HW5 solution, Due Monday September 26, 2016.
-------------------------------------------

(a) 

> set.seed(75)
  Z = sample(1:4, 160, replace=T)
  X = rbeta(160,.6,.8)
  Y = 2 + 3*X + 0.2*(Z-2.5)*X + rchisq(160,3)

> sum(Y > 12)  ### = 6, as mentioned in the problem

> plot(X,Y, xlab="X values", ylab="Y values", main=
   "Scatterplot of Y versus X Stratified by Z",
   type="n")
  for(i in 1:4) points(X[Z==i], Y[Z==i], pch=2:5)
  abline(h=12, col="cyan")
  identify(X,Y)
[1]   6  21  42  87 103 127    ### indices print after right mouse-click

> legend(locator(), legend=paste("Z=",1:4," points",sep=""),
           pch = 2:5)
### graph saved as "HW5F16(a).pdf" 


(b) 
> set.seed(33333)
  Xvec = rbeta(200, 3.3, 1.4)

  NlogL = function(theta,xv)  -sum(dbeta(xv,theta[1],theta[2],log=T))

(i)
> unlist(optim(c(3.3,1.4), NlogL, xv=Xvec)[c(1,2,4)])
       par1        par2       value convergence 
   3.331952    1.542406  -64.159509    0.000000 
> unlist(nlm(NlogL, c(3.3,1.4), xv=Xvec))
      minimum     estimate1     estimate2     gradient1     gradient2 
-6.415951e+01  3.331688e+00  1.542352e+00 -1.025329e-06  1.892828e-06 
         code    iterations 
 1.000000e+00  1.100000e+01 
Warning messages:
1: In dbeta(xv, theta[1], theta[2], log = T) : NaNs produced
2: In nlm(NlogL, c(3.3, 1.4), xv = Xvec) :
  NA/Inf replaced by maximum positive value

### Everything was very quick. To get a timing run, let's augment the 
dataset by just repeating it 1000 times.

> XvecBig = rep(Xvec,1000)

> system.time({  tmp = optim(c(3.3,1.4), NlogL, xv=XvecBig) })
   user  system elapsed        #### Times on home laptop
   7.46    0.02    7.55 
> system.time({ tmp = nlm(NlogL, c(3.3,1.4), xv=XvecBig) })
   user  system elapsed 
   7.35    0.02    7.41        ### with warnings again

(ii) Gradient of sum of -log beta densities is 

> attr(NlogL,"gradient") = function(theta,xv) {
             n = length(xv)
             tau = theta[1]+theta[2]
             n*c(digamma(theta[1])-digamma(tau)-mean(log(xv)),
                 digamma(theta[2])-digamma(tau)-mean(log(1-xv))) }

> system.time({  tmp = optim(c(3.3,1.4), NlogL, gr = 
                    attr(NlogL,"gradient"), xv=XvecBig) })
   user  system elapsed 
   7.46    0.10    7.60 

> system.time({ tmp = nlm(NlogL, c(3.3,1.4), xv=XvecBig) })
   user  system elapsed 
   7.37    0.07    7.48 

> attr(NlogL,"hessian") = function(theta,xv) {
             n = length(xv)
             z = trigamma(theta[1]+theta[2])
             n*array(c(trigamma(theta[1])-z, rep(-z,2), 
                 trigamma(theta[2])-z), c(2,2)) }

### it would make sense to use "check.analyticals =T" in nlm 
### at least some of the time for debugging purposes.

> system.time({ tmp = nlm(NlogL, c(3.3,1.4), xv=XvecBig) })
   user  system elapsed 
   7.42    0.02    7.45 

### So there is really no benefit seen in this example for analytical
###   gradients and Hessians

#----------------------------------------------------------------

EXTRA CREDIT part. Examples of minimization problems where it makes a LOT of difference whether you supply analytical gradient and hessian attributes.


Attempt 1. Maximization of a very erratic function

set.seed(10001)
Zvec = rnorm(100)/(1:100)^2.5
snx = function(n,x) sin(n*x)
Zvec2 = Zvec*(1:100)

### Random-coefficient d-variable Fourier series
   g = function(xy,dvec, n=100) {
            d = length(xy)
            mat = outer(1:n, xy*dvec, snx)
            sum(apply(mat,1,prod)*Zvec[1:n]) }

> xt = seq(0,10, by=.1)
  y = replace(xt,T,0)
  for(i in 0:100) y[i]=g(c(xt[i],0.5),c(5,.3))
  plot(xt,y, type="l")         ### OK very erratic !


> trymin = function(B, h, dvec, gr=NULL, hess=NULL, 
               low=0, up=10, d=2, n=100) {
  uv = array(runif(d*B,low,up), c(B,d))
  out = array(0, c(B,d+1))
  for (k in 1:B) 
      out[k,] = unlist(nlminb(uv[k,], h, gradient=gr, 
          hessian=hess, lower=low, upper=up, dvec=dvec, n=n)[1:2])
  k = which.min(out[,d+1])
  out[k,]  }

> system.time({ est.val = trymin(1e3,g, dvec=c(5,.3)) })
   user  system elapsed 
  13.06    0.00   13.11          ### workstation at College Park
> est.val
[1]  9.9041223  7.9890727 -0.1864898

> trymin(1e3,g,c(5,.3))    ### same objective function again, but 
                          ### different minimizing point

   gprim = function(xy,dvec, n=100) {
            d = length(xy)
            mat = outer(1:n, xy*dvec, snx)
            out = numeric(d)
            for(i in 1:d) out[i] = dvec[i]*sum(
               apply(mat[,-i, drop=F],1,prod)*
               cos((1:n)*xy[i]*dvec[i])*Zvec2[1:n])
            out }   

> system.time({ est.val = trymin(1e3,g, gr=gprim, dvec=c(5,.3)) })
   user  system elapsed 
  11.71    0.00   11.76 
>   est.val
[1]  9.5737521  2.4829028 -0.1864898

> system.time({ est.val = trymin(1e3,g, gr=gprim, dvec=c(5,.3)) })
   user  system elapsed 
  11.68    0.00   11.69 
>   est.val
[1]  9.9041223  7.9890728 -0.1864898  ### small (10%) improvement with gradient

#-----------------------------------------
#### now compare times with and without gradient when the argument dim d=4

> system.time({ est.val = trymin(1e3,g, c(5,.3,1.5,3), d=4) })
   user  system elapsed 
  37.05    0.00   37.15 
>   est.val
[1]  2.827433  5.235988  9.424778  5.759587 -0.183337

> system.time({ est.val = trymin(1e3,g, c(5,.3,1.5,3), d=4) })
   user  system elapsed 
  36.69    0.05   36.86 
>   est.val
[1]  0.9424778  5.2359878  3.1415927  9.9483767 -0.1833370

> system.time({ est.val = trymin(1e3,g, gr=gprim, dvec=c(5,.3,1.5,3), d=4) })
   user  system elapsed 
  31.80    0.01   31.87 
>   est.val
[1]  0.3141593  5.2359878  7.3303829  6.8067841 -0.1833370

> system.time({ est.val = trymin(1e3,g, gr=gprim, dvec=c(5,.3,1.5,3), d=4) })
   user  system elapsed 
  31.52    0.00   31.61 
>   est.val
[1]  8.482300  5.235988  1.047198  6.806784 -0.183337

### Improvement with gradient is better now: 14%

###-------------------------------------------------------
### Try it again with a different optimizing function

> trymin2 = function(B, h, dvec, gr=NULL, hess=NULL, 
               low=0, up=10, d=2, n=100) {
  uv = array(runif(d*B,low,up), c(B,d))
  out = array(0, c(B,d+1))
  for (k in 1:B) 
      out[k,] = unlist(optim(uv[k,], h, gr=gr, 
          lower=low, upper=up, dvec=dvec, n=n,
           method="L-BFGS-B")[1:2])
  k = which.min(out[,d+1])
  out[k,]  }

>  system.time({ est.val = trymin2(1e3,g, dvec=c(5,.3)) })
   user  system elapsed 
  30.04    0.00   30.14 
> est.val
[1]  7.3908482  7.9890731 -0.1864898

>  system.time({ est.val = trymin2(1e3,g, gr=gprim,dvec=c(5,.3)) })
   user  system elapsed 
  14.33    0.00   14.40 
> est.val
[1]  9.9041223  7.9890727 -0.1864898       ### OK, real speedup

### Now d=4 with optim

> system.time({ est.val = trymin2(1e3,g, c(5,.3,1.5,3), d=4) })
   user  system elapsed 
  87.81    0.01   88.09 
> est.val
[1]  2.199115  5.235987  5.235988  2.617994 -0.183337
        ### same objective value, different minimizer than with nlminb

> system.time({ est.val = trymin2(1e3,g, gr=gprim, c(5,.3,1.5,3), d=4) })
   user  system elapsed 
  65.50    0.03   66.12 
> est.val
[1]  9.110619  5.235988  3.141593  2.617994 -0.183337
    ### greater speedup using optim, but optim was slower on the d=4 problem

> system.time({ est.val = trymin2(1e3,g, gr=gprim, c(5,.3,1.5,3), d=4) })
   user  system elapsed 
  43.84    0.00   44.03 
> est.val
[1]  5.340707  5.235986  7.330383  4.712389 -0.183337

##------------ d=6 cases
> system.time({ est.val = trymin(1e3,g,                 ## nlminb, d=6
                 c(5,.3,1.5,3,2,.8), d=6, n=30) })
   user  system elapsed 
  27.04    0.00   27.09 
> est.val
[1]  0.9424778  5.2359878  1.0471976  8.9011792  0.7853982  1.9634954 -0.1836903


> system.time({ est.val = trymin(1e3,g, gr=gprim,       ## nlminb, d=6
                 c(5,.3,1.5,3,2,.8), d=6, n=30) })
   user  system elapsed 
  22.79    0.02   22.89 
> est.val
[1]  2.8274334  5.2359878  3.1415927  0.5235988  0.7853982  1.9634954 -0.1836903

#-----------------------------------------------------------------------
##### Try another higher-dimensional function that is slightly less crazy

 qfcn = function(xy,dvec, n=100, lam=1) {
            d = length(xy)
            mat = outer(1:n, xy*dvec, snx)
            sum(apply(mat,1,prod)*Zvec[1:n])/(1+lam*sum(xy^2)) }

 qprim = function(xy,dvec, n=100, lam=1) {
            d = length(xy)
            mat = outer(1:n, xy*dvec, snx)
            out = numeric(d)
            for(i in 1:d) out[i] = dvec[i]*sum(
               apply(mat[,-i, drop=F],1,prod)*
               cos((1:n)*xy[i]*dvec[i])*Zvec2[1:n])
            out/(1+lam*sum(xy^2)) - 
               (2/(1+lam*sum(xy^2)))*xy*qfcn(xy,dvec,n,lam) }   

 qfcn2 = qfcn;  attr(qfcn2, "gradient") = qprim

### The point of the term 1/(1+lam*sum(xy^2)) is to ensure that 
###   there are no minima at very large magnitudes of xy, so 
###   that we can try nlm to solve without box-constraints.
### [Recall that with nlm, we must use attributes to include 
###   the gradient or hessian functions.]

> trymin3 = function(B, h, dvec, d=2, n=100, lam=1, low=0, up=5) {
  uv = array(runif(d*B,low,up), c(B,d))
  out = array(0, c(B,d+1))
  for (k in 1:B) 
      out[k,] = unlist(nlm(h, uv[k,], dvec=dvec, n=n, lam=lam)[2:1])
  k = which.min(out[,d+1])
  out[k,]  }


> system.time({ est.val = trymin(1e3,qfcn,              ## nlminb, d=6, n=10
                 c(5,.3,1.5,3,2,.8), d=6, n=10) })
   user  system elapsed 
  34.97    0.00   35.06 
>   est.val
[1]  1.391046505  2.122399098  4.627019900  1.270959753  0.336311250
[6]  0.835050112 -0.003767413
 

> system.time({ est.val = trymin(1e3, qfcn, gr=qprim,    ## nlminb, d=6, n=10
                 c(5,.3,1.5,3,2,.8), d=6, n=10) })
   user  system elapsed 
  32.99    0.00   33.07 
>  est.val
[1]  2.826372283  3.470929629  0.751565606  1.569159621  0.783557773
[6]  1.935316844 -0.003741943

###--------------------- now with nlm using trymin3

> system.time({ est.val = trymin3(1e3, qfcn,            ## nlm, d=6, n=10
                 c(.5,.1,1.5,.3,.2,.1), d=6, n=10, lam=1) })   ### lam=1
   user  system elapsed 
  35.05    0.03   35.24 
> est.val
[1]  0.730459899  2.365617240 -0.248549954  1.169499504  1.619412122
[6]  2.365686234 -0.002230617

> system.time({ est.val = trymin3(1e3, qfcn2,            ## nlm, d=6, n=10
                 c(.5,.1,1.5,.3,.2,.1), d=6, n=10, lam=1) })   ### lam=1
   user  system elapsed 
  34.85    0.00   34.91 
> est.val
[1]  0.730462147  2.365608364 -0.248551815  1.169457896  1.619418362
[6]  2.365570442 -0.002230617

     ### not much advantage when the function is this smooth

> system.time({ est.val = trymin3(1e3, qfcn,            ## nlm, d=6, n=30
                 2*c(.5,.1,1.5,.3,.2,.1), d=6, n=30, lam=1) })   ### lam=1
   user  system elapsed                                      ### dvec*2
  52.62    0.00   52.70 
>   est.val
[1]  0.413607801  1.461983225  0.400586399  0.658903764  0.932461269
[6]  1.461997933 -0.007261782

> system.time({ est.val = trymin3(1e3, qfcn2,            ## nlm, d=6, n=30
                 2*c(.5,.1,1.5,.3,.2,.1), d=6, n=30, lam=1) })   ### lam=1
   user  system elapsed                                   ## dvec*2
  50.06    0.00   50.15 
>   est.val
[1]  0.413621313  1.461962399  0.400587762  0.658889245  0.932480579
[6]  1.461953209 -0.007261782

### Not much advantage  

> system.time({ est.val = trymin3(1e3, qfcn,            ## nlm, d=6, n=50
           6*c(.5,.1,1.5,.3,.2,.1), d=6, n=50, lam=1) })   ### lam=1
   user  system elapsed 
  53.81    0.00   53.96 
>   est.val
[1]  1.26015802  1.01206284  0.77019940  0.35862784  0.53176807  1.01205678
[7] -0.01972945

> system.time({ est.val = trymin3(1e3, qfcn2,            ## nlm, d=6, n=50
           6*c(.5,.1,1.5,.3,.2,.1), d=6, n=50, lam=1) })   ### lam=1
   user  system elapsed 
   user  system elapsed 
  53.81    0.00   53.96 
>   est.val
[1]  1.26015802  1.01206284  0.77019940  0.35862784  0.53176807  1.01205678
[7] -0.01972945
 
> system.time({ est.val = trymin3(1e3, qfcn2,            ## nlm, d=6, n=50
+            6*c(.5,.1,1.5,.3,.2,.1), d=6, n=50, lam=1) })  
   user  system elapsed 
  52.23    0.00   52.40 
>   est.val
[1] -0.21603707  0.93448021  0.27646817  0.35633932  0.52180588  0.93448021
[7] -0.03164016

#### So it seems that the advantage achieved by using the gradient depends 
### a lot on the specific optimizaton algorithm, and the dimension and 
### irregularity of the function, and it is fairly hard to find examples 
### where that advantage is dramatic.