Homework 10 solution, Oct 2009.
===============================

> HW10Data = rlogis(5000, runif(1,2,2.5), runif(1,.7,.85))
> mean(HW10Data)
[1] 2.480233
> sqrt(var(HW10Data))
[1] 1.366586
> integrate(function(x) x^2*dlogis(x), -20,20,abs.tol=1e-9)$val
[1] 3.289866

> unix.time({xfit = nlm(function(x) 
           -sum(log(dlogis(HW10Data,x[1],x[2]))), c(2,1))})
[1] 0.12 0.00 0.13   NA   NA
> str(xfit)
List of 5
 $ minimum   : num 8573
 $ estimate  : num [1:2] 2.48 0.75
 $ gradient  : num [1:2] -0.00251  0.00363
 $ code      : int 1
 $ iterations: int 7

> unix.time({xfit = nlm(function(x) 
           -sum(log(dlogis(HW10Data,x[1],x[2]))), c(0,1))})
[1] 0.19 0.00 0.20   NA   NA

> unix.time({xfit = nlm(function(x) -sum(log(dlogis(
           HW10Data,x[1],x[2]))), c(0,1), hessian=T)})
[1] 0.18 0.02 0.20   NA   NA

> unix.time({xfit2 = nlm(function(x) { 
         obj = -sum(log(dlogis(HW13Data,x[1],x[2])))
         pvec=plogis(HW10Data,x[1],x[2])
         nd = length(HW10Data)
         attr(obj,"gradient") = c(nd - 2*sum(pvec), 
              nd-sum((HW10Data-x[1])*(2*pvec-1))/x[2])/x[2]
              obj}, c(2,1))})
[1] 0.15 0.00 0.15   NA   NA
> str(xfit2)
List of 5
 $ minimum   : num 8573
 $ estimate  : num [1:2] 2.48 0.75
 $ gradient  : num [1:2] -0.00251  0.00364
 $ code      : int 1
 $ iterations: int 7
> unix.time({xfit2 = nlm(function(x) { 
         obj = -sum(log(dlogis(HW10Data,x[1],x[2])))
         pvec=plogis(HW10Data,x[1],x[2])
         nd = length(HW10Data)
         attr(obj,"gradient") = c(nd - 2*sum(pvec), 
              nd-sum((HW10Data-x[1])*(2*pvec-1))/x[2])/x[2]
              obj}, c(2,1), hessian=T)})
[1] 0.18 0.00 0.17   NA   NA

> unix.time({xfit2 = nlm(function(x) { 
         obj = -sum(log(dlogis(HW10Data,x[1],x[2])))
         pvec=plogis(HW10Data,x[1],x[2])
         nd = length(HW10Data)
         attr(obj,"gradient") = c(nd - 2*sum(pvec), 
              nd-sum((HW10Data-x[1])*(2*pvec-1))/x[2])/x[2]
              obj}, c(0,1))})
[1] 0.16 0.00 0.16 

> unix.time({xfit3 = nlm(function(x) { 
         dvec=dlogis(HW10Data,x[1],x[2])
         pvec=plogis(HW10Data,x[1],x[2])
         stdvec=(HW10Data-x[1])/x[2]
         obj = -sum(log(dvec))
         nd = length(HW10Data)
         attr(obj,"gradient") = c(nd - 2*sum(pvec), 
         nd-sum(stdvec*(2*pvec-1)))/x[2]
         attr(obj,"hessian") = -matrix(c(-2*sum(dvec),
           rep(nd-2*sum(pvec+stdvec*dvec),2), nd+2*sum(
           stdvec*(1-2*pvec-dvec*stdvec))), ncol=2)/x[2]^2
              obj}, c(2,1))})
[1] 0.68 0.00 0.76  NA NA

## So in simple problems or maybe not-so-simple there
##   might be no benefit in using the analytical 
##   gradients unless you want the output Hessian 
##   (which we almost always do want for statistical 
##   applications), but even then the analytical 
##   Hessian might be more trouble than it is worth !!


### Now for part (b):  so far we have found using nlm:
> xfit$est
[1] 2.2160326 0.8122672
> xfit2$est
[1] 2.2160326 0.8122672
> xfit3$est
[1] 2.2160321 0.8122672

> unix.time({xfit4 = optim(c(2,1),function(x) { 
        dvec=dlogis(HW10Data,x[1],x[2])
        pvec=plogis(HW10Data,x[1],x[2])
        stdvec=(HW10Data-x[1])/x[2]
        obj = -sum(log(dvec))
        nd = length(HW10Data)
             obj})})
    user  system elapsed 
   0.23    0.00    0.23 
> xfit4$par
[1] 2.2161128 0.8122323
> unix.time({cat(optim(c(2,1),function(x) { 
        dvec=dlogis(HW10Data,x[1],x[2])
        pvec=plogis(HW10Data,x[1],x[2])
        stdvec=(HW10Data-x[1])/x[2]
        obj = -sum(log(dvec))
        nd = length(HW10Data)
   obj}, control=list(reltol=1e-10))$par,"\n")})
2.216024 0.8122737 
   user  system elapsed 
    0.3     0.0     0.3    
     ### so it looks as though the default
### accuracy for "optim" is a little less than for "nlm"

### The "Gradient-Search" solution is done using the "Gradmat"
###   and "GradSrch" functions provided in the notes.
### Listings of these same functions are given at the end of 
###   this Solution file for convenience.

> likfunc = function(x) { 
        dvec=dlogis(HW10Data,x[1],x[2])
        pvec=plogis(HW10Data,x[1],x[2])
        stdvec=(HW10Data-x[1])/x[2]
        obj = -sum(log(dvec))
        nd = length(HW10Data)
        obj}
> Gradmat(c(2,1),likfunc)
[1] -392.566 1248.681      ]
      ### so the steps in the GradSrch function
    #### will be much too large if we are not careful !

> unix.time({xfit5=GradSrch(c(2,1), function(x) { 
        dvec=dlogis(HW10Data,x[1],x[2])
        pvec=plogis(HW10Data,x[1],x[2])
        stdvec=(HW10Data-x[1])/x[2]
        obj = -sum(log(dvec))
        nd = length(HW10Data)
   obj}, .05, unitfac=T)})
   user  system elapsed 
   0.73    0.00    0.75 
> xfit5
$nstep
[1] 25      ###  ran up to iteration limit

$initial
[1] 2 1

$final
[1] 2.2159634 0.7853559

$funcval
[1] 8964.238

> Gradmat(xfit5$final,likfunc)
[1] -0.788279 -312.608450   ## still very far from convergence

unix.time({xfit5=GradSrch(xfit5$final, function(x) { 
        dvec=dlogis(HW10Data,x[1],x[2])
        pvec=plogis(HW10Data,x[1],x[2])
        stdvec=(HW10Data-x[1])/x[2]
        obj = -sum(log(dvec))
        nd = length(HW10Data)
   obj}, 1e-4, unitfac=F)})
   user  system elapsed 
   0.14    0.00    0.14

####  Now do it again using small gradient steps 
###   with this "final" point as new starting point
> unix.time({xfit5=GradSrch(xfit5$final, function(x) { 
        dvec=dlogis(HW10Data,x[1],x[2])
        pvec=plogis(HW10Data,x[1],x[2])
        stdvec=(HW10Data-x[1])/x[2]
        obj = -sum(log(dvec))
        nd = length(HW10Data)
   obj}, .001)})

> Gradmat(xfit5$final,likfunc)
[1] -0.001060471  0.001962690    ### much better !!

> unlist(xfit5[c(1,3)])
    nstep    final1    final2 
5.0000000 2.2160322 0.8122674 

---------- Function Listings follow ------------

> Gradmat
function(parvec, infcn, eps = 1e-06)
{
# Function to calculate the difference-quotient approx gradient
#   (matrix) of an arbitrary input (vector) function    infcn
# Now recoded to use central differences !
        dd = length(parvec)
        aa = length(infcn(parvec))
        epsmat = (diag(dd) * eps)/2
        gmat = array(0, dim = c(aa, dd))
        for(i in 1:dd)
                gmat[, i] = (infcn(parvec + epsmat[, i]) - 
                    infcn(parvec - epsmat[, i]))/eps
        if(aa > 1)  gmat   else c(gmat)
}


> GradSrch
function(inipar, infcn, step, nmax = 25, stoptol = 1e-05, 
        unitfac = F, eps = 1e-06, gradfunc = NULL)
{
# Function to implement Steepest-descent. The   unitfac  
#  condition indicates whether or not the supplied step-length 
#  factor(s) multiply the negative gradient itself, or 
#  the unit vector in the same direction.
   if(is.null(gradfunc))
         gradfunc = function(x) Gradmat(x, infcn, eps)
   steps = if(length(step) > 1) step else rep(step, nmax)
   newpar = inipar
   oldpar = newpar - 1
   ctr = 0
   while(ctr < nmax & sqrt(sum((newpar - oldpar)^2)) > stoptol) {
         ctr = ctr + 1
         oldpar = newpar
         newstep = gradfunc(oldpar)
         newstep = if(unitfac) newstep/sqrt(sum(
                newstep^2)) else newstep
    ### use unit vector in gradient direction if unitfac=T
         newpar = oldpar - steps[ctr] * newstep
   }
   list(nstep = ctr, initial = inipar, final = newpar, 
         funcval = infcn(newpar))
}