HW5 solution, Due Monday September 26, 2016. ------------------------------------------- (a) > set.seed(75) Z = sample(1:4, 160, replace=T) X = rbeta(160,.6,.8) Y = 2 + 3*X + 0.2*(Z-2.5)*X + rchisq(160,3) > sum(Y > 12) ### = 6, as mentioned in the problem > plot(X,Y, xlab="X values", ylab="Y values", main= "Scatterplot of Y versus X Stratified by Z", type="n") for(i in 1:4) points(X[Z==i], Y[Z==i], pch=2:5) abline(h=12, col="cyan") identify(X,Y) [1] 6 21 42 87 103 127 ### indices print after right mouse-click > legend(locator(), legend=paste("Z=",1:4," points",sep=""), pch = 2:5) ### graph saved as "HW5F16(a).pdf" (b) > set.seed(33333) Xvec = rbeta(200, 3.3, 1.4) NlogL = function(theta,xv) -sum(dbeta(xv,theta[1],theta[2],log=T)) (i) > unlist(optim(c(3.3,1.4), NlogL, xv=Xvec)[c(1,2,4)]) par1 par2 value convergence 3.331952 1.542406 -64.159509 0.000000 > unlist(nlm(NlogL, c(3.3,1.4), xv=Xvec)) minimum estimate1 estimate2 gradient1 gradient2 -6.415951e+01 3.331688e+00 1.542352e+00 -1.025329e-06 1.892828e-06 code iterations 1.000000e+00 1.100000e+01 Warning messages: 1: In dbeta(xv, theta[1], theta[2], log = T) : NaNs produced 2: In nlm(NlogL, c(3.3, 1.4), xv = Xvec) : NA/Inf replaced by maximum positive value ### Everything was very quick. To get a timing run, let's augment the dataset by just repeating it 1000 times. > XvecBig = rep(Xvec,1000) > system.time({ tmp = optim(c(3.3,1.4), NlogL, xv=XvecBig) }) user system elapsed #### Times on home laptop 7.46 0.02 7.55 > system.time({ tmp = nlm(NlogL, c(3.3,1.4), xv=XvecBig) }) user system elapsed 7.35 0.02 7.41 ### with warnings again (ii) Gradient of sum of -log beta densities is > attr(NlogL,"gradient") = function(theta,xv) { n = length(xv) tau = theta[1]+theta[2] n*c(digamma(theta[1])-digamma(tau)-mean(log(xv)), digamma(theta[2])-digamma(tau)-mean(log(1-xv))) } > system.time({ tmp = optim(c(3.3,1.4), NlogL, gr = attr(NlogL,"gradient"), xv=XvecBig) }) user system elapsed 7.46 0.10 7.60 > system.time({ tmp = nlm(NlogL, c(3.3,1.4), xv=XvecBig) }) user system elapsed 7.37 0.07 7.48 > attr(NlogL,"hessian") = function(theta,xv) { n = length(xv) z = trigamma(theta[1]+theta[2]) n*array(c(trigamma(theta[1])-z, rep(-z,2), trigamma(theta[2])-z), c(2,2)) } ### it would make sense to use "check.analyticals =T" in nlm ### at least some of the time for debugging purposes. > system.time({ tmp = nlm(NlogL, c(3.3,1.4), xv=XvecBig) }) user system elapsed 7.42 0.02 7.45 ### So there is really no benefit seen in this example for analytical ### gradients and Hessians #---------------------------------------------------------------- EXTRA CREDIT part. Examples of minimization problems where it makes a LOT of difference whether you supply analytical gradient and hessian attributes. Attempt 1. Maximization of a very erratic function set.seed(10001) Zvec = rnorm(100)/(1:100)^2.5 snx = function(n,x) sin(n*x) Zvec2 = Zvec*(1:100) ### Random-coefficient d-variable Fourier series g = function(xy,dvec, n=100) { d = length(xy) mat = outer(1:n, xy*dvec, snx) sum(apply(mat,1,prod)*Zvec[1:n]) } > xt = seq(0,10, by=.1) y = replace(xt,T,0) for(i in 0:100) y[i]=g(c(xt[i],0.5),c(5,.3)) plot(xt,y, type="l") ### OK very erratic ! > trymin = function(B, h, dvec, gr=NULL, hess=NULL, low=0, up=10, d=2, n=100) { uv = array(runif(d*B,low,up), c(B,d)) out = array(0, c(B,d+1)) for (k in 1:B) out[k,] = unlist(nlminb(uv[k,], h, gradient=gr, hessian=hess, lower=low, upper=up, dvec=dvec, n=n)[1:2]) k = which.min(out[,d+1]) out[k,] } > system.time({ est.val = trymin(1e3,g, dvec=c(5,.3)) }) user system elapsed 13.06 0.00 13.11 ### workstation at College Park > est.val [1] 9.9041223 7.9890727 -0.1864898 > trymin(1e3,g,c(5,.3)) ### same objective function again, but ### different minimizing point gprim = function(xy,dvec, n=100) { d = length(xy) mat = outer(1:n, xy*dvec, snx) out = numeric(d) for(i in 1:d) out[i] = dvec[i]*sum( apply(mat[,-i, drop=F],1,prod)* cos((1:n)*xy[i]*dvec[i])*Zvec2[1:n]) out } > system.time({ est.val = trymin(1e3,g, gr=gprim, dvec=c(5,.3)) }) user system elapsed 11.71 0.00 11.76 > est.val [1] 9.5737521 2.4829028 -0.1864898 > system.time({ est.val = trymin(1e3,g, gr=gprim, dvec=c(5,.3)) }) user system elapsed 11.68 0.00 11.69 > est.val [1] 9.9041223 7.9890728 -0.1864898 ### small (10%) improvement with gradient #----------------------------------------- #### now compare times with and without gradient when the argument dim d=4 > system.time({ est.val = trymin(1e3,g, c(5,.3,1.5,3), d=4) }) user system elapsed 37.05 0.00 37.15 > est.val [1] 2.827433 5.235988 9.424778 5.759587 -0.183337 > system.time({ est.val = trymin(1e3,g, c(5,.3,1.5,3), d=4) }) user system elapsed 36.69 0.05 36.86 > est.val [1] 0.9424778 5.2359878 3.1415927 9.9483767 -0.1833370 > system.time({ est.val = trymin(1e3,g, gr=gprim, dvec=c(5,.3,1.5,3), d=4) }) user system elapsed 31.80 0.01 31.87 > est.val [1] 0.3141593 5.2359878 7.3303829 6.8067841 -0.1833370 > system.time({ est.val = trymin(1e3,g, gr=gprim, dvec=c(5,.3,1.5,3), d=4) }) user system elapsed 31.52 0.00 31.61 > est.val [1] 8.482300 5.235988 1.047198 6.806784 -0.183337 ### Improvement with gradient is better now: 14% ###------------------------------------------------------- ### Try it again with a different optimizing function > trymin2 = function(B, h, dvec, gr=NULL, hess=NULL, low=0, up=10, d=2, n=100) { uv = array(runif(d*B,low,up), c(B,d)) out = array(0, c(B,d+1)) for (k in 1:B) out[k,] = unlist(optim(uv[k,], h, gr=gr, lower=low, upper=up, dvec=dvec, n=n, method="L-BFGS-B")[1:2]) k = which.min(out[,d+1]) out[k,] } > system.time({ est.val = trymin2(1e3,g, dvec=c(5,.3)) }) user system elapsed 30.04 0.00 30.14 > est.val [1] 7.3908482 7.9890731 -0.1864898 > system.time({ est.val = trymin2(1e3,g, gr=gprim,dvec=c(5,.3)) }) user system elapsed 14.33 0.00 14.40 > est.val [1] 9.9041223 7.9890727 -0.1864898 ### OK, real speedup ### Now d=4 with optim > system.time({ est.val = trymin2(1e3,g, c(5,.3,1.5,3), d=4) }) user system elapsed 87.81 0.01 88.09 > est.val [1] 2.199115 5.235987 5.235988 2.617994 -0.183337 ### same objective value, different minimizer than with nlminb > system.time({ est.val = trymin2(1e3,g, gr=gprim, c(5,.3,1.5,3), d=4) }) user system elapsed 65.50 0.03 66.12 > est.val [1] 9.110619 5.235988 3.141593 2.617994 -0.183337 ### greater speedup using optim, but optim was slower on the d=4 problem > system.time({ est.val = trymin2(1e3,g, gr=gprim, c(5,.3,1.5,3), d=4) }) user system elapsed 43.84 0.00 44.03 > est.val [1] 5.340707 5.235986 7.330383 4.712389 -0.183337 ##------------ d=6 cases > system.time({ est.val = trymin(1e3,g, ## nlminb, d=6 c(5,.3,1.5,3,2,.8), d=6, n=30) }) user system elapsed 27.04 0.00 27.09 > est.val [1] 0.9424778 5.2359878 1.0471976 8.9011792 0.7853982 1.9634954 -0.1836903 > system.time({ est.val = trymin(1e3,g, gr=gprim, ## nlminb, d=6 c(5,.3,1.5,3,2,.8), d=6, n=30) }) user system elapsed 22.79 0.02 22.89 > est.val [1] 2.8274334 5.2359878 3.1415927 0.5235988 0.7853982 1.9634954 -0.1836903 #----------------------------------------------------------------------- ##### Try another higher-dimensional function that is slightly less crazy qfcn = function(xy,dvec, n=100, lam=1) { d = length(xy) mat = outer(1:n, xy*dvec, snx) sum(apply(mat,1,prod)*Zvec[1:n])/(1+lam*sum(xy^2)) } qprim = function(xy,dvec, n=100, lam=1) { d = length(xy) mat = outer(1:n, xy*dvec, snx) out = numeric(d) for(i in 1:d) out[i] = dvec[i]*sum( apply(mat[,-i, drop=F],1,prod)* cos((1:n)*xy[i]*dvec[i])*Zvec2[1:n]) out/(1+lam*sum(xy^2)) - (2/(1+lam*sum(xy^2)))*xy*qfcn(xy,dvec,n,lam) } qfcn2 = qfcn; attr(qfcn2, "gradient") = qprim ### The point of the term 1/(1+lam*sum(xy^2)) is to ensure that ### there are no minima at very large magnitudes of xy, so ### that we can try nlm to solve without box-constraints. ### [Recall that with nlm, we must use attributes to include ### the gradient or hessian functions.] > trymin3 = function(B, h, dvec, d=2, n=100, lam=1, low=0, up=5) { uv = array(runif(d*B,low,up), c(B,d)) out = array(0, c(B,d+1)) for (k in 1:B) out[k,] = unlist(nlm(h, uv[k,], dvec=dvec, n=n, lam=lam)[2:1]) k = which.min(out[,d+1]) out[k,] } > system.time({ est.val = trymin(1e3,qfcn, ## nlminb, d=6, n=10 c(5,.3,1.5,3,2,.8), d=6, n=10) }) user system elapsed 34.97 0.00 35.06 > est.val [1] 1.391046505 2.122399098 4.627019900 1.270959753 0.336311250 [6] 0.835050112 -0.003767413 > system.time({ est.val = trymin(1e3, qfcn, gr=qprim, ## nlminb, d=6, n=10 c(5,.3,1.5,3,2,.8), d=6, n=10) }) user system elapsed 32.99 0.00 33.07 > est.val [1] 2.826372283 3.470929629 0.751565606 1.569159621 0.783557773 [6] 1.935316844 -0.003741943 ###--------------------- now with nlm using trymin3 > system.time({ est.val = trymin3(1e3, qfcn, ## nlm, d=6, n=10 c(.5,.1,1.5,.3,.2,.1), d=6, n=10, lam=1) }) ### lam=1 user system elapsed 35.05 0.03 35.24 > est.val [1] 0.730459899 2.365617240 -0.248549954 1.169499504 1.619412122 [6] 2.365686234 -0.002230617 > system.time({ est.val = trymin3(1e3, qfcn2, ## nlm, d=6, n=10 c(.5,.1,1.5,.3,.2,.1), d=6, n=10, lam=1) }) ### lam=1 user system elapsed 34.85 0.00 34.91 > est.val [1] 0.730462147 2.365608364 -0.248551815 1.169457896 1.619418362 [6] 2.365570442 -0.002230617 ### not much advantage when the function is this smooth > system.time({ est.val = trymin3(1e3, qfcn, ## nlm, d=6, n=30 2*c(.5,.1,1.5,.3,.2,.1), d=6, n=30, lam=1) }) ### lam=1 user system elapsed ### dvec*2 52.62 0.00 52.70 > est.val [1] 0.413607801 1.461983225 0.400586399 0.658903764 0.932461269 [6] 1.461997933 -0.007261782 > system.time({ est.val = trymin3(1e3, qfcn2, ## nlm, d=6, n=30 2*c(.5,.1,1.5,.3,.2,.1), d=6, n=30, lam=1) }) ### lam=1 user system elapsed ## dvec*2 50.06 0.00 50.15 > est.val [1] 0.413621313 1.461962399 0.400587762 0.658889245 0.932480579 [6] 1.461953209 -0.007261782 ### Not much advantage > system.time({ est.val = trymin3(1e3, qfcn, ## nlm, d=6, n=50 6*c(.5,.1,1.5,.3,.2,.1), d=6, n=50, lam=1) }) ### lam=1 user system elapsed 53.81 0.00 53.96 > est.val [1] 1.26015802 1.01206284 0.77019940 0.35862784 0.53176807 1.01205678 [7] -0.01972945 > system.time({ est.val = trymin3(1e3, qfcn2, ## nlm, d=6, n=50 6*c(.5,.1,1.5,.3,.2,.1), d=6, n=50, lam=1) }) ### lam=1 user system elapsed user system elapsed 53.81 0.00 53.96 > est.val [1] 1.26015802 1.01206284 0.77019940 0.35862784 0.53176807 1.01205678 [7] -0.01972945 > system.time({ est.val = trymin3(1e3, qfcn2, ## nlm, d=6, n=50 + 6*c(.5,.1,1.5,.3,.2,.1), d=6, n=50, lam=1) }) user system elapsed 52.23 0.00 52.40 > est.val [1] -0.21603707 0.93448021 0.27646817 0.35633932 0.52180588 0.93448021 [7] -0.03164016 #### So it seems that the advantage achieved by using the gradient depends ### a lot on the specific optimizaton algorithm, and the dimension and ### irregularity of the function, and it is fairly hard to find examples ### where that advantage is dramatic.