Example Logs, STAT 705, Fall 2015 ================================= Part III, November 2015 ========================= 11/9/2015 GLM material roughly following log "RlogF09.GLM.txt" X1 = runif(120); X2 = rpois(120,3) ntrials = rep(c(5,6,3), 40) success = rbinom( 120, ntrials, plogis(-1 + 0.6*X1 + 0.1*X2) ) failure = ntrials - success simlogis = data.frame( X1, X2, success, failure ) > fitB0 = glm ( cbind(success, failure) ~ X1 + X2, family=binomial, data=simlogis ) ### NB: within family=binomial, link="logit" is the default, ### but other possibilities exist, most often "probit" or "cloglog" ### NB: there is another syntax, sometimes more useful, giving EXACTLY THE SAME MODEL FITTED OBJECTS. > fitBAlt = glm ( I(success/ntrials) ~ X1 + X2, weights=ntrials, family=binomial, data=simlogis ) > sum(abs(fitBAlt$fit - fitB0$fit)) [1] 0 > sum(abs(c(summary(fitBAlt)$coef - summary(fitB0)$coef))) [1] 0 ### OFTEN THE OUTCOME OF USING "weights" ARGUMENTS IN MODEL-FITTING ## FUNCTIONS IN R IS MUCH LESS TRANSPARENT !! ------------------------------------------------- Following log "GLMlog.R", get data "Data/pbcdata.asc" > pbcdat = read.table("http://www.math.umd.edu/~evs/s705/Data/pbcdata.asc", header=T) > names(pbcdat) [1] "OBS" "IDNUM" "DTH" "EVTTIME" "TREATGP" "LOGBILI" "AGEVAR" [8] "CIRRH" "CCHOL" "ALBUMIN" > ind = pbcdat$DTH==1 | pbcdat$EVTTIME >= 41 Surv41 = as.numeric(pbcdat$EVTTIME < 41) pbcdat = cbind.data.frame(pbcdat[,3:10], Surv41)[ind,] ## 172 x 9 > tmpglm = glm(cbind(Surv41, 1-Surv41) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family = binomial) ### Simulation to introduce "dispersion" in the form of shared random ### effect within cluster reff = runif(172,-2,2) yrsp1 = rbinom(172, rep(10,172),plogis(tmpglm$lin + reff)) #### 172 y's like the binary outcomes in PBCdata except they now ### arise from clusters of 10 with shared "reff" ### Think of yrsp1[i]/10 = Ybar_{i.} as the "data" for the ith cluster > mean(dlogis(tmpglm$lin + reff)) ### E( Var Ybar_{i.} ) [1] 0.132752 > mean((10/9)*.1*yrsp1*(1-0.1*yrsp1)) ### estimator of E( Var Ybar_{i.} ) [1] 0.1264858 ### average across i. glmfitA = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family=binomial) glmfitB = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family=quasibinomial) sum(abs(glmfitA$coef-glmfitB$coef)) ### 0 > c(summary(glmfitB)$disp, summary(glmfitA)$disp) [1] 2.494836 1.000000 > sum(10*(yrsp1/10-glmfitB$fit)^2/(glmfitB$fit*(1-glmfitB$fit)))/(172-4) [1] 2.494836 ======================================================================= 11/11/2015 Small Initial Demonstration of Smoothing Spline =============================================== > xv = runif(500, 0,2) fxv = sqrt(plogis(xv +2.5*xv - 5*xv^2 + 2*xv^3)) yv = fxv + 0.03*rnorm(500) ordx = order(xv) plot(xv[ordx], yv[ordx]) curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) tmpspl1 = smooth.spline(xv,yv) lines(xv[ordx], predict(tmpspl1,xv[ordx])$y, col="blue", lwd=3) > tmpspl1$spar [1] 0.7934125 > tmpspl2 = smooth.spline(xv,yv, spar=0.01) lines(xv[ordx], predict(tmpspl2,xv[ordx])$y, col="green", lwd=3) plot(xv[ordx], yv[ordx]) curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) tmpspl3 = smooth.spline(xv,yv, spar=1) lines(xv[ordx], predict(tmpspl3,xv[ordx])$y, lwd=3) yv2 = fxv + 0.001*rnorm(500) plot(xv[ordx], yv2[ordx]) curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) tmpspl4 = smooth.spline(xv,yv) lines(xv[ordx], predict(tmpspl4,xv[ordx])$y, col="blue", lwd=2) summary(fxv - predict(tmpspl4,xv)$y) Min. 1st Qu. Median Mean 3rd Qu. Max. -6.230e-03 -3.092e-03 -5.708e-05 -7.092e-04 1.010e-03 6.539e-03 >>>>> Continue with other smoothing and nonparametric regression techniques nprgfram = cbind.data.frame(x=xv, y=yv) > plot(nprgfram$x,nprgfram$y, main="Scatterplot of Raw Data, n=500") for(i in 1:8) abline(v=i*.2, lty=3) for(i in 1:9) { inds <- (1:500)[abs(nprgfram$x-.2*i+.1) < 0.1] itmp <- order(nprgfram$x[inds]) lines(nprgfram$x[inds[itmp]], lm(y ~ x, data=nprgfram[ inds,])$fit[itmp], lty=1, lwd=2) } curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) ### Now consider a lowess fit: > plot(nprgfram$x,nprgfram$y, main="Raw Data with Lowess Line, n=500") tmplow = lowess(nprgfram$x,nprgfram$y) lines(tmplow$x,tmplow$y, lty=1, lwd=2) curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) ### locpoly(x, y, drv = 0, degree, kernel = "normal", bandwidth, gridsize = 401L, bwdisc = 25, range.x, binned = FALSE, truncate = TRUE) tmppoly = locpoly(xv, yv, degree=3, bandwidth=0.1) plot(tmppoly) points(nprgfram$x,nprgfram$y, main="Scatterplot of Raw Data, n=500") curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) ### DISCUSSION: USE OF SPLINES FOR NUMERICAL APPROXIMATION 11/18/15 ## Consider the problem of sampling from a specified distribution, ## by the probability integral transform. ## Let's do this with t_5 xv = c((1:500)/10, 10 + (1:20)/2) xv = c(-rev(xv),0,xv) > pt(xv[1],5) [1] 2.887758e-06 > yv = pt(xv,5)) curve(log(pt(x,5)), -5,5) tmpy = smooth.spline(log(yv),xv) plot((1:999)/1000,predict(tmpy,log((1:999)/1000))$y, typ="l") curve(qt(x,5), .001,.999, add=T, col="red") ### Now to simulate 100,000 of them using the spline-fitted function ### Note that the evaluation function is unix.time({rant = predict(tmpy, log(runif(1e5)))$y}) user system elapsed 0.03 0.00 0.03 hist(rant, prob=T, nclass=128, xlim=c(-6,6)) curve(dt(x,5), -6,6, add=T, col="green") ### I will ask you to do this in a more difficult case, for an exercise ### SPLINE BASIS EXAMPLE ## Start with a fairly complicated curve mixran = function(x, pts, seednum) { set.seed(seednum) coef = 10*(runif(length(pts))-0.5) structure(c(outer(x, pts, function(x,y) dnorm(x-y)) %*% coef), seed=seednum, coef=coef) } xv = seq(-5,5,by=0.1) mixran(xv[1:5], c(-1,0.5,2), 333) [1] -4.461042e-05 -6.632645e-05 -9.765776e-05 -1.424013e-04 -2.056501e-04 attr(,"seed") [1] 333 attr(,"coef") [1] -0.3299934 -4.1540185 4.7348527 plot(xv, mixran(xv, c(-1,0.5,2), 333)) yv3 = mixran(xv, c(-3,-2,-0.5,0.8,2,4), 333) + rnorm(101,0,0.8) > library(splines) library(KernSmooth) dim(mat3) ### 101 x 8 mat3 = bs(xv, knots=seq(-4,4,by=2), degree=3) ### dim=length(knots)+degree fity = lm(yv3 ~ . , data=cbind.data.frame(yv3,mat3)) plot(xv,yv3) lines(xv, mixran(xv, c(-3,-2,-0.5,0.8,2,4), 333), col="red") points(xv, fity$fit, pch=5) ### What if we took sparser knots ? mat4 = bs(xv, knots=c(-3,0,3), degree=3) points(xv, lm(yv3 ~ . , data=cbind.data.frame(yv3,mat4))$fit, pch=18) ================================================================== Illustration of Data Replication Methods ======================================== 11/23/2015 DATA-SPLITTING, PARAMETRIC BOOTSTRAP, & BAYES POSTERIOR PREDICTIVE SIMULATION ## Data illustration set.seed(5) x = runif(200) y = 3 - 7*x + 10*(0.1+abs(x-.5))^1.5*rt(200,10) rtwt = 1/(0.1+abs(x-.5))^1.5 Dset = cbind.data.frame(y, x, wt=rtwt^2) > summary(rtwt^2) Min. 1st Qu. Median Mean 3rd Qu. Max. 4.760 9.806 22.580 91.930 90.610 761.900 x1st = rtwt x2st = x*rtwt yst = y*rtwt Dset2 = cbind.data.frame(yst,x1st,x2st) AuxY = array(0, c(1000,200)) > tmp1 = lm(y~x, data=Dset[1:100,], weight=wt) ## NOTE: S3 method for class 'lm' predict(object, newdata, se.fit = FALSE, scale = NULL, df = Inf, interval = c("none", "confidence", "prediction"), level = 0.95, type = c("response", "terms"), terms = NULL, na.action = na.pass, pred.var = res.var/weights, weights = 1, ...) tmp2 = predict(tmp1,Dset[1:100,], interval="none", type="response", weights=Dset$wt[1:100]) > summary(tmp2-tmp1$fit) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000e+00 8.882e-16 1.332e-15 1.578e-15 1.776e-15 2.642e-14 NOTE THAT IN ALL OF THE COMPUTATIONAL METHODS STUDIED IN THIS EXTENDED EXAMPLE, WE COMPUTE THE (unweighted) RMSE FOR Y-PREDICTIONS BUT DO THE MODEL-FITTING BY A WEIGHTED-LEAST SQUARES METHOD. Data-splitting method: 50-50 splits for RMSE --------------------- TstDat = TrnDat = array(0, c(1000,100)) TrnMSE = SplMSE = numeric(1000) for(i in 1:1000) { ind = sample(1:200, 100) TrnDat[i,] = setdiff(1:200,ind) tmp = lm(y ~ x, weights=wt, data=Dset[TrnDat[i,],]) TrnMSE[i] = mean(tmp$resid^2) SplMSE[i] = mean((Dset$y[ind] - predict(tmp,Dset[ind,], interval="none", type="response", weights=Dset$wt[ind]))^2) } > summary(sqrt(SplMSE)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.898 2.421 2.559 2.552 2.694 3.137 > summary(sqrt(TrnMSE)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.809 2.371 2.514 2.508 2.652 3.034 IT IS NO SURPRISE THAT THE TrnMSE VALUES ARE SMALLER THAN THE TEST-DATA SplMSE VALUES, ALTHOUGH HERE THE DIFFERENCE IS SMALL. AS WE REMARKED IN CLASS, THIS METHOD OF PSEUDO-SAMPLE REPLICATION GENERATES X'S AND Y'S TOGETHER. "Parametric Bootstrap" ---------------------- ## generate residuals anew from N(0,sighat) ## "predictors" kept from tmp1 with xvec = x[1:100] wts = Dset$wt[1:100] AuxY = tmp1$fitted + array(rnorm(100*1000,0, rep((0.1+abs(xvec-0.5))^1.5*summary(tmp1)$sig,100)), c(100,1000)) ### Now can look at the MSEs from re-fitting the model 1000 times MSEboot = numeric(1000) for(i in 1:1000) MSEboot[i] = mean(lm(AuxY[,i] ~ xvec, weights=wts)$resid^2) > summary(sqrt(MSEboot)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.651 2.218 2.361 2.364 2.502 3.107 THIS FIRST WAY OF DOING THE PARAMETRIC BOOTSTRAP GENERATES ONLY Y'S FOR A FIXED SET OF X'S. YOU COULD SAY THAT WHAT WE GENERATE IN THIS WAY IS A SET OF "CONDITIONAL" PSEUDO-SAMPLES. THAT IS, THE VARIABILITY REPLICATED HERE IS STRICTLY THAT DUE TO THE CONDITIONAL DENSITY OF Y GIVEN X. NEXT LET'S DO A SLIGHTLY MORE ELABORATE PARAMETRIC-BOOTSTRAP SIMULATION IN WHICH X'S AND Y'S ARE RE-GENERATED. BUT I PROPOSE, AS I MENTIONED IN CLASS, A SIMULATION DESIGN IN WHICH WE CAN SEPARATE THE EFFECTS ON MSE OF THE ESTIMATOR OF DIFFERENT SETS OF X'S FROM THE CONDITIONAL SAMPLING VARIABILITY OF Y'S GIVEN FIXED X'S. We choose to simulate B=100 different X-vectors X^{(b)}, and then for each one, R=1000 independently generated Y-vectors Y^{(b,r)} from the conditional distribution of Y given X=X^{(b)}. Xarr = array(runif(1e4), c(100,100)) ### index (b,i) Yarr = array(0, c(1000,100,100)) ## indices (r,b,i) : r indexes y-vector replicate within fixed X^{(b)}, ### and i=1:100 indexes data points within each batch of n=100. abhat = tmp1$coef sighat = summary(tmp1)$sig epsarr = array(0, c(100,1000)) Wtarr = (0.1+abs(Xarr-0.5))^(-1.5) Wtarr = (1/apply(Wtarr,1,sum))*Wtarr for(b in 1:100) { epsarr[,] = rnorm(1e5) Yarr[,b,] = t( (abhat[1] - abhat[2]*Xarr[b,]) + (sighat*(0.1+abs(Xarr[b,]-0.5))^1.5)*epsarr ) } ## Now we have the large arrays of Y's and X's and we need to ## vectorize the weighted least squares estimates in order to make ## the computations quick. Ypred = replace(Yarr,T,0) ### array like Yarr filled with 0's Xmn = apply(Wtarr*Xarr,1,sum) XctrArr = -Xmn + Xarr SXXsq = apply(Wtarr*XctrArr^2,1,sum) auxmat = array(outer(rep(1,1000), Wtarr*XctrArr)*Yarr, c(1000*100,100)) b2hatarr = array( c( auxmat %*% rep(1,100))/ rep(SXXsq, rep(1000,100)), c(1000,100)) ### 1000 x 100 (r,b) b1hatarr = apply(outer(rep(1,1000),Wtarr)*Yarr,1:2,sum) - t( Xmn*t(b2hatarr) ) for(b in 1:100) Ypred[,b,] = b1hatarr[,b] + outer(b2hatarr[,b],Xarr[b,]) MSEarr = apply( (Yarr-Ypred)^2, 1:2, mean) ### 1000x100 (r,b) Biasarr = apply(Ypred - Yarr, 1:2, mean)> summary(c(Biasarr)) Min. 1st Qu. Median Mean 3rd Qu. Max. -0.6123000 -0.0921800 -0.0005000 -0.0004574 0.0912800 0.5894000 ### Biases are small but not 0. They would be 0 if the averages of prediction errors within sample were taken weighted by Wtarr. ### All of this was quick. Now we have an array of MSEs for all (r,b) ### and we break MSE into 3 parts each of which we estimate via simulation: ### MSE (Ypred_i^{(r,b)}) = E( (Y_i^{(r,b)} - Ypred_i^{(r,b)})^2 ) ### + E( Var(Y_i^{(r,b)} - Ypred_i^{(r,b)} | X[b,.]) ) ### + Var( E(Y_i^{(r,b)} - Ypred_i^{(r,b)} | X[b,.]) ) ## First term is bias^2, which we estimate from Biasarr^2 > mean(c(Biasarr^2)) [1] 0.01865613 > mean( apply((Yarr - Ypred)^2, 2, mean) ) [1] 5.081514 > sd( apply((Yarr - Ypred)^2, 2, mean) ) [1] 0.5081083 > var( c(Yarr-Ypred)) [1] 5.081514 DID WE LEARN ANYTHING EXTRA BY DOING THE SIMULATION BY BLOCKS b ? WE FOUND OUT HOW VARIABLE OUR OVERALL ESTIMATED ANSWER WAS DUE TO SAMPLING VARIATION FROM ONE SET OF X'S TO ANOTHER. HOW DID THIS SIMULATION COMPARE WITH THE DATA-SPLITTING MSE ? THE RMSE WE FOUND THERE WAS 2.508 WHILE WE JUST FOUND THE PARAMETRIC-BOOTSTRAP RMSE TO BE > sqrt(5.081514) [1] 2.254221 THE RMSE FOUND FROM THE PARAMETRIC BOOTSTRAP WITH SINGLE FIXED Xvec WAS 2.364, but since that was for just a single Xvec, and we now know from the two-level (r and b) simulation that the variance of the MSE estimator was (0.5018)^2, these different parametric-bootstrap estimators (one with fixed X's, the other with varying X's) are within range of one another. NOTE HOWEVER THAT WE MAY ALSO GUESS THAT THE DATA-SPLITTING MSE IS LARGER BECAUSE IT TAKES INTO ACCOUNT THE ACTUAL DATA (200 DATA-PAIRS) WHICH DID NOT PRECISELY SATISFY THE MODEL ASSUMED BY THE PARAMETRIC BOOTSTRAP. BAYES posterior predictive sampling ----------------------------------- We continue with the same model as above, but define a standard conjugate prior for the parameters theta = (b1,b2, sigma^2) within a normal-errors model for the data with known weight-pattern (ie standard deviation of y given x known to be proportional to (0.1+abs(x-.5))^1.5. We will continue with that model as we did for the parametric bootstrap, although recall that it is slightly misspecified because the errors were actually t_{10} distributed rather than normal. ### We want to define a prior conjugate for the model y ~ N( b1 + b2*x, (sig*(0.1+abs(x-0.5))^{1.5})^2 ) ## For this, could take b1,b2 ~ indep N(0, sig0^2) , sig0 large and ## known, say 5 and 1/sig^2 ~ Gamma(1, lam0) for small lam0, say 0.1. ## Then with tau = 1/sig^2, the posterior is proportional to tau^{n/2} exp( -0.02*(b1^2+b2^2) - 0.5*tau*sum(yi-b1-b2*xi)^2 ## This is conjugate (posterior of the same form as prior) for (b1,b2) ## if sig had been known and fixed ## and for tau or sig^2 if b1,b2 were known and fixed. But jointly, it ## is not conjugate. NOTE: there is a Theorem (Bernstein-von Mises) which says that in large iid samples, the posterior is approximately normal centered at the MLE and with variance (like the MLE!) the inverse of the Fisher Information Here the Fisher Information from the fitted model is obtained as > Infmat = array(c(sum(Dset$wt), sum(Dset$wt*Dset$x), 0, sum(Dset$wt*Dset$x), sum(Dset$wt*Dset$x^2), 0, 0, 0, nrow(Dset)/sighat^2)/sighat^2, c(3,3)) > round(solve(Infmat), 4) [,1] [,2] [,3] [1,] 0.0840 -0.1613 0.0000 [2,] -0.1613 0.3267 0.0000 [3,] 0.0000 0.0000 32.0117 ## mvrnorm(n = 1, mu, Sigma, tol = 1e-6, empirical = FALSE, EISPACK = FALSE) ## Approximate Bayesian Idea in simulating new data pseudo-samples is (a) to draw multivariate normal (b1H,b2H,sigH^2) ~ N( (b1hat, b2hat,sighat^2), Infmat) (b) generate x ~ unif and y given x as N(b1H + b2H*x, {sigH*(0.1+abs(x-0.5))^1.5}^2) HOWEVER: use Delta method upon transformed parameter nu = log(sig^2) to force the simulated variance values to be positive ! NB. We can get an idea of the degree of (NON-) normality of sighat distribution by using the array of Yarr - Ypred = Yresid values to estimate a 1000 x 100 (r,b) array of parametric-bootstrap Monte Carlo simulated ones, as follows: > library(MASS) ### mvrnorm(n = 1, mu, Sigma, tol = 1e-6, empirical = FALSE, EISPACK = FALSE) > dim(Yarr) [1] 1000 100 100 > sghatsqArr = apply( (Yarr-Ypred)^2, 1:2, mean) > dim(sghatsqArr) [1] 1000 100 > hist( sghatsqArr[,2], nclass=50, prob=T) > hist( sghatsqArr, nclass=50, prob=T) > curve(dnorm(x,mean(sghatsqArr), sd(sghatsqArr)), 2,12, add=T, col="green") #### definitely off !!! ### Try to overplot gamma with matching 1st two moments instead > beta = mean(c(sghatsqArr))/var(c(sghatsqArr)) alph = beta*mean(c(sghatsqArr)) curve(dgamma(x,alph, beta), 2,12, add=T, col="blue") ##---------------------------------------------------------- > c(coef = tmp1$coef, sigsq = summary(tmp1)$sig^2) coef.(Intercept) coef.x sigsq 2.282569 -5.560408 80.014571 ### "True" b1,b2 values from simulation were 3, -7. ### When sig^2=80, obtain "theoretical" MSE as the ### expectation of 80*(t_{10})^2*E((0.1+abs(U-0.5))^3) = > 80*1.25*0.06475 = 6.475 ### derived from > integrate(function(x) x^2*dt(x,10), -Inf,Inf)$val [1] 1.25 > integrate(function(x) (0.1+abs(x-0.5))^3, 0,1)$val [1] 0.06475 ##>>>=========================================== Continuation 11/30/2015 USE THE PRECEDING NON-NORMALITY OF sighatsq TO ARGUE THAT WE SHOULD NOT SIMPLY USE THE BERNSTEIN-VON MISES THEOREM TO SAMPLE FROM MULTIVARIATE NORMAL AS A SHORTCUT FOR THE BAYESIAN POSTERIOR, BUT RATHER APPROXIMATE THE POSTERIOR BY MULTIVARIATE NORMAL IN (b1,b2) ALONG WITH INDEPENDENT GAMMA (motivated by the form of the information matrix) FOR SIGSQ. > rhovec = numeric(100) > for(i in 1:100) rhovec[i] = cor(b1hatarr[,i],b2hatarr[,i]) > hist(rhovec) > summary(rhovec) Min. 1st Qu. Median Mean 3rd Qu. Max. -0.9807 -0.9716 -0.9673 -0.9662 -0.9617 -0.9464 #### STRONG NEGATIVE CORRELATION BETWEEN B1, B2 > var(cbind(c(b1hatarr), c(b2hatarr))) [,1] [,2] [1,] 0.2207807 -0.4124677 [2,] -0.4124677 0.8253712 ### corr = -0.966 > c(alpha=alph, beta=beta) ### gamma parameters for sigsq alpha beta 20.538710 4.041849 ### mean here is 5.0815, while > mean((Dset$y[1:100]-tmp1$fit)^2) [1] 6.153456 #### Again, actual data shows greater variances ### which is partly due to true t_10 errors. #### Using Infmat^{-1} above for variance would suggest sd's > round(sqrt(diag(solve(Infmat))), 4) [1] 0.2898 0.5716 5.6579 ### and b1hat,b2hat correlation > aux = solve(Infmat); aux[1,2]/sqrt(aux[1,1]*aux[2,2]) [1] -0.9737398 ### NOTE that sampling from this posterior would also suffer from ### the same parametric-model misspecification ?! ## We would be sampling from neighborhood b2 in -5.560 +/- 1.96*0.572 b1 approx. = ybar - b2*xbar sigsq approx Gamma(17.68, 3.04) ##>===================================================== NONPARAMETRIC BOOTSTRAP 11/30/2015 IS THERE AN IDEA THAT LETS US LEARN ABOUT THE PERFORMANCE OF THIS KIND OF DATA ANALYSIS WITHOUT ASSUMING WE KNOW THE CORRECT (X,Y) MODEL ? Yes, the NONPARAMETRIC BOOTSTRAP ! ## (I) Bootstrap resampling of the iid pairs (X_i, Y_i) ## Assume the only data we have is Dset[1:100,] (including weights). indArr = array(sample(1:100,1e5, replace=T), c(1000,100)) #### corrected the next statements to reflect weighted least-squares auxXB = array(Dset$x[c(indArr)], c(1000,100)) auxYB = array(Dset$y[c(indArr)], c(1000,100))Ones = rep(1/100,100) wtB = (0.1+abs(auxXB-0.5))^(-3) wtB = (1/apply(wtB,1,sum))*wtB avXB = apply(wtB*auxXB,1,sum) avYB = apply(wtB*auxYB,1,sum) b2hatB = c( ((-avXB + auxXB)*wtB*auxYB) %*% Ones / (wtB*(-avXB + auxXB)^2) %*% Ones ) b1hatB = avYB - b2hatB*avXB MSEB = c( (b1hatB + b2hatB*auxXB - auxYB)^2 %*% Ones ) tmp = hist(MSEB, nclass=30, prob=T) library(KernSmooth) lines(locpoly(tmp$mids, tmp$density, degree=3, bandw = 0.8), col="blue") curve(dnorm(x,mean(MSEB), sd(MSEB)), 2,20, add=T, col="red") ### so the MSEs are skew-distributed, not quite normal #### Picture saved as "NPBootMSEpic.pdf"> c(mean = mean(MSEB), sd = sd(MSEB)) mean sd 6.072114 1.417756 ### MSE quite a bit larger than ~ .5 suggested by ### parametric bootstrap, and SD larger too ### NOTE that this way of bootstrapping is letting the residual #### t-distribution express itself better !! tmp2 = hist(b2hatB, nclass=30, prob=T) lines(locpoly(tmp2$mids, tmp2$density, deg=3, bandw = 0.8), col="blue") curve(dnorm(x,mean(b2hatB), sd(b2hatB)),-12,-2, add=T, col="red") ### much closer to normal, but still not quite there ## (II) Bootstrap resampling of residuals*rtwt for fixed X_i ## recall xvec is Dset$x[1:100], rtwt[1:100] = 1/(0.1+abs(xvec-0.5))^1.5 ## use indArr above to resample residuals and then re-create Y's epsBArr = array((tmp1$resid*rtwt[1:100])[c(indArr)], c(1000,100)) YNPboot = tmp1$fitted + (1/rtwt[1:100])*epsBArr wgts = Dset$wt[1:100]/sum(Dset$wt[1:100]) avYB2 = c(YNPboot %*% wgts) xvav = sum(xvec*wgts) b2hatB2 = c( (outer(rep(1,1000),xvec-xvav)*YNPboot) %*% wgts )/ sum((xvec-xvav)^2*wgts) b1hatB2 = avYB2 - b2hatB2*xvav MSEB2 = c( (b1hatB2 + outer(b2hatB2,xvec) - YNPboot)^2 %*% Ones ) > c(mean(MSEB2), sd(MSEB2)) [1] 5.814071 5.229153 #### Note smaller MSE when X fixed. ### THIS WAY OF BOOTSTRAPPING IMPOSES THE STRUCTURE OF INDEPENDENCE ### of X and epsilon's but the rescaling of tmp1$resid ### by rtwt seems to make the variability of the MSE estimates ### much larger because of the large range of weights. ## (III) Could separately and independently bootstrap X and residuals auxXB3 = array(sample(xvec,1e5,replace=T), c(1000,100)) YNPboot3 = tmp1$coef[1]+tmp1$coef[2]*auxXB3 + (1/rtwt[1:100])*epsBArr wtB3 = (0.1+abs(auxXB3-0.5))^(-3) wtB3 = (1/apply(wtB3,1,sum))*wtB3 avYB3 = apply(wtB3*YNPboot3,1,sum) avXB3 = apply(wtB3*auxXB3,1,sum) b2hatB3 = c( ((-avXB3 + auxXB3)*wtB3*YNPboot3) %*% Ones / (wtB3*(-avXB3 + auxXB3)^2) %*% Ones ) b1hatB3 = avYB3 - b2hatB3*avXB3 MSEB3 = c( (b1hatB3 + b2hatB3*auxXB3 - YNPboot3)^2 %*% Ones ) > c(mean(MSEB3), sd(MSEB3)) [1] 5.826440 5.249827 ### VERY SIMILAR RESULTS TO THOSE IN (II) ABOVE. #### The average MSE stays small when ### we incorporate the independence of X's and epsilon's ! ============================================================= 12/2 WE FINISH THIS EXAMPLE WITH WEIGHTED LEAST-SQUARES (WITH t_{10} ERRORS) BY ASKING AND ANSWERING: WHAT IS THE "RIGHT" ANSWER FOR THE AVERAGE AND SD OF UNWEIGHTED MSE FOR Y-PREDICTORS BASED ON THE DATA-SAMPLES OF SIZE 100 WITH KNOWN VARIANCE-WEIGHTING (AS A FUNCTION OF X_i) for the kind of data (with t_{10} not N(0,1) errors) we were actually working with ??? (a) Conditional for fixed X's at xvec AuxY.MC = 3 - 7*xvec + array(rep(10*(0.1+abs(xvec-0.5))^1.5,1000)* rt(100*1000,10), c(100,1000)) ### Now can look at the MSEs from re-fitting the model 1000 times MSE.MC = numeric(1000) for(i in 1:1000) MSE.MC[i] = mean(lm(AuxY.MC[,i] ~ xvec, weights=Dset$wt[1:100])$resid^2) > summary(sqrt(MSE.MC)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.973 2.711 2.908 2.936 3.145 4.044 > c(mean=mean(MSE.MC), sd = sd(MSE.MC)) mean sd 8.728081 1.977124 (b) UNconditional, with varying (iid) X's AuxX.MC2 = array(runif(1e5), c(100,1000)) RtWt.MC2 = (0.1+abs(AuxX.MC2-0.5))^1.5 AuxY.MC2 = 3 - 7*AuxX.MC2 + 10*RtWt.MC2* array(rt(100*1000,10), c(100,1000)) MSE.MC2 = numeric(1000) for(i in 1:1000) MSE.MC2[i] = mean(lm(AuxY.MC2[,i] ~ AuxX.MC2[,i], weights= RtWt.MC2[,i]^2)$resid^2) > summary(sqrt(MSE.MC2)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.841 2.546 2.775 2.805 3.036 4.497 > c(mean=mean(MSE.MC2), sd = sd(MSE.MC2)) mean sd 7.998849 2.091280