Example Logs, STAT 705, Fall 2015 9/2/2015 > tmp = sin(0.354 + 2*pi*(1:500)/10) > mat = array(tmp,c(10,50)) > apply(mat,1,var) [1] 5.927550e-29 4.168184e-31 9.749828e-29 3.043804e-28 2.945958e-28 [6] 1.140571e-28 4.980691e-31 9.790227e-29 1.896126e-28 2.672093e-28 > tmp1 = rep(c(1,3),500) > tmp2 = c(rbind(rep(1,500),rep(3,500))) > sum(abs(tmp1-tmp2)) [1] 0 > tmp3 = replace(rep(1,1000), 1:1000 %% 2 == 0, 3) > sum(abs(tmp3-tmp1)) [1] 0 > tmp4 = ifelse(1:1000 %% 2 == 0, 3,1) > sum(abs(tmp4-tmp1)) [1] 0 > toss = rbinom(10,1,.5) > cond = as.logical(toss) > cond [1] FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE > replace(1:10,cond,101:105) [1] 1 2 101 4 102 103 104 105 9 10 > ifelse(cond,101:105,1:10) [1] 1 2 103 4 105 101 102 103 9 10 > sum(abs( (1:10)[toss==1] - which(cond) )) [1] 0 toss + array(rep(1:10,5), c(10,5)) [,1] [,2] [,3] [,4] [,5] [1,] 1 1 1 1 1 [2,] 2 2 2 2 2 [3,] 4 4 4 4 4 [4,] 4 4 4 4 4 [5,] 6 6 6 6 6 [6,] 7 7 7 7 7 [7,] 8 8 8 8 8 [8,] 9 9 9 9 9 [9,] 9 9 9 9 9 [10,] 10 10 10 10 10 outer(toss+1:10,rep(1,5),"*") [,1] [,2] [,3] [,4] [,5] [1,] 1 1 1 1 1 [2,] 2 2 2 2 2 [3,] 4 4 4 4 4 [4,] 4 4 4 4 4 [5,] 6 6 6 6 6 [6,] 7 7 7 7 7 [7,] 8 8 8 8 8 [8,] 9 9 9 9 9 [9,] 9 9 9 9 9 [10,] 10 10 10 10 10 > attributes(mat) $dim [1] 10 50 > fr = data.frame(mat) > attributes(fr) $names [1] "X1" "X2" "X3" "X4" "X5" "X6" "X7" "X8" "X9" "X10" "X11" "X12" [13] "X13" "X14" "X15" "X16" "X17" "X18" "X19" "X20" "X21" "X22" "X23" "X24" [25] "X25" "X26" "X27" "X28" "X29" "X30" "X31" "X32" "X33" "X34" "X35" "X36" [37] "X37" "X38" "X39" "X40" "X41" "X42" "X43" "X44" "X45" "X46" "X47" "X48" [49] "X49" "X50" $row.names [1] 1 2 3 4 5 6 7 8 9 10 $class [1] "data.frame" > class(mat) [1] "matrix" > length(mat) [1] 500 > length(fr) [1] 50 > outer( 1:5, 4:6, "*") [,1] [,2] [,3] [1,] 4 5 6 [2,] 8 10 12 [3,] 12 15 18 [4,] 16 20 24 [5,] 20 25 30 > tmp = outer(1:6, 1:6, function(x,y) x<= y) tmp [,1] [,2] [,3] [,4] [,5] [,6] [1,] TRUE TRUE TRUE TRUE TRUE TRUE [2,] FALSE TRUE TRUE TRUE TRUE TRUE [3,] FALSE FALSE TRUE TRUE TRUE TRUE [4,] FALSE FALSE FALSE TRUE TRUE TRUE [5,] FALSE FALSE FALSE FALSE TRUE TRUE [6,] FALSE FALSE FALSE FALSE FALSE TRUE > 1*tmp [,1] [,2] [,3] [,4] [,5] [,6] [1,] 1 1 1 1 1 1 [2,] 0 1 1 1 1 1 [3,] 0 0 1 1 1 1 [4,] 0 0 0 1 1 1 [5,] 0 0 0 0 1 1 [6,] 0 0 0 0 0 1 > class(tmp[1:36]) = "numeric" tmp [,1] [,2] [,3] [,4] [,5] [,6] [1,] 1 1 1 1 1 1 [2,] 0 1 1 1 1 1 [3,] 0 0 1 1 1 1 [4,] 0 0 0 1 1 1 [5,] 0 0 0 0 1 1 [6,] 0 0 0 0 0 1 9/9/2015 > help(data.frame) Description This function creates data frames, tightly coupled collections of variables which share many of the properties of matrices and of lists, used as the fundamental data structure by most of R's modeling software. Usage data.frame(..., row.names = NULL, check.rows = FALSE, check.names = TRUE, stringsAsFactors = default.stringsAsFactors()) ... > fac1 = factor(tmp1) > attributes(fac1) $levels [1] "1" "3" $class [1] "factor" > fac1[1:20] [1] 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 Levels: 1 3 > as.character(fac1)[1:10] [1] "1" "3" "1" "3" "1" "3" "1" "3" "1" "3" > as.numeric(fac1)[1:20] [1] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 > cond [1] FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE > as.numeric(factor(cond)) [1] 1 1 2 1 2 2 2 2 1 1 > as.numeric(cond) > as.numeric(cond) [1] 0 0 1 0 1 1 1 1 0 0 > library() #### shows all available packages ... > library(foreign) > search() [1] ".GlobalEnv" "package:foreign" "package:stats" [4] "package:graphics" "package:grDevices" "package:utils" [7] "package:datasets" "package:methods" "Autoloads" [10] "package:base" > objects(2) [1] "data.restore" "lookup.xport" "read.arff" "read.dbf" [5] "read.dta" "read.epiinfo" "read.mtp" "read.octave" [9] "read.S" "read.spss" "read.ssd" "read.systat" [13] "read.xport" "write.arff" "write.dbf" "write.dta" [17] "write.foreign" > help(read.table) read.table {utils} R Documentation Data Input Description Reads a file in table format and creates a data frame from it, with cases corresponding to lines and variables to fields in the file. Usage read.table(file, header = FALSE, sep = "", quote = "\"'", dec = ".", numerals = c("allow.loss", "warn.loss", "no.loss"), row.names, col.names, as.is = !stringsAsFactors, na.strings = "NA", colClasses = NA, nrows = -1, skip = 0, check.names = TRUE, fill = !blank.lines.skip, strip.white = FALSE, blank.lines.skip = TRUE, comment.char = "#", allowEscapes = FALSE, flush = FALSE, stringsAsFactors = default.stringsAsFactors(), fileEncoding = "", encoding = "unknown", text, skipNul = FALSE) ... [discussed in class] > help(write) [also discussed in class] ##### Some helpful functions > tmpmat = array( rbinom(20,2,.5), c(5,4) ) tmpmat [,1] [,2] [,3] [,4] [1,] 0 1 0 1 [2,] 1 1 2 1 [3,] 1 2 0 0 [4,] 2 2 1 1 [5,] 1 1 1 2 > apply(tmpmat, 2, sum) [1] 5 7 4 5 > apply(tmpmat, 1, sum) [1] 2 5 3 6 5 > c(tmpmat %*% rep(1,4)) [1] 2 5 3 6 5 > for(i in 1:5) cat(sum(tmpmat[i,]), ". "); cat("\n") 2 . 5 . 3 . 6 . 5 . > table(c(tmpmat)) 0 1 2 4 11 5 ######## > attributes(fr) $names [1] "X1" "X2" "X3" "X4" "X5" "X6" "X7" "X8" "X9" "X10" "X11" "X12" [13] "X13" "X14" "X15" "X16" "X17" "X18" "X19" "X20" "X21" "X22" "X23" "X24" [25] "X25" "X26" "X27" "X28" "X29" "X30" "X31" "X32" "X33" "X34" "X35" "X36" [37] "X37" "X38" "X39" "X40" "X41" "X42" "X43" "X44" "X45" "X46" "X47" "X48" [49] "X49" "X50" $row.names [1] 1 2 3 4 5 6 7 8 9 10 $class [1] "data.frame" > unlist(lapply(fr,class))[1:7] X1 X2 X3 X4 X5 X6 X7 "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" > lst1 = list(A = 1:15, B = c("A","B","C","D"), C = diag(1:4)) > unlist(lapply(lst1, length)) A B C 15 4 16 > lapply(lst1,summary) $A Min. 1st Qu. Median Mean 3rd Qu. Max. 1.0 4.5 8.0 8.0 11.5 15.0 $B Length Class Mode 4 character character $C V1 V2 V3 V4 Min. :0.00 Min. :0.0 Min. :0.00 Min. :0 1st Qu.:0.00 1st Qu.:0.0 1st Qu.:0.00 1st Qu.:0 Median :0.00 Median :0.0 Median :0.00 Median :0 Mean :0.25 Mean :0.5 Mean :0.75 Mean :1 3rd Qu.:0.25 3rd Qu.:0.5 3rd Qu.:0.75 3rd Qu.:1 Max. :1.00 Max. :2.0 Max. :3.00 Max. :4 > tmp0 = diag(4)/tmpmat[1:4,] > tmp0 [,1] [,2] [,3] [,4] [1,] Inf 0 NaN 0 [2,] 0 1 0 0 [3,] 0 0 Inf NaN [4,] 0 0 0 1 > apply(tmp0, 2, function(col) sum(is.na(col))) [1] 0 0 1 1 > apply(tmp0, 2, function(col) sum(is.nan(col))) [1] 0 0 1 1 ####### Other functions f1 = function(x, y) x - 3*y^2 f1(5:7, 1:3) > f1(5:7, 1:3) [1] 2 -6 -20 > > outer(5:7, 1:3, f1) [,1] [,2] [,3] [1,] 2 -7 -22 [2,] 3 -6 -21 [3,] 4 -5 -20 > rvec = runif(10000) > table(trunc(rvec * 20)) 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 500 477 500 495 484 544 443 541 474 533 500 516 512 513 475 484 485 518 488 518 > tmp = hist(rvec, nclass = 20) > tmp$counts [1] 500 477 500 495 484 544 443 541 474 533 500 516 512 513 475 484 485 518 488 518 > svec = rnorm(1000, 1, sqrt(5)) hist(svec, nclass=25, prob=T, , xlab="svec values", ylab="Density", main= paste("Histogram with overplotted", "\n normal density curve")) curve(dnorm(x,1,sqrt(5)), add=T, col="red") > help(plot) ### plotting x versus y, see also "points", "lines" help(par) ### so-called "graphical parameters", including plotting symbols 9/16/2015 ## Discussion of histograms in class ## we talked about over-laying density with histogram ## Continuing previous uniform and normal examples: first 10,000 uniform[0,1] points > round(tmp$counts/10000 - .05, 4) [1] 0.0000 -0.0023 0.0000 -0.0005 -0.0016 0.0044 -0.0057 0.0041 -0.0026 [10] 0.0033 0.0000 0.0016 0.0012 0.0013 -0.0025 -0.0016 -0.0015 0.0018 [19] -0.0012 0.0018 ### Pretty accurate: each should be roughly within range > qnorm(.99)*c(-1,1)/sqrt(12*10000) [1] -0.006715588 0.006715588 ### Now for second case, "svec" 1000 normals with mean 1, var 5 > tmp2 = hist(svec, nclass=25, prob=T) table(diff(tmp2$breaks)) 0.5 30 round(tmp2$density - dnorm(tmp2$mids,1,sqrt(5)), 4) [1] 0.0018 0.0016 0.0011 0.0021 -0.0016 0.0015 0.0087 0.0053 -0.0033 [10] -0.0057 0.0020 0.0162 -0.0055 0.0327 -0.0026 -0.0107 0.0347 0.0227 [19] -0.0287 -0.0386 -0.0093 -0.0175 0.0082 0.0020 0.0003 -0.0113 0.0093 [28] -0.0073 -0.0025 -0.0016 round(tmp2$counts/1000 - (pnorm(tmp2$breaks[2:31],1,sqrt(5))- pnorm(tmp2$breaks[1:30],1,sqrt(5))), 4) [1] 0.0009 0.0008 0.0005 0.0010 -0.0008 0.0007 0.0043 0.0026 -0.0017 [10] -0.0029 0.0009 0.0081 -0.0028 0.0164 -0.0012 -0.0052 0.0175 0.0115 [19] -0.0142 -0.0192 -0.0046 -0.0088 0.0041 0.0009 0.0001 -0.0057 0.0046 [28] -0.0037 -0.0013 -0.0008 ### These last two calculated vectors are indices of "goodness of fit" of ### the histogram to the correct normal density. > max(abs(.Last.value)) [1] 0.0192 ### might be used in a quick assessment of fit ### Some GRAPHICAL commands > xv = rgamma(100,2) yv = rnorm(1+2*xv, 0.3*(xv^0.25)) plot(xv, yv) tmp1 = lm(yv~xv) lines(xv, tmp1$fit, col="blue") inds = identify(xv,yv) ### left-click successive points of interest in plot [1] 1 14 41 70 80 83 96 ### right-click mouse when finished sqrt( (yv - tmp1$fit)[inds]/summary(tmp1)$sigma ) 25 26 50 64 1.2113197 0.7710393 0.9728214 1.2791945 #### Managing arrays for simulations simarray = array( rgamma(100000,3,2), c(100,1000)) hist(apply(simarray - 1.5,2,mean), nclass =12, prob=T) curve(dnorm(x,0, sqrt(3/(4*100))), add=T) > sqrt(var(apply(simarray - 1.5,2,mean))) [1] 0.09068389 ### compare with theoretical sqrt(3/(4*100))=0.08660254 > curve(dnorm(x,0,sqrt(3/400)), -.4,.4) > curve(dgamma(x+1.5,300,200), add=T, col="green") ### compares two theoretical densities 9/28/2015 Method of Simulating Normal R.V.'s (Box-Mueller method) Exercise: if X,Y are indep N(0,1) then A = arcsin(X/sqrt(X^2+Y^2)) ~ Unif(-pi/2,pi/2) and S = X^2+Y^2 ~ Expon(1/2) are indep So can simulate independent standard-normal pair by S = 2*rexp(1), A = pi*(2*runif(1)-1) and (X,Y) = sqrt(S)*(sin(A), cos(A)) > Svec = 2*rexp(1000); Avec = pi*(2*runif(1000)-1) hist(sqrt(Svec)*sin(Avec), nclass=40, prob=T) curve(dnorm(x), col="red", add=T) Xmat = sqrt(Svec)*cbind(sin(Avec),cos(Avec)) plot(Xmat) ## this checks independence abline(h=0); abline(h=1); abline(v=1, col="red"); abline(v=2,col="red") > mean(abs(Xmat[,1]-1.5)<.5 & abs(Xmat[,2]-0.5)<0.5) [1] 0.049 > mean(abs(Xmat[,1]-1.5)<.5)*mean(abs(Xmat[,2]-.5)<.5) [1] 0.046843 --------------------------------------- Second (slower) method: accept-reject First simulate Logistic r.v. Can directly use rlogis(1000), alternatively recall plogis(x) = e^x/(1+e^x) and inverse is qlogis(r) = log(r/(1-r)), so could do qlogis(runif(1000)) Check the max of N(0,1) density divided by dlogis(x,0,C) > curve(dnorm(x)/dlogis(x),0,2) ### max <= 1.6 > curve(dnorm(x)/dlogis(x,0,1/sqrt(3)),0,2) ### max <= 1.16005 max(dnorm(seq(1,2,by=.001))/dlogis(seq(1,2,by=.001),0,1/sqrt(3))) ## = 1.16002 ## Use dlogis(x) : > Uvec = runif(1000)*1.6 Wvec = rlogis(1000) Yvec = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] ### length 632 hist(Yvec, nclass=15, prob=T) ## Next try it with dlogis(x,0,1/sqrt(3)) > Uvec = runif(1000)*1.16002 Wvec = rlogis(1000,0,1/sqrt(3)) Yvec = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] ### length 916 hist(Yvec, nclass=15, prob=T) ### Timing runs to generate 10 million N(0,1) rv's > memory.limit() [1] 3894 ### in Mb system.time({ Svec = sqrt(2*rexp(5e6)); Avec = pi*(2*runif(5e6)-1) tmp = c(Svec*sin(Avec), Svec*cos(Avec)) }) user system elapsed 1.06 0.20 1.26 > system.time({tmp = qnorm(runif(1e7))}) user system elapsed 1.03 0.09 1.15 > system.time({ Uvec = runif(1e7)*1.6 ; Wvec = rlogis(1e7) tmp = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] }) user system elapsed 3.17 0.13 3.29 > system.time({ Uvec = runif(1e7)*1.16002; Wvec = rlogis(1e7,0,1/sqrt(3)) tmp = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] }) user system elapsed 3.31 0.27 3.58 ======================================================================== 10/7/2015 We saw in class that the method of "control variates" involves linear regression. In the setting of the example in the log that we covered last time: http://www.math.umd.edu/~slud/s705/Rlogs/Antith_Contr09 > Control.fr = cbind.data.frame(X1 = rexp(1e5)*0.8, X2 = runif(1e5,2,4), X3 = rlogis(1e5), X4 = rgamma(1e5,1.5)) Y = c(data.matrix(Control.fr) %*% rep(1,4)) > Control.fr = cbind.data.frame(Control.fr, Y=Y, Yctr = Y-5.3, Yhi = as.numeric(Y>5.5)) rm(Y) tmp = lm(Yhi ~ Yctr, data=Control.fr) > names(tmp) [1] "coefficients" "residuals" "effects" "rank" [5] "fitted.values" "assign" "qr" "df.residual" [9] "xlevels" "call" "terms" "model" > names(summary(tmp)) [1] "call" "terms" "residuals" "coefficients" [5] "aliased" "sigma" "df" "r.squared" [9] "adj.r.squared" "fstatistic" "cov.unscaled" > summary(tmp)$coef[,1:2] Estimate Std. Error (Intercept) 0.4440900 0.0009788089 Yctr 0.1623008 0.0004088673 ## Note that the SE 0.00098 is essentially the same as > sqrt(1-cor(Control.fr[,"Y"],Control.fr[,"Yhi"])^2)*sd(Control.fr$Yhi)/sqrt(1e5) [1] 0.0009788011 ==================================================================== ## Generating multivariate densities by conditional prob. integral transform ---------------------------------------------------------------------------- Want to generate random point x1 < x2 < ... < x6 in [0,1]^6. Several methods: > sort(runif(6)) > tmparr = array(runif(6*10000), c(10000,6)) inds = which(apply(tmparr, 1, function(row) min(diff(row)))>0) length(inds) [1] 19 > tmparr[inds[1],] [1] 0.01267986 0.02371346 0.22257134 0.58500541 0.62999614 0.92351900 > vec = c(rep(0,6),1) for(i in 6:1) vec[i]=vec[i+1]*rbeta(1,i,1) ### To do it in an array of many rows: > ordmat = cbind(array(0, c(1e4,6)), rep(1,1e4)) for(i in 6:1) ordmat[,i] = ordmat[,i+1]*rbeta(1e4,i,1) ### For example, can check that the 4th order-statistic has Beta(4,3) density: hist(ordmat[,4], nclass=40, prob=T) curve(dbeta(x,4,3),0,1, add=T, col="green") ### OK ## NB this is an instance of successive conditional univariate densities ### Other methods ? ... ### Another R topic: Aggregation, splitting and lists ----------------------------------------------------- Suppose we want to consider a ("mixture" or "stratified") model in which there are H = 6 population strata with respective sizes or probabilities (0.3, 0.1, 0.2, 0.15, 0.15, 0.1) and within stratum h Y_i ~ N(mu_h, sig_h) Let's generate a large populatioon (N=1e5) of this sort pvec = c(0.3, 0.1, 0.2, 0.15, 0.15, 0.1) muvec = c(4, 2, 1, 5, 3, 6) sigvec = sqrt(1:6) hvec = sample(1:6, 1e5, prob=pvec, replace=T) > rbind(Emp = table(hvec)/1e5, True=pvec) 1 2 3 4 5 6 Emp 0.29853 0.09915 0.20125 0.15079 0.15072 0.09956 True 0.30000 0.10000 0.20000 0.15000 0.15000 0.10000 Yvec = rnorm(1e5, muvec[hvec], sigvec[hvec]) ### Now want to display stratum properties ### Could do: gp1 = which(hvec==1) > c(mean=mean(Yvec[gp1]), sd=sd(Yvec[gp1])) mean sd 3.9977241 0.9993129 etcetera, or else: > tmplist = split(Yvec,hvec) > class(tmplist) [1] "list" > sapply(tmplist,length) 1 2 3 4 5 6 29853 9915 20125 15079 15072 9956 > sapply(tmplist, function(gp) c(mean(gp),sd(gp))) 1 2 3 4 5 6 [1,] 3.9977241 1.986277 1.006401 5.008875 2.969467 6.009526 [2,] 0.9993129 1.412418 1.720414 1.994702 2.246038 2.444997 or else > tmpfr = aggregate(Yvec, by=list(hvec), function(gp) c(mean(gp),sd(gp))) > class(tmpfr) [1] "data.frame" > tmpfr Group.1 x.1 x.2 1 1 3.9977241 0.9993129 2 2 1.9862770 1.4124179 3 3 1.0064005 1.7204142 4 4 5.0088751 1.9947024 5 5 2.9694670 2.2460384 6 6 6.0095257 2.4449966 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Log related to simulation and MLE behavior 10/14/2015 ### Let's start with a model Z ~ N(a,b^2) X ~ Poisson( exp(nu+tau*Z) ) Y ~ Binom(1, plogis(beta1 + beta2*X + beta3*Z)) ## Generate data n=400 based on vectors of parameters par = c(a=4.1, lb=log(0.8), nu=-2, tau = 0.2, beta=c(-.5,.8,.1)) DataGen = function(N, par) { Zvec = rnorm(N,par[1],exp(par[2])) Xvec = rpois(N,exp(par[3]+par[4]*Zvec)) Yvec = rbinom(N,1,plogis(par[5]+par[6]*Xvec+par[7]*Zvec)) cbind(Y=Yvec, X=Xvec, Z=Zvec) } tmp = cbind(DataGen(400, par),One=rep(1,400)) #### 400 x 4 ### Now a likelihood nlogL = function(thet, Dat) -mean(log(dnorm(Dat[,"Z"],thet[1],exp(thet[2])))) - mean(log(dpois(Dat[,"X"],exp(c(Dat[,c(4,3)]%*%thet[3:4]))))) - mean(log(dbinom(Dat[,"Y"],1,plogis(c(Dat[,c(4,2,3)]%*% thet[5:7]))))) ### Now MLE calculation MLE = nlm(nlogL, c(5,log(2),rep(0,5)), hessian=T, Dat=tmp) > MLE $minimum [1] 2.579204 $estimate [1] 4.1044118 -0.2136951 -2.5665736 0.3401607 -0.7309071 0.5007728 0.2446937 $gradient [1] -8.035650e-08 3.794565e-07 1.018789e-07 2.237255e-07 4.363399e-08 [6] 9.181322e-08 1.613332e-07 $hessian [,1] [,2] [,3] [,4] [,5] [1,] 1.533251e+00 -6.290957e-04 -1.081980e-08 1.081980e-08 0.000000e+00 [2,] -6.290957e-04 1.999599e+00 -4.440892e-08 0.000000e+00 -4.440892e-08 [3,] -1.081980e-08 -4.440892e-08 3.225323e-01 1.398468e+00 -4.440892e-08 [4,] 1.081980e-08 0.000000e+00 1.398468e+00 6.290156e+00 4.440892e-08 [5,] 0.000000e+00 -4.440892e-08 -4.440892e-08 4.440892e-08 2.327332e-01 [6,] 0.000000e+00 -4.440892e-08 -4.440892e-08 0.000000e+00 6.167920e-02 [7,] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 9.449793e-01 [,6] [,7] [1,] 0.000000e+00 0.0000000 [2,] -4.440892e-08 0.0000000 [3,] -4.440892e-08 0.0000000 [4,] 0.000000e+00 0.0000000 [5,] 6.167920e-02 0.9449793 [6,] 7.749374e-02 0.2604345 [7,] 2.604345e-01 3.9821295 $code [1] 1 $iterations [1] 41 ### CONTINUE IN CLASS with an actual simulation (say of 100 data-batches) Example: ThetMat = array(0, c(1000,7,2)) ### This matrix will contain the parameter estimators and their individual ### estimated "theoretical" SE's derived from diagonal of the inverted ### observed-information matrix. ConvCod = logical(1000) for(r in 1:1000) { tmp = cbind(DataGen(400, par),One=rep(1,400)) MLE = nlm(nlogL, par, hessian=T, Dat=tmp) ThetMat[r,,1] = MLE$est ConvCod[r] = (MLE$code ==1 & sqrt(sum(MLE$grad^2))/7 < 1.e-6 & min(eigen(MLE$hess)$values)>0) ThetMat[r,,2] = sqrt(diag(solve(MLE$hess))) } > hist(ThetMat[,6,1], nclass=25, prob=T) curve(dnorm(x, par[6], sd(ThetMat[,6,1])), add=T, col="red") ### really nice agreement > round(rbind(Theor.SD = sqrt(apply(ThetMat[,,2]^2,2,mean)/400), Empir.SD = apply(ThetMat[,,1],2,sd)),5) [,1] [,2] [,3] [,4] [,5] [,6] [,7] Theor.SD 0.03994 0.03536 0.48483 0.11265 0.54209 0.21052 0.13034 Empir.SD 0.03999 0.03560 0.49943 0.11617 0.56854 0.21452 0.13754 ### ALSO VERY NICE AGREEMENT ! #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 10/19/2015 ### Another useful form of matrix indexing (applies also to more general arrays) > tmat = array(1:20,c(5,4)) > tmat [,1] [,2] [,3] [,4] [1,] 1 6 11 16 [2,] 2 7 12 17 [3,] 3 8 13 18 [4,] 4 9 14 19 [5,] 5 10 15 20 > tmat[cbind(1:5,c(3,2,1,4,1))] [1] 11 7 3 19 5 > tarr = array(1:60, c(5,4,3)) > tarr , , 1 [,1] [,2] [,3] [,4] [1,] 1 6 11 16 [2,] 2 7 12 17 [3,] 3 8 13 18 [4,] 4 9 14 19 [5,] 5 10 15 20 , , 2 [,1] [,2] [,3] [,4] [1,] 21 26 31 36 [2,] 22 27 32 37 [3,] 23 28 33 38 [4,] 24 29 34 39 [5,] 25 30 35 40 , , 3 [,1] [,2] [,3] [,4] [1,] 41 46 51 56 [2,] 42 47 52 57 [3,] 43 48 53 58 [4,] 44 49 54 59 [5,] 45 50 55 60 > tarr[cbind(1:5,c(2,3,2,4,1),c(1,3,2,1,3))] [1] 6 52 28 19 45 ##-------------------------------------------------------------------------------- ### Contrasting MLEs and variances from correctly and incorrectly specified models ## Suppose our data comes from lognormal(mu,sigma^2) and we analyze it as Gamma(a,lam) > Xdat = exp(rnorm(400, 0.5, 0.8)) ## "Correct" ML analysis gives parameter estimates > MLtru = c(mean(log(Xdat)), var(log(Xdat))) [1] 0.5355010 0.6796627 ### NB 2nd parameter is variance, to be compare with .8^2 and "empirical" or "observed" information matrix (per observation) > diag(c(1/MLtru[2]^2, 1/MLtru[2]^4)) [,1] [,2] [1,] 2.164777 0.00000 [2,] 0.000000 4.68626 #### Now let's analyze the data via ML as Gamma(a,lam) alam.ini = (mean(Xdat)/var(Xdat))*c(mean(Xdat),1) > alam.ini [1] 0.9997862 0.4183704 ### not ultimately meaningful, but not too wild Gfit = optim(alam.ini, function(alam, data) -sum(log(dgamma(data,alam[1],alam[2]))), data=Xdat, hessian=T) ### 49 iterations, conv=0 > round( unlist(Gfit[1:2]), 6) par1 par2 value 1.636170 0.684693 723.132589 > Gfit$hess [,1] [,2] [1,] 333.4763 -584.2035 [2,] -584.2035 1396.0407 Gfit2 = nlm(function(alam, data) -sum(log(dgamma(data,alam[1],alam[2]))), alam.ini, data=Xdat, hessian=T) ### only 7 iterations > Gfit2$est [1] 1.6363411 0.6847426 > Gfit2$hess [,1] [,2] [1,] 333.3872 -584.1184 [2,] -584.1184 1395.5721 > Gfit2$grad [1] -4.029622e-05 8.742518e-05 nlm(function(alam, data) -sum(log(dgamma(data,alam[1],alam[2]))), alam.ini, data=Xdat, hessian=T, gradtol=1e-6)$est [1] 1.6363411 0.6847426 > nlm(function(alam, data) -sum(log(dgamma(data,alam[1],alam[2]))), alam.ini, data=Xdat, hessian=T, gradtol=1e-9)$est [1] 1.6363475 0.6847457 ### Analytical gradient of logL contributions for each X_i, as nX2 matrix grlogL = function(alam, data) cbind(log(data)+log(alam[2])-digamma(alam[1]), alam[1]/alam[2] - data) tmp = grlogL(Gfit2$est,Xdat) > c(t(tmp) %*% rep(1,400)) [1] 0.0003131229 0.0006105371 ### 0 to sufficient precision, after dividing by 400 > Mis.Info = t(tmp) %*% tmp [,1] [,2] [1,] 271.1854 -642.7003 [2,] -642.7003 2279.0728 #### compare with Observed Info, above > Gfit2$hess [,1] [,2] [1,] 333.3872 -584.1184 [2,] -584.1184 1395.5721 ### As shown in class, a "robustified" variance for the ML estimators via ### the mis-specified model is > Rob.Var = solve(Mis.Info) %*% Gfit$hess %*% solve(Mis.Info) [,1] [,2] [1,] 0.014215887 0.0030794743 [2,] 0.003079474 0.0008750884 compared with the "naive" variance > solve(Gfit2$hess) [,1] [,2] [1,] 0.011248119 0.004707914 [2,] 0.004707914 0.002687055 ### The corresponding SE's for the gamma-based MLE's are: > round(rbind(Naive= sqrt(diag(solve(Gfit2$hess))), Robust= sqrt(diag(Rob.Var)) ), 4) [,1] [,2] Naive 0.1061 0.0518 Robust 0.1192 0.0296 ### This means that it might take a large simulation to feel the effects of ### incorrect CI coverage for the shape-parameter a , but the estimates ### for lambda will actually have a lot smaller variance than the naive ### variance matrix would tell us. #------------------------------------------------------------------ 10/21/2015 "Classic" EM Algorithm Problem Contingency Table with Missing Cells. Consider the following data problem. Xv = sample(1:3, 500, prob=c(.45,.25,.3), rep=T) Yv = sample(1:4, 500, prob=c(.22,.32, .18, .28), rep=T) > XYtab0 = table(Xv,Yv) XYtab0 Yv Xv 1 2 3 4 1 45 76 40 65 2 29 39 21 33 3 40 47 28 37 NB > apply(XYtab,1,sum)/500 1 2 3 0.452 0.244 0.304 ### close to .45,.25,.3 as expected > apply(XYtab,2,sum)/500 1 2 3 4 0.228 0.324 0.178 0.270 ### close to .22, .32, .18, .28 as expected ### Now suppose that the (3,3),(2,4), and (1,1) cells are blanked out ### ie treated as missing, with only the number 45+33+28 = 106 of such ### missing observations known. In that view, the missing data is the information of exactly which (or how many of) those 106 observations fall ito each of the (X=1,Y=1), (X=3,Y=3) and (X=2,Y=4) cells. XYtab=XYtab0; XYtab0[cbind(1:3,c(1,4,3))] = NA XYtab0 Yv Xv 1 2 3 4 1 76 40 65 2 29 39 21 3 40 47 37 along with table total N=500. Data is XYtab0. sum(c(XYtab0), na.rm=T) ### 394, so 106 fell in missing cells We apply the EM algorithm, in the form= EMiter = function(pvec, qvec) { denom = pvec[1]*qvec[1]+pvec[3]*qvec[3]+pvec[2]*qvec[4] ptmp = c(181+106*pvec[1]*qvec[1]/denom, 89+106*pvec[2]*qvec[4]/denom, 124+106*pvec[3]*qvec[3]/denom) qtmp = c(69+106*pvec[1]*qvec[1]/denom, 162, 61+106*pvec[3]*qvec[3]/denom, 102+106*pvec[2]*qvec[4]/denom) list(pout = ptmp/sum(ptmp), qout = qtmp/sum(qtmp)) } ### So EMiter gives a mapping from one pair of 3 and 4 dimensional prob vectors ### to another such pair, and we want to iterate to convergence. > pv = rep(.333,3); qv = rep(.25,4) listvec = list(pv,qv) > for(i in 1:10) listvec = EMiter(listvec[[1]], listvec[[2]]) round(unlist(listvec),4) pout1 pout2 pout3 qout1 qout2 qout3 qout4 0.4699 0.2357 0.2944 0.2459 0.3240 0.1684 0.2617 > for(i in 1:10) listvec = EMiter(listvec[[1]], listvec[[2]]) round(unlist(listvec),4) pout1 pout2 pout3 qout1 qout2 qout3 qout4 0.4701 0.2356 0.2943 0.2461 0.3240 0.1683 0.2616 > listvec0 = listvec for(i in 1:10) listvec = EMiter(listvec[[1]], listvec[[2]]) round(unlist(listvec),4) pout1 pout2 pout3 qout1 qout2 qout3 qout4 0.4701 0.2356 0.2943 0.2461 0.3240 0.1683 0.2616 > unlist(listvec)-unlist(listvec0) pout1 pout2 pout3 qout1 qout2 5.886801e-07 -3.658261e-07 -2.228540e-07 5.886801e-07 0.000000e+00 qout3 qout4 -2.228540e-07 -3.658261e-07 ### So good convergence has happened ! ##-------------------------------------------------------------- ## Now a continuous distributional example -- random effects ANOVA Let Y_{ij} = mu_i + a[j] + eps[i,j] i=1,..,5, j=1,..4 where eps[i,j] ~ N(0,sig^2) iid independent of a[j] ~ N(0,siga^2) iid and mu_i are unknown parameters along with sig, sigA NB aj given Dobs is N(Ybar_{.j}, sig^2*0.2*sigA^2/(sigA^2+0.2*sig^2) Data: avec = rnorm(4,0,1.5) eps = array(rnorm(20,0,0.8), c(5,4)) Ymat = outer(c(6,3,5,2,4),avec,"+") + eps Yidot = apply(Ymat,1,mean) Ydotj = apply(Ymat,2,mean) Ybar = mean(Yidot) listpar = list(mu=rep(1,5), sigAsq=2, sigsq=1) EMiter2 = function(listpar) { mubar = mean(listpar[[1]]) gam = listpar[[2]]/(listpar[[2]]+0.2*listpar[[3]]) avar = 0.2*listpar[[3]]*gam newEaj = mubar + gam*(Ydotj-mubar) newmuvec = Yidot - mean(newEaj) newsgAsq = avar + mean(newEaj^2) newsgsq = avar + mean(c(Ymat - outer(newmuvec,newEaj,"+"))^2) list(mu =newmuvec, sigAsq=newsgAsq, sigsq=newsgsq) } ####-------------------------------------------------------- ## Now a continuous distributional example -- random effects ANOVA Let Y_{ij} = mu_i + a[j] + eps[i,j] i=1,..,5, j=1,..4 where eps[i,j] ~ N(0,sig^2) iid independent of a[j] ~ N(0,siga^2) iid and mu_i are unknown parameters along with sig, sigA NB aj given Dobs is N(gam*(Ybar_{.j}-mubar), sig^2*0.2*) where gam = sigA^2/(sigA^2+0.2*sig^2) and mubar = sum_{i=1}^5 mu_i/5 NOTE THAT THIS CONDITIONAL DISTRIBUTION WAS WRITTEN WRONGLY WHEN I DID THIS MUCH IN CLASS. IT IS CORRECTED HERE. Data: avec = rnorm(4,0,1.5) eps = array(rnorm(20,0,0.8), c(5,4)) Ymat = outer(c(6,3,5,2,4),avec,"+") + eps ## correct mu = (6,3,5,2,4), sigAsq=1.5^2, sigsq=0.8^2 Yidot = apply(Ymat,1,mean) Ydotj = apply(Ymat,2,mean) Ybar = mean(Yidot) listpar = list(mu=rep(1,5), sigAsq=2, sigsq=1) ### Initial values EMiter2 = function(listpar) { mubar = mean(listpar[[1]]) gam = listpar[[2]]/(listpar[[2]]+0.2*listpar[[3]]) avar = 0.2*listpar[[3]]*gam newEaj = gam*(Ydotj-mubar) newmuvec = Yidot - mean(newEaj) newsgAsq = avar + mean(newEaj^2) newsgsq = avar + mean(c(Ymat - outer(newmuvec,newEaj,"+"))^2) list(mu =newmuvec, sigAsq=newsgAsq, sigsq=newsgsq) } > listpar = EMiter2(listpar) ### first EM iterate round(unlist(listpar),3) mu1 mu2 mu3 mu4 mu5 sigAsq sigsq 3.340 0.907 3.037 -1.714 0.662 7.682 0.720 > listpar = EMiter2(listpar) round(unlist(listpar),3) mu1 mu2 mu3 mu4 mu5 sigAsq sigsq 3.385 0.952 3.082 -1.669 0.707 7.658 0.666 > listpar = EMiter2(listpar) round(unlist(listpar),3) mu1 mu2 mu3 mu4 mu5 sigAsq sigsq 3.427 0.993 3.124 -1.627 0.749 7.454 0.655 > listpar = EMiter2(listpar) round(unlist(listpar),3) mu1 mu2 mu3 mu4 mu5 sigAsq sigsq 3.468 1.034 3.165 -1.586 0.790 7.258 0.653 ### will require many more iterates ... > for (i in 1:50) listpar = EMiter2(listpar) round(unlist(listpar),3) mu1 mu2 mu3 mu4 mu5 sigAsq sigsq 5.524 3.091 5.221 0.470 2.846 1.697 0.654 ### Stabilizing, but not quite there yet ! ### Recall that the "true" parameter vector is ### (6,3,5,2,4, 1.5^2, 0.8^2) > for (i in 1:50) listpar = EMiter2(listpar) round(unlist(listpar),3) mu1 mu2 mu3 mu4 mu5 sigAsq sigsq 5.797 3.364 5.494 0.743 3.119 1.605 0.655 > for (i in 1:50) listpar = EMiter2(listpar) round(unlist(listpar),3) mu1 mu2 mu3 mu4 mu5 sigAsq sigsq 5.803 3.369 5.500 0.749 3.125 1.605 0.655 ### You can see appropriate convergence, but it is slow. > for (i in 1:100) listpar = EMiter2(listpar) round(unlist(listpar),3) mu1 mu2 mu3 mu4 mu5 sigAsq sigsq 5.803 3.369 5.500 0.749 3.125 1.605 0.655 #### But we seem to have gotten there !! ========================================================= 10/26/2015 Material from Slog4.txt log, updated: > steamdat <- matrix(c(10.98, 35.3, 11.13, 29.7, 12.51, 30.8, 8.40, 58.8, 9.27, 61.4, 8.73, 71.3, 6.36, 74.4, 8.50, 76.7, 7.82, 70.7, 9.14, 57.5, 8.24, 46.4, 12.19, 28.9, 11.88, 28.1, 9.57, 39.1, 10.94, 46.8, 9.58, 48.5, 10.09, 59.3, 8.11, 70.0, 6.83, 70.0, 8.88, 74.5, 7.68, 72.1, 8.47, 58.1, 8.86, 44.6, 10.36, 33.4, 11.08, 28.6), ncol=2, byrow=T, dimnames=list(NULL,c("steamuse","temp"))) > plot(steamdat[,2],steamdat[,1], xlab="Temp", ylab="Use", main="Draper-Smith Steam Data Example") > lmtmp = lm(steamuse ~ . , data=data.frame(steamdat)) > identify(steamdat[,2],steamdat[,1]) [1] 3 7 11 15 17 19 > summary(lmtmp) Call: lm(formula = steamuse ~ ., data = data.frame(steamdat)) Residuals: Min 1Q Median 3Q Max -1.6789 -0.5291 -0.1221 0.7988 1.3457 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 13.62299 0.58146 23.429 < 2e-16 *** temp -0.07983 0.01052 -7.586 1.05e-07 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.8901 on 23 degrees of freedom Multiple R-squared: 0.7144, Adjusted R-squared: 0.702 F-statistic: 57.54 on 1 and 23 DF, p-value: 1.055e-07 > names(summary(lmtmp)) [1] "call" "terms" "residuals" "coefficients" [5] "aliased" "sigma" "df" "r.squared" [9] "adj.r.squared" "fstatistic" "cov.unscaled" #### It is a good idea to test your model knowledge so that you can compute "by hand" (in R) as many as possible of the lm output list components, e.g.: > c(summary(lmtmp)$sigma^2, sum(lmtmp$residuals^2)/23) [1] 0.7923217 0.7923217 > summary(lmtmp)$cov.unscaled (Intercept) temp (Intercept) 0.426720377 -0.0073520984 temp -0.007352098 0.0001397737 > mat = model.matrix(lmtmp) solve(t(mat) %*% mat) (Intercept) temp (Intercept) 0.426720377 -0.0073520984 temp -0.007352098 0.0001397737 > c(summary(lmtmp)$r.sq, cor(steamdat[,1], lmtmp$fitted)^2) [1] 0.7144375 0.7144375 > rvar = { mtmp = model.matrix(lmtmp) summary(lmtmp)$sigma^2*diag(diag(25)-mtmp %*% solve(t(mtmp) %*% mtmp, t(mtmp))) } stdrsd = lmtmp$resid/sqrt(rvar) > plot(steamdat[,2],steamdat[,1], xlab="Temp", ylab="Use", main="Draper-Smith Steam Data Example") identify(steamdat[,2],steamdat[,1], labels=round(stdrsd,2)) [1] 3 7 11 15 17 19 20 > anova(lmtmp) Analysis of Variance Table Response: steamuse Df Sum Sq Mean Sq F value Pr(>F) temp 1 45.592 45.592 57.543 1.055e-07 *** Residuals 23 18.223 0.792 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 ## NB > c(sum(lmtmp$resid^2), sum((steamdat[,1]-mean(steamdat[,1]))^2)) [1] 18.2234 63.8158 ## 2nd number is "total (corrected) sum of squares" #### Continue using RlogF09.LinRegr.txt or StepExmp.Log >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Model syntax: y ~ x + z y ~ x*z > data(attitude) > dim(attitude) [1] 30 7 > tmpfit = lm( rating ~ . , data=attitude) update( ) update(tmpfit, .~ complaints) stpfit1 = step(tmpfit, ratings ~ (complaints + privileges + learning + raises + critical + advance)^2, direction = "forw") ### Further discussion about factors, contrasts ... 10/28/2015 GAUSSIAN QUADRATURE So far our numerical intergration formulas (Trapezoid, Simpson's) apply to finite ranges of integration and are taken from a large class of numerical quadrature formulas callled "Newton-Cotes formulas". Another approach, called "Gaussian Quadrature", applies also to infinite ranges and approximates integrals \int g(x) f(x) dx for functions g (smooth, well-approximated by polynomials) by sums \sum_{k=1}^n g(p_k) w_k when f(x) is any of a number of special families of "kernels" (mostly, probability densities after scaling). The most important example for us is Gaussian quadrature with respect to the "error-function" kernel f(x) = exp(-x^2) on (-Infty,Infty) which we view as the N(0,1/2) scaled up by a factor sqrt(pi). > GHquad.wt = #### adapted from "Numerical Recipes" book function (N) { bvec <- sqrt((1:(N - 1))/2) elist <- eigen(matrix(replace(rep(0, N * N), (N + 1) * (1:(N - 1)), bvec) + replace(rep(0, N * N), seq(2, N * (N - 1), N + 1), bvec), ncol = N), symm = T) list(pts = elist$values, wts = sqrt(pi) * c(elist$vectors[1, ])^2) } Here N is the desired number of points (placed symmetrically around the origin) In order to calculate E g(Z) for Z ~ N(0,1), view Z = sqrt(2)*Z' where Z' ~ N(0, 1/2) tmp = GHquad.wt(20) ### N may be taken larger, but sometimes not necessary! sum( g(sqrt(2)*tmp$pts)*tmp$wts )/sqrt(pi) ### Example > tmp=GHquad.wt(20); tmpB=GHquad.wt(100) > sum( (sqrt(2)*tmp$pts)^2*tmp$wts ) /sqrt(pi) - 1 [1] 2.220446e-15 > sum( (sqrt(2)*tmpB$pts)^2*tmpB$wts ) /sqrt(pi) - 1 [1] -6.994405e-15 > sum( (sqrt(2)*tmp$pts)^4*tmp$wts ) /sqrt(pi) - 3 [1] 4.884981e-15 > sum( (sqrt(2)*tmpB$pts)^4*tmpB$wts ) /sqrt(pi) - 3 [1] -3.330669e-14 ### Now let's try a function g not a low-order polynomial > A1 = sum( log(1+2*3.5*tmp$pts^2) * tmp$wts )/sqrt(pi) > A2 = sum( log(1+2*3.5*tmpB$pts^2) * tmpB$wts )/sqrt(pi) > c(A1=A1, A2=A2, diff=A1-A2) A1 A2 diff 1.094413002 1.089033938 0.005379065 > integrate(function(x) log(1+3.5*x^2)*dnorm(x),-Inf,Inf) 1.089027 with absolute error < 4.8e-08 > integrate(function(x) log(1+3.5*x^2)*dnorm(x),-Inf,Inf, rel.tol=1.e-9) 1.089027 with absolute error < 2.4e-11 > tmpC = GHquad.wt(160) > sum( log(1+2*3.5*tmpC$pts^2) * tmpC$wts )/sqrt(pi) [1] 1.089028 ### Suppose we want integrals of log(1+b*x^2)*dnorm(x) for all b in seq(1,3.5,by=.01) bvec = seq(1,10,by=.01) vec1 = vec2 = numeric(901) unix.time({for(i in 1:901) vec1[i] = 2*integrate(function(x,b) log(1+b*x^2)*dnorm(x),0,Inf,rel.tol=1.e-10, b=bvec[i])$val }) user system elapsed 0.11 0.00 0.10 ### 900 high-accuracy integrals, 0.13 sec. > unix.time( { tmp = GHquad.wt(100) mat = log(1+outer(2*bvec, tmp$pts^2,"*")) vec2 = c(mat %*% tmp$wts)/sqrt(pi) } ) user system elapsed 0.01 0.00 0.02 > summary(vec1-vec2) ### errors range from 1e-9 near b=1 #### to 1e-4 near b=10 > unix.time( { tmp = GHquad.wt(200) mat = log(1+outer(2*bvec, tmp$pts^2,"*")) vec2 = c(mat %*% tmp$wts)/sqrt(pi) } ) user system elapsed 0.01 0.00 0.02 > summary(vec1-vec2) Min. 1st Qu. Median Mean 3rd Qu. Max. -2.431e-05 -7.312e-06 -1.122e-06 -4.661e-06 -3.171e-08 0.000e+00 > unix.time( { tmp = GHquad.wt(400) mat = log(1+outer(2*bvec, tmp$pts^2,"*")) vec2 = c(mat %*% tmp$wts)/sqrt(pi) } ) user system elapsed 0.11 0.00 0.11 summary(vec1-vec2) Min. 1st Qu. Median Mean 3rd Qu. Max. -4.229e-07 -7.689e-08 -5.371e-09 -6.133e-08 -3.380e-11 0.000e+00 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 11/9/2015 GLM material roughly following log "RlogF09.GLM.txt" X1 = runif(120); X2 = rpois(120,3) ntrials = rep(c(5,6,3), 40) success = rbinom( 120, ntrials, plogis(-1 + 0.6*X1 + 0.1*X2) ) failure = ntrials - success simlogis = data.frame( X1, X2, success, failure ) > fitB0 = glm ( cbind(success, failure) ~ X1 + X2, family=binomial, data=simlogis ) ### NB: within family=binomial, link="logit" is the default, ### but other possibilities exist, most often "probit" or "cloglog" ### NB: there is another syntax, sometimes more useful, giving EXACTLY THE SAME MODEL FITTED OBJECTS. > fitBAlt = glm ( I(success/ntrials) ~ X1 + X2, weights=ntrials, family=binomial, data=simlogis ) > sum(abs(fitBAlt$fit - fitB0$fit)) [1] 0 > sum(abs(c(summary(fitBAlt)$coef - summary(fitB0)$coef))) [1] 0 ### OFTEN THE OUTCOME OF USING "weights" ARGUMENTS IN MODEL-FITTING ## FUNCTIONS IN R IS MUCH LESS TRANSPARENT !! ------------------------------------------------- Following log "GLMlog.R", get data "Data/pbcdata.asc" > pbcdat = read.table("http://www.math.umd.edu/~evs/s705/Data/pbcdata.asc", header=T) > names(pbcdat) [1] "OBS" "IDNUM" "DTH" "EVTTIME" "TREATGP" "LOGBILI" "AGEVAR" [8] "CIRRH" "CCHOL" "ALBUMIN" > ind = pbcdat$DTH==1 | pbcdat$EVTTIME >= 41 Surv41 = as.numeric(pbcdat$EVTTIME < 41) pbcdat = cbind.data.frame(pbcdat[,3:10], Surv41)[ind,] ## 172 x 9 > tmpglm = glm(cbind(Surv41, 1-Surv41) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family = binomial) ### Simulation to introduce "dispersion" in the form of shared random ### effect within cluster reff = runif(172,-2,2) yrsp1 = rbinom(172, rep(10,172),plogis(tmpglm$lin + reff)) #### 172 y's like the binary outcomes in PBCdata except they now ### arise from clusters of 10 with shared "reff" ### Think of yrsp1[i]/10 = Ybar_{i.} as the "data" for the ith cluster > mean(dlogis(tmpglm$lin + reff)) ### E( Var Ybar_{i.} ) [1] 0.132752 > mean((10/9)*.1*yrsp1*(1-0.1*yrsp1)) ### estimator of E( Var Ybar_{i.} ) [1] 0.1264858 ### average across i. glmfitA = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family=binomial) glmfitB = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family=quasibinomial) sum(abs(glmfitA$coef-glmfitB$coef)) ### 0 > c(summary(glmfitB)$disp, summary(glmfitA)$disp) [1] 2.494836 1.000000 > sum(10*(yrsp1/10-glmfitB$fit)^2/(glmfitB$fit*(1-glmfitB$fit)))/(172-4) [1] 2.494836 ======================================================================= 11/11/2015 Small Initial Demonstration of Smoothing Spline =============================================== > xv = runif(500, 0,2) fxv = sqrt(plogis(xv +2.5*xv - 5*xv^2 + 2*xv^3)) yv = fxv + 0.03*rnorm(500) ordx = order(xv) plot(xv[ordx], yv[ordx]) curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) tmpspl1 = smooth.spline(xv,yv) lines(xv[ordx], predict(tmpspl1,xv[ordx])$y, col="blue", lwd=3) > tmpspl1$spar [1] 0.7934125 > tmpspl2 = smooth.spline(xv,yv, spar=0.01) lines(xv[ordx], predict(tmpspl2,xv[ordx])$y, col="green", lwd=3) plot(xv[ordx], yv[ordx]) curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) tmpspl3 = smooth.spline(xv,yv, spar=1) lines(xv[ordx], predict(tmpspl3,xv[ordx])$y, lwd=3) yv2 = fxv + 0.001*rnorm(500) plot(xv[ordx], yv2[ordx]) curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) tmpspl4 = smooth.spline(xv,yv) lines(xv[ordx], predict(tmpspl4,xv[ordx])$y, col="blue", lwd=2) summary(fxv - predict(tmpspl4,xv)$y) Min. 1st Qu. Median Mean 3rd Qu. Max. -6.230e-03 -3.092e-03 -5.708e-05 -7.092e-04 1.010e-03 6.539e-03 >>>>> Continue with other smoothing and nonparametric regression techniques nprgfram = cbind.data.frame(x=xv, y=yv) > plot(nprgfram$x,nprgfram$y, main="Scatterplot of Raw Data, n=500") for(i in 1:8) abline(v=i*.2, lty=3) for(i in 1:9) { inds <- (1:500)[abs(nprgfram$x-.2*i+.1) < 0.1] itmp <- order(nprgfram$x[inds]) lines(nprgfram$x[inds[itmp]], lm(y ~ x, data=nprgfram[ inds,])$fit[itmp], lty=1, lwd=2) } curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) ### Now consider a lowess fit: > plot(nprgfram$x,nprgfram$y, main="Raw Data with Lowess Line, n=500") tmplow = lowess(nprgfram$x,nprgfram$y) lines(tmplow$x,tmplow$y, lty=1, lwd=2) curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) ### locpoly(x, y, drv = 0, degree, kernel = "normal", bandwidth, gridsize = 401L, bwdisc = 25, range.x, binned = FALSE, truncate = TRUE) tmppoly = locpoly(xv, yv, degree=3, bandwidth=0.1) plot(tmppoly) points(nprgfram$x,nprgfram$y, main="Scatterplot of Raw Data, n=500") curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), add=T, col="red", lwd=2) ### DISCUSSION: USE OF SPLINES FOR NUMERICAL APPROXIMATION 11/18/15 ## Consider the problem of sampling from a specified distribution, ## by the probability integral transform. ## Let's do this with t_5 xv = c((1:500)/10, 10 + (1:20)/2) xv = c(-rev(xv),0,xv) > pt(xv[1],5) [1] 2.887758e-06 > yv = pt(xv,5)) curve(log(pt(x,5)), -5,5) tmpy = smooth.spline(log(yv),xv) plot((1:999)/1000,predict(tmpy,log((1:999)/1000))$y, typ="l") curve(qt(x,5), .001,.999, add=T, col="red") ### Now to simulate 100,000 of them using the spline-fitted function ### Note that the evaluation function is unix.time({rant = predict(tmpy, log(runif(1e5)))$y}) user system elapsed 0.03 0.00 0.03 hist(rant, prob=T, nclass=128, xlim=c(-6,6)) curve(dt(x,5), -6,6, add=T, col="green") ### I will ask you to do this in a more difficult case, for an exercise ### SPLINE BASIS EXAMPLE ## Start with a fairly complicated curve mixran = function(x, pts, seednum) { set.seed(seednum) coef = 10*(runif(length(pts))-0.5) structure(c(outer(x, pts, function(x,y) dnorm(x-y)) %*% coef), seed=seednum, coef=coef) } xv = seq(-5,5,by=0.1) mixran(xv[1:5], c(-1,0.5,2), 333) [1] -4.461042e-05 -6.632645e-05 -9.765776e-05 -1.424013e-04 -2.056501e-04 attr(,"seed") [1] 333 attr(,"coef") [1] -0.3299934 -4.1540185 4.7348527 plot(xv, mixran(xv, c(-1,0.5,2), 333)) yv3 = mixran(xv, c(-3,-2,-0.5,0.8,2,4), 333) + rnorm(101,0,0.8) > library(splines) library(KernSmooth) dim(mat3) ### 101 x 8 mat3 = bs(xv, knots=seq(-4,4,by=2), degree=3) ### dim=length(knots)+degree fity = lm(yv3 ~ . , data=cbind.data.frame(yv3,mat3)) plot(xv,yv3) lines(xv, mixran(xv, c(-3,-2,-0.5,0.8,2,4), 333), col="red") points(xv, fity$fit, pch=5) ### What if we took sparser knots ? mat4 = bs(xv, knots=c(-3,0,3), degree=3) points(xv, lm(yv3 ~ . , data=cbind.data.frame(yv3,mat4))$fit, pch=18) ================================================================== Illustration of Data Replication Methods ======================================== 11/23/2015 DATA-SPLITTING, PARAMETRIC BOOTSTRAP, & BAYES POSTERIOR PREDICTIVE SIMULATION ## Data illustration set.seed(5) x = runif(200) y = 3 - 7*x + 10*(0.1+abs(x-.5))^1.5*rt(200,10) rtwt = 1/(0.1+abs(x-.5))^1.5 Dset = cbind.data.frame(y, x, wt=rtwt^2) > summary(rtwt^2) Min. 1st Qu. Median Mean 3rd Qu. Max. 4.760 9.806 22.580 91.930 90.610 761.900 x1st = rtwt x2st = x*rtwt yst = y*rtwt Dset2 = cbind.data.frame(yst,x1st,x2st) AuxY = array(0, c(1000,200)) > tmp1 = lm(y~x, data=Dset[1:100,], weight=wt) ## NOTE: S3 method for class 'lm' predict(object, newdata, se.fit = FALSE, scale = NULL, df = Inf, interval = c("none", "confidence", "prediction"), level = 0.95, type = c("response", "terms"), terms = NULL, na.action = na.pass, pred.var = res.var/weights, weights = 1, ...) tmp2 = predict(tmp1,Dset[1:100,], interval="none", type="response", weights=Dset$wt[1:100]) > summary(tmp2-tmp1$fit) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000e+00 8.882e-16 1.332e-15 1.578e-15 1.776e-15 2.642e-14 NOTE THAT IN ALL OF THE COMPUTATIONAL METHODS STUDIED IN THIS EXTENDED EXAMPLE, WE COMPUTE THE (unweighted) RMSE FOR Y-PREDICTIONS BUT DO THE MODEL-FITTING BY A WEIGHTED-LEAST SQUARES METHOD. Data-splitting method: 50-50 splits for RMSE --------------------- TstDat = TrnDat = array(0, c(1000,100)) TrnMSE = SplMSE = numeric(1000) for(i in 1:1000) { ind = sample(1:200, 100) TrnDat[i,] = setdiff(1:200,ind) tmp = lm(y ~ x, weights=wt, data=Dset[TrnDat[i,],]) TrnMSE[i] = mean(tmp$resid^2) SplMSE[i] = mean((Dset$y[ind] - predict(tmp,Dset[ind,], interval="none", type="response", weights=Dset$wt[ind]))^2) } > summary(sqrt(SplMSE)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.898 2.421 2.559 2.552 2.694 3.137 > summary(sqrt(TrnMSE)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.809 2.371 2.514 2.508 2.652 3.034 IT IS NO SURPRISE THAT THE TrnMSE VALUES ARE SMALLER THAN THE TEST-DATA SplMSE VALUES, ALTHOUGH HERE THE DIFFERENCE IS SMALL. AS WE REMARKED IN CLASS, THIS METHOD OF PSEUDO-SAMPLE REPLICATION GENERATES X'S AND Y'S TOGETHER. "Parametric Bootstrap" ---------------------- ## generate residuals anew from N(0,sighat) ## "predictors" kept from tmp1 with xvec = x[1:100] wts = Dset$wt[1:100] AuxY = tmp1$fitted + array(rnorm(100*1000,0, rep((0.1+abs(xvec-0.5))^1.5*summary(tmp1)$sig,100)), c(100,1000)) ### Now can look at the MSEs from re-fitting the model 1000 times MSEboot = numeric(1000) for(i in 1:1000) MSEboot[i] = mean(lm(AuxY[,i] ~ xvec, weights=wts)$resid^2) > summary(sqrt(MSEboot)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.651 2.218 2.361 2.364 2.502 3.107 THIS FIRST WAY OF DOING THE PARAMETRIC BOOTSTRAP GENERATES ONLY Y'S FOR A FIXED SET OF X'S. YOU COULD SAY THAT WHAT WE GENERATE IN THIS WAY IS A SET OF "CONDITIONAL" PSEUDO-SAMPLES. THAT IS, THE VARIABILITY REPLICATED HERE IS STRICTLY THAT DUE TO THE CONDITIONAL DENSITY OF Y GIVEN X. NEXT LET'S DO A SLIGHTLY MORE ELABORATE PARAMETRIC-BOOTSTRAP SIMULATION IN WHICH X'S AND Y'S ARE RE-GENERATED. BUT I PROPOSE, AS I MENTIONED IN CLASS, A SIMULATION DESIGN IN WHICH WE CAN SEPARATE THE EFFECTS ON MSE OF THE ESTIMATOR OF DIFFERENT SETS OF X'S FROM THE CONDITIONAL SAMPLING VARIABILITY OF Y'S GIVEN FIXED X'S. We choose to simulate B=100 different X-vectors X^{(b)}, and then for each one, R=1000 independently generated Y-vectors Y^{(b,r)} from the conditional distribution of Y given X=X^{(b)}. Xarr = array(runif(1e4), c(100,100)) ### index (b,i) Yarr = array(0, c(1000,100,100)) ## indices (r,b,i) : r indexes y-vector replicate within fixed X^{(b)}, ### and i=1:100 indexes data points within each batch of n=100. abhat = tmp1$coef sighat = summary(tmp1)$sig epsarr = array(0, c(100,1000)) Wtarr = (0.1+abs(Xarr-0.5))^(-1.5) Wtarr = (1/apply(Wtarr,1,sum))*Wtarr for(b in 1:100) { epsarr[,] = rnorm(1e5) Yarr[,b,] = t( (abhat[1] - abhat[2]*Xarr[b,]) + (sighat*(0.1+abs(Xarr[b,]-0.5))^1.5)*epsarr ) } ## Now we have the large arrays of Y's and X's and we need to ## vectorize the weighted least squares estimates in order to make ## the computations quick. Ypred = replace(Yarr,T,0) ### array like Yarr filled with 0's Xmn = apply(Wtarr*Xarr,1,sum) XctrArr = -Xmn + Xarr SXXsq = apply(Wtarr*XctrArr^2,1,sum) auxmat = array(outer(rep(1,1000), Wtarr*XctrArr)*Yarr, c(1000*100,100)) b2hatarr = array( c( auxmat %*% rep(1,100))/ rep(SXXsq, rep(1000,100)), c(1000,100)) ### 1000 x 100 (r,b) b1hatarr = apply(outer(rep(1,1000),Wtarr)*Yarr,1:2,sum) - t( Xmn*t(b2hatarr) ) for(b in 1:100) Ypred[,b,] = b1hatarr[,b] + outer(b2hatarr[,b],Xarr[b,]) MSEarr = apply( (Yarr-Ypred)^2, 1:2, mean) ### 1000x100 (r,b) Biasarr = apply(Ypred - Yarr, 1:2, mean)> summary(c(Biasarr)) Min. 1st Qu. Median Mean 3rd Qu. Max. -0.6123000 -0.0921800 -0.0005000 -0.0004574 0.0912800 0.5894000 ### Biases are small but not 0. They would be 0 if the averages of prediction errors within sample were taken weighted by Wtarr. ### All of this was quick. Now we have an array of MSEs for all (r,b) ### and we break MSE into 3 parts each of which we estimate via simulation: ### MSE (Ypred_i^{(r,b)}) = E( (Y_i^{(r,b)} - Ypred_i^{(r,b)})^2 ) ### + E( Var(Y_i^{(r,b)} - Ypred_i^{(r,b)} | X[b,.]) ) ### + Var( E(Y_i^{(r,b)} - Ypred_i^{(r,b)} | X[b,.]) ) ## First term is bias^2, which we estimate from Biasarr^2 > mean(c(Biasarr^2)) [1] 0.01865613 > mean( apply((Yarr - Ypred)^2, 2, mean) ) [1] 5.081514 > sd( apply((Yarr - Ypred)^2, 2, mean) ) [1] 0.5081083 > var( c(Yarr-Ypred)) [1] 5.081514 DID WE LEARN ANYTHING EXTRA BY DOING THE SIMULATION BY BLOCKS b ? WE FOUND OUT HOW VARIABLE OUR OVERALL ESTIMATED ANSWER WAS DUE TO SAMPLING VARIATION FROM ONE SET OF X'S TO ANOTHER. HOW DID THIS SIMULATION COMPARE WITH THE DATA-SPLITTING MSE ? THE RMSE WE FOUND THERE WAS 2.508 WHILE WE JUST FOUND THE PARAMETRIC-BOOTSTRAP RMSE TO BE > sqrt(5.081514) [1] 2.254221 THE RMSE FOUND FROM THE PARAMETRIC BOOTSTRAP WITH SINGLE FIXED Xvec WAS 2.364, but since that was for just a single Xvec, and we now know from the two-level (r and b) simulation that the variance of the MSE estimator was (0.5018)^2, these different parametric-bootstrap estimators (one with fixed X's, the other with varying X's) are within range of one another. NOTE HOWEVER THAT WE MAY ALSO GUESS THAT THE DATA-SPLITTING MSE IS LARGER BECAUSE IT TAKES INTO ACCOUNT THE ACTUAL DATA (200 DATA-PAIRS) WHICH DID NOT PRECISELY SATISFY THE MODEL ASSUMED BY THE PARAMETRIC BOOTSTRAP. BAYES posterior predictive sampling ----------------------------------- We continue with the same model as above, but define a standard conjugate prior for the parameters theta = (b1,b2, sigma^2) within a normal-errors model for the data with known weight-pattern (ie standard deviation of y given x known to be proportional to (0.1+abs(x-.5))^1.5. We will continue with that model as we did for the parametric bootstrap, although recall that it is slightly misspecified because the errors were actually t_{10} distributed rather than normal. ### We want to define a prior conjugate for the model y ~ N( b1 + b2*x, (sig*(0.1+abs(x-0.5))^{1.5})^2 ) ## For this, could take b1,b2 ~ indep N(0, sig0^2) , sig0 large and ## known, say 5 and 1/sig^2 ~ Gamma(1, lam0) for small lam0, say 0.1. ## Then with tau = 1/sig^2, the posterior is proportional to tau^{n/2} exp( -0.02*(b1^2+b2^2) - 0.5*tau*sum(yi-b1-b2*xi)^2 ## This is conjugate (posterior of the same form as prior) for (b1,b2) ## if sig had been known and fixed ## and for tau or sig^2 if b1,b2 were known and fixed. But jointly, it ## is not conjugate. NOTE: there is a Theorem (Bernstein-von Mises) which says that in large iid samples, the posterior is approximately normal centered at the MLE and with variance (like the MLE!) the inverse of the Fisher Information Here the Fisher Information from the fitted model is obtained as > Infmat = array(c(sum(Dset$wt), sum(Dset$wt*Dset$x), 0, sum(Dset$wt*Dset$x), sum(Dset$wt*Dset$x^2), 0, 0, 0, nrow(Dset)/sighat^2)/sighat^2, c(3,3)) > round(solve(Infmat), 4) [,1] [,2] [,3] [1,] 0.0840 -0.1613 0.0000 [2,] -0.1613 0.3267 0.0000 [3,] 0.0000 0.0000 32.0117 ## mvrnorm(n = 1, mu, Sigma, tol = 1e-6, empirical = FALSE, EISPACK = FALSE) ## Approximate Bayesian Idea in simulating new data pseudo-samples is (a) to draw multivariate normal (b1H,b2H,sigH^2) ~ N( (b1hat, b2hat,sighat^2), Infmat) (b) generate x ~ unif and y given x as N(b1H + b2H*x, {sigH*(0.1+abs(x-0.5))^1.5}^2) HOWEVER: use Delta method upon transformed parameter nu = log(sig^2) to force the simulated variance values to be positive ! NB. We can get an idea of the degree of (NON-) normality of sighat distribution by using the array of Yarr - Ypred = Yresid values to estimate a 1000 x 100 (r,b) array of parametric-bootstrap Monte Carlo simulated ones, as follows: > library(MASS) ### mvrnorm(n = 1, mu, Sigma, tol = 1e-6, empirical = FALSE, EISPACK = FALSE) > dim(Yarr) [1] 1000 100 100 > sghatsqArr = apply( (Yarr-Ypred)^2, 1:2, mean) > dim(sghatsqArr) [1] 1000 100 > hist( sghatsqArr[,2], nclass=50, prob=T) > hist( sghatsqArr, nclass=50, prob=T) > curve(dnorm(x,mean(sghatsqArr), sd(sghatsqArr)), 2,12, add=T, col="green") #### definitely off !!! ### Try to overplot gamma with matching 1st two moments instead > beta = mean(c(sghatsqArr))/var(c(sghatsqArr)) alph = beta*mean(c(sghatsqArr)) curve(dgamma(x,alph, beta), 2,12, add=T, col="blue") ##---------------------------------------------------------- > c(coef = tmp1$coef, sigsq = summary(tmp1)$sig^2) coef.(Intercept) coef.x sigsq 2.282569 -5.560408 80.014571 ### "True" b1,b2 values from simulation were 3, -7. ### When sig^2=80, obtain "theoretical" MSE as the ### expectation of 80*(t_{10})^2*E((0.1+abs(U-0.5))^3) = > 80*1.25*0.06475 = 6.475 ### derived from > integrate(function(x) x^2*dt(x,10), -Inf,Inf)$val [1] 1.25 > integrate(function(x) (0.1+abs(x-0.5))^3, 0,1)$val [1] 0.06475 ##>>>=========================================== Continuation 11/30/2015 USE THE PRECEDING NON-NORMALITY OF sighatsq TO ARGUE THAT WE SHOULD NOT SIMPLY USE THE BERNSTEIN-VON MISES THEOREM TO SAMPLE FROM MULTIVARIATE NORMAL AS A SHORTCUT FOR THE BAYESIAN POSTERIOR, BUT RATHER APPROXIMATE THE POSTERIOR BY MULTIVARIATE NORMAL IN (b1,b2) ALONG WITH INDEPENDENT GAMMA (motivated by the form of the information matrix) FOR SIGSQ. > rhovec = numeric(100) > for(i in 1:100) rhovec[i] = cor(b1hatarr[,i],b2hatarr[,i]) > hist(rhovec) > summary(rhovec) Min. 1st Qu. Median Mean 3rd Qu. Max. -0.9807 -0.9716 -0.9673 -0.9662 -0.9617 -0.9464 #### STRONG NEGATIVE CORRELATION BETWEEN B1, B2 > var(cbind(c(b1hatarr), c(b2hatarr))) [,1] [,2] [1,] 0.2207807 -0.4124677 [2,] -0.4124677 0.8253712 ### corr = -0.966 > c(alpha=alph, beta=beta) ### gamma parameters for sigsq alpha beta 20.538710 4.041849 ### mean here is 5.0815, while > mean((Dset$y[1:100]-tmp1$fit)^2) [1] 6.153456 #### Again, actual data shows greater variances ### which is partly due to true t_10 errors. #### Using Infmat^{-1} above for variance would suggest sd's > round(sqrt(diag(solve(Infmat))), 4) [1] 0.2898 0.5716 5.6579 ### and b1hat,b2hat correlation > aux = solve(Infmat); aux[1,2]/sqrt(aux[1,1]*aux[2,2]) [1] -0.9737398 ### NOTE that sampling from this posterior would also suffer from ### the same parametric-model misspecification ?! ## We would be sampling from neighborhood b2 in -5.560 +/- 1.96*0.572 b1 approx. = ybar - b2*xbar sigsq approx Gamma(17.68, 3.04) ##>===================================================== NONPARAMETRIC BOOTSTRAP 11/30/2015 IS THERE AN IDEA THAT LETS US LEARN ABOUT THE PERFORMANCE OF THIS KIND OF DATA ANALYSIS WITHOUT ASSUMING WE KNOW THE CORRECT (X,Y) MODEL ? Yes, the NONPARAMETRIC BOOTSTRAP ! ## (I) Bootstrap resampling of the iid pairs (X_i, Y_i) ## Assume the only data we have is Dset[1:100,] (including weights). indArr = array(sample(1:100,1e5, replace=T), c(1000,100)) #### corrected the next statements to reflect weighted least-squares auxXB = array(Dset$x[c(indArr)], c(1000,100)) auxYB = array(Dset$y[c(indArr)], c(1000,100))Ones = rep(1/100,100) wtB = (0.1+abs(auxXB-0.5))^(-3) wtB = (1/apply(wtB,1,sum))*wtB avXB = apply(wtB*auxXB,1,sum) avYB = apply(wtB*auxYB,1,sum) b2hatB = c( ((-avXB + auxXB)*wtB*auxYB) %*% Ones / (wtB*(-avXB + auxXB)^2) %*% Ones ) b1hatB = avYB - b2hatB*avXB MSEB = c( (b1hatB + b2hatB*auxXB - auxYB)^2 %*% Ones ) tmp = hist(MSEB, nclass=30, prob=T) library(KernSmooth) lines(locpoly(tmp$mids, tmp$density, degree=3, bandw = 0.8), col="blue") curve(dnorm(x,mean(MSEB), sd(MSEB)), 2,20, add=T, col="red") ### so the MSEs are skew-distributed, not quite normal #### Picture saved as "NPBootMSEpic.pdf"> c(mean = mean(MSEB), sd = sd(MSEB)) mean sd 6.072114 1.417756 ### MSE quite a bit larger than ~ .5 suggested by ### parametric bootstrap, and SD larger too ### NOTE that this way of bootstrapping is letting the residual #### t-distribution express itself better !! tmp2 = hist(b2hatB, nclass=30, prob=T) lines(locpoly(tmp2$mids, tmp2$density, deg=3, bandw = 0.8), col="blue") curve(dnorm(x,mean(b2hatB), sd(b2hatB)),-12,-2, add=T, col="red") ### much closer to normal, but still not quite there ## (II) Bootstrap resampling of residuals*rtwt for fixed X_i ## recall xvec is Dset$x[1:100], rtwt[1:100] = 1/(0.1+abs(xvec-0.5))^1.5 ## use indArr above to resample residuals and then re-create Y's epsBArr = array((tmp1$resid*rtwt[1:100])[c(indArr)], c(1000,100)) YNPboot = tmp1$fitted + (1/rtwt[1:100])*epsBArr wgts = Dset$wt[1:100]/sum(Dset$wt[1:100]) avYB2 = c(YNPboot %*% wgts) xvav = sum(xvec*wgts) b2hatB2 = c( (outer(rep(1,1000),xvec-xvav)*YNPboot) %*% wgts )/ sum((xvec-xvav)^2*wgts) b1hatB2 = avYB2 - b2hatB2*xvav MSEB2 = c( (b1hatB2 + outer(b2hatB2,xvec) - YNPboot)^2 %*% Ones ) > c(mean(MSEB2), sd(MSEB2)) [1] 5.814071 5.229153 #### Note smaller MSE when X fixed. ### THIS WAY OF BOOTSTRAPPING IMPOSES THE STRUCTURE OF INDEPENDENCE ### of X and epsilon's but the rescaling of tmp1$resid ### by rtwt seems to make the variability of the MSE estimates ### much larger because of the large range of weights. ## (III) Could separately and independently bootstrap X and residuals auxXB3 = array(sample(xvec,1e5,replace=T), c(1000,100)) YNPboot3 = tmp1$coef[1]+tmp1$coef[2]*auxXB3 + (1/rtwt[1:100])*epsBArr wtB3 = (0.1+abs(auxXB3-0.5))^(-3) wtB3 = (1/apply(wtB3,1,sum))*wtB3 avYB3 = apply(wtB3*YNPboot3,1,sum) avXB3 = apply(wtB3*auxXB3,1,sum) b2hatB3 = c( ((-avXB3 + auxXB3)*wtB3*YNPboot3) %*% Ones / (wtB3*(-avXB3 + auxXB3)^2) %*% Ones ) b1hatB3 = avYB3 - b2hatB3*avXB3 MSEB3 = c( (b1hatB3 + b2hatB3*auxXB3 - YNPboot3)^2 %*% Ones ) > c(mean(MSEB3), sd(MSEB3)) [1] 5.826440 5.249827 ### VERY SIMILAR RESULTS TO THOSE IN (II) ABOVE. #### The average MSE stays small when ### we incorporate the independence of X's and epsilon's ! ============================================================= 12/2 WE FINISH THIS EXAMPLE WITH WEIGHTED LEAST-SQUARES (WITH t_{10} ERRORS) BY ASKING AND ANSWERING: WHAT IS THE "RIGHT" ANSWER FOR THE AVERAGE AND SD OF UNWEIGHTED MSE FOR Y-PREDICTORS BASED ON THE DATA-SAMPLES OF SIZE 100 WITH KNOWN VARIANCE-WEIGHTING (AS A FUNCTION OF X_i) for the kind of data (with t_{10} not N(0,1) errors) we were actually working with ??? (a) Conditional for fixed X's at xvec AuxY.MC = 3 - 7*xvec + array(rep(10*(0.1+abs(xvec-0.5))^1.5,1000)* rt(100*1000,10), c(100,1000)) ### Now can look at the MSEs from re-fitting the model 1000 times MSE.MC = numeric(1000) for(i in 1:1000) MSE.MC[i] = mean(lm(AuxY.MC[,i] ~ xvec, weights=Dset$wt[1:100])$resid^2) > summary(sqrt(MSE.MC)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.973 2.711 2.908 2.936 3.145 4.044 > c(mean=mean(MSE.MC), sd = sd(MSE.MC)) mean sd 8.728081 1.977124 (b) UNconditional, with varying (iid) X's AuxX.MC2 = array(runif(1e5), c(100,1000)) RtWt.MC2 = (0.1+abs(AuxX.MC2-0.5))^1.5 AuxY.MC2 = 3 - 7*AuxX.MC2 + 10*RtWt.MC2* array(rt(100*1000,10), c(100,1000)) MSE.MC2 = numeric(1000) for(i in 1:1000) MSE.MC2[i] = mean(lm(AuxY.MC2[,i] ~ AuxX.MC2[,i], weights= RtWt.MC2[,i]^2)$resid^2) > summary(sqrt(MSE.MC2)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.841 2.546 2.775 2.805 3.036 4.497 > c(mean=mean(MSE.MC2), sd = sd(MSE.MC2)) mean sd 7.998849 2.091280