Example Logs, STAT 705, Fall 2015
           =================================

                  Part III,   November 2015
                  =========================

                                                     11/9/2015

GLM material roughly following log "RlogF09.GLM.txt"

 X1 = runif(120); X2 = rpois(120,3)
  ntrials = rep(c(5,6,3), 40)  
  success = rbinom( 120, ntrials, plogis(-1 + 0.6*X1 + 0.1*X2) )
  failure = ntrials - success
  simlogis = data.frame( X1, X2, success, failure )

>  fitB0 = glm ( cbind(success, failure) ~ X1 + X2,
                   family=binomial, data=simlogis )

### NB: within family=binomial, link="logit" is the default,
###   but other possibilities exist, most often "probit" or "cloglog" 

### NB: there is another syntax, sometimes more useful, giving 
EXACTLY THE SAME MODEL FITTED OBJECTS.

> fitBAlt = glm ( I(success/ntrials) ~ X1 + X2, weights=ntrials,
                   family=binomial, data=simlogis )
> sum(abs(fitBAlt$fit - fitB0$fit))
[1] 0

> sum(abs(c(summary(fitBAlt)$coef - summary(fitB0)$coef)))
[1] 0

### OFTEN THE OUTCOME OF USING "weights" ARGUMENTS IN MODEL-FITTING 
## FUNCTIONS IN R IS  MUCH LESS TRANSPARENT !!

-------------------------------------------------

Following log  "GLMlog.R", get data "Data/pbcdata.asc"

> pbcdat = read.table("http://www.math.umd.edu/~evs/s705/Data/pbcdata.asc", header=T)    

> names(pbcdat)
 [1] "OBS"     "IDNUM"   "DTH"     "EVTTIME" "TREATGP" "LOGBILI" "AGEVAR"
 [8] "CIRRH"   "CCHOL"   "ALBUMIN"

> ind = pbcdat$DTH==1 | pbcdat$EVTTIME >= 41
  Surv41 = as.numeric(pbcdat$EVTTIME < 41)
  pbcdat = cbind.data.frame(pbcdat[,3:10], Surv41)[ind,] ## 172 x 9
  
> tmpglm = glm(cbind(Surv41, 1-Surv41) ~ LOGBILI + TREATGP + AGEVAR,
    data=pbcdat, family = binomial)

### Simulation to introduce "dispersion" in the form of shared random 
###                            effect within cluster

reff = runif(172,-2,2)
yrsp1 = rbinom(172, rep(10,172),plogis(tmpglm$lin + reff))
  #### 172 y's like the binary outcomes in PBCdata except they now
   ###    arise from clusters of 10 with shared "reff"

### Think of yrsp1[i]/10 = Ybar_{i.} as the "data" for the ith cluster

> mean(dlogis(tmpglm$lin + reff))           ### E( Var Ybar_{i.} )
[1] 0.132752
>   mean((10/9)*.1*yrsp1*(1-0.1*yrsp1))   ### estimator of E( Var Ybar_{i.} )
[1] 0.1264858                             ### average across i.

glmfitA = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat,
   family=binomial) 
glmfitB = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat,
   family=quasibinomial) 

sum(abs(glmfitA$coef-glmfitB$coef))   ### 0 

> c(summary(glmfitB)$disp, summary(glmfitA)$disp)
[1] 2.494836 1.000000

>  sum(10*(yrsp1/10-glmfitB$fit)^2/(glmfitB$fit*(1-glmfitB$fit)))/(172-4)
[1] 2.494836

=======================================================================

                                                                 11/11/2015

Small Initial Demonstration of Smoothing Spline
===============================================

> xv = runif(500, 0,2)
  fxv = sqrt(plogis(xv +2.5*xv - 5*xv^2 + 2*xv^3)) 
  yv = fxv + 0.03*rnorm(500)

  ordx = order(xv)
  plot(xv[ordx], yv[ordx])
  curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)

  tmpspl1 = smooth.spline(xv,yv)
  lines(xv[ordx], predict(tmpspl1,xv[ordx])$y, col="blue", lwd=3)
> tmpspl1$spar
[1] 0.7934125

> tmpspl2 = smooth.spline(xv,yv, spar=0.01)

  lines(xv[ordx], predict(tmpspl2,xv[ordx])$y, col="green", lwd=3)

  plot(xv[ordx], yv[ordx])
  curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)

  tmpspl3 = smooth.spline(xv,yv, spar=1)

  lines(xv[ordx], predict(tmpspl3,xv[ordx])$y, lwd=3)

  yv2 = fxv + 0.001*rnorm(500)
  plot(xv[ordx], yv2[ordx])
  curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)
  tmpspl4 = smooth.spline(xv,yv)

  lines(xv[ordx], predict(tmpspl4,xv[ordx])$y, col="blue", lwd=2)

summary(fxv - predict(tmpspl4,xv)$y)
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-6.230e-03 -3.092e-03 -5.708e-05 -7.092e-04  1.010e-03  6.539e-03 

>>>>> Continue with other smoothing and nonparametric regression techniques

 nprgfram =  cbind.data.frame(x=xv, y=yv)

> plot(nprgfram$x,nprgfram$y, main="Scatterplot of Raw Data, n=500")

  for(i in 1:8) abline(v=i*.2, lty=3)
  for(i in 1:9) {
   inds <- (1:500)[abs(nprgfram$x-.2*i+.1) < 0.1]
   itmp <- order(nprgfram$x[inds])
   lines(nprgfram$x[inds[itmp]], lm(y ~ x, data=nprgfram[
       inds,])$fit[itmp], lty=1, lwd=2) }

 curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)

### Now consider a lowess fit:

> plot(nprgfram$x,nprgfram$y, main="Raw Data with Lowess Line, n=500")
  tmplow = lowess(nprgfram$x,nprgfram$y)
  lines(tmplow$x,tmplow$y, lty=1, lwd=2)
 curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)

### 

locpoly(x, y, drv = 0, degree, kernel = "normal", 
        bandwidth, gridsize = 401L, bwdisc = 25, 
        range.x, binned = FALSE, truncate = TRUE)


tmppoly = locpoly(xv, yv, degree=3, bandwidth=0.1)

plot(tmppoly)
points(nprgfram$x,nprgfram$y, main="Scatterplot of Raw Data, n=500")
 curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)

### DISCUSSION: USE OF SPLINES FOR NUMERICAL APPROXIMATION     11/18/15

## Consider the problem of sampling from a specified distribution, 
##    by the probability integral transform.

## Let's do this with  t_5

xv = c((1:500)/10, 10 + (1:20)/2)
xv = c(-rev(xv),0,xv)
> pt(xv[1],5)
[1] 2.887758e-06

> yv = pt(xv,5))
  curve(log(pt(x,5)), -5,5)

  tmpy = smooth.spline(log(yv),xv)
 plot((1:999)/1000,predict(tmpy,log((1:999)/1000))$y, typ="l")
  curve(qt(x,5), .001,.999, add=T, col="red")

### Now to simulate 100,000 of them using the spline-fitted function
### Note that the evaluation function is

unix.time({rant = predict(tmpy, log(runif(1e5)))$y})
   user  system elapsed 
   0.03    0.00    0.03 

  hist(rant, prob=T, nclass=128, xlim=c(-6,6))
  curve(dt(x,5), -6,6, add=T, col="green")

### I will ask you to do this in a more difficult case, for an exercise

### SPLINE BASIS EXAMPLE

## Start with a fairly complicated curve

mixran = function(x, pts, seednum)  {
   set.seed(seednum)
   coef = 10*(runif(length(pts))-0.5)
   structure(c(outer(x, pts, function(x,y) dnorm(x-y)) %*% coef), 
         seed=seednum, coef=coef) }

xv = seq(-5,5,by=0.1)
mixran(xv[1:5], c(-1,0.5,2), 333)
[1] -4.461042e-05 -6.632645e-05 -9.765776e-05 -1.424013e-04 -2.056501e-04
attr(,"seed")
[1] 333
attr(,"coef")
[1] -0.3299934 -4.1540185  4.7348527


plot(xv, mixran(xv, c(-1,0.5,2), 333))

yv3 = mixran(xv, c(-3,-2,-0.5,0.8,2,4), 333) + rnorm(101,0,0.8)

> library(splines)
  library(KernSmooth)

dim(mat3)  ### 101 x 8

mat3 = bs(xv, knots=seq(-4,4,by=2), degree=3)  ### dim=length(knots)+degree

fity = lm(yv3 ~ . , data=cbind.data.frame(yv3,mat3))

plot(xv,yv3)
lines(xv, mixran(xv, c(-3,-2,-0.5,0.8,2,4), 333), col="red")

points(xv, fity$fit, pch=5)

### What if we took sparser knots ?

mat4 = bs(xv, knots=c(-3,0,3), degree=3)
points(xv, lm(yv3 ~ . , data=cbind.data.frame(yv3,mat4))$fit, pch=18)

==================================================================

Illustration of Data Replication Methods
========================================                 11/23/2015

DATA-SPLITTING, PARAMETRIC BOOTSTRAP, & BAYES 
    POSTERIOR PREDICTIVE SIMULATION

## Data illustration

  set.seed(5)
  x = runif(200)
  y = 3 - 7*x + 10*(0.1+abs(x-.5))^1.5*rt(200,10)
  rtwt = 1/(0.1+abs(x-.5))^1.5
  Dset = cbind.data.frame(y, x, wt=rtwt^2)

> summary(rtwt^2)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  4.760   9.806  22.580  91.930  90.610 761.900 

  x1st = rtwt
  x2st = x*rtwt
  yst = y*rtwt
  Dset2 = cbind.data.frame(yst,x1st,x2st)

  AuxY = array(0, c(1000,200))   

> tmp1 = lm(y~x, data=Dset[1:100,], weight=wt)

## NOTE: S3 method for class 'lm'
predict(object, newdata, se.fit = FALSE, scale = NULL, df = Inf,
        interval = c("none", "confidence", "prediction"),
        level = 0.95, type = c("response", "terms"),
        terms = NULL, na.action = na.pass,
        pred.var = res.var/weights, weights = 1, ...)

  tmp2 = predict(tmp1,Dset[1:100,], interval="none", 
             type="response", weights=Dset$wt[1:100])

> summary(tmp2-tmp1$fit)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.000e+00 8.882e-16 1.332e-15 1.578e-15 1.776e-15 2.642e-14 


NOTE THAT IN ALL OF THE COMPUTATIONAL METHODS STUDIED IN THIS EXTENDED
EXAMPLE, WE COMPUTE THE (unweighted) RMSE FOR Y-PREDICTIONS BUT DO THE 
MODEL-FITTING BY A WEIGHTED-LEAST SQUARES METHOD.



Data-splitting method: 50-50 splits for RMSE
---------------------

  TstDat = TrnDat = array(0, c(1000,100))
  TrnMSE =  SplMSE = numeric(1000)

  for(i in 1:1000) {
    ind = sample(1:200, 100)
    TrnDat[i,] = setdiff(1:200,ind)
    tmp = lm(y ~ x, weights=wt, data=Dset[TrnDat[i,],])
    TrnMSE[i] = mean(tmp$resid^2)
    SplMSE[i] = mean((Dset$y[ind] - 
       predict(tmp,Dset[ind,], interval="none", 
          type="response", weights=Dset$wt[ind]))^2)
    }

>  summary(sqrt(SplMSE))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.898   2.421   2.559   2.552   2.694   3.137
>  summary(sqrt(TrnMSE))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.809   2.371   2.514   2.508   2.652   3.034 

IT IS NO SURPRISE THAT THE TrnMSE VALUES ARE SMALLER THAN THE 
TEST-DATA SplMSE VALUES, ALTHOUGH HERE THE DIFFERENCE IS SMALL.

AS WE REMARKED IN CLASS, THIS METHOD OF PSEUDO-SAMPLE 
REPLICATION GENERATES X'S AND Y'S TOGETHER.


"Parametric Bootstrap"
----------------------

## generate residuals anew from N(0,sighat)
## "predictors" kept from  tmp1 with 

   xvec = x[1:100]
   wts = Dset$wt[1:100]
   AuxY = tmp1$fitted + array(rnorm(100*1000,0,
         rep((0.1+abs(xvec-0.5))^1.5*summary(tmp1)$sig,100)), 
         c(100,1000))

### Now can look at the MSEs from re-fitting the model 1000 times

    MSEboot = numeric(1000)
    for(i in 1:1000) 
       MSEboot[i] = mean(lm(AuxY[,i] ~ xvec, weights=wts)$resid^2)

> summary(sqrt(MSEboot))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.651   2.218   2.361   2.364   2.502   3.107 

THIS FIRST WAY OF DOING THE PARAMETRIC BOOTSTRAP GENERATES ONLY Y'S
FOR A FIXED SET OF X'S. YOU COULD SAY THAT WHAT WE GENERATE IN THIS 
WAY IS A SET OF "CONDITIONAL" PSEUDO-SAMPLES. THAT IS, THE VARIABILITY
REPLICATED HERE IS STRICTLY THAT DUE TO THE CONDITIONAL DENSITY OF 
Y GIVEN X.

NEXT LET'S DO A SLIGHTLY MORE ELABORATE PARAMETRIC-BOOTSTRAP SIMULATION
IN WHICH X'S AND Y'S ARE RE-GENERATED. BUT I PROPOSE, AS I MENTIONED IN
CLASS, A SIMULATION DESIGN IN WHICH WE CAN SEPARATE THE EFFECTS ON MSE
OF THE ESTIMATOR OF DIFFERENT SETS OF X'S FROM THE CONDITIONAL SAMPLING 
VARIABILITY OF Y'S GIVEN FIXED X'S.

We choose to simulate B=100 different X-vectors X^{(b)}, and then for 
each one, R=1000 independently generated Y-vectors Y^{(b,r)} from 
the conditional distribution of Y given X=X^{(b)}.

Xarr = array(runif(1e4), c(100,100))   ### index (b,i)

Yarr = array(0, c(1000,100,100))  
## indices (r,b,i) : r indexes y-vector replicate within fixed X^{(b)},
###    and i=1:100 indexes data points within each batch of n=100.

abhat = tmp1$coef
sighat = summary(tmp1)$sig
epsarr = array(0, c(100,1000))
Wtarr = (0.1+abs(Xarr-0.5))^(-1.5)
Wtarr = (1/apply(Wtarr,1,sum))*Wtarr

for(b in 1:100) {
    epsarr[,] = rnorm(1e5)
    Yarr[,b,] = t( (abhat[1] - abhat[2]*Xarr[b,]) + 
      (sighat*(0.1+abs(Xarr[b,]-0.5))^1.5)*epsarr ) }

## Now we have the large arrays of Y's and X's and we need to 
## vectorize the weighted least squares estimates in order to make
## the computations quick.

Ypred = replace(Yarr,T,0)    ### array like Yarr filled with 0's

Xmn = apply(Wtarr*Xarr,1,sum)
XctrArr = -Xmn + Xarr
SXXsq = apply(Wtarr*XctrArr^2,1,sum)

auxmat = array(outer(rep(1,1000), Wtarr*XctrArr)*Yarr, 
                c(1000*100,100))

b2hatarr = array( c( auxmat %*% rep(1,100))/      
      rep(SXXsq, rep(1000,100)),  c(1000,100))    ### 1000 x 100  (r,b)

b1hatarr = apply(outer(rep(1,1000),Wtarr)*Yarr,1:2,sum) - t( Xmn*t(b2hatarr) ) 

for(b in 1:100) Ypred[,b,] = b1hatarr[,b] + outer(b2hatarr[,b],Xarr[b,])

MSEarr = apply( (Yarr-Ypred)^2, 1:2, mean)   ### 1000x100  (r,b)
  Biasarr = apply(Ypred - Yarr, 1:2, mean)> summary(c(Biasarr))
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-0.6123000 -0.0921800 -0.0005000 -0.0004574  0.0912800  0.5894000 

### Biases are small but not 0. They would be 0 if the averages of 
prediction errors within sample were taken weighted by Wtarr.

### All of this was quick. Now we have an array of MSEs for all (r,b)
### and we break MSE into 3 parts each of which we estimate via simulation:

### MSE (Ypred_i^{(r,b)}) = E( (Y_i^{(r,b)} - Ypred_i^{(r,b)})^2 )
###  +  E( Var(Y_i^{(r,b)} - Ypred_i^{(r,b)} | X[b,.]) )  
###  +  Var( E(Y_i^{(r,b)} - Ypred_i^{(r,b)} | X[b,.]) )  

## First term is bias^2, which we estimate from Biasarr^2 

> mean(c(Biasarr^2))
[1] 0.01865613
> mean( apply((Yarr - Ypred)^2, 2, mean) )
[1] 5.081514

> sd( apply((Yarr - Ypred)^2, 2, mean) )
[1] 0.5081083

> var( c(Yarr-Ypred))
[1] 5.081514

DID WE LEARN ANYTHING EXTRA BY DOING THE SIMULATION BY BLOCKS b ?
WE FOUND OUT HOW VARIABLE OUR OVERALL ESTIMATED ANSWER WAS DUE TO 
SAMPLING VARIATION FROM ONE SET OF X'S TO ANOTHER.

HOW DID THIS SIMULATION COMPARE WITH THE DATA-SPLITTING MSE ?
THE RMSE WE FOUND THERE WAS  2.508  WHILE WE JUST FOUND THE 
PARAMETRIC-BOOTSTRAP RMSE TO BE

> sqrt(5.081514)
[1] 2.254221

THE RMSE FOUND FROM THE PARAMETRIC BOOTSTRAP WITH SINGLE FIXED Xvec
WAS  2.364, but since that was for just a single Xvec, and we now 
know from the two-level (r and b) simulation that the variance of 
the MSE estimator was (0.5018)^2, these different parametric-bootstrap 
estimators (one with fixed X's, the other with varying X's) are 
within range of one another.

NOTE HOWEVER THAT WE MAY ALSO GUESS THAT THE DATA-SPLITTING MSE IS 
LARGER BECAUSE IT TAKES INTO ACCOUNT THE ACTUAL DATA (200 DATA-PAIRS)
WHICH DID NOT PRECISELY SATISFY THE MODEL ASSUMED BY THE PARAMETRIC BOOTSTRAP.


BAYES posterior predictive sampling
-----------------------------------

We continue with the same model as above, but define a standard conjugate 
prior for the parameters theta = (b1,b2, sigma^2) within a normal-errors 
model for the data with known weight-pattern (ie standard deviation of 
y given x known to be proportional to (0.1+abs(x-.5))^1.5. We will 
continue with that model as we did for the parametric bootstrap, 
although recall that it is slightly misspecified because the errors were 
actually t_{10} distributed rather than normal.

### We want to define a prior conjugate for the model

y ~ N( b1 + b2*x, (sig*(0.1+abs(x-0.5))^{1.5})^2 )

## For this, could take b1,b2 ~ indep N(0, sig0^2) ,  sig0 large and 
##    known, say 5 and 1/sig^2 ~ Gamma(1, lam0)  for small lam0, say 0.1.

## Then with tau = 1/sig^2, the posterior is proportional to 

tau^{n/2} exp( -0.02*(b1^2+b2^2) - 0.5*tau*sum(yi-b1-b2*xi)^2

## This is conjugate (posterior of the same form as prior) for (b1,b2) 
##   if sig had been known and fixed

## and for tau or sig^2 if b1,b2 were known and fixed. But jointly, it
## is not conjugate.

NOTE: there is a Theorem (Bernstein-von Mises) which says that in large 
iid samples, the posterior is approximately normal centered at the MLE
and with variance (like the MLE!) the inverse of the Fisher Information

Here the Fisher Information from the fitted model is obtained as

> Infmat = array(c(sum(Dset$wt), sum(Dset$wt*Dset$x), 0, 
             sum(Dset$wt*Dset$x), sum(Dset$wt*Dset$x^2), 0, 
             0, 0, nrow(Dset)/sighat^2)/sighat^2, c(3,3))

> round(solve(Infmat), 4)
        [,1]    [,2]    [,3]
[1,]  0.0840 -0.1613  0.0000
[2,] -0.1613  0.3267  0.0000
[3,]  0.0000  0.0000 32.0117


## mvrnorm(n = 1, mu, Sigma, tol = 1e-6, empirical = FALSE, EISPACK = FALSE)

## Approximate Bayesian Idea in simulating new data pseudo-samples is

(a) to draw  multivariate normal (b1H,b2H,sigH^2) ~ 
    N( (b1hat, b2hat,sighat^2), Infmat)

(b) generate x ~ unif and y given x as 
    N(b1H + b2H*x, {sigH*(0.1+abs(x-0.5))^1.5}^2)

HOWEVER: use Delta method upon transformed parameter  nu = log(sig^2)
   to force the simulated variance values to be positive !

NB. We can get an idea of the degree of (NON-) normality of sighat 
distribution by using the array of Yarr - Ypred = Yresid values to 
estimate a 1000 x 100 (r,b) array of parametric-bootstrap Monte Carlo 
simulated ones, as follows:

> library(MASS)
###  mvrnorm(n = 1, mu, Sigma, tol = 1e-6, empirical = FALSE, EISPACK = FALSE)

> dim(Yarr)
[1] 1000  100  100
> sghatsqArr = apply( (Yarr-Ypred)^2, 1:2, mean) 
> dim(sghatsqArr)
[1] 1000  100

> hist( sghatsqArr[,2], nclass=50, prob=T)
> hist( sghatsqArr, nclass=50, prob=T)

> curve(dnorm(x,mean(sghatsqArr), sd(sghatsqArr)), 2,12, add=T, col="green")
#### definitely off !!!

### Try to overplot gamma with matching 1st two moments instead

> beta = mean(c(sghatsqArr))/var(c(sghatsqArr))
  alph = beta*mean(c(sghatsqArr))
  curve(dgamma(x,alph, beta), 2,12, add=T, col="blue")
##----------------------------------------------------------


> c(coef = tmp1$coef, sigsq = summary(tmp1)$sig^2)
coef.(Intercept)           coef.x            sigsq 
        2.282569        -5.560408        80.014571 
### "True" b1,b2 values from simulation were 3, -7.

### When sig^2=80, obtain "theoretical" MSE as the 
###   expectation of 80*(t_{10})^2*E((0.1+abs(U-0.5))^3) = 
> 80*1.25*0.06475 = 6.475  ### derived from
> integrate(function(x) x^2*dt(x,10), -Inf,Inf)$val
[1] 1.25
> integrate(function(x) (0.1+abs(x-0.5))^3, 0,1)$val
[1] 0.06475


##>>>===========================================

                                               Continuation 11/30/2015

USE THE PRECEDING NON-NORMALITY OF  sighatsq  TO ARGUE THAT WE SHOULD 
NOT SIMPLY USE THE BERNSTEIN-VON MISES THEOREM TO SAMPLE FROM MULTIVARIATE 
NORMAL AS A SHORTCUT FOR THE BAYESIAN POSTERIOR, BUT RATHER APPROXIMATE 
THE POSTERIOR BY MULTIVARIATE NORMAL IN (b1,b2) ALONG WITH INDEPENDENT 
GAMMA (motivated by the form of the information matrix) FOR SIGSQ.

> rhovec = numeric(100)
> for(i in 1:100) rhovec[i] = cor(b1hatarr[,i],b2hatarr[,i])
> hist(rhovec)
> summary(rhovec)
    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
 -0.9807 -0.9716 -0.9673 -0.9662 -0.9617 -0.9464 
 #### STRONG NEGATIVE CORRELATION BETWEEN B1, B2

> var(cbind(c(b1hatarr), c(b2hatarr)))
           [,1]       [,2]
[1,]  0.2207807 -0.4124677
[2,] -0.4124677  0.8253712     ### corr = -0.966

> c(alpha=alph, beta=beta)    ### gamma parameters for sigsq
     alpha      beta
 20.538710  4.041849      ### mean here is  5.0815,  while 

> mean((Dset$y[1:100]-tmp1$fit)^2)
[1] 6.153456         #### Again, actual data shows greater variances
                  ### which is partly due to true  t_10 errors.

#### Using Infmat^{-1} above for variance would suggest sd's

> round(sqrt(diag(solve(Infmat))), 4)
[1] 0.2898 0.5716 5.6579

### and b1hat,b2hat correlation

> aux = solve(Infmat); aux[1,2]/sqrt(aux[1,1]*aux[2,2])
[1] -0.9737398

### NOTE that sampling from this posterior would also suffer from 
###   the same parametric-model misspecification ?!

## We would be sampling from neighborhood

b2 in  -5.560 +/-  1.96*0.572

b1 approx. = ybar - b2*xbar

sigsq approx Gamma(17.68, 3.04)

##>=====================================================
         NONPARAMETRIC BOOTSTRAP                         11/30/2015

IS THERE AN IDEA THAT LETS US LEARN ABOUT THE PERFORMANCE OF THIS
KIND OF DATA ANALYSIS WITHOUT ASSUMING WE KNOW THE CORRECT (X,Y) MODEL ?

Yes, the NONPARAMETRIC BOOTSTRAP !

## (I) Bootstrap resampling of the iid pairs  (X_i, Y_i)

## Assume the only data we have is Dset[1:100,] (including weights).

indArr = array(sample(1:100,1e5, replace=T), c(1000,100))
   #### corrected the next statements to reflect weighted least-squares

auxXB = array(Dset$x[c(indArr)], c(1000,100))
auxYB = array(Dset$y[c(indArr)], c(1000,100))Ones = rep(1/100,100)
wtB = (0.1+abs(auxXB-0.5))^(-3)
wtB = (1/apply(wtB,1,sum))*wtB
avXB = apply(wtB*auxXB,1,sum)
avYB = apply(wtB*auxYB,1,sum)
b2hatB = c( ((-avXB + auxXB)*wtB*auxYB) %*% Ones / 
            (wtB*(-avXB + auxXB)^2) %*% Ones )
b1hatB = avYB - b2hatB*avXB 

MSEB = c( (b1hatB + b2hatB*auxXB - auxYB)^2 %*% Ones )

tmp = hist(MSEB, nclass=30, prob=T)
library(KernSmooth)
lines(locpoly(tmp$mids, tmp$density, degree=3, bandw = 0.8), col="blue")
curve(dnorm(x,mean(MSEB), sd(MSEB)), 2,20, add=T, col="red")   
     ### so the MSEs are skew-distributed, not quite normal
     #### Picture saved as "NPBootMSEpic.pdf"> 
c(mean = mean(MSEB), sd = sd(MSEB))    mean       sd 
6.072114 1.417756    ### MSE quite a bit larger than ~ .5 suggested by
                     ###  parametric bootstrap, and SD larger too
   ### NOTE that this way of bootstrapping is letting the residual
       #### t-distribution express itself better !!

tmp2 = hist(b2hatB, nclass=30, prob=T)
lines(locpoly(tmp2$mids, tmp2$density, deg=3, bandw = 0.8), col="blue")
curve(dnorm(x,mean(b2hatB), sd(b2hatB)),-12,-2, add=T, col="red")    
          ### much closer to normal, but still not quite there


## (II) Bootstrap resampling of residuals*rtwt for fixed X_i
## recall xvec is Dset$x[1:100],  rtwt[1:100] = 1/(0.1+abs(xvec-0.5))^1.5
## use indArr above to resample residuals and then re-create Y's

epsBArr = array((tmp1$resid*rtwt[1:100])[c(indArr)], c(1000,100))
YNPboot = tmp1$fitted + (1/rtwt[1:100])*epsBArr
wgts = Dset$wt[1:100]/sum(Dset$wt[1:100])
avYB2 = c(YNPboot %*% wgts)
xvav = sum(xvec*wgts)

b2hatB2 = c( (outer(rep(1,1000),xvec-xvav)*YNPboot) %*% wgts )/ 
             sum((xvec-xvav)^2*wgts) 

b1hatB2 = avYB2 - b2hatB2*xvav 

MSEB2 = c( (b1hatB2 + outer(b2hatB2,xvec) - YNPboot)^2 %*% Ones )

> c(mean(MSEB2), sd(MSEB2))
[1] 5.814071 5.229153        #### Note smaller MSE when X fixed.   

### THIS WAY OF BOOTSTRAPPING IMPOSES THE STRUCTURE OF INDEPENDENCE
   ###  of X and epsilon's but the rescaling of tmp1$resid
   ### by  rtwt  seems to make the variability of the MSE estimates 
   ### much larger because of the large range of weights.

## (III) Could separately and independently bootstrap X and residuals
auxXB3 = array(sample(xvec,1e5,replace=T), c(1000,100))
YNPboot3 = tmp1$coef[1]+tmp1$coef[2]*auxXB3 + (1/rtwt[1:100])*epsBArr

wtB3 = (0.1+abs(auxXB3-0.5))^(-3)
wtB3 = (1/apply(wtB3,1,sum))*wtB3

avYB3 = apply(wtB3*YNPboot3,1,sum)
avXB3 = apply(wtB3*auxXB3,1,sum)

b2hatB3 = c( ((-avXB3 + auxXB3)*wtB3*YNPboot3) %*% Ones /
              (wtB3*(-avXB3 + auxXB3)^2) %*% Ones )

b1hatB3 = avYB3 - b2hatB3*avXB3 

MSEB3 = c( (b1hatB3 + b2hatB3*auxXB3 - YNPboot3)^2 %*% Ones )

> c(mean(MSEB3), sd(MSEB3))
[1] 5.826440 5.249827

### VERY SIMILAR RESULTS TO THOSE IN (II) ABOVE.  
                #### The average MSE stays small when
     ### we incorporate the independence of X's and epsilon's !



=============================================================    12/2

WE FINISH THIS EXAMPLE WITH WEIGHTED LEAST-SQUARES (WITH t_{10} ERRORS)
BY ASKING AND ANSWERING: WHAT IS THE "RIGHT" ANSWER FOR THE AVERAGE AND SD
OF UNWEIGHTED MSE FOR Y-PREDICTORS BASED ON THE DATA-SAMPLES OF SIZE 100 
WITH KNOWN VARIANCE-WEIGHTING (AS A FUNCTION OF X_i) for the kind of data
(with t_{10} not N(0,1) errors) we were actually working with ???

(a) Conditional for fixed  X's  at  xvec

   AuxY.MC = 3 - 7*xvec + array(rep(10*(0.1+abs(xvec-0.5))^1.5,1000)*
          rt(100*1000,10),  c(100,1000))

### Now can look at the MSEs from re-fitting the model 1000 times

    MSE.MC = numeric(1000)
    for(i in 1:1000) 
       MSE.MC[i] = mean(lm(AuxY.MC[,i] ~ xvec, weights=Dset$wt[1:100])$resid^2)

> summary(sqrt(MSE.MC))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.973   2.711   2.908   2.936   3.145   4.044 

> c(mean=mean(MSE.MC), sd = sd(MSE.MC))
    mean       sd 
8.728081 1.977124 

(b) UNconditional, with varying (iid) X's 

   AuxX.MC2 = array(runif(1e5), c(100,1000))
   RtWt.MC2 = (0.1+abs(AuxX.MC2-0.5))^1.5
   AuxY.MC2 = 3 - 7*AuxX.MC2 + 10*RtWt.MC2*
          array(rt(100*1000,10),  c(100,1000))

    MSE.MC2 = numeric(1000)
    for(i in 1:1000) 
       MSE.MC2[i] = mean(lm(AuxY.MC2[,i] ~ AuxX.MC2[,i], weights=
         RtWt.MC2[,i]^2)$resid^2)

> summary(sqrt(MSE.MC2))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.841   2.546   2.775   2.805   3.036   4.497 

> c(mean=mean(MSE.MC2), sd = sd(MSE.MC2))
    mean       sd 
7.998849 2.091280