Example Logs, STAT 705, Fall 2015

                                                      9/2/2015

> tmp = sin(0.354 + 2*pi*(1:500)/10)
> mat = array(tmp,c(10,50))
> apply(mat,1,var)
 [1] 5.927550e-29 4.168184e-31 9.749828e-29 3.043804e-28 2.945958e-28
 [6] 1.140571e-28 4.980691e-31 9.790227e-29 1.896126e-28 2.672093e-28

> tmp1 = rep(c(1,3),500)
> tmp2 = c(rbind(rep(1,500),rep(3,500)))
> sum(abs(tmp1-tmp2))
[1] 0
> tmp3 = replace(rep(1,1000), 1:1000 %% 2 == 0, 3)
> sum(abs(tmp3-tmp1))
[1] 0
> tmp4 = ifelse(1:1000 %% 2 == 0, 3,1)
> sum(abs(tmp4-tmp1))
[1] 0

> toss = rbinom(10,1,.5)
> cond = as.logical(toss)
> cond
 [1] FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
> replace(1:10,cond,101:105)
 [1]   1   2 101   4 102 103 104 105   9  10
> ifelse(cond,101:105,1:10)
 [1]   1   2 103   4 105 101 102 103   9  10

> sum(abs(  (1:10)[toss==1] - which(cond) ))
[1] 0

toss + array(rep(1:10,5), c(10,5))
      [,1] [,2] [,3] [,4] [,5]
 [1,]    1    1    1    1    1
 [2,]    2    2    2    2    2
 [3,]    4    4    4    4    4
 [4,]    4    4    4    4    4
 [5,]    6    6    6    6    6
 [6,]    7    7    7    7    7
 [7,]    8    8    8    8    8
 [8,]    9    9    9    9    9
 [9,]    9    9    9    9    9
[10,]   10   10   10   10   10

outer(toss+1:10,rep(1,5),"*")
      [,1] [,2] [,3] [,4] [,5]
 [1,]    1    1    1    1    1
 [2,]    2    2    2    2    2
 [3,]    4    4    4    4    4
 [4,]    4    4    4    4    4
 [5,]    6    6    6    6    6
 [6,]    7    7    7    7    7
 [7,]    8    8    8    8    8
 [8,]    9    9    9    9    9
 [9,]    9    9    9    9    9
[10,]   10   10   10   10   10

> attributes(mat)
$dim
[1] 10 50

> fr = data.frame(mat)
> attributes(fr)
$names
 [1] "X1"  "X2"  "X3"  "X4"  "X5"  "X6"  "X7"  "X8"  "X9"  "X10" "X11" "X12"
[13] "X13" "X14" "X15" "X16" "X17" "X18" "X19" "X20" "X21" "X22" "X23" "X24"
[25] "X25" "X26" "X27" "X28" "X29" "X30" "X31" "X32" "X33" "X34" "X35" "X36"
[37] "X37" "X38" "X39" "X40" "X41" "X42" "X43" "X44" "X45" "X46" "X47" "X48"
[49] "X49" "X50"

$row.names
 [1]  1  2  3  4  5  6  7  8  9 10

$class
[1] "data.frame"

> class(mat)
[1] "matrix"
> length(mat)
[1] 500
> length(fr)
[1] 50

> outer( 1:5, 4:6, "*")

     [,1] [,2] [,3]
[1,]    4    5    6
[2,]    8   10   12
[3,]   12   15   18
[4,]   16   20   24
[5,]   20   25   30

> tmp = outer(1:6, 1:6, function(x,y) x<= y)
  tmp
      [,1]  [,2]  [,3]  [,4]  [,5] [,6]
[1,]  TRUE  TRUE  TRUE  TRUE  TRUE TRUE
[2,] FALSE  TRUE  TRUE  TRUE  TRUE TRUE
[3,] FALSE FALSE  TRUE  TRUE  TRUE TRUE
[4,] FALSE FALSE FALSE  TRUE  TRUE TRUE
[5,] FALSE FALSE FALSE FALSE  TRUE TRUE
[6,] FALSE FALSE FALSE FALSE FALSE TRUE

> 1*tmp
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    1    1    1    1    1
[2,]    0    1    1    1    1    1
[3,]    0    0    1    1    1    1
[4,]    0    0    0    1    1    1
[5,]    0    0    0    0    1    1
[6,]    0    0    0    0    0    1

> class(tmp[1:36]) = "numeric"
  tmp
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    1    1    1    1    1
[2,]    0    1    1    1    1    1
[3,]    0    0    1    1    1    1
[4,]    0    0    0    1    1    1
[5,]    0    0    0    0    1    1
[6,]    0    0    0    0    0    1



                                                      9/9/2015

> help(data.frame)

Description

This function creates data frames, tightly coupled collections of variables which 
share many of the properties of matrices and of lists, used as the fundamental data 
structure by most of R's modeling software.

Usage

data.frame(..., row.names = NULL, check.rows = FALSE,
           check.names = TRUE,
           stringsAsFactors = default.stringsAsFactors())
...

> fac1 = factor(tmp1)
> attributes(fac1)
$levels
[1] "1" "3"

$class
[1] "factor"

> fac1[1:20]
 [1] 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3
Levels: 1 3

> as.character(fac1)[1:10]
 [1] "1" "3" "1" "3" "1" "3" "1" "3" "1" "3"

> as.numeric(fac1)[1:20]
 [1] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2

> cond
 [1] FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
> as.numeric(factor(cond))
 [1] 1 1 2 1 2 2 2 2 1 1
> as.numeric(cond)
> as.numeric(cond)
 [1] 0 0 1 0 1 1 1 1 0 0


> library()    #### shows all available packages ...

> library(foreign)
> search()
 [1] ".GlobalEnv"        "package:foreign"   "package:stats"    
 [4] "package:graphics"  "package:grDevices" "package:utils"    
 [7] "package:datasets"  "package:methods"   "Autoloads"        
[10] "package:base"   
> objects(2)
 [1] "data.restore"  "lookup.xport"  "read.arff"     "read.dbf"     
 [5] "read.dta"      "read.epiinfo"  "read.mtp"      "read.octave"  
 [9] "read.S"        "read.spss"     "read.ssd"      "read.systat"  
[13] "read.xport"    "write.arff"    "write.dbf"     "write.dta"    
[17] "write.foreign"


> help(read.table)
read.table {utils}	R Documentation
Data Input
Description

Reads a file in table format and creates a data frame from it, with cases corresponding 
to lines and variables to fields in the file.
Usage

read.table(file, header = FALSE, sep = "", quote = "\"'",
           dec = ".", numerals = c("allow.loss", "warn.loss", "no.loss"),
           row.names, col.names, as.is = !stringsAsFactors,
           na.strings = "NA", colClasses = NA, nrows = -1,
           skip = 0, check.names = TRUE, fill = !blank.lines.skip,
           strip.white = FALSE, blank.lines.skip = TRUE,
           comment.char = "#",
           allowEscapes = FALSE, flush = FALSE,
           stringsAsFactors = default.stringsAsFactors(),
           fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)

... [discussed in class]

> help(write)                [also discussed in class]

##### Some helpful functions

> tmpmat = array( rbinom(20,2,.5), c(5,4) )
tmpmat
     [,1] [,2] [,3] [,4]
[1,]    0    1    0    1
[2,]    1    1    2    1
[3,]    1    2    0    0
[4,]    2    2    1    1
[5,]    1    1    1    2
> apply(tmpmat, 2, sum)
[1] 5 7 4 5

> apply(tmpmat, 1, sum)
[1] 2 5 3 6 5
> c(tmpmat %*% rep(1,4))
[1] 2 5 3 6 5
> for(i in 1:5) cat(sum(tmpmat[i,]), ".  "); cat("\n")
2 .  5 .  3 .  6 .  5 .  

> table(c(tmpmat))
 0  1  2 
 4 11  5 

########

> attributes(fr)
$names
 [1] "X1"  "X2"  "X3"  "X4"  "X5"  "X6"  "X7"  "X8"  "X9"  "X10" "X11" "X12"
[13] "X13" "X14" "X15" "X16" "X17" "X18" "X19" "X20" "X21" "X22" "X23" "X24"
[25] "X25" "X26" "X27" "X28" "X29" "X30" "X31" "X32" "X33" "X34" "X35" "X36"
[37] "X37" "X38" "X39" "X40" "X41" "X42" "X43" "X44" "X45" "X46" "X47" "X48"
[49] "X49" "X50"

$row.names
 [1]  1  2  3  4  5  6  7  8  9 10

$class
[1] "data.frame"

> unlist(lapply(fr,class))[1:7]
       X1        X2        X3        X4        X5        X6        X7 
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
 

> lst1 = list(A = 1:15, B = c("A","B","C","D"), C = diag(1:4)) 
> unlist(lapply(lst1, length))
 A  B  C 
15  4 16 

> lapply(lst1,summary)
$A
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    1.0     4.5     8.0     8.0    11.5    15.0 

$B
   Length     Class      Mode 
        4 character character 

$C
       V1             V2            V3             V4   
 Min.   :0.00   Min.   :0.0   Min.   :0.00   Min.   :0  
 1st Qu.:0.00   1st Qu.:0.0   1st Qu.:0.00   1st Qu.:0  
 Median :0.00   Median :0.0   Median :0.00   Median :0  
 Mean   :0.25   Mean   :0.5   Mean   :0.75   Mean   :1  
 3rd Qu.:0.25   3rd Qu.:0.5   3rd Qu.:0.75   3rd Qu.:1  
 Max.   :1.00   Max.   :2.0   Max.   :3.00   Max.   :4  



> tmp0 = diag(4)/tmpmat[1:4,]
> tmp0
     [,1] [,2] [,3] [,4]
[1,]  Inf    0  NaN    0
[2,]    0    1    0    0
[3,]    0    0  Inf  NaN
[4,]    0    0    0    1
> apply(tmp0, 2, function(col) sum(is.na(col)))
[1] 0 0 1 1
>  apply(tmp0, 2, function(col) sum(is.nan(col)))
[1] 0 0 1 1

####### Other functions

f1 = function(x, y)   x - 3*y^2

f1(5:7, 1:3)
> f1(5:7, 1:3)
[1]   2  -6 -20
> 
> outer(5:7, 1:3, f1)
     [,1] [,2] [,3]
[1,]    2   -7  -22
[2,]    3   -6  -21
[3,]    4   -5  -20


> rvec = runif(10000) 
> table(trunc(rvec * 20))

  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
500 477 500 495 484 544 443 541 474 533 500 516 512 513 475 484 485 518 488 518 

> tmp = hist(rvec, nclass = 20)
> tmp$counts
 [1] 500 477 500 495 484 544 443 541 474 533 500 516 512 513 475 484 485 518 488 518

> svec = rnorm(1000, 1, sqrt(5))
  hist(svec, nclass=25, prob=T, ,
      xlab="svec values", ylab="Density", main=
      paste("Histogram with overplotted",
            "\n normal density curve"))
  curve(dnorm(x,1,sqrt(5)), add=T, col="red")

> help(plot)     ### plotting x versus y, see also "points", "lines"
  help(par)      ### so-called "graphical parameters", including plotting symbols



                                                               9/16/2015

## Discussion of histograms in class
##   we talked about over-laying density with histogram

## Continuing previous uniform and normal examples:  first 10,000 uniform[0,1] points

> round(tmp$counts/10000 - .05, 4)
 [1]  0.0000 -0.0023  0.0000 -0.0005 -0.0016  0.0044 -0.0057  0.0041 -0.0026
[10]  0.0033  0.0000  0.0016  0.0012  0.0013 -0.0025 -0.0016 -0.0015  0.0018
[19] -0.0012  0.0018

### Pretty accurate: each should be roughly within range 
> qnorm(.99)*c(-1,1)/sqrt(12*10000)
[1] -0.006715588  0.006715588

### Now for second case, "svec" 1000 normals with mean 1, var 5
> tmp2 = hist(svec, nclass=25, prob=T)
  table(diff(tmp2$breaks))
0.5 
 30 
  round(tmp2$density - dnorm(tmp2$mids,1,sqrt(5)), 4)
 [1]  0.0018  0.0016  0.0011  0.0021 -0.0016  0.0015  0.0087  0.0053 -0.0033
[10] -0.0057  0.0020  0.0162 -0.0055  0.0327 -0.0026 -0.0107  0.0347  0.0227
[19] -0.0287 -0.0386 -0.0093 -0.0175  0.0082  0.0020  0.0003 -0.0113  0.0093
[28] -0.0073 -0.0025 -0.0016
  round(tmp2$counts/1000 - (pnorm(tmp2$breaks[2:31],1,sqrt(5))-
              pnorm(tmp2$breaks[1:30],1,sqrt(5))), 4)
 [1]  0.0009  0.0008  0.0005  0.0010 -0.0008  0.0007  0.0043  0.0026 -0.0017
[10] -0.0029  0.0009  0.0081 -0.0028  0.0164 -0.0012 -0.0052  0.0175  0.0115
[19] -0.0142 -0.0192 -0.0046 -0.0088  0.0041  0.0009  0.0001 -0.0057  0.0046
[28] -0.0037 -0.0013 -0.0008

### These last two calculated vectors are indices of "goodness of fit" of 
###   the histogram to the correct normal density.
> max(abs(.Last.value))
[1] 0.0192                ### might be used in a quick assessment of fit


### Some GRAPHICAL commands

> xv = rgamma(100,2)
  yv = rnorm(1+2*xv, 0.3*(xv^0.25))
  plot(xv, yv)

  tmp1 = lm(yv~xv)
  lines(xv, tmp1$fit, col="blue")
  inds = identify(xv,yv)          ### left-click successive points of interest in plot
[1]  1 14 41 70 80 83 96                ### right-click mouse when finished

  sqrt( (yv - tmp1$fit)[inds]/summary(tmp1)$sigma )
       25        26        50        64 
1.2113197 0.7710393 0.9728214 1.2791945 

#### Managing arrays for simulations

simarray = array( rgamma(100000,3,2), c(100,1000))

hist(apply(simarray - 1.5,2,mean), nclass =12, prob=T)

curve(dnorm(x,0, sqrt(3/(4*100))), add=T)

> sqrt(var(apply(simarray - 1.5,2,mean)))
[1] 0.09068389   ### compare with theoretical sqrt(3/(4*100))=0.08660254

> curve(dnorm(x,0,sqrt(3/400)), -.4,.4)
> curve(dgamma(x+1.5,300,200), add=T, col="green") 
            ### compares two theoretical densities

							9/28/2015

Method of Simulating Normal R.V.'s (Box-Mueller method)

Exercise: if X,Y are indep N(0,1) then A = arcsin(X/sqrt(X^2+Y^2)) ~ Unif(-pi/2,pi/2)
                  and S = X^2+Y^2 ~ Expon(1/2) are indep
So can simulate independent standard-normal pair by S = 2*rexp(1), A = pi*(2*runif(1)-1)
                  and  (X,Y) = sqrt(S)*(sin(A), cos(A))

> Svec = 2*rexp(1000); Avec = pi*(2*runif(1000)-1)
  hist(sqrt(Svec)*sin(Avec), nclass=40, prob=T)
  curve(dnorm(x), col="red", add=T)
  Xmat = sqrt(Svec)*cbind(sin(Avec),cos(Avec))
  plot(Xmat)   ## this checks independence

abline(h=0); abline(h=1); abline(v=1, col="red"); abline(v=2,col="red")
> mean(abs(Xmat[,1]-1.5)<.5 & abs(Xmat[,2]-0.5)<0.5)
[1] 0.049
> mean(abs(Xmat[,1]-1.5)<.5)*mean(abs(Xmat[,2]-.5)<.5)
[1] 0.046843
---------------------------------------

Second (slower) method: accept-reject

First simulate Logistic r.v.

Can directly use   rlogis(1000), alternatively recall  plogis(x) = e^x/(1+e^x)
and inverse is  qlogis(r) = log(r/(1-r)), so could do  qlogis(runif(1000))

Check the max of N(0,1) density divided by dlogis(x,0,C) 

> curve(dnorm(x)/dlogis(x),0,2)                  ### max <= 1.6
> curve(dnorm(x)/dlogis(x,0,1/sqrt(3)),0,2)      ### max <= 1.16005
 max(dnorm(seq(1,2,by=.001))/dlogis(seq(1,2,by=.001),0,1/sqrt(3)))  ## = 1.16002

## Use  dlogis(x) :  
> Uvec = runif(1000)*1.6
  Wvec = rlogis(1000)
  Yvec = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ]  ### length 632
  hist(Yvec, nclass=15, prob=T)

## Next try it with  dlogis(x,0,1/sqrt(3))
> Uvec = runif(1000)*1.16002
  Wvec = rlogis(1000,0,1/sqrt(3))
  Yvec = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ]  ### length 916
  hist(Yvec, nclass=15, prob=T)

### Timing runs to generate 10 million N(0,1) rv's

> memory.limit()
[1] 3894              ### in Mb

system.time({ Svec = sqrt(2*rexp(5e6)); Avec = pi*(2*runif(5e6)-1)
              tmp = c(Svec*sin(Avec), Svec*cos(Avec)) })
   user  system elapsed 
   1.06    0.20    1.26  

> system.time({tmp = qnorm(runif(1e7))})
   user  system elapsed 
   1.03    0.09    1.15  

> system.time({  Uvec = runif(1e7)*1.6 ;  Wvec = rlogis(1e7)
    tmp = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] })
   user  system elapsed 
   3.17    0.13    3.29 

> system.time({  Uvec = runif(1e7)*1.16002;  Wvec = rlogis(1e7,0,1/sqrt(3))
    tmp = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] })
   user  system elapsed 
   3.31    0.27    3.58   

========================================================================

							10/7/2015

We saw in class that the method of "control variates" involves
linear regression.

In the setting of the example in the log that we covered last time:

http://www.math.umd.edu/~slud/s705/Rlogs/Antith_Contr09


> Control.fr = cbind.data.frame(X1 = rexp(1e5)*0.8, X2 = runif(1e5,2,4),
    X3 = rlogis(1e5), X4 = rgamma(1e5,1.5))
  Y = c(data.matrix(Control.fr) %*% rep(1,4))

> Control.fr = cbind.data.frame(Control.fr, Y=Y, Yctr = Y-5.3, Yhi = as.numeric(Y>5.5))
  rm(Y)
  tmp = lm(Yhi ~ Yctr, data=Control.fr)
  
> names(tmp)
 [1] "coefficients"  "residuals"     "effects"       "rank"         
 [5] "fitted.values" "assign"        "qr"            "df.residual"  
 [9] "xlevels"       "call"          "terms"         "model"        
> names(summary(tmp))
 [1] "call"          "terms"         "residuals"     "coefficients" 
 [5] "aliased"       "sigma"         "df"            "r.squared"    
 [9] "adj.r.squared" "fstatistic"    "cov.unscaled" 

> summary(tmp)$coef[,1:2]
             Estimate   Std. Error
(Intercept) 0.4440900 0.0009788089
Yctr        0.1623008 0.0004088673

## Note that the SE 0.00098 is essentially the same as

> sqrt(1-cor(Control.fr[,"Y"],Control.fr[,"Yhi"])^2)*sd(Control.fr$Yhi)/sqrt(1e5)
[1] 0.0009788011

====================================================================

## Generating multivariate densities by conditional prob. integral transform
----------------------------------------------------------------------------

Want to generate random point x1 < x2 < ... < x6 in [0,1]^6.

Several methods:

> sort(runif(6))

> tmparr = array(runif(6*10000), c(10000,6))
  inds = which(apply(tmparr, 1, function(row) min(diff(row)))>0)
  length(inds)
[1] 19

>   tmparr[inds[1],]
[1] 0.01267986 0.02371346 0.22257134 0.58500541 0.62999614 0.92351900

> vec = c(rep(0,6),1)
  for(i in 6:1) vec[i]=vec[i+1]*rbeta(1,i,1)

### To do it in an array of many rows:

> ordmat = cbind(array(0, c(1e4,6)), rep(1,1e4))
  for(i in 6:1) ordmat[,i] = ordmat[,i+1]*rbeta(1e4,i,1)

### For example, can check that the 4th order-statistic has Beta(4,3) density:

hist(ordmat[,4], nclass=40, prob=T)
curve(dbeta(x,4,3),0,1, add=T, col="green")       ### OK

## NB this is an instance of successive conditional univariate densities

### Other methods ? ...


### Another R topic: Aggregation, splitting and lists
-----------------------------------------------------

Suppose we want to consider a ("mixture" or "stratified") model in which
there are H = 6 population strata with respective sizes or probabilities 
(0.3, 0.1, 0.2, 0.15, 0.15, 0.1) and within stratum h

Y_i ~ N(mu_h, sig_h)

Let's generate a large populatioon (N=1e5) of this sort

pvec = c(0.3, 0.1, 0.2, 0.15, 0.15, 0.1)
muvec = c(4, 2, 1, 5, 3, 6)
sigvec = sqrt(1:6)

hvec = sample(1:6, 1e5, prob=pvec, replace=T)

> rbind(Emp = table(hvec)/1e5, True=pvec)
           1       2       3       4       5       6
Emp  0.29853 0.09915 0.20125 0.15079 0.15072 0.09956
True 0.30000 0.10000 0.20000 0.15000 0.15000 0.10000

Yvec = rnorm(1e5, muvec[hvec], sigvec[hvec])

### Now want to display stratum properties
### Could do:

gp1 = which(hvec==1)
> c(mean=mean(Yvec[gp1]), sd=sd(Yvec[gp1]))
     mean        sd 
3.9977241 0.9993129 

etcetera, or else:

> tmplist = split(Yvec,hvec)
> class(tmplist)
[1] "list"
> sapply(tmplist,length)
    1     2     3     4     5     6 
29853  9915 20125 15079 15072  9956 
> sapply(tmplist, function(gp) c(mean(gp),sd(gp)))
             1        2        3        4        5        6
[1,] 3.9977241 1.986277 1.006401 5.008875 2.969467 6.009526
[2,] 0.9993129 1.412418 1.720414 1.994702 2.246038 2.444997

or else

> tmpfr = aggregate(Yvec, by=list(hvec), function(gp) c(mean(gp),sd(gp)))
> class(tmpfr)
[1] "data.frame"
> tmpfr
  Group.1       x.1       x.2
1       1 3.9977241 0.9993129
2       2 1.9862770 1.4124179
3       3 1.0064005 1.7204142
4       4 5.0088751 1.9947024
5       5 2.9694670 2.2460384
6       6 6.0095257 2.4449966

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

Log related to simulation and MLE behavior            10/14/2015

### Let's start with a model

Z ~ N(a,b^2)

X ~ Poisson( exp(nu+tau*Z) )

Y ~ Binom(1, plogis(beta1 + beta2*X + beta3*Z))

## Generate data   n=400  based on vectors of parameters

par = c(a=4.1, lb=log(0.8), nu=-2, tau = 0.2, beta=c(-.5,.8,.1))

DataGen = function(N, par) {
   Zvec = rnorm(N,par[1],exp(par[2]))
   Xvec = rpois(N,exp(par[3]+par[4]*Zvec))
   Yvec = rbinom(N,1,plogis(par[5]+par[6]*Xvec+par[7]*Zvec))
   cbind(Y=Yvec, X=Xvec, Z=Zvec) }

tmp = cbind(DataGen(400, par),One=rep(1,400))     #### 400 x 4

### Now a likelihood

nlogL = function(thet, Dat) 
    -mean(log(dnorm(Dat[,"Z"],thet[1],exp(thet[2])))) -
     mean(log(dpois(Dat[,"X"],exp(c(Dat[,c(4,3)]%*%thet[3:4]))))) - 
     mean(log(dbinom(Dat[,"Y"],1,plogis(c(Dat[,c(4,2,3)]%*% thet[5:7])))))

### Now MLE calculation

MLE = nlm(nlogL, c(5,log(2),rep(0,5)), hessian=T, Dat=tmp)

> MLE
$minimum
[1] 2.579204

$estimate
[1]  4.1044118 -0.2136951 -2.5665736  0.3401607 -0.7309071  0.5007728  0.2446937

$gradient
[1] -8.035650e-08  3.794565e-07  1.018789e-07  2.237255e-07  4.363399e-08
[6]  9.181322e-08  1.613332e-07

$hessian
              [,1]          [,2]          [,3]         [,4]          [,5]
[1,]  1.533251e+00 -6.290957e-04 -1.081980e-08 1.081980e-08  0.000000e+00
[2,] -6.290957e-04  1.999599e+00 -4.440892e-08 0.000000e+00 -4.440892e-08
[3,] -1.081980e-08 -4.440892e-08  3.225323e-01 1.398468e+00 -4.440892e-08
[4,]  1.081980e-08  0.000000e+00  1.398468e+00 6.290156e+00  4.440892e-08
[5,]  0.000000e+00 -4.440892e-08 -4.440892e-08 4.440892e-08  2.327332e-01
[6,]  0.000000e+00 -4.440892e-08 -4.440892e-08 0.000000e+00  6.167920e-02
[7,]  0.000000e+00  0.000000e+00  0.000000e+00 0.000000e+00  9.449793e-01
              [,6]      [,7]
[1,]  0.000000e+00 0.0000000
[2,] -4.440892e-08 0.0000000
[3,] -4.440892e-08 0.0000000
[4,]  0.000000e+00 0.0000000
[5,]  6.167920e-02 0.9449793
[6,]  7.749374e-02 0.2604345
[7,]  2.604345e-01 3.9821295

$code
[1] 1

$iterations
[1] 41



### CONTINUE IN CLASS with an actual simulation (say of 100 data-batches)


Example:

ThetMat = array(0, c(1000,7,2))
    ### This matrix will contain the parameter estimators and their individual
  ### estimated "theoretical" SE's derived from diagonal of the inverted 
  ### observed-information matrix.
ConvCod = logical(1000)

for(r in 1:1000) {
   tmp = cbind(DataGen(400, par),One=rep(1,400)) 
   MLE = nlm(nlogL, par, hessian=T, Dat=tmp)
   ThetMat[r,,1] = MLE$est
   ConvCod[r] = (MLE$code ==1 & sqrt(sum(MLE$grad^2))/7 < 1.e-6 & 
               min(eigen(MLE$hess)$values)>0)
   ThetMat[r,,2] = sqrt(diag(solve(MLE$hess)))
   }

> hist(ThetMat[,6,1], nclass=25, prob=T)
 curve(dnorm(x, par[6], sd(ThetMat[,6,1])), add=T, col="red")

### really nice agreement


> round(rbind(Theor.SD = sqrt(apply(ThetMat[,,2]^2,2,mean)/400),
        Empir.SD = apply(ThetMat[,,1],2,sd)),5)

            [,1]    [,2]    [,3]    [,4]    [,5]    [,6]    [,7]
Theor.SD 0.03994 0.03536 0.48483 0.11265 0.54209 0.21052 0.13034
Empir.SD 0.03999 0.03560 0.49943 0.11617 0.56854 0.21452 0.13754

### ALSO VERY NICE AGREEMENT !

#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

						10/19/2015

### Another useful form of matrix indexing (applies also to more general arrays)

> tmat = array(1:20,c(5,4))
> tmat
     [,1] [,2] [,3] [,4]
[1,]    1    6   11   16
[2,]    2    7   12   17
[3,]    3    8   13   18
[4,]    4    9   14   19
[5,]    5   10   15   20

> tmat[cbind(1:5,c(3,2,1,4,1))]
[1] 11  7  3 19  5

> tarr = array(1:60, c(5,4,3))
> tarr
, , 1
     [,1] [,2] [,3] [,4]
[1,]    1    6   11   16
[2,]    2    7   12   17
[3,]    3    8   13   18
[4,]    4    9   14   19
[5,]    5   10   15   20

, , 2
     [,1] [,2] [,3] [,4]
[1,]   21   26   31   36
[2,]   22   27   32   37
[3,]   23   28   33   38
[4,]   24   29   34   39
[5,]   25   30   35   40

, , 3
     [,1] [,2] [,3] [,4]
[1,]   41   46   51   56
[2,]   42   47   52   57
[3,]   43   48   53   58
[4,]   44   49   54   59
[5,]   45   50   55   60

> tarr[cbind(1:5,c(2,3,2,4,1),c(1,3,2,1,3))]
[1]  6 52 28 19 45

##--------------------------------------------------------------------------------
### Contrasting MLEs and variances from correctly and incorrectly specified models

## Suppose our data comes from lognormal(mu,sigma^2) and we analyze it as Gamma(a,lam)


> Xdat = exp(rnorm(400, 0.5, 0.8))

## "Correct" ML analysis gives parameter estimates 

> MLtru = c(mean(log(Xdat)), var(log(Xdat)))
[1] 0.5355010 0.6796627   ### NB 2nd parameter is variance, to be compare with .8^2

and "empirical" or "observed" information matrix (per observation)

> diag(c(1/MLtru[2]^2, 1/MLtru[2]^4))
         [,1]    [,2]
[1,] 2.164777 0.00000
[2,] 0.000000 4.68626


#### Now let's analyze the data via ML as Gamma(a,lam)

alam.ini = (mean(Xdat)/var(Xdat))*c(mean(Xdat),1)
> alam.ini
[1] 0.9997862 0.4183704        ### not ultimately meaningful, but not too wild

Gfit = optim(alam.ini, function(alam, data) -sum(log(dgamma(data,alam[1],alam[2]))),
                   data=Xdat, hessian=T)    ### 49 iterations, conv=0
> round( unlist(Gfit[1:2]), 6)
      par1       par2      value 

  1.636170   0.684693 723.132589 
> Gfit$hess
          [,1]      [,2]
[1,]  333.4763 -584.2035
[2,] -584.2035 1396.0407

Gfit2 = nlm(function(alam, data) -sum(log(dgamma(data,alam[1],alam[2]))), alam.ini,
                  data=Xdat, hessian=T)    ### only 7 iterations
> Gfit2$est
[1] 1.6363411 0.6847426
> Gfit2$hess
          [,1]      [,2]
[1,]  333.3872 -584.1184
[2,] -584.1184 1395.5721
> Gfit2$grad
[1] -4.029622e-05  8.742518e-05

nlm(function(alam, data) -sum(log(dgamma(data,alam[1],alam[2]))), alam.ini,
                  data=Xdat, hessian=T, gradtol=1e-6)$est
[1] 1.6363411 0.6847426
> nlm(function(alam, data) -sum(log(dgamma(data,alam[1],alam[2]))), alam.ini,
                  data=Xdat, hessian=T, gradtol=1e-9)$est
[1] 1.6363475 0.6847457

### Analytical gradient of logL contributions for each X_i, as nX2 matrix

grlogL =  function(alam, data) 
             cbind(log(data)+log(alam[2])-digamma(alam[1]), 
                   alam[1]/alam[2] - data)

tmp = grlogL(Gfit2$est,Xdat)

> c(t(tmp) %*% rep(1,400))
[1] 0.0003131229 0.0006105371    ### 0 to sufficient precision, after dividing by 400

> Mis.Info = t(tmp) %*% tmp      
          [,1]      [,2]
[1,]  271.1854 -642.7003
[2,] -642.7003 2279.0728

                         #### compare with Observed Info, above
> Gfit2$hess
          [,1]      [,2]
[1,]  333.3872 -584.1184
[2,] -584.1184 1395.5721

### As shown in class, a "robustified" variance for the ML estimators via 
### the mis-specified model is

> Rob.Var = solve(Mis.Info) %*% Gfit$hess %*% solve(Mis.Info)
            [,1]         [,2]
[1,] 0.014215887 0.0030794743
[2,] 0.003079474 0.0008750884

compared with the "naive" variance

> solve(Gfit2$hess)
            [,1]        [,2]
[1,] 0.011248119 0.004707914
[2,] 0.004707914 0.002687055

### The corresponding SE's for the gamma-based MLE's are:

> round(rbind(Naive= sqrt(diag(solve(Gfit2$hess))), 
        Robust= sqrt(diag(Rob.Var)) ), 4)
         [,1]   [,2]
Naive  0.1061 0.0518
Robust 0.1192 0.0296

### This means that it might take a large simulation to feel the effects of 
###  incorrect CI coverage for the shape-parameter  a ,  but the estimates
###  for lambda  will actually have a lot smaller variance than the naive 
###  variance matrix would tell us.

#------------------------------------------------------------------
                                                 10/21/2015

"Classic" EM Algorithm Problem

Contingency Table with Missing Cells.
Consider the following data problem. 

Xv = sample(1:3, 500, prob=c(.45,.25,.3), rep=T)
Yv = sample(1:4, 500, prob=c(.22,.32, .18, .28), rep=T)

> XYtab0 = table(Xv,Yv)
  XYtab0
   Yv
Xv   1  2  3  4
  1 45 76 40 65
  2 29 39 21 33
  3 40 47 28 37

NB 
> apply(XYtab,1,sum)/500
    1     2     3 
0.452 0.244 0.304      ### close to .45,.25,.3 as expected
> apply(XYtab,2,sum)/500
    1     2     3     4 
0.228 0.324 0.178 0.270 ### close to .22, .32, .18, .28 as expected

### Now suppose that the (3,3),(2,4), and (1,1) cells are blanked out
### ie treated as missing, with only the number 45+33+28 = 106 of such 
### missing observations known. In that view, the missing data is the 
information of exactly which (or how many of) those 106 observations 
fall ito each of the (X=1,Y=1), (X=3,Y=3) and (X=2,Y=4) cells.

XYtab=XYtab0; XYtab0[cbind(1:3,c(1,4,3))] = NA
XYtab0
   Yv
Xv   1  2  3  4
  1    76 40 65
  2 29 39 21   
  3 40 47    37

along with table total N=500. Data is XYtab0.

sum(c(XYtab0), na.rm=T)  ### 394, so 106 fell in missing cells


We apply the EM algorithm, in the form=

EMiter = function(pvec, qvec) {
   denom = pvec[1]*qvec[1]+pvec[3]*qvec[3]+pvec[2]*qvec[4]
   ptmp = c(181+106*pvec[1]*qvec[1]/denom, 89+106*pvec[2]*qvec[4]/denom,
            124+106*pvec[3]*qvec[3]/denom)
   qtmp = c(69+106*pvec[1]*qvec[1]/denom, 162, 61+106*pvec[3]*qvec[3]/denom,
            102+106*pvec[2]*qvec[4]/denom)
   list(pout = ptmp/sum(ptmp), qout = qtmp/sum(qtmp))
   }

### So EMiter gives a mapping from one pair of 3 and 4 dimensional prob vectors
###   to another such pair, and we want to iterate to convergence.

> pv = rep(.333,3); qv = rep(.25,4)
  listvec = list(pv,qv)

> for(i in 1:10) listvec = EMiter(listvec[[1]], listvec[[2]])
  round(unlist(listvec),4)
 pout1  pout2  pout3  qout1  qout2  qout3  qout4 
0.4699 0.2357 0.2944 0.2459 0.3240 0.1684 0.2617 
   
> for(i in 1:10) listvec = EMiter(listvec[[1]], listvec[[2]])
  round(unlist(listvec),4)
 pout1  pout2  pout3  qout1  qout2  qout3  qout4 
0.4701 0.2356 0.2943 0.2461 0.3240 0.1683 0.2616 

> listvec0 = listvec
  for(i in 1:10) listvec = EMiter(listvec[[1]], listvec[[2]])
    round(unlist(listvec),4)
 pout1  pout2  pout3  qout1  qout2  qout3  qout4 
0.4701 0.2356 0.2943 0.2461 0.3240 0.1683 0.2616 

> unlist(listvec)-unlist(listvec0)
        pout1         pout2         pout3         qout1         qout2 
 5.886801e-07 -3.658261e-07 -2.228540e-07  5.886801e-07  0.000000e+00 
        qout3         qout4 
-2.228540e-07 -3.658261e-07         ### So good convergence has happened !

##--------------------------------------------------------------
## Now a continuous distributional example -- random effects ANOVA

Let  Y_{ij} = mu_i + a[j] + eps[i,j]   i=1,..,5, j=1,..4

where eps[i,j] ~ N(0,sig^2) iid independent of 
      a[j]    ~ N(0,siga^2) iid
and mu_i are unknown parameters along with  sig, sigA

NB aj given Dobs is N(Ybar_{.j}, sig^2*0.2*sigA^2/(sigA^2+0.2*sig^2)

Data: avec = rnorm(4,0,1.5)
      eps = array(rnorm(20,0,0.8), c(5,4))
      Ymat = outer(c(6,3,5,2,4),avec,"+") + eps 
      Yidot = apply(Ymat,1,mean)
      Ydotj = apply(Ymat,2,mean)
      Ybar = mean(Yidot)

listpar = list(mu=rep(1,5), sigAsq=2, sigsq=1)

EMiter2 = function(listpar) {
      mubar = mean(listpar[[1]])
      gam = listpar[[2]]/(listpar[[2]]+0.2*listpar[[3]])
      avar = 0.2*listpar[[3]]*gam
      newEaj = mubar + gam*(Ydotj-mubar)
      newmuvec = Yidot - mean(newEaj)
      newsgAsq = avar + mean(newEaj^2)
      newsgsq = avar + mean(c(Ymat - outer(newmuvec,newEaj,"+"))^2)
      list(mu =newmuvec, sigAsq=newsgAsq, sigsq=newsgsq) }


####--------------------------------------------------------
## Now a continuous distributional example -- random effects ANOVA

Let  Y_{ij} = mu_i + a[j] + eps[i,j]   i=1,..,5, j=1,..4

where eps[i,j] ~ N(0,sig^2) iid independent of       
      a[j]    ~ N(0,siga^2) iid
and mu_i are unknown parameters along with  sig, sigA

NB aj given Dobs is    N(gam*(Ybar_{.j}-mubar), sig^2*0.2*)
           where
                 gam = sigA^2/(sigA^2+0.2*sig^2)
           and   mubar = sum_{i=1}^5 mu_i/5

NOTE THAT THIS CONDITIONAL DISTRIBUTION WAS WRITTEN WRONGLY WHEN I DID
THIS MUCH IN CLASS. IT IS CORRECTED HERE.


Data: avec = rnorm(4,0,1.5)
      eps = array(rnorm(20,0,0.8), c(5,4))
      Ymat = outer(c(6,3,5,2,4),avec,"+") + eps 
                  ## correct mu = (6,3,5,2,4), sigAsq=1.5^2, sigsq=0.8^2
      Yidot = apply(Ymat,1,mean)
      Ydotj = apply(Ymat,2,mean)
      Ybar = mean(Yidot)

listpar = list(mu=rep(1,5), sigAsq=2, sigsq=1)   ### Initial values


EMiter2 = function(listpar) {         mubar = mean(listpar[[1]])
      gam = listpar[[2]]/(listpar[[2]]+0.2*listpar[[3]])
      avar = 0.2*listpar[[3]]*gam
      newEaj = gam*(Ydotj-mubar) 
      newmuvec = Yidot - mean(newEaj)
      newsgAsq = avar + mean(newEaj^2)
      newsgsq = avar + mean(c(Ymat - outer(newmuvec,newEaj,"+"))^2)
      list(mu =newmuvec, sigAsq=newsgAsq, sigsq=newsgsq)      }

> listpar = EMiter2(listpar)             ###  first EM iterate
  round(unlist(listpar),3)
   mu1    mu2    mu3    mu4    mu5 sigAsq  sigsq 
 3.340  0.907  3.037 -1.714  0.662  7.682  0.720

> listpar = EMiter2(listpar)
  round(unlist(listpar),3)
    mu1    mu2    mu3    mu4   mu5 sigAsq  sigsq 
 3.385  0.952  3.082 -1.669  0.707  7.658  0.666 

>  listpar = EMiter2(listpar)
    round(unlist(listpar),3)
    mu1    mu2    mu3    mu4   mu5 sigAsq  sigsq 
 3.427  0.993  3.124 -1.627  0.749  7.454  0.655 

>  listpar = EMiter2(listpar)
   round(unlist(listpar),3)
   mu1    mu2    mu3    mu4    mu5 sigAsq  sigsq 
 3.468  1.034  3.165 -1.586  0.790  7.258  0.653  

### will require many more iterates ...

>  for (i in 1:50)  listpar = EMiter2(listpar)
   round(unlist(listpar),3)
   mu1    mu2    mu3    mu4    mu5 sigAsq  sigsq 
 5.524  3.091  5.221  0.470  2.846  1.697  0.654 

### Stabilizing, but not quite there yet !
### Recall that the "true" parameter vector is
###    (6,3,5,2,4, 1.5^2, 0.8^2)

>   for (i in 1:50)  listpar = EMiter2(listpar)
       round(unlist(listpar),3)
   mu1    mu2    mu3    mu4    mu5 sigAsq  sigsq 
 5.797  3.364  5.494  0.743  3.119  1.605  0.655 

>  for (i in 1:50)  listpar = EMiter2(listpar)
      round(unlist(listpar),3)
   mu1    mu2    mu3    mu4    mu5 sigAsq  sigsq 
 5.803  3.369  5.500  0.749  3.125  1.605  0.655 

### You can see appropriate convergence, but it is slow.

>  for (i in 1:100)  listpar = EMiter2(listpar)
    round(unlist(listpar),3)

   mu1    mu2    mu3    mu4    mu5 sigAsq  sigsq 
 5.803  3.369  5.500  0.749  3.125  1.605  0.655 

   #### But we seem to have gotten there !!

=========================================================
                                                10/26/2015

Material from Slog4.txt log, updated:

> steamdat <- matrix(c(10.98, 35.3, 11.13, 29.7, 12.51, 30.8, 8.40, 58.8,
   9.27, 61.4, 8.73, 71.3, 6.36, 74.4, 8.50, 76.7, 7.82, 70.7, 9.14, 57.5, 
   8.24, 46.4, 12.19, 28.9, 11.88, 28.1, 9.57, 39.1, 10.94, 46.8, 9.58, 
   48.5, 10.09, 59.3, 8.11, 70.0, 6.83, 70.0, 8.88, 74.5, 7.68, 72.1,
   8.47, 58.1, 8.86, 44.6, 10.36, 33.4, 11.08, 28.6), ncol=2, byrow=T,
   dimnames=list(NULL,c("steamuse","temp")))

>  plot(steamdat[,2],steamdat[,1], xlab="Temp", ylab="Use",
      main="Draper-Smith Steam Data Example")

>  lmtmp = lm(steamuse ~ . , data=data.frame(steamdat))
>  identify(steamdat[,2],steamdat[,1])
[1]  3  7 11 15 17 19

> summary(lmtmp)

Call:
lm(formula = steamuse ~ ., data = data.frame(steamdat))

Residuals:
    Min      1Q  Median      3Q     Max 
-1.6789 -0.5291 -0.1221  0.7988  1.3457 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 13.62299    0.58146  23.429  < 2e-16 ***
temp        -0.07983    0.01052  -7.586 1.05e-07 ***
---
Signif. codes:  0 �***� 0.001 �**� 0.01 �*� 0.05 �.� 0.1 � � 1

Residual standard error: 0.8901 on 23 degrees of freedom
Multiple R-squared:  0.7144,    Adjusted R-squared:  0.702 
F-statistic: 57.54 on 1 and 23 DF,  p-value: 1.055e-07

> names(summary(lmtmp))
 [1] "call"          "terms"         "residuals"     "coefficients" 
 [5] "aliased"       "sigma"         "df"            "r.squared"    
 [9] "adj.r.squared" "fstatistic"    "cov.unscaled" 

#### It is a good idea to test your model knowledge so that you 
can compute "by hand" (in R) as many as possible of the  lm  output
list components, e.g.:

> c(summary(lmtmp)$sigma^2, sum(lmtmp$residuals^2)/23)
[1] 0.7923217 0.7923217 

> summary(lmtmp)$cov.unscaled
             (Intercept)          temp
(Intercept)  0.426720377 -0.0073520984
temp        -0.007352098  0.0001397737

> mat = model.matrix(lmtmp)
  solve(t(mat) %*% mat)
             (Intercept)          temp
(Intercept)  0.426720377 -0.0073520984
temp        -0.007352098  0.0001397737

> c(summary(lmtmp)$r.sq, cor(steamdat[,1], lmtmp$fitted)^2)
[1] 0.7144375  0.7144375


> rvar = { mtmp = model.matrix(lmtmp)
           summary(lmtmp)$sigma^2*diag(diag(25)-mtmp %*% 
               solve(t(mtmp) %*% mtmp, t(mtmp))) }
  stdrsd = lmtmp$resid/sqrt(rvar)
 

>  plot(steamdat[,2],steamdat[,1], xlab="Temp", ylab="Use",
      main="Draper-Smith Steam Data Example")
   identify(steamdat[,2],steamdat[,1], labels=round(stdrsd,2))
[1]  3  7 11 15 17 19 20

> anova(lmtmp)
Analysis of Variance Table

Response: steamuse
          Df Sum Sq Mean Sq F value    Pr(>F)    
temp       1 45.592  45.592  57.543 1.055e-07 ***
Residuals 23 18.223   0.792                      
---
Signif. codes:  0 �***� 0.001 �**� 0.01 �*� 0.05 �.� 0.1 � � 1

## NB 
> c(sum(lmtmp$resid^2), sum((steamdat[,1]-mean(steamdat[,1]))^2))
[1] 18.2234 63.8158  ## 2nd number is "total (corrected) sum of squares"

#### Continue using RlogF09.LinRegr.txt or StepExmp.Log

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

Model syntax:   y ~ x + z

                y ~ x*z

> data(attitude)    
> dim(attitude)
[1] 30  7
         
> tmpfit = lm( rating ~ . , data=attitude)

              update(   )

      update(tmpfit, .~ complaints)

 stpfit1 = step(tmpfit, ratings ~ (complaints + privileges + 
    learning + raises + critical + advance)^2, direction = "forw")
             
### Further discussion about factors, contrasts ... 


                                             10/28/2015

GAUSSIAN QUADRATURE

So far our numerical intergration formulas (Trapezoid, Simpson's) apply 
to finite ranges of integration and are taken from a large class 
of numerical quadrature  formulas callled "Newton-Cotes formulas".

Another approach, called "Gaussian Quadrature", applies also to infinite
ranges and approximates integrals   \int g(x) f(x) dx
for functions g  (smooth, well-approximated by polynomials)
by sums   \sum_{k=1}^n g(p_k) w_k
when f(x) is any of a number of special families of "kernels" (mostly,
probability densities after scaling).

The most important example for us is Gaussian quadrature with respect 
to the "error-function" kernel f(x) = exp(-x^2) on (-Infty,Infty)
which we view as the N(0,1/2) scaled up by a factor  sqrt(pi).

> GHquad.wt =            #### adapted from "Numerical Recipes" book
function (N) 
{
    bvec <- sqrt((1:(N - 1))/2)
    elist <- eigen(matrix(replace(rep(0, N * N), (N + 1) * (1:(N - 
        1)), bvec) + replace(rep(0, N * N), seq(2, N * (N - 1), 
        N + 1), bvec), ncol = N), symm = T)
    list(pts = elist$values, wts = sqrt(pi) * c(elist$vectors[1, 
        ])^2)
}

Here N is the desired number of points (placed symmetrically around the origin)

In order to calculate E g(Z) for Z ~ N(0,1), view Z = sqrt(2)*Z' where 
Z' ~ N(0, 1/2)

tmp = GHquad.wt(20)      ### N may be taken larger, but sometimes not necessary!
sum( g(sqrt(2)*tmp$pts)*tmp$wts )/sqrt(pi)

### Example

> tmp=GHquad.wt(20); tmpB=GHquad.wt(100)

> sum( (sqrt(2)*tmp$pts)^2*tmp$wts ) /sqrt(pi) - 1
[1] 2.220446e-15
> sum( (sqrt(2)*tmpB$pts)^2*tmpB$wts ) /sqrt(pi) - 1
[1] -6.994405e-15

> sum( (sqrt(2)*tmp$pts)^4*tmp$wts ) /sqrt(pi) - 3
[1] 4.884981e-15
> sum( (sqrt(2)*tmpB$pts)^4*tmpB$wts ) /sqrt(pi) - 3
[1] -3.330669e-14

### Now let's try a function g not a low-order polynomial

> A1 = sum( log(1+2*3.5*tmp$pts^2) * tmp$wts )/sqrt(pi)
> A2 = sum( log(1+2*3.5*tmpB$pts^2) * tmpB$wts )/sqrt(pi)
> c(A1=A1, A2=A2, diff=A1-A2)
         A1          A2        diff 
1.094413002 1.089033938 0.005379065 

> integrate(function(x) log(1+3.5*x^2)*dnorm(x),-Inf,Inf)
1.089027 with absolute error < 4.8e-08

>  integrate(function(x) log(1+3.5*x^2)*dnorm(x),-Inf,Inf, rel.tol=1.e-9)
1.089027 with absolute error < 2.4e-11

> tmpC = GHquad.wt(160)
> sum( log(1+2*3.5*tmpC$pts^2) * tmpC$wts )/sqrt(pi)
[1] 1.089028

### Suppose we want integrals of log(1+b*x^2)*dnorm(x)
    for all  b in   seq(1,3.5,by=.01)

bvec =   seq(1,10,by=.01)
vec1 = vec2 = numeric(901)

unix.time({for(i in 1:901) vec1[i] = 2*integrate(function(x,b) 
      log(1+b*x^2)*dnorm(x),0,Inf,rel.tol=1.e-10, b=bvec[i])$val })
   user  system elapsed 
   0.11    0.00    0.10 

### 900 high-accuracy integrals, 0.13 sec.

> unix.time( { tmp = GHquad.wt(100)
     mat = log(1+outer(2*bvec, tmp$pts^2,"*"))
     vec2 = c(mat %*% tmp$wts)/sqrt(pi) } )
   user  system elapsed 
   0.01    0.00    0.02 

> summary(vec1-vec2)   ### errors range from 1e-9 near b=1
               #### to 1e-4 near b=10

> unix.time( { tmp = GHquad.wt(200)
     mat = log(1+outer(2*bvec, tmp$pts^2,"*"))
     vec2 = c(mat %*% tmp$wts)/sqrt(pi) } )
   user  system elapsed 
   0.01    0.00    0.02 
> summary(vec1-vec2)
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-2.431e-05 -7.312e-06 -1.122e-06 -4.661e-06 -3.171e-08  0.000e+00 

> unix.time( { tmp = GHquad.wt(400)
     mat = log(1+outer(2*bvec, tmp$pts^2,"*"))
     vec2 = c(mat %*% tmp$wts)/sqrt(pi) } )
   user  system elapsed 
   0.11    0.00    0.11 

  summary(vec1-vec2)
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-4.229e-07 -7.689e-08 -5.371e-09 -6.133e-08 -3.380e-11  0.000e+00 

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

                                                     11/9/2015

GLM material roughly following log "RlogF09.GLM.txt"

 X1 = runif(120); X2 = rpois(120,3)
  ntrials = rep(c(5,6,3), 40)  
  success = rbinom( 120, ntrials, plogis(-1 + 0.6*X1 + 0.1*X2) )
  failure = ntrials - success
  simlogis = data.frame( X1, X2, success, failure )

>  fitB0 = glm ( cbind(success, failure) ~ X1 + X2,
                   family=binomial, data=simlogis )

### NB: within family=binomial, link="logit" is the default,
###   but other possibilities exist, most often "probit" or "cloglog" 

### NB: there is another syntax, sometimes more useful, giving 
EXACTLY THE SAME MODEL FITTED OBJECTS.

> fitBAlt = glm ( I(success/ntrials) ~ X1 + X2, weights=ntrials,
                   family=binomial, data=simlogis )
> sum(abs(fitBAlt$fit - fitB0$fit))
[1] 0

> sum(abs(c(summary(fitBAlt)$coef - summary(fitB0)$coef)))
[1] 0

### OFTEN THE OUTCOME OF USING "weights" ARGUMENTS IN MODEL-FITTING 
## FUNCTIONS IN R IS  MUCH LESS TRANSPARENT !!

-------------------------------------------------

Following log  "GLMlog.R", get data "Data/pbcdata.asc"

> pbcdat = read.table("http://www.math.umd.edu/~evs/s705/Data/pbcdata.asc", header=T)    

> names(pbcdat)
 [1] "OBS"     "IDNUM"   "DTH"     "EVTTIME" "TREATGP" "LOGBILI" "AGEVAR"
 [8] "CIRRH"   "CCHOL"   "ALBUMIN"

> ind = pbcdat$DTH==1 | pbcdat$EVTTIME >= 41
  Surv41 = as.numeric(pbcdat$EVTTIME < 41)
  pbcdat = cbind.data.frame(pbcdat[,3:10], Surv41)[ind,] ## 172 x 9
  
> tmpglm = glm(cbind(Surv41, 1-Surv41) ~ LOGBILI + TREATGP + AGEVAR,
    data=pbcdat, family = binomial)

### Simulation to introduce "dispersion" in the form of shared random 
###                            effect within cluster

reff = runif(172,-2,2)
yrsp1 = rbinom(172, rep(10,172),plogis(tmpglm$lin + reff))
  #### 172 y's like the binary outcomes in PBCdata except they now
   ###    arise from clusters of 10 with shared "reff"

### Think of yrsp1[i]/10 = Ybar_{i.} as the "data" for the ith cluster

> mean(dlogis(tmpglm$lin + reff))           ### E( Var Ybar_{i.} )
[1] 0.132752
>   mean((10/9)*.1*yrsp1*(1-0.1*yrsp1))   ### estimator of E( Var Ybar_{i.} )
[1] 0.1264858                             ### average across i.

glmfitA = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat,
   family=binomial) 
glmfitB = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat,
   family=quasibinomial) 

sum(abs(glmfitA$coef-glmfitB$coef))   ### 0 

> c(summary(glmfitB)$disp, summary(glmfitA)$disp)
[1] 2.494836 1.000000

>  sum(10*(yrsp1/10-glmfitB$fit)^2/(glmfitB$fit*(1-glmfitB$fit)))/(172-4)
[1] 2.494836

=======================================================================

                                                                 11/11/2015

Small Initial Demonstration of Smoothing Spline
===============================================

> xv = runif(500, 0,2)
  fxv = sqrt(plogis(xv +2.5*xv - 5*xv^2 + 2*xv^3)) 
  yv = fxv + 0.03*rnorm(500)

  ordx = order(xv)
  plot(xv[ordx], yv[ordx])
  curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)

  tmpspl1 = smooth.spline(xv,yv)
  lines(xv[ordx], predict(tmpspl1,xv[ordx])$y, col="blue", lwd=3)
> tmpspl1$spar
[1] 0.7934125

> tmpspl2 = smooth.spline(xv,yv, spar=0.01)

  lines(xv[ordx], predict(tmpspl2,xv[ordx])$y, col="green", lwd=3)

  plot(xv[ordx], yv[ordx])
  curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)

  tmpspl3 = smooth.spline(xv,yv, spar=1)

  lines(xv[ordx], predict(tmpspl3,xv[ordx])$y, lwd=3)

  yv2 = fxv + 0.001*rnorm(500)
  plot(xv[ordx], yv2[ordx])
  curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)
  tmpspl4 = smooth.spline(xv,yv)

  lines(xv[ordx], predict(tmpspl4,xv[ordx])$y, col="blue", lwd=2)

summary(fxv - predict(tmpspl4,xv)$y)
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-6.230e-03 -3.092e-03 -5.708e-05 -7.092e-04  1.010e-03  6.539e-03 

>>>>> Continue with other smoothing and nonparametric regression techniques

 nprgfram =  cbind.data.frame(x=xv, y=yv)

> plot(nprgfram$x,nprgfram$y, main="Scatterplot of Raw Data, n=500")

  for(i in 1:8) abline(v=i*.2, lty=3)
  for(i in 1:9) {
   inds <- (1:500)[abs(nprgfram$x-.2*i+.1) < 0.1]
   itmp <- order(nprgfram$x[inds])
   lines(nprgfram$x[inds[itmp]], lm(y ~ x, data=nprgfram[
       inds,])$fit[itmp], lty=1, lwd=2) }

 curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)

### Now consider a lowess fit:

> plot(nprgfram$x,nprgfram$y, main="Raw Data with Lowess Line, n=500")
  tmplow = lowess(nprgfram$x,nprgfram$y)
  lines(tmplow$x,tmplow$y, lty=1, lwd=2)
 curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)

### 

locpoly(x, y, drv = 0, degree, kernel = "normal", 
        bandwidth, gridsize = 401L, bwdisc = 25, 
        range.x, binned = FALSE, truncate = TRUE)


tmppoly = locpoly(xv, yv, degree=3, bandwidth=0.1)

plot(tmppoly)
points(nprgfram$x,nprgfram$y, main="Scatterplot of Raw Data, n=500")
 curve(sqrt(plogis(x +2.5*x - 5*x^2 + 2*x^3)),c(0,2), 
      add=T, col="red", lwd=2)

### DISCUSSION: USE OF SPLINES FOR NUMERICAL APPROXIMATION     11/18/15

## Consider the problem of sampling from a specified distribution, 
##    by the probability integral transform.

## Let's do this with  t_5

xv = c((1:500)/10, 10 + (1:20)/2)
xv = c(-rev(xv),0,xv)
> pt(xv[1],5)
[1] 2.887758e-06

> yv = pt(xv,5))
  curve(log(pt(x,5)), -5,5)

  tmpy = smooth.spline(log(yv),xv)
 plot((1:999)/1000,predict(tmpy,log((1:999)/1000))$y, typ="l")
  curve(qt(x,5), .001,.999, add=T, col="red")

### Now to simulate 100,000 of them using the spline-fitted function
### Note that the evaluation function is

unix.time({rant = predict(tmpy, log(runif(1e5)))$y})
   user  system elapsed 
   0.03    0.00    0.03 

  hist(rant, prob=T, nclass=128, xlim=c(-6,6))
  curve(dt(x,5), -6,6, add=T, col="green")

### I will ask you to do this in a more difficult case, for an exercise

### SPLINE BASIS EXAMPLE

## Start with a fairly complicated curve

mixran = function(x, pts, seednum)  {
   set.seed(seednum)
   coef = 10*(runif(length(pts))-0.5)
   structure(c(outer(x, pts, function(x,y) dnorm(x-y)) %*% coef), 
         seed=seednum, coef=coef) }

xv = seq(-5,5,by=0.1)
mixran(xv[1:5], c(-1,0.5,2), 333)
[1] -4.461042e-05 -6.632645e-05 -9.765776e-05 -1.424013e-04 -2.056501e-04
attr(,"seed")
[1] 333
attr(,"coef")
[1] -0.3299934 -4.1540185  4.7348527


plot(xv, mixran(xv, c(-1,0.5,2), 333))

yv3 = mixran(xv, c(-3,-2,-0.5,0.8,2,4), 333) + rnorm(101,0,0.8)

> library(splines)
  library(KernSmooth)

dim(mat3)  ### 101 x 8

mat3 = bs(xv, knots=seq(-4,4,by=2), degree=3)  ### dim=length(knots)+degree

fity = lm(yv3 ~ . , data=cbind.data.frame(yv3,mat3))

plot(xv,yv3)
lines(xv, mixran(xv, c(-3,-2,-0.5,0.8,2,4), 333), col="red")

points(xv, fity$fit, pch=5)

### What if we took sparser knots ?

mat4 = bs(xv, knots=c(-3,0,3), degree=3)
points(xv, lm(yv3 ~ . , data=cbind.data.frame(yv3,mat4))$fit, pch=18)

==================================================================

Illustration of Data Replication Methods
========================================                 11/23/2015

DATA-SPLITTING, PARAMETRIC BOOTSTRAP, & BAYES 
    POSTERIOR PREDICTIVE SIMULATION

## Data illustration

  set.seed(5)
  x = runif(200)
  y = 3 - 7*x + 10*(0.1+abs(x-.5))^1.5*rt(200,10)
  rtwt = 1/(0.1+abs(x-.5))^1.5
  Dset = cbind.data.frame(y, x, wt=rtwt^2)

> summary(rtwt^2)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  4.760   9.806  22.580  91.930  90.610 761.900 

  x1st = rtwt
  x2st = x*rtwt
  yst = y*rtwt
  Dset2 = cbind.data.frame(yst,x1st,x2st)

  AuxY = array(0, c(1000,200))   

> tmp1 = lm(y~x, data=Dset[1:100,], weight=wt)

## NOTE: S3 method for class 'lm'
predict(object, newdata, se.fit = FALSE, scale = NULL, df = Inf,
        interval = c("none", "confidence", "prediction"),
        level = 0.95, type = c("response", "terms"),
        terms = NULL, na.action = na.pass,
        pred.var = res.var/weights, weights = 1, ...)

  tmp2 = predict(tmp1,Dset[1:100,], interval="none", 
             type="response", weights=Dset$wt[1:100])

> summary(tmp2-tmp1$fit)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.000e+00 8.882e-16 1.332e-15 1.578e-15 1.776e-15 2.642e-14 


NOTE THAT IN ALL OF THE COMPUTATIONAL METHODS STUDIED IN THIS EXTENDED
EXAMPLE, WE COMPUTE THE (unweighted) RMSE FOR Y-PREDICTIONS BUT DO THE 
MODEL-FITTING BY A WEIGHTED-LEAST SQUARES METHOD.



Data-splitting method: 50-50 splits for RMSE
---------------------

  TstDat = TrnDat = array(0, c(1000,100))
  TrnMSE =  SplMSE = numeric(1000)

  for(i in 1:1000) {
    ind = sample(1:200, 100)
    TrnDat[i,] = setdiff(1:200,ind)
    tmp = lm(y ~ x, weights=wt, data=Dset[TrnDat[i,],])
    TrnMSE[i] = mean(tmp$resid^2)
    SplMSE[i] = mean((Dset$y[ind] - 
       predict(tmp,Dset[ind,], interval="none", 
          type="response", weights=Dset$wt[ind]))^2)
    }

>  summary(sqrt(SplMSE))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.898   2.421   2.559   2.552   2.694   3.137
>  summary(sqrt(TrnMSE))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.809   2.371   2.514   2.508   2.652   3.034 

IT IS NO SURPRISE THAT THE TrnMSE VALUES ARE SMALLER THAN THE 
TEST-DATA SplMSE VALUES, ALTHOUGH HERE THE DIFFERENCE IS SMALL.

AS WE REMARKED IN CLASS, THIS METHOD OF PSEUDO-SAMPLE 
REPLICATION GENERATES X'S AND Y'S TOGETHER.


"Parametric Bootstrap"
----------------------

## generate residuals anew from N(0,sighat)
## "predictors" kept from  tmp1 with 

   xvec = x[1:100]
   wts = Dset$wt[1:100]
   AuxY = tmp1$fitted + array(rnorm(100*1000,0,
         rep((0.1+abs(xvec-0.5))^1.5*summary(tmp1)$sig,100)), 
         c(100,1000))

### Now can look at the MSEs from re-fitting the model 1000 times

    MSEboot = numeric(1000)
    for(i in 1:1000) 
       MSEboot[i] = mean(lm(AuxY[,i] ~ xvec, weights=wts)$resid^2)

> summary(sqrt(MSEboot))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.651   2.218   2.361   2.364   2.502   3.107 

THIS FIRST WAY OF DOING THE PARAMETRIC BOOTSTRAP GENERATES ONLY Y'S
FOR A FIXED SET OF X'S. YOU COULD SAY THAT WHAT WE GENERATE IN THIS 
WAY IS A SET OF "CONDITIONAL" PSEUDO-SAMPLES. THAT IS, THE VARIABILITY
REPLICATED HERE IS STRICTLY THAT DUE TO THE CONDITIONAL DENSITY OF 
Y GIVEN X.

NEXT LET'S DO A SLIGHTLY MORE ELABORATE PARAMETRIC-BOOTSTRAP SIMULATION
IN WHICH X'S AND Y'S ARE RE-GENERATED. BUT I PROPOSE, AS I MENTIONED IN
CLASS, A SIMULATION DESIGN IN WHICH WE CAN SEPARATE THE EFFECTS ON MSE
OF THE ESTIMATOR OF DIFFERENT SETS OF X'S FROM THE CONDITIONAL SAMPLING 
VARIABILITY OF Y'S GIVEN FIXED X'S.

We choose to simulate B=100 different X-vectors X^{(b)}, and then for 
each one, R=1000 independently generated Y-vectors Y^{(b,r)} from 
the conditional distribution of Y given X=X^{(b)}.

Xarr = array(runif(1e4), c(100,100))   ### index (b,i)

Yarr = array(0, c(1000,100,100))  
## indices (r,b,i) : r indexes y-vector replicate within fixed X^{(b)},
###    and i=1:100 indexes data points within each batch of n=100.

abhat = tmp1$coef
sighat = summary(tmp1)$sig
epsarr = array(0, c(100,1000))
Wtarr = (0.1+abs(Xarr-0.5))^(-1.5)
Wtarr = (1/apply(Wtarr,1,sum))*Wtarr

for(b in 1:100) {
    epsarr[,] = rnorm(1e5)
    Yarr[,b,] = t( (abhat[1] - abhat[2]*Xarr[b,]) + 
      (sighat*(0.1+abs(Xarr[b,]-0.5))^1.5)*epsarr ) }

## Now we have the large arrays of Y's and X's and we need to 
## vectorize the weighted least squares estimates in order to make
## the computations quick.

Ypred = replace(Yarr,T,0)    ### array like Yarr filled with 0's

Xmn = apply(Wtarr*Xarr,1,sum)
XctrArr = -Xmn + Xarr
SXXsq = apply(Wtarr*XctrArr^2,1,sum)

auxmat = array(outer(rep(1,1000), Wtarr*XctrArr)*Yarr, 
                c(1000*100,100))

b2hatarr = array( c( auxmat %*% rep(1,100))/      
      rep(SXXsq, rep(1000,100)),  c(1000,100))    ### 1000 x 100  (r,b)

b1hatarr = apply(outer(rep(1,1000),Wtarr)*Yarr,1:2,sum) - t( Xmn*t(b2hatarr) ) 

for(b in 1:100) Ypred[,b,] = b1hatarr[,b] + outer(b2hatarr[,b],Xarr[b,])

MSEarr = apply( (Yarr-Ypred)^2, 1:2, mean)   ### 1000x100  (r,b)
  Biasarr = apply(Ypred - Yarr, 1:2, mean)> summary(c(Biasarr))
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-0.6123000 -0.0921800 -0.0005000 -0.0004574  0.0912800  0.5894000 

### Biases are small but not 0. They would be 0 if the averages of 
prediction errors within sample were taken weighted by Wtarr.

### All of this was quick. Now we have an array of MSEs for all (r,b)
### and we break MSE into 3 parts each of which we estimate via simulation:

### MSE (Ypred_i^{(r,b)}) = E( (Y_i^{(r,b)} - Ypred_i^{(r,b)})^2 )
###  +  E( Var(Y_i^{(r,b)} - Ypred_i^{(r,b)} | X[b,.]) )  
###  +  Var( E(Y_i^{(r,b)} - Ypred_i^{(r,b)} | X[b,.]) )  

## First term is bias^2, which we estimate from Biasarr^2 

> mean(c(Biasarr^2))
[1] 0.01865613
> mean( apply((Yarr - Ypred)^2, 2, mean) )
[1] 5.081514

> sd( apply((Yarr - Ypred)^2, 2, mean) )
[1] 0.5081083

> var( c(Yarr-Ypred))
[1] 5.081514

DID WE LEARN ANYTHING EXTRA BY DOING THE SIMULATION BY BLOCKS b ?
WE FOUND OUT HOW VARIABLE OUR OVERALL ESTIMATED ANSWER WAS DUE TO 
SAMPLING VARIATION FROM ONE SET OF X'S TO ANOTHER.

HOW DID THIS SIMULATION COMPARE WITH THE DATA-SPLITTING MSE ?
THE RMSE WE FOUND THERE WAS  2.508  WHILE WE JUST FOUND THE 
PARAMETRIC-BOOTSTRAP RMSE TO BE

> sqrt(5.081514)
[1] 2.254221

THE RMSE FOUND FROM THE PARAMETRIC BOOTSTRAP WITH SINGLE FIXED Xvec
WAS  2.364, but since that was for just a single Xvec, and we now 
know from the two-level (r and b) simulation that the variance of 
the MSE estimator was (0.5018)^2, these different parametric-bootstrap 
estimators (one with fixed X's, the other with varying X's) are 
within range of one another.

NOTE HOWEVER THAT WE MAY ALSO GUESS THAT THE DATA-SPLITTING MSE IS 
LARGER BECAUSE IT TAKES INTO ACCOUNT THE ACTUAL DATA (200 DATA-PAIRS)
WHICH DID NOT PRECISELY SATISFY THE MODEL ASSUMED BY THE PARAMETRIC BOOTSTRAP.


BAYES posterior predictive sampling
-----------------------------------

We continue with the same model as above, but define a standard conjugate 
prior for the parameters theta = (b1,b2, sigma^2) within a normal-errors 
model for the data with known weight-pattern (ie standard deviation of 
y given x known to be proportional to (0.1+abs(x-.5))^1.5. We will 
continue with that model as we did for the parametric bootstrap, 
although recall that it is slightly misspecified because the errors were 
actually t_{10} distributed rather than normal.

### We want to define a prior conjugate for the model

y ~ N( b1 + b2*x, (sig*(0.1+abs(x-0.5))^{1.5})^2 )

## For this, could take b1,b2 ~ indep N(0, sig0^2) ,  sig0 large and 
##    known, say 5 and 1/sig^2 ~ Gamma(1, lam0)  for small lam0, say 0.1.

## Then with tau = 1/sig^2, the posterior is proportional to 

tau^{n/2} exp( -0.02*(b1^2+b2^2) - 0.5*tau*sum(yi-b1-b2*xi)^2

## This is conjugate (posterior of the same form as prior) for (b1,b2) 
##   if sig had been known and fixed

## and for tau or sig^2 if b1,b2 were known and fixed. But jointly, it
## is not conjugate.

NOTE: there is a Theorem (Bernstein-von Mises) which says that in large 
iid samples, the posterior is approximately normal centered at the MLE
and with variance (like the MLE!) the inverse of the Fisher Information

Here the Fisher Information from the fitted model is obtained as

> Infmat = array(c(sum(Dset$wt), sum(Dset$wt*Dset$x), 0, 
             sum(Dset$wt*Dset$x), sum(Dset$wt*Dset$x^2), 0, 
             0, 0, nrow(Dset)/sighat^2)/sighat^2, c(3,3))

> round(solve(Infmat), 4)
        [,1]    [,2]    [,3]
[1,]  0.0840 -0.1613  0.0000
[2,] -0.1613  0.3267  0.0000
[3,]  0.0000  0.0000 32.0117


## mvrnorm(n = 1, mu, Sigma, tol = 1e-6, empirical = FALSE, EISPACK = FALSE)

## Approximate Bayesian Idea in simulating new data pseudo-samples is

(a) to draw  multivariate normal (b1H,b2H,sigH^2) ~ 
    N( (b1hat, b2hat,sighat^2), Infmat)

(b) generate x ~ unif and y given x as 
    N(b1H + b2H*x, {sigH*(0.1+abs(x-0.5))^1.5}^2)

HOWEVER: use Delta method upon transformed parameter  nu = log(sig^2)
   to force the simulated variance values to be positive !

NB. We can get an idea of the degree of (NON-) normality of sighat 
distribution by using the array of Yarr - Ypred = Yresid values to 
estimate a 1000 x 100 (r,b) array of parametric-bootstrap Monte Carlo 
simulated ones, as follows:

> library(MASS)
###  mvrnorm(n = 1, mu, Sigma, tol = 1e-6, empirical = FALSE, EISPACK = FALSE)

> dim(Yarr)
[1] 1000  100  100
> sghatsqArr = apply( (Yarr-Ypred)^2, 1:2, mean) 
> dim(sghatsqArr)
[1] 1000  100

> hist( sghatsqArr[,2], nclass=50, prob=T)
> hist( sghatsqArr, nclass=50, prob=T)

> curve(dnorm(x,mean(sghatsqArr), sd(sghatsqArr)), 2,12, add=T, col="green")
#### definitely off !!!

### Try to overplot gamma with matching 1st two moments instead

> beta = mean(c(sghatsqArr))/var(c(sghatsqArr))
  alph = beta*mean(c(sghatsqArr))
  curve(dgamma(x,alph, beta), 2,12, add=T, col="blue")
##----------------------------------------------------------


> c(coef = tmp1$coef, sigsq = summary(tmp1)$sig^2)
coef.(Intercept)           coef.x            sigsq 
        2.282569        -5.560408        80.014571 
### "True" b1,b2 values from simulation were 3, -7.

### When sig^2=80, obtain "theoretical" MSE as the 
###   expectation of 80*(t_{10})^2*E((0.1+abs(U-0.5))^3) = 
> 80*1.25*0.06475 = 6.475  ### derived from
> integrate(function(x) x^2*dt(x,10), -Inf,Inf)$val
[1] 1.25
> integrate(function(x) (0.1+abs(x-0.5))^3, 0,1)$val
[1] 0.06475


##>>>===========================================

                                               Continuation 11/30/2015

USE THE PRECEDING NON-NORMALITY OF  sighatsq  TO ARGUE THAT WE SHOULD 
NOT SIMPLY USE THE BERNSTEIN-VON MISES THEOREM TO SAMPLE FROM MULTIVARIATE 
NORMAL AS A SHORTCUT FOR THE BAYESIAN POSTERIOR, BUT RATHER APPROXIMATE 
THE POSTERIOR BY MULTIVARIATE NORMAL IN (b1,b2) ALONG WITH INDEPENDENT 
GAMMA (motivated by the form of the information matrix) FOR SIGSQ.

> rhovec = numeric(100)
> for(i in 1:100) rhovec[i] = cor(b1hatarr[,i],b2hatarr[,i])
> hist(rhovec)
> summary(rhovec)
    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
 -0.9807 -0.9716 -0.9673 -0.9662 -0.9617 -0.9464 
 #### STRONG NEGATIVE CORRELATION BETWEEN B1, B2

> var(cbind(c(b1hatarr), c(b2hatarr)))
           [,1]       [,2]
[1,]  0.2207807 -0.4124677
[2,] -0.4124677  0.8253712     ### corr = -0.966

> c(alpha=alph, beta=beta)    ### gamma parameters for sigsq
     alpha      beta
 20.538710  4.041849      ### mean here is  5.0815,  while 

> mean((Dset$y[1:100]-tmp1$fit)^2)
[1] 6.153456         #### Again, actual data shows greater variances
                  ### which is partly due to true  t_10 errors.

#### Using Infmat^{-1} above for variance would suggest sd's

> round(sqrt(diag(solve(Infmat))), 4)
[1] 0.2898 0.5716 5.6579

### and b1hat,b2hat correlation

> aux = solve(Infmat); aux[1,2]/sqrt(aux[1,1]*aux[2,2])
[1] -0.9737398

### NOTE that sampling from this posterior would also suffer from 
###   the same parametric-model misspecification ?!

## We would be sampling from neighborhood

b2 in  -5.560 +/-  1.96*0.572

b1 approx. = ybar - b2*xbar

sigsq approx Gamma(17.68, 3.04)

##>=====================================================
         NONPARAMETRIC BOOTSTRAP                         11/30/2015

IS THERE AN IDEA THAT LETS US LEARN ABOUT THE PERFORMANCE OF THIS
KIND OF DATA ANALYSIS WITHOUT ASSUMING WE KNOW THE CORRECT (X,Y) MODEL ?

Yes, the NONPARAMETRIC BOOTSTRAP !

## (I) Bootstrap resampling of the iid pairs  (X_i, Y_i)

## Assume the only data we have is Dset[1:100,] (including weights).

indArr = array(sample(1:100,1e5, replace=T), c(1000,100))
   #### corrected the next statements to reflect weighted least-squares

auxXB = array(Dset$x[c(indArr)], c(1000,100))
auxYB = array(Dset$y[c(indArr)], c(1000,100))Ones = rep(1/100,100)
wtB = (0.1+abs(auxXB-0.5))^(-3)
wtB = (1/apply(wtB,1,sum))*wtB
avXB = apply(wtB*auxXB,1,sum)
avYB = apply(wtB*auxYB,1,sum)
b2hatB = c( ((-avXB + auxXB)*wtB*auxYB) %*% Ones / 
            (wtB*(-avXB + auxXB)^2) %*% Ones )
b1hatB = avYB - b2hatB*avXB 

MSEB = c( (b1hatB + b2hatB*auxXB - auxYB)^2 %*% Ones )

tmp = hist(MSEB, nclass=30, prob=T)
library(KernSmooth)
lines(locpoly(tmp$mids, tmp$density, degree=3, bandw = 0.8), col="blue")
curve(dnorm(x,mean(MSEB), sd(MSEB)), 2,20, add=T, col="red")   
     ### so the MSEs are skew-distributed, not quite normal
     #### Picture saved as "NPBootMSEpic.pdf"> 
c(mean = mean(MSEB), sd = sd(MSEB))    mean       sd 
6.072114 1.417756    ### MSE quite a bit larger than ~ .5 suggested by
                     ###  parametric bootstrap, and SD larger too
   ### NOTE that this way of bootstrapping is letting the residual
       #### t-distribution express itself better !!

tmp2 = hist(b2hatB, nclass=30, prob=T)
lines(locpoly(tmp2$mids, tmp2$density, deg=3, bandw = 0.8), col="blue")
curve(dnorm(x,mean(b2hatB), sd(b2hatB)),-12,-2, add=T, col="red")    
          ### much closer to normal, but still not quite there


## (II) Bootstrap resampling of residuals*rtwt for fixed X_i
## recall xvec is Dset$x[1:100],  rtwt[1:100] = 1/(0.1+abs(xvec-0.5))^1.5
## use indArr above to resample residuals and then re-create Y's

epsBArr = array((tmp1$resid*rtwt[1:100])[c(indArr)], c(1000,100))
YNPboot = tmp1$fitted + (1/rtwt[1:100])*epsBArr
wgts = Dset$wt[1:100]/sum(Dset$wt[1:100])
avYB2 = c(YNPboot %*% wgts)
xvav = sum(xvec*wgts)

b2hatB2 = c( (outer(rep(1,1000),xvec-xvav)*YNPboot) %*% wgts )/ 
             sum((xvec-xvav)^2*wgts) 

b1hatB2 = avYB2 - b2hatB2*xvav 

MSEB2 = c( (b1hatB2 + outer(b2hatB2,xvec) - YNPboot)^2 %*% Ones )

> c(mean(MSEB2), sd(MSEB2))
[1] 5.814071 5.229153        #### Note smaller MSE when X fixed.   

### THIS WAY OF BOOTSTRAPPING IMPOSES THE STRUCTURE OF INDEPENDENCE
   ###  of X and epsilon's but the rescaling of tmp1$resid
   ### by  rtwt  seems to make the variability of the MSE estimates 
   ### much larger because of the large range of weights.

## (III) Could separately and independently bootstrap X and residuals
auxXB3 = array(sample(xvec,1e5,replace=T), c(1000,100))
YNPboot3 = tmp1$coef[1]+tmp1$coef[2]*auxXB3 + (1/rtwt[1:100])*epsBArr

wtB3 = (0.1+abs(auxXB3-0.5))^(-3)
wtB3 = (1/apply(wtB3,1,sum))*wtB3

avYB3 = apply(wtB3*YNPboot3,1,sum)
avXB3 = apply(wtB3*auxXB3,1,sum)

b2hatB3 = c( ((-avXB3 + auxXB3)*wtB3*YNPboot3) %*% Ones /
              (wtB3*(-avXB3 + auxXB3)^2) %*% Ones )

b1hatB3 = avYB3 - b2hatB3*avXB3 

MSEB3 = c( (b1hatB3 + b2hatB3*auxXB3 - YNPboot3)^2 %*% Ones )

> c(mean(MSEB3), sd(MSEB3))
[1] 5.826440 5.249827

### VERY SIMILAR RESULTS TO THOSE IN (II) ABOVE.  
                #### The average MSE stays small when
     ### we incorporate the independence of X's and epsilon's !

=============================================================    12/2

WE FINISH THIS EXAMPLE WITH WEIGHTED LEAST-SQUARES (WITH t_{10} ERRORS)
BY ASKING AND ANSWERING: WHAT IS THE "RIGHT" ANSWER FOR THE AVERAGE AND SD
OF UNWEIGHTED MSE FOR Y-PREDICTORS BASED ON THE DATA-SAMPLES OF SIZE 100 
WITH KNOWN VARIANCE-WEIGHTING (AS A FUNCTION OF X_i) for the kind of data
(with t_{10} not N(0,1) errors) we were actually working with ???

(a) Conditional for fixed  X's  at  xvec

   AuxY.MC = 3 - 7*xvec + array(rep(10*(0.1+abs(xvec-0.5))^1.5,1000)*
          rt(100*1000,10),  c(100,1000))

### Now can look at the MSEs from re-fitting the model 1000 times

    MSE.MC = numeric(1000)
    for(i in 1:1000) 
       MSE.MC[i] = mean(lm(AuxY.MC[,i] ~ xvec, weights=Dset$wt[1:100])$resid^2)

> summary(sqrt(MSE.MC))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.973   2.711   2.908   2.936   3.145   4.044 

> c(mean=mean(MSE.MC), sd = sd(MSE.MC))
    mean       sd 
8.728081 1.977124 

(b) UNconditional, with varying (iid) X's 

   AuxX.MC2 = array(runif(1e5), c(100,1000))
   RtWt.MC2 = (0.1+abs(AuxX.MC2-0.5))^1.5
   AuxY.MC2 = 3 - 7*AuxX.MC2 + 10*RtWt.MC2*
          array(rt(100*1000,10),  c(100,1000))

    MSE.MC2 = numeric(1000)
    for(i in 1:1000) 
       MSE.MC2[i] = mean(lm(AuxY.MC2[,i] ~ AuxX.MC2[,i], weights=
         RtWt.MC2[,i]^2)$resid^2)

> summary(sqrt(MSE.MC2))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.841   2.546   2.775   2.805   3.036   4.497 

> c(mean=mean(MSE.MC2), sd = sd(MSE.MC2))
    mean       sd 
7.998849 2.091280