Example Logs, STAT 705, Fall 2015
=================================

            PART 1 September 2015
         =================================
                                                      9/2/2015

> tmp = sin(0.354 + 2*pi*(1:500)/10)
> mat = array(tmp,c(10,50))
> apply(mat,1,var)
 [1] 5.927550e-29 4.168184e-31 9.749828e-29 3.043804e-28 2.945958e-28
 [6] 1.140571e-28 4.980691e-31 9.790227e-29 1.896126e-28 2.672093e-28

> tmp1 = rep(c(1,3),500)
> tmp2 = c(rbind(rep(1,500),rep(3,500)))
> sum(abs(tmp1-tmp2))
[1] 0
> tmp3 = replace(rep(1,1000), 1:1000 %% 2 == 0, 3)
> sum(abs(tmp3-tmp1))
[1] 0
> tmp4 = ifelse(1:1000 %% 2 == 0, 3,1)
> sum(abs(tmp4-tmp1))
[1] 0

> toss = rbinom(10,1,.5)
> cond = as.logical(toss)
> cond
 [1] FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
> replace(1:10,cond,101:105)
 [1]   1   2 101   4 102 103 104 105   9  10
> ifelse(cond,101:105,1:10)
 [1]   1   2 103   4 105 101 102 103   9  10

> sum(abs(  (1:10)[toss==1] - which(cond) ))
[1] 0

toss + array(rep(1:10,5), c(10,5))
      [,1] [,2] [,3] [,4] [,5]
 [1,]    1    1    1    1    1
 [2,]    2    2    2    2    2
 [3,]    4    4    4    4    4
 [4,]    4    4    4    4    4
 [5,]    6    6    6    6    6
 [6,]    7    7    7    7    7
 [7,]    8    8    8    8    8
 [8,]    9    9    9    9    9
 [9,]    9    9    9    9    9
[10,]   10   10   10   10   10

outer(toss+1:10,rep(1,5),"*")
      [,1] [,2] [,3] [,4] [,5]
 [1,]    1    1    1    1    1
 [2,]    2    2    2    2    2
 [3,]    4    4    4    4    4
 [4,]    4    4    4    4    4
 [5,]    6    6    6    6    6
 [6,]    7    7    7    7    7
 [7,]    8    8    8    8    8
 [8,]    9    9    9    9    9
 [9,]    9    9    9    9    9
[10,]   10   10   10   10   10

> attributes(mat)
$dim
[1] 10 50

> fr = data.frame(mat)
> attributes(fr)
$names
 [1] "X1"  "X2"  "X3"  "X4"  "X5"  "X6"  "X7"  "X8"  "X9"  "X10" "X11" "X12"
[13] "X13" "X14" "X15" "X16" "X17" "X18" "X19" "X20" "X21" "X22" "X23" "X24"
[25] "X25" "X26" "X27" "X28" "X29" "X30" "X31" "X32" "X33" "X34" "X35" "X36"
[37] "X37" "X38" "X39" "X40" "X41" "X42" "X43" "X44" "X45" "X46" "X47" "X48"
[49] "X49" "X50"

$row.names
 [1]  1  2  3  4  5  6  7  8  9 10

$class
[1] "data.frame"

> class(mat)
[1] "matrix"
> length(mat)
[1] 500
> length(fr)
[1] 50

> outer( 1:5, 4:6, "*")

     [,1] [,2] [,3]
[1,]    4    5    6
[2,]    8   10   12
[3,]   12   15   18
[4,]   16   20   24
[5,]   20   25   30

> tmp = outer(1:6, 1:6, function(x,y) x<= y)
  tmp
      [,1]  [,2]  [,3]  [,4]  [,5] [,6]
[1,]  TRUE  TRUE  TRUE  TRUE  TRUE TRUE
[2,] FALSE  TRUE  TRUE  TRUE  TRUE TRUE
[3,] FALSE FALSE  TRUE  TRUE  TRUE TRUE
[4,] FALSE FALSE FALSE  TRUE  TRUE TRUE
[5,] FALSE FALSE FALSE FALSE  TRUE TRUE
[6,] FALSE FALSE FALSE FALSE FALSE TRUE

> 1*tmp
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    1    1    1    1    1
[2,]    0    1    1    1    1    1
[3,]    0    0    1    1    1    1
[4,]    0    0    0    1    1    1
[5,]    0    0    0    0    1    1
[6,]    0    0    0    0    0    1

> class(tmp[1:36]) = "numeric"
  tmp
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    1    1    1    1    1
[2,]    0    1    1    1    1    1
[3,]    0    0    1    1    1    1
[4,]    0    0    0    1    1    1
[5,]    0    0    0    0    1    1
[6,]    0    0    0    0    0    1



                                                      9/9/2015

> help(data.frame)

Description

This function creates data frames, tightly coupled collections of variables which 
share many of the properties of matrices and of lists, used as the fundamental data 
structure by most of R's modeling software.

Usage

data.frame(..., row.names = NULL, check.rows = FALSE,
           check.names = TRUE,
           stringsAsFactors = default.stringsAsFactors())
...

> fac1 = factor(tmp1)
> attributes(fac1)
$levels
[1] "1" "3"

$class
[1] "factor"

> fac1[1:20]
 [1] 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3
Levels: 1 3

> as.character(fac1)[1:10]
 [1] "1" "3" "1" "3" "1" "3" "1" "3" "1" "3"

> as.numeric(fac1)[1:20]
 [1] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2

> cond
 [1] FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
> as.numeric(factor(cond))
 [1] 1 1 2 1 2 2 2 2 1 1
> as.numeric(cond)
> as.numeric(cond)
 [1] 0 0 1 0 1 1 1 1 0 0


> library()    #### shows all available packages ...

> library(foreign)
> search()
 [1] ".GlobalEnv"        "package:foreign"   "package:stats"    
 [4] "package:graphics"  "package:grDevices" "package:utils"    
 [7] "package:datasets"  "package:methods"   "Autoloads"        
[10] "package:base"   
> objects(2)
 [1] "data.restore"  "lookup.xport"  "read.arff"     "read.dbf"     
 [5] "read.dta"      "read.epiinfo"  "read.mtp"      "read.octave"  
 [9] "read.S"        "read.spss"     "read.ssd"      "read.systat"  
[13] "read.xport"    "write.arff"    "write.dbf"     "write.dta"    
[17] "write.foreign"


> help(read.table)
read.table {utils}	R Documentation
Data Input
Description

Reads a file in table format and creates a data frame from it, with cases corresponding 
to lines and variables to fields in the file.
Usage

read.table(file, header = FALSE, sep = "", quote = "\"'",
           dec = ".", numerals = c("allow.loss", "warn.loss", "no.loss"),
           row.names, col.names, as.is = !stringsAsFactors,
           na.strings = "NA", colClasses = NA, nrows = -1,
           skip = 0, check.names = TRUE, fill = !blank.lines.skip,
           strip.white = FALSE, blank.lines.skip = TRUE,
           comment.char = "#",
           allowEscapes = FALSE, flush = FALSE,
           stringsAsFactors = default.stringsAsFactors(),
           fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)

... [discussed in class]

> help(write)                [also discussed in class]

##### Some helpful functions

> tmpmat = array( rbinom(20,2,.5), c(5,4) )
tmpmat
     [,1] [,2] [,3] [,4]
[1,]    0    1    0    1
[2,]    1    1    2    1
[3,]    1    2    0    0
[4,]    2    2    1    1
[5,]    1    1    1    2
> apply(tmpmat, 2, sum)
[1] 5 7 4 5

> apply(tmpmat, 1, sum)
[1] 2 5 3 6 5
> c(tmpmat %*% rep(1,4))
[1] 2 5 3 6 5
> for(i in 1:5) cat(sum(tmpmat[i,]), ".  "); cat("\n")
2 .  5 .  3 .  6 .  5 .  

> table(c(tmpmat))
 0  1  2 
 4 11  5 

########

> attributes(fr)
$names
 [1] "X1"  "X2"  "X3"  "X4"  "X5"  "X6"  "X7"  "X8"  "X9"  "X10" "X11" "X12"
[13] "X13" "X14" "X15" "X16" "X17" "X18" "X19" "X20" "X21" "X22" "X23" "X24"
[25] "X25" "X26" "X27" "X28" "X29" "X30" "X31" "X32" "X33" "X34" "X35" "X36"
[37] "X37" "X38" "X39" "X40" "X41" "X42" "X43" "X44" "X45" "X46" "X47" "X48"
[49] "X49" "X50"

$row.names
 [1]  1  2  3  4  5  6  7  8  9 10

$class
[1] "data.frame"

> unlist(lapply(fr,class))[1:7]
       X1        X2        X3        X4        X5        X6        X7 
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
 

> lst1 = list(A = 1:15, B = c("A","B","C","D"), C = diag(1:4)) 
> unlist(lapply(lst1, length))
 A  B  C 
15  4 16 

> lapply(lst1,summary)
$A
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    1.0     4.5     8.0     8.0    11.5    15.0 

$B
   Length     Class      Mode 
        4 character character 

$C
       V1             V2            V3             V4   
 Min.   :0.00   Min.   :0.0   Min.   :0.00   Min.   :0  
 1st Qu.:0.00   1st Qu.:0.0   1st Qu.:0.00   1st Qu.:0  
 Median :0.00   Median :0.0   Median :0.00   Median :0  
 Mean   :0.25   Mean   :0.5   Mean   :0.75   Mean   :1  
 3rd Qu.:0.25   3rd Qu.:0.5   3rd Qu.:0.75   3rd Qu.:1  
 Max.   :1.00   Max.   :2.0   Max.   :3.00   Max.   :4  



> tmp0 = diag(4)/tmpmat[1:4,]
> tmp0
     [,1] [,2] [,3] [,4]
[1,]  Inf    0  NaN    0
[2,]    0    1    0    0
[3,]    0    0  Inf  NaN
[4,]    0    0    0    1
> apply(tmp0, 2, function(col) sum(is.na(col)))
[1] 0 0 1 1
>  apply(tmp0, 2, function(col) sum(is.nan(col)))
[1] 0 0 1 1

####### Other functions

f1 = function(x, y)   x - 3*y^2

f1(5:7, 1:3)
> f1(5:7, 1:3)
[1]   2  -6 -20
> 
> outer(5:7, 1:3, f1)
     [,1] [,2] [,3]
[1,]    2   -7  -22
[2,]    3   -6  -21
[3,]    4   -5  -20


> rvec = runif(10000) 
> table(trunc(rvec * 20))

  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
500 477 500 495 484 544 443 541 474 533 500 516 512 513 475 484 485 518 488 518 

> tmp = hist(rvec, nclass = 20)
> tmp$counts
 [1] 500 477 500 495 484 544 443 541 474 533 500 516 512 513 475 484 485 518 488 518

> svec = rnorm(1000, 1, sqrt(5))
  hist(svec, nclass=25, prob=T, ,
      xlab="svec values", ylab="Density", main=
      paste("Histogram with overplotted",
            "\n normal density curve"))
  curve(dnorm(x,1,sqrt(5)), add=T, col="red")

> help(plot)     ### plotting x versus y, see also "points", "lines"
  help(par)      ### so-called "graphical parameters", including plotting symbols



                                                               9/16/2015

## Discussion of histograms in class
##   we talked about over-laying density with histogram

## Continuing previous uniform and normal examples:  first 10,000 uniform[0,1] points

> round(tmp$counts/10000 - .05, 4)
 [1]  0.0000 -0.0023  0.0000 -0.0005 -0.0016  0.0044 -0.0057  0.0041 -0.0026
[10]  0.0033  0.0000  0.0016  0.0012  0.0013 -0.0025 -0.0016 -0.0015  0.0018
[19] -0.0012  0.0018

### Pretty accurate: each should be roughly within range 
> qnorm(.99)*c(-1,1)/sqrt(12*10000)
[1] -0.006715588  0.006715588

### Now for second case, "svec" 1000 normals with mean 1, var 5
> tmp2 = hist(svec, nclass=25, prob=T)
  table(diff(tmp2$breaks))
0.5 
 30 
  round(tmp2$density - dnorm(tmp2$mids,1,sqrt(5)), 4)
 [1]  0.0018  0.0016  0.0011  0.0021 -0.0016  0.0015  0.0087  0.0053 -0.0033
[10] -0.0057  0.0020  0.0162 -0.0055  0.0327 -0.0026 -0.0107  0.0347  0.0227
[19] -0.0287 -0.0386 -0.0093 -0.0175  0.0082  0.0020  0.0003 -0.0113  0.0093
[28] -0.0073 -0.0025 -0.0016
  round(tmp2$counts/1000 - (pnorm(tmp2$breaks[2:31],1,sqrt(5))-
              pnorm(tmp2$breaks[1:30],1,sqrt(5))), 4)
 [1]  0.0009  0.0008  0.0005  0.0010 -0.0008  0.0007  0.0043  0.0026 -0.0017
[10] -0.0029  0.0009  0.0081 -0.0028  0.0164 -0.0012 -0.0052  0.0175  0.0115
[19] -0.0142 -0.0192 -0.0046 -0.0088  0.0041  0.0009  0.0001 -0.0057  0.0046
[28] -0.0037 -0.0013 -0.0008

### These last two calculated vectors are indices of "goodness of fit" of 
###   the histogram to the correct normal density.
> max(abs(.Last.value))
[1] 0.0192                ### might be used in a quick assessment of fit


### Some GRAPHICAL commands

> xv = rgamma(100,2)
  yv = rnorm(1+2*xv, 0.3*(xv^0.25))
  plot(xv, yv)

  tmp1 = lm(yv~xv)
  lines(xv, tmp1$fit, col="blue")
  inds = identify(xv,yv)          ### left-click successive points of interest in plot
[1]  1 14 41 70 80 83 96                ### right-click mouse when finished

  sqrt( (yv - tmp1$fit)[inds]/summary(tmp1)$sigma )
       25        26        50        64 
1.2113197 0.7710393 0.9728214 1.2791945 

#### Managing arrays for simulations

simarray = array( rgamma(100000,3,2), c(100,1000))

hist(apply(simarray - 1.5,2,mean), nclass =12, prob=T)

curve(dnorm(x,0, sqrt(3/(4*100))), add=T)

> sqrt(var(apply(simarray - 1.5,2,mean)))
[1] 0.09068389   ### compare with theoretical sqrt(3/(4*100))=0.08660254

> curve(dnorm(x,0,sqrt(3/400)), -.4,.4)
> curve(dgamma(x+1.5,300,200), add=T, col="green") 
            ### compares two theoretical densities

							9/28/2015

Method of Simulating Normal R.V.'s (Box-Mueller method)

Exercise: if X,Y are indep N(0,1) then A = arcsin(X/sqrt(X^2+Y^2)) ~ Unif(-pi/2,pi/2)
                  and S = X^2+Y^2 ~ Expon(1/2) are indep
So can simulate independent standard-normal pair by S = 2*rexp(1), A = pi*(2*runif(1)-1)
                  and  (X,Y) = sqrt(S)*(sin(A), cos(A))

> Svec = 2*rexp(1000); Avec = pi*(2*runif(1000)-1)
  hist(sqrt(Svec)*sin(Avec), nclass=40, prob=T)
  curve(dnorm(x), col="red", add=T)
  Xmat = sqrt(Svec)*cbind(sin(Avec),cos(Avec))
  plot(Xmat)   ## this checks independence

abline(h=0); abline(h=1); abline(v=1, col="red"); abline(v=2,col="red")
> mean(abs(Xmat[,1]-1.5)<.5 & abs(Xmat[,2]-0.5)<0.5)
[1] 0.049
> mean(abs(Xmat[,1]-1.5)<.5)*mean(abs(Xmat[,2]-.5)<.5)
[1] 0.046843
---------------------------------------

Second (slower) method: accept-reject

First simulate Logistic r.v.

Can directly use   rlogis(1000), alternatively recall  plogis(x) = e^x/(1+e^x)
and inverse is  qlogis(r) = log(r/(1-r)), so could do  qlogis(runif(1000))

Check the max of N(0,1) density divided by dlogis(x,0,C) 

> curve(dnorm(x)/dlogis(x),0,2)                  ### max <= 1.6
> curve(dnorm(x)/dlogis(x,0,1/sqrt(3)),0,2)      ### max <= 1.16005
 max(dnorm(seq(1,2,by=.001))/dlogis(seq(1,2,by=.001),0,1/sqrt(3)))  ## = 1.16002

## Use  dlogis(x) :  
> Uvec = runif(1000)*1.6
  Wvec = rlogis(1000)
  Yvec = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ]  ### length 632
  hist(Yvec, nclass=15, prob=T)

## Next try it with  dlogis(x,0,1/sqrt(3))
> Uvec = runif(1000)*1.16002
  Wvec = rlogis(1000,0,1/sqrt(3))
  Yvec = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ]  ### length 916
  hist(Yvec, nclass=15, prob=T)

### Timing runs to generate 10 million N(0,1) rv's

> memory.limit()
[1] 3894              ### in Mb

system.time({ Svec = sqrt(2*rexp(5e6)); Avec = pi*(2*runif(5e6)-1)
              tmp = c(Svec*sin(Avec), Svec*cos(Avec)) })
   user  system elapsed 
   1.06    0.20    1.26  

> system.time({tmp = qnorm(runif(1e7))})
   user  system elapsed 
   1.03    0.09    1.15  

> system.time({  Uvec = runif(1e7)*1.6 ;  Wvec = rlogis(1e7)
    tmp = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] })
   user  system elapsed 
   3.17    0.13    3.29 

> system.time({  Uvec = runif(1e7)*1.16002;  Wvec = rlogis(1e7,0,1/sqrt(3))
    tmp = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] })
   user  system elapsed 
   3.31    0.27    3.58   

========================================================================