Example Logs, STAT 705, Fall 2015 ================================= PART 1 September 2015 ================================= 9/2/2015 > tmp = sin(0.354 + 2*pi*(1:500)/10) > mat = array(tmp,c(10,50)) > apply(mat,1,var) [1] 5.927550e-29 4.168184e-31 9.749828e-29 3.043804e-28 2.945958e-28 [6] 1.140571e-28 4.980691e-31 9.790227e-29 1.896126e-28 2.672093e-28 > tmp1 = rep(c(1,3),500) > tmp2 = c(rbind(rep(1,500),rep(3,500))) > sum(abs(tmp1-tmp2)) [1] 0 > tmp3 = replace(rep(1,1000), 1:1000 %% 2 == 0, 3) > sum(abs(tmp3-tmp1)) [1] 0 > tmp4 = ifelse(1:1000 %% 2 == 0, 3,1) > sum(abs(tmp4-tmp1)) [1] 0 > toss = rbinom(10,1,.5) > cond = as.logical(toss) > cond [1] FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE > replace(1:10,cond,101:105) [1] 1 2 101 4 102 103 104 105 9 10 > ifelse(cond,101:105,1:10) [1] 1 2 103 4 105 101 102 103 9 10 > sum(abs( (1:10)[toss==1] - which(cond) )) [1] 0 toss + array(rep(1:10,5), c(10,5)) [,1] [,2] [,3] [,4] [,5] [1,] 1 1 1 1 1 [2,] 2 2 2 2 2 [3,] 4 4 4 4 4 [4,] 4 4 4 4 4 [5,] 6 6 6 6 6 [6,] 7 7 7 7 7 [7,] 8 8 8 8 8 [8,] 9 9 9 9 9 [9,] 9 9 9 9 9 [10,] 10 10 10 10 10 outer(toss+1:10,rep(1,5),"*") [,1] [,2] [,3] [,4] [,5] [1,] 1 1 1 1 1 [2,] 2 2 2 2 2 [3,] 4 4 4 4 4 [4,] 4 4 4 4 4 [5,] 6 6 6 6 6 [6,] 7 7 7 7 7 [7,] 8 8 8 8 8 [8,] 9 9 9 9 9 [9,] 9 9 9 9 9 [10,] 10 10 10 10 10 > attributes(mat) $dim [1] 10 50 > fr = data.frame(mat) > attributes(fr) $names [1] "X1" "X2" "X3" "X4" "X5" "X6" "X7" "X8" "X9" "X10" "X11" "X12" [13] "X13" "X14" "X15" "X16" "X17" "X18" "X19" "X20" "X21" "X22" "X23" "X24" [25] "X25" "X26" "X27" "X28" "X29" "X30" "X31" "X32" "X33" "X34" "X35" "X36" [37] "X37" "X38" "X39" "X40" "X41" "X42" "X43" "X44" "X45" "X46" "X47" "X48" [49] "X49" "X50" $row.names [1] 1 2 3 4 5 6 7 8 9 10 $class [1] "data.frame" > class(mat) [1] "matrix" > length(mat) [1] 500 > length(fr) [1] 50 > outer( 1:5, 4:6, "*") [,1] [,2] [,3] [1,] 4 5 6 [2,] 8 10 12 [3,] 12 15 18 [4,] 16 20 24 [5,] 20 25 30 > tmp = outer(1:6, 1:6, function(x,y) x<= y) tmp [,1] [,2] [,3] [,4] [,5] [,6] [1,] TRUE TRUE TRUE TRUE TRUE TRUE [2,] FALSE TRUE TRUE TRUE TRUE TRUE [3,] FALSE FALSE TRUE TRUE TRUE TRUE [4,] FALSE FALSE FALSE TRUE TRUE TRUE [5,] FALSE FALSE FALSE FALSE TRUE TRUE [6,] FALSE FALSE FALSE FALSE FALSE TRUE > 1*tmp [,1] [,2] [,3] [,4] [,5] [,6] [1,] 1 1 1 1 1 1 [2,] 0 1 1 1 1 1 [3,] 0 0 1 1 1 1 [4,] 0 0 0 1 1 1 [5,] 0 0 0 0 1 1 [6,] 0 0 0 0 0 1 > class(tmp[1:36]) = "numeric" tmp [,1] [,2] [,3] [,4] [,5] [,6] [1,] 1 1 1 1 1 1 [2,] 0 1 1 1 1 1 [3,] 0 0 1 1 1 1 [4,] 0 0 0 1 1 1 [5,] 0 0 0 0 1 1 [6,] 0 0 0 0 0 1 9/9/2015 > help(data.frame) Description This function creates data frames, tightly coupled collections of variables which share many of the properties of matrices and of lists, used as the fundamental data structure by most of R's modeling software. Usage data.frame(..., row.names = NULL, check.rows = FALSE, check.names = TRUE, stringsAsFactors = default.stringsAsFactors()) ... > fac1 = factor(tmp1) > attributes(fac1) $levels [1] "1" "3" $class [1] "factor" > fac1[1:20] [1] 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 Levels: 1 3 > as.character(fac1)[1:10] [1] "1" "3" "1" "3" "1" "3" "1" "3" "1" "3" > as.numeric(fac1)[1:20] [1] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 > cond [1] FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE > as.numeric(factor(cond)) [1] 1 1 2 1 2 2 2 2 1 1 > as.numeric(cond) > as.numeric(cond) [1] 0 0 1 0 1 1 1 1 0 0 > library() #### shows all available packages ... > library(foreign) > search() [1] ".GlobalEnv" "package:foreign" "package:stats" [4] "package:graphics" "package:grDevices" "package:utils" [7] "package:datasets" "package:methods" "Autoloads" [10] "package:base" > objects(2) [1] "data.restore" "lookup.xport" "read.arff" "read.dbf" [5] "read.dta" "read.epiinfo" "read.mtp" "read.octave" [9] "read.S" "read.spss" "read.ssd" "read.systat" [13] "read.xport" "write.arff" "write.dbf" "write.dta" [17] "write.foreign" > help(read.table) read.table {utils} R Documentation Data Input Description Reads a file in table format and creates a data frame from it, with cases corresponding to lines and variables to fields in the file. Usage read.table(file, header = FALSE, sep = "", quote = "\"'", dec = ".", numerals = c("allow.loss", "warn.loss", "no.loss"), row.names, col.names, as.is = !stringsAsFactors, na.strings = "NA", colClasses = NA, nrows = -1, skip = 0, check.names = TRUE, fill = !blank.lines.skip, strip.white = FALSE, blank.lines.skip = TRUE, comment.char = "#", allowEscapes = FALSE, flush = FALSE, stringsAsFactors = default.stringsAsFactors(), fileEncoding = "", encoding = "unknown", text, skipNul = FALSE) ... [discussed in class] > help(write) [also discussed in class] ##### Some helpful functions > tmpmat = array( rbinom(20,2,.5), c(5,4) ) tmpmat [,1] [,2] [,3] [,4] [1,] 0 1 0 1 [2,] 1 1 2 1 [3,] 1 2 0 0 [4,] 2 2 1 1 [5,] 1 1 1 2 > apply(tmpmat, 2, sum) [1] 5 7 4 5 > apply(tmpmat, 1, sum) [1] 2 5 3 6 5 > c(tmpmat %*% rep(1,4)) [1] 2 5 3 6 5 > for(i in 1:5) cat(sum(tmpmat[i,]), ". "); cat("\n") 2 . 5 . 3 . 6 . 5 . > table(c(tmpmat)) 0 1 2 4 11 5 ######## > attributes(fr) $names [1] "X1" "X2" "X3" "X4" "X5" "X6" "X7" "X8" "X9" "X10" "X11" "X12" [13] "X13" "X14" "X15" "X16" "X17" "X18" "X19" "X20" "X21" "X22" "X23" "X24" [25] "X25" "X26" "X27" "X28" "X29" "X30" "X31" "X32" "X33" "X34" "X35" "X36" [37] "X37" "X38" "X39" "X40" "X41" "X42" "X43" "X44" "X45" "X46" "X47" "X48" [49] "X49" "X50" $row.names [1] 1 2 3 4 5 6 7 8 9 10 $class [1] "data.frame" > unlist(lapply(fr,class))[1:7] X1 X2 X3 X4 X5 X6 X7 "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" > lst1 = list(A = 1:15, B = c("A","B","C","D"), C = diag(1:4)) > unlist(lapply(lst1, length)) A B C 15 4 16 > lapply(lst1,summary) $A Min. 1st Qu. Median Mean 3rd Qu. Max. 1.0 4.5 8.0 8.0 11.5 15.0 $B Length Class Mode 4 character character $C V1 V2 V3 V4 Min. :0.00 Min. :0.0 Min. :0.00 Min. :0 1st Qu.:0.00 1st Qu.:0.0 1st Qu.:0.00 1st Qu.:0 Median :0.00 Median :0.0 Median :0.00 Median :0 Mean :0.25 Mean :0.5 Mean :0.75 Mean :1 3rd Qu.:0.25 3rd Qu.:0.5 3rd Qu.:0.75 3rd Qu.:1 Max. :1.00 Max. :2.0 Max. :3.00 Max. :4 > tmp0 = diag(4)/tmpmat[1:4,] > tmp0 [,1] [,2] [,3] [,4] [1,] Inf 0 NaN 0 [2,] 0 1 0 0 [3,] 0 0 Inf NaN [4,] 0 0 0 1 > apply(tmp0, 2, function(col) sum(is.na(col))) [1] 0 0 1 1 > apply(tmp0, 2, function(col) sum(is.nan(col))) [1] 0 0 1 1 ####### Other functions f1 = function(x, y) x - 3*y^2 f1(5:7, 1:3) > f1(5:7, 1:3) [1] 2 -6 -20 > > outer(5:7, 1:3, f1) [,1] [,2] [,3] [1,] 2 -7 -22 [2,] 3 -6 -21 [3,] 4 -5 -20 > rvec = runif(10000) > table(trunc(rvec * 20)) 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 500 477 500 495 484 544 443 541 474 533 500 516 512 513 475 484 485 518 488 518 > tmp = hist(rvec, nclass = 20) > tmp$counts [1] 500 477 500 495 484 544 443 541 474 533 500 516 512 513 475 484 485 518 488 518 > svec = rnorm(1000, 1, sqrt(5)) hist(svec, nclass=25, prob=T, , xlab="svec values", ylab="Density", main= paste("Histogram with overplotted", "\n normal density curve")) curve(dnorm(x,1,sqrt(5)), add=T, col="red") > help(plot) ### plotting x versus y, see also "points", "lines" help(par) ### so-called "graphical parameters", including plotting symbols 9/16/2015 ## Discussion of histograms in class ## we talked about over-laying density with histogram ## Continuing previous uniform and normal examples: first 10,000 uniform[0,1] points > round(tmp$counts/10000 - .05, 4) [1] 0.0000 -0.0023 0.0000 -0.0005 -0.0016 0.0044 -0.0057 0.0041 -0.0026 [10] 0.0033 0.0000 0.0016 0.0012 0.0013 -0.0025 -0.0016 -0.0015 0.0018 [19] -0.0012 0.0018 ### Pretty accurate: each should be roughly within range > qnorm(.99)*c(-1,1)/sqrt(12*10000) [1] -0.006715588 0.006715588 ### Now for second case, "svec" 1000 normals with mean 1, var 5 > tmp2 = hist(svec, nclass=25, prob=T) table(diff(tmp2$breaks)) 0.5 30 round(tmp2$density - dnorm(tmp2$mids,1,sqrt(5)), 4) [1] 0.0018 0.0016 0.0011 0.0021 -0.0016 0.0015 0.0087 0.0053 -0.0033 [10] -0.0057 0.0020 0.0162 -0.0055 0.0327 -0.0026 -0.0107 0.0347 0.0227 [19] -0.0287 -0.0386 -0.0093 -0.0175 0.0082 0.0020 0.0003 -0.0113 0.0093 [28] -0.0073 -0.0025 -0.0016 round(tmp2$counts/1000 - (pnorm(tmp2$breaks[2:31],1,sqrt(5))- pnorm(tmp2$breaks[1:30],1,sqrt(5))), 4) [1] 0.0009 0.0008 0.0005 0.0010 -0.0008 0.0007 0.0043 0.0026 -0.0017 [10] -0.0029 0.0009 0.0081 -0.0028 0.0164 -0.0012 -0.0052 0.0175 0.0115 [19] -0.0142 -0.0192 -0.0046 -0.0088 0.0041 0.0009 0.0001 -0.0057 0.0046 [28] -0.0037 -0.0013 -0.0008 ### These last two calculated vectors are indices of "goodness of fit" of ### the histogram to the correct normal density. > max(abs(.Last.value)) [1] 0.0192 ### might be used in a quick assessment of fit ### Some GRAPHICAL commands > xv = rgamma(100,2) yv = rnorm(1+2*xv, 0.3*(xv^0.25)) plot(xv, yv) tmp1 = lm(yv~xv) lines(xv, tmp1$fit, col="blue") inds = identify(xv,yv) ### left-click successive points of interest in plot [1] 1 14 41 70 80 83 96 ### right-click mouse when finished sqrt( (yv - tmp1$fit)[inds]/summary(tmp1)$sigma ) 25 26 50 64 1.2113197 0.7710393 0.9728214 1.2791945 #### Managing arrays for simulations simarray = array( rgamma(100000,3,2), c(100,1000)) hist(apply(simarray - 1.5,2,mean), nclass =12, prob=T) curve(dnorm(x,0, sqrt(3/(4*100))), add=T) > sqrt(var(apply(simarray - 1.5,2,mean))) [1] 0.09068389 ### compare with theoretical sqrt(3/(4*100))=0.08660254 > curve(dnorm(x,0,sqrt(3/400)), -.4,.4) > curve(dgamma(x+1.5,300,200), add=T, col="green") ### compares two theoretical densities 9/28/2015 Method of Simulating Normal R.V.'s (Box-Mueller method) Exercise: if X,Y are indep N(0,1) then A = arcsin(X/sqrt(X^2+Y^2)) ~ Unif(-pi/2,pi/2) and S = X^2+Y^2 ~ Expon(1/2) are indep So can simulate independent standard-normal pair by S = 2*rexp(1), A = pi*(2*runif(1)-1) and (X,Y) = sqrt(S)*(sin(A), cos(A)) > Svec = 2*rexp(1000); Avec = pi*(2*runif(1000)-1) hist(sqrt(Svec)*sin(Avec), nclass=40, prob=T) curve(dnorm(x), col="red", add=T) Xmat = sqrt(Svec)*cbind(sin(Avec),cos(Avec)) plot(Xmat) ## this checks independence abline(h=0); abline(h=1); abline(v=1, col="red"); abline(v=2,col="red") > mean(abs(Xmat[,1]-1.5)<.5 & abs(Xmat[,2]-0.5)<0.5) [1] 0.049 > mean(abs(Xmat[,1]-1.5)<.5)*mean(abs(Xmat[,2]-.5)<.5) [1] 0.046843 --------------------------------------- Second (slower) method: accept-reject First simulate Logistic r.v. Can directly use rlogis(1000), alternatively recall plogis(x) = e^x/(1+e^x) and inverse is qlogis(r) = log(r/(1-r)), so could do qlogis(runif(1000)) Check the max of N(0,1) density divided by dlogis(x,0,C) > curve(dnorm(x)/dlogis(x),0,2) ### max <= 1.6 > curve(dnorm(x)/dlogis(x,0,1/sqrt(3)),0,2) ### max <= 1.16005 max(dnorm(seq(1,2,by=.001))/dlogis(seq(1,2,by=.001),0,1/sqrt(3))) ## = 1.16002 ## Use dlogis(x) : > Uvec = runif(1000)*1.6 Wvec = rlogis(1000) Yvec = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] ### length 632 hist(Yvec, nclass=15, prob=T) ## Next try it with dlogis(x,0,1/sqrt(3)) > Uvec = runif(1000)*1.16002 Wvec = rlogis(1000,0,1/sqrt(3)) Yvec = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] ### length 916 hist(Yvec, nclass=15, prob=T) ### Timing runs to generate 10 million N(0,1) rv's > memory.limit() [1] 3894 ### in Mb system.time({ Svec = sqrt(2*rexp(5e6)); Avec = pi*(2*runif(5e6)-1) tmp = c(Svec*sin(Avec), Svec*cos(Avec)) }) user system elapsed 1.06 0.20 1.26 > system.time({tmp = qnorm(runif(1e7))}) user system elapsed 1.03 0.09 1.15 > system.time({ Uvec = runif(1e7)*1.6 ; Wvec = rlogis(1e7) tmp = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] }) user system elapsed 3.17 0.13 3.29 > system.time({ Uvec = runif(1e7)*1.16002; Wvec = rlogis(1e7,0,1/sqrt(3)) tmp = Wvec[ Uvec < dnorm(Wvec)/dlogis(Wvec) ] }) user system elapsed 3.31 0.27 3.58 ========================================================================