HW8 SOLUTIONS     10/10/16

(1) Generate 10,000 standard normal N(0,1) random variates,
and put them in an array Zmat of dimensions 500 x 20.

#--- Part (1)(i) solution

Zmat = array(rnorm(500*20), c(500,20))
br = c(-Inf,-1,0,1,Inf)
Zrows = apply(Zmat, 1, function(row) hist(row, br, plot=F)$count)
    ### this is a 4 x 500 matrix with the j'th column containing
   ### the count of Zmat[j,]-values in  the four Ak sets
    ### Theoretical probabilities of each point falling in the Ak sets:
pvec = diff(pnorm(br))
> pvec
[1] 0.1586553 0.3413447 0.3413447 0.1586553

## 1st: a coarse test of fit to the multinomial distribution of these columns

> chisq.test(c(Zrows %*% rep(1,500)), p=pvec)

        Chi-squared test for given probabilities

data:  c(Zrows %*% rep(1, 500))
X-squared = 2.832, df = 3, p-value = 0.4183

### But a more refined test would be to ask whether the pattern of Zrows[,j]
## fits to a multinomial distribution.

## For example, to test whether the first row of Zrow conforms 
##   to a Binom(20,pvec[1]) distribution. Here we will regard values 0,...,6,and 7+
##   as the 8 categories for which the binomial distribution give respective 
##   probabilities c(dbinom(0:6,20,pvec[1]), 1-pbinom(6,20,pvec[1])

> round(rbind(Observed= table(pmin(Zrows[1,],7))/500,
      Expected=c(dbinom(0:6,20,pvec[1]), 1-pbinom(6,20,pvec[1]))),4)
              0      1      2      3      4      5     6      7
Observed 0.0280 0.1120 0.2320 0.2440 0.2040 0.1120 0.044 0.0240
Expected 0.0316 0.1191 0.2134 0.2414 0.1935 0.1168 0.055 0.0291

> chisq.test(table(pmin(Zrows[1,],7)), p=c(dbinom(0:6,20,pvec[1]), 
          1-pbinom(6,20,pvec[1])))

        Chi-squared test for given probabilities

data:  table(pmin(Zrows[1, ], 7))
X-squared = 3.1813, df = 7, p-value = 0.8677

### Let's do the same kind of hypothesis test for the 2nd and 3rd rows of Zrows, 
###   using 9 categories [0,3],4:10,11+ :

> chisq.test(table(pmax(3, pmin(Zrows[2,],11))), p=c(pbinom(3,20,pvec[2]), 
          dbinom(4:10,20,pvec[2]), 1-pbinom(10,20,pvec[2])))

        Chi-squared test for given probabilities

data:  table(pmax(3, pmin(Zrows[2, ], 11)))
X-squared = 5.8686, df = 8, p-value = 0.6619

> chisq.test(table(pmax(3, pmin(Zrows[3,],11))), p=c(pbinom(3,20,pvec[2]), 
          dbinom(4:10,20,pvec[2]), 1-pbinom(10,20,pvec[2])))

        Chi-squared test for given probabilities

data:  table(pmax(3, pmin(Zrows[3, ], 11)))
X-squared = 2.5266, df = 8, p-value = 0.9605

### So curiously, the last chi-squared statistic is unusually LOW 
###  compared to its reference distribution of chos-square with 8 df,
###  but this is OK, because it is not very extreme and we are doing 
###  several hypothesis tests (i.e., "multiple comparisons")

#--- Part (1)(ii) solution

> qnorm(.9995)
[1] 3.290527
> table( apply(abs(Zmat) > qnorm(.9995),1, sum) )
  0   1 
493   7           ### so 493 rows have this occurring not even once
                   ### and 7 rows have 1 occurrence

sum in each row is Binom(20, .0005)  and therefor the probability that 
a row has >= 1 occurrence is  1-dbinom(0,20,.0005) = .009953 ~ 1-dpois(0,.01) 
and the number of rows out of 500 with >=1 occurrence is ~ Poisson(500*.01),
a Poisson(5) r.v., for which 7 is not a particulary unusual value.

#---------------------------------------

(2) We want to simulate a large number (>10^6) of independent identically 
distributed pseudo-random variable values from the density

f(x) = C0/(1+x^2+3*x^4)^3,  for all x

#--- Part (2) solution

## 1st method: CRUDE SOLUTION WITH PROBABILITY INTEGRAL TRANSFORM 
##   using piecewise linear interpolation to invert

## NOTE that we must first transform to finite interval! I transform 
##    by the change-of-variable  y = arctan(sign(x)*abs(x)^(1/K)), so 
##    that |y| <= pi/2.

## Take M a large even integer (100 or more): will tabulate cdf 
##    at points   y = pi*j/M, j=-M/2,...,M/2

f1 = function(x) 1/(1+x^2+3*x^4)^3
C0 = 0.5/integrate(f1,0,Inf)$val        ### error << 1.e-8

> integrate(f1,-Inf,-3)
1.717825e-08 with absolute error < 1.9e-13

> xvec = c(seq(0,1, by=.01), seq(1.02,2,.02),seq(2.05,3,by=.05))
  xvec = c(-rev(xvec),xvec[-1])

g = function(y, K=4) {
        vt = tan(y)^2
        (K*C0)*(1+vt)*vt^((K-1)/2)/(1+vt^K+3*vt^(2*K))^3  }
> integrate(g,-pi/2,pi/2)
1 with absolute error < 9.8e-05
> integrate(g,-pi/2,pi/2, K=1)
1 with absolute error < 2.3e-09
> integrate(g,-pi/2,pi/2, K=10)
1 with absolute error < 1.5e-07

  ### now create a function to interpolate the inverse cdf linearly

> SimFcn1 = function(K, xvec, rel.tol=.Machine$double.eps^0.25) {
   yvec = sign(xvec)*atan(abs(xvec)^(1/K))
   M = length(xvec)
   ints = numeric(M-1)
   for (i in 1:(M-1)) ints[i] = 
          integrate(g,yvec[i],yvec[i+1],rel.tol=rel.tol, K=K)$val 
   cdf = c(0,cumsum(ints),1)     ## length M+1
 #### values of cdf at xvec are cumints
   invfcn = function(uv) {  
      keep = !duplicated(cdf)
      newint = cdf[keep]
      xnew = xvec[keep]
      delta = diff(newint)     
      uints = cut(uv, breaks=newint, labels=F)
      uvfrac = (uv-newint[uints])/delta[uints]
      xnew[uints]+delta[uints]*uvfrac }
   list(invfcn=invfcn, xyvals= cbind(x=xvec, yvec=yvec), 
      cdf=cdf)  }

> tmp = SimFcn1(4,xvec)
  range(diff(tmp$cdf))
[1] 3.977421e-09 1.165561e-02

  tmp$invfcn(seq(.1,.9,by=.1))
[1] -4.175975e-01 -2.798558e-01 -1.770826e-01 -8.594274e-02  2.002434e-08
[6]  8.734487e-02  1.776438e-01  2.787882e-01  4.134081e-01

### How can we check the accuracy of this inverse ?
### One way is to consider

> plot(tmp$xyvals[,1], tmp$cdf[-1], type="l")
> ut = runif(100); xt = tmp$invfcn(ut)
> points(xt,ut)
     ### all of these points are right on the visual cdf curve

### But "accuracy" measured in terms of the true integrated cdf value
### can be summarized as follows.

acc = numeric(100)
err = numeric(100)
for(i in 1:100) {
      aux = integrate(f1,-Inf,xt[i])
      acc[i] = C0*aux$val - ut[i]
      err[i] = aux$abs.err }

> summary(acc)
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-2.409e-03 -5.774e-04  8.493e-05  5.700e-05  8.247e-04  1.882e-03 
> summary(err)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
9.300e-10 5.886e-07 2.310e-06 1.564e-05 1.812e-05 9.522e-05 

### This says that we have achieved accuracy only of order .001
### Now let's look at a timing run, for later comparison

> system.time({ aux = runif(1e8) })
   user  system elapsed 
   6.35    0.22    6.61     ### so 1e6 take ~ .07 secs

> system.time({ aux = tmp$invfcn(runif(1e6)) })
   user  system elapsed 
   0.25    0.02    0.27              ### Pretty fast ...

### and the time required to run  tmp = SimFcn1(4,xvec)  was .03sec

### Now let's increase the accuracy, and check the speed
### First try changed K parameter, 8 instead of 4:

> system.time({ tmp2 = SimFcn1(8, xvec) })  ## < .02 secs
  system.time({ aux = tmp2$invfcn(runif(1e6)) })  ## 0.22 secs

## Again an accuracy check:

> ut = runif(100); xt = tmp2$invfcn(ut)
  for(i in 1:100) {
      aux = integrate(f1,-Inf,xt[i], rel.tol=1.e-8)
      acc[i] = C0*aux$val - ut[i]
      err[i] = aux$abs.err }
> summary(err)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
4.298e-12 4.536e-11 5.627e-10 1.390e-09 2.436e-09 9.340e-09 
> summary(acc)
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-2.380e-03 -5.063e-04 -8.722e-06 -8.032e-05  4.742e-04  1.689e-03 
  ### accuracy about same as before.

### So now try making the xvec spacings MUCH finer, and decreasing the
###   rel.tol in the SimFcn1 function call

> xvec2 = c(seq(0,1, by=.0001), seq(1.0002,2,.0002),seq(2.01,3,by=.01))
  xvec2 = c(-rev(xvec2),xvec2[-1])     #### 30201 points !!!

> system.time({ tmp3 = SimFcn1(8,xvec2, rel.tol=1.e-7) })
   user  system elapsed 
   2.34    0.01    2.37
> system.time({ aux = tmp3$invfcn(runif(1e6)) })
   user  system elapsed 
   0.32    0.02    0.35 

### So the time to do the inversion by lookup and piecewise linear
###   interpolation increases by only about 40%, but the 
###   preprocessing to do accurate integration in the first place 
###   was very much longer. Now let's see the payuoff in accuracy:

> ut = runif(100); xt = tmp3$invfcn(ut)
  for(i in 1:100) {
      aux = integrate(f1,-Inf,xt[i], rel.tol=1.e-8)
      acc[i] = C0*aux$val - ut[i]
      err[i] = aux$abs.err }

## all err's < 1e-8, and accuracy now of order 1e-5
> summary(acc)
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-2.227e-05 -5.716e-06 -9.518e-08 -7.551e-07  4.655e-06  1.777e-05 

### NB. The many integrations being done here could be speeded 
###  up A LOT by doing vectorized Simpson's Rule.

## All of the generated points give visually excellent histograms

> hist(tmp3$invfcn(runif(1e6)), nclass=80, xlim=c(-1.3, 1.3), prob=T,
    main="Hist. of Prob Int Trans Method \n for density f, n=1e6")
  curve(C0*f1(x),add=T, col="orange")

#---------
## 2nd METHOD: ACCEPT REJECT using a piecewise defined bounding function

Since   f(x) = C0/(1+x^2+3*x^4)^3,  for all x,

we cannot hope to bound by anything much better than a constant C0
on [-1/2,1/2], say; but for |x| in [.5,1.2]  bound by 
K*dt(2*sqrt(5)*x,5), and for |x| > 2 bound by B/(1+|x|)^12

To start, we obtain the probabilities of the subintervals according 
to the desired density as follows:

## A1 = [-0.5,0.5]
> p1 = C0*integrate(f1,-.5,.5, rel.tol=1.e-8)$val   ### = 0.8814516

## A2 = [-1,-.5] , A3 = [.5,1]
> p2 = p3 = C0*integrate(f1,-1,-.5, rel.tol=1.e-8)$val   ### = 0.05812933

## A4 = [1,Inf), A5 = (-Inf,-1]
> p4 = p5 = C0*integrate(f1,-Inf,-1, rel.tol=1.e-8)$val  ### = 0.001144859

Next, the bounds:

the density (C0/p1)*f1  on  [-.5,.5]  is bounded above by (C0/p1)=1.322452

Then, the ratio on A3=(.5,1]  of the two densities given by   

density   
>  h1 = function(x)   2*sqrt(5)*dt(0,5)*(1+4*x^2)^(-3)/
                     (pt(2*sqrt(5),5)-pt(sqrt(5),5))

divided by density  

> h2 = function(x) (C0/p3)/(1+x^2+3*x^4)^3) 

> 1/optimize(function(x) h2(x)/h1(x), c(.5,1))$obj
[1] 2.452708

### So h2(x) <= 2.453*h1(x) on  the whole interval (.5,1]

## Finally on (1,Inf) the ratio of the densities

> h3 = function(x) 11/x^12

## over 

> h4 = function(x) (C0/p5)/(1+x^2+3*x^4)^3

## is bounded everywhere above by 
> 1/optimize(function(x) h4(x)/h3(x), c(1, 1e4))$obj
[1] 1.3501

### So now we can implement the simulation as a mixture 
## of accept-reject generated densities, as follows:

### the vectors of bounds of density ratios on the 5 intervals
### each increased by a little bit (2% in largest-probability 
### interval A1, 5% in A2,A3 and 10% in A4,A5) 

  mults = c(1.35, 2.58, 2.58, 1.485, 1.485)
  f2 = function(x) h2(x)/h1(x)
  f4 = function(x) h4(x)/h3(x)
  pvec = c(p1,p2,p3,p4,p5)

## and recall that the density ratios on A1, A2, and A4 have 
## the respective function definitions f1, f2, f4 and that
## the ratio f2 applies also to A3 and f4 to A5

 SimFcn2 = function(N, pvec) {
   ### simulates N r.v.'s
   Lvec = sample(1:5,1e6, prob=pvec, replace=T)
   Xvec = numeric(1e6)
   Mvec = table(Lvec)
  ### now define list Uarr of matrices of all of random uniforms 
  ### we need for accept-reject in all 5 subintervals
  ### with Uarr[[k]][i,1] and Uarr[[k]][i,2] the pairs used in
  ### the k'th subinterval
   Uarr = NULL
   for (k in 1:5) Uarr = c(Uarr, list(matrix(runif(2*
                   round(mults[k]*Mvec[k])), ncol=2)))
   xtmp = Uarr[[1]][,1]-0.5
   Xvec[Lvec==1] = (xtmp[Uarr[[1]][,2] <= f1(xtmp)])[1:Mvec[1]]
   c1 = pt(sqrt(5),5)
   c2 = pt(2*sqrt(5),5)-pt(sqrt(5),5)
   xtmp = (0.5/sqrt(5))*qt( Uarr[[2]][,1]*c2+1-c1-c2, 5)
   Xvec[Lvec==2] = (xtmp[Uarr[[2]][,2] <= f2(xtmp)])[1:Mvec[2]]
   xtmp = (0.5/sqrt(5))*qt( Uarr[[3]][,1]*c2+c1, 5)
   Xvec[Lvec==3] = (xtmp[Uarr[[3]][,2] <= f2(xtmp)])[1:Mvec[3]]
   xtmp = -(Uarr[[4]][,1]^(-1/11))   
   Xvec[Lvec==4] = (xtmp[Uarr[[4]][,2] <= f4(xtmp)])[1:Mvec[4]] 
   xtmp = (1-Uarr[[5]][,1])^(-1/11)   
   Xvec[Lvec==5] = (xtmp[Uarr[[5]][,2] <= f4(xtmp)])[1:Mvec[5]]
   list(Xvec, Mvec) }

> system.time({ aux = SimFcn2(1e6, pvec) })
   user  system elapsed 
   3.04    0.11    3.15     

> aux[[2]]  ### the numbers of points in the respective Ak sets
Lvec
     1      2      3      4      5 
881721  58114  57875   1143   1147 

> hist(aux[[1]], nclass=80, xlim=c(-1.3, 1.3), prob=T,
    main="Hist. of Prob Int Trans Method \n for density f, n=1e6")
  curve(C0*f1(x),add=T, col="orange")

     ### so the time is longer, but the simulation is exact!
  ### If we really wanted to provide accuracy 1e-6 or better,
  ### it is not yet clear which method is better, although I suspect 
  ### it is Method1 with Simpson's-rule integrations in this 
  ### one-dimensional example.