HW8 SOLUTIONS 10/10/16 (1) Generate 10,000 standard normal N(0,1) random variates, and put them in an array Zmat of dimensions 500 x 20. #--- Part (1)(i) solution Zmat = array(rnorm(500*20), c(500,20)) br = c(-Inf,-1,0,1,Inf) Zrows = apply(Zmat, 1, function(row) hist(row, br, plot=F)$count) ### this is a 4 x 500 matrix with the j'th column containing ### the count of Zmat[j,]-values in the four Ak sets ### Theoretical probabilities of each point falling in the Ak sets: pvec = diff(pnorm(br)) > pvec [1] 0.1586553 0.3413447 0.3413447 0.1586553 ## 1st: a coarse test of fit to the multinomial distribution of these columns > chisq.test(c(Zrows %*% rep(1,500)), p=pvec) Chi-squared test for given probabilities data: c(Zrows %*% rep(1, 500)) X-squared = 2.832, df = 3, p-value = 0.4183 ### But a more refined test would be to ask whether the pattern of Zrows[,j] ## fits to a multinomial distribution. ## For example, to test whether the first row of Zrow conforms ## to a Binom(20,pvec[1]) distribution. Here we will regard values 0,...,6,and 7+ ## as the 8 categories for which the binomial distribution give respective ## probabilities c(dbinom(0:6,20,pvec[1]), 1-pbinom(6,20,pvec[1]) > round(rbind(Observed= table(pmin(Zrows[1,],7))/500, Expected=c(dbinom(0:6,20,pvec[1]), 1-pbinom(6,20,pvec[1]))),4) 0 1 2 3 4 5 6 7 Observed 0.0280 0.1120 0.2320 0.2440 0.2040 0.1120 0.044 0.0240 Expected 0.0316 0.1191 0.2134 0.2414 0.1935 0.1168 0.055 0.0291 > chisq.test(table(pmin(Zrows[1,],7)), p=c(dbinom(0:6,20,pvec[1]), 1-pbinom(6,20,pvec[1]))) Chi-squared test for given probabilities data: table(pmin(Zrows[1, ], 7)) X-squared = 3.1813, df = 7, p-value = 0.8677 ### Let's do the same kind of hypothesis test for the 2nd and 3rd rows of Zrows, ### using 9 categories [0,3],4:10,11+ : > chisq.test(table(pmax(3, pmin(Zrows[2,],11))), p=c(pbinom(3,20,pvec[2]), dbinom(4:10,20,pvec[2]), 1-pbinom(10,20,pvec[2]))) Chi-squared test for given probabilities data: table(pmax(3, pmin(Zrows[2, ], 11))) X-squared = 5.8686, df = 8, p-value = 0.6619 > chisq.test(table(pmax(3, pmin(Zrows[3,],11))), p=c(pbinom(3,20,pvec[2]), dbinom(4:10,20,pvec[2]), 1-pbinom(10,20,pvec[2]))) Chi-squared test for given probabilities data: table(pmax(3, pmin(Zrows[3, ], 11))) X-squared = 2.5266, df = 8, p-value = 0.9605 ### So curiously, the last chi-squared statistic is unusually LOW ### compared to its reference distribution of chos-square with 8 df, ### but this is OK, because it is not very extreme and we are doing ### several hypothesis tests (i.e., "multiple comparisons") #--- Part (1)(ii) solution > qnorm(.9995) [1] 3.290527 > table( apply(abs(Zmat) > qnorm(.9995),1, sum) ) 0 1 493 7 ### so 493 rows have this occurring not even once ### and 7 rows have 1 occurrence sum in each row is Binom(20, .0005) and therefor the probability that a row has >= 1 occurrence is 1-dbinom(0,20,.0005) = .009953 ~ 1-dpois(0,.01) and the number of rows out of 500 with >=1 occurrence is ~ Poisson(500*.01), a Poisson(5) r.v., for which 7 is not a particulary unusual value. #--------------------------------------- (2) We want to simulate a large number (>10^6) of independent identically distributed pseudo-random variable values from the density f(x) = C0/(1+x^2+3*x^4)^3, for all x #--- Part (2) solution ## 1st method: CRUDE SOLUTION WITH PROBABILITY INTEGRAL TRANSFORM ## using piecewise linear interpolation to invert ## NOTE that we must first transform to finite interval! I transform ## by the change-of-variable y = arctan(sign(x)*abs(x)^(1/K)), so ## that |y| <= pi/2. ## Take M a large even integer (100 or more): will tabulate cdf ## at points y = pi*j/M, j=-M/2,...,M/2 f1 = function(x) 1/(1+x^2+3*x^4)^3 C0 = 0.5/integrate(f1,0,Inf)$val ### error << 1.e-8 > integrate(f1,-Inf,-3) 1.717825e-08 with absolute error < 1.9e-13 > xvec = c(seq(0,1, by=.01), seq(1.02,2,.02),seq(2.05,3,by=.05)) xvec = c(-rev(xvec),xvec[-1]) g = function(y, K=4) { vt = tan(y)^2 (K*C0)*(1+vt)*vt^((K-1)/2)/(1+vt^K+3*vt^(2*K))^3 } > integrate(g,-pi/2,pi/2) 1 with absolute error < 9.8e-05 > integrate(g,-pi/2,pi/2, K=1) 1 with absolute error < 2.3e-09 > integrate(g,-pi/2,pi/2, K=10) 1 with absolute error < 1.5e-07 ### now create a function to interpolate the inverse cdf linearly > SimFcn1 = function(K, xvec, rel.tol=.Machine$double.eps^0.25) { yvec = sign(xvec)*atan(abs(xvec)^(1/K)) M = length(xvec) ints = numeric(M-1) for (i in 1:(M-1)) ints[i] = integrate(g,yvec[i],yvec[i+1],rel.tol=rel.tol, K=K)$val cdf = c(0,cumsum(ints),1) ## length M+1 #### values of cdf at xvec are cumints invfcn = function(uv) { keep = !duplicated(cdf) newint = cdf[keep] xnew = xvec[keep] delta = diff(newint) uints = cut(uv, breaks=newint, labels=F) uvfrac = (uv-newint[uints])/delta[uints] xnew[uints]+delta[uints]*uvfrac } list(invfcn=invfcn, xyvals= cbind(x=xvec, yvec=yvec), cdf=cdf) } > tmp = SimFcn1(4,xvec) range(diff(tmp$cdf)) [1] 3.977421e-09 1.165561e-02 tmp$invfcn(seq(.1,.9,by=.1)) [1] -4.175975e-01 -2.798558e-01 -1.770826e-01 -8.594274e-02 2.002434e-08 [6] 8.734487e-02 1.776438e-01 2.787882e-01 4.134081e-01 ### How can we check the accuracy of this inverse ? ### One way is to consider > plot(tmp$xyvals[,1], tmp$cdf[-1], type="l") > ut = runif(100); xt = tmp$invfcn(ut) > points(xt,ut) ### all of these points are right on the visual cdf curve ### But "accuracy" measured in terms of the true integrated cdf value ### can be summarized as follows. acc = numeric(100) err = numeric(100) for(i in 1:100) { aux = integrate(f1,-Inf,xt[i]) acc[i] = C0*aux$val - ut[i] err[i] = aux$abs.err } > summary(acc) Min. 1st Qu. Median Mean 3rd Qu. Max. -2.409e-03 -5.774e-04 8.493e-05 5.700e-05 8.247e-04 1.882e-03 > summary(err) Min. 1st Qu. Median Mean 3rd Qu. Max. 9.300e-10 5.886e-07 2.310e-06 1.564e-05 1.812e-05 9.522e-05 ### This says that we have achieved accuracy only of order .001 ### Now let's look at a timing run, for later comparison > system.time({ aux = runif(1e8) }) user system elapsed 6.35 0.22 6.61 ### so 1e6 take ~ .07 secs > system.time({ aux = tmp$invfcn(runif(1e6)) }) user system elapsed 0.25 0.02 0.27 ### Pretty fast ... ### and the time required to run tmp = SimFcn1(4,xvec) was .03sec ### Now let's increase the accuracy, and check the speed ### First try changed K parameter, 8 instead of 4: > system.time({ tmp2 = SimFcn1(8, xvec) }) ## < .02 secs system.time({ aux = tmp2$invfcn(runif(1e6)) }) ## 0.22 secs ## Again an accuracy check: > ut = runif(100); xt = tmp2$invfcn(ut) for(i in 1:100) { aux = integrate(f1,-Inf,xt[i], rel.tol=1.e-8) acc[i] = C0*aux$val - ut[i] err[i] = aux$abs.err } > summary(err) Min. 1st Qu. Median Mean 3rd Qu. Max. 4.298e-12 4.536e-11 5.627e-10 1.390e-09 2.436e-09 9.340e-09 > summary(acc) Min. 1st Qu. Median Mean 3rd Qu. Max. -2.380e-03 -5.063e-04 -8.722e-06 -8.032e-05 4.742e-04 1.689e-03 ### accuracy about same as before. ### So now try making the xvec spacings MUCH finer, and decreasing the ### rel.tol in the SimFcn1 function call > xvec2 = c(seq(0,1, by=.0001), seq(1.0002,2,.0002),seq(2.01,3,by=.01)) xvec2 = c(-rev(xvec2),xvec2[-1]) #### 30201 points !!! > system.time({ tmp3 = SimFcn1(8,xvec2, rel.tol=1.e-7) }) user system elapsed 2.34 0.01 2.37 > system.time({ aux = tmp3$invfcn(runif(1e6)) }) user system elapsed 0.32 0.02 0.35 ### So the time to do the inversion by lookup and piecewise linear ### interpolation increases by only about 40%, but the ### preprocessing to do accurate integration in the first place ### was very much longer. Now let's see the payuoff in accuracy: > ut = runif(100); xt = tmp3$invfcn(ut) for(i in 1:100) { aux = integrate(f1,-Inf,xt[i], rel.tol=1.e-8) acc[i] = C0*aux$val - ut[i] err[i] = aux$abs.err } ## all err's < 1e-8, and accuracy now of order 1e-5 > summary(acc) Min. 1st Qu. Median Mean 3rd Qu. Max. -2.227e-05 -5.716e-06 -9.518e-08 -7.551e-07 4.655e-06 1.777e-05 ### NB. The many integrations being done here could be speeded ### up A LOT by doing vectorized Simpson's Rule. ## All of the generated points give visually excellent histograms > hist(tmp3$invfcn(runif(1e6)), nclass=80, xlim=c(-1.3, 1.3), prob=T, main="Hist. of Prob Int Trans Method \n for density f, n=1e6") curve(C0*f1(x),add=T, col="orange") #--------- ## 2nd METHOD: ACCEPT REJECT using a piecewise defined bounding function Since f(x) = C0/(1+x^2+3*x^4)^3, for all x, we cannot hope to bound by anything much better than a constant C0 on [-1/2,1/2], say; but for |x| in [.5,1.2] bound by K*dt(2*sqrt(5)*x,5), and for |x| > 2 bound by B/(1+|x|)^12 To start, we obtain the probabilities of the subintervals according to the desired density as follows: ## A1 = [-0.5,0.5] > p1 = C0*integrate(f1,-.5,.5, rel.tol=1.e-8)$val ### = 0.8814516 ## A2 = [-1,-.5] , A3 = [.5,1] > p2 = p3 = C0*integrate(f1,-1,-.5, rel.tol=1.e-8)$val ### = 0.05812933 ## A4 = [1,Inf), A5 = (-Inf,-1] > p4 = p5 = C0*integrate(f1,-Inf,-1, rel.tol=1.e-8)$val ### = 0.001144859 Next, the bounds: the density (C0/p1)*f1 on [-.5,.5] is bounded above by (C0/p1)=1.322452 Then, the ratio on A3=(.5,1] of the two densities given by density > h1 = function(x) 2*sqrt(5)*dt(0,5)*(1+4*x^2)^(-3)/ (pt(2*sqrt(5),5)-pt(sqrt(5),5)) divided by density > h2 = function(x) (C0/p3)/(1+x^2+3*x^4)^3) > 1/optimize(function(x) h2(x)/h1(x), c(.5,1))$obj [1] 2.452708 ### So h2(x) <= 2.453*h1(x) on the whole interval (.5,1] ## Finally on (1,Inf) the ratio of the densities > h3 = function(x) 11/x^12 ## over > h4 = function(x) (C0/p5)/(1+x^2+3*x^4)^3 ## is bounded everywhere above by > 1/optimize(function(x) h4(x)/h3(x), c(1, 1e4))$obj [1] 1.3501 ### So now we can implement the simulation as a mixture ## of accept-reject generated densities, as follows: ### the vectors of bounds of density ratios on the 5 intervals ### each increased by a little bit (2% in largest-probability ### interval A1, 5% in A2,A3 and 10% in A4,A5) mults = c(1.35, 2.58, 2.58, 1.485, 1.485) f2 = function(x) h2(x)/h1(x) f4 = function(x) h4(x)/h3(x) pvec = c(p1,p2,p3,p4,p5) ## and recall that the density ratios on A1, A2, and A4 have ## the respective function definitions f1, f2, f4 and that ## the ratio f2 applies also to A3 and f4 to A5 SimFcn2 = function(N, pvec) { ### simulates N r.v.'s Lvec = sample(1:5,1e6, prob=pvec, replace=T) Xvec = numeric(1e6) Mvec = table(Lvec) ### now define list Uarr of matrices of all of random uniforms ### we need for accept-reject in all 5 subintervals ### with Uarr[[k]][i,1] and Uarr[[k]][i,2] the pairs used in ### the k'th subinterval Uarr = NULL for (k in 1:5) Uarr = c(Uarr, list(matrix(runif(2* round(mults[k]*Mvec[k])), ncol=2))) xtmp = Uarr[[1]][,1]-0.5 Xvec[Lvec==1] = (xtmp[Uarr[[1]][,2] <= f1(xtmp)])[1:Mvec[1]] c1 = pt(sqrt(5),5) c2 = pt(2*sqrt(5),5)-pt(sqrt(5),5) xtmp = (0.5/sqrt(5))*qt( Uarr[[2]][,1]*c2+1-c1-c2, 5) Xvec[Lvec==2] = (xtmp[Uarr[[2]][,2] <= f2(xtmp)])[1:Mvec[2]] xtmp = (0.5/sqrt(5))*qt( Uarr[[3]][,1]*c2+c1, 5) Xvec[Lvec==3] = (xtmp[Uarr[[3]][,2] <= f2(xtmp)])[1:Mvec[3]] xtmp = -(Uarr[[4]][,1]^(-1/11)) Xvec[Lvec==4] = (xtmp[Uarr[[4]][,2] <= f4(xtmp)])[1:Mvec[4]] xtmp = (1-Uarr[[5]][,1])^(-1/11) Xvec[Lvec==5] = (xtmp[Uarr[[5]][,2] <= f4(xtmp)])[1:Mvec[5]] list(Xvec, Mvec) } > system.time({ aux = SimFcn2(1e6, pvec) }) user system elapsed 3.04 0.11 3.15 > aux[[2]] ### the numbers of points in the respective Ak sets Lvec 1 2 3 4 5 881721 58114 57875 1143 1147 > hist(aux[[1]], nclass=80, xlim=c(-1.3, 1.3), prob=T, main="Hist. of Prob Int Trans Method \n for density f, n=1e6") curve(C0*f1(x),add=T, col="orange") ### so the time is longer, but the simulation is exact! ### If we really wanted to provide accuracy 1e-6 or better, ### it is not yet clear which method is better, although I suspect ### it is Method1 with Simpson's-rule integrations in this ### one-dimensional example.