Homework Solutions to Problems 11--12.
=====================================
[NB solutions are in Splus 6.0]

(11) gmix <- function(N) {
       xout <- numeric(N)
       eps <- rbinom(N,1,.3)
       naux <- sum(eps)
       xout[eps==1] <- rnorm(naux)
       xout[eps==0] <- rlogis(N-naux)*1.5
       xout
     }

> xtmp <- gmix(1000)
> summary(xtmp)
      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
 -13.25289  -1.15870   0.00146   0.01452   1.23705  14.89885
> diff(c(0, 0.3*pnorm(c(-5,-1,.5,4))+.7*plogis(c(-5,-1,.5,4)/1.5),1))
[1] 0.02411172 0.26095540 0.33017076 0.33927420 0.04548792
  expec <- 1000*.Last.value
  obs <- hist(xtmp, breaks=c(-15,-5,-1,.5,4,20), prob=F, plot=F)$count
> sum((obs-expec)^2/expec)
[1] 2.217372	### Very moderate for chi-square 5df !!

(b) First generate triple  mm  of counts of observations falling 
intorespective intervals (-Inf, -1], (-1,1], (1,Inf) (with 
respective probabilities .2/e = .0736, 1-.5/e = .8161, .3/e = .1104.
Then simulate within these intervals from corresponding densities
using exponentials for extreme intervals and rejection method using
normals for middle interval. NB. We can actually use the rejection
method for the middle interval to choose those observations falling
in the tails, as follows, using the facts
> c(qnorm(.2/exp(1)), qnorm(1-.3/exp(1)))
[1] -1.449666  1.224595

> ThrSim <- function(N, fac=2) {
     rnvec <- rnorm(N)
     mm1 <-  sum(rnvec < -1.449666)
     mm3 <-  sum(rnvec > 1.224595)
     ind <- (1:N)[rnvec >= -1.449666 & rnvec <= 1.224595]
     rnaux <- c(rnvec[ind], rnorm(200+sum(ind)*(fac-1)))
     outvec <- numeric(N)
     outvec[rnvec < -1.449666] <- -1-rexp(mm1)
     outvec[rnvec > 1.224595] <- 1+rexp(mm3)
     outvec[ind] <- rnaux[abs(rnaux)<1][1:(N-mm1-mm3)]
     list(outvec = outvec, mcounts =c(mm1,N-mm1-mm3,mm3), normVal=ind)
   }
> dout <- ThrSim(1000)
> dout$mcounts/1000
[1] 0.078 0.818 0.104  ### These numbers match well
> chisq.gof(dout$outvec[dout$outvec> 1]-1, dist="exponential",
    rate=1/mean(dout$outvec[dout$outvec> 1]-1), n.param.est=1)
        Chi-square Goodness of Fit Test

data:  dout$outvec[dout$outvec > 1] - 1 
Chi-square = 6.25, df = 11, p-value = 0.8562   ### OK !!!
> chisq.gof(-dout$outvec[dout$outvec < -1]-1, dist="exponential",
    rate=1/mean(-dout$outvec[dout$outvec < -1]-1), n.param.est=1)
        Chi-square Goodness of Fit Test

data:   - dout$outvec[dout$outvec < -1] - 1 
Chi-square = 11.2308, df = 10, p-value = 0.3398 ### OK.


(12) There are a few different ways to do this. The ones I will cover
     are three: (a) multivariate probability integral transform
     method (very messy!), (b) same method applied to affinely
     transformed data (very simple!), (c) naive accept-reject
     (slightly wasteful but very simple).

METHOD 0. This is a really ugly method, when applied to the 
untransformed problem. As an indication, find the marginal density 
     f_Y(y) = c* I[0<y<1] * ( (1-y)* min(y,1.5-y) + 0.5*I[y< .75] *
                              ((1-y)^2 - max(0,y-0.5)^2) )
from which we can find 1/c = Volume of allowed 3-dim region=

> integrate(function(y) (1-y)* pmin(y,1.5-y),0,1)$integral +
  integrate(function(y) 0.5*((1-y)^2 - pmax(0,y-0.5)^2), 0,.75)$integral
[1] 0.3229167
This agrees to seven decimal places with the value  31/96  calculated
below, under method 1.

METHOD 1. We know how to do change-of-variable based on affine
transformations. Here let u=x, v=x+y, w=x+z. Then the mapping
(x,y,z) -> (u,v,w) is evidently linear and invertible, with Jacobian
determinant 1: so the inverse preserves volumes, and the
region of interest in (u,v,w) coordinates is 
(v,w) in (0,1)^2, max(0.5*(v+w)-0.75, 0) < u < min(v,w)
However, we must still do the necessary integrations if we want 
to use the (trivariate) probability integral transform method,
yielding 

f_{V,W}(v,w) = c * (min(v,w) - max(0, 0.5*(v+w) - 0.75)), 

for v, w in (0,1); and then 

f_V(v) = c*(v - 0.5*v^2 - 0.25*(max(0,v-0.5))^2);  so that

F_V(v) = c*( 0.5*v^2 - v^3/6 - (1/12)*(max(0,v-0.5))^3)

## Setting v=1, F_V(1)=1, gives the value c=96/31 mentioned above.
## NOTE that direct numerical inversion of this distribution function
## is not feasible in parallel fashion, so we resort to tabulation,
## spline-interpolation of the inverse, etc.

F_{W|V}(w|v) = (c/F_V(v))*(0.5*min(v,w)^2+ v*max(w-v,0) - 
                 0.25*I[v>0.5]*(v+w-1.5)^2)

> fv <- function(v) (96/31)*(v - 0.5*v^2 - 0.25*(pmax(0,v-0.5))^2)
  FV <- function(v) (96/31)*( 0.5*v^2 - v^3/6 -
                (1/12)*(pmax(0,v-0.5))^3)
  FWcondV <- function(w,v) (0.5*pmin(v,w)^2 + v*pmax(w-v,0) - 0.25*
          pmax(0,v+w-1.5)^2)/(v - 0.5*v^2 - 0.25*(pmax(0,v-0.5))^2)

#### OK, now we can use the tabulate/interpolate idea for V, and 
     explicit inversion for FWcondV:

> FWinvV <- function(r,v) {
     fout <- numeric(length(r)) ## assumes r, v have same length
     vsm <- (v < 0.5)
     wltv <- (r < FWcondV(v,v))
     fout[vsm & wltv] <- { vaux <- v[vsm & wltv]
           sqrt(2*r[vsm & wltv]*vaux*(1 - 0.5*vaux)) }
     fout[vsm & !wltv] <- { vaux <- v[vsm & !wltv] 
           r[vsm & !wltv]*(1 - 0.5*vaux) + vaux/2 }
     vwsumg <- (r >= FWcondV(pmin(1.5-v,1),v))
     if(sum(!vsm)) {
     auxb <- (!vsm & !vwsumg)
     cc <- 96/31
     if(sum(auxb)) {
     fout[auxb & wltv] <- { vaux <- v[auxb & wltv]
         sqrt(2*r[auxb & wltv]*fv(vaux)/cc) }
     fout[auxb & !wltv] <- { vaux <- v[auxb & !wltv]
         r[auxb & !wltv]*fv(vaux)/(cc*vaux) + vaux/2 }
     }
     auxb <- (!vsm & vwsumg)
     if(sum(auxb)) {
     fout[auxb & wltv] <- { vaux <- v[auxb & wltv]
         sqrt(4*r[auxb & wltv]*fv(vaux)/cc+2*(vaux-1.5)^2) + vaux-1.5 }
     fout[auxb & !wltv] <- { vaux <- v[auxb & !wltv]
         vaux+1.5 - sqrt(vaux*(6-2*vaux)-4*r[auxb & !wltv]*fv(vaux)/cc)}
       }
     }
     fout
     }

> vt <- runif(1000)
  wt <- runif(1000)
  rt <- FWcondV(wt,vt)
  summary(FWinvV(rt,vt)-wt)
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-2.220e-16  0.000e+00  0.000e+00 -1.629e-18  0.000e+00  2.220e-16 
 
### Up to here, we have tested that FWinvV correctly inverts FWcondV

> M1 <- function(N, spacng=.01) {
  spts <- (0:100)/100
  Vspl <- smooth.spline(FV(spts),spts, spar=1.e-6)
  Vsim <- predict.smooth.spline(Vspl,runif(N))$y
  Wsim <- FWinvV(runif(N),Vsim)
  Usim <- runif(N, pmax((Vsim+Wsim)/2-.75,0), pmin(Vsim,Wsim))
  cbind(x=Usim, y=Vsim-Usim, z=Wsim-Usim)
  } 

METHOD 2. Now we have only to generate a series of X,Y,Z values
directly in the cube (0,1)^3 and reject those that fall outside the
region.
> M2 <- function(N, fac=1/0.321) {
  naux <- 200 + N*fac
  xyzmat <- matrix(runif(3*naux), ncol=3, 
        dimnames=list(NULL, c("x","y","z")))
  bool <- xyzmat[,1]+xyzmat[,2] < 1 & xyzmat[,1]+xyzmat[,3] < 1 &
        xyzmat[,2]+xyzmat[,3] < 1.5
  list(xyzmat = xyzmat[bool,][1:N,], volum=mean(bool))
  }
> M2(1000)[[2]]
[1] 0.3309201
> M2(1.e6)[[2]]
[1] 0.3226401	### took 10 minutes on Sparc 5 !?

### OK, now generate two different matrices 1000 x 3:

> mat1 <- matrix(unlist(M1(1000)), ncol=3)
  mat2 <- M2(1000)  ### recall this is a list by def'n
> mat2$volum
[1]  0.3257919      ### OK, still agrees with volumes above
> mat2 <- mat2$xyzmat
## For chi-square use the cells based on x,y,z < .5 or >= .5
## NB the cells with x,y>.5 or  x,z > .5  are empty (4 in all)

> t1 <- table( mat1[,1] >= .5, mat1[,2] >= .5, mat1[,3] >= .5)
  t2 <- table( mat2[,1] >= .5, mat2[,2] >= .5, mat2[,3] >= .5)
> t1
, ,  = FALSE       
        FALSE TRUE
  FALSE 401   186 
  TRUE  122     0 

, ,  = TRUE       
        FALSE TRUE
  FALSE 190   101 
  TRUE    0     0 

> t2
, ,  = FALSE       
        FALSE TRUE
  FALSE 386   177 
  TRUE  126     0 

, ,  = TRUE       
        FALSE TRUE
  FALSE 199   112 
  TRUE    0     0 

> sum(t1+t2>0)
[1] 5  ### so the chi-square will have   4 df
> sum(((t1-t2)^2/(t1+t2))[t1+t2>0])
[1] 1.349854	#### Convincingly small !!!

Method 3. Suppose we had wanted only to test the validity of M2
without working so hard to establish the correct marginal and 
onditional densities. It would have sufficed to create a rejection-
method simulation which worked differently from (and so might 
be expected to have errors, if any, different from) M2.

For example, let's use the same transformation as in Method 1;
create (V,W) by a reject-accept method using the known density f_{V,W};
and then generate U conditionally uniform in  
[max(0.5*(v+w)-0.75, 0), min(v,w)].

> M3 <- function(N, fac=5) {
  naux <- 200 + N*fac
  vwrmat <- matrix(runif(2*naux), ncol=2)
  vwrmat <- vwrmat[runif(naux) < pmin(vwrmat[,1],
     vwrmat[,2]) -pmax(0,0.5*(vwrmat[,1]+vwrmat[,2])-0.75),][1:N,]
  uvec <- runif(N, pmax(0,0.5*(vwrmat[,1]+vwrmat[,2])-0.75), 
     pmin(vwrmat[,1], vwrmat[,2]))
  xyzmat = cbind(x=uvec, y=vwrmat[,1]-uvec, z=vwrmat[,2]-uvec)
  }
> mat3 <- M3(1000)
> t3 <- table(mat3[,1]>0.5, mat3[,2]>0.5, mat3[,3]>0.5)
> t3
, ,  = FALSE       
        FALSE TRUE
  FALSE 363   215 
  TRUE  125     0 

, ,  = TRUE       
        FALSE TRUE
  FALSE 204    93 
  TRUE    0     0 
> sum(((t3-t2)^2/(t3+t2))[t3+t2>0])
[1] 6.216943	### still OK for 4 df chisq
> 1-pchisq(6.217,4)  ### pvalue = 0.184

-----------------------------------
(c) Now re-ran the same functions  M1, M2, for N=1000,
re-generated mat1, mat2 .

> ks.gof(mat1[,1], mat2[,1])
ks = 0.036, p-value = 0.5107 
> ks.gof(mat1[,2], mat2[,2])
ks = 0.038, p-value = 0.4421 
> ks.gof(mat1[,3], mat2[,3])
ks = 0.05, p-value = 0.1528