Homework 14 Solution
====================

## saved as text-file all lines beginning with header of
## http://www.math.umd.edu/~evs/s430.old/Data/bass

> bass = read.table("Bass.txt", header=T)
  bass = cbind.data.frame(bass[,c(1:8)], logAvM=log(bass[,7]))
  names(bass)[1:8] = c("ID", "Lake", "Alk", "pH", "Calc", 
         "Chlor", "AvMrc", "nsamp")
  simpfit = lm(I(log(AvMrc))~Alk, data=bass)
  plot(bass$Alk, bass$logAvM, xlab="Alkalinity", ylab="log Avg Merc",
     main="Overplotted Scatterplot, \n Fitted line and CI")
  ind = order(bass$Alk)
  lines(bass$Alk[ind], simpfit$fit[ind], lty=1)
  mtmp = model.matrix(simpfit)
  residSE = summary(simpfit)$sig*sqrt(1-diag(mtmp %*% 
             summary(simpfit)$cov.unsc %*%t(mtmp)))
> summary(residSE)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.5541  0.5828  0.5837  0.5816  0.5857  0.5873 
  lines(bass$Alk[ind], simpfit$fit[ind] + 1.96*residSE[ind], 
         lty=3, col="red")
  lines(bass$Alk[ind], simpfit$fit[ind] - 1.96*residSE[ind], 
         lty=3, col="red")
  legend(locator(), legend=c("Fitted","Conf.Band"), lty=c(1,3))
> identify(bass$Alk, bass$logAvM)
[1]  3 18 36 38 40        ### 38 & 40 outside; other 3 just inside

(ii)  Remove only 38, 40

### For one-at-a-time selection with alpha=.05, use k=qchisq(.95,1)=3.84

> logLik(simpfit)
'log Lik.' -46.48546 (df=3)
  extractAIC(simpfit)
[1]   2.00000 -53.43657
  extractAIC(simpfit, k=3.84)
[1]   2.00000 -49.75657]
  extractAIC(update(simpfit, data=bass[c(1:37,39,41:53),]), k=3.84)
[1]   2.00000 -73.37294             ### initial "AIC" 

> stpfit = step(update(simpfit, data=bass[c(1:37,39,41:53),]), 
                 logAvM ~ Alk + pH + Calc + Chlor, 
                 direction = "forw", k=3.84)
> stpfit$call
lm(formula = logAvM ~ Alk + Chlor, data = bass[c(1:37, 39, 41:53), 
    ])
### this is the "final model, added only "Chlor" improving AIC 
###    from  -73.373  to  -78.982

> round(summary(stpfit)$coef,4)
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  -0.2237     0.0848 -2.6392   0.0112
Alk          -0.0132     0.0019 -6.9493   0.0000
Chlor        -0.0089     0.0028 -3.1258   0.0030

### All coefs quite significant

> stpfit2 = step(update(simpfit, data=bass[c(1:37,39,41:53),]), 
                 logAvM ~ (Alk + pH + Calc + Chlor)^2, 
                 direction = "forw", k=3.84)
### Same "best" model is chosen even if pairwise interactions allowed.

> anova(stpfit)
Analysis of Variance Table

Response: logAvM
          Df  Sum Sq Mean Sq  F value    Pr(>F)    
Alk        1 19.8642 19.8642 110.2611 4.996e-14 ***
Chlor      1  1.7603  1.7603   9.7708  0.003008 ** 
Residuals 48  8.6475  0.1802            

### The F-test shows improved fit; for graphical display:

plot(simpfit$resid[c(1:37,39,41:53)], stpfit$resid, 
     xlab="Simp Reg Resid", ylab="Improved Model Resid", 
     main="Residuals from 2 Models")
abline(0,1, col="red")
### Most points below the line imply more small residuals from 2nd model

> summary(abs(stpfit$resid)/abs(simpfit$resid[c(1:37,39,41:53)]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.1808  0.7377  0.8997  1.2350  1.2660  7.8520