Homework 14 Solution ==================== ## saved as text-file all lines beginning with header of ## http://www.math.umd.edu/~evs/s430.old/Data/bass > bass = read.table("Bass.txt", header=T) bass = cbind.data.frame(bass[,c(1:8)], logAvM=log(bass[,7])) names(bass)[1:8] = c("ID", "Lake", "Alk", "pH", "Calc", "Chlor", "AvMrc", "nsamp") simpfit = lm(I(log(AvMrc))~Alk, data=bass) plot(bass$Alk, bass$logAvM, xlab="Alkalinity", ylab="log Avg Merc", main="Overplotted Scatterplot, \n Fitted line and CI") ind = order(bass$Alk) lines(bass$Alk[ind], simpfit$fit[ind], lty=1) mtmp = model.matrix(simpfit) residSE = summary(simpfit)$sig*sqrt(1-diag(mtmp %*% summary(simpfit)$cov.unsc %*%t(mtmp))) > summary(residSE) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.5541 0.5828 0.5837 0.5816 0.5857 0.5873 lines(bass$Alk[ind], simpfit$fit[ind] + 1.96*residSE[ind], lty=3, col="red") lines(bass$Alk[ind], simpfit$fit[ind] - 1.96*residSE[ind], lty=3, col="red") legend(locator(), legend=c("Fitted","Conf.Band"), lty=c(1,3)) > identify(bass$Alk, bass$logAvM) [1] 3 18 36 38 40 ### 38 & 40 outside; other 3 just inside (ii) Remove only 38, 40 ### For one-at-a-time selection with alpha=.05, use k=qchisq(.95,1)=3.84 > logLik(simpfit) 'log Lik.' -46.48546 (df=3) extractAIC(simpfit) [1] 2.00000 -53.43657 extractAIC(simpfit, k=3.84) [1] 2.00000 -49.75657] extractAIC(update(simpfit, data=bass[c(1:37,39,41:53),]), k=3.84) [1] 2.00000 -73.37294 ### initial "AIC" > stpfit = step(update(simpfit, data=bass[c(1:37,39,41:53),]), logAvM ~ Alk + pH + Calc + Chlor, direction = "forw", k=3.84) > stpfit$call lm(formula = logAvM ~ Alk + Chlor, data = bass[c(1:37, 39, 41:53), ]) ### this is the "final model, added only "Chlor" improving AIC ### from -73.373 to -78.982 > round(summary(stpfit)$coef,4) Estimate Std. Error t value Pr(>|t|) (Intercept) -0.2237 0.0848 -2.6392 0.0112 Alk -0.0132 0.0019 -6.9493 0.0000 Chlor -0.0089 0.0028 -3.1258 0.0030 ### All coefs quite significant > stpfit2 = step(update(simpfit, data=bass[c(1:37,39,41:53),]), logAvM ~ (Alk + pH + Calc + Chlor)^2, direction = "forw", k=3.84) ### Same "best" model is chosen even if pairwise interactions allowed. > anova(stpfit) Analysis of Variance Table Response: logAvM Df Sum Sq Mean Sq F value Pr(>F) Alk 1 19.8642 19.8642 110.2611 4.996e-14 *** Chlor 1 1.7603 1.7603 9.7708 0.003008 ** Residuals 48 8.6475 0.1802 ### The F-test shows improved fit; for graphical display: plot(simpfit$resid[c(1:37,39,41:53)], stpfit$resid, xlab="Simp Reg Resid", ylab="Improved Model Resid", main="Residuals from 2 Models") abline(0,1, col="red") ### Most points below the line imply more small residuals from 2nd model > summary(abs(stpfit$resid)/abs(simpfit$resid[c(1:37,39,41:53)])) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.1808 0.7377 0.8997 1.2350 1.2660 7.8520