qqgamma <- function(x, shape, ...) plot(qgamma(ppoints(x), shape), sort(x), ...) qqweibull <- function(x, shape, ...) plot(qweibull(ppoints(x), shape), sort(x), ...) library(MASS) data(GAGurine) GAGurine$Agemod <- GAGurine$GAG GAGurine$Agemod[GAGurine$Agemod==Inf] <- 0 GAGurine$Agemod <- cut(GAGurine$Age, 10) plot(GAGurine$Age, 1/GAGurine$GAG) gg2 <- lm(GAG~Age, data=GAGurine) summary(gg2)$adj.r.squared ## Compute simple linear model gg <- lm(1/GAG~Age, data=GAGurine) summary(gg)$adj.r.squared gg1 <- lm(1/GAG~poly(Age,4) ,data=GAGurine) summary(gg1)$adj.r.squared ## Set threshold for large leverage as p/ ## Non-polynomial model thr <- 2/314 ## Polynomial model thr1 <- 5/314 ## Compute studentized residuals ggstr <- studres(gg) gg1str <- studres(gg1) ## Compute leverages gg.hat <- lm.influence(gg)$hat gg1.hat <- lm.influence(gg1)$hat ## Get the samples with the highest leverage ## leverages are the h_ii of the H matrix used to compute the linear model. ## The higher the leverage the higher the influence the sample has to move the model ## Common thresholds are 2 or 3 times the trace(H) (which is p) normalized by the number of samples ## Try with different threshold multipliers which(gg.hat>2*thr) which(gg1.hat>2*thr1) order(abs(ggstr), decreasing=TRUE) order(abs(gg1str), decreasing=TRUE) ## Note that some of the sample have both high studentized residuals and high leverages. ## Some of them can be considered outlier. Maybe in some further analysis we should consider removing this samples from the dataset. ## Diagnostic Plots plot(fitted(gg),ggstr, main="Poly(1)") abline(h = 0, lty = 2) plot(fitted(gg1),gg1str, main="Poly(4)") abline(h = 0, lty = 2) ## Check normality qqnorm(ggstr) qqline(ggstr) shapiro.test(ggstr) qqnorm(gg1str) qqline(gg1str) shapiro.test(gg1str) ## Plot the data, boxplots, regression models and loess fit. xyplot(1/GAG~Age, data=GAGurine,groups=trunc(Age),pch=19, panel=function(x,y,...){ panel.xyplot(x,y,...) panel.loess(x,y,..., col="red", lwd=2, family="gaussian", degree=3) panel.bwplot(trunc(x)+0.5,y, horizontal=FALSE, lwd=3, par.settings=list( box.rectangle=list(col="red"), box.umbrella=list(col="red"), box.dot = list(col="red"), plot.symbol=list(col="red")) ) panel.abline(gg, lwd=2) panel.lines(x,gg1$fitted.values, lwd=3, col="blue") }, key=list(text=list(c("lm", "poly(4)", "Loess Fit")), lines=list(lty=c(rep(1,3)), col=c("black", "blue", "red")), columns=3) ) ## With the command below we can get the information on the values plotted in the boxplots for each Age level. ## In particular in the "stats" object we have a matrix of 5 x 17 -> see the help of boxplots and slides for further information. rr <- boxplot(GAG~I(trunc(Age)), data=GAGurine, plot=FALSE)$stats ## Create a vector of ages as integers xx <- unique(trunc(GAGurine$Age))+1 ## Given the poly(4) model we can compute the prediction at each Age level with 95% CI mypred <- predict(gg1, data.frame(Age=xx), interval="confidence", level=0.95) ## Plot everything ## NB we compute the model using 1/GAG so we have to transform back the prediction of the linear model to produce a plot GAG by Age. plot(xx,1/mypred[,2], type="none", main="GAG level vs Age", ylab="GAG", xlab="Age") polygon(x=c(xx,rev(xx)),y=c(1/mypred[,2], rev(1/mypred[,3])), col="skyblue") lines(xx,1/mypred[,1], lty=3) lines(xx,1/mypred[,2]) lines(xx,1/mypred[,3])