press <- read.table("/media/data/work/Teaching/Unitn_Pugliese/Lesson6/pressione.txt",header=TRUE) # we had already performed linear regression reg <- lm(pressione ~ eta,data=press) summary(reg) plot (pressione ~ eta, data=press) # points abline(reg) # regression line # a way to look at whether a regression is a good model is to look at residuals # if the model were correct, residuals should be independent and equidistributed (and normal) plot (reg$residual ~ reg$fitted) # reg$residual are y - yhat # reg$fitted sono yhat abline(h=0) asc <- 20:85 # the abscissae on which to use the model # these are all numbers from 20 to 85 with increment 1 # predict can also be used to have confidence intervals on y_hat or on a single # predicted value reg.pred <- predict(reg,data.frame(eta=asc),interval="confidence") # asking for C.I. on yhat reg.pred # we could simply ask for the 2nd column... reg.pred[,2] #column 2 reg.pred[3,] # row 3 # or convert into a dataframe.... as.data.frame(reg.pred)$lwr plot (pressione ~ eta, data = press) abline(reg) lines(asc, reg.pred[,2],lty=2) lines(asc, reg.pred[,3],lty=2) # uncertainty increases away from the centre reg.pred2 <- predict(reg,data.frame(eta=asc),interval="prediction") # asking for # C.I. on a new observation with thatvalue of eta (NOT SEEN IN CLASS) lines(asc, reg.pred2[,2],lty=2,col=2) lines(asc, reg.pred2[,3],lty=2,col=2) # roughly we're adding plus or minus 2 sigma to the prediction # model choice # let us start from a dataset with many "predictor" variables # the dataset "state" in the standard implementation of R contains information on US states help(state) state.x77 # is a matrix containing some statistics state.abb states <- data.frame(state.x77,row.names=state.abb) ## row.names... not using the whole state name # we explore whether it is possible to predict (or explain) life expectancy # on the basis of the other variables ##start with all the variables regtot <- lm(Life.Exp ~ .,data = states) # we do not need to write them all. ## It's ok writing ~ . and this will use all variables but the response... summary(regtot) ##Multiple R-squared: 0.736 ##Adjusted R-squared: 0.692 (is something that is computed by R, ## that penalizes R^2 if too many parameters) ##Remove the Area (the worst variable as t value) from the predictors' set ##using update reg <- update(regtot, ~ . - Area) #take Area away from predictor variables summary(reg) ##Multiple R-squared: 0.736 ##Adjusted R-squared: 0.699 ## removing the one predictor (Area), R^2 is the same, ## but check the other variables p-value # one can proceed like this. ## remove Illitteracy reg <- update(reg, ~ . - Illiteracy) summary(reg) ##Multiple R-squared: 0.736 ##Adjusted R-squared: 0.706 ## R^2 is the same, the adjusted R^2 is growing ## remove the Income reg <- update (reg, ~ . - Income) summary(reg) ##Residual standard error: 0.736 ##Adjusted R-squared: 0.713 ## if we stick to the reference value 5% for significance (though...), we go on removing Population reg <- update (reg, ~ . - Population) summary(reg) ##Multiple R-squared: 0.713 ##Adjusted R-squared: 0.694 AIC(reg) AIC(regtot) plot(reg$res ~ reg$fit) #seems quite alright abline(h=0) qqnorm(reg$res) qqline(reg$res) # an automatic selection procedure using Akaike's criterion step(regtot) AIC(regtot) # application of cross-validation to this dataset states1 <- states[-1,] ##remove the first sample of the dataframe # run, on the reduced dataset, the simpler regression reg1 <- lm(Life.Exp ~ Murder+HS.Grad+Frost,data=states1) # a model with few predictor variables summary(reg1) ypred1 <- predict(reg1,states[1,]) ##run the model on the first sample of the dataframe ypred1 #predicted value states$Life.Exp[1] #real value ## same procedure doing a loop on all the rows: Leave-One-Out CrossValidation err <- vector(length=50) #we create a vector that will be needed for (i in 1:50) # there exists in R the "for" loop, and this is its syntax { states.i <- states[-i,] # create a dataset without row i reg.i <- lm(Life.Exp ~ Murder+HS.Grad+Frost,data=states.i) # lm on the dataset without row i ypred.i <- predict(reg.i,states[i,]) # value predicted through this model on row i err[i] <- (ypred.i - states$Life.Exp[i])^2 # resulting error } # end of for-loop ypred.i ##evaluate the model sum(err) sum(err)/(50*var(states$Life.Exp)) #compare to original variance # what if I had used all predictor variables err2 <- vector(length=50) for (i in 1:50) { states.i <- states[-i,] regtot.i <- lm(Life.Exp ~ .,data=states.i) ypredtot.i <- predict(regtot.i,states[i,]) err2[i] <- (ypredtot.i - states$Life.Exp[i])^2 } ## evaluate the model sum(err2) sum(err2)/(50*var(states$Life.Exp)) #compare to original variance sum(err2)/sum((states$Life.Exp - mean(states$Life.Exp))^2) # having more predictor variables causes a bigger error # in principle, one could repeat this for all the possible models, # and look for that minimizing sum(err) ## what does it happen if you run the model on a sample used also ## for building/training the model? reg <- lm(Life.Exp ~ Murder+HS.Grad+Frost, data=states) ## build the model on all the samples err_all <- vector(length=50) for (i in 1:50) { ypredtot.i <- predict(reg,states[i,]) err_all[i] <- (ypredtot.i - states$Life.Exp[i])^2 } ## evaluate the model sum(err_all) sum(err_all)/(50*var(states$Life.Exp)) #compare to original variance # a more optimistic result than with the cross-validation regtot <- lm(Life.Exp ~ .,data=states) ##build the model on all the samples err_alltot <- vector(length=50) for (i in 1:50) { ypredtot.i <- predict(regtot,states[i,]) err_alltot[i] <- (ypredtot.i - states$Life.Exp[i])^2 } ##evaluate the model sum(err_alltot) sum(err_alltot)/(50*var(states$Life.Exp)) #compare to original variance # adding more variables always improves when fitting to the same model