press <- read.table("/media/data/work/Teaching/Unitn_Pugliese/Lesson6/pressione.txt",header=TRUE)
# we had already performed linear regression
reg <- lm(pressione ~ eta,data=press)
summary(reg)
plot (pressione ~ eta, data=press) # points
abline(reg) # regression line
# a way to look at whether a regression is a good model is to look at residuals
# if the model were correct, residuals should be independent and equidistributed (and normal)
plot (reg$residual ~ reg$fitted) # reg$residual are y - yhat
# reg$fitted sono yhat
abline(h=0) 

asc  <-  20:85 # the abscissae on which to use the model
# these are all numbers from 20 to 85 with increment 1

# predict can also be used to have confidence intervals on y_hat or on a single 
# predicted value
reg.pred  <- predict(reg,data.frame(eta=asc),interval="confidence") 
# asking for C.I. on yhat
reg.pred
# we could simply ask for the 2nd column...
reg.pred[,2] #column 2
reg.pred[3,] # row 3
# or convert into a dataframe.... 
as.data.frame(reg.pred)$lwr
plot (pressione ~ eta, data = press)
abline(reg)
lines(asc, reg.pred[,2],lty=2)
lines(asc, reg.pred[,3],lty=2) # uncertainty increases away from the centre
reg.pred2 <- predict(reg,data.frame(eta=asc),interval="prediction") # asking for 
# C.I. on a new observation with thatvalue of eta (NOT SEEN IN CLASS)
lines(asc, reg.pred2[,2],lty=2,col=2)
lines(asc, reg.pred2[,3],lty=2,col=2)
# roughly we're adding plus or minus 2 sigma to the prediction 

# model choice
# let us start from a dataset with many "predictor" variables
# the dataset "state" in the standard implementation of R contains information on US states 
help(state)
state.x77 # is a matrix containing some statistics
state.abb
states  <-  data.frame(state.x77,row.names=state.abb) 
## row.names... not using the whole state name
# we explore whether it is possible to predict (or explain) life expectancy 
# on the basis of the other variables

##start with all the variables
regtot <- lm(Life.Exp ~ .,data = states) # we do not need to write them all. 
## It's ok writing  ~ . and this will use all variables but the response...
summary(regtot)
##Multiple R-squared: 0.736
##Adjusted R-squared: 0.692 (is something that is computed by R, 
## that penalizes R^2 if too many parameters)

##Remove the Area (the worst variable as t value) from the predictors' set
##using update
reg <- update(regtot, ~ . - Area) #take Area away from predictor variables
summary(reg)
##Multiple R-squared: 0.736
##Adjusted R-squared: 0.699

## removing the one predictor (Area), R^2 is the same, 
## but check the other variables p-value

# one can proceed like this.
## remove Illitteracy
reg <- update(reg, ~ . - Illiteracy)
summary(reg)
##Multiple R-squared: 0.736
##Adjusted R-squared: 0.706 

## R^2 is the same, the adjusted R^2 is growing

## remove the Income
reg <- update (reg, ~ . - Income)
summary(reg)
##Residual standard error: 0.736
##Adjusted R-squared: 0.713

## if we stick to the reference value 5% for significance (though...), we go on removing Population
reg <- update (reg, ~ . - Population)
summary(reg)
##Multiple R-squared: 0.713
##Adjusted R-squared: 0.694
AIC(reg)
AIC(regtot)

plot(reg$res ~ reg$fit) #seems quite alright
abline(h=0)
qqnorm(reg$res)
qqline(reg$res)

# an automatic selection procedure using Akaike's criterion
step(regtot)
AIC(regtot)

# application of cross-validation to this dataset

states1 <- states[-1,] ##remove the first sample of the dataframe
# run, on the reduced dataset, the simpler regression
reg1 <-  lm(Life.Exp ~ Murder+HS.Grad+Frost,data=states1) # a model with few predictor variables
summary(reg1)
ypred1 <- predict(reg1,states[1,]) ##run the model on the first sample of the dataframe
ypred1 #predicted value
states$Life.Exp[1] #real value

## same procedure doing a loop on all the rows: Leave-One-Out CrossValidation
err <- vector(length=50) #we create a vector that will be needed
for (i in 1:50) # there exists in R the "for" loop, and this is its syntax
{
  states.i <- states[-i,] # create a dataset without row i
  reg.i <- lm(Life.Exp ~ Murder+HS.Grad+Frost,data=states.i) 
  # lm on the dataset without row i
  ypred.i <- predict(reg.i,states[i,]) # value predicted through this model on row i
  err[i] <- (ypred.i - states$Life.Exp[i])^2 # resulting error
} # end of for-loop
ypred.i
##evaluate the model
sum(err)
sum(err)/(50*var(states$Life.Exp)) #compare to original variance

# what if I had used all predictor variables
err2 <- vector(length=50)
for (i in 1:50) {
  states.i <- states[-i,]
  regtot.i <- lm(Life.Exp ~ .,data=states.i)
  ypredtot.i <- predict(regtot.i,states[i,])
  err2[i] <- (ypredtot.i - states$Life.Exp[i])^2
}

## evaluate the model
sum(err2)
sum(err2)/(50*var(states$Life.Exp)) #compare to original variance

sum(err2)/sum((states$Life.Exp - mean(states$Life.Exp))^2)

# having more predictor variables causes a bigger error
# in principle, one could repeat this for all the possible models, 
# and look for that minimizing sum(err)

## what does it happen if you run the model on a sample used also 
## for building/training the model?
reg <-  lm(Life.Exp ~ Murder+HS.Grad+Frost, data=states)
## build the model on all the samples

err_all <- vector(length=50)
for (i in 1:50) {
  ypredtot.i <- predict(reg,states[i,])
  err_all[i] <- (ypredtot.i - states$Life.Exp[i])^2
}
## evaluate the model
sum(err_all)
sum(err_all)/(50*var(states$Life.Exp)) #compare to original variance
# a more optimistic result than with the cross-validation

regtot <-  lm(Life.Exp ~ .,data=states) ##build the model on all the samples

err_alltot <- vector(length=50)
for (i in 1:50) {
  ypredtot.i <- predict(regtot,states[i,])
  err_alltot[i] <- (ypredtot.i - states$Life.Exp[i])^2
}
##evaluate the model
sum(err_alltot)
sum(err_alltot)/(50*var(states$Life.Exp)) #compare to original variance
# adding more variables always improves when fitting to the same model