# lm with two qualitative variables is called 2-way analysis of variance
rats <- read.table("rats.txt",header=T)
# a dataset on survival (time) of rats after a (poison) and a treatment (treat)
rats # the data
boxplot(time ~ treat + poison, data=rats, ylab="survival time", cex.lab=1.5)
 # show boxplots of survival at all combinations of poison and treatment 
 # it is quite apparent that the assumption of equal variances is not true
 # we go ahead any way
 
# one may assume additive effects of the two factors, or an interaction among the two

interaction.plot(rats$treat,rats$poison,rats$time) # graphical analysis of interaction (if no interactions, lines should be parallel)
 interaction.plot(rats$treat,rats$poison,rats$time,ylab="survival time", xlab = "treatment", trace.label="poison", cex.lab=1.25) # you can add options to make a nicer output,
# the lines are mean survival at different treatments for the poisons
# actually, they are not so far from parallel

# model with interaction terms
g <- lm(time ~ poison * treat, data = rats)  # if there is * between the two predictor variables, is the model with interactions
anova(g) # output as anova table
summary(g) # output coefficients; hard to understand
g_add <- lm(time ~ poison + treat, data = rats) # additive model (no interactions). 
anova(g_add) # output as anova table
summary(g_add) # parameter estimates
anova(g,g_add) # tests one model against the other
# the result is exactly what we had seen in anova(g) as test of the interaction

# what can we do about unequal variances ?
qqnorm(g$res,main="Quantiles of residuals vs. normal ones",xlab="",ylab="") # visual test for the normality of residuals
qqline (g$res) # very far away from normal
boxplot(g$res ~ round(g$fitted,2), main = "Residuals at each poison+treat combination",cex.main=1.5) #look at box-plots of residuals


# try a transformation (others, as the logarithm, had been explored) 
#1/time can be interpreted as mortality
boxplot(1/time ~ poison + treat, data = rats) # look at data under this transformation


g1 <- lm(1/time ~ poison * treat, data = rats) #analysis of 1/survival (with interaction)
anova(g1) #interaction not significant
g1 <- lm(1/time ~ poison + treat, data = rats)#analysis of 1/survival (without interaction)
anova(g1)
summary(g1)
qqnorm(g1$res,main="Q-Q plot of residuals of ANOVA of inverses",xlab="",ylab="",cex.main=1.5) # visual test for the normality of residuals
qqline (g1$res)
boxplot(g1$res ~ round(g1$fitted,2), main = "Box-plots of residuals  of ANOVA of inverses",cex.main=1.5) #boxplots
# This seems the best one

# Can we say anything about the difference among treatments?
# Summary makes comparisons (via tests of single beta_i)
# It is not correct (why?) testing directly all possible differences.
# There is an approximate procedure that works well (TukeyHSD)
TukeyHSD(aov(1/time ~ poison + treat, data = rats)) #Tukey's comparisons of effects on 1/survival
# the command TukeyHSD however needs to work on the output of the command "aov" not of "lm" 
# "aov" does more or less the same as "lm" on qualitative variables, but the output is of a different class

# polynomial regression
# though we have a polynomial in the independent variable, it is still a linear model (in the parameters...)

# look again at the dataset with age ("eta") and pressure ("pressione")
press <- read.table("pressione.txt",header=TRUE)
#  linear regression
reg <- lm(pressione ~ eta,data=press)
summary(reg)
plot (pressione ~ eta, data=press) # points
abline(reg) # regression line
# a way to look at whether a regression is a good model is to look at residuals
# if the model were correct, residuals should be independent and equidistributed (and normal)
plot (reg$residual ~ reg$fitted) # reg$residual are y - yhat
# reg$fitted sono yhat
abline(h=0) 

#  residuals seem positive at low and high age 
# and negative at intermediate age


# we can think of adding a quadratic term

reg2 <- lm(pressione ~ eta + eta^2,data=press)
summary(reg2) # does not work

# several ways to handle this

# one method is to use the function I() that means: take the object as it is (see the Help for explanations)
reg2 <- lm(pressione ~ eta + I(eta^2),data=press)
summary(reg2) # it works
# it can be seen that the coefficient of the x^2 is significantly different from 0.


# a more elegant way is using the function poly(.,.)
regp <- lm(pressione ~ poly(eta,1),data=press)
# asking to compute the regression of pressione on a polynomial in eta of degree 1
summary(regp)
# If you compare to the linear regression computed before, we have the same R^2, fitted values and residuals...
# but different coefficients
# poly(eta,j) are orthogoanl polynomials
# it has computational advantages, and also is such that
# coefficients do not change, when increasing the degree of the polynomial
regp2 <- lm(pressione ~ poly(eta,2), data=press)
summary(regp2)

# it is however more usual writing in terms of coefficients of x, x^2...
reg2 <- lm(pressione ~ poly(eta,2,raw=TRUE), data = press)
# raw = TRUE tells to use the "normal" basis of polynomials: 1, x, x^2...
 
summary(reg2) # ok


# Clearly R^2 is increased (from 51% to 60%) moving from linear to quadratic model.
# the coefficient of the quadratic term is significantly different from 0.
# this is equivalent to the test
anova(reg,reg2)
# ok
# let's look at residuals
plot(reg2$res ~ reg2$fit)
abline(h=0)
# it's difficult saying very much, but there are no obvious trends

# how about a 3-rd degree polynomial
reg3 <- lm(pressione ~ poly(eta,3,raw=TRUE), data = press)
summary(reg3) # a third degree term is not significant, even though R^2 has slightly increased
anova(reg2,reg3)


# Now let's make a plot with data, regression line and quadratic polynomial.
# The first two are easy:

plot (pressione ~ eta, data = press)
abline(reg)
points(reg2$fit ~ press$eta, col = "red") # now observed points are in black
# in red the predicted ones through the quadratic
# to draw the polynomial, the best way is using the command predict(). 
# predict() uses the model to produce the output in some given points
# for instance...
asc = 20:85 # the abscissae on which to use the model
# these are all numbers from 20 to 85 with increment 1
asc
ord = predict(reg2,data.frame(eta=asc)) # the values obtained using reg2 on asc
ord 
lines(asc,ord, lty=2, col = "blue") # ok

# predict can also be used to have confidence intervals on y_hat or on a single predicted value
reg.pred = predict(reg,data.frame(eta=asc),interval="confidence") # asking for C.I. on yhat
reg.pred
# how to access the columns
reg.pred[,2] #column 2
reg.pred[3,] # row 3
plot (pressione ~ eta, data = press)
abline(reg)
lines(asc, reg.pred[,2],lty=2)
lines(asc, reg.pred[,3],lty=2) # uncertainty increases away from the centre
reg.pred2 <- predict(reg,data.frame(eta=asc),interval="prediction") # asking for C.I. on a new observation with that value of eta 
lines(asc, reg.pred2[,2],lty=2,col=2)
lines(asc, reg.pred2[,3],lty=2,col=2) # roughly we're adding plus or minus 2 sigma to the prediction 

# logistic regression. An example

library(MASS) # load the library which also contains the dataset of interest
data()
help(rotifer) # describes the data
head(rotifer)
help(glm) #glm is the command to perform generalized linear model fit
reglogis <- glm(pm.y/pm.tot ~ density, weights= pm.tot, family=binomial, data = rotifer) #  regression logistic is performed through the command "glm" (generalized linear model) specifying that we require the binomial family:
# there are different possible syntaxes in the formula. Here we give as response the frequency of "successes"
# with weights, I state the number of "trials"

summary(reglogis) # we can read coefficient estimates, with corresponding standard errors
# and the deviance, compared to that of a "saturated" model

# An alternative (equivalent) syntax
reglogis2 <- glm(as.matrix(cbind(pm.y,pm.tot-pm.y))~ density,  family=binomial, data = rotifer) # response is the pair (number of "successes", number of "failures"); then we do not need to give weights
summary(reglogis2)

# plotting results...
plot(pm.y/pm.tot ~ density,data = rotifer)
lines(rotifer$density,reglogis$fitted) # reglogis$fitted are the predicted probabilities
# the output is ok only because rotifer$density is ordered


# alternative way to plot the model prediction, using predict, so that it becomes smoother and there is no problem with ordering
M <- max(rotifer$density)
m <- min(rotifer$density)
asc <- m + (-10:210)*(M-m)/200 # these are abscissae in which to compute the model (divide the interval (m.M) in 200 parts; add 10 points before and 10 after at the same distance)
ord <- predict(reglogis,data.frame(density=asc),type="response")
# with type="response" we get the predicted probability, not X*beta
# had we wished to obtain the predicted values of X*beta....
eta <- predict(reglogis,data.frame(density=asc),type="link")
# now plot the prediction together with data points
plot(pm.y/pm.tot ~ density,data = rotifer,ylim=c(0,1)) #plot data
lines(asc,ord) #add model