# lm with two qualitative variables is called 2-way analysis of variance rats <- read.table("rats.txt",header=T) # a dataset on survival (time) of rats after a (poison) and a treatment (treat) rats # the data boxplot(time ~ treat + poison, data=rats, ylab="survival time", cex.lab=1.5) # show boxplots of survival at all combinations of poison and treatment # it is quite apparent that the assumption of equal variances is not true # we go ahead any way # one may assume additive effects of the two factors, or an interaction among the two interaction.plot(rats$treat,rats$poison,rats$time) # graphical analysis of interaction (if no interactions, lines should be parallel) interaction.plot(rats$treat,rats$poison,rats$time,ylab="survival time", xlab = "treatment", trace.label="poison", cex.lab=1.25) # you can add options to make a nicer output, # the lines are mean survival at different treatments for the poisons # actually, they are not so far from parallel # model with interaction terms g <- lm(time ~ poison * treat, data = rats) # if there is * between the two predictor variables, is the model with interactions anova(g) # output as anova table summary(g) # output coefficients; hard to understand g_add <- lm(time ~ poison + treat, data = rats) # additive model (no interactions). anova(g_add) # output as anova table summary(g_add) # parameter estimates anova(g,g_add) # tests one model against the other # the result is exactly what we had seen in anova(g) as test of the interaction # what can we do about unequal variances ? qqnorm(g$res,main="Quantiles of residuals vs. normal ones",xlab="",ylab="") # visual test for the normality of residuals qqline (g$res) # very far away from normal boxplot(g$res ~ round(g$fitted,2), main = "Residuals at each poison+treat combination",cex.main=1.5) #look at box-plots of residuals # try a transformation (others, as the logarithm, had been explored) #1/time can be interpreted as mortality boxplot(1/time ~ poison + treat, data = rats) # look at data under this transformation g1 <- lm(1/time ~ poison * treat, data = rats) #analysis of 1/survival (with interaction) anova(g1) #interaction not significant g1 <- lm(1/time ~ poison + treat, data = rats)#analysis of 1/survival (without interaction) anova(g1) summary(g1) qqnorm(g1$res,main="Q-Q plot of residuals of ANOVA of inverses",xlab="",ylab="",cex.main=1.5) # visual test for the normality of residuals qqline (g1$res) boxplot(g1$res ~ round(g1$fitted,2), main = "Box-plots of residuals of ANOVA of inverses",cex.main=1.5) #boxplots # This seems the best one # Can we say anything about the difference among treatments? # Summary makes comparisons (via tests of single beta_i) # It is not correct (why?) testing directly all possible differences. # There is an approximate procedure that works well (TukeyHSD) TukeyHSD(aov(1/time ~ poison + treat, data = rats)) #Tukey's comparisons of effects on 1/survival # the command TukeyHSD however needs to work on the output of the command "aov" not of "lm" # "aov" does more or less the same as "lm" on qualitative variables, but the output is of a different class # polynomial regression # though we have a polynomial in the independent variable, it is still a linear model (in the parameters...) # look again at the dataset with age ("eta") and pressure ("pressione") press <- read.table("pressione.txt",header=TRUE) # linear regression reg <- lm(pressione ~ eta,data=press) summary(reg) plot (pressione ~ eta, data=press) # points abline(reg) # regression line # a way to look at whether a regression is a good model is to look at residuals # if the model were correct, residuals should be independent and equidistributed (and normal) plot (reg$residual ~ reg$fitted) # reg$residual are y - yhat # reg$fitted sono yhat abline(h=0) # residuals seem positive at low and high age # and negative at intermediate age # we can think of adding a quadratic term reg2 <- lm(pressione ~ eta + eta^2,data=press) summary(reg2) # does not work # several ways to handle this # one method is to use the function I() that means: take the object as it is (see the Help for explanations) reg2 <- lm(pressione ~ eta + I(eta^2),data=press) summary(reg2) # it works # it can be seen that the coefficient of the x^2 is significantly different from 0. # a more elegant way is using the function poly(.,.) regp <- lm(pressione ~ poly(eta,1),data=press) # asking to compute the regression of pressione on a polynomial in eta of degree 1 summary(regp) # If you compare to the linear regression computed before, we have the same R^2, fitted values and residuals... # but different coefficients # poly(eta,j) are orthogoanl polynomials # it has computational advantages, and also is such that # coefficients do not change, when increasing the degree of the polynomial regp2 <- lm(pressione ~ poly(eta,2), data=press) summary(regp2) # it is however more usual writing in terms of coefficients of x, x^2... reg2 <- lm(pressione ~ poly(eta,2,raw=TRUE), data = press) # raw = TRUE tells to use the "normal" basis of polynomials: 1, x, x^2... summary(reg2) # ok # Clearly R^2 is increased (from 51% to 60%) moving from linear to quadratic model. # the coefficient of the quadratic term is significantly different from 0. # this is equivalent to the test anova(reg,reg2) # ok # let's look at residuals plot(reg2$res ~ reg2$fit) abline(h=0) # it's difficult saying very much, but there are no obvious trends # how about a 3-rd degree polynomial reg3 <- lm(pressione ~ poly(eta,3,raw=TRUE), data = press) summary(reg3) # a third degree term is not significant, even though R^2 has slightly increased anova(reg2,reg3) # Now let's make a plot with data, regression line and quadratic polynomial. # The first two are easy: plot (pressione ~ eta, data = press) abline(reg) points(reg2$fit ~ press$eta, col = "red") # now observed points are in black # in red the predicted ones through the quadratic # to draw the polynomial, the best way is using the command predict(). # predict() uses the model to produce the output in some given points # for instance... asc = 20:85 # the abscissae on which to use the model # these are all numbers from 20 to 85 with increment 1 asc ord = predict(reg2,data.frame(eta=asc)) # the values obtained using reg2 on asc ord lines(asc,ord, lty=2, col = "blue") # ok # predict can also be used to have confidence intervals on y_hat or on a single predicted value reg.pred = predict(reg,data.frame(eta=asc),interval="confidence") # asking for C.I. on yhat reg.pred # how to access the columns reg.pred[,2] #column 2 reg.pred[3,] # row 3 plot (pressione ~ eta, data = press) abline(reg) lines(asc, reg.pred[,2],lty=2) lines(asc, reg.pred[,3],lty=2) # uncertainty increases away from the centre reg.pred2 <- predict(reg,data.frame(eta=asc),interval="prediction") # asking for C.I. on a new observation with that value of eta lines(asc, reg.pred2[,2],lty=2,col=2) lines(asc, reg.pred2[,3],lty=2,col=2) # roughly we're adding plus or minus 2 sigma to the prediction # logistic regression. An example library(MASS) # load the library which also contains the dataset of interest data() help(rotifer) # describes the data head(rotifer) help(glm) #glm is the command to perform generalized linear model fit reglogis <- glm(pm.y/pm.tot ~ density, weights= pm.tot, family=binomial, data = rotifer) # regression logistic is performed through the command "glm" (generalized linear model) specifying that we require the binomial family: # there are different possible syntaxes in the formula. Here we give as response the frequency of "successes" # with weights, I state the number of "trials" summary(reglogis) # we can read coefficient estimates, with corresponding standard errors # and the deviance, compared to that of a "saturated" model # An alternative (equivalent) syntax reglogis2 <- glm(as.matrix(cbind(pm.y,pm.tot-pm.y))~ density, family=binomial, data = rotifer) # response is the pair (number of "successes", number of "failures"); then we do not need to give weights summary(reglogis2) # plotting results... plot(pm.y/pm.tot ~ density,data = rotifer) lines(rotifer$density,reglogis$fitted) # reglogis$fitted are the predicted probabilities # the output is ok only because rotifer$density is ordered # alternative way to plot the model prediction, using predict, so that it becomes smoother and there is no problem with ordering M <- max(rotifer$density) m <- min(rotifer$density) asc <- m + (-10:210)*(M-m)/200 # these are abscissae in which to compute the model (divide the interval (m.M) in 200 parts; add 10 points before and 10 after at the same distance) ord <- predict(reglogis,data.frame(density=asc),type="response") # with type="response" we get the predicted probability, not X*beta # had we wished to obtain the predicted values of X*beta.... eta <- predict(reglogis,data.frame(density=asc),type="link") # now plot the prediction together with data points plot(pm.y/pm.tot ~ density,data = rotifer,ylim=c(0,1)) #plot data lines(asc,ord) #add model