summary(airquality) # multiple regression. Note the particular (symbolic) use of the symbol + reg <- lm (Ozone ~ Solar.R + Wind + Temp, data = airquality) # linear model on data frame airquality summary(reg) reg1 <- update(reg, . ~ .- Temp) # using update, we can avoid rewriting the whole command. # We are saying to keep the same term on the left of ~ (this is the meaning of .), and to keep the same to the right of ~ but without Temp (- is used symbolically like +) summary(reg1) # note that dropping one variable, the estimate of all other coefficients have changed. # an example of analysis of variance summary(PlantGrowth) plot(weight ~ group, data=PlantGrowth) #produces a boxplot, since group is categorical #variances do not look strikingly different among groups, so we continue with lm g = lm(weight ~ group, data=PlantGrowth) summary(g) # test of two models nested into each other reg2 = update(reg,.~.-Solar.R-Wind) anova(reg,reg2) # does not work because regressions computed on different data (due to missing values) # Solution: take away all observations with missing values attach(airquality) # to avoid writing "airquality$" is.na(Ozone[1]) # asks whether Ozone[1] is a missing value Ozone[1] # indeed Ozone[1] is 41 missing <- is.na(Ozone) | is.na(Solar.R) | is.na(Wind) | is.na(Temp) # | is the logical operator "or" # hence "missing" is true if at least a missing value missing # is a logical vector which indicates which observationshave missing values airquality[10,] # we have seen that row 10 has at least a missing value. In fact ! cleanair <- airquality[!missing,] # this creates a data.frame with only observations without missing values # [i,] indicates row i ([,i] would have been column i) str(cleanair) # this shows that there are only 111 observations (=rows) in the "dataframe" detach(airquality) # now variables inside airquality are again inaccessible reg <- lm (Ozone ~ Solar.R + Wind + Temp, data = cleanair) summary(reg) # is the same as before reg2 = update(reg,.~.-Solar.R-Wind) summary(reg2) anova(reg,reg2) # we have to reject the null hypothesis (i.e. the simpler model reg2) plot(reg) #graphical diagnostics. It appears that assumptions of linear models are not satisfied...