Chapter 7 - Moving Beyond Linearity, Question 8

library(ISLR)
summary(Auto)

      mpg          cylinders      displacement     horsepower        weight    
 Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
 1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
 Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
 Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
 3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
 Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
                                                                               
  acceleration        year           origin                      name    
 Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
 1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
 Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
 Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
 3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
 Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
                                                 (Other)           :365

plot(Auto)

Displacement, Horsepower and Weight seem to have a non-linear relation with mpg

Polynomial Regression¶

library(boot)
cv.err = c()
set.seed(1)
Auto = na.omit(Auto)
for(i in 1:6){
    glm.model = glm(mpg~poly(weight,i),data=Auto)
    cv.err[i] = cv.glm(Auto,glm.model,K=10)$delta[2]
}
plot(cv.err,type="l",xlab="no. of degrees",ylab="cv err")
title("10-fold Cross Validation Graph")

weight.limits = range(Auto$weight)
weight.grid = seq(weight.limits[1],weight.limits[2])
glm.model = glm(mpg~poly(weight,2),data=Auto)
pred = predict(glm.model,newdata=list(weight=weight.grid))
plot(Auto$weight,Auto$mpg,col="darkgrey",xlab="weight",ylab="mpg")
title("mpg vs weight graph")
lines(weight.grid,pred,col="blue")

Natural Spline¶

library(splines)
err = c()
set.seed(1)
for(i in 1:10){
    ns.model = glm(mpg~ns(weight,i),data=Auto)
    err[i]=cv.glm(Auto,ns.model,K=10)$delta[2]
}
err

The error doesn't significantly decrease with the increase in the degrees of freedom of the natural spline function. So we will choose a smaller number of degrees of freedom to make the model simpler and avoid overfitting the data. We will choose the degrees of freedom as 3.

ns.model = lm(mpg~ns(weight,3),data=Auto)
pred = predict(ns.model,newdata=list(weight=weight.grid))
plot(Auto$weight,Auto$mpg,col="darkgrey",xlab="weight",ylab="mpg")
title("mpg vs weight graph")
lines(weight.grid,pred,col="blue")

Smoothing Spline¶

#We will let the function choose the degree of freedom
smooth.model = smooth.spline(Auto$weight,Auto$mpg,cv=TRUE)
smooth.model$df

Warning message in smooth.spline(Auto$weight, Auto$mpg, cv = TRUE):
“cross-validation with non-unique 'x' values seems doubtful”

plot(Auto$weight,Auto$mpg,col="darkgrey",xlab="weight",ylab="mpg")
title("mpg vs weight graph")
lines(smooth.model,col="blue")

Generalized Additive Models¶

library(gam)
gam.model = gam(mpg~ns(weight,3)+ns(displacement,3),data=Auto)
deviance(ns.model) #deviance of natural spline
deviance(gam.model) #deviance of Generalized additive model

Polynomial Regression¶

Natural Spline¶

Smoothing Spline¶

Generalized Additive Models¶

Piecewise model¶