Chapter 8 Tree Based Methods - Question 10

library(ISLR)
library(gbm)
summary(Hitters)

     AtBat            Hits         HmRun            Runs       
 Min.   : 16.0   Min.   :  1   Min.   : 0.00   Min.   :  0.00  
 1st Qu.:255.2   1st Qu.: 64   1st Qu.: 4.00   1st Qu.: 30.25  
 Median :379.5   Median : 96   Median : 8.00   Median : 48.00  
 Mean   :380.9   Mean   :101   Mean   :10.77   Mean   : 50.91  
 3rd Qu.:512.0   3rd Qu.:137   3rd Qu.:16.00   3rd Qu.: 69.00  
 Max.   :687.0   Max.   :238   Max.   :40.00   Max.   :130.00  
                                                               
      RBI             Walks            Years            CAtBat       
 Min.   :  0.00   Min.   :  0.00   Min.   : 1.000   Min.   :   19.0  
 1st Qu.: 28.00   1st Qu.: 22.00   1st Qu.: 4.000   1st Qu.:  816.8  
 Median : 44.00   Median : 35.00   Median : 6.000   Median : 1928.0  
 Mean   : 48.03   Mean   : 38.74   Mean   : 7.444   Mean   : 2648.7  
 3rd Qu.: 64.75   3rd Qu.: 53.00   3rd Qu.:11.000   3rd Qu.: 3924.2  
 Max.   :121.00   Max.   :105.00   Max.   :24.000   Max.   :14053.0  
                                                                     
     CHits            CHmRun           CRuns             CRBI        
 Min.   :   4.0   Min.   :  0.00   Min.   :   1.0   Min.   :   0.00  
 1st Qu.: 209.0   1st Qu.: 14.00   1st Qu.: 100.2   1st Qu.:  88.75  
 Median : 508.0   Median : 37.50   Median : 247.0   Median : 220.50  
 Mean   : 717.6   Mean   : 69.49   Mean   : 358.8   Mean   : 330.12  
 3rd Qu.:1059.2   3rd Qu.: 90.00   3rd Qu.: 526.2   3rd Qu.: 426.25  
 Max.   :4256.0   Max.   :548.00   Max.   :2165.0   Max.   :1659.00  
                                                                     
     CWalks        League  Division    PutOuts          Assists     
 Min.   :   0.00   A:175   E:157    Min.   :   0.0   Min.   :  0.0  
 1st Qu.:  67.25   N:147   W:165    1st Qu.: 109.2   1st Qu.:  7.0  
 Median : 170.50                    Median : 212.0   Median : 39.5  
 Mean   : 260.24                    Mean   : 288.9   Mean   :106.9  
 3rd Qu.: 339.25                    3rd Qu.: 325.0   3rd Qu.:166.0  
 Max.   :1566.00                    Max.   :1378.0   Max.   :492.0  
                                                                    
     Errors          Salary       NewLeague
 Min.   : 0.00   Min.   :  67.5   A:176    
 1st Qu.: 3.00   1st Qu.: 190.0   N:146    
 Median : 6.00   Median : 425.0            
 Mean   : 8.04   Mean   : 535.9            
 3rd Qu.:11.00   3rd Qu.: 750.0            
 Max.   :32.00   Max.   :2460.0            
                 NA's   :59

a¶

hitters_dataset = na.omit(Hitters)

hitters_dataset$Salary = log(hitters_dataset$Salary)

b¶

set.seed(1)
train = sample(1:nrow(hitters_dataset),200)

c¶

lambda = seq(0.1,0.9,0.1)

boost.model = gbm(Salary~.,data=hitters_dataset[train,],n.trees=1000,shrinkage=0.2,distribution="gaussian",
                  interaction.depth=4)

yhat = predict(boost.model,hitters_dataset[train,],n.trees=1000)
mean((yhat-hitters_dataset$Salary[train])^2)

mse = c()
for(i in 1:length(lambda)){
    boost.model = gbm(Salary~.,data=hitters_dataset[train,],n.trees=1000,shrinkage=lambda[i],distribution="gaussian",
                      interaction.depth=4)
    yhat = predict(boost.model,newdata=hitters_dataset[train,],n.trees=1000)
    mse[i]=mean((yhat-hitters_dataset$Salary[train])^2)
}

plot(lambda,mse,type="b",ylab="training mse")

d¶

mse = c()
for(i in 1:length(lambda)){
    boost.model = gbm(Salary~.,data=hitters_dataset[train,],n.trees=1000,shrinkage=lambda[i],distribution="gaussian",
                      interaction.depth=4)
    yhat = predict(boost.model,newdata=hitters_dataset[-train,],n.trees=1000)
    mse[i]=mean((yhat-hitters_dataset$Salary[-train])^2)
}

plot(lambda,mse,ylab="test mse",type="b")

#Learning rate that gives the lowest test mse
lambda[which.min(mse)]

#lowest test mse
min(mse)

e¶

#The test mse of Linear Regression Model
lm.model = lm(Salary~.,data=hitters_dataset,subset=train)
yhat = predict(lm.model,newdata=hitters_dataset[-train,])
mean((yhat-hitters_dataset$Salary[-train])^2)

#Linear Regression with best subset selection
library(leaps)
bs.model = regsubsets(Salary~.,data=hitters_dataset,nvmax=19)
bs.model.summary = summary(bs.model)

names(bs.model.summary)

par(mfrow=c(2,2))
plot(bs.model.summary$adjr2,type="l")
plot(bs.model.summary$cp,type="l")
plot(bs.model.summary$bic,type="l")

#no. of predictors that give the highest adjusted R-squared value
which.max(bs.model.summary$adjr2)

#no. of predictors that give the lowest cp value
which.min(bs.model.summary$cp)

#no. of predictors that give the lowest bic
which.min(bs.model.summary$bic)

#mse for model with 13 variables
n = names(coef(bs.model,13))
test.mat = model.matrix(Salary~.,data=hitters_dataset[-train,])
yhat = test.mat[,n]%*%coef(bs.model,13)
mean((yhat-hitters_dataset$Salary[-train])^2)

#mse for model with 9 variables
n = names(coef(bs.model,9))
test.mat = model.matrix(Salary~.,data=hitters_dataset[-train,])
yhat = test.mat[,n]%*%coef(bs.model,9)
mean((yhat-hitters_dataset$Salary[-train])^2)

#mse for model with 3 variables
n = names(coef(bs.model,3))
test.mat = model.matrix(Salary~.,data=hitters_dataset[-train,])
yhat = test.mat[,n]%*%coef(bs.model,3)
mean((yhat-hitters_dataset$Salary[-train])^2)

In part (d) of this question we can see that boosting gives a model with a test mse of 0.2 which is lower than the mse of the Linear Regression models obtained with and without the best subset selection algorithm.

f¶

summary(boost.model)

CRBI is the most important variable in the boosted model. RBI, PutOuts and CWalks are second most important variables.

g¶

library(randomForest)

p = ncol(hitters_dataset)-1
bag.model = randomForest(Salary~.,data=hitters_dataset[train,],n.trees=500,mtry=p)

#mse of bagging algorithm
yhat = predict(bag.model,newdata=hitters_dataset[-train,])
mean((yhat-hitters_dataset$Salary[-train])^2)

Bagging gives mse of 0.11, which is lower than the mse of all the models trained in this exercise.

	var	rel.inf
CRBI	CRBI	22.3654877
RBI	RBI	7.6115471
PutOuts	PutOuts	7.2129052
CWalks	CWalks	7.0474275
Walks	Walks	6.2574998
AtBat	AtBat	5.4665939
Assists	Assists	4.7977665
CHmRun	CHmRun	4.5353878
CHits	CHits	4.4261210
Runs	Runs	4.3462279
Errors	Errors	4.0873342
Years	Years	3.9898132
Hits	Hits	3.8697769
HmRun	HmRun	3.7694035
CAtBat	CAtBat	3.3025405
CRuns	CRuns	2.7592266
Division	Division	1.8578448
NewLeague	NewLeague	1.5045714
League	League	0.7925247