Chapter 8 Tree Based Methods - Question 8

library(ISLR)

names(Carseats)

a¶

set.seed(1)
train = sample(1:nrow(Carseats),nrow(Carseats)/2)

b¶

library(tree)
carseats.tree = tree(Sales~.,data=Carseats,subset=train)

plot(carseats.tree)
text(carseats.tree,pretty=0)

#Finding the mean squared error
carseats.pred = predict(carseats.tree,newdata=Carseats[-train,])
mean((carseats.pred-Carseats$Sales[-train])^2)

c¶

cv.carseats.tree = cv.tree(carseats.tree)
cv.carseats.tree

$size
 [1] 18 17 16 15 14 12 11 10  9  8  7  6  5  4  3  1

$dev
 [1] 1013.2727  995.5856  995.5856 1040.5112 1040.5112  979.9660  983.3205
 [8]  991.4825  991.3305 1024.1369 1018.1733 1038.2824 1124.6938 1127.3483
[15] 1236.5701 1562.7692

$k
 [1]      -Inf  15.48181  15.53599  18.69038  18.74886  21.05038  23.79480
 [8]  25.78579  26.01210  30.10435  32.74801  53.28569  72.33061  78.19599
[15] 141.73781 251.22901

$method
[1] "deviance"

attr(,"class")
[1] "prune"         "tree.sequence"

plot(cv.carseats.tree$size,cv.carseats.tree$dev,type="b")

#As we can see the tree with 12 terminal nodes gives the lowest deviance.
prune.carseats.tree = prune.tree(carseats.tree,best=12)
plot(prune.carseats.tree)
text(prune.carseats.tree)

yhat = predict(prune.carseats.tree,newdata=Carseats[-train,])
mean((yhat-Carseats$Sales[-train])^2)

Pruning the tree doesn't decrease the test error. The mean squared error is 4.1 without pruning and 4.6 with pruning

d¶

library(randomForest)

# p = the number of variables considered at each split
p = ncol(Carseats)-1
bag.carseats = randomForest(Sales~.,data=Carseats,mtry=p,ntree=500,importance=TRUE,subset=train)

yhat = predict(bag.carseats,newdata=Carseats[-train,])
mean((yhat-Carseats$Sales[-train])^2)

Bagging has reduced the test error from 4 to 2.6

varImpPlot(bag.carseats)

Price and Shelveloc are the two most important variables in deciding the number of sales.

e¶

# p = the number variables considered at each split
p = sqrt(ncol(Carseats)-1)
p2 = (ncol(Carseats)-1)/2
randf.carseats = randomForest(Sales~.,data=Carseats,mtry=p,ntree=500,importance=TRUE,subset=train)
randf.carseats.p2 = randomForest(Sales~.,data=Carseats,mtry=p2,ntree=500,importance=TRUE,subset=train)

yhat = predict(randf.carseats,newdata=Carseats[-train,])
mean((yhat-Carseats$Sales[-train])^2)

yhat = predict(randf.carseats.p2,newdata=Carseats[-train,])
mean((yhat-Carseats$Sales[-train])^2)

Here the Random Forest increases the mean squared error.

importance(randf.carseats)
varImpPlot(randf.carseats)

According to Random Forest also the Price and ShelveLoc are the two most important variables

	%IncMSE	IncNodePurity
CompPrice	7.0760560	129.67547
Income	5.4464122	126.45442
Advertising	12.9804105	138.40878
Population	-1.3719784	97.99927
Price	37.0633392	382.26242
ShelveLoc	30.4561325	236.74282
Age	17.9715510	196.77469
Education	2.0792595	72.60662
Urban	-0.5662764	16.19988
US	5.2840921	32.31290