In [3]:
library(ISLR)
In [15]:
set.seed(1)
train = sample(1:nrow(Hitters),nrow(Hitters)/2)
hitters_dataset = na.omit(Hitters)
summary(hitters_dataset)
     AtBat            Hits           HmRun            Runs       
 Min.   : 19.0   Min.   :  1.0   Min.   : 0.00   Min.   :  0.00  
 1st Qu.:282.5   1st Qu.: 71.5   1st Qu.: 5.00   1st Qu.: 33.50  
 Median :413.0   Median :103.0   Median : 9.00   Median : 52.00  
 Mean   :403.6   Mean   :107.8   Mean   :11.62   Mean   : 54.75  
 3rd Qu.:526.0   3rd Qu.:141.5   3rd Qu.:18.00   3rd Qu.: 73.00  
 Max.   :687.0   Max.   :238.0   Max.   :40.00   Max.   :130.00  
      RBI             Walks            Years            CAtBat       
 Min.   :  0.00   Min.   :  0.00   Min.   : 1.000   Min.   :   19.0  
 1st Qu.: 30.00   1st Qu.: 23.00   1st Qu.: 4.000   1st Qu.:  842.5  
 Median : 47.00   Median : 37.00   Median : 6.000   Median : 1931.0  
 Mean   : 51.49   Mean   : 41.11   Mean   : 7.312   Mean   : 2657.5  
 3rd Qu.: 71.00   3rd Qu.: 57.00   3rd Qu.:10.000   3rd Qu.: 3890.5  
 Max.   :121.00   Max.   :105.00   Max.   :24.000   Max.   :14053.0  
     CHits            CHmRun           CRuns             CRBI       
 Min.   :   4.0   Min.   :  0.00   Min.   :   2.0   Min.   :   3.0  
 1st Qu.: 212.0   1st Qu.: 15.00   1st Qu.: 105.5   1st Qu.:  95.0  
 Median : 516.0   Median : 40.00   Median : 250.0   Median : 230.0  
 Mean   : 722.2   Mean   : 69.24   Mean   : 361.2   Mean   : 330.4  
 3rd Qu.:1054.0   3rd Qu.: 92.50   3rd Qu.: 497.5   3rd Qu.: 424.5  
 Max.   :4256.0   Max.   :548.00   Max.   :2165.0   Max.   :1659.0  
     CWalks       League  Division    PutOuts          Assists     
 Min.   :   1.0   A:139   E:129    Min.   :   0.0   Min.   :  0.0  
 1st Qu.:  71.0   N:124   W:134    1st Qu.: 113.5   1st Qu.:  8.0  
 Median : 174.0                    Median : 224.0   Median : 45.0  
 Mean   : 260.3                    Mean   : 290.7   Mean   :118.8  
 3rd Qu.: 328.5                    3rd Qu.: 322.5   3rd Qu.:192.0  
 Max.   :1566.0                    Max.   :1377.0   Max.   :492.0  
     Errors           Salary       NewLeague
 Min.   : 0.000   Min.   :  67.5   A:141    
 1st Qu.: 3.000   1st Qu.: 190.0   N:122    
 Median : 7.000   Median : 425.0            
 Mean   : 8.593   Mean   : 535.9            
 3rd Qu.:13.000   3rd Qu.: 750.0            
 Max.   :32.000   Max.   :2460.0            
In [16]:
hitters_dataset$Salary = log(hitters_dataset$Salary)

Boosting

In [21]:
set.seed(1)
library(gbm)
boost.model = gbm(Salary~.,data=hitters_dataset[train,],shrinkage=0.2,n.trees=1000,distribution="gaussian")
yhat = predict(boost.model,newdata=hitters_dataset[-train,],n.trees=1000)
#mse
mean((yhat-hitters_dataset$Salary[-train])^2)
0.310433860823912

Bagging

In [27]:
set.seed(1)
p = ncol(hitters_dataset)-1
library(randomForest)
bag.model = randomForest(Salary~.,data=hitters_dataset,mtry=p,n.trees=1000,subset=train)
yhat = predict(bag.model,newdata=hitters_dataset[-train,])
#mse
mean((yhat-hitters_dataset$Salary[-train])^2)
0.186204695514195

Random Forests

In [28]:
set.seed(1)
p = sqrt(ncol(hitters_dataset)-1)
rf.model = randomForest(Salary~.,data=hitters_dataset,mtry=p,n.trees=1000,subset=train)
yhat=predict(rf.model,newdata=hitters_dataset[-train,])
#mse
mean((yhat-hitters_dataset$Salary[-train])^2)
0.170901228749609

Linear Regression

In [30]:
lm.model = lm(Salary~.,data=hitters_dataset,subset=train)
yhat = predict(lm.model,newdata=hitters_dataset[-train,])
#mse
mean((yhat-hitters_dataset$Salary)^2)
Warning message in yhat - hitters_dataset$Salary:
“longer object length is not a multiple of shorter object length”
1.1924944665497

Result

Random Forest model gives the lowest mse