Chapter 4-Classification, Q13

In [1]:
library(MASS)
names(Boston)
  1. 'crim'
  2. 'zn'
  3. 'indus'
  4. 'chas'
  5. 'nox'
  6. 'rm'
  7. 'age'
  8. 'dis'
  9. 'rad'
  10. 'tax'
  11. 'ptratio'
  12. 'black'
  13. 'lstat'
  14. 'medv'
In [2]:
crime_class = rep(0,length(Boston$crim))
crim_median = median(Boston$crim)
crime_class[Boston$crim>crim_median]=1
new_Boston = data.frame(Boston,crime_class)
names(new_Boston)
  1. 'crim'
  2. 'zn'
  3. 'indus'
  4. 'chas'
  5. 'nox'
  6. 'rm'
  7. 'age'
  8. 'dis'
  9. 'rad'
  10. 'tax'
  11. 'ptratio'
  12. 'black'
  13. 'lstat'
  14. 'medv'
  15. 'crime_class'
In [3]:
cor(new_Boston)
crimzninduschasnoxrmagedisradtaxptratioblacklstatmedvcrime_class
crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171 -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456 -0.38506394 0.4556215 -0.3883046 0.40939545
zn-0.20046922 1.00000000 -0.53382819 -0.042696719-0.51660371 0.31199059 -0.56953734 0.66440822 -0.311947826-0.31456332 -0.3916785 0.17552032 -0.4129946 0.3604453 -0.43615103
indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145 -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476 -0.35697654 0.6037997 -0.4837252 0.60326017
chas-0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281 0.09125123 0.08651777 -0.09917578 -0.007368241-0.03558652 -0.1215152 0.04878848 -0.0539293 0.1752602 0.07009677
nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000 -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327 -0.38005064 0.5908789 -0.4273208 0.72323480
rm-0.21924670 0.31199059 -0.39167585 0.091251225-0.30218819 1.00000000 -0.24026493 0.20524621 -0.209846668-0.29204783 -0.3555015 0.12806864 -0.6138083 0.6953599 -0.15637178
age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010 -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150 -0.27353398 0.6023385 -0.3769546 0.61393992
dis-0.37967009 0.66440822 -0.70802699 -0.099175780-0.76923011 0.20524621 -0.74788054 1.00000000 -0.494587930-0.53443158 -0.2324705 0.29151167 -0.4969958 0.2499287 -0.61634164
rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056 -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412 -0.44441282 0.4886763 -0.3816262 0.61978625
tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320 -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530 -0.44180801 0.5439934 -0.4685359 0.60874128
ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268 -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000 -0.17738330 0.3740443 -0.5077867 0.25356836
black-0.38506394 0.17552032 -0.35697654 0.048788485-0.38005064 0.12806864 -0.27353398 0.29151167 -0.444412816-0.44180801 -0.1773833 1.00000000 -0.3660869 0.3334608 -0.35121093
lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892 -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443 -0.36608690 1.0000000 -0.7376627 0.45326273
medv-0.38830461 0.36044534 -0.48372516 0.175260177-0.42732077 0.69535995 -0.37695457 0.24992873 -0.381626231-0.46853593 -0.5077867 0.33346082 -0.7376627 1.0000000 -0.26301673
crime_class 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480 -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128 0.2535684 -0.35121093 0.4532627 -0.2630167 1.00000000

The predictors indus, nox, age, dis, rad and tax have a significant correlation with the crime_class

In [4]:
#creating training data and test data
nrow(new_Boston)
506
In [45]:
#new_Boston = new_Boston[sample(nrow(new_Boston)),]
training_data = new_Boston[1:406,]
test_data = new_Boston[407:nrow(new_Boston),]

LDA

In [44]:
lda.model = lda(crime_class~indus+nox+age+dis+rad+tax,data=training_data)
lda.pred = predict(lda.model,newdata=test_data)
table(lda.pred$class,test_data$crime_class)
   
     0  1
  0  2  1
  1 13 84
In [47]:
#error rate
mean(lda.pred$class!=test_data$crime_class)*100
14
In [48]:
#False positives is high. Increasing the cutoff probability
lda.pred2 = rep(0,length(test_data$crime_class))
lda.pred2[lda.pred$posterior[,2]>0.7]=1
table(lda.pred2,test_data$crime_class)
         
lda.pred2  0  1
        0 10  5
        1  5 80
In [40]:
#error rate
mean(lda.pred2!=test_data$crime_class)*100
10

Logistic Regression

In [22]:
lgs.model = glm(crime_class~indus+nox+rad+age+dis+tax,data=training_data,family=binomial)
summary(lgs.model)
Call:
glm(formula = crime_class ~ indus + nox + rad + age + dis + tax, 
    family = binomial, data = training_data)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.18209  -0.28262  -0.08559   0.04478   2.79128  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -28.284373   4.321520  -6.545 5.95e-11 ***
indus        -0.088696   0.049488  -1.792   0.0731 .  
nox          50.338126   8.474764   5.940 2.85e-09 ***
rad           0.557643   0.129025   4.322 1.55e-05 ***
age           0.003107   0.009210   0.337   0.7359    
dis           0.167513   0.151451   1.106   0.2687    
tax          -0.003928   0.003159  -1.243   0.2138    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 550.71  on 405  degrees of freedom
Residual deviance: 214.16  on 399  degrees of freedom
AIC: 228.16

Number of Fisher Scoring iterations: 8
In [23]:
lgs.probs = predict(lgs.model,newdata=test_data,type="response")
lgs.pred = rep(0,length(test_data$crime_class))
lgs.pred[lgs.probs>0.5]=1
#error rate
mean(lgs.pred!=test_data$crime_class)*100
5
In [24]:
table(lgs.pred,test_data$crime_class)
        
lgs.pred  0  1
       0 10  0
       1  5 85

KNN

In [33]:
library(class)
train.y = training_data$crime_class
test.y = test_data$crime_class
training_data_k = training_data[,-c(1,2,4,6)]
test_data_k = test_data[,-c(1,2,4,6)]
knn.pred = knn(training_data_k,test_data_k,train.y,k=3)
In [34]:
#error rate
mean(knn.pred!=test.y)*100
8
In [27]:
table(knn.pred,test.y)
        test.y
knn.pred  0  1
       0 10  3
       1  5 82