Chapter 4-Classification, Q13

library(MASS)
names(Boston)

crime_class = rep(0,length(Boston$crim))
crim_median = median(Boston$crim)
crime_class[Boston$crim>crim_median]=1
new_Boston = data.frame(Boston,crime_class)
names(new_Boston)

cor(new_Boston)

The predictors indus, nox, age, dis, rad and tax have a significant correlation with the crime_class

#creating training data and test data
nrow(new_Boston)

#new_Boston = new_Boston[sample(nrow(new_Boston)),]
training_data = new_Boston[1:406,]
test_data = new_Boston[407:nrow(new_Boston),]

LDA¶

lda.model = lda(crime_class~indus+nox+age+dis+rad+tax,data=training_data)
lda.pred = predict(lda.model,newdata=test_data)
table(lda.pred$class,test_data$crime_class)

   
     0  1
  0  2  1
  1 13 84

#error rate
mean(lda.pred$class!=test_data$crime_class)*100

#False positives is high. Increasing the cutoff probability
lda.pred2 = rep(0,length(test_data$crime_class))
lda.pred2[lda.pred$posterior[,2]>0.7]=1
table(lda.pred2,test_data$crime_class)

         
lda.pred2  0  1
        0 10  5
        1  5 80

#error rate
mean(lda.pred2!=test_data$crime_class)*100

Logistic Regression¶

lgs.model = glm(crime_class~indus+nox+rad+age+dis+tax,data=training_data,family=binomial)
summary(lgs.model)

Call:
glm(formula = crime_class ~ indus + nox + rad + age + dis + tax, 
    family = binomial, data = training_data)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.18209  -0.28262  -0.08559   0.04478   2.79128  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -28.284373   4.321520  -6.545 5.95e-11 ***
indus        -0.088696   0.049488  -1.792   0.0731 .  
nox          50.338126   8.474764   5.940 2.85e-09 ***
rad           0.557643   0.129025   4.322 1.55e-05 ***
age           0.003107   0.009210   0.337   0.7359    
dis           0.167513   0.151451   1.106   0.2687    
tax          -0.003928   0.003159  -1.243   0.2138    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 550.71  on 405  degrees of freedom
Residual deviance: 214.16  on 399  degrees of freedom
AIC: 228.16

Number of Fisher Scoring iterations: 8

lgs.probs = predict(lgs.model,newdata=test_data,type="response")
lgs.pred = rep(0,length(test_data$crime_class))
lgs.pred[lgs.probs>0.5]=1
#error rate
mean(lgs.pred!=test_data$crime_class)*100

table(lgs.pred,test_data$crime_class)

        
lgs.pred  0  1
       0 10  0
       1  5 85

KNN¶

library(class)
train.y = training_data$crime_class
test.y = test_data$crime_class
training_data_k = training_data[,-c(1,2,4,6)]
test_data_k = test_data[,-c(1,2,4,6)]
knn.pred = knn(training_data_k,test_data_k,train.y,k=3)

#error rate
mean(knn.pred!=test.y)*100

table(knn.pred,test.y)

        test.y
knn.pred  0  1
       0 10  3
       1  5 82

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv	crime_class
crim	1.00000000	-0.20046922	0.40658341	-0.055891582	0.42097171	-0.21924670	0.35273425	-0.37967009	0.625505145	0.58276431	0.2899456	-0.38506394	0.4556215	-0.3883046	0.40939545
zn	-0.20046922	1.00000000	-0.53382819	-0.042696719	-0.51660371	0.31199059	-0.56953734	0.66440822	-0.311947826	-0.31456332	-0.3916785	0.17552032	-0.4129946	0.3604453	-0.43615103
indus	0.40658341	-0.53382819	1.00000000	0.062938027	0.76365145	-0.39167585	0.64477851	-0.70802699	0.595129275	0.72076018	0.3832476	-0.35697654	0.6037997	-0.4837252	0.60326017
chas	-0.05589158	-0.04269672	0.06293803	1.000000000	0.09120281	0.09125123	0.08651777	-0.09917578	-0.007368241	-0.03558652	-0.1215152	0.04878848	-0.0539293	0.1752602	0.07009677
nox	0.42097171	-0.51660371	0.76365145	0.091202807	1.00000000	-0.30218819	0.73147010	-0.76923011	0.611440563	0.66802320	0.1889327	-0.38005064	0.5908789	-0.4273208	0.72323480
rm	-0.21924670	0.31199059	-0.39167585	0.091251225	-0.30218819	1.00000000	-0.24026493	0.20524621	-0.209846668	-0.29204783	-0.3555015	0.12806864	-0.6138083	0.6953599	-0.15637178
age	0.35273425	-0.56953734	0.64477851	0.086517774	0.73147010	-0.24026493	1.00000000	-0.74788054	0.456022452	0.50645559	0.2615150	-0.27353398	0.6023385	-0.3769546	0.61393992
dis	-0.37967009	0.66440822	-0.70802699	-0.099175780	-0.76923011	0.20524621	-0.74788054	1.00000000	-0.494587930	-0.53443158	-0.2324705	0.29151167	-0.4969958	0.2499287	-0.61634164
rad	0.62550515	-0.31194783	0.59512927	-0.007368241	0.61144056	-0.20984667	0.45602245	-0.49458793	1.000000000	0.91022819	0.4647412	-0.44441282	0.4886763	-0.3816262	0.61978625
tax	0.58276431	-0.31456332	0.72076018	-0.035586518	0.66802320	-0.29204783	0.50645559	-0.53443158	0.910228189	1.00000000	0.4608530	-0.44180801	0.5439934	-0.4685359	0.60874128
ptratio	0.28994558	-0.39167855	0.38324756	-0.121515174	0.18893268	-0.35550149	0.26151501	-0.23247054	0.464741179	0.46085304	1.0000000	-0.17738330	0.3740443	-0.5077867	0.25356836
black	-0.38506394	0.17552032	-0.35697654	0.048788485	-0.38005064	0.12806864	-0.27353398	0.29151167	-0.444412816	-0.44180801	-0.1773833	1.00000000	-0.3660869	0.3334608	-0.35121093
lstat	0.45562148	-0.41299457	0.60379972	-0.053929298	0.59087892	-0.61380827	0.60233853	-0.49699583	0.488676335	0.54399341	0.3740443	-0.36608690	1.0000000	-0.7376627	0.45326273
medv	-0.38830461	0.36044534	-0.48372516	0.175260177	-0.42732077	0.69535995	-0.37695457	0.24992873	-0.381626231	-0.46853593	-0.5077867	0.33346082	-0.7376627	1.0000000	-0.26301673
crime_class	0.40939545	-0.43615103	0.60326017	0.070096774	0.72323480	-0.15637178	0.61393992	-0.61634164	0.619786249	0.60874128	0.2535684	-0.35121093	0.4532627	-0.2630167	1.00000000