Exercise 4.13

This is similar to exercise 4.11. We define a new variable with the value of 1 if the crime rate is above the median and 0 if it is below.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.cross_validation import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score

from sklearn.datasets import load_boston

%matplotlib inline

boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df.head()

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

df['CRIM01'] = np.where(df['CRIM'] > df['CRIM'].median(), 1, 0)
df.head()

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

df = df.drop('CRIM', axis=1)
df.head()

	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

As in 4.11, we try to assess which variables are more associated with 'CRIM01' by looking at correlations between the variables.

df.corr()

	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	CRIM01
ZN	1.000000	-0.533828	-0.042697	-0.516604	0.311991	-0.569537	0.664408	-0.311948	-0.314563	-0.391679	0.175520	-0.412995	-0.436151
INDUS	-0.533828	1.000000	0.062938	0.763651	-0.391676	0.644779	-0.708027	0.595129	0.720760	0.383248	-0.356977	0.603800	0.603260
CHAS	-0.042697	0.062938	1.000000	0.091203	0.091251	0.086518	-0.099176	-0.007368	-0.035587	-0.121515	0.048788	-0.053929	0.070097
NOX	-0.516604	0.763651	0.091203	1.000000	-0.302188	0.731470	-0.769230	0.611441	0.668023	0.188933	-0.380051	0.590879	0.723235
RM	0.311991	-0.391676	0.091251	-0.302188	1.000000	-0.240265	0.205246	-0.209847	-0.292048	-0.355501	0.128069	-0.613808	-0.156372
AGE	-0.569537	0.644779	0.086518	0.731470	-0.240265	1.000000	-0.747881	0.456022	0.506456	0.261515	-0.273534	0.602339	0.613940
DIS	0.664408	-0.708027	-0.099176	-0.769230	0.205246	-0.747881	1.000000	-0.494588	-0.534432	-0.232471	0.291512	-0.496996	-0.616342
RAD	-0.311948	0.595129	-0.007368	0.611441	-0.209847	0.456022	-0.494588	1.000000	0.910228	0.464741	-0.444413	0.488676	0.619786
TAX	-0.314563	0.720760	-0.035587	0.668023	-0.292048	0.506456	-0.534432	0.910228	1.000000	0.460853	-0.441808	0.543993	0.608741
PTRATIO	-0.391679	0.383248	-0.121515	0.188933	-0.355501	0.261515	-0.232471	0.464741	0.460853	1.000000	-0.177383	0.374044	0.253568
B	0.175520	-0.356977	0.048788	-0.380051	0.128069	-0.273534	0.291512	-0.444413	-0.441808	-0.177383	1.000000	-0.366087	-0.351211
LSTAT	-0.412995	0.603800	-0.053929	0.590879	-0.613808	0.602339	-0.496996	0.488676	0.543993	0.374044	-0.366087	1.000000	0.453263
CRIM01	-0.436151	0.603260	0.070097	0.723235	-0.156372	0.613940	-0.616342	0.619786	0.608741	0.253568	-0.351211	0.453263	1.000000

x_all = df.iloc[:,:-1].values
x_6 = df[['INDUS','NOX','AGE','DIS','RAD','TAX']].values
y = df['CRIM01'].values

x_all_train, x_all_test, y_train, y_test = train_test_split(x_all, y, random_state=1)
x_6_train, x_6_test, y_train, y_test = train_test_split(x_6, y, random_state=1)

lr = LogisticRegression()
lr.fit(x_6_train, y_train)
print("Accuracy with x_6  :",accuracy_score(y_test, lr.predict(x_6_test)))
lr.fit(x_all_train, y_train)
print("Accuracy with x_all:",accuracy_score(y_test, lr.predict(x_all_test)))

Accuracy with x_6  : 0.842519685039
Accuracy with x_all: 0.858267716535

lda = LinearDiscriminantAnalysis()
lda.fit(x_6_train, y_train)
print("Accuracy with x_6  :",accuracy_score(y_test, lda.predict(x_6_test)))
lda.fit(x_all_train, y_train)
print("Accuracy with x_all:",accuracy_score(y_test, lda.predict(x_all_test)))

Accuracy with x_6  : 0.818897637795
Accuracy with x_all: 0.834645669291

qda = QuadraticDiscriminantAnalysis()
qda.fit(x_6_train, y_train)
print("Accuracy with x_6  :",accuracy_score(y_test, qda.predict(x_6_test)))
qda.fit(x_all_train, y_train)
print("Accuracy with x_all:",accuracy_score(y_test, qda.predict(x_all_test)))

Accuracy with x_6  : 0.874015748031
Accuracy with x_all: 0.905511811024

best_acc = 0
best_k = 0
for K in range(1,101):
    knn = KNeighborsClassifier(n_neighbors=K)
    knn.fit(x_all_train, y_train)
    acc = accuracy_score(y_test, knn.predict(x_all_test))
    if acc > best_acc:
        best_acc, best_k = acc, K

print('Best accuracy = {:.4f} with K = {:3}'.format(best_acc, best_k))

Best accuracy = 0.9370 with K =   1

best_acc = 0
best_k = 0
for K in range(1,101):
    knn = KNeighborsClassifier(n_neighbors=K)
    knn.fit(x_6_train, y_train)
    acc = accuracy_score(y_test, knn.predict(x_6_test))
    if acc > best_acc:
        best_acc, best_k = acc, K

print('Best accuracy = {:.4f} with K = {:3}'.format(best_acc, best_k))

Best accuracy = 0.9370 with K =   1

We used two subsets of the predictors: all of them and the 6 most correlated with the response. From the results above, we can see that including all the predictors improves the accuracy (except for KNN) but not by much (less than 3%). QDA has better accuracy test performance than LDA, indicating a non-linear Bayes decision boundary. The best model was KNN with K = 1. For a more robust analysis we should perform cross-validation. Which leads us to the next chapter :)