Exercise 4.13

This is similar to exercise 4.11. We define a new variable with the value of 1 if the crime rate is above the median and 0 if it is below.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.cross_validation import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score

from sklearn.datasets import load_boston

%matplotlib inline
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df.head()
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33
df['CRIM01'] = np.where(df['CRIM'] > df['CRIM'].median(), 1, 0)
df.head()
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT CRIM01
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 0
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 0
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 0
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 0
df = df.drop('CRIM', axis=1)
df.head()
ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT CRIM01
0 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 0
1 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 0
2 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 0
3 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 0
4 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 0

As in 4.11, we try to assess which variables are more associated with 'CRIM01' by looking at correlations between the variables.

df.corr()
ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT CRIM01
ZN 1.000000 -0.533828 -0.042697 -0.516604 0.311991 -0.569537 0.664408 -0.311948 -0.314563 -0.391679 0.175520 -0.412995 -0.436151
INDUS -0.533828 1.000000 0.062938 0.763651 -0.391676 0.644779 -0.708027 0.595129 0.720760 0.383248 -0.356977 0.603800 0.603260
CHAS -0.042697 0.062938 1.000000 0.091203 0.091251 0.086518 -0.099176 -0.007368 -0.035587 -0.121515 0.048788 -0.053929 0.070097
NOX -0.516604 0.763651 0.091203 1.000000 -0.302188 0.731470 -0.769230 0.611441 0.668023 0.188933 -0.380051 0.590879 0.723235
RM 0.311991 -0.391676 0.091251 -0.302188 1.000000 -0.240265 0.205246 -0.209847 -0.292048 -0.355501 0.128069 -0.613808 -0.156372
AGE -0.569537 0.644779 0.086518 0.731470 -0.240265 1.000000 -0.747881 0.456022 0.506456 0.261515 -0.273534 0.602339 0.613940
DIS 0.664408 -0.708027 -0.099176 -0.769230 0.205246 -0.747881 1.000000 -0.494588 -0.534432 -0.232471 0.291512 -0.496996 -0.616342
RAD -0.311948 0.595129 -0.007368 0.611441 -0.209847 0.456022 -0.494588 1.000000 0.910228 0.464741 -0.444413 0.488676 0.619786
TAX -0.314563 0.720760 -0.035587 0.668023 -0.292048 0.506456 -0.534432 0.910228 1.000000 0.460853 -0.441808 0.543993 0.608741
PTRATIO -0.391679 0.383248 -0.121515 0.188933 -0.355501 0.261515 -0.232471 0.464741 0.460853 1.000000 -0.177383 0.374044 0.253568
B 0.175520 -0.356977 0.048788 -0.380051 0.128069 -0.273534 0.291512 -0.444413 -0.441808 -0.177383 1.000000 -0.366087 -0.351211
LSTAT -0.412995 0.603800 -0.053929 0.590879 -0.613808 0.602339 -0.496996 0.488676 0.543993 0.374044 -0.366087 1.000000 0.453263
CRIM01 -0.436151 0.603260 0.070097 0.723235 -0.156372 0.613940 -0.616342 0.619786 0.608741 0.253568 -0.351211 0.453263 1.000000
x_all = df.iloc[:,:-1].values
x_6 = df[['INDUS','NOX','AGE','DIS','RAD','TAX']].values
y = df['CRIM01'].values

x_all_train, x_all_test, y_train, y_test = train_test_split(x_all, y, random_state=1)
x_6_train, x_6_test, y_train, y_test = train_test_split(x_6, y, random_state=1)
lr = LogisticRegression()
lr.fit(x_6_train, y_train)
print("Accuracy with x_6  :",accuracy_score(y_test, lr.predict(x_6_test)))
lr.fit(x_all_train, y_train)
print("Accuracy with x_all:",accuracy_score(y_test, lr.predict(x_all_test)))
Accuracy with x_6  : 0.842519685039
Accuracy with x_all: 0.858267716535
lda = LinearDiscriminantAnalysis()
lda.fit(x_6_train, y_train)
print("Accuracy with x_6  :",accuracy_score(y_test, lda.predict(x_6_test)))
lda.fit(x_all_train, y_train)
print("Accuracy with x_all:",accuracy_score(y_test, lda.predict(x_all_test)))
Accuracy with x_6  : 0.818897637795
Accuracy with x_all: 0.834645669291
qda = QuadraticDiscriminantAnalysis()
qda.fit(x_6_train, y_train)
print("Accuracy with x_6  :",accuracy_score(y_test, qda.predict(x_6_test)))
qda.fit(x_all_train, y_train)
print("Accuracy with x_all:",accuracy_score(y_test, qda.predict(x_all_test)))
Accuracy with x_6  : 0.874015748031
Accuracy with x_all: 0.905511811024
best_acc = 0
best_k = 0
for K in range(1,101):
    knn = KNeighborsClassifier(n_neighbors=K)
    knn.fit(x_all_train, y_train)
    acc = accuracy_score(y_test, knn.predict(x_all_test))
    if acc > best_acc:
        best_acc, best_k = acc, K

print('Best accuracy = {:.4f} with K = {:3}'.format(best_acc, best_k))
Best accuracy = 0.9370 with K =   1
best_acc = 0
best_k = 0
for K in range(1,101):
    knn = KNeighborsClassifier(n_neighbors=K)
    knn.fit(x_6_train, y_train)
    acc = accuracy_score(y_test, knn.predict(x_6_test))
    if acc > best_acc:
        best_acc, best_k = acc, K

print('Best accuracy = {:.4f} with K = {:3}'.format(best_acc, best_k))
Best accuracy = 0.9370 with K =   1

We used two subsets of the predictors: all of them and the 6 most correlated with the response. From the results above, we can see that including all the predictors improves the accuracy (except for KNN) but not by much (less than 3%). QDA has better accuracy test performance than LDA, indicating a non-linear Bayes decision boundary. The best model was KNN with K = 1. For a more robust analysis we should perform cross-validation. Which leads us to the next chapter :)