Exercise 4.13
This is similar to exercise 4.11. We define a new variable with the value of 1 if the crime rate is above the median and 0 if it is below.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_boston
%matplotlib inline
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df.head()
|
CRIM |
ZN |
INDUS |
CHAS |
NOX |
RM |
AGE |
DIS |
RAD |
TAX |
PTRATIO |
B |
LSTAT |
0 |
0.00632 |
18.0 |
2.31 |
0.0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1.0 |
296.0 |
15.3 |
396.90 |
4.98 |
1 |
0.02731 |
0.0 |
7.07 |
0.0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2.0 |
242.0 |
17.8 |
396.90 |
9.14 |
2 |
0.02729 |
0.0 |
7.07 |
0.0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2.0 |
242.0 |
17.8 |
392.83 |
4.03 |
3 |
0.03237 |
0.0 |
2.18 |
0.0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3.0 |
222.0 |
18.7 |
394.63 |
2.94 |
4 |
0.06905 |
0.0 |
2.18 |
0.0 |
0.458 |
7.147 |
54.2 |
6.0622 |
3.0 |
222.0 |
18.7 |
396.90 |
5.33 |
df['CRIM01'] = np.where(df['CRIM'] > df['CRIM'].median(), 1, 0)
df.head()
|
CRIM |
ZN |
INDUS |
CHAS |
NOX |
RM |
AGE |
DIS |
RAD |
TAX |
PTRATIO |
B |
LSTAT |
CRIM01 |
0 |
0.00632 |
18.0 |
2.31 |
0.0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1.0 |
296.0 |
15.3 |
396.90 |
4.98 |
0 |
1 |
0.02731 |
0.0 |
7.07 |
0.0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2.0 |
242.0 |
17.8 |
396.90 |
9.14 |
0 |
2 |
0.02729 |
0.0 |
7.07 |
0.0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2.0 |
242.0 |
17.8 |
392.83 |
4.03 |
0 |
3 |
0.03237 |
0.0 |
2.18 |
0.0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3.0 |
222.0 |
18.7 |
394.63 |
2.94 |
0 |
4 |
0.06905 |
0.0 |
2.18 |
0.0 |
0.458 |
7.147 |
54.2 |
6.0622 |
3.0 |
222.0 |
18.7 |
396.90 |
5.33 |
0 |
df = df.drop('CRIM', axis=1)
df.head()
|
ZN |
INDUS |
CHAS |
NOX |
RM |
AGE |
DIS |
RAD |
TAX |
PTRATIO |
B |
LSTAT |
CRIM01 |
0 |
18.0 |
2.31 |
0.0 |
0.538 |
6.575 |
65.2 |
4.0900 |
1.0 |
296.0 |
15.3 |
396.90 |
4.98 |
0 |
1 |
0.0 |
7.07 |
0.0 |
0.469 |
6.421 |
78.9 |
4.9671 |
2.0 |
242.0 |
17.8 |
396.90 |
9.14 |
0 |
2 |
0.0 |
7.07 |
0.0 |
0.469 |
7.185 |
61.1 |
4.9671 |
2.0 |
242.0 |
17.8 |
392.83 |
4.03 |
0 |
3 |
0.0 |
2.18 |
0.0 |
0.458 |
6.998 |
45.8 |
6.0622 |
3.0 |
222.0 |
18.7 |
394.63 |
2.94 |
0 |
4 |
0.0 |
2.18 |
0.0 |
0.458 |
7.147 |
54.2 |
6.0622 |
3.0 |
222.0 |
18.7 |
396.90 |
5.33 |
0 |
As in 4.11, we try to assess which variables are more associated with 'CRIM01' by looking at correlations between the variables.
df.corr()
|
ZN |
INDUS |
CHAS |
NOX |
RM |
AGE |
DIS |
RAD |
TAX |
PTRATIO |
B |
LSTAT |
CRIM01 |
ZN |
1.000000 |
-0.533828 |
-0.042697 |
-0.516604 |
0.311991 |
-0.569537 |
0.664408 |
-0.311948 |
-0.314563 |
-0.391679 |
0.175520 |
-0.412995 |
-0.436151 |
INDUS |
-0.533828 |
1.000000 |
0.062938 |
0.763651 |
-0.391676 |
0.644779 |
-0.708027 |
0.595129 |
0.720760 |
0.383248 |
-0.356977 |
0.603800 |
0.603260 |
CHAS |
-0.042697 |
0.062938 |
1.000000 |
0.091203 |
0.091251 |
0.086518 |
-0.099176 |
-0.007368 |
-0.035587 |
-0.121515 |
0.048788 |
-0.053929 |
0.070097 |
NOX |
-0.516604 |
0.763651 |
0.091203 |
1.000000 |
-0.302188 |
0.731470 |
-0.769230 |
0.611441 |
0.668023 |
0.188933 |
-0.380051 |
0.590879 |
0.723235 |
RM |
0.311991 |
-0.391676 |
0.091251 |
-0.302188 |
1.000000 |
-0.240265 |
0.205246 |
-0.209847 |
-0.292048 |
-0.355501 |
0.128069 |
-0.613808 |
-0.156372 |
AGE |
-0.569537 |
0.644779 |
0.086518 |
0.731470 |
-0.240265 |
1.000000 |
-0.747881 |
0.456022 |
0.506456 |
0.261515 |
-0.273534 |
0.602339 |
0.613940 |
DIS |
0.664408 |
-0.708027 |
-0.099176 |
-0.769230 |
0.205246 |
-0.747881 |
1.000000 |
-0.494588 |
-0.534432 |
-0.232471 |
0.291512 |
-0.496996 |
-0.616342 |
RAD |
-0.311948 |
0.595129 |
-0.007368 |
0.611441 |
-0.209847 |
0.456022 |
-0.494588 |
1.000000 |
0.910228 |
0.464741 |
-0.444413 |
0.488676 |
0.619786 |
TAX |
-0.314563 |
0.720760 |
-0.035587 |
0.668023 |
-0.292048 |
0.506456 |
-0.534432 |
0.910228 |
1.000000 |
0.460853 |
-0.441808 |
0.543993 |
0.608741 |
PTRATIO |
-0.391679 |
0.383248 |
-0.121515 |
0.188933 |
-0.355501 |
0.261515 |
-0.232471 |
0.464741 |
0.460853 |
1.000000 |
-0.177383 |
0.374044 |
0.253568 |
B |
0.175520 |
-0.356977 |
0.048788 |
-0.380051 |
0.128069 |
-0.273534 |
0.291512 |
-0.444413 |
-0.441808 |
-0.177383 |
1.000000 |
-0.366087 |
-0.351211 |
LSTAT |
-0.412995 |
0.603800 |
-0.053929 |
0.590879 |
-0.613808 |
0.602339 |
-0.496996 |
0.488676 |
0.543993 |
0.374044 |
-0.366087 |
1.000000 |
0.453263 |
CRIM01 |
-0.436151 |
0.603260 |
0.070097 |
0.723235 |
-0.156372 |
0.613940 |
-0.616342 |
0.619786 |
0.608741 |
0.253568 |
-0.351211 |
0.453263 |
1.000000 |
x_all = df.iloc[:,:-1].values
x_6 = df[['INDUS','NOX','AGE','DIS','RAD','TAX']].values
y = df['CRIM01'].values
x_all_train, x_all_test, y_train, y_test = train_test_split(x_all, y, random_state=1)
x_6_train, x_6_test, y_train, y_test = train_test_split(x_6, y, random_state=1)
lr = LogisticRegression()
lr.fit(x_6_train, y_train)
print("Accuracy with x_6 :",accuracy_score(y_test, lr.predict(x_6_test)))
lr.fit(x_all_train, y_train)
print("Accuracy with x_all:",accuracy_score(y_test, lr.predict(x_all_test)))
Accuracy with x_6 : 0.842519685039
Accuracy with x_all: 0.858267716535
lda = LinearDiscriminantAnalysis()
lda.fit(x_6_train, y_train)
print("Accuracy with x_6 :",accuracy_score(y_test, lda.predict(x_6_test)))
lda.fit(x_all_train, y_train)
print("Accuracy with x_all:",accuracy_score(y_test, lda.predict(x_all_test)))
Accuracy with x_6 : 0.818897637795
Accuracy with x_all: 0.834645669291
qda = QuadraticDiscriminantAnalysis()
qda.fit(x_6_train, y_train)
print("Accuracy with x_6 :",accuracy_score(y_test, qda.predict(x_6_test)))
qda.fit(x_all_train, y_train)
print("Accuracy with x_all:",accuracy_score(y_test, qda.predict(x_all_test)))
Accuracy with x_6 : 0.874015748031
Accuracy with x_all: 0.905511811024
best_acc = 0
best_k = 0
for K in range(1,101):
knn = KNeighborsClassifier(n_neighbors=K)
knn.fit(x_all_train, y_train)
acc = accuracy_score(y_test, knn.predict(x_all_test))
if acc > best_acc:
best_acc, best_k = acc, K
print('Best accuracy = {:.4f} with K = {:3}'.format(best_acc, best_k))
Best accuracy = 0.9370 with K = 1
best_acc = 0
best_k = 0
for K in range(1,101):
knn = KNeighborsClassifier(n_neighbors=K)
knn.fit(x_6_train, y_train)
acc = accuracy_score(y_test, knn.predict(x_6_test))
if acc > best_acc:
best_acc, best_k = acc, K
print('Best accuracy = {:.4f} with K = {:3}'.format(best_acc, best_k))
Best accuracy = 0.9370 with K = 1
We used two subsets of the predictors: all of them and the 6 most correlated with the response. From the results above, we can see that including all the predictors improves the accuracy (except for KNN) but not by much (less than 3%). QDA has better accuracy test performance than LDA, indicating a non-linear Bayes decision boundary. The best model was KNN with K = 1. For a more robust analysis we should perform cross-validation. Which leads us to the next chapter :)