Exercise 4.11
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline
/Users/disciplina/anaconda3/envs/islp/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
df = pd.read_csv('../data/Auto.csv')
df.head()
|
mpg |
cylinders |
displacement |
horsepower |
weight |
acceleration |
year |
origin |
name |
0 |
18.0 |
8 |
307.0 |
130 |
3504 |
12.0 |
70 |
1 |
chevrolet chevelle malibu |
1 |
15.0 |
8 |
350.0 |
165 |
3693 |
11.5 |
70 |
1 |
buick skylark 320 |
2 |
18.0 |
8 |
318.0 |
150 |
3436 |
11.0 |
70 |
1 |
plymouth satellite |
3 |
16.0 |
8 |
304.0 |
150 |
3433 |
12.0 |
70 |
1 |
amc rebel sst |
4 |
17.0 |
8 |
302.0 |
140 |
3449 |
10.5 |
70 |
1 |
ford torino |
(a)
df['mpg01']= np.where(df['mpg'] > df['mpg'].median(), 1, 0)
df.head()
|
mpg |
cylinders |
displacement |
horsepower |
weight |
acceleration |
year |
origin |
name |
mpg01 |
0 |
18.0 |
8 |
307.0 |
130 |
3504 |
12.0 |
70 |
1 |
chevrolet chevelle malibu |
0 |
1 |
15.0 |
8 |
350.0 |
165 |
3693 |
11.5 |
70 |
1 |
buick skylark 320 |
0 |
2 |
18.0 |
8 |
318.0 |
150 |
3436 |
11.0 |
70 |
1 |
plymouth satellite |
0 |
3 |
16.0 |
8 |
304.0 |
150 |
3433 |
12.0 |
70 |
1 |
amc rebel sst |
0 |
4 |
17.0 |
8 |
302.0 |
140 |
3449 |
10.5 |
70 |
1 |
ford torino |
0 |
df = df.drop('mpg', axis=1)
df.head()
|
cylinders |
displacement |
horsepower |
weight |
acceleration |
year |
origin |
name |
mpg01 |
0 |
8 |
307.0 |
130 |
3504 |
12.0 |
70 |
1 |
chevrolet chevelle malibu |
0 |
1 |
8 |
350.0 |
165 |
3693 |
11.5 |
70 |
1 |
buick skylark 320 |
0 |
2 |
8 |
318.0 |
150 |
3436 |
11.0 |
70 |
1 |
plymouth satellite |
0 |
3 |
8 |
304.0 |
150 |
3433 |
12.0 |
70 |
1 |
amc rebel sst |
0 |
4 |
8 |
302.0 |
140 |
3449 |
10.5 |
70 |
1 |
ford torino |
0 |
(b)
g = sns.PairGrid(df, size=2)
g.map_upper(plt.scatter, s=3)
g.map_diag(plt.hist)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.fig.set_size_inches(12, 12)
/Users/disciplina/anaconda3/envs/islp/lib/python3.6/site-packages/matplotlib/contour.py:967: UserWarning: The following kwargs were not used by contour: 'label', 'color'
s)
df.corr()
|
cylinders |
displacement |
weight |
acceleration |
year |
origin |
mpg01 |
cylinders |
1.000000 |
0.950920 |
0.897017 |
-0.504061 |
-0.346717 |
-0.564972 |
-0.740327 |
displacement |
0.950920 |
1.000000 |
0.933104 |
-0.544162 |
-0.369804 |
-0.610664 |
-0.738607 |
weight |
0.897017 |
0.933104 |
1.000000 |
-0.419502 |
-0.307900 |
-0.581265 |
-0.745734 |
acceleration |
-0.504061 |
-0.544162 |
-0.419502 |
1.000000 |
0.282901 |
0.210084 |
0.322629 |
year |
-0.346717 |
-0.369804 |
-0.307900 |
0.282901 |
1.000000 |
0.184314 |
0.454108 |
origin |
-0.564972 |
-0.610664 |
-0.581265 |
0.210084 |
0.184314 |
1.000000 |
0.511393 |
mpg01 |
-0.740327 |
-0.738607 |
-0.745734 |
0.322629 |
0.454108 |
0.511393 |
1.000000 |
From the scatterplots and the last line of the correlation matrix above, we see that several variables are highly correlated with mpg01, specially cylinders, displacement and weight. We take these three variables as the ones most associated with mpg01 for the rest of the exercise.
(c)
x = df[['cylinders', 'displacement', 'weight']].values
y = df['mpg01'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
(d)
lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
accuracy_score(y_test, lda.predict(x_test))
0.89000000000000001
(e)
qda = QuadraticDiscriminantAnalysis()
qda.fit(x_train, y_train)
accuracy_score(y_test, qda.predict(x_test))
0.88
(f)
lr = LogisticRegression()
lr.fit(x_train, y_train)
accuracy_score(y_test, lr.predict(x_test))
0.87
(g)
for K in range(1,101):
knn = KNeighborsClassifier(n_neighbors=K)
knn.fit(x_train, y_train)
acc = accuracy_score(y_test, knn.predict(x_test))
print('K = {:3}, accuracy = {:.4f}'.format(K, acc))
K = 1, accuracy = 0.9000
K = 2, accuracy = 0.8800
K = 3, accuracy = 0.9100
K = 4, accuracy = 0.8900
K = 5, accuracy = 0.8700
K = 6, accuracy = 0.8900
K = 7, accuracy = 0.9000
K = 8, accuracy = 0.9100
K = 9, accuracy = 0.8900
K = 10, accuracy = 0.8900
K = 11, accuracy = 0.8900
K = 12, accuracy = 0.8800
K = 13, accuracy = 0.8900
K = 14, accuracy = 0.8900
K = 15, accuracy = 0.8900
K = 16, accuracy = 0.8900
K = 17, accuracy = 0.8700
K = 18, accuracy = 0.8900
K = 19, accuracy = 0.8800
K = 20, accuracy = 0.8900
K = 21, accuracy = 0.8900
K = 22, accuracy = 0.8900
K = 23, accuracy = 0.8800
K = 24, accuracy = 0.8800
K = 25, accuracy = 0.8800
K = 26, accuracy = 0.8800
K = 27, accuracy = 0.8800
K = 28, accuracy = 0.8800
K = 29, accuracy = 0.8800
K = 30, accuracy = 0.8800
K = 31, accuracy = 0.8800
K = 32, accuracy = 0.8900
K = 33, accuracy = 0.8900
K = 34, accuracy = 0.8900
K = 35, accuracy = 0.8700
K = 36, accuracy = 0.8800
K = 37, accuracy = 0.8700
K = 38, accuracy = 0.8800
K = 39, accuracy = 0.8800
K = 40, accuracy = 0.8800
K = 41, accuracy = 0.8800
K = 42, accuracy = 0.8800
K = 43, accuracy = 0.8800
K = 44, accuracy = 0.8800
K = 45, accuracy = 0.8800
K = 46, accuracy = 0.8800
K = 47, accuracy = 0.8700
K = 48, accuracy = 0.8800
K = 49, accuracy = 0.8700
K = 50, accuracy = 0.8700
K = 51, accuracy = 0.8700
K = 52, accuracy = 0.8700
K = 53, accuracy = 0.8700
K = 54, accuracy = 0.8700
K = 55, accuracy = 0.8700
K = 56, accuracy = 0.8700
K = 57, accuracy = 0.8700
K = 58, accuracy = 0.8700
K = 59, accuracy = 0.8700
K = 60, accuracy = 0.8700
K = 61, accuracy = 0.8700
K = 62, accuracy = 0.8700
K = 63, accuracy = 0.8700
K = 64, accuracy = 0.8700
K = 65, accuracy = 0.8700
K = 66, accuracy = 0.8700
K = 67, accuracy = 0.8700
K = 68, accuracy = 0.8700
K = 69, accuracy = 0.8700
K = 70, accuracy = 0.8700
K = 71, accuracy = 0.8700
K = 72, accuracy = 0.8700
K = 73, accuracy = 0.8700
K = 74, accuracy = 0.8800
K = 75, accuracy = 0.8700
K = 76, accuracy = 0.8800
K = 77, accuracy = 0.8700
K = 78, accuracy = 0.8800
K = 79, accuracy = 0.8800
K = 80, accuracy = 0.8800
K = 81, accuracy = 0.8800
K = 82, accuracy = 0.8800
K = 83, accuracy = 0.8800
K = 84, accuracy = 0.8800
K = 85, accuracy = 0.8800
K = 86, accuracy = 0.8800
K = 87, accuracy = 0.8700
K = 88, accuracy = 0.8800
K = 89, accuracy = 0.8800
K = 90, accuracy = 0.8800
K = 91, accuracy = 0.8800
K = 92, accuracy = 0.8900
K = 93, accuracy = 0.8800
K = 94, accuracy = 0.8800
K = 95, accuracy = 0.8800
K = 96, accuracy = 0.8900
K = 97, accuracy = 0.8800
K = 98, accuracy = 0.8800
K = 99, accuracy = 0.8800
K = 100, accuracy = 0.8800
The results above seem to indicated that the best values of K are somewhere around 1 and 22, although the difference in accuracy is not very large between different values of K.