Exercise 4.11

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier 

%matplotlib inline

/Users/disciplina/anaconda3/envs/islp/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

df = pd.read_csv('../data/Auto.csv')

df.head()

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
0	18.0	8	307.0	130	3504	12.0	70	1	chevrolet chevelle malibu
1	15.0	8	350.0	165	3693	11.5	70	1	buick skylark 320
2	18.0	8	318.0	150	3436	11.0	70	1	plymouth satellite
3	16.0	8	304.0	150	3433	12.0	70	1	amc rebel sst
4	17.0	8	302.0	140	3449	10.5	70	1	ford torino

(a)

df['mpg01']= np.where(df['mpg'] > df['mpg'].median(), 1, 0)

df.head()

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
0	18.0	8	307.0	130	3504	12.0	70	1	chevrolet chevelle malibu
1	15.0	8	350.0	165	3693	11.5	70	1	buick skylark 320
2	18.0	8	318.0	150	3436	11.0	70	1	plymouth satellite
3	16.0	8	304.0	150	3433	12.0	70	1	amc rebel sst
4	17.0	8	302.0	140	3449	10.5	70	1	ford torino

df = df.drop('mpg', axis=1)
df.head()

	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
0	8	307.0	130	3504	12.0	70	1	chevrolet chevelle malibu
1	8	350.0	165	3693	11.5	70	1	buick skylark 320
2	8	318.0	150	3436	11.0	70	1	plymouth satellite
3	8	304.0	150	3433	12.0	70	1	amc rebel sst
4	8	302.0	140	3449	10.5	70	1	ford torino

(b)

g = sns.PairGrid(df, size=2)
g.map_upper(plt.scatter, s=3)
g.map_diag(plt.hist)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.fig.set_size_inches(12, 12)

/Users/disciplina/anaconda3/envs/islp/lib/python3.6/site-packages/matplotlib/contour.py:967: UserWarning: The following kwargs were not used by contour: 'label', 'color'
  s)

png

df.corr()

	cylinders	displacement	weight	acceleration	year	origin	mpg01
cylinders	1.000000	0.950920	0.897017	-0.504061	-0.346717	-0.564972	-0.740327
displacement	0.950920	1.000000	0.933104	-0.544162	-0.369804	-0.610664	-0.738607
weight	0.897017	0.933104	1.000000	-0.419502	-0.307900	-0.581265	-0.745734
acceleration	-0.504061	-0.544162	-0.419502	1.000000	0.282901	0.210084	0.322629
year	-0.346717	-0.369804	-0.307900	0.282901	1.000000	0.184314	0.454108
origin	-0.564972	-0.610664	-0.581265	0.210084	0.184314	1.000000	0.511393
mpg01	-0.740327	-0.738607	-0.745734	0.322629	0.454108	0.511393	1.000000

From the scatterplots and the last line of the correlation matrix above, we see that several variables are highly correlated with mpg01, specially cylinders, displacement and weight. We take these three variables as the ones most associated with mpg01 for the rest of the exercise.

(c)

x = df[['cylinders', 'displacement', 'weight']].values
y = df['mpg01'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

(d)

lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
accuracy_score(y_test, lda.predict(x_test))

0.89000000000000001

(e)

qda = QuadraticDiscriminantAnalysis()
qda.fit(x_train, y_train)
accuracy_score(y_test, qda.predict(x_test))

0.88

(f)

lr = LogisticRegression()
lr.fit(x_train, y_train)
accuracy_score(y_test, lr.predict(x_test))

0.87

(g)

for K in range(1,101):
    knn = KNeighborsClassifier(n_neighbors=K)
    knn.fit(x_train, y_train)
    acc = accuracy_score(y_test, knn.predict(x_test))
    print('K = {:3}, accuracy = {:.4f}'.format(K, acc))

K =   1, accuracy = 0.9000
K =   2, accuracy = 0.8800
K =   3, accuracy = 0.9100
K =   4, accuracy = 0.8900
K =   5, accuracy = 0.8700
K =   6, accuracy = 0.8900
K =   7, accuracy = 0.9000
K =   8, accuracy = 0.9100
K =   9, accuracy = 0.8900
K =  10, accuracy = 0.8900
K =  11, accuracy = 0.8900
K =  12, accuracy = 0.8800
K =  13, accuracy = 0.8900
K =  14, accuracy = 0.8900
K =  15, accuracy = 0.8900
K =  16, accuracy = 0.8900
K =  17, accuracy = 0.8700
K =  18, accuracy = 0.8900
K =  19, accuracy = 0.8800
K =  20, accuracy = 0.8900
K =  21, accuracy = 0.8900
K =  22, accuracy = 0.8900
K =  23, accuracy = 0.8800
K =  24, accuracy = 0.8800
K =  25, accuracy = 0.8800
K =  26, accuracy = 0.8800
K =  27, accuracy = 0.8800
K =  28, accuracy = 0.8800
K =  29, accuracy = 0.8800
K =  30, accuracy = 0.8800
K =  31, accuracy = 0.8800
K =  32, accuracy = 0.8900
K =  33, accuracy = 0.8900
K =  34, accuracy = 0.8900
K =  35, accuracy = 0.8700
K =  36, accuracy = 0.8800
K =  37, accuracy = 0.8700
K =  38, accuracy = 0.8800
K =  39, accuracy = 0.8800
K =  40, accuracy = 0.8800
K =  41, accuracy = 0.8800
K =  42, accuracy = 0.8800
K =  43, accuracy = 0.8800
K =  44, accuracy = 0.8800
K =  45, accuracy = 0.8800
K =  46, accuracy = 0.8800
K =  47, accuracy = 0.8700
K =  48, accuracy = 0.8800
K =  49, accuracy = 0.8700
K =  50, accuracy = 0.8700
K =  51, accuracy = 0.8700
K =  52, accuracy = 0.8700
K =  53, accuracy = 0.8700
K =  54, accuracy = 0.8700
K =  55, accuracy = 0.8700
K =  56, accuracy = 0.8700
K =  57, accuracy = 0.8700
K =  58, accuracy = 0.8700
K =  59, accuracy = 0.8700
K =  60, accuracy = 0.8700
K =  61, accuracy = 0.8700
K =  62, accuracy = 0.8700
K =  63, accuracy = 0.8700
K =  64, accuracy = 0.8700
K =  65, accuracy = 0.8700
K =  66, accuracy = 0.8700
K =  67, accuracy = 0.8700
K =  68, accuracy = 0.8700
K =  69, accuracy = 0.8700
K =  70, accuracy = 0.8700
K =  71, accuracy = 0.8700
K =  72, accuracy = 0.8700
K =  73, accuracy = 0.8700
K =  74, accuracy = 0.8800
K =  75, accuracy = 0.8700
K =  76, accuracy = 0.8800
K =  77, accuracy = 0.8700
K =  78, accuracy = 0.8800
K =  79, accuracy = 0.8800
K =  80, accuracy = 0.8800
K =  81, accuracy = 0.8800
K =  82, accuracy = 0.8800
K =  83, accuracy = 0.8800
K =  84, accuracy = 0.8800
K =  85, accuracy = 0.8800
K =  86, accuracy = 0.8800
K =  87, accuracy = 0.8700
K =  88, accuracy = 0.8800
K =  89, accuracy = 0.8800
K =  90, accuracy = 0.8800
K =  91, accuracy = 0.8800
K =  92, accuracy = 0.8900
K =  93, accuracy = 0.8800
K =  94, accuracy = 0.8800
K =  95, accuracy = 0.8800
K =  96, accuracy = 0.8900
K =  97, accuracy = 0.8800
K =  98, accuracy = 0.8800
K =  99, accuracy = 0.8800
K = 100, accuracy = 0.8800

The results above seem to indicated that the best values of K are somewhere around 1 and 22, although the difference in accuracy is not very large between different values of K.