Exercise 9.8

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

%matplotlib inline

df = pd.read_csv('../data/OJ.csv', index_col=0)

# Data overview
df.head()

	Purchase	WeekofPurchase	StoreID	PriceCH	PriceMM	DiscCH	DiscMM	SpecialMM	LoyalCH	SalePriceMM	SalePriceCH	PriceDiff	Store7	PctDiscMM	PctDiscCH	ListPriceDiff	STORE
1	CH	237	1	1.75	1.99	0.00	0.0	0	0.500000	1.99	1.75	0.24	No	0.000000	0.000000	0.24	1
2	CH	239	1	1.75	1.99	0.00	0.3	1	0.600000	1.69	1.75	-0.06	No	0.150754	0.000000	0.24	1
3	CH	245	1	1.86	2.09	0.17	0.0	0	0.680000	2.09	1.69	0.40	No	0.000000	0.091398	0.23	1
4	MM	227	1	1.69	1.69	0.00	0.0	0	0.400000	1.69	1.69	0.00	No	0.000000	0.000000	0.00	1
5	CH	228	7	1.69	1.69	0.00	0.0	0	0.956535	1.69	1.69	0.00	Yes	0.000000	0.000000	0.00	0

# Define predictors and response 
X = df.drop(axis=1, labels=['Purchase'])
y = df['Purchase']

# Dummy variables to transform qualitative into quantitative variables
X = pd.get_dummies(X)

(a)

# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=800, random_state=1)

(b)

# Fit SVC to data
svc = SVC(C=0.01, kernel='linear', random_state=1)
svc.fit(X_train, y_train)

SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False)

# Number of support vectors for each class
svc.n_support_

array([307, 304])

In our dataset, we have 800 observations, 2 classes, and a total of 611 support vectors. From those support vectors, 307 belong to class CH and 304 to class MM.

(c)

Error rate (ERR) is calculated as the number of all incorrect predictions divided by the total number of the dataset. The best error rate is 0.0, whereas the worst is 1.0. The error rate is derived from the confusion matrix.

Source: https://classeval.wordpress.com/introduction/basic-evaluation-measures/

# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, svc.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, svc.predict(X_test)))

Train confusion matrix:  [[489   7]
 [241  63]]
Test confusion matrix:  [[150   7]
 [ 90  23]]

The count of true negatives is $C_{0,0}$ , false negatives is $C_{1,0}$ , true positives is $C_{1,1}$ and false positives is $C_{0,1}$ .

# Error rate
train_err = (7+241)/(489+7+241+63)
test_err = (7+90)/(150+7+90+23)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)

Train error rate:  0.31
Test error rate:  0.3592592592592593

(d)

Since the selection of an optimal cost is a hypertuning parameter operation, we will use the GridSearchCV.

# Hypertune cost using GridSearchCV
svc = SVC(kernel='linear', random_state=1)

parameters = {'C':np.arange(0.01, 10, 2)}

clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([ 0.01,  2.01,  4.01,  6.01,  8.01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

# Best value for cost
clf.best_params_

{'C': 2.0099999999999998}

(e)

# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, clf.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, clf.predict(X_test)))

Train confusion matrix:  [[436  60]
 [ 72 232]]
Test confusion matrix:  [[143  14]
 [ 30  83]]

# Error rate
train_err = (59+75)/(437+59+75+229)
test_err = (13+35)/(144+13+35+78)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)

Train error rate:  0.1675
Test error rate:  0.17777777777777778

(f)

# Fit SVC to data
svc = SVC(C=0.01, kernel='rbf', random_state=1)
svc.fit(X_train, y_train)

SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False)

# Number of support vectors for each class
svc.n_support_

array([321, 304])

# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, svc.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, svc.predict(X_test)))

Train confusion matrix:  [[496   0]
 [304   0]]
Test confusion matrix:  [[157   0]
 [113   0]]

# Error rate
train_err = (0+304)/(496+0+304+0)
test_err = (0+113)/(157+0+113+0)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)

Train error rate:  0.38
Test error rate:  0.4185185185185185

# Hypertune cost using GridSearchCV
svc = SVC(kernel='rbf', random_state=1)

parameters = {'C':np.arange(0.01, 10, 2)}

clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([ 0.01,  2.01,  4.01,  6.01,  8.01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

# Best value for cost
clf.best_params_

{'C': 4.0099999999999998}

# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, clf.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, clf.predict(X_test)))

Train confusion matrix:  [[450  46]
 [ 94 210]]
Test confusion matrix:  [[145  12]
 [ 39  74]]

# Error rate
train_err = (40+78)/(456+40+78+226)
test_err = (11+36)/(146+11+36+77)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)

Train error rate:  0.1475
Test error rate:  0.17407407407407408

(g)

# Fit SVC to data
svc = SVC(C=0.01, kernel='poly', degree=2, random_state=1)
svc.fit(X_train, y_train)

SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=2, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False)

# Number of support vectors for each class
svc.n_support_

array([164, 164])

# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, svc.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, svc.predict(X_test)))

Train confusion matrix:  [[435  61]
 [ 70 234]]
Test confusion matrix:  [[140  17]
 [ 30  83]]

# Error rate
train_err = (61+70)/(435+61+70+234)
test_err = (17+30)/(140+17+30+83)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)

Train error rate:  0.16375
Test error rate:  0.17407407407407408

# Hypertune cost using GridSearchCV
svc = SVC(kernel='poly', degree=2, random_state=1)

parameters = {'C':np.arange(0.01, 10, 2)}

clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)

# Best value for cost
clf.best_params_

# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, clf.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, clf.predict(X_test)))

# Error rate
train_err = (61+70)/(456+40+78+226)
test_err = (11+36)/(146+11+36+77)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)

(h)

Overall, the approach that seems to give the best results on this data is xxx.