Exercise 9.8

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

%matplotlib inline
df = pd.read_csv('../data/OJ.csv', index_col=0)
# Data overview
df.head()
Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
1 CH 237 1 1.75 1.99 0.00 0.0 0 0 0.500000 1.99 1.75 0.24 No 0.000000 0.000000 0.24 1
2 CH 239 1 1.75 1.99 0.00 0.3 0 1 0.600000 1.69 1.75 -0.06 No 0.150754 0.000000 0.24 1
3 CH 245 1 1.86 2.09 0.17 0.0 0 0 0.680000 2.09 1.69 0.40 No 0.000000 0.091398 0.23 1
4 MM 227 1 1.69 1.69 0.00 0.0 0 0 0.400000 1.69 1.69 0.00 No 0.000000 0.000000 0.00 1
5 CH 228 7 1.69 1.69 0.00 0.0 0 0 0.956535 1.69 1.69 0.00 Yes 0.000000 0.000000 0.00 0
# Define predictors and response 
X = df.drop(axis=1, labels=['Purchase'])
y = df['Purchase']
# Dummy variables to transform qualitative into quantitative variables
X = pd.get_dummies(X)

(a)

# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=800, random_state=1)

(b)

# Fit SVC to data
svc = SVC(C=0.01, kernel='linear', random_state=1)
svc.fit(X_train, y_train)
SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False)
# Number of support vectors for each class
svc.n_support_
array([307, 304])

In our dataset, we have 800 observations, 2 classes, and a total of 611 support vectors. From those support vectors, 307 belong to class CH and 304 to class MM.

(c)

Error rate (ERR) is calculated as the number of all incorrect predictions divided by the total number of the dataset. The best error rate is 0.0, whereas the worst is 1.0. The error rate is derived from the confusion matrix.

Source: https://classeval.wordpress.com/introduction/basic-evaluation-measures/

# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, svc.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, svc.predict(X_test)))
Train confusion matrix:  [[489   7]
 [241  63]]
Test confusion matrix:  [[150   7]
 [ 90  23]]

The count of true negatives is C_{0,0}, false negatives is C_{1,0}, true positives is C_{1,1} and false positives is C_{0,1}.

# Error rate
train_err = (7+241)/(489+7+241+63)
test_err = (7+90)/(150+7+90+23)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)
Train error rate:  0.31
Test error rate:  0.3592592592592593

(d)

Since the selection of an optimal cost is a hypertuning parameter operation, we will use the GridSearchCV.

# Hypertune cost using GridSearchCV
svc = SVC(kernel='linear', random_state=1)

parameters = {'C':np.arange(0.01, 10, 2)}

clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([ 0.01,  2.01,  4.01,  6.01,  8.01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
# Best value for cost
clf.best_params_
{'C': 2.0099999999999998}

(e)

# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, clf.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, clf.predict(X_test)))
Train confusion matrix:  [[436  60]
 [ 72 232]]
Test confusion matrix:  [[143  14]
 [ 30  83]]
# Error rate
train_err = (59+75)/(437+59+75+229)
test_err = (13+35)/(144+13+35+78)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)
Train error rate:  0.1675
Test error rate:  0.17777777777777778

(f)

# Fit SVC to data
svc = SVC(C=0.01, kernel='rbf', random_state=1)
svc.fit(X_train, y_train)
SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False)
# Number of support vectors for each class
svc.n_support_
array([321, 304])
# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, svc.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, svc.predict(X_test)))
Train confusion matrix:  [[496   0]
 [304   0]]
Test confusion matrix:  [[157   0]
 [113   0]]
# Error rate
train_err = (0+304)/(496+0+304+0)
test_err = (0+113)/(157+0+113+0)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)
Train error rate:  0.38
Test error rate:  0.4185185185185185
# Hypertune cost using GridSearchCV
svc = SVC(kernel='rbf', random_state=1)

parameters = {'C':np.arange(0.01, 10, 2)}

clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([ 0.01,  2.01,  4.01,  6.01,  8.01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
# Best value for cost
clf.best_params_
{'C': 4.0099999999999998}
# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, clf.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, clf.predict(X_test)))
Train confusion matrix:  [[450  46]
 [ 94 210]]
Test confusion matrix:  [[145  12]
 [ 39  74]]
# Error rate
train_err = (40+78)/(456+40+78+226)
test_err = (11+36)/(146+11+36+77)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)
Train error rate:  0.1475
Test error rate:  0.17407407407407408

(g)

# Fit SVC to data
svc = SVC(C=0.01, kernel='poly', degree=2, random_state=1)
svc.fit(X_train, y_train)
SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=2, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False)
# Number of support vectors for each class
svc.n_support_
array([164, 164])
# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, svc.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, svc.predict(X_test)))
Train confusion matrix:  [[435  61]
 [ 70 234]]
Test confusion matrix:  [[140  17]
 [ 30  83]]
# Error rate
train_err = (61+70)/(435+61+70+234)
test_err = (17+30)/(140+17+30+83)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)
Train error rate:  0.16375
Test error rate:  0.17407407407407408
# Hypertune cost using GridSearchCV
svc = SVC(kernel='poly', degree=2, random_state=1)

parameters = {'C':np.arange(0.01, 10, 2)}

clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
# Best value for cost
clf.best_params_
# Confusion matrix
print('Train confusion matrix: ', confusion_matrix(y_train, clf.predict(X_train)))
print('Test confusion matrix: ', confusion_matrix(y_test, clf.predict(X_test)))
# Error rate
train_err = (61+70)/(456+40+78+226)
test_err = (11+36)/(146+11+36+77)

print('Train error rate: ', train_err)
print('Test error rate: ', test_err)

(h)

Overall, the approach that seems to give the best results on this data is xxx.