Exercise 7.10

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

%matplotlib inline
df = pd.read_csv('../data/College.csv', index_col=0)
df.head()
Private Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
Abilene Christian University Yes 1660 1232 721 23 52 2885 537 7440 3300 450 2200 70 78 18.1 12 7041 60
Adelphi University Yes 2186 1924 512 16 29 2683 1227 12280 6450 750 1500 29 30 12.2 16 10527 56
Adrian College Yes 1428 1097 336 22 50 1036 99 11250 3750 400 1165 53 66 12.9 30 8735 54
Agnes Scott College Yes 417 349 137 60 89 510 63 12960 5450 450 875 92 97 7.7 37 19016 59
Alaska Pacific University Yes 193 146 55 16 44 249 869 7560 4120 800 1500 76 72 11.9 2 10922 15
# Dummy variables
# The feature 'Private' is categorical. In order to use it in our models, we need to use dummy variables.
df = pd.get_dummies(df)
df.head()
Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate Private_No Private_Yes
Abilene Christian University 1660 1232 721 23 52 2885 537 7440 3300 450 2200 70 78 18.1 12 7041 60 0.0 1.0
Adelphi University 2186 1924 512 16 29 2683 1227 12280 6450 750 1500 29 30 12.2 16 10527 56 0.0 1.0
Adrian College 1428 1097 336 22 50 1036 99 11250 3750 400 1165 53 66 12.9 30 8735 54 0.0 1.0
Agnes Scott College 417 349 137 60 89 510 63 12960 5450 450 875 92 97 7.7 37 19016 59 0.0 1.0
Alaska Pacific University 193 146 55 16 44 249 869 7560 4120 800 1500 76 72 11.9 2 10922 15 0.0 1.0

(a)

# Dataset
X = df.ix[:, df.columns != 'Outstate']
y = df['Outstate']
# Split into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)
# Forward stepwise selection
lr = LinearRegression()

sfs = SFS(lr,
          k_features = 18,  # We have 18 features
          forward = True,
          floating = False,
          scoring = 'r2',
          cv = 0)

sfs = sfs.fit(X_train.as_matrix(), y_train)  # as_matrix() to be readable by sfs

fig = plot_sfs(sfs.get_metric_dict())

#plt.title('Sequential forward selection (w. StdDev)')
plt.title('Sequential forward selection')
plt.grid()
plt.show()
c:\program files\anaconda3\lib\site-packages\numpy\core\_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)

png

We will choose 6 features. The figure shows that a larger number of features will not increase the performance significantly.

# Visualizing the results in dataframes
pd.DataFrame.from_dict(sfs.get_metric_dict()).T
c:\program files\anaconda3\lib\site-packages\numpy\core\_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
avg_score ci_bound cv_scores feature_idx std_dev std_err
1 0.43622 NaN [0.436219807936] (7,) 0 NaN
2 0.589207 NaN [0.589206513899] (13, 7) 0 NaN
3 0.661242 NaN [0.661241618445] (13, 14, 7) 0 NaN
4 0.718047 NaN [0.71804725948] (16, 13, 14, 7) 0 NaN
5 0.744539 NaN [0.744538917009] (16, 10, 13, 14, 7) 0 NaN
6 0.75298 NaN [0.752979718057] (16, 7, 10, 13, 14, 15) 0 NaN
7 0.755819 NaN [0.755818691645] (16, 7, 9, 10, 13, 14, 15) 0 NaN
8 0.757751 NaN [0.757750769449] (16, 4, 7, 9, 10, 13, 14, 15) 0 NaN
9 0.759088 NaN [0.759088496118] (1, 4, 7, 9, 10, 13, 14, 15, 16) 0 NaN
10 0.763593 NaN [0.763592832578] (0, 1, 4, 7, 9, 10, 13, 14, 15, 16) 0 NaN
11 0.766277 NaN [0.766276542748] (0, 1, 2, 4, 7, 9, 10, 13, 14, 15, 16) 0 NaN
12 0.767782 NaN [0.767781703908] (0, 1, 2, 3, 4, 7, 9, 10, 13, 14, 15, 16) 0 NaN
13 0.768725 NaN [0.768724891242] (0, 1, 2, 3, 4, 7, 9, 10, 12, 13, 14, 15, 16) 0 NaN
14 0.769542 NaN [0.769541795295] (0, 1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16) 0 NaN
15 0.77035 NaN [0.770349525842] (0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 1... 0 NaN
16 0.770451 NaN [0.770451018385] (0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14... 0 NaN
17 0.770452 NaN [0.770451587] (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... 0 NaN
18 0.770452 NaN [0.770451587] (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... 0 NaN
# Variables that we will choose
print('Variables: %s, %s, %s, %s, %s, %s' % (X.columns[16], X.columns[7], X.columns[10], X.columns[13], X.columns[14], X.columns[15]))
Variables: Private_No, Room.Board, PhD, perc.alumni, Expend, Grad.Rate

References