Exercise 7.10

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

%matplotlib inline

df = pd.read_csv('../data/College.csv', index_col=0)

df.head()

	Private	Apps	Accept	Enroll	Top10perc	Top25perc	F.Undergrad	P.Undergrad	Outstate	Room.Board	Books	Personal	PhD	Terminal	S.F.Ratio	perc.alumni	Expend	Grad.Rate
Abilene Christian University	Yes	1660	1232	721	23	52	2885	537	7440	3300	450	2200	70	78	18.1	12	7041	60
Adelphi University	Yes	2186	1924	512	16	29	2683	1227	12280	6450	750	1500	29	30	12.2	16	10527	56
Adrian College	Yes	1428	1097	336	22	50	1036	99	11250	3750	400	1165	53	66	12.9	30	8735	54
Agnes Scott College	Yes	417	349	137	60	89	510	63	12960	5450	450	875	92	97	7.7	37	19016	59
Alaska Pacific University	Yes	193	146	55	16	44	249	869	7560	4120	800	1500	76	72	11.9	2	10922	15

# Dummy variables
# The feature 'Private' is categorical. In order to use it in our models, we need to use dummy variables.
df = pd.get_dummies(df)

df.head()

	Apps	Accept	Enroll	Top10perc	Top25perc	F.Undergrad	P.Undergrad	Outstate	Room.Board	Books	Personal	PhD	Terminal	S.F.Ratio	perc.alumni	Expend	Grad.Rate	Private_Yes
Abilene Christian University	1660	1232	721	23	52	2885	537	7440	3300	450	2200	70	78	18.1	12	7041	60	1.0
Adelphi University	2186	1924	512	16	29	2683	1227	12280	6450	750	1500	29	30	12.2	16	10527	56	1.0
Adrian College	1428	1097	336	22	50	1036	99	11250	3750	400	1165	53	66	12.9	30	8735	54	1.0
Agnes Scott College	417	349	137	60	89	510	63	12960	5450	450	875	92	97	7.7	37	19016	59	1.0
Alaska Pacific University	193	146	55	16	44	249	869	7560	4120	800	1500	76	72	11.9	2	10922	15	1.0

(a)

# Dataset
X = df.ix[:, df.columns != 'Outstate']
y = df['Outstate']

# Split into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)

# Forward stepwise selection
lr = LinearRegression()

sfs = SFS(lr,
          k_features = 18,  # We have 18 features
          forward = True,
          floating = False,
          scoring = 'r2',
          cv = 0)

sfs = sfs.fit(X_train.as_matrix(), y_train)  # as_matrix() to be readable by sfs

fig = plot_sfs(sfs.get_metric_dict())

#plt.title('Sequential forward selection (w. StdDev)')
plt.title('Sequential forward selection')
plt.grid()
plt.show()

c:\program files\anaconda3\lib\site-packages\numpy\core\_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)

png

We will choose 6 features. The figure shows that a larger number of features will not increase the performance significantly.

# Visualizing the results in dataframes
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

c:\program files\anaconda3\lib\site-packages\numpy\core\_methods.py:82: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)

	avg_score	ci_bound	cv_scores	feature_idx	std_err
1	0.43622	NaN	[0.436219807936]	(7,)	NaN
2	0.589207	NaN	[0.589206513899]	(13, 7)	NaN
3	0.661242	NaN	[0.661241618445]	(13, 14, 7)	NaN
4	0.718047	NaN	[0.71804725948]	(16, 13, 14, 7)	NaN
5	0.744539	NaN	[0.744538917009]	(16, 10, 13, 14, 7)	NaN
6	0.75298	NaN	[0.752979718057]	(16, 7, 10, 13, 14, 15)	NaN
7	0.755819	NaN	[0.755818691645]	(16, 7, 9, 10, 13, 14, 15)	NaN
8	0.757751	NaN	[0.757750769449]	(16, 4, 7, 9, 10, 13, 14, 15)	NaN
9	0.759088	NaN	[0.759088496118]	(1, 4, 7, 9, 10, 13, 14, 15, 16)	NaN
10	0.763593	NaN	[0.763592832578]	(0, 1, 4, 7, 9, 10, 13, 14, 15, 16)	NaN
11	0.766277	NaN	[0.766276542748]	(0, 1, 2, 4, 7, 9, 10, 13, 14, 15, 16)	NaN
12	0.767782	NaN	[0.767781703908]	(0, 1, 2, 3, 4, 7, 9, 10, 13, 14, 15, 16)	NaN
13	0.768725	NaN	[0.768724891242]	(0, 1, 2, 3, 4, 7, 9, 10, 12, 13, 14, 15, 16)	NaN
14	0.769542	NaN	[0.769541795295]	(0, 1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16)	NaN
15	0.77035	NaN	[0.770349525842]	(0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 1...	NaN
16	0.770451	NaN	[0.770451018385]	(0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14...	NaN
17	0.770452	NaN	[0.770451587]	(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	NaN
18	0.770452	NaN	[0.770451587]	(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	NaN

# Variables that we will choose
print('Variables: %s, %s, %s, %s, %s, %s' % (X.columns[16], X.columns[7], X.columns[10], X.columns[13], X.columns[14], X.columns[15]))

Variables: Private_No, Room.Board, PhD, perc.alumni, Expend, Grad.Rate

References

https://github.com/dswah/pyGAM (GAM in Python)
http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html (scikit train and test splits)
http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/ (mlxtend forward stepwise)
https://github.com/rasbt/mlxtend/blob/master/mlxtend/plotting/plot_sequential_feature_selection.py (plot_sequential_feature_selection source)
https://github.com/bsilverthorn/gampy/blob/master/src/python/gampy/backfit.py (GAM package???)