Exercise 8.10

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import BaggingRegressor

%matplotlib inline

df = pd.read_csv('../data/Hitters.csv', index_col=0)

df.head()

	AtBat	Hits	HmRun	Runs	RBI	Walks	Years	CAtBat	CHits	CHmRun	CRuns	CRBI	CWalks	League	Division	PutOuts	Assists	Errors	Salary	NewLeague
-Andy Allanson	293	66	1	30	29	14	1	293	66	1	30	29	14	A	E	446	33	20	NaN	A
-Alan Ashby	315	81	7	24	38	39	14	3449	835	69	321	414	375	N	W	632	43	10	475.0	N
-Alvin Davis	479	130	18	66	72	76	3	1624	457	63	224	266	263	A	W	880	82	14	480.0	A
-Andre Dawson	496	141	20	65	78	37	11	5628	1575	225	828	838	354	N	E	200	11	3	500.0	N
-Andres Galarraga	321	87	10	39	42	30	2	396	101	12	48	46	33	N	E	805	40	4	91.5	N

df = pd.get_dummies(df)

df.head()

	AtBat	Hits	HmRun	Runs	RBI	Walks	Years	CAtBat	CHits	CHmRun	...	PutOuts	Assists	Errors	Salary	League_A	League_N	Division_E	Division_W	NewLeague_A	NewLeague_N
-Andy Allanson	293	66	1	30	29	14	1	293	66	1	...	446	33	20	NaN	1.0	0.0	1.0	0.0	1.0	0.0
-Alan Ashby	315	81	7	24	38	39	14	3449	835	69	...	632	43	10	475.0	0.0	1.0	0.0	1.0	0.0	1.0
-Alvin Davis	479	130	18	66	72	76	3	1624	457	63	...	880	82	14	480.0	1.0	0.0	0.0	1.0	1.0	0.0
-Andre Dawson	496	141	20	65	78	37	11	5628	1575	225	...	200	11	3	500.0	0.0	1.0	1.0	0.0	0.0	1.0
-Andres Galarraga	321	87	10	39	42	30	2	396	101	12	...	805	40	4	91.5	0.0	1.0	1.0	0.0	0.0	1.0

5 rows × 23 columns

(a)

# Remove observations for whom the salary information is unknown.
df = df.dropna(subset=['Salary'])

# Log transform salaries
df['Salary'] = np.log(df['Salary'])

(b)

# Create training and testing 
# We don't use the train_test_split because we don't want to split data randomly.
X = df.drop(['Salary'], axis=1)
y = df['Salary']

X_train = X.ix[:200,:]
y_train = y.ix[:200]
X_test = X.ix[200:,:]
y_test = y.ix[200:]

(c)

# Boosting with different shrinkage values
shrinkage_values = [.001, .025, .005, .01, .025, .05, .1, .25, .5]
mses = []
for i in shrinkage_values:
    bst = GradientBoostingRegressor(learning_rate=i, n_estimators=1000, random_state=1)
    bst.fit(X_train, y_train)
    mses.append(mean_squared_error(y_train, bst.predict(X_train)))

# Plot training set MSE for different shrinkage values
plt.scatter(shrinkage_values, mses)

<matplotlib.collections.PathCollection at 0xf98e0f0>

png

(d)

# Boosting with different shrinkage values
shrinkage_values = [.001, .025, .005, .01, .025, .05, .1, .25, .5]
mses = []
for i in shrinkage_values:
    bst = GradientBoostingRegressor(learning_rate=i, n_estimators=1000, random_state=1)
    bst.fit(X_train, y_train)
    mses.append(mean_squared_error(y_test, bst.predict(X_test)))

# Plot training set MSE for different shrinkage values
plt.scatter(shrinkage_values, mses)

<matplotlib.collections.PathCollection at 0xf9f5240>

png

# Get minimum test MSE value
print('Minimum test MSE:', np.min(mses))

Minimum test MSE: 0.208753925111

# Index of the shrinkage_value that leads to the minimum test MSE
np.where(mses == np.min(mses))

(array([2], dtype=int64),)

(e)

# Linear regression
rgr = LinearRegression()
rgr.fit(X_train, y_train)

print('Minimum test MSE:', mean_squared_error(y_test, rgr.predict(X_test)))

Minimum test MSE: 0.491795937545

# Cross-validated lasso
lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)

print('Minimum test MSE:', mean_squared_error(y_test, lasso.predict(X_test)))

Minimum test MSE: 0.486586369603

The test MSE obtained using boosting is lower than the test MSE obtained using a linear regression or a lasso regularized regression. This means that, according to this error metric, boosting is the model with better predictive capacity.

(f)

# Plot features importance to understand their importance.
bst = GradientBoostingRegressor(learning_rate=0.005)  # 0.005 is the learning_rate corresponding to the best test MSE
bst.fit(X_train, y_train)

feature_importance = bst.feature_importances_*100
rel_imp = pd.Series(feature_importance, index=X.columns).sort_values(inplace=False)
rel_imp.T.plot(kind='barh', color='r')
plt.xlabel('Variable importance')

<matplotlib.text.Text at 0xd682e10>

png

According to the figure, the most important predictors seem to be: CAtBat, AtBat, CRuns, Walks and CRBI.

(g)

# Fit bagging regressor
bagging = BaggingRegressor()
bagging.fit(X_train, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

# Test MSE
print('Test MSE:', mean_squared_error(y_test, bagging.predict(X_test)))

Test MSE: 0.253358375264