Exercise 8.10

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import BaggingRegressor

%matplotlib inline
df = pd.read_csv('../data/Hitters.csv', index_col=0)
df.head()
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI CWalks League Division PutOuts Assists Errors Salary NewLeague
-Andy Allanson 293 66 1 30 29 14 1 293 66 1 30 29 14 A E 446 33 20 NaN A
-Alan Ashby 315 81 7 24 38 39 14 3449 835 69 321 414 375 N W 632 43 10 475.0 N
-Alvin Davis 479 130 18 66 72 76 3 1624 457 63 224 266 263 A W 880 82 14 480.0 A
-Andre Dawson 496 141 20 65 78 37 11 5628 1575 225 828 838 354 N E 200 11 3 500.0 N
-Andres Galarraga 321 87 10 39 42 30 2 396 101 12 48 46 33 N E 805 40 4 91.5 N
df = pd.get_dummies(df)
df.head()
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun ... PutOuts Assists Errors Salary League_A League_N Division_E Division_W NewLeague_A NewLeague_N
-Andy Allanson 293 66 1 30 29 14 1 293 66 1 ... 446 33 20 NaN 1.0 0.0 1.0 0.0 1.0 0.0
-Alan Ashby 315 81 7 24 38 39 14 3449 835 69 ... 632 43 10 475.0 0.0 1.0 0.0 1.0 0.0 1.0
-Alvin Davis 479 130 18 66 72 76 3 1624 457 63 ... 880 82 14 480.0 1.0 0.0 0.0 1.0 1.0 0.0
-Andre Dawson 496 141 20 65 78 37 11 5628 1575 225 ... 200 11 3 500.0 0.0 1.0 1.0 0.0 0.0 1.0
-Andres Galarraga 321 87 10 39 42 30 2 396 101 12 ... 805 40 4 91.5 0.0 1.0 1.0 0.0 0.0 1.0

5 rows × 23 columns

(a)

# Remove observations for whom the salary information is unknown.
df = df.dropna(subset=['Salary'])
# Log transform salaries
df['Salary'] = np.log(df['Salary'])

(b)

# Create training and testing 
# We don't use the train_test_split because we don't want to split data randomly.
X = df.drop(['Salary'], axis=1)
y = df['Salary']

X_train = X.ix[:200,:]
y_train = y.ix[:200]
X_test = X.ix[200:,:]
y_test = y.ix[200:]

(c)

# Boosting with different shrinkage values
shrinkage_values = [.001, .025, .005, .01, .025, .05, .1, .25, .5]
mses = []
for i in shrinkage_values:
    bst = GradientBoostingRegressor(learning_rate=i, n_estimators=1000, random_state=1)
    bst.fit(X_train, y_train)
    mses.append(mean_squared_error(y_train, bst.predict(X_train)))
# Plot training set MSE for different shrinkage values
plt.scatter(shrinkage_values, mses)
<matplotlib.collections.PathCollection at 0xf98e0f0>

png

(d)

# Boosting with different shrinkage values
shrinkage_values = [.001, .025, .005, .01, .025, .05, .1, .25, .5]
mses = []
for i in shrinkage_values:
    bst = GradientBoostingRegressor(learning_rate=i, n_estimators=1000, random_state=1)
    bst.fit(X_train, y_train)
    mses.append(mean_squared_error(y_test, bst.predict(X_test)))
# Plot training set MSE for different shrinkage values
plt.scatter(shrinkage_values, mses)
<matplotlib.collections.PathCollection at 0xf9f5240>

png

# Get minimum test MSE value
print('Minimum test MSE:', np.min(mses))
Minimum test MSE: 0.208753925111
# Index of the shrinkage_value that leads to the minimum test MSE
np.where(mses == np.min(mses))
(array([2], dtype=int64),)

(e)

# Linear regression
rgr = LinearRegression()
rgr.fit(X_train, y_train)

print('Minimum test MSE:', mean_squared_error(y_test, rgr.predict(X_test)))
Minimum test MSE: 0.491795937545
# Cross-validated lasso
lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)

print('Minimum test MSE:', mean_squared_error(y_test, lasso.predict(X_test)))
Minimum test MSE: 0.486586369603

The test MSE obtained using boosting is lower than the test MSE obtained using a linear regression or a lasso regularized regression. This means that, according to this error metric, boosting is the model with better predictive capacity.

(f)

# Plot features importance to understand their importance.
bst = GradientBoostingRegressor(learning_rate=0.005)  # 0.005 is the learning_rate corresponding to the best test MSE
bst.fit(X_train, y_train)

feature_importance = bst.feature_importances_*100
rel_imp = pd.Series(feature_importance, index=X.columns).sort_values(inplace=False)
rel_imp.T.plot(kind='barh', color='r')
plt.xlabel('Variable importance')
<matplotlib.text.Text at 0xd682e10>

png

According to the figure, the most important predictors seem to be: CAtBat, AtBat, CRuns, Walks and CRBI.

(g)

# Fit bagging regressor
bagging = BaggingRegressor()
bagging.fit(X_train, y_train)
BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)
# Test MSE
print('Test MSE:', mean_squared_error(y_test, bagging.predict(X_test)))
Test MSE: 0.253358375264