Exercise 4.10

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #visualization library
from sklearn.linear_model import LogisticRegression #problem will be solved with scikit
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis #linear discriminant analysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis #quadratic discriminant analysis
from sklearn.neighbors import KNeighborsClassifier #K nearest neighbours (KNN)

import statsmodels.api as sm #to compute p-values
from patsy import dmatrices

%matplotlib inline
df = pd.read_csv('../data/Weekly.csv',index_col=0)
df.head()
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.154976 -0.270 Down
2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.148574 -2.576 Down
3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.159837 3.514 Up
4 1990 3.514 -2.576 -0.270 0.816 1.572 0.161630 0.712 Up
5 1990 0.712 3.514 -2.576 -0.270 0.816 0.153728 1.178 Up

(a)

df.describe() #descriptive statistics
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today
count 1089.000000 1089.000000 1089.000000 1089.000000 1089.000000 1089.000000 1089.000000 1089.000000
mean 2000.048669 0.150585 0.151079 0.147205 0.145818 0.139893 1.574618 0.149899
std 6.033182 2.357013 2.357254 2.360502 2.360279 2.361285 1.686636 2.356927
min 1990.000000 -18.195000 -18.195000 -18.195000 -18.195000 -18.195000 0.087465 -18.195000
25% 1995.000000 -1.154000 -1.154000 -1.158000 -1.158000 -1.166000 0.332022 -1.154000
50% 2000.000000 0.241000 0.241000 0.241000 0.238000 0.234000 1.002680 0.241000
75% 2005.000000 1.405000 1.409000 1.409000 1.409000 1.405000 2.053727 1.405000
max 2010.000000 12.026000 12.026000 12.026000 12.026000 12.026000 9.328214 12.026000
df.corr() #correlation matrix
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today
Year 1.000000 -0.032289 -0.033390 -0.030006 -0.031128 -0.030519 0.841942 -0.032460
Lag1 -0.032289 1.000000 -0.074853 0.058636 -0.071274 -0.008183 -0.064951 -0.075032
Lag2 -0.033390 -0.074853 1.000000 -0.075721 0.058382 -0.072499 -0.085513 0.059167
Lag3 -0.030006 0.058636 -0.075721 1.000000 -0.075396 0.060657 -0.069288 -0.071244
Lag4 -0.031128 -0.071274 0.058382 -0.075396 1.000000 -0.075675 -0.061075 -0.007826
Lag5 -0.030519 -0.008183 -0.072499 0.060657 -0.075675 1.000000 -0.058517 0.011013
Volume 0.841942 -0.064951 -0.085513 -0.069288 -0.061075 -0.058517 1.000000 -0.033078
Today -0.032460 -0.075032 0.059167 -0.071244 -0.007826 0.011013 -0.033078 1.000000
sns.distplot(df['Today']);
C:\Program Files\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

png

sns.boxplot(x='Direction', y='Volume', data=df);

png

sns.pairplot(df);

png

plt.scatter(df['Year'],df['Volume']);

png

(b)

#logistic regression model
train_cols = ['Lag1','Lag2','Lag3','Lag4','Lag5','Volume'] #independent var. considered in the logistic model
lr = LogisticRegression()
mod = lr.fit(df[train_cols], df['Direction'])
mod.coef_ #independent var. coefficients
array([[-0.04117292,  0.05846974, -0.01599122, -0.02769998, -0.01440289,
        -0.02212844]])
mod.intercept_ #interception
array([ 0.26484745])
#p-values determine if the predictors appear to be statistically significant 
#it's easier to get p-values with statsmodel

#we need to transform the target value to non-categorical
#if we don't do this, statsmodel will try to use dummy variables
#dummy variables don't work here because they will create a target value with 2 columns
df['Direction'] = df['Direction'].map({'Down':0, 'Up':1})

#fit model
y, X = dmatrices('Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume', data=df, return_type='dataframe')
#logit = sm.Logit(y.ix[:,0], X) #
logit = sm.Logit(y, X) 
result = logit.fit()
Optimization terminated successfully.
         Current function value: 0.682441
         Iterations 4
print(result.summary())
                           Logit Regression Results                           
==============================================================================
Dep. Variable:              Direction   No. Observations:                 1089
Model:                          Logit   Df Residuals:                     1082
Method:                           MLE   Df Model:                            6
Date:                Fri, 27 Jan 2017   Pseudo R-squ.:                0.006580
Time:                        05:36:33   Log-Likelihood:                -743.18
converged:                       True   LL-Null:                       -748.10
                                        LLR p-value:                    0.1313
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.2669      0.086      3.106      0.002         0.098     0.435
Lag1          -0.0413      0.026     -1.563      0.118        -0.093     0.010
Lag2           0.0584      0.027      2.175      0.030         0.006     0.111
Lag3          -0.0161      0.027     -0.602      0.547        -0.068     0.036
Lag4          -0.0278      0.026     -1.050      0.294        -0.080     0.024
Lag5          -0.0145      0.026     -0.549      0.583        -0.066     0.037
Volume        -0.0227      0.037     -0.616      0.538        -0.095     0.050
==============================================================================

Lag2 seems to be a predictor with statistical significance. We can say that because Lag2 has a small P|z|, meaning that there's a small probability that Lag2 coefficient is equal to zero.

Note: Notice that the coefficients are not exactly the same. That happens because scikit-learn applies some kind of parameter regularization. You can confirm this by reading the scikit-learn documentation, as suggested here: http://stats.stackexchange.com/questions/203740/logistic-regression-scikit-learn-vs-statsmodels. An option to overcome this is to for LogisticRegression to use a big C value (e.g. 1e9).

(c)

#confusion matrix
conf_mat = confusion_matrix(df['Direction'], lr.predict(df[train_cols]))
print(conf_mat) #alternative to 'fancy' plot
[[ 55 429]
 [ 47 558]]
#'fancy' confusion matrix plot
#based on: Raschka (2014)
fig, ax = plt.subplots(figsize=(2, 2))
ax.matshow(conf_mat, cmap=plt.cm.Reds, alpha=0.3)
for i in range(conf_mat.shape[0]):
    for j in range(conf_mat.shape[1]):
        ax.text(x=j, y=i,
        s=conf_mat[i, j],
        va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

png

Note: 'Down' is class 0 because it is the first 'Direction' class in the dataset. When encoding the categorical variable, scikit will automatically attribute the value 0 to the first class it gets, value 1 to the second class it gets and so one.

#overall fraction of correct predictions
lr.score(df[train_cols],df['Direction'])
0.56290174471992649

(d)

#partitioning the dataset
df_9908 = df[(df['Year'] >=1990) & (df['Year'] <=2008)]
df_0910 = df[(df['Year'] >=2009) & (df['Year'] <=2010)]
#to avoid 'ValueError: Found arrays with inconsistent numbers of sample'
#shape must be (X,1) and not (X,)
X = df_9908['Lag2']
X = X.reshape(np.shape(X)[0],1)
#logistic regression
mod = lr.fit(X,df_9908['Direction']) #regression object already exists; just need to fit it to the new data
#confusion matrix
X = df_0910['Lag2']
X = X.reshape(np.shape(X)[0],1)

conf_mat = confusion_matrix(df_0910['Direction'], lr.predict(X))
print(conf_mat)
[[ 9 34]
 [ 5 56]]
#overall fraction of correct predictions
lr.score(X, df_0910['Direction'])
0.625

(e)

#getting data ready
X = df_9908['Lag2']
X = X.reshape(np.shape(X)[0],1)

#linear discriminant analysis (LDA)
lda = LinearDiscriminantAnalysis()
lda.fit(X,df_9908['Direction'])
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)
#getting data ready
X = df_0910['Lag2']
X = X.reshape(np.shape(X)[0],1)

#confusion matrix
conf_mat = confusion_matrix(df_0910['Direction'], lda.predict(X))
print(conf_mat)
[[ 9 34]
 [ 5 56]]
#overall fraction of correct predictions
#it will be helpful for the next question
lda.score(X, df_0910['Direction'])
0.625

(f)

#getting data ready
X = df_9908['Lag2']
X = X.reshape(np.shape(X)[0],1)

#quadratic discriminant analysis (QDA)
qda = QuadraticDiscriminantAnalysis()
qda.fit(X,df_9908['Direction'])
QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariances=False, tol=0.0001)
#getting data ready
X = df_0910['Lag2']
X = X.reshape(np.shape(X)[0],1)

#confusion matrix
conf_mat = confusion_matrix(df_0910['Direction'], qda.predict(X))
print(conf_mat)
[[ 0 43]
 [ 0 61]]
#overall fraction of correct predictions
#it will be helpful for the next question
qda.score(X, df_0910['Direction'])
0.58653846153846156

(g)

#getting data ready
X = df_9908['Lag2']
X = X.reshape(np.shape(X)[0],1)

#creating an instance of Neighbours Classifier and fitting the data
nbrs = KNeighborsClassifier(n_neighbors=1)
nbrs.fit(X,df_9908['Direction'])
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
#getting data ready
X = df_0910['Lag2']
X = X.reshape(np.shape(X)[0],1)

#confusion matrix
conf_mat = confusion_matrix(df_0910['Direction'], nbrs.predict(X))
print(conf_mat)
[[21 22]
 [31 30]]
#overall fraction of correct predictions
#it will be helpful for the next question
nbrs.score(X, df_0910['Direction'])
0.49038461538461536

(h)

The methods that appear to provide the best results are the logistic regression and the linear discriminant analysis (LDA). They are the methods where score (overall fraction of correct predictions) is higher (0.625 vs 0.587 vs 0.490)

(i)

#trying with a different number of neighbors
n_nbrs = 10

#getting data ready for the neigb
X = df_9908['Lag2']
X = X.reshape(np.shape(X)[0],1)

#creating an instance of Neighbours Classifier and fitting the data
nbrs = KNeighborsClassifier(n_neighbors=n_nbrs)
nbrs.fit(X,df_9908['Direction'])

#getting data ready for the confusion matrix
X = df_0910['Lag2']
X = X.reshape(np.shape(X)[0],1)

#confusion matrix
conf_mat = confusion_matrix(df_0910['Direction'], nbrs.predict(X))
print(conf_mat)
[[22 21]
 [24 37]]

Now, it's time to play a little bit with models. We leave this entertaining task to the reader, as homework.

References