Exercise 4.10

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #visualization library
from sklearn.linear_model import LogisticRegression #problem will be solved with scikit
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis #linear discriminant analysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis #quadratic discriminant analysis
from sklearn.neighbors import KNeighborsClassifier #K nearest neighbours (KNN)

import statsmodels.api as sm #to compute p-values
from patsy import dmatrices

%matplotlib inline

df = pd.read_csv('../data/Weekly.csv',index_col=0)

df.head()

	Year	Lag1	Lag2	Lag3	Lag4	Lag5	Volume	Today	Direction
1	1990	0.816	1.572	-3.936	-0.229	-3.484	0.154976	-0.270	Down
2	1990	-0.270	0.816	1.572	-3.936	-0.229	0.148574	-2.576	Down
3	1990	-2.576	-0.270	0.816	1.572	-3.936	0.159837	3.514	Up
4	1990	3.514	-2.576	-0.270	0.816	1.572	0.161630	0.712	Up
5	1990	0.712	3.514	-2.576	-0.270	0.816	0.153728	1.178	Up

(a)

df.describe() #descriptive statistics

	Year	Lag1	Lag2	Lag3	Lag4	Lag5	Volume	Today
count	1089.000000	1089.000000	1089.000000	1089.000000	1089.000000	1089.000000	1089.000000	1089.000000
mean	2000.048669	0.150585	0.151079	0.147205	0.145818	0.139893	1.574618	0.149899
std	6.033182	2.357013	2.357254	2.360502	2.360279	2.361285	1.686636	2.356927
min	1990.000000	-18.195000	-18.195000	-18.195000	-18.195000	-18.195000	0.087465	-18.195000
25%	1995.000000	-1.154000	-1.154000	-1.158000	-1.158000	-1.166000	0.332022	-1.154000
50%	2000.000000	0.241000	0.241000	0.241000	0.238000	0.234000	1.002680	0.241000
75%	2005.000000	1.405000	1.409000	1.409000	1.409000	1.405000	2.053727	1.405000
max	2010.000000	12.026000	12.026000	12.026000	12.026000	12.026000	9.328214	12.026000

df.corr() #correlation matrix

	Year	Lag1	Lag2	Lag3	Lag4	Lag5	Volume	Today
Year	1.000000	-0.032289	-0.033390	-0.030006	-0.031128	-0.030519	0.841942	-0.032460
Lag1	-0.032289	1.000000	-0.074853	0.058636	-0.071274	-0.008183	-0.064951	-0.075032
Lag2	-0.033390	-0.074853	1.000000	-0.075721	0.058382	-0.072499	-0.085513	0.059167
Lag3	-0.030006	0.058636	-0.075721	1.000000	-0.075396	0.060657	-0.069288	-0.071244
Lag4	-0.031128	-0.071274	0.058382	-0.075396	1.000000	-0.075675	-0.061075	-0.007826
Lag5	-0.030519	-0.008183	-0.072499	0.060657	-0.075675	1.000000	-0.058517	0.011013
Volume	0.841942	-0.064951	-0.085513	-0.069288	-0.061075	-0.058517	1.000000	-0.033078
Today	-0.032460	-0.075032	0.059167	-0.071244	-0.007826	0.011013	-0.033078	1.000000

Relationship Year/Volume should be explored

sns.distplot(df['Today']);

C:\Program Files\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

png

sns.boxplot(x='Direction', y='Volume', data=df);

png

sns.pairplot(df);

png

plt.scatter(df['Year'],df['Volume']);

png

The Year/Volume relationship is the only one with a visible pattern

(b)

#logistic regression model
train_cols = ['Lag1','Lag2','Lag3','Lag4','Lag5','Volume'] #independent var. considered in the logistic model
lr = LogisticRegression()
mod = lr.fit(df[train_cols], df['Direction'])

mod.coef_ #independent var. coefficients

array([[-0.04117292,  0.05846974, -0.01599122, -0.02769998, -0.01440289,
        -0.02212844]])

mod.intercept_ #interception

array([ 0.26484745])

#p-values determine if the predictors appear to be statistically significant 
#it's easier to get p-values with statsmodel

#we need to transform the target value to non-categorical
#if we don't do this, statsmodel will try to use dummy variables
#dummy variables don't work here because they will create a target value with 2 columns
df['Direction'] = df['Direction'].map({'Down':0, 'Up':1})

#fit model
y, X = dmatrices('Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume', data=df, return_type='dataframe')
#logit = sm.Logit(y.ix[:,0], X) #
logit = sm.Logit(y, X) 
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.682441
         Iterations 4

print(result.summary())

                           Logit Regression Results                           
==============================================================================
Dep. Variable:              Direction   No. Observations:                 1089
Model:                          Logit   Df Residuals:                     1082
Method:                           MLE   Df Model:                            6
Date:                Fri, 27 Jan 2017   Pseudo R-squ.:                0.006580
Time:                        05:36:33   Log-Likelihood:                -743.18
converged:                       True   LL-Null:                       -748.10
                                        LLR p-value:                    0.1313
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.2669      0.086      3.106      0.002         0.098     0.435
Lag1          -0.0413      0.026     -1.563      0.118        -0.093     0.010
Lag2           0.0584      0.027      2.175      0.030         0.006     0.111
Lag3          -0.0161      0.027     -0.602      0.547        -0.068     0.036
Lag4          -0.0278      0.026     -1.050      0.294        -0.080     0.024
Lag5          -0.0145      0.026     -0.549      0.583        -0.066     0.037
Volume        -0.0227      0.037     -0.616      0.538        -0.095     0.050
==============================================================================

Lag2 seems to be a predictor with statistical significance. We can say that because Lag2 has a small P|z|, meaning that there's a small probability that Lag2 coefficient is equal to zero.

Note: Notice that the coefficients are not exactly the same. That happens because scikit-learn applies some kind of parameter regularization. You can confirm this by reading the scikit-learn documentation, as suggested here: http://stats.stackexchange.com/questions/203740/logistic-regression-scikit-learn-vs-statsmodels. An option to overcome this is to for LogisticRegression to use a big C value (e.g. 1e9).

(c)

#confusion matrix
conf_mat = confusion_matrix(df['Direction'], lr.predict(df[train_cols]))
print(conf_mat) #alternative to 'fancy' plot

[[ 55 429]
 [ 47 558]]

#'fancy' confusion matrix plot
#based on: Raschka (2014)
fig, ax = plt.subplots(figsize=(2, 2))
ax.matshow(conf_mat, cmap=plt.cm.Reds, alpha=0.3)
for i in range(conf_mat.shape[0]):
    for j in range(conf_mat.shape[1]):
        ax.text(x=j, y=i,
        s=conf_mat[i, j],
        va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

png

The confusion matrix gives us two types of mistakes: 1) false positives; 2) false negatives.
A false positive occurs when our prediction gives us a positive value but the real value is negative. A false negative occurs when our prediction gives us a negative value but the real value is positive.
Assuming that class 1 ('Up') is the positive class, our model correctly classified 55 samples that belong to class 0 (true negatives) and 558 samples that belong to class 1 (true positives). However, our model also incorrectly misclassified 429 samples from class 0 to class 1 (false negatives), and it predicted that 47 samples were 'Up' although they were 'Down' (false positives).

Note: 'Down' is class 0 because it is the first 'Direction' class in the dataset. When encoding the categorical variable, scikit will automatically attribute the value 0 to the first class it gets, value 1 to the second class it gets and so one.

#overall fraction of correct predictions
lr.score(df[train_cols],df['Direction'])

0.56290174471992649

(d)

#partitioning the dataset
df_9908 = df[(df['Year'] >=1990) & (df['Year'] <=2008)]
df_0910 = df[(df['Year'] >=2009) & (df['Year'] <=2010)]

#to avoid 'ValueError: Found arrays with inconsistent numbers of sample'
#shape must be (X,1) and not (X,)
X = df_9908['Lag2']
X = X.reshape(np.shape(X)[0],1)

#logistic regression
mod = lr.fit(X,df_9908['Direction']) #regression object already exists; just need to fit it to the new data

#confusion matrix
X = df_0910['Lag2']
X = X.reshape(np.shape(X)[0],1)

conf_mat = confusion_matrix(df_0910['Direction'], lr.predict(X))
print(conf_mat)

[[ 9 34]
 [ 5 56]]

#overall fraction of correct predictions
lr.score(X, df_0910['Direction'])

0.625

(e)

#getting data ready
X = df_9908['Lag2']
X = X.reshape(np.shape(X)[0],1)

#linear discriminant analysis (LDA)
lda = LinearDiscriminantAnalysis()
lda.fit(X,df_9908['Direction'])

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

#getting data ready
X = df_0910['Lag2']
X = X.reshape(np.shape(X)[0],1)

#confusion matrix
conf_mat = confusion_matrix(df_0910['Direction'], lda.predict(X))
print(conf_mat)

[[ 9 34]
 [ 5 56]]

#overall fraction of correct predictions
#it will be helpful for the next question
lda.score(X, df_0910['Direction'])

0.625

(f)

#getting data ready
X = df_9908['Lag2']
X = X.reshape(np.shape(X)[0],1)

#quadratic discriminant analysis (QDA)
qda = QuadraticDiscriminantAnalysis()
qda.fit(X,df_9908['Direction'])

QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariances=False, tol=0.0001)

#getting data ready
X = df_0910['Lag2']
X = X.reshape(np.shape(X)[0],1)

#confusion matrix
conf_mat = confusion_matrix(df_0910['Direction'], qda.predict(X))
print(conf_mat)

[[ 0 43]
 [ 0 61]]

#overall fraction of correct predictions
#it will be helpful for the next question
qda.score(X, df_0910['Direction'])

0.58653846153846156

(g)

#getting data ready
X = df_9908['Lag2']
X = X.reshape(np.shape(X)[0],1)

#creating an instance of Neighbours Classifier and fitting the data
nbrs = KNeighborsClassifier(n_neighbors=1)
nbrs.fit(X,df_9908['Direction'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

#getting data ready
X = df_0910['Lag2']
X = X.reshape(np.shape(X)[0],1)

#confusion matrix
conf_mat = confusion_matrix(df_0910['Direction'], nbrs.predict(X))
print(conf_mat)

[[21 22]
 [31 30]]

#overall fraction of correct predictions
#it will be helpful for the next question
nbrs.score(X, df_0910['Direction'])

0.49038461538461536

(h)

The methods that appear to provide the best results are the logistic regression and the linear discriminant analysis (LDA). They are the methods where score (overall fraction of correct predictions) is higher (0.625 vs 0.587 vs 0.490)

(i)

#trying with a different number of neighbors
n_nbrs = 10

#getting data ready for the neigb
X = df_9908['Lag2']
X = X.reshape(np.shape(X)[0],1)

#creating an instance of Neighbours Classifier and fitting the data
nbrs = KNeighborsClassifier(n_neighbors=n_nbrs)
nbrs.fit(X,df_9908['Direction'])

#getting data ready for the confusion matrix
X = df_0910['Lag2']
X = X.reshape(np.shape(X)[0],1)

#confusion matrix
conf_mat = confusion_matrix(df_0910['Direction'], nbrs.predict(X))
print(conf_mat)

[[22 21]
 [24 37]]

Now, it's time to play a little bit with models. We leave this entertaining task to the reader, as homework.

References

Raschka, S., 2014, Python Machine Learning