Exercise 5.9
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
%matplotlib inline
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['MEDV'] = pd.Series(boston.target)
medv = df['MEDV']
df.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
(a)
mu = medv.mean()
(b)
medv.std()/np.sqrt(len(df))
0.40886114749753505
(c)
means = [medv.sample(n = len(df), replace=True).mean() for _ in range(1000)]
np.std(means)
0.4126021332293619
(d)
SE = np.std(means)
print(mu - 2*SE, mu + 2*SE)
21.7076020577 23.3580105906
(e)
medv.median()
21.2
(f)
medians = [medv.sample(n = len(df), replace=True).median() for _ in range(1000)]
np.std(medians)
0.37716367799670181
(g)
medv.quantile(.1)
12.75
(h)
quantiles = [medv.sample(n = len(df), replace=True).quantile(.1) for _ in range(1000)]
np.std(quantiles)x
0.50477123531358237