Queer European MD passionate about IT

Mission211Solution.ipynb 41 KB

import pandas

bike_rentals = pandas.read_csv("bike_rental_hour.csv")
bike_rentals.head()
instant dteday season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
0 1 2011-01-01 1 0 1 0 0 6 0 1 0.24 0.2879 0.81 0 3 13 16
1 2 2011-01-01 1 0 1 1 0 6 0 1 0.22 0.2727 0.80 0 8 32 40
2 3 2011-01-01 1 0 1 2 0 6 0 1 0.22 0.2727 0.80 0 5 27 32
3 4 2011-01-01 1 0 1 3 0 6 0 1 0.24 0.2879 0.75 0 3 10 13
4 5 2011-01-01 1 0 1 4 0 6 0 1 0.24 0.2879 0.75 0 0 1 1
%matplotlib inline

import matplotlib.pyplot as plt

plt.hist(bike_rentals["cnt"])
(array([ 6972.,  3705.,  2659.,  1660.,   987.,   663.,   369.,   188.,
          139.,    37.]),
 array([   1. ,   98.6,  196.2,  293.8,  391.4,  489. ,  586.6,  684.2,
         781.8,  879.4,  977. ]),
 <a list of 10 Patch objects>)
bike_rentals.corr()["cnt"]
instant       0.278379
season        0.178056
yr            0.250495
mnth          0.120638
hr            0.394071
holiday      -0.030927
weekday       0.026900
workingday    0.030284
weathersit   -0.142426
temp          0.404772
atemp         0.400929
hum          -0.322911
windspeed     0.093234
casual        0.694564
registered    0.972151
cnt           1.000000
Name: cnt, dtype: float64
def assign_label(hour):
    if hour >=0 and hour < 6:
        return 4
    elif hour >=6 and hour < 12:
        return 1
    elif hour >= 12 and hour < 18:
        return 2
    elif hour >= 18 and hour <=24:
        return 3

bike_rentals["time_label"] = bike_rentals["hr"].apply(assign_label)

Error metric

The mean squared error metric makes the most sense to evaluate our error. MSE works on continuous numeric data, which fits our data quite well.

train = bike_rentals.sample(frac=.8)
test = bike_rentals.loc[~bike_rentals.index.isin(train.index)]
from sklearn.linear_model import LinearRegression

predictors = list(train.columns)
predictors.remove("cnt")
predictors.remove("casual")
predictors.remove("registered")
predictors.remove("dteday")

reg = LinearRegression()

reg.fit(train[predictors], train["cnt"])
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
import numpy
predictions = reg.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)
16586.154698429491
actual
array([ -75.31906346,  144.15652539,  125.29713548, ...,  167.94469909,
        181.44415684,  165.3047817 ])
test["cnt"]
4          1
10        36
16        93
24        17
36        75
39        76
40        65
45         9
48         2
52        64
68        12
72         2
76       179
80        78
81        97
87       112
88        54
90        35
92         6
109      169
111       89
112       43
113       42
115       11
122      219
133      112
138       17
144       84
146      134
147       63
        ... 
17232     34
17243     31
17245      8
17255     32
17265     45
17269     75
17280     63
17289     51
17291    239
17292    191
17298    225
17301    213
17302    128
17304     92
17309     19
17311      3
17312      3
17315     44
17316     49
17327     66
17339     33
17340     74
17343    144
17346    138
17348    123
17349    125
17351     72
17353     36
17354     49
17373    122
Name: cnt, dtype: int64

Error

The error is very high, which may be due to the fact that the data has a few extremely high rental counts, but otherwise mostly low counts. Larger errors are penalized more with MSE, which leads to a higher total error.

from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor(min_samples_leaf=5)

reg.fit(train[predictors], train["cnt"])
DecisionTreeRegressor(compute_importances=None, criterion='mse',
           max_depth=None, max_features=None, max_leaf_nodes=None,
           min_density=None, min_samples_leaf=5, min_samples_split=2,
           random_state=None, splitter='best')
predictions = reg.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)
2644.2820429330714
reg = DecisionTreeRegressor(min_samples_leaf=2)

reg.fit(train[predictors], train["cnt"])

predictions = reg.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)
2964.7288070579207

Decision tree error

By taking the nonlinear predictors into account, the decision tree regressor appears to have much higher accuracy than linear regression.

from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor(min_samples_leaf=5)
reg.fit(train[predictors], train["cnt"])
RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=5,
           min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False,
           random_state=None, verbose=0)
predictions = reg.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)
1911.9827104170736

Random forest error

By removing some of the sources of overfitting, the random forest accuracy is improved over the decision tree accuracy.