import pandas
bike_rentals = pandas.read_csv("bike_rental_hour.csv")
bike_rentals.head()
instant | dteday | season | yr | mnth | hr | holiday | weekday | workingday | weathersit | temp | atemp | hum | windspeed | casual | registered | cnt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2011-01-01 | 1 | 0 | 1 | 0 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.81 | 0 | 3 | 13 | 16 |
1 | 2 | 2011-01-01 | 1 | 0 | 1 | 1 | 0 | 6 | 0 | 1 | 0.22 | 0.2727 | 0.80 | 0 | 8 | 32 | 40 |
2 | 3 | 2011-01-01 | 1 | 0 | 1 | 2 | 0 | 6 | 0 | 1 | 0.22 | 0.2727 | 0.80 | 0 | 5 | 27 | 32 |
3 | 4 | 2011-01-01 | 1 | 0 | 1 | 3 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.75 | 0 | 3 | 10 | 13 |
4 | 5 | 2011-01-01 | 1 | 0 | 1 | 4 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.75 | 0 | 0 | 1 | 1 |
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(bike_rentals["cnt"])
(array([ 6972., 3705., 2659., 1660., 987., 663., 369., 188., 139., 37.]), array([ 1. , 98.6, 196.2, 293.8, 391.4, 489. , 586.6, 684.2, 781.8, 879.4, 977. ]), <a list of 10 Patch objects>)
bike_rentals.corr()["cnt"]
instant 0.278379 season 0.178056 yr 0.250495 mnth 0.120638 hr 0.394071 holiday -0.030927 weekday 0.026900 workingday 0.030284 weathersit -0.142426 temp 0.404772 atemp 0.400929 hum -0.322911 windspeed 0.093234 casual 0.694564 registered 0.972151 cnt 1.000000 Name: cnt, dtype: float64
def assign_label(hour):
if hour >=0 and hour < 6:
return 4
elif hour >=6 and hour < 12:
return 1
elif hour >= 12 and hour < 18:
return 2
elif hour >= 18 and hour <=24:
return 3
bike_rentals["time_label"] = bike_rentals["hr"].apply(assign_label)
The mean squared error metric makes the most sense to evaluate our error. MSE works on continuous numeric data, which fits our data quite well.
train = bike_rentals.sample(frac=.8)
test = bike_rentals.loc[~bike_rentals.index.isin(train.index)]
from sklearn.linear_model import LinearRegression
predictors = list(train.columns)
predictors.remove("cnt")
predictors.remove("casual")
predictors.remove("registered")
predictors.remove("dteday")
reg = LinearRegression()
reg.fit(train[predictors], train["cnt"])
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
import numpy
predictions = reg.predict(test[predictors])
numpy.mean((predictions - test["cnt"]) ** 2)
16586.154698429491
actual
array([ -75.31906346, 144.15652539, 125.29713548, ..., 167.94469909, 181.44415684, 165.3047817 ])
test["cnt"]
4 1 10 36 16 93 24 17 36 75 39 76 40 65 45 9 48 2 52 64 68 12 72 2 76 179 80 78 81 97 87 112 88 54 90 35 92 6 109 169 111 89 112 43 113 42 115 11 122 219 133 112 138 17 144 84 146 134 147 63 ... 17232 34 17243 31 17245 8 17255 32 17265 45 17269 75 17280 63 17289 51 17291 239 17292 191 17298 225 17301 213 17302 128 17304 92 17309 19 17311 3 17312 3 17315 44 17316 49 17327 66 17339 33 17340 74 17343 144 17346 138 17348 123 17349 125 17351 72 17353 36 17354 49 17373 122 Name: cnt, dtype: int64
The error is very high, which may be due to the fact that the data has a few extremely high rental counts, but otherwise mostly low counts. Larger errors are penalized more with MSE, which leads to a higher total error.
from sklearn.tree import DecisionTreeRegressor
reg = DecisionTreeRegressor(min_samples_leaf=5)
reg.fit(train[predictors], train["cnt"])
DecisionTreeRegressor(compute_importances=None, criterion='mse', max_depth=None, max_features=None, max_leaf_nodes=None, min_density=None, min_samples_leaf=5, min_samples_split=2, random_state=None, splitter='best')
predictions = reg.predict(test[predictors])
numpy.mean((predictions - test["cnt"]) ** 2)
2644.2820429330714
reg = DecisionTreeRegressor(min_samples_leaf=2)
reg.fit(train[predictors], train["cnt"])
predictions = reg.predict(test[predictors])
numpy.mean((predictions - test["cnt"]) ** 2)
2964.7288070579207
By taking the nonlinear predictors into account, the decision tree regressor appears to have much higher accuracy than linear regression.
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(min_samples_leaf=5)
reg.fit(train[predictors], train["cnt"])
RandomForestRegressor(bootstrap=True, compute_importances=None, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_density=None, min_samples_leaf=5, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0)
predictions = reg.predict(test[predictors])
numpy.mean((predictions - test["cnt"]) ** 2)
1911.9827104170736
By removing some of the sources of overfitting, the random forest accuracy is improved over the decision tree accuracy.