{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
instantdtedayseasonyrmnthhrholidayweekdayworkingdayweathersittempatemphumwindspeedcasualregisteredcnt
012011-01-01101006010.240.28790.81031316
122011-01-01101106010.220.27270.80083240
232011-01-01101206010.220.27270.80052732
342011-01-01101306010.240.28790.75031013
452011-01-01101406010.240.28790.750011
\n", "
" ], "text/plain": [ " instant dteday season yr mnth hr holiday weekday workingday \\\n", "0 1 2011-01-01 1 0 1 0 0 6 0 \n", "1 2 2011-01-01 1 0 1 1 0 6 0 \n", "2 3 2011-01-01 1 0 1 2 0 6 0 \n", "3 4 2011-01-01 1 0 1 3 0 6 0 \n", "4 5 2011-01-01 1 0 1 4 0 6 0 \n", "\n", " weathersit temp atemp hum windspeed casual registered cnt \n", "0 1 0.24 0.2879 0.81 0 3 13 16 \n", "1 1 0.22 0.2727 0.80 0 8 32 40 \n", "2 1 0.22 0.2727 0.80 0 5 27 32 \n", "3 1 0.24 0.2879 0.75 0 3 10 13 \n", "4 1 0.24 0.2879 0.75 0 0 1 1 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas\n", "\n", "bike_rentals = pandas.read_csv(\"bike_rental_hour.csv\")\n", "bike_rentals.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(array([ 6972., 3705., 2659., 1660., 987., 663., 369., 188.,\n", " 139., 37.]),\n", " array([ 1. , 98.6, 196.2, 293.8, 391.4, 489. , 586.6, 684.2,\n", " 781.8, 879.4, 977. ]),\n", " )" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": [ "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEACAYAAABcXmojAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\n", "AAALEgAACxIB0t1+/AAAFD5JREFUeJzt3V+MnNd93vHvI1FMaFsVK7igKJGFiWJViEVT22xFN7UR\n", "OnVVxkgpXUkyUIJIiNywjt0WSCz6ouJV6gRoYxmFdBH/ESXYTFmnIWhUkEUrXjRAAa+dSA0jipXY\n", "lqi4CZdu40hJilak+OvFnD2cEhR3lxzujDTfDzDgec97zsx5D7nz8D3vOzupKiRJArhp3AOQJE0O\n", "Q0GS1BkKkqTOUJAkdYaCJKkzFCRJ3ZKhkOSvJ3lh6PF6ks8kuT3JsSSvJHkuyfqhPvuTvJrkZJL7\n", "huq3JTne9j12ow5KknRtspLPKSS5CZgH7gV+EfifVfVrST4H/OWqeiTJVuAbwN8B7gK+A8xUVSWZ\n", "Az5dVXNJngG+VFXPjviYJEnXaKXLR58ATlXVa8Au4GCrPwg80Mr3A4eq6nxVnQZOAduTbARuraq5\n", "1u6poT6SpAmw0lB4GDjUyhuqaqGVF4ANrXwncGaozxkGZwyX18+3eknShFh2KCRZC/wj4N9dvq8G\n", "a1D+vgxJeodbs4K2PwP8XlX9sG0vJLmjqs62paFzrX4e2DzUbxODM4T5Vh6un7/8RZIYLpK0QlWV\n", "UTzPSkLhU1xaOgI4CuwBfrX9eWSo/htJ/jWD5aEZYK5daH4jyXZgDtgNfOnKLzXOXFhzEd5aV1Vv\n", "jnEQJDlQVQfGOYZJ4Vxc4lxc4lxcMsr/TC8rFJK8l8FF5l8Yqv4CcDjJXuA08CBAVZ1Ichg4AVwA\n", "9tWlW5z2AU8C64BnvPNIkibLskKhqv4CeP9ldX/CICiu1P5XgF+5Qv3vAX9z5cOUJK0GP9E8uWbH\n", "PYAJMjvuAUyQ2XEPYILMjnsA70Yr+vDaahisjXlNQZKWK0mN6kKzZwqSpM5QkCR1hoIkqTMUJEmd\n", "oSBJ6gwFSVJnKEiSOkNBktQZCpKkzlCQJHWGgiSpMxQkSZ2hIEnqDAVJUmcoSJI6Q0GS1BkKkqTO\n", "UJAkdYaCJKkzFCRJnaEgSeqWFQpJ1if5ZpKXk5xIsj3J7UmOJXklyXNJ1g+135/k1SQnk9w3VL8t\n", "yfG277EbcUCSpGu33DOFx4Bnquoe4CeAk8AjwLGquht4vm2TZCvwELAV2Ak8niTteZ4A9lbVDDCT\n", "ZOfIjkSSdN2WDIUktwEfq6qvAlTVhap6HdgFHGzNDgIPtPL9wKGqOl9Vp4FTwPYkG4Fbq2qutXtq\n", "qI8kaQIs50xhC/DDJF9L8vtJfiPJe4ENVbXQ2iwAG1r5TuDMUP8zwF1XqJ9v9ZKkCbFmmW0+DHy6\n", "qr6f5Iu0paJFVVVJanTDOjBU3tEekiSAJDu4QW+MywmFM8CZqvp+2/4msB84m+SOqjrblobOtf3z\n", "wOah/pvac8y38nD9/JVf8sAyhy9J06eqZoHZxe0kj47quZdcPqqqs8BrSe5uVZ8AXgK+BexpdXuA\n", "I618FHg4ydokW4AZYK49zxvtzqUAu4f6SJImwHLOFAB+Efh6krXAfwV+DrgZOJxkL3AaeBCgqk4k\n", "OQycAC4A+6pqcWlpH/AksI7B3UzPjug4JEkjkEvv15NhcG1inGNacxHeWldVb45xEJK0bEmqqrJ0\n", "y6X5iWZJUmcoSJI6Q0GS1BkKkqTOUJAkdYaCJKkzFCRJnaEgSeoMBUlSZyhIkjpDQZLUGQqSpM5Q\n", "kCR1hoIkqTMUJEmdoSBJ6gwFSVJnKEiSOkNBktQZCpKkzlCQJHWGgiSpMxQkSd2yQiHJ6SR/kOSF\n", "JHOt7vYkx5K8kuS5JOuH2u9P8mqSk0nuG6rfluR42/fY6A9HknQ9lnumUMCOqvpQVd3b6h4BjlXV\n", "3cDzbZskW4GHgK3ATuDxJGl9ngD2VtUMMJNk54iOQ5I0AitZPspl27uAg618EHigle8HDlXV+ao6\n", "DZwCtifZCNxaVXOt3VNDfSRJE2AlZwrfSfKDJL/Q6jZU1UIrLwAbWvlO4MxQ3zPAXVeon2/1kqQJ\n", "sWaZ7f5eVf1xkr8CHEtycnhnVVWSGt2wDgyVd7SHJAkgyQ5u0BvjskKhqv64/fnDJL8N3AssJLmj\n", "qs62paFzrfk8sHmo+yYGZwjzrTxcP3/lVzyw/COQpClTVbPA7OJ2kkdH9dxLLh8leU+SW1v5vcB9\n", "wHHgKLCnNdsDHGnlo8DDSdYm2QLMAHNVdRZ4I8n2duF591AfSdIEWM6Zwgbgt9sNRGuAr1fVc0l+\n", "ABxOshc4DTwIUFUnkhwGTgAXgH1Vtbi0tA94ElgHPFNVz47wWCRJ1ymX3q8nw+DaxDjHtOYivLWu\n", "qt4c4yAkadmSVFVdfofoNfETzZKkzlCQJHWGgiSpMxQkSZ2hIEnqDAVJUmcoSJI6Q0GS1BkKkqTO\n", "UJAkdYaCJKkzFCRJnaEgSeoMBUlSZyhIkjpDQZLUGQqSpM5QkCR1hoIkqTMUJEmdoSBJ6gwFSVJn\n", "KEiSumWFQpKbk7yQ5Ftt+/Ykx5K8kuS5JOuH2u5P8mqSk0nuG6rfluR42/fY6A9FknS9lnum8Fng\n", "BFBt+xHgWFXdDTzftkmyFXgI2ArsBB5PktbnCWBvVc0AM0l2juYQJEmjsmQoJNkEfBL4MrD4Br8L\n", "ONjKB4EHWvl+4FBVna+q08ApYHuSjcCtVTXX2j011EeSNCGWc6bw68AvAReH6jZU1UIrLwAbWvlO\n", "4MxQuzPAXVeon2/1kqQJsuZqO5P8LHCuql5IsuNKbaqqktSV9l27A0PlHe0hSQJo78c7bsRzXzUU\n", "gJ8EdiX5JPDjwF9K8jSwkOSOqjrblobOtfbzwOah/psYnCHMt/Jw/fzbv+yBFRyCJE2XqpoFZhe3\n", "kzw6que+6vJRVX2+qjZX1RbgYeB3qmo3cBTY05rtAY608lHg4SRrk2wBZoC5qjoLvJFke7vwvHuo\n", "jyRpQix1pnC5xWWiLwCHk+wFTgMPAlTViSSHGdypdAHYV1WLffYBTwLrgGeq6tnrG7okadRy6T17\n", "MgyuT4xzTGsuwlvrqurNMQ5CkpYtSVVVlm65ND/RLEnqDAVJUmcoSJI6Q0GS1BkKkqRupbekTov/\n", "e+n3+I3HqO4kkKSVMBTe1jhvizUPJI2Hy0eSpM5QkCR1hoIkqTMUJEmdoSBJ6gwFSVJnKEiSOkNB\n", "ktQZCpKkzlCQJHWGgiSpMxQkSZ2hIEnqDAVJUmcoSJI6Q0GS1F01FJL8eJLvJXkxyYkk/7LV357k\n", "WJJXkjyXZP1Qn/1JXk1yMsl9Q/Xbkhxv+x67cYckSbpWVw2Fqvo/wMer6oPATwAfT/JR4BHgWFXd\n", "DTzftkmyFXgI2ArsBB7Ppe+1fALYW1UzwEySnTfigCRJ127J5aOq+t+tuBa4GfgRsAs42OoPAg+0\n", "8v3Aoao6X1WngVPA9iQbgVuraq61e2qojyRpQiwZCkluSvIisAB8t6peAjZU1UJrsgBsaOU7gTND\n", "3c8Ad12hfr7VS5ImyJqlGlTVReCDSW4Dvp3k45ftryQj/pb7A0PlHe0hSQJIsoMb9Ma4ZCgsqqrX\n", "k/wHYBuwkOSOqjrblobOtWbzwOahbpsYnCHMt/Jw/fzbv9qB5Q5LkqZOVc0Cs4vbSR4d1XMvdffR\n", "+xfvLEqyDvgHwAvAUWBPa7YHONLKR4GHk6xNsgWYAeaq6izwRpLt7cLz7qE+kqQJsdSZwkbgYJKb\n", "GATI01X1fJIXgMNJ9gKngQcBqupEksPACeACsK+qFpeW9gFPAuuAZ6rq2VEfjCTp+uTSe/ZkGFyf\n", "GOeY1lyEt24a7xhCVWXpdpI0eN8c1XuGn2iWJHWGgiSpMxQkSZ2hIEnqDAVJUmcoSJK6ZX+iWatr\n", "9L86ZOW8LVaaPobCxBp3JpgH0jRy+UiS1BkKkqTOUJAkdYaCJKkzFCRJnaEgSeoMBUlSZyhIkjpD\n", "QZLUGQqSpM5QkCR1hoIkqTMUJEmdoSBJ6gwFSVK3ZCgk2Zzku0leSvKHST7T6m9PcizJK0meS7J+\n", "qM/+JK8mOZnkvqH6bUmOt32P3ZhDkiRdq+WcKZwH/llV/Q3gI8A/SXIP8AhwrKruBp5v2yTZCjwE\n", "bAV2Ao8nWfzGlieAvVU1A8wk2TnSo5EkXZclQ6GqzlbVi63858DLwF3ALuBga3YQeKCV7wcOVdX5\n", "qjoNnAK2J9kI3FpVc63dU0N9JEkTYEXXFJJ8APgQ8D1gQ1UttF0LwIZWvhM4M9TtDIMQubx+vtVL\n", "kibEsr+jOcn7gN8CPltVf3ZpRQiqqkb7RfMHhso72kOSBJBkBzfojXFZoZDkFgaB8HRVHWnVC0nu\n", "qKqzbWnoXKufBzYPdd/E4AxhvpWH6+ev/IoHljl8SZo+VTULzC5uJ3l0VM+9nLuPAnwFOFFVXxza\n", "dRTY08p7gCND9Q8nWZtkCzADzFXVWeCNJNvbc+4e6iNJmgCpuvqqT5KPAv8R+ANgsfF+YA44DPxV\n", "4DTwYFX9aevzeeDngQsMlpu+3eq3AU8C64BnquozV3i9uvQy47DmIrx103jHEMb7+oMxVFWWbidp\n", "3JLUqH5elwyF1WYogKEgaSVGGQp+olmS1BkKkqTOUJAkdYaCJKkzFCRJnaEgSeqW/WsuNH1G+6tL\n", "ro23xUqry1DQVYw7E8wDabW5fCRJ6gwFSVJnKEiSOkNBktQZCpKkzlCQJHWGgiSpMxQkSZ2hIEnq\n", "DAVJUmcoSJI6Q0GS1BkKkqTOUJAkdYaCJKlbMhSSfDXJQpLjQ3W3JzmW5JUkzyVZP7Rvf5JXk5xM\n", "ct9Q/bYkx9u+x0Z/KJKk67WcM4WvATsvq3sEOFZVdwPPt22SbAUeAra2Po8nWfymlCeAvVU1A8wk\n", "ufw5JUljtmQoVNXvAj+6rHoXcLCVDwIPtPL9wKGqOl9Vp4FTwPYkG4Fbq2qutXtqqI8kaUJc6zWF\n", "DVW10MoLwIZWvhM4M9TuDHDXFernW70kaYJc93c0V1WN/gveDwyVd7SHptHo/22tTFX5RdGaOEl2\n", "cIPeGK81FBaS3FFVZ9vS0LlWPw9sHmq3icEZwnwrD9fPv/3TH7jGYendZ5yZYB5oMlXVLDC7uJ3k\n", "0VE997UuHx0F9rTyHuDIUP3DSdYm2QLMAHNVdRZ4I8n2duF591AfSdKEWPJMIckh4KeA9yd5DfgX\n", "wBeAw0n2AqeBBwGq6kSSw8AJ4AKwr6oW/6u3D3gSWAc8U1XPjvZQJEnXK5fesyfDYA15nGNacxHe\n", "umn8yxbj/ntxDBCvKegdIUmN6t+qn2iWJHWGgiSpMxQkSZ2hIEnqDAVJUmcoSJK66/41F9K72bh/\n", "zQb4qza0ugwF6arGnQnmgVaXy0eSpM5QkCR1hoIkqTMUJEmdoSBJ6gwFSVJnKEiSOj+nIE04P0Cn\n", "1WQoSBNv3JlgHkwTl48kSZ2hIEnqDAVJUmcoSJI6LzRLWtK474Dy7qfVs+qhkGQn8EXgZuDLVfWr\n", "qz0GSSs1zkzI2EMJpieYVnX5KMnNwL8BdgJbgU8luWc1x/DOMTvuAUyQ2XEPYILMjnsAY1JXeHz3\n", "bepvxGN6rPY1hXuBU1V1uqrOA78J3L/KY3iHmB33ACbI7LgHMEFmxz2ACTI77gG8K612KNwFvDa0\n", "fabVSZImwGpfU1jmedhPv35jh3E1b902vteWNKmm5bpGqlbvOJN8BDhQVTvb9n7g4vDF5kmYeEl6\n", "pxlVYKx2KKwB/gvw94E/AuaAT1XVy6s2CEnS21rV5aOqupDk08C3GdyS+hUDQZImx6qeKUiSJtvE\n", "/JqLJDuTnEzyapLPjXs8N1qSzUm+m+SlJH+Y5DOt/vYkx5K8kuS5JOuH+uxv83MyyX3jG/3oJbk5\n", "yQtJvtW2p3IeAJKsT/LNJC8nOZFk+zTORzuul5IcT/KNJD82TfOQ5KtJFpIcH6pb8fEn2dbm8NUk\n", "jy35wlU19geDpaRTwAeAW4AXgXvGPa4bfMx3AB9s5fcxuNZyD/BrwC+3+s8BX2jlrW1ebmnzdAq4\n", "adzHMcL5+OfA14GjbXsq56Ed40Hg51t5DXDbtM1HO5b/BvxY2/63wJ5pmgfgY8CHgONDdSs5/sWV\n", "oDng3lZ+Bth5tdedlDOFqftQW1WdraoXW/nPgZcZfGZjF4M3BdqfD7Ty/cChqjpfVacZ/KXfu6qD\n", "vkGSbAI+CXyZS9/oMnXzAJDkNuBjVfVVGFyHq6rXmb75eAM4D7yn3aDyHgY3p0zNPFTV7wI/uqx6\n", "Jce/PclG4Naqmmvtnhrqc0WTEgpT/aG2JB9g8D+C7wEbqmqh7VoANrTynQzmZdG7aY5+Hfgl4OJQ\n", "3TTOA8AW4IdJvpbk95P8RpL3MmXzUVV/Avwr4H8wCIM/rapjTNk8XMFKj//y+nmWmJdJCYWpvdqd\n", "5H3AbwGfrao/G95Xg/O9q83NO37ekvwscK6qXuBtvvdxGuZhyBrgw8DjVfVh4C+AR4YbTMN8JPlr\n", "wD9lsBRyJ/C+JP94uM00zMPVLOP4r8mkhMI8sHloezP/f7q9KyW5hUEgPF1VR1r1QpI72v6NwLlW\n", "f/kcbWp173Q/CexK8t+BQ8BPJ3ma6ZuHRWeAM1X1/bb9TQYhcXbK5uNvA/+pqv5XVV0A/j3wd5m+\n", "ebjcSn4uzrT6TZfVX3VeJiUUfgDMJPlAkrXAQ8DRMY/phkoS4CvAiar64tCuowwuqNH+PDJU/3CS\n", "tUm2ADMMLiC9o1XV56tqc1VtAR4GfqeqdjNl87Coqs4CryW5u1V9AngJ+BbTNR8ngY8kWdd+Vj4B\n", "nGD65uFyK/q5aP+e3mh3sAXYPdTnysZ9hX3oqvrPMLgD5xSwf9zjWYXj/SiDNfQXgRfaYydwO/Ad\n", "4BXgOWD9UJ/Pt/k5CfzDcR/DDZiTn+LS3UfTPA9/C/g+8J8Z/A/5tmmcD+CXGQTicQYXVW+Zpnlg\n", "cOb8R8CbDK65/ty1HD+wrc3hKeBLS72uH16TJHWTsnwkSZoAhoIkqTMUJEmdoSBJ6gwFSVJnKEiS\n", "OkNBktQZCpKk7v8BIgy2anPl5soAAAAASUVORK5CYII=\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "plt.hist(bike_rentals[\"cnt\"])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "instant 0.278379\n", "season 0.178056\n", "yr 0.250495\n", "mnth 0.120638\n", "hr 0.394071\n", "holiday -0.030927\n", "weekday 0.026900\n", "workingday 0.030284\n", "weathersit -0.142426\n", "temp 0.404772\n", "atemp 0.400929\n", "hum -0.322911\n", "windspeed 0.093234\n", "casual 0.694564\n", "registered 0.972151\n", "cnt 1.000000\n", "Name: cnt, dtype: float64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bike_rentals.corr()[\"cnt\"]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def assign_label(hour):\n", " if hour >=0 and hour < 6:\n", " return 4\n", " elif hour >=6 and hour < 12:\n", " return 1\n", " elif hour >= 12 and hour < 18:\n", " return 2\n", " elif hour >= 18 and hour <=24:\n", " return 3\n", "\n", "bike_rentals[\"time_label\"] = bike_rentals[\"hr\"].apply(assign_label)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Error metric\n", "\n", "The mean squared error metric makes the most sense to evaluate our error. MSE works on continuous numeric data, which fits our data quite well." ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "train = bike_rentals.sample(frac=.8)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "test = bike_rentals.loc[~bike_rentals.index.isin(train.index)]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import LinearRegression\n", "\n", "predictors = list(train.columns)\n", "predictors.remove(\"cnt\")\n", "predictors.remove(\"casual\")\n", "predictors.remove(\"registered\")\n", "predictors.remove(\"dteday\")\n", "\n", "reg = LinearRegression()\n", "\n", "reg.fit(train[predictors], train[\"cnt\"])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "16586.154698429491" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy\n", "predictions = reg.predict(test[predictors])\n", "\n", "numpy.mean((predictions - test[\"cnt\"]) ** 2)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ -75.31906346, 144.15652539, 125.29713548, ..., 167.94469909,\n", " 181.44415684, 165.3047817 ])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "actual" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "4 1\n", "10 36\n", "16 93\n", "24 17\n", "36 75\n", "39 76\n", "40 65\n", "45 9\n", "48 2\n", "52 64\n", "68 12\n", "72 2\n", "76 179\n", "80 78\n", "81 97\n", "87 112\n", "88 54\n", "90 35\n", "92 6\n", "109 169\n", "111 89\n", "112 43\n", "113 42\n", "115 11\n", "122 219\n", "133 112\n", "138 17\n", "144 84\n", "146 134\n", "147 63\n", " ... \n", "17232 34\n", "17243 31\n", "17245 8\n", "17255 32\n", "17265 45\n", "17269 75\n", "17280 63\n", "17289 51\n", "17291 239\n", "17292 191\n", "17298 225\n", "17301 213\n", "17302 128\n", "17304 92\n", "17309 19\n", "17311 3\n", "17312 3\n", "17315 44\n", "17316 49\n", "17327 66\n", "17339 33\n", "17340 74\n", "17343 144\n", "17346 138\n", "17348 123\n", "17349 125\n", "17351 72\n", "17353 36\n", "17354 49\n", "17373 122\n", "Name: cnt, dtype: int64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test[\"cnt\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Error\n", "\n", "The error is very high, which may be due to the fact that the data has a few extremely high rental counts, but otherwise mostly low counts. Larger errors are penalized more with MSE, which leads to a higher total error." ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "DecisionTreeRegressor(compute_importances=None, criterion='mse',\n", " max_depth=None, max_features=None, max_leaf_nodes=None,\n", " min_density=None, min_samples_leaf=5, min_samples_split=2,\n", " random_state=None, splitter='best')" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.tree import DecisionTreeRegressor\n", "\n", "reg = DecisionTreeRegressor(min_samples_leaf=5)\n", "\n", "reg.fit(train[predictors], train[\"cnt\"])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2644.2820429330714" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions = reg.predict(test[predictors])\n", "\n", "numpy.mean((predictions - test[\"cnt\"]) ** 2)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2964.7288070579207" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reg = DecisionTreeRegressor(min_samples_leaf=2)\n", "\n", "reg.fit(train[predictors], train[\"cnt\"])\n", "\n", "predictions = reg.predict(test[predictors])\n", "\n", "numpy.mean((predictions - test[\"cnt\"]) ** 2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Decision tree error\n", "\n", "By taking the nonlinear predictors into account, the decision tree regressor appears to have much higher accuracy than linear regression." ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "RandomForestRegressor(bootstrap=True, compute_importances=None,\n", " criterion='mse', max_depth=None, max_features='auto',\n", " max_leaf_nodes=None, min_density=None, min_samples_leaf=5,\n", " min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False,\n", " random_state=None, verbose=0)" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "\n", "reg = RandomForestRegressor(min_samples_leaf=5)\n", "reg.fit(train[predictors], train[\"cnt\"])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1911.9827104170736" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions = reg.predict(test[predictors])\n", "\n", "numpy.mean((predictions - test[\"cnt\"]) ** 2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Random forest error\n", "\n", "By removing some of the sources of overfitting, the random forest accuracy is improved over the decision tree accuracy." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.2" } }, "nbformat": 4, "nbformat_minor": 0 }