{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Introducing Data Science Workflows" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "train = pd.read_csv(\"train.csv\")\n", "holdout = pd.read_csv(\"test.csv\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocessing the Data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# %load functions.py\n", "def process_missing(df):\n", " \"\"\"Handle various missing values from the data set\n", "\n", " Usage\n", " ------\n", "\n", " holdout = process_missing(holdout)\n", " \"\"\"\n", " df[\"Fare\"] = df[\"Fare\"].fillna(train[\"Fare\"].mean())\n", " df[\"Embarked\"] = df[\"Embarked\"].fillna(\"S\")\n", " return df\n", "\n", "def process_age(df):\n", " \"\"\"Process the Age column into pre-defined 'bins' \n", "\n", " Usage\n", " ------\n", "\n", " train = process_age(train)\n", " \"\"\"\n", " df[\"Age\"] = df[\"Age\"].fillna(-0.5)\n", " cut_points = [-1,0,5,12,18,35,60,100]\n", " label_names = [\"Missing\",\"Infant\",\"Child\",\"Teenager\",\"Young Adult\",\"Adult\",\"Senior\"]\n", " df[\"Age_categories\"] = pd.cut(df[\"Age\"],cut_points,labels=label_names)\n", " return df\n", "\n", "def process_fare(df):\n", " \"\"\"Process the Fare column into pre-defined 'bins' \n", "\n", " Usage\n", " ------\n", "\n", " train = process_fare(train)\n", " \"\"\"\n", " cut_points = [-1,12,50,100,1000]\n", " label_names = [\"0-12\",\"12-50\",\"50-100\",\"100+\"]\n", " df[\"Fare_categories\"] = pd.cut(df[\"Fare\"],cut_points,labels=label_names)\n", " return df\n", "\n", "def process_cabin(df):\n", " \"\"\"Process the Cabin column into pre-defined 'bins' \n", "\n", " Usage\n", " ------\n", "\n", " train process_cabin(train)\n", " \"\"\"\n", " df[\"Cabin_type\"] = df[\"Cabin\"].str[0]\n", " df[\"Cabin_type\"] = df[\"Cabin_type\"].fillna(\"Unknown\")\n", " df = df.drop('Cabin',axis=1)\n", " return df\n", "\n", "def process_titles(df):\n", " \"\"\"Extract and categorize the title from the name column \n", "\n", " Usage\n", " ------\n", "\n", " train = process_titles(train)\n", " \"\"\"\n", " titles = {\n", " \"Mr\" : \"Mr\",\n", " \"Mme\": \"Mrs\",\n", " \"Ms\": \"Mrs\",\n", " \"Mrs\" : \"Mrs\",\n", " \"Master\" : \"Master\",\n", " \"Mlle\": \"Miss\",\n", " \"Miss\" : \"Miss\",\n", " \"Capt\": \"Officer\",\n", " \"Col\": \"Officer\",\n", " \"Major\": \"Officer\",\n", " \"Dr\": \"Officer\",\n", " \"Rev\": \"Officer\",\n", " \"Jonkheer\": \"Royalty\",\n", " \"Don\": \"Royalty\",\n", " \"Sir\" : \"Royalty\",\n", " \"Countess\": \"Royalty\",\n", " \"Dona\": \"Royalty\",\n", " \"Lady\" : \"Royalty\"\n", " }\n", " extracted_titles = df[\"Name\"].str.extract(' ([A-Za-z]+)\\.',expand=False)\n", " df[\"Title\"] = extracted_titles.map(titles)\n", " return df\n", "\n", "def create_dummies(df,column_name):\n", " \"\"\"Create Dummy Columns (One Hot Encoding) from a single Column\n", "\n", " Usage\n", " ------\n", "\n", " train = create_dummies(train,\"Age\")\n", " \"\"\"\n", " dummies = pd.get_dummies(df[column_name],prefix=column_name)\n", " df = pd.concat([df,dummies],axis=1)\n", " return df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def pre_process(df):\n", " df = process_missing(df)\n", " df = process_age(df)\n", " df = process_fare(df)\n", " df = process_titles(df)\n", " df = process_cabin(df)\n", "\n", " for col in [\"Age_categories\",\"Fare_categories\",\n", " \"Title\",\"Cabin_type\",\"Sex\"]:\n", " df = create_dummies(df,col)\n", " \n", " return df\n", "\n", "train = pre_process(train)\n", "holdout = pre_process(holdout)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exploring the Data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 891 entries, 0 to 890\n", "Data columns (total 3 columns):\n", "SibSp 891 non-null int64\n", "Parch 891 non-null int64\n", "Survived 891 non-null int64\n", "dtypes: int64(3)\n", "memory usage: 21.0 KB\n" ] } ], "source": [ "explore_cols = [\"SibSp\",\"Parch\",\"Survived\"]\n", "explore = train[explore_cols].copy()\n", "explore.info()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "explore.drop(\"Survived\",axis=1).plot.hist(alpha=0.5,bins=8)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "explore[\"familysize\"] = explore[[\"SibSp\",\"Parch\"]].sum(axis=1)\n", "explore.drop(\"Survived\",axis=1).plot.hist(alpha=0.5,bins=10)\n", "plt.xticks(range(11))\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": false }, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXEAAAEGCAYAAACToKXdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAFuJJREFUeJzt3X2UXHWd5/H3NyCIECGBSJCY4MAGNmhASQIIaCMPiQqSswhJRx53Z0DPOsqZXdEZGRJ1cPDIgXnQZUHUxYGEw6pjkhnEjIwNBiZIeEg0kgQZCSEBJA9okLAE8t0/quipVPqhOn2rq2/n/TqnD3Vv/ereb/VpPrn967q/b2QmkqRyGtbqAiRJu84Ql6QSM8QlqcQMcUkqMUNckkpsz4E8WUT4URhJ2gWZGV3tH/Ar8cxs2tfs2bObevxmf1m/9e+OtVt/7189cTpFkkrMEJekEhtSId7W1tbqEvrF+lurzPWXuXaw/v6I3uZbCj1ZRA7k+SRpKIgIsps/bA7op1MkDW6HHXYYa9asaXUZu61x48bx1FNP9ek1XolL6lS94mt1Gbut7r7/PV2JD6k5cUna3RjiklRihrgklZghLkklZohLUp1PfvKTXHPNNYUf94tf/CIXXnhhocc0xCX1aPTow4iIpn2NHn1Yw7UsXryYk046iQMOOICDDjqIU045hYcffrjw93zjjTfyhS98ofDjQuWTJkXyc+KSevT882uA5n3s8PnnGwu1LVu2cPbZZ3PTTTdx3nnn8eqrr/Kzn/2Mvffeu8/nzMzCw7RVvBKXVAqrV68mIjj//POJCPbee29OP/103vWud+00TbFmzRqGDRvG9u3bATj11FO56qqrOPnkk9l333352te+xuTJk3c4/g033MD06dMBuPTSS7n66qsBmDBhAnfddVfnuNdff523ve1tPPbYYwAsWbKEk046iREjRvCe97yHe++9t3PsU089RVtbG/vvvz9Tp05lw4YNhX9fDHFJpTB+/Hj22GMPLrnkEu6++25efPHFHZ6vv7Ku377tttu45ZZb2LJlC5/4xCdYvXo1Tz75ZOfz8+bN4+Mf//hO521vb2fu3Lmd23fffTejRo3i2GOPZd26dZx11llcffXVbN68meuuu45zzz2XjRs3AjBr1iwmT57Mhg0buOqqq7j11lv7/X2oZ4hLKoXhw4ezePFihg0bxmWXXcaoUaOYPn06v/3tbxt6/SWXXMJRRx3FsGHDeOtb38o555zDvHnzAHjiiSdYtWoVZ5999k6vmzVrFgsWLOCVV14BKmHf3t4OwO23385HPvIRpk6dCsBpp53GpEmTuOuuu1i7di1Lly7lS1/6Em9605s45ZRTujx+fxnikkrjyCOP5Nvf/jZPP/00K1asYP369VxxxRUNvfYd73jHDtvt7e2dIT537lymT5/Om9/85p1ed/jhhzNhwgQWLlzI1q1bWbBgQecV+5o1a7jzzjsZOXIkI0eOZMSIEdx///08++yzrF+/nhEjRrDPPvt0HmvcuHG7+ta71VCIR8S0iFgZEasj4nNdPH9ARPwgIpZFxJKImFB4pZJUY/z48Vx88cWsWLGC/fbbj5dffrnzuWeffXan8fXTK2eccQYvvPACy5Yt44477mDWrFndnmvmzJnMnTuX+fPnc/TRR/POd74TqPzDcNFFF7Fp0yY2bdrE5s2b2bJlC1deeSWHHHIImzdvZuvWrZ3Hefrpp/v7tnfSa4hHxDDg68BU4GigPSKOqhv2F8CjmXkMcDHwd0UXKmn3tmrVKq6//nrWrVsHwNq1a5k3bx4nnngixxxzDPfddx9r167ld7/7Hddee22vx9tzzz0577zz+OxnP8vmzZs544wzuh07c+ZMFi1axI033rhD2F9wwQUsXLiQRYsWsX37dl555RXuvfde1q9fz9ixY5k0aRKzZ89m27ZtLF68mIULF/b/G1GnkSvxKcATmbkmM7cBdwDn1I2ZAPwrQGauAg6LiFGFViqpJQ4+eBwQTfuqHL93w4cP58EHH+T4449n+PDhvO9972PixIlcd911nH766cyYMYOJEycyefLkneaeu/s4YXt7O/fccw/nn38+w4YN63b86NGjOfHEE1myZAkzZszo3D9mzBjmz5/PV77yFUaNGsW4ceO47rrrOj8Vc/vtt7NkyRIOPPBAvvzlL3PxxRc39F77otelaCPiXGBqZl5W3b4AmJKZn64Zcw3w5sz8HxExBVgMHJ+Zj9Ydy6VopUHMpWhba1eWoi3qZp9rgb+NiEeAXwCPAq93NXDOnDmdj9va2krflkmSitbR0UFHR0dDYxu5Ej8BmJOZ06rbnwcyM7/aw2t+A7w7M1+q2++VuDSIeSXeWs1qCvEQcEREjIuIvYCZwIK6E+wfEW+qPv4T4N76AJckFa/X6ZTMfD0iPgUsohL638rMxyPi8srTeTPwn4FbI2I7sAL4b80sWpJUYY9NSZ2cTmkte2xK0m7GpWgldRo3btyQWaK1jHbltnynUyRpkHM6RZKGKENckkrMEJekEjPEJanEDHFJKjFDXJJKzBCXpBIzxCWpxAxxSSoxQ1ySSqyobvcHRsSPIuKxiPhFRFxSeKWSpJ000tlnGLAaOA1YT6VJxMzMXFkzZjaVHpt/HhEHAauAgzPztbpjuXaKJPVRf9dOaaTb/XPA8Orj4cDG+gCXJBWvkaVoDwXW1mw/QyXYa30TuCci1gP7ATOKKU+S1JOi1hP/c2BZZp4aEYcD/xIRE7vqs2m3e0nq2YB3u4+Iu4BrMvP+6vY9wOcyc2ndsZwTl6Q+anq3e+Bx4PTqyQ4GxgP/vuslS5IaUVS3+78GvhMRy4AArszMTc0sXJJkezZJGvRszyZJQ5QhLkklZohLUokZ4pJUYoa4JJWYIS5JJWaIS1KJGeKSVGKGuCSVmCEuSSVmiEtSiRniklRihrgklVhR3e7/Z0Q8GhGPVLvdvxYRBxRfriSpViHd7uvGnwVckZmnd/GcS9FKUh8NRLf7Wu3AvL6XKUnqq0ZCvKtu94d2NTAi9gGmAd/vf2mSpN4U1e3+DWcDizPzxe4G2O1ekno24N3ua8b+ALgzM+/o5ljOiUtSH/U0J95IiO8BrKLyh81ngZ8D7Zn5eN24/al0uB+TmVu7OZYhLkl91FOIF9XtHmA68OPuAlySVDy73UvSIGe3e0kaogxxSSoxQ1ySSswQl6QSM8QlqcQMcUkqMUNckkrMEJekEjPEJanEDHFJKjFDXJJKzBCXpBIzxCWpxArpdl8d01bteP/LiPhpsWVKkrpSSLf7akOIB4AzM3NdRByUmRu6OJZL0UpSHw1Et/tZwPczcx1AVwEuSSpeUd3uxwMjI+KnEfFQRFxYVIGSpO4V1e1+T+C9wAeBfYF/i4h/y8xfF3R8SVIXGgnxdcDYmu0x1X21ngE2ZOYrwCsRcR9wDLBTiM+54ILOx20TJ9I2cWJfa5akIa1j+XI6li9vaGwh3e4j4ijg74FpwN7Ag8CMzPxV3bEyly5t/J1IkohJk5rb7T4zV0bEj4HlwOvAzfUBLkkq3sB3u/dKXJL6pKcrce/YlKQSM8QlqcQMcUkqMUNckkrMEJekEjPEJanEDHFJKjFDXJJKzBCXpBIzxCWpxIpailZdOPPMs9i46bmmHf/AkaNZtOifmnZ8SYOfId5ElQBv3loxGzdNatqxJZWD0ymSVGKGuCSVWEMhHhHTImJlRKyOiM918fwHIuLFiHik+nVV8aVKkur1OiceEcOAr1Pp7LMeeCgi5mfmyrqh92XmR5tQoySpG41ciU8BnsjMNZm5DbgDOKeLcV0uWC5Jap5GQvxQYG3N9jPVffVOjIjHIuKfI2JCIdVJknpU1EcMHwbGZubLEfEh4IfA+K4Gzrnpps7HbccdR9skPyYnSbU6li6l4+GHGxrbSLf7E4A5mTmtuv15Kg2Sv9rDa34DHJeZm+r271Y9No+bNIlmfk4cJvHwbvT9lHZX/e2x+RBwRESMi4i9gJnAgh1OEHFwzeMpVP5x2IQkqal6nU7JzNcj4lPAIiqh/63MfDwiLq88nTcDH4uITwLbgK3AjGYWLUmqaGhOPDPvBo6s23dTzeNvAN8otjRJUm+8Y1OSSswQl6QSM8QlqcRcilbdcj10afAzxNUt10OXBj+nUySpxAxxSSoxQ1ySSswQl6QSM8QlqcQMcUkqMUNckkrMEJekEiuk233NuMkRsS0i/ktxJUqSutNriNd0u58KHA20R8RR3Yy7Fvhx0UVKkrpWZLf7PwW+B/y2wPokST0opNt9RLwdmJ6ZNwJd9oGTJBWvqAWw/gaonSvvNsjtdi9JPetLt/tGQnwdMLZme0x1X61JwB0REcBBwIciYltmLqgbx5zLL2+oMEnaXbVNmrTDBe4Xv/nNbsc2EuKd3e6BZ6l0u2+vHZCZf/TG44j4DrCwqwCXJBWrqG73O7ykCXVKkrpQSLf7uv3/tYC6JEkN8I5NSSoxQ1ySSswQl6QSM8QlqcQMcUkqMUNckkrMEJekEitq7ZSmOPPMs9i46bmmHf/AkaNZtOifmnZ8SWq2QR3ilQBf2sTju/iWpHJzOkWSSswQl6QSM8QlqcQMcUkqsUK63UfERyNiWUQ8GhFLI+KDxZcqSarX66dTarrdnwasBx6KiPmZubJm2E/eaAIREe8G/hE4ogn1SpJqFNLtPjNfrtncD9hQXImSpO4U0u0eICKmR8TjwF3Ap4spT5LUk8Ju9snMHwI/jIiTgX+grhPQG+bccEPn47aJE2mbOLHbYx4EwAtFldi1F5p3fOtvQBPrl8qqY/lyOpYvb2hsZPbcEjMiTgDmZOa06vbnqfTW/GoPr3kSmJKZG+v2Z2/nqxtPc1t2Bn2pp89Ht/7eztDU+qWhIiLIzOjquUamUzq73UfEXlS63e/QyT4iDq95/F6A+gCXJBWvqG7350bERcCrwB+AGc0sWpJU0et0SqEnczql6DNYv7Qb6O90iiRpkDLEJanEDHFJKjFDXJJKzBCXpBIzxCWpxAxxSSoxQ1xD1ujRhxERTfsaPfqwVr9FyZt9vFmmh6Nbf29n8GYlDQhv9pGkIcoQl6QSM8QlqcQMcUkqMUNckkqsoRCPiGkRsTIiVkfE57p4flZELKt+La52vJckNVmvIR4Rw4CvA1OBo4H2iDiqbti/A+/PzGOAvwK+WXShkqSdNXIlPgV4IjPXZOY24A7gnNoBmbkkM39X3VwCHFpsmZKkrjQS4ocCa2u2n6HnkP5j4Ef9KUqS1Jhee2z2RUScClwKnNzdmDlz5nQ+bmtro62trcgSJKn0Ojo66OjoaGhsr7fdR8QJwJzMnFbd/jyVBslfrRs3Efg+MC0zn+zmWN52X+wZrL+no5e8fukN/b3t/iHgiIgYFxF7ATOBBXUnGEslwC/sLsAlScXrdTolM1+PiE8Bi6iE/rcy8/GIuLzydN4M/CUwEvhfUbn82ZaZU5pZuCTJVQz9db6no1t/b2dwOkUDwlUMJWmIMsQlqcQMcWkQsiuRGuWcuHOy3R/d+ns7Q9PqL3PtKp5z4pI0RBniklRihrgklZghLkklZohLUokZ4pJUYoa4JJWYIS5JJWaIS1KJFdXt/siIeCAiXomIPyu+TElSV3pdT7ym2/1pwHrgoYiYn5kra4ZtBP4UmN6UKiVJXSqq2/2GzHwYeK0JNUqSutGMbveSpAFSaLf7RtjtXpJ61pJu99XnZgNbMvP6bo7lUrTFnsH6ezp6iesvc+0qXtO73defbxdqlCTtgkK63UfEwcBSYDiwPSI+A0zIzJeaWbwk7e7s7OOv890f3fp7O4PTKRoQdvaRpCHKEJekEjPEJanEDHFJKjFDXJJKzBCXpBIzxCWpxAxxSSoxQ1ySSswQl6QSM8QlqcQMcUkqMUNckkqskG731TF/FxFPRMRjEXFssWU2qqM1py1MR6sL6KeOVhfQTx2tLqAfOlpdQL802sVmsGpl/b2GeE23+6nA0UB7RBxVN+ZDwOGZ+Z+Ay4H/3YRaG9DRmtMWpqPVBfRTR6sL6KeOVhfQDx2tLqBfDPFdV0i3++r2dwEy80Fg/2qjCElSExXV7b5+zLouxkiSCtZIo+RzgamZeVl1+wJgSmZ+umbMQuCvM/OB6vZPgCsz85G6Y9lKRJJ2QXedfXrtsUnlqnpszfaY6r76Me/oZUy3RUiSdk1R3e4XABcBRMQJwIuZ+XyhlUqSdlJIt/vMvCsiPhwRvwb+AFza3LIlSTDA3e4lScXyjk1JKjFDXJJKrJFPpwxa1TtHz+E/PpO+DliQmY+3rqrdR/X7fyjwYGa+VLN/Wmbe3brKehcRJwGbM/NXEfEBYBLwWGbe0+LSdklEfDczL2p1HbsiIk6mclPhLzNzUavr6U3NBzzWZ+ZPImIW8D7gceDm6k2RA1dPWefEq2u4tFO5g/SZ6u4xVL65d2Tmta2qrb8i4tLM/E6r6+hJRHwa+O9UfnCPBT6TmfOrzz2Sme9tZX09iYivAB+k8ptoB/B+4J+BM6hcBFzXuup6FxH1nw4L4FTgXwEy86MDXlQfRMTPM3NK9fGfUPk5+kfgTGDhYP9/NyJup3IB/BbgRWA/4AfAaVQy9eIBrafEIb4aOLr+X73qv5Irquu4lFJEPJ2ZY3sf2ToR8QvgxMx8KSIOA74H/ENm/m1EPJqZ72lpgT2IiBXARGBv4DlgTGb+PiL2AZZk5jEtLbAXEfEI8CvgFiCphPg8KhcwZOa9rauud7U/HxHxEPDhzHwhIval8v1/d2sr7FlELM/MiRGxJ5Xf/t9e/RRfAMsyc+JA1lPm6ZTtwNuBNXX7D6k+N6hFxPLungLKsO7MsDemUDLzqYhoA74XEeOovIfB7NXMfB14OSKezMzfA2Tm1ogY9D87VKZ+PgN8AfhsZj4WEVsHe3jXGBYRI6j8JrRHZr4AkJl/iIjXWltaQ/aoXizuS+VqfH9gE5WLgj0Gupgyh/gVwD0R8QT/sW7LWOAI4FMtq6pxB1NZGXJz3f4AHhj4cvrs+Yg4NjMfA6hekZ8FfBsY1FdSwKsR8ZbMfBk47o2dEbE/lSvbQS0ztwM3RMT/rf73ecr1//L+wMNUftYzIg7JzGcjYj8G/wUAwG3ASuBV4M+AxRFxP3AC8H8GupjSTqdA5zK5U9jxD5sPVa+yBrWI+Bbwncxc3MVzczNzVgvKalhEjAFey8znunjupMy8vwVlNSQi9s7M/9fF/oOAQzLzFy0oa5dFxEeAkzLzL1pdS39ExFuAgzPzN62upTfV3zh/n5mbI+KPqPx2tCozlw14LWUOcUna3fk5cUkqMUNckkrMEJekEjPENSRFxBci4pcRsSwiHomIKRFx8xv9YSNiSzevOz4ilkTEoxGxIiKuHtjKpb4p08eSpIZU17T/MHBsZr4WESOBvd7oTlXV3V/0bwU+lpm/rN68cWSTy5X6xStxDUWHABsy8zWAzNyUmc9FxE8j4o3lACIirq9erf9LRBxY3T8KeL76uszMldXBsyPiuxHxQESsiog/Hug3JXXFENdQtAgYGxErI+IbEfH+LsbsC/w8M98F3AfMru7/G2BVRHw/Ii6LiL1rXvNuoI3KYkdXR8To5r0FqTGGuIaczPwD8F7gMuAF4I6IqF+U6HXgzurj24CTq6/9MpW7OBcBs4Af1bxmfma+mpkbqSw2NaVpb0JqkHPiGpKychfbfcB91cW6LqbnW+o7n6veMXhTRNwCvFBd52OHMVRvGS+2aqnvvBLXkBMR4yPiiJpdxwJP1Q3bA/hY9fHHgcXV1364Zsx44DUqy40CnBMRe1Xnzz9ApYm41FJeiWso2g/4++qCVq8Bv6YytfK9mjEvAVMi4i+p/CFzRnX/hRFxPfBy9bWzMjMrH1RhOZX1xw8EvtTVujHSQHPtFKkBETEb2JKZ17e6FqmW0ymSVGJeiUtSiXklLkklZohLUokZ4pJUYoa4JJWYIS5JJfb/AfCyiQMuNRw9AAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXEAAAEGCAYAAACToKXdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAFplJREFUeJzt3X2UVfV97/H3F5+rREEJWAmYq0WLCRoF1KjtGB8gjUZWvQpDfLw3VbNumrjubUzaeJUkV2tWWPEmTevVxGQljUBz8wS0Sri1GQ1GDKiIMQjWVkRQI4IWA0SE7/3jHCfDYR7OMPvMmQ3v11qzPHuf3/nt75w1fNyz5+zfNzITSVI5DWp2AZKk3WeIS1KJGeKSVGKGuCSVmCEuSSW2b38eLCL8KIwk7YbMjM729/uZeGY27Ovmm29u6PyN/rJ+698ba7f+nr+64+UUSSoxQ1ySSmyPCvGWlpZml9An1t9cZa6/zLWD9fdF9HS9pdCDRWR/Hk+S9gQRQXbxh81+/XSKpIHt6KOPZvXq1c0uY681evRonnvuuV69xjNxSe2qZ3zNLmOv1dX7392Z+B51TVyS9jaGuCSVmCEuSSVmiEtSiRniklTjYx/7GLfcckvh837uc5/j8ssvL3ROQ1xSt0aMOJqIaNjXiBFH113LokWLOOOMMzjssMM44ogjOOuss3j00UcL/57vuOMOPvvZzxY+L1Q+aVIkPycuqVsvv7waaNzHDl9+ub5Q27RpExdeeCF33nknl1xyCW+++SY/+9nPOOCAA3p9zMwsPEybxTNxSaWwatUqIoJLL72UiOCAAw7g3HPP5T3vec8ulylWr17NoEGD2LFjBwBnn302N954I2eeeSYHH3wwX/rSl5gwYcJO899+++1MmTIFgKuvvpqbbroJgLFjx3Lvvfe2j9u+fTvvfOc7WbZsGQCLFy/mjDPOYMiQIbzvfe/jgQceaB/73HPP0dLSwqGHHsqkSZNYv3594e+LIS6pFMaMGcM+++zDVVddxYIFC3jttdd2er72zLp2+7vf/S7f+MY32LRpE9dddx2rVq3i2WefbX9+9uzZfOQjH9nluK2trcyaNat9e8GCBQwbNoyTTjqJtWvXcsEFF3DTTTexceNGZs6cycUXX8yrr74KwPTp05kwYQLr16/nxhtv5Nvf/naf34dahrikUhg8eDCLFi1i0KBBXHPNNQwbNowpU6bw61//uq7XX3XVVRx//PEMGjSId7zjHVx00UXMnj0bgGeeeYaVK1dy4YUX7vK66dOnM2/ePLZu3QpUwr61tRWAe+65hw996ENMmjQJgHPOOYfx48dz7733smbNGpYuXcrnP/959ttvP84666xO5+8rQ1xSaRx33HF885vf5Pnnn+epp55i3bp1XH/99XW99l3vetdO262tre0hPmvWLKZMmcKBBx64y+uOOeYYxo4dy/z589myZQvz5s1rP2NfvXo13/ve9xg6dChDhw5lyJAhPPTQQ7z44ousW7eOIUOGcNBBB7XPNXr06N391rtUV4hHxOSIeDoiVkXEpzt5/rCI+GFEPBERiyNibOGVSlIHY8aM4corr+Spp57ikEMOYfPmze3Pvfjii7uMr728ct555/HKK6/wxBNPMGfOHKZPn97lsaZNm8asWbOYO3cuJ5xwAu9+97uByv8YrrjiCjZs2MCGDRvYuHEjmzZt4oYbbuDII49k48aNbNmypX2e559/vq/f9i56DPGIGAR8DZgEnAC0RsTxNcP+Cng8M08ErgS+WnShkvZuK1eu5Mtf/jJr164FYM2aNcyePZvTTz+dE088kQcffJA1a9bw+uuvc9ttt/U437777ssll1zCpz71KTZu3Mh5553X5dhp06axcOFC7rjjjp3C/rLLLmP+/PksXLiQHTt2sHXrVh544AHWrVvHqFGjGD9+PDfffDPbtm1j0aJFzJ8/v+9vRI16zsQnAs9k5urM3AbMAS6qGTMW+BeAzFwJHB0RwwqtVFJTDB8+GoiGfVXm79ngwYN55JFHOPXUUxk8eDDvf//7GTduHDNnzuTcc89l6tSpjBs3jgkTJuxy7bmrjxO2trZy//33c+mllzJo0KAux48YMYLTTz+dxYsXM3Xq1Pb9I0eOZO7cudx6660MGzaM0aNHM3PmzPZPxdxzzz0sXryYww8/nC984QtceeWVdX2vvdHjUrQRcTEwKTOvqW5fBkzMzE90GHMLcGBm/o+ImAgsAk7NzMdr5nIpWmkAcyna5tqdpWiLutnnNuArEfEY8CTwOLC9s4EzZsxof9zS0lL6tkySVLS2tjba2trqGlvPmfhpwIzMnFzd/gyQmfnFbl7z78B7M/ONmv2eiUsDmGfizdWophBLgGMjYnRE7A9MA+bVHODQiNiv+vjPgAdqA1ySVLweL6dk5vaI+DiwkEro352ZKyLi2srTeRfwh8C3I2IH8BTwXxtZtCSpwh6bktp5OaW57LEpSXsZl6KV1G706NF7zBKtZbQ7t+V7OUWSBjgvp0jSHsoQl6QSM8QlqcQMcUkqMUNckkrMEJekEjPEJanEDHFJKjFDXJJKzBCXpBIrqtv94RFxX0Qsi4gnI+KqwiuVJO2ins4+g4BVwDnAOipNIqZl5tMdxtxMpcfmX0bEEcBKYHhmvlUzl2unSFIv9XXtlHq63b8EDK4+Hgy8WhvgkqTi1bMU7VHAmg7bL1AJ9o6+DtwfEeuAQ4CpxZQnSepOUeuJ/yXwRGaeHRHHAP8vIsZ11mfTbveS1L1+73YfEfcCt2TmQ9Xt+4FPZ+bSmrm8Ji5JvdTwbvfACuDc6sGGA2OAf9v9kiVJ9Siq2/1fA9+KiCeAAG7IzA2NLFySZHs2SRrwbM8mSXsoQ1ySSswQl6QSM8QlqcQMcUkqMUNckkrMEJekEjPEJanEDHFJKjFDXJJKzBCXpBIzxCWpxAxxSSqxorrd/0VEPB4Rj1W73b8VEYcVX64kqaNCut3XjL8AuD4zz+3kOZeilaRe6o9u9x21ArN7X6YkqbfqCfHOut0f1dnAiDgImAz8oO+lSZJ6UlS3+7ddCCzKzNe6GmC3e0nqXr93u+8w9ofA9zJzThdzeU1cknqpu2vi9YT4PsBKKn/YfBH4BdCamStqxh1KpcP9yMzc0sVchrgk9VJ3IV5Ut3uAKcBPugpwSVLx7HYvSQOc3e4laQ9liEtSiRniklRihrgklZghLkklZohLUokZ4pJUYoa4JJWYIS5JJWaIS1KJGeKSVGKGuCSVmCEuSSVWSLf76piWasf7X0bET4stU5LUmUK63VcbQvwcOD8z10bEEZm5vpO5XIpWknqpP7rdTwd+kJlrAToLcElS8Yrqdj8GGBoRP42IJRFxeVEFSpK6VlS3+32Bk4EPAAcDD0fEw5n5rwXNL0nqRD0hvhYY1WF7ZHVfRy8A6zNzK7A1Ih4ETgR2CfEZl13W/rhl3Dhaxo3rbc1SXaa1XsHG115p2PxDDhvGnNnfadj82nu1LV9O2/LldY0tpNt9RBwP/A0wGTgAeASYmpm/qpkrc+nS+r8TqQ9OGT8eaOTP23ge9edZ/SDGj29st/vMfDoifgIsB7YDd9UGuCSpeHVdE8/MBcBxNfvurNmeCcwsrjRJUk+8Y1OSSswQl6QSM8QlqcQMcUkqMUNckkrMEJekEivqtnt14vzzL+DVDS81bP7Dh45g4cJ/bNj8kgY+Q7yBKgHeuDv6Xt0wvmFzSyoHL6dIUokZ4pJUYoa4JJWYIS5JJWaIS1KJGeKSVGJ1hXhETI6IpyNiVUR8upPn/zgiXouIx6pfNxZfqiSpVo+fE4+IQcDXqHT2WQcsiYi5mfl0zdAHM/PDDahRktSFes7EJwLPZObqzNwGzAEu6mRcp62DJEmNU0+IHwWs6bD9QnVfrdMjYllE/FNEjC2kOklSt4q67f5RYFRmbo6IDwI/BsZ0NnDGnb/r6tZyyim0jPfW8YHKtV+k5mhbupS2Rx+ta2w9Ib4WGNVhe2R1X7vMfKPD4/si4u8iYmhmbqidbMa119ZVmJrPtV+k5mgZP36nE9zPff3rXY6t53LKEuDYiBgdEfsD04B5HQdExPAOjycC0VmAS5KK1eOZeGZuj4iPAwuphP7dmbkiIq6tPJ13Af85Ij4GbAO2AFMbWbQkqaKua+KZuQA4rmbfnR0e/y3wt8WWJknqiXdsSlKJGeKSVGKGuCSVmCEuSSVmiEtSiRniklRihrgklZghLkklZohLUokZ4pJUYoa4JJWYIS5JJWaIS1KJFdLtvsO4CRGxLSL+tLgSJUld6THEO3S7nwScALRGxPFdjLsN+EnRRUqSOldkt/s/B74P/LrA+iRJ3Sik231E/D4wJTPvAKK48iRJ3Smq2/3/BjpeK+8yyO12L0nd6/du98B4YE5EBHAE8MGI2JaZ82rG2e1eknrQm2739YR4e7d74EUq3e5bOw7IzP/09uOI+BYwv7MAlyQVq6hu9zu9pAF1SpI6UUi3+5r9/6WAuiRJdfCOTUkqMUNckkrMEJekEjPEJanEDHFJKjFDXJJKzBCXpBIzxCWpxIpaAKshzj//Al7d8FLD5j986AgWLvzHhs0vSY02oEO8EuBLGzi/KyhKKjcvp0hSiRniklRihrgklVgh3e4j4sMR8UREPB4RSyPiA8WXKkmq1eMfNjt0uz8HWAcsiYi5mfl0h2H//HYTiIh4L/Aj4NgG1CtJ6qCQbveZubnD5iHA+uJKlCR1pZBu9wARMSUiVgD3Ap8opjxJUncK+5x4Zv4Y+HFEnAn8PTWdgN424/bb2x+3jBtHy7hxXc55BACvFFVi515p3PzWXwfrl3bRtnw5bcuX1zU2MrtviRkRpwEzMnNydfszVHprfrGb1zwLTMzMV2v2Z0/HqxlPY1t2Br2pp9ezW39PR7B+qQ4RQWZGZ8/Vczmlvdt9ROxPpdv9Tp3sI+KYDo9PBqgNcElS8Yrqdn9xRFwBvAn8BpjayKIlSRU9Xk4p9GBeTin6CNbf3ewlr196W18vp0iSBihDXJJKzBCXpBIzxCWpxAxxSSoxQ1ySSswQl6QSM8QlqcQMcUkqMUNckkrMEJekEjPEJanEDHFJKjFDXJJKrK4Qj4jJEfF0RKyKiE938vz0iHii+rWo2vFektRgPYZ4RAwCvgZMAk4AWiPi+Jph/wb8UWaeCPwv4OtFFypJ2lU9Z+ITgWcyc3VmbgPmABd1HJCZizPz9ermYuCoYsuUJHWmnhA/CljTYfsFug/pjwL39aUoSVJ9euyx2RsRcTZwNXBmV2NmzJjR/rilpYWWlpYiS5A0AIwYcTQvv7y6YfMPHz6al156rmHzN1tbWxttbW11je2xx2ZEnAbMyMzJ1e3PUGmQ/MWaceOAHwCTM/PZLuayx2axR7D+7mYvef1l5ntfrL722FwCHBsRoyNif2AaMK/mAKOoBPjlXQW4JKl4PV5OycztEfFxYCGV0L87M1dExLWVp/Mu4H8CQ4G/i8r/grdl5sRGFi5JquNySqEH83JK0Uew/u5mL3n9ZeZ7X6y+Xk6RJA1QhrgklZghLkklZohLUokZ4pJUYoa4JJWYIS5JJWaIS1KJGeKSVGKGuCSVmCEuSSVmiEtSiRniklRiRXW7Py4ifh4RWyPivxdfpiSpMz2uJ96h2/05wDpgSUTMzcynOwx7FfhzYEpDqpQkdaqobvfrM/NR4K0G1ChJ6kIjut1LkvpJod3u62G3e0nqXlO63VefuxnYlJlf7mIu27MVewTr7272ktdfZr73xWp4t/va4+1GjZKk3VBIt/uIGA4sBQYDOyLik8DYzHyjkcVL0t7Obvf+Ot/17Nbf0xH2ql/pe8P3vlh2u5ekPZQhLkklZohLUokZ4pJUYoa4JJWYIS5JJWaIS1KJGeKSVGKGuCSVmCEuSSVmiEtSiRniklRihrgklVgh3e6rY74aEc9ExLKIOKnYMuvV1pzDFqat2QX0UVuzC+ijtmYXsNvq7QIzcLU1u4A+aeb732OId+h2Pwk4AWiNiONrxnwQOCYz/wC4Fvg/Dai1Dm3NOWxh2ppdQB+1NbuAPmprdgG7zRBvrgEd4tTR7b66/R2AzHwEOLTaKEKS1EBFdbuvHbO2kzGSpILV0yj5YmBSZl5T3b4MmJiZn+gwZj7w15n58+r2PwM3ZOZjNXPtPa04JKlAXXX26bHHJpWz6lEdtkdW99WOeVcPY7osQpK0e4rqdj8PuAIgIk4DXsvMlwutVJK0i0K63WfmvRHxJxHxr8BvgKsbW7YkCfq5270kqVjesSlJJWaIS1KJ1fPplAGreufoRfzuM+lrgXmZuaJ5Ve09qu//UcAjmflGh/2TM3NB8yrrWUScAWzMzF9FxB8D44FlmXl/k0vbLRHxncy8otl17I6IOJPKTYW/zMyFza6nJxFxKrAiM/8jIg4CPgOcDPwKuDUzX+/Xesp6Tby6hksrlTtIX6juHknl0zNzMvO2ZtXWVxFxdWZ+q9l1dCciPgH8N2AFcBLwycycW33uscw8uZn1dScibgU+QOU30Tbgj4B/As6jchIws3nV9Swiaj8dFsDZwL8AZOaH+72oXoiIX2TmxOrjP6Pyc/Qj4Hxg/kD/txsRTwEnZuZbEXEXsBn4PnBOdf+f9ms9JQ7xVcAJ1aUAOu7fH3iquo5LKUXE85k5queRzRMRTwKnZ+YbEXE0lR/iv8/Mr0TE45n5vqYW2I3qP8JxwAHAS8DIDmdVizPzxKYW2IOIeIzKWd83gKQS4rOpnMCQmQ80r7qedfz5iIglwJ9k5isRcTCV9/+9za2wexGxIjP/sPp4pxOWiFiWmf26AGCZL6fsAH4fWF2z/8jqcwNaRCzv6imgDOvODHr7EkpmPhcRLcD3I2I0le9hIHszM7cDmyPi2cz8D4DM3BIRA/5nh8qln08CnwU+lZnLImLLQA/vDgZFxBAqvwntk5mvAGTmbyLireaWVpdfdvht+YmIGJ+ZSyNiDLCtpxcXrcwhfj1wf0Q8w+/WbRkFHAt8vGlV1W84lZUhN9bsD+Dn/V9Or70cESdl5jKA6hn5BcA3gQF9JgW8GRG/l5mbgVPe3hkRh1I5sx3QMnMHcHtE/N/qf1+mXP+WDwUepfKznhFxZGa+GBGHMPBPAAA+CnwlIm4E1gMPR8QaKjn00f4uprSXU6B9mdyJ7PyHzSXVs6wBLSLuBr6VmYs6eW5WZk5vQll1i4iRwFuZ+VInz52RmQ81oay6RMQBmfnbTvYfARyZmU82oazdFhEfAs7IzL9qdi19ERG/BwzPzH9vdi31iIh3AO+m8j/QF5p1l3qpQ1yS9nZ+TlySSswQl6QSM8QlqcQMce2RImJ7RDwWEU9GxD9ExIEFzHllRPxNEfVJRTHEtaf6TWaeXL1xZBtwXb0vrH7qqSt+EkADiiGuvcHPqNw/QET8KCKWVM/Q2z/TGxGbImJmRDwOnBYR4yPioYhYFhGLq3cTAhwVEfdFxMqI+GITvhdpJ2W6QUDqjQCIiH2BDwL3VfdfnZmvVS+vLImIH2TmRuBg4OHM/IuI2A94GrgkMx+r3oSytfr6E6msFbMNWBkRX83MXVoRSv3FM3HtqQ6qrjHyCypLM9xd3X99RCwDFlNZMO3tNXbeAn5YfXwcsO7tRt+Z+UaHG8jur27/lsr6JaMb/61IXfNMXHuqzbUrKVaXnP0AcGpm/jYifgq8/QfPrbnznW9d3f7d8U7P7fhvSE3mmbj2VJ2F8KFU1hD/bXUt9NO6GL8SGBERpwBExCERsU/jSpV2n2cR2lN19imSBcB11aVoVwIPdzY+M7dFxFTga9XlaTcD59Z5DKlfuXaKJJWYl1MkqcQMcUkqMUNckkrMEJekEjPEJanEDHFJKjFDXJJK7P8D0nNgBvPTXtEAAAAASUVORK5CYII=\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "\n", "for col in explore.columns.drop(\"Survived\"):\n", " pivot = explore.pivot_table(index=col,values=\"Survived\")\n", " pivot.plot.bar(ylim=(0,1),yticks=np.arange(0,1,.1))\n", " plt.axhspan(.3, .6, alpha=0.2, color='red')\n", " plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The `SibSp` column shows the number of siblings and/or spouses each passenger had on board, while the `Parch` columns shows the number of parents or children each passenger had onboard. Neither column has any missing values.\n", "\n", "The distribution of values in both columns is skewed right, with the majority of values being zero.\n", "\n", "You can sum these two columns to explore the total number of family members each passenger had onboard. The shape of the distribution of values in this case is similar, however there are less values at zero, and the quantity tapers off less rapidly as the values increase.\n", "\n", "Looking at the survival rates of the the combined family members, you can see that few of the over 500 passengers with no family members survived, while greater numbers of passengers with family members survived." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Engineering New Features" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def process_isalone(df):\n", " df[\"familysize\"] = df[[\"SibSp\",\"Parch\"]].sum(axis=1)\n", " df[\"isalone\"] = 0\n", " df.loc[(df[\"familysize\"] == 0),\"isalone\"] = 1\n", " df = df.drop(\"familysize\",axis=1)\n", " return df\n", "\n", "train = process_isalone(train)\n", "holdout = process_isalone(holdout)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Selecting the Best-Performing Features" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best Columns \n", "------------\n", "['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Age_categories_Missing', 'Age_categories_Infant', 'Age_categories_Young Adult', 'Fare_categories_12-50', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Cabin_type_D', 'Cabin_type_E', 'Cabin_type_Unknown', 'Sex_female', 'Sex_male', 'isalone']\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/joshuadevlin/.virtualenvs/dscontent/lib/python3.4/site-packages/sklearn/utils/__init__.py:54: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", " if np.issubdtype(mask.dtype, np.int):\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.feature_selection import RFECV\n", "\n", "def select_features(df):\n", " # Remove non-numeric columns, columns that have null values\n", " df = df.select_dtypes([np.number]).dropna(axis=1)\n", " all_X = df.drop([\"Survived\",\"PassengerId\"],axis=1)\n", " all_y = df[\"Survived\"]\n", " \n", " clf = RandomForestClassifier(random_state=1)\n", " selector = RFECV(clf,cv=10)\n", " selector.fit(all_X,all_y)\n", " \n", " best_columns = list(all_X.columns[selector.support_])\n", " print(\"Best Columns \\n\"+\"-\"*12+\"\\n{}\\n\".format(best_columns))\n", " \n", " return best_columns\n", "\n", "cols = select_features(train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Selecting and Tuning Different Algorithms" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LogisticRegression\n", "------------------\n", "Best Score: 0.8204264870931538\n", "Best Parameters: {'solver': 'liblinear'}\n", "\n", "KNeighborsClassifier\n", "--------------------\n", "Best Score: 0.7755331088664422\n", "Best Parameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 5, 'algorithm': 'brute'}\n", "\n", "RandomForestClassifier\n", "----------------------\n", "Best Score: 0.8294051627384961\n", "Best Parameters: {'max_features': 'log2', 'criterion': 'entropy', 'n_estimators': 4, 'min_samples_leaf': 1, 'min_samples_split': 5, 'max_depth': 5}\n", "\n" ] } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "def select_model(df,features):\n", " \n", " all_X = df[features]\n", " all_y = df[\"Survived\"]\n", "\n", " # List of dictionaries, each containing a model name,\n", " # it's estimator and a dict of hyperparameters\n", " models = [\n", " {\n", " \"name\": \"LogisticRegression\",\n", " \"estimator\": LogisticRegression(),\n", " \"hyperparameters\":\n", " {\n", " \"solver\": [\"newton-cg\", \"lbfgs\", \"liblinear\"]\n", " }\n", " },\n", " {\n", " \"name\": \"KNeighborsClassifier\",\n", " \"estimator\": KNeighborsClassifier(),\n", " \"hyperparameters\":\n", " {\n", " \"n_neighbors\": range(1,20,2),\n", " \"weights\": [\"distance\", \"uniform\"],\n", " \"algorithm\": [\"ball_tree\", \"kd_tree\", \"brute\"],\n", " \"p\": [1,2]\n", " }\n", " },\n", " {\n", " \"name\": \"RandomForestClassifier\",\n", " \"estimator\": RandomForestClassifier(random_state=1),\n", " \"hyperparameters\":\n", " {\n", " \"n_estimators\": [4, 6, 9],\n", " \"criterion\": [\"entropy\", \"gini\"],\n", " \"max_depth\": [2, 5, 10],\n", " \"max_features\": [\"log2\", \"sqrt\"],\n", " \"min_samples_leaf\": [1, 5, 8],\n", " \"min_samples_split\": [2, 3, 5]\n", "\n", " }\n", " }\n", " ]\n", "\n", " for model in models:\n", " print(model['name'])\n", " print('-'*len(model['name']))\n", "\n", " grid = GridSearchCV(model[\"estimator\"],\n", " param_grid=model[\"hyperparameters\"],\n", " cv=10)\n", " grid.fit(all_X,all_y)\n", " model[\"best_params\"] = grid.best_params_\n", " model[\"best_score\"] = grid.best_score_\n", " model[\"best_model\"] = grid.best_estimator_\n", "\n", " print(\"Best Score: {}\".format(model[\"best_score\"]))\n", " print(\"Best Parameters: {}\\n\".format(model[\"best_params\"]))\n", "\n", " return models\n", "\n", "result = select_model(train,cols)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Making a Submission to Kaggle" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def save_submission_file(model,cols,filename=\"submission.csv\"):\n", " holdout_data = holdout[cols]\n", " predictions = model.predict(holdout_data)\n", " \n", " holdout_ids = holdout[\"PassengerId\"]\n", " submission_df = {\"PassengerId\": holdout_ids,\n", " \"Survived\": predictions}\n", " submission = pd.DataFrame(submission_df)\n", "\n", " submission.to_csv(filename,index=False)\n", "\n", "best_rf_model = result[2][\"best_model\"]\n", "save_submission_file(best_rf_model,cols)" ] } ], "metadata": { "kernelspec": { "display_name": "dscontent", "language": "python", "name": "dscontent" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.4" } }, "nbformat": 4, "nbformat_minor": 2 }