{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Introduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "pd.options.display.max_columns = 999\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import KFold\n",
    "\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn import linear_model\n",
    "from sklearn.model_selection import KFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"AmesHousing.tsv\", delimiter=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "57088.25161263909"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def transform_features(df):\n",
    "    return df\n",
    "\n",
    "def select_features(df):\n",
    "    return df[[\"Gr Liv Area\", \"SalePrice\"]]\n",
    "\n",
    "def train_and_test(df):  \n",
    "    train = df[:1460]\n",
    "    test = df[1460:]\n",
    "    \n",
    "    ## You can use `pd.DataFrame.select_dtypes()` to specify column types\n",
    "    ## and return only those columns as a data frame.\n",
    "    numeric_train = train.select_dtypes(include=['integer', 'float'])\n",
    "    numeric_test = test.select_dtypes(include=['integer', 'float'])\n",
    "    \n",
    "    ## You can use `pd.Series.drop()` to drop a value.\n",
    "    features = numeric_train.columns.drop(\"SalePrice\")\n",
    "    lr = linear_model.LinearRegression()\n",
    "    lr.fit(train[features], train[\"SalePrice\"])\n",
    "    predictions = lr.predict(test[features])\n",
    "    mse = mean_squared_error(test[\"SalePrice\"], predictions)\n",
    "    rmse = np.sqrt(mse)\n",
    "    \n",
    "    return rmse\n",
    "\n",
    "transform_df = transform_features(df)\n",
    "filtered_df = select_features(transform_df)\n",
    "rmse = train_and_test(filtered_df)\n",
    "\n",
    "rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Engineering\n",
    "\n",
    "- Handle missing values:\n",
    "    - All columns:\n",
    "        - Drop any with 5% or more missing values **for now**.\n",
    "    - Text columns:\n",
    "        - Drop any with 1 or more missing values **for now**.\n",
    "    - Numerical columns:\n",
    "        - For columns with missing values, fill in with the most common value in that column"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1: All columns: Drop any with 5% or more missing values **for now**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## Series object: column name -> number of missing values\n",
    "num_missing = df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter Series to columns containing >5% missing values\n",
    "drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()\n",
    "\n",
    "# Drop those columns from the data frame. Note the use of the .index accessor\n",
    "df = df.drop(drop_missing_cols.index, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2: Text columns: Drop any with 1 or more missing values **for now**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## Series object: column name -> number of missing values\n",
    "text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)\n",
    "\n",
    "## Filter Series to columns containing *any* missing values\n",
    "drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]\n",
    "\n",
    "df = df.drop(drop_missing_cols_2.index, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3: Numerical columns: For columns with missing values, fill in with the most common value in that column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BsmtFin SF 1       1\n",
       "BsmtFin SF 2       1\n",
       "Bsmt Unf SF        1\n",
       "Total Bsmt SF      1\n",
       "Garage Cars        1\n",
       "Garage Area        1\n",
       "Bsmt Full Bath     2\n",
       "Bsmt Half Bath     2\n",
       "Mas Vnr Area      23\n",
       "dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Compute column-wise missing value counts\n",
    "num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()\n",
    "fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()\n",
    "fixable_numeric_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'BsmtFin SF 1': 0.0,\n",
       " 'BsmtFin SF 2': 0.0,\n",
       " 'Bsmt Unf SF': 0.0,\n",
       " 'Total Bsmt SF': 0.0,\n",
       " 'Garage Cars': 2.0,\n",
       " 'Garage Area': 0.0,\n",
       " 'Bsmt Full Bath': 0.0,\n",
       " 'Bsmt Half Bath': 0.0,\n",
       " 'Mas Vnr Area': 0.0}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Compute the most common value for each column in `fixable_nmeric_missing_cols`.\n",
    "replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]\n",
    "replacement_values_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Use `pd.DataFrame.fillna()` to replace missing values.\n",
    "df = df.fillna(replacement_values_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    64\n",
       "dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Verify that every column has 0 missing values\n",
    "df.isnull().sum().value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "What new features can we create, that better capture the information in some of the features?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2180   -1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "years_sold = df['Yr Sold'] - df['Year Built']\n",
    "years_sold[years_sold < 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1702   -1\n",
       "2180   -2\n",
       "2181   -1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "years_since_remod = df['Yr Sold'] - df['Year Remod/Add']\n",
    "years_since_remod[years_since_remod < 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## Create new columns\n",
    "df['Years Before Sale'] = years_sold\n",
    "df['Years Since Remod'] = years_since_remod\n",
    "\n",
    "## Drop rows with negative values for both of these new features\n",
    "df = df.drop([1702, 2180, 2181], axis=0)\n",
    "\n",
    "## No longer need original year columns\n",
    "df = df.drop([\"Year Built\", \"Year Remod/Add\"], axis = 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Drop columns that:\n",
    "- that aren't useful for ML\n",
    "- leak data about the final sale, read more about columns [here](https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Drop columns that aren't useful for ML\n",
    "df = df.drop([\"PID\", \"Order\"], axis=1)\n",
    "\n",
    "## Drop columns that leak info about the final sale\n",
    "df = df.drop([\"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Yr Sold\"], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's update transform_features()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "55275.367312413066"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def transform_features(df):\n",
    "    num_missing = df.isnull().sum()\n",
    "    drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()\n",
    "    df = df.drop(drop_missing_cols.index, axis=1)\n",
    "    \n",
    "    text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)\n",
    "    drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]\n",
    "    df = df.drop(drop_missing_cols_2.index, axis=1)\n",
    "    \n",
    "    num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()\n",
    "    fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()\n",
    "    replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]\n",
    "    df = df.fillna(replacement_values_dict)\n",
    "    \n",
    "    years_sold = df['Yr Sold'] - df['Year Built']\n",
    "    years_since_remod = df['Yr Sold'] - df['Year Remod/Add']\n",
    "    df['Years Before Sale'] = years_sold\n",
    "    df['Years Since Remod'] = years_since_remod\n",
    "    df = df.drop([1702, 2180, 2181], axis=0)\n",
    "\n",
    "    df = df.drop([\"PID\", \"Order\", \"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Year Built\", \"Year Remod/Add\"], axis=1)\n",
    "    return df\n",
    "\n",
    "def select_features(df):\n",
    "    return df[[\"Gr Liv Area\", \"SalePrice\"]]\n",
    "\n",
    "def train_and_test(df):  \n",
    "    train = df[:1460]\n",
    "    test = df[1460:]\n",
    "    \n",
    "    ## You can use `pd.DataFrame.select_dtypes()` to specify column types\n",
    "    ## and return only those columns as a data frame.\n",
    "    numeric_train = train.select_dtypes(include=['integer', 'float'])\n",
    "    numeric_test = test.select_dtypes(include=['integer', 'float'])\n",
    "    \n",
    "    ## You can use `pd.Series.drop()` to drop a value.\n",
    "    features = numeric_train.columns.drop(\"SalePrice\")\n",
    "    lr = linear_model.LinearRegression()\n",
    "    lr.fit(train[features], train[\"SalePrice\"])\n",
    "    predictions = lr.predict(test[features])\n",
    "    mse = mean_squared_error(test[\"SalePrice\"], predictions)\n",
    "    rmse = np.sqrt(mse)\n",
    "    \n",
    "    return rmse\n",
    "\n",
    "df = pd.read_csv(\"AmesHousing.tsv\", delimiter=\"\\t\")\n",
    "transform_df = transform_features(df)\n",
    "filtered_df = select_features(transform_df)\n",
    "rmse = train_and_test(filtered_df)\n",
    "\n",
    "rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MS SubClass</th>\n",
       "      <th>Lot Area</th>\n",
       "      <th>Overall Qual</th>\n",
       "      <th>Overall Cond</th>\n",
       "      <th>Mas Vnr Area</th>\n",
       "      <th>BsmtFin SF 1</th>\n",
       "      <th>BsmtFin SF 2</th>\n",
       "      <th>Bsmt Unf SF</th>\n",
       "      <th>Total Bsmt SF</th>\n",
       "      <th>1st Flr SF</th>\n",
       "      <th>2nd Flr SF</th>\n",
       "      <th>Low Qual Fin SF</th>\n",
       "      <th>Gr Liv Area</th>\n",
       "      <th>Bsmt Full Bath</th>\n",
       "      <th>Bsmt Half Bath</th>\n",
       "      <th>Full Bath</th>\n",
       "      <th>Half Bath</th>\n",
       "      <th>Bedroom AbvGr</th>\n",
       "      <th>Kitchen AbvGr</th>\n",
       "      <th>TotRms AbvGrd</th>\n",
       "      <th>Fireplaces</th>\n",
       "      <th>Garage Cars</th>\n",
       "      <th>Garage Area</th>\n",
       "      <th>Wood Deck SF</th>\n",
       "      <th>Open Porch SF</th>\n",
       "      <th>Enclosed Porch</th>\n",
       "      <th>3Ssn Porch</th>\n",
       "      <th>Screen Porch</th>\n",
       "      <th>Pool Area</th>\n",
       "      <th>Misc Val</th>\n",
       "      <th>Yr Sold</th>\n",
       "      <th>SalePrice</th>\n",
       "      <th>Years Before Sale</th>\n",
       "      <th>Years Since Remod</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>20</td>\n",
       "      <td>31770</td>\n",
       "      <td>6</td>\n",
       "      <td>5</td>\n",
       "      <td>112.0</td>\n",
       "      <td>639.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>441.0</td>\n",
       "      <td>1080.0</td>\n",
       "      <td>1656</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1656</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>2.0</td>\n",
       "      <td>528.0</td>\n",
       "      <td>210</td>\n",
       "      <td>62</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2010</td>\n",
       "      <td>215000</td>\n",
       "      <td>50</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20</td>\n",
       "      <td>11622</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>0.0</td>\n",
       "      <td>468.0</td>\n",
       "      <td>144.0</td>\n",
       "      <td>270.0</td>\n",
       "      <td>882.0</td>\n",
       "      <td>896</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>896</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>730.0</td>\n",
       "      <td>140</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>120</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2010</td>\n",
       "      <td>105000</td>\n",
       "      <td>49</td>\n",
       "      <td>49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "      <td>14267</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>108.0</td>\n",
       "      <td>923.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>406.0</td>\n",
       "      <td>1329.0</td>\n",
       "      <td>1329</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1329</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>312.0</td>\n",
       "      <td>393</td>\n",
       "      <td>36</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12500</td>\n",
       "      <td>2010</td>\n",
       "      <td>172000</td>\n",
       "      <td>52</td>\n",
       "      <td>52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20</td>\n",
       "      <td>11160</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1065.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1045.0</td>\n",
       "      <td>2110.0</td>\n",
       "      <td>2110</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2110</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>2.0</td>\n",
       "      <td>522.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2010</td>\n",
       "      <td>244000</td>\n",
       "      <td>42</td>\n",
       "      <td>42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>60</td>\n",
       "      <td>13830</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>791.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>137.0</td>\n",
       "      <td>928.0</td>\n",
       "      <td>928</td>\n",
       "      <td>701</td>\n",
       "      <td>0</td>\n",
       "      <td>1629</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>482.0</td>\n",
       "      <td>212</td>\n",
       "      <td>34</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2010</td>\n",
       "      <td>189900</td>\n",
       "      <td>13</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   MS SubClass  Lot Area  Overall Qual  Overall Cond  Mas Vnr Area  \\\n",
       "0           20     31770             6             5         112.0   \n",
       "1           20     11622             5             6           0.0   \n",
       "2           20     14267             6             6         108.0   \n",
       "3           20     11160             7             5           0.0   \n",
       "4           60     13830             5             5           0.0   \n",
       "\n",
       "   BsmtFin SF 1  BsmtFin SF 2  Bsmt Unf SF  Total Bsmt SF  1st Flr SF  \\\n",
       "0         639.0           0.0        441.0         1080.0        1656   \n",
       "1         468.0         144.0        270.0          882.0         896   \n",
       "2         923.0           0.0        406.0         1329.0        1329   \n",
       "3        1065.0           0.0       1045.0         2110.0        2110   \n",
       "4         791.0           0.0        137.0          928.0         928   \n",
       "\n",
       "   2nd Flr SF  Low Qual Fin SF  Gr Liv Area  Bsmt Full Bath  Bsmt Half Bath  \\\n",
       "0           0                0         1656             1.0             0.0   \n",
       "1           0                0          896             0.0             0.0   \n",
       "2           0                0         1329             0.0             0.0   \n",
       "3           0                0         2110             1.0             0.0   \n",
       "4         701                0         1629             0.0             0.0   \n",
       "\n",
       "   Full Bath  Half Bath  Bedroom AbvGr  Kitchen AbvGr  TotRms AbvGrd  \\\n",
       "0          1          0              3              1              7   \n",
       "1          1          0              2              1              5   \n",
       "2          1          1              3              1              6   \n",
       "3          2          1              3              1              8   \n",
       "4          2          1              3              1              6   \n",
       "\n",
       "   Fireplaces  Garage Cars  Garage Area  Wood Deck SF  Open Porch SF  \\\n",
       "0           2          2.0        528.0           210             62   \n",
       "1           0          1.0        730.0           140              0   \n",
       "2           0          1.0        312.0           393             36   \n",
       "3           2          2.0        522.0             0              0   \n",
       "4           1          2.0        482.0           212             34   \n",
       "\n",
       "   Enclosed Porch  3Ssn Porch  Screen Porch  Pool Area  Misc Val  Yr Sold  \\\n",
       "0               0           0             0          0         0     2010   \n",
       "1               0           0           120          0         0     2010   \n",
       "2               0           0             0          0     12500     2010   \n",
       "3               0           0             0          0         0     2010   \n",
       "4               0           0             0          0         0     2010   \n",
       "\n",
       "   SalePrice  Years Before Sale  Years Since Remod  \n",
       "0     215000                 50                 50  \n",
       "1     105000                 49                 49  \n",
       "2     172000                 52                 52  \n",
       "3     244000                 42                 42  \n",
       "4     189900                 13                 12  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "numerical_df = transform_df.select_dtypes(include=['int', 'float'])\n",
    "numerical_df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BsmtFin SF 2         0.006127\n",
       "Misc Val             0.019273\n",
       "Yr Sold              0.030358\n",
       "3Ssn Porch           0.032268\n",
       "Bsmt Half Bath       0.035875\n",
       "Low Qual Fin SF      0.037629\n",
       "Pool Area            0.068438\n",
       "MS SubClass          0.085128\n",
       "Overall Cond         0.101540\n",
       "Screen Porch         0.112280\n",
       "Kitchen AbvGr        0.119760\n",
       "Enclosed Porch       0.128685\n",
       "Bedroom AbvGr        0.143916\n",
       "Bsmt Unf SF          0.182751\n",
       "Lot Area             0.267520\n",
       "2nd Flr SF           0.269601\n",
       "Bsmt Full Bath       0.276258\n",
       "Half Bath            0.284871\n",
       "Open Porch SF        0.316262\n",
       "Wood Deck SF         0.328183\n",
       "BsmtFin SF 1         0.439284\n",
       "Fireplaces           0.474831\n",
       "TotRms AbvGrd        0.498574\n",
       "Mas Vnr Area         0.506983\n",
       "Years Since Remod    0.534985\n",
       "Full Bath            0.546118\n",
       "Years Before Sale    0.558979\n",
       "1st Flr SF           0.635185\n",
       "Garage Area          0.641425\n",
       "Total Bsmt SF        0.644012\n",
       "Garage Cars          0.648361\n",
       "Gr Liv Area          0.717596\n",
       "Overall Qual         0.801206\n",
       "SalePrice            1.000000\n",
       "Name: SalePrice, dtype: float64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()\n",
    "abs_corr_coeffs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BsmtFin SF 1         0.439284\n",
       "Fireplaces           0.474831\n",
       "TotRms AbvGrd        0.498574\n",
       "Mas Vnr Area         0.506983\n",
       "Years Since Remod    0.534985\n",
       "Full Bath            0.546118\n",
       "Years Before Sale    0.558979\n",
       "1st Flr SF           0.635185\n",
       "Garage Area          0.641425\n",
       "Total Bsmt SF        0.644012\n",
       "Garage Cars          0.648361\n",
       "Gr Liv Area          0.717596\n",
       "Overall Qual         0.801206\n",
       "SalePrice            1.000000\n",
       "Name: SalePrice, dtype: float64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Let's only keep columns with a correlation coefficient of larger than 0.4 (arbitrary, worth experimenting later!)\n",
    "abs_corr_coeffs[abs_corr_coeffs > 0.4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## Drop columns with less than 0.4 correlation with SalePrice\n",
    "transform_df = transform_df.drop(abs_corr_coeffs[abs_corr_coeffs < 0.4].index, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Which categorical columns should we keep?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Create a list of column names from documentation that are *meant* to be categorical\n",
    "nominal_features = [\"PID\", \"MS SubClass\", \"MS Zoning\", \"Street\", \"Alley\", \"Land Contour\", \"Lot Config\", \"Neighborhood\", \n",
    "                    \"Condition 1\", \"Condition 2\", \"Bldg Type\", \"House Style\", \"Roof Style\", \"Roof Matl\", \"Exterior 1st\", \n",
    "                    \"Exterior 2nd\", \"Mas Vnr Type\", \"Foundation\", \"Heating\", \"Central Air\", \"Garage Type\", \n",
    "                    \"Misc Feature\", \"Sale Type\", \"Sale Condition\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- Which columns are currently numerical but need to be encoded as categorical instead (because the numbers don't have any semantic meaning)?\n",
    "- If a categorical column has hundreds of unique values (or categories), should we keep it? When we dummy code this column, hundreds of columns will need to be added back to the data frame."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## Which categorical columns have we still carried with us? We'll test tehse \n",
    "transform_cat_cols = []\n",
    "for col in nominal_features:\n",
    "    if col in transform_df.columns:\n",
    "        transform_cat_cols.append(col)\n",
    "\n",
    "## How many unique values in each categorical column?\n",
    "uniqueness_counts = transform_df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()\n",
    "## Aribtrary cutoff of 10 unique values (worth experimenting)\n",
    "drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index\n",
    "transform_df = transform_df.drop(drop_nonuniq_cols, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Select just the remaining text columns and convert to categorical\n",
    "text_cols = transform_df.select_dtypes(include=['object'])\n",
    "for col in text_cols:\n",
    "    transform_df[col] = transform_df[col].astype('category')\n",
    "    \n",
    "## Create dummy columns and add back to the dataframe!\n",
    "transform_df = pd.concat([\n",
    "    transform_df, \n",
    "    pd.get_dummies(transform_df.select_dtypes(include=['category']))\n",
    "], axis=1).drop(text_cols,axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Update `select_features()`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[27352.325452161054, 26865.145668097586, 26500.762368070868, 35730.36340669092]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "29112.149223755107"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def transform_features(df):\n",
    "    num_missing = df.isnull().sum()\n",
    "    drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()\n",
    "    df = df.drop(drop_missing_cols.index, axis=1)\n",
    "    \n",
    "    text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)\n",
    "    drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]\n",
    "    df = df.drop(drop_missing_cols_2.index, axis=1)\n",
    "    \n",
    "    num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()\n",
    "    fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()\n",
    "    replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]\n",
    "    df = df.fillna(replacement_values_dict)\n",
    "    \n",
    "    years_sold = df['Yr Sold'] - df['Year Built']\n",
    "    years_since_remod = df['Yr Sold'] - df['Year Remod/Add']\n",
    "    df['Years Before Sale'] = years_sold\n",
    "    df['Years Since Remod'] = years_since_remod\n",
    "    df = df.drop([1702, 2180, 2181], axis=0)\n",
    "\n",
    "    df = df.drop([\"PID\", \"Order\", \"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Year Built\", \"Year Remod/Add\"], axis=1)\n",
    "    return df\n",
    "\n",
    "def select_features(df, coeff_threshold=0.4, uniq_threshold=10):\n",
    "    numerical_df = df.select_dtypes(include=['int', 'float'])\n",
    "    abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()\n",
    "    df = df.drop(abs_corr_coeffs[abs_corr_coeffs < coeff_threshold].index, axis=1)\n",
    "    \n",
    "    nominal_features = [\"PID\", \"MS SubClass\", \"MS Zoning\", \"Street\", \"Alley\", \"Land Contour\", \"Lot Config\", \"Neighborhood\", \n",
    "                    \"Condition 1\", \"Condition 2\", \"Bldg Type\", \"House Style\", \"Roof Style\", \"Roof Matl\", \"Exterior 1st\", \n",
    "                    \"Exterior 2nd\", \"Mas Vnr Type\", \"Foundation\", \"Heating\", \"Central Air\", \"Garage Type\", \n",
    "                    \"Misc Feature\", \"Sale Type\", \"Sale Condition\"]\n",
    "    \n",
    "    transform_cat_cols = []\n",
    "    for col in nominal_features:\n",
    "        if col in df.columns:\n",
    "            transform_cat_cols.append(col)\n",
    "\n",
    "    uniqueness_counts = df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()\n",
    "    drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index\n",
    "    df = df.drop(drop_nonuniq_cols, axis=1)\n",
    "    \n",
    "    text_cols = df.select_dtypes(include=['object'])\n",
    "    for col in text_cols:\n",
    "        df[col] = df[col].astype('category')\n",
    "    df = pd.concat([df, pd.get_dummies(df.select_dtypes(include=['category']))], axis=1).drop(text_cols,axis=1)\n",
    "    \n",
    "    return df\n",
    "\n",
    "def train_and_test(df, k=0):\n",
    "    numeric_df = df.select_dtypes(include=['integer', 'float'])\n",
    "    features = numeric_df.columns.drop(\"SalePrice\")\n",
    "    lr = linear_model.LinearRegression()\n",
    "    \n",
    "    if k == 0:\n",
    "        train = df[:1460]\n",
    "        test = df[1460:]\n",
    "\n",
    "        lr.fit(train[features], train[\"SalePrice\"])\n",
    "        predictions = lr.predict(test[features])\n",
    "        mse = mean_squared_error(test[\"SalePrice\"], predictions)\n",
    "        rmse = np.sqrt(mse)\n",
    "\n",
    "        return rmse\n",
    "    \n",
    "    if k == 1:\n",
    "        # Randomize *all* rows (frac=1) from `df` and return\n",
    "        shuffled_df = df.sample(frac=1, )\n",
    "        train = df[:1460]\n",
    "        test = df[1460:]\n",
    "        \n",
    "        lr.fit(train[features], train[\"SalePrice\"])\n",
    "        predictions_one = lr.predict(test[features])        \n",
    "        \n",
    "        mse_one = mean_squared_error(test[\"SalePrice\"], predictions_one)\n",
    "        rmse_one = np.sqrt(mse_one)\n",
    "        \n",
    "        lr.fit(test[features], test[\"SalePrice\"])\n",
    "        predictions_two = lr.predict(train[features])        \n",
    "       \n",
    "        mse_two = mean_squared_error(train[\"SalePrice\"], predictions_two)\n",
    "        rmse_two = np.sqrt(mse_two)\n",
    "        \n",
    "        avg_rmse = np.mean([rmse_one, rmse_two])\n",
    "        print(rmse_one)\n",
    "        print(rmse_two)\n",
    "        return avg_rmse\n",
    "    else:\n",
    "        kf = KFold(n_splits=k, shuffle=True)\n",
    "        rmse_values = []\n",
    "        for train_index, test_index, in kf.split(df):\n",
    "            train = df.iloc[train_index]\n",
    "            test = df.iloc[test_index]\n",
    "            lr.fit(train[features], train[\"SalePrice\"])\n",
    "            predictions = lr.predict(test[features])\n",
    "            mse = mean_squared_error(test[\"SalePrice\"], predictions)\n",
    "            rmse = np.sqrt(mse)\n",
    "            rmse_values.append(rmse)\n",
    "        print(rmse_values)\n",
    "        avg_rmse = np.mean(rmse_values)\n",
    "        return avg_rmse\n",
    "\n",
    "df = pd.read_csv(\"AmesHousing.tsv\", delimiter=\"\\t\")\n",
    "transform_df = transform_features(df)\n",
    "filtered_df = select_features(transform_df)\n",
    "rmse = train_and_test(filtered_df, k=4)\n",
    "\n",
    "rmse"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}