{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Introduction" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "pd.options.display.max_columns = 999\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.model_selection import KFold\n", "\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn import linear_model\n", "from sklearn.model_selection import KFold" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "df = pd.read_csv(\"AmesHousing.tsv\", delimiter=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "57088.25161263909" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def transform_features(df):\n", " return df\n", "\n", "def select_features(df):\n", " return df[[\"Gr Liv Area\", \"SalePrice\"]]\n", "\n", "def train_and_test(df): \n", " train = df[:1460]\n", " test = df[1460:]\n", " \n", " ## You can use `pd.DataFrame.select_dtypes()` to specify column types\n", " ## and return only those columns as a data frame.\n", " numeric_train = train.select_dtypes(include=['integer', 'float'])\n", " numeric_test = test.select_dtypes(include=['integer', 'float'])\n", " \n", " ## You can use `pd.Series.drop()` to drop a value.\n", " features = numeric_train.columns.drop(\"SalePrice\")\n", " lr = linear_model.LinearRegression()\n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions = lr.predict(test[features])\n", " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n", " rmse = np.sqrt(mse)\n", " \n", " return rmse\n", "\n", "transform_df = transform_features(df)\n", "filtered_df = select_features(transform_df)\n", "rmse = train_and_test(filtered_df)\n", "\n", "rmse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Engineering\n", "\n", "- Handle missing values:\n", " - All columns:\n", " - Drop any with 5% or more missing values **for now**.\n", " - Text columns:\n", " - Drop any with 1 or more missing values **for now**.\n", " - Numerical columns:\n", " - For columns with missing values, fill in with the most common value in that column" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1: All columns: Drop any with 5% or more missing values **for now**." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Series object: column name -> number of missing values\n", "num_missing = df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Filter Series to columns containing >5% missing values\n", "drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()\n", "\n", "# Drop those columns from the data frame. Note the use of the .index accessor\n", "df = df.drop(drop_missing_cols.index, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "2: Text columns: Drop any with 1 or more missing values **for now**." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Series object: column name -> number of missing values\n", "text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)\n", "\n", "## Filter Series to columns containing *any* missing values\n", "drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]\n", "\n", "df = df.drop(drop_missing_cols_2.index, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "3: Numerical columns: For columns with missing values, fill in with the most common value in that column" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BsmtFin SF 1 1\n", "BsmtFin SF 2 1\n", "Bsmt Unf SF 1\n", "Total Bsmt SF 1\n", "Garage Cars 1\n", "Garage Area 1\n", "Bsmt Full Bath 2\n", "Bsmt Half Bath 2\n", "Mas Vnr Area 23\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Compute column-wise missing value counts\n", "num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()\n", "fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()\n", "fixable_numeric_cols" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'BsmtFin SF 1': 0.0,\n", " 'BsmtFin SF 2': 0.0,\n", " 'Bsmt Unf SF': 0.0,\n", " 'Total Bsmt SF': 0.0,\n", " 'Garage Cars': 2.0,\n", " 'Garage Area': 0.0,\n", " 'Bsmt Full Bath': 0.0,\n", " 'Bsmt Half Bath': 0.0,\n", " 'Mas Vnr Area': 0.0}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Compute the most common value for each column in `fixable_nmeric_missing_cols`.\n", "replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]\n", "replacement_values_dict" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "## Use `pd.DataFrame.fillna()` to replace missing values.\n", "df = df.fillna(replacement_values_dict)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 64\n", "dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Verify that every column has 0 missing values\n", "df.isnull().sum().value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What new features can we create, that better capture the information in some of the features?" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "2180 -1\n", "dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "years_sold = df['Yr Sold'] - df['Year Built']\n", "years_sold[years_sold < 0]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1702 -1\n", "2180 -2\n", "2181 -1\n", "dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "years_since_remod = df['Yr Sold'] - df['Year Remod/Add']\n", "years_since_remod[years_since_remod < 0]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Create new columns\n", "df['Years Before Sale'] = years_sold\n", "df['Years Since Remod'] = years_since_remod\n", "\n", "## Drop rows with negative values for both of these new features\n", "df = df.drop([1702, 2180, 2181], axis=0)\n", "\n", "## No longer need original year columns\n", "df = df.drop([\"Year Built\", \"Year Remod/Add\"], axis = 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Drop columns that:\n", "- that aren't useful for ML\n", "- leak data about the final sale, read more about columns [here](https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "## Drop columns that aren't useful for ML\n", "df = df.drop([\"PID\", \"Order\"], axis=1)\n", "\n", "## Drop columns that leak info about the final sale\n", "df = df.drop([\"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Yr Sold\"], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's update transform_features()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "55275.367312413066" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def transform_features(df):\n", " num_missing = df.isnull().sum()\n", " drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()\n", " df = df.drop(drop_missing_cols.index, axis=1)\n", " \n", " text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)\n", " drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]\n", " df = df.drop(drop_missing_cols_2.index, axis=1)\n", " \n", " num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()\n", " fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()\n", " replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]\n", " df = df.fillna(replacement_values_dict)\n", " \n", " years_sold = df['Yr Sold'] - df['Year Built']\n", " years_since_remod = df['Yr Sold'] - df['Year Remod/Add']\n", " df['Years Before Sale'] = years_sold\n", " df['Years Since Remod'] = years_since_remod\n", " df = df.drop([1702, 2180, 2181], axis=0)\n", "\n", " df = df.drop([\"PID\", \"Order\", \"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Year Built\", \"Year Remod/Add\"], axis=1)\n", " return df\n", "\n", "def select_features(df):\n", " return df[[\"Gr Liv Area\", \"SalePrice\"]]\n", "\n", "def train_and_test(df): \n", " train = df[:1460]\n", " test = df[1460:]\n", " \n", " ## You can use `pd.DataFrame.select_dtypes()` to specify column types\n", " ## and return only those columns as a data frame.\n", " numeric_train = train.select_dtypes(include=['integer', 'float'])\n", " numeric_test = test.select_dtypes(include=['integer', 'float'])\n", " \n", " ## You can use `pd.Series.drop()` to drop a value.\n", " features = numeric_train.columns.drop(\"SalePrice\")\n", " lr = linear_model.LinearRegression()\n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions = lr.predict(test[features])\n", " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n", " rmse = np.sqrt(mse)\n", " \n", " return rmse\n", "\n", "df = pd.read_csv(\"AmesHousing.tsv\", delimiter=\"\\t\")\n", "transform_df = transform_features(df)\n", "filtered_df = select_features(transform_df)\n", "rmse = train_and_test(filtered_df)\n", "\n", "rmse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Selection" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", " | MS SubClass | \n", "Lot Area | \n", "Overall Qual | \n", "Overall Cond | \n", "Mas Vnr Area | \n", "BsmtFin SF 1 | \n", "BsmtFin SF 2 | \n", "Bsmt Unf SF | \n", "Total Bsmt SF | \n", "1st Flr SF | \n", "2nd Flr SF | \n", "Low Qual Fin SF | \n", "Gr Liv Area | \n", "Bsmt Full Bath | \n", "Bsmt Half Bath | \n", "Full Bath | \n", "Half Bath | \n", "Bedroom AbvGr | \n", "Kitchen AbvGr | \n", "TotRms AbvGrd | \n", "Fireplaces | \n", "Garage Cars | \n", "Garage Area | \n", "Wood Deck SF | \n", "Open Porch SF | \n", "Enclosed Porch | \n", "3Ssn Porch | \n", "Screen Porch | \n", "Pool Area | \n", "Misc Val | \n", "Yr Sold | \n", "SalePrice | \n", "Years Before Sale | \n", "Years Since Remod | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "20 | \n", "31770 | \n", "6 | \n", "5 | \n", "112.0 | \n", "639.0 | \n", "0.0 | \n", "441.0 | \n", "1080.0 | \n", "1656 | \n", "0 | \n", "0 | \n", "1656 | \n", "1.0 | \n", "0.0 | \n", "1 | \n", "0 | \n", "3 | \n", "1 | \n", "7 | \n", "2 | \n", "2.0 | \n", "528.0 | \n", "210 | \n", "62 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "2010 | \n", "215000 | \n", "50 | \n", "50 | \n", "
1 | \n", "20 | \n", "11622 | \n", "5 | \n", "6 | \n", "0.0 | \n", "468.0 | \n", "144.0 | \n", "270.0 | \n", "882.0 | \n", "896 | \n", "0 | \n", "0 | \n", "896 | \n", "0.0 | \n", "0.0 | \n", "1 | \n", "0 | \n", "2 | \n", "1 | \n", "5 | \n", "0 | \n", "1.0 | \n", "730.0 | \n", "140 | \n", "0 | \n", "0 | \n", "0 | \n", "120 | \n", "0 | \n", "0 | \n", "2010 | \n", "105000 | \n", "49 | \n", "49 | \n", "
2 | \n", "20 | \n", "14267 | \n", "6 | \n", "6 | \n", "108.0 | \n", "923.0 | \n", "0.0 | \n", "406.0 | \n", "1329.0 | \n", "1329 | \n", "0 | \n", "0 | \n", "1329 | \n", "0.0 | \n", "0.0 | \n", "1 | \n", "1 | \n", "3 | \n", "1 | \n", "6 | \n", "0 | \n", "1.0 | \n", "312.0 | \n", "393 | \n", "36 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "12500 | \n", "2010 | \n", "172000 | \n", "52 | \n", "52 | \n", "
3 | \n", "20 | \n", "11160 | \n", "7 | \n", "5 | \n", "0.0 | \n", "1065.0 | \n", "0.0 | \n", "1045.0 | \n", "2110.0 | \n", "2110 | \n", "0 | \n", "0 | \n", "2110 | \n", "1.0 | \n", "0.0 | \n", "2 | \n", "1 | \n", "3 | \n", "1 | \n", "8 | \n", "2 | \n", "2.0 | \n", "522.0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "2010 | \n", "244000 | \n", "42 | \n", "42 | \n", "
4 | \n", "60 | \n", "13830 | \n", "5 | \n", "5 | \n", "0.0 | \n", "791.0 | \n", "0.0 | \n", "137.0 | \n", "928.0 | \n", "928 | \n", "701 | \n", "0 | \n", "1629 | \n", "0.0 | \n", "0.0 | \n", "2 | \n", "1 | \n", "3 | \n", "1 | \n", "6 | \n", "1 | \n", "2.0 | \n", "482.0 | \n", "212 | \n", "34 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "2010 | \n", "189900 | \n", "13 | \n", "12 | \n", "