{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Introduction" ] }, { "cell_type": "code", "execution_count": 409, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "pd.options.display.max_columns = 999\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.model_selection import KFold\n", "\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn import linear_model\n", "from sklearn.model_selection import KFold" ] }, { "cell_type": "code", "execution_count": 333, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "df = pd.read_csv(\"AmesHousing.tsv\", delimiter=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 334, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "57088.251612639091" ] }, "execution_count": 334, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def transform_features(df):\n", " return df\n", "\n", "def select_features(df):\n", " return df[[\"Gr Liv Area\", \"SalePrice\"]]\n", "\n", "def train_and_test(df): \n", " train = df[:1460]\n", " test = df[1460:]\n", " \n", " ## You can use `pd.DataFrame.select_dtypes()` to specify column types\n", " ## and return only those columns as a data frame.\n", " numeric_train = train.select_dtypes(include=['integer', 'float'])\n", " numeric_test = test.select_dtypes(include=['integer', 'float'])\n", " \n", " ## You can use `pd.Series.drop()` to drop a value.\n", " features = numeric_train.columns.drop(\"SalePrice\")\n", " lr = linear_model.LinearRegression()\n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions = lr.predict(test[features])\n", " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n", " rmse = np.sqrt(mse)\n", " \n", " return rmse\n", "\n", "transform_df = transform_features(df)\n", "filtered_df = select_features(transform_df)\n", "rmse = train_and_test(filtered_df)\n", "\n", "rmse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Engineering\n", "\n", "- Handle missing values:\n", " - All columns:\n", " - Drop any with 5% or more missing values **for now**.\n", " - Text columns:\n", " - Drop any with 1 or more missing values **for now**.\n", " - Numerical columns:\n", " - For columns with missing values, fill in with the most common value in that column" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1: All columns: Drop any with 5% or more missing values **for now**." ] }, { "cell_type": "code", "execution_count": 296, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Series object: column name -> number of missing values\n", "num_missing = df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 297, "metadata": {}, "outputs": [], "source": [ "# Filter Series to columns containing >5% missing values\n", "drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()\n", "\n", "# Drop those columns from the data frame. Note the use of the .index accessor\n", "df = df.drop(drop_missing_cols.index, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "2: Text columns: Drop any with 1 or more missing values **for now**." ] }, { "cell_type": "code", "execution_count": 298, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Series object: column name -> number of missing values\n", "text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)\n", "\n", "## Filter Series to columns containing *any* missing values\n", "drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]\n", "\n", "df = df.drop(drop_missing_cols_2.index, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "3: Numerical columns: For columns with missing values, fill in with the most common value in that column" ] }, { "cell_type": "code", "execution_count": 299, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BsmtFin SF 1 1\n", "BsmtFin SF 2 1\n", "Bsmt Unf SF 1\n", "Total Bsmt SF 1\n", "Garage Cars 1\n", "Garage Area 1\n", "Bsmt Full Bath 2\n", "Bsmt Half Bath 2\n", "Mas Vnr Area 23\n", "dtype: int64" ] }, "execution_count": 299, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Compute column-wise missing value counts\n", "num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()\n", "fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()\n", "fixable_numeric_cols" ] }, { "cell_type": "code", "execution_count": 307, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Bsmt Full Bath': 0.0,\n", " 'Bsmt Half Bath': 0.0,\n", " 'Bsmt Unf SF': 0.0,\n", " 'BsmtFin SF 1': 0.0,\n", " 'BsmtFin SF 2': 0.0,\n", " 'Garage Area': 0.0,\n", " 'Garage Cars': 2.0,\n", " 'Mas Vnr Area': 0.0,\n", " 'Total Bsmt SF': 0.0}" ] }, "execution_count": 307, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Compute the most common value for each column in `fixable_nmeric_missing_cols`.\n", "replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]\n", "replacement_values_dict" ] }, { "cell_type": "code", "execution_count": 308, "metadata": {}, "outputs": [], "source": [ "## Use `pd.DataFrame.fillna()` to replace missing values.\n", "df = df.fillna(replacement_values_dict)" ] }, { "cell_type": "code", "execution_count": 311, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 64\n", "dtype: int64" ] }, "execution_count": 311, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Verify that every column has 0 missing values\n", "df.isnull().sum().value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What new features can we create, that better capture the information in some of the features?" ] }, { "cell_type": "code", "execution_count": 320, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "2180 -1\n", "dtype: int64" ] }, "execution_count": 320, "metadata": {}, "output_type": "execute_result" } ], "source": [ "years_sold = df['Yr Sold'] - df['Year Built']\n", "years_sold[years_sold < 0]" ] }, { "cell_type": "code", "execution_count": 322, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1702 -1\n", "2180 -2\n", "2181 -1\n", "dtype: int64" ] }, "execution_count": 322, "metadata": {}, "output_type": "execute_result" } ], "source": [ "years_since_remod = df['Yr Sold'] - df['Year Remod/Add']\n", "years_since_remod[years_since_remod < 0]" ] }, { "cell_type": "code", "execution_count": 329, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Create new columns\n", "df['Years Before Sale'] = years_sold\n", "df['Years Since Remod'] = years_since_remod\n", "\n", "## Drop rows with negative values for both of these new features\n", "df = df.drop([1702, 2180, 2181], axis=0)\n", "\n", "## No longer need original year columns\n", "df = df.drop([\"Year Built\", \"Year Remod/Add\"], axis = 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Drop columns that:\n", "- that aren't useful for ML\n", "- leak data about the final sale, read more about columns [here](https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt)" ] }, { "cell_type": "code", "execution_count": 327, "metadata": {}, "outputs": [], "source": [ "## Drop columns that aren't useful for ML\n", "df = df.drop([\"PID\", \"Order\"], axis=1)\n", "\n", "## Drop columns that leak info about the final sale\n", "df = df.drop([\"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Yr Sold\"], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's update transform_features()" ] }, { "cell_type": "code", "execution_count": 340, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "55275.367312413066" ] }, "execution_count": 340, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def transform_features(df):\n", " num_missing = df.isnull().sum()\n", " drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()\n", " df = df.drop(drop_missing_cols.index, axis=1)\n", " \n", " text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)\n", " drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]\n", " df = df.drop(drop_missing_cols_2.index, axis=1)\n", " \n", " num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()\n", " fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()\n", " replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]\n", " df = df.fillna(replacement_values_dict)\n", " \n", " years_sold = df['Yr Sold'] - df['Year Built']\n", " years_since_remod = df['Yr Sold'] - df['Year Remod/Add']\n", " df['Years Before Sale'] = years_sold\n", " df['Years Since Remod'] = years_since_remod\n", " df = df.drop([1702, 2180, 2181], axis=0)\n", "\n", " df = df.drop([\"PID\", \"Order\", \"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Year Built\", \"Year Remod/Add\"], axis=1)\n", " return df\n", "\n", "def select_features(df):\n", " return df[[\"Gr Liv Area\", \"SalePrice\"]]\n", "\n", "def train_and_test(df): \n", " train = df[:1460]\n", " test = df[1460:]\n", " \n", " ## You can use `pd.DataFrame.select_dtypes()` to specify column types\n", " ## and return only those columns as a data frame.\n", " numeric_train = train.select_dtypes(include=['integer', 'float'])\n", " numeric_test = test.select_dtypes(include=['integer', 'float'])\n", " \n", " ## You can use `pd.Series.drop()` to drop a value.\n", " features = numeric_train.columns.drop(\"SalePrice\")\n", " lr = linear_model.LinearRegression()\n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions = lr.predict(test[features])\n", " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n", " rmse = np.sqrt(mse)\n", " \n", " return rmse\n", "\n", "df = pd.read_csv(\"AmesHousing.tsv\", delimiter=\"\\t\")\n", "transform_df = transform_features(df)\n", "filtered_df = select_features(transform_df)\n", "rmse = train_and_test(filtered_df)\n", "\n", "rmse" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Feature Selection" ] }, { "cell_type": "code", "execution_count": 389, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MS SubClassLot AreaOverall QualOverall CondMas Vnr AreaBsmtFin SF 1BsmtFin SF 2Bsmt Unf SFTotal Bsmt SF1st Flr SF2nd Flr SFLow Qual Fin SFGr Liv AreaBsmt Full BathBsmt Half BathFull BathHalf BathBedroom AbvGrKitchen AbvGrTotRms AbvGrdFireplacesGarage CarsGarage AreaWood Deck SFOpen Porch SFEnclosed Porch3Ssn PorchScreen PorchPool AreaMisc ValYr SoldSalePriceYears Before SaleYears Since Remod
0203177065112.0639.00.0441.01080.016560016561.00.01031722.0528.0210620000020102150005050
12011622560.0468.0144.0270.0882.0896008960.00.01021501.0730.01400001200020101050004949
2201426766108.0923.00.0406.01329.013290013290.00.01131601.0312.03933600001250020101720005252
32011160750.01065.00.01045.02110.021100021101.00.02131822.0522.0000000020102440004242
46013830550.0791.00.0137.0928.0928701016290.00.02131612.0482.0212340000020101899001312
56099786620.0602.00.0324.0926.0926678016040.00.02131712.0470.0360360000020101955001212
61204920850.0616.00.0722.01338.013380013381.00.02021602.0582.0001700000201021350099
71205005850.0263.00.01017.01280.012800012800.00.02021502.0506.0082001440020101915001818
81205389850.01180.00.0415.01595.016160016161.00.02021512.0608.02371520000020102365001514
9607500750.00.00.0994.0994.01028776018040.00.02131712.0442.0140600000020101890001111
106010000650.00.00.0763.0763.0763892016550.00.02131712.0440.0157840000020101759001716
11207980670.0935.00.0233.01168.011870011871.00.02031602.0420.04832100005002010185000183
12608402650.00.00.0789.0789.0789676014650.00.02131712.0393.00750000020101804001212
132010176750.0637.00.0663.01300.013410013411.00.01121512.0506.019200000020101715002020
141206820850.0368.01120.00.01488.015020015021.00.01111402.0528.0054001400020102120002525
15605350485603.01416.00.0234.01650.016901589032791.00.031411213.0841.0503360021000201053800077
165012134870.0427.00.0132.0559.01080672017520.00.02041802.0492.032512000002010164000225
17201139492350.01445.00.0411.01856.018560018561.00.01111813.0834.0113000000201039443200
182019138450.0120.00.0744.0864.0864008640.00.01021402.0400.0000000020101410005959
19201317566119.0790.0163.0589.01542.020730020731.00.02031722.0500.034900000020102100003222
20201175166480.0705.00.01139.01844.018440018440.00.02031712.0546.001220000020101900003333
2185106257681.0885.0168.00.01053.011730011731.00.02031622.0528.001200000020101700003636
22607500750.0533.00.0281.0814.0814860016741.00.02131702.0663.00960000020102160001010
23201124167180.0578.00.0426.01004.010040010041.00.01021512.0480.000000070020101490004040
242012537560.0734.00.0344.01078.010780010781.00.01131612.0500.000000002010149900392
25208450560.0775.00.0281.01056.010560010561.00.01031611.0304.0085184000020101420004242
26208400450.0804.078.00.0882.0882008821.00.01021402.0525.024000000020101260004040
272010500450.0432.00.0432.0864.0864008640.00.01031510.00.0000000020101150003939
281205858750.01051.00.0354.01405.013370013371.00.02021512.0511.0203680000020101840001111
29160168065504.0156.00.0327.0483.048350409870.00.01121501.0264.02750000002010960003939
.........................................................................................................
2900201361885198.01350.00.0378.01728.019600019601.00.02031823.0714.01723800000200632000010
2901201144385208.01460.00.0408.01868.020280020281.00.02021723.0880.03266600000200636990010
2902201157795382.01455.00.0383.01838.018380018381.00.02031913.0682.016122500000200635990010
29032031250130.00.00.00.00.016000016000.00.01131601.0270.00013500002006815005555
290490702075200.01243.00.045.01288.013680013682.00.02022804.0784.004800000200621500099
2905120450065116.0897.00.0319.01216.012160012161.00.02021502.0402.0012500000200616400088
2906120450065443.01201.00.036.01237.013370013371.00.02021502.0405.0019900000200615350088
29072017217550.00.00.01140.01140.011400011400.00.01031600.00.036560000020068450000
29081602665560.00.00.0264.0264.0616688013040.00.01131511.0336.014100000020061045002929
29091602665560.0548.0173.036.0757.0925550014750.00.02041611.0336.0104260000020061270002929
29101603964640.0837.00.0105.0942.012911230025211.00.021511012.0576.0728200000020061514003333
29112010172570.0441.00.0423.0864.0874008741.00.01031501.0288.00120000002006126500383
29129011836550.0149.00.01503.01652.016520016520.00.02042803.0928.0000000020061465003636
29131801470460.0522.00.0108.0630.0630006301.00.01011300.00.000000002006730003636
29141601484440.0252.00.0294.0546.0546546010920.00.01131501.0253.000000002006794003434
2915201338455194.0119.0344.0641.01104.013600013601.00.01031811.0336.016000000020061400003727
29161801533570.0553.00.077.0630.0630006301.00.01011300.00.000000002006920003636
29171601533450.0408.00.0138.0546.0546546010920.00.01131501.0286.000000002006875503636
29181601526450.00.00.0546.0546.0546546010920.00.01131500.00.0034000002006795003636
29191601936470.00.00.0546.0546.0546546010920.00.01131500.00.000000002006905003636
29201601894450.0252.00.0294.0546.0546546010920.00.01131601.0286.0024000002006710003636
29219012640650.0936.0396.0396.01728.017280017280.00.02042802.0574.04000000020061509003030
2922909297550.01606.00.0122.01728.017280017282.00.02042802.0560.0000000020061880003030
29232017400550.0936.00.0190.01126.011260011261.00.02031512.0484.0295410000020061600002929
29242020000570.01224.00.00.01224.012240012241.00.01041712.0576.047400000020061310004610
2925807937660.0819.00.0184.01003.010030010031.00.01031602.0588.012000000020061425002222
2926208885550.0301.0324.0239.0864.0902009021.00.01021502.0484.016400000020061310002323
29278510441550.0337.00.0575.0912.0970009700.01.01031600.00.08032000070020061320001414
29282010010550.01071.0123.0195.01389.013890013891.00.01021612.0418.0240380000020061700003231
29296096277594.0758.00.0238.0996.09961004020000.00.02131913.0650.0190480000020061880001312
\n", "

2927 rows × 34 columns

\n", "
" ], "text/plain": [ " MS SubClass Lot Area Overall Qual Overall Cond Mas Vnr Area \\\n", "0 20 31770 6 5 112.0 \n", "1 20 11622 5 6 0.0 \n", "2 20 14267 6 6 108.0 \n", "3 20 11160 7 5 0.0 \n", "4 60 13830 5 5 0.0 \n", "5 60 9978 6 6 20.0 \n", "6 120 4920 8 5 0.0 \n", "7 120 5005 8 5 0.0 \n", "8 120 5389 8 5 0.0 \n", "9 60 7500 7 5 0.0 \n", "10 60 10000 6 5 0.0 \n", "11 20 7980 6 7 0.0 \n", "12 60 8402 6 5 0.0 \n", "13 20 10176 7 5 0.0 \n", "14 120 6820 8 5 0.0 \n", "15 60 53504 8 5 603.0 \n", "16 50 12134 8 7 0.0 \n", "17 20 11394 9 2 350.0 \n", "18 20 19138 4 5 0.0 \n", "19 20 13175 6 6 119.0 \n", "20 20 11751 6 6 480.0 \n", "21 85 10625 7 6 81.0 \n", "22 60 7500 7 5 0.0 \n", "23 20 11241 6 7 180.0 \n", "24 20 12537 5 6 0.0 \n", "25 20 8450 5 6 0.0 \n", "26 20 8400 4 5 0.0 \n", "27 20 10500 4 5 0.0 \n", "28 120 5858 7 5 0.0 \n", "29 160 1680 6 5 504.0 \n", "... ... ... ... ... ... \n", "2900 20 13618 8 5 198.0 \n", "2901 20 11443 8 5 208.0 \n", "2902 20 11577 9 5 382.0 \n", "2903 20 31250 1 3 0.0 \n", "2904 90 7020 7 5 200.0 \n", "2905 120 4500 6 5 116.0 \n", "2906 120 4500 6 5 443.0 \n", "2907 20 17217 5 5 0.0 \n", "2908 160 2665 5 6 0.0 \n", "2909 160 2665 5 6 0.0 \n", "2910 160 3964 6 4 0.0 \n", "2911 20 10172 5 7 0.0 \n", "2912 90 11836 5 5 0.0 \n", "2913 180 1470 4 6 0.0 \n", "2914 160 1484 4 4 0.0 \n", "2915 20 13384 5 5 194.0 \n", "2916 180 1533 5 7 0.0 \n", "2917 160 1533 4 5 0.0 \n", "2918 160 1526 4 5 0.0 \n", "2919 160 1936 4 7 0.0 \n", "2920 160 1894 4 5 0.0 \n", "2921 90 12640 6 5 0.0 \n", "2922 90 9297 5 5 0.0 \n", "2923 20 17400 5 5 0.0 \n", "2924 20 20000 5 7 0.0 \n", "2925 80 7937 6 6 0.0 \n", "2926 20 8885 5 5 0.0 \n", "2927 85 10441 5 5 0.0 \n", "2928 20 10010 5 5 0.0 \n", "2929 60 9627 7 5 94.0 \n", "\n", " BsmtFin SF 1 BsmtFin SF 2 Bsmt Unf SF Total Bsmt SF 1st Flr SF \\\n", "0 639.0 0.0 441.0 1080.0 1656 \n", "1 468.0 144.0 270.0 882.0 896 \n", "2 923.0 0.0 406.0 1329.0 1329 \n", "3 1065.0 0.0 1045.0 2110.0 2110 \n", "4 791.0 0.0 137.0 928.0 928 \n", "5 602.0 0.0 324.0 926.0 926 \n", "6 616.0 0.0 722.0 1338.0 1338 \n", "7 263.0 0.0 1017.0 1280.0 1280 \n", "8 1180.0 0.0 415.0 1595.0 1616 \n", "9 0.0 0.0 994.0 994.0 1028 \n", "10 0.0 0.0 763.0 763.0 763 \n", "11 935.0 0.0 233.0 1168.0 1187 \n", "12 0.0 0.0 789.0 789.0 789 \n", "13 637.0 0.0 663.0 1300.0 1341 \n", "14 368.0 1120.0 0.0 1488.0 1502 \n", "15 1416.0 0.0 234.0 1650.0 1690 \n", "16 427.0 0.0 132.0 559.0 1080 \n", "17 1445.0 0.0 411.0 1856.0 1856 \n", "18 120.0 0.0 744.0 864.0 864 \n", "19 790.0 163.0 589.0 1542.0 2073 \n", "20 705.0 0.0 1139.0 1844.0 1844 \n", "21 885.0 168.0 0.0 1053.0 1173 \n", "22 533.0 0.0 281.0 814.0 814 \n", "23 578.0 0.0 426.0 1004.0 1004 \n", "24 734.0 0.0 344.0 1078.0 1078 \n", "25 775.0 0.0 281.0 1056.0 1056 \n", "26 804.0 78.0 0.0 882.0 882 \n", "27 432.0 0.0 432.0 864.0 864 \n", "28 1051.0 0.0 354.0 1405.0 1337 \n", "29 156.0 0.0 327.0 483.0 483 \n", "... ... ... ... ... ... \n", "2900 1350.0 0.0 378.0 1728.0 1960 \n", "2901 1460.0 0.0 408.0 1868.0 2028 \n", "2902 1455.0 0.0 383.0 1838.0 1838 \n", "2903 0.0 0.0 0.0 0.0 1600 \n", "2904 1243.0 0.0 45.0 1288.0 1368 \n", "2905 897.0 0.0 319.0 1216.0 1216 \n", "2906 1201.0 0.0 36.0 1237.0 1337 \n", "2907 0.0 0.0 1140.0 1140.0 1140 \n", "2908 0.0 0.0 264.0 264.0 616 \n", "2909 548.0 173.0 36.0 757.0 925 \n", "2910 837.0 0.0 105.0 942.0 1291 \n", "2911 441.0 0.0 423.0 864.0 874 \n", "2912 149.0 0.0 1503.0 1652.0 1652 \n", "2913 522.0 0.0 108.0 630.0 630 \n", "2914 252.0 0.0 294.0 546.0 546 \n", "2915 119.0 344.0 641.0 1104.0 1360 \n", "2916 553.0 0.0 77.0 630.0 630 \n", "2917 408.0 0.0 138.0 546.0 546 \n", "2918 0.0 0.0 546.0 546.0 546 \n", "2919 0.0 0.0 546.0 546.0 546 \n", "2920 252.0 0.0 294.0 546.0 546 \n", "2921 936.0 396.0 396.0 1728.0 1728 \n", "2922 1606.0 0.0 122.0 1728.0 1728 \n", "2923 936.0 0.0 190.0 1126.0 1126 \n", "2924 1224.0 0.0 0.0 1224.0 1224 \n", "2925 819.0 0.0 184.0 1003.0 1003 \n", "2926 301.0 324.0 239.0 864.0 902 \n", "2927 337.0 0.0 575.0 912.0 970 \n", "2928 1071.0 123.0 195.0 1389.0 1389 \n", "2929 758.0 0.0 238.0 996.0 996 \n", "\n", " 2nd Flr SF Low Qual Fin SF Gr Liv Area Bsmt Full Bath \\\n", "0 0 0 1656 1.0 \n", "1 0 0 896 0.0 \n", "2 0 0 1329 0.0 \n", "3 0 0 2110 1.0 \n", "4 701 0 1629 0.0 \n", "5 678 0 1604 0.0 \n", "6 0 0 1338 1.0 \n", "7 0 0 1280 0.0 \n", "8 0 0 1616 1.0 \n", "9 776 0 1804 0.0 \n", "10 892 0 1655 0.0 \n", "11 0 0 1187 1.0 \n", "12 676 0 1465 0.0 \n", "13 0 0 1341 1.0 \n", "14 0 0 1502 1.0 \n", "15 1589 0 3279 1.0 \n", "16 672 0 1752 0.0 \n", "17 0 0 1856 1.0 \n", "18 0 0 864 0.0 \n", "19 0 0 2073 1.0 \n", "20 0 0 1844 0.0 \n", "21 0 0 1173 1.0 \n", "22 860 0 1674 1.0 \n", "23 0 0 1004 1.0 \n", "24 0 0 1078 1.0 \n", "25 0 0 1056 1.0 \n", "26 0 0 882 1.0 \n", "27 0 0 864 0.0 \n", "28 0 0 1337 1.0 \n", "29 504 0 987 0.0 \n", "... ... ... ... ... \n", "2900 0 0 1960 1.0 \n", "2901 0 0 2028 1.0 \n", "2902 0 0 1838 1.0 \n", "2903 0 0 1600 0.0 \n", "2904 0 0 1368 2.0 \n", "2905 0 0 1216 1.0 \n", "2906 0 0 1337 1.0 \n", "2907 0 0 1140 0.0 \n", "2908 688 0 1304 0.0 \n", "2909 550 0 1475 0.0 \n", "2910 1230 0 2521 1.0 \n", "2911 0 0 874 1.0 \n", "2912 0 0 1652 0.0 \n", "2913 0 0 630 1.0 \n", "2914 546 0 1092 0.0 \n", "2915 0 0 1360 1.0 \n", "2916 0 0 630 1.0 \n", "2917 546 0 1092 0.0 \n", "2918 546 0 1092 0.0 \n", "2919 546 0 1092 0.0 \n", "2920 546 0 1092 0.0 \n", "2921 0 0 1728 0.0 \n", "2922 0 0 1728 2.0 \n", "2923 0 0 1126 1.0 \n", "2924 0 0 1224 1.0 \n", "2925 0 0 1003 1.0 \n", "2926 0 0 902 1.0 \n", "2927 0 0 970 0.0 \n", "2928 0 0 1389 1.0 \n", "2929 1004 0 2000 0.0 \n", "\n", " Bsmt Half Bath Full Bath Half Bath Bedroom AbvGr Kitchen AbvGr \\\n", "0 0.0 1 0 3 1 \n", "1 0.0 1 0 2 1 \n", "2 0.0 1 1 3 1 \n", "3 0.0 2 1 3 1 \n", "4 0.0 2 1 3 1 \n", "5 0.0 2 1 3 1 \n", "6 0.0 2 0 2 1 \n", "7 0.0 2 0 2 1 \n", "8 0.0 2 0 2 1 \n", "9 0.0 2 1 3 1 \n", "10 0.0 2 1 3 1 \n", "11 0.0 2 0 3 1 \n", "12 0.0 2 1 3 1 \n", "13 0.0 1 1 2 1 \n", "14 0.0 1 1 1 1 \n", "15 0.0 3 1 4 1 \n", "16 0.0 2 0 4 1 \n", "17 0.0 1 1 1 1 \n", "18 0.0 1 0 2 1 \n", "19 0.0 2 0 3 1 \n", "20 0.0 2 0 3 1 \n", "21 0.0 2 0 3 1 \n", "22 0.0 2 1 3 1 \n", "23 0.0 1 0 2 1 \n", "24 0.0 1 1 3 1 \n", "25 0.0 1 0 3 1 \n", "26 0.0 1 0 2 1 \n", "27 0.0 1 0 3 1 \n", "28 0.0 2 0 2 1 \n", "29 0.0 1 1 2 1 \n", "... ... ... ... ... ... \n", "2900 0.0 2 0 3 1 \n", "2901 0.0 2 0 2 1 \n", "2902 0.0 2 0 3 1 \n", "2903 0.0 1 1 3 1 \n", "2904 0.0 2 0 2 2 \n", "2905 0.0 2 0 2 1 \n", "2906 0.0 2 0 2 1 \n", "2907 0.0 1 0 3 1 \n", "2908 0.0 1 1 3 1 \n", "2909 0.0 2 0 4 1 \n", "2910 0.0 2 1 5 1 \n", "2911 0.0 1 0 3 1 \n", "2912 0.0 2 0 4 2 \n", "2913 0.0 1 0 1 1 \n", "2914 0.0 1 1 3 1 \n", "2915 0.0 1 0 3 1 \n", "2916 0.0 1 0 1 1 \n", "2917 0.0 1 1 3 1 \n", "2918 0.0 1 1 3 1 \n", "2919 0.0 1 1 3 1 \n", "2920 0.0 1 1 3 1 \n", "2921 0.0 2 0 4 2 \n", "2922 0.0 2 0 4 2 \n", "2923 0.0 2 0 3 1 \n", "2924 0.0 1 0 4 1 \n", "2925 0.0 1 0 3 1 \n", "2926 0.0 1 0 2 1 \n", "2927 1.0 1 0 3 1 \n", "2928 0.0 1 0 2 1 \n", "2929 0.0 2 1 3 1 \n", "\n", " TotRms AbvGrd Fireplaces Garage Cars Garage Area Wood Deck SF \\\n", "0 7 2 2.0 528.0 210 \n", "1 5 0 1.0 730.0 140 \n", "2 6 0 1.0 312.0 393 \n", "3 8 2 2.0 522.0 0 \n", "4 6 1 2.0 482.0 212 \n", "5 7 1 2.0 470.0 360 \n", "6 6 0 2.0 582.0 0 \n", "7 5 0 2.0 506.0 0 \n", "8 5 1 2.0 608.0 237 \n", "9 7 1 2.0 442.0 140 \n", "10 7 1 2.0 440.0 157 \n", "11 6 0 2.0 420.0 483 \n", "12 7 1 2.0 393.0 0 \n", "13 5 1 2.0 506.0 192 \n", "14 4 0 2.0 528.0 0 \n", "15 12 1 3.0 841.0 503 \n", "16 8 0 2.0 492.0 325 \n", "17 8 1 3.0 834.0 113 \n", "18 4 0 2.0 400.0 0 \n", "19 7 2 2.0 500.0 349 \n", "20 7 1 2.0 546.0 0 \n", "21 6 2 2.0 528.0 0 \n", "22 7 0 2.0 663.0 0 \n", "23 5 1 2.0 480.0 0 \n", "24 6 1 2.0 500.0 0 \n", "25 6 1 1.0 304.0 0 \n", "26 4 0 2.0 525.0 240 \n", "27 5 1 0.0 0.0 0 \n", "28 5 1 2.0 511.0 203 \n", "29 5 0 1.0 264.0 275 \n", "... ... ... ... ... ... \n", "2900 8 2 3.0 714.0 172 \n", "2901 7 2 3.0 880.0 326 \n", "2902 9 1 3.0 682.0 161 \n", "2903 6 0 1.0 270.0 0 \n", "2904 8 0 4.0 784.0 0 \n", "2905 5 0 2.0 402.0 0 \n", "2906 5 0 2.0 405.0 0 \n", "2907 6 0 0.0 0.0 36 \n", "2908 5 1 1.0 336.0 141 \n", "2909 6 1 1.0 336.0 104 \n", "2910 10 1 2.0 576.0 728 \n", "2911 5 0 1.0 288.0 0 \n", "2912 8 0 3.0 928.0 0 \n", "2913 3 0 0.0 0.0 0 \n", "2914 5 0 1.0 253.0 0 \n", "2915 8 1 1.0 336.0 160 \n", "2916 3 0 0.0 0.0 0 \n", "2917 5 0 1.0 286.0 0 \n", "2918 5 0 0.0 0.0 0 \n", "2919 5 0 0.0 0.0 0 \n", "2920 6 0 1.0 286.0 0 \n", "2921 8 0 2.0 574.0 40 \n", "2922 8 0 2.0 560.0 0 \n", "2923 5 1 2.0 484.0 295 \n", "2924 7 1 2.0 576.0 474 \n", "2925 6 0 2.0 588.0 120 \n", "2926 5 0 2.0 484.0 164 \n", "2927 6 0 0.0 0.0 80 \n", "2928 6 1 2.0 418.0 240 \n", "2929 9 1 3.0 650.0 190 \n", "\n", " Open Porch SF Enclosed Porch 3Ssn Porch Screen Porch Pool Area \\\n", "0 62 0 0 0 0 \n", "1 0 0 0 120 0 \n", "2 36 0 0 0 0 \n", "3 0 0 0 0 0 \n", "4 34 0 0 0 0 \n", "5 36 0 0 0 0 \n", "6 0 170 0 0 0 \n", "7 82 0 0 144 0 \n", "8 152 0 0 0 0 \n", "9 60 0 0 0 0 \n", "10 84 0 0 0 0 \n", "11 21 0 0 0 0 \n", "12 75 0 0 0 0 \n", "13 0 0 0 0 0 \n", "14 54 0 0 140 0 \n", "15 36 0 0 210 0 \n", "16 12 0 0 0 0 \n", "17 0 0 0 0 0 \n", "18 0 0 0 0 0 \n", "19 0 0 0 0 0 \n", "20 122 0 0 0 0 \n", "21 120 0 0 0 0 \n", "22 96 0 0 0 0 \n", "23 0 0 0 0 0 \n", "24 0 0 0 0 0 \n", "25 85 184 0 0 0 \n", "26 0 0 0 0 0 \n", "27 0 0 0 0 0 \n", "28 68 0 0 0 0 \n", "29 0 0 0 0 0 \n", "... ... ... ... ... ... \n", "2900 38 0 0 0 0 \n", "2901 66 0 0 0 0 \n", "2902 225 0 0 0 0 \n", "2903 0 135 0 0 0 \n", "2904 48 0 0 0 0 \n", "2905 125 0 0 0 0 \n", "2906 199 0 0 0 0 \n", "2907 56 0 0 0 0 \n", "2908 0 0 0 0 0 \n", "2909 26 0 0 0 0 \n", "2910 20 0 0 0 0 \n", "2911 120 0 0 0 0 \n", "2912 0 0 0 0 0 \n", "2913 0 0 0 0 0 \n", "2914 0 0 0 0 0 \n", "2915 0 0 0 0 0 \n", "2916 0 0 0 0 0 \n", "2917 0 0 0 0 0 \n", "2918 34 0 0 0 0 \n", "2919 0 0 0 0 0 \n", "2920 24 0 0 0 0 \n", "2921 0 0 0 0 0 \n", "2922 0 0 0 0 0 \n", "2923 41 0 0 0 0 \n", "2924 0 0 0 0 0 \n", "2925 0 0 0 0 0 \n", "2926 0 0 0 0 0 \n", "2927 32 0 0 0 0 \n", "2928 38 0 0 0 0 \n", "2929 48 0 0 0 0 \n", "\n", " Misc Val Yr Sold SalePrice Years Before Sale Years Since Remod \n", "0 0 2010 215000 50 50 \n", "1 0 2010 105000 49 49 \n", "2 12500 2010 172000 52 52 \n", "3 0 2010 244000 42 42 \n", "4 0 2010 189900 13 12 \n", "5 0 2010 195500 12 12 \n", "6 0 2010 213500 9 9 \n", "7 0 2010 191500 18 18 \n", "8 0 2010 236500 15 14 \n", "9 0 2010 189000 11 11 \n", "10 0 2010 175900 17 16 \n", "11 500 2010 185000 18 3 \n", "12 0 2010 180400 12 12 \n", "13 0 2010 171500 20 20 \n", "14 0 2010 212000 25 25 \n", "15 0 2010 538000 7 7 \n", "16 0 2010 164000 22 5 \n", "17 0 2010 394432 0 0 \n", "18 0 2010 141000 59 59 \n", "19 0 2010 210000 32 22 \n", "20 0 2010 190000 33 33 \n", "21 0 2010 170000 36 36 \n", "22 0 2010 216000 10 10 \n", "23 700 2010 149000 40 40 \n", "24 0 2010 149900 39 2 \n", "25 0 2010 142000 42 42 \n", "26 0 2010 126000 40 40 \n", "27 0 2010 115000 39 39 \n", "28 0 2010 184000 11 11 \n", "29 0 2010 96000 39 39 \n", "... ... ... ... ... ... \n", "2900 0 2006 320000 1 0 \n", "2901 0 2006 369900 1 0 \n", "2902 0 2006 359900 1 0 \n", "2903 0 2006 81500 55 55 \n", "2904 0 2006 215000 9 9 \n", "2905 0 2006 164000 8 8 \n", "2906 0 2006 153500 8 8 \n", "2907 0 2006 84500 0 0 \n", "2908 0 2006 104500 29 29 \n", "2909 0 2006 127000 29 29 \n", "2910 0 2006 151400 33 33 \n", "2911 0 2006 126500 38 3 \n", "2912 0 2006 146500 36 36 \n", "2913 0 2006 73000 36 36 \n", "2914 0 2006 79400 34 34 \n", "2915 0 2006 140000 37 27 \n", "2916 0 2006 92000 36 36 \n", "2917 0 2006 87550 36 36 \n", "2918 0 2006 79500 36 36 \n", "2919 0 2006 90500 36 36 \n", "2920 0 2006 71000 36 36 \n", "2921 0 2006 150900 30 30 \n", "2922 0 2006 188000 30 30 \n", "2923 0 2006 160000 29 29 \n", "2924 0 2006 131000 46 10 \n", "2925 0 2006 142500 22 22 \n", "2926 0 2006 131000 23 23 \n", "2927 700 2006 132000 14 14 \n", "2928 0 2006 170000 32 31 \n", "2929 0 2006 188000 13 12 \n", "\n", "[2927 rows x 34 columns]" ] }, "execution_count": 389, "metadata": {}, "output_type": "execute_result" } ], "source": [ "numerical_df = transform_df.select_dtypes(include=['int', 'float'])\n", "numerical_df" ] }, { "cell_type": "code", "execution_count": 390, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "BsmtFin SF 2 0.006127\n", "Misc Val 0.019273\n", "Yr Sold 0.030358\n", "3Ssn Porch 0.032268\n", "Bsmt Half Bath 0.035875\n", "Low Qual Fin SF 0.037629\n", "Pool Area 0.068438\n", "MS SubClass 0.085128\n", "Overall Cond 0.101540\n", "Screen Porch 0.112280\n", "Kitchen AbvGr 0.119760\n", "Enclosed Porch 0.128685\n", "Bedroom AbvGr 0.143916\n", "Bsmt Unf SF 0.182751\n", "Lot Area 0.267520\n", "2nd Flr SF 0.269601\n", "Bsmt Full Bath 0.276258\n", "Half Bath 0.284871\n", "Open Porch SF 0.316262\n", "Wood Deck SF 0.328183\n", "BsmtFin SF 1 0.439284\n", "Fireplaces 0.474831\n", "TotRms AbvGrd 0.498574\n", "Mas Vnr Area 0.506983\n", "Years Since Remod 0.534985\n", "Full Bath 0.546118\n", "Years Before Sale 0.558979\n", "1st Flr SF 0.635185\n", "Garage Area 0.641425\n", "Total Bsmt SF 0.644012\n", "Garage Cars 0.648361\n", "Gr Liv Area 0.717596\n", "Overall Qual 0.801206\n", "SalePrice 1.000000\n", "Name: SalePrice, dtype: float64" ] }, "execution_count": 390, "metadata": {}, "output_type": "execute_result" } ], "source": [ "abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()\n", "abs_corr_coeffs" ] }, { "cell_type": "code", "execution_count": 391, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BsmtFin SF 1 0.439284\n", "Fireplaces 0.474831\n", "TotRms AbvGrd 0.498574\n", "Mas Vnr Area 0.506983\n", "Years Since Remod 0.534985\n", "Full Bath 0.546118\n", "Years Before Sale 0.558979\n", "1st Flr SF 0.635185\n", "Garage Area 0.641425\n", "Total Bsmt SF 0.644012\n", "Garage Cars 0.648361\n", "Gr Liv Area 0.717596\n", "Overall Qual 0.801206\n", "SalePrice 1.000000\n", "Name: SalePrice, dtype: float64" ] }, "execution_count": 391, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Let's only keep columns with a correlation coefficient of larger than 0.4 (arbitrary, worth experimenting later!)\n", "abs_corr_coeffs[abs_corr_coeffs > 0.4]" ] }, { "cell_type": "code", "execution_count": 392, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Drop columns with less than 0.4 correlation with SalePrice\n", "transform_df = transform_df.drop(abs_corr_coeffs[abs_corr_coeffs < 0.4].index, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Which categorical columns should we keep?" ] }, { "cell_type": "code", "execution_count": 393, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## Create a list of column names from documentation that are *meant* to be categorical\n", "nominal_features = [\"PID\", \"MS SubClass\", \"MS Zoning\", \"Street\", \"Alley\", \"Land Contour\", \"Lot Config\", \"Neighborhood\", \n", " \"Condition 1\", \"Condition 2\", \"Bldg Type\", \"House Style\", \"Roof Style\", \"Roof Matl\", \"Exterior 1st\", \n", " \"Exterior 2nd\", \"Mas Vnr Type\", \"Foundation\", \"Heating\", \"Central Air\", \"Garage Type\", \n", " \"Misc Feature\", \"Sale Type\", \"Sale Condition\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Which columns are currently numerical but need to be encoded as categorical instead (because the numbers don't have any semantic meaning)?\n", "- If a categorical column has hundreds of unique values (or categories), should we keep it? When we dummy code this column, hundreds of columns will need to be added back to the data frame." ] }, { "cell_type": "code", "execution_count": 394, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Which categorical columns have we still carried with us? We'll test tehse \n", "transform_cat_cols = []\n", "for col in nominal_features:\n", " if col in transform_df.columns:\n", " transform_cat_cols.append(col)\n", "\n", "## How many unique values in each categorical column?\n", "uniqueness_counts = transform_df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()\n", "## Aribtrary cutoff of 10 unique values (worth experimenting)\n", "drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index\n", "transform_df = transform_df.drop(drop_nonuniq_cols, axis=1)" ] }, { "cell_type": "code", "execution_count": 395, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## Select just the remaining text columns and convert to categorical\n", "text_cols = transform_df.select_dtypes(include=['object'])\n", "for col in text_cols:\n", " transform_df[col] = transform_df[col].astype('category')\n", " \n", "## Create dummy columns and add back to the dataframe!\n", "transform_df = pd.concat([\n", " transform_df, \n", " pd.get_dummies(transform_df.select_dtypes(include=['category']))\n", "], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Update `select_features()`" ] }, { "cell_type": "code", "execution_count": 426, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[26865.660668584878, 36061.238855571239, 24743.661620667855, 27948.830118737143]\n" ] }, { "data": { "text/plain": [ "28904.84781589028" ] }, "execution_count": 426, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def transform_features(df):\n", " num_missing = df.isnull().sum()\n", " drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()\n", " df = df.drop(drop_missing_cols.index, axis=1)\n", " \n", " text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)\n", " drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]\n", " df = df.drop(drop_missing_cols_2.index, axis=1)\n", " \n", " num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()\n", " fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()\n", " replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]\n", " df = df.fillna(replacement_values_dict)\n", " \n", " years_sold = df['Yr Sold'] - df['Year Built']\n", " years_since_remod = df['Yr Sold'] - df['Year Remod/Add']\n", " df['Years Before Sale'] = years_sold\n", " df['Years Since Remod'] = years_since_remod\n", " df = df.drop([1702, 2180, 2181], axis=0)\n", "\n", " df = df.drop([\"PID\", \"Order\", \"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Year Built\", \"Year Remod/Add\"], axis=1)\n", " return df\n", "\n", "def select_features(df, coeff_threshold=0.4, uniq_threshold=10):\n", " numerical_df = df.select_dtypes(include=['int', 'float'])\n", " abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()\n", " df = df.drop(abs_corr_coeffs[abs_corr_coeffs < coeff_threshold].index, axis=1)\n", " \n", " nominal_features = [\"PID\", \"MS SubClass\", \"MS Zoning\", \"Street\", \"Alley\", \"Land Contour\", \"Lot Config\", \"Neighborhood\", \n", " \"Condition 1\", \"Condition 2\", \"Bldg Type\", \"House Style\", \"Roof Style\", \"Roof Matl\", \"Exterior 1st\", \n", " \"Exterior 2nd\", \"Mas Vnr Type\", \"Foundation\", \"Heating\", \"Central Air\", \"Garage Type\", \n", " \"Misc Feature\", \"Sale Type\", \"Sale Condition\"]\n", " \n", " transform_cat_cols = []\n", " for col in nominal_features:\n", " if col in df.columns:\n", " transform_cat_cols.append(col)\n", "\n", " uniqueness_counts = df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()\n", " drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index\n", " df = df.drop(drop_nonuniq_cols, axis=1)\n", " \n", " text_cols = df.select_dtypes(include=['object'])\n", " for col in text_cols:\n", " df[col] = df[col].astype('category')\n", " df = pd.concat([df, pd.get_dummies(df.select_dtypes(include=['category']))], axis=1)\n", " \n", " return df\n", "\n", "def train_and_test(df, k=0):\n", " numeric_df = df.select_dtypes(include=['integer', 'float'])\n", " features = numeric_df.columns.drop(\"SalePrice\")\n", " lr = linear_model.LinearRegression()\n", " \n", " if k == 0:\n", " train = df[:1460]\n", " test = df[1460:]\n", "\n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions = lr.predict(test[features])\n", " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n", " rmse = np.sqrt(mse)\n", "\n", " return rmse\n", " \n", " if k == 1:\n", " # Randomize *all* rows (frac=1) from `df` and return\n", " shuffled_df = df.sample(frac=1, )\n", " train = df[:1460]\n", " test = df[1460:]\n", " \n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions_one = lr.predict(test[features]) \n", " \n", " mse_one = mean_squared_error(test[\"SalePrice\"], predictions_one)\n", " rmse_one = np.sqrt(mse_one)\n", " \n", " lr.fit(test[features], test[\"SalePrice\"])\n", " predictions_two = lr.predict(train[features]) \n", " \n", " mse_two = mean_squared_error(train[\"SalePrice\"], predictions_two)\n", " rmse_two = np.sqrt(mse_two)\n", " \n", " avg_rmse = np.mean([rmse_one, rmse_two])\n", " print(rmse_one)\n", " print(rmse_two)\n", " return avg_rmse\n", " else:\n", " kf = KFold(n_splits=k, shuffle=True)\n", " rmse_values = []\n", " for train_index, test_index, in kf.split(df):\n", " train = df.iloc[train_index]\n", " test = df.iloc[test_index]\n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions = lr.predict(test[features])\n", " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n", " rmse = np.sqrt(mse)\n", " rmse_values.append(rmse)\n", " print(rmse_values)\n", " avg_rmse = np.mean(rmse_values)\n", " return avg_rmse\n", "\n", "df = pd.read_csv(\"AmesHousing.tsv\", delimiter=\"\\t\")\n", "transform_df = transform_features(df)\n", "filtered_df = select_features(transform_df)\n", "rmse = train_and_test(filtered_df, k=4)\n", "\n", "rmse" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }