{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Introduction" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "pd.options.display.max_columns = 999\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.model_selection import KFold\n", "\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn import linear_model\n", "from sklearn.model_selection import KFold" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "df = pd.read_csv(\"AmesHousing.tsv\", delimiter=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "57088.25161263909" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def transform_features(df):\n", " return df\n", "\n", "def select_features(df):\n", " return df[[\"Gr Liv Area\", \"SalePrice\"]]\n", "\n", "def train_and_test(df): \n", " train = df[:1460]\n", " test = df[1460:]\n", " \n", " ## You can use `pd.DataFrame.select_dtypes()` to specify column types\n", " ## and return only those columns as a data frame.\n", " numeric_train = train.select_dtypes(include=['integer', 'float'])\n", " numeric_test = test.select_dtypes(include=['integer', 'float'])\n", " \n", " ## You can use `pd.Series.drop()` to drop a value.\n", " features = numeric_train.columns.drop(\"SalePrice\")\n", " lr = linear_model.LinearRegression()\n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions = lr.predict(test[features])\n", " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n", " rmse = np.sqrt(mse)\n", " \n", " return rmse\n", "\n", "transform_df = transform_features(df)\n", "filtered_df = select_features(transform_df)\n", "rmse = train_and_test(filtered_df)\n", "\n", "rmse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Engineering\n", "\n", "- Handle missing values:\n", " - All columns:\n", " - Drop any with 5% or more missing values **for now**.\n", " - Text columns:\n", " - Drop any with 1 or more missing values **for now**.\n", " - Numerical columns:\n", " - For columns with missing values, fill in with the most common value in that column" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1: All columns: drop any with 5% or more missing values **for now**." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Series object: column name -> number of missing values\n", "num_missing = df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Filter Series to columns containing >5% missing values\n", "drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()\n", "\n", "# Drop those columns from the data frame. Note the use of the .index accessor\n", "df = df.drop(drop_missing_cols.index, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "2: Text columns: drop any with 1 or more missing values **for now**." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Series object: column name -> number of missing values\n", "text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)\n", "\n", "## Filter Series to columns containing *any* missing values\n", "drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]\n", "\n", "df = df.drop(drop_missing_cols_2.index, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "3: Numerical columns: for columns with missing values, fill in with the most common value in that column" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BsmtFin SF 1 1\n", "BsmtFin SF 2 1\n", "Bsmt Unf SF 1\n", "Total Bsmt SF 1\n", "Garage Cars 1\n", "Garage Area 1\n", "Bsmt Full Bath 2\n", "Bsmt Half Bath 2\n", "Mas Vnr Area 23\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Compute column-wise missing value counts\n", "num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()\n", "fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()\n", "fixable_numeric_cols" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'BsmtFin SF 1': 0.0,\n", " 'BsmtFin SF 2': 0.0,\n", " 'Bsmt Unf SF': 0.0,\n", " 'Total Bsmt SF': 0.0,\n", " 'Garage Cars': 2.0,\n", " 'Garage Area': 0.0,\n", " 'Bsmt Full Bath': 0.0,\n", " 'Bsmt Half Bath': 0.0,\n", " 'Mas Vnr Area': 0.0}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Compute the most common value for each column in `fixable_nmeric_missing_cols`.\n", "replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]\n", "replacement_values_dict" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "## Use `pd.DataFrame.fillna()` to replace missing values.\n", "df = df.fillna(replacement_values_dict)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 64\n", "dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Verify that every column has 0 missing values\n", "df.isnull().sum().value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What new features can we create, that better capture the information in some of the features?" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "2180 -1\n", "dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "years_sold = df['Yr Sold'] - df['Year Built']\n", "years_sold[years_sold < 0]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1702 -1\n", "2180 -2\n", "2181 -1\n", "dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "years_since_remod = df['Yr Sold'] - df['Year Remod/Add']\n", "years_since_remod[years_since_remod < 0]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Create new columns\n", "df['Years Before Sale'] = years_sold\n", "df['Years Since Remod'] = years_since_remod\n", "\n", "## Drop rows with negative values for both of these new features\n", "df = df.drop([1702, 2180, 2181], axis=0)\n", "\n", "## No longer need original year columns\n", "df = df.drop([\"Year Built\", \"Year Remod/Add\"], axis = 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Drop columns that:\n", "- that aren't useful for ML\n", "- leak data about the final sale, read more about columns [here](https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "## Drop columns that aren't useful for ML\n", "df = df.drop([\"PID\", \"Order\"], axis=1)\n", "\n", "## Drop columns that leak info about the final sale\n", "df = df.drop([\"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Yr Sold\"], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's update transform_features()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "55275.367312413066" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def transform_features(df):\n", " num_missing = df.isnull().sum()\n", " drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()\n", " df = df.drop(drop_missing_cols.index, axis=1)\n", " \n", " text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)\n", " drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]\n", " df = df.drop(drop_missing_cols_2.index, axis=1)\n", " \n", " num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()\n", " fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()\n", " replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]\n", " df = df.fillna(replacement_values_dict)\n", " \n", " years_sold = df['Yr Sold'] - df['Year Built']\n", " years_since_remod = df['Yr Sold'] - df['Year Remod/Add']\n", " df['Years Before Sale'] = years_sold\n", " df['Years Since Remod'] = years_since_remod\n", " df = df.drop([1702, 2180, 2181], axis=0)\n", "\n", " df = df.drop([\"PID\", \"Order\", \"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Year Built\", \"Year Remod/Add\"], axis=1)\n", " return df\n", "\n", "def select_features(df):\n", " return df[[\"Gr Liv Area\", \"SalePrice\"]]\n", "\n", "def train_and_test(df): \n", " train = df[:1460]\n", " test = df[1460:]\n", " \n", " ## You can use `pd.DataFrame.select_dtypes()` to specify column types\n", " ## and return only those columns as a DataFrame.\n", " numeric_train = train.select_dtypes(include=['integer', 'float'])\n", " numeric_test = test.select_dtypes(include=['integer', 'float'])\n", " \n", " ## You can use `pd.Series.drop()` to drop a value.\n", " features = numeric_train.columns.drop(\"SalePrice\")\n", " lr = linear_model.LinearRegression()\n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions = lr.predict(test[features])\n", " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n", " rmse = np.sqrt(mse)\n", " \n", " return rmse\n", "\n", "df = pd.read_csv(\"AmesHousing.tsv\", delimiter=\"\\t\")\n", "transform_df = transform_features(df)\n", "filtered_df = select_features(transform_df)\n", "rmse = train_and_test(filtered_df)\n", "\n", "rmse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Selection" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MS SubClassLot AreaOverall QualOverall CondMas Vnr AreaBsmtFin SF 1BsmtFin SF 2Bsmt Unf SFTotal Bsmt SF1st Flr SF2nd Flr SFLow Qual Fin SFGr Liv AreaBsmt Full BathBsmt Half BathFull BathHalf BathBedroom AbvGrKitchen AbvGrTotRms AbvGrdFireplacesGarage CarsGarage AreaWood Deck SFOpen Porch SFEnclosed Porch3Ssn PorchScreen PorchPool AreaMisc ValYr SoldSalePriceYears Before SaleYears Since Remod
0203177065112.0639.00.0441.01080.016560016561.00.01031722.0528.0210620000020102150005050
12011622560.0468.0144.0270.0882.0896008960.00.01021501.0730.01400001200020101050004949
2201426766108.0923.00.0406.01329.013290013290.00.01131601.0312.03933600001250020101720005252
32011160750.01065.00.01045.02110.021100021101.00.02131822.0522.0000000020102440004242
46013830550.0791.00.0137.0928.0928701016290.00.02131612.0482.0212340000020101899001312
\n", "
" ], "text/plain": [ " MS SubClass Lot Area Overall Qual Overall Cond Mas Vnr Area \\\n", "0 20 31770 6 5 112.0 \n", "1 20 11622 5 6 0.0 \n", "2 20 14267 6 6 108.0 \n", "3 20 11160 7 5 0.0 \n", "4 60 13830 5 5 0.0 \n", "\n", " BsmtFin SF 1 BsmtFin SF 2 Bsmt Unf SF Total Bsmt SF 1st Flr SF \\\n", "0 639.0 0.0 441.0 1080.0 1656 \n", "1 468.0 144.0 270.0 882.0 896 \n", "2 923.0 0.0 406.0 1329.0 1329 \n", "3 1065.0 0.0 1045.0 2110.0 2110 \n", "4 791.0 0.0 137.0 928.0 928 \n", "\n", " 2nd Flr SF Low Qual Fin SF Gr Liv Area Bsmt Full Bath Bsmt Half Bath \\\n", "0 0 0 1656 1.0 0.0 \n", "1 0 0 896 0.0 0.0 \n", "2 0 0 1329 0.0 0.0 \n", "3 0 0 2110 1.0 0.0 \n", "4 701 0 1629 0.0 0.0 \n", "\n", " Full Bath Half Bath Bedroom AbvGr Kitchen AbvGr TotRms AbvGrd \\\n", "0 1 0 3 1 7 \n", "1 1 0 2 1 5 \n", "2 1 1 3 1 6 \n", "3 2 1 3 1 8 \n", "4 2 1 3 1 6 \n", "\n", " Fireplaces Garage Cars Garage Area Wood Deck SF Open Porch SF \\\n", "0 2 2.0 528.0 210 62 \n", "1 0 1.0 730.0 140 0 \n", "2 0 1.0 312.0 393 36 \n", "3 2 2.0 522.0 0 0 \n", "4 1 2.0 482.0 212 34 \n", "\n", " Enclosed Porch 3Ssn Porch Screen Porch Pool Area Misc Val Yr Sold \\\n", "0 0 0 0 0 0 2010 \n", "1 0 0 120 0 0 2010 \n", "2 0 0 0 0 12500 2010 \n", "3 0 0 0 0 0 2010 \n", "4 0 0 0 0 0 2010 \n", "\n", " SalePrice Years Before Sale Years Since Remod \n", "0 215000 50 50 \n", "1 105000 49 49 \n", "2 172000 52 52 \n", "3 244000 42 42 \n", "4 189900 13 12 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "numerical_df = transform_df.select_dtypes(include=['int', 'float'])\n", "numerical_df.head(5)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "BsmtFin SF 2 0.006127\n", "Misc Val 0.019273\n", "Yr Sold 0.030358\n", "3Ssn Porch 0.032268\n", "Bsmt Half Bath 0.035875\n", "Low Qual Fin SF 0.037629\n", "Pool Area 0.068438\n", "MS SubClass 0.085128\n", "Overall Cond 0.101540\n", "Screen Porch 0.112280\n", "Kitchen AbvGr 0.119760\n", "Enclosed Porch 0.128685\n", "Bedroom AbvGr 0.143916\n", "Bsmt Unf SF 0.182751\n", "Lot Area 0.267520\n", "2nd Flr SF 0.269601\n", "Bsmt Full Bath 0.276258\n", "Half Bath 0.284871\n", "Open Porch SF 0.316262\n", "Wood Deck SF 0.328183\n", "BsmtFin SF 1 0.439284\n", "Fireplaces 0.474831\n", "TotRms AbvGrd 0.498574\n", "Mas Vnr Area 0.506983\n", "Years Since Remod 0.534985\n", "Full Bath 0.546118\n", "Years Before Sale 0.558979\n", "1st Flr SF 0.635185\n", "Garage Area 0.641425\n", "Total Bsmt SF 0.644012\n", "Garage Cars 0.648361\n", "Gr Liv Area 0.717596\n", "Overall Qual 0.801206\n", "SalePrice 1.000000\n", "Name: SalePrice, dtype: float64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()\n", "abs_corr_coeffs" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BsmtFin SF 1 0.439284\n", "Fireplaces 0.474831\n", "TotRms AbvGrd 0.498574\n", "Mas Vnr Area 0.506983\n", "Years Since Remod 0.534985\n", "Full Bath 0.546118\n", "Years Before Sale 0.558979\n", "1st Flr SF 0.635185\n", "Garage Area 0.641425\n", "Total Bsmt SF 0.644012\n", "Garage Cars 0.648361\n", "Gr Liv Area 0.717596\n", "Overall Qual 0.801206\n", "SalePrice 1.000000\n", "Name: SalePrice, dtype: float64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Let's only keep columns with a correlation coefficient larger than 0.4 (arbitrary — worth experimenting later!).\n", "abs_corr_coeffs[abs_corr_coeffs > 0.4]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Drop columns with less than 0.4 correlation with SalePrice.\n", "transform_df = transform_df.drop(abs_corr_coeffs[abs_corr_coeffs < 0.4].index, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Which categorical columns should we keep?" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## Create a list of column names from documentation that are *meant* to be categorical.\n", "nominal_features = [\"PID\", \"MS SubClass\", \"MS Zoning\", \"Street\", \"Alley\", \"Land Contour\", \"Lot Config\", \"Neighborhood\", \n", " \"Condition 1\", \"Condition 2\", \"Bldg Type\", \"House Style\", \"Roof Style\", \"Roof Matl\", \"Exterior 1st\", \n", " \"Exterior 2nd\", \"Mas Vnr Type\", \"Foundation\", \"Heating\", \"Central Air\", \"Garage Type\", \n", " \"Misc Feature\", \"Sale Type\", \"Sale Condition\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Which columns are currently numerical but need to be encoded as categorical instead (because the numbers don't have any semantic meaning)?\n", "- If a categorical column has hundreds of unique values (or categories), should we keep it? When we dummy-code this column, hundreds of columns will need to be added back to the DataFrame." ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "scrolled": true }, "outputs": [], "source": [ "## Which categorical columns have we still carried with us? We'll test these. \n", "transform_cat_cols = []\n", "for col in nominal_features:\n", " if col in transform_df.columns:\n", " transform_cat_cols.append(col)\n", "\n", "## How many unique values in each categorical column?\n", "uniqueness_counts = transform_df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()\n", "## Aribtrary cutoff of 10 unique values (worth experimenting).\n", "drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index\n", "transform_df = transform_df.drop(drop_nonuniq_cols, axis=1)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## Select only the remaining text columns, and convert to categorical.\n", "text_cols = transform_df.select_dtypes(include=['object'])\n", "for col in text_cols:\n", " transform_df[col] = transform_df[col].astype('category')\n", " \n", "## Create dummy columns, and add back to the DataFrame!\n", "transform_df = pd.concat([\n", " transform_df, \n", " pd.get_dummies(transform_df.select_dtypes(include=['category']))\n", "], axis=1).drop(text_cols,axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Update `select_features()`" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[27352.325452161054, 26865.145668097586, 26500.762368070868, 35730.36340669092]\n" ] }, { "data": { "text/plain": [ "29112.149223755107" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def transform_features(df):\n", " num_missing = df.isnull().sum()\n", " drop_missing_cols = num_missing[(num_missing > len(df)/20)].sort_values()\n", " df = df.drop(drop_missing_cols.index, axis=1)\n", " \n", " text_mv_counts = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)\n", " drop_missing_cols_2 = text_mv_counts[text_mv_counts > 0]\n", " df = df.drop(drop_missing_cols_2.index, axis=1)\n", " \n", " num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()\n", " fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()\n", " replacement_values_dict = df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]\n", " df = df.fillna(replacement_values_dict)\n", " \n", " years_sold = df['Yr Sold'] - df['Year Built']\n", " years_since_remod = df['Yr Sold'] - df['Year Remod/Add']\n", " df['Years Before Sale'] = years_sold\n", " df['Years Since Remod'] = years_since_remod\n", " df = df.drop([1702, 2180, 2181], axis=0)\n", "\n", " df = df.drop([\"PID\", \"Order\", \"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Year Built\", \"Year Remod/Add\"], axis=1)\n", " return df\n", "\n", "def select_features(df, coeff_threshold=0.4, uniq_threshold=10):\n", " numerical_df = df.select_dtypes(include=['int', 'float'])\n", " abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()\n", " df = df.drop(abs_corr_coeffs[abs_corr_coeffs < coeff_threshold].index, axis=1)\n", " \n", " nominal_features = [\"PID\", \"MS SubClass\", \"MS Zoning\", \"Street\", \"Alley\", \"Land Contour\", \"Lot Config\", \"Neighborhood\", \n", " \"Condition 1\", \"Condition 2\", \"Bldg Type\", \"House Style\", \"Roof Style\", \"Roof Matl\", \"Exterior 1st\", \n", " \"Exterior 2nd\", \"Mas Vnr Type\", \"Foundation\", \"Heating\", \"Central Air\", \"Garage Type\", \n", " \"Misc Feature\", \"Sale Type\", \"Sale Condition\"]\n", " \n", " transform_cat_cols = []\n", " for col in nominal_features:\n", " if col in df.columns:\n", " transform_cat_cols.append(col)\n", "\n", " uniqueness_counts = df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()\n", " drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index\n", " df = df.drop(drop_nonuniq_cols, axis=1)\n", " \n", " text_cols = df.select_dtypes(include=['object'])\n", " for col in text_cols:\n", " df[col] = df[col].astype('category')\n", " df = pd.concat([df, pd.get_dummies(df.select_dtypes(include=['category']))], axis=1).drop(text_cols,axis=1)\n", " \n", " return df\n", "\n", "def train_and_test(df, k=0):\n", " numeric_df = df.select_dtypes(include=['integer', 'float'])\n", " features = numeric_df.columns.drop(\"SalePrice\")\n", " lr = linear_model.LinearRegression()\n", " \n", " if k == 0:\n", " train = df[:1460]\n", " test = df[1460:]\n", "\n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions = lr.predict(test[features])\n", " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n", " rmse = np.sqrt(mse)\n", "\n", " return rmse\n", " \n", " if k == 1:\n", " # Randomize *all* rows (frac=1) from `df` and return\n", " shuffled_df = df.sample(frac=1, )\n", " train = df[:1460]\n", " test = df[1460:]\n", " \n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions_one = lr.predict(test[features]) \n", " \n", " mse_one = mean_squared_error(test[\"SalePrice\"], predictions_one)\n", " rmse_one = np.sqrt(mse_one)\n", " \n", " lr.fit(test[features], test[\"SalePrice\"])\n", " predictions_two = lr.predict(train[features]) \n", " \n", " mse_two = mean_squared_error(train[\"SalePrice\"], predictions_two)\n", " rmse_two = np.sqrt(mse_two)\n", " \n", " avg_rmse = np.mean([rmse_one, rmse_two])\n", " print(rmse_one)\n", " print(rmse_two)\n", " return avg_rmse\n", " else:\n", " kf = KFold(n_splits=k, shuffle=True)\n", " rmse_values = []\n", " for train_index, test_index, in kf.split(df):\n", " train = df.iloc[train_index]\n", " test = df.iloc[test_index]\n", " lr.fit(train[features], train[\"SalePrice\"])\n", " predictions = lr.predict(test[features])\n", " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n", " rmse = np.sqrt(mse)\n", " rmse_values.append(rmse)\n", " print(rmse_values)\n", " avg_rmse = np.mean(rmse_values)\n", " return avg_rmse\n", "\n", "df = pd.read_csv(\"AmesHousing.tsv\", delimiter=\"\\t\")\n", "transform_df = transform_features(df)\n", "filtered_df = select_features(transform_df)\n", "rmse = train_and_test(filtered_df, k=4)\n", "\n", "rmse" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }