2 år sedan · 38762f9ea6
--- a/Mission240Solutions.ipynb
+++ b/Mission240Solutions.ipynb
@@ -108,7 +108,7 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "1: All columns: Drop any with 5% or more missing values **for now**."
			
 
				+    "1: All columns: drop any with 5% or more missing values **for now**."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -140,7 +140,7 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "2: Text columns: Drop any with 1 or more missing values **for now**."
			
 
				+    "2: Text columns: drop any with 1 or more missing values **for now**."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -164,7 +164,7 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "3: Numerical columns: For columns with missing values, fill in with the most common value in that column"
			
 
				+    "3: Numerical columns: for columns with missing values, fill in with the most common value in that column"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -412,7 +412,7 @@
 
				     "    test = df[1460:]\n",
			
 
				     "    \n",
			
 
				     "    ## You can use `pd.DataFrame.select_dtypes()` to specify column types\n",
			
 
				-    "    ## and return only those columns as a data frame.\n",
			
 
				+    "    ## and return only those columns as a DataFrame.\n",
			
 
				     "    numeric_train = train.select_dtypes(include=['integer', 'float'])\n",
			
 
				     "    numeric_test = test.select_dtypes(include=['integer', 'float'])\n",
			
 
				     "    \n",
			
@@ -844,7 +844,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "## Let's only keep columns with a correlation coefficient of larger than 0.4 (arbitrary, worth experimenting later!)\n",
			
 
				+    "## Let's only keep columns with a correlation coefficient larger than 0.4 (arbitrary — worth experimenting later!).\n",
			
 
				     "abs_corr_coeffs[abs_corr_coeffs > 0.4]"
			
 
				    ]
			
 
				   },
			
@@ -856,7 +856,7 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "## Drop columns with less than 0.4 correlation with SalePrice\n",
			
 
				+    "## Drop columns with less than 0.4 correlation with SalePrice.\n",
			
 
				     "transform_df = transform_df.drop(abs_corr_coeffs[abs_corr_coeffs < 0.4].index, axis=1)"
			
 
				    ]
			
 
				   },
			
@@ -875,7 +875,7 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "## Create a list of column names from documentation that are *meant* to be categorical\n",
			
 
				+    "## Create a list of column names from documentation that are *meant* to be categorical.\n",
			
 
				     "nominal_features = [\"PID\", \"MS SubClass\", \"MS Zoning\", \"Street\", \"Alley\", \"Land Contour\", \"Lot Config\", \"Neighborhood\", \n",
			
 
				     "                    \"Condition 1\", \"Condition 2\", \"Bldg Type\", \"House Style\", \"Roof Style\", \"Roof Matl\", \"Exterior 1st\", \n",
			
 
				     "                    \"Exterior 2nd\", \"Mas Vnr Type\", \"Foundation\", \"Heating\", \"Central Air\", \"Garage Type\", \n",
			
@@ -887,7 +887,7 @@
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "- Which columns are currently numerical but need to be encoded as categorical instead (because the numbers don't have any semantic meaning)?\n",
			
 
				-    "- If a categorical column has hundreds of unique values (or categories), should we keep it? When we dummy code this column, hundreds of columns will need to be added back to the data frame."
			
 
				+    "- If a categorical column has hundreds of unique values (or categories), should we keep it? When we dummy-code this column, hundreds of columns will need to be added back to the DataFrame."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -898,7 +898,7 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "## Which categorical columns have we still carried with us? We'll test these \n",
			
 
				+    "## Which categorical columns have we still carried with us? We'll test these. \n",
			
 
				     "transform_cat_cols = []\n",
			
 
				     "for col in nominal_features:\n",
			
 
				     "    if col in transform_df.columns:\n",
			
@@ -906,7 +906,7 @@
 
				     "\n",
			
 
				     "## How many unique values in each categorical column?\n",
			
 
				     "uniqueness_counts = transform_df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()\n",
			
 
				-    "## Aribtrary cutoff of 10 unique values (worth experimenting)\n",
			
 
				+    "## Aribtrary cutoff of 10 unique values (worth experimenting).\n",
			
 
				     "drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index\n",
			
 
				     "transform_df = transform_df.drop(drop_nonuniq_cols, axis=1)"
			
 
				    ]
			
@@ -919,12 +919,12 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "## Select just the remaining text columns and convert to categorical\n",
			
 
				+    "## Select only the remaining text columns, and convert to categorical.\n",
			
 
				     "text_cols = transform_df.select_dtypes(include=['object'])\n",
			
 
				     "for col in text_cols:\n",
			
 
				     "    transform_df[col] = transform_df[col].astype('category')\n",
			
 
				     "    \n",
			
 
				-    "## Create dummy columns and add back to the dataframe!\n",
			
 
				+    "## Create dummy columns, and add back to the DataFrame!\n",
			
 
				     "transform_df = pd.concat([\n",
			
 
				     "    transform_df, \n",
			
 
				     "    pd.get_dummies(transform_df.select_dtypes(include=['category']))\n",
			
@@ -1089,7 +1089,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.8.3"
			
 
				+   "version": "3.8.5"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,