{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Introduction To The Dataset"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"pd.options.display.max_columns = 99"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" symboling | \n",
" normalized-losses | \n",
" make | \n",
" fuel-type | \n",
" aspiration | \n",
" num-of-doors | \n",
" body-style | \n",
" drive-wheels | \n",
" engine-location | \n",
" wheel-base | \n",
" length | \n",
" width | \n",
" height | \n",
" curb-weight | \n",
" engine-type | \n",
" num-of-cylinders | \n",
" engine-size | \n",
" fuel-system | \n",
" bore | \n",
" stroke | \n",
" compression-rate | \n",
" horsepower | \n",
" peak-rpm | \n",
" city-mpg | \n",
" highway-mpg | \n",
" price | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 3 | \n",
" ? | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" 88.6 | \n",
" 168.8 | \n",
" 64.1 | \n",
" 48.8 | \n",
" 2548 | \n",
" dohc | \n",
" four | \n",
" 130 | \n",
" mpfi | \n",
" 3.47 | \n",
" 2.68 | \n",
" 9.0 | \n",
" 111 | \n",
" 5000 | \n",
" 21 | \n",
" 27 | \n",
" 13495 | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" ? | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" 88.6 | \n",
" 168.8 | \n",
" 64.1 | \n",
" 48.8 | \n",
" 2548 | \n",
" dohc | \n",
" four | \n",
" 130 | \n",
" mpfi | \n",
" 3.47 | \n",
" 2.68 | \n",
" 9.0 | \n",
" 111 | \n",
" 5000 | \n",
" 21 | \n",
" 27 | \n",
" 16500 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" ? | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" hatchback | \n",
" rwd | \n",
" front | \n",
" 94.5 | \n",
" 171.2 | \n",
" 65.5 | \n",
" 52.4 | \n",
" 2823 | \n",
" ohcv | \n",
" six | \n",
" 152 | \n",
" mpfi | \n",
" 2.68 | \n",
" 3.47 | \n",
" 9.0 | \n",
" 154 | \n",
" 5000 | \n",
" 19 | \n",
" 26 | \n",
" 16500 | \n",
"
\n",
" \n",
" 3 | \n",
" 2 | \n",
" 164 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" four | \n",
" sedan | \n",
" fwd | \n",
" front | \n",
" 99.8 | \n",
" 176.6 | \n",
" 66.2 | \n",
" 54.3 | \n",
" 2337 | \n",
" ohc | \n",
" four | \n",
" 109 | \n",
" mpfi | \n",
" 3.19 | \n",
" 3.40 | \n",
" 10.0 | \n",
" 102 | \n",
" 5500 | \n",
" 24 | \n",
" 30 | \n",
" 13950 | \n",
"
\n",
" \n",
" 4 | \n",
" 2 | \n",
" 164 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" four | \n",
" sedan | \n",
" 4wd | \n",
" front | \n",
" 99.4 | \n",
" 176.6 | \n",
" 66.4 | \n",
" 54.3 | \n",
" 2824 | \n",
" ohc | \n",
" five | \n",
" 136 | \n",
" mpfi | \n",
" 3.19 | \n",
" 3.40 | \n",
" 8.0 | \n",
" 115 | \n",
" 5500 | \n",
" 18 | \n",
" 22 | \n",
" 17450 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" symboling normalized-losses make fuel-type aspiration num-of-doors \\\n",
"0 3 ? alfa-romero gas std two \n",
"1 3 ? alfa-romero gas std two \n",
"2 1 ? alfa-romero gas std two \n",
"3 2 164 audi gas std four \n",
"4 2 164 audi gas std four \n",
"\n",
" body-style drive-wheels engine-location wheel-base length width \\\n",
"0 convertible rwd front 88.6 168.8 64.1 \n",
"1 convertible rwd front 88.6 168.8 64.1 \n",
"2 hatchback rwd front 94.5 171.2 65.5 \n",
"3 sedan fwd front 99.8 176.6 66.2 \n",
"4 sedan 4wd front 99.4 176.6 66.4 \n",
"\n",
" height curb-weight engine-type num-of-cylinders engine-size fuel-system \\\n",
"0 48.8 2548 dohc four 130 mpfi \n",
"1 48.8 2548 dohc four 130 mpfi \n",
"2 52.4 2823 ohcv six 152 mpfi \n",
"3 54.3 2337 ohc four 109 mpfi \n",
"4 54.3 2824 ohc five 136 mpfi \n",
"\n",
" bore stroke compression-rate horsepower peak-rpm city-mpg highway-mpg \\\n",
"0 3.47 2.68 9.0 111 5000 21 27 \n",
"1 3.47 2.68 9.0 111 5000 21 27 \n",
"2 2.68 3.47 9.0 154 5000 19 26 \n",
"3 3.19 3.40 10.0 102 5500 24 30 \n",
"4 3.19 3.40 8.0 115 5500 18 22 \n",
"\n",
" price \n",
"0 13495 \n",
"1 16500 \n",
"2 16500 \n",
"3 13950 \n",
"4 17450 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', \n",
" 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', \n",
" 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']\n",
"cars = pd.read_csv('imports-85.data', names=cols)\n",
"\n",
"cars.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Select only the columns with continuous values from - https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.names\n",
"continuous_values_cols = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']\n",
"numeric_cars = cars[continuous_values_cols]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" normalized-losses | \n",
" wheel-base | \n",
" length | \n",
" width | \n",
" height | \n",
" curb-weight | \n",
" engine-size | \n",
" bore | \n",
" stroke | \n",
" compression-rate | \n",
" horsepower | \n",
" peak-rpm | \n",
" city-mpg | \n",
" highway-mpg | \n",
" price | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ? | \n",
" 88.6 | \n",
" 168.8 | \n",
" 64.1 | \n",
" 48.8 | \n",
" 2548 | \n",
" 130 | \n",
" 3.47 | \n",
" 2.68 | \n",
" 9.0 | \n",
" 111 | \n",
" 5000 | \n",
" 21 | \n",
" 27 | \n",
" 13495 | \n",
"
\n",
" \n",
" 1 | \n",
" ? | \n",
" 88.6 | \n",
" 168.8 | \n",
" 64.1 | \n",
" 48.8 | \n",
" 2548 | \n",
" 130 | \n",
" 3.47 | \n",
" 2.68 | \n",
" 9.0 | \n",
" 111 | \n",
" 5000 | \n",
" 21 | \n",
" 27 | \n",
" 16500 | \n",
"
\n",
" \n",
" 2 | \n",
" ? | \n",
" 94.5 | \n",
" 171.2 | \n",
" 65.5 | \n",
" 52.4 | \n",
" 2823 | \n",
" 152 | \n",
" 2.68 | \n",
" 3.47 | \n",
" 9.0 | \n",
" 154 | \n",
" 5000 | \n",
" 19 | \n",
" 26 | \n",
" 16500 | \n",
"
\n",
" \n",
" 3 | \n",
" 164 | \n",
" 99.8 | \n",
" 176.6 | \n",
" 66.2 | \n",
" 54.3 | \n",
" 2337 | \n",
" 109 | \n",
" 3.19 | \n",
" 3.40 | \n",
" 10.0 | \n",
" 102 | \n",
" 5500 | \n",
" 24 | \n",
" 30 | \n",
" 13950 | \n",
"
\n",
" \n",
" 4 | \n",
" 164 | \n",
" 99.4 | \n",
" 176.6 | \n",
" 66.4 | \n",
" 54.3 | \n",
" 2824 | \n",
" 136 | \n",
" 3.19 | \n",
" 3.40 | \n",
" 8.0 | \n",
" 115 | \n",
" 5500 | \n",
" 18 | \n",
" 22 | \n",
" 17450 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" normalized-losses wheel-base length width height curb-weight \\\n",
"0 ? 88.6 168.8 64.1 48.8 2548 \n",
"1 ? 88.6 168.8 64.1 48.8 2548 \n",
"2 ? 94.5 171.2 65.5 52.4 2823 \n",
"3 164 99.8 176.6 66.2 54.3 2337 \n",
"4 164 99.4 176.6 66.4 54.3 2824 \n",
"\n",
" engine-size bore stroke compression-rate horsepower peak-rpm city-mpg \\\n",
"0 130 3.47 2.68 9.0 111 5000 21 \n",
"1 130 3.47 2.68 9.0 111 5000 21 \n",
"2 152 2.68 3.47 9.0 154 5000 19 \n",
"3 109 3.19 3.40 10.0 102 5500 24 \n",
"4 136 3.19 3.40 8.0 115 5500 18 \n",
"\n",
" highway-mpg price \n",
"0 27 13495 \n",
"1 27 16500 \n",
"2 26 16500 \n",
"3 30 13950 \n",
"4 22 17450 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"numeric_cars.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Cleaning"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" normalized-losses | \n",
" wheel-base | \n",
" length | \n",
" width | \n",
" height | \n",
" curb-weight | \n",
" engine-size | \n",
" bore | \n",
" stroke | \n",
" compression-rate | \n",
" horsepower | \n",
" peak-rpm | \n",
" city-mpg | \n",
" highway-mpg | \n",
" price | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" NaN | \n",
" 88.6 | \n",
" 168.8 | \n",
" 64.1 | \n",
" 48.8 | \n",
" 2548 | \n",
" 130 | \n",
" 3.47 | \n",
" 2.68 | \n",
" 9.0 | \n",
" 111 | \n",
" 5000 | \n",
" 21 | \n",
" 27 | \n",
" 13495 | \n",
"
\n",
" \n",
" 1 | \n",
" NaN | \n",
" 88.6 | \n",
" 168.8 | \n",
" 64.1 | \n",
" 48.8 | \n",
" 2548 | \n",
" 130 | \n",
" 3.47 | \n",
" 2.68 | \n",
" 9.0 | \n",
" 111 | \n",
" 5000 | \n",
" 21 | \n",
" 27 | \n",
" 16500 | \n",
"
\n",
" \n",
" 2 | \n",
" NaN | \n",
" 94.5 | \n",
" 171.2 | \n",
" 65.5 | \n",
" 52.4 | \n",
" 2823 | \n",
" 152 | \n",
" 2.68 | \n",
" 3.47 | \n",
" 9.0 | \n",
" 154 | \n",
" 5000 | \n",
" 19 | \n",
" 26 | \n",
" 16500 | \n",
"
\n",
" \n",
" 3 | \n",
" 164 | \n",
" 99.8 | \n",
" 176.6 | \n",
" 66.2 | \n",
" 54.3 | \n",
" 2337 | \n",
" 109 | \n",
" 3.19 | \n",
" 3.40 | \n",
" 10.0 | \n",
" 102 | \n",
" 5500 | \n",
" 24 | \n",
" 30 | \n",
" 13950 | \n",
"
\n",
" \n",
" 4 | \n",
" 164 | \n",
" 99.4 | \n",
" 176.6 | \n",
" 66.4 | \n",
" 54.3 | \n",
" 2824 | \n",
" 136 | \n",
" 3.19 | \n",
" 3.40 | \n",
" 8.0 | \n",
" 115 | \n",
" 5500 | \n",
" 18 | \n",
" 22 | \n",
" 17450 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" normalized-losses wheel-base length width height curb-weight \\\n",
"0 NaN 88.6 168.8 64.1 48.8 2548 \n",
"1 NaN 88.6 168.8 64.1 48.8 2548 \n",
"2 NaN 94.5 171.2 65.5 52.4 2823 \n",
"3 164 99.8 176.6 66.2 54.3 2337 \n",
"4 164 99.4 176.6 66.4 54.3 2824 \n",
"\n",
" engine-size bore stroke compression-rate horsepower peak-rpm city-mpg \\\n",
"0 130 3.47 2.68 9.0 111 5000 21 \n",
"1 130 3.47 2.68 9.0 111 5000 21 \n",
"2 152 2.68 3.47 9.0 154 5000 19 \n",
"3 109 3.19 3.40 10.0 102 5500 24 \n",
"4 136 3.19 3.40 8.0 115 5500 18 \n",
"\n",
" highway-mpg price \n",
"0 27 13495 \n",
"1 27 16500 \n",
"2 26 16500 \n",
"3 30 13950 \n",
"4 22 17450 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"numeric_cars = numeric_cars.replace('?', np.nan)\n",
"numeric_cars.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"normalized-losses 41\n",
"wheel-base 0\n",
"length 0\n",
"width 0\n",
"height 0\n",
"curb-weight 0\n",
"engine-size 0\n",
"bore 4\n",
"stroke 4\n",
"compression-rate 0\n",
"horsepower 2\n",
"peak-rpm 2\n",
"city-mpg 0\n",
"highway-mpg 0\n",
"price 4\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"numeric_cars = numeric_cars.astype('float')\n",
"numeric_cars.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"normalized-losses 37\n",
"wheel-base 0\n",
"length 0\n",
"width 0\n",
"height 0\n",
"curb-weight 0\n",
"engine-size 0\n",
"bore 4\n",
"stroke 4\n",
"compression-rate 0\n",
"horsepower 2\n",
"peak-rpm 2\n",
"city-mpg 0\n",
"highway-mpg 0\n",
"price 0\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Because `price` is the column we want to predict, let's remove any rows with missing `price` values.\n",
"numeric_cars = numeric_cars.dropna(subset=['price'])\n",
"numeric_cars.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Replace missing values in other columns using column means.\n",
"numeric_cars = numeric_cars.fillna(numeric_cars.mean())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"normalized-losses 0\n",
"wheel-base 0\n",
"length 0\n",
"width 0\n",
"height 0\n",
"curb-weight 0\n",
"engine-size 0\n",
"bore 0\n",
"stroke 0\n",
"compression-rate 0\n",
"horsepower 0\n",
"peak-rpm 0\n",
"city-mpg 0\n",
"highway-mpg 0\n",
"price 0\n",
"dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Confirm that there are no more missing values!\n",
"numeric_cars.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Normalize all columnns to range from 0 to 1 except the target column.\n",
"price_col = numeric_cars['price']\n",
"numeric_cars = (numeric_cars - numeric_cars.min())/(numeric_cars.max() - numeric_cars.min())\n",
"numeric_cars['price'] = price_col"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Univariate Model"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"engine-size 3238.462830\n",
"horsepower 4037.037713\n",
"curb-weight 4401.118255\n",
"highway-mpg 4630.026799\n",
"width 4704.482590\n",
"city-mpg 4766.422505\n",
"length 5427.200961\n",
"wheel-base 5461.553998\n",
"compression-rate 6610.812153\n",
"bore 6780.627785\n",
"normalized-losses 7330.197653\n",
"peak-rpm 7697.459696\n",
"stroke 8006.529545\n",
"height 8144.441043\n",
"dtype: float64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.neighbors import KNeighborsRegressor\n",
"from sklearn.metrics import mean_squared_error\n",
"\n",
"def knn_train_test(train_col, target_col, df):\n",
" knn = KNeighborsRegressor()\n",
" np.random.seed(1)\n",
" \n",
" # Randomize order of rows in DataFrame.\n",
" shuffled_index = np.random.permutation(df.index)\n",
" rand_df = df.reindex(shuffled_index)\n",
"\n",
" # Divide number of rows in half and round.\n",
" last_train_row = int(len(rand_df) / 2)\n",
" \n",
" # Select the first half, and set as training set.\n",
" # Select the second half, and set as test set.\n",
" train_df = rand_df.iloc[0:last_train_row]\n",
" test_df = rand_df.iloc[last_train_row:]\n",
" \n",
" # Fit a KNN model using default k value.\n",
" knn.fit(train_df[[train_col]], train_df[target_col])\n",
" \n",
" # Make predictions using model.\n",
" predicted_labels = knn.predict(test_df[[train_col]])\n",
"\n",
" # Calculate and return RMSE.\n",
" mse = mean_squared_error(test_df[target_col], predicted_labels)\n",
" rmse = np.sqrt(mse)\n",
" return rmse\n",
"\n",
"rmse_results = {}\n",
"train_cols = numeric_cars.columns.drop('price')\n",
"\n",
"# For each column (minus `price`), train a model, return RMSE value\n",
"# and add to the dictionary `rmse_results`.\n",
"for col in train_cols:\n",
" rmse_val = knn_train_test(col, 'price', numeric_cars)\n",
" rmse_results[col] = rmse_val\n",
"\n",
"# Create a Series object from the dictionary so \n",
"# we can easily view the results, sort, etc\n",
"rmse_results_series = pd.Series(rmse_results)\n",
"rmse_results_series.sort_values()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"{'normalized-losses': {1: 7846.750605148984,\n",
" 3: 7500.5698123109905,\n",
" 5: 7330.197653434445,\n",
" 7: 7756.421586234123,\n",
" 9: 7688.096096891432},\n",
" 'wheel-base': {1: 4493.734068810494,\n",
" 3: 5120.161506064513,\n",
" 5: 5461.553997873057,\n",
" 7: 5448.1070513823315,\n",
" 9: 5738.405685192312},\n",
" 'length': {1: 4628.45550121557,\n",
" 3: 5129.8358210721635,\n",
" 5: 5427.2009608367125,\n",
" 7: 5313.427720847974,\n",
" 9: 5383.054514833446},\n",
" 'width': {1: 4559.257297950061,\n",
" 3: 4606.413692169901,\n",
" 5: 4704.482589704386,\n",
" 7: 4571.485046194653,\n",
" 9: 4652.914172067787},\n",
" 'height': {1: 8904.04645636071,\n",
" 3: 8277.609643045525,\n",
" 5: 8144.441042663747,\n",
" 7: 7679.598124393773,\n",
" 9: 7811.03606291223},\n",
" 'curb-weight': {1: 5264.290230758878,\n",
" 3: 5022.318011757233,\n",
" 5: 4401.118254793124,\n",
" 7: 4330.608104418053,\n",
" 9: 4632.044474454401},\n",
" 'engine-size': {1: 3258.4861059962027,\n",
" 3: 2840.562805643501,\n",
" 5: 3238.4628296477176,\n",
" 7: 3563.086774256415,\n",
" 9: 3831.8244149840766},\n",
" 'bore': {1: 8602.58848450066,\n",
" 3: 6984.239489480916,\n",
" 5: 6780.627784685976,\n",
" 7: 6878.097965921532,\n",
" 9: 6866.808502038413},\n",
" 'stroke': {1: 9116.495955406906,\n",
" 3: 7338.68466990294,\n",
" 5: 8006.529544647101,\n",
" 7: 7803.937796804327,\n",
" 9: 7735.554366079291},\n",
" 'compression-rate': {1: 8087.205346523092,\n",
" 3: 7375.063685578359,\n",
" 5: 6610.812153159129,\n",
" 7: 6732.801282941515,\n",
" 9: 7024.485525463435},\n",
" 'horsepower': {1: 4170.054848037801,\n",
" 3: 4020.8492630885394,\n",
" 5: 4037.0377131537603,\n",
" 7: 4353.811860277134,\n",
" 9: 4515.135617419103},\n",
" 'peak-rpm': {1: 9511.480067750124,\n",
" 3: 8537.550899973421,\n",
" 5: 7697.4596964334805,\n",
" 7: 7510.294160083481,\n",
" 9: 7340.041341263401},\n",
" 'city-mpg': {1: 5901.143574354764,\n",
" 3: 4646.746408727155,\n",
" 5: 4766.422505090134,\n",
" 7: 5232.523034167316,\n",
" 9: 5465.209492527533},\n",
" 'highway-mpg': {1: 6025.594966720739,\n",
" 3: 4617.305019788554,\n",
" 5: 4630.026798588056,\n",
" 7: 4796.061440186946,\n",
" 9: 5278.358056953987}}"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def knn_train_test(train_col, target_col, df):\n",
" np.random.seed(1)\n",
" \n",
" # Randomize order of rows in DataFrame.\n",
" shuffled_index = np.random.permutation(df.index)\n",
" rand_df = df.reindex(shuffled_index)\n",
"\n",
" # Divide number of rows in half and round.\n",
" last_train_row = int(len(rand_df) / 2)\n",
" \n",
" # Select the first half, and set as training set.\n",
" # Select the second half, and set as test set.\n",
" train_df = rand_df.iloc[0:last_train_row]\n",
" test_df = rand_df.iloc[last_train_row:]\n",
" \n",
" k_values = [1,3,5,7,9]\n",
" k_rmses = {}\n",
" \n",
" for k in k_values:\n",
" # Fit model using k nearest neighbors.\n",
" knn = KNeighborsRegressor(n_neighbors=k)\n",
" knn.fit(train_df[[train_col]], train_df[target_col])\n",
"\n",
" # Make predictions using model.\n",
" predicted_labels = knn.predict(test_df[[train_col]])\n",
"\n",
" # Calculate and return RMSE.\n",
" mse = mean_squared_error(test_df[target_col], predicted_labels)\n",
" rmse = np.sqrt(mse)\n",
" \n",
" k_rmses[k] = rmse\n",
" return k_rmses\n",
"\n",
"k_rmse_results = {}\n",
"\n",
"# For each column (minus `price`), train a model, return RMSE value\n",
"# and add to the dictionary `rmse_results`.\n",
"train_cols = numeric_cars.columns.drop('price')\n",
"for col in train_cols:\n",
" rmse_val = knn_train_test(col, 'price', numeric_cars)\n",
" k_rmse_results[col] = rmse_val\n",
"\n",
"k_rmse_results"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"