Queer European MD passionate about IT
소스 검색

Merge branch 'master' of github.com:dataquestio/solutions

yunoac 4 년 전
부모
커밋
22a54470d0
5개의 변경된 파일707개의 추가작업 그리고 1762개의 파일을 삭제
  1. 100 1552
      Mission155Solutions.ipynb
  2. 143 183
      Mission191Solutions.ipynb
  3. 42 27
      Mission257Solutions.ipynb
  4. 292 0
      Mission280Solutions.ipynb
  5. 130 0
      Mission487Solutions.Rmd

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 100 - 1552
Mission155Solutions.ipynb


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 143 - 183
Mission191Solutions.ipynb


+ 42 - 27
Mission257Solutions.ipynb

@@ -11,20 +11,12 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Connected: None@factbook.db'"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
     }
-   ],
+   },
+   "outputs": [],
    "source": [
     "%%capture\n",
     "%load_ext sql\n",
@@ -42,13 +34,17 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      " * sqlite:///factbook.db\n",
       "Done.\n"
      ]
     },
@@ -165,13 +161,17 @@
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      " * sqlite:///factbook.db\n",
       "Done.\n"
      ]
     },
@@ -223,13 +223,17 @@
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      " * sqlite:///factbook.db\n",
       "Done.\n"
      ]
     },
@@ -286,13 +290,17 @@
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      " * sqlite:///factbook.db\n",
       "Done.\n"
      ]
     },
@@ -356,13 +364,17 @@
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      " * sqlite:///factbook.db\n",
       "Done.\n"
      ]
     },
@@ -375,13 +387,13 @@
        "        <th>avg_area</th>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "        <td>62094928.32231405</td>\n",
+       "        <td>32242666.56846473</td>\n",
        "        <td>555093.546184739</td>\n",
        "    </tr>\n",
        "</table>"
       ],
       "text/plain": [
-       "[(62094928.32231405, 555093.546184739)]"
+       "[(32242666.56846473, 555093.546184739)]"
       ]
      },
      "execution_count": 6,
@@ -391,10 +403,9 @@
    ],
    "source": [
     "%%sql\n",
-    "SELECT\n",
-    "    AVG(population) avg_population,\n",
-    "    AVG(area) avg_area\n",
-    "FROM facts;"
+    "SELECT AVG(population) avg_population, AVG(area) avg_area\n",
+    "  FROM facts\n",
+    " WHERE name <> 'World';"
    ]
   },
   {
@@ -408,13 +419,17 @@
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      " * sqlite:///factbook.db\n",
       "Done.\n"
      ]
     },
@@ -573,9 +588,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.4.3"
+   "version": "3.7.6"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 292 - 0
Mission280Solutions.ipynb


+ 130 - 0
Mission487Solutions.Rmd

@@ -0,0 +1,130 @@
+---
+title: 'Predicting Car Prices: Guided Project Solutions'
+output: html_document
+---
+
+# Introduction to the data
+
+```{r, message = FALSE, warning = FALSE }
+library(readr)
+library(tidyr)
+library(dplyr)
+cars <- read.csv("./data/imports-85.data")
+
+# Fixing the column names since the .data file reads headers incorrectly
+colnames(cars) <- c(
+  "symboling",
+  "normalized_losses",
+  "make",
+  "fuel_type",
+  "aspiration",
+  "num_doors",
+  "body_style",
+  "drive_wheels",
+  "engine_location",
+  "wheel_base",
+  "length",
+  "width",
+  "height",
+  "curb_weight",
+  "engine_type",
+  "num_cylinders",
+  "engine_size",
+  "fuel_system",
+  "bore",
+  "stroke",
+  "compression_ratio",
+  "horsepower",
+  "peak_rpm",
+  "city_mpg",
+  "highway_mpg",
+  "price"
+)
+
+# Removing non-numerical columns and removing missing data
+cars <- cars %>% 
+  select(
+    symboling, wheel_base, length, width, height, curb_weight,
+    engine_size, bore, stroke, compression_ratio, horsepower, 
+    peak_rpm, city_mpg, highway_mpg, price
+  ) %>% 
+  filter(
+    stroke != "?",
+    bore != "?",
+    horsepower != "?",
+    peak_rpm != "?",
+    price != "?"
+  ) %>% 
+  mutate(
+    stroke = as.numeric(stroke),
+    bore = as.numeric(bore),
+    horsepower = as.numeric(horsepower),
+    peak_rpm = as.numeric(peak_rpm),
+    price = as.numeric(price)
+  )
+
+# Confirming that each of the columns are numeric
+library(purrr)
+map(cars, typeof)
+```
+
+# Examining Relationships Between Predictors
+
+```{r}
+library(caret)
+featurePlot(cars, cars$price)
+```
+
+There looks to be a somewhat positive relationship between horsepower and price. City MPG and highway MPG look positive too, but there's a curious grouping that looks like it pops up. Many features look like they plateau in terms of price (ie even as we increase, price does not increase). Height seems not to have any meaningful relationship with price since the dots look like an evenly scattered plot.
+
+```{r}
+library(ggplot2)
+ggplot(cars, aes(x = price)) +
+  geom_histogram(color = "red") +
+  labs(
+    title = "Distribution of prices in cars dataset",
+    x = "Price",
+    y = "Frequency"
+  )
+```
+
+It looks like there's a reasonably even distirbution of the prices in the dataset, so there are no outliers.  There are 2 cars whose price is zero, so this might be suspect. This only represents 1% of the entire dataset, so it shouldn't have too much impact on predictions, especially if we use a high number of neighbors.
+
+# Setting up the train-test split
+
+```{r}
+library(caret)
+split_indices <- createDataPartition(cars$price, p = 0.8,  list = FALSE)
+train_cars <- cars[split_indices,]
+test_cars <- cars[-split_indices,]
+```
+
+
+# Cross-validation and hyperparameter optimization
+
+```{r}
+# 5-fold cross-validation 
+five_fold_control <- trainControl(method = "cv", number = 5)
+
+tuning_grid <- expand.grid(k = 1:20)
+```
+
+# Choosing a model
+
+```{r}
+# Creating a model based on all the features
+full_model <- train(price ~ .,
+                    data = train_cars,
+                    method = "knn",
+                    trControl = five_fold_control,
+                    tuneGrid = tuning_grid,
+                    preProcess = c("center", "scale"))
+```
+
+# Final model evaluation
+
+```{r}
+predictions <- predict(full_model, newdata = test_cars)
+postResample(pred = predictions, obs = test_cars$price)
+```
+

이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.