hace 8 años · 6f1c6654e0
--- a/Mission165Solutions.ipynb
+++ b/Mission165Solutions.ipynb
@@ -0,0 +1,367 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Introduction"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 2,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import pandas as pd\n",
			
 
				+    "pd.options.display.max_columns = 99\n",
			
 
				+    "first_five = pd.read_csv('loans_2007.csv', nrows=5)\n",
			
 
				+    "first_five"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 5,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "thousand_chunk = pd.read_csv('loans_2007.csv', nrows=1000)\n",
			
 
				+    "thousand_chunk.memory_usage(deep=True).sum()/(1024*1024)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Let's try tripling to 3000 rows and calculate the memory footprint for each chunk."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 6,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
			
 
				+    "for chunk in chunk_iter:\n",
			
 
				+    "    print(chunk.memory_usage(deep=True).sum()/(1024*1024))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## How many rows in the data set?"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 13,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
			
 
				+    "total_rows = 0\n",
			
 
				+    "for chunk in chunk_iter:\n",
			
 
				+    "    total_rows += len(chunk)\n",
			
 
				+    "print(total_rows)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Exploring the Data in Chunks\n",
			
 
				+    "\n",
			
 
				+    "## How many columns have a numeric type? How many have a string type?"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 7,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false,
			
 
				+    "scrolled": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Numeric columns\n",
			
 
				+    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
			
 
				+    "for chunk in chunk_iter:\n",
			
 
				+    "    print(chunk.dtypes.value_counts())"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 36,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Are string columns consistent across chunks?\n",
			
 
				+    "obj_cols = []\n",
			
 
				+    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
			
 
				+    "\n",
			
 
				+    "for chunk in chunk_iter:\n",
			
 
				+    "    chunk_obj_cols = chunk.select_dtypes(include=['object']).columns.tolist()\n",
			
 
				+    "    if len(obj_cols) > 0:\n",
			
 
				+    "        is_same = obj_cols == chunk_obj_cols\n",
			
 
				+    "        if not is_same:\n",
			
 
				+    "            print(\"overall obj cols:\", obj_cols, \"\\n\")\n",
			
 
				+    "            print(\"chunk obj cols:\", chunk_obj_cols, \"\\n\")    \n",
			
 
				+    "    else:\n",
			
 
				+    "        obj_cols = chunk_obj_cols"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				+   "source": [
			
 
				+    "### Observation 1: By default -- 31 numeric columns and 21 string columns.\n",
			
 
				+    "\n",
			
 
				+    "### Observation 2: It seems like one column in particular (the `id` column) is being cast to int64 in the last 2 chunks but not in the earlier chunks. Since the `id` column won't be useful for analysis, visualization, or predictive modelling let's ignore this column.\n",
			
 
				+    "\n",
			
 
				+    "## How many unique values are there in each string column? How many of the string columns contain values that are less than 50% unique?"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 23,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false,
			
 
				+    "scrolled": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "## Create dictionary (key: column, value: list of Series objects representing each chunk's value counts)\n",
			
 
				+    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
			
 
				+    "str_cols_vc = {}\n",
			
 
				+    "for chunk in chunk_iter:\n",
			
 
				+    "    str_cols = chunk.select_dtypes(include=['object'])\n",
			
 
				+    "    for col in str_cols.columns:\n",
			
 
				+    "        current_col_vc = str_cols[col].value_counts()\n",
			
 
				+    "        if col in str_cols_vc:\n",
			
 
				+    "            str_cols_vc[col].append(current_col_vc)\n",
			
 
				+    "        else:\n",
			
 
				+    "            str_cols_vc[col] = [current_col_vc]"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 24,
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "## Combine the value counts.\n",
			
 
				+    "combined_vcs = {}\n",
			
 
				+    "\n",
			
 
				+    "for col in str_cols_vc:\n",
			
 
				+    "    combined_vc = pd.concat(str_cols_vc[col])\n",
			
 
				+    "    final_vc = combined_vc.groupby(combined_vc.index).sum()\n",
			
 
				+    "    combined_vcs[col] = final_vc"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 37,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "combined_vcs.keys()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Optimizing String Columns\n",
			
 
				+    "\n",
			
 
				+    "### Determine which string columns you can convert to a numeric type if you clean them. Let's focus on columns that would actually be useful for analysis and modelling."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 38,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "obj_cols"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 59,
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "useful_obj_cols = ['term', 'sub_grade', 'emp_title', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'earliest_cr_line', 'revol_util', 'last_pymnt_d', 'last_credit_pull_d']"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 61,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "for col in useful_obj_cols:\n",
			
 
				+    "    print(col)\n",
			
 
				+    "    print(combined_vcs[col])\n",
			
 
				+    "    print(\"-----------\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Convert to category"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 94,
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "convert_col_dtypes = {\n",
			
 
				+    "    \"sub_grade\": \"category\", \"home_ownership\": \"category\", \n",
			
 
				+    "    \"verification_status\": \"category\", \"purpose\": \"category\"\n",
			
 
				+    "}"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Convert `term` and `revol_util` to numerical by data cleaning.\n",
			
 
				+    "### Convert `issue_d`, `earliest_cr_line`, `last_pymnt_d`, and `last_credit_pull_d` to datetime."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 97,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false,
			
 
				+    "scrolled": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "chunk[useful_obj_cols]"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 96,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
			
 
				+    "\n",
			
 
				+    "for chunk in chunk_iter:\n",
			
 
				+    "    term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
			
 
				+    "    revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
			
 
				+    "    chunk['term'] = pd.to_numeric(term_cleaned)\n",
			
 
				+    "    chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
			
 
				+    "    \n",
			
 
				+    "chunk.dtypes"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 118,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
			
 
				+    "mv_counts = {}\n",
			
 
				+    "for chunk in chunk_iter:\n",
			
 
				+    "    term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
			
 
				+    "    revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
			
 
				+    "    chunk['term'] = pd.to_numeric(term_cleaned)\n",
			
 
				+    "    chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
			
 
				+    "    float_cols = chunk.select_dtypes(include=['float'])\n",
			
 
				+    "    for col in float_cols.columns:\n",
			
 
				+    "        missing_values = len(chunk) - chunk[col].count()\n",
			
 
				+    "        if col in mv_counts:\n",
			
 
				+    "            mv_counts[col] = mv_counts[col] + missing_values\n",
			
 
				+    "        else:\n",
			
 
				+    "            mv_counts[col] = missing_values\n",
			
 
				+    "mv_counts"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 123,
			
 
				+   "metadata": {
			
 
				+    "collapsed": false
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
			
 
				+    "mv_counts = {}\n",
			
 
				+    "for chunk in chunk_iter:\n",
			
 
				+    "    term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
			
 
				+    "    revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
			
 
				+    "    chunk['term'] = pd.to_numeric(term_cleaned)\n",
			
 
				+    "    chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
			
 
				+    "    chunk = chunk.dropna(how='all')\n",
			
 
				+    "    float_cols = chunk.select_dtypes(include=['float'])\n",
			
 
				+    "    for col in float_cols.columns:\n",
			
 
				+    "        missing_values = len(chunk) - chunk[col].count()\n",
			
 
				+    "        if col in mv_counts:\n",
			
 
				+    "            mv_counts[col] = mv_counts[col] + missing_values\n",
			
 
				+    "        else:\n",
			
 
				+    "            mv_counts[col] = missing_values\n",
			
 
				+    "mv_counts"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "anaconda-cloud": {},
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.5.0"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 2
			
 
				+}