{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>member_id</th>\n",
       "      <th>loan_amnt</th>\n",
       "      <th>funded_amnt</th>\n",
       "      <th>funded_amnt_inv</th>\n",
       "      <th>term</th>\n",
       "      <th>int_rate</th>\n",
       "      <th>installment</th>\n",
       "      <th>grade</th>\n",
       "      <th>sub_grade</th>\n",
       "      <th>emp_title</th>\n",
       "      <th>emp_length</th>\n",
       "      <th>home_ownership</th>\n",
       "      <th>annual_inc</th>\n",
       "      <th>verification_status</th>\n",
       "      <th>issue_d</th>\n",
       "      <th>loan_status</th>\n",
       "      <th>pymnt_plan</th>\n",
       "      <th>purpose</th>\n",
       "      <th>title</th>\n",
       "      <th>zip_code</th>\n",
       "      <th>addr_state</th>\n",
       "      <th>dti</th>\n",
       "      <th>delinq_2yrs</th>\n",
       "      <th>earliest_cr_line</th>\n",
       "      <th>inq_last_6mths</th>\n",
       "      <th>open_acc</th>\n",
       "      <th>pub_rec</th>\n",
       "      <th>revol_bal</th>\n",
       "      <th>revol_util</th>\n",
       "      <th>total_acc</th>\n",
       "      <th>initial_list_status</th>\n",
       "      <th>out_prncp</th>\n",
       "      <th>out_prncp_inv</th>\n",
       "      <th>total_pymnt</th>\n",
       "      <th>total_pymnt_inv</th>\n",
       "      <th>total_rec_prncp</th>\n",
       "      <th>total_rec_int</th>\n",
       "      <th>total_rec_late_fee</th>\n",
       "      <th>recoveries</th>\n",
       "      <th>collection_recovery_fee</th>\n",
       "      <th>last_pymnt_d</th>\n",
       "      <th>last_pymnt_amnt</th>\n",
       "      <th>last_credit_pull_d</th>\n",
       "      <th>collections_12_mths_ex_med</th>\n",
       "      <th>policy_code</th>\n",
       "      <th>application_type</th>\n",
       "      <th>acc_now_delinq</th>\n",
       "      <th>chargeoff_within_12_mths</th>\n",
       "      <th>delinq_amnt</th>\n",
       "      <th>pub_rec_bankruptcies</th>\n",
       "      <th>tax_liens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1077501</td>\n",
       "      <td>1296599.0</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>4975.0</td>\n",
       "      <td>36 months</td>\n",
       "      <td>10.65%</td>\n",
       "      <td>162.87</td>\n",
       "      <td>B</td>\n",
       "      <td>B2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10+ years</td>\n",
       "      <td>RENT</td>\n",
       "      <td>24000.0</td>\n",
       "      <td>Verified</td>\n",
       "      <td>Dec-2011</td>\n",
       "      <td>Fully Paid</td>\n",
       "      <td>n</td>\n",
       "      <td>credit_card</td>\n",
       "      <td>Computer</td>\n",
       "      <td>860xx</td>\n",
       "      <td>AZ</td>\n",
       "      <td>27.65</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Jan-1985</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>13648.0</td>\n",
       "      <td>83.7%</td>\n",
       "      <td>9.0</td>\n",
       "      <td>f</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>5863.155187</td>\n",
       "      <td>5833.84</td>\n",
       "      <td>5000.00</td>\n",
       "      <td>863.16</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>Jan-2015</td>\n",
       "      <td>171.62</td>\n",
       "      <td>Jun-2016</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>INDIVIDUAL</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1077430</td>\n",
       "      <td>1314167.0</td>\n",
       "      <td>2500.0</td>\n",
       "      <td>2500.0</td>\n",
       "      <td>2500.0</td>\n",
       "      <td>60 months</td>\n",
       "      <td>15.27%</td>\n",
       "      <td>59.83</td>\n",
       "      <td>C</td>\n",
       "      <td>C4</td>\n",
       "      <td>Ryder</td>\n",
       "      <td>&lt; 1 year</td>\n",
       "      <td>RENT</td>\n",
       "      <td>30000.0</td>\n",
       "      <td>Source Verified</td>\n",
       "      <td>Dec-2011</td>\n",
       "      <td>Charged Off</td>\n",
       "      <td>n</td>\n",
       "      <td>car</td>\n",
       "      <td>bike</td>\n",
       "      <td>309xx</td>\n",
       "      <td>GA</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Apr-1999</td>\n",
       "      <td>5.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1687.0</td>\n",
       "      <td>9.4%</td>\n",
       "      <td>4.0</td>\n",
       "      <td>f</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1008.710000</td>\n",
       "      <td>1008.71</td>\n",
       "      <td>456.46</td>\n",
       "      <td>435.17</td>\n",
       "      <td>0.00</td>\n",
       "      <td>117.08</td>\n",
       "      <td>1.11</td>\n",
       "      <td>Apr-2013</td>\n",
       "      <td>119.66</td>\n",
       "      <td>Sep-2013</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>INDIVIDUAL</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1077175</td>\n",
       "      <td>1313524.0</td>\n",
       "      <td>2400.0</td>\n",
       "      <td>2400.0</td>\n",
       "      <td>2400.0</td>\n",
       "      <td>36 months</td>\n",
       "      <td>15.96%</td>\n",
       "      <td>84.33</td>\n",
       "      <td>C</td>\n",
       "      <td>C5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10+ years</td>\n",
       "      <td>RENT</td>\n",
       "      <td>12252.0</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Dec-2011</td>\n",
       "      <td>Fully Paid</td>\n",
       "      <td>n</td>\n",
       "      <td>small_business</td>\n",
       "      <td>real estate business</td>\n",
       "      <td>606xx</td>\n",
       "      <td>IL</td>\n",
       "      <td>8.72</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Nov-2001</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2956.0</td>\n",
       "      <td>98.5%</td>\n",
       "      <td>10.0</td>\n",
       "      <td>f</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>3005.666844</td>\n",
       "      <td>3005.67</td>\n",
       "      <td>2400.00</td>\n",
       "      <td>605.67</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>Jun-2014</td>\n",
       "      <td>649.91</td>\n",
       "      <td>Jun-2016</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>INDIVIDUAL</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1076863</td>\n",
       "      <td>1277178.0</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>36 months</td>\n",
       "      <td>13.49%</td>\n",
       "      <td>339.31</td>\n",
       "      <td>C</td>\n",
       "      <td>C1</td>\n",
       "      <td>AIR RESOURCES BOARD</td>\n",
       "      <td>10+ years</td>\n",
       "      <td>RENT</td>\n",
       "      <td>49200.0</td>\n",
       "      <td>Source Verified</td>\n",
       "      <td>Dec-2011</td>\n",
       "      <td>Fully Paid</td>\n",
       "      <td>n</td>\n",
       "      <td>other</td>\n",
       "      <td>personel</td>\n",
       "      <td>917xx</td>\n",
       "      <td>CA</td>\n",
       "      <td>20.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Feb-1996</td>\n",
       "      <td>1.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5598.0</td>\n",
       "      <td>21%</td>\n",
       "      <td>37.0</td>\n",
       "      <td>f</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>12231.890000</td>\n",
       "      <td>12231.89</td>\n",
       "      <td>10000.00</td>\n",
       "      <td>2214.92</td>\n",
       "      <td>16.97</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>Jan-2015</td>\n",
       "      <td>357.48</td>\n",
       "      <td>Apr-2016</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>INDIVIDUAL</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1075358</td>\n",
       "      <td>1311748.0</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>60 months</td>\n",
       "      <td>12.69%</td>\n",
       "      <td>67.79</td>\n",
       "      <td>B</td>\n",
       "      <td>B5</td>\n",
       "      <td>University Medical Group</td>\n",
       "      <td>1 year</td>\n",
       "      <td>RENT</td>\n",
       "      <td>80000.0</td>\n",
       "      <td>Source Verified</td>\n",
       "      <td>Dec-2011</td>\n",
       "      <td>Current</td>\n",
       "      <td>n</td>\n",
       "      <td>other</td>\n",
       "      <td>Personal</td>\n",
       "      <td>972xx</td>\n",
       "      <td>OR</td>\n",
       "      <td>17.94</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Jan-1996</td>\n",
       "      <td>0.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>27783.0</td>\n",
       "      <td>53.9%</td>\n",
       "      <td>38.0</td>\n",
       "      <td>f</td>\n",
       "      <td>461.73</td>\n",
       "      <td>461.73</td>\n",
       "      <td>3581.120000</td>\n",
       "      <td>3581.12</td>\n",
       "      <td>2538.27</td>\n",
       "      <td>1042.85</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>Jun-2016</td>\n",
       "      <td>67.79</td>\n",
       "      <td>Jun-2016</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>INDIVIDUAL</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \\\n",
       "0  1077501  1296599.0     5000.0       5000.0           4975.0   36 months   \n",
       "1  1077430  1314167.0     2500.0       2500.0           2500.0   60 months   \n",
       "2  1077175  1313524.0     2400.0       2400.0           2400.0   36 months   \n",
       "3  1076863  1277178.0    10000.0      10000.0          10000.0   36 months   \n",
       "4  1075358  1311748.0     3000.0       3000.0           3000.0   60 months   \n",
       "\n",
       "  int_rate  installment grade sub_grade                 emp_title emp_length  \\\n",
       "0   10.65%       162.87     B        B2                       NaN  10+ years   \n",
       "1   15.27%        59.83     C        C4                     Ryder   < 1 year   \n",
       "2   15.96%        84.33     C        C5                       NaN  10+ years   \n",
       "3   13.49%       339.31     C        C1       AIR RESOURCES BOARD  10+ years   \n",
       "4   12.69%        67.79     B        B5  University Medical Group     1 year   \n",
       "\n",
       "  home_ownership  annual_inc verification_status   issue_d  loan_status  \\\n",
       "0           RENT     24000.0            Verified  Dec-2011   Fully Paid   \n",
       "1           RENT     30000.0     Source Verified  Dec-2011  Charged Off   \n",
       "2           RENT     12252.0        Not Verified  Dec-2011   Fully Paid   \n",
       "3           RENT     49200.0     Source Verified  Dec-2011   Fully Paid   \n",
       "4           RENT     80000.0     Source Verified  Dec-2011      Current   \n",
       "\n",
       "  pymnt_plan         purpose                 title zip_code addr_state    dti  \\\n",
       "0          n     credit_card              Computer    860xx         AZ  27.65   \n",
       "1          n             car                  bike    309xx         GA   1.00   \n",
       "2          n  small_business  real estate business    606xx         IL   8.72   \n",
       "3          n           other              personel    917xx         CA  20.00   \n",
       "4          n           other              Personal    972xx         OR  17.94   \n",
       "\n",
       "   delinq_2yrs earliest_cr_line  inq_last_6mths  open_acc  pub_rec  revol_bal  \\\n",
       "0          0.0         Jan-1985             1.0       3.0      0.0    13648.0   \n",
       "1          0.0         Apr-1999             5.0       3.0      0.0     1687.0   \n",
       "2          0.0         Nov-2001             2.0       2.0      0.0     2956.0   \n",
       "3          0.0         Feb-1996             1.0      10.0      0.0     5598.0   \n",
       "4          0.0         Jan-1996             0.0      15.0      0.0    27783.0   \n",
       "\n",
       "  revol_util  total_acc initial_list_status  out_prncp  out_prncp_inv  \\\n",
       "0      83.7%        9.0                   f       0.00           0.00   \n",
       "1       9.4%        4.0                   f       0.00           0.00   \n",
       "2      98.5%       10.0                   f       0.00           0.00   \n",
       "3        21%       37.0                   f       0.00           0.00   \n",
       "4      53.9%       38.0                   f     461.73         461.73   \n",
       "\n",
       "    total_pymnt  total_pymnt_inv  total_rec_prncp  total_rec_int  \\\n",
       "0   5863.155187          5833.84          5000.00         863.16   \n",
       "1   1008.710000          1008.71           456.46         435.17   \n",
       "2   3005.666844          3005.67          2400.00         605.67   \n",
       "3  12231.890000         12231.89         10000.00        2214.92   \n",
       "4   3581.120000          3581.12          2538.27        1042.85   \n",
       "\n",
       "   total_rec_late_fee  recoveries  collection_recovery_fee last_pymnt_d  \\\n",
       "0                0.00        0.00                     0.00     Jan-2015   \n",
       "1                0.00      117.08                     1.11     Apr-2013   \n",
       "2                0.00        0.00                     0.00     Jun-2014   \n",
       "3               16.97        0.00                     0.00     Jan-2015   \n",
       "4                0.00        0.00                     0.00     Jun-2016   \n",
       "\n",
       "   last_pymnt_amnt last_credit_pull_d  collections_12_mths_ex_med  \\\n",
       "0           171.62           Jun-2016                         0.0   \n",
       "1           119.66           Sep-2013                         0.0   \n",
       "2           649.91           Jun-2016                         0.0   \n",
       "3           357.48           Apr-2016                         0.0   \n",
       "4            67.79           Jun-2016                         0.0   \n",
       "\n",
       "   policy_code application_type  acc_now_delinq  chargeoff_within_12_mths  \\\n",
       "0          1.0       INDIVIDUAL             0.0                       0.0   \n",
       "1          1.0       INDIVIDUAL             0.0                       0.0   \n",
       "2          1.0       INDIVIDUAL             0.0                       0.0   \n",
       "3          1.0       INDIVIDUAL             0.0                       0.0   \n",
       "4          1.0       INDIVIDUAL             0.0                       0.0   \n",
       "\n",
       "   delinq_amnt  pub_rec_bankruptcies  tax_liens  \n",
       "0          0.0                   0.0        0.0  \n",
       "1          0.0                   0.0        0.0  \n",
       "2          0.0                   0.0        0.0  \n",
       "3          0.0                   0.0        0.0  \n",
       "4          0.0                   0.0        0.0  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "pd.options.display.max_columns = 99\n",
    "\n",
    "first_five = pd.read_csv('loans_2007.csv', nrows=5)\n",
    "first_five"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.5502548217773438"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "thousand_chunk = pd.read_csv('loans_2007.csv', nrows=1000)\n",
    "thousand_chunk.memory_usage(deep=True).sum()/(1024*1024)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Let's try tripling to 3000 rows and calculate the memory footprint for each chunk."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.649059295654297\n",
      "4.644805908203125\n",
      "4.646563529968262\n",
      "4.647915840148926\n",
      "4.644108772277832\n",
      "4.645991325378418\n",
      "4.644582748413086\n",
      "4.646951675415039\n",
      "4.645077705383301\n",
      "4.64512825012207\n",
      "4.657840728759766\n",
      "4.656707763671875\n",
      "4.663515090942383\n",
      "4.896956443786621\n",
      "0.880854606628418\n"
     ]
    }
   ],
   "source": [
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
    "for chunk in chunk_iter:\n",
    "    print(chunk.memory_usage(deep=True).sum()/(1024*1024))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How many rows in the data set?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "42538\n"
     ]
    }
   ],
   "source": [
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
    "total_rows = 0\n",
    "for chunk in chunk_iter:\n",
    "    total_rows += len(chunk)\n",
    "print(total_rows)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploring the Data in Chunks\n",
    "\n",
    "## How many columns have a numeric type? How many have a string type?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30]\n",
      "[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22]\n"
     ]
    }
   ],
   "source": [
    "# Numeric columns\n",
    "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
    "\n",
    "numeric = []\n",
    "string = []\n",
    "for lc in loans_chunks:\n",
    "    nums = lc.select_dtypes(include=[np.number]).shape[1]\n",
    "    numeric.append(nums)\n",
    "    strs = lc.select_dtypes(include=['object']).shape[1]\n",
    "    string.append(strs)\n",
    "\n",
    "print(numeric)\n",
    "print(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
      "\n",
      "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
      "\n",
      "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
      "\n",
      "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Are string columns consistent across chunks?\n",
    "obj_cols = []\n",
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
    "\n",
    "for chunk in chunk_iter:\n",
    "    chunk_obj_cols = chunk.select_dtypes(include=['object']).columns.tolist()\n",
    "    if len(obj_cols) > 0:\n",
    "        is_same = obj_cols == chunk_obj_cols\n",
    "        if not is_same:\n",
    "            print(\"overall obj cols:\", obj_cols, \"\\n\")\n",
    "            print(\"chunk obj cols:\", chunk_obj_cols, \"\\n\")    \n",
    "    else:\n",
    "        obj_cols = chunk_obj_cols"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### Observation 1: By default -- 31 numeric columns and 21 string columns.\n",
    "\n",
    "### Observation 2: It seems like one column in particular (the `id` column) is being cast to int64 in the last 2 chunks but not in the earlier chunks. Since the `id` column won't be useful for analysis, visualization, or predictive modelling let's ignore this column.\n",
    "\n",
    "## How many unique values are there in each string column? How many of the string columns contain values that are less than 50% unique?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "term 2\n",
      "grade 7\n",
      "sub_grade 35\n",
      "emp_length 11\n",
      "home_ownership 5\n",
      "verification_status 3\n",
      "loan_status 9\n",
      "pymnt_plan 2\n",
      "purpose 14\n",
      "initial_list_status 1\n",
      "application_type 1\n"
     ]
    }
   ],
   "source": [
    "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
    "\n",
    "uniques = {}\n",
    "for lc in loans_chunks:\n",
    "    strings_only = lc.select_dtypes(include=['object'])\n",
    "    cols = strings_only.columns\n",
    "    for c in cols:\n",
    "        val_counts = strings_only[c].value_counts()\n",
    "        if c in uniques:\n",
    "            uniques[c].append(val_counts)\n",
    "        else:\n",
    "            uniques[c] = [val_counts]\n",
    "\n",
    "uniques_combined = {}\n",
    "unique_stats = {\n",
    "    'column_name': [],\n",
    "    'total_values': [],\n",
    "    'unique_values': [],\n",
    "}\n",
    "for col in uniques:\n",
    "    u_concat = pd.concat(uniques[col])\n",
    "    u_group = u_concat.groupby(u_concat.index).sum()\n",
    "    uniques_combined[col] = u_group\n",
    "    if u_group.shape[0] < 50:\n",
    "        print(col, u_group.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Which float columns have no missing values and could be candidates for conversion to the integer type?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "member_id                        3\n",
       "total_rec_int                    3\n",
       "total_pymnt_inv                  3\n",
       "total_pymnt                      3\n",
       "revol_bal                        3\n",
       "recoveries                       3\n",
       "policy_code                      3\n",
       "out_prncp_inv                    3\n",
       "out_prncp                        3\n",
       "total_rec_late_fee               3\n",
       "loan_amnt                        3\n",
       "last_pymnt_amnt                  3\n",
       "total_rec_prncp                  3\n",
       "funded_amnt_inv                  3\n",
       "funded_amnt                      3\n",
       "dti                              3\n",
       "collection_recovery_fee          3\n",
       "installment                      3\n",
       "annual_inc                       7\n",
       "inq_last_6mths                  32\n",
       "total_acc                       32\n",
       "delinq_2yrs                     32\n",
       "pub_rec                         32\n",
       "delinq_amnt                     32\n",
       "open_acc                        32\n",
       "acc_now_delinq                  32\n",
       "tax_liens                      108\n",
       "collections_12_mths_ex_med     148\n",
       "chargeoff_within_12_mths       148\n",
       "pub_rec_bankruptcies          1368\n",
       "dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
    "\n",
    "missing = []\n",
    "for lc in loans_chunks:\n",
    "    floats = lc.select_dtypes(include=['float'])\n",
    "    missing.append(floats.apply(pd.isnull).sum())\n",
    "\n",
    "combined_missing = pd.concat(missing)\n",
    "combined_missing.groupby(combined_missing.index).sum().sort_values()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate the total memory usage across all of the chunks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "66.21605968475342"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
    "\n",
    "mem_usage = []\n",
    "\n",
    "for lc in loans_chunks:\n",
    "    mem_usage.append(lc.memory_usage(deep=True).sum() / 1024 ** 2)\n",
    "\n",
    "sum(mem_usage)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Optimizing String Columns\n",
    "\n",
    "### Determine which string columns you can convert to a numeric type if you clean them. Let's focus on columns that would actually be useful for analysis and modelling."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['term',\n",
       " 'int_rate',\n",
       " 'grade',\n",
       " 'sub_grade',\n",
       " 'emp_title',\n",
       " 'emp_length',\n",
       " 'home_ownership',\n",
       " 'verification_status',\n",
       " 'issue_d',\n",
       " 'loan_status',\n",
       " 'pymnt_plan',\n",
       " 'purpose',\n",
       " 'title',\n",
       " 'zip_code',\n",
       " 'addr_state',\n",
       " 'earliest_cr_line',\n",
       " 'revol_util',\n",
       " 'initial_list_status',\n",
       " 'last_pymnt_d',\n",
       " 'last_credit_pull_d',\n",
       " 'application_type']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "useful_obj_cols = ['term', 'sub_grade', 'emp_title', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'earliest_cr_line', 'revol_util', 'last_pymnt_d', 'last_credit_pull_d']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Create dictionary (key: column, value: list of Series objects representing each chunk's value counts)\n",
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
    "str_cols_vc = {}\n",
    "for chunk in chunk_iter:\n",
    "    str_cols = chunk.select_dtypes(include=['object'])\n",
    "    for col in str_cols.columns:\n",
    "        current_col_vc = str_cols[col].value_counts()\n",
    "        if col in str_cols_vc:\n",
    "            str_cols_vc[col].append(current_col_vc)\n",
    "        else:\n",
    "            str_cols_vc[col] = [current_col_vc]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Combine the value counts.\n",
    "combined_vcs = {}\n",
    "\n",
    "for col in str_cols_vc:\n",
    "    combined_vc = pd.concat(str_cols_vc[col])\n",
    "    final_vc = combined_vc.groupby(combined_vc.index).sum()\n",
    "    combined_vcs[col] = final_vc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "term\n",
      " 36 months    31534\n",
      " 60 months    11001\n",
      "Name: term, dtype: int64\n",
      "-----------\n",
      "sub_grade\n",
      "A1    1142\n",
      "A2    1520\n",
      "A3    1823\n",
      "A4    2905\n",
      "A5    2793\n",
      "B1    1882\n",
      "B2    2113\n",
      "B3    2997\n",
      "B4    2590\n",
      "B5    2807\n",
      "C1    2264\n",
      "C2    2157\n",
      "C3    1658\n",
      "C4    1370\n",
      "C5    1291\n",
      "D1    1053\n",
      "D2    1485\n",
      "D3    1322\n",
      "D4    1140\n",
      "D5    1016\n",
      "E1     884\n",
      "E2     791\n",
      "E3     668\n",
      "E4     552\n",
      "E5     499\n",
      "F1     392\n",
      "F2     308\n",
      "F3     236\n",
      "F4     211\n",
      "F5     154\n",
      "G1     141\n",
      "G2     107\n",
      "G3      79\n",
      "G4      99\n",
      "G5      86\n",
      "Name: sub_grade, dtype: int64\n",
      "-----------\n",
      "emp_title\n",
      "  old palm inc                       1\n",
      " Brocade Communications              1\n",
      " CenturyLink                         1\n",
      " Department of Homeland Security     1\n",
      " Down To Earth Distributors, Inc.    1\n",
      "                                    ..\n",
      "zashko inc.                          1\n",
      "zeno office solutions                1\n",
      "zion lutheran school                 1\n",
      "zoll medical corp                    1\n",
      "zozaya officiating                   1\n",
      "Name: emp_title, Length: 30658, dtype: int64\n",
      "-----------\n",
      "home_ownership\n",
      "MORTGAGE    18959\n",
      "NONE            8\n",
      "OTHER         136\n",
      "OWN          3251\n",
      "RENT        20181\n",
      "Name: home_ownership, dtype: int64\n",
      "-----------\n",
      "verification_status\n",
      "Not Verified       18758\n",
      "Source Verified    10306\n",
      "Verified           13471\n",
      "Name: verification_status, dtype: int64\n",
      "-----------\n",
      "issue_d\n",
      "Apr-2008     259\n",
      "Apr-2009     333\n",
      "Apr-2010     912\n",
      "Apr-2011    1563\n",
      "Aug-2007      74\n",
      "Aug-2008     100\n",
      "Aug-2009     446\n",
      "Aug-2010    1175\n",
      "Aug-2011    1934\n",
      "Dec-2007     172\n",
      "Dec-2008     253\n",
      "Dec-2009     658\n",
      "Dec-2010    1335\n",
      "Dec-2011    2267\n",
      "Feb-2008     306\n",
      "Feb-2009     302\n",
      "Feb-2010     682\n",
      "Feb-2011    1298\n",
      "Jan-2008     305\n",
      "Jan-2009     269\n",
      "Jan-2010     662\n",
      "Jan-2011    1380\n",
      "Jul-2007      63\n",
      "Jul-2008     141\n",
      "Jul-2009     411\n",
      "Jul-2010    1204\n",
      "Jul-2011    1875\n",
      "Jun-2007      24\n",
      "Jun-2008     124\n",
      "Jun-2009     406\n",
      "Jun-2010    1105\n",
      "Jun-2011    1835\n",
      "Mar-2008     402\n",
      "Mar-2009     324\n",
      "Mar-2010     828\n",
      "Mar-2011    1448\n",
      "May-2008     115\n",
      "May-2009     359\n",
      "May-2010     989\n",
      "May-2011    1704\n",
      "Nov-2007     112\n",
      "Nov-2008     209\n",
      "Nov-2009     662\n",
      "Nov-2010    1224\n",
      "Nov-2011    2232\n",
      "Oct-2007     105\n",
      "Oct-2008     122\n",
      "Oct-2009     604\n",
      "Oct-2010    1232\n",
      "Oct-2011    2118\n",
      "Sep-2007      53\n",
      "Sep-2008      57\n",
      "Sep-2009     507\n",
      "Sep-2010    1189\n",
      "Sep-2011    2067\n",
      "Name: issue_d, dtype: int64\n",
      "-----------\n",
      "purpose\n",
      "car                    1615\n",
      "credit_card            5477\n",
      "debt_consolidation    19776\n",
      "educational             422\n",
      "home_improvement       3199\n",
      "house                   426\n",
      "major_purchase         2311\n",
      "medical                 753\n",
      "moving                  629\n",
      "other                  4425\n",
      "renewable_energy        106\n",
      "small_business         1992\n",
      "vacation                400\n",
      "wedding                1004\n",
      "Name: purpose, dtype: int64\n",
      "-----------\n",
      "earliest_cr_line\n",
      "Apr-1964      3\n",
      "Apr-1966      1\n",
      "Apr-1967      4\n",
      "Apr-1968      1\n",
      "Apr-1969      1\n",
      "           ... \n",
      "Sep-2004    221\n",
      "Sep-2005    162\n",
      "Sep-2006    150\n",
      "Sep-2007     63\n",
      "Sep-2008      8\n",
      "Name: earliest_cr_line, Length: 530, dtype: int64\n",
      "-----------\n",
      "revol_util\n",
      "0%       1070\n",
      "0.01%       1\n",
      "0.03%       1\n",
      "0.04%       1\n",
      "0.05%       1\n",
      "         ... \n",
      "99.5%      24\n",
      "99.6%      27\n",
      "99.7%      32\n",
      "99.8%      25\n",
      "99.9%      29\n",
      "Name: revol_util, Length: 1119, dtype: int64\n",
      "-----------\n",
      "last_pymnt_d\n",
      "Apr-2008     23\n",
      "Apr-2009     72\n",
      "Apr-2010    145\n",
      "Apr-2011    519\n",
      "Apr-2012    781\n",
      "           ... \n",
      "Sep-2011    491\n",
      "Sep-2012    802\n",
      "Sep-2013    712\n",
      "Sep-2014    694\n",
      "Sep-2015    211\n",
      "Name: last_pymnt_d, Length: 103, dtype: int64\n",
      "-----------\n",
      "last_credit_pull_d\n",
      "Apr-2009     24\n",
      "Apr-2010     77\n",
      "Apr-2011    177\n",
      "Apr-2012    326\n",
      "Apr-2013    445\n",
      "           ... \n",
      "Sep-2011    175\n",
      "Sep-2012    414\n",
      "Sep-2013    408\n",
      "Sep-2014    564\n",
      "Sep-2015    531\n",
      "Name: last_credit_pull_d, Length: 108, dtype: int64\n",
      "-----------\n"
     ]
    }
   ],
   "source": [
    "for col in useful_obj_cols:\n",
    "    print(col)\n",
    "    print(combined_vcs[col])\n",
    "    print(\"-----------\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert to category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "convert_col_dtypes = {\n",
    "    \"sub_grade\": \"category\", \"home_ownership\": \"category\", \n",
    "    \"verification_status\": \"category\", \"purpose\": \"category\"\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert `term` and `revol_util` to numerical by data cleaning.\n",
    "### Convert `issue_d`, `earliest_cr_line`, `last_pymnt_d`, and `last_credit_pull_d` to datetime."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>term</th>\n",
       "      <th>sub_grade</th>\n",
       "      <th>emp_title</th>\n",
       "      <th>home_ownership</th>\n",
       "      <th>verification_status</th>\n",
       "      <th>issue_d</th>\n",
       "      <th>purpose</th>\n",
       "      <th>earliest_cr_line</th>\n",
       "      <th>revol_util</th>\n",
       "      <th>last_pymnt_d</th>\n",
       "      <th>last_credit_pull_d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>42000</th>\n",
       "      <td>36 months</td>\n",
       "      <td>C2</td>\n",
       "      <td>Best Buy</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Jul-2000</td>\n",
       "      <td>100.7%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42001</th>\n",
       "      <td>36 months</td>\n",
       "      <td>G2</td>\n",
       "      <td>CVS PHARMACY</td>\n",
       "      <td>OWN</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Mar-1989</td>\n",
       "      <td>51.9%</td>\n",
       "      <td>Nov-2008</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42002</th>\n",
       "      <td>36 months</td>\n",
       "      <td>E4</td>\n",
       "      <td>General Motors</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Dec-1998</td>\n",
       "      <td>80.7%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42003</th>\n",
       "      <td>36 months</td>\n",
       "      <td>G4</td>\n",
       "      <td>usa medical center</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Jul-1995</td>\n",
       "      <td>57.2%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Jun-2011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42004</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B3</td>\n",
       "      <td>InvestSource Inc</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Sep-2005</td>\n",
       "      <td>74%</td>\n",
       "      <td>Mar-2010</td>\n",
       "      <td>Aug-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42533</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>May-2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42534</th>\n",
       "      <td>36 months</td>\n",
       "      <td>A5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NONE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>Aug-2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42535</th>\n",
       "      <td>36 months</td>\n",
       "      <td>A3</td>\n",
       "      <td>Homemaker</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>Feb-2015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42536</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42537</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>538 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             term sub_grade           emp_title home_ownership  \\\n",
       "42000   36 months        C2            Best Buy           RENT   \n",
       "42001   36 months        G2        CVS PHARMACY            OWN   \n",
       "42002   36 months        E4      General Motors           RENT   \n",
       "42003   36 months        G4  usa medical center           RENT   \n",
       "42004   36 months        B3    InvestSource Inc           RENT   \n",
       "...           ...       ...                 ...            ...   \n",
       "42533   36 months        B3                 NaN           RENT   \n",
       "42534   36 months        A5                 NaN           NONE   \n",
       "42535   36 months        A3           Homemaker       MORTGAGE   \n",
       "42536         NaN       NaN                 NaN            NaN   \n",
       "42537         NaN       NaN                 NaN            NaN   \n",
       "\n",
       "      verification_status   issue_d             purpose earliest_cr_line  \\\n",
       "42000        Not Verified  Feb-2008  debt_consolidation         Jul-2000   \n",
       "42001        Not Verified  Feb-2008  debt_consolidation         Mar-1989   \n",
       "42002        Not Verified  Feb-2008  debt_consolidation         Dec-1998   \n",
       "42003        Not Verified  Feb-2008  debt_consolidation         Jul-1995   \n",
       "42004        Not Verified  Feb-2008  debt_consolidation         Sep-2005   \n",
       "...                   ...       ...                 ...              ...   \n",
       "42533        Not Verified  Jun-2007               other              NaN   \n",
       "42534        Not Verified  Jun-2007               other              NaN   \n",
       "42535        Not Verified  Jun-2007               other              NaN   \n",
       "42536                 NaN       NaN                 NaN              NaN   \n",
       "42537                 NaN       NaN                 NaN              NaN   \n",
       "\n",
       "      revol_util last_pymnt_d last_credit_pull_d  \n",
       "42000     100.7%     Feb-2011           Jun-2016  \n",
       "42001      51.9%     Nov-2008           Jun-2016  \n",
       "42002      80.7%     Feb-2011           Jun-2016  \n",
       "42003      57.2%     Feb-2011           Jun-2011  \n",
       "42004        74%     Mar-2010           Aug-2010  \n",
       "...          ...          ...                ...  \n",
       "42533        NaN     Jun-2010           May-2007  \n",
       "42534        NaN     Jun-2010           Aug-2007  \n",
       "42535        NaN     Jun-2010           Feb-2015  \n",
       "42536        NaN          NaN                NaN  \n",
       "42537        NaN          NaN                NaN  \n",
       "\n",
       "[538 rows x 11 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk[useful_obj_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                                    object\n",
       "member_id                            float64\n",
       "loan_amnt                            float64\n",
       "funded_amnt                          float64\n",
       "funded_amnt_inv                      float64\n",
       "term                                 float64\n",
       "int_rate                              object\n",
       "installment                          float64\n",
       "grade                                 object\n",
       "sub_grade                           category\n",
       "emp_title                             object\n",
       "emp_length                            object\n",
       "home_ownership                      category\n",
       "annual_inc                           float64\n",
       "verification_status                 category\n",
       "issue_d                       datetime64[ns]\n",
       "loan_status                           object\n",
       "pymnt_plan                            object\n",
       "purpose                             category\n",
       "title                                 object\n",
       "zip_code                              object\n",
       "addr_state                            object\n",
       "dti                                  float64\n",
       "delinq_2yrs                          float64\n",
       "earliest_cr_line              datetime64[ns]\n",
       "inq_last_6mths                       float64\n",
       "open_acc                             float64\n",
       "pub_rec                              float64\n",
       "revol_bal                            float64\n",
       "revol_util                           float64\n",
       "total_acc                            float64\n",
       "initial_list_status                   object\n",
       "out_prncp                            float64\n",
       "out_prncp_inv                        float64\n",
       "total_pymnt                          float64\n",
       "total_pymnt_inv                      float64\n",
       "total_rec_prncp                      float64\n",
       "total_rec_int                        float64\n",
       "total_rec_late_fee                   float64\n",
       "recoveries                           float64\n",
       "collection_recovery_fee              float64\n",
       "last_pymnt_d                  datetime64[ns]\n",
       "last_pymnt_amnt                      float64\n",
       "last_credit_pull_d            datetime64[ns]\n",
       "collections_12_mths_ex_med           float64\n",
       "policy_code                          float64\n",
       "application_type                      object\n",
       "acc_now_delinq                       float64\n",
       "chargeoff_within_12_mths             float64\n",
       "delinq_amnt                          float64\n",
       "pub_rec_bankruptcies                 float64\n",
       "tax_liens                            float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
    "\n",
    "for chunk in chunk_iter:\n",
    "    term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
    "    revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
    "    chunk['term'] = pd.to_numeric(term_cleaned)\n",
    "    chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
    "    \n",
    "chunk.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'member_id': 3,\n",
       " 'loan_amnt': 3,\n",
       " 'funded_amnt': 3,\n",
       " 'funded_amnt_inv': 3,\n",
       " 'installment': 3,\n",
       " 'annual_inc': 7,\n",
       " 'dti': 3,\n",
       " 'delinq_2yrs': 32,\n",
       " 'inq_last_6mths': 32,\n",
       " 'open_acc': 32,\n",
       " 'pub_rec': 32,\n",
       " 'revol_bal': 3,\n",
       " 'revol_util': 93,\n",
       " 'total_acc': 32,\n",
       " 'out_prncp': 3,\n",
       " 'out_prncp_inv': 3,\n",
       " 'total_pymnt': 3,\n",
       " 'total_pymnt_inv': 3,\n",
       " 'total_rec_prncp': 3,\n",
       " 'total_rec_int': 3,\n",
       " 'total_rec_late_fee': 3,\n",
       " 'recoveries': 3,\n",
       " 'collection_recovery_fee': 3,\n",
       " 'last_pymnt_amnt': 3,\n",
       " 'collections_12_mths_ex_med': 148,\n",
       " 'policy_code': 3,\n",
       " 'acc_now_delinq': 32,\n",
       " 'chargeoff_within_12_mths': 148,\n",
       " 'delinq_amnt': 32,\n",
       " 'pub_rec_bankruptcies': 1368,\n",
       " 'tax_liens': 108,\n",
       " 'term': 3}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
    "mv_counts = {}\n",
    "for chunk in chunk_iter:\n",
    "    term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
    "    revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
    "    chunk['term'] = pd.to_numeric(term_cleaned)\n",
    "    chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
    "    float_cols = chunk.select_dtypes(include=['float'])\n",
    "    for col in float_cols.columns:\n",
    "        missing_values = len(chunk) - chunk[col].count()\n",
    "        if col in mv_counts:\n",
    "            mv_counts[col] = mv_counts[col] + missing_values\n",
    "        else:\n",
    "            mv_counts[col] = missing_values\n",
    "mv_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'member_id': 3,\n",
       " 'loan_amnt': 3,\n",
       " 'funded_amnt': 3,\n",
       " 'funded_amnt_inv': 3,\n",
       " 'installment': 3,\n",
       " 'annual_inc': 7,\n",
       " 'dti': 3,\n",
       " 'delinq_2yrs': 32,\n",
       " 'inq_last_6mths': 32,\n",
       " 'open_acc': 32,\n",
       " 'pub_rec': 32,\n",
       " 'revol_bal': 3,\n",
       " 'revol_util': 93,\n",
       " 'total_acc': 32,\n",
       " 'out_prncp': 3,\n",
       " 'out_prncp_inv': 3,\n",
       " 'total_pymnt': 3,\n",
       " 'total_pymnt_inv': 3,\n",
       " 'total_rec_prncp': 3,\n",
       " 'total_rec_int': 3,\n",
       " 'total_rec_late_fee': 3,\n",
       " 'recoveries': 3,\n",
       " 'collection_recovery_fee': 3,\n",
       " 'last_pymnt_amnt': 3,\n",
       " 'collections_12_mths_ex_med': 148,\n",
       " 'policy_code': 3,\n",
       " 'acc_now_delinq': 32,\n",
       " 'chargeoff_within_12_mths': 148,\n",
       " 'delinq_amnt': 32,\n",
       " 'pub_rec_bankruptcies': 1368,\n",
       " 'tax_liens': 108,\n",
       " 'term': 3}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
    "mv_counts = {}\n",
    "for chunk in chunk_iter:\n",
    "    term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
    "    revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
    "    chunk['term'] = pd.to_numeric(term_cleaned)\n",
    "    chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
    "    chunk = chunk.dropna(how='all')\n",
    "    float_cols = chunk.select_dtypes(include=['float'])\n",
    "    for col in float_cols.columns:\n",
    "        missing_values = len(chunk) - chunk[col].count()\n",
    "        if col in mv_counts:\n",
    "            mv_counts[col] = mv_counts[col] + missing_values\n",
    "        else:\n",
    "            mv_counts[col] = missing_values\n",
    "mv_counts"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}