{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>member_id</th>\n",
       "      <th>loan_amnt</th>\n",
       "      <th>funded_amnt</th>\n",
       "      <th>funded_amnt_inv</th>\n",
       "      <th>term</th>\n",
       "      <th>int_rate</th>\n",
       "      <th>installment</th>\n",
       "      <th>grade</th>\n",
       "      <th>sub_grade</th>\n",
       "      <th>emp_title</th>\n",
       "      <th>emp_length</th>\n",
       "      <th>home_ownership</th>\n",
       "      <th>annual_inc</th>\n",
       "      <th>verification_status</th>\n",
       "      <th>issue_d</th>\n",
       "      <th>loan_status</th>\n",
       "      <th>pymnt_plan</th>\n",
       "      <th>purpose</th>\n",
       "      <th>title</th>\n",
       "      <th>zip_code</th>\n",
       "      <th>addr_state</th>\n",
       "      <th>dti</th>\n",
       "      <th>delinq_2yrs</th>\n",
       "      <th>earliest_cr_line</th>\n",
       "      <th>inq_last_6mths</th>\n",
       "      <th>open_acc</th>\n",
       "      <th>pub_rec</th>\n",
       "      <th>revol_bal</th>\n",
       "      <th>revol_util</th>\n",
       "      <th>total_acc</th>\n",
       "      <th>initial_list_status</th>\n",
       "      <th>out_prncp</th>\n",
       "      <th>out_prncp_inv</th>\n",
       "      <th>total_pymnt</th>\n",
       "      <th>total_pymnt_inv</th>\n",
       "      <th>total_rec_prncp</th>\n",
       "      <th>total_rec_int</th>\n",
       "      <th>total_rec_late_fee</th>\n",
       "      <th>recoveries</th>\n",
       "      <th>collection_recovery_fee</th>\n",
       "      <th>last_pymnt_d</th>\n",
       "      <th>last_pymnt_amnt</th>\n",
       "      <th>last_credit_pull_d</th>\n",
       "      <th>collections_12_mths_ex_med</th>\n",
       "      <th>policy_code</th>\n",
       "      <th>application_type</th>\n",
       "      <th>acc_now_delinq</th>\n",
       "      <th>chargeoff_within_12_mths</th>\n",
       "      <th>delinq_amnt</th>\n",
       "      <th>pub_rec_bankruptcies</th>\n",
       "      <th>tax_liens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1077501</td>\n",
       "      <td>1296599.0</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>4975.0</td>\n",
       "      <td>36 months</td>\n",
       "      <td>10.65%</td>\n",
       "      <td>162.87</td>\n",
       "      <td>B</td>\n",
       "      <td>B2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10+ years</td>\n",
       "      <td>RENT</td>\n",
       "      <td>24000.0</td>\n",
       "      <td>Verified</td>\n",
       "      <td>Dec-2011</td>\n",
       "      <td>Fully Paid</td>\n",
       "      <td>n</td>\n",
       "      <td>credit_card</td>\n",
       "      <td>Computer</td>\n",
       "      <td>860xx</td>\n",
       "      <td>AZ</td>\n",
       "      <td>27.65</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Jan-1985</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>13648.0</td>\n",
       "      <td>83.7%</td>\n",
       "      <td>9.0</td>\n",
       "      <td>f</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>5863.155187</td>\n",
       "      <td>5833.84</td>\n",
       "      <td>5000.00</td>\n",
       "      <td>863.16</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>Jan-2015</td>\n",
       "      <td>171.62</td>\n",
       "      <td>Jun-2016</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>INDIVIDUAL</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1077430</td>\n",
       "      <td>1314167.0</td>\n",
       "      <td>2500.0</td>\n",
       "      <td>2500.0</td>\n",
       "      <td>2500.0</td>\n",
       "      <td>60 months</td>\n",
       "      <td>15.27%</td>\n",
       "      <td>59.83</td>\n",
       "      <td>C</td>\n",
       "      <td>C4</td>\n",
       "      <td>Ryder</td>\n",
       "      <td>&lt; 1 year</td>\n",
       "      <td>RENT</td>\n",
       "      <td>30000.0</td>\n",
       "      <td>Source Verified</td>\n",
       "      <td>Dec-2011</td>\n",
       "      <td>Charged Off</td>\n",
       "      <td>n</td>\n",
       "      <td>car</td>\n",
       "      <td>bike</td>\n",
       "      <td>309xx</td>\n",
       "      <td>GA</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Apr-1999</td>\n",
       "      <td>5.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1687.0</td>\n",
       "      <td>9.4%</td>\n",
       "      <td>4.0</td>\n",
       "      <td>f</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1008.710000</td>\n",
       "      <td>1008.71</td>\n",
       "      <td>456.46</td>\n",
       "      <td>435.17</td>\n",
       "      <td>0.00</td>\n",
       "      <td>117.08</td>\n",
       "      <td>1.11</td>\n",
       "      <td>Apr-2013</td>\n",
       "      <td>119.66</td>\n",
       "      <td>Sep-2013</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>INDIVIDUAL</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1077175</td>\n",
       "      <td>1313524.0</td>\n",
       "      <td>2400.0</td>\n",
       "      <td>2400.0</td>\n",
       "      <td>2400.0</td>\n",
       "      <td>36 months</td>\n",
       "      <td>15.96%</td>\n",
       "      <td>84.33</td>\n",
       "      <td>C</td>\n",
       "      <td>C5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10+ years</td>\n",
       "      <td>RENT</td>\n",
       "      <td>12252.0</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Dec-2011</td>\n",
       "      <td>Fully Paid</td>\n",
       "      <td>n</td>\n",
       "      <td>small_business</td>\n",
       "      <td>real estate business</td>\n",
       "      <td>606xx</td>\n",
       "      <td>IL</td>\n",
       "      <td>8.72</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Nov-2001</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2956.0</td>\n",
       "      <td>98.5%</td>\n",
       "      <td>10.0</td>\n",
       "      <td>f</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>3005.666844</td>\n",
       "      <td>3005.67</td>\n",
       "      <td>2400.00</td>\n",
       "      <td>605.67</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>Jun-2014</td>\n",
       "      <td>649.91</td>\n",
       "      <td>Jun-2016</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>INDIVIDUAL</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1076863</td>\n",
       "      <td>1277178.0</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>36 months</td>\n",
       "      <td>13.49%</td>\n",
       "      <td>339.31</td>\n",
       "      <td>C</td>\n",
       "      <td>C1</td>\n",
       "      <td>AIR RESOURCES BOARD</td>\n",
       "      <td>10+ years</td>\n",
       "      <td>RENT</td>\n",
       "      <td>49200.0</td>\n",
       "      <td>Source Verified</td>\n",
       "      <td>Dec-2011</td>\n",
       "      <td>Fully Paid</td>\n",
       "      <td>n</td>\n",
       "      <td>other</td>\n",
       "      <td>personel</td>\n",
       "      <td>917xx</td>\n",
       "      <td>CA</td>\n",
       "      <td>20.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Feb-1996</td>\n",
       "      <td>1.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5598.0</td>\n",
       "      <td>21%</td>\n",
       "      <td>37.0</td>\n",
       "      <td>f</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>12231.890000</td>\n",
       "      <td>12231.89</td>\n",
       "      <td>10000.00</td>\n",
       "      <td>2214.92</td>\n",
       "      <td>16.97</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>Jan-2015</td>\n",
       "      <td>357.48</td>\n",
       "      <td>Apr-2016</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>INDIVIDUAL</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1075358</td>\n",
       "      <td>1311748.0</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>60 months</td>\n",
       "      <td>12.69%</td>\n",
       "      <td>67.79</td>\n",
       "      <td>B</td>\n",
       "      <td>B5</td>\n",
       "      <td>University Medical Group</td>\n",
       "      <td>1 year</td>\n",
       "      <td>RENT</td>\n",
       "      <td>80000.0</td>\n",
       "      <td>Source Verified</td>\n",
       "      <td>Dec-2011</td>\n",
       "      <td>Current</td>\n",
       "      <td>n</td>\n",
       "      <td>other</td>\n",
       "      <td>Personal</td>\n",
       "      <td>972xx</td>\n",
       "      <td>OR</td>\n",
       "      <td>17.94</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Jan-1996</td>\n",
       "      <td>0.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>27783.0</td>\n",
       "      <td>53.9%</td>\n",
       "      <td>38.0</td>\n",
       "      <td>f</td>\n",
       "      <td>461.73</td>\n",
       "      <td>461.73</td>\n",
       "      <td>3581.120000</td>\n",
       "      <td>3581.12</td>\n",
       "      <td>2538.27</td>\n",
       "      <td>1042.85</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>Jun-2016</td>\n",
       "      <td>67.79</td>\n",
       "      <td>Jun-2016</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>INDIVIDUAL</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \\\n",
       "0  1077501  1296599.0     5000.0       5000.0           4975.0   36 months   \n",
       "1  1077430  1314167.0     2500.0       2500.0           2500.0   60 months   \n",
       "2  1077175  1313524.0     2400.0       2400.0           2400.0   36 months   \n",
       "3  1076863  1277178.0    10000.0      10000.0          10000.0   36 months   \n",
       "4  1075358  1311748.0     3000.0       3000.0           3000.0   60 months   \n",
       "\n",
       "  int_rate  installment grade sub_grade                 emp_title emp_length  \\\n",
       "0   10.65%       162.87     B        B2                       NaN  10+ years   \n",
       "1   15.27%        59.83     C        C4                     Ryder   < 1 year   \n",
       "2   15.96%        84.33     C        C5                       NaN  10+ years   \n",
       "3   13.49%       339.31     C        C1       AIR RESOURCES BOARD  10+ years   \n",
       "4   12.69%        67.79     B        B5  University Medical Group     1 year   \n",
       "\n",
       "  home_ownership  annual_inc verification_status   issue_d  loan_status  \\\n",
       "0           RENT     24000.0            Verified  Dec-2011   Fully Paid   \n",
       "1           RENT     30000.0     Source Verified  Dec-2011  Charged Off   \n",
       "2           RENT     12252.0        Not Verified  Dec-2011   Fully Paid   \n",
       "3           RENT     49200.0     Source Verified  Dec-2011   Fully Paid   \n",
       "4           RENT     80000.0     Source Verified  Dec-2011      Current   \n",
       "\n",
       "  pymnt_plan         purpose                 title zip_code addr_state    dti  \\\n",
       "0          n     credit_card              Computer    860xx         AZ  27.65   \n",
       "1          n             car                  bike    309xx         GA   1.00   \n",
       "2          n  small_business  real estate business    606xx         IL   8.72   \n",
       "3          n           other              personel    917xx         CA  20.00   \n",
       "4          n           other              Personal    972xx         OR  17.94   \n",
       "\n",
       "   delinq_2yrs earliest_cr_line  inq_last_6mths  open_acc  pub_rec  revol_bal  \\\n",
       "0          0.0         Jan-1985             1.0       3.0      0.0    13648.0   \n",
       "1          0.0         Apr-1999             5.0       3.0      0.0     1687.0   \n",
       "2          0.0         Nov-2001             2.0       2.0      0.0     2956.0   \n",
       "3          0.0         Feb-1996             1.0      10.0      0.0     5598.0   \n",
       "4          0.0         Jan-1996             0.0      15.0      0.0    27783.0   \n",
       "\n",
       "  revol_util  total_acc initial_list_status  out_prncp  out_prncp_inv  \\\n",
       "0      83.7%        9.0                   f       0.00           0.00   \n",
       "1       9.4%        4.0                   f       0.00           0.00   \n",
       "2      98.5%       10.0                   f       0.00           0.00   \n",
       "3        21%       37.0                   f       0.00           0.00   \n",
       "4      53.9%       38.0                   f     461.73         461.73   \n",
       "\n",
       "    total_pymnt  total_pymnt_inv  total_rec_prncp  total_rec_int  \\\n",
       "0   5863.155187          5833.84          5000.00         863.16   \n",
       "1   1008.710000          1008.71           456.46         435.17   \n",
       "2   3005.666844          3005.67          2400.00         605.67   \n",
       "3  12231.890000         12231.89         10000.00        2214.92   \n",
       "4   3581.120000          3581.12          2538.27        1042.85   \n",
       "\n",
       "   total_rec_late_fee  recoveries  collection_recovery_fee last_pymnt_d  \\\n",
       "0                0.00        0.00                     0.00     Jan-2015   \n",
       "1                0.00      117.08                     1.11     Apr-2013   \n",
       "2                0.00        0.00                     0.00     Jun-2014   \n",
       "3               16.97        0.00                     0.00     Jan-2015   \n",
       "4                0.00        0.00                     0.00     Jun-2016   \n",
       "\n",
       "   last_pymnt_amnt last_credit_pull_d  collections_12_mths_ex_med  \\\n",
       "0           171.62           Jun-2016                         0.0   \n",
       "1           119.66           Sep-2013                         0.0   \n",
       "2           649.91           Jun-2016                         0.0   \n",
       "3           357.48           Apr-2016                         0.0   \n",
       "4            67.79           Jun-2016                         0.0   \n",
       "\n",
       "   policy_code application_type  acc_now_delinq  chargeoff_within_12_mths  \\\n",
       "0          1.0       INDIVIDUAL             0.0                       0.0   \n",
       "1          1.0       INDIVIDUAL             0.0                       0.0   \n",
       "2          1.0       INDIVIDUAL             0.0                       0.0   \n",
       "3          1.0       INDIVIDUAL             0.0                       0.0   \n",
       "4          1.0       INDIVIDUAL             0.0                       0.0   \n",
       "\n",
       "   delinq_amnt  pub_rec_bankruptcies  tax_liens  \n",
       "0          0.0                   0.0        0.0  \n",
       "1          0.0                   0.0        0.0  \n",
       "2          0.0                   0.0        0.0  \n",
       "3          0.0                   0.0        0.0  \n",
       "4          0.0                   0.0        0.0  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "pd.options.display.max_columns = 99\n",
    "\n",
    "first_five = pd.read_csv('loans_2007.csv', nrows=5)\n",
    "first_five"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.5506629943847656"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "thousand_chunk = pd.read_csv('loans_2007.csv', nrows=1000)\n",
    "thousand_chunk.memory_usage(deep=True).sum()/(1024*1024)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Let's try tripling to 3000 rows and calculate the memory footprint for each chunk."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.6512298584\n",
      "4.64817810059\n",
      "4.64966869354\n",
      "4.65059375763\n",
      "4.64692020416\n",
      "4.64837551117\n",
      "4.64736747742\n",
      "4.64941596985\n",
      "4.64743518829\n",
      "4.64727210999\n",
      "4.6598777771\n",
      "4.6566619873\n",
      "4.66349601746\n",
      "4.89784526825\n",
      "0.880808830261\n"
     ]
    }
   ],
   "source": [
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
    "for chunk in chunk_iter:\n",
    "    print(chunk.memory_usage(deep=True).sum()/(1024*1024))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How many rows in the data set?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "42538\n"
     ]
    }
   ],
   "source": [
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
    "total_rows = 0\n",
    "for chunk in chunk_iter:\n",
    "    total_rows += len(chunk)\n",
    "print(total_rows)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploring the Data in Chunks\n",
    "\n",
    "## How many columns have a numeric type? How many have a string type?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30]\n",
      "[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22]\n"
     ]
    }
   ],
   "source": [
    "# Numeric columns\n",
    "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
    "\n",
    "numeric = []\n",
    "string = []\n",
    "for lc in loans_chunks:\n",
    "    nums = lc.select_dtypes(include=[np.number]).shape[1]\n",
    "    numeric.append(nums)\n",
    "    strs = lc.select_dtypes(include=['object']).shape[1]\n",
    "    string.append(strs)\n",
    "\n",
    "print(numeric)\n",
    "print(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
      "\n",
      "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
      "\n",
      "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
      "\n",
      "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Are string columns consistent across chunks?\n",
    "obj_cols = []\n",
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
    "\n",
    "for chunk in chunk_iter:\n",
    "    chunk_obj_cols = chunk.select_dtypes(include=['object']).columns.tolist()\n",
    "    if len(obj_cols) > 0:\n",
    "        is_same = obj_cols == chunk_obj_cols\n",
    "        if not is_same:\n",
    "            print(\"overall obj cols:\", obj_cols, \"\\n\")\n",
    "            print(\"chunk obj cols:\", chunk_obj_cols, \"\\n\")    \n",
    "    else:\n",
    "        obj_cols = chunk_obj_cols"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### Observation 1: By default -- 31 numeric columns and 21 string columns.\n",
    "\n",
    "### Observation 2: It seems like one column in particular (the `id` column) is being cast to int64 in the last 2 chunks but not in the earlier chunks. Since the `id` column won't be useful for analysis, visualization, or predictive modelling let's ignore this column.\n",
    "\n",
    "## How many unique values are there in each string column? How many of the string columns contain values that are less than 50% unique?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "term 2\n",
      "loan_status 9\n",
      "purpose 14\n",
      "grade 7\n",
      "sub_grade 35\n",
      "initial_list_status 1\n",
      "verification_status 3\n",
      "application_type 1\n",
      "emp_length 12\n",
      "home_ownership 5\n",
      "pymnt_plan 2\n"
     ]
    }
   ],
   "source": [
    "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
    "\n",
    "uniques = {}\n",
    "for lc in loans_chunks:\n",
    "    strings_only = lc.select_dtypes(include=['object'])\n",
    "    cols = strings_only.columns\n",
    "    for c in cols:\n",
    "        val_counts = strings_only[c].value_counts()\n",
    "        if c in uniques:\n",
    "            uniques[c].append(val_counts)\n",
    "        else:\n",
    "            uniques[c] = [val_counts]\n",
    "\n",
    "uniques_combined = {}\n",
    "unique_stats = {\n",
    "    'column_name': [],\n",
    "    'total_values': [],\n",
    "    'unique_values': [],\n",
    "}\n",
    "for col in uniques:\n",
    "    u_concat = pd.concat(uniques[col])\n",
    "    u_group = u_concat.groupby(u_concat.index).sum()\n",
    "    uniques_combined[col] = u_group\n",
    "    if u_group.shape[0] < 50:\n",
    "        print(col, u_group.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Which float columns have no missing values and could be candidates for conversion to the integer type?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "member_id                        3\n",
       "total_rec_int                    3\n",
       "total_pymnt_inv                  3\n",
       "total_pymnt                      3\n",
       "revol_bal                        3\n",
       "recoveries                       3\n",
       "policy_code                      3\n",
       "out_prncp_inv                    3\n",
       "out_prncp                        3\n",
       "total_rec_late_fee               3\n",
       "loan_amnt                        3\n",
       "last_pymnt_amnt                  3\n",
       "total_rec_prncp                  3\n",
       "funded_amnt_inv                  3\n",
       "funded_amnt                      3\n",
       "dti                              3\n",
       "collection_recovery_fee          3\n",
       "installment                      3\n",
       "annual_inc                       7\n",
       "inq_last_6mths                  32\n",
       "total_acc                       32\n",
       "delinq_2yrs                     32\n",
       "pub_rec                         32\n",
       "delinq_amnt                     32\n",
       "open_acc                        32\n",
       "acc_now_delinq                  32\n",
       "tax_liens                      108\n",
       "collections_12_mths_ex_med     148\n",
       "chargeoff_within_12_mths       148\n",
       "pub_rec_bankruptcies          1368\n",
       "dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
    "\n",
    "missing = []\n",
    "for lc in loans_chunks:\n",
    "    floats = lc.select_dtypes(include=['float'])\n",
    "    missing.append(floats.apply(pd.isnull).sum())\n",
    "\n",
    "combined_missing = pd.concat(missing)\n",
    "combined_missing.groupby(combined_missing.index).sum().sort_values()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate the total memory usage across all of the chunks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "66.245146751403809"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
    "\n",
    "mem_usage = []\n",
    "\n",
    "for lc in loans_chunks:\n",
    "    mem_usage.append(lc.memory_usage(deep=True).sum() / 1024 ** 2)\n",
    "\n",
    "sum(mem_usage)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Optimizing String Columns\n",
    "\n",
    "### Determine which string columns you can convert to a numeric type if you clean them. Let's focus on columns that would actually be useful for analysis and modelling."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['term',\n",
       " 'int_rate',\n",
       " 'grade',\n",
       " 'sub_grade',\n",
       " 'emp_title',\n",
       " 'emp_length',\n",
       " 'home_ownership',\n",
       " 'verification_status',\n",
       " 'issue_d',\n",
       " 'loan_status',\n",
       " 'pymnt_plan',\n",
       " 'purpose',\n",
       " 'title',\n",
       " 'zip_code',\n",
       " 'addr_state',\n",
       " 'earliest_cr_line',\n",
       " 'revol_util',\n",
       " 'initial_list_status',\n",
       " 'last_pymnt_d',\n",
       " 'last_credit_pull_d',\n",
       " 'application_type']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "useful_obj_cols = ['term', 'sub_grade', 'emp_title', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'earliest_cr_line', 'revol_util', 'last_pymnt_d', 'last_credit_pull_d']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Create dictionary (key: column, value: list of Series objects representing each chunk's value counts)\n",
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
    "str_cols_vc = {}\n",
    "for chunk in chunk_iter:\n",
    "    str_cols = chunk.select_dtypes(include=['object'])\n",
    "    for col in str_cols.columns:\n",
    "        current_col_vc = str_cols[col].value_counts()\n",
    "        if col in str_cols_vc:\n",
    "            str_cols_vc[col].append(current_col_vc)\n",
    "        else:\n",
    "            str_cols_vc[col] = [current_col_vc]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Combine the value counts.\n",
    "combined_vcs = {}\n",
    "\n",
    "for col in str_cols_vc:\n",
    "    combined_vc = pd.concat(str_cols_vc[col])\n",
    "    final_vc = combined_vc.groupby(combined_vc.index).sum()\n",
    "    combined_vcs[col] = final_vc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "term\n",
      " 36 months    31534\n",
      " 60 months    11001\n",
      "Name: term, dtype: int64\n",
      "-----------\n",
      "sub_grade\n",
      "A1    1142\n",
      "A2    1520\n",
      "A3    1823\n",
      "A4    2905\n",
      "A5    2793\n",
      "B1    1882\n",
      "B2    2113\n",
      "B3    2997\n",
      "B4    2590\n",
      "B5    2807\n",
      "C1    2264\n",
      "C2    2157\n",
      "C3    1658\n",
      "C4    1370\n",
      "C5    1291\n",
      "D1    1053\n",
      "D2    1485\n",
      "D3    1322\n",
      "D4    1140\n",
      "D5    1016\n",
      "E1     884\n",
      "E2     791\n",
      "E3     668\n",
      "E4     552\n",
      "E5     499\n",
      "F1     392\n",
      "F2     308\n",
      "F3     236\n",
      "F4     211\n",
      "F5     154\n",
      "G1     141\n",
      "G2     107\n",
      "G3      79\n",
      "G4      99\n",
      "G5      86\n",
      "Name: sub_grade, dtype: int64\n",
      "-----------\n",
      "emp_title\n",
      "  old palm inc                                               1\n",
      " Brocade Communications                                      1\n",
      " CenturyLink                                                 1\n",
      " Department of Homeland Security                             1\n",
      " Down To Earth Distributors, Inc.                            1\n",
      " Plaid, Inc.                                                 1\n",
      " U.S. Dept. Of Homeland Security                             1\n",
      " pacific crane maintenance company                           1\n",
      "$260M '06 vintage technology venture capital firm            1\n",
      "(Collaborative) Abbott Nutrition Intl                        1\n",
      "(self) Castleforte Group                                     1\n",
      "0CEAN VIEW INTL REALTY/C21 UNITED PLATINUM                   1\n",
      "1 Cochran Hyundai                                            1\n",
      "1 and 1 Internet Inc.                                        1\n",
      "1)-Yavapai Regional Medical Center 2)- Dr. cantors office    1\n",
      "1-800 Contacts                                               1\n",
      "101 restaurant                                               1\n",
      "1013 communications                                          1\n",
      "1125 memorex dr                                              1\n",
      "1199SEIU Benefit and Pension Funds                           1\n",
      "1199SEIU UHE                                                 1\n",
      "123 Recovery                                                 1\n",
      "123 appartment corp.                                         1\n",
      "128 Air Refueling Wing (USAF)                                1\n",
      "1400                                                         1\n",
      "15                                                           1\n",
      "162 fighter wing                                             1\n",
      "16th MP BDE, U.S. Army                                       1\n",
      "1800-got-junk?                                               1\n",
      "182nd Airlift Wing Fire Dept.                                1\n",
      "                                                            ..\n",
      "wyoming valley hospital                                      1\n",
      "wythe county community hospital                              1\n",
      "xcel fire protection                                         1\n",
      "xerox cor.                                                   1\n",
      "xerox corp                                                   1\n",
      "xo communication                                             1\n",
      "xotic deliveries                                             1\n",
      "xpedite technologies inc                                     1\n",
      "xpedx                                                        1\n",
      "yankee candle company                                        1\n",
      "ye olde pepper co                                            1\n",
      "yellow cab com.                                              1\n",
      "yellow cab taxi                                              1\n",
      "ymca                                                         3\n",
      "yonkers fire department                                      1\n",
      "york hospital dental center                                  1\n",
      "yorkville physical therapy                                   1\n",
      "young and associates                                         1\n",
      "young atitudes aveda                                         1\n",
      "youth villages                                               1\n",
      "yrbnyc.com / Yellow Rat Bastard                              1\n",
      "yrc                                                          1\n",
      "yrcw                                                         1\n",
      "zachry industral inc                                         1\n",
      "zakheim and lavrar                                           1\n",
      "zashko inc.                                                  1\n",
      "zeno office solutions                                        1\n",
      "zion lutheran school                                         1\n",
      "zoll medical corp                                            1\n",
      "zozaya officiating                                           1\n",
      "Name: emp_title, dtype: int64\n",
      "-----------\n",
      "home_ownership\n",
      "MORTGAGE    18959\n",
      "NONE            8\n",
      "OTHER         136\n",
      "OWN          3251\n",
      "RENT        20181\n",
      "Name: home_ownership, dtype: int64\n",
      "-----------\n",
      "verification_status\n",
      "Not Verified       18758\n",
      "Source Verified    10306\n",
      "Verified           13471\n",
      "Name: verification_status, dtype: int64\n",
      "-----------\n",
      "issue_d\n",
      "Apr-2008     259\n",
      "Apr-2009     333\n",
      "Apr-2010     912\n",
      "Apr-2011    1563\n",
      "Aug-2007      74\n",
      "Aug-2008     100\n",
      "Aug-2009     446\n",
      "Aug-2010    1175\n",
      "Aug-2011    1934\n",
      "Dec-2007     172\n",
      "Dec-2008     253\n",
      "Dec-2009     658\n",
      "Dec-2010    1335\n",
      "Dec-2011    2267\n",
      "Feb-2008     306\n",
      "Feb-2009     302\n",
      "Feb-2010     682\n",
      "Feb-2011    1298\n",
      "Jan-2008     305\n",
      "Jan-2009     269\n",
      "Jan-2010     662\n",
      "Jan-2011    1380\n",
      "Jul-2007      63\n",
      "Jul-2008     141\n",
      "Jul-2009     411\n",
      "Jul-2010    1204\n",
      "Jul-2011    1875\n",
      "Jun-2007      24\n",
      "Jun-2008     124\n",
      "Jun-2009     406\n",
      "Jun-2010    1105\n",
      "Jun-2011    1835\n",
      "Mar-2008     402\n",
      "Mar-2009     324\n",
      "Mar-2010     828\n",
      "Mar-2011    1448\n",
      "May-2008     115\n",
      "May-2009     359\n",
      "May-2010     989\n",
      "May-2011    1704\n",
      "Nov-2007     112\n",
      "Nov-2008     209\n",
      "Nov-2009     662\n",
      "Nov-2010    1224\n",
      "Nov-2011    2232\n",
      "Oct-2007     105\n",
      "Oct-2008     122\n",
      "Oct-2009     604\n",
      "Oct-2010    1232\n",
      "Oct-2011    2118\n",
      "Sep-2007      53\n",
      "Sep-2008      57\n",
      "Sep-2009     507\n",
      "Sep-2010    1189\n",
      "Sep-2011    2067\n",
      "Name: issue_d, dtype: int64\n",
      "-----------\n",
      "purpose\n",
      "car                    1615\n",
      "credit_card            5477\n",
      "debt_consolidation    19776\n",
      "educational             422\n",
      "home_improvement       3199\n",
      "house                   426\n",
      "major_purchase         2311\n",
      "medical                 753\n",
      "moving                  629\n",
      "other                  4425\n",
      "renewable_energy        106\n",
      "small_business         1992\n",
      "vacation                400\n",
      "wedding                1004\n",
      "Name: purpose, dtype: int64\n",
      "-----------\n",
      "earliest_cr_line\n",
      "Apr-1964      3\n",
      "Apr-1966      1\n",
      "Apr-1967      4\n",
      "Apr-1968      1\n",
      "Apr-1969      1\n",
      "Apr-1970      3\n",
      "Apr-1971      6\n",
      "Apr-1972      2\n",
      "Apr-1973      7\n",
      "Apr-1974      6\n",
      "Apr-1975     11\n",
      "Apr-1976      7\n",
      "Apr-1977     10\n",
      "Apr-1978     14\n",
      "Apr-1979     15\n",
      "Apr-1980     17\n",
      "Apr-1981     15\n",
      "Apr-1982     23\n",
      "Apr-1983     29\n",
      "Apr-1984     27\n",
      "Apr-1985     31\n",
      "Apr-1986     29\n",
      "Apr-1987     48\n",
      "Apr-1988     55\n",
      "Apr-1989     64\n",
      "Apr-1990     79\n",
      "Apr-1991     77\n",
      "Apr-1992     90\n",
      "Apr-1993    100\n",
      "Apr-1994    134\n",
      "           ... \n",
      "Sep-1979     20\n",
      "Sep-1980     18\n",
      "Sep-1981     28\n",
      "Sep-1982     16\n",
      "Sep-1983     33\n",
      "Sep-1984     46\n",
      "Sep-1985     41\n",
      "Sep-1986     59\n",
      "Sep-1987     59\n",
      "Sep-1988     63\n",
      "Sep-1989     83\n",
      "Sep-1990     93\n",
      "Sep-1991     70\n",
      "Sep-1992     72\n",
      "Sep-1993    124\n",
      "Sep-1994    119\n",
      "Sep-1995    181\n",
      "Sep-1996    187\n",
      "Sep-1997    196\n",
      "Sep-1998    267\n",
      "Sep-1999    277\n",
      "Sep-2000    325\n",
      "Sep-2001    282\n",
      "Sep-2002    251\n",
      "Sep-2003    220\n",
      "Sep-2004    221\n",
      "Sep-2005    162\n",
      "Sep-2006    150\n",
      "Sep-2007     63\n",
      "Sep-2008      8\n",
      "Name: earliest_cr_line, dtype: int64\n",
      "-----------\n",
      "revol_util\n",
      "0%       1070\n",
      "0.01%       1\n",
      "0.03%       1\n",
      "0.04%       1\n",
      "0.05%       1\n",
      "0.1%       61\n",
      "0.12%       1\n",
      "0.16%       1\n",
      "0.2%       64\n",
      "0.3%       43\n",
      "0.4%       45\n",
      "0.46%       1\n",
      "0.49%       1\n",
      "0.5%       49\n",
      "0.54%       1\n",
      "0.6%       39\n",
      "0.7%       47\n",
      "0.75%       1\n",
      "0.8%       43\n",
      "0.83%       1\n",
      "0.86%       1\n",
      "0.9%       41\n",
      "1%         51\n",
      "1.1%       39\n",
      "1.2%       37\n",
      "1.3%       36\n",
      "1.4%       32\n",
      "1.5%       33\n",
      "1.6%       24\n",
      "1.7%       30\n",
      "         ... \n",
      "97%        31\n",
      "97.1%      39\n",
      "97.2%      32\n",
      "97.3%      36\n",
      "97.4%      33\n",
      "97.5%      35\n",
      "97.6%      35\n",
      "97.7%      45\n",
      "97.8%      38\n",
      "97.9%      27\n",
      "98%        28\n",
      "98.1%      25\n",
      "98.2%      25\n",
      "98.3%      36\n",
      "98.4%      31\n",
      "98.5%      29\n",
      "98.6%      36\n",
      "98.7%      27\n",
      "98.8%      38\n",
      "98.9%      20\n",
      "99%        35\n",
      "99.1%      33\n",
      "99.2%      22\n",
      "99.3%      32\n",
      "99.4%      24\n",
      "99.5%      24\n",
      "99.6%      27\n",
      "99.7%      32\n",
      "99.8%      25\n",
      "99.9%      29\n",
      "Name: revol_util, dtype: int64\n",
      "-----------\n",
      "last_pymnt_d\n",
      "Apr-2008     23\n",
      "Apr-2009     72\n",
      "Apr-2010    145\n",
      "Apr-2011    519\n",
      "Apr-2012    781\n",
      "Apr-2013    890\n",
      "Apr-2014    682\n",
      "Apr-2015    141\n",
      "Apr-2016    178\n",
      "Aug-2008     31\n",
      "Aug-2009     66\n",
      "Aug-2010    198\n",
      "Aug-2011    479\n",
      "Aug-2012    868\n",
      "Aug-2013    744\n",
      "Aug-2014    836\n",
      "Aug-2015    217\n",
      "Dec-2007      2\n",
      "Dec-2008     31\n",
      "Dec-2009    116\n",
      "Dec-2010    332\n",
      "Dec-2011    585\n",
      "Dec-2012    732\n",
      "Dec-2013    799\n",
      "Dec-2014    949\n",
      "Dec-2015    185\n",
      "Feb-2008      8\n",
      "Feb-2009     65\n",
      "Feb-2010    148\n",
      "Feb-2011    437\n",
      "           ... \n",
      "May-2011    448\n",
      "May-2012    773\n",
      "May-2013    943\n",
      "May-2014    686\n",
      "May-2015    151\n",
      "May-2016    203\n",
      "Nov-2008     31\n",
      "Nov-2009     80\n",
      "Nov-2010    258\n",
      "Nov-2011    487\n",
      "Nov-2012    782\n",
      "Nov-2013    701\n",
      "Nov-2014    593\n",
      "Nov-2015    231\n",
      "Oct-2008     55\n",
      "Oct-2009     85\n",
      "Oct-2010    292\n",
      "Oct-2011    482\n",
      "Oct-2012    853\n",
      "Oct-2013    712\n",
      "Oct-2014    813\n",
      "Oct-2015    200\n",
      "Sep-2008     40\n",
      "Sep-2009     55\n",
      "Sep-2010    215\n",
      "Sep-2011    491\n",
      "Sep-2012    802\n",
      "Sep-2013    712\n",
      "Sep-2014    694\n",
      "Sep-2015    211\n",
      "Name: last_pymnt_d, dtype: int64\n",
      "-----------\n",
      "last_credit_pull_d\n",
      "Apr-2009     24\n",
      "Apr-2010     77\n",
      "Apr-2011    177\n",
      "Apr-2012    326\n",
      "Apr-2013    445\n",
      "Apr-2014    507\n",
      "Apr-2015    404\n",
      "Apr-2016    912\n",
      "Aug-2007     17\n",
      "Aug-2008     15\n",
      "Aug-2009     32\n",
      "Aug-2010     68\n",
      "Aug-2011    290\n",
      "Aug-2012    391\n",
      "Aug-2013    375\n",
      "Aug-2014    528\n",
      "Aug-2015    435\n",
      "Dec-2007      3\n",
      "Dec-2008      7\n",
      "Dec-2009     73\n",
      "Dec-2010    156\n",
      "Dec-2011    268\n",
      "Dec-2012    391\n",
      "Dec-2013    431\n",
      "Dec-2014    567\n",
      "Dec-2015    656\n",
      "Feb-2008      3\n",
      "Feb-2009     20\n",
      "Feb-2010     87\n",
      "Feb-2011    199\n",
      "           ... \n",
      "May-2013    470\n",
      "May-2014    477\n",
      "May-2015    433\n",
      "May-2016    766\n",
      "Nov-2007      3\n",
      "Nov-2009     50\n",
      "Nov-2010    126\n",
      "Nov-2011    221\n",
      "Nov-2012    432\n",
      "Nov-2013    491\n",
      "Nov-2014    500\n",
      "Nov-2015    516\n",
      "Oct-2007      3\n",
      "Oct-2008     13\n",
      "Oct-2009     51\n",
      "Oct-2010    120\n",
      "Oct-2011    162\n",
      "Oct-2012    351\n",
      "Oct-2013    429\n",
      "Oct-2014    529\n",
      "Oct-2015    513\n",
      "Sep-2007      5\n",
      "Sep-2008      9\n",
      "Sep-2009     21\n",
      "Sep-2010    119\n",
      "Sep-2011    175\n",
      "Sep-2012    414\n",
      "Sep-2013    408\n",
      "Sep-2014    564\n",
      "Sep-2015    531\n",
      "Name: last_credit_pull_d, dtype: int64\n",
      "-----------\n"
     ]
    }
   ],
   "source": [
    "for col in useful_obj_cols:\n",
    "    print(col)\n",
    "    print(combined_vcs[col])\n",
    "    print(\"-----------\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert to category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "convert_col_dtypes = {\n",
    "    \"sub_grade\": \"category\", \"home_ownership\": \"category\", \n",
    "    \"verification_status\": \"category\", \"purpose\": \"category\"\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert `term` and `revol_util` to numerical by data cleaning.\n",
    "### Convert `issue_d`, `earliest_cr_line`, `last_pymnt_d`, and `last_credit_pull_d` to datetime."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>term</th>\n",
       "      <th>sub_grade</th>\n",
       "      <th>emp_title</th>\n",
       "      <th>home_ownership</th>\n",
       "      <th>verification_status</th>\n",
       "      <th>issue_d</th>\n",
       "      <th>purpose</th>\n",
       "      <th>earliest_cr_line</th>\n",
       "      <th>revol_util</th>\n",
       "      <th>last_pymnt_d</th>\n",
       "      <th>last_credit_pull_d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>42000</th>\n",
       "      <td>36 months</td>\n",
       "      <td>C2</td>\n",
       "      <td>Best Buy</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Jul-2000</td>\n",
       "      <td>100.7%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42001</th>\n",
       "      <td>36 months</td>\n",
       "      <td>G2</td>\n",
       "      <td>CVS PHARMACY</td>\n",
       "      <td>OWN</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Mar-1989</td>\n",
       "      <td>51.9%</td>\n",
       "      <td>Nov-2008</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42002</th>\n",
       "      <td>36 months</td>\n",
       "      <td>E4</td>\n",
       "      <td>General Motors</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Dec-1998</td>\n",
       "      <td>80.7%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42003</th>\n",
       "      <td>36 months</td>\n",
       "      <td>G4</td>\n",
       "      <td>usa medical center</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Jul-1995</td>\n",
       "      <td>57.2%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Jun-2011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42004</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B3</td>\n",
       "      <td>InvestSource Inc</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Sep-2005</td>\n",
       "      <td>74%</td>\n",
       "      <td>Mar-2010</td>\n",
       "      <td>Aug-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42005</th>\n",
       "      <td>36 months</td>\n",
       "      <td>D1</td>\n",
       "      <td>kaiser</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Nov-2000</td>\n",
       "      <td>37.6%</td>\n",
       "      <td>Mar-2010</td>\n",
       "      <td>Apr-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42006</th>\n",
       "      <td>36 months</td>\n",
       "      <td>C1</td>\n",
       "      <td>Panoramic Software</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Dec-1995</td>\n",
       "      <td>56%</td>\n",
       "      <td>Apr-2010</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42007</th>\n",
       "      <td>36 months</td>\n",
       "      <td>D5</td>\n",
       "      <td>AT&amp;T Inc.</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>small_business</td>\n",
       "      <td>Feb-2000</td>\n",
       "      <td>49.7%</td>\n",
       "      <td>Feb-2009</td>\n",
       "      <td>Oct-2009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42008</th>\n",
       "      <td>36 months</td>\n",
       "      <td>D4</td>\n",
       "      <td>D&amp;Y</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Oct-1992</td>\n",
       "      <td>71.5%</td>\n",
       "      <td>Jan-2010</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42009</th>\n",
       "      <td>36 months</td>\n",
       "      <td>D5</td>\n",
       "      <td>Frederick's of Hollywood</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Oct-1994</td>\n",
       "      <td>49%</td>\n",
       "      <td>Jul-2008</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42010</th>\n",
       "      <td>36 months</td>\n",
       "      <td>F2</td>\n",
       "      <td>Yuma Nursing Home Ctr</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Oct-1993</td>\n",
       "      <td>18.1%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Nov-2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42011</th>\n",
       "      <td>36 months</td>\n",
       "      <td>E5</td>\n",
       "      <td>Akiva Incorporated</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>small_business</td>\n",
       "      <td>Jan-2006</td>\n",
       "      <td>30.8%</td>\n",
       "      <td>Jan-2009</td>\n",
       "      <td>Aug-2009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42012</th>\n",
       "      <td>36 months</td>\n",
       "      <td>G4</td>\n",
       "      <td>Citrix Online</td>\n",
       "      <td>OWN</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Sep-1998</td>\n",
       "      <td>98.1%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Mar-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42013</th>\n",
       "      <td>36 months</td>\n",
       "      <td>G3</td>\n",
       "      <td>Virginia College Online</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>other</td>\n",
       "      <td>Dec-1990</td>\n",
       "      <td>78.4%</td>\n",
       "      <td>Oct-2008</td>\n",
       "      <td>Sep-2008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42014</th>\n",
       "      <td>36 months</td>\n",
       "      <td>E2</td>\n",
       "      <td>Zone 7 Water Agency</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Nov-1998</td>\n",
       "      <td>94.3%</td>\n",
       "      <td>May-2009</td>\n",
       "      <td>May-2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42015</th>\n",
       "      <td>36 months</td>\n",
       "      <td>G5</td>\n",
       "      <td>Integrity Appraisals</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Feb-1997</td>\n",
       "      <td>78.7%</td>\n",
       "      <td>Nov-2008</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42016</th>\n",
       "      <td>36 months</td>\n",
       "      <td>G4</td>\n",
       "      <td>Stordok Document Destruction/Moreno Roofing Co.</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>credit_card</td>\n",
       "      <td>Jun-1982</td>\n",
       "      <td>0%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Dec-2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42017</th>\n",
       "      <td>36 months</td>\n",
       "      <td>E4</td>\n",
       "      <td>Aladdin Capital Management</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>credit_card</td>\n",
       "      <td>Nov-1999</td>\n",
       "      <td>65.2%</td>\n",
       "      <td>Nov-2009</td>\n",
       "      <td>Nov-2009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42018</th>\n",
       "      <td>36 months</td>\n",
       "      <td>D4</td>\n",
       "      <td>Cayuga Medical Center</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Nov-1994</td>\n",
       "      <td>58.9%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42019</th>\n",
       "      <td>36 months</td>\n",
       "      <td>E1</td>\n",
       "      <td>North Sound PT</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jan-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Feb-2000</td>\n",
       "      <td>94%</td>\n",
       "      <td>Oct-2009</td>\n",
       "      <td>Mar-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42020</th>\n",
       "      <td>36 months</td>\n",
       "      <td>C3</td>\n",
       "      <td>UHS</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jan-2008</td>\n",
       "      <td>medical</td>\n",
       "      <td>Oct-1998</td>\n",
       "      <td>83.1%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Feb-2011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42021</th>\n",
       "      <td>36 months</td>\n",
       "      <td>D4</td>\n",
       "      <td>Adecco</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Nov-2002</td>\n",
       "      <td>60.7%</td>\n",
       "      <td>Aug-2009</td>\n",
       "      <td>Jan-2011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42022</th>\n",
       "      <td>36 months</td>\n",
       "      <td>E5</td>\n",
       "      <td>Greatwide logistics Services</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>home_improvement</td>\n",
       "      <td>Oct-1994</td>\n",
       "      <td>49.9%</td>\n",
       "      <td>Sep-2009</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42023</th>\n",
       "      <td>36 months</td>\n",
       "      <td>D4</td>\n",
       "      <td>ASU University</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>credit_card</td>\n",
       "      <td>Sep-2000</td>\n",
       "      <td>59.9%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Feb-2011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42024</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B5</td>\n",
       "      <td>Wells Fargo</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Nov-1995</td>\n",
       "      <td>32.6%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42025</th>\n",
       "      <td>36 months</td>\n",
       "      <td>D5</td>\n",
       "      <td>lakeside memorial hospital</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jan-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Nov-1997</td>\n",
       "      <td>77.4%</td>\n",
       "      <td>Jan-2011</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42026</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B5</td>\n",
       "      <td>Controller Group</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>small_business</td>\n",
       "      <td>Mar-1998</td>\n",
       "      <td>46.5%</td>\n",
       "      <td>Oct-2008</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42027</th>\n",
       "      <td>36 months</td>\n",
       "      <td>F1</td>\n",
       "      <td>David Johnston Equestrian Training Lic.#29026</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>small_business</td>\n",
       "      <td>Nov-2000</td>\n",
       "      <td>18.1%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Feb-2011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42028</th>\n",
       "      <td>36 months</td>\n",
       "      <td>E3</td>\n",
       "      <td>Lockheed Martin Corp.</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Dec-1983</td>\n",
       "      <td>45.9%</td>\n",
       "      <td>Feb-2011</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42029</th>\n",
       "      <td>36 months</td>\n",
       "      <td>G4</td>\n",
       "      <td>Self</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>small_business</td>\n",
       "      <td>Apr-1994</td>\n",
       "      <td>92.3%</td>\n",
       "      <td>Apr-2008</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42508</th>\n",
       "      <td>36 months</td>\n",
       "      <td>D5</td>\n",
       "      <td>SUNY- ESF</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jul-2007</td>\n",
       "      <td>credit_card</td>\n",
       "      <td>Dec-2004</td>\n",
       "      <td>49.3%</td>\n",
       "      <td>Jul-2010</td>\n",
       "      <td>Sep-2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42509</th>\n",
       "      <td>36 months</td>\n",
       "      <td>D5</td>\n",
       "      <td>Clinton Shop Rite of Hunterdon County</td>\n",
       "      <td>OWN</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jul-2007</td>\n",
       "      <td>car</td>\n",
       "      <td>Dec-2006</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Feb-2010</td>\n",
       "      <td>Jul-2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42510</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B4</td>\n",
       "      <td>Scheduall</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jul-2007</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Feb-2008</td>\n",
       "      <td>May-2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42511</th>\n",
       "      <td>36 months</td>\n",
       "      <td>E5</td>\n",
       "      <td>GA-PCOM</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jul-2007</td>\n",
       "      <td>educational</td>\n",
       "      <td>Sep-1999</td>\n",
       "      <td>85%</td>\n",
       "      <td>Aug-2010</td>\n",
       "      <td>Aug-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42512</th>\n",
       "      <td>36 months</td>\n",
       "      <td>A1</td>\n",
       "      <td>Tzigane Inc</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jul-2007</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Mar-1984</td>\n",
       "      <td>2.2%</td>\n",
       "      <td>Jul-2010</td>\n",
       "      <td>Jun-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42513</th>\n",
       "      <td>36 months</td>\n",
       "      <td>C2</td>\n",
       "      <td>Yale University</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Jan-1996</td>\n",
       "      <td>66%</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>Oct-2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42514</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B4</td>\n",
       "      <td>Brick Township board of education</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>debt_consolidation</td>\n",
       "      <td>Jul-2004</td>\n",
       "      <td>63.5%</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>Feb-2015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42515</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B2</td>\n",
       "      <td>Classic Components</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jul-2010</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42516</th>\n",
       "      <td>36 months</td>\n",
       "      <td>C4</td>\n",
       "      <td>Compensation Solutions</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Apr-2010</td>\n",
       "      <td>Apr-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42517</th>\n",
       "      <td>36 months</td>\n",
       "      <td>C3</td>\n",
       "      <td>Stanford University Libraries, LOCKSS Project</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jul-2010</td>\n",
       "      <td>Sep-2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42518</th>\n",
       "      <td>36 months</td>\n",
       "      <td>E3</td>\n",
       "      <td>Macy's</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>Jun-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42519</th>\n",
       "      <td>36 months</td>\n",
       "      <td>A3</td>\n",
       "      <td>Diamond Management and Technology Consultants</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Aug-2009</td>\n",
       "      <td>Aug-2009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42520</th>\n",
       "      <td>36 months</td>\n",
       "      <td>A2</td>\n",
       "      <td>U.S. Bank</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jan-2008</td>\n",
       "      <td>Jun-2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42521</th>\n",
       "      <td>36 months</td>\n",
       "      <td>A2</td>\n",
       "      <td>NC</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jan-2008</td>\n",
       "      <td>Jun-2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42522</th>\n",
       "      <td>36 months</td>\n",
       "      <td>A5</td>\n",
       "      <td>College Pro Painters</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Mar-2010</td>\n",
       "      <td>Mar-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42523</th>\n",
       "      <td>36 months</td>\n",
       "      <td>A1</td>\n",
       "      <td>Mana Products</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>Apr-2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42524</th>\n",
       "      <td>36 months</td>\n",
       "      <td>C4</td>\n",
       "      <td>Apto Solutions</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jul-2010</td>\n",
       "      <td>Jun-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42525</th>\n",
       "      <td>36 months</td>\n",
       "      <td>D3</td>\n",
       "      <td>Infinitely law group</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>May-2008</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42526</th>\n",
       "      <td>36 months</td>\n",
       "      <td>C4</td>\n",
       "      <td>Town of Plainville</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Aug-2008</td>\n",
       "      <td>Jun-2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42527</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B2</td>\n",
       "      <td>Tanks Tavern</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jul-2010</td>\n",
       "      <td>Jun-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42528</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OWN</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>May-2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42529</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jul-2010</td>\n",
       "      <td>Jul-2010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42530</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B4</td>\n",
       "      <td>Air Force</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>May-2008</td>\n",
       "      <td>Jun-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42531</th>\n",
       "      <td>36 months</td>\n",
       "      <td>C1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Mar-2008</td>\n",
       "      <td>Feb-2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42532</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B4</td>\n",
       "      <td>Halping hands company inc.</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>Sep-2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42533</th>\n",
       "      <td>36 months</td>\n",
       "      <td>B3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RENT</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>May-2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42534</th>\n",
       "      <td>36 months</td>\n",
       "      <td>A5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NONE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>Aug-2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42535</th>\n",
       "      <td>36 months</td>\n",
       "      <td>A3</td>\n",
       "      <td>Homemaker</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>Not Verified</td>\n",
       "      <td>Jun-2007</td>\n",
       "      <td>other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Jun-2010</td>\n",
       "      <td>Feb-2015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42536</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42537</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>538 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             term sub_grade                                        emp_title  \\\n",
       "42000   36 months        C2                                         Best Buy   \n",
       "42001   36 months        G2                                     CVS PHARMACY   \n",
       "42002   36 months        E4                                   General Motors   \n",
       "42003   36 months        G4                               usa medical center   \n",
       "42004   36 months        B3                                 InvestSource Inc   \n",
       "42005   36 months        D1                                           kaiser   \n",
       "42006   36 months        C1                               Panoramic Software   \n",
       "42007   36 months        D5                                        AT&T Inc.   \n",
       "42008   36 months        D4                                              D&Y   \n",
       "42009   36 months        D5                         Frederick's of Hollywood   \n",
       "42010   36 months        F2                            Yuma Nursing Home Ctr   \n",
       "42011   36 months        E5                               Akiva Incorporated   \n",
       "42012   36 months        G4                                    Citrix Online   \n",
       "42013   36 months        G3                          Virginia College Online   \n",
       "42014   36 months        E2                              Zone 7 Water Agency   \n",
       "42015   36 months        G5                             Integrity Appraisals   \n",
       "42016   36 months        G4  Stordok Document Destruction/Moreno Roofing Co.   \n",
       "42017   36 months        E4                       Aladdin Capital Management   \n",
       "42018   36 months        D4                            Cayuga Medical Center   \n",
       "42019   36 months        E1                                   North Sound PT   \n",
       "42020   36 months        C3                                              UHS   \n",
       "42021   36 months        D4                                           Adecco   \n",
       "42022   36 months        E5                     Greatwide logistics Services   \n",
       "42023   36 months        D4                                   ASU University   \n",
       "42024   36 months        B5                                      Wells Fargo   \n",
       "42025   36 months        D5                       lakeside memorial hospital   \n",
       "42026   36 months        B5                                 Controller Group   \n",
       "42027   36 months        F1    David Johnston Equestrian Training Lic.#29026   \n",
       "42028   36 months        E3                            Lockheed Martin Corp.   \n",
       "42029   36 months        G4                                             Self   \n",
       "...           ...       ...                                              ...   \n",
       "42508   36 months        D5                                        SUNY- ESF   \n",
       "42509   36 months        D5            Clinton Shop Rite of Hunterdon County   \n",
       "42510   36 months        B4                                        Scheduall   \n",
       "42511   36 months        E5                                          GA-PCOM   \n",
       "42512   36 months        A1                                      Tzigane Inc   \n",
       "42513   36 months        C2                                  Yale University   \n",
       "42514   36 months        B4                Brick Township board of education   \n",
       "42515   36 months        B2                               Classic Components   \n",
       "42516   36 months        C4                           Compensation Solutions   \n",
       "42517   36 months        C3    Stanford University Libraries, LOCKSS Project   \n",
       "42518   36 months        E3                                           Macy's   \n",
       "42519   36 months        A3    Diamond Management and Technology Consultants   \n",
       "42520   36 months        A2                                        U.S. Bank   \n",
       "42521   36 months        A2                                               NC   \n",
       "42522   36 months        A5                             College Pro Painters   \n",
       "42523   36 months        A1                                    Mana Products   \n",
       "42524   36 months        C4                                   Apto Solutions   \n",
       "42525   36 months        D3                             Infinitely law group   \n",
       "42526   36 months        C4                               Town of Plainville   \n",
       "42527   36 months        B2                                     Tanks Tavern   \n",
       "42528   36 months        B3                                              NaN   \n",
       "42529   36 months        B5                                              NaN   \n",
       "42530   36 months        B4                                        Air Force   \n",
       "42531   36 months        C1                                              NaN   \n",
       "42532   36 months        B4                       Halping hands company inc.   \n",
       "42533   36 months        B3                                              NaN   \n",
       "42534   36 months        A5                                              NaN   \n",
       "42535   36 months        A3                                        Homemaker   \n",
       "42536         NaN       NaN                                              NaN   \n",
       "42537         NaN       NaN                                              NaN   \n",
       "\n",
       "      home_ownership verification_status   issue_d             purpose  \\\n",
       "42000           RENT        Not Verified  Feb-2008  debt_consolidation   \n",
       "42001            OWN        Not Verified  Feb-2008  debt_consolidation   \n",
       "42002           RENT        Not Verified  Feb-2008  debt_consolidation   \n",
       "42003           RENT        Not Verified  Feb-2008  debt_consolidation   \n",
       "42004           RENT        Not Verified  Feb-2008  debt_consolidation   \n",
       "42005       MORTGAGE        Not Verified  Feb-2008  debt_consolidation   \n",
       "42006       MORTGAGE        Not Verified  Feb-2008  debt_consolidation   \n",
       "42007           RENT        Not Verified  Feb-2008      small_business   \n",
       "42008           RENT        Not Verified  Feb-2008  debt_consolidation   \n",
       "42009           RENT        Not Verified  Feb-2008  debt_consolidation   \n",
       "42010           RENT        Not Verified  Feb-2008  debt_consolidation   \n",
       "42011           RENT        Not Verified  Feb-2008      small_business   \n",
       "42012            OWN        Not Verified  Feb-2008  debt_consolidation   \n",
       "42013       MORTGAGE        Not Verified  Feb-2008               other   \n",
       "42014           RENT        Not Verified  Feb-2008  debt_consolidation   \n",
       "42015       MORTGAGE        Not Verified  Feb-2008  debt_consolidation   \n",
       "42016       MORTGAGE        Not Verified  Feb-2008         credit_card   \n",
       "42017           RENT        Not Verified  Feb-2008         credit_card   \n",
       "42018           RENT        Not Verified  Feb-2008  debt_consolidation   \n",
       "42019           RENT        Not Verified  Jan-2008  debt_consolidation   \n",
       "42020           RENT        Not Verified  Jan-2008             medical   \n",
       "42021           RENT        Not Verified  Feb-2008  debt_consolidation   \n",
       "42022       MORTGAGE        Not Verified  Feb-2008    home_improvement   \n",
       "42023           RENT        Not Verified  Feb-2008         credit_card   \n",
       "42024           RENT        Not Verified  Feb-2008  debt_consolidation   \n",
       "42025       MORTGAGE        Not Verified  Jan-2008  debt_consolidation   \n",
       "42026           RENT        Not Verified  Feb-2008      small_business   \n",
       "42027           RENT        Not Verified  Feb-2008      small_business   \n",
       "42028       MORTGAGE        Not Verified  Feb-2008  debt_consolidation   \n",
       "42029       MORTGAGE        Not Verified  Feb-2008      small_business   \n",
       "...              ...                 ...       ...                 ...   \n",
       "42508           RENT        Not Verified  Jul-2007         credit_card   \n",
       "42509            OWN        Not Verified  Jul-2007                 car   \n",
       "42510       MORTGAGE        Not Verified  Jul-2007  debt_consolidation   \n",
       "42511           RENT        Not Verified  Jul-2007         educational   \n",
       "42512       MORTGAGE        Not Verified  Jul-2007  debt_consolidation   \n",
       "42513           RENT        Not Verified  Jun-2007  debt_consolidation   \n",
       "42514       MORTGAGE        Not Verified  Jun-2007  debt_consolidation   \n",
       "42515           RENT        Not Verified  Jun-2007               other   \n",
       "42516           RENT        Not Verified  Jun-2007               other   \n",
       "42517           RENT        Not Verified  Jun-2007               other   \n",
       "42518           RENT        Not Verified  Jun-2007               other   \n",
       "42519       MORTGAGE        Not Verified  Jun-2007               other   \n",
       "42520       MORTGAGE        Not Verified  Jun-2007               other   \n",
       "42521       MORTGAGE        Not Verified  Jun-2007               other   \n",
       "42522       MORTGAGE        Not Verified  Jun-2007               other   \n",
       "42523           RENT        Not Verified  Jun-2007               other   \n",
       "42524           RENT        Not Verified  Jun-2007               other   \n",
       "42525           RENT        Not Verified  Jun-2007               other   \n",
       "42526           RENT        Not Verified  Jun-2007               other   \n",
       "42527           RENT        Not Verified  Jun-2007               other   \n",
       "42528            OWN        Not Verified  Jun-2007               other   \n",
       "42529           RENT        Not Verified  Jun-2007               other   \n",
       "42530           RENT        Not Verified  Jun-2007               other   \n",
       "42531           RENT        Not Verified  Jun-2007               other   \n",
       "42532           RENT        Not Verified  Jun-2007               other   \n",
       "42533           RENT        Not Verified  Jun-2007               other   \n",
       "42534           NONE        Not Verified  Jun-2007               other   \n",
       "42535       MORTGAGE        Not Verified  Jun-2007               other   \n",
       "42536            NaN                 NaN       NaN                 NaN   \n",
       "42537            NaN                 NaN       NaN                 NaN   \n",
       "\n",
       "      earliest_cr_line revol_util last_pymnt_d last_credit_pull_d  \n",
       "42000         Jul-2000     100.7%     Feb-2011           Jun-2016  \n",
       "42001         Mar-1989      51.9%     Nov-2008           Jun-2016  \n",
       "42002         Dec-1998      80.7%     Feb-2011           Jun-2016  \n",
       "42003         Jul-1995      57.2%     Feb-2011           Jun-2011  \n",
       "42004         Sep-2005        74%     Mar-2010           Aug-2010  \n",
       "42005         Nov-2000      37.6%     Mar-2010           Apr-2010  \n",
       "42006         Dec-1995        56%     Apr-2010           Jun-2016  \n",
       "42007         Feb-2000      49.7%     Feb-2009           Oct-2009  \n",
       "42008         Oct-1992      71.5%     Jan-2010           Jun-2016  \n",
       "42009         Oct-1994        49%     Jul-2008           Jun-2016  \n",
       "42010         Oct-1993      18.1%     Feb-2011           Nov-2014  \n",
       "42011         Jan-2006      30.8%     Jan-2009           Aug-2009  \n",
       "42012         Sep-1998      98.1%     Feb-2011           Mar-2016  \n",
       "42013         Dec-1990      78.4%     Oct-2008           Sep-2008  \n",
       "42014         Nov-1998      94.3%     May-2009           May-2013  \n",
       "42015         Feb-1997      78.7%     Nov-2008           Jun-2016  \n",
       "42016         Jun-1982         0%     Feb-2011           Dec-2014  \n",
       "42017         Nov-1999      65.2%     Nov-2009           Nov-2009  \n",
       "42018         Nov-1994      58.9%     Feb-2011           Jun-2016  \n",
       "42019         Feb-2000        94%     Oct-2009           Mar-2010  \n",
       "42020         Oct-1998      83.1%     Feb-2011           Feb-2011  \n",
       "42021         Nov-2002      60.7%     Aug-2009           Jan-2011  \n",
       "42022         Oct-1994      49.9%     Sep-2009           Jun-2016  \n",
       "42023         Sep-2000      59.9%     Feb-2011           Feb-2011  \n",
       "42024         Nov-1995      32.6%     Feb-2011           Jun-2016  \n",
       "42025         Nov-1997      77.4%     Jan-2011           Jun-2016  \n",
       "42026         Mar-1998      46.5%     Oct-2008           Jun-2016  \n",
       "42027         Nov-2000      18.1%     Feb-2011           Feb-2011  \n",
       "42028         Dec-1983      45.9%     Feb-2011           Jun-2016  \n",
       "42029         Apr-1994      92.3%     Apr-2008           Jun-2016  \n",
       "...                ...        ...          ...                ...  \n",
       "42508         Dec-2004      49.3%     Jul-2010           Sep-2012  \n",
       "42509         Dec-2006        NaN     Feb-2010           Jul-2013  \n",
       "42510              NaN        NaN     Feb-2008           May-2007  \n",
       "42511         Sep-1999        85%     Aug-2010           Aug-2010  \n",
       "42512         Mar-1984       2.2%     Jul-2010           Jun-2010  \n",
       "42513         Jan-1996        66%     Jun-2010           Oct-2014  \n",
       "42514         Jul-2004      63.5%     Jun-2010           Feb-2015  \n",
       "42515              NaN        NaN     Jul-2010           Jun-2016  \n",
       "42516              NaN        NaN     Apr-2010           Apr-2010  \n",
       "42517              NaN        NaN     Jul-2010           Sep-2014  \n",
       "42518              NaN        NaN     Jun-2010           Jun-2010  \n",
       "42519              NaN        NaN     Aug-2009           Aug-2009  \n",
       "42520              NaN        NaN     Jan-2008           Jun-2007  \n",
       "42521              NaN        NaN     Jan-2008           Jun-2007  \n",
       "42522              NaN        NaN     Mar-2010           Mar-2010  \n",
       "42523              NaN        NaN     Jun-2010           Apr-2014  \n",
       "42524              NaN        NaN     Jul-2010           Jun-2010  \n",
       "42525              NaN        NaN     May-2008           Jun-2016  \n",
       "42526              NaN        NaN     Aug-2008           Jun-2007  \n",
       "42527              NaN        NaN     Jul-2010           Jun-2010  \n",
       "42528              NaN        NaN     Jun-2010           May-2007  \n",
       "42529              NaN        NaN     Jul-2010           Jul-2010  \n",
       "42530              NaN        NaN     May-2008           Jun-2016  \n",
       "42531              NaN        NaN     Mar-2008           Feb-2013  \n",
       "42532              NaN        NaN     Jun-2010           Sep-2014  \n",
       "42533              NaN        NaN     Jun-2010           May-2007  \n",
       "42534              NaN        NaN     Jun-2010           Aug-2007  \n",
       "42535              NaN        NaN     Jun-2010           Feb-2015  \n",
       "42536              NaN        NaN          NaN                NaN  \n",
       "42537              NaN        NaN          NaN                NaN  \n",
       "\n",
       "[538 rows x 11 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk[useful_obj_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                                    object\n",
       "member_id                            float64\n",
       "loan_amnt                            float64\n",
       "funded_amnt                          float64\n",
       "funded_amnt_inv                      float64\n",
       "term                                 float64\n",
       "int_rate                              object\n",
       "installment                          float64\n",
       "grade                                 object\n",
       "sub_grade                           category\n",
       "emp_title                             object\n",
       "emp_length                            object\n",
       "home_ownership                      category\n",
       "annual_inc                           float64\n",
       "verification_status                 category\n",
       "issue_d                       datetime64[ns]\n",
       "loan_status                           object\n",
       "pymnt_plan                            object\n",
       "purpose                             category\n",
       "title                                 object\n",
       "zip_code                              object\n",
       "addr_state                            object\n",
       "dti                                  float64\n",
       "delinq_2yrs                          float64\n",
       "earliest_cr_line              datetime64[ns]\n",
       "inq_last_6mths                       float64\n",
       "open_acc                             float64\n",
       "pub_rec                              float64\n",
       "revol_bal                            float64\n",
       "revol_util                           float64\n",
       "total_acc                            float64\n",
       "initial_list_status                   object\n",
       "out_prncp                            float64\n",
       "out_prncp_inv                        float64\n",
       "total_pymnt                          float64\n",
       "total_pymnt_inv                      float64\n",
       "total_rec_prncp                      float64\n",
       "total_rec_int                        float64\n",
       "total_rec_late_fee                   float64\n",
       "recoveries                           float64\n",
       "collection_recovery_fee              float64\n",
       "last_pymnt_d                  datetime64[ns]\n",
       "last_pymnt_amnt                      float64\n",
       "last_credit_pull_d            datetime64[ns]\n",
       "collections_12_mths_ex_med           float64\n",
       "policy_code                          float64\n",
       "application_type                      object\n",
       "acc_now_delinq                       float64\n",
       "chargeoff_within_12_mths             float64\n",
       "delinq_amnt                          float64\n",
       "pub_rec_bankruptcies                 float64\n",
       "tax_liens                            float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
    "\n",
    "for chunk in chunk_iter:\n",
    "    term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
    "    revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
    "    chunk['term'] = pd.to_numeric(term_cleaned)\n",
    "    chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
    "    \n",
    "chunk.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'acc_now_delinq': 32,\n",
       " 'annual_inc': 7,\n",
       " 'chargeoff_within_12_mths': 148,\n",
       " 'collection_recovery_fee': 3,\n",
       " 'collections_12_mths_ex_med': 148,\n",
       " 'delinq_2yrs': 32,\n",
       " 'delinq_amnt': 32,\n",
       " 'dti': 3,\n",
       " 'funded_amnt': 3,\n",
       " 'funded_amnt_inv': 3,\n",
       " 'inq_last_6mths': 32,\n",
       " 'installment': 3,\n",
       " 'last_pymnt_amnt': 3,\n",
       " 'loan_amnt': 3,\n",
       " 'member_id': 3,\n",
       " 'open_acc': 32,\n",
       " 'out_prncp': 3,\n",
       " 'out_prncp_inv': 3,\n",
       " 'policy_code': 3,\n",
       " 'pub_rec': 32,\n",
       " 'pub_rec_bankruptcies': 1368,\n",
       " 'recoveries': 3,\n",
       " 'revol_bal': 3,\n",
       " 'revol_util': 93,\n",
       " 'tax_liens': 108,\n",
       " 'term': 3,\n",
       " 'total_acc': 32,\n",
       " 'total_pymnt': 3,\n",
       " 'total_pymnt_inv': 3,\n",
       " 'total_rec_int': 3,\n",
       " 'total_rec_late_fee': 3,\n",
       " 'total_rec_prncp': 3}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
    "mv_counts = {}\n",
    "for chunk in chunk_iter:\n",
    "    term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
    "    revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
    "    chunk['term'] = pd.to_numeric(term_cleaned)\n",
    "    chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
    "    float_cols = chunk.select_dtypes(include=['float'])\n",
    "    for col in float_cols.columns:\n",
    "        missing_values = len(chunk) - chunk[col].count()\n",
    "        if col in mv_counts:\n",
    "            mv_counts[col] = mv_counts[col] + missing_values\n",
    "        else:\n",
    "            mv_counts[col] = missing_values\n",
    "mv_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'acc_now_delinq': 32,\n",
       " 'annual_inc': 7,\n",
       " 'chargeoff_within_12_mths': 148,\n",
       " 'collection_recovery_fee': 3,\n",
       " 'collections_12_mths_ex_med': 148,\n",
       " 'delinq_2yrs': 32,\n",
       " 'delinq_amnt': 32,\n",
       " 'dti': 3,\n",
       " 'funded_amnt': 3,\n",
       " 'funded_amnt_inv': 3,\n",
       " 'inq_last_6mths': 32,\n",
       " 'installment': 3,\n",
       " 'last_pymnt_amnt': 3,\n",
       " 'loan_amnt': 3,\n",
       " 'member_id': 3,\n",
       " 'open_acc': 32,\n",
       " 'out_prncp': 3,\n",
       " 'out_prncp_inv': 3,\n",
       " 'policy_code': 3,\n",
       " 'pub_rec': 32,\n",
       " 'pub_rec_bankruptcies': 1368,\n",
       " 'recoveries': 3,\n",
       " 'revol_bal': 3,\n",
       " 'revol_util': 93,\n",
       " 'tax_liens': 108,\n",
       " 'term': 3,\n",
       " 'total_acc': 32,\n",
       " 'total_pymnt': 3,\n",
       " 'total_pymnt_inv': 3,\n",
       " 'total_rec_int': 3,\n",
       " 'total_rec_late_fee': 3,\n",
       " 'total_rec_prncp': 3}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
    "mv_counts = {}\n",
    "for chunk in chunk_iter:\n",
    "    term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
    "    revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
    "    chunk['term'] = pd.to_numeric(term_cleaned)\n",
    "    chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
    "    chunk = chunk.dropna(how='all')\n",
    "    float_cols = chunk.select_dtypes(include=['float'])\n",
    "    for col in float_cols.columns:\n",
    "        missing_values = len(chunk) - chunk[col].count()\n",
    "        if col in mv_counts:\n",
    "            mv_counts[col] = mv_counts[col] + missing_values\n",
    "        else:\n",
    "            mv_counts[col] = missing_values\n",
    "mv_counts"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}