|
@@ -1,1621 +0,0 @@
|
|
|
-{
|
|
|
- "cells": [
|
|
|
- {
|
|
|
- "cell_type": "markdown",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "# Introduction"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 1,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "text/html": [
|
|
|
- "<div>\n",
|
|
|
- "<style scoped>\n",
|
|
|
- " .dataframe tbody tr th:only-of-type {\n",
|
|
|
- " vertical-align: middle;\n",
|
|
|
- " }\n",
|
|
|
- "\n",
|
|
|
- " .dataframe tbody tr th {\n",
|
|
|
- " vertical-align: top;\n",
|
|
|
- " }\n",
|
|
|
- "\n",
|
|
|
- " .dataframe thead th {\n",
|
|
|
- " text-align: right;\n",
|
|
|
- " }\n",
|
|
|
- "</style>\n",
|
|
|
- "<table border=\"1\" class=\"dataframe\">\n",
|
|
|
- " <thead>\n",
|
|
|
- " <tr style=\"text-align: right;\">\n",
|
|
|
- " <th></th>\n",
|
|
|
- " <th>id</th>\n",
|
|
|
- " <th>member_id</th>\n",
|
|
|
- " <th>loan_amnt</th>\n",
|
|
|
- " <th>funded_amnt</th>\n",
|
|
|
- " <th>funded_amnt_inv</th>\n",
|
|
|
- " <th>term</th>\n",
|
|
|
- " <th>int_rate</th>\n",
|
|
|
- " <th>installment</th>\n",
|
|
|
- " <th>grade</th>\n",
|
|
|
- " <th>sub_grade</th>\n",
|
|
|
- " <th>emp_title</th>\n",
|
|
|
- " <th>emp_length</th>\n",
|
|
|
- " <th>home_ownership</th>\n",
|
|
|
- " <th>annual_inc</th>\n",
|
|
|
- " <th>verification_status</th>\n",
|
|
|
- " <th>issue_d</th>\n",
|
|
|
- " <th>loan_status</th>\n",
|
|
|
- " <th>pymnt_plan</th>\n",
|
|
|
- " <th>purpose</th>\n",
|
|
|
- " <th>title</th>\n",
|
|
|
- " <th>zip_code</th>\n",
|
|
|
- " <th>addr_state</th>\n",
|
|
|
- " <th>dti</th>\n",
|
|
|
- " <th>delinq_2yrs</th>\n",
|
|
|
- " <th>earliest_cr_line</th>\n",
|
|
|
- " <th>inq_last_6mths</th>\n",
|
|
|
- " <th>open_acc</th>\n",
|
|
|
- " <th>pub_rec</th>\n",
|
|
|
- " <th>revol_bal</th>\n",
|
|
|
- " <th>revol_util</th>\n",
|
|
|
- " <th>total_acc</th>\n",
|
|
|
- " <th>initial_list_status</th>\n",
|
|
|
- " <th>out_prncp</th>\n",
|
|
|
- " <th>out_prncp_inv</th>\n",
|
|
|
- " <th>total_pymnt</th>\n",
|
|
|
- " <th>total_pymnt_inv</th>\n",
|
|
|
- " <th>total_rec_prncp</th>\n",
|
|
|
- " <th>total_rec_int</th>\n",
|
|
|
- " <th>total_rec_late_fee</th>\n",
|
|
|
- " <th>recoveries</th>\n",
|
|
|
- " <th>collection_recovery_fee</th>\n",
|
|
|
- " <th>last_pymnt_d</th>\n",
|
|
|
- " <th>last_pymnt_amnt</th>\n",
|
|
|
- " <th>last_credit_pull_d</th>\n",
|
|
|
- " <th>collections_12_mths_ex_med</th>\n",
|
|
|
- " <th>policy_code</th>\n",
|
|
|
- " <th>application_type</th>\n",
|
|
|
- " <th>acc_now_delinq</th>\n",
|
|
|
- " <th>chargeoff_within_12_mths</th>\n",
|
|
|
- " <th>delinq_amnt</th>\n",
|
|
|
- " <th>pub_rec_bankruptcies</th>\n",
|
|
|
- " <th>tax_liens</th>\n",
|
|
|
- " </tr>\n",
|
|
|
- " </thead>\n",
|
|
|
- " <tbody>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>0</th>\n",
|
|
|
- " <td>1077501</td>\n",
|
|
|
- " <td>1296599.0</td>\n",
|
|
|
- " <td>5000.0</td>\n",
|
|
|
- " <td>5000.0</td>\n",
|
|
|
- " <td>4975.0</td>\n",
|
|
|
- " <td>36 months</td>\n",
|
|
|
- " <td>10.65%</td>\n",
|
|
|
- " <td>162.87</td>\n",
|
|
|
- " <td>B</td>\n",
|
|
|
- " <td>B2</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>10+ years</td>\n",
|
|
|
- " <td>RENT</td>\n",
|
|
|
- " <td>24000.0</td>\n",
|
|
|
- " <td>Verified</td>\n",
|
|
|
- " <td>Dec-2011</td>\n",
|
|
|
- " <td>Fully Paid</td>\n",
|
|
|
- " <td>n</td>\n",
|
|
|
- " <td>credit_card</td>\n",
|
|
|
- " <td>Computer</td>\n",
|
|
|
- " <td>860xx</td>\n",
|
|
|
- " <td>AZ</td>\n",
|
|
|
- " <td>27.65</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>Jan-1985</td>\n",
|
|
|
- " <td>1.0</td>\n",
|
|
|
- " <td>3.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>13648.0</td>\n",
|
|
|
- " <td>83.7%</td>\n",
|
|
|
- " <td>9.0</td>\n",
|
|
|
- " <td>f</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>5863.155187</td>\n",
|
|
|
- " <td>5833.84</td>\n",
|
|
|
- " <td>5000.00</td>\n",
|
|
|
- " <td>863.16</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>Jan-2015</td>\n",
|
|
|
- " <td>171.62</td>\n",
|
|
|
- " <td>Jun-2016</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>1.0</td>\n",
|
|
|
- " <td>INDIVIDUAL</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>1</th>\n",
|
|
|
- " <td>1077430</td>\n",
|
|
|
- " <td>1314167.0</td>\n",
|
|
|
- " <td>2500.0</td>\n",
|
|
|
- " <td>2500.0</td>\n",
|
|
|
- " <td>2500.0</td>\n",
|
|
|
- " <td>60 months</td>\n",
|
|
|
- " <td>15.27%</td>\n",
|
|
|
- " <td>59.83</td>\n",
|
|
|
- " <td>C</td>\n",
|
|
|
- " <td>C4</td>\n",
|
|
|
- " <td>Ryder</td>\n",
|
|
|
- " <td>< 1 year</td>\n",
|
|
|
- " <td>RENT</td>\n",
|
|
|
- " <td>30000.0</td>\n",
|
|
|
- " <td>Source Verified</td>\n",
|
|
|
- " <td>Dec-2011</td>\n",
|
|
|
- " <td>Charged Off</td>\n",
|
|
|
- " <td>n</td>\n",
|
|
|
- " <td>car</td>\n",
|
|
|
- " <td>bike</td>\n",
|
|
|
- " <td>309xx</td>\n",
|
|
|
- " <td>GA</td>\n",
|
|
|
- " <td>1.00</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>Apr-1999</td>\n",
|
|
|
- " <td>5.0</td>\n",
|
|
|
- " <td>3.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>1687.0</td>\n",
|
|
|
- " <td>9.4%</td>\n",
|
|
|
- " <td>4.0</td>\n",
|
|
|
- " <td>f</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>1008.710000</td>\n",
|
|
|
- " <td>1008.71</td>\n",
|
|
|
- " <td>456.46</td>\n",
|
|
|
- " <td>435.17</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>117.08</td>\n",
|
|
|
- " <td>1.11</td>\n",
|
|
|
- " <td>Apr-2013</td>\n",
|
|
|
- " <td>119.66</td>\n",
|
|
|
- " <td>Sep-2013</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>1.0</td>\n",
|
|
|
- " <td>INDIVIDUAL</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>2</th>\n",
|
|
|
- " <td>1077175</td>\n",
|
|
|
- " <td>1313524.0</td>\n",
|
|
|
- " <td>2400.0</td>\n",
|
|
|
- " <td>2400.0</td>\n",
|
|
|
- " <td>2400.0</td>\n",
|
|
|
- " <td>36 months</td>\n",
|
|
|
- " <td>15.96%</td>\n",
|
|
|
- " <td>84.33</td>\n",
|
|
|
- " <td>C</td>\n",
|
|
|
- " <td>C5</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>10+ years</td>\n",
|
|
|
- " <td>RENT</td>\n",
|
|
|
- " <td>12252.0</td>\n",
|
|
|
- " <td>Not Verified</td>\n",
|
|
|
- " <td>Dec-2011</td>\n",
|
|
|
- " <td>Fully Paid</td>\n",
|
|
|
- " <td>n</td>\n",
|
|
|
- " <td>small_business</td>\n",
|
|
|
- " <td>real estate business</td>\n",
|
|
|
- " <td>606xx</td>\n",
|
|
|
- " <td>IL</td>\n",
|
|
|
- " <td>8.72</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>Nov-2001</td>\n",
|
|
|
- " <td>2.0</td>\n",
|
|
|
- " <td>2.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>2956.0</td>\n",
|
|
|
- " <td>98.5%</td>\n",
|
|
|
- " <td>10.0</td>\n",
|
|
|
- " <td>f</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>3005.666844</td>\n",
|
|
|
- " <td>3005.67</td>\n",
|
|
|
- " <td>2400.00</td>\n",
|
|
|
- " <td>605.67</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>Jun-2014</td>\n",
|
|
|
- " <td>649.91</td>\n",
|
|
|
- " <td>Jun-2016</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>1.0</td>\n",
|
|
|
- " <td>INDIVIDUAL</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>3</th>\n",
|
|
|
- " <td>1076863</td>\n",
|
|
|
- " <td>1277178.0</td>\n",
|
|
|
- " <td>10000.0</td>\n",
|
|
|
- " <td>10000.0</td>\n",
|
|
|
- " <td>10000.0</td>\n",
|
|
|
- " <td>36 months</td>\n",
|
|
|
- " <td>13.49%</td>\n",
|
|
|
- " <td>339.31</td>\n",
|
|
|
- " <td>C</td>\n",
|
|
|
- " <td>C1</td>\n",
|
|
|
- " <td>AIR RESOURCES BOARD</td>\n",
|
|
|
- " <td>10+ years</td>\n",
|
|
|
- " <td>RENT</td>\n",
|
|
|
- " <td>49200.0</td>\n",
|
|
|
- " <td>Source Verified</td>\n",
|
|
|
- " <td>Dec-2011</td>\n",
|
|
|
- " <td>Fully Paid</td>\n",
|
|
|
- " <td>n</td>\n",
|
|
|
- " <td>other</td>\n",
|
|
|
- " <td>personel</td>\n",
|
|
|
- " <td>917xx</td>\n",
|
|
|
- " <td>CA</td>\n",
|
|
|
- " <td>20.00</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>Feb-1996</td>\n",
|
|
|
- " <td>1.0</td>\n",
|
|
|
- " <td>10.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>5598.0</td>\n",
|
|
|
- " <td>21%</td>\n",
|
|
|
- " <td>37.0</td>\n",
|
|
|
- " <td>f</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>12231.890000</td>\n",
|
|
|
- " <td>12231.89</td>\n",
|
|
|
- " <td>10000.00</td>\n",
|
|
|
- " <td>2214.92</td>\n",
|
|
|
- " <td>16.97</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>Jan-2015</td>\n",
|
|
|
- " <td>357.48</td>\n",
|
|
|
- " <td>Apr-2016</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>1.0</td>\n",
|
|
|
- " <td>INDIVIDUAL</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>4</th>\n",
|
|
|
- " <td>1075358</td>\n",
|
|
|
- " <td>1311748.0</td>\n",
|
|
|
- " <td>3000.0</td>\n",
|
|
|
- " <td>3000.0</td>\n",
|
|
|
- " <td>3000.0</td>\n",
|
|
|
- " <td>60 months</td>\n",
|
|
|
- " <td>12.69%</td>\n",
|
|
|
- " <td>67.79</td>\n",
|
|
|
- " <td>B</td>\n",
|
|
|
- " <td>B5</td>\n",
|
|
|
- " <td>University Medical Group</td>\n",
|
|
|
- " <td>1 year</td>\n",
|
|
|
- " <td>RENT</td>\n",
|
|
|
- " <td>80000.0</td>\n",
|
|
|
- " <td>Source Verified</td>\n",
|
|
|
- " <td>Dec-2011</td>\n",
|
|
|
- " <td>Current</td>\n",
|
|
|
- " <td>n</td>\n",
|
|
|
- " <td>other</td>\n",
|
|
|
- " <td>Personal</td>\n",
|
|
|
- " <td>972xx</td>\n",
|
|
|
- " <td>OR</td>\n",
|
|
|
- " <td>17.94</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>Jan-1996</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>15.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>27783.0</td>\n",
|
|
|
- " <td>53.9%</td>\n",
|
|
|
- " <td>38.0</td>\n",
|
|
|
- " <td>f</td>\n",
|
|
|
- " <td>461.73</td>\n",
|
|
|
- " <td>461.73</td>\n",
|
|
|
- " <td>3581.120000</td>\n",
|
|
|
- " <td>3581.12</td>\n",
|
|
|
- " <td>2538.27</td>\n",
|
|
|
- " <td>1042.85</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>0.00</td>\n",
|
|
|
- " <td>Jun-2016</td>\n",
|
|
|
- " <td>67.79</td>\n",
|
|
|
- " <td>Jun-2016</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>1.0</td>\n",
|
|
|
- " <td>INDIVIDUAL</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " <td>0.0</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " </tbody>\n",
|
|
|
- "</table>\n",
|
|
|
- "</div>"
|
|
|
- ],
|
|
|
- "text/plain": [
|
|
|
- " id member_id loan_amnt funded_amnt funded_amnt_inv term \\\n",
|
|
|
- "0 1077501 1296599.0 5000.0 5000.0 4975.0 36 months \n",
|
|
|
- "1 1077430 1314167.0 2500.0 2500.0 2500.0 60 months \n",
|
|
|
- "2 1077175 1313524.0 2400.0 2400.0 2400.0 36 months \n",
|
|
|
- "3 1076863 1277178.0 10000.0 10000.0 10000.0 36 months \n",
|
|
|
- "4 1075358 1311748.0 3000.0 3000.0 3000.0 60 months \n",
|
|
|
- "\n",
|
|
|
- " int_rate installment grade sub_grade emp_title emp_length \\\n",
|
|
|
- "0 10.65% 162.87 B B2 NaN 10+ years \n",
|
|
|
- "1 15.27% 59.83 C C4 Ryder < 1 year \n",
|
|
|
- "2 15.96% 84.33 C C5 NaN 10+ years \n",
|
|
|
- "3 13.49% 339.31 C C1 AIR RESOURCES BOARD 10+ years \n",
|
|
|
- "4 12.69% 67.79 B B5 University Medical Group 1 year \n",
|
|
|
- "\n",
|
|
|
- " home_ownership annual_inc verification_status issue_d loan_status \\\n",
|
|
|
- "0 RENT 24000.0 Verified Dec-2011 Fully Paid \n",
|
|
|
- "1 RENT 30000.0 Source Verified Dec-2011 Charged Off \n",
|
|
|
- "2 RENT 12252.0 Not Verified Dec-2011 Fully Paid \n",
|
|
|
- "3 RENT 49200.0 Source Verified Dec-2011 Fully Paid \n",
|
|
|
- "4 RENT 80000.0 Source Verified Dec-2011 Current \n",
|
|
|
- "\n",
|
|
|
- " pymnt_plan purpose title zip_code addr_state dti \\\n",
|
|
|
- "0 n credit_card Computer 860xx AZ 27.65 \n",
|
|
|
- "1 n car bike 309xx GA 1.00 \n",
|
|
|
- "2 n small_business real estate business 606xx IL 8.72 \n",
|
|
|
- "3 n other personel 917xx CA 20.00 \n",
|
|
|
- "4 n other Personal 972xx OR 17.94 \n",
|
|
|
- "\n",
|
|
|
- " delinq_2yrs earliest_cr_line inq_last_6mths open_acc pub_rec revol_bal \\\n",
|
|
|
- "0 0.0 Jan-1985 1.0 3.0 0.0 13648.0 \n",
|
|
|
- "1 0.0 Apr-1999 5.0 3.0 0.0 1687.0 \n",
|
|
|
- "2 0.0 Nov-2001 2.0 2.0 0.0 2956.0 \n",
|
|
|
- "3 0.0 Feb-1996 1.0 10.0 0.0 5598.0 \n",
|
|
|
- "4 0.0 Jan-1996 0.0 15.0 0.0 27783.0 \n",
|
|
|
- "\n",
|
|
|
- " revol_util total_acc initial_list_status out_prncp out_prncp_inv \\\n",
|
|
|
- "0 83.7% 9.0 f 0.00 0.00 \n",
|
|
|
- "1 9.4% 4.0 f 0.00 0.00 \n",
|
|
|
- "2 98.5% 10.0 f 0.00 0.00 \n",
|
|
|
- "3 21% 37.0 f 0.00 0.00 \n",
|
|
|
- "4 53.9% 38.0 f 461.73 461.73 \n",
|
|
|
- "\n",
|
|
|
- " total_pymnt total_pymnt_inv total_rec_prncp total_rec_int \\\n",
|
|
|
- "0 5863.155187 5833.84 5000.00 863.16 \n",
|
|
|
- "1 1008.710000 1008.71 456.46 435.17 \n",
|
|
|
- "2 3005.666844 3005.67 2400.00 605.67 \n",
|
|
|
- "3 12231.890000 12231.89 10000.00 2214.92 \n",
|
|
|
- "4 3581.120000 3581.12 2538.27 1042.85 \n",
|
|
|
- "\n",
|
|
|
- " total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d \\\n",
|
|
|
- "0 0.00 0.00 0.00 Jan-2015 \n",
|
|
|
- "1 0.00 117.08 1.11 Apr-2013 \n",
|
|
|
- "2 0.00 0.00 0.00 Jun-2014 \n",
|
|
|
- "3 16.97 0.00 0.00 Jan-2015 \n",
|
|
|
- "4 0.00 0.00 0.00 Jun-2016 \n",
|
|
|
- "\n",
|
|
|
- " last_pymnt_amnt last_credit_pull_d collections_12_mths_ex_med \\\n",
|
|
|
- "0 171.62 Jun-2016 0.0 \n",
|
|
|
- "1 119.66 Sep-2013 0.0 \n",
|
|
|
- "2 649.91 Jun-2016 0.0 \n",
|
|
|
- "3 357.48 Apr-2016 0.0 \n",
|
|
|
- "4 67.79 Jun-2016 0.0 \n",
|
|
|
- "\n",
|
|
|
- " policy_code application_type acc_now_delinq chargeoff_within_12_mths \\\n",
|
|
|
- "0 1.0 INDIVIDUAL 0.0 0.0 \n",
|
|
|
- "1 1.0 INDIVIDUAL 0.0 0.0 \n",
|
|
|
- "2 1.0 INDIVIDUAL 0.0 0.0 \n",
|
|
|
- "3 1.0 INDIVIDUAL 0.0 0.0 \n",
|
|
|
- "4 1.0 INDIVIDUAL 0.0 0.0 \n",
|
|
|
- "\n",
|
|
|
- " delinq_amnt pub_rec_bankruptcies tax_liens \n",
|
|
|
- "0 0.0 0.0 0.0 \n",
|
|
|
- "1 0.0 0.0 0.0 \n",
|
|
|
- "2 0.0 0.0 0.0 \n",
|
|
|
- "3 0.0 0.0 0.0 \n",
|
|
|
- "4 0.0 0.0 0.0 "
|
|
|
- ]
|
|
|
- },
|
|
|
- "execution_count": 1,
|
|
|
- "metadata": {},
|
|
|
- "output_type": "execute_result"
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "import pandas as pd\n",
|
|
|
- "import numpy as np\n",
|
|
|
- "pd.options.display.max_columns = 99\n",
|
|
|
- "\n",
|
|
|
- "first_five = pd.read_csv('loans_2007.csv', nrows=5)\n",
|
|
|
- "first_five"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 2,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "text/plain": [
|
|
|
- "1.5502548217773438"
|
|
|
- ]
|
|
|
- },
|
|
|
- "execution_count": 2,
|
|
|
- "metadata": {},
|
|
|
- "output_type": "execute_result"
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "thousand_chunk = pd.read_csv('loans_2007.csv', nrows=1000)\n",
|
|
|
- "thousand_chunk.memory_usage(deep=True).sum()/(1024*1024)"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "markdown",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "### Let's try tripling to 3000 rows and calculate the memory footprint for each chunk."
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 3,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "name": "stdout",
|
|
|
- "output_type": "stream",
|
|
|
- "text": [
|
|
|
- "4.649059295654297\n",
|
|
|
- "4.644805908203125\n",
|
|
|
- "4.646563529968262\n",
|
|
|
- "4.647915840148926\n",
|
|
|
- "4.644108772277832\n",
|
|
|
- "4.645991325378418\n",
|
|
|
- "4.644582748413086\n",
|
|
|
- "4.646951675415039\n",
|
|
|
- "4.645077705383301\n",
|
|
|
- "4.64512825012207\n",
|
|
|
- "4.657840728759766\n",
|
|
|
- "4.656707763671875\n",
|
|
|
- "4.663515090942383\n",
|
|
|
- "4.896956443786621\n",
|
|
|
- "0.880854606628418\n"
|
|
|
- ]
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
|
|
|
- "for chunk in chunk_iter:\n",
|
|
|
- " print(chunk.memory_usage(deep=True).sum()/(1024*1024))"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "markdown",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "## How many rows are in the dataset?"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 4,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "name": "stdout",
|
|
|
- "output_type": "stream",
|
|
|
- "text": [
|
|
|
- "42538\n"
|
|
|
- ]
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
|
|
|
- "total_rows = 0\n",
|
|
|
- "for chunk in chunk_iter:\n",
|
|
|
- " total_rows += len(chunk)\n",
|
|
|
- "print(total_rows)"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "markdown",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "# Exploring the Data in Chunks\n",
|
|
|
- "\n",
|
|
|
- "## How many columns have a numeric type? How many have a string type?"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 5,
|
|
|
- "metadata": {
|
|
|
- "scrolled": true
|
|
|
- },
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "name": "stdout",
|
|
|
- "output_type": "stream",
|
|
|
- "text": [
|
|
|
- "[31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30]\n",
|
|
|
- "[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22]\n"
|
|
|
- ]
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "# Numeric columns\n",
|
|
|
- "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
|
|
|
- "\n",
|
|
|
- "numeric = []\n",
|
|
|
- "string = []\n",
|
|
|
- "for lc in loans_chunks:\n",
|
|
|
- " nums = lc.select_dtypes(include=[np.number]).shape[1]\n",
|
|
|
- " numeric.append(nums)\n",
|
|
|
- " strs = lc.select_dtypes(include=['object']).shape[1]\n",
|
|
|
- " string.append(strs)\n",
|
|
|
- "\n",
|
|
|
- "print(numeric)\n",
|
|
|
- "print(string)"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 6,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "name": "stdout",
|
|
|
- "output_type": "stream",
|
|
|
- "text": [
|
|
|
- "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
|
|
|
- "\n",
|
|
|
- "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
|
|
|
- "\n",
|
|
|
- "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
|
|
|
- "\n",
|
|
|
- "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n",
|
|
|
- "\n"
|
|
|
- ]
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "# Are string columns consistent across chunks?\n",
|
|
|
- "obj_cols = []\n",
|
|
|
- "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
|
|
|
- "\n",
|
|
|
- "for chunk in chunk_iter:\n",
|
|
|
- " chunk_obj_cols = chunk.select_dtypes(include=['object']).columns.tolist()\n",
|
|
|
- " if len(obj_cols) > 0:\n",
|
|
|
- " is_same = obj_cols == chunk_obj_cols\n",
|
|
|
- " if not is_same:\n",
|
|
|
- " print(\"overall obj cols:\", obj_cols, \"\\n\")\n",
|
|
|
- " print(\"chunk obj cols:\", chunk_obj_cols, \"\\n\") \n",
|
|
|
- " else:\n",
|
|
|
- " obj_cols = chunk_obj_cols"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "markdown",
|
|
|
- "metadata": {
|
|
|
- "collapsed": true
|
|
|
- },
|
|
|
- "source": [
|
|
|
- "### Observation 1: By default โ 31 numeric columns and 21 string columns.\n",
|
|
|
- "\n",
|
|
|
- "### Observation 2: It seems like one column in particular (the `id` column) is being cast to int64 in the last 2 chunks but not in the earlier chunks. Since the `id` column won't be useful for analysis, visualization, or predictive modeling, let's ignore this column.\n",
|
|
|
- "\n",
|
|
|
- "## How many unique values are there in each string column? How many of the string columns contain values that are less than 50% unique?"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 7,
|
|
|
- "metadata": {
|
|
|
- "scrolled": true
|
|
|
- },
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "name": "stdout",
|
|
|
- "output_type": "stream",
|
|
|
- "text": [
|
|
|
- "term 2\n",
|
|
|
- "grade 7\n",
|
|
|
- "sub_grade 35\n",
|
|
|
- "emp_length 11\n",
|
|
|
- "home_ownership 5\n",
|
|
|
- "verification_status 3\n",
|
|
|
- "loan_status 9\n",
|
|
|
- "pymnt_plan 2\n",
|
|
|
- "purpose 14\n",
|
|
|
- "initial_list_status 1\n",
|
|
|
- "application_type 1\n"
|
|
|
- ]
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
|
|
|
- "\n",
|
|
|
- "uniques = {}\n",
|
|
|
- "for lc in loans_chunks:\n",
|
|
|
- " strings_only = lc.select_dtypes(include=['object'])\n",
|
|
|
- " cols = strings_only.columns\n",
|
|
|
- " for c in cols:\n",
|
|
|
- " val_counts = strings_only[c].value_counts()\n",
|
|
|
- " if c in uniques:\n",
|
|
|
- " uniques[c].append(val_counts)\n",
|
|
|
- " else:\n",
|
|
|
- " uniques[c] = [val_counts]\n",
|
|
|
- "\n",
|
|
|
- "uniques_combined = {}\n",
|
|
|
- "unique_stats = {\n",
|
|
|
- " 'column_name': [],\n",
|
|
|
- " 'total_values': [],\n",
|
|
|
- " 'unique_values': [],\n",
|
|
|
- "}\n",
|
|
|
- "for col in uniques:\n",
|
|
|
- " u_concat = pd.concat(uniques[col])\n",
|
|
|
- " u_group = u_concat.groupby(u_concat.index).sum()\n",
|
|
|
- " uniques_combined[col] = u_group\n",
|
|
|
- " if u_group.shape[0] < 50:\n",
|
|
|
- " print(col, u_group.shape[0])"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "markdown",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "## Which float columns have no missing values and could be candidates for conversion to the integer type?"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 8,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "text/plain": [
|
|
|
- "member_id 3\n",
|
|
|
- "total_rec_int 3\n",
|
|
|
- "total_pymnt_inv 3\n",
|
|
|
- "total_pymnt 3\n",
|
|
|
- "revol_bal 3\n",
|
|
|
- "recoveries 3\n",
|
|
|
- "policy_code 3\n",
|
|
|
- "out_prncp_inv 3\n",
|
|
|
- "out_prncp 3\n",
|
|
|
- "total_rec_late_fee 3\n",
|
|
|
- "loan_amnt 3\n",
|
|
|
- "last_pymnt_amnt 3\n",
|
|
|
- "total_rec_prncp 3\n",
|
|
|
- "funded_amnt_inv 3\n",
|
|
|
- "funded_amnt 3\n",
|
|
|
- "dti 3\n",
|
|
|
- "collection_recovery_fee 3\n",
|
|
|
- "installment 3\n",
|
|
|
- "annual_inc 7\n",
|
|
|
- "inq_last_6mths 32\n",
|
|
|
- "total_acc 32\n",
|
|
|
- "delinq_2yrs 32\n",
|
|
|
- "pub_rec 32\n",
|
|
|
- "delinq_amnt 32\n",
|
|
|
- "open_acc 32\n",
|
|
|
- "acc_now_delinq 32\n",
|
|
|
- "tax_liens 108\n",
|
|
|
- "collections_12_mths_ex_med 148\n",
|
|
|
- "chargeoff_within_12_mths 148\n",
|
|
|
- "pub_rec_bankruptcies 1368\n",
|
|
|
- "dtype: int64"
|
|
|
- ]
|
|
|
- },
|
|
|
- "execution_count": 8,
|
|
|
- "metadata": {},
|
|
|
- "output_type": "execute_result"
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
|
|
|
- "\n",
|
|
|
- "missing = []\n",
|
|
|
- "for lc in loans_chunks:\n",
|
|
|
- " floats = lc.select_dtypes(include=['float'])\n",
|
|
|
- " missing.append(floats.apply(pd.isnull).sum())\n",
|
|
|
- "\n",
|
|
|
- "combined_missing = pd.concat(missing)\n",
|
|
|
- "combined_missing.groupby(combined_missing.index).sum().sort_values()"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "markdown",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "## Calculate the total memory usage across all of the chunks."
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 9,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "text/plain": [
|
|
|
- "66.21605968475342"
|
|
|
- ]
|
|
|
- },
|
|
|
- "execution_count": 9,
|
|
|
- "metadata": {},
|
|
|
- "output_type": "execute_result"
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n",
|
|
|
- "\n",
|
|
|
- "mem_usage = []\n",
|
|
|
- "\n",
|
|
|
- "for lc in loans_chunks:\n",
|
|
|
- " mem_usage.append(lc.memory_usage(deep=True).sum() / 1024 ** 2)\n",
|
|
|
- "\n",
|
|
|
- "sum(mem_usage)"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "markdown",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "## Optimizing String Columns\n",
|
|
|
- "\n",
|
|
|
- "### Determine which string columns you can convert to a numeric type if you clean them. Let's focus on columns that would actually be useful for analysis and modeling."
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 10,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "text/plain": [
|
|
|
- "['term',\n",
|
|
|
- " 'int_rate',\n",
|
|
|
- " 'grade',\n",
|
|
|
- " 'sub_grade',\n",
|
|
|
- " 'emp_title',\n",
|
|
|
- " 'emp_length',\n",
|
|
|
- " 'home_ownership',\n",
|
|
|
- " 'verification_status',\n",
|
|
|
- " 'issue_d',\n",
|
|
|
- " 'loan_status',\n",
|
|
|
- " 'pymnt_plan',\n",
|
|
|
- " 'purpose',\n",
|
|
|
- " 'title',\n",
|
|
|
- " 'zip_code',\n",
|
|
|
- " 'addr_state',\n",
|
|
|
- " 'earliest_cr_line',\n",
|
|
|
- " 'revol_util',\n",
|
|
|
- " 'initial_list_status',\n",
|
|
|
- " 'last_pymnt_d',\n",
|
|
|
- " 'last_credit_pull_d',\n",
|
|
|
- " 'application_type']"
|
|
|
- ]
|
|
|
- },
|
|
|
- "execution_count": 10,
|
|
|
- "metadata": {},
|
|
|
- "output_type": "execute_result"
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "obj_cols"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 11,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [],
|
|
|
- "source": [
|
|
|
- "useful_obj_cols = ['term', 'sub_grade', 'emp_title', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'earliest_cr_line', 'revol_util', 'last_pymnt_d', 'last_credit_pull_d']"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 12,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [],
|
|
|
- "source": [
|
|
|
- "## Create dictionary (key: column, value: list of Series objects representing each chunk's value counts)\n",
|
|
|
- "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
|
|
|
- "str_cols_vc = {}\n",
|
|
|
- "for chunk in chunk_iter:\n",
|
|
|
- " str_cols = chunk.select_dtypes(include=['object'])\n",
|
|
|
- " for col in str_cols.columns:\n",
|
|
|
- " current_col_vc = str_cols[col].value_counts()\n",
|
|
|
- " if col in str_cols_vc:\n",
|
|
|
- " str_cols_vc[col].append(current_col_vc)\n",
|
|
|
- " else:\n",
|
|
|
- " str_cols_vc[col] = [current_col_vc]"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 13,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [],
|
|
|
- "source": [
|
|
|
- "## Combine the value counts.\n",
|
|
|
- "combined_vcs = {}\n",
|
|
|
- "\n",
|
|
|
- "for col in str_cols_vc:\n",
|
|
|
- " combined_vc = pd.concat(str_cols_vc[col])\n",
|
|
|
- " final_vc = combined_vc.groupby(combined_vc.index).sum()\n",
|
|
|
- " combined_vcs[col] = final_vc"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 14,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "name": "stdout",
|
|
|
- "output_type": "stream",
|
|
|
- "text": [
|
|
|
- "term\n",
|
|
|
- " 36 months 31534\n",
|
|
|
- " 60 months 11001\n",
|
|
|
- "Name: term, dtype: int64\n",
|
|
|
- "-----------\n",
|
|
|
- "sub_grade\n",
|
|
|
- "A1 1142\n",
|
|
|
- "A2 1520\n",
|
|
|
- "A3 1823\n",
|
|
|
- "A4 2905\n",
|
|
|
- "A5 2793\n",
|
|
|
- "B1 1882\n",
|
|
|
- "B2 2113\n",
|
|
|
- "B3 2997\n",
|
|
|
- "B4 2590\n",
|
|
|
- "B5 2807\n",
|
|
|
- "C1 2264\n",
|
|
|
- "C2 2157\n",
|
|
|
- "C3 1658\n",
|
|
|
- "C4 1370\n",
|
|
|
- "C5 1291\n",
|
|
|
- "D1 1053\n",
|
|
|
- "D2 1485\n",
|
|
|
- "D3 1322\n",
|
|
|
- "D4 1140\n",
|
|
|
- "D5 1016\n",
|
|
|
- "E1 884\n",
|
|
|
- "E2 791\n",
|
|
|
- "E3 668\n",
|
|
|
- "E4 552\n",
|
|
|
- "E5 499\n",
|
|
|
- "F1 392\n",
|
|
|
- "F2 308\n",
|
|
|
- "F3 236\n",
|
|
|
- "F4 211\n",
|
|
|
- "F5 154\n",
|
|
|
- "G1 141\n",
|
|
|
- "G2 107\n",
|
|
|
- "G3 79\n",
|
|
|
- "G4 99\n",
|
|
|
- "G5 86\n",
|
|
|
- "Name: sub_grade, dtype: int64\n",
|
|
|
- "-----------\n",
|
|
|
- "emp_title\n",
|
|
|
- " old palm inc 1\n",
|
|
|
- " Brocade Communications 1\n",
|
|
|
- " CenturyLink 1\n",
|
|
|
- " Department of Homeland Security 1\n",
|
|
|
- " Down To Earth Distributors, Inc. 1\n",
|
|
|
- " ..\n",
|
|
|
- "zashko inc. 1\n",
|
|
|
- "zeno office solutions 1\n",
|
|
|
- "zion lutheran school 1\n",
|
|
|
- "zoll medical corp 1\n",
|
|
|
- "zozaya officiating 1\n",
|
|
|
- "Name: emp_title, Length: 30658, dtype: int64\n",
|
|
|
- "-----------\n",
|
|
|
- "home_ownership\n",
|
|
|
- "MORTGAGE 18959\n",
|
|
|
- "NONE 8\n",
|
|
|
- "OTHER 136\n",
|
|
|
- "OWN 3251\n",
|
|
|
- "RENT 20181\n",
|
|
|
- "Name: home_ownership, dtype: int64\n",
|
|
|
- "-----------\n",
|
|
|
- "verification_status\n",
|
|
|
- "Not Verified 18758\n",
|
|
|
- "Source Verified 10306\n",
|
|
|
- "Verified 13471\n",
|
|
|
- "Name: verification_status, dtype: int64\n",
|
|
|
- "-----------\n",
|
|
|
- "issue_d\n",
|
|
|
- "Apr-2008 259\n",
|
|
|
- "Apr-2009 333\n",
|
|
|
- "Apr-2010 912\n",
|
|
|
- "Apr-2011 1563\n",
|
|
|
- "Aug-2007 74\n",
|
|
|
- "Aug-2008 100\n",
|
|
|
- "Aug-2009 446\n",
|
|
|
- "Aug-2010 1175\n",
|
|
|
- "Aug-2011 1934\n",
|
|
|
- "Dec-2007 172\n",
|
|
|
- "Dec-2008 253\n",
|
|
|
- "Dec-2009 658\n",
|
|
|
- "Dec-2010 1335\n",
|
|
|
- "Dec-2011 2267\n",
|
|
|
- "Feb-2008 306\n",
|
|
|
- "Feb-2009 302\n",
|
|
|
- "Feb-2010 682\n",
|
|
|
- "Feb-2011 1298\n",
|
|
|
- "Jan-2008 305\n",
|
|
|
- "Jan-2009 269\n",
|
|
|
- "Jan-2010 662\n",
|
|
|
- "Jan-2011 1380\n",
|
|
|
- "Jul-2007 63\n",
|
|
|
- "Jul-2008 141\n",
|
|
|
- "Jul-2009 411\n",
|
|
|
- "Jul-2010 1204\n",
|
|
|
- "Jul-2011 1875\n",
|
|
|
- "Jun-2007 24\n",
|
|
|
- "Jun-2008 124\n",
|
|
|
- "Jun-2009 406\n",
|
|
|
- "Jun-2010 1105\n",
|
|
|
- "Jun-2011 1835\n",
|
|
|
- "Mar-2008 402\n",
|
|
|
- "Mar-2009 324\n",
|
|
|
- "Mar-2010 828\n",
|
|
|
- "Mar-2011 1448\n",
|
|
|
- "May-2008 115\n",
|
|
|
- "May-2009 359\n",
|
|
|
- "May-2010 989\n",
|
|
|
- "May-2011 1704\n",
|
|
|
- "Nov-2007 112\n",
|
|
|
- "Nov-2008 209\n",
|
|
|
- "Nov-2009 662\n",
|
|
|
- "Nov-2010 1224\n",
|
|
|
- "Nov-2011 2232\n",
|
|
|
- "Oct-2007 105\n",
|
|
|
- "Oct-2008 122\n",
|
|
|
- "Oct-2009 604\n",
|
|
|
- "Oct-2010 1232\n",
|
|
|
- "Oct-2011 2118\n",
|
|
|
- "Sep-2007 53\n",
|
|
|
- "Sep-2008 57\n",
|
|
|
- "Sep-2009 507\n",
|
|
|
- "Sep-2010 1189\n",
|
|
|
- "Sep-2011 2067\n",
|
|
|
- "Name: issue_d, dtype: int64\n",
|
|
|
- "-----------\n",
|
|
|
- "purpose\n",
|
|
|
- "car 1615\n",
|
|
|
- "credit_card 5477\n",
|
|
|
- "debt_consolidation 19776\n",
|
|
|
- "educational 422\n",
|
|
|
- "home_improvement 3199\n",
|
|
|
- "house 426\n",
|
|
|
- "major_purchase 2311\n",
|
|
|
- "medical 753\n",
|
|
|
- "moving 629\n",
|
|
|
- "other 4425\n",
|
|
|
- "renewable_energy 106\n",
|
|
|
- "small_business 1992\n",
|
|
|
- "vacation 400\n",
|
|
|
- "wedding 1004\n",
|
|
|
- "Name: purpose, dtype: int64\n",
|
|
|
- "-----------\n",
|
|
|
- "earliest_cr_line\n",
|
|
|
- "Apr-1964 3\n",
|
|
|
- "Apr-1966 1\n",
|
|
|
- "Apr-1967 4\n",
|
|
|
- "Apr-1968 1\n",
|
|
|
- "Apr-1969 1\n",
|
|
|
- " ... \n",
|
|
|
- "Sep-2004 221\n",
|
|
|
- "Sep-2005 162\n",
|
|
|
- "Sep-2006 150\n",
|
|
|
- "Sep-2007 63\n",
|
|
|
- "Sep-2008 8\n",
|
|
|
- "Name: earliest_cr_line, Length: 530, dtype: int64\n",
|
|
|
- "-----------\n",
|
|
|
- "revol_util\n",
|
|
|
- "0% 1070\n",
|
|
|
- "0.01% 1\n",
|
|
|
- "0.03% 1\n",
|
|
|
- "0.04% 1\n",
|
|
|
- "0.05% 1\n",
|
|
|
- " ... \n",
|
|
|
- "99.5% 24\n",
|
|
|
- "99.6% 27\n",
|
|
|
- "99.7% 32\n",
|
|
|
- "99.8% 25\n",
|
|
|
- "99.9% 29\n",
|
|
|
- "Name: revol_util, Length: 1119, dtype: int64\n",
|
|
|
- "-----------\n",
|
|
|
- "last_pymnt_d\n",
|
|
|
- "Apr-2008 23\n",
|
|
|
- "Apr-2009 72\n",
|
|
|
- "Apr-2010 145\n",
|
|
|
- "Apr-2011 519\n",
|
|
|
- "Apr-2012 781\n",
|
|
|
- " ... \n",
|
|
|
- "Sep-2011 491\n",
|
|
|
- "Sep-2012 802\n",
|
|
|
- "Sep-2013 712\n",
|
|
|
- "Sep-2014 694\n",
|
|
|
- "Sep-2015 211\n",
|
|
|
- "Name: last_pymnt_d, Length: 103, dtype: int64\n",
|
|
|
- "-----------\n",
|
|
|
- "last_credit_pull_d\n",
|
|
|
- "Apr-2009 24\n",
|
|
|
- "Apr-2010 77\n",
|
|
|
- "Apr-2011 177\n",
|
|
|
- "Apr-2012 326\n",
|
|
|
- "Apr-2013 445\n",
|
|
|
- " ... \n",
|
|
|
- "Sep-2011 175\n",
|
|
|
- "Sep-2012 414\n",
|
|
|
- "Sep-2013 408\n",
|
|
|
- "Sep-2014 564\n",
|
|
|
- "Sep-2015 531\n",
|
|
|
- "Name: last_credit_pull_d, Length: 108, dtype: int64\n",
|
|
|
- "-----------\n"
|
|
|
- ]
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "for col in useful_obj_cols:\n",
|
|
|
- " print(col)\n",
|
|
|
- " print(combined_vcs[col])\n",
|
|
|
- " print(\"-----------\")"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "markdown",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "### Convert to category."
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 15,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [],
|
|
|
- "source": [
|
|
|
- "convert_col_dtypes = {\n",
|
|
|
- " \"sub_grade\": \"category\", \"home_ownership\": \"category\", \n",
|
|
|
- " \"verification_status\": \"category\", \"purpose\": \"category\"\n",
|
|
|
- "}"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "markdown",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "### Convert `term` and `revol_util` to numerical by data cleaning.\n",
|
|
|
- "### Convert `issue_d`, `earliest_cr_line`, `last_pymnt_d`, and `last_credit_pull_d` to datetime."
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 16,
|
|
|
- "metadata": {
|
|
|
- "scrolled": true
|
|
|
- },
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "text/html": [
|
|
|
- "<div>\n",
|
|
|
- "<style scoped>\n",
|
|
|
- " .dataframe tbody tr th:only-of-type {\n",
|
|
|
- " vertical-align: middle;\n",
|
|
|
- " }\n",
|
|
|
- "\n",
|
|
|
- " .dataframe tbody tr th {\n",
|
|
|
- " vertical-align: top;\n",
|
|
|
- " }\n",
|
|
|
- "\n",
|
|
|
- " .dataframe thead th {\n",
|
|
|
- " text-align: right;\n",
|
|
|
- " }\n",
|
|
|
- "</style>\n",
|
|
|
- "<table border=\"1\" class=\"dataframe\">\n",
|
|
|
- " <thead>\n",
|
|
|
- " <tr style=\"text-align: right;\">\n",
|
|
|
- " <th></th>\n",
|
|
|
- " <th>term</th>\n",
|
|
|
- " <th>sub_grade</th>\n",
|
|
|
- " <th>emp_title</th>\n",
|
|
|
- " <th>home_ownership</th>\n",
|
|
|
- " <th>verification_status</th>\n",
|
|
|
- " <th>issue_d</th>\n",
|
|
|
- " <th>purpose</th>\n",
|
|
|
- " <th>earliest_cr_line</th>\n",
|
|
|
- " <th>revol_util</th>\n",
|
|
|
- " <th>last_pymnt_d</th>\n",
|
|
|
- " <th>last_credit_pull_d</th>\n",
|
|
|
- " </tr>\n",
|
|
|
- " </thead>\n",
|
|
|
- " <tbody>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>42000</th>\n",
|
|
|
- " <td>36 months</td>\n",
|
|
|
- " <td>C2</td>\n",
|
|
|
- " <td>Best Buy</td>\n",
|
|
|
- " <td>RENT</td>\n",
|
|
|
- " <td>Not Verified</td>\n",
|
|
|
- " <td>Feb-2008</td>\n",
|
|
|
- " <td>debt_consolidation</td>\n",
|
|
|
- " <td>Jul-2000</td>\n",
|
|
|
- " <td>100.7%</td>\n",
|
|
|
- " <td>Feb-2011</td>\n",
|
|
|
- " <td>Jun-2016</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>42001</th>\n",
|
|
|
- " <td>36 months</td>\n",
|
|
|
- " <td>G2</td>\n",
|
|
|
- " <td>CVS PHARMACY</td>\n",
|
|
|
- " <td>OWN</td>\n",
|
|
|
- " <td>Not Verified</td>\n",
|
|
|
- " <td>Feb-2008</td>\n",
|
|
|
- " <td>debt_consolidation</td>\n",
|
|
|
- " <td>Mar-1989</td>\n",
|
|
|
- " <td>51.9%</td>\n",
|
|
|
- " <td>Nov-2008</td>\n",
|
|
|
- " <td>Jun-2016</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>42002</th>\n",
|
|
|
- " <td>36 months</td>\n",
|
|
|
- " <td>E4</td>\n",
|
|
|
- " <td>General Motors</td>\n",
|
|
|
- " <td>RENT</td>\n",
|
|
|
- " <td>Not Verified</td>\n",
|
|
|
- " <td>Feb-2008</td>\n",
|
|
|
- " <td>debt_consolidation</td>\n",
|
|
|
- " <td>Dec-1998</td>\n",
|
|
|
- " <td>80.7%</td>\n",
|
|
|
- " <td>Feb-2011</td>\n",
|
|
|
- " <td>Jun-2016</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>42003</th>\n",
|
|
|
- " <td>36 months</td>\n",
|
|
|
- " <td>G4</td>\n",
|
|
|
- " <td>usa medical center</td>\n",
|
|
|
- " <td>RENT</td>\n",
|
|
|
- " <td>Not Verified</td>\n",
|
|
|
- " <td>Feb-2008</td>\n",
|
|
|
- " <td>debt_consolidation</td>\n",
|
|
|
- " <td>Jul-1995</td>\n",
|
|
|
- " <td>57.2%</td>\n",
|
|
|
- " <td>Feb-2011</td>\n",
|
|
|
- " <td>Jun-2011</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>42004</th>\n",
|
|
|
- " <td>36 months</td>\n",
|
|
|
- " <td>B3</td>\n",
|
|
|
- " <td>InvestSource Inc</td>\n",
|
|
|
- " <td>RENT</td>\n",
|
|
|
- " <td>Not Verified</td>\n",
|
|
|
- " <td>Feb-2008</td>\n",
|
|
|
- " <td>debt_consolidation</td>\n",
|
|
|
- " <td>Sep-2005</td>\n",
|
|
|
- " <td>74%</td>\n",
|
|
|
- " <td>Mar-2010</td>\n",
|
|
|
- " <td>Aug-2010</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>...</th>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>42533</th>\n",
|
|
|
- " <td>36 months</td>\n",
|
|
|
- " <td>B3</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>RENT</td>\n",
|
|
|
- " <td>Not Verified</td>\n",
|
|
|
- " <td>Jun-2007</td>\n",
|
|
|
- " <td>other</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>Jun-2010</td>\n",
|
|
|
- " <td>May-2007</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>42534</th>\n",
|
|
|
- " <td>36 months</td>\n",
|
|
|
- " <td>A5</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NONE</td>\n",
|
|
|
- " <td>Not Verified</td>\n",
|
|
|
- " <td>Jun-2007</td>\n",
|
|
|
- " <td>other</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>Jun-2010</td>\n",
|
|
|
- " <td>Aug-2007</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>42535</th>\n",
|
|
|
- " <td>36 months</td>\n",
|
|
|
- " <td>A3</td>\n",
|
|
|
- " <td>Homemaker</td>\n",
|
|
|
- " <td>MORTGAGE</td>\n",
|
|
|
- " <td>Not Verified</td>\n",
|
|
|
- " <td>Jun-2007</td>\n",
|
|
|
- " <td>other</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>Jun-2010</td>\n",
|
|
|
- " <td>Feb-2015</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>42536</th>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>42537</th>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " <td>NaN</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " </tbody>\n",
|
|
|
- "</table>\n",
|
|
|
- "<p>538 rows ร 11 columns</p>\n",
|
|
|
- "</div>"
|
|
|
- ],
|
|
|
- "text/plain": [
|
|
|
- " term sub_grade emp_title home_ownership \\\n",
|
|
|
- "42000 36 months C2 Best Buy RENT \n",
|
|
|
- "42001 36 months G2 CVS PHARMACY OWN \n",
|
|
|
- "42002 36 months E4 General Motors RENT \n",
|
|
|
- "42003 36 months G4 usa medical center RENT \n",
|
|
|
- "42004 36 months B3 InvestSource Inc RENT \n",
|
|
|
- "... ... ... ... ... \n",
|
|
|
- "42533 36 months B3 NaN RENT \n",
|
|
|
- "42534 36 months A5 NaN NONE \n",
|
|
|
- "42535 36 months A3 Homemaker MORTGAGE \n",
|
|
|
- "42536 NaN NaN NaN NaN \n",
|
|
|
- "42537 NaN NaN NaN NaN \n",
|
|
|
- "\n",
|
|
|
- " verification_status issue_d purpose earliest_cr_line \\\n",
|
|
|
- "42000 Not Verified Feb-2008 debt_consolidation Jul-2000 \n",
|
|
|
- "42001 Not Verified Feb-2008 debt_consolidation Mar-1989 \n",
|
|
|
- "42002 Not Verified Feb-2008 debt_consolidation Dec-1998 \n",
|
|
|
- "42003 Not Verified Feb-2008 debt_consolidation Jul-1995 \n",
|
|
|
- "42004 Not Verified Feb-2008 debt_consolidation Sep-2005 \n",
|
|
|
- "... ... ... ... ... \n",
|
|
|
- "42533 Not Verified Jun-2007 other NaN \n",
|
|
|
- "42534 Not Verified Jun-2007 other NaN \n",
|
|
|
- "42535 Not Verified Jun-2007 other NaN \n",
|
|
|
- "42536 NaN NaN NaN NaN \n",
|
|
|
- "42537 NaN NaN NaN NaN \n",
|
|
|
- "\n",
|
|
|
- " revol_util last_pymnt_d last_credit_pull_d \n",
|
|
|
- "42000 100.7% Feb-2011 Jun-2016 \n",
|
|
|
- "42001 51.9% Nov-2008 Jun-2016 \n",
|
|
|
- "42002 80.7% Feb-2011 Jun-2016 \n",
|
|
|
- "42003 57.2% Feb-2011 Jun-2011 \n",
|
|
|
- "42004 74% Mar-2010 Aug-2010 \n",
|
|
|
- "... ... ... ... \n",
|
|
|
- "42533 NaN Jun-2010 May-2007 \n",
|
|
|
- "42534 NaN Jun-2010 Aug-2007 \n",
|
|
|
- "42535 NaN Jun-2010 Feb-2015 \n",
|
|
|
- "42536 NaN NaN NaN \n",
|
|
|
- "42537 NaN NaN NaN \n",
|
|
|
- "\n",
|
|
|
- "[538 rows x 11 columns]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "execution_count": 16,
|
|
|
- "metadata": {},
|
|
|
- "output_type": "execute_result"
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "chunk[useful_obj_cols]"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 17,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "text/plain": [
|
|
|
- "id object\n",
|
|
|
- "member_id float64\n",
|
|
|
- "loan_amnt float64\n",
|
|
|
- "funded_amnt float64\n",
|
|
|
- "funded_amnt_inv float64\n",
|
|
|
- "term float64\n",
|
|
|
- "int_rate object\n",
|
|
|
- "installment float64\n",
|
|
|
- "grade object\n",
|
|
|
- "sub_grade category\n",
|
|
|
- "emp_title object\n",
|
|
|
- "emp_length object\n",
|
|
|
- "home_ownership category\n",
|
|
|
- "annual_inc float64\n",
|
|
|
- "verification_status category\n",
|
|
|
- "issue_d datetime64[ns]\n",
|
|
|
- "loan_status object\n",
|
|
|
- "pymnt_plan object\n",
|
|
|
- "purpose category\n",
|
|
|
- "title object\n",
|
|
|
- "zip_code object\n",
|
|
|
- "addr_state object\n",
|
|
|
- "dti float64\n",
|
|
|
- "delinq_2yrs float64\n",
|
|
|
- "earliest_cr_line datetime64[ns]\n",
|
|
|
- "inq_last_6mths float64\n",
|
|
|
- "open_acc float64\n",
|
|
|
- "pub_rec float64\n",
|
|
|
- "revol_bal float64\n",
|
|
|
- "revol_util float64\n",
|
|
|
- "total_acc float64\n",
|
|
|
- "initial_list_status object\n",
|
|
|
- "out_prncp float64\n",
|
|
|
- "out_prncp_inv float64\n",
|
|
|
- "total_pymnt float64\n",
|
|
|
- "total_pymnt_inv float64\n",
|
|
|
- "total_rec_prncp float64\n",
|
|
|
- "total_rec_int float64\n",
|
|
|
- "total_rec_late_fee float64\n",
|
|
|
- "recoveries float64\n",
|
|
|
- "collection_recovery_fee float64\n",
|
|
|
- "last_pymnt_d datetime64[ns]\n",
|
|
|
- "last_pymnt_amnt float64\n",
|
|
|
- "last_credit_pull_d datetime64[ns]\n",
|
|
|
- "collections_12_mths_ex_med float64\n",
|
|
|
- "policy_code float64\n",
|
|
|
- "application_type object\n",
|
|
|
- "acc_now_delinq float64\n",
|
|
|
- "chargeoff_within_12_mths float64\n",
|
|
|
- "delinq_amnt float64\n",
|
|
|
- "pub_rec_bankruptcies float64\n",
|
|
|
- "tax_liens float64\n",
|
|
|
- "dtype: object"
|
|
|
- ]
|
|
|
- },
|
|
|
- "execution_count": 17,
|
|
|
- "metadata": {},
|
|
|
- "output_type": "execute_result"
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
|
|
|
- "\n",
|
|
|
- "for chunk in chunk_iter:\n",
|
|
|
- " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
|
|
|
- " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
|
|
|
- " chunk['term'] = pd.to_numeric(term_cleaned)\n",
|
|
|
- " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
|
|
|
- " \n",
|
|
|
- "chunk.dtypes"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 18,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "text/plain": [
|
|
|
- "{'member_id': 3,\n",
|
|
|
- " 'loan_amnt': 3,\n",
|
|
|
- " 'funded_amnt': 3,\n",
|
|
|
- " 'funded_amnt_inv': 3,\n",
|
|
|
- " 'installment': 3,\n",
|
|
|
- " 'annual_inc': 7,\n",
|
|
|
- " 'dti': 3,\n",
|
|
|
- " 'delinq_2yrs': 32,\n",
|
|
|
- " 'inq_last_6mths': 32,\n",
|
|
|
- " 'open_acc': 32,\n",
|
|
|
- " 'pub_rec': 32,\n",
|
|
|
- " 'revol_bal': 3,\n",
|
|
|
- " 'revol_util': 93,\n",
|
|
|
- " 'total_acc': 32,\n",
|
|
|
- " 'out_prncp': 3,\n",
|
|
|
- " 'out_prncp_inv': 3,\n",
|
|
|
- " 'total_pymnt': 3,\n",
|
|
|
- " 'total_pymnt_inv': 3,\n",
|
|
|
- " 'total_rec_prncp': 3,\n",
|
|
|
- " 'total_rec_int': 3,\n",
|
|
|
- " 'total_rec_late_fee': 3,\n",
|
|
|
- " 'recoveries': 3,\n",
|
|
|
- " 'collection_recovery_fee': 3,\n",
|
|
|
- " 'last_pymnt_amnt': 3,\n",
|
|
|
- " 'collections_12_mths_ex_med': 148,\n",
|
|
|
- " 'policy_code': 3,\n",
|
|
|
- " 'acc_now_delinq': 32,\n",
|
|
|
- " 'chargeoff_within_12_mths': 148,\n",
|
|
|
- " 'delinq_amnt': 32,\n",
|
|
|
- " 'pub_rec_bankruptcies': 1368,\n",
|
|
|
- " 'tax_liens': 108,\n",
|
|
|
- " 'term': 3}"
|
|
|
- ]
|
|
|
- },
|
|
|
- "execution_count": 18,
|
|
|
- "metadata": {},
|
|
|
- "output_type": "execute_result"
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
|
|
|
- "mv_counts = {}\n",
|
|
|
- "for chunk in chunk_iter:\n",
|
|
|
- " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
|
|
|
- " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
|
|
|
- " chunk['term'] = pd.to_numeric(term_cleaned)\n",
|
|
|
- " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
|
|
|
- " float_cols = chunk.select_dtypes(include=['float'])\n",
|
|
|
- " for col in float_cols.columns:\n",
|
|
|
- " missing_values = len(chunk) - chunk[col].count()\n",
|
|
|
- " if col in mv_counts:\n",
|
|
|
- " mv_counts[col] = mv_counts[col] + missing_values\n",
|
|
|
- " else:\n",
|
|
|
- " mv_counts[col] = missing_values\n",
|
|
|
- "mv_counts"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 19,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "text/plain": [
|
|
|
- "{'member_id': 3,\n",
|
|
|
- " 'loan_amnt': 3,\n",
|
|
|
- " 'funded_amnt': 3,\n",
|
|
|
- " 'funded_amnt_inv': 3,\n",
|
|
|
- " 'installment': 3,\n",
|
|
|
- " 'annual_inc': 7,\n",
|
|
|
- " 'dti': 3,\n",
|
|
|
- " 'delinq_2yrs': 32,\n",
|
|
|
- " 'inq_last_6mths': 32,\n",
|
|
|
- " 'open_acc': 32,\n",
|
|
|
- " 'pub_rec': 32,\n",
|
|
|
- " 'revol_bal': 3,\n",
|
|
|
- " 'revol_util': 93,\n",
|
|
|
- " 'total_acc': 32,\n",
|
|
|
- " 'out_prncp': 3,\n",
|
|
|
- " 'out_prncp_inv': 3,\n",
|
|
|
- " 'total_pymnt': 3,\n",
|
|
|
- " 'total_pymnt_inv': 3,\n",
|
|
|
- " 'total_rec_prncp': 3,\n",
|
|
|
- " 'total_rec_int': 3,\n",
|
|
|
- " 'total_rec_late_fee': 3,\n",
|
|
|
- " 'recoveries': 3,\n",
|
|
|
- " 'collection_recovery_fee': 3,\n",
|
|
|
- " 'last_pymnt_amnt': 3,\n",
|
|
|
- " 'collections_12_mths_ex_med': 148,\n",
|
|
|
- " 'policy_code': 3,\n",
|
|
|
- " 'acc_now_delinq': 32,\n",
|
|
|
- " 'chargeoff_within_12_mths': 148,\n",
|
|
|
- " 'delinq_amnt': 32,\n",
|
|
|
- " 'pub_rec_bankruptcies': 1368,\n",
|
|
|
- " 'tax_liens': 108,\n",
|
|
|
- " 'term': 3}"
|
|
|
- ]
|
|
|
- },
|
|
|
- "execution_count": 19,
|
|
|
- "metadata": {},
|
|
|
- "output_type": "execute_result"
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
|
|
|
- "mv_counts = {}\n",
|
|
|
- "for chunk in chunk_iter:\n",
|
|
|
- " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
|
|
|
- " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
|
|
|
- " chunk['term'] = pd.to_numeric(term_cleaned)\n",
|
|
|
- " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
|
|
|
- " chunk = chunk.dropna(how='all')\n",
|
|
|
- " float_cols = chunk.select_dtypes(include=['float'])\n",
|
|
|
- " for col in float_cols.columns:\n",
|
|
|
- " missing_values = len(chunk) - chunk[col].count()\n",
|
|
|
- " if col in mv_counts:\n",
|
|
|
- " mv_counts[col] = mv_counts[col] + missing_values\n",
|
|
|
- " else:\n",
|
|
|
- " mv_counts[col] = missing_values\n",
|
|
|
- "mv_counts"
|
|
|
- ]
|
|
|
- }
|
|
|
- ],
|
|
|
- "metadata": {
|
|
|
- "anaconda-cloud": {},
|
|
|
- "kernelspec": {
|
|
|
- "display_name": "Python 3",
|
|
|
- "language": "python",
|
|
|
- "name": "python3"
|
|
|
- },
|
|
|
- "language_info": {
|
|
|
- "codemirror_mode": {
|
|
|
- "name": "ipython",
|
|
|
- "version": 3
|
|
|
- },
|
|
|
- "file_extension": ".py",
|
|
|
- "mimetype": "text/x-python",
|
|
|
- "name": "python",
|
|
|
- "nbconvert_exporter": "python",
|
|
|
- "pygments_lexer": "ipython3",
|
|
|
- "version": "3.8.5"
|
|
|
- }
|
|
|
- },
|
|
|
- "nbformat": 4,
|
|
|
- "nbformat_minor": 2
|
|
|
-}
|