{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>id</th>\n", " <th>member_id</th>\n", " <th>loan_amnt</th>\n", " <th>funded_amnt</th>\n", " <th>funded_amnt_inv</th>\n", " <th>term</th>\n", " <th>int_rate</th>\n", " <th>installment</th>\n", " <th>grade</th>\n", " <th>sub_grade</th>\n", " <th>emp_title</th>\n", " <th>emp_length</th>\n", " <th>home_ownership</th>\n", " <th>annual_inc</th>\n", " <th>verification_status</th>\n", " <th>issue_d</th>\n", " <th>loan_status</th>\n", " <th>pymnt_plan</th>\n", " <th>purpose</th>\n", " <th>title</th>\n", " <th>zip_code</th>\n", " <th>addr_state</th>\n", " <th>dti</th>\n", " <th>delinq_2yrs</th>\n", " <th>earliest_cr_line</th>\n", " <th>inq_last_6mths</th>\n", " <th>open_acc</th>\n", " <th>pub_rec</th>\n", " <th>revol_bal</th>\n", " <th>revol_util</th>\n", " <th>total_acc</th>\n", " <th>initial_list_status</th>\n", " <th>out_prncp</th>\n", " <th>out_prncp_inv</th>\n", " <th>total_pymnt</th>\n", " <th>total_pymnt_inv</th>\n", " <th>total_rec_prncp</th>\n", " <th>total_rec_int</th>\n", " <th>total_rec_late_fee</th>\n", " <th>recoveries</th>\n", " <th>collection_recovery_fee</th>\n", " <th>last_pymnt_d</th>\n", " <th>last_pymnt_amnt</th>\n", " <th>last_credit_pull_d</th>\n", " <th>collections_12_mths_ex_med</th>\n", " <th>policy_code</th>\n", " <th>application_type</th>\n", " <th>acc_now_delinq</th>\n", " <th>chargeoff_within_12_mths</th>\n", " <th>delinq_amnt</th>\n", " <th>pub_rec_bankruptcies</th>\n", " <th>tax_liens</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1077501</td>\n", " <td>1296599.0</td>\n", " <td>5000.0</td>\n", " <td>5000.0</td>\n", " <td>4975.0</td>\n", " <td>36 months</td>\n", " <td>10.65%</td>\n", " <td>162.87</td>\n", " <td>B</td>\n", " <td>B2</td>\n", " <td>NaN</td>\n", " <td>10+ years</td>\n", " <td>RENT</td>\n", " <td>24000.0</td>\n", " <td>Verified</td>\n", " <td>Dec-2011</td>\n", " <td>Fully Paid</td>\n", " <td>n</td>\n", " <td>credit_card</td>\n", " <td>Computer</td>\n", " <td>860xx</td>\n", " <td>AZ</td>\n", " <td>27.65</td>\n", " <td>0.0</td>\n", " <td>Jan-1985</td>\n", " <td>1.0</td>\n", " <td>3.0</td>\n", " <td>0.0</td>\n", " <td>13648.0</td>\n", " <td>83.7%</td>\n", " <td>9.0</td>\n", " <td>f</td>\n", " <td>0.00</td>\n", " <td>0.00</td>\n", " <td>5863.155187</td>\n", " <td>5833.84</td>\n", " <td>5000.00</td>\n", " <td>863.16</td>\n", " <td>0.00</td>\n", " <td>0.00</td>\n", " <td>0.00</td>\n", " <td>Jan-2015</td>\n", " <td>171.62</td>\n", " <td>Jun-2016</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>INDIVIDUAL</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1077430</td>\n", " <td>1314167.0</td>\n", " <td>2500.0</td>\n", " <td>2500.0</td>\n", " <td>2500.0</td>\n", " <td>60 months</td>\n", " <td>15.27%</td>\n", " <td>59.83</td>\n", " <td>C</td>\n", " <td>C4</td>\n", " <td>Ryder</td>\n", " <td>< 1 year</td>\n", " <td>RENT</td>\n", " <td>30000.0</td>\n", " <td>Source Verified</td>\n", " <td>Dec-2011</td>\n", " <td>Charged Off</td>\n", " <td>n</td>\n", " <td>car</td>\n", " <td>bike</td>\n", " <td>309xx</td>\n", " <td>GA</td>\n", " <td>1.00</td>\n", " <td>0.0</td>\n", " <td>Apr-1999</td>\n", " <td>5.0</td>\n", " <td>3.0</td>\n", " <td>0.0</td>\n", " <td>1687.0</td>\n", " <td>9.4%</td>\n", " <td>4.0</td>\n", " <td>f</td>\n", " <td>0.00</td>\n", " <td>0.00</td>\n", " <td>1008.710000</td>\n", " <td>1008.71</td>\n", " <td>456.46</td>\n", " <td>435.17</td>\n", " <td>0.00</td>\n", " <td>117.08</td>\n", " <td>1.11</td>\n", " <td>Apr-2013</td>\n", " <td>119.66</td>\n", " <td>Sep-2013</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>INDIVIDUAL</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1077175</td>\n", " <td>1313524.0</td>\n", " <td>2400.0</td>\n", " <td>2400.0</td>\n", " <td>2400.0</td>\n", " <td>36 months</td>\n", " <td>15.96%</td>\n", " <td>84.33</td>\n", " <td>C</td>\n", " <td>C5</td>\n", " <td>NaN</td>\n", " <td>10+ years</td>\n", " <td>RENT</td>\n", " <td>12252.0</td>\n", " <td>Not Verified</td>\n", " <td>Dec-2011</td>\n", " <td>Fully Paid</td>\n", " <td>n</td>\n", " <td>small_business</td>\n", " <td>real estate business</td>\n", " <td>606xx</td>\n", " <td>IL</td>\n", " <td>8.72</td>\n", " <td>0.0</td>\n", " <td>Nov-2001</td>\n", " <td>2.0</td>\n", " <td>2.0</td>\n", " <td>0.0</td>\n", " <td>2956.0</td>\n", " <td>98.5%</td>\n", " <td>10.0</td>\n", " <td>f</td>\n", " <td>0.00</td>\n", " <td>0.00</td>\n", " <td>3005.666844</td>\n", " <td>3005.67</td>\n", " <td>2400.00</td>\n", " <td>605.67</td>\n", " <td>0.00</td>\n", " <td>0.00</td>\n", " <td>0.00</td>\n", " <td>Jun-2014</td>\n", " <td>649.91</td>\n", " <td>Jun-2016</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>INDIVIDUAL</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1076863</td>\n", " <td>1277178.0</td>\n", " <td>10000.0</td>\n", " <td>10000.0</td>\n", " <td>10000.0</td>\n", " <td>36 months</td>\n", " <td>13.49%</td>\n", " <td>339.31</td>\n", " <td>C</td>\n", " <td>C1</td>\n", " <td>AIR RESOURCES BOARD</td>\n", " <td>10+ years</td>\n", " <td>RENT</td>\n", " <td>49200.0</td>\n", " <td>Source Verified</td>\n", " <td>Dec-2011</td>\n", " <td>Fully Paid</td>\n", " <td>n</td>\n", " <td>other</td>\n", " <td>personel</td>\n", " <td>917xx</td>\n", " <td>CA</td>\n", " <td>20.00</td>\n", " <td>0.0</td>\n", " <td>Feb-1996</td>\n", " <td>1.0</td>\n", " <td>10.0</td>\n", " <td>0.0</td>\n", " <td>5598.0</td>\n", " <td>21%</td>\n", " <td>37.0</td>\n", " <td>f</td>\n", " <td>0.00</td>\n", " <td>0.00</td>\n", " <td>12231.890000</td>\n", " <td>12231.89</td>\n", " <td>10000.00</td>\n", " <td>2214.92</td>\n", " <td>16.97</td>\n", " <td>0.00</td>\n", " <td>0.00</td>\n", " <td>Jan-2015</td>\n", " <td>357.48</td>\n", " <td>Apr-2016</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>INDIVIDUAL</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1075358</td>\n", " <td>1311748.0</td>\n", " <td>3000.0</td>\n", " <td>3000.0</td>\n", " <td>3000.0</td>\n", " <td>60 months</td>\n", " <td>12.69%</td>\n", " <td>67.79</td>\n", " <td>B</td>\n", " <td>B5</td>\n", " <td>University Medical Group</td>\n", " <td>1 year</td>\n", " <td>RENT</td>\n", " <td>80000.0</td>\n", " <td>Source Verified</td>\n", " <td>Dec-2011</td>\n", " <td>Current</td>\n", " <td>n</td>\n", " <td>other</td>\n", " <td>Personal</td>\n", " <td>972xx</td>\n", " <td>OR</td>\n", " <td>17.94</td>\n", " <td>0.0</td>\n", " <td>Jan-1996</td>\n", " <td>0.0</td>\n", " <td>15.0</td>\n", " <td>0.0</td>\n", " <td>27783.0</td>\n", " <td>53.9%</td>\n", " <td>38.0</td>\n", " <td>f</td>\n", " <td>461.73</td>\n", " <td>461.73</td>\n", " <td>3581.120000</td>\n", " <td>3581.12</td>\n", " <td>2538.27</td>\n", " <td>1042.85</td>\n", " <td>0.00</td>\n", " <td>0.00</td>\n", " <td>0.00</td>\n", " <td>Jun-2016</td>\n", " <td>67.79</td>\n", " <td>Jun-2016</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>INDIVIDUAL</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " id member_id loan_amnt funded_amnt funded_amnt_inv term \\\n", "0 1077501 1296599.0 5000.0 5000.0 4975.0 36 months \n", "1 1077430 1314167.0 2500.0 2500.0 2500.0 60 months \n", "2 1077175 1313524.0 2400.0 2400.0 2400.0 36 months \n", "3 1076863 1277178.0 10000.0 10000.0 10000.0 36 months \n", "4 1075358 1311748.0 3000.0 3000.0 3000.0 60 months \n", "\n", " int_rate installment grade sub_grade emp_title emp_length \\\n", "0 10.65% 162.87 B B2 NaN 10+ years \n", "1 15.27% 59.83 C C4 Ryder < 1 year \n", "2 15.96% 84.33 C C5 NaN 10+ years \n", "3 13.49% 339.31 C C1 AIR RESOURCES BOARD 10+ years \n", "4 12.69% 67.79 B B5 University Medical Group 1 year \n", "\n", " home_ownership annual_inc verification_status issue_d loan_status \\\n", "0 RENT 24000.0 Verified Dec-2011 Fully Paid \n", "1 RENT 30000.0 Source Verified Dec-2011 Charged Off \n", "2 RENT 12252.0 Not Verified Dec-2011 Fully Paid \n", "3 RENT 49200.0 Source Verified Dec-2011 Fully Paid \n", "4 RENT 80000.0 Source Verified Dec-2011 Current \n", "\n", " pymnt_plan purpose title zip_code addr_state dti \\\n", "0 n credit_card Computer 860xx AZ 27.65 \n", "1 n car bike 309xx GA 1.00 \n", "2 n small_business real estate business 606xx IL 8.72 \n", "3 n other personel 917xx CA 20.00 \n", "4 n other Personal 972xx OR 17.94 \n", "\n", " delinq_2yrs earliest_cr_line inq_last_6mths open_acc pub_rec revol_bal \\\n", "0 0.0 Jan-1985 1.0 3.0 0.0 13648.0 \n", "1 0.0 Apr-1999 5.0 3.0 0.0 1687.0 \n", "2 0.0 Nov-2001 2.0 2.0 0.0 2956.0 \n", "3 0.0 Feb-1996 1.0 10.0 0.0 5598.0 \n", "4 0.0 Jan-1996 0.0 15.0 0.0 27783.0 \n", "\n", " revol_util total_acc initial_list_status out_prncp out_prncp_inv \\\n", "0 83.7% 9.0 f 0.00 0.00 \n", "1 9.4% 4.0 f 0.00 0.00 \n", "2 98.5% 10.0 f 0.00 0.00 \n", "3 21% 37.0 f 0.00 0.00 \n", "4 53.9% 38.0 f 461.73 461.73 \n", "\n", " total_pymnt total_pymnt_inv total_rec_prncp total_rec_int \\\n", "0 5863.155187 5833.84 5000.00 863.16 \n", "1 1008.710000 1008.71 456.46 435.17 \n", "2 3005.666844 3005.67 2400.00 605.67 \n", "3 12231.890000 12231.89 10000.00 2214.92 \n", "4 3581.120000 3581.12 2538.27 1042.85 \n", "\n", " total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d \\\n", "0 0.00 0.00 0.00 Jan-2015 \n", "1 0.00 117.08 1.11 Apr-2013 \n", "2 0.00 0.00 0.00 Jun-2014 \n", "3 16.97 0.00 0.00 Jan-2015 \n", "4 0.00 0.00 0.00 Jun-2016 \n", "\n", " last_pymnt_amnt last_credit_pull_d collections_12_mths_ex_med \\\n", "0 171.62 Jun-2016 0.0 \n", "1 119.66 Sep-2013 0.0 \n", "2 649.91 Jun-2016 0.0 \n", "3 357.48 Apr-2016 0.0 \n", "4 67.79 Jun-2016 0.0 \n", "\n", " policy_code application_type acc_now_delinq chargeoff_within_12_mths \\\n", "0 1.0 INDIVIDUAL 0.0 0.0 \n", "1 1.0 INDIVIDUAL 0.0 0.0 \n", "2 1.0 INDIVIDUAL 0.0 0.0 \n", "3 1.0 INDIVIDUAL 0.0 0.0 \n", "4 1.0 INDIVIDUAL 0.0 0.0 \n", "\n", " delinq_amnt pub_rec_bankruptcies tax_liens \n", "0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "pd.options.display.max_columns = 99\n", "\n", "first_five = pd.read_csv('loans_2007.csv', nrows=5)\n", "first_five" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.5506629943847656" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "thousand_chunk = pd.read_csv('loans_2007.csv', nrows=1000)\n", "thousand_chunk.memory_usage(deep=True).sum()/(1024*1024)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Let's try tripling to 3000 rows and calculate the memory footprint for each chunk." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.6512298584\n", "4.64817810059\n", "4.64966869354\n", "4.65059375763\n", "4.64692020416\n", "4.64837551117\n", "4.64736747742\n", "4.64941596985\n", "4.64743518829\n", "4.64727210999\n", "4.6598777771\n", "4.6566619873\n", "4.66349601746\n", "4.89784526825\n", "0.880808830261\n" ] } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "for chunk in chunk_iter:\n", " print(chunk.memory_usage(deep=True).sum()/(1024*1024))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## How many rows in the data set?" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "42538\n" ] } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "total_rows = 0\n", "for chunk in chunk_iter:\n", " total_rows += len(chunk)\n", "print(total_rows)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploring the Data in Chunks\n", "\n", "## How many columns have a numeric type? How many have a string type?" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30]\n", "[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22]\n" ] } ], "source": [ "# Numeric columns\n", "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "numeric = []\n", "string = []\n", "for lc in loans_chunks:\n", " nums = lc.select_dtypes(include=[np.number]).shape[1]\n", " numeric.append(nums)\n", " strs = lc.select_dtypes(include=['object']).shape[1]\n", " string.append(strs)\n", "\n", "print(numeric)\n", "print(string)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n", "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n", "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n", "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n" ] } ], "source": [ "# Are string columns consistent across chunks?\n", "obj_cols = []\n", "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "\n", "for chunk in chunk_iter:\n", " chunk_obj_cols = chunk.select_dtypes(include=['object']).columns.tolist()\n", " if len(obj_cols) > 0:\n", " is_same = obj_cols == chunk_obj_cols\n", " if not is_same:\n", " print(\"overall obj cols:\", obj_cols, \"\\n\")\n", " print(\"chunk obj cols:\", chunk_obj_cols, \"\\n\") \n", " else:\n", " obj_cols = chunk_obj_cols" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### Observation 1: By default -- 31 numeric columns and 21 string columns.\n", "\n", "### Observation 2: It seems like one column in particular (the `id` column) is being cast to int64 in the last 2 chunks but not in the earlier chunks. Since the `id` column won't be useful for analysis, visualization, or predictive modelling let's ignore this column.\n", "\n", "## How many unique values are there in each string column? How many of the string columns contain values that are less than 50% unique?" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "term 2\n", "loan_status 9\n", "purpose 14\n", "grade 7\n", "sub_grade 35\n", "initial_list_status 1\n", "verification_status 3\n", "application_type 1\n", "emp_length 12\n", "home_ownership 5\n", "pymnt_plan 2\n" ] } ], "source": [ "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "uniques = {}\n", "for lc in loans_chunks:\n", " strings_only = lc.select_dtypes(include=['object'])\n", " cols = strings_only.columns\n", " for c in cols:\n", " val_counts = strings_only[c].value_counts()\n", " if c in uniques:\n", " uniques[c].append(val_counts)\n", " else:\n", " uniques[c] = [val_counts]\n", "\n", "uniques_combined = {}\n", "unique_stats = {\n", " 'column_name': [],\n", " 'total_values': [],\n", " 'unique_values': [],\n", "}\n", "for col in uniques:\n", " u_concat = pd.concat(uniques[col])\n", " u_group = u_concat.groupby(u_concat.index).sum()\n", " uniques_combined[col] = u_group\n", " if u_group.shape[0] < 50:\n", " print(col, u_group.shape[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Which float columns have no missing values and could be candidates for conversion to the integer type?" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "member_id 3\n", "total_rec_int 3\n", "total_pymnt_inv 3\n", "total_pymnt 3\n", "revol_bal 3\n", "recoveries 3\n", "policy_code 3\n", "out_prncp_inv 3\n", "out_prncp 3\n", "total_rec_late_fee 3\n", "loan_amnt 3\n", "last_pymnt_amnt 3\n", "total_rec_prncp 3\n", "funded_amnt_inv 3\n", "funded_amnt 3\n", "dti 3\n", "collection_recovery_fee 3\n", "installment 3\n", "annual_inc 7\n", "inq_last_6mths 32\n", "total_acc 32\n", "delinq_2yrs 32\n", "pub_rec 32\n", "delinq_amnt 32\n", "open_acc 32\n", "acc_now_delinq 32\n", "tax_liens 108\n", "collections_12_mths_ex_med 148\n", "chargeoff_within_12_mths 148\n", "pub_rec_bankruptcies 1368\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "missing = []\n", "for lc in loans_chunks:\n", " floats = lc.select_dtypes(include=['float'])\n", " missing.append(floats.apply(pd.isnull).sum())\n", "\n", "combined_missing = pd.concat(missing)\n", "combined_missing.groupby(combined_missing.index).sum().sort_values()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate the total memory usage across all of the chunks." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "66.245146751403809" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "mem_usage = []\n", "\n", "for lc in loans_chunks:\n", " mem_usage.append(lc.memory_usage(deep=True).sum() / 1024 ** 2)\n", "\n", "sum(mem_usage)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Optimizing String Columns\n", "\n", "### Determine which string columns you can convert to a numeric type if you clean them. Let's focus on columns that would actually be useful for analysis and modelling." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['term',\n", " 'int_rate',\n", " 'grade',\n", " 'sub_grade',\n", " 'emp_title',\n", " 'emp_length',\n", " 'home_ownership',\n", " 'verification_status',\n", " 'issue_d',\n", " 'loan_status',\n", " 'pymnt_plan',\n", " 'purpose',\n", " 'title',\n", " 'zip_code',\n", " 'addr_state',\n", " 'earliest_cr_line',\n", " 'revol_util',\n", " 'initial_list_status',\n", " 'last_pymnt_d',\n", " 'last_credit_pull_d',\n", " 'application_type']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_cols" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "useful_obj_cols = ['term', 'sub_grade', 'emp_title', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'earliest_cr_line', 'revol_util', 'last_pymnt_d', 'last_credit_pull_d']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "## Create dictionary (key: column, value: list of Series objects representing each chunk's value counts)\n", "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "str_cols_vc = {}\n", "for chunk in chunk_iter:\n", " str_cols = chunk.select_dtypes(include=['object'])\n", " for col in str_cols.columns:\n", " current_col_vc = str_cols[col].value_counts()\n", " if col in str_cols_vc:\n", " str_cols_vc[col].append(current_col_vc)\n", " else:\n", " str_cols_vc[col] = [current_col_vc]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "## Combine the value counts.\n", "combined_vcs = {}\n", "\n", "for col in str_cols_vc:\n", " combined_vc = pd.concat(str_cols_vc[col])\n", " final_vc = combined_vc.groupby(combined_vc.index).sum()\n", " combined_vcs[col] = final_vc" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "term\n", " 36 months 31534\n", " 60 months 11001\n", "Name: term, dtype: int64\n", "-----------\n", "sub_grade\n", "A1 1142\n", "A2 1520\n", "A3 1823\n", "A4 2905\n", "A5 2793\n", "B1 1882\n", "B2 2113\n", "B3 2997\n", "B4 2590\n", "B5 2807\n", "C1 2264\n", "C2 2157\n", "C3 1658\n", "C4 1370\n", "C5 1291\n", "D1 1053\n", "D2 1485\n", "D3 1322\n", "D4 1140\n", "D5 1016\n", "E1 884\n", "E2 791\n", "E3 668\n", "E4 552\n", "E5 499\n", "F1 392\n", "F2 308\n", "F3 236\n", "F4 211\n", "F5 154\n", "G1 141\n", "G2 107\n", "G3 79\n", "G4 99\n", "G5 86\n", "Name: sub_grade, dtype: int64\n", "-----------\n", "emp_title\n", " old palm inc 1\n", " Brocade Communications 1\n", " CenturyLink 1\n", " Department of Homeland Security 1\n", " Down To Earth Distributors, Inc. 1\n", " Plaid, Inc. 1\n", " U.S. Dept. Of Homeland Security 1\n", " pacific crane maintenance company 1\n", "$260M '06 vintage technology venture capital firm 1\n", "(Collaborative) Abbott Nutrition Intl 1\n", "(self) Castleforte Group 1\n", "0CEAN VIEW INTL REALTY/C21 UNITED PLATINUM 1\n", "1 Cochran Hyundai 1\n", "1 and 1 Internet Inc. 1\n", "1)-Yavapai Regional Medical Center 2)- Dr. cantors office 1\n", "1-800 Contacts 1\n", "101 restaurant 1\n", "1013 communications 1\n", "1125 memorex dr 1\n", "1199SEIU Benefit and Pension Funds 1\n", "1199SEIU UHE 1\n", "123 Recovery 1\n", "123 appartment corp. 1\n", "128 Air Refueling Wing (USAF) 1\n", "1400 1\n", "15 1\n", "162 fighter wing 1\n", "16th MP BDE, U.S. Army 1\n", "1800-got-junk? 1\n", "182nd Airlift Wing Fire Dept. 1\n", " ..\n", "wyoming valley hospital 1\n", "wythe county community hospital 1\n", "xcel fire protection 1\n", "xerox cor. 1\n", "xerox corp 1\n", "xo communication 1\n", "xotic deliveries 1\n", "xpedite technologies inc 1\n", "xpedx 1\n", "yankee candle company 1\n", "ye olde pepper co 1\n", "yellow cab com. 1\n", "yellow cab taxi 1\n", "ymca 3\n", "yonkers fire department 1\n", "york hospital dental center 1\n", "yorkville physical therapy 1\n", "young and associates 1\n", "young atitudes aveda 1\n", "youth villages 1\n", "yrbnyc.com / Yellow Rat Bastard 1\n", "yrc 1\n", "yrcw 1\n", "zachry industral inc 1\n", "zakheim and lavrar 1\n", "zashko inc. 1\n", "zeno office solutions 1\n", "zion lutheran school 1\n", "zoll medical corp 1\n", "zozaya officiating 1\n", "Name: emp_title, dtype: int64\n", "-----------\n", "home_ownership\n", "MORTGAGE 18959\n", "NONE 8\n", "OTHER 136\n", "OWN 3251\n", "RENT 20181\n", "Name: home_ownership, dtype: int64\n", "-----------\n", "verification_status\n", "Not Verified 18758\n", "Source Verified 10306\n", "Verified 13471\n", "Name: verification_status, dtype: int64\n", "-----------\n", "issue_d\n", "Apr-2008 259\n", "Apr-2009 333\n", "Apr-2010 912\n", "Apr-2011 1563\n", "Aug-2007 74\n", "Aug-2008 100\n", "Aug-2009 446\n", "Aug-2010 1175\n", "Aug-2011 1934\n", "Dec-2007 172\n", "Dec-2008 253\n", "Dec-2009 658\n", "Dec-2010 1335\n", "Dec-2011 2267\n", "Feb-2008 306\n", "Feb-2009 302\n", "Feb-2010 682\n", "Feb-2011 1298\n", "Jan-2008 305\n", "Jan-2009 269\n", "Jan-2010 662\n", "Jan-2011 1380\n", "Jul-2007 63\n", "Jul-2008 141\n", "Jul-2009 411\n", "Jul-2010 1204\n", "Jul-2011 1875\n", "Jun-2007 24\n", "Jun-2008 124\n", "Jun-2009 406\n", "Jun-2010 1105\n", "Jun-2011 1835\n", "Mar-2008 402\n", "Mar-2009 324\n", "Mar-2010 828\n", "Mar-2011 1448\n", "May-2008 115\n", "May-2009 359\n", "May-2010 989\n", "May-2011 1704\n", "Nov-2007 112\n", "Nov-2008 209\n", "Nov-2009 662\n", "Nov-2010 1224\n", "Nov-2011 2232\n", "Oct-2007 105\n", "Oct-2008 122\n", "Oct-2009 604\n", "Oct-2010 1232\n", "Oct-2011 2118\n", "Sep-2007 53\n", "Sep-2008 57\n", "Sep-2009 507\n", "Sep-2010 1189\n", "Sep-2011 2067\n", "Name: issue_d, dtype: int64\n", "-----------\n", "purpose\n", "car 1615\n", "credit_card 5477\n", "debt_consolidation 19776\n", "educational 422\n", "home_improvement 3199\n", "house 426\n", "major_purchase 2311\n", "medical 753\n", "moving 629\n", "other 4425\n", "renewable_energy 106\n", "small_business 1992\n", "vacation 400\n", "wedding 1004\n", "Name: purpose, dtype: int64\n", "-----------\n", "earliest_cr_line\n", "Apr-1964 3\n", "Apr-1966 1\n", "Apr-1967 4\n", "Apr-1968 1\n", "Apr-1969 1\n", "Apr-1970 3\n", "Apr-1971 6\n", "Apr-1972 2\n", "Apr-1973 7\n", "Apr-1974 6\n", "Apr-1975 11\n", "Apr-1976 7\n", "Apr-1977 10\n", "Apr-1978 14\n", "Apr-1979 15\n", "Apr-1980 17\n", "Apr-1981 15\n", "Apr-1982 23\n", "Apr-1983 29\n", "Apr-1984 27\n", "Apr-1985 31\n", "Apr-1986 29\n", "Apr-1987 48\n", "Apr-1988 55\n", "Apr-1989 64\n", "Apr-1990 79\n", "Apr-1991 77\n", "Apr-1992 90\n", "Apr-1993 100\n", "Apr-1994 134\n", " ... \n", "Sep-1979 20\n", "Sep-1980 18\n", "Sep-1981 28\n", "Sep-1982 16\n", "Sep-1983 33\n", "Sep-1984 46\n", "Sep-1985 41\n", "Sep-1986 59\n", "Sep-1987 59\n", "Sep-1988 63\n", "Sep-1989 83\n", "Sep-1990 93\n", "Sep-1991 70\n", "Sep-1992 72\n", "Sep-1993 124\n", "Sep-1994 119\n", "Sep-1995 181\n", "Sep-1996 187\n", "Sep-1997 196\n", "Sep-1998 267\n", "Sep-1999 277\n", "Sep-2000 325\n", "Sep-2001 282\n", "Sep-2002 251\n", "Sep-2003 220\n", "Sep-2004 221\n", "Sep-2005 162\n", "Sep-2006 150\n", "Sep-2007 63\n", "Sep-2008 8\n", "Name: earliest_cr_line, dtype: int64\n", "-----------\n", "revol_util\n", "0% 1070\n", "0.01% 1\n", "0.03% 1\n", "0.04% 1\n", "0.05% 1\n", "0.1% 61\n", "0.12% 1\n", "0.16% 1\n", "0.2% 64\n", "0.3% 43\n", "0.4% 45\n", "0.46% 1\n", "0.49% 1\n", "0.5% 49\n", "0.54% 1\n", "0.6% 39\n", "0.7% 47\n", "0.75% 1\n", "0.8% 43\n", "0.83% 1\n", "0.86% 1\n", "0.9% 41\n", "1% 51\n", "1.1% 39\n", "1.2% 37\n", "1.3% 36\n", "1.4% 32\n", "1.5% 33\n", "1.6% 24\n", "1.7% 30\n", " ... \n", "97% 31\n", "97.1% 39\n", "97.2% 32\n", "97.3% 36\n", "97.4% 33\n", "97.5% 35\n", "97.6% 35\n", "97.7% 45\n", "97.8% 38\n", "97.9% 27\n", "98% 28\n", "98.1% 25\n", "98.2% 25\n", "98.3% 36\n", "98.4% 31\n", "98.5% 29\n", "98.6% 36\n", "98.7% 27\n", "98.8% 38\n", "98.9% 20\n", "99% 35\n", "99.1% 33\n", "99.2% 22\n", "99.3% 32\n", "99.4% 24\n", "99.5% 24\n", "99.6% 27\n", "99.7% 32\n", "99.8% 25\n", "99.9% 29\n", "Name: revol_util, dtype: int64\n", "-----------\n", "last_pymnt_d\n", "Apr-2008 23\n", "Apr-2009 72\n", "Apr-2010 145\n", "Apr-2011 519\n", "Apr-2012 781\n", "Apr-2013 890\n", "Apr-2014 682\n", "Apr-2015 141\n", "Apr-2016 178\n", "Aug-2008 31\n", "Aug-2009 66\n", "Aug-2010 198\n", "Aug-2011 479\n", "Aug-2012 868\n", "Aug-2013 744\n", "Aug-2014 836\n", "Aug-2015 217\n", "Dec-2007 2\n", "Dec-2008 31\n", "Dec-2009 116\n", "Dec-2010 332\n", "Dec-2011 585\n", "Dec-2012 732\n", "Dec-2013 799\n", "Dec-2014 949\n", "Dec-2015 185\n", "Feb-2008 8\n", "Feb-2009 65\n", "Feb-2010 148\n", "Feb-2011 437\n", " ... \n", "May-2011 448\n", "May-2012 773\n", "May-2013 943\n", "May-2014 686\n", "May-2015 151\n", "May-2016 203\n", "Nov-2008 31\n", "Nov-2009 80\n", "Nov-2010 258\n", "Nov-2011 487\n", "Nov-2012 782\n", "Nov-2013 701\n", "Nov-2014 593\n", "Nov-2015 231\n", "Oct-2008 55\n", "Oct-2009 85\n", "Oct-2010 292\n", "Oct-2011 482\n", "Oct-2012 853\n", "Oct-2013 712\n", "Oct-2014 813\n", "Oct-2015 200\n", "Sep-2008 40\n", "Sep-2009 55\n", "Sep-2010 215\n", "Sep-2011 491\n", "Sep-2012 802\n", "Sep-2013 712\n", "Sep-2014 694\n", "Sep-2015 211\n", "Name: last_pymnt_d, dtype: int64\n", "-----------\n", "last_credit_pull_d\n", "Apr-2009 24\n", "Apr-2010 77\n", "Apr-2011 177\n", "Apr-2012 326\n", "Apr-2013 445\n", "Apr-2014 507\n", "Apr-2015 404\n", "Apr-2016 912\n", "Aug-2007 17\n", "Aug-2008 15\n", "Aug-2009 32\n", "Aug-2010 68\n", "Aug-2011 290\n", "Aug-2012 391\n", "Aug-2013 375\n", "Aug-2014 528\n", "Aug-2015 435\n", "Dec-2007 3\n", "Dec-2008 7\n", "Dec-2009 73\n", "Dec-2010 156\n", "Dec-2011 268\n", "Dec-2012 391\n", "Dec-2013 431\n", "Dec-2014 567\n", "Dec-2015 656\n", "Feb-2008 3\n", "Feb-2009 20\n", "Feb-2010 87\n", "Feb-2011 199\n", " ... \n", "May-2013 470\n", "May-2014 477\n", "May-2015 433\n", "May-2016 766\n", "Nov-2007 3\n", "Nov-2009 50\n", "Nov-2010 126\n", "Nov-2011 221\n", "Nov-2012 432\n", "Nov-2013 491\n", "Nov-2014 500\n", "Nov-2015 516\n", "Oct-2007 3\n", "Oct-2008 13\n", "Oct-2009 51\n", "Oct-2010 120\n", "Oct-2011 162\n", "Oct-2012 351\n", "Oct-2013 429\n", "Oct-2014 529\n", "Oct-2015 513\n", "Sep-2007 5\n", "Sep-2008 9\n", "Sep-2009 21\n", "Sep-2010 119\n", "Sep-2011 175\n", "Sep-2012 414\n", "Sep-2013 408\n", "Sep-2014 564\n", "Sep-2015 531\n", "Name: last_credit_pull_d, dtype: int64\n", "-----------\n" ] } ], "source": [ "for col in useful_obj_cols:\n", " print(col)\n", " print(combined_vcs[col])\n", " print(\"-----------\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convert to category" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "convert_col_dtypes = {\n", " \"sub_grade\": \"category\", \"home_ownership\": \"category\", \n", " \"verification_status\": \"category\", \"purpose\": \"category\"\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convert `term` and `revol_util` to numerical by data cleaning.\n", "### Convert `issue_d`, `earliest_cr_line`, `last_pymnt_d`, and `last_credit_pull_d` to datetime." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>term</th>\n", " <th>sub_grade</th>\n", " <th>emp_title</th>\n", " <th>home_ownership</th>\n", " <th>verification_status</th>\n", " <th>issue_d</th>\n", " <th>purpose</th>\n", " <th>earliest_cr_line</th>\n", " <th>revol_util</th>\n", " <th>last_pymnt_d</th>\n", " <th>last_credit_pull_d</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>42000</th>\n", " <td>36 months</td>\n", " <td>C2</td>\n", " <td>Best Buy</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Jul-2000</td>\n", " <td>100.7%</td>\n", " <td>Feb-2011</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42001</th>\n", " <td>36 months</td>\n", " <td>G2</td>\n", " <td>CVS PHARMACY</td>\n", " <td>OWN</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Mar-1989</td>\n", " <td>51.9%</td>\n", " <td>Nov-2008</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42002</th>\n", " <td>36 months</td>\n", " <td>E4</td>\n", " <td>General Motors</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Dec-1998</td>\n", " <td>80.7%</td>\n", " <td>Feb-2011</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42003</th>\n", " <td>36 months</td>\n", " <td>G4</td>\n", " <td>usa medical center</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Jul-1995</td>\n", " <td>57.2%</td>\n", " <td>Feb-2011</td>\n", " <td>Jun-2011</td>\n", " </tr>\n", " <tr>\n", " <th>42004</th>\n", " <td>36 months</td>\n", " <td>B3</td>\n", " <td>InvestSource Inc</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Sep-2005</td>\n", " <td>74%</td>\n", " <td>Mar-2010</td>\n", " <td>Aug-2010</td>\n", " </tr>\n", " <tr>\n", " <th>42005</th>\n", " <td>36 months</td>\n", " <td>D1</td>\n", " <td>kaiser</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Nov-2000</td>\n", " <td>37.6%</td>\n", " <td>Mar-2010</td>\n", " <td>Apr-2010</td>\n", " </tr>\n", " <tr>\n", " <th>42006</th>\n", " <td>36 months</td>\n", " <td>C1</td>\n", " <td>Panoramic Software</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Dec-1995</td>\n", " <td>56%</td>\n", " <td>Apr-2010</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42007</th>\n", " <td>36 months</td>\n", " <td>D5</td>\n", " <td>AT&T Inc.</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>small_business</td>\n", " <td>Feb-2000</td>\n", " <td>49.7%</td>\n", " <td>Feb-2009</td>\n", " <td>Oct-2009</td>\n", " </tr>\n", " <tr>\n", " <th>42008</th>\n", " <td>36 months</td>\n", " <td>D4</td>\n", " <td>D&Y</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Oct-1992</td>\n", " <td>71.5%</td>\n", " <td>Jan-2010</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42009</th>\n", " <td>36 months</td>\n", " <td>D5</td>\n", " <td>Frederick's of Hollywood</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Oct-1994</td>\n", " <td>49%</td>\n", " <td>Jul-2008</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42010</th>\n", " <td>36 months</td>\n", " <td>F2</td>\n", " <td>Yuma Nursing Home Ctr</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Oct-1993</td>\n", " <td>18.1%</td>\n", " <td>Feb-2011</td>\n", " <td>Nov-2014</td>\n", " </tr>\n", " <tr>\n", " <th>42011</th>\n", " <td>36 months</td>\n", " <td>E5</td>\n", " <td>Akiva Incorporated</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>small_business</td>\n", " <td>Jan-2006</td>\n", " <td>30.8%</td>\n", " <td>Jan-2009</td>\n", " <td>Aug-2009</td>\n", " </tr>\n", " <tr>\n", " <th>42012</th>\n", " <td>36 months</td>\n", " <td>G4</td>\n", " <td>Citrix Online</td>\n", " <td>OWN</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Sep-1998</td>\n", " <td>98.1%</td>\n", " <td>Feb-2011</td>\n", " <td>Mar-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42013</th>\n", " <td>36 months</td>\n", " <td>G3</td>\n", " <td>Virginia College Online</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>other</td>\n", " <td>Dec-1990</td>\n", " <td>78.4%</td>\n", " <td>Oct-2008</td>\n", " <td>Sep-2008</td>\n", " </tr>\n", " <tr>\n", " <th>42014</th>\n", " <td>36 months</td>\n", " <td>E2</td>\n", " <td>Zone 7 Water Agency</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Nov-1998</td>\n", " <td>94.3%</td>\n", " <td>May-2009</td>\n", " <td>May-2013</td>\n", " </tr>\n", " <tr>\n", " <th>42015</th>\n", " <td>36 months</td>\n", " <td>G5</td>\n", " <td>Integrity Appraisals</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Feb-1997</td>\n", " <td>78.7%</td>\n", " <td>Nov-2008</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42016</th>\n", " <td>36 months</td>\n", " <td>G4</td>\n", " <td>Stordok Document Destruction/Moreno Roofing Co.</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>credit_card</td>\n", " <td>Jun-1982</td>\n", " <td>0%</td>\n", " <td>Feb-2011</td>\n", " <td>Dec-2014</td>\n", " </tr>\n", " <tr>\n", " <th>42017</th>\n", " <td>36 months</td>\n", " <td>E4</td>\n", " <td>Aladdin Capital Management</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>credit_card</td>\n", " <td>Nov-1999</td>\n", " <td>65.2%</td>\n", " <td>Nov-2009</td>\n", " <td>Nov-2009</td>\n", " </tr>\n", " <tr>\n", " <th>42018</th>\n", " <td>36 months</td>\n", " <td>D4</td>\n", " <td>Cayuga Medical Center</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Nov-1994</td>\n", " <td>58.9%</td>\n", " <td>Feb-2011</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42019</th>\n", " <td>36 months</td>\n", " <td>E1</td>\n", " <td>North Sound PT</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jan-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Feb-2000</td>\n", " <td>94%</td>\n", " <td>Oct-2009</td>\n", " <td>Mar-2010</td>\n", " </tr>\n", " <tr>\n", " <th>42020</th>\n", " <td>36 months</td>\n", " <td>C3</td>\n", " <td>UHS</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jan-2008</td>\n", " <td>medical</td>\n", " <td>Oct-1998</td>\n", " <td>83.1%</td>\n", " <td>Feb-2011</td>\n", " <td>Feb-2011</td>\n", " </tr>\n", " <tr>\n", " <th>42021</th>\n", " <td>36 months</td>\n", " <td>D4</td>\n", " <td>Adecco</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Nov-2002</td>\n", " <td>60.7%</td>\n", " <td>Aug-2009</td>\n", " <td>Jan-2011</td>\n", " </tr>\n", " <tr>\n", " <th>42022</th>\n", " <td>36 months</td>\n", " <td>E5</td>\n", " <td>Greatwide logistics Services</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>home_improvement</td>\n", " <td>Oct-1994</td>\n", " <td>49.9%</td>\n", " <td>Sep-2009</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42023</th>\n", " <td>36 months</td>\n", " <td>D4</td>\n", " <td>ASU University</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>credit_card</td>\n", " <td>Sep-2000</td>\n", " <td>59.9%</td>\n", " <td>Feb-2011</td>\n", " <td>Feb-2011</td>\n", " </tr>\n", " <tr>\n", " <th>42024</th>\n", " <td>36 months</td>\n", " <td>B5</td>\n", " <td>Wells Fargo</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Nov-1995</td>\n", " <td>32.6%</td>\n", " <td>Feb-2011</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42025</th>\n", " <td>36 months</td>\n", " <td>D5</td>\n", " <td>lakeside memorial hospital</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Jan-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Nov-1997</td>\n", " <td>77.4%</td>\n", " <td>Jan-2011</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42026</th>\n", " <td>36 months</td>\n", " <td>B5</td>\n", " <td>Controller Group</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>small_business</td>\n", " <td>Mar-1998</td>\n", " <td>46.5%</td>\n", " <td>Oct-2008</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42027</th>\n", " <td>36 months</td>\n", " <td>F1</td>\n", " <td>David Johnston Equestrian Training Lic.#29026</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>small_business</td>\n", " <td>Nov-2000</td>\n", " <td>18.1%</td>\n", " <td>Feb-2011</td>\n", " <td>Feb-2011</td>\n", " </tr>\n", " <tr>\n", " <th>42028</th>\n", " <td>36 months</td>\n", " <td>E3</td>\n", " <td>Lockheed Martin Corp.</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>debt_consolidation</td>\n", " <td>Dec-1983</td>\n", " <td>45.9%</td>\n", " <td>Feb-2011</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42029</th>\n", " <td>36 months</td>\n", " <td>G4</td>\n", " <td>Self</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Feb-2008</td>\n", " <td>small_business</td>\n", " <td>Apr-1994</td>\n", " <td>92.3%</td>\n", " <td>Apr-2008</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>42508</th>\n", " <td>36 months</td>\n", " <td>D5</td>\n", " <td>SUNY- ESF</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jul-2007</td>\n", " <td>credit_card</td>\n", " <td>Dec-2004</td>\n", " <td>49.3%</td>\n", " <td>Jul-2010</td>\n", " <td>Sep-2012</td>\n", " </tr>\n", " <tr>\n", " <th>42509</th>\n", " <td>36 months</td>\n", " <td>D5</td>\n", " <td>Clinton Shop Rite of Hunterdon County</td>\n", " <td>OWN</td>\n", " <td>Not Verified</td>\n", " <td>Jul-2007</td>\n", " <td>car</td>\n", " <td>Dec-2006</td>\n", " <td>NaN</td>\n", " <td>Feb-2010</td>\n", " <td>Jul-2013</td>\n", " </tr>\n", " <tr>\n", " <th>42510</th>\n", " <td>36 months</td>\n", " <td>B4</td>\n", " <td>Scheduall</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Jul-2007</td>\n", " <td>debt_consolidation</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Feb-2008</td>\n", " <td>May-2007</td>\n", " </tr>\n", " <tr>\n", " <th>42511</th>\n", " <td>36 months</td>\n", " <td>E5</td>\n", " <td>GA-PCOM</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jul-2007</td>\n", " <td>educational</td>\n", " <td>Sep-1999</td>\n", " <td>85%</td>\n", " <td>Aug-2010</td>\n", " <td>Aug-2010</td>\n", " </tr>\n", " <tr>\n", " <th>42512</th>\n", " <td>36 months</td>\n", " <td>A1</td>\n", " <td>Tzigane Inc</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Jul-2007</td>\n", " <td>debt_consolidation</td>\n", " <td>Mar-1984</td>\n", " <td>2.2%</td>\n", " <td>Jul-2010</td>\n", " <td>Jun-2010</td>\n", " </tr>\n", " <tr>\n", " <th>42513</th>\n", " <td>36 months</td>\n", " <td>C2</td>\n", " <td>Yale University</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>debt_consolidation</td>\n", " <td>Jan-1996</td>\n", " <td>66%</td>\n", " <td>Jun-2010</td>\n", " <td>Oct-2014</td>\n", " </tr>\n", " <tr>\n", " <th>42514</th>\n", " <td>36 months</td>\n", " <td>B4</td>\n", " <td>Brick Township board of education</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>debt_consolidation</td>\n", " <td>Jul-2004</td>\n", " <td>63.5%</td>\n", " <td>Jun-2010</td>\n", " <td>Feb-2015</td>\n", " </tr>\n", " <tr>\n", " <th>42515</th>\n", " <td>36 months</td>\n", " <td>B2</td>\n", " <td>Classic Components</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jul-2010</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42516</th>\n", " <td>36 months</td>\n", " <td>C4</td>\n", " <td>Compensation Solutions</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Apr-2010</td>\n", " <td>Apr-2010</td>\n", " </tr>\n", " <tr>\n", " <th>42517</th>\n", " <td>36 months</td>\n", " <td>C3</td>\n", " <td>Stanford University Libraries, LOCKSS Project</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jul-2010</td>\n", " <td>Sep-2014</td>\n", " </tr>\n", " <tr>\n", " <th>42518</th>\n", " <td>36 months</td>\n", " <td>E3</td>\n", " <td>Macy's</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jun-2010</td>\n", " <td>Jun-2010</td>\n", " </tr>\n", " <tr>\n", " <th>42519</th>\n", " <td>36 months</td>\n", " <td>A3</td>\n", " <td>Diamond Management and Technology Consultants</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Aug-2009</td>\n", " <td>Aug-2009</td>\n", " </tr>\n", " <tr>\n", " <th>42520</th>\n", " <td>36 months</td>\n", " <td>A2</td>\n", " <td>U.S. Bank</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jan-2008</td>\n", " <td>Jun-2007</td>\n", " </tr>\n", " <tr>\n", " <th>42521</th>\n", " <td>36 months</td>\n", " <td>A2</td>\n", " <td>NC</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jan-2008</td>\n", " <td>Jun-2007</td>\n", " </tr>\n", " <tr>\n", " <th>42522</th>\n", " <td>36 months</td>\n", " <td>A5</td>\n", " <td>College Pro Painters</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Mar-2010</td>\n", " <td>Mar-2010</td>\n", " </tr>\n", " <tr>\n", " <th>42523</th>\n", " <td>36 months</td>\n", " <td>A1</td>\n", " <td>Mana Products</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jun-2010</td>\n", " <td>Apr-2014</td>\n", " </tr>\n", " <tr>\n", " <th>42524</th>\n", " <td>36 months</td>\n", " <td>C4</td>\n", " <td>Apto Solutions</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jul-2010</td>\n", " <td>Jun-2010</td>\n", " </tr>\n", " <tr>\n", " <th>42525</th>\n", " <td>36 months</td>\n", " <td>D3</td>\n", " <td>Infinitely law group</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>May-2008</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42526</th>\n", " <td>36 months</td>\n", " <td>C4</td>\n", " <td>Town of Plainville</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Aug-2008</td>\n", " <td>Jun-2007</td>\n", " </tr>\n", " <tr>\n", " <th>42527</th>\n", " <td>36 months</td>\n", " <td>B2</td>\n", " <td>Tanks Tavern</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jul-2010</td>\n", " <td>Jun-2010</td>\n", " </tr>\n", " <tr>\n", " <th>42528</th>\n", " <td>36 months</td>\n", " <td>B3</td>\n", " <td>NaN</td>\n", " <td>OWN</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jun-2010</td>\n", " <td>May-2007</td>\n", " </tr>\n", " <tr>\n", " <th>42529</th>\n", " <td>36 months</td>\n", " <td>B5</td>\n", " <td>NaN</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jul-2010</td>\n", " <td>Jul-2010</td>\n", " </tr>\n", " <tr>\n", " <th>42530</th>\n", " <td>36 months</td>\n", " <td>B4</td>\n", " <td>Air Force</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>May-2008</td>\n", " <td>Jun-2016</td>\n", " </tr>\n", " <tr>\n", " <th>42531</th>\n", " <td>36 months</td>\n", " <td>C1</td>\n", " <td>NaN</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Mar-2008</td>\n", " <td>Feb-2013</td>\n", " </tr>\n", " <tr>\n", " <th>42532</th>\n", " <td>36 months</td>\n", " <td>B4</td>\n", " <td>Halping hands company inc.</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jun-2010</td>\n", " <td>Sep-2014</td>\n", " </tr>\n", " <tr>\n", " <th>42533</th>\n", " <td>36 months</td>\n", " <td>B3</td>\n", " <td>NaN</td>\n", " <td>RENT</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jun-2010</td>\n", " <td>May-2007</td>\n", " </tr>\n", " <tr>\n", " <th>42534</th>\n", " <td>36 months</td>\n", " <td>A5</td>\n", " <td>NaN</td>\n", " <td>NONE</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jun-2010</td>\n", " <td>Aug-2007</td>\n", " </tr>\n", " <tr>\n", " <th>42535</th>\n", " <td>36 months</td>\n", " <td>A3</td>\n", " <td>Homemaker</td>\n", " <td>MORTGAGE</td>\n", " <td>Not Verified</td>\n", " <td>Jun-2007</td>\n", " <td>other</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Jun-2010</td>\n", " <td>Feb-2015</td>\n", " </tr>\n", " <tr>\n", " <th>42536</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>42537</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>538 rows × 11 columns</p>\n", "</div>" ], "text/plain": [ " term sub_grade emp_title \\\n", "42000 36 months C2 Best Buy \n", "42001 36 months G2 CVS PHARMACY \n", "42002 36 months E4 General Motors \n", "42003 36 months G4 usa medical center \n", "42004 36 months B3 InvestSource Inc \n", "42005 36 months D1 kaiser \n", "42006 36 months C1 Panoramic Software \n", "42007 36 months D5 AT&T Inc. \n", "42008 36 months D4 D&Y \n", "42009 36 months D5 Frederick's of Hollywood \n", "42010 36 months F2 Yuma Nursing Home Ctr \n", "42011 36 months E5 Akiva Incorporated \n", "42012 36 months G4 Citrix Online \n", "42013 36 months G3 Virginia College Online \n", "42014 36 months E2 Zone 7 Water Agency \n", "42015 36 months G5 Integrity Appraisals \n", "42016 36 months G4 Stordok Document Destruction/Moreno Roofing Co. \n", "42017 36 months E4 Aladdin Capital Management \n", "42018 36 months D4 Cayuga Medical Center \n", "42019 36 months E1 North Sound PT \n", "42020 36 months C3 UHS \n", "42021 36 months D4 Adecco \n", "42022 36 months E5 Greatwide logistics Services \n", "42023 36 months D4 ASU University \n", "42024 36 months B5 Wells Fargo \n", "42025 36 months D5 lakeside memorial hospital \n", "42026 36 months B5 Controller Group \n", "42027 36 months F1 David Johnston Equestrian Training Lic.#29026 \n", "42028 36 months E3 Lockheed Martin Corp. \n", "42029 36 months G4 Self \n", "... ... ... ... \n", "42508 36 months D5 SUNY- ESF \n", "42509 36 months D5 Clinton Shop Rite of Hunterdon County \n", "42510 36 months B4 Scheduall \n", "42511 36 months E5 GA-PCOM \n", "42512 36 months A1 Tzigane Inc \n", "42513 36 months C2 Yale University \n", "42514 36 months B4 Brick Township board of education \n", "42515 36 months B2 Classic Components \n", "42516 36 months C4 Compensation Solutions \n", "42517 36 months C3 Stanford University Libraries, LOCKSS Project \n", "42518 36 months E3 Macy's \n", "42519 36 months A3 Diamond Management and Technology Consultants \n", "42520 36 months A2 U.S. Bank \n", "42521 36 months A2 NC \n", "42522 36 months A5 College Pro Painters \n", "42523 36 months A1 Mana Products \n", "42524 36 months C4 Apto Solutions \n", "42525 36 months D3 Infinitely law group \n", "42526 36 months C4 Town of Plainville \n", "42527 36 months B2 Tanks Tavern \n", "42528 36 months B3 NaN \n", "42529 36 months B5 NaN \n", "42530 36 months B4 Air Force \n", "42531 36 months C1 NaN \n", "42532 36 months B4 Halping hands company inc. \n", "42533 36 months B3 NaN \n", "42534 36 months A5 NaN \n", "42535 36 months A3 Homemaker \n", "42536 NaN NaN NaN \n", "42537 NaN NaN NaN \n", "\n", " home_ownership verification_status issue_d purpose \\\n", "42000 RENT Not Verified Feb-2008 debt_consolidation \n", "42001 OWN Not Verified Feb-2008 debt_consolidation \n", "42002 RENT Not Verified Feb-2008 debt_consolidation \n", "42003 RENT Not Verified Feb-2008 debt_consolidation \n", "42004 RENT Not Verified Feb-2008 debt_consolidation \n", "42005 MORTGAGE Not Verified Feb-2008 debt_consolidation \n", "42006 MORTGAGE Not Verified Feb-2008 debt_consolidation \n", "42007 RENT Not Verified Feb-2008 small_business \n", "42008 RENT Not Verified Feb-2008 debt_consolidation \n", "42009 RENT Not Verified Feb-2008 debt_consolidation \n", "42010 RENT Not Verified Feb-2008 debt_consolidation \n", "42011 RENT Not Verified Feb-2008 small_business \n", "42012 OWN Not Verified Feb-2008 debt_consolidation \n", "42013 MORTGAGE Not Verified Feb-2008 other \n", "42014 RENT Not Verified Feb-2008 debt_consolidation \n", "42015 MORTGAGE Not Verified Feb-2008 debt_consolidation \n", "42016 MORTGAGE Not Verified Feb-2008 credit_card \n", "42017 RENT Not Verified Feb-2008 credit_card \n", "42018 RENT Not Verified Feb-2008 debt_consolidation \n", "42019 RENT Not Verified Jan-2008 debt_consolidation \n", "42020 RENT Not Verified Jan-2008 medical \n", "42021 RENT Not Verified Feb-2008 debt_consolidation \n", "42022 MORTGAGE Not Verified Feb-2008 home_improvement \n", "42023 RENT Not Verified Feb-2008 credit_card \n", "42024 RENT Not Verified Feb-2008 debt_consolidation \n", "42025 MORTGAGE Not Verified Jan-2008 debt_consolidation \n", "42026 RENT Not Verified Feb-2008 small_business \n", "42027 RENT Not Verified Feb-2008 small_business \n", "42028 MORTGAGE Not Verified Feb-2008 debt_consolidation \n", "42029 MORTGAGE Not Verified Feb-2008 small_business \n", "... ... ... ... ... \n", "42508 RENT Not Verified Jul-2007 credit_card \n", "42509 OWN Not Verified Jul-2007 car \n", "42510 MORTGAGE Not Verified Jul-2007 debt_consolidation \n", "42511 RENT Not Verified Jul-2007 educational \n", "42512 MORTGAGE Not Verified Jul-2007 debt_consolidation \n", "42513 RENT Not Verified Jun-2007 debt_consolidation \n", "42514 MORTGAGE Not Verified Jun-2007 debt_consolidation \n", "42515 RENT Not Verified Jun-2007 other \n", "42516 RENT Not Verified Jun-2007 other \n", "42517 RENT Not Verified Jun-2007 other \n", "42518 RENT Not Verified Jun-2007 other \n", "42519 MORTGAGE Not Verified Jun-2007 other \n", "42520 MORTGAGE Not Verified Jun-2007 other \n", "42521 MORTGAGE Not Verified Jun-2007 other \n", "42522 MORTGAGE Not Verified Jun-2007 other \n", "42523 RENT Not Verified Jun-2007 other \n", "42524 RENT Not Verified Jun-2007 other \n", "42525 RENT Not Verified Jun-2007 other \n", "42526 RENT Not Verified Jun-2007 other \n", "42527 RENT Not Verified Jun-2007 other \n", "42528 OWN Not Verified Jun-2007 other \n", "42529 RENT Not Verified Jun-2007 other \n", "42530 RENT Not Verified Jun-2007 other \n", "42531 RENT Not Verified Jun-2007 other \n", "42532 RENT Not Verified Jun-2007 other \n", "42533 RENT Not Verified Jun-2007 other \n", "42534 NONE Not Verified Jun-2007 other \n", "42535 MORTGAGE Not Verified Jun-2007 other \n", "42536 NaN NaN NaN NaN \n", "42537 NaN NaN NaN NaN \n", "\n", " earliest_cr_line revol_util last_pymnt_d last_credit_pull_d \n", "42000 Jul-2000 100.7% Feb-2011 Jun-2016 \n", "42001 Mar-1989 51.9% Nov-2008 Jun-2016 \n", "42002 Dec-1998 80.7% Feb-2011 Jun-2016 \n", "42003 Jul-1995 57.2% Feb-2011 Jun-2011 \n", "42004 Sep-2005 74% Mar-2010 Aug-2010 \n", "42005 Nov-2000 37.6% Mar-2010 Apr-2010 \n", "42006 Dec-1995 56% Apr-2010 Jun-2016 \n", "42007 Feb-2000 49.7% Feb-2009 Oct-2009 \n", "42008 Oct-1992 71.5% Jan-2010 Jun-2016 \n", "42009 Oct-1994 49% Jul-2008 Jun-2016 \n", "42010 Oct-1993 18.1% Feb-2011 Nov-2014 \n", "42011 Jan-2006 30.8% Jan-2009 Aug-2009 \n", "42012 Sep-1998 98.1% Feb-2011 Mar-2016 \n", "42013 Dec-1990 78.4% Oct-2008 Sep-2008 \n", "42014 Nov-1998 94.3% May-2009 May-2013 \n", "42015 Feb-1997 78.7% Nov-2008 Jun-2016 \n", "42016 Jun-1982 0% Feb-2011 Dec-2014 \n", "42017 Nov-1999 65.2% Nov-2009 Nov-2009 \n", "42018 Nov-1994 58.9% Feb-2011 Jun-2016 \n", "42019 Feb-2000 94% Oct-2009 Mar-2010 \n", "42020 Oct-1998 83.1% Feb-2011 Feb-2011 \n", "42021 Nov-2002 60.7% Aug-2009 Jan-2011 \n", "42022 Oct-1994 49.9% Sep-2009 Jun-2016 \n", "42023 Sep-2000 59.9% Feb-2011 Feb-2011 \n", "42024 Nov-1995 32.6% Feb-2011 Jun-2016 \n", "42025 Nov-1997 77.4% Jan-2011 Jun-2016 \n", "42026 Mar-1998 46.5% Oct-2008 Jun-2016 \n", "42027 Nov-2000 18.1% Feb-2011 Feb-2011 \n", "42028 Dec-1983 45.9% Feb-2011 Jun-2016 \n", "42029 Apr-1994 92.3% Apr-2008 Jun-2016 \n", "... ... ... ... ... \n", "42508 Dec-2004 49.3% Jul-2010 Sep-2012 \n", "42509 Dec-2006 NaN Feb-2010 Jul-2013 \n", "42510 NaN NaN Feb-2008 May-2007 \n", "42511 Sep-1999 85% Aug-2010 Aug-2010 \n", "42512 Mar-1984 2.2% Jul-2010 Jun-2010 \n", "42513 Jan-1996 66% Jun-2010 Oct-2014 \n", "42514 Jul-2004 63.5% Jun-2010 Feb-2015 \n", "42515 NaN NaN Jul-2010 Jun-2016 \n", "42516 NaN NaN Apr-2010 Apr-2010 \n", "42517 NaN NaN Jul-2010 Sep-2014 \n", "42518 NaN NaN Jun-2010 Jun-2010 \n", "42519 NaN NaN Aug-2009 Aug-2009 \n", "42520 NaN NaN Jan-2008 Jun-2007 \n", "42521 NaN NaN Jan-2008 Jun-2007 \n", "42522 NaN NaN Mar-2010 Mar-2010 \n", "42523 NaN NaN Jun-2010 Apr-2014 \n", "42524 NaN NaN Jul-2010 Jun-2010 \n", "42525 NaN NaN May-2008 Jun-2016 \n", "42526 NaN NaN Aug-2008 Jun-2007 \n", "42527 NaN NaN Jul-2010 Jun-2010 \n", "42528 NaN NaN Jun-2010 May-2007 \n", "42529 NaN NaN Jul-2010 Jul-2010 \n", "42530 NaN NaN May-2008 Jun-2016 \n", "42531 NaN NaN Mar-2008 Feb-2013 \n", "42532 NaN NaN Jun-2010 Sep-2014 \n", "42533 NaN NaN Jun-2010 May-2007 \n", "42534 NaN NaN Jun-2010 Aug-2007 \n", "42535 NaN NaN Jun-2010 Feb-2015 \n", "42536 NaN NaN NaN NaN \n", "42537 NaN NaN NaN NaN \n", "\n", "[538 rows x 11 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk[useful_obj_cols]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id object\n", "member_id float64\n", "loan_amnt float64\n", "funded_amnt float64\n", "funded_amnt_inv float64\n", "term float64\n", "int_rate object\n", "installment float64\n", "grade object\n", "sub_grade category\n", "emp_title object\n", "emp_length object\n", "home_ownership category\n", "annual_inc float64\n", "verification_status category\n", "issue_d datetime64[ns]\n", "loan_status object\n", "pymnt_plan object\n", "purpose category\n", "title object\n", "zip_code object\n", "addr_state object\n", "dti float64\n", "delinq_2yrs float64\n", "earliest_cr_line datetime64[ns]\n", "inq_last_6mths float64\n", "open_acc float64\n", "pub_rec float64\n", "revol_bal float64\n", "revol_util float64\n", "total_acc float64\n", "initial_list_status object\n", "out_prncp float64\n", "out_prncp_inv float64\n", "total_pymnt float64\n", "total_pymnt_inv float64\n", "total_rec_prncp float64\n", "total_rec_int float64\n", "total_rec_late_fee float64\n", "recoveries float64\n", "collection_recovery_fee float64\n", "last_pymnt_d datetime64[ns]\n", "last_pymnt_amnt float64\n", "last_credit_pull_d datetime64[ns]\n", "collections_12_mths_ex_med float64\n", "policy_code float64\n", "application_type object\n", "acc_now_delinq float64\n", "chargeoff_within_12_mths float64\n", "delinq_amnt float64\n", "pub_rec_bankruptcies float64\n", "tax_liens float64\n", "dtype: object" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n", "\n", "for chunk in chunk_iter:\n", " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n", " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n", " chunk['term'] = pd.to_numeric(term_cleaned)\n", " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n", " \n", "chunk.dtypes" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'acc_now_delinq': 32,\n", " 'annual_inc': 7,\n", " 'chargeoff_within_12_mths': 148,\n", " 'collection_recovery_fee': 3,\n", " 'collections_12_mths_ex_med': 148,\n", " 'delinq_2yrs': 32,\n", " 'delinq_amnt': 32,\n", " 'dti': 3,\n", " 'funded_amnt': 3,\n", " 'funded_amnt_inv': 3,\n", " 'inq_last_6mths': 32,\n", " 'installment': 3,\n", " 'last_pymnt_amnt': 3,\n", " 'loan_amnt': 3,\n", " 'member_id': 3,\n", " 'open_acc': 32,\n", " 'out_prncp': 3,\n", " 'out_prncp_inv': 3,\n", " 'policy_code': 3,\n", " 'pub_rec': 32,\n", " 'pub_rec_bankruptcies': 1368,\n", " 'recoveries': 3,\n", " 'revol_bal': 3,\n", " 'revol_util': 93,\n", " 'tax_liens': 108,\n", " 'term': 3,\n", " 'total_acc': 32,\n", " 'total_pymnt': 3,\n", " 'total_pymnt_inv': 3,\n", " 'total_rec_int': 3,\n", " 'total_rec_late_fee': 3,\n", " 'total_rec_prncp': 3}" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n", "mv_counts = {}\n", "for chunk in chunk_iter:\n", " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n", " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n", " chunk['term'] = pd.to_numeric(term_cleaned)\n", " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n", " float_cols = chunk.select_dtypes(include=['float'])\n", " for col in float_cols.columns:\n", " missing_values = len(chunk) - chunk[col].count()\n", " if col in mv_counts:\n", " mv_counts[col] = mv_counts[col] + missing_values\n", " else:\n", " mv_counts[col] = missing_values\n", "mv_counts" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'acc_now_delinq': 32,\n", " 'annual_inc': 7,\n", " 'chargeoff_within_12_mths': 148,\n", " 'collection_recovery_fee': 3,\n", " 'collections_12_mths_ex_med': 148,\n", " 'delinq_2yrs': 32,\n", " 'delinq_amnt': 32,\n", " 'dti': 3,\n", " 'funded_amnt': 3,\n", " 'funded_amnt_inv': 3,\n", " 'inq_last_6mths': 32,\n", " 'installment': 3,\n", " 'last_pymnt_amnt': 3,\n", " 'loan_amnt': 3,\n", " 'member_id': 3,\n", " 'open_acc': 32,\n", " 'out_prncp': 3,\n", " 'out_prncp_inv': 3,\n", " 'policy_code': 3,\n", " 'pub_rec': 32,\n", " 'pub_rec_bankruptcies': 1368,\n", " 'recoveries': 3,\n", " 'revol_bal': 3,\n", " 'revol_util': 93,\n", " 'tax_liens': 108,\n", " 'term': 3,\n", " 'total_acc': 32,\n", " 'total_pymnt': 3,\n", " 'total_pymnt_inv': 3,\n", " 'total_rec_int': 3,\n", " 'total_rec_late_fee': 3,\n", " 'total_rec_prncp': 3}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n", "mv_counts = {}\n", "for chunk in chunk_iter:\n", " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n", " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n", " chunk['term'] = pd.to_numeric(term_cleaned)\n", " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n", " chunk = chunk.dropna(how='all')\n", " float_cols = chunk.select_dtypes(include=['float'])\n", " for col in float_cols.columns:\n", " missing_values = len(chunk) - chunk[col].count()\n", " if col in mv_counts:\n", " mv_counts[col] = mv_counts[col] + missing_values\n", " else:\n", " mv_counts[col] = missing_values\n", "mv_counts" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.4" } }, "nbformat": 4, "nbformat_minor": 2 }