{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmember_idloan_amntfunded_amntfunded_amnt_invtermint_rateinstallmentgradesub_gradeemp_titleemp_lengthhome_ownershipannual_incverification_statusissue_dloan_statuspymnt_planpurposetitlezip_codeaddr_statedtidelinq_2yrsearliest_cr_lineinq_last_6mthsopen_accpub_recrevol_balrevol_utiltotal_accinitial_list_statusout_prncpout_prncp_invtotal_pymnttotal_pymnt_invtotal_rec_prncptotal_rec_inttotal_rec_late_feerecoveriescollection_recovery_feelast_pymnt_dlast_pymnt_amntlast_credit_pull_dcollections_12_mths_ex_medpolicy_codeapplication_typeacc_now_delinqchargeoff_within_12_mthsdelinq_amntpub_rec_bankruptciestax_liens
010775011296599.05000.05000.04975.036 months10.65%162.87BB2NaN10+ yearsRENT24000.0VerifiedDec-2011Fully Paidncredit_cardComputer860xxAZ27.650.0Jan-19851.03.00.013648.083.7%9.0f0.000.005863.1551875833.845000.00863.160.000.000.00Jan-2015171.62Jun-20160.01.0INDIVIDUAL0.00.00.00.00.0
110774301314167.02500.02500.02500.060 months15.27%59.83CC4Ryder< 1 yearRENT30000.0Source VerifiedDec-2011Charged Offncarbike309xxGA1.000.0Apr-19995.03.00.01687.09.4%4.0f0.000.001008.7100001008.71456.46435.170.00117.081.11Apr-2013119.66Sep-20130.01.0INDIVIDUAL0.00.00.00.00.0
210771751313524.02400.02400.02400.036 months15.96%84.33CC5NaN10+ yearsRENT12252.0Not VerifiedDec-2011Fully Paidnsmall_businessreal estate business606xxIL8.720.0Nov-20012.02.00.02956.098.5%10.0f0.000.003005.6668443005.672400.00605.670.000.000.00Jun-2014649.91Jun-20160.01.0INDIVIDUAL0.00.00.00.00.0
310768631277178.010000.010000.010000.036 months13.49%339.31CC1AIR RESOURCES BOARD10+ yearsRENT49200.0Source VerifiedDec-2011Fully Paidnotherpersonel917xxCA20.000.0Feb-19961.010.00.05598.021%37.0f0.000.0012231.89000012231.8910000.002214.9216.970.000.00Jan-2015357.48Apr-20160.01.0INDIVIDUAL0.00.00.00.00.0
410753581311748.03000.03000.03000.060 months12.69%67.79BB5University Medical Group1 yearRENT80000.0Source VerifiedDec-2011CurrentnotherPersonal972xxOR17.940.0Jan-19960.015.00.027783.053.9%38.0f461.73461.733581.1200003581.122538.271042.850.000.000.00Jun-201667.79Jun-20160.01.0INDIVIDUAL0.00.00.00.00.0
\n", "
" ], "text/plain": [ " id member_id loan_amnt funded_amnt funded_amnt_inv term \\\n", "0 1077501 1296599.0 5000.0 5000.0 4975.0 36 months \n", "1 1077430 1314167.0 2500.0 2500.0 2500.0 60 months \n", "2 1077175 1313524.0 2400.0 2400.0 2400.0 36 months \n", "3 1076863 1277178.0 10000.0 10000.0 10000.0 36 months \n", "4 1075358 1311748.0 3000.0 3000.0 3000.0 60 months \n", "\n", " int_rate installment grade sub_grade emp_title emp_length \\\n", "0 10.65% 162.87 B B2 NaN 10+ years \n", "1 15.27% 59.83 C C4 Ryder < 1 year \n", "2 15.96% 84.33 C C5 NaN 10+ years \n", "3 13.49% 339.31 C C1 AIR RESOURCES BOARD 10+ years \n", "4 12.69% 67.79 B B5 University Medical Group 1 year \n", "\n", " home_ownership annual_inc verification_status issue_d loan_status \\\n", "0 RENT 24000.0 Verified Dec-2011 Fully Paid \n", "1 RENT 30000.0 Source Verified Dec-2011 Charged Off \n", "2 RENT 12252.0 Not Verified Dec-2011 Fully Paid \n", "3 RENT 49200.0 Source Verified Dec-2011 Fully Paid \n", "4 RENT 80000.0 Source Verified Dec-2011 Current \n", "\n", " pymnt_plan purpose title zip_code addr_state dti \\\n", "0 n credit_card Computer 860xx AZ 27.65 \n", "1 n car bike 309xx GA 1.00 \n", "2 n small_business real estate business 606xx IL 8.72 \n", "3 n other personel 917xx CA 20.00 \n", "4 n other Personal 972xx OR 17.94 \n", "\n", " delinq_2yrs earliest_cr_line inq_last_6mths open_acc pub_rec revol_bal \\\n", "0 0.0 Jan-1985 1.0 3.0 0.0 13648.0 \n", "1 0.0 Apr-1999 5.0 3.0 0.0 1687.0 \n", "2 0.0 Nov-2001 2.0 2.0 0.0 2956.0 \n", "3 0.0 Feb-1996 1.0 10.0 0.0 5598.0 \n", "4 0.0 Jan-1996 0.0 15.0 0.0 27783.0 \n", "\n", " revol_util total_acc initial_list_status out_prncp out_prncp_inv \\\n", "0 83.7% 9.0 f 0.00 0.00 \n", "1 9.4% 4.0 f 0.00 0.00 \n", "2 98.5% 10.0 f 0.00 0.00 \n", "3 21% 37.0 f 0.00 0.00 \n", "4 53.9% 38.0 f 461.73 461.73 \n", "\n", " total_pymnt total_pymnt_inv total_rec_prncp total_rec_int \\\n", "0 5863.155187 5833.84 5000.00 863.16 \n", "1 1008.710000 1008.71 456.46 435.17 \n", "2 3005.666844 3005.67 2400.00 605.67 \n", "3 12231.890000 12231.89 10000.00 2214.92 \n", "4 3581.120000 3581.12 2538.27 1042.85 \n", "\n", " total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d \\\n", "0 0.00 0.00 0.00 Jan-2015 \n", "1 0.00 117.08 1.11 Apr-2013 \n", "2 0.00 0.00 0.00 Jun-2014 \n", "3 16.97 0.00 0.00 Jan-2015 \n", "4 0.00 0.00 0.00 Jun-2016 \n", "\n", " last_pymnt_amnt last_credit_pull_d collections_12_mths_ex_med \\\n", "0 171.62 Jun-2016 0.0 \n", "1 119.66 Sep-2013 0.0 \n", "2 649.91 Jun-2016 0.0 \n", "3 357.48 Apr-2016 0.0 \n", "4 67.79 Jun-2016 0.0 \n", "\n", " policy_code application_type acc_now_delinq chargeoff_within_12_mths \\\n", "0 1.0 INDIVIDUAL 0.0 0.0 \n", "1 1.0 INDIVIDUAL 0.0 0.0 \n", "2 1.0 INDIVIDUAL 0.0 0.0 \n", "3 1.0 INDIVIDUAL 0.0 0.0 \n", "4 1.0 INDIVIDUAL 0.0 0.0 \n", "\n", " delinq_amnt pub_rec_bankruptcies tax_liens \n", "0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "pd.options.display.max_columns = 99\n", "\n", "first_five = pd.read_csv('loans_2007.csv', nrows=5)\n", "first_five" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.5506629943847656" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "thousand_chunk = pd.read_csv('loans_2007.csv', nrows=1000)\n", "thousand_chunk.memory_usage(deep=True).sum()/(1024*1024)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Let's try tripling to 3000 rows and calculate the memory footprint for each chunk." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.6512298584\n", "4.64817810059\n", "4.64966869354\n", "4.65059375763\n", "4.64692020416\n", "4.64837551117\n", "4.64736747742\n", "4.64941596985\n", "4.64743518829\n", "4.64727210999\n", "4.6598777771\n", "4.6566619873\n", "4.66349601746\n", "4.89784526825\n", "0.880808830261\n" ] } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "for chunk in chunk_iter:\n", " print(chunk.memory_usage(deep=True).sum()/(1024*1024))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## How many rows in the data set?" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "42538\n" ] } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "total_rows = 0\n", "for chunk in chunk_iter:\n", " total_rows += len(chunk)\n", "print(total_rows)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploring the Data in Chunks\n", "\n", "## How many columns have a numeric type? How many have a string type?" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30]\n", "[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22]\n" ] } ], "source": [ "# Numeric columns\n", "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "numeric = []\n", "string = []\n", "for lc in loans_chunks:\n", " nums = lc.select_dtypes(include=[np.number]).shape[1]\n", " numeric.append(nums)\n", " strs = lc.select_dtypes(include=['object']).shape[1]\n", " string.append(strs)\n", "\n", "print(numeric)\n", "print(string)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n", "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n", "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n", "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n" ] } ], "source": [ "# Are string columns consistent across chunks?\n", "obj_cols = []\n", "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "\n", "for chunk in chunk_iter:\n", " chunk_obj_cols = chunk.select_dtypes(include=['object']).columns.tolist()\n", " if len(obj_cols) > 0:\n", " is_same = obj_cols == chunk_obj_cols\n", " if not is_same:\n", " print(\"overall obj cols:\", obj_cols, \"\\n\")\n", " print(\"chunk obj cols:\", chunk_obj_cols, \"\\n\") \n", " else:\n", " obj_cols = chunk_obj_cols" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### Observation 1: By default -- 31 numeric columns and 21 string columns.\n", "\n", "### Observation 2: It seems like one column in particular (the `id` column) is being cast to int64 in the last 2 chunks but not in the earlier chunks. Since the `id` column won't be useful for analysis, visualization, or predictive modelling let's ignore this column.\n", "\n", "## How many unique values are there in each string column? How many of the string columns contain values that are less than 50% unique?" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "term 2\n", "loan_status 9\n", "purpose 14\n", "grade 7\n", "sub_grade 35\n", "initial_list_status 1\n", "verification_status 3\n", "application_type 1\n", "emp_length 12\n", "home_ownership 5\n", "pymnt_plan 2\n" ] } ], "source": [ "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "uniques = {}\n", "for lc in loans_chunks:\n", " strings_only = lc.select_dtypes(include=['object'])\n", " cols = strings_only.columns\n", " for c in cols:\n", " val_counts = strings_only[c].value_counts()\n", " if c in uniques:\n", " uniques[c].append(val_counts)\n", " else:\n", " uniques[c] = [val_counts]\n", "\n", "uniques_combined = {}\n", "unique_stats = {\n", " 'column_name': [],\n", " 'total_values': [],\n", " 'unique_values': [],\n", "}\n", "for col in uniques:\n", " u_concat = pd.concat(uniques[col])\n", " u_group = u_concat.groupby(u_concat.index).sum()\n", " uniques_combined[col] = u_group\n", " if u_group.shape[0] < 50:\n", " print(col, u_group.shape[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Which float columns have no missing values and could be candidates for conversion to the integer type?" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "member_id 3\n", "total_rec_int 3\n", "total_pymnt_inv 3\n", "total_pymnt 3\n", "revol_bal 3\n", "recoveries 3\n", "policy_code 3\n", "out_prncp_inv 3\n", "out_prncp 3\n", "total_rec_late_fee 3\n", "loan_amnt 3\n", "last_pymnt_amnt 3\n", "total_rec_prncp 3\n", "funded_amnt_inv 3\n", "funded_amnt 3\n", "dti 3\n", "collection_recovery_fee 3\n", "installment 3\n", "annual_inc 7\n", "inq_last_6mths 32\n", "total_acc 32\n", "delinq_2yrs 32\n", "pub_rec 32\n", "delinq_amnt 32\n", "open_acc 32\n", "acc_now_delinq 32\n", "tax_liens 108\n", "collections_12_mths_ex_med 148\n", "chargeoff_within_12_mths 148\n", "pub_rec_bankruptcies 1368\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "missing = []\n", "for lc in loans_chunks:\n", " floats = lc.select_dtypes(include=['float'])\n", " missing.append(floats.apply(pd.isnull).sum())\n", "\n", "combined_missing = pd.concat(missing)\n", "combined_missing.groupby(combined_missing.index).sum().sort_values()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate the total memory usage across all of the chunks." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "66.245146751403809" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "mem_usage = []\n", "\n", "for lc in loans_chunks:\n", " mem_usage.append(lc.memory_usage(deep=True).sum() / 1024 ** 2)\n", "\n", "sum(mem_usage)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Optimizing String Columns\n", "\n", "### Determine which string columns you can convert to a numeric type if you clean them. Let's focus on columns that would actually be useful for analysis and modelling." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['term',\n", " 'int_rate',\n", " 'grade',\n", " 'sub_grade',\n", " 'emp_title',\n", " 'emp_length',\n", " 'home_ownership',\n", " 'verification_status',\n", " 'issue_d',\n", " 'loan_status',\n", " 'pymnt_plan',\n", " 'purpose',\n", " 'title',\n", " 'zip_code',\n", " 'addr_state',\n", " 'earliest_cr_line',\n", " 'revol_util',\n", " 'initial_list_status',\n", " 'last_pymnt_d',\n", " 'last_credit_pull_d',\n", " 'application_type']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_cols" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "useful_obj_cols = ['term', 'sub_grade', 'emp_title', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'earliest_cr_line', 'revol_util', 'last_pymnt_d', 'last_credit_pull_d']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "## Create dictionary (key: column, value: list of Series objects representing each chunk's value counts)\n", "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "str_cols_vc = {}\n", "for chunk in chunk_iter:\n", " str_cols = chunk.select_dtypes(include=['object'])\n", " for col in str_cols.columns:\n", " current_col_vc = str_cols[col].value_counts()\n", " if col in str_cols_vc:\n", " str_cols_vc[col].append(current_col_vc)\n", " else:\n", " str_cols_vc[col] = [current_col_vc]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "## Combine the value counts.\n", "combined_vcs = {}\n", "\n", "for col in str_cols_vc:\n", " combined_vc = pd.concat(str_cols_vc[col])\n", " final_vc = combined_vc.groupby(combined_vc.index).sum()\n", " combined_vcs[col] = final_vc" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "term\n", " 36 months 31534\n", " 60 months 11001\n", "Name: term, dtype: int64\n", "-----------\n", "sub_grade\n", "A1 1142\n", "A2 1520\n", "A3 1823\n", "A4 2905\n", "A5 2793\n", "B1 1882\n", "B2 2113\n", "B3 2997\n", "B4 2590\n", "B5 2807\n", "C1 2264\n", "C2 2157\n", "C3 1658\n", "C4 1370\n", "C5 1291\n", "D1 1053\n", "D2 1485\n", "D3 1322\n", "D4 1140\n", "D5 1016\n", "E1 884\n", "E2 791\n", "E3 668\n", "E4 552\n", "E5 499\n", "F1 392\n", "F2 308\n", "F3 236\n", "F4 211\n", "F5 154\n", "G1 141\n", "G2 107\n", "G3 79\n", "G4 99\n", "G5 86\n", "Name: sub_grade, dtype: int64\n", "-----------\n", "emp_title\n", " old palm inc 1\n", " Brocade Communications 1\n", " CenturyLink 1\n", " Department of Homeland Security 1\n", " Down To Earth Distributors, Inc. 1\n", " Plaid, Inc. 1\n", " U.S. Dept. Of Homeland Security 1\n", " pacific crane maintenance company 1\n", "$260M '06 vintage technology venture capital firm 1\n", "(Collaborative) Abbott Nutrition Intl 1\n", "(self) Castleforte Group 1\n", "0CEAN VIEW INTL REALTY/C21 UNITED PLATINUM 1\n", "1 Cochran Hyundai 1\n", "1 and 1 Internet Inc. 1\n", "1)-Yavapai Regional Medical Center 2)- Dr. cantors office 1\n", "1-800 Contacts 1\n", "101 restaurant 1\n", "1013 communications 1\n", "1125 memorex dr 1\n", "1199SEIU Benefit and Pension Funds 1\n", "1199SEIU UHE 1\n", "123 Recovery 1\n", "123 appartment corp. 1\n", "128 Air Refueling Wing (USAF) 1\n", "1400 1\n", "15 1\n", "162 fighter wing 1\n", "16th MP BDE, U.S. Army 1\n", "1800-got-junk? 1\n", "182nd Airlift Wing Fire Dept. 1\n", " ..\n", "wyoming valley hospital 1\n", "wythe county community hospital 1\n", "xcel fire protection 1\n", "xerox cor. 1\n", "xerox corp 1\n", "xo communication 1\n", "xotic deliveries 1\n", "xpedite technologies inc 1\n", "xpedx 1\n", "yankee candle company 1\n", "ye olde pepper co 1\n", "yellow cab com. 1\n", "yellow cab taxi 1\n", "ymca 3\n", "yonkers fire department 1\n", "york hospital dental center 1\n", "yorkville physical therapy 1\n", "young and associates 1\n", "young atitudes aveda 1\n", "youth villages 1\n", "yrbnyc.com / Yellow Rat Bastard 1\n", "yrc 1\n", "yrcw 1\n", "zachry industral inc 1\n", "zakheim and lavrar 1\n", "zashko inc. 1\n", "zeno office solutions 1\n", "zion lutheran school 1\n", "zoll medical corp 1\n", "zozaya officiating 1\n", "Name: emp_title, dtype: int64\n", "-----------\n", "home_ownership\n", "MORTGAGE 18959\n", "NONE 8\n", "OTHER 136\n", "OWN 3251\n", "RENT 20181\n", "Name: home_ownership, dtype: int64\n", "-----------\n", "verification_status\n", "Not Verified 18758\n", "Source Verified 10306\n", "Verified 13471\n", "Name: verification_status, dtype: int64\n", "-----------\n", "issue_d\n", "Apr-2008 259\n", "Apr-2009 333\n", "Apr-2010 912\n", "Apr-2011 1563\n", "Aug-2007 74\n", "Aug-2008 100\n", "Aug-2009 446\n", "Aug-2010 1175\n", "Aug-2011 1934\n", "Dec-2007 172\n", "Dec-2008 253\n", "Dec-2009 658\n", "Dec-2010 1335\n", "Dec-2011 2267\n", "Feb-2008 306\n", "Feb-2009 302\n", "Feb-2010 682\n", "Feb-2011 1298\n", "Jan-2008 305\n", "Jan-2009 269\n", "Jan-2010 662\n", "Jan-2011 1380\n", "Jul-2007 63\n", "Jul-2008 141\n", "Jul-2009 411\n", "Jul-2010 1204\n", "Jul-2011 1875\n", "Jun-2007 24\n", "Jun-2008 124\n", "Jun-2009 406\n", "Jun-2010 1105\n", "Jun-2011 1835\n", "Mar-2008 402\n", "Mar-2009 324\n", "Mar-2010 828\n", "Mar-2011 1448\n", "May-2008 115\n", "May-2009 359\n", "May-2010 989\n", "May-2011 1704\n", "Nov-2007 112\n", "Nov-2008 209\n", "Nov-2009 662\n", "Nov-2010 1224\n", "Nov-2011 2232\n", "Oct-2007 105\n", "Oct-2008 122\n", "Oct-2009 604\n", "Oct-2010 1232\n", "Oct-2011 2118\n", "Sep-2007 53\n", "Sep-2008 57\n", "Sep-2009 507\n", "Sep-2010 1189\n", "Sep-2011 2067\n", "Name: issue_d, dtype: int64\n", "-----------\n", "purpose\n", "car 1615\n", "credit_card 5477\n", "debt_consolidation 19776\n", "educational 422\n", "home_improvement 3199\n", "house 426\n", "major_purchase 2311\n", "medical 753\n", "moving 629\n", "other 4425\n", "renewable_energy 106\n", "small_business 1992\n", "vacation 400\n", "wedding 1004\n", "Name: purpose, dtype: int64\n", "-----------\n", "earliest_cr_line\n", "Apr-1964 3\n", "Apr-1966 1\n", "Apr-1967 4\n", "Apr-1968 1\n", "Apr-1969 1\n", "Apr-1970 3\n", "Apr-1971 6\n", "Apr-1972 2\n", "Apr-1973 7\n", "Apr-1974 6\n", "Apr-1975 11\n", "Apr-1976 7\n", "Apr-1977 10\n", "Apr-1978 14\n", "Apr-1979 15\n", "Apr-1980 17\n", "Apr-1981 15\n", "Apr-1982 23\n", "Apr-1983 29\n", "Apr-1984 27\n", "Apr-1985 31\n", "Apr-1986 29\n", "Apr-1987 48\n", "Apr-1988 55\n", "Apr-1989 64\n", "Apr-1990 79\n", "Apr-1991 77\n", "Apr-1992 90\n", "Apr-1993 100\n", "Apr-1994 134\n", " ... \n", "Sep-1979 20\n", "Sep-1980 18\n", "Sep-1981 28\n", "Sep-1982 16\n", "Sep-1983 33\n", "Sep-1984 46\n", "Sep-1985 41\n", "Sep-1986 59\n", "Sep-1987 59\n", "Sep-1988 63\n", "Sep-1989 83\n", "Sep-1990 93\n", "Sep-1991 70\n", "Sep-1992 72\n", "Sep-1993 124\n", "Sep-1994 119\n", "Sep-1995 181\n", "Sep-1996 187\n", "Sep-1997 196\n", "Sep-1998 267\n", "Sep-1999 277\n", "Sep-2000 325\n", "Sep-2001 282\n", "Sep-2002 251\n", "Sep-2003 220\n", "Sep-2004 221\n", "Sep-2005 162\n", "Sep-2006 150\n", "Sep-2007 63\n", "Sep-2008 8\n", "Name: earliest_cr_line, dtype: int64\n", "-----------\n", "revol_util\n", "0% 1070\n", "0.01% 1\n", "0.03% 1\n", "0.04% 1\n", "0.05% 1\n", "0.1% 61\n", "0.12% 1\n", "0.16% 1\n", "0.2% 64\n", "0.3% 43\n", "0.4% 45\n", "0.46% 1\n", "0.49% 1\n", "0.5% 49\n", "0.54% 1\n", "0.6% 39\n", "0.7% 47\n", "0.75% 1\n", "0.8% 43\n", "0.83% 1\n", "0.86% 1\n", "0.9% 41\n", "1% 51\n", "1.1% 39\n", "1.2% 37\n", "1.3% 36\n", "1.4% 32\n", "1.5% 33\n", "1.6% 24\n", "1.7% 30\n", " ... \n", "97% 31\n", "97.1% 39\n", "97.2% 32\n", "97.3% 36\n", "97.4% 33\n", "97.5% 35\n", "97.6% 35\n", "97.7% 45\n", "97.8% 38\n", "97.9% 27\n", "98% 28\n", "98.1% 25\n", "98.2% 25\n", "98.3% 36\n", "98.4% 31\n", "98.5% 29\n", "98.6% 36\n", "98.7% 27\n", "98.8% 38\n", "98.9% 20\n", "99% 35\n", "99.1% 33\n", "99.2% 22\n", "99.3% 32\n", "99.4% 24\n", "99.5% 24\n", "99.6% 27\n", "99.7% 32\n", "99.8% 25\n", "99.9% 29\n", "Name: revol_util, dtype: int64\n", "-----------\n", "last_pymnt_d\n", "Apr-2008 23\n", "Apr-2009 72\n", "Apr-2010 145\n", "Apr-2011 519\n", "Apr-2012 781\n", "Apr-2013 890\n", "Apr-2014 682\n", "Apr-2015 141\n", "Apr-2016 178\n", "Aug-2008 31\n", "Aug-2009 66\n", "Aug-2010 198\n", "Aug-2011 479\n", "Aug-2012 868\n", "Aug-2013 744\n", "Aug-2014 836\n", "Aug-2015 217\n", "Dec-2007 2\n", "Dec-2008 31\n", "Dec-2009 116\n", "Dec-2010 332\n", "Dec-2011 585\n", "Dec-2012 732\n", "Dec-2013 799\n", "Dec-2014 949\n", "Dec-2015 185\n", "Feb-2008 8\n", "Feb-2009 65\n", "Feb-2010 148\n", "Feb-2011 437\n", " ... \n", "May-2011 448\n", "May-2012 773\n", "May-2013 943\n", "May-2014 686\n", "May-2015 151\n", "May-2016 203\n", "Nov-2008 31\n", "Nov-2009 80\n", "Nov-2010 258\n", "Nov-2011 487\n", "Nov-2012 782\n", "Nov-2013 701\n", "Nov-2014 593\n", "Nov-2015 231\n", "Oct-2008 55\n", "Oct-2009 85\n", "Oct-2010 292\n", "Oct-2011 482\n", "Oct-2012 853\n", "Oct-2013 712\n", "Oct-2014 813\n", "Oct-2015 200\n", "Sep-2008 40\n", "Sep-2009 55\n", "Sep-2010 215\n", "Sep-2011 491\n", "Sep-2012 802\n", "Sep-2013 712\n", "Sep-2014 694\n", "Sep-2015 211\n", "Name: last_pymnt_d, dtype: int64\n", "-----------\n", "last_credit_pull_d\n", "Apr-2009 24\n", "Apr-2010 77\n", "Apr-2011 177\n", "Apr-2012 326\n", "Apr-2013 445\n", "Apr-2014 507\n", "Apr-2015 404\n", "Apr-2016 912\n", "Aug-2007 17\n", "Aug-2008 15\n", "Aug-2009 32\n", "Aug-2010 68\n", "Aug-2011 290\n", "Aug-2012 391\n", "Aug-2013 375\n", "Aug-2014 528\n", "Aug-2015 435\n", "Dec-2007 3\n", "Dec-2008 7\n", "Dec-2009 73\n", "Dec-2010 156\n", "Dec-2011 268\n", "Dec-2012 391\n", "Dec-2013 431\n", "Dec-2014 567\n", "Dec-2015 656\n", "Feb-2008 3\n", "Feb-2009 20\n", "Feb-2010 87\n", "Feb-2011 199\n", " ... \n", "May-2013 470\n", "May-2014 477\n", "May-2015 433\n", "May-2016 766\n", "Nov-2007 3\n", "Nov-2009 50\n", "Nov-2010 126\n", "Nov-2011 221\n", "Nov-2012 432\n", "Nov-2013 491\n", "Nov-2014 500\n", "Nov-2015 516\n", "Oct-2007 3\n", "Oct-2008 13\n", "Oct-2009 51\n", "Oct-2010 120\n", "Oct-2011 162\n", "Oct-2012 351\n", "Oct-2013 429\n", "Oct-2014 529\n", "Oct-2015 513\n", "Sep-2007 5\n", "Sep-2008 9\n", "Sep-2009 21\n", "Sep-2010 119\n", "Sep-2011 175\n", "Sep-2012 414\n", "Sep-2013 408\n", "Sep-2014 564\n", "Sep-2015 531\n", "Name: last_credit_pull_d, dtype: int64\n", "-----------\n" ] } ], "source": [ "for col in useful_obj_cols:\n", " print(col)\n", " print(combined_vcs[col])\n", " print(\"-----------\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convert to category" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "convert_col_dtypes = {\n", " \"sub_grade\": \"category\", \"home_ownership\": \"category\", \n", " \"verification_status\": \"category\", \"purpose\": \"category\"\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convert `term` and `revol_util` to numerical by data cleaning.\n", "### Convert `issue_d`, `earliest_cr_line`, `last_pymnt_d`, and `last_credit_pull_d` to datetime." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
termsub_gradeemp_titlehome_ownershipverification_statusissue_dpurposeearliest_cr_linerevol_utillast_pymnt_dlast_credit_pull_d
4200036 monthsC2Best BuyRENTNot VerifiedFeb-2008debt_consolidationJul-2000100.7%Feb-2011Jun-2016
4200136 monthsG2CVS PHARMACYOWNNot VerifiedFeb-2008debt_consolidationMar-198951.9%Nov-2008Jun-2016
4200236 monthsE4General MotorsRENTNot VerifiedFeb-2008debt_consolidationDec-199880.7%Feb-2011Jun-2016
4200336 monthsG4usa medical centerRENTNot VerifiedFeb-2008debt_consolidationJul-199557.2%Feb-2011Jun-2011
4200436 monthsB3InvestSource IncRENTNot VerifiedFeb-2008debt_consolidationSep-200574%Mar-2010Aug-2010
4200536 monthsD1kaiserMORTGAGENot VerifiedFeb-2008debt_consolidationNov-200037.6%Mar-2010Apr-2010
4200636 monthsC1Panoramic SoftwareMORTGAGENot VerifiedFeb-2008debt_consolidationDec-199556%Apr-2010Jun-2016
4200736 monthsD5AT&T Inc.RENTNot VerifiedFeb-2008small_businessFeb-200049.7%Feb-2009Oct-2009
4200836 monthsD4D&YRENTNot VerifiedFeb-2008debt_consolidationOct-199271.5%Jan-2010Jun-2016
4200936 monthsD5Frederick's of HollywoodRENTNot VerifiedFeb-2008debt_consolidationOct-199449%Jul-2008Jun-2016
4201036 monthsF2Yuma Nursing Home CtrRENTNot VerifiedFeb-2008debt_consolidationOct-199318.1%Feb-2011Nov-2014
4201136 monthsE5Akiva IncorporatedRENTNot VerifiedFeb-2008small_businessJan-200630.8%Jan-2009Aug-2009
4201236 monthsG4Citrix OnlineOWNNot VerifiedFeb-2008debt_consolidationSep-199898.1%Feb-2011Mar-2016
4201336 monthsG3Virginia College OnlineMORTGAGENot VerifiedFeb-2008otherDec-199078.4%Oct-2008Sep-2008
4201436 monthsE2Zone 7 Water AgencyRENTNot VerifiedFeb-2008debt_consolidationNov-199894.3%May-2009May-2013
4201536 monthsG5Integrity AppraisalsMORTGAGENot VerifiedFeb-2008debt_consolidationFeb-199778.7%Nov-2008Jun-2016
4201636 monthsG4Stordok Document Destruction/Moreno Roofing Co.MORTGAGENot VerifiedFeb-2008credit_cardJun-19820%Feb-2011Dec-2014
4201736 monthsE4Aladdin Capital ManagementRENTNot VerifiedFeb-2008credit_cardNov-199965.2%Nov-2009Nov-2009
4201836 monthsD4Cayuga Medical CenterRENTNot VerifiedFeb-2008debt_consolidationNov-199458.9%Feb-2011Jun-2016
4201936 monthsE1North Sound PTRENTNot VerifiedJan-2008debt_consolidationFeb-200094%Oct-2009Mar-2010
4202036 monthsC3UHSRENTNot VerifiedJan-2008medicalOct-199883.1%Feb-2011Feb-2011
4202136 monthsD4AdeccoRENTNot VerifiedFeb-2008debt_consolidationNov-200260.7%Aug-2009Jan-2011
4202236 monthsE5Greatwide logistics ServicesMORTGAGENot VerifiedFeb-2008home_improvementOct-199449.9%Sep-2009Jun-2016
4202336 monthsD4ASU UniversityRENTNot VerifiedFeb-2008credit_cardSep-200059.9%Feb-2011Feb-2011
4202436 monthsB5Wells FargoRENTNot VerifiedFeb-2008debt_consolidationNov-199532.6%Feb-2011Jun-2016
4202536 monthsD5lakeside memorial hospitalMORTGAGENot VerifiedJan-2008debt_consolidationNov-199777.4%Jan-2011Jun-2016
4202636 monthsB5Controller GroupRENTNot VerifiedFeb-2008small_businessMar-199846.5%Oct-2008Jun-2016
4202736 monthsF1David Johnston Equestrian Training Lic.#29026RENTNot VerifiedFeb-2008small_businessNov-200018.1%Feb-2011Feb-2011
4202836 monthsE3Lockheed Martin Corp.MORTGAGENot VerifiedFeb-2008debt_consolidationDec-198345.9%Feb-2011Jun-2016
4202936 monthsG4SelfMORTGAGENot VerifiedFeb-2008small_businessApr-199492.3%Apr-2008Jun-2016
....................................
4250836 monthsD5SUNY- ESFRENTNot VerifiedJul-2007credit_cardDec-200449.3%Jul-2010Sep-2012
4250936 monthsD5Clinton Shop Rite of Hunterdon CountyOWNNot VerifiedJul-2007carDec-2006NaNFeb-2010Jul-2013
4251036 monthsB4ScheduallMORTGAGENot VerifiedJul-2007debt_consolidationNaNNaNFeb-2008May-2007
4251136 monthsE5GA-PCOMRENTNot VerifiedJul-2007educationalSep-199985%Aug-2010Aug-2010
4251236 monthsA1Tzigane IncMORTGAGENot VerifiedJul-2007debt_consolidationMar-19842.2%Jul-2010Jun-2010
4251336 monthsC2Yale UniversityRENTNot VerifiedJun-2007debt_consolidationJan-199666%Jun-2010Oct-2014
4251436 monthsB4Brick Township board of educationMORTGAGENot VerifiedJun-2007debt_consolidationJul-200463.5%Jun-2010Feb-2015
4251536 monthsB2Classic ComponentsRENTNot VerifiedJun-2007otherNaNNaNJul-2010Jun-2016
4251636 monthsC4Compensation SolutionsRENTNot VerifiedJun-2007otherNaNNaNApr-2010Apr-2010
4251736 monthsC3Stanford University Libraries, LOCKSS ProjectRENTNot VerifiedJun-2007otherNaNNaNJul-2010Sep-2014
4251836 monthsE3Macy'sRENTNot VerifiedJun-2007otherNaNNaNJun-2010Jun-2010
4251936 monthsA3Diamond Management and Technology ConsultantsMORTGAGENot VerifiedJun-2007otherNaNNaNAug-2009Aug-2009
4252036 monthsA2U.S. BankMORTGAGENot VerifiedJun-2007otherNaNNaNJan-2008Jun-2007
4252136 monthsA2NCMORTGAGENot VerifiedJun-2007otherNaNNaNJan-2008Jun-2007
4252236 monthsA5College Pro PaintersMORTGAGENot VerifiedJun-2007otherNaNNaNMar-2010Mar-2010
4252336 monthsA1Mana ProductsRENTNot VerifiedJun-2007otherNaNNaNJun-2010Apr-2014
4252436 monthsC4Apto SolutionsRENTNot VerifiedJun-2007otherNaNNaNJul-2010Jun-2010
4252536 monthsD3Infinitely law groupRENTNot VerifiedJun-2007otherNaNNaNMay-2008Jun-2016
4252636 monthsC4Town of PlainvilleRENTNot VerifiedJun-2007otherNaNNaNAug-2008Jun-2007
4252736 monthsB2Tanks TavernRENTNot VerifiedJun-2007otherNaNNaNJul-2010Jun-2010
4252836 monthsB3NaNOWNNot VerifiedJun-2007otherNaNNaNJun-2010May-2007
4252936 monthsB5NaNRENTNot VerifiedJun-2007otherNaNNaNJul-2010Jul-2010
4253036 monthsB4Air ForceRENTNot VerifiedJun-2007otherNaNNaNMay-2008Jun-2016
4253136 monthsC1NaNRENTNot VerifiedJun-2007otherNaNNaNMar-2008Feb-2013
4253236 monthsB4Halping hands company inc.RENTNot VerifiedJun-2007otherNaNNaNJun-2010Sep-2014
4253336 monthsB3NaNRENTNot VerifiedJun-2007otherNaNNaNJun-2010May-2007
4253436 monthsA5NaNNONENot VerifiedJun-2007otherNaNNaNJun-2010Aug-2007
4253536 monthsA3HomemakerMORTGAGENot VerifiedJun-2007otherNaNNaNJun-2010Feb-2015
42536NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
42537NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

538 rows × 11 columns

\n", "
" ], "text/plain": [ " term sub_grade emp_title \\\n", "42000 36 months C2 Best Buy \n", "42001 36 months G2 CVS PHARMACY \n", "42002 36 months E4 General Motors \n", "42003 36 months G4 usa medical center \n", "42004 36 months B3 InvestSource Inc \n", "42005 36 months D1 kaiser \n", "42006 36 months C1 Panoramic Software \n", "42007 36 months D5 AT&T Inc. \n", "42008 36 months D4 D&Y \n", "42009 36 months D5 Frederick's of Hollywood \n", "42010 36 months F2 Yuma Nursing Home Ctr \n", "42011 36 months E5 Akiva Incorporated \n", "42012 36 months G4 Citrix Online \n", "42013 36 months G3 Virginia College Online \n", "42014 36 months E2 Zone 7 Water Agency \n", "42015 36 months G5 Integrity Appraisals \n", "42016 36 months G4 Stordok Document Destruction/Moreno Roofing Co. \n", "42017 36 months E4 Aladdin Capital Management \n", "42018 36 months D4 Cayuga Medical Center \n", "42019 36 months E1 North Sound PT \n", "42020 36 months C3 UHS \n", "42021 36 months D4 Adecco \n", "42022 36 months E5 Greatwide logistics Services \n", "42023 36 months D4 ASU University \n", "42024 36 months B5 Wells Fargo \n", "42025 36 months D5 lakeside memorial hospital \n", "42026 36 months B5 Controller Group \n", "42027 36 months F1 David Johnston Equestrian Training Lic.#29026 \n", "42028 36 months E3 Lockheed Martin Corp. \n", "42029 36 months G4 Self \n", "... ... ... ... \n", "42508 36 months D5 SUNY- ESF \n", "42509 36 months D5 Clinton Shop Rite of Hunterdon County \n", "42510 36 months B4 Scheduall \n", "42511 36 months E5 GA-PCOM \n", "42512 36 months A1 Tzigane Inc \n", "42513 36 months C2 Yale University \n", "42514 36 months B4 Brick Township board of education \n", "42515 36 months B2 Classic Components \n", "42516 36 months C4 Compensation Solutions \n", "42517 36 months C3 Stanford University Libraries, LOCKSS Project \n", "42518 36 months E3 Macy's \n", "42519 36 months A3 Diamond Management and Technology Consultants \n", "42520 36 months A2 U.S. Bank \n", "42521 36 months A2 NC \n", "42522 36 months A5 College Pro Painters \n", "42523 36 months A1 Mana Products \n", "42524 36 months C4 Apto Solutions \n", "42525 36 months D3 Infinitely law group \n", "42526 36 months C4 Town of Plainville \n", "42527 36 months B2 Tanks Tavern \n", "42528 36 months B3 NaN \n", "42529 36 months B5 NaN \n", "42530 36 months B4 Air Force \n", "42531 36 months C1 NaN \n", "42532 36 months B4 Halping hands company inc. \n", "42533 36 months B3 NaN \n", "42534 36 months A5 NaN \n", "42535 36 months A3 Homemaker \n", "42536 NaN NaN NaN \n", "42537 NaN NaN NaN \n", "\n", " home_ownership verification_status issue_d purpose \\\n", "42000 RENT Not Verified Feb-2008 debt_consolidation \n", "42001 OWN Not Verified Feb-2008 debt_consolidation \n", "42002 RENT Not Verified Feb-2008 debt_consolidation \n", "42003 RENT Not Verified Feb-2008 debt_consolidation \n", "42004 RENT Not Verified Feb-2008 debt_consolidation \n", "42005 MORTGAGE Not Verified Feb-2008 debt_consolidation \n", "42006 MORTGAGE Not Verified Feb-2008 debt_consolidation \n", "42007 RENT Not Verified Feb-2008 small_business \n", "42008 RENT Not Verified Feb-2008 debt_consolidation \n", "42009 RENT Not Verified Feb-2008 debt_consolidation \n", "42010 RENT Not Verified Feb-2008 debt_consolidation \n", "42011 RENT Not Verified Feb-2008 small_business \n", "42012 OWN Not Verified Feb-2008 debt_consolidation \n", "42013 MORTGAGE Not Verified Feb-2008 other \n", "42014 RENT Not Verified Feb-2008 debt_consolidation \n", "42015 MORTGAGE Not Verified Feb-2008 debt_consolidation \n", "42016 MORTGAGE Not Verified Feb-2008 credit_card \n", "42017 RENT Not Verified Feb-2008 credit_card \n", "42018 RENT Not Verified Feb-2008 debt_consolidation \n", "42019 RENT Not Verified Jan-2008 debt_consolidation \n", "42020 RENT Not Verified Jan-2008 medical \n", "42021 RENT Not Verified Feb-2008 debt_consolidation \n", "42022 MORTGAGE Not Verified Feb-2008 home_improvement \n", "42023 RENT Not Verified Feb-2008 credit_card \n", "42024 RENT Not Verified Feb-2008 debt_consolidation \n", "42025 MORTGAGE Not Verified Jan-2008 debt_consolidation \n", "42026 RENT Not Verified Feb-2008 small_business \n", "42027 RENT Not Verified Feb-2008 small_business \n", "42028 MORTGAGE Not Verified Feb-2008 debt_consolidation \n", "42029 MORTGAGE Not Verified Feb-2008 small_business \n", "... ... ... ... ... \n", "42508 RENT Not Verified Jul-2007 credit_card \n", "42509 OWN Not Verified Jul-2007 car \n", "42510 MORTGAGE Not Verified Jul-2007 debt_consolidation \n", "42511 RENT Not Verified Jul-2007 educational \n", "42512 MORTGAGE Not Verified Jul-2007 debt_consolidation \n", "42513 RENT Not Verified Jun-2007 debt_consolidation \n", "42514 MORTGAGE Not Verified Jun-2007 debt_consolidation \n", "42515 RENT Not Verified Jun-2007 other \n", "42516 RENT Not Verified Jun-2007 other \n", "42517 RENT Not Verified Jun-2007 other \n", "42518 RENT Not Verified Jun-2007 other \n", "42519 MORTGAGE Not Verified Jun-2007 other \n", "42520 MORTGAGE Not Verified Jun-2007 other \n", "42521 MORTGAGE Not Verified Jun-2007 other \n", "42522 MORTGAGE Not Verified Jun-2007 other \n", "42523 RENT Not Verified Jun-2007 other \n", "42524 RENT Not Verified Jun-2007 other \n", "42525 RENT Not Verified Jun-2007 other \n", "42526 RENT Not Verified Jun-2007 other \n", "42527 RENT Not Verified Jun-2007 other \n", "42528 OWN Not Verified Jun-2007 other \n", "42529 RENT Not Verified Jun-2007 other \n", "42530 RENT Not Verified Jun-2007 other \n", "42531 RENT Not Verified Jun-2007 other \n", "42532 RENT Not Verified Jun-2007 other \n", "42533 RENT Not Verified Jun-2007 other \n", "42534 NONE Not Verified Jun-2007 other \n", "42535 MORTGAGE Not Verified Jun-2007 other \n", "42536 NaN NaN NaN NaN \n", "42537 NaN NaN NaN NaN \n", "\n", " earliest_cr_line revol_util last_pymnt_d last_credit_pull_d \n", "42000 Jul-2000 100.7% Feb-2011 Jun-2016 \n", "42001 Mar-1989 51.9% Nov-2008 Jun-2016 \n", "42002 Dec-1998 80.7% Feb-2011 Jun-2016 \n", "42003 Jul-1995 57.2% Feb-2011 Jun-2011 \n", "42004 Sep-2005 74% Mar-2010 Aug-2010 \n", "42005 Nov-2000 37.6% Mar-2010 Apr-2010 \n", "42006 Dec-1995 56% Apr-2010 Jun-2016 \n", "42007 Feb-2000 49.7% Feb-2009 Oct-2009 \n", "42008 Oct-1992 71.5% Jan-2010 Jun-2016 \n", "42009 Oct-1994 49% Jul-2008 Jun-2016 \n", "42010 Oct-1993 18.1% Feb-2011 Nov-2014 \n", "42011 Jan-2006 30.8% Jan-2009 Aug-2009 \n", "42012 Sep-1998 98.1% Feb-2011 Mar-2016 \n", "42013 Dec-1990 78.4% Oct-2008 Sep-2008 \n", "42014 Nov-1998 94.3% May-2009 May-2013 \n", "42015 Feb-1997 78.7% Nov-2008 Jun-2016 \n", "42016 Jun-1982 0% Feb-2011 Dec-2014 \n", "42017 Nov-1999 65.2% Nov-2009 Nov-2009 \n", "42018 Nov-1994 58.9% Feb-2011 Jun-2016 \n", "42019 Feb-2000 94% Oct-2009 Mar-2010 \n", "42020 Oct-1998 83.1% Feb-2011 Feb-2011 \n", "42021 Nov-2002 60.7% Aug-2009 Jan-2011 \n", "42022 Oct-1994 49.9% Sep-2009 Jun-2016 \n", "42023 Sep-2000 59.9% Feb-2011 Feb-2011 \n", "42024 Nov-1995 32.6% Feb-2011 Jun-2016 \n", "42025 Nov-1997 77.4% Jan-2011 Jun-2016 \n", "42026 Mar-1998 46.5% Oct-2008 Jun-2016 \n", "42027 Nov-2000 18.1% Feb-2011 Feb-2011 \n", "42028 Dec-1983 45.9% Feb-2011 Jun-2016 \n", "42029 Apr-1994 92.3% Apr-2008 Jun-2016 \n", "... ... ... ... ... \n", "42508 Dec-2004 49.3% Jul-2010 Sep-2012 \n", "42509 Dec-2006 NaN Feb-2010 Jul-2013 \n", "42510 NaN NaN Feb-2008 May-2007 \n", "42511 Sep-1999 85% Aug-2010 Aug-2010 \n", "42512 Mar-1984 2.2% Jul-2010 Jun-2010 \n", "42513 Jan-1996 66% Jun-2010 Oct-2014 \n", "42514 Jul-2004 63.5% Jun-2010 Feb-2015 \n", "42515 NaN NaN Jul-2010 Jun-2016 \n", "42516 NaN NaN Apr-2010 Apr-2010 \n", "42517 NaN NaN Jul-2010 Sep-2014 \n", "42518 NaN NaN Jun-2010 Jun-2010 \n", "42519 NaN NaN Aug-2009 Aug-2009 \n", "42520 NaN NaN Jan-2008 Jun-2007 \n", "42521 NaN NaN Jan-2008 Jun-2007 \n", "42522 NaN NaN Mar-2010 Mar-2010 \n", "42523 NaN NaN Jun-2010 Apr-2014 \n", "42524 NaN NaN Jul-2010 Jun-2010 \n", "42525 NaN NaN May-2008 Jun-2016 \n", "42526 NaN NaN Aug-2008 Jun-2007 \n", "42527 NaN NaN Jul-2010 Jun-2010 \n", "42528 NaN NaN Jun-2010 May-2007 \n", "42529 NaN NaN Jul-2010 Jul-2010 \n", "42530 NaN NaN May-2008 Jun-2016 \n", "42531 NaN NaN Mar-2008 Feb-2013 \n", "42532 NaN NaN Jun-2010 Sep-2014 \n", "42533 NaN NaN Jun-2010 May-2007 \n", "42534 NaN NaN Jun-2010 Aug-2007 \n", "42535 NaN NaN Jun-2010 Feb-2015 \n", "42536 NaN NaN NaN NaN \n", "42537 NaN NaN NaN NaN \n", "\n", "[538 rows x 11 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk[useful_obj_cols]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id object\n", "member_id float64\n", "loan_amnt float64\n", "funded_amnt float64\n", "funded_amnt_inv float64\n", "term float64\n", "int_rate object\n", "installment float64\n", "grade object\n", "sub_grade category\n", "emp_title object\n", "emp_length object\n", "home_ownership category\n", "annual_inc float64\n", "verification_status category\n", "issue_d datetime64[ns]\n", "loan_status object\n", "pymnt_plan object\n", "purpose category\n", "title object\n", "zip_code object\n", "addr_state object\n", "dti float64\n", "delinq_2yrs float64\n", "earliest_cr_line datetime64[ns]\n", "inq_last_6mths float64\n", "open_acc float64\n", "pub_rec float64\n", "revol_bal float64\n", "revol_util float64\n", "total_acc float64\n", "initial_list_status object\n", "out_prncp float64\n", "out_prncp_inv float64\n", "total_pymnt float64\n", "total_pymnt_inv float64\n", "total_rec_prncp float64\n", "total_rec_int float64\n", "total_rec_late_fee float64\n", "recoveries float64\n", "collection_recovery_fee float64\n", "last_pymnt_d datetime64[ns]\n", "last_pymnt_amnt float64\n", "last_credit_pull_d datetime64[ns]\n", "collections_12_mths_ex_med float64\n", "policy_code float64\n", "application_type object\n", "acc_now_delinq float64\n", "chargeoff_within_12_mths float64\n", "delinq_amnt float64\n", "pub_rec_bankruptcies float64\n", "tax_liens float64\n", "dtype: object" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n", "\n", "for chunk in chunk_iter:\n", " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n", " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n", " chunk['term'] = pd.to_numeric(term_cleaned)\n", " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n", " \n", "chunk.dtypes" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'acc_now_delinq': 32,\n", " 'annual_inc': 7,\n", " 'chargeoff_within_12_mths': 148,\n", " 'collection_recovery_fee': 3,\n", " 'collections_12_mths_ex_med': 148,\n", " 'delinq_2yrs': 32,\n", " 'delinq_amnt': 32,\n", " 'dti': 3,\n", " 'funded_amnt': 3,\n", " 'funded_amnt_inv': 3,\n", " 'inq_last_6mths': 32,\n", " 'installment': 3,\n", " 'last_pymnt_amnt': 3,\n", " 'loan_amnt': 3,\n", " 'member_id': 3,\n", " 'open_acc': 32,\n", " 'out_prncp': 3,\n", " 'out_prncp_inv': 3,\n", " 'policy_code': 3,\n", " 'pub_rec': 32,\n", " 'pub_rec_bankruptcies': 1368,\n", " 'recoveries': 3,\n", " 'revol_bal': 3,\n", " 'revol_util': 93,\n", " 'tax_liens': 108,\n", " 'term': 3,\n", " 'total_acc': 32,\n", " 'total_pymnt': 3,\n", " 'total_pymnt_inv': 3,\n", " 'total_rec_int': 3,\n", " 'total_rec_late_fee': 3,\n", " 'total_rec_prncp': 3}" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n", "mv_counts = {}\n", "for chunk in chunk_iter:\n", " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n", " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n", " chunk['term'] = pd.to_numeric(term_cleaned)\n", " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n", " float_cols = chunk.select_dtypes(include=['float'])\n", " for col in float_cols.columns:\n", " missing_values = len(chunk) - chunk[col].count()\n", " if col in mv_counts:\n", " mv_counts[col] = mv_counts[col] + missing_values\n", " else:\n", " mv_counts[col] = missing_values\n", "mv_counts" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'acc_now_delinq': 32,\n", " 'annual_inc': 7,\n", " 'chargeoff_within_12_mths': 148,\n", " 'collection_recovery_fee': 3,\n", " 'collections_12_mths_ex_med': 148,\n", " 'delinq_2yrs': 32,\n", " 'delinq_amnt': 32,\n", " 'dti': 3,\n", " 'funded_amnt': 3,\n", " 'funded_amnt_inv': 3,\n", " 'inq_last_6mths': 32,\n", " 'installment': 3,\n", " 'last_pymnt_amnt': 3,\n", " 'loan_amnt': 3,\n", " 'member_id': 3,\n", " 'open_acc': 32,\n", " 'out_prncp': 3,\n", " 'out_prncp_inv': 3,\n", " 'policy_code': 3,\n", " 'pub_rec': 32,\n", " 'pub_rec_bankruptcies': 1368,\n", " 'recoveries': 3,\n", " 'revol_bal': 3,\n", " 'revol_util': 93,\n", " 'tax_liens': 108,\n", " 'term': 3,\n", " 'total_acc': 32,\n", " 'total_pymnt': 3,\n", " 'total_pymnt_inv': 3,\n", " 'total_rec_int': 3,\n", " 'total_rec_late_fee': 3,\n", " 'total_rec_prncp': 3}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n", "mv_counts = {}\n", "for chunk in chunk_iter:\n", " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n", " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n", " chunk['term'] = pd.to_numeric(term_cleaned)\n", " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n", " chunk = chunk.dropna(how='all')\n", " float_cols = chunk.select_dtypes(include=['float'])\n", " for col in float_cols.columns:\n", " missing_values = len(chunk) - chunk[col].count()\n", " if col in mv_counts:\n", " mv_counts[col] = mv_counts[col] + missing_values\n", " else:\n", " mv_counts[col] = missing_values\n", "mv_counts" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.4" } }, "nbformat": 4, "nbformat_minor": 2 }