{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmember_idloan_amntfunded_amntfunded_amnt_invtermint_rateinstallmentgradesub_gradeemp_titleemp_lengthhome_ownershipannual_incverification_statusissue_dloan_statuspymnt_planpurposetitlezip_codeaddr_statedtidelinq_2yrsearliest_cr_lineinq_last_6mthsopen_accpub_recrevol_balrevol_utiltotal_accinitial_list_statusout_prncpout_prncp_invtotal_pymnttotal_pymnt_invtotal_rec_prncptotal_rec_inttotal_rec_late_feerecoveriescollection_recovery_feelast_pymnt_dlast_pymnt_amntlast_credit_pull_dcollections_12_mths_ex_medpolicy_codeapplication_typeacc_now_delinqchargeoff_within_12_mthsdelinq_amntpub_rec_bankruptciestax_liens
010775011296599.05000.05000.04975.036 months10.65%162.87BB2NaN10+ yearsRENT24000.0VerifiedDec-2011Fully Paidncredit_cardComputer860xxAZ27.650.0Jan-19851.03.00.013648.083.7%9.0f0.000.005863.1551875833.845000.00863.160.000.000.00Jan-2015171.62Jun-20160.01.0INDIVIDUAL0.00.00.00.00.0
110774301314167.02500.02500.02500.060 months15.27%59.83CC4Ryder< 1 yearRENT30000.0Source VerifiedDec-2011Charged Offncarbike309xxGA1.000.0Apr-19995.03.00.01687.09.4%4.0f0.000.001008.7100001008.71456.46435.170.00117.081.11Apr-2013119.66Sep-20130.01.0INDIVIDUAL0.00.00.00.00.0
210771751313524.02400.02400.02400.036 months15.96%84.33CC5NaN10+ yearsRENT12252.0Not VerifiedDec-2011Fully Paidnsmall_businessreal estate business606xxIL8.720.0Nov-20012.02.00.02956.098.5%10.0f0.000.003005.6668443005.672400.00605.670.000.000.00Jun-2014649.91Jun-20160.01.0INDIVIDUAL0.00.00.00.00.0
310768631277178.010000.010000.010000.036 months13.49%339.31CC1AIR RESOURCES BOARD10+ yearsRENT49200.0Source VerifiedDec-2011Fully Paidnotherpersonel917xxCA20.000.0Feb-19961.010.00.05598.021%37.0f0.000.0012231.89000012231.8910000.002214.9216.970.000.00Jan-2015357.48Apr-20160.01.0INDIVIDUAL0.00.00.00.00.0
410753581311748.03000.03000.03000.060 months12.69%67.79BB5University Medical Group1 yearRENT80000.0Source VerifiedDec-2011CurrentnotherPersonal972xxOR17.940.0Jan-19960.015.00.027783.053.9%38.0f461.73461.733581.1200003581.122538.271042.850.000.000.00Jun-201667.79Jun-20160.01.0INDIVIDUAL0.00.00.00.00.0
\n", "
" ], "text/plain": [ " id member_id loan_amnt funded_amnt funded_amnt_inv term \\\n", "0 1077501 1296599.0 5000.0 5000.0 4975.0 36 months \n", "1 1077430 1314167.0 2500.0 2500.0 2500.0 60 months \n", "2 1077175 1313524.0 2400.0 2400.0 2400.0 36 months \n", "3 1076863 1277178.0 10000.0 10000.0 10000.0 36 months \n", "4 1075358 1311748.0 3000.0 3000.0 3000.0 60 months \n", "\n", " int_rate installment grade sub_grade emp_title emp_length \\\n", "0 10.65% 162.87 B B2 NaN 10+ years \n", "1 15.27% 59.83 C C4 Ryder < 1 year \n", "2 15.96% 84.33 C C5 NaN 10+ years \n", "3 13.49% 339.31 C C1 AIR RESOURCES BOARD 10+ years \n", "4 12.69% 67.79 B B5 University Medical Group 1 year \n", "\n", " home_ownership annual_inc verification_status issue_d loan_status \\\n", "0 RENT 24000.0 Verified Dec-2011 Fully Paid \n", "1 RENT 30000.0 Source Verified Dec-2011 Charged Off \n", "2 RENT 12252.0 Not Verified Dec-2011 Fully Paid \n", "3 RENT 49200.0 Source Verified Dec-2011 Fully Paid \n", "4 RENT 80000.0 Source Verified Dec-2011 Current \n", "\n", " pymnt_plan purpose title zip_code addr_state dti \\\n", "0 n credit_card Computer 860xx AZ 27.65 \n", "1 n car bike 309xx GA 1.00 \n", "2 n small_business real estate business 606xx IL 8.72 \n", "3 n other personel 917xx CA 20.00 \n", "4 n other Personal 972xx OR 17.94 \n", "\n", " delinq_2yrs earliest_cr_line inq_last_6mths open_acc pub_rec revol_bal \\\n", "0 0.0 Jan-1985 1.0 3.0 0.0 13648.0 \n", "1 0.0 Apr-1999 5.0 3.0 0.0 1687.0 \n", "2 0.0 Nov-2001 2.0 2.0 0.0 2956.0 \n", "3 0.0 Feb-1996 1.0 10.0 0.0 5598.0 \n", "4 0.0 Jan-1996 0.0 15.0 0.0 27783.0 \n", "\n", " revol_util total_acc initial_list_status out_prncp out_prncp_inv \\\n", "0 83.7% 9.0 f 0.00 0.00 \n", "1 9.4% 4.0 f 0.00 0.00 \n", "2 98.5% 10.0 f 0.00 0.00 \n", "3 21% 37.0 f 0.00 0.00 \n", "4 53.9% 38.0 f 461.73 461.73 \n", "\n", " total_pymnt total_pymnt_inv total_rec_prncp total_rec_int \\\n", "0 5863.155187 5833.84 5000.00 863.16 \n", "1 1008.710000 1008.71 456.46 435.17 \n", "2 3005.666844 3005.67 2400.00 605.67 \n", "3 12231.890000 12231.89 10000.00 2214.92 \n", "4 3581.120000 3581.12 2538.27 1042.85 \n", "\n", " total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d \\\n", "0 0.00 0.00 0.00 Jan-2015 \n", "1 0.00 117.08 1.11 Apr-2013 \n", "2 0.00 0.00 0.00 Jun-2014 \n", "3 16.97 0.00 0.00 Jan-2015 \n", "4 0.00 0.00 0.00 Jun-2016 \n", "\n", " last_pymnt_amnt last_credit_pull_d collections_12_mths_ex_med \\\n", "0 171.62 Jun-2016 0.0 \n", "1 119.66 Sep-2013 0.0 \n", "2 649.91 Jun-2016 0.0 \n", "3 357.48 Apr-2016 0.0 \n", "4 67.79 Jun-2016 0.0 \n", "\n", " policy_code application_type acc_now_delinq chargeoff_within_12_mths \\\n", "0 1.0 INDIVIDUAL 0.0 0.0 \n", "1 1.0 INDIVIDUAL 0.0 0.0 \n", "2 1.0 INDIVIDUAL 0.0 0.0 \n", "3 1.0 INDIVIDUAL 0.0 0.0 \n", "4 1.0 INDIVIDUAL 0.0 0.0 \n", "\n", " delinq_amnt pub_rec_bankruptcies tax_liens \n", "0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "pd.options.display.max_columns = 99\n", "\n", "first_five = pd.read_csv('loans_2007.csv', nrows=5)\n", "first_five" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.5502548217773438" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "thousand_chunk = pd.read_csv('loans_2007.csv', nrows=1000)\n", "thousand_chunk.memory_usage(deep=True).sum()/(1024*1024)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Let's try tripling to 3000 rows and calculate the memory footprint for each chunk." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.649059295654297\n", "4.644805908203125\n", "4.646563529968262\n", "4.647915840148926\n", "4.644108772277832\n", "4.645991325378418\n", "4.644582748413086\n", "4.646951675415039\n", "4.645077705383301\n", "4.64512825012207\n", "4.657840728759766\n", "4.656707763671875\n", "4.663515090942383\n", "4.896956443786621\n", "0.880854606628418\n" ] } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "for chunk in chunk_iter:\n", " print(chunk.memory_usage(deep=True).sum()/(1024*1024))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## How many rows in the data set?" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "42538\n" ] } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "total_rows = 0\n", "for chunk in chunk_iter:\n", " total_rows += len(chunk)\n", "print(total_rows)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploring the Data in Chunks\n", "\n", "## How many columns have a numeric type? How many have a string type?" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30]\n", "[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22]\n" ] } ], "source": [ "# Numeric columns\n", "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "numeric = []\n", "string = []\n", "for lc in loans_chunks:\n", " nums = lc.select_dtypes(include=[np.number]).shape[1]\n", " numeric.append(nums)\n", " strs = lc.select_dtypes(include=['object']).shape[1]\n", " string.append(strs)\n", "\n", "print(numeric)\n", "print(string)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n", "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n", "overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n", "chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] \n", "\n" ] } ], "source": [ "# Are string columns consistent across chunks?\n", "obj_cols = []\n", "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "\n", "for chunk in chunk_iter:\n", " chunk_obj_cols = chunk.select_dtypes(include=['object']).columns.tolist()\n", " if len(obj_cols) > 0:\n", " is_same = obj_cols == chunk_obj_cols\n", " if not is_same:\n", " print(\"overall obj cols:\", obj_cols, \"\\n\")\n", " print(\"chunk obj cols:\", chunk_obj_cols, \"\\n\") \n", " else:\n", " obj_cols = chunk_obj_cols" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### Observation 1: By default -- 31 numeric columns and 21 string columns.\n", "\n", "### Observation 2: It seems like one column in particular (the `id` column) is being cast to int64 in the last 2 chunks but not in the earlier chunks. Since the `id` column won't be useful for analysis, visualization, or predictive modelling let's ignore this column.\n", "\n", "## How many unique values are there in each string column? How many of the string columns contain values that are less than 50% unique?" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "term 2\n", "grade 7\n", "sub_grade 35\n", "emp_length 11\n", "home_ownership 5\n", "verification_status 3\n", "loan_status 9\n", "pymnt_plan 2\n", "purpose 14\n", "initial_list_status 1\n", "application_type 1\n" ] } ], "source": [ "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "uniques = {}\n", "for lc in loans_chunks:\n", " strings_only = lc.select_dtypes(include=['object'])\n", " cols = strings_only.columns\n", " for c in cols:\n", " val_counts = strings_only[c].value_counts()\n", " if c in uniques:\n", " uniques[c].append(val_counts)\n", " else:\n", " uniques[c] = [val_counts]\n", "\n", "uniques_combined = {}\n", "unique_stats = {\n", " 'column_name': [],\n", " 'total_values': [],\n", " 'unique_values': [],\n", "}\n", "for col in uniques:\n", " u_concat = pd.concat(uniques[col])\n", " u_group = u_concat.groupby(u_concat.index).sum()\n", " uniques_combined[col] = u_group\n", " if u_group.shape[0] < 50:\n", " print(col, u_group.shape[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Which float columns have no missing values and could be candidates for conversion to the integer type?" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "member_id 3\n", "total_rec_int 3\n", "total_pymnt_inv 3\n", "total_pymnt 3\n", "revol_bal 3\n", "recoveries 3\n", "policy_code 3\n", "out_prncp_inv 3\n", "out_prncp 3\n", "total_rec_late_fee 3\n", "loan_amnt 3\n", "last_pymnt_amnt 3\n", "total_rec_prncp 3\n", "funded_amnt_inv 3\n", "funded_amnt 3\n", "dti 3\n", "collection_recovery_fee 3\n", "installment 3\n", "annual_inc 7\n", "inq_last_6mths 32\n", "total_acc 32\n", "delinq_2yrs 32\n", "pub_rec 32\n", "delinq_amnt 32\n", "open_acc 32\n", "acc_now_delinq 32\n", "tax_liens 108\n", "collections_12_mths_ex_med 148\n", "chargeoff_within_12_mths 148\n", "pub_rec_bankruptcies 1368\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "missing = []\n", "for lc in loans_chunks:\n", " floats = lc.select_dtypes(include=['float'])\n", " missing.append(floats.apply(pd.isnull).sum())\n", "\n", "combined_missing = pd.concat(missing)\n", "combined_missing.groupby(combined_missing.index).sum().sort_values()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate the total memory usage across all of the chunks." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "66.21605968475342" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3000)\n", "\n", "mem_usage = []\n", "\n", "for lc in loans_chunks:\n", " mem_usage.append(lc.memory_usage(deep=True).sum() / 1024 ** 2)\n", "\n", "sum(mem_usage)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Optimizing String Columns\n", "\n", "### Determine which string columns you can convert to a numeric type if you clean them. Let's focus on columns that would actually be useful for analysis and modelling." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['term',\n", " 'int_rate',\n", " 'grade',\n", " 'sub_grade',\n", " 'emp_title',\n", " 'emp_length',\n", " 'home_ownership',\n", " 'verification_status',\n", " 'issue_d',\n", " 'loan_status',\n", " 'pymnt_plan',\n", " 'purpose',\n", " 'title',\n", " 'zip_code',\n", " 'addr_state',\n", " 'earliest_cr_line',\n", " 'revol_util',\n", " 'initial_list_status',\n", " 'last_pymnt_d',\n", " 'last_credit_pull_d',\n", " 'application_type']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_cols" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "useful_obj_cols = ['term', 'sub_grade', 'emp_title', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'earliest_cr_line', 'revol_util', 'last_pymnt_d', 'last_credit_pull_d']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "## Create dictionary (key: column, value: list of Series objects representing each chunk's value counts)\n", "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n", "str_cols_vc = {}\n", "for chunk in chunk_iter:\n", " str_cols = chunk.select_dtypes(include=['object'])\n", " for col in str_cols.columns:\n", " current_col_vc = str_cols[col].value_counts()\n", " if col in str_cols_vc:\n", " str_cols_vc[col].append(current_col_vc)\n", " else:\n", " str_cols_vc[col] = [current_col_vc]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "## Combine the value counts.\n", "combined_vcs = {}\n", "\n", "for col in str_cols_vc:\n", " combined_vc = pd.concat(str_cols_vc[col])\n", " final_vc = combined_vc.groupby(combined_vc.index).sum()\n", " combined_vcs[col] = final_vc" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "term\n", " 36 months 31534\n", " 60 months 11001\n", "Name: term, dtype: int64\n", "-----------\n", "sub_grade\n", "A1 1142\n", "A2 1520\n", "A3 1823\n", "A4 2905\n", "A5 2793\n", "B1 1882\n", "B2 2113\n", "B3 2997\n", "B4 2590\n", "B5 2807\n", "C1 2264\n", "C2 2157\n", "C3 1658\n", "C4 1370\n", "C5 1291\n", "D1 1053\n", "D2 1485\n", "D3 1322\n", "D4 1140\n", "D5 1016\n", "E1 884\n", "E2 791\n", "E3 668\n", "E4 552\n", "E5 499\n", "F1 392\n", "F2 308\n", "F3 236\n", "F4 211\n", "F5 154\n", "G1 141\n", "G2 107\n", "G3 79\n", "G4 99\n", "G5 86\n", "Name: sub_grade, dtype: int64\n", "-----------\n", "emp_title\n", " old palm inc 1\n", " Brocade Communications 1\n", " CenturyLink 1\n", " Department of Homeland Security 1\n", " Down To Earth Distributors, Inc. 1\n", " ..\n", "zashko inc. 1\n", "zeno office solutions 1\n", "zion lutheran school 1\n", "zoll medical corp 1\n", "zozaya officiating 1\n", "Name: emp_title, Length: 30658, dtype: int64\n", "-----------\n", "home_ownership\n", "MORTGAGE 18959\n", "NONE 8\n", "OTHER 136\n", "OWN 3251\n", "RENT 20181\n", "Name: home_ownership, dtype: int64\n", "-----------\n", "verification_status\n", "Not Verified 18758\n", "Source Verified 10306\n", "Verified 13471\n", "Name: verification_status, dtype: int64\n", "-----------\n", "issue_d\n", "Apr-2008 259\n", "Apr-2009 333\n", "Apr-2010 912\n", "Apr-2011 1563\n", "Aug-2007 74\n", "Aug-2008 100\n", "Aug-2009 446\n", "Aug-2010 1175\n", "Aug-2011 1934\n", "Dec-2007 172\n", "Dec-2008 253\n", "Dec-2009 658\n", "Dec-2010 1335\n", "Dec-2011 2267\n", "Feb-2008 306\n", "Feb-2009 302\n", "Feb-2010 682\n", "Feb-2011 1298\n", "Jan-2008 305\n", "Jan-2009 269\n", "Jan-2010 662\n", "Jan-2011 1380\n", "Jul-2007 63\n", "Jul-2008 141\n", "Jul-2009 411\n", "Jul-2010 1204\n", "Jul-2011 1875\n", "Jun-2007 24\n", "Jun-2008 124\n", "Jun-2009 406\n", "Jun-2010 1105\n", "Jun-2011 1835\n", "Mar-2008 402\n", "Mar-2009 324\n", "Mar-2010 828\n", "Mar-2011 1448\n", "May-2008 115\n", "May-2009 359\n", "May-2010 989\n", "May-2011 1704\n", "Nov-2007 112\n", "Nov-2008 209\n", "Nov-2009 662\n", "Nov-2010 1224\n", "Nov-2011 2232\n", "Oct-2007 105\n", "Oct-2008 122\n", "Oct-2009 604\n", "Oct-2010 1232\n", "Oct-2011 2118\n", "Sep-2007 53\n", "Sep-2008 57\n", "Sep-2009 507\n", "Sep-2010 1189\n", "Sep-2011 2067\n", "Name: issue_d, dtype: int64\n", "-----------\n", "purpose\n", "car 1615\n", "credit_card 5477\n", "debt_consolidation 19776\n", "educational 422\n", "home_improvement 3199\n", "house 426\n", "major_purchase 2311\n", "medical 753\n", "moving 629\n", "other 4425\n", "renewable_energy 106\n", "small_business 1992\n", "vacation 400\n", "wedding 1004\n", "Name: purpose, dtype: int64\n", "-----------\n", "earliest_cr_line\n", "Apr-1964 3\n", "Apr-1966 1\n", "Apr-1967 4\n", "Apr-1968 1\n", "Apr-1969 1\n", " ... \n", "Sep-2004 221\n", "Sep-2005 162\n", "Sep-2006 150\n", "Sep-2007 63\n", "Sep-2008 8\n", "Name: earliest_cr_line, Length: 530, dtype: int64\n", "-----------\n", "revol_util\n", "0% 1070\n", "0.01% 1\n", "0.03% 1\n", "0.04% 1\n", "0.05% 1\n", " ... \n", "99.5% 24\n", "99.6% 27\n", "99.7% 32\n", "99.8% 25\n", "99.9% 29\n", "Name: revol_util, Length: 1119, dtype: int64\n", "-----------\n", "last_pymnt_d\n", "Apr-2008 23\n", "Apr-2009 72\n", "Apr-2010 145\n", "Apr-2011 519\n", "Apr-2012 781\n", " ... \n", "Sep-2011 491\n", "Sep-2012 802\n", "Sep-2013 712\n", "Sep-2014 694\n", "Sep-2015 211\n", "Name: last_pymnt_d, Length: 103, dtype: int64\n", "-----------\n", "last_credit_pull_d\n", "Apr-2009 24\n", "Apr-2010 77\n", "Apr-2011 177\n", "Apr-2012 326\n", "Apr-2013 445\n", " ... \n", "Sep-2011 175\n", "Sep-2012 414\n", "Sep-2013 408\n", "Sep-2014 564\n", "Sep-2015 531\n", "Name: last_credit_pull_d, Length: 108, dtype: int64\n", "-----------\n" ] } ], "source": [ "for col in useful_obj_cols:\n", " print(col)\n", " print(combined_vcs[col])\n", " print(\"-----------\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convert to category" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "convert_col_dtypes = {\n", " \"sub_grade\": \"category\", \"home_ownership\": \"category\", \n", " \"verification_status\": \"category\", \"purpose\": \"category\"\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convert `term` and `revol_util` to numerical by data cleaning.\n", "### Convert `issue_d`, `earliest_cr_line`, `last_pymnt_d`, and `last_credit_pull_d` to datetime." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
termsub_gradeemp_titlehome_ownershipverification_statusissue_dpurposeearliest_cr_linerevol_utillast_pymnt_dlast_credit_pull_d
4200036 monthsC2Best BuyRENTNot VerifiedFeb-2008debt_consolidationJul-2000100.7%Feb-2011Jun-2016
4200136 monthsG2CVS PHARMACYOWNNot VerifiedFeb-2008debt_consolidationMar-198951.9%Nov-2008Jun-2016
4200236 monthsE4General MotorsRENTNot VerifiedFeb-2008debt_consolidationDec-199880.7%Feb-2011Jun-2016
4200336 monthsG4usa medical centerRENTNot VerifiedFeb-2008debt_consolidationJul-199557.2%Feb-2011Jun-2011
4200436 monthsB3InvestSource IncRENTNot VerifiedFeb-2008debt_consolidationSep-200574%Mar-2010Aug-2010
....................................
4253336 monthsB3NaNRENTNot VerifiedJun-2007otherNaNNaNJun-2010May-2007
4253436 monthsA5NaNNONENot VerifiedJun-2007otherNaNNaNJun-2010Aug-2007
4253536 monthsA3HomemakerMORTGAGENot VerifiedJun-2007otherNaNNaNJun-2010Feb-2015
42536NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
42537NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

538 rows × 11 columns

\n", "
" ], "text/plain": [ " term sub_grade emp_title home_ownership \\\n", "42000 36 months C2 Best Buy RENT \n", "42001 36 months G2 CVS PHARMACY OWN \n", "42002 36 months E4 General Motors RENT \n", "42003 36 months G4 usa medical center RENT \n", "42004 36 months B3 InvestSource Inc RENT \n", "... ... ... ... ... \n", "42533 36 months B3 NaN RENT \n", "42534 36 months A5 NaN NONE \n", "42535 36 months A3 Homemaker MORTGAGE \n", "42536 NaN NaN NaN NaN \n", "42537 NaN NaN NaN NaN \n", "\n", " verification_status issue_d purpose earliest_cr_line \\\n", "42000 Not Verified Feb-2008 debt_consolidation Jul-2000 \n", "42001 Not Verified Feb-2008 debt_consolidation Mar-1989 \n", "42002 Not Verified Feb-2008 debt_consolidation Dec-1998 \n", "42003 Not Verified Feb-2008 debt_consolidation Jul-1995 \n", "42004 Not Verified Feb-2008 debt_consolidation Sep-2005 \n", "... ... ... ... ... \n", "42533 Not Verified Jun-2007 other NaN \n", "42534 Not Verified Jun-2007 other NaN \n", "42535 Not Verified Jun-2007 other NaN \n", "42536 NaN NaN NaN NaN \n", "42537 NaN NaN NaN NaN \n", "\n", " revol_util last_pymnt_d last_credit_pull_d \n", "42000 100.7% Feb-2011 Jun-2016 \n", "42001 51.9% Nov-2008 Jun-2016 \n", "42002 80.7% Feb-2011 Jun-2016 \n", "42003 57.2% Feb-2011 Jun-2011 \n", "42004 74% Mar-2010 Aug-2010 \n", "... ... ... ... \n", "42533 NaN Jun-2010 May-2007 \n", "42534 NaN Jun-2010 Aug-2007 \n", "42535 NaN Jun-2010 Feb-2015 \n", "42536 NaN NaN NaN \n", "42537 NaN NaN NaN \n", "\n", "[538 rows x 11 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk[useful_obj_cols]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id object\n", "member_id float64\n", "loan_amnt float64\n", "funded_amnt float64\n", "funded_amnt_inv float64\n", "term float64\n", "int_rate object\n", "installment float64\n", "grade object\n", "sub_grade category\n", "emp_title object\n", "emp_length object\n", "home_ownership category\n", "annual_inc float64\n", "verification_status category\n", "issue_d datetime64[ns]\n", "loan_status object\n", "pymnt_plan object\n", "purpose category\n", "title object\n", "zip_code object\n", "addr_state object\n", "dti float64\n", "delinq_2yrs float64\n", "earliest_cr_line datetime64[ns]\n", "inq_last_6mths float64\n", "open_acc float64\n", "pub_rec float64\n", "revol_bal float64\n", "revol_util float64\n", "total_acc float64\n", "initial_list_status object\n", "out_prncp float64\n", "out_prncp_inv float64\n", "total_pymnt float64\n", "total_pymnt_inv float64\n", "total_rec_prncp float64\n", "total_rec_int float64\n", "total_rec_late_fee float64\n", "recoveries float64\n", "collection_recovery_fee float64\n", "last_pymnt_d datetime64[ns]\n", "last_pymnt_amnt float64\n", "last_credit_pull_d datetime64[ns]\n", "collections_12_mths_ex_med float64\n", "policy_code float64\n", "application_type object\n", "acc_now_delinq float64\n", "chargeoff_within_12_mths float64\n", "delinq_amnt float64\n", "pub_rec_bankruptcies float64\n", "tax_liens float64\n", "dtype: object" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n", "\n", "for chunk in chunk_iter:\n", " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n", " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n", " chunk['term'] = pd.to_numeric(term_cleaned)\n", " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n", " \n", "chunk.dtypes" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'member_id': 3,\n", " 'loan_amnt': 3,\n", " 'funded_amnt': 3,\n", " 'funded_amnt_inv': 3,\n", " 'installment': 3,\n", " 'annual_inc': 7,\n", " 'dti': 3,\n", " 'delinq_2yrs': 32,\n", " 'inq_last_6mths': 32,\n", " 'open_acc': 32,\n", " 'pub_rec': 32,\n", " 'revol_bal': 3,\n", " 'revol_util': 93,\n", " 'total_acc': 32,\n", " 'out_prncp': 3,\n", " 'out_prncp_inv': 3,\n", " 'total_pymnt': 3,\n", " 'total_pymnt_inv': 3,\n", " 'total_rec_prncp': 3,\n", " 'total_rec_int': 3,\n", " 'total_rec_late_fee': 3,\n", " 'recoveries': 3,\n", " 'collection_recovery_fee': 3,\n", " 'last_pymnt_amnt': 3,\n", " 'collections_12_mths_ex_med': 148,\n", " 'policy_code': 3,\n", " 'acc_now_delinq': 32,\n", " 'chargeoff_within_12_mths': 148,\n", " 'delinq_amnt': 32,\n", " 'pub_rec_bankruptcies': 1368,\n", " 'tax_liens': 108,\n", " 'term': 3}" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n", "mv_counts = {}\n", "for chunk in chunk_iter:\n", " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n", " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n", " chunk['term'] = pd.to_numeric(term_cleaned)\n", " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n", " float_cols = chunk.select_dtypes(include=['float'])\n", " for col in float_cols.columns:\n", " missing_values = len(chunk) - chunk[col].count()\n", " if col in mv_counts:\n", " mv_counts[col] = mv_counts[col] + missing_values\n", " else:\n", " mv_counts[col] = missing_values\n", "mv_counts" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'member_id': 3,\n", " 'loan_amnt': 3,\n", " 'funded_amnt': 3,\n", " 'funded_amnt_inv': 3,\n", " 'installment': 3,\n", " 'annual_inc': 7,\n", " 'dti': 3,\n", " 'delinq_2yrs': 32,\n", " 'inq_last_6mths': 32,\n", " 'open_acc': 32,\n", " 'pub_rec': 32,\n", " 'revol_bal': 3,\n", " 'revol_util': 93,\n", " 'total_acc': 32,\n", " 'out_prncp': 3,\n", " 'out_prncp_inv': 3,\n", " 'total_pymnt': 3,\n", " 'total_pymnt_inv': 3,\n", " 'total_rec_prncp': 3,\n", " 'total_rec_int': 3,\n", " 'total_rec_late_fee': 3,\n", " 'recoveries': 3,\n", " 'collection_recovery_fee': 3,\n", " 'last_pymnt_amnt': 3,\n", " 'collections_12_mths_ex_med': 148,\n", " 'policy_code': 3,\n", " 'acc_now_delinq': 32,\n", " 'chargeoff_within_12_mths': 148,\n", " 'delinq_amnt': 32,\n", " 'pub_rec_bankruptcies': 1368,\n", " 'tax_liens': 108,\n", " 'term': 3}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n", "mv_counts = {}\n", "for chunk in chunk_iter:\n", " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n", " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n", " chunk['term'] = pd.to_numeric(term_cleaned)\n", " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n", " chunk = chunk.dropna(how='all')\n", " float_cols = chunk.select_dtypes(include=['float'])\n", " for col in float_cols.columns:\n", " missing_values = len(chunk) - chunk[col].count()\n", " if col in mv_counts:\n", " mv_counts[col] = mv_counts[col] + missing_values\n", " else:\n", " mv_counts[col] = missing_values\n", "mv_counts" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 2 }