|
@@ -0,0 +1,367 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "# Introduction"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 2,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "import pandas as pd\n",
|
|
|
+ "pd.options.display.max_columns = 99\n",
|
|
|
+ "first_five = pd.read_csv('loans_2007.csv', nrows=5)\n",
|
|
|
+ "first_five"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 5,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "thousand_chunk = pd.read_csv('loans_2007.csv', nrows=1000)\n",
|
|
|
+ "thousand_chunk.memory_usage(deep=True).sum()/(1024*1024)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "### Let's try tripling to 3000 rows and calculate the memory footprint for each chunk."
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 6,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
|
|
|
+ "for chunk in chunk_iter:\n",
|
|
|
+ " print(chunk.memory_usage(deep=True).sum()/(1024*1024))"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "## How many rows in the data set?"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 13,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
|
|
|
+ "total_rows = 0\n",
|
|
|
+ "for chunk in chunk_iter:\n",
|
|
|
+ " total_rows += len(chunk)\n",
|
|
|
+ "print(total_rows)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "# Exploring the Data in Chunks\n",
|
|
|
+ "\n",
|
|
|
+ "## How many columns have a numeric type? How many have a string type?"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 7,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false,
|
|
|
+ "scrolled": true
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# Numeric columns\n",
|
|
|
+ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
|
|
|
+ "for chunk in chunk_iter:\n",
|
|
|
+ " print(chunk.dtypes.value_counts())"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 36,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "# Are string columns consistent across chunks?\n",
|
|
|
+ "obj_cols = []\n",
|
|
|
+ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
|
|
|
+ "\n",
|
|
|
+ "for chunk in chunk_iter:\n",
|
|
|
+ " chunk_obj_cols = chunk.select_dtypes(include=['object']).columns.tolist()\n",
|
|
|
+ " if len(obj_cols) > 0:\n",
|
|
|
+ " is_same = obj_cols == chunk_obj_cols\n",
|
|
|
+ " if not is_same:\n",
|
|
|
+ " print(\"overall obj cols:\", obj_cols, \"\\n\")\n",
|
|
|
+ " print(\"chunk obj cols:\", chunk_obj_cols, \"\\n\") \n",
|
|
|
+ " else:\n",
|
|
|
+ " obj_cols = chunk_obj_cols"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": true
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "### Observation 1: By default -- 31 numeric columns and 21 string columns.\n",
|
|
|
+ "\n",
|
|
|
+ "### Observation 2: It seems like one column in particular (the `id` column) is being cast to int64 in the last 2 chunks but not in the earlier chunks. Since the `id` column won't be useful for analysis, visualization, or predictive modelling let's ignore this column.\n",
|
|
|
+ "\n",
|
|
|
+ "## How many unique values are there in each string column? How many of the string columns contain values that are less than 50% unique?"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 23,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false,
|
|
|
+ "scrolled": true
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "## Create dictionary (key: column, value: list of Series objects representing each chunk's value counts)\n",
|
|
|
+ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)\n",
|
|
|
+ "str_cols_vc = {}\n",
|
|
|
+ "for chunk in chunk_iter:\n",
|
|
|
+ " str_cols = chunk.select_dtypes(include=['object'])\n",
|
|
|
+ " for col in str_cols.columns:\n",
|
|
|
+ " current_col_vc = str_cols[col].value_counts()\n",
|
|
|
+ " if col in str_cols_vc:\n",
|
|
|
+ " str_cols_vc[col].append(current_col_vc)\n",
|
|
|
+ " else:\n",
|
|
|
+ " str_cols_vc[col] = [current_col_vc]"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 24,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": true
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "## Combine the value counts.\n",
|
|
|
+ "combined_vcs = {}\n",
|
|
|
+ "\n",
|
|
|
+ "for col in str_cols_vc:\n",
|
|
|
+ " combined_vc = pd.concat(str_cols_vc[col])\n",
|
|
|
+ " final_vc = combined_vc.groupby(combined_vc.index).sum()\n",
|
|
|
+ " combined_vcs[col] = final_vc"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 37,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "combined_vcs.keys()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "## Optimizing String Columns\n",
|
|
|
+ "\n",
|
|
|
+ "### Determine which string columns you can convert to a numeric type if you clean them. Let's focus on columns that would actually be useful for analysis and modelling."
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 38,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "obj_cols"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 59,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": true
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "useful_obj_cols = ['term', 'sub_grade', 'emp_title', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'earliest_cr_line', 'revol_util', 'last_pymnt_d', 'last_credit_pull_d']"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 61,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "for col in useful_obj_cols:\n",
|
|
|
+ " print(col)\n",
|
|
|
+ " print(combined_vcs[col])\n",
|
|
|
+ " print(\"-----------\")"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "### Convert to category"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 94,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": true
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "convert_col_dtypes = {\n",
|
|
|
+ " \"sub_grade\": \"category\", \"home_ownership\": \"category\", \n",
|
|
|
+ " \"verification_status\": \"category\", \"purpose\": \"category\"\n",
|
|
|
+ "}"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "### Convert `term` and `revol_util` to numerical by data cleaning.\n",
|
|
|
+ "### Convert `issue_d`, `earliest_cr_line`, `last_pymnt_d`, and `last_credit_pull_d` to datetime."
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 97,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false,
|
|
|
+ "scrolled": true
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "chunk[useful_obj_cols]"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 96,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
|
|
|
+ "\n",
|
|
|
+ "for chunk in chunk_iter:\n",
|
|
|
+ " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
|
|
|
+ " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
|
|
|
+ " chunk['term'] = pd.to_numeric(term_cleaned)\n",
|
|
|
+ " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
|
|
|
+ " \n",
|
|
|
+ "chunk.dtypes"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 118,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
|
|
|
+ "mv_counts = {}\n",
|
|
|
+ "for chunk in chunk_iter:\n",
|
|
|
+ " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
|
|
|
+ " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
|
|
|
+ " chunk['term'] = pd.to_numeric(term_cleaned)\n",
|
|
|
+ " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
|
|
|
+ " float_cols = chunk.select_dtypes(include=['float'])\n",
|
|
|
+ " for col in float_cols.columns:\n",
|
|
|
+ " missing_values = len(chunk) - chunk[col].count()\n",
|
|
|
+ " if col in mv_counts:\n",
|
|
|
+ " mv_counts[col] = mv_counts[col] + missing_values\n",
|
|
|
+ " else:\n",
|
|
|
+ " mv_counts[col] = missing_values\n",
|
|
|
+ "mv_counts"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 123,
|
|
|
+ "metadata": {
|
|
|
+ "collapsed": false
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000, dtype=convert_col_dtypes, parse_dates=[\"issue_d\", \"earliest_cr_line\", \"last_pymnt_d\", \"last_credit_pull_d\"])\n",
|
|
|
+ "mv_counts = {}\n",
|
|
|
+ "for chunk in chunk_iter:\n",
|
|
|
+ " term_cleaned = chunk['term'].str.lstrip(\" \").str.rstrip(\" months\")\n",
|
|
|
+ " revol_cleaned = chunk['revol_util'].str.rstrip(\"%\")\n",
|
|
|
+ " chunk['term'] = pd.to_numeric(term_cleaned)\n",
|
|
|
+ " chunk['revol_util'] = pd.to_numeric(revol_cleaned)\n",
|
|
|
+ " chunk = chunk.dropna(how='all')\n",
|
|
|
+ " float_cols = chunk.select_dtypes(include=['float'])\n",
|
|
|
+ " for col in float_cols.columns:\n",
|
|
|
+ " missing_values = len(chunk) - chunk[col].count()\n",
|
|
|
+ " if col in mv_counts:\n",
|
|
|
+ " mv_counts[col] = mv_counts[col] + missing_values\n",
|
|
|
+ " else:\n",
|
|
|
+ " mv_counts[col] = missing_values\n",
|
|
|
+ "mv_counts"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "anaconda-cloud": {},
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "Python 3",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python3"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 3
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython3",
|
|
|
+ "version": "3.5.0"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 2
|
|
|
+}
|