{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Introduction" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": true }, "outputs": [], "source": [ "import pandas as pd\n", "pd.options.display.max_columns = 99\n", "chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute each column's missing value counts" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "company_country_code 1\n", "company_name 1\n", "company_permalink 1\n", "company_region 1\n", "investor_region 2\n", "investor_permalink 2\n", "investor_name 2\n", "funded_quarter 3\n", "funded_at 3\n", "funded_month 3\n", "funded_year 3\n", "funding_round_type 3\n", "company_state_code 492\n", "company_city 533\n", "company_category_code 643\n", "raised_amount_usd 3599\n", "investor_country_code 12001\n", "investor_city 12480\n", "investor_state_code 16809\n", "investor_category_code 50427\n", "dtype: int64" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mv_list = []\n", "for chunk in chunk_iter:\n", " mv_list.append(chunk.isnull().sum())\n", " \n", "combined_mv_vc = pd.concat(mv_list)\n", "unique_combined_mv_vc = combined_mv_vc.groupby(combined_mv_vc.index).sum()\n", "unique_combined_mv_vc.sort_values()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Total memory footprint for each column" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "company_permalink 4057788\n", "company_name 3591326\n", "company_category_code 3421104\n", "company_country_code 3172176\n", "company_state_code 3106051\n", "company_region 3411585\n", "company_city 3505926\n", "investor_permalink 4980548\n", "investor_name 3915666\n", "investor_category_code 622424\n", "investor_country_code 2647292\n", "investor_state_code 2476607\n", "investor_region 3396281\n", "investor_city 2885083\n", "funding_round_type 3410707\n", "funded_at 3542185\n", "funded_month 3383584\n", "funded_quarter 3383584\n", "funded_year 422960\n", "raised_amount_usd 422960\n", "dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')\n", "counter = 0\n", "series_memory_fp = pd.Series(dtype='float64')\n", "for chunk in chunk_iter:\n", " if counter == 0:\n", " series_memory_fp = chunk.memory_usage(deep=True)\n", " else:\n", " series_memory_fp += chunk.memory_usage(deep=True)\n", " counter += 1\n", "\n", "# Drop memory footprint calculation for the index.\n", "series_memory_fp = series_memory_fp.drop('Index')\n", "series_memory_fp" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Total memory footprint of the data (in megabytes)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "56.9876070022583" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "series_memory_fp.sum() / (1024 * 1024)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "company_country_code 1\n", "company_name 1\n", "company_permalink 1\n", "company_region 1\n", "investor_region 2\n", "investor_permalink 2\n", "investor_name 2\n", "funded_quarter 3\n", "funded_at 3\n", "funded_month 3\n", "funded_year 3\n", "funding_round_type 3\n", "company_state_code 492\n", "company_city 533\n", "company_category_code 643\n", "raised_amount_usd 3599\n", "investor_country_code 12001\n", "investor_city 12480\n", "investor_state_code 16809\n", "investor_category_code 50427\n", "dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unique_combined_mv_vc.sort_values()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Drop columns representing URLs or containing too many missing values (>90% missing)\n", "drop_cols = ['investor_permalink', 'company_permalink', 'investor_category_code']\n", "keep_cols = chunk.columns.drop(drop_cols)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keep_cols.tolist" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Selecting Data Types\n", "\n", "Let's first determine which columns shift types across chunks. Note that we only lay the groundwork for this step." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Key: Column name, Value: List of types\n", "col_types = {}\n", "chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols)\n", "\n", "for chunk in chunk_iter:\n", " for col in chunk.columns:\n", " if col not in col_types:\n", " col_types[col] = [str(chunk.dtypes[col])]\n", " else:\n", " col_types[col].append(str(chunk.dtypes[col]))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'company_name': {'object'},\n", " 'company_category_code': {'object'},\n", " 'company_country_code': {'object'},\n", " 'company_state_code': {'object'},\n", " 'company_region': {'object'},\n", " 'company_city': {'object'},\n", " 'investor_name': {'object'},\n", " 'investor_country_code': {'float64', 'object'},\n", " 'investor_state_code': {'float64', 'object'},\n", " 'investor_region': {'object'},\n", " 'investor_city': {'float64', 'object'},\n", " 'funding_round_type': {'object'},\n", " 'funded_at': {'object'},\n", " 'funded_month': {'object'},\n", " 'funded_quarter': {'object'},\n", " 'funded_year': {'float64', 'int64'},\n", " 'raised_amount_usd': {'float64'}}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "uniq_col_types = {}\n", "for k,v in col_types.items():\n", " uniq_col_types[k] = set(col_types[k])\n", "uniq_col_types" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
company_namecompany_category_codecompany_country_codecompany_state_codecompany_regioncompany_cityinvestor_nameinvestor_country_codeinvestor_state_codeinvestor_regioninvestor_cityfunding_round_typefunded_atfunded_monthfunded_quarterfunded_yearraised_amount_usd
50000NuORDERfashionUSACALos AngelesWest HollywoodMortimer SingerNaNNaNunknownNaNseries-a2012-10-012012-102012-Q420123060000.0
50001ChaChaadvertisingUSAINIndianapolisCarmelMorton MeyersonNaNNaNunknownNaNseries-b2007-10-012007-102007-Q4200712000000.0
50002BinfiresoftwareUSAFLBocat RatonBocat RatonMoshe ArielNaNNaNunknownNaNangel2008-04-182008-042008-Q22008500000.0
50003BinfiresoftwareUSAFLBocat RatonBocat RatonMoshe ArielNaNNaNunknownNaNangel2010-01-012010-012010-Q12010750000.0
50004Unified ColorsoftwareUSACASF BaySouth San FrnaciscoMr. Andrew OungNaNNaNunknownNaNangel2010-01-012010-012010-Q12010NaN
......................................................
52865Garantia DataenterpriseUSACASF BaySanta ClaraZohar GilonNaNNaNunknownNaNseries-a2012-08-082012-082012-Q320123800000.0
52866DudaMobilemobileUSACASF BayPalo AltoZohar GilonNaNNaNunknownNaNseries-c+2013-04-082013-042013-Q2201310300000.0
52867SiteBrainssoftwareUSACASF BaySan Franciscozohar israelNaNNaNunknownNaNangel2010-08-012010-082010-Q32010350000.0
52868Comprehend SystemsenterpriseUSACASF BayPalo AltoZorba LiebermanNaNNaNunknownNaNseries-a2013-07-112013-072013-Q320138400000.0
52869SmartThingsmobileUSADCunknownMinneapolisZorik GordonNaNNaNunknownNaNseries-a2012-12-042012-122012-Q420123000000.0
\n", "

2870 rows × 17 columns

\n", "
" ], "text/plain": [ " company_name company_category_code company_country_code \\\n", "50000 NuORDER fashion USA \n", "50001 ChaCha advertising USA \n", "50002 Binfire software USA \n", "50003 Binfire software USA \n", "50004 Unified Color software USA \n", "... ... ... ... \n", "52865 Garantia Data enterprise USA \n", "52866 DudaMobile mobile USA \n", "52867 SiteBrains software USA \n", "52868 Comprehend Systems enterprise USA \n", "52869 SmartThings mobile USA \n", "\n", " company_state_code company_region company_city investor_name \\\n", "50000 CA Los Angeles West Hollywood Mortimer Singer \n", "50001 IN Indianapolis Carmel Morton Meyerson \n", "50002 FL Bocat Raton Bocat Raton Moshe Ariel \n", "50003 FL Bocat Raton Bocat Raton Moshe Ariel \n", "50004 CA SF Bay South San Frnacisco Mr. Andrew Oung \n", "... ... ... ... ... \n", "52865 CA SF Bay Santa Clara Zohar Gilon \n", "52866 CA SF Bay Palo Alto Zohar Gilon \n", "52867 CA SF Bay San Francisco zohar israel \n", "52868 CA SF Bay Palo Alto Zorba Lieberman \n", "52869 DC unknown Minneapolis Zorik Gordon \n", "\n", " investor_country_code investor_state_code investor_region \\\n", "50000 NaN NaN unknown \n", "50001 NaN NaN unknown \n", "50002 NaN NaN unknown \n", "50003 NaN NaN unknown \n", "50004 NaN NaN unknown \n", "... ... ... ... \n", "52865 NaN NaN unknown \n", "52866 NaN NaN unknown \n", "52867 NaN NaN unknown \n", "52868 NaN NaN unknown \n", "52869 NaN NaN unknown \n", "\n", " investor_city funding_round_type funded_at funded_month \\\n", "50000 NaN series-a 2012-10-01 2012-10 \n", "50001 NaN series-b 2007-10-01 2007-10 \n", "50002 NaN angel 2008-04-18 2008-04 \n", "50003 NaN angel 2010-01-01 2010-01 \n", "50004 NaN angel 2010-01-01 2010-01 \n", "... ... ... ... ... \n", "52865 NaN series-a 2012-08-08 2012-08 \n", "52866 NaN series-c+ 2013-04-08 2013-04 \n", "52867 NaN angel 2010-08-01 2010-08 \n", "52868 NaN series-a 2013-07-11 2013-07 \n", "52869 NaN series-a 2012-12-04 2012-12 \n", "\n", " funded_quarter funded_year raised_amount_usd \n", "50000 2012-Q4 2012 3060000.0 \n", "50001 2007-Q4 2007 12000000.0 \n", "50002 2008-Q2 2008 500000.0 \n", "50003 2010-Q1 2010 750000.0 \n", "50004 2010-Q1 2010 NaN \n", "... ... ... ... \n", "52865 2012-Q3 2012 3800000.0 \n", "52866 2013-Q2 2013 10300000.0 \n", "52867 2010-Q3 2010 350000.0 \n", "52868 2013-Q3 2013 8400000.0 \n", "52869 2012-Q4 2012 3000000.0 \n", "\n", "[2870 rows x 17 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading Chunks into SQLite" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "conn = sqlite3.connect('crunchbase.db')\n", "chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')\n", "\n", "for chunk in chunk_iter:\n", " chunk.to_sql(\"investments\", conn, if_exists='append', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }