{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Introduction" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "scrolled": true }, "outputs": [], "source": [ "import pandas as pd\n", "pd.options.display.max_columns = 99\n", "chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute each column's missing value counts" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "company_country_code 1\n", "company_name 1\n", "company_permalink 1\n", "company_region 1\n", "investor_region 2\n", "investor_permalink 2\n", "investor_name 2\n", "funded_quarter 3\n", "funded_at 3\n", "funded_month 3\n", "funded_year 3\n", "funding_round_type 3\n", "company_state_code 492\n", "company_city 533\n", "company_category_code 643\n", "raised_amount_usd 3599\n", "investor_country_code 12001\n", "investor_city 12480\n", "investor_state_code 16809\n", "investor_category_code 50427\n", "dtype: int64" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mv_list = []\n", "for chunk in chunk_iter:\n", " mv_list.append(chunk.isnull().sum())\n", " \n", "combined_mv_vc = pd.concat(mv_list)\n", "unique_combined_mv_vc = combined_mv_vc.groupby(combined_mv_vc.index).sum()\n", "unique_combined_mv_vc.sort_values()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Total memory footprint for each column" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "company_permalink 4057788\n", "company_name 3591326\n", "company_category_code 3421104\n", "company_country_code 3172176\n", "company_state_code 3106051\n", "company_region 3411585\n", "company_city 3505926\n", "investor_permalink 4980548\n", "investor_name 3915666\n", "investor_category_code 622424\n", "investor_country_code 2647292\n", "investor_state_code 2476607\n", "investor_region 3396281\n", "investor_city 2885083\n", "funding_round_type 3410707\n", "funded_at 3542185\n", "funded_month 3383584\n", "funded_quarter 3383584\n", "funded_year 422960\n", "raised_amount_usd 422960\n", "dtype: int64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')\n", "counter = 0\n", "series_memory_fp = pd.Series()\n", "for chunk in chunk_iter:\n", " if counter == 0:\n", " series_memory_fp = chunk.memory_usage(deep=True)\n", " else:\n", " series_memory_fp += chunk.memory_usage(deep=True)\n", " counter += 1\n", "\n", "# Drop memory footprint calculation for the index.\n", "series_memory_fp = series_memory_fp.drop('Index')\n", "series_memory_fp" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Total memory footprint of the data (in megabytes)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "56.987607002258301" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "series_memory_fp.sum() / (1024 * 1024)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "company_country_code 1\n", "company_name 1\n", "company_permalink 1\n", "company_region 1\n", "investor_region 2\n", "investor_permalink 2\n", "investor_name 2\n", "funded_quarter 3\n", "funded_at 3\n", "funded_month 3\n", "funded_year 3\n", "funding_round_type 3\n", "company_state_code 492\n", "company_city 533\n", "company_category_code 643\n", "raised_amount_usd 3599\n", "investor_country_code 12001\n", "investor_city 12480\n", "investor_state_code 16809\n", "investor_category_code 50427\n", "dtype: int64" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unique_combined_mv_vc.sort_values()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Drop columns representing URL's or containing way too many missing values (>90% missing)\n", "drop_cols = ['investor_permalink', 'company_permalink', 'investor_category_code']\n", "keep_cols = chunk.columns.drop(drop_cols)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['company_name', 'company_category_code', 'company_country_code',\n", " 'company_state_code', 'company_region', 'company_city', 'investor_name',\n", " 'investor_country_code', 'investor_state_code', 'investor_region',\n", " 'investor_city', 'funding_round_type', 'funded_at', 'funded_month',\n", " 'funded_quarter', 'funded_year', 'raised_amount_usd'],\n", " dtype='object')" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keep_cols.tolist" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Selecting Data Types\n", "\n", "Let's first determine which columns shift types across chunks. Note that we only lay the groundwork for this step." ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Key: Column name, Value: List of types\n", "col_types = {}\n", "chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols)\n", "\n", "for chunk in chunk_iter:\n", " for col in chunk.columns:\n", " if col not in col_types:\n", " col_types[col] = [str(chunk.dtypes[col])]\n", " else:\n", " col_types[col].append(str(chunk.dtypes[col]))" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'company_category_code': {'object'},\n", " 'company_city': {'object'},\n", " 'company_country_code': {'object'},\n", " 'company_name': {'object'},\n", " 'company_region': {'object'},\n", " 'company_state_code': {'object'},\n", " 'funded_at': {'object'},\n", " 'funded_month': {'object'},\n", " 'funded_quarter': {'object'},\n", " 'funded_year': {'float64', 'int64'},\n", " 'funding_round_type': {'object'},\n", " 'investor_city': {'float64', 'object'},\n", " 'investor_country_code': {'float64', 'object'},\n", " 'investor_name': {'object'},\n", " 'investor_region': {'object'},\n", " 'investor_state_code': {'float64', 'object'},\n", " 'raised_amount_usd': {'float64'}}" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "uniq_col_types = {}\n", "for k,v in col_types.items():\n", " uniq_col_types[k] = set(col_types[k])\n", "uniq_col_types" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
company_namecompany_category_codecompany_country_codecompany_state_codecompany_regioncompany_cityinvestor_nameinvestor_country_codeinvestor_state_codeinvestor_regioninvestor_cityfunding_round_typefunded_atfunded_monthfunded_quarterfunded_yearraised_amount_usd
50000NuORDERfashionUSACALos AngelesWest HollywoodMortimer SingerNaNNaNunknownNaNseries-a2012-10-012012-102012-Q420123060000.0
50001ChaChaadvertisingUSAINIndianapolisCarmelMorton MeyersonNaNNaNunknownNaNseries-b2007-10-012007-102007-Q4200712000000.0
50002BinfiresoftwareUSAFLBocat RatonBocat RatonMoshe ArielNaNNaNunknownNaNangel2008-04-182008-042008-Q22008500000.0
50003BinfiresoftwareUSAFLBocat RatonBocat RatonMoshe ArielNaNNaNunknownNaNangel2010-01-012010-012010-Q12010750000.0
50004Unified ColorsoftwareUSACASF BaySouth San FrnaciscoMr. Andrew OungNaNNaNunknownNaNangel2010-01-012010-012010-Q12010NaN
50005HItviewsadvertisingUSANYNew YorkNew York Citymultiple partiesNaNNaNunknownNaNangel2007-11-292007-112007-Q42007485000.0
50006LockerDomesocialUSAMOSaint LouisSt. Louismultiple partiesNaNNaNunknownNaNangel2012-04-172012-042012-Q22012300000.0
50007ThirdLoveecommerceUSACASF BaySan FranciscoMunjal ShahNaNNaNunknownNaNseries-a2012-12-012012-122012-Q420125600000.0
50008HakiasearchUSANaNTBDNaNMurat VargiNaNNaNunknownNaNseries-a2006-11-012006-112006-Q4200616000000.0
50009bookacoachsportsUSAINIndianapolisIndianapolisMyles GroteNaNNaNunknownNaNangel2012-11-012012-112012-Q42012NaN
50010LocalCirclessocialUSACASF BaySanta ClaraNadir GodrejNaNNaNunknownNaNangel2012-09-012012-092012-Q32012NaN
50011GraphdiveanalyticsUSACASF BayMenlo ParkNaguib SawirisNaNNaNunknownNaNangel2012-10-042012-102012-Q420121000000.0
50012RibbonecommerceUSACASF BaySan FranciscoNaguib SawirisNaNNaNunknownNaNseries-a2013-02-052013-022013-Q120131630000.0
50013Dokkankom.comecommerceUSANYNew Yorknew yorkNamek Zu'biNaNNaNunknownNaNangel2011-10-102011-102011-Q4201130000.0
50014LookerywebUSACASF BaySan FranciscoNana ShinNaNNaNunknownNaNangel2008-02-072008-022008-Q12008900000.0
50015TrustDegreeswebUSANYKenmoreKenmoreNancy BarrettNaNNaNunknownNaNangel2011-06-092011-062011-Q220118000.0
50016Altavozgames_videoUSADCWashington DCWashingtonNancy JacobsenNaNNaNunknownNaNangel2012-09-112012-092012-Q32012150000.0
50017EdSurgeeducationUSACASF BayBurlingameNancy PeretsmanNaNNaNunknownNaNangel2012-08-292012-082012-Q32012400000.0
50018FullContactenterpriseUSACODenverDenverNancy PierceNaNNaNunknownNaNseries-b2012-07-092012-072012-Q320127000000.0
50019Rapt MediaenterpriseUSACODenverBoulderNancy PierceNaNNaNunknownNaNseries-a2013-01-232013-012013-Q120132288803.0
50020HumanoidsoftwareUSACASF BaySan FranciscoNat FriedmanNaNNaNunknownNaNangel2010-12-012010-122010-Q420101100000.0
50021RunscopewebUSACASF BaySan FranciscoNat FriedmanNaNNaNunknownNaNangel2013-05-222013-052013-Q220131100000.0
50022AdzerkadvertisingUSANCRaleigh-DurhamDurhamNat TurnerNaNNaNunknownNaNangel2011-07-122011-072011-Q32011650000.0
50023AdaptlyadvertisingUSANYNew YorkNew YorkNat TurnerNaNNaNunknownNaNseries-a2011-04-182011-042011-Q220112000000.0
50024LoreeducationUSANYNew YorkNew YorkNat TurnerNaNNaNunknownNaNangel2011-06-272011-062011-Q220111000000.0
50025Tasted MenuhospitalityUSAMABostonBostonNat TurnerNaNNaNunknownNaNangel2011-05-012011-052011-Q22011NaN
50026Lua TechnologiesmobileUSANYNew YorkNew YorkNat TurnerNaNNaNunknownNaNseries-a2012-08-012012-082012-Q320122500000.0
50027Blue ApronhospitalityUSANYNew YorkBrooklynNat TurnerNaNNaNunknownNaNseries-a2013-02-192013-022013-Q120133000000.0
50028ChatIDmobileUSANYNew YorkNew YorkNat TurnerNaNNaNunknownNaNangel2012-01-012012-012012-Q12012NaN
50029Breakthrough BehavioralhealthUSACASF BayRedwood CityNat TurnerNaNNaNunknownNaNangel2012-08-132012-082012-Q32012900000.0
......................................................
52840MeddikhealthUSANYNew YorkNew YorkZach WeinbergNaNNaNunknownNaNangel2012-05-242012-052012-Q22012750000.0
52841Blue ApronhospitalityUSANYNew YorkBrooklynZach WeinbergNaNNaNunknownNaNseries-a2013-02-192013-022013-Q120133000000.0
52842ChatIDmobileUSANYNew YorkNew YorkZach WeinbergNaNNaNunknownNaNangel2012-01-012012-012012-Q12012NaN
52843Breakthrough BehavioralhealthUSACASF BayRedwood CityZach WeinbergNaNNaNunknownNaNangel2012-08-132012-082012-Q32012900000.0
52844PlaidsoftwareUSACASF BaySan FranciscoZach WeinbergNaNNaNunknownNaNseries-a2013-09-192013-092013-Q320132800000.0
52845PokitDokmobileUSACASF BayMenlo ParkZach ZeitlinNaNNaNunknownNaNangel2012-07-122012-072012-Q320121300000.0
52846FitocracywebUSANYNew YorkNew YorkZachary AaronsNaNNaNunknownNaNangel2011-09-012011-092011-Q32011250000.0
52847SquaremobileUSACASF BaySan FranciscoZachary BogueNaNNaNunknownNaNseries-a2009-11-012009-112009-Q4200910000000.0
52848MixRankadvertisingUSACASF BaySan FranciscoZachary BogueNaNNaNunknownNaNseries-a2011-11-182011-112011-Q420111500000.0
52849SocialcammobileUSACASanta Clara CountySanta Clara CountyZachary BogueNaNNaNunknownNaNangel2012-04-302012-042012-Q22012NaN
52850NuzzelnewsUSACASF BaySan FranciscoZachary BogueNaNNaNunknownNaNventure2012-11-152012-112012-Q420121700000.0
52851ThirdLoveecommerceUSACASF BaySan FranciscoZachary BogueNaNNaNunknownNaNseries-a2012-12-012012-122012-Q420125600000.0
52852MXD3DwebUSACASF BaySan FranciscoZaid AyoubNaNNaNunknownNaNangel2012-01-012012-012012-Q12012300000.0
52853MXD3DwebUSACASF BaySan FranciscoZaid AyoubNaNNaNunknownNaNangel2011-01-012011-012011-Q12011300000.0
52854VeriousmobileUSACASF BaySan CarlosZain KhanNaNNaNunknownNaNangel2011-05-302011-052011-Q22011800000.0
52855IdentifiedanalyticsUSACASF BaySan FranciscoZao YangNaNNaNunknownNaNseries-b2012-06-052012-062012-Q2201221000000.0
52856HaulerDealsfashionUSACALos AngelesLos AngelesZaw ThetNaNNaNunknownNaNangel2012-10-312012-102012-Q420121250000.0
52857When You WishnonprofitUSACALos AngelesMarina Del ReyZelda MarzecNaNNaNunknownNaNseries-a2011-02-012011-022011-Q120111500000.0
52858FarmeronanalyticsUSACASF BayMountain ViewZeljko MataijaNaNNaNunknownNaNangel2010-10-012010-102010-Q4201015000.0
52859Theraclone SciencesbiotechUSAWASeattleSeattleZenyaku KogyoNaNNaNunknownNaNseries-b2013-03-252013-032013-Q120138000000.0
52860SimpleGeoadvertisingUSACASF BaySan FranciscoZiv NavothNaNNaNunknownNaNother2009-11-102009-112009-Q42009195000.0
52861Open MeecommerceUSACALos AngelesLos AngelesZiver BirgNaNNaNunknownNaNangel2013-08-012013-082013-Q32013NaN
52862Comprehend SystemsenterpriseUSACASF BayPalo AltoZod NazemNaNNaNunknownNaNseries-a2013-07-112013-072013-Q320138400000.0
52863PayoneerotherUSANYNew YorkNew YorkZohar GilonNaNNaNunknownNaNseries-a2005-01-012005-012005-Q120052000000.0
52864OutbrainwebUSANYNew YorkNew York CityZohar GilonNaNNaNunknownNaNseries-b2009-02-112009-022009-Q1200912000000.0
52865Garantia DataenterpriseUSACASF BaySanta ClaraZohar GilonNaNNaNunknownNaNseries-a2012-08-082012-082012-Q320123800000.0
52866DudaMobilemobileUSACASF BayPalo AltoZohar GilonNaNNaNunknownNaNseries-c+2013-04-082013-042013-Q2201310300000.0
52867SiteBrainssoftwareUSACASF BaySan Franciscozohar israelNaNNaNunknownNaNangel2010-08-012010-082010-Q32010350000.0
52868Comprehend SystemsenterpriseUSACASF BayPalo AltoZorba LiebermanNaNNaNunknownNaNseries-a2013-07-112013-072013-Q320138400000.0
52869SmartThingsmobileUSADCunknownMinneapolisZorik GordonNaNNaNunknownNaNseries-a2012-12-042012-122012-Q420123000000.0
\n", "

2870 rows × 17 columns

\n", "
" ], "text/plain": [ " company_name company_category_code company_country_code \\\n", "50000 NuORDER fashion USA \n", "50001 ChaCha advertising USA \n", "50002 Binfire software USA \n", "50003 Binfire software USA \n", "50004 Unified Color software USA \n", "50005 HItviews advertising USA \n", "50006 LockerDome social USA \n", "50007 ThirdLove ecommerce USA \n", "50008 Hakia search USA \n", "50009 bookacoach sports USA \n", "50010 LocalCircles social USA \n", "50011 Graphdive analytics USA \n", "50012 Ribbon ecommerce USA \n", "50013 Dokkankom.com ecommerce USA \n", "50014 Lookery web USA \n", "50015 TrustDegrees web USA \n", "50016 Altavoz games_video USA \n", "50017 EdSurge education USA \n", "50018 FullContact enterprise USA \n", "50019 Rapt Media enterprise USA \n", "50020 Humanoid software USA \n", "50021 Runscope web USA \n", "50022 Adzerk advertising USA \n", "50023 Adaptly advertising USA \n", "50024 Lore education USA \n", "50025 Tasted Menu hospitality USA \n", "50026 Lua Technologies mobile USA \n", "50027 Blue Apron hospitality USA \n", "50028 ChatID mobile USA \n", "50029 Breakthrough Behavioral health USA \n", "... ... ... ... \n", "52840 Meddik health USA \n", "52841 Blue Apron hospitality USA \n", "52842 ChatID mobile USA \n", "52843 Breakthrough Behavioral health USA \n", "52844 Plaid software USA \n", "52845 PokitDok mobile USA \n", "52846 Fitocracy web USA \n", "52847 Square mobile USA \n", "52848 MixRank advertising USA \n", "52849 Socialcam mobile USA \n", "52850 Nuzzel news USA \n", "52851 ThirdLove ecommerce USA \n", "52852 MXD3D web USA \n", "52853 MXD3D web USA \n", "52854 Verious mobile USA \n", "52855 Identified analytics USA \n", "52856 HaulerDeals fashion USA \n", "52857 When You Wish nonprofit USA \n", "52858 Farmeron analytics USA \n", "52859 Theraclone Sciences biotech USA \n", "52860 SimpleGeo advertising USA \n", "52861 Open Me ecommerce USA \n", "52862 Comprehend Systems enterprise USA \n", "52863 Payoneer other USA \n", "52864 Outbrain web USA \n", "52865 Garantia Data enterprise USA \n", "52866 DudaMobile mobile USA \n", "52867 SiteBrains software USA \n", "52868 Comprehend Systems enterprise USA \n", "52869 SmartThings mobile USA \n", "\n", " company_state_code company_region company_city \\\n", "50000 CA Los Angeles West Hollywood \n", "50001 IN Indianapolis Carmel \n", "50002 FL Bocat Raton Bocat Raton \n", "50003 FL Bocat Raton Bocat Raton \n", "50004 CA SF Bay South San Frnacisco \n", "50005 NY New York New York City \n", "50006 MO Saint Louis St. Louis \n", "50007 CA SF Bay San Francisco \n", "50008 NaN TBD NaN \n", "50009 IN Indianapolis Indianapolis \n", "50010 CA SF Bay Santa Clara \n", "50011 CA SF Bay Menlo Park \n", "50012 CA SF Bay San Francisco \n", "50013 NY New York new york \n", "50014 CA SF Bay San Francisco \n", "50015 NY Kenmore Kenmore \n", "50016 DC Washington DC Washington \n", "50017 CA SF Bay Burlingame \n", "50018 CO Denver Denver \n", "50019 CO Denver Boulder \n", "50020 CA SF Bay San Francisco \n", "50021 CA SF Bay San Francisco \n", "50022 NC Raleigh-Durham Durham \n", "50023 NY New York New York \n", "50024 NY New York New York \n", "50025 MA Boston Boston \n", "50026 NY New York New York \n", "50027 NY New York Brooklyn \n", "50028 NY New York New York \n", "50029 CA SF Bay Redwood City \n", "... ... ... ... \n", "52840 NY New York New York \n", "52841 NY New York Brooklyn \n", "52842 NY New York New York \n", "52843 CA SF Bay Redwood City \n", "52844 CA SF Bay San Francisco \n", "52845 CA SF Bay Menlo Park \n", "52846 NY New York New York \n", "52847 CA SF Bay San Francisco \n", "52848 CA SF Bay San Francisco \n", "52849 CA Santa Clara County Santa Clara County \n", "52850 CA SF Bay San Francisco \n", "52851 CA SF Bay San Francisco \n", "52852 CA SF Bay San Francisco \n", "52853 CA SF Bay San Francisco \n", "52854 CA SF Bay San Carlos \n", "52855 CA SF Bay San Francisco \n", "52856 CA Los Angeles Los Angeles \n", "52857 CA Los Angeles Marina Del Rey \n", "52858 CA SF Bay Mountain View \n", "52859 WA Seattle Seattle \n", "52860 CA SF Bay San Francisco \n", "52861 CA Los Angeles Los Angeles \n", "52862 CA SF Bay Palo Alto \n", "52863 NY New York New York \n", "52864 NY New York New York City \n", "52865 CA SF Bay Santa Clara \n", "52866 CA SF Bay Palo Alto \n", "52867 CA SF Bay San Francisco \n", "52868 CA SF Bay Palo Alto \n", "52869 DC unknown Minneapolis \n", "\n", " investor_name investor_country_code investor_state_code \\\n", "50000 Mortimer Singer NaN NaN \n", "50001 Morton Meyerson NaN NaN \n", "50002 Moshe Ariel NaN NaN \n", "50003 Moshe Ariel NaN NaN \n", "50004 Mr. Andrew Oung NaN NaN \n", "50005 multiple parties NaN NaN \n", "50006 multiple parties NaN NaN \n", "50007 Munjal Shah NaN NaN \n", "50008 Murat Vargi NaN NaN \n", "50009 Myles Grote NaN NaN \n", "50010 Nadir Godrej NaN NaN \n", "50011 Naguib Sawiris NaN NaN \n", "50012 Naguib Sawiris NaN NaN \n", "50013 Namek Zu'bi NaN NaN \n", "50014 Nana Shin NaN NaN \n", "50015 Nancy Barrett NaN NaN \n", "50016 Nancy Jacobsen NaN NaN \n", "50017 Nancy Peretsman NaN NaN \n", "50018 Nancy Pierce NaN NaN \n", "50019 Nancy Pierce NaN NaN \n", "50020 Nat Friedman NaN NaN \n", "50021 Nat Friedman NaN NaN \n", "50022 Nat Turner NaN NaN \n", "50023 Nat Turner NaN NaN \n", "50024 Nat Turner NaN NaN \n", "50025 Nat Turner NaN NaN \n", "50026 Nat Turner NaN NaN \n", "50027 Nat Turner NaN NaN \n", "50028 Nat Turner NaN NaN \n", "50029 Nat Turner NaN NaN \n", "... ... ... ... \n", "52840 Zach Weinberg NaN NaN \n", "52841 Zach Weinberg NaN NaN \n", "52842 Zach Weinberg NaN NaN \n", "52843 Zach Weinberg NaN NaN \n", "52844 Zach Weinberg NaN NaN \n", "52845 Zach Zeitlin NaN NaN \n", "52846 Zachary Aarons NaN NaN \n", "52847 Zachary Bogue NaN NaN \n", "52848 Zachary Bogue NaN NaN \n", "52849 Zachary Bogue NaN NaN \n", "52850 Zachary Bogue NaN NaN \n", "52851 Zachary Bogue NaN NaN \n", "52852 Zaid Ayoub NaN NaN \n", "52853 Zaid Ayoub NaN NaN \n", "52854 Zain Khan NaN NaN \n", "52855 Zao Yang NaN NaN \n", "52856 Zaw Thet NaN NaN \n", "52857 Zelda Marzec NaN NaN \n", "52858 Zeljko Mataija NaN NaN \n", "52859 Zenyaku Kogyo NaN NaN \n", "52860 Ziv Navoth NaN NaN \n", "52861 Ziver Birg NaN NaN \n", "52862 Zod Nazem NaN NaN \n", "52863 Zohar Gilon NaN NaN \n", "52864 Zohar Gilon NaN NaN \n", "52865 Zohar Gilon NaN NaN \n", "52866 Zohar Gilon NaN NaN \n", "52867 zohar israel NaN NaN \n", "52868 Zorba Lieberman NaN NaN \n", "52869 Zorik Gordon NaN NaN \n", "\n", " investor_region investor_city funding_round_type funded_at \\\n", "50000 unknown NaN series-a 2012-10-01 \n", "50001 unknown NaN series-b 2007-10-01 \n", "50002 unknown NaN angel 2008-04-18 \n", "50003 unknown NaN angel 2010-01-01 \n", "50004 unknown NaN angel 2010-01-01 \n", "50005 unknown NaN angel 2007-11-29 \n", "50006 unknown NaN angel 2012-04-17 \n", "50007 unknown NaN series-a 2012-12-01 \n", "50008 unknown NaN series-a 2006-11-01 \n", "50009 unknown NaN angel 2012-11-01 \n", "50010 unknown NaN angel 2012-09-01 \n", "50011 unknown NaN angel 2012-10-04 \n", "50012 unknown NaN series-a 2013-02-05 \n", "50013 unknown NaN angel 2011-10-10 \n", "50014 unknown NaN angel 2008-02-07 \n", "50015 unknown NaN angel 2011-06-09 \n", "50016 unknown NaN angel 2012-09-11 \n", "50017 unknown NaN angel 2012-08-29 \n", "50018 unknown NaN series-b 2012-07-09 \n", "50019 unknown NaN series-a 2013-01-23 \n", "50020 unknown NaN angel 2010-12-01 \n", "50021 unknown NaN angel 2013-05-22 \n", "50022 unknown NaN angel 2011-07-12 \n", "50023 unknown NaN series-a 2011-04-18 \n", "50024 unknown NaN angel 2011-06-27 \n", "50025 unknown NaN angel 2011-05-01 \n", "50026 unknown NaN series-a 2012-08-01 \n", "50027 unknown NaN series-a 2013-02-19 \n", "50028 unknown NaN angel 2012-01-01 \n", "50029 unknown NaN angel 2012-08-13 \n", "... ... ... ... ... \n", "52840 unknown NaN angel 2012-05-24 \n", "52841 unknown NaN series-a 2013-02-19 \n", "52842 unknown NaN angel 2012-01-01 \n", "52843 unknown NaN angel 2012-08-13 \n", "52844 unknown NaN series-a 2013-09-19 \n", "52845 unknown NaN angel 2012-07-12 \n", "52846 unknown NaN angel 2011-09-01 \n", "52847 unknown NaN series-a 2009-11-01 \n", "52848 unknown NaN series-a 2011-11-18 \n", "52849 unknown NaN angel 2012-04-30 \n", "52850 unknown NaN venture 2012-11-15 \n", "52851 unknown NaN series-a 2012-12-01 \n", "52852 unknown NaN angel 2012-01-01 \n", "52853 unknown NaN angel 2011-01-01 \n", "52854 unknown NaN angel 2011-05-30 \n", "52855 unknown NaN series-b 2012-06-05 \n", "52856 unknown NaN angel 2012-10-31 \n", "52857 unknown NaN series-a 2011-02-01 \n", "52858 unknown NaN angel 2010-10-01 \n", "52859 unknown NaN series-b 2013-03-25 \n", "52860 unknown NaN other 2009-11-10 \n", "52861 unknown NaN angel 2013-08-01 \n", "52862 unknown NaN series-a 2013-07-11 \n", "52863 unknown NaN series-a 2005-01-01 \n", "52864 unknown NaN series-b 2009-02-11 \n", "52865 unknown NaN series-a 2012-08-08 \n", "52866 unknown NaN series-c+ 2013-04-08 \n", "52867 unknown NaN angel 2010-08-01 \n", "52868 unknown NaN series-a 2013-07-11 \n", "52869 unknown NaN series-a 2012-12-04 \n", "\n", " funded_month funded_quarter funded_year raised_amount_usd \n", "50000 2012-10 2012-Q4 2012 3060000.0 \n", "50001 2007-10 2007-Q4 2007 12000000.0 \n", "50002 2008-04 2008-Q2 2008 500000.0 \n", "50003 2010-01 2010-Q1 2010 750000.0 \n", "50004 2010-01 2010-Q1 2010 NaN \n", "50005 2007-11 2007-Q4 2007 485000.0 \n", "50006 2012-04 2012-Q2 2012 300000.0 \n", "50007 2012-12 2012-Q4 2012 5600000.0 \n", "50008 2006-11 2006-Q4 2006 16000000.0 \n", "50009 2012-11 2012-Q4 2012 NaN \n", "50010 2012-09 2012-Q3 2012 NaN \n", "50011 2012-10 2012-Q4 2012 1000000.0 \n", "50012 2013-02 2013-Q1 2013 1630000.0 \n", "50013 2011-10 2011-Q4 2011 30000.0 \n", "50014 2008-02 2008-Q1 2008 900000.0 \n", "50015 2011-06 2011-Q2 2011 8000.0 \n", "50016 2012-09 2012-Q3 2012 150000.0 \n", "50017 2012-08 2012-Q3 2012 400000.0 \n", "50018 2012-07 2012-Q3 2012 7000000.0 \n", "50019 2013-01 2013-Q1 2013 2288803.0 \n", "50020 2010-12 2010-Q4 2010 1100000.0 \n", "50021 2013-05 2013-Q2 2013 1100000.0 \n", "50022 2011-07 2011-Q3 2011 650000.0 \n", "50023 2011-04 2011-Q2 2011 2000000.0 \n", "50024 2011-06 2011-Q2 2011 1000000.0 \n", "50025 2011-05 2011-Q2 2011 NaN \n", "50026 2012-08 2012-Q3 2012 2500000.0 \n", "50027 2013-02 2013-Q1 2013 3000000.0 \n", "50028 2012-01 2012-Q1 2012 NaN \n", "50029 2012-08 2012-Q3 2012 900000.0 \n", "... ... ... ... ... \n", "52840 2012-05 2012-Q2 2012 750000.0 \n", "52841 2013-02 2013-Q1 2013 3000000.0 \n", "52842 2012-01 2012-Q1 2012 NaN \n", "52843 2012-08 2012-Q3 2012 900000.0 \n", "52844 2013-09 2013-Q3 2013 2800000.0 \n", "52845 2012-07 2012-Q3 2012 1300000.0 \n", "52846 2011-09 2011-Q3 2011 250000.0 \n", "52847 2009-11 2009-Q4 2009 10000000.0 \n", "52848 2011-11 2011-Q4 2011 1500000.0 \n", "52849 2012-04 2012-Q2 2012 NaN \n", "52850 2012-11 2012-Q4 2012 1700000.0 \n", "52851 2012-12 2012-Q4 2012 5600000.0 \n", "52852 2012-01 2012-Q1 2012 300000.0 \n", "52853 2011-01 2011-Q1 2011 300000.0 \n", "52854 2011-05 2011-Q2 2011 800000.0 \n", "52855 2012-06 2012-Q2 2012 21000000.0 \n", "52856 2012-10 2012-Q4 2012 1250000.0 \n", "52857 2011-02 2011-Q1 2011 1500000.0 \n", "52858 2010-10 2010-Q4 2010 15000.0 \n", "52859 2013-03 2013-Q1 2013 8000000.0 \n", "52860 2009-11 2009-Q4 2009 195000.0 \n", "52861 2013-08 2013-Q3 2013 NaN \n", "52862 2013-07 2013-Q3 2013 8400000.0 \n", "52863 2005-01 2005-Q1 2005 2000000.0 \n", "52864 2009-02 2009-Q1 2009 12000000.0 \n", "52865 2012-08 2012-Q3 2012 3800000.0 \n", "52866 2013-04 2013-Q2 2013 10300000.0 \n", "52867 2010-08 2010-Q3 2010 350000.0 \n", "52868 2013-07 2013-Q3 2013 8400000.0 \n", "52869 2012-12 2012-Q4 2012 3000000.0 \n", "\n", "[2870 rows x 17 columns]" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading Chunks Into SQLite" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "conn = sqlite3.connect('crunchbase.db')\n", "chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')\n", "\n", "for chunk in chunk_iter:\n", " chunk.to_sql(\"investments\", conn, if_exists='append', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }