Queer European MD passionate about IT

Mission167Solutions.ipynb 23 KB

Introduction

import pandas as pd
pd.options.display.max_columns = 99
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

Compute each column's missing value counts

mv_list = []
for chunk in chunk_iter:
    mv_list.append(chunk.isnull().sum())
    
combined_mv_vc = pd.concat(mv_list)
unique_combined_mv_vc = combined_mv_vc.groupby(combined_mv_vc.index).sum()
unique_combined_mv_vc.sort_values()
company_country_code          1
company_name                  1
company_permalink             1
company_region                1
investor_region               2
investor_permalink            2
investor_name                 2
funded_quarter                3
funded_at                     3
funded_month                  3
funded_year                   3
funding_round_type            3
company_state_code          492
company_city                533
company_category_code       643
raised_amount_usd          3599
investor_country_code     12001
investor_city             12480
investor_state_code       16809
investor_category_code    50427
dtype: int64

Total memory footprint for each column

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
counter = 0
series_memory_fp = pd.Series(dtype='float64')
for chunk in chunk_iter:
    if counter == 0:
        series_memory_fp = chunk.memory_usage(deep=True)
    else:
        series_memory_fp += chunk.memory_usage(deep=True)
    counter += 1

# Drop memory footprint calculation for the index.
series_memory_fp = series_memory_fp.drop('Index')
series_memory_fp
company_permalink         4057788
company_name              3591326
company_category_code     3421104
company_country_code      3172176
company_state_code        3106051
company_region            3411585
company_city              3505926
investor_permalink        4980548
investor_name             3915666
investor_category_code     622424
investor_country_code     2647292
investor_state_code       2476607
investor_region           3396281
investor_city             2885083
funding_round_type        3410707
funded_at                 3542185
funded_month              3383584
funded_quarter            3383584
funded_year                422960
raised_amount_usd          422960
dtype: int64

Total memory footprint of the data (in megabytes)

series_memory_fp.sum() / (1024 * 1024)
56.9876070022583
unique_combined_mv_vc.sort_values()
company_country_code          1
company_name                  1
company_permalink             1
company_region                1
investor_region               2
investor_permalink            2
investor_name                 2
funded_quarter                3
funded_at                     3
funded_month                  3
funded_year                   3
funding_round_type            3
company_state_code          492
company_city                533
company_category_code       643
raised_amount_usd          3599
investor_country_code     12001
investor_city             12480
investor_state_code       16809
investor_category_code    50427
dtype: int64
# Drop columns representing URL's or containing way too many missing values (>90% missing)
drop_cols = ['investor_permalink', 'company_permalink', 'investor_category_code']
keep_cols = chunk.columns.drop(drop_cols)
keep_cols.tolist
<bound method IndexOpsMixin.tolist of Index(['company_name', 'company_category_code', 'company_country_code',
       'company_state_code', 'company_region', 'company_city', 'investor_name',
       'investor_country_code', 'investor_state_code', 'investor_region',
       'investor_city', 'funding_round_type', 'funded_at', 'funded_month',
       'funded_quarter', 'funded_year', 'raised_amount_usd'],
      dtype='object')>

Selecting Data Types

Let's first determine which columns shift types across chunks. Note that we only lay the groundwork for this step.

# Key: Column name, Value: List of types
col_types = {}
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols)

for chunk in chunk_iter:
    for col in chunk.columns:
        if col not in col_types:
            col_types[col] = [str(chunk.dtypes[col])]
        else:
            col_types[col].append(str(chunk.dtypes[col]))
uniq_col_types = {}
for k,v in col_types.items():
    uniq_col_types[k] = set(col_types[k])
uniq_col_types
{'company_name': {'object'},
 'company_category_code': {'object'},
 'company_country_code': {'object'},
 'company_state_code': {'object'},
 'company_region': {'object'},
 'company_city': {'object'},
 'investor_name': {'object'},
 'investor_country_code': {'float64', 'object'},
 'investor_state_code': {'float64', 'object'},
 'investor_region': {'object'},
 'investor_city': {'float64', 'object'},
 'funding_round_type': {'object'},
 'funded_at': {'object'},
 'funded_month': {'object'},
 'funded_quarter': {'object'},
 'funded_year': {'float64', 'int64'},
 'raised_amount_usd': {'float64'}}
chunk
company_name company_category_code company_country_code company_state_code company_region company_city investor_name investor_country_code investor_state_code investor_region investor_city funding_round_type funded_at funded_month funded_quarter funded_year raised_amount_usd
50000 NuORDER fashion USA CA Los Angeles West Hollywood Mortimer Singer NaN NaN unknown NaN series-a 2012-10-01 2012-10 2012-Q4 2012 3060000.0
50001 ChaCha advertising USA IN Indianapolis Carmel Morton Meyerson NaN NaN unknown NaN series-b 2007-10-01 2007-10 2007-Q4 2007 12000000.0
50002 Binfire software USA FL Bocat Raton Bocat Raton Moshe Ariel NaN NaN unknown NaN angel 2008-04-18 2008-04 2008-Q2 2008 500000.0
50003 Binfire software USA FL Bocat Raton Bocat Raton Moshe Ariel NaN NaN unknown NaN angel 2010-01-01 2010-01 2010-Q1 2010 750000.0
50004 Unified Color software USA CA SF Bay South San Frnacisco Mr. Andrew Oung NaN NaN unknown NaN angel 2010-01-01 2010-01 2010-Q1 2010 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
52865 Garantia Data enterprise USA CA SF Bay Santa Clara Zohar Gilon NaN NaN unknown NaN series-a 2012-08-08 2012-08 2012-Q3 2012 3800000.0
52866 DudaMobile mobile USA CA SF Bay Palo Alto Zohar Gilon NaN NaN unknown NaN series-c+ 2013-04-08 2013-04 2013-Q2 2013 10300000.0
52867 SiteBrains software USA CA SF Bay San Francisco zohar israel NaN NaN unknown NaN angel 2010-08-01 2010-08 2010-Q3 2010 350000.0
52868 Comprehend Systems enterprise USA CA SF Bay Palo Alto Zorba Lieberman NaN NaN unknown NaN series-a 2013-07-11 2013-07 2013-Q3 2013 8400000.0
52869 SmartThings mobile USA DC unknown Minneapolis Zorik Gordon NaN NaN unknown NaN series-a 2012-12-04 2012-12 2012-Q4 2012 3000000.0

2870 rows × 17 columns

Loading Chunks Into SQLite

import sqlite3
conn = sqlite3.connect('crunchbase.db')
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

for chunk in chunk_iter:
    chunk.to_sql("investments", conn, if_exists='append', index=False)