Davte/solutions @ 707776eb5f39bd54d7c2e986022f8642b57fa903

Introduction

import pandas as pd
pd.options.display.max_columns = 99
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

Compute each column's missing value counts

mv_list = []
for chunk in chunk_iter:
    mv_list.append(chunk.isnull().sum())
    
combined_mv_vc = pd.concat(mv_list)
unique_combined_mv_vc = combined_mv_vc.groupby(combined_mv_vc.index).sum()
unique_combined_mv_vc.sort_values()

company_country_code          1
company_name                  1
company_permalink             1
company_region                1
investor_region               2
investor_permalink            2
investor_name                 2
funded_quarter                3
funded_at                     3
funded_month                  3
funded_year                   3
funding_round_type            3
company_state_code          492
company_city                533
company_category_code       643
raised_amount_usd          3599
investor_country_code     12001
investor_city             12480
investor_state_code       16809
investor_category_code    50427
dtype: int64

Total memory footprint for each column

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
counter = 0
series_memory_fp = pd.Series(dtype='float64')
for chunk in chunk_iter:
    if counter == 0:
        series_memory_fp = chunk.memory_usage(deep=True)
    else:
        series_memory_fp += chunk.memory_usage(deep=True)
    counter += 1

# Drop memory footprint calculation for the index.
series_memory_fp = series_memory_fp.drop('Index')
series_memory_fp

company_permalink         4057788
company_name              3591326
company_category_code     3421104
company_country_code      3172176
company_state_code        3106051
company_region            3411585
company_city              3505926
investor_permalink        4980548
investor_name             3915666
investor_category_code     622424
investor_country_code     2647292
investor_state_code       2476607
investor_region           3396281
investor_city             2885083
funding_round_type        3410707
funded_at                 3542185
funded_month              3383584
funded_quarter            3383584
funded_year                422960
raised_amount_usd          422960
dtype: int64

Total memory footprint of the data (in megabytes)

series_memory_fp.sum() / (1024 * 1024)

56.9876070022583

unique_combined_mv_vc.sort_values()

company_country_code          1
company_name                  1
company_permalink             1
company_region                1
investor_region               2
investor_permalink            2
investor_name                 2
funded_quarter                3
funded_at                     3
funded_month                  3
funded_year                   3
funding_round_type            3
company_state_code          492
company_city                533
company_category_code       643
raised_amount_usd          3599
investor_country_code     12001
investor_city             12480
investor_state_code       16809
investor_category_code    50427
dtype: int64

# Drop columns representing URL's or containing way too many missing values (>90% missing)
drop_cols = ['investor_permalink', 'company_permalink', 'investor_category_code']
keep_cols = chunk.columns.drop(drop_cols)

keep_cols.tolist

<bound method IndexOpsMixin.tolist of Index(['company_name', 'company_category_code', 'company_country_code',
       'company_state_code', 'company_region', 'company_city', 'investor_name',
       'investor_country_code', 'investor_state_code', 'investor_region',
       'investor_city', 'funding_round_type', 'funded_at', 'funded_month',
       'funded_quarter', 'funded_year', 'raised_amount_usd'],
      dtype='object')>

Selecting Data Types

Let's first determine which columns shift types across chunks. Note that we only lay the groundwork for this step.

# Key: Column name, Value: List of types
col_types = {}
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols)

for chunk in chunk_iter:
    for col in chunk.columns:
        if col not in col_types:
            col_types[col] = [str(chunk.dtypes[col])]
        else:
            col_types[col].append(str(chunk.dtypes[col]))

uniq_col_types = {}
for k,v in col_types.items():
    uniq_col_types[k] = set(col_types[k])
uniq_col_types

{'company_name': {'object'},
 'company_category_code': {'object'},
 'company_country_code': {'object'},
 'company_state_code': {'object'},
 'company_region': {'object'},
 'company_city': {'object'},
 'investor_name': {'object'},
 'investor_country_code': {'float64', 'object'},
 'investor_state_code': {'float64', 'object'},
 'investor_region': {'object'},
 'investor_city': {'float64', 'object'},
 'funding_round_type': {'object'},
 'funded_at': {'object'},
 'funded_month': {'object'},
 'funded_quarter': {'object'},
 'funded_year': {'float64', 'int64'},
 'raised_amount_usd': {'float64'}}

chunk

	company_name	company_category_code	company_country_code	company_state_code	company_region	company_city	investor_name	investor_country_code	investor_state_code	investor_region	investor_city	funding_round_type	funded_at	funded_month	funded_quarter	funded_year	raised_amount_usd
50000	NuORDER	fashion	USA	CA	Los Angeles	West Hollywood	Mortimer Singer	NaN	NaN	unknown	NaN	series-a	2012-10-01	2012-10	2012-Q4	2012	3060000.0
50001	ChaCha	advertising	USA	IN	Indianapolis	Carmel	Morton Meyerson	NaN	NaN	unknown	NaN	series-b	2007-10-01	2007-10	2007-Q4	2007	12000000.0
50002	Binfire	software	USA	FL	Bocat Raton	Bocat Raton	Moshe Ariel	NaN	NaN	unknown	NaN	angel	2008-04-18	2008-04	2008-Q2	2008	500000.0
50003	Binfire	software	USA	FL	Bocat Raton	Bocat Raton	Moshe Ariel	NaN	NaN	unknown	NaN	angel	2010-01-01	2010-01	2010-Q1	2010	750000.0
50004	Unified Color	software	USA	CA	SF Bay	South San Frnacisco	Mr. Andrew Oung	NaN	NaN	unknown	NaN	angel	2010-01-01	2010-01	2010-Q1	2010	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
52865	Garantia Data	enterprise	USA	CA	SF Bay	Santa Clara	Zohar Gilon	NaN	NaN	unknown	NaN	series-a	2012-08-08	2012-08	2012-Q3	2012	3800000.0
52866	DudaMobile	mobile	USA	CA	SF Bay	Palo Alto	Zohar Gilon	NaN	NaN	unknown	NaN	series-c+	2013-04-08	2013-04	2013-Q2	2013	10300000.0
52867	SiteBrains	software	USA	CA	SF Bay	San Francisco	zohar israel	NaN	NaN	unknown	NaN	angel	2010-08-01	2010-08	2010-Q3	2010	350000.0
52868	Comprehend Systems	enterprise	USA	CA	SF Bay	Palo Alto	Zorba Lieberman	NaN	NaN	unknown	NaN	series-a	2013-07-11	2013-07	2013-Q3	2013	8400000.0
52869	SmartThings	mobile	USA	DC	unknown	Minneapolis	Zorik Gordon	NaN	NaN	unknown	NaN	series-a	2012-12-04	2012-12	2012-Q4	2012	3000000.0

2870 rows Ã— 17 columns

Loading Chunks Into SQLite

import sqlite3
conn = sqlite3.connect('crunchbase.db')
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

for chunk in chunk_iter:
    chunk.to_sql("investments", conn, if_exists='append', index=False)

Mission167Solutions.ipynb 23 KB Előzmények Nyers

Introduction

Selecting Data Types

Loading Chunks Into SQLite

Mission167Solutions.ipynb 23 KB

Előzmények Nyers