# Introducing Wikipedia Data

In [2]:
import os

os.listdir("wiki")

['%C3%81lvaro_Sierra.html',
 '%C3%89cole_des_Mines_de_Douai.html',
 '%C3%89taule.html',
 '%C3%96lm%C3%BCs_bir_kadinin_evraki_metrukesi.html',
 '%C5%81osie-Do%C5%82%C4%99gi.html',
 '%C5%8Cnog%C5%8D_Station.html',
 '%C5%9Eah%C3%AEn%C3%AA_Bekir%C3%AA_Sorekl%C3%AE.html',
 '(1-4)-a-D-glucan_1-a-D-glucosylmutase.html',
 '1._FC_Eschborn.html',
 '100%25_Banco.html',
 '100_Greatest_Romanians.html',
 '100_mm_field_gun_M1944_(BS-3).html',
 '104th_Logistic_Support_Brigade_(United_Kingdom).html',
 '1208_(band).html',
 '125th_(Lancashire_Fusiliers)_Brigade.html',
 '16th_Virginia_Infantry.html',
 '1860_in_science.html',
 '1866_in_birding_and_ornithology.html',
 '1879_FA_Cup_Final.html',
 '1896_Indiana_Hoosiers_football_team.html',
 '1898_Colgate_football_team.html',
 '1904_Case_football_team.html',
 '1905%E2%80%9306_FC_Barcelona_season.html',
 '1910_in_literature.html',
 '1915_Montana_football_team.html',
 '1937_Social_Credit_backbenchers%27_revolt.html',
 '1947_Notre_Dame_Fighting_Irish_football_tea

In [3]:
len(os.listdir("wiki"))

2997

In [4]:
with open("wiki/Cecil_Peak.html") as f:
    print(f.read())

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Cecil Peak - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Cecil_Peak","wgTitle":"Cecil Peak","wgCurRevisionId":724716112,"wgRevisionId":724716112,"wgArticleId":36574331,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Coordinates on Wikidata","All stub articles","Mountains of Otago","Queenstown-Lakes District","Southern Alps","Otago geography stubs"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","J

It appears that the main content is nested inside the `div` tag with the id `content`.

# Reading in the Data

In [12]:
import concurrent.futures
import time

pool = concurrent.futures.ThreadPoolExecutor(max_workers=4)

def read_data(filename):
    with open(filename) as f:
        data = f.read()
    return data

start = time.time()
filenames = ["wiki/{}".format(f) for f in os.listdir("wiki")]
content = pool.map(read_data, filenames)
content = list(content)

end = time.time()
print(end - start)
articles = [f.replace(".html", "").replace("wiki/", "") for f in filenames]

2.952831983566284


After doing some profiling, it doesn't appear that threading makes a huge difference to performance.  It may be because although files are opened, most of the task is offset by the overhead of creating new threads.

# Remove Extraneous Markup

In [26]:
from bs4 import BeautifulSoup

def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return str(soup.find_all("div", id="content")[0])

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
parsed = pool.map(parse_html, content)
parsed = list(parsed)
end = time.time()

print(end - start)

139.67757391929626


In [27]:
parsed[0]

'<div class="mw-body" id="content" role="main">\n<a id="top"></a>\n<div id="siteNotice"><!-- CentralNotice --></div>\n<div class="mw-indicators">\n</div>\n<h1 class="firstHeading" id="firstHeading" lang="en">Álvaro Sierra</h1>\n<div class="mw-body-content" id="bodyContent">\n<div id="siteSub">From Wikipedia, the free encyclopedia</div>\n<div id="contentSub"></div>\n<div class="mw-jump" id="jump-to-nav">\n\t\t\t\t\tJump to:\t\t\t\t\t<a href="#mw-head">navigation</a>, \t\t\t\t\t<a href="#p-search">search</a>\n</div>\n<div class="mw-content-ltr" dir="ltr" id="mw-content-text" lang="en"><div class="hatnote" role="note">This name uses <a href="/wiki/Spanish_naming_customs" title="Spanish naming customs">Spanish naming customs</a>: the first or paternal <a href="/wiki/Surname" title="Surname">family name</a> is <i>Sierra</i>\xa0and the second or maternal family name is <i>Peña</i>.</div>\n<p><b>Álvaro Sierra Peña</b> (born April 4, 1967 in <a href="/wiki/Sogamoso" title="Sogamoso">Sogamoso</

This operation is quite slow and CPU-intensive.  It looks like using as many processes are there are available processors speeds things up.

# Finding Common Tags

In [33]:
from bs4 import BeautifulSoup

def count_tags(html):
    soup = BeautifulSoup(html, 'html.parser')
    tags = {}
    for tag in soup.find_all():
        if tag.name not in tags:
            tags[tag.name] = 0
        tags[tag.name] += 1
    return tags

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
tags = pool.map(count_tags, parsed)
tags = list(tags)

tag_counts = {}
for tag in tags:
    for k,v in tag.items():
        if k not in tag_counts:
            tag_counts[k] = 0
        tag_counts[k] += v
end = time.time()

print(end - start)
tag_counts

71.57306599617004


{'a': 498830,
 'abbr': 11536,
 'annotation': 205,
 'area': 138,
 'audio': 8,
 'b': 45291,
 'bdi': 34,
 'big': 416,
 'blockquote': 148,
 'br': 15599,
 'caption': 609,
 'center': 657,
 'cite': 11605,
 'code': 281,
 'dd': 3761,
 'del': 6,
 'div': 88787,
 'dl': 1425,
 'dt': 953,
 'font': 44,
 'h1': 2997,
 'h2': 12336,
 'h3': 2720,
 'h4': 405,
 'h5': 26,
 'h6': 1,
 'hr': 240,
 'i': 55074,
 'img': 24457,
 'li': 263520,
 'map': 9,
 'math': 205,
 'mfrac': 181,
 'mi': 1538,
 'mn': 657,
 'mo': 1399,
 'mover': 14,
 'mrow': 1302,
 'mspace': 65,
 'msqrt': 21,
 'mstyle': 214,
 'msub': 319,
 'msubsup': 56,
 'msup': 163,
 'mtable': 4,
 'mtd': 93,
 'mtext': 13,
 'mtr': 13,
 'munder': 7,
 'munderover': 17,
 'noscript': 2997,
 'ol': 2626,
 'p': 24099,
 'pre': 12,
 'q': 162,
 'rb': 16,
 'rp': 32,
 'rt': 16,
 'ruby': 16,
 's': 76,
 'samp': 2,
 'semantics': 205,
 'small': 9415,
 'source': 8,
 'span': 218982,
 'strong': 1847,
 'sub': 547,
 'sup': 36301,
 'table': 13114,
 'td': 175989,
 'th': 46258,
 'time': 

Based on our findings, it looks like there are quite a few `td`, `a`, `li`, and `span` tags.  This indicates that articles tend to have lots of links, along with lists and tables.  Links are the most numerous tag, which indicates how interconnected articles on Wikipedia are.

# Finding Common Words

In [40]:
from bs4 import BeautifulSoup
from collections import Counter
import re

def count_words(html):
    soup = BeautifulSoup(html, 'html.parser')
    words = {}
    text = soup.get_text()
    text = re.sub("\W+", " ", text.lower())
    words = text.split(" ")
    words = [w for w in words if len(w) >= 5]
    return Counter(words).most_common(10)

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
words = pool.map(count_words, parsed)
words = list(words)

word_counts = {}
for wc in words:
    for word, count in wc:
        if word not in word_counts:
            word_counts[word] = 0
        word_counts[word] += 1
end = time.time()

print(end - start)
word_counts

133.99117708206177


{'junior': 8,
 'prannathji': 1,
 'touring': 1,
 'sofia': 1,
 'patrick': 5,
 'twice': 1,
 'managers': 3,
 'agostini': 1,
 'domination': 1,
 'billième': 1,
 'kavčič': 1,
 'dönhoff': 1,
 'uprising': 1,
 'moshe': 1,
 'bangladesh': 3,
 'muntaner': 1,
 'mayor': 11,
 'beasley': 1,
 'zeitweiser': 1,
 'first': 100,
 'gives': 1,
 'films1940s': 1,
 'januszkowo': 1,
 'minesweeper': 2,
 'commons': 6,
 '83222': 1,
 'rayon': 2,
 'humorist': 1,
 'hercules': 1,
 'encarnaçao': 1,
 'calling': 2,
 'today': 3,
 'kazuki': 1,
 'zoltán': 1,
 'fortified': 1,
 'martinez': 1,
 'båstad': 1,
 'presidential': 5,
 'contributor': 1,
 'trăng': 1,
 'preparatory': 1,
 'governor': 12,
 'pietism': 1,
 'least': 1,
 'sulfonic': 1,
 'manipuli': 1,
 'scindalmota': 1,
 'colliery': 1,
 'birchall': 1,
 'plotinus': 1,
 'narasaraopet': 1,
 'awareness': 2,
 'willimon': 1,
 'appear': 1,
 'monadenia': 1,
 'antioquia': 2,
 'polynomials': 1,
 'kitplanes': 1,
 'chiriguelo': 1,
 'asteraceae': 2,
 'nadia': 1,
 'thoracic': 1,
 'entrance': 

Only selecting the top `10` words from each article speeds up performance quite a bit.