{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introducing Wikipedia Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['%C3%81lvaro_Sierra.html',\n", " '%C3%89cole_des_Mines_de_Douai.html',\n", " '%C3%89taule.html',\n", " '%C3%96lm%C3%BCs_bir_kadinin_evraki_metrukesi.html',\n", " '%C5%81osie-Do%C5%82%C4%99gi.html',\n", " '%C5%8Cnog%C5%8D_Station.html',\n", " '%C5%9Eah%C3%AEn%C3%AA_Bekir%C3%AA_Sorekl%C3%AE.html',\n", " '(1-4)-a-D-glucan_1-a-D-glucosylmutase.html',\n", " '1._FC_Eschborn.html',\n", " '100%25_Banco.html',\n", " '100_Greatest_Romanians.html',\n", " '100_mm_field_gun_M1944_(BS-3).html',\n", " '104th_Logistic_Support_Brigade_(United_Kingdom).html',\n", " '1208_(band).html',\n", " '125th_(Lancashire_Fusiliers)_Brigade.html',\n", " '16th_Virginia_Infantry.html',\n", " '1860_in_science.html',\n", " '1866_in_birding_and_ornithology.html',\n", " '1879_FA_Cup_Final.html',\n", " '1896_Indiana_Hoosiers_football_team.html',\n", " '1898_Colgate_football_team.html',\n", " '1904_Case_football_team.html',\n", " '1905%E2%80%9306_FC_Barcelona_season.html',\n", " '1910_in_literature.html',\n", " '1915_Montana_football_team.html',\n", " '1937_Social_Credit_backbenchers%27_revolt.html',\n", " '1947_Notre_Dame_Fighting_Irish_football_team.html',\n", " '1951_National_League_tie-breaker_series.html',\n", " '1953%E2%80%9354_FA_Cup_qualifying_rounds.html',\n", " '1958_Wightman_Cup.html',\n", " '1963_Pittsburgh_Panthers_football_team.html',\n", " '1968_Duke_Blue_Devils_football_team.html',\n", " '1970_African_Cup_of_Nations.html',\n", " '1975%E2%80%9376_FIBA_Kora%C4%87_Cup.html',\n", " '1976_Kansas_State_Wildcats_football_team.html',\n", " '1978_College_Football_All-America_Team.html',\n", " '1982_Kentucky_Derby.html',\n", " '1982_World_Series.html',\n", " '1984%E2%80%9385_Southern_Football_League.html',\n", " '1984_WAC_Men%27s_Basketball_Tournament.html',\n", " '1987_Football_League_Trophy_Final.html',\n", " '1988_Chatham_Cup.html',\n", " '1988_State_of_Origin_series.html',\n", " '1989_Preakness_Stakes.html',\n", " '1990%E2%80%9391_Southern_Football_League.html',\n", " '1991%E2%80%9392_BHL_season.html',\n", " '1991_RTHK_Top_10_Gold_Songs_Awards.html',\n", " '1991_uprising_in_Basra.html',\n", " '1991_US_Open_%E2%80%93_Women%27s_Doubles.html',\n", " '1992_European_Athletics_Indoor_Championships.html',\n", " '1994_French_Open_%E2%80%93_Men%27s_Doubles.html',\n", " '1994_Scotland_rugby_union_tour_of_Argentina.html',\n", " '1995_FINA_Men%27s_Water_Polo_World_Cup.html',\n", " '1996_UEFA_Futsal_Championship.html',\n", " '1997_Edmonton_Eskimos_season.html',\n", " '1999%E2%80%932000_UEFA_Champions_League_second_group_stage.html',\n", " '1999_Caribbean_Cup.html',\n", " '1999_MTV_Video_Music_Awards.html',\n", " '1st_Strategic_Aerospace_Division.html',\n", " '2000_Pepsi_Southern_500.html',\n", " '2001%E2%80%9302_NBA_season.html',\n", " '2001_Australian_Individual_Speedway_Championship.html',\n", " '2001_Copa_del_Rey_Final.html',\n", " '2001_NCAA_Division_I_Field_Hockey_Championship.html',\n", " '2002%E2%80%9303_Scottish_Second_Division.html',\n", " '2003_World_Championships_in_Athletics_%E2%80%93_Women%27s_hammer_throw.html',\n", " '2004_Tuvalu_A-Division.html',\n", " '2005%E2%80%9306_in_Welsh_football.html',\n", " '2005_Asian_Women%27s_Club_Volleyball_Championship.html',\n", " '2005_in_radio.html',\n", " '2005_NCAA_Division_I_Men%27s_Basketball_Tournament.html',\n", " '2006_European_Athletics_Championships_%E2%80%93_Women%27s_triple_jump.html',\n", " '2007%E2%80%9308_Huddersfield_Town_A.F.C._season.html',\n", " '2007%E2%80%9308_Women%27s_EHF_Cup.html',\n", " '2007_in_Belgian_television.html',\n", " '2007_Kilkenny_Senior_Hurling_Championship.html',\n", " '2008_2._deild_karla.html',\n", " '2008_Fed_Cup_World_Group_II.html',\n", " '2008_Swedish_Open.html',\n", " '2009_English_cricket_season.html',\n", " '2009_European_Junior_Swimming_Championships.html',\n", " '2009_Giro_di_Lombardia.html',\n", " '2009_Open_Costa_Adeje_%E2%80%93_Isla_de_Tenerife_%E2%80%93_Singles.html',\n", " '2009_World_Junior_Ice_Hockey_Championships_rosters.html',\n", " '2010_Karshi_Challenger_%E2%80%93_Singles.html',\n", " '2010_New_Year%27s_Eve_tornado_outbreak.html',\n", " '2011%E2%80%9312_Western_Collegiate_Hockey_Association_women%27s_ice_hockey_season.html',\n", " '2011_Dutch_National_Track_Championships_%E2%80%93_Men%27s_1_km_time_trial.html',\n", " '2011_ITU_Duathlon_World_Championships.html',\n", " '2011_Mosconi_Cup.html',\n", " '2011_UK_Open_Qualifier_4.html',\n", " '2011_Wainwright_Roaming_Buffalo_Classic.html',\n", " '2012%E2%80%9313_Liga_Na%C8%9Bional%C4%83_(women%27s_handball).html',\n", " '2012%E2%80%9313_SVB_Hoofdklasse.html',\n", " '2013%E2%80%9314_Stirling_%26_District_Amateur_Football_Association.html',\n", " '2013_Curlers_Corner_Autumn_Gold_Curling_Classic.html',\n", " '2013_FIA_WTCC_Race_of_Russia.html',\n", " '2013_Mississippi_State_Bulldogs_baseball_team.html',\n", " '2013_San_Luis_Open_Challenger_%E2%80%93_Doubles.html',\n", " '2013_World_Grand_Prix_(darts).html',\n", " '2014%E2%80%9315_Albanian_Third_Division.html',\n", " '2014%E2%80%9315_Kansas_State_Wildcats_men%27s_basketball_team.html',\n", " '2014%E2%80%9316_European_Nations_Cup_Second_Division.html',\n", " '2014_Kano_bombing.html',\n", " '2014_Special_Honours.html',\n", " '2014_World_RX_of_Turkey.html',\n", " '2015%E2%80%9316_PLK_season.html',\n", " '2015%E2%80%9316_Washington_Huskies_men%27s_basketball_team.html',\n", " '2015_NWSL_College_Draft.html',\n", " '2015_Thai_League_Cup.html',\n", " '2016%E2%80%9317_Little_Rock_Trojans_women%27s_basketball_team.html',\n", " '2016%E2%80%9317_New_Zealand_Football_Championship.html',\n", " '2016_Danmark_Rundt.html',\n", " '2016_FK_Haugesund_season.html',\n", " '2016_Minnesota_Vikings_season.html',\n", " '2017_Orange_County_Breakers_season.html',\n", " '23rd_Reserve_Battalion,_CEF.html',\n", " '24th_Genie_Awards.html',\n", " '25B-NBOMe.html',\n", " '26th_Illinois_Volunteer_Infantry_Regiment.html',\n", " '3-(Trifluoromethyl)aniline.html',\n", " '33rd_New_Jersey_Volunteer_Infantry.html',\n", " '427.html',\n", " '4_Way_Street.html',\n", " '50_caliber_Mark_7_gun.html',\n", " '55th_parallel_south.html',\n", " '59th_Oza.html',\n", " '83_(number).html',\n", " '877_Naval_Air_Squadron.html',\n", " '985_Innovative_Platforms_for_Key_Disciplines_Project.html',\n", " 'A%C3%A9ro_Services_Gu%C3%A9pard_Gu%C3%A9pard_912.html',\n", " 'A._P._Komala.html',\n", " 'A_Beautiful_Valley.html',\n", " 'A_Month_of_Sundays_(miniseries).html',\n", " 'A_Slow_Messe.html',\n", " 'A_Sport_and_a_Pastime.html',\n", " 'Aaron_Paul.html',\n", " 'Aaron_Williams_and_the_Hoodoo.html',\n", " 'Aaronsburg_Historic_District.html',\n", " 'Aattanayagann.html',\n", " 'AAU_Junior_Olympic_Games.html',\n", " 'AB_v_CD.html',\n", " 'Abanycha_bicolor.html',\n", " 'Abbas_Jadidi.html',\n", " 'Abbeville_Sluggers.html',\n", " 'Abdelwahid_Aboud_Mackaye.html',\n", " 'Abdulla_Majid_Al_Naimi.html',\n", " 'Abdulrahman_al-Awlaki.html',\n", " 'Abortion_in_Italy.html',\n", " 'Abortion_in_Oklahoma.html',\n", " 'Abou_Redis.html',\n", " 'Abraham_Booth.html',\n", " 'Abraham_Stanyan.html',\n", " 'Absolutely_Zippo.html',\n", " 'Abu_Oleymeh,_Ramshir.html',\n", " 'Abyssobela_atoxica.html',\n", " 'Acacia_dermatophylla.html',\n", " 'Academic_grading_in_Bosnia_and_Herzegovina.html',\n", " 'Acceptance_(Heroes).html',\n", " 'Acceptilation.html',\n", " 'Accommodation_bridge.html',\n", " 'Achaea_faber.html',\n", " 'Acharapakkam_(State_Assembly_Constituency).html',\n", " 'Achim_Schwarze.html',\n", " 'ACM_Transactions_on_Computational_Biology_and_Bioinformatics.html',\n", " 'Acrocercops_calycophthalma.html',\n", " 'Acrolophus_simulatus.html',\n", " 'Adelaide%E2%80%93Darwin_railway.html',\n", " 'Adios_(B%C3%B6hse_Onkelz_album).html',\n", " 'Adish_Aggarwala.html',\n", " 'ADNI.html',\n", " 'Adolf%C3%B3w,_%C5%81%C3%B3d%C5%BA_Voivodeship.html',\n", " 'Adolf_Schulte.html',\n", " 'Adoration_of_the_Magi_(Filippino_Lippi).html',\n", " 'Adrian_Bawtree.html',\n", " 'Adrian_Clarke_(footballer).html',\n", " 'Aechmea_%27Pica%27.html',\n", " 'Agaritine_gamma-glutamyltransferase.html',\n", " 'Agesarchus_of_Tritaea.html',\n", " 'Agnes_Tait.html',\n", " 'Agnolin.html',\n", " 'Agostino_Gemelli.html',\n", " 'Agripina_Samper_Agudelo.html',\n", " 'Agullent.html',\n", " 'Agyneta_allosubtilis.html',\n", " 'Ahanta_West_District.html',\n", " 'Ahmad_Khatami_(professor).html',\n", " 'AIM_Phoneline.html',\n", " 'Air_Barons.html',\n", " 'Aiva.html',\n", " 'Ajminal.html',\n", " 'Aku_Partanen.html',\n", " 'Al-Midan.html',\n", " 'Al-Nurayn_Mosque.html',\n", " 'Al_Azim_Mosque.html',\n", " 'Al_Mahon.html',\n", " 'Al_Sherman.html',\n", " 'Alan_Oakley.html',\n", " 'Alan_Orr.html',\n", " 'Alan_Sheehan.html',\n", " 'Albanians_in_Egypt.html',\n", " 'Albert_French.html',\n", " 'Album-equivalent_unit.html',\n", " 'Aldona_Orman.html',\n", " 'Aleksandra_Jagie%C5%82o.html',\n", " 'Aleksei_Lazarev.html',\n", " 'Alex_Karalexis.html',\n", " 'Alex_Kurtzman.html',\n", " 'Alex_McEachern.html',\n", " 'Alexander_Armstrong_(explorer).html',\n", " 'Alexander_King_(author).html',\n", " 'Alexander_McCue.html',\n", " 'Alexander_Rizzoni.html',\n", " 'Alexandru_Greab.html',\n", " 'Alexei_Vinogradov.html',\n", " 'Alexios_Aspietes.html',\n", " 'Alexis_Lloyd.html',\n", " 'Ali_Al_Bulaihi.html',\n", " 'Aliabad-e_Robat.html',\n", " 'Alireza_JJ.html',\n", " 'Alisa_(soap_opera).html',\n", " 'All-Polish_Youth.html',\n", " 'All_Ceylon_Tamil_Congress.html',\n", " 'All_Lights_Fucked_on_the_Hairy_Amp_Drooling.html',\n", " 'All_Tangled_Up_in_Love.html',\n", " 'All_the_Fun.html',\n", " 'All_You_Get_from_Love_Is_a_Love_Song.html',\n", " 'Allama_Iqbal_Medical_College.html',\n", " 'Allan_Egolf.html',\n", " 'Allan_Lister_Samuel_Brown.html',\n", " 'Allen_Snyder_(lawyer).html',\n", " 'Alnwick_District_Council_election,_2007.html',\n", " 'Alojz_Knafelc.html',\n", " 'Alpine_skiing_at_the_1994_Winter_Olympics_%E2%80%93_Men%27s_combined.html',\n", " 'Amanda_Peet.html',\n", " 'Amano_Artisan_Chocolate.html',\n", " 'Amber_Butchart.html',\n", " 'Amborella.html',\n", " 'AMD_XGP.html',\n", " 'Ameenapuram.html',\n", " 'American_Federation_of_Labor_Building.html',\n", " 'American_Journal_of_Kidney_Diseases.html',\n", " 'Aminonaphthalenesulfonic_acids.html',\n", " 'Amorphogynia.html',\n", " 'Ampelasia.html',\n", " 'Amphidromia.html',\n", " 'Amy_Watkins.html',\n", " 'Ana_Beatriz.html',\n", " 'Anatolii_Sloiko.html',\n", " 'Ancita_antennata.html',\n", " 'Ancylostoma_duodenale.html',\n", " 'Anders_Ahlgren.html',\n", " 'Andrei_Gamalyan.html',\n", " 'Andrew_McNally.html',\n", " 'Andrew_S._Tanenbaum.html',\n", " 'Andrew_Wilson_(classical_archaeologist).html',\n", " 'Angelic_non-determinism.html',\n", " 'Angolan_Union_for_Peace,_Democracy_and_Development.html',\n", " 'Aniavan.html',\n", " 'Anish_Giri.html',\n", " 'Anjada_Gandu.html',\n", " 'Anne_Whateley.html',\n", " 'Annopole_Stare.html',\n", " 'Annunciation_Church,_Walsall.html',\n", " 'Ant%C3%B3nio_Maria_Baptista.html',\n", " 'Ant%C3%B3nio_Marinho_e_Pinto.html',\n", " 'Antae_temple.html',\n", " 'Anthony_Cesario.html',\n", " 'Anthony_Marten.html',\n", " 'Antibiotic_use_in_livestock.html',\n", " 'Antigonadotropin.html',\n", " 'Antipater_of_Thessalonica.html',\n", " 'Antoine_Gakeme.html',\n", " 'Anton_von_Troeltsch.html',\n", " 'Antonio_Ciano.html',\n", " 'Antonio_de_Zayas_(bishop).html',\n", " 'Anuradhapura_West_Electoral_District.html',\n", " 'Apache_Bloodhound.html',\n", " 'Aphonopelma_chiricahua.html',\n", " 'Apocephalus_borealis.html',\n", " 'Appa_(film).html',\n", " 'Aq_Dash,_East_Azerbaijan.html',\n", " 'Aquaphobia.html',\n", " 'ARA_Garibaldi.html',\n", " 'Arabic_Toilers%27_Movement.html',\n", " 'Arcadio_Gonz%C3%A1lez.html',\n", " 'Archibald_Gordon_(British_Army_officer).html',\n", " 'Archiepiscopal_Palace,_Rouen.html',\n", " 'Area_code_218.html',\n", " 'Area_of_refuge.html',\n", " 'Argentina_at_the_2015_World_Championships_in_Athletics.html',\n", " 'Arkansas_gubernatorial_election,_1968.html',\n", " 'Arkansas_Highway_178.html',\n", " 'Arnold_Freiherr_von_Biegeleben.html',\n", " 'Arrest_warrant.html',\n", " 'Arrondissements_of_the_Yvelines_department.html',\n", " 'Art_collection_of_Fondazione_Cassa_di_Risparmio_di_Perugia.html',\n", " 'Arthur_Baldwinson.html',\n", " 'Arthur_L._Aidala.html',\n", " 'Arthur_Lelyveld.html',\n", " 'Arthur_Mendes.html',\n", " 'Arthur_Norman.html',\n", " 'Arturo_Liebstein.html',\n", " 'Arun_K._Pati.html',\n", " 'Arvind_Gokhale.html',\n", " 'Ash_Rees.html',\n", " 'Asheqlu.html',\n", " 'Ashley,_Wisconsin.html',\n", " 'Ashu_Dani.html',\n", " 'Asparuh_Peak.html',\n", " 'Assegaaibosch_Nature_Reserve.html',\n", " 'Asun_Balzola.html',\n", " 'Athletics_at_the_1994_Commonwealth_Games_%E2%80%93_Men%27s_pole_vault.html',\n", " 'Athletics_at_the_2007_Games_of_the_Small_States_of_Europe.html',\n", " 'Athletics_at_the_2012_Summer_Paralympics_%E2%80%93_Women%27s_200_metres_T36.html',\n", " 'Atlanta_Area_School_for_the_Deaf.html',\n", " 'Atlantic_Studios.html',\n", " 'Atractaspis_duerdeni.html',\n", " 'Atrypanius_scitulus.html',\n", " 'Attila,_Illinois.html',\n", " 'Aubrey_Fair.html',\n", " 'Audefroi_le_Bastart.html',\n", " 'August_Duranowski.html',\n", " 'Augustine_Chacon.html',\n", " 'Austral_Alien.html',\n", " 'Australian_federal_election,_1954.html',\n", " 'Australian_hardcore.html',\n", " 'Australian_Musician_(magazine).html',\n", " 'Australiteuthis_aldrichi.html',\n", " 'Avalara.html',\n", " 'Avengers_Academy.html',\n", " 'Avenira.html',\n", " 'AVM_GmbH.html',\n", " 'Avyarud.html',\n", " 'Awali.html',\n", " 'Axel_Revold.html',\n", " 'Ayish_Bayou.html',\n", " 'Aylon_Darwin_Tavella.html',\n", " 'Ayodhya_Mandapam.html',\n", " 'Baadj.html',\n", " 'Baadshah_(1999_film).html',\n", " 'Bad_Axe_River.html',\n", " 'Bad_Luck_(Social_Distortion_song).html',\n", " 'Baddiewinkle.html',\n", " 'Bagrat_VI_of_Georgia.html',\n", " 'Bahia_bigelovii.html',\n", " 'Bahmanabad-e_Olya.html',\n", " 'Baigts-de-B%C3%A9arn.html',\n", " 'Baithakata_College.html',\n", " 'Bal%C3%A1zs_Vill%C3%A1m.html',\n", " 'Ballard_Memorial_High_School.html',\n", " 'Ballonet.html',\n", " 'Balloon_light.html',\n", " 'Baltic_Peak.html',\n", " 'Ban_On.html',\n", " 'Bandar_Botanik_LRT_Station.html',\n", " 'Banovina_of_Croatia.html',\n", " 'Bar_One_Racing_Juvenile_Hurdle.html',\n", " 'Baraf.html',\n", " 'Barbara_McKinzie.html',\n", " 'Barbus_caudosignatus.html',\n", " 'Baritius_cyclozonata.html',\n", " 'Barn.html',\n", " 'Barnsdall_Art_Park.html',\n", " 'Barra_do_Rocha.html',\n", " 'Barren_vegetation.html',\n", " 'Bart_van_Leeuwen.html',\n", " 'Bartrum_Glacier.html',\n", " 'Batfink.html',\n", " 'Battle_of_Abydos.html',\n", " 'Battle_of_Finta.html',\n", " 'Battle_of_Isonzo_(489).html',\n", " 'Battle_of_Sulukh.html',\n", " 'Battle_of_Wattignies.html',\n", " 'Battleford-Cut_Knife.html',\n", " 'Bawdrip_Halt_railway_station.html',\n", " 'Bay_of_Concepci%C3%B3n.html',\n", " 'Bayandalai,_%C3%96mn%C3%B6govi.html',\n", " 'Bazemore,_Alabama.html',\n", " 'BBCH-scale_(rice).html',\n", " 'BBG_Academy.html',\n", " 'Be%C5%82chat%C3%B3w_Power_Station.html',\n", " 'Beaubassin_East,_New_Brunswick.html',\n", " 'Beaune_Altarpiece.html',\n", " 'Beaupr%C3%A9,_Quebec.html',\n", " 'Bebearia_romboutsi.html',\n", " 'Bechdel_test.html',\n", " 'Bed_and_Breakfast_(album).html',\n", " 'Begin_(Riyu_Kosaka_album).html',\n", " 'Beijing_Dongyue_Temple.html',\n", " 'Belgium_women%27s_national_field_hockey_team.html',\n", " 'Beli_language_(South_Sudan).html',\n", " 'Bellanca_28-92.html',\n", " 'Belmont_Estate.html',\n", " 'Belosavci.html',\n", " 'Belvedere,_London.html',\n", " 'Belvedere_Park,_Georgia.html',\n", " 'Ben_H._Williams.html',\n", " 'Ben_Rimalower.html',\n", " 'Bengt_Forsberg.html',\n", " 'Benidorm_Bastards.html',\n", " 'Benktander_type_II_distribution.html',\n", " 'Benny_Lee.html',\n", " 'Bernard-Augustin_Conroy.html',\n", " 'Bert_Leboe.html',\n", " 'Bertilda_Samper_Acosta.html',\n", " 'Berzeliustinden.html',\n", " 'Bethoncourt.html',\n", " 'Beverley_Hills_Apartment_Block.html',\n", " 'Beyond_Magnetic.html',\n", " 'Bhimnagar,_Supaul.html',\n", " 'Bhubanananda_Das.html',\n", " 'Bianna_Golodryga.html',\n", " 'Bias.html',\n", " 'Bibiana_Beglau.html',\n", " 'Bibliography_of_Niue.html',\n", " 'Bicycle_basket.html',\n", " 'Bifidocarpus.html',\n", " 'Bijou,_California.html',\n", " 'Bilan_d%27aptitude_d%C3%A9livr%C3%A9_par_les_grandes_%C3%A9coles.html',\n", " 'Bill_Overstreet.html',\n", " 'Bill_Price_(baseball).html',\n", " 'Bill_Widenhouse.html',\n", " 'Bill_Williamson_(footballer).html',\n", " 'Billi%C3%A8me.html',\n", " 'Billy_Grammer.html',\n", " 'Billy_McNeill.html',\n", " 'Biorock.html',\n", " 'Bishop_of_Kingston.html',\n", " 'Biz_Kid$.html',\n", " 'Black_Moshannon_State_Park_Historic_Districts.html',\n", " 'Black_Pond_Township,_Oregon_County,_Missouri.html',\n", " 'Blackshear_Prison.html',\n", " 'Blick_nach_Rechts.html',\n", " 'Blohm.html',\n", " 'Blomheller_Station.html',\n", " 'Blommersia.html',\n", " 'Blue_Heelers_(season_8).html',\n", " 'Blue_Sky_Mining.html',\n", " 'Blue_SWAT.html',\n", " 'Bluesin%27_Around.html',\n", " 'Board_of_Jewish_Education_(Toronto).html',\n", " 'Boardman_Township,_Mahoning_County,_Ohio.html',\n", " 'Bob_Allen_(shortstop).html',\n", " 'Bob_Bass.html',\n", " 'Bob_Lustig.html',\n", " 'Bobo_Sikorski.html',\n", " 'Bon_Accord_Free_Church.html',\n", " 'Bone_morphogenetic_protein_4.html',\n", " 'Bones_Apart.html',\n", " 'Bonny_River.html',\n", " 'Boogie_Bunnies.html',\n", " 'Boone_Square.html',\n", " 'Boothapandi.html',\n", " 'Botswana_passport.html',\n", " 'Bowie_Race_Track.html',\n", " 'Bowlers%27_Club_of_New_South_Wales.html',\n", " 'Boynton_baronets.html',\n", " 'Br%C3%A9zina_District.html',\n", " 'Brachylogus.html',\n", " 'Brady_Hicks.html',\n", " 'Braille_pattern_dots-126.html',\n", " 'Branko_Maru%C5%A1i%C4%8D.html',\n", " 'Brede_Waterworks.html',\n", " 'Brendan_Foster.html',\n", " 'Brian_Collins_(2010s_singer).html',\n", " 'Brian_G._W._Manning.html',\n", " 'Brian_Sims.html',\n", " 'Brightside_Apartments.html',\n", " 'British_Academy_Television_Award_for_Best_Comedy_(Programme_or_Series).html',\n", " 'Bro_Safari.html',\n", " 'Brograve_Mill.html',\n", " 'Brownfield_(software_development).html',\n", " 'Bruce_Conte.html',\n", " 'Bruce_Lyon.html',\n", " 'Bryan_Payton.html',\n", " 'Bryzg%C3%B3w.html',\n", " 'Buchanan_%26_Press.html',\n", " 'Buffington_Township,_Indiana_County,_Pennsylvania.html',\n", " 'Buford,_North_Dakota.html',\n", " 'Buko_pie.html',\n", " 'Bukovica_(Cazin).html',\n", " 'Bulbonaricus.html',\n", " 'Bulbophyllum_biflorum.html',\n", " 'Burbridge_Creek.html',\n", " 'Burmese_shrike.html',\n", " 'Burnin%27_Sneakers.html',\n", " 'Burnley_(surname).html',\n", " 'Burnley_Borough_Council_election,_2011.html',\n", " 'Burrough_Court.html',\n", " 'Busan%E2%80%93Gimhae_Light_Rail_Transit_Operation_Corporation.html',\n", " 'Business.com.html',\n", " 'BW_Vulpeculae.html',\n", " 'Bycz,_Kuyavian-Pomeranian_Voivodeship.html',\n", " 'Bye!_My_Boy!.html',\n", " 'Bye_Bye_Mon_Cowboy.html',\n", " 'C%27est_la_vie_(Khaled_song).html',\n", " 'C%C3%AA.html',\n", " 'C10H8.html',\n", " 'C11orf30.html',\n", " 'C18H23NO6.html',\n", " 'Cabrini%E2%80%93Green,_Chicago.html',\n", " 'Caladenia_amoena.html',\n", " 'Caldelas,_Sequeiros_e_Paranhos.html',\n", " 'California_Historical_Landmarks_in_San_Bernardino_County,_California.html',\n", " 'California_roach.html',\n", " 'California_Unfair_Competition_Law.html',\n", " 'Calin_Rovinescu.html',\n", " 'Calvin_Harrell.html',\n", " 'Camp_Liberty_killings.html',\n", " 'Camp_Nelson_Confederate_Cemetery.html',\n", " 'Campus_of_Texas_A%26M_University.html',\n", " 'Canada_Bay.html',\n", " 'Canbelego_County.html',\n", " 'Caney_Creek_(Matagorda_Bay).html',\n", " 'Cannabis_in_Spain.html',\n", " 'Canyamars.html',\n", " 'Cape_Bonavista.html',\n", " 'Cape_Mayo.html',\n", " 'Cappella_San_Donato,_Venafro.html',\n", " 'Capriccio_Italien.html',\n", " 'Caprinia_versicolor.html',\n", " 'Caramel_Box_Yarukibako.html',\n", " 'Cardinals_created_by_Alexander_VIII.html',\n", " 'Cardipeltis.html',\n", " 'Carley_State_Park.html',\n", " 'Carlos_Martins_(musician).html',\n", " 'Carmen_Weber.html',\n", " 'Carroll_O._Switzer.html',\n", " 'Carrollwood_Day_School.html',\n", " 'Carte_Imagine%27R.html',\n", " 'Carthage,_Cincinnati.html',\n", " 'Caryl_Nowson.html',\n", " 'Cascade_Summit,_Oregon.html',\n", " 'Cascate_del_Rio_Verde.html',\n", " 'Caste_system_in_India.html',\n", " 'Castle_of_Cardona.html',\n", " 'Catlett_House_(Staunton,_Virginia).html',\n", " 'CCL8.html',\n", " 'CDC42EP3.html',\n", " 'Cechenena_sperlingi.html',\n", " 'Cecil,_Oregon.html',\n", " 'Cecil_Biggs.html',\n", " 'Cecil_Bothwell.html',\n", " 'Cecil_Peak.html',\n", " 'Cecile_of_Baux.html',\n", " 'Cecyl%C3%B3wka-Brz%C3%B3zka.html',\n", " 'Celebrity_doll.html',\n", " 'Celta_de_Vigo_B.html',\n", " 'Center_(group_theory).html',\n", " 'Central_District_(Rezvanshahr_County).html',\n", " 'Central_Luzon.html',\n", " 'Central_Park_(Tolyatti).html',\n", " 'Centzon_Totochtin.html',\n", " 'Ceryx_hageni.html',\n", " 'Ch%C3%A2telet_surface.html',\n", " 'Ch%C3%A9nens.html',\n", " 'Chadefaudiellaceae.html',\n", " 'Chah-e_Zakaria.html',\n", " 'Challenger_Banque_Nationale_de_Drummondville.html',\n", " 'Channel_48_digital_TV_stations_in_the_United_States.html',\n", " 'Chapel_of_Mercy,_Monaco-Ville.html',\n", " 'Charalampos_Brilakis.html',\n", " 'Charged_Records.html',\n", " 'Charles_Golding_Constable.html',\n", " 'Charles_Micallef.html',\n", " 'Charles_Page_(cricketer).html',\n", " 'Charles_S._Benton.html',\n", " 'Charles_Sears.html',\n", " 'Charles_Smith_Wilcox.html',\n", " 'Charles_Stuart_(rugby_union).html',\n", " 'Charles_Trudeau_(politician).html',\n", " 'Charles_Wadsworth.html',\n", " 'Charlton_baronets.html',\n", " 'Chaun_Thompson.html',\n", " 'Checkerspot_(magazine).html',\n", " 'Cheddar_Ales.html',\n", " 'Chef_Wan.html',\n", " 'Chemin_d%27Aylmer.html',\n", " 'Cherechiu.html',\n", " 'Cheryl_Prewitt.html',\n", " 'Chester_Morris.html',\n", " 'Chestnut_pie.html',\n", " 'Chhaparband_(Muslim).html',\n", " 'Chhatrasal.html',\n", " 'Chicoreus_cervicornis.html',\n", " 'Chief_Justice_of_New_Zealand.html',\n", " 'Chihuahuan_Desert_Nature_Center_and_Botanical_Gardens.html',\n", " 'Chilango_(magazine).html',\n", " 'Children_of_Air_India.html',\n", " 'Children_of_the_Grave.html',\n", " 'Chile_de_%C3%A1rbol.html',\n", " 'Chilo_auricilius.html',\n", " 'Chinlone.html',\n", " 'Chionodes_retiniella.html',\n", " 'Chiriguelo.html',\n", " 'Chisocheton_cumingianus_subsp._kinabaluensis.html',\n", " 'Chocolate_Factory.html',\n", " 'Choseng_Trungpa.html',\n", " 'Christia_Mercer.html',\n", " 'Christian_Amoroso.html',\n", " 'Christian_Jessen.html',\n", " 'Christianity_and_antisemitism.html',\n", " 'Christine_Gardner.html',\n", " 'Christopher_Speer.html',\n", " 'Chromis_iomelas.html',\n", " 'Church_of_St_Mary_the_Virgin,_Keysoe.html',\n", " 'Church_of_the_SubGenius.html',\n", " 'CIE_1964_color_space.html',\n", " 'Circle_Repertory_Company.html',\n", " 'Circus_Avenue.html',\n", " 'Cisy,_Byt%C3%B3w_County.html',\n", " 'Cit%C3%A9_du_Vin.html',\n", " 'City_of_Mandaluyong_Science_High_School.html',\n", " 'Claire_Danes.html',\n", " 'Clarinet_Quintet_(T%C3%A4glichsbeck).html',\n", " 'Clarksville,_Pennsylvania.html',\n", " 'Class_C_GPCR.html',\n", " 'Claude%27s_syndrome.html',\n", " 'Claude-Godefroy_Coquart.html',\n", " 'Claude_Mandil.html',\n", " 'Claudia_Neidig.html',\n", " 'Clay_Research_Award.html',\n", " 'Clay_Township,_Clark_County,_Missouri.html',\n", " 'Clear_Creek_Township,_Cooper_County,_Missouri.html',\n", " 'Cleveland,_Oklahoma.html',\n", " 'Cleveland_Institution_of_Engineers.html',\n", " 'Cleveland_Play_House.html',\n", " 'Clinton_Engineer_Works.html',\n", " 'Clive_Brown_(footballer).html',\n", " 'Clocking_Out_Is_for_Suckers.html',\n", " 'Clothes_Show_Live.html',\n", " 'Coalition_for_Unity_and_Democracy.html',\n", " 'Coalville_Town_railway_station.html',\n", " 'Coastal_upwelling_of_the_South_Eastern_Arabian_Sea.html',\n", " 'Cobble_Hill,_Brooklyn.html',\n", " 'Coble_hypersurface.html',\n", " 'Code_page_1023.html',\n", " 'Coenaculum_secundum.html',\n", " 'Col_Pearce.html',\n", " 'Colantonio_Incorporated.html',\n", " 'Colchester_Village_Historic_District.html',\n", " 'Cole_Shade_Sule.html',\n", " 'Colleen_Haskell.html',\n", " 'Colors_for_Trombone.html',\n", " 'Colossal_Titan_Strife.html',\n", " 'Colosseum_LiveS_%E2%80%93_The_Reunion_Concerts.html',\n", " 'Columbia_Airport_(Ohio).html',\n", " 'Comedy_Now!.html',\n", " 'Comisi%C3%B3n_Femenil_Mexicana_Nacional.html',\n", " 'Common_Sense_Revolution.html',\n", " 'Communities_of_Tulu_Nadu.html',\n", " 'Companys,_proc%C3%A9s_a_Catalunya.html',\n", " 'Compton%27s_Most_Wanted_discography.html',\n", " 'Computational_Geometry_(journal).html',\n", " 'Congregation_Agudath_Achim.html',\n", " 'Conjugate_depth.html',\n", " 'Connecticut_House_of_Representatives.html',\n", " 'Connell_Mansion.html',\n", " 'Consolidated_Commodore.html',\n", " 'Constance_D%27Arcy_Mackay.html',\n", " 'Constantin_Ro%C8%99u.html',\n", " 'Constitution_of_Kyrgyzstan.html',\n", " 'Continuity_of_Operations.html',\n", " 'Coosa_County_School_District.html',\n", " 'Copamyntis_infusella.html',\n", " 'Copenhagen_Skatepark.html',\n", " 'Copier_family.html',\n", " 'Corey_Lee.html',\n", " 'Cornix.html',\n", " 'Coronium_(gastropod).html',\n", " 'Coronoid_process_of_the_ulna.html',\n", " 'Corozal_Hospital.html',\n", " 'Corporation_(university).html',\n", " 'Corredores_Ferroviarios.html',\n", " 'Cortile_del_Belvedere.html',\n", " 'Cosmopterix_similis.html',\n", " 'Coup_d%27%C3%A9tat.html',\n", " 'Cr%C3%A8me_de_violette.html',\n", " 'Crab-eating_frog.html',\n", " 'Craig_Chester.html',\n", " 'Crawley_Down.html',\n", " 'Crazy_(2000_film).html',\n", " 'Crazy_House_(1928_film).html',\n", " 'Crepidula_atrasolea.html',\n", " 'Cricoid_pressure.html',\n", " 'Criminal_Code_(Canada).html',\n", " 'Crisfield_Municipal_Airport.html',\n", " 'Crispoldus.html',\n", " 'Criss_Cross_(film).html',\n", " 'Cristian_Berdeja.html',\n", " 'Cryptographic_primitive.html',\n", " 'Ctenodes_zonata.html',\n", " 'Cultural_tourism_in_Egypt.html',\n", " 'Cuproxena_hoffmanana.html',\n", " 'Curridge.html',\n", " 'Curtis_F._Marbut.html',\n", " 'Curtis_Jones_(law).html',\n", " 'Curtis_Whitley.html',\n", " 'Curtiss-Wright_Hangar_(Columbia,_South_Carolina).html',\n", " 'Cyclohexane_conformation.html',\n", " 'Cyclone_Gamede.html',\n", " 'Cygany,_Masovian_Voivodeship.html',\n", " 'Dabir_Khan.html',\n", " 'Dachuan_District.html',\n", " 'Dahan-e_Kanak.html',\n", " 'Dailyhunt.html',\n", " 'Dakar_2:_The_World%27s_Ultimate_Rally.html',\n", " 'Dalian_Road_Station.html',\n", " 'Danbury_Ridge_Nature_Reserves.html',\n", " 'Dance_notation.html',\n", " 'Dancing_with_the_Stars_(Greece_season_5).html',\n", " 'Dani%C3%ABl_de_Clercq.html',\n", " 'Daniel_Cerone.html',\n", " 'Daniel_Glazman.html',\n", " 'Daniela_Del_Din.html',\n", " 'Daniella_Abreu.html',\n", " 'Danish_Maritime_Safety_Administration.html',\n", " 'Danny_Gray.html',\n", " 'Darantasia_cuneiplena.html',\n", " 'Daryl_Stanley.html',\n", " 'DataMeet.html',\n", " 'Datang,_Zhuji.html',\n", " 'Dave_Hill_(golfer).html',\n", " 'David_Beasley.html',\n", " 'David_Jesson.html',\n", " 'David_Mandel.html',\n", " 'David_Sands_(psychologist).html',\n", " 'David_Sencer.html',\n", " 'David_Solomona.html',\n", " 'David_Thomson,_3rd_Baron_Thomson_of_Fleet.html',\n", " 'Days_Creek_Formation.html',\n", " 'De_Bruijn_index.html',\n", " 'De_La_Salle_University_%E2%80%93_Dasmari%C3%B1as.html',\n", " 'Dean_Downing.html',\n", " 'Dean_Kukan.html',\n", " 'Deception_(Irish_TV_series).html',\n", " 'Decker_Township,_Richland_County,_Illinois.html',\n", " 'Decoy_receptors.html',\n", " 'Degerfors_IF.html',\n", " 'Delta_Wedding.html',\n", " 'Delvenau.html',\n", " 'Democratic_Forum_of_Germans_in_Romania.html',\n", " 'Demographics_of_American_Samoa.html',\n", " 'Demographics_of_Wallis_and_Futuna.html',\n", " 'Denis_Augustine_Hanley.html',\n", " 'Dennis_Chapman.html',\n", " 'Dennis_Shere.html',\n", " 'Denyse_Sibley.html',\n", " 'Der_Opernball.html',\n", " 'Derek_Acorah.html',\n", " 'Dermatomycosis.html',\n", " 'Derrick_White_(baseball).html',\n", " 'Desert_Rain_(song).html',\n", " 'Deserticossus_murinus.html',\n", " 'Desmiphora_bijuba.html',\n", " 'Destroyer_Squadron_7.html',\n", " 'Deuterocohnia_schreiteri.html',\n", " 'Devarampally.html',\n", " 'Developmental_robotics.html',\n", " 'Devil_on_Horseback.html',\n", " 'Devitt_Insurance.html',\n", " 'Devizes_Town_F.C..html',\n", " 'Dewoitine_D.21.html',\n", " 'Dextran_1.html',\n", " 'Dhanbad_(Vidhan_Sabha_constituency).html',\n", " 'Diablo_Swing_Orchestra.html',\n", " 'Dialogue_for_Hungary.html',\n", " 'Dick_Johnson_(clarinetist).html',\n", " 'Dick_Persson.html',\n", " 'Diego_Galv%C3%A1n.html',\n", " 'DiGiorgio_Corporation.html',\n", " 'Digital_Light_Processing.html',\n", " 'Dik%C4%BCi_parish.html',\n", " 'Dimanche_%C3%A0_Bamako.html',\n", " 'Dimension_of_an_algebraic_variety.html',\n", " 'Diphilus_(physician).html',\n", " 'Diplacus_aurantiacus.html',\n", " 'Discretionary_trust.html',\n", " 'Distance_Education_Centre,_Victoria.html',\n", " 'Divine_Incantations_Scripture.html',\n", " 'DJ-Kicks:_Henrik_Schwarz.html',\n", " 'Dmytrivka,_Shakhtarsk_Raion.html',\n", " 'Dokri_Taluka.html',\n", " 'Domaniowice.html',\n", " 'Domenico_Siniscalco.html',\n", " 'Dominique_Voynet.html',\n", " 'Don_Kindt,_Jr..html',\n", " 'Don_Nelson_Laramore.html',\n", " 'Don_Parsons_(ice_hockey).html',\n", " 'Don_Raye.html',\n", " 'Donal_Moloney.html',\n", " 'Donegal_Township,_Westmoreland_County,_Pennsylvania.html',\n", " 'Double_Wing_Attack.html',\n", " 'Doug_Sahm_and_Band.html',\n", " 'Doughty.html',\n", " 'Douglas_Trojans.html',\n", " 'Doumanaba.html',\n", " 'Dowell_Philip_O%27Reilly.html',\n", " 'Downhill_Domination.html',\n", " 'Dr._Feelgood_(album).html',\n", " 'Dragnet_(franchise).html',\n", " 'Dresden-Plauen_railway_station.html',\n", " 'Drumstruck.html',\n", " 'Dryas_integrifolia.html',\n", " 'Du%C5%A1anovo.html',\n", " 'Du%C5%BEi_Monastery.html',\n", " 'Durham_Women%27s_F.C..html',\n", " 'DWTE-TV.html',\n", " 'DXRA.html',\n", " 'Dysgonia_pudica.html',\n", " 'Dzahadjou_Lamzand%C3%A9.html',\n", " 'DZYM.html',\n", " 'E._E._Evans-Pritchard.html',\n", " 'E._F._McClellan.html',\n", " 'Earl_of_Anglesey.html',\n", " 'Early_medieval_states_in_Kazakhstan.html',\n", " 'Earth_Liberation_Front.html',\n", " 'EarthCheck_Assessed.html',\n", " 'East_Bloomfield_Historic_District.html',\n", " 'East_Down_(Northern_Ireland_Parliament_constituency).html',\n", " 'Eastbrook_Academy.html',\n", " 'Eastern_Finnmark_Police_District.html',\n", " 'Echembrotus.html',\n", " 'Eco-Drive.html',\n", " 'Ed_Jackson_(rugby_player).html',\n", " 'Eddie_Durie.html',\n", " 'Eddie_Eyre.html',\n", " 'Edmonton_municipal_election,_1961.html',\n", " 'EDP_Sarichioi_Wind_Farm.html',\n", " 'Eduardo_De_Filippo.html',\n", " 'Education_and_Training_Board.html',\n", " 'Education_in_Tacloban.html',\n", " 'Edward_Grosvenor.html',\n", " 'Edward_Holmes_Baldock_(dealer).html',\n", " 'Edwin_Bollier.html',\n", " 'Egg_coffee.html',\n", " 'Eggerberg_railway_station.html',\n", " 'Ego_Dominus_Tuus.html',\n", " 'Einstein_(horse).html',\n", " 'Eivind_Hiis_Hauge.html',\n", " 'Ek_Dil_Sau_Afsane.html',\n", " 'El%C5%BCbieta_Dru%C5%BCbacka.html',\n", " 'El_Bordj.html',\n", " 'El_Pas_de_la_Casa.html',\n", " 'Elaine_Fleming.html',\n", " 'ELED.html',\n", " 'Elgin_National_Watch_Company.html',\n", " 'Elio_Calderini.html',\n", " 'Elisa_von_der_Recke.html',\n", " 'Elizabeth_Gray_(fossil_collector).html',\n", " 'Elo%C3%ADna_Miyares_Berm%C3%BAdez.html',\n", " 'Els_Dottermans.html',\n", " 'Elzear_Torreggiani.html',\n", " 'Emarginula_koon.html',\n", " 'Embraer_Unidade_Gavi%C3%A3o_Peixoto_Airport.html',\n", " 'EMD_SD40-2.html',\n", " 'Emily_(1964_song).html',\n", " 'Eminencia.html',\n", " 'Emmanuel_Muhammad.html',\n", " 'Emperor_(novel_series).html',\n", " 'Enamul_Haque_(cricketer,_born_1966).html',\n", " 'Endre_Church.html',\n", " 'ENESSERE.html',\n", " 'English_cricket_team_in_South_Africa_in_1956%E2%80%9357.html',\n", " 'ENMAX_Centrium.html',\n", " 'Enrique_Morales.html',\n", " 'Environmental_impact_of_agriculture.html',\n", " 'Epichorista.html',\n", " 'Epropetes_metallica.html',\n", " 'Equestrian_at_the_1936_Summer_Olympics.html',\n", " 'Erd%C5%91s%E2%80%93Faber%E2%80%93Lov%C3%A1sz_conjecture.html',\n", " 'Eressa_aperiens.html',\n", " 'Eric_Ziebold.html',\n", " 'Erigeron_canaani.html',\n", " 'Erwin_Lutzer.html',\n", " 'Eryngium_planum.html',\n", " 'Esher_Church_of_England_High_School.html',\n", " 'Estadio_Municipal_Pozoblanco.html',\n", " 'Estelle_v._Smith.html',\n", " 'Eternamente_Romanticos.html',\n", " 'Ethan_Van_der_Ryn.html',\n", " 'Ethel_Johns.html',\n", " 'ETS-VIII.html',\n", " 'Eudonia_notozeucta.html',\n", " 'Eugamandus_brunneus.html',\n", " 'Eugenio_Morelli.html',\n", " 'Eupithecia_trancasae.html',\n", " 'Europa_(Roman_province).html',\n", " 'Euxoa_simulata.html',\n", " 'Evanston,_Cincinnati.html',\n", " 'Evene.html',\n", " 'Ewa_Kasprzyk_(athlete).html',\n", " 'Explanation_module.html',\n", " 'Exploratorium_(film).html',\n", " 'Extension_of_the_Wish.html',\n", " 'Eye_of_the_Beholder_III:_Assault_on_Myth_Drannor.html',\n", " 'Eyes_Galaxies.html',\n", " 'F%C3%A9lix_C%C3%A1rdenas.html',\n", " 'F._L._Griggs.html',\n", " 'Faat_Zakirov.html',\n", " 'Face_Drop.html',\n", " 'Faces_(Run%E2%80%93D.M.C._song).html',\n", " 'Facial_cleft.html',\n", " 'Facundo_Gamband%C3%A9.html',\n", " 'Fahy,_County_Mayo.html',\n", " 'Failing_Office_Building.html',\n", " 'Faizullah.html',\n", " 'Falman-County_Acres,_Texas.html',\n", " 'False_pregnancy.html',\n", " 'Faryab,_Dashtestan.html',\n", " 'FC_Bobruisk.html',\n", " 'FC_Khikhani_Khulo.html',\n", " 'Fe%27i_banana.html',\n", " 'Fear_play.html',\n", " 'Feel_No_Fret.html',\n", " 'Feiner_v._New_York.html',\n", " 'Feodosia_Morozova.html',\n", " 'Ferdinand_Arnodin.html',\n", " 'Ferdinand_Mainzer.html',\n", " 'Feria_del_Sol_(M%C3%A9rida).html',\n", " 'Fernanda_Vasconcellos.html',\n", " 'Feuerzangenbowle.html',\n", " 'Fictionwise.html',\n", " 'Fifty_Years_of_Music.html',\n", " 'Filip_Pyrochta.html',\n", " 'Filochy.html',\n", " 'Findwell.html',\n", " 'FinnSec_Security.html',\n", " 'Finnveden.html',\n", " 'Firebaugh,_California.html',\n", " 'Fireworks_EP.html',\n", " 'Firmin%C3%B3polis.html',\n", " 'Fischli.html',\n", " 'Flag_of_Chicago.html',\n", " 'Flag_of_the_Austral_Islands.html',\n", " 'Flashmob_(album).html',\n", " 'Flat_roof.html',\n", " 'Fleetwood,_Oklahoma.html',\n", " 'Flight_Design_Boxtair.html',\n", " 'Flour_Babies.html',\n", " 'Foidolite.html',\n", " 'Foilbacks.html',\n", " 'Follow_(album).html',\n", " 'Football_at_the_National_Games_of_China.html',\n", " 'Football_Foundation.html',\n", " 'Fort_Astoria.html',\n", " 'Fortabat_Art_Collection.html',\n", " 'Forth_and_Bargy_dialect.html',\n", " 'Fortuna_Glacier.html',\n", " 'Forward_Operating_Base_Grizzly.html',\n", " 'Fougangoue.html',\n", " 'Foulonia.html',\n", " 'Fractured_(Everything_I_Said_Was_True).html',\n", " 'France_in_the_Eurovision_Song_Contest_2010.html',\n", " 'Frances_Morris_(actress).html',\n", " 'Frank_C._Archibald_(Vermont_politician).html',\n", " 'Frank_Drum.html',\n", " 'Frank_Gross.html',\n", " 'Frans,_Ain.html',\n", " 'Franz_Georg_Benkert.html',\n", " 'Fraser_Lake_Airport.html',\n", " 'Fred_H._Hildebrandt.html',\n", " 'Fred_Marks.html',\n", " 'Frederick_Leadbetter.html',\n", " 'Freedom_(White_Heart_album).html',\n", " 'Freehold_Township_Schools.html',\n", " 'Friedrich_Albrecht_Anton_Meyer.html',\n", " 'Friedrich_Kirchner.html',\n", " 'Friendship,_Wake_County,_North_Carolina.html',\n", " 'Frisilia_sulcata.html',\n", " 'Frithjof_Schmidt.html',\n", " 'From_All_Sides.html',\n", " 'Frost_Township,_Michigan.html',\n", " 'Fu_Biao.html',\n", " 'Fucibet.html',\n", " 'Functoid.html',\n", " 'Fureai_kippu.html',\n", " 'Furkan_Korkmaz.html',\n", " 'Furto_di_sera_bel_colpo_si_spera.html',\n", " 'Furubira_District,_Hokkaido.html',\n", " 'Fusinus_carvalhoriosi.html',\n", " 'Future_Weather.html',\n", " 'G%C3%B3ra,_Mogilno_County.html',\n", " 'G%C3%B6ncruszka.html',\n", " 'G.O.Y.A._(Gunz_Or_Yay_Available).html',\n", " 'G.S._Lamias_Achilleus.html',\n", " 'Ga%C5%A1parci.html',\n", " 'Gabriele_Poso.html',\n", " 'Gacharageini.html',\n", " 'Gad_(deity).html',\n", " 'Gaetanus_Matthew_Perez.html',\n", " 'Gal%C3%A1pagos,_Guadalajara.html',\n", " 'Galician_Nationalist_Bloc.html',\n", " 'Galloway_and_Upper_Nithsdale_(Scottish_Parliament_constituency).html',\n", " 'Gander_(surname).html',\n", " 'Gandigwad.html',\n", " 'Gang_run_printing.html',\n", " 'Garde_Civique.html',\n", " 'Garden-based_learning.html',\n", " ...]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "\n", "os.listdir(\"wiki\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2997" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(os.listdir(\"wiki\"))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n", "Cecil Peak - Wikipedia\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\t\t
\n", "\t\t
\n", "\t\t
\n", "\t\t\t\n", "\n", "\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t
\n", "
\n", "\t\t\t

Cecil Peak

\n", "\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t
From Wikipedia, the free encyclopedia
\n", "\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\tJump to:\t\t\t\t\tnavigation, \t\t\t\t\tsearch\n", "\t\t\t\t
\n", "\t\t\t\t

Cecil Peak is a mountain in the Wakatipu Basin, New Zealand and reaches a height of 1,978 metres.[1] It is on the south side of Lake Wakatipu, south south-west of Queenstown and is highly prominent from around this area. [2]

\n", "
\n", "
\"\"\n", "
\n", "
\n", "View from Fernhill of Cecil Peak (Walter Peak on right). Kā Kamu-a-Hakitekura is the Māori name for both mountains.
\n", "
\n", "
\n", "

Vegetation is mainly grass and tussock (as it is under a pastoral lease[3]) with trees near the waterline. Hidden Island is one of four islands in Lake Wakatipu and sits very close to the shoreline of Cecil Peak. On 27 March 2010 a local band performed an outdoor concert in a natural amphitheatre on the peak playing songs from the band Pink Floyd.[4]

\n", "

Name[edit]

\n", "

Both Cecil Peak and the nearby mountain of Walter Peak were named after William Rees' eldest sons' first given names by the surveyor James McKerrow in 1862.[5]

\n", "

See also[edit]

\n", "\n", "

References[edit]

\n", "
\n", "
    \n", "
  1. ^ Malcolm, McKinnon. \"Otago places - Wakatipu basin\". Te Ara - the Encyclopedia of New Zealand. 
  2. \n", "
  3. ^ \"Ngai Tahu Names\" (PDF). Te Karaka Issue 44, pg 13. tekaraka.co.nz. Retrieved 18 September 2013. 
  4. \n", "
  5. ^ \"Crown Pastoral Land Tenure Review\" (PDF). Land Information New Zealand. 
  6. \n", "
  7. ^ \"Rocking it on Cecil Peak\". Scoop Media. 
  8. \n", "
  9. ^ Jardine, D.G. (1978). Shadows on the Hill. A.H. & A.W. Reed Ltd. p. 185. ISBN 0589010093. 
  10. \n", "
\n", "
\n", "

Coordinates: 45°06′32″S 168°37′42″E / 45.108831°S 168.628463°E / -45.108831; 168.628463

\n", "


\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
\t\t\t\t\t
\n", "\t\t\t\t\t\tRetrieved from \"https://en.wikipedia.org/w/index.php?title=Cecil_Peak&oldid=724716112\"\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t\t
\n", "\t\t
\n", "\t\t
\n", "\t\t\t

Navigation menu

\n", "\n", "\t\t\t
\n", "\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t

Personal tools

\n", "\t\t\t\t\t\t\n", "\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t

Namespaces

\n", "\t\t\t\t\t\t
    \n", "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
  • Article
  • \n", "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
  • Talk
  • \n", "\t\t\t\t\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\t\t\t

\n", "\t\t\t\t\t\t\tVariants\n", "\t\t\t\t\t\t

\n", "\n", "\t\t\t\t\t\t
\n", "\t\t\t\t\t\t\t
    \n", "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t
\n", "\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t

Views

\n", "\t\t\t\t\t\t
    \n", "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
  • Read
  • \n", "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
  • Edit
  • \n", "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
  • View history
  • \n", "\t\t\t\t\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t

More

\n", "\n", "\t\t\t\t\t\t
\n", "\t\t\t\t\t\t\t
    \n", "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t
\n", "\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t

\n", "\t\t\t\t\t\t\t\n", "\t\t\t\t\t\t

\n", "\n", "\t\t\t\t\t\t
\n", "\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\t\t\t\t\t
\n", "\t\t\t\t\t\t
\n", "\t\t\t\t\t
\n", "\t\t\t\t\t\t\t\t\t
\n", "\t\t\t
\n", "\t\t\t
\n", "\t\t\t\t
\n", "\t\t\t\t\t\t
\n", "\t\t\t

Navigation

\n", "\n", "\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\n", "\t\t\t\t\t\t\t
\n", "\t\t
\n", "\t\t\t
\n", "\t\t\t

Interaction

\n", "\n", "\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\n", "\t\t\t\t\t\t\t
\n", "\t\t
\n", "\t\t\t
\n", "\t\t\t

Tools

\n", "\n", "\t\t\t\n", "\t\t
\n", "\t\t\t
\n", "\t\t\t

Print/export

\n", "\n", "\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\n", "\t\t\t\t\t\t\t
\n", "\t\t
\n", "\t\t\t
\n", "\t\t\t

Languages

\n", "\n", "\t\t\t
\n", "\t\t\t\t\t\t\t\t\t\n", "\t\t\t\t\t\t\t
\n", "\t\t
\n", "\t\t\t\t
\n", "\t\t
\n", "\t\t
\n", "\t\t\t\t\t\t\t\n", "\t\t\t\t\t\t\t\n", "\t\t\t\t\t\t\t\t\t\t\n", "\t\t\t\t\t\t
\n", "\t\t
\n", "\t\t\n", "\t\n", "\n", "\n" ] } ], "source": [ "with open(\"wiki/Cecil_Peak.html\") as f:\n", " print(f.read())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It appears that the main content is nested inside the `div` tag with the id `content`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Reading in the Data" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2.952831983566284\n" ] } ], "source": [ "import concurrent.futures\n", "import time\n", "\n", "pool = concurrent.futures.ThreadPoolExecutor(max_workers=4)\n", "\n", "def read_data(filename):\n", " with open(filename) as f:\n", " data = f.read()\n", " return data\n", "\n", "start = time.time()\n", "filenames = [\"wiki/{}\".format(f) for f in os.listdir(\"wiki\")]\n", "content = pool.map(read_data, filenames)\n", "content = list(content)\n", "\n", "end = time.time()\n", "print(end - start)\n", "articles = [f.replace(\".html\", \"\").replace(\"wiki/\", \"\") for f in filenames]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After doing some profiling, it doesn't appear that threading makes a huge difference to performance. It may be because although files are opened, most of the task is offset by the overhead of creating new threads." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Remove Extraneous Markup" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "139.67757391929626\n" ] } ], "source": [ "from bs4 import BeautifulSoup\n", "\n", "def parse_html(html):\n", " soup = BeautifulSoup(html, 'html.parser')\n", " return str(soup.find_all(\"div\", id=\"content\")[0])\n", "\n", "start = time.time()\n", "pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)\n", "parsed = pool.map(parse_html, content)\n", "parsed = list(parsed)\n", "end = time.time()\n", "\n", "print(end - start)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'
\\n\\n
\\n
\\n
\\n

Álvaro Sierra

\\n
\\n
From Wikipedia, the free encyclopedia
\\n
\\n
\\n\\t\\t\\t\\t\\tJump to:\\t\\t\\t\\t\\tnavigation, \\t\\t\\t\\t\\tsearch\\n
\\n
This name uses Spanish naming customs: the first or paternal family name is Sierra\\xa0and the second or maternal family name is Peña.
\\n

Álvaro Sierra Peña (born April 4, 1967 in Sogamoso, Boyacá) is a retired male professional road racing cyclist from Colombia.

\\n

Career[edit]

\\n
\\n
\\n
1990
\\n
6th in General Classification GP Internacional de Café, Bogota (COL)
\\n
1991
\\n
1st in General Classification Vuelta a Colombia (COL)
\\n
1993
\\n
8th in General Classification Vuelta a Colombia (COL)
\\n
1994
\\n
2nd in General Classification Vuelta a Colombia (COL)
\\n
1995
\\n
3rd in Stage 4 Vuelta a Colombia, Manizales (COL)
\\n
2nd in Stage 8 Vuelta a Colombia, Buenaventura (COL)
\\n
3rd in Stage 11 Vuelta a Colombia, Ibagué (COL)
\\n
3rd in General Classification Vuelta a Colombia (COL)
\\n
1996
\\n
2nd in Stage 9 Clásico RCN, Santa Helena (COL)
\\n
6th in General Classification Clásico RCN (COL)
\\n
1997
\\n
7th in General Classification Clásico RCN (COL)
\\n
7th in General Classification Vuelta a Colombia (COL)
\\n
1998
\\n
1st in General Classification Vuelta a Antioquia (COL)
\\n
1st in General Classification Vuelta a Boyacà (COL)
\\n
1st in Stage 9 Clásico RCN, Bogotá (COL)
\\n
5th in General Classification Clásico RCN (COL)
\\n
1999
\\n
1st in General Classification Vuelta a Boyacà (COL)
\\n
7th in General Classification Vuelta a Colombia (COL)
\\n
1st in Stage 3 Clásico RCN, Alto de Patios (COL)
\\n
2nd in Stage 7 Clasico RCN, El Cable (COL)
\\n
2nd in General Classification Clasico RCN (COL)
\\n
2000
\\n
2nd in Stage 15 Vuelta a Colombia, Tunja (COL)
\\n
1st in Stage 2 Vuelta a Boyacà, El Humilladero (COL)
\\n
2nd in Stage 3 Vuelta a Boyacà, Mongui (COL)
\\n
2nd in General Classification Vuelta a Boyacá (COL)
\\n
5th in General Classification Clásico RCN (COL)
\\n
3rd in General Classification Vuelta a Venezuela (VEN)
\\n
2001
\\n
1st in General Classification Vuelta a Boyacà (COL)
\\n
3rd in Stage 7 Vuelta al Tachira, Circuito en Mérida con llegada en Tovar (VEN)
\\n
2002
\\n
2nd in Stage 4 Clasica del Meta, Alto de Buenavista (COL)
\\n
1st in Stage 4 Vuelta al Tolima, Ibagué (COL)
\\n
3rd in Stage 5 Vuelta a Boyacà, Toca (COL)
\\n
2nd in General Classification Vuelta a Boyacà (COL)
\\n
3rd in Stage 6 Vuelta a Boyacà, Tunja (COL)
\\n
2003
\\n
1st in General Classification Doble Sucre Potosí GP Cemento Fancesa (BOL)
\\n
1st in Stage 1 Clasica Integración de la Guadua-Gobernación de Risaralda, Marsella (COL)
\\n
2nd in Stage 2 Clasica Integración de la Guadua-Gobernación de Risaralda, Apia (COL)
\\n
1st in General Classification Clasica Integración de la Guadua-Gobernación de Risaralda (COL)
\\n
3rd in Stage 3 Vuelta a Antioquia, Jerico (COL)
\\n
1st in Stage 4 Vuelta a Antioquia, Alto de El Escobero (COL)
\\n
2nd in General Classification Vuelta a Antioquia (COL)
\\n
2nd in Stage 5 Vuelta a Colombia, El Escobero (COL)
\\n
2nd in Stage 13 Vuelta a Colombia, Cali (COL)
\\n
3rd in General Classification Vuelta a Colombia (COL)
\\n
3rd in Stage 4 Vuelta a Boyacà, Jenesano (COL)
\\n
3rd in Stage 5 Vuelta a Boyacà, Tunja (COL)
\\n
2nd in Stage 7 Clásico RCN, Tunja (COL)
\\n
3rd in Stage 8 Clasico RCN, Alto de Patios (COL)
\\n
2nd in General Classification Clasico RCN (COL)
\\n
1st in Stage 3 Doble Copacabana GP Fides, La Paz (BOL)
\\n
1st in Stage 4 Doble Copacabana GP Fides, Viarcha (BOL)
\\n
2nd in Stage 5 part b Doble Copacabana GP Fides, Copacabana (BOL)
\\n
1st in General Classification Doble Copacabana GP Fides (BOL)
\\n
2nd in Stage 4 Vuelta a Costa Rica, Ciudad Quesada (CRC)
\\n
1st in Stage 7 Vuelta a Costa Rica, Barva (CRC)
\\n
2nd in Stage 8 Vuelta a Costa Rica, Cascajal de Coronado (CRC)
\\n
1st in Stage 10 Vuelta a Costa Rica, Pérez Zeledón (CRC)
\\n
3rd in General Classification Vuelta a Costa Rica (CRC)
\\n
2004
\\n
2nd in Stage 1 Clásica Club Deportivo Boyacá, Alto del Topo (COL)
\\n
1st in General Classification Clásica Club Deportivo Boyacá (COL)
\\n
1st in Stage 2 Vuelta a Cundinamarca, Fusagasugá (COL)
\\n
1st in General Classification Vuelta a Cundinamarca (COL)
\\n
5th in General Classification Clásico RCN (COL)
\\n
2005
\\n
1st in Stage 1 Doble Sucre Potosí GP Cemento Fancesa, Potosí (BOL)
\\n
1st in Stage 2 Doble Sucre Potosí GP Cemento Fancesa, Potosí (BOL)
\\n
1st in General Classification Doble Sucre Potosí GP Cemento Fancesa (BOL)
\\n
1st in Stage 3 Vuelta a Boyacà, Saboyá (COL)
\\n
3rd in Stage 4 Vuelta a Boyacà, Cucaita (COL)
\\n
2nd in Stage 3 Vuelta a Colombia, Armenia (COL)
\\n
3rd in Stage 8 Vuelta a Colombia, Jerico (COL)
\\n
3rd in Stage 13 Vuelta a Colombia, La Y (COL)
\\n
3rd in General Classification Vuelta a Colombia (COL)
\\n
3rd in General Classification GP Cootranspensilvania (COL)
\\n
3rd in Stage 7 Clásico RCN, Armenia (COL)
\\n
2nd in General Classification Clasico RCN (COL)
\\n
3rd in Stage 8 Clasico RCN, Manizales (COL)
\\n
2nd in Stage 2 Doble Copacabana GP Fides, La Paz (BOL)
\\n
1st in Stage 4 Doble Copacabana GP Fides, Viacha (BOL)
\\n
2006
\\n
2nd in General Classification Doble Sucre Potosí GP Cemento Fancesa (BOL)
\\n
1st in Stage 8 Vuelta a Colombia, Armenia (COL)
\\n
1st in Stage 14 Vuelta a Colombia, Alto del Escobero (COL)
\\n
3rd in General Classification Vuelta a Colombia (COL)
\\n
2nd in Stage 1 Vuelta a Boyacà, Boyaca (COL)
\\n
1st in Stage 4 Doble Copacabana GP Fides, Viacha (BOL)
\\n
3rd in General Classification Doble Copacabana GP Fides (BOL)
\\n
6th in General Classification Clásico RCN (COL)
\\n
2007
\\n
3rd in Stage 2 Doble Sucre Potosí GP Cemento Fancesa, Potosi (BOL)
\\n
3rd in General Classification Doble Sucre Potosí GP Cemento Fancesa (BOL)
\\n
2008
\\n
3rd in Prologue Vuelta a Cundinamarca, Cogua (COL)
\\n
2nd in General Classification Vuelta a Cundinamarca (COL)
\\n
4th in General Classification Vuelta a Boyacà (COL)
\\n
2nd in Stage 6 Vuelta a Boyacà, Tunja (COL)
\\n
2nd in Stage 5 Vuelta a Bolivia, Oruro (BOL)
\\n
1st in Stage 7 part b Vuelta a Bolivia, Copacabana (BOL)
\\n
3rd in General Classification Vuelta a Bolivia (BOL)
\\n
3rd in Stage 8 part a Vuelta a Bolivia, San Pablo de Tiquina (BOL)
\\n
2nd in Stage 3 Vuelta Ciclista a la Republica del Ecuador, Quito (ECU)
\\n
2nd in Stage 5 Vuelta Ciclista a la Republica del Ecuador, Riobamba (ECU)
\\n
2nd in General Classification Vuelta Ciclista a la Republica del Ecuador (ECU)
\\n
\\n
\\n

References[edit]

\\n\\n\\n\\n\\n
\\n\\t\\t\\t\\t\\t\\tRetrieved from \"https://en.wikipedia.org/w/index.php?title=Álvaro_Sierra&oldid=729711134\"\\t\\t\\t\\t\\t
\\n
\\n
\\n
'" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This operation is quite slow and CPU-intensive. It looks like using as many processes are there are available processors speeds things up." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Finding Common Tags" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "71.57306599617004\n" ] }, { "data": { "text/plain": [ "{'a': 498830,\n", " 'abbr': 11536,\n", " 'annotation': 205,\n", " 'area': 138,\n", " 'audio': 8,\n", " 'b': 45291,\n", " 'bdi': 34,\n", " 'big': 416,\n", " 'blockquote': 148,\n", " 'br': 15599,\n", " 'caption': 609,\n", " 'center': 657,\n", " 'cite': 11605,\n", " 'code': 281,\n", " 'dd': 3761,\n", " 'del': 6,\n", " 'div': 88787,\n", " 'dl': 1425,\n", " 'dt': 953,\n", " 'font': 44,\n", " 'h1': 2997,\n", " 'h2': 12336,\n", " 'h3': 2720,\n", " 'h4': 405,\n", " 'h5': 26,\n", " 'h6': 1,\n", " 'hr': 240,\n", " 'i': 55074,\n", " 'img': 24457,\n", " 'li': 263520,\n", " 'map': 9,\n", " 'math': 205,\n", " 'mfrac': 181,\n", " 'mi': 1538,\n", " 'mn': 657,\n", " 'mo': 1399,\n", " 'mover': 14,\n", " 'mrow': 1302,\n", " 'mspace': 65,\n", " 'msqrt': 21,\n", " 'mstyle': 214,\n", " 'msub': 319,\n", " 'msubsup': 56,\n", " 'msup': 163,\n", " 'mtable': 4,\n", " 'mtd': 93,\n", " 'mtext': 13,\n", " 'mtr': 13,\n", " 'munder': 7,\n", " 'munderover': 17,\n", " 'noscript': 2997,\n", " 'ol': 2626,\n", " 'p': 24099,\n", " 'pre': 12,\n", " 'q': 162,\n", " 'rb': 16,\n", " 'rp': 32,\n", " 'rt': 16,\n", " 'ruby': 16,\n", " 's': 76,\n", " 'samp': 2,\n", " 'semantics': 205,\n", " 'small': 9415,\n", " 'source': 8,\n", " 'span': 218982,\n", " 'strong': 1847,\n", " 'sub': 547,\n", " 'sup': 36301,\n", " 'table': 13114,\n", " 'td': 175989,\n", " 'th': 46258,\n", " 'time': 34,\n", " 'tr': 84116,\n", " 'tt': 7,\n", " 'u': 110,\n", " 'ul': 32928,\n", " 'wbr': 277}" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from bs4 import BeautifulSoup\n", "\n", "def count_tags(html):\n", " soup = BeautifulSoup(html, 'html.parser')\n", " tags = {}\n", " for tag in soup.find_all():\n", " if tag.name not in tags:\n", " tags[tag.name] = 0\n", " tags[tag.name] += 1\n", " return tags\n", "\n", "start = time.time()\n", "pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)\n", "tags = pool.map(count_tags, parsed)\n", "tags = list(tags)\n", "\n", "tag_counts = {}\n", "for tag in tags:\n", " for k,v in tag.items():\n", " if k not in tag_counts:\n", " tag_counts[k] = 0\n", " tag_counts[k] += v\n", "end = time.time()\n", "\n", "print(end - start)\n", "tag_counts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Based on our findings, it looks like there are quite a few `td`, `a`, `li`, and `span` tags. This indicates that articles tend to have lots of links, along with lists and tables. Links are the most numerous tag, which indicates how interconnected articles on Wikipedia are." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Finding Common Words" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "133.99117708206177\n" ] }, { "data": { "text/plain": [ "{'junior': 8,\n", " 'prannathji': 1,\n", " 'touring': 1,\n", " 'sofia': 1,\n", " 'patrick': 5,\n", " 'twice': 1,\n", " 'managers': 3,\n", " 'agostini': 1,\n", " 'domination': 1,\n", " 'billième': 1,\n", " 'kavčič': 1,\n", " 'dönhoff': 1,\n", " 'uprising': 1,\n", " 'moshe': 1,\n", " 'bangladesh': 3,\n", " 'muntaner': 1,\n", " 'mayor': 11,\n", " 'beasley': 1,\n", " 'zeitweiser': 1,\n", " 'first': 100,\n", " 'gives': 1,\n", " 'films1940s': 1,\n", " 'januszkowo': 1,\n", " 'minesweeper': 2,\n", " 'commons': 6,\n", " '83222': 1,\n", " 'rayon': 2,\n", " 'humorist': 1,\n", " 'hercules': 1,\n", " 'encarnaçao': 1,\n", " 'calling': 2,\n", " 'today': 3,\n", " 'kazuki': 1,\n", " 'zoltán': 1,\n", " 'fortified': 1,\n", " 'martinez': 1,\n", " 'båstad': 1,\n", " 'presidential': 5,\n", " 'contributor': 1,\n", " 'trăng': 1,\n", " 'preparatory': 1,\n", " 'governor': 12,\n", " 'pietism': 1,\n", " 'least': 1,\n", " 'sulfonic': 1,\n", " 'manipuli': 1,\n", " 'scindalmota': 1,\n", " 'colliery': 1,\n", " 'birchall': 1,\n", " 'plotinus': 1,\n", " 'narasaraopet': 1,\n", " 'awareness': 2,\n", " 'willimon': 1,\n", " 'appear': 1,\n", " 'monadenia': 1,\n", " 'antioquia': 2,\n", " 'polynomials': 1,\n", " 'kitplanes': 1,\n", " 'chiriguelo': 1,\n", " 'asteraceae': 2,\n", " 'nadia': 1,\n", " 'thoracic': 1,\n", " 'entrance': 1,\n", " 'chiudinelli': 2,\n", " 'wilcox': 1,\n", " 'welfare': 2,\n", " 'choice': 1,\n", " 'cricoid': 1,\n", " 'zielona': 1,\n", " 'greyish': 2,\n", " 'naghizadeh': 1,\n", " 'arhndt': 1,\n", " 'vladimir': 1,\n", " 'flyin': 1,\n", " 'vilnius': 2,\n", " 'inflanty': 1,\n", " 'guaraldi': 1,\n", " 'viticultural': 1,\n", " 'scholars': 1,\n", " 'nagar': 5,\n", " 'hepburn': 1,\n", " 'reports': 1,\n", " '1357883': 1,\n", " 'artificial': 1,\n", " 'biological': 2,\n", " 'francesco': 1,\n", " 'marguérite': 1,\n", " 'arxiv': 2,\n", " 'billion': 1,\n", " 'morgana': 1,\n", " 'confederation': 1,\n", " 'bahia': 2,\n", " 'brăila': 1,\n", " 'giuliani': 1,\n", " 'veronica': 1,\n", " 'sunde': 1,\n", " 'scoville': 1,\n", " 'surrounding': 1,\n", " 'chaun': 1,\n", " 'makindye': 1,\n", " 'derek': 1,\n", " 'regiments': 1,\n", " 'bosnia': 6,\n", " 'southend': 1,\n", " 'taylour': 1,\n", " 'waller': 1,\n", " 'opposite': 1,\n", " 'consultant': 1,\n", " 'ballyhale': 1,\n", " 'worcestershire': 1,\n", " 'godavari': 1,\n", " 'araneae': 1,\n", " 'physician': 1,\n", " 'draft': 10,\n", " 'raith': 1,\n", " 'congo': 1,\n", " 'paraguay': 2,\n", " 'highest': 5,\n", " 'argos': 1,\n", " 'citation': 9,\n", " 'chung': 1,\n", " 'erigeron': 1,\n", " 'ramirez': 1,\n", " 'fubinaca': 1,\n", " 'jordan': 3,\n", " 'iyama': 1,\n", " 'anatolia': 1,\n", " 'eugenio': 1,\n", " 'disney': 3,\n", " 'nacional': 2,\n", " 'funen': 1,\n", " 'luehderi': 1,\n", " 'urodilatin': 1,\n", " 'background': 1,\n", " 'vijay': 2,\n", " 'needs': 1,\n", " 'patriots': 1,\n", " 'permanent': 4,\n", " 'recuay': 1,\n", " 'clinic': 1,\n", " 'imperialis': 1,\n", " 'seibu': 1,\n", " 'ćetković': 1,\n", " 'wolfe': 1,\n", " 'fernanda': 1,\n", " 'scientist': 2,\n", " 'aminonaphthalenesulfonic': 1,\n", " 'vittra': 1,\n", " 'other': 38,\n", " 'zgierz': 2,\n", " 'bluegrass': 2,\n", " 'catalog': 2,\n", " 'nyblom': 1,\n", " '44333': 1,\n", " 'damsa': 1,\n", " 'forever': 1,\n", " 'advocates': 1,\n", " 'heckstall': 1,\n", " 'unincorporated': 17,\n", " 'munich': 3,\n", " 'avalon': 1,\n", " 'snakebite': 1,\n", " 'porrit': 1,\n", " 'ellison': 1,\n", " 'debian': 1,\n", " 'young': 14,\n", " 'nuway': 1,\n", " 'tyson': 1,\n", " 'bexley': 1,\n", " 'boehringer': 1,\n", " 'franklin': 2,\n", " 'pyrénées': 1,\n", " 'telescope': 1,\n", " 'metropolitan': 4,\n", " 'olfactory': 1,\n", " 'cardinals': 3,\n", " 'bausch': 1,\n", " 'middlesbrough': 3,\n", " 'score': 5,\n", " 'henares': 1,\n", " 'flowered': 1,\n", " 'truck': 2,\n", " 'naichau': 1,\n", " 'production': 19,\n", " 'paschal': 1,\n", " 'jessen': 1,\n", " 'lebanon': 4,\n", " 'ratchaburi': 1,\n", " 'glazman': 1,\n", " 'daniels': 1,\n", " 'tottori': 1,\n", " 'información': 1,\n", " 'genome': 2,\n", " 'moscow': 6,\n", " 'nagashi': 1,\n", " 'abaúj': 1,\n", " 'allsvenskan': 1,\n", " 'vries': 1,\n", " 'poultry': 1,\n", " 'culkin': 1,\n", " 'chase': 1,\n", " 'ralph': 1,\n", " 'times': 22,\n", " 'gould': 1,\n", " 'borsod': 1,\n", " 'ghriss': 1,\n", " 'surgeon': 3,\n", " 'troglodytarum': 1,\n", " 'velika': 1,\n", " 'docking': 1,\n", " 'islam': 2,\n", " 'underground': 1,\n", " 'grackle': 1,\n", " 'points': 6,\n", " 'clean': 3,\n", " 'vanuatu': 1,\n", " 'pompey': 1,\n", " 'jarosław': 1,\n", " 'varus': 1,\n", " 'jaromír': 1,\n", " 'bibcode': 1,\n", " 'reader': 1,\n", " 'identifierswikipedia': 43,\n", " 'envelope': 1,\n", " 'wimberg': 1,\n", " 'xhcaa': 1,\n", " 'farida': 1,\n", " 'settlers': 1,\n", " 'ramanathan': 1,\n", " 'decimal': 1,\n", " 'stjørdal': 1,\n", " 'gender': 2,\n", " 'kamen': 1,\n", " 'arčikauskas': 1,\n", " 'simod': 1,\n", " 'wildlife': 4,\n", " 'holidays': 1,\n", " 'lighting': 1,\n", " 'kummerow': 1,\n", " 'siniscalco': 1,\n", " 'litera': 1,\n", " 'spookey': 1,\n", " 'member': 21,\n", " 'wilco': 1,\n", " 'songs': 23,\n", " 'abeno': 1,\n", " 'dordogne': 2,\n", " 'sciences': 2,\n", " 'maanikya': 1,\n", " '000000002016': 1,\n", " 'backs': 1,\n", " 'weapons': 2,\n", " 'spiel': 1,\n", " 'kirklees': 1,\n", " 'bruses': 1,\n", " 'sidéradougou': 1,\n", " 'studios': 4,\n", " 'vincent': 2,\n", " 'mandel': 1,\n", " 'named': 4,\n", " 'mangalore': 1,\n", " 'mickey': 1,\n", " 'salma': 1,\n", " 'lawsuit': 1,\n", " 'pietismus': 1,\n", " 'kalangis': 1,\n", " 'higgins': 1,\n", " 'tunisia': 2,\n", " 'vetriswaran': 1,\n", " 'dolni': 2,\n", " 'activities': 1,\n", " 'patanga': 1,\n", " 'coulommiers': 1,\n", " 'châtillon': 2,\n", " 'capriccio': 1,\n", " 'sędki': 1,\n", " 'accessory': 1,\n", " 'turret': 1,\n", " 'visitors': 2,\n", " 'magazine': 17,\n", " 'yvelines': 1,\n", " 'caney': 1,\n", " 'bathurst': 1,\n", " 'brenda': 2,\n", " 'muğanlı': 1,\n", " 'detroit': 5,\n", " 'parrots': 1,\n", " 'railways': 1,\n", " 'rolling': 1,\n", " '59611': 1,\n", " 'köping': 1,\n", " 'ramachandran': 1,\n", " 'larvae': 6,\n", " 'likely': 1,\n", " 'pochyta': 1,\n", " 'mascara': 1,\n", " 'canot': 1,\n", " 'directive': 2,\n", " 'eighty': 1,\n", " 'bahuguna': 1,\n", " 'peptide': 2,\n", " 'macmillan': 1,\n", " 'furkan': 1,\n", " 'cable': 2,\n", " 'keonjhar': 1,\n", " 'boogie': 1,\n", " 'szczawin': 1,\n", " 'dašić': 1,\n", " 'antennata': 1,\n", " 'zschokke': 1,\n", " 'backyard': 1,\n", " 'sirusho': 1,\n", " 'pippo': 1,\n", " 'creative': 1,\n", " 'tropische': 1,\n", " 'chenar': 1,\n", " 'original': 19,\n", " 'gosling': 1,\n", " 'overstreet': 2,\n", " 'chisocheton': 1,\n", " 'öland': 1,\n", " 'chaux': 2,\n", " 'goiás': 2,\n", " 'ground': 3,\n", " 'event': 12,\n", " 'namhaedo': 1,\n", " 'teacher': 1,\n", " 'sylvian': 1,\n", " 'palmer': 5,\n", " 'writer': 11,\n", " 'macrobertson': 1,\n", " 'javier': 1,\n", " 'students': 20,\n", " 'helmond': 1,\n", " 'twiggy': 1,\n", " '73917': 1,\n", " 'rescue': 1,\n", " 'daniele': 1,\n", " 'innings': 1,\n", " 'rhialto': 1,\n", " 'dinamo': 3,\n", " 'edwin': 1,\n", " 'moderni': 1,\n", " 'shrek': 1,\n", " 'haider': 1,\n", " 'viamala': 1,\n", " 'pedro': 2,\n", " 'cheddar': 1,\n", " 'exploratorium': 1,\n", " 'bayfield': 1,\n", " 'subdivision': 1,\n", " 'sevens': 2,\n", " 'nelonen': 1,\n", " 'pergamus': 1,\n", " 'requires': 2,\n", " 'santa': 19,\n", " 'mengs': 1,\n", " 'vitaly': 1,\n", " 'bombardment': 1,\n", " 'gimhae': 1,\n", " 'deshamanya': 1,\n", " 'maine': 4,\n", " 'rivals': 1,\n", " 'ponder': 1,\n", " 'jeffs': 1,\n", " 'spicules': 1,\n", " 'unicode': 1,\n", " 'tomorrow': 1,\n", " 'cincinnati': 5,\n", " 'amborellaceae': 1,\n", " 'analog': 1,\n", " 'mediaș': 1,\n", " 'roxburgh': 1,\n", " 'huddersfield': 1,\n", " 'appenzell': 1,\n", " 'turista': 1,\n", " 'branches': 1,\n", " 'kumamoto': 1,\n", " 'aliabad': 4,\n", " 'sloop': 1,\n", " 'montana': 5,\n", " 'funny': 1,\n", " 'document': 1,\n", " 'łosie': 1,\n", " '66083': 1,\n", " 'later': 7,\n", " 'atoxica': 1,\n", " 'paranhos': 1,\n", " 'former': 10,\n", " 'qualifier': 2,\n", " 'competed': 1,\n", " 'witzenhausen': 1,\n", " 'latin': 11,\n", " 'adams': 3,\n", " 'scales': 1,\n", " 'splash': 1,\n", " 'gorkha': 1,\n", " 'trojans': 2,\n", " 'floor': 2,\n", " 'hampson': 3,\n", " 'precinct': 1,\n", " 'arterial': 1,\n", " 'fidélis': 1,\n", " 'cutervo': 1,\n", " 'danmark': 1,\n", " 'groups': 3,\n", " 'veloso': 1,\n", " 'emulations': 1,\n", " 'pontiac': 1,\n", " 'dundas': 1,\n", " 'ariarathes': 1,\n", " 'channel': 8,\n", " 'description': 4,\n", " 'innocent': 1,\n", " 'sequences': 1,\n", " 'fuscozonata': 1,\n", " 'maharaj': 1,\n", " 'networking': 1,\n", " 'wizard': 2,\n", " 'criterium': 1,\n", " 'gasht': 1,\n", " 'acosta': 1,\n", " 'loving': 1,\n", " 'guarro': 1,\n", " 'bigelovii': 1,\n", " 'inches': 1,\n", " 'stian_aker': 1,\n", " 'gardner': 3,\n", " 'mudramothiram': 1,\n", " 'cobreloa': 1,\n", " 'mahony': 1,\n", " 'lithophila': 1,\n", " 'concepción': 1,\n", " 'smeaton': 1,\n", " 'haven': 2,\n", " 'reviews': 4,\n", " 'cindy': 1,\n", " 'references': 34,\n", " 'dzahadjou': 1,\n", " 'sicily': 1,\n", " 'garrisson': 1,\n", " 'schneider': 1,\n", " 'psychology': 5,\n", " 'warsaw': 2,\n", " 'guangde': 1,\n", " 'nominee': 1,\n", " 'muirí': 1,\n", " 'written': 10,\n", " 'leeuwen': 1,\n", " 'chaharduli': 1,\n", " 'tachinidae': 2,\n", " 'incumbent': 1,\n", " 'diego': 4,\n", " 'ennico': 1,\n", " 'kazan': 2,\n", " 'italiane': 1,\n", " 'coupier': 1,\n", " 'riffle': 1,\n", " 'saxophone': 1,\n", " 'fleet': 7,\n", " 'residence': 2,\n", " 'auryn': 1,\n", " 'hotels': 1,\n", " 'association': 10,\n", " 'histoplasmosis': 1,\n", " 'leuke': 1,\n", " 'imphal': 1,\n", " 'baloch': 1,\n", " 'asheqlu': 1,\n", " 'supernova': 1,\n", " 'namhae': 2,\n", " 'zebina': 1,\n", " 'mazowiecki': 1,\n", " 'taluka': 2,\n", " 'rican': 1,\n", " 'subcutaneous': 1,\n", " 'admiral': 2,\n", " 'alakbarova': 1,\n", " 'nerve': 1,\n", " 'lyrics': 3,\n", " 'porto': 1,\n", " 'finnmark': 1,\n", " 'transistor': 1,\n", " 'pilot': 5,\n", " 'anwar': 1,\n", " 'mafeteng': 1,\n", " 'bridge': 13,\n", " 'tryavna': 2,\n", " 'doble': 2,\n", " 'maritime': 3,\n", " 'circumcarinata': 1,\n", " 'manor': 1,\n", " 'grichka': 1,\n", " 'planetary': 1,\n", " 'leprieuri': 1,\n", " 'plants': 4,\n", " 'sarojini': 1,\n", " 'lough': 1,\n", " 'safety': 2,\n", " 'meneng': 1,\n", " 'kartli': 1,\n", " 'badgers': 1,\n", " 'opinions': 1,\n", " 'contralateral': 1,\n", " 'tiranë': 1,\n", " 'bethoncourt': 1,\n", " 'kapinovo': 1,\n", " 'civil': 17,\n", " 'those': 1,\n", " 'estrone': 1,\n", " 'babak': 1,\n", " 'brian': 4,\n", " 'leaders': 7,\n", " 'mossman': 1,\n", " 'carley': 1,\n", " 'honey': 2,\n", " 'normandin': 1,\n", " 'known': 14,\n", " 'hydroxynaphthalene': 1,\n", " 'marcus': 1,\n", " 'personalities': 1,\n", " 'kapoor': 3,\n", " 'government': 29,\n", " 'ghisallo': 1,\n", " 'regional': 10,\n", " 'reaction': 1,\n", " 'works': 13,\n", " 'neville': 1,\n", " 'mausoleum': 1,\n", " 'icelandic': 1,\n", " 'methodist': 1,\n", " 'lakshmanan': 1,\n", " 'script': 1,\n", " 'three': 11,\n", " 'herzegovina': 7,\n", " 'runoff': 1,\n", " 'dawson': 1,\n", " 'eliskases': 1,\n", " 'vinca': 1,\n", " 'television': 50,\n", " 'bukovica': 1,\n", " 'reward': 1,\n", " 'noctuoidea': 1,\n", " 'jetpack': 1,\n", " 'picture': 2,\n", " 'kildare': 1,\n", " 'rheumatoid': 1,\n", " 'romagna': 1,\n", " 'phong': 1,\n", " 'abbas': 2,\n", " 'season': 61,\n", " 'dictionary': 4,\n", " 'peparethos': 1,\n", " 'brugnoli': 1,\n", " 'peptides': 1,\n", " 'passive': 1,\n", " 'spectra': 2,\n", " 'chandan': 1,\n", " 'circuit': 5,\n", " 'alvord': 1,\n", " 'keisling': 1,\n", " 'canberra': 1,\n", " 'adviser': 1,\n", " 'watford': 2,\n", " 'lubbock': 1,\n", " 'constant': 4,\n", " 'lantern': 1,\n", " 'pampanga': 1,\n", " 'membranacea': 1,\n", " 'partial': 1,\n", " 'vessels': 2,\n", " 'residenztheater': 1,\n", " 'inducing': 1,\n", " 'chamber': 2,\n", " 'sales': 3,\n", " 'geforce': 1,\n", " 'suburban': 2,\n", " 'kissing': 1,\n", " 'video': 24,\n", " 'center': 27,\n", " '125th': 1,\n", " 'leyte': 1,\n", " 'christopher': 1,\n", " 'bradford': 1,\n", " 'stealing': 1,\n", " 'carracedo': 1,\n", " 'dołęgi': 1,\n", " 'commission': 8,\n", " 'homebuilding': 1,\n", " 'catholic': 21,\n", " 'whiting': 1,\n", " 'chacon': 1,\n", " 'healing': 1,\n", " 'aereon': 1,\n", " 'dasyure': 1,\n", " 'busan': 2,\n", " 'career': 82,\n", " 'bratz': 1,\n", " 'rolli': 1,\n", " 'holstein': 1,\n", " 'dewey': 1,\n", " 'utiel': 1,\n", " 'creek': 15,\n", " 'edward': 5,\n", " 'destroyer': 2,\n", " 'coalfield': 1,\n", " '354927000': 1,\n", " 'aspietes': 1,\n", " 'clothes': 1,\n", " 'relegated': 1,\n", " 'regiment': 7,\n", " 'annunciation': 1,\n", " 'corps': 6,\n", " 'monkey': 2,\n", " 'thompson': 2,\n", " 'saudi': 3,\n", " 'bicycle': 1,\n", " 'angèle': 1,\n", " 'sentence': 1,\n", " 'brzózka': 1,\n", " '85111': 1,\n", " 'mechanisms': 1,\n", " 'kappa': 1,\n", " 'syntomis': 1,\n", " 'larvicide': 1,\n", " 'bangkok': 3,\n", " 'observer': 1,\n", " 'uttarakhand': 1,\n", " 'rally': 3,\n", " 'heuch': 1,\n", " 'bayadh': 1,\n", " 'animals': 1,\n", " 'nicola': 1,\n", " 'feedjit': 1,\n", " 'koishiteru': 1,\n", " 'bloomfield': 1,\n", " 'thornaby': 1,\n", " 'human': 12,\n", " 'internal': 1,\n", " 'wilkes': 1,\n", " 'filmsturkish': 1,\n", " 'watch': 5,\n", " 'okoye': 1,\n", " 'fuchs': 1,\n", " 'metallurg': 1,\n", " 'sulukh': 1,\n", " 'khedoori': 1,\n", " 'coronium': 1,\n", " 'benítez': 1,\n", " 'fusiliers': 1,\n", " 'warrant': 1,\n", " 'precursor': 1,\n", " 'family': 73,\n", " 'theorem': 2,\n", " 'defence': 3,\n", " 'immunity': 2,\n", " 'kramer': 1,\n", " 'lucky': 1,\n", " 'livonia': 1,\n", " 'ereader': 1,\n", " 'israelites': 1,\n", " 'ramaranjan': 1,\n", " 'maria': 6,\n", " 'isaac': 1,\n", " 'force': 8,\n", " 'paralympic': 2,\n", " 'maribor': 1,\n", " 'tatars': 1,\n", " 'baddie': 1,\n", " 'ruins': 1,\n", " 'stade': 1,\n", " 'ryots': 1,\n", " 'worldcat': 4,\n", " 'coccidioidomycosis': 1,\n", " 'reunion': 1,\n", " 'boland': 1,\n", " 'nellie': 2,\n", " 'speed': 10,\n", " 'redwater': 1,\n", " 'bowles': 1,\n", " 'katie': 1,\n", " 'merelim': 1,\n", " 'solkan': 1,\n", " 'germar': 1,\n", " 'missing': 6,\n", " 'greek': 25,\n", " 'fulcher': 1,\n", " 'monument': 3,\n", " 'harvard': 4,\n", " 'sustainability': 1,\n", " 'summer': 30,\n", " 'bromo': 1,\n", " 'chemokine': 1,\n", " 'rainstorm': 1,\n", " 'kenneth': 2,\n", " 'straloch': 1,\n", " 'stone': 8,\n", " 'hirst': 1,\n", " 'horsey': 1,\n", " 'šumadija': 1,\n", " 'parfitt': 1,\n", " 'galaxies': 2,\n", " 'climate': 9,\n", " 'westminster': 1,\n", " 'herbert': 3,\n", " 'fungal': 1,\n", " 'giovanni': 3,\n", " 'believer': 1,\n", " 'conveyor': 1,\n", " 'bechdel': 1,\n", " 'søren': 1,\n", " 'eugamandus': 1,\n", " 'medal': 21,\n", " 'ömnögovi': 1,\n", " 'begin': 1,\n", " '01333': 1,\n", " 'valle': 1,\n", " 'meaning': 1,\n", " 'baltimore': 4,\n", " 'gosei': 1,\n", " 'krispies': 1,\n", " 'eastern': 13,\n", " 'oplast': 1,\n", " 'serpentine': 1,\n", " 'january': 96,\n", " 'macroagelaius': 1,\n", " 'posters': 1,\n", " 'stuart': 3,\n", " 'elizabeth': 6,\n", " 'apocephalus': 1,\n", " 'philip': 1,\n", " 'sherry': 1,\n", " 'mussels': 1,\n", " 'raggett': 1,\n", " 'dream': 4,\n", " 'zhejiang': 1,\n", " 'forewings': 2,\n", " 'saves': 1,\n", " 'makkasan': 1,\n", " 'mansion': 1,\n", " 'trucks': 1,\n", " 'animated': 1,\n", " 'azerbaijani': 2,\n", " 'purged': 1,\n", " 'recke': 1,\n", " 'infections': 1,\n", " 'batted': 2,\n", " 'kosaka': 1,\n", " 'categories': 217,\n", " 'shanghai': 2,\n", " 'heroines': 1,\n", " 'laser': 1,\n", " 'always': 1,\n", " 'khikhani': 1,\n", " 'rovers': 7,\n", " 'verpelét': 1,\n", " 'germán': 1,\n", " 'evermore': 1,\n", " 'stefano': 1,\n", " 'asiab': 1,\n", " 'viola': 1,\n", " 'flour': 1,\n", " 'aviva': 1,\n", " 'volumes': 1,\n", " 'figaro': 1,\n", " 'mclaren': 1,\n", " 'gnome': 1,\n", " 'junction': 6,\n", " 'against': 11,\n", " 'lorne': 1,\n", " 'slavery': 1,\n", " 'manga': 3,\n", " 'phorids': 1,\n", " 'gospel': 1,\n", " 'wiltshire': 1,\n", " 'discretization': 1,\n", " 'headache': 1,\n", " 'transilien': 1,\n", " 'volunteer': 1,\n", " 'авени': 1,\n", " 'hyderabad': 1,\n", " 'volleyball': 5,\n", " 'beverley': 1,\n", " 'induced': 1,\n", " 'yarumal': 1,\n", " 'cells': 1,\n", " 'romance': 3,\n", " 'evraki': 1,\n", " 'tacloban': 1,\n", " 'grows': 2,\n", " 'lagunitas': 1,\n", " 'elevation': 2,\n", " 'territory': 2,\n", " 'qualifying': 2,\n", " 'contains': 3,\n", " 'economics': 4,\n", " 'deseret': 1,\n", " 'noise': 2,\n", " 'kirby': 1,\n", " 'transportation': 5,\n", " 'agripina': 1,\n", " 'smoszewo': 1,\n", " 'stiles': 1,\n", " 'scott': 6,\n", " 'andorra': 2,\n", " 'italien': 1,\n", " 'göttingen': 1,\n", " 'presenters': 1,\n", " 'luigné': 1,\n", " 'vatican': 1,\n", " 'seyyed': 1,\n", " 'midnight': 3,\n", " 'vaibbhav': 1,\n", " 'fusidic': 1,\n", " 'newport': 1,\n", " 'unusual': 1,\n", " 'casey': 1,\n", " 'payments': 1,\n", " 'pages': 2,\n", " 'mullins': 1,\n", " 'gubernatorial': 1,\n", " 'density': 1,\n", " 'tanenbaum': 1,\n", " 'exchange': 1,\n", " 'hasan': 2,\n", " 'richt': 1,\n", " 'genealogical': 2,\n", " '000000001989': 1,\n", " 'leuty': 1,\n", " 'natural': 9,\n", " 'guineas': 1,\n", " 'carol': 1,\n", " 'policy': 2,\n", " 'woolley': 1,\n", " 'rolin': 1,\n", " 'vista': 1,\n", " 'murinus': 1,\n", " 'lawngtlai': 1,\n", " 'logan': 2,\n", " 'chapel': 3,\n", " 'banovinas': 1,\n", " 'general': 42,\n", " 'enhanced': 1,\n", " 'bresse': 1,\n", " 'fresno': 3,\n", " 'porch': 1,\n", " 'fisher': 2,\n", " 'anish': 1,\n", " 'equestrian': 1,\n", " 'smilax': 1,\n", " 'drain': 1,\n", " 'municipality': 32,\n", " 'chapter': 2,\n", " 'surnames': 1,\n", " 'steinbauer': 1,\n", " 'faber': 3,\n", " 'czech': 8,\n", " 'international': 45,\n", " 'choral': 2,\n", " 'class': 30,\n", " '94750': 1,\n", " 'moginie': 1,\n", " 'houghton': 1,\n", " 'cymru': 1,\n", " 'astoria': 2,\n", " '36528': 1,\n", " 'oznoz': 1,\n", " 'italy': 12,\n", " 'villers': 6,\n", " 'autism': 1,\n", " 'touch': 2,\n", " 'svalbard': 1,\n", " 'rapper': 1,\n", " 'spanish': 34,\n", " 'breads': 1,\n", " 'taproot': 1,\n", " 'founder': 1,\n", " 'braunschweig': 1,\n", " 'neolepetopsidae': 1,\n", " 'including': 1,\n", " 'firefox': 1,\n", " 'lincoln': 7,\n", " 'garuh': 1,\n", " 'giang': 2,\n", " 'eleuteri': 1,\n", " 'match': 12,\n", " 'iwaki': 1,\n", " 'osteoporosis': 1,\n", " 'préliminaires': 1,\n", " 'couple': 1,\n", " 'armstrong': 3,\n", " 'musica': 1,\n", " 'shackleton': 1,\n", " 'funeral': 1,\n", " 'ortiz': 1,\n", " 'kyril': 1,\n", " 'zvereva': 1,\n", " 'continuity': 1,\n", " 'brasil': 1,\n", " 'nepali': 1,\n", " 'winters': 1,\n", " 'runner': 1,\n", " 'hammerstein': 1,\n", " 'hanna': 1,\n", " 'lalden': 1,\n", " 'chemo': 1,\n", " 'granduciel': 1,\n", " 'anterior': 2,\n", " 'wolfpack': 1,\n", " 'lincolnshire': 1,\n", " 'kawabe': 1,\n", " 'nueva': 1,\n", " 'jagiellonia': 1,\n", " 'friendship': 4,\n", " 'stoke': 1,\n", " 'scenic': 1,\n", " 'ricardo': 2,\n", " 'stream': 1,\n", " 'sweden': 10,\n", " 'boost': 1,\n", " 'paranaense': 1,\n", " 'torkaman': 1,\n", " 'suvarnabhumi': 1,\n", " 'anglican': 1,\n", " 'frisian': 1,\n", " 'germany': 21,\n", " 'huesca': 1,\n", " 'shaun': 1,\n", " 'propantheline': 1,\n", " 'filled': 1,\n", " 'tomakomai': 1,\n", " 'gnophaela': 1,\n", " 'lecanoromycetes': 1,\n", " 'database': 6,\n", " 'johnstown': 1,\n", " 'celta': 2,\n", " 'vaughn': 1,\n", " 'gibraltar': 1,\n", " 'mitsuki': 1,\n", " 'hairy': 1,\n", " 'ayeyarwady': 1,\n", " 'ordinary': 2,\n", " 'bruderschaft': 1,\n", " 'operative': 1,\n", " 'cocktail': 1,\n", " 'tazeh': 2,\n", " 'hasanein': 1,\n", " 'shortest': 1,\n", " 'completed': 1,\n", " 'velike': 1,\n", " 'celtic': 3,\n", " 'darvazeh': 1,\n", " 'london': 40,\n", " 'myoictis': 1,\n", " 'component': 1,\n", " 'matilda': 1,\n", " 'arabic': 8,\n", " 'radomsko': 1,\n", " 'pirama': 1,\n", " 'baxter': 1,\n", " 'gliese': 2,\n", " 'cantonese': 1,\n", " 'allegro': 1,\n", " 'brooks': 1,\n", " 'biggest': 1,\n", " 'fischli': 1,\n", " 'girolamo': 1,\n", " 'erksan': 1,\n", " 'affairs': 2,\n", " 'steamroller': 1,\n", " 'contest': 3,\n", " 'awarded': 2,\n", " 'selling': 1,\n", " 'msnbc': 1,\n", " 'pequeñas': 1,\n", " 'assault': 1,\n", " 'police': 9,\n", " 'censorship': 1,\n", " 'reloaded': 1,\n", " 'bušić': 1,\n", " 'encyclopædia': 1,\n", " 'barbus': 1,\n", " 'mobile': 6,\n", " 'finishes': 1,\n", " 'bills': 1,\n", " 'baronetcy': 1,\n", " 'kansas': 9,\n", " 'drooling': 1,\n", " 'philippines': 5,\n", " 'seine': 2,\n", " 'passing': 1,\n", " 'pregnant': 1,\n", " 'caspase': 1,\n", " 'liverpool': 1,\n", " 'green': 11,\n", " 'greenhouses': 1,\n", " 'princess': 1,\n", " 'uttar': 2,\n", " 'desmiphora': 1,\n", " 'coordinates': 56,\n", " ...}" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from bs4 import BeautifulSoup\n", "from collections import Counter\n", "import re\n", "\n", "def count_words(html):\n", " soup = BeautifulSoup(html, 'html.parser')\n", " words = {}\n", " text = soup.get_text()\n", " text = re.sub(\"\\W+\", \" \", text.lower())\n", " words = text.split(\" \")\n", " words = [w for w in words if len(w) >= 5]\n", " return Counter(words).most_common(10)\n", "\n", "start = time.time()\n", "pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)\n", "words = pool.map(count_words, parsed)\n", "words = list(words)\n", "\n", "word_counts = {}\n", "for wc in words:\n", " for word, count in wc:\n", " if word not in word_counts:\n", " word_counts[word] = 0\n", " word_counts[word] += 1\n", "end = time.time()\n", "\n", "print(end - start)\n", "word_counts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Only selecting the top `10` words from each article speeds up performance quite a bit." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }