{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('new', 186), ('google', 168), ('bitcoin', 102), ('open', 93), ('programming', 91), ('web', 89), ('data', 86), ('video', 80), ('python', 76), ('code', 73), ('facebook', 72), ('released', 72), ('using', 71), ('2013', 66), ('javascript', 66), ('free', 65), ('source', 65), ('game', 64), ('internet', 63), ('microsoft', 60), ('c', 60), ('linux', 59), ('app', 58), ('pdf', 56), ('work', 55), ('language', 55), ('software', 53), ('2014', 53), ('startup', 52), ('apple', 51), ('use', 51), ('make', 51), ('time', 49), ('yc', 49), ('security', 49), ('nsa', 46), ('github', 46), ('windows', 45), ('world', 42), ('way', 42), ('like', 42), ('1', 41), ('project', 41), ('computer', 41), ('heartbleed', 41), ('git', 38), ('users', 38), ('dont', 38), ('design', 38), ('ios', 38), ('developer', 37), ('os', 37), ('twitter', 37), ('ceo', 37), ('vs', 37), ('life', 37), ('big', 36), ('day', 36), ('android', 35), ('online', 35), ('years', 34), ('simple', 34), ('court', 34), ('guide', 33), ('learning', 33), ('mt', 33), ('api', 33), ('says', 33), ('apps', 33), ('browser', 33), ('server', 32), ('firefox', 32), ('fast', 32), ('gox', 32), ('problem', 32), ('mozilla', 32), ('engine', 32), ('site', 32), ('introducing', 31), ('amazon', 31), ('year', 31), ('support', 30), ('stop', 30), ('built', 30), ('better', 30), ('million', 30), ('people', 30), ('text', 30), ('3', 29), ('does', 29), ('tech', 29), ('development', 29), ('billion', 28), ('developers', 28), ('just', 28), ('library', 28), ('did', 28), ('website', 28), ('money', 28), ('inside', 28)]\n" ] } ], "source": [ "from datetime import datetime\n", "import json\n", "import io\n", "import csv\n", "import string\n", "\n", "from pipeline import build_csv, Pipeline\n", "from stop_words import stop_words\n", "\n", "pipeline = Pipeline()\n", "\n", "@pipeline.task()\n", "def file_to_json():\n", " with open('hn_stories_2014.json', 'r') as f:\n", " data = json.load(f)\n", " stories = data['stories']\n", " return stories\n", "\n", "@pipeline.task(depends_on=file_to_json)\n", "def filter_stories(stories):\n", " def is_popular(story):\n", " return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')\n", " \n", " return (\n", " story for story in stories\n", " if is_popular(story)\n", " )\n", "\n", "@pipeline.task(depends_on=filter_stories)\n", "def json_to_csv(stories):\n", " lines = []\n", " for story in stories:\n", " lines.append(\n", " (story['objectID'], datetime.strptime(story['created_at'], \"%Y-%m-%dT%H:%M:%SZ\"), story['url'], story['points'], story['title'])\n", " )\n", " return build_csv(lines, header=['objectID', 'created_at', 'url', 'points', 'title'], file=io.StringIO())\n", "\n", "@pipeline.task(depends_on=json_to_csv)\n", "def extract_titles(csv_file):\n", " reader = csv.reader(csv_file)\n", " header = next(reader)\n", " idx = header.index('title')\n", " \n", " return (line[idx] for line in reader)\n", "\n", "@pipeline.task(depends_on=extract_titles)\n", "def clean_title(titles):\n", " for title in titles:\n", " title = title.lower()\n", " title = ''.join(c for c in title if c not in string.punctuation)\n", " yield title\n", "\n", "@pipeline.task(depends_on=clean_title)\n", "def build_keyword_dictionary(titles):\n", " word_freq = {}\n", " for title in titles:\n", " for word in title.split(' '):\n", " if word and word not in stop_words:\n", " if word not in word_freq:\n", " word_freq[word] = 1\n", " word_freq[word] += 1\n", " return word_freq\n", "\n", "@pipeline.task(depends_on=build_keyword_dictionary)\n", "def top_keywords(word_freq):\n", " freq_tuple = [\n", " (word, word_freq[word])\n", " for word in sorted(word_freq, key=word_freq.get, reverse=True)\n", " ]\n", " return freq_tuple[:100]\n", "\n", "ran = pipeline.run()\n", "print(ran[top_keywords])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 1 }