vor 8 Jahren · 4fcdc73d63
--- a/Mission267Solutions.ipynb
+++ b/Mission267Solutions.ipynb
@@ -0,0 +1,123 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 17,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "[('new', 186), ('google', 168), ('bitcoin', 102), ('open', 93), ('programming', 91), ('web', 89), ('data', 86), ('video', 80), ('python', 76), ('code', 73), ('facebook', 72), ('released', 72), ('using', 71), ('javascript', 66), ('2013', 66), ('free', 65), ('source', 65), ('game', 64), ('internet', 63), ('microsoft', 60), ('c', 60), ('linux', 59), ('app', 58), ('pdf', 56), ('language', 55), ('work', 55), ('software', 53), ('2014', 53), ('startup', 52), ('apple', 51), ('make', 51), ('use', 51), ('time', 49), ('yc', 49), ('security', 49), ('github', 46), ('nsa', 46), ('windows', 45), ('like', 42), ('world', 42), ('way', 42), ('computer', 41), ('heartbleed', 41), ('1', 41), ('project', 41), ('design', 38), ('users', 38), ('dont', 38), ('ios', 38), ('git', 38), ('vs', 37), ('developer', 37), ('os', 37), ('life', 37), ('twitter', 37), ('ceo', 37), ('day', 36), ('big', 36), ('online', 35), ('android', 35), ('years', 34), ('simple', 34), ('court', 34), ('mt', 33), ('apps', 33), ('says', 33), ('api', 33), ('browser', 33), ('guide', 33), ('learning', 33), ('mozilla', 32), ('site', 32), ('gox', 32), ('firefox', 32), ('engine', 32), ('problem', 32), ('server', 32), ('fast', 32), ('amazon', 31), ('year', 31), ('introducing', 31), ('support', 30), ('better', 30), ('stop', 30), ('million', 30), ('text', 30), ('people', 30), ('built', 30), ('does', 29), ('development', 29), ('tech', 29), ('3', 29), ('just', 28), ('inside', 28), ('did', 28), ('library', 28), ('money', 28), ('website', 28), ('chrome', 28), ('2048', 28)]\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "from datetime import datetime\n",
			
 
				+    "import json\n",
			
 
				+    "import io\n",
			
 
				+    "import string\n",
			
 
				+    "\n",
			
 
				+    "from pipeline import build_csv, Pipeline\n",
			
 
				+    "from stop_words import stop_words\n",
			
 
				+    "\n",
			
 
				+    "pipeline = Pipeline()\n",
			
 
				+    "\n",
			
 
				+    "@pipeline.task()\n",
			
 
				+    "def file_to_json():\n",
			
 
				+    "    with open('hn_stories_2014.json', 'r') as f:\n",
			
 
				+    "        data = json.load(f)\n",
			
 
				+    "        stories = data['stories']\n",
			
 
				+    "    return stories\n",
			
 
				+    "\n",
			
 
				+    "@pipeline.task(depends_on=file_to_json)\n",
			
 
				+    "def filter_stories(stories):\n",
			
 
				+    "    def is_popular(story):\n",
			
 
				+    "        return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')\n",
			
 
				+    "    \n",
			
 
				+    "    return (\n",
			
 
				+    "        story for story in stories\n",
			
 
				+    "        if is_popular(story)\n",
			
 
				+    "    )\n",
			
 
				+    "\n",
			
 
				+    "@pipeline.task(depends_on=filter_stories)\n",
			
 
				+    "def json_to_csv(stories):\n",
			
 
				+    "    lines = []\n",
			
 
				+    "    for story in stories:\n",
			
 
				+    "        lines.append(\n",
			
 
				+    "            (story['objectID'], datetime.strptime(story['created_at'], \"%Y-%m-%dT%H:%M:%SZ\"), story['url'], story['points'], story['title'])\n",
			
 
				+    "        )\n",
			
 
				+    "    return build_csv(lines, header=['objectID', 'created_at', 'url', 'points', 'title'], file=io.StringIO())\n",
			
 
				+    "\n",
			
 
				+    "@pipeline.task(depends_on=json_to_csv)\n",
			
 
				+    "def extract_titles(csv_file):\n",
			
 
				+    "    reader = csv.reader(csv_file)\n",
			
 
				+    "    header = next(reader)\n",
			
 
				+    "    idx = header.index('title')\n",
			
 
				+    "    \n",
			
 
				+    "    return (line[idx] for line in reader)\n",
			
 
				+    "\n",
			
 
				+    "@pipeline.task(depends_on=extract_titles)\n",
			
 
				+    "def clean_title(titles):\n",
			
 
				+    "    for title in titles:\n",
			
 
				+    "        title = title.lower()\n",
			
 
				+    "        title = ''.join(c for c in title if c not in string.punctuation)\n",
			
 
				+    "        yield title\n",
			
 
				+    "\n",
			
 
				+    "@pipeline.task(depends_on=clean_title)\n",
			
 
				+    "def build_keyword_dictionary(titles):\n",
			
 
				+    "    word_freq = {}\n",
			
 
				+    "    for title in titles:\n",
			
 
				+    "        for word in title.split(' '):\n",
			
 
				+    "            if word and word not in stop_words:\n",
			
 
				+    "                if word not in word_freq:\n",
			
 
				+    "                    word_freq[word] = 1\n",
			
 
				+    "                word_freq[word] += 1\n",
			
 
				+    "    return word_freq\n",
			
 
				+    "\n",
			
 
				+    "@pipeline.task(depends_on=build_keyword_dictionary)\n",
			
 
				+    "def top_keywords(word_freq):\n",
			
 
				+    "    freq_tuple = [\n",
			
 
				+    "        (word, word_freq[word])\n",
			
 
				+    "        for word in sorted(word_freq, key=word_freq.get, reverse=True)\n",
			
 
				+    "    ]\n",
			
 
				+    "    return freq_tuple[:100]\n",
			
 
				+    "\n",
			
 
				+    "ran = pipeline.run()\n",
			
 
				+    "print(ran[top_keywords])"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "collapsed": true
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "anaconda-cloud": {},
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.5.2"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 1
			
 
				+}