преди 5 години · 1f31d3d4ed
--- a/Mission433Solutions.ipynb
+++ b/Mission433Solutions.ipynb
@@ -472,18 +472,11 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "word_counts_per_sms = {unique_word:[] for unique_word in vocabulary}\n",
			
 
				+    "word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}\n",
			
 
				     "\n",
			
 
				-    "for unique_word in vocabulary:\n",
			
 
				-    "    \n",
			
 
				-    "    for sms in training_set['SMS']:        \n",
			
 
				-    "        counter = 0\n",
			
 
				-    "        \n",
			
 
				-    "        for word in sms:            \n",
			
 
				-    "            if word == unique_word:\n",
			
 
				-    "                counter += 1\n",
			
 
				-    "                \n",
			
 
				-    "        word_counts_per_sms[unique_word].append(counter)"
			
 
				+    "for index, sms in enumerate(training_set['SMS']):\n",
			
 
				+    "    for word in sms:\n",
			
 
				+    "        word_counts_per_sms[word][index] += 1"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -512,27 +505,27 @@
 
				        "  <thead>\n",
			
 
				        "    <tr style=\"text-align: right;\">\n",
			
 
				        "      <th></th>\n",
			
 
				-       "      <th>should</th>\n",
			
 
				-       "      <th>huiming</th>\n",
			
 
				-       "      <th>double</th>\n",
			
 
				-       "      <th>freshers</th>\n",
			
 
				-       "      <th>here</th>\n",
			
 
				-       "      <th>tea</th>\n",
			
 
				-       "      <th>running</th>\n",
			
 
				-       "      <th>seat</th>\n",
			
 
				-       "      <th>mb</th>\n",
			
 
				-       "      <th>safely</th>\n",
			
 
				+       "      <th>woke</th>\n",
			
 
				+       "      <th>gr8</th>\n",
			
 
				+       "      <th>forum</th>\n",
			
 
				+       "      <th>bettr</th>\n",
			
 
				+       "      <th>std</th>\n",
			
 
				+       "      <th>pobox334</th>\n",
			
 
				+       "      <th>wap</th>\n",
			
 
				+       "      <th>kalstiya</th>\n",
			
 
				+       "      <th>skillgame</th>\n",
			
 
				+       "      <th>slap</th>\n",
			
 
				        "      <th>...</th>\n",
			
 
				-       "      <th>nike</th>\n",
			
 
				-       "      <th>ache</th>\n",
			
 
				-       "      <th>08000930705</th>\n",
			
 
				-       "      <th>printing</th>\n",
			
 
				-       "      <th>450</th>\n",
			
 
				-       "      <th>shanghai</th>\n",
			
 
				-       "      <th>purple</th>\n",
			
 
				-       "      <th>bday</th>\n",
			
 
				-       "      <th>nd</th>\n",
			
 
				-       "      <th>ignorant</th>\n",
			
 
				+       "      <th>sterm</th>\n",
			
 
				+       "      <th>click</th>\n",
			
 
				+       "      <th>person2die</th>\n",
			
 
				+       "      <th>amused</th>\n",
			
 
				+       "      <th>box434sk38wp150ppm18</th>\n",
			
 
				+       "      <th>bcaz</th>\n",
			
 
				+       "      <th>lodging</th>\n",
			
 
				+       "      <th>lyf</th>\n",
			
 
				+       "      <th>officially</th>\n",
			
 
				+       "      <th>again</th>\n",
			
 
				        "    </tr>\n",
			
 
				        "  </thead>\n",
			
 
				        "  <tbody>\n",
			
@@ -662,26 +655,26 @@
 
				        "</div>"
			
 
				       ],
			
 
				       "text/plain": [
			
 
				-       "   should  huiming  double  freshers  here  tea  running  seat  mb  safely  \\\n",
			
 
				-       "0       0        0       0         0     0    0        0     0   0       0   \n",
			
 
				-       "1       0        0       0         0     0    0        0     0   0       0   \n",
			
 
				-       "2       0        0       0         0     0    0        0     0   0       0   \n",
			
 
				-       "3       0        0       0         0     0    0        0     0   0       0   \n",
			
 
				-       "4       0        0       0         0     0    0        0     0   0       0   \n",
			
 
				+       "   woke  gr8  forum  bettr  std  pobox334  wap  kalstiya  skillgame  slap  \\\n",
			
 
				+       "0     0    0      0      0    0         0    0         0          0     0   \n",
			
 
				+       "1     0    0      0      0    0         0    0         0          0     0   \n",
			
 
				+       "2     0    0      0      0    0         0    0         0          0     0   \n",
			
 
				+       "3     0    0      0      0    0         0    0         0          0     0   \n",
			
 
				+       "4     0    0      0      0    0         0    0         0          0     0   \n",
			
 
				        "\n",
			
 
				-       "   ...  nike  ache  08000930705  printing  450  shanghai  purple  bday  nd  \\\n",
			
 
				-       "0  ...     0     0            0         0    0         0       0     0   0   \n",
			
 
				-       "1  ...     0     0            0         0    0         0       0     0   0   \n",
			
 
				-       "2  ...     0     0            0         0    0         0       0     0   0   \n",
			
 
				-       "3  ...     0     0            0         0    0         0       0     0   0   \n",
			
 
				-       "4  ...     0     0            0         0    0         0       0     0   0   \n",
			
 
				+       "   ...  sterm  click  person2die  amused  box434sk38wp150ppm18  bcaz  lodging  \\\n",
			
 
				+       "0  ...      0      0           0       0                     0     0        0   \n",
			
 
				+       "1  ...      0      0           0       0                     0     0        0   \n",
			
 
				+       "2  ...      0      0           0       0                     0     0        0   \n",
			
 
				+       "3  ...      0      0           0       0                     0     0        0   \n",
			
 
				+       "4  ...      0      0           0       0                     0     0        0   \n",
			
 
				        "\n",
			
 
				-       "   ignorant  \n",
			
 
				-       "0         0  \n",
			
 
				-       "1         0  \n",
			
 
				-       "2         0  \n",
			
 
				-       "3         0  \n",
			
 
				-       "4         0  \n",
			
 
				+       "   lyf  officially  again  \n",
			
 
				+       "0    0           0      0  \n",
			
 
				+       "1    0           0      0  \n",
			
 
				+       "2    0           0      0  \n",
			
 
				+       "3    0           0      0  \n",
			
 
				+       "4    0           0      0  \n",
			
 
				        "\n",
			
 
				        "[5 rows x 7783 columns]"
			
 
				       ]
			
@@ -724,25 +717,25 @@
 
				        "      <th></th>\n",
			
 
				        "      <th>Label</th>\n",
			
 
				        "      <th>SMS</th>\n",
			
 
				-       "      <th>should</th>\n",
			
 
				-       "      <th>huiming</th>\n",
			
 
				-       "      <th>double</th>\n",
			
 
				-       "      <th>freshers</th>\n",
			
 
				-       "      <th>here</th>\n",
			
 
				-       "      <th>tea</th>\n",
			
 
				-       "      <th>running</th>\n",
			
 
				-       "      <th>seat</th>\n",
			
 
				+       "      <th>woke</th>\n",
			
 
				+       "      <th>gr8</th>\n",
			
 
				+       "      <th>forum</th>\n",
			
 
				+       "      <th>bettr</th>\n",
			
 
				+       "      <th>std</th>\n",
			
 
				+       "      <th>pobox334</th>\n",
			
 
				+       "      <th>wap</th>\n",
			
 
				+       "      <th>kalstiya</th>\n",
			
 
				        "      <th>...</th>\n",
			
 
				-       "      <th>nike</th>\n",
			
 
				-       "      <th>ache</th>\n",
			
 
				-       "      <th>08000930705</th>\n",
			
 
				-       "      <th>printing</th>\n",
			
 
				-       "      <th>450</th>\n",
			
 
				-       "      <th>shanghai</th>\n",
			
 
				-       "      <th>purple</th>\n",
			
 
				-       "      <th>bday</th>\n",
			
 
				-       "      <th>nd</th>\n",
			
 
				-       "      <th>ignorant</th>\n",
			
 
				+       "      <th>sterm</th>\n",
			
 
				+       "      <th>click</th>\n",
			
 
				+       "      <th>person2die</th>\n",
			
 
				+       "      <th>amused</th>\n",
			
 
				+       "      <th>box434sk38wp150ppm18</th>\n",
			
 
				+       "      <th>bcaz</th>\n",
			
 
				+       "      <th>lodging</th>\n",
			
 
				+       "      <th>lyf</th>\n",
			
 
				+       "      <th>officially</th>\n",
			
 
				+       "      <th>again</th>\n",
			
 
				        "    </tr>\n",
			
 
				        "  </thead>\n",
			
 
				        "  <tbody>\n",
			
@@ -872,26 +865,26 @@
 
				        "</div>"
			
 
				       ],
			
 
				       "text/plain": [
			
 
				-       "  Label                                                SMS  should  huiming  \\\n",
			
 
				-       "0   ham                  [yep, by, the, pretty, sculpture]       0        0   \n",
			
 
				-       "1   ham  [yes, princess, are, you, going, to, make, me,...       0        0   \n",
			
 
				-       "2   ham                    [welp, apparently, he, retired]       0        0   \n",
			
 
				-       "3   ham                                           [havent]       0        0   \n",
			
 
				-       "4   ham  [i, forgot, 2, ask, ü, all, smth, there, s, a,...       0        0   \n",
			
 
				+       "  Label                                                SMS  woke  gr8  forum  \\\n",
			
 
				+       "0   ham                  [yep, by, the, pretty, sculpture]     0    0      0   \n",
			
 
				+       "1   ham  [yes, princess, are, you, going, to, make, me,...     0    0      0   \n",
			
 
				+       "2   ham                    [welp, apparently, he, retired]     0    0      0   \n",
			
 
				+       "3   ham                                           [havent]     0    0      0   \n",
			
 
				+       "4   ham  [i, forgot, 2, ask, ü, all, smth, there, s, a,...     0    0      0   \n",
			
 
				        "\n",
			
 
				-       "   double  freshers  here  tea  running  seat  ...  nike  ache  08000930705  \\\n",
			
 
				-       "0       0         0     0    0        0     0  ...     0     0            0   \n",
			
 
				-       "1       0         0     0    0        0     0  ...     0     0            0   \n",
			
 
				-       "2       0         0     0    0        0     0  ...     0     0            0   \n",
			
 
				-       "3       0         0     0    0        0     0  ...     0     0            0   \n",
			
 
				-       "4       0         0     0    0        0     0  ...     0     0            0   \n",
			
 
				+       "   bettr  std  pobox334  wap  kalstiya  ...  sterm  click  person2die  amused  \\\n",
			
 
				+       "0      0    0         0    0         0  ...      0      0           0       0   \n",
			
 
				+       "1      0    0         0    0         0  ...      0      0           0       0   \n",
			
 
				+       "2      0    0         0    0         0  ...      0      0           0       0   \n",
			
 
				+       "3      0    0         0    0         0  ...      0      0           0       0   \n",
			
 
				+       "4      0    0         0    0         0  ...      0      0           0       0   \n",
			
 
				        "\n",
			
 
				-       "   printing  450  shanghai  purple  bday  nd  ignorant  \n",
			
 
				-       "0         0    0         0       0     0   0         0  \n",
			
 
				-       "1         0    0         0       0     0   0         0  \n",
			
 
				-       "2         0    0         0       0     0   0         0  \n",
			
 
				-       "3         0    0         0       0     0   0         0  \n",
			
 
				-       "4         0    0         0       0     0   0         0  \n",
			
 
				+       "   box434sk38wp150ppm18  bcaz  lodging  lyf  officially  again  \n",
			
 
				+       "0                     0     0        0    0           0      0  \n",
			
 
				+       "1                     0     0        0    0           0      0  \n",
			
 
				+       "2                     0     0        0    0           0      0  \n",
			
 
				+       "3                     0     0        0    0           0      0  \n",
			
 
				+       "4                     0     0        0    0           0      0  \n",
			
 
				        "\n",
			
 
				        "[5 rows x 7785 columns]"
			
 
				       ]
			
@@ -915,27 +908,21 @@
 
				     "We're now done with cleaning the training set, and we can begin creating the spam filter. The Naive Bayes algorithm will need to answer these two probability questions to be able to classify new messages:\n",
			
 
				     "\n",
			
 
				     "\\begin{equation}\n",
			
 
				-    "P(Spam | w_1,w_2, ..., w_n) = P(Spam) \\cdot \\prod_{i=1}^{n}P(w_i|Spam)\n",
			
 
				-    "\\end{equation}\n",
			
 
				-    "\n",
			
 
				-    "\\begin{equation}\n",
			
 
				-    "P(Ham | w_1,w_2, ..., w_n) = P(Ham) \\cdot \\prod_{i=1}^{n}P(w_i|Ham)\n",
			
 
				+    "P(Spam | w_1,w_2, ..., w_n) \\propto P(Spam) \\cdot \\prod_{i=1}^{n}P(w_i|Spam) \\\\\\\n",
			
 
				+    "P(Ham | w_1,w_2, ..., w_n) \\propto P(Ham) \\cdot \\prod_{i=1}^{n}P(w_i|Ham)\n",
			
 
				     "\\end{equation}\n",
			
 
				     "\n",
			
 
				-    "Also, to calculate $P(w_i|Spam)$ and $P(w_i|Ham)$ inside the formulas above (where $w_i$ can be any word), we'll need to use:\n",
			
 
				+    "Also, to calculate P(w<sub>i</sub>|Spam) and P(w<sub>i</sub>|Ham) inside the formulas above, we'll need to use these equations:\n",
			
 
				     "\n",
			
 
				     "\\begin{equation}\n",
			
 
				-    "P(w_i|Spam) = \\frac{card(w_i|Spam) + \\alpha}{card(Spam) + \\alpha \\cdot card(Vocabulary)}\n",
			
 
				-    "\\end{equation}\n",
			
 
				-    "\n",
			
 
				-    "\\begin{equation}\n",
			
 
				-    "P(w_i|Ham) = \\frac{card(w_i|Ham) + \\alpha}{card(Ham) + \\alpha \\cdot card(Vocabulary)}\n",
			
 
				+    "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}} \\\\\\\n",
			
 
				+    "P(w_i|Ham) = \\frac{N_{w_i|Ham} + \\alpha}{N_{Ham} + \\alpha \\cdot N_{Vocabulary}}\n",
			
 
				     "\\end{equation}\n",
			
 
				     "\n",
			
 
				     "Some of the terms in the four equations above will have the same value for every new message. We can calculate the value of these terms once and avoid doing the computations again when a new messages comes in. Below, we'll use our training set to calculate:\n",
			
 
				     "\n",
			
 
				     "- P(Spam) and P(Ham)\n",
			
 
				-    "- card(Spam), card(Ham), and card(Vocabulary)\n",
			
 
				+    "- N<sub>Spam</sub>, N<sub>Ham</sub>, N<sub>Vocabulary</sub>\n",
			
 
				     "\n",
			
 
				     "We'll also use Laplace smoothing and set $\\alpha = 1$."
			
 
				    ]
			
@@ -950,12 +937,12 @@
 
				     "p_spam = training_set_clean['Label'].value_counts(normalize=True)['spam']\n",
			
 
				     "p_ham = training_set_clean['Label'].value_counts(normalize=True)['ham']\n",
			
 
				     "\n",
			
 
				-    "# card(Spam), card(Ham), and card(Vocabulary)\n",
			
 
				-    "card_spam = training_set_clean[training_set_clean['Label'] == 'spam'].sum(\n",
			
 
				+    "# N_Spam, N_Ham, and N_Vocabulary\n",
			
 
				+    "n_spam = training_set_clean[training_set_clean['Label'] == 'spam'].sum(\n",
			
 
				     "                                                    axis=1).sum()\n",
			
 
				-    "card_ham = training_set_clean[training_set_clean['Label'] == 'ham'].sum(\n",
			
 
				+    "n_ham = training_set_clean[training_set_clean['Label'] == 'ham'].sum(\n",
			
 
				     "                                                    axis=1).sum()\n",
			
 
				-    "card_vocabulary = len(vocabulary)\n",
			
 
				+    "n_vocabulary = len(vocabulary)\n",
			
 
				     "\n",
			
 
				     "# Laplace smoothing\n",
			
 
				     "alpha = 1"
			
@@ -972,11 +959,8 @@
 
				     "The parameters are calculated using the formulas:\n",
			
 
				     "\n",
			
 
				     "\\begin{equation}\n",
			
 
				-    "P(w_i|Spam) = \\frac{card(w_i|Spam) + \\alpha}{card(Spam) + \\alpha \\cdot card(Vocabulary)}\n",
			
 
				-    "\\end{equation}\n",
			
 
				-    "\n",
			
 
				-    "\\begin{equation}\n",
			
 
				-    "P(w_i|Ham) = \\frac{card(w_i|Ham) + \\alpha}{card(Ham) + \\alpha \\cdot card(Vocabulary)}\n",
			
 
				+    "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}} \\\\\\\n",
			
 
				+    "P(w_i|Ham) = \\frac{N_{w_i|Ham} + \\alpha}{N_{Ham} + \\alpha \\cdot N_{Vocabulary}}\n",
			
 
				     "\\end{equation}"
			
 
				    ]
			
 
				   },
			
@@ -997,12 +981,12 @@
 
				     "\n",
			
 
				     "# Calculate parameters\n",
			
 
				     "for word in vocabulary:\n",
			
 
				-    "    card_word_given_spam = spam_messages[word].sum()\n",
			
 
				-    "    p_word_given_spam = (card_word_given_spam + alpha) / (card_spam + alpha*card_vocabulary)\n",
			
 
				+    "    n_word_given_spam = spam_messages[word].sum()\n",
			
 
				+    "    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)\n",
			
 
				     "    parameters_spam[word] = p_word_given_spam\n",
			
 
				     "    \n",
			
 
				-    "    card_word_given_ham = ham_messages[word].sum()\n",
			
 
				-    "    p_word_given_ham = (card_word_given_ham + alpha) / (card_ham + alpha*card_vocabulary)\n",
			
 
				+    "    n_word_given_ham = ham_messages[word].sum()\n",
			
 
				+    "    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)\n",
			
 
				     "    parameters_ham[word] = p_word_given_ham"
			
 
				    ]
			
 
				   },
			
@@ -1105,7 +1089,7 @@
 
				     "\n",
			
 
				     "The two results above look promising, but let's see how well the filter does on our test set, which has 1,114 messages.\n",
			
 
				     "\n",
			
 
				-    "We'll start by writing a function that returns classification labels instead of print them."
			
 
				+    "We'll start by writing a function that returns classification labels instead of printing them."
			
 
				    ]
			
 
				   },
			
 
				   {