Queer European MD passionate about IT
Browse Source

less intensive code and latex fixes

Alex 5 years ago
parent
commit
890a0fa54e
1 changed files with 106 additions and 93 deletions
  1. 106 93
      Mission433Solutions.ipynb

+ 106 - 93
Mission433Solutions.ipynb

@@ -236,9 +236,8 @@
     "\n",
     "Essentially, we want to bring data to this format:\n",
     "\n",
-    "<center>\n",
     "![img](https://dq-content.s3.amazonaws.com/433/cpgp_dataset_3.png)\n",
-    "</center>\n",
+    "\n",
     "\n",
     "### Letter Case and Punctuation\n",
     "\n",
@@ -505,27 +504,27 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>woke</th>\n",
-       "      <th>gr8</th>\n",
-       "      <th>forum</th>\n",
-       "      <th>bettr</th>\n",
-       "      <th>std</th>\n",
-       "      <th>pobox334</th>\n",
-       "      <th>wap</th>\n",
-       "      <th>kalstiya</th>\n",
-       "      <th>skillgame</th>\n",
-       "      <th>slap</th>\n",
+       "      <th>ticket</th>\n",
+       "      <th>kappa</th>\n",
+       "      <th>too</th>\n",
+       "      <th>abdomen</th>\n",
+       "      <th>unhappy</th>\n",
+       "      <th>hoody</th>\n",
+       "      <th>start</th>\n",
+       "      <th>die</th>\n",
+       "      <th>wild</th>\n",
+       "      <th>195</th>\n",
        "      <th>...</th>\n",
-       "      <th>sterm</th>\n",
-       "      <th>click</th>\n",
-       "      <th>person2die</th>\n",
-       "      <th>amused</th>\n",
-       "      <th>box434sk38wp150ppm18</th>\n",
-       "      <th>bcaz</th>\n",
-       "      <th>lodging</th>\n",
-       "      <th>lyf</th>\n",
-       "      <th>officially</th>\n",
-       "      <th>again</th>\n",
+       "      <th>09058095201</th>\n",
+       "      <th>chase</th>\n",
+       "      <th>thru</th>\n",
+       "      <th>ru</th>\n",
+       "      <th>xclusive</th>\n",
+       "      <th>fellow</th>\n",
+       "      <th>red</th>\n",
+       "      <th>entitled</th>\n",
+       "      <th>auto</th>\n",
+       "      <th>bothering</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -655,26 +654,26 @@
        "</div>"
       ],
       "text/plain": [
-       "   woke  gr8  forum  bettr  std  pobox334  wap  kalstiya  skillgame  slap  \\\n",
-       "0     0    0      0      0    0         0    0         0          0     0   \n",
-       "1     0    0      0      0    0         0    0         0          0     0   \n",
-       "2     0    0      0      0    0         0    0         0          0     0   \n",
-       "3     0    0      0      0    0         0    0         0          0     0   \n",
-       "4     0    0      0      0    0         0    0         0          0     0   \n",
+       "   ticket  kappa  too  abdomen  unhappy  hoody  start  die  wild  195  ...  \\\n",
+       "0       0      0    0        0        0      0      0    0     0    0  ...   \n",
+       "1       0      0    0        0        0      0      0    0     0    0  ...   \n",
+       "2       0      0    0        0        0      0      0    0     0    0  ...   \n",
+       "3       0      0    0        0        0      0      0    0     0    0  ...   \n",
+       "4       0      0    0        0        0      0      0    0     0    0  ...   \n",
        "\n",
-       "   ...  sterm  click  person2die  amused  box434sk38wp150ppm18  bcaz  lodging  \\\n",
-       "0  ...      0      0           0       0                     0     0        0   \n",
-       "1  ...      0      0           0       0                     0     0        0   \n",
-       "2  ...      0      0           0       0                     0     0        0   \n",
-       "3  ...      0      0           0       0                     0     0        0   \n",
-       "4  ...      0      0           0       0                     0     0        0   \n",
+       "   09058095201  chase  thru  ru  xclusive  fellow  red  entitled  auto  \\\n",
+       "0            0      0     0   0         0       0    0         0     0   \n",
+       "1            0      0     0   0         0       0    0         0     0   \n",
+       "2            0      0     0   0         0       0    0         0     0   \n",
+       "3            0      0     0   0         0       0    0         0     0   \n",
+       "4            0      0     0   0         0       0    0         0     0   \n",
        "\n",
-       "   lyf  officially  again  \n",
-       "0    0           0      0  \n",
-       "1    0           0      0  \n",
-       "2    0           0      0  \n",
-       "3    0           0      0  \n",
-       "4    0           0      0  \n",
+       "   bothering  \n",
+       "0          0  \n",
+       "1          0  \n",
+       "2          0  \n",
+       "3          0  \n",
+       "4          0  \n",
        "\n",
        "[5 rows x 7783 columns]"
       ]
@@ -717,25 +716,25 @@
        "      <th></th>\n",
        "      <th>Label</th>\n",
        "      <th>SMS</th>\n",
-       "      <th>woke</th>\n",
-       "      <th>gr8</th>\n",
-       "      <th>forum</th>\n",
-       "      <th>bettr</th>\n",
-       "      <th>std</th>\n",
-       "      <th>pobox334</th>\n",
-       "      <th>wap</th>\n",
-       "      <th>kalstiya</th>\n",
+       "      <th>ticket</th>\n",
+       "      <th>kappa</th>\n",
+       "      <th>too</th>\n",
+       "      <th>abdomen</th>\n",
+       "      <th>unhappy</th>\n",
+       "      <th>hoody</th>\n",
+       "      <th>start</th>\n",
+       "      <th>die</th>\n",
        "      <th>...</th>\n",
-       "      <th>sterm</th>\n",
-       "      <th>click</th>\n",
-       "      <th>person2die</th>\n",
-       "      <th>amused</th>\n",
-       "      <th>box434sk38wp150ppm18</th>\n",
-       "      <th>bcaz</th>\n",
-       "      <th>lodging</th>\n",
-       "      <th>lyf</th>\n",
-       "      <th>officially</th>\n",
-       "      <th>again</th>\n",
+       "      <th>09058095201</th>\n",
+       "      <th>chase</th>\n",
+       "      <th>thru</th>\n",
+       "      <th>ru</th>\n",
+       "      <th>xclusive</th>\n",
+       "      <th>fellow</th>\n",
+       "      <th>red</th>\n",
+       "      <th>entitled</th>\n",
+       "      <th>auto</th>\n",
+       "      <th>bothering</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -865,26 +864,26 @@
        "</div>"
       ],
       "text/plain": [
-       "  Label                                                SMS  woke  gr8  forum  \\\n",
-       "0   ham                  [yep, by, the, pretty, sculpture]     0    0      0   \n",
-       "1   ham  [yes, princess, are, you, going, to, make, me,...     0    0      0   \n",
-       "2   ham                    [welp, apparently, he, retired]     0    0      0   \n",
-       "3   ham                                           [havent]     0    0      0   \n",
-       "4   ham  [i, forgot, 2, ask, ü, all, smth, there, s, a,...     0    0      0   \n",
+       "  Label                                                SMS  ticket  kappa  \\\n",
+       "0   ham                  [yep, by, the, pretty, sculpture]       0      0   \n",
+       "1   ham  [yes, princess, are, you, going, to, make, me,...       0      0   \n",
+       "2   ham                    [welp, apparently, he, retired]       0      0   \n",
+       "3   ham                                           [havent]       0      0   \n",
+       "4   ham  [i, forgot, 2, ask, ü, all, smth, there, s, a,...       0      0   \n",
        "\n",
-       "   bettr  std  pobox334  wap  kalstiya  ...  sterm  click  person2die  amused  \\\n",
-       "0      0    0         0    0         0  ...      0      0           0       0   \n",
-       "1      0    0         0    0         0  ...      0      0           0       0   \n",
-       "2      0    0         0    0         0  ...      0      0           0       0   \n",
-       "3      0    0         0    0         0  ...      0      0           0       0   \n",
-       "4      0    0         0    0         0  ...      0      0           0       0   \n",
+       "   too  abdomen  unhappy  hoody  start  die  ...  09058095201  chase  thru  \\\n",
+       "0    0        0        0      0      0    0  ...            0      0     0   \n",
+       "1    0        0        0      0      0    0  ...            0      0     0   \n",
+       "2    0        0        0      0      0    0  ...            0      0     0   \n",
+       "3    0        0        0      0      0    0  ...            0      0     0   \n",
+       "4    0        0        0      0      0    0  ...            0      0     0   \n",
        "\n",
-       "   box434sk38wp150ppm18  bcaz  lodging  lyf  officially  again  \n",
-       "0                     0     0        0    0           0      0  \n",
-       "1                     0     0        0    0           0      0  \n",
-       "2                     0     0        0    0           0      0  \n",
-       "3                     0     0        0    0           0      0  \n",
-       "4                     0     0        0    0           0      0  \n",
+       "   ru  xclusive  fellow  red  entitled  auto  bothering  \n",
+       "0   0         0       0    0         0     0          0  \n",
+       "1   0         0       0    0         0     0          0  \n",
+       "2   0         0       0    0         0     0          0  \n",
+       "3   0         0       0    0         0     0          0  \n",
+       "4   0         0       0    0         0     0          0  \n",
        "\n",
        "[5 rows x 7785 columns]"
       ]
@@ -908,17 +907,25 @@
     "We're now done with cleaning the training set, and we can begin creating the spam filter. The Naive Bayes algorithm will need to answer these two probability questions to be able to classify new messages:\n",
     "\n",
     "\\begin{equation}\n",
-    "P(Spam | w_1,w_2, ..., w_n) \\propto P(Spam) \\cdot \\prod_{i=1}^{n}P(w_i|Spam) \\\\\\\n",
+    "P(Spam | w_1,w_2, ..., w_n) \\propto P(Spam) \\cdot \\prod_{i=1}^{n}P(w_i|Spam)\n",
+    "\\end{equation}\n",
+    "\n",
+    "\\begin{equation}\n",
     "P(Ham | w_1,w_2, ..., w_n) \\propto P(Ham) \\cdot \\prod_{i=1}^{n}P(w_i|Ham)\n",
     "\\end{equation}\n",
     "\n",
+    "\n",
     "Also, to calculate P(w<sub>i</sub>|Spam) and P(w<sub>i</sub>|Ham) inside the formulas above, we'll need to use these equations:\n",
     "\n",
     "\\begin{equation}\n",
-    "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}} \\\\\\\n",
+    "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}}\n",
+    "\\end{equation}\n",
+    "\n",
+    "\\begin{equation}\n",
     "P(w_i|Ham) = \\frac{N_{w_i|Ham} + \\alpha}{N_{Ham} + \\alpha \\cdot N_{Vocabulary}}\n",
     "\\end{equation}\n",
     "\n",
+    "\n",
     "Some of the terms in the four equations above will have the same value for every new message. We can calculate the value of these terms once and avoid doing the computations again when a new messages comes in. Below, we'll use our training set to calculate:\n",
     "\n",
     "- P(Spam) and P(Ham)\n",
@@ -933,15 +940,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Isolating spam and ham messages first\n",
+    "spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']\n",
+    "ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']\n",
+    "\n",
     "# P(Spam) and P(Ham)\n",
-    "p_spam = training_set_clean['Label'].value_counts(normalize=True)['spam']\n",
-    "p_ham = training_set_clean['Label'].value_counts(normalize=True)['ham']\n",
+    "p_spam = len(spam_messages) / len(training_set_clean)\n",
+    "p_ham = len(ham_messages) / len(training_set_clean)\n",
+    "\n",
+    "# N_Spam\n",
+    "n_words_per_spam_message = spam_messages['SMS'].apply(lambda x: len(x))\n",
+    "n_spam = n_words_per_spam_message.sum()\n",
     "\n",
-    "# N_Spam, N_Ham, and N_Vocabulary\n",
-    "n_spam = training_set_clean[training_set_clean['Label'] == 'spam'].sum(\n",
-    "                                                    axis=1).sum()\n",
-    "n_ham = training_set_clean[training_set_clean['Label'] == 'ham'].sum(\n",
-    "                                                    axis=1).sum()\n",
+    "# N_Ham\n",
+    "n_words_per_ham_message = ham_messages['SMS'].apply(lambda x: len(x))\n",
+    "n_ham = n_words_per_ham_message.sum()\n",
+    "\n",
+    "# N_Vocabulary\n",
     "n_vocabulary = len(vocabulary)\n",
     "\n",
     "# Laplace smoothing\n",
@@ -959,7 +974,10 @@
     "The parameters are calculated using the formulas:\n",
     "\n",
     "\\begin{equation}\n",
-    "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}} \\\\\\\n",
+    "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}}\n",
+    "\\end{equation}\n",
+    "\n",
+    "\\begin{equation}\n",
     "P(w_i|Ham) = \\frac{N_{w_i|Ham} + \\alpha}{N_{Ham} + \\alpha \\cdot N_{Vocabulary}}\n",
     "\\end{equation}"
    ]
@@ -974,18 +992,13 @@
     "parameters_spam = {unique_word:0 for unique_word in vocabulary}\n",
     "parameters_ham = {unique_word:0 for unique_word in vocabulary}\n",
     "\n",
-    "# Isolate spam and ham messages before starting the loop below\n",
-    "# Don't do this inside the loop, it'll add to code running time significantly\n",
-    "spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']\n",
-    "ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']\n",
-    "\n",
     "# Calculate parameters\n",
     "for word in vocabulary:\n",
-    "    n_word_given_spam = spam_messages[word].sum()\n",
+    "    n_word_given_spam = spam_messages[word].sum()   # spam_messages already defined in a cell above\n",
     "    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)\n",
     "    parameters_spam[word] = p_word_given_spam\n",
     "    \n",
-    "    n_word_given_ham = ham_messages[word].sum()\n",
+    "    n_word_given_ham = ham_messages[word].sum()   # ham_messages already defined in a cell above\n",
     "    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)\n",
     "    parameters_ham[word] = p_word_given_ham"
    ]