|
@@ -472,18 +472,11 @@
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "word_counts_per_sms = {unique_word:[] for unique_word in vocabulary}\n",
|
|
|
+ "word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}\n",
|
|
|
"\n",
|
|
|
- "for unique_word in vocabulary:\n",
|
|
|
- " \n",
|
|
|
- " for sms in training_set['SMS']: \n",
|
|
|
- " counter = 0\n",
|
|
|
- " \n",
|
|
|
- " for word in sms: \n",
|
|
|
- " if word == unique_word:\n",
|
|
|
- " counter += 1\n",
|
|
|
- " \n",
|
|
|
- " word_counts_per_sms[unique_word].append(counter)"
|
|
|
+ "for index, sms in enumerate(training_set['SMS']):\n",
|
|
|
+ " for word in sms:\n",
|
|
|
+ " word_counts_per_sms[word][index] += 1"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
@@ -512,27 +505,27 @@
|
|
|
" <thead>\n",
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
" <th></th>\n",
|
|
|
- " <th>should</th>\n",
|
|
|
- " <th>huiming</th>\n",
|
|
|
- " <th>double</th>\n",
|
|
|
- " <th>freshers</th>\n",
|
|
|
- " <th>here</th>\n",
|
|
|
- " <th>tea</th>\n",
|
|
|
- " <th>running</th>\n",
|
|
|
- " <th>seat</th>\n",
|
|
|
- " <th>mb</th>\n",
|
|
|
- " <th>safely</th>\n",
|
|
|
+ " <th>woke</th>\n",
|
|
|
+ " <th>gr8</th>\n",
|
|
|
+ " <th>forum</th>\n",
|
|
|
+ " <th>bettr</th>\n",
|
|
|
+ " <th>std</th>\n",
|
|
|
+ " <th>pobox334</th>\n",
|
|
|
+ " <th>wap</th>\n",
|
|
|
+ " <th>kalstiya</th>\n",
|
|
|
+ " <th>skillgame</th>\n",
|
|
|
+ " <th>slap</th>\n",
|
|
|
" <th>...</th>\n",
|
|
|
- " <th>nike</th>\n",
|
|
|
- " <th>ache</th>\n",
|
|
|
- " <th>08000930705</th>\n",
|
|
|
- " <th>printing</th>\n",
|
|
|
- " <th>450</th>\n",
|
|
|
- " <th>shanghai</th>\n",
|
|
|
- " <th>purple</th>\n",
|
|
|
- " <th>bday</th>\n",
|
|
|
- " <th>nd</th>\n",
|
|
|
- " <th>ignorant</th>\n",
|
|
|
+ " <th>sterm</th>\n",
|
|
|
+ " <th>click</th>\n",
|
|
|
+ " <th>person2die</th>\n",
|
|
|
+ " <th>amused</th>\n",
|
|
|
+ " <th>box434sk38wp150ppm18</th>\n",
|
|
|
+ " <th>bcaz</th>\n",
|
|
|
+ " <th>lodging</th>\n",
|
|
|
+ " <th>lyf</th>\n",
|
|
|
+ " <th>officially</th>\n",
|
|
|
+ " <th>again</th>\n",
|
|
|
" </tr>\n",
|
|
|
" </thead>\n",
|
|
|
" <tbody>\n",
|
|
@@ -662,26 +655,26 @@
|
|
|
"</div>"
|
|
|
],
|
|
|
"text/plain": [
|
|
|
- " should huiming double freshers here tea running seat mb safely \\\n",
|
|
|
- "0 0 0 0 0 0 0 0 0 0 0 \n",
|
|
|
- "1 0 0 0 0 0 0 0 0 0 0 \n",
|
|
|
- "2 0 0 0 0 0 0 0 0 0 0 \n",
|
|
|
- "3 0 0 0 0 0 0 0 0 0 0 \n",
|
|
|
- "4 0 0 0 0 0 0 0 0 0 0 \n",
|
|
|
+ " woke gr8 forum bettr std pobox334 wap kalstiya skillgame slap \\\n",
|
|
|
+ "0 0 0 0 0 0 0 0 0 0 0 \n",
|
|
|
+ "1 0 0 0 0 0 0 0 0 0 0 \n",
|
|
|
+ "2 0 0 0 0 0 0 0 0 0 0 \n",
|
|
|
+ "3 0 0 0 0 0 0 0 0 0 0 \n",
|
|
|
+ "4 0 0 0 0 0 0 0 0 0 0 \n",
|
|
|
"\n",
|
|
|
- " ... nike ache 08000930705 printing 450 shanghai purple bday nd \\\n",
|
|
|
- "0 ... 0 0 0 0 0 0 0 0 0 \n",
|
|
|
- "1 ... 0 0 0 0 0 0 0 0 0 \n",
|
|
|
- "2 ... 0 0 0 0 0 0 0 0 0 \n",
|
|
|
- "3 ... 0 0 0 0 0 0 0 0 0 \n",
|
|
|
- "4 ... 0 0 0 0 0 0 0 0 0 \n",
|
|
|
+ " ... sterm click person2die amused box434sk38wp150ppm18 bcaz lodging \\\n",
|
|
|
+ "0 ... 0 0 0 0 0 0 0 \n",
|
|
|
+ "1 ... 0 0 0 0 0 0 0 \n",
|
|
|
+ "2 ... 0 0 0 0 0 0 0 \n",
|
|
|
+ "3 ... 0 0 0 0 0 0 0 \n",
|
|
|
+ "4 ... 0 0 0 0 0 0 0 \n",
|
|
|
"\n",
|
|
|
- " ignorant \n",
|
|
|
- "0 0 \n",
|
|
|
- "1 0 \n",
|
|
|
- "2 0 \n",
|
|
|
- "3 0 \n",
|
|
|
- "4 0 \n",
|
|
|
+ " lyf officially again \n",
|
|
|
+ "0 0 0 0 \n",
|
|
|
+ "1 0 0 0 \n",
|
|
|
+ "2 0 0 0 \n",
|
|
|
+ "3 0 0 0 \n",
|
|
|
+ "4 0 0 0 \n",
|
|
|
"\n",
|
|
|
"[5 rows x 7783 columns]"
|
|
|
]
|
|
@@ -724,25 +717,25 @@
|
|
|
" <th></th>\n",
|
|
|
" <th>Label</th>\n",
|
|
|
" <th>SMS</th>\n",
|
|
|
- " <th>should</th>\n",
|
|
|
- " <th>huiming</th>\n",
|
|
|
- " <th>double</th>\n",
|
|
|
- " <th>freshers</th>\n",
|
|
|
- " <th>here</th>\n",
|
|
|
- " <th>tea</th>\n",
|
|
|
- " <th>running</th>\n",
|
|
|
- " <th>seat</th>\n",
|
|
|
+ " <th>woke</th>\n",
|
|
|
+ " <th>gr8</th>\n",
|
|
|
+ " <th>forum</th>\n",
|
|
|
+ " <th>bettr</th>\n",
|
|
|
+ " <th>std</th>\n",
|
|
|
+ " <th>pobox334</th>\n",
|
|
|
+ " <th>wap</th>\n",
|
|
|
+ " <th>kalstiya</th>\n",
|
|
|
" <th>...</th>\n",
|
|
|
- " <th>nike</th>\n",
|
|
|
- " <th>ache</th>\n",
|
|
|
- " <th>08000930705</th>\n",
|
|
|
- " <th>printing</th>\n",
|
|
|
- " <th>450</th>\n",
|
|
|
- " <th>shanghai</th>\n",
|
|
|
- " <th>purple</th>\n",
|
|
|
- " <th>bday</th>\n",
|
|
|
- " <th>nd</th>\n",
|
|
|
- " <th>ignorant</th>\n",
|
|
|
+ " <th>sterm</th>\n",
|
|
|
+ " <th>click</th>\n",
|
|
|
+ " <th>person2die</th>\n",
|
|
|
+ " <th>amused</th>\n",
|
|
|
+ " <th>box434sk38wp150ppm18</th>\n",
|
|
|
+ " <th>bcaz</th>\n",
|
|
|
+ " <th>lodging</th>\n",
|
|
|
+ " <th>lyf</th>\n",
|
|
|
+ " <th>officially</th>\n",
|
|
|
+ " <th>again</th>\n",
|
|
|
" </tr>\n",
|
|
|
" </thead>\n",
|
|
|
" <tbody>\n",
|
|
@@ -872,26 +865,26 @@
|
|
|
"</div>"
|
|
|
],
|
|
|
"text/plain": [
|
|
|
- " Label SMS should huiming \\\n",
|
|
|
- "0 ham [yep, by, the, pretty, sculpture] 0 0 \n",
|
|
|
- "1 ham [yes, princess, are, you, going, to, make, me,... 0 0 \n",
|
|
|
- "2 ham [welp, apparently, he, retired] 0 0 \n",
|
|
|
- "3 ham [havent] 0 0 \n",
|
|
|
- "4 ham [i, forgot, 2, ask, ü, all, smth, there, s, a,... 0 0 \n",
|
|
|
+ " Label SMS woke gr8 forum \\\n",
|
|
|
+ "0 ham [yep, by, the, pretty, sculpture] 0 0 0 \n",
|
|
|
+ "1 ham [yes, princess, are, you, going, to, make, me,... 0 0 0 \n",
|
|
|
+ "2 ham [welp, apparently, he, retired] 0 0 0 \n",
|
|
|
+ "3 ham [havent] 0 0 0 \n",
|
|
|
+ "4 ham [i, forgot, 2, ask, ü, all, smth, there, s, a,... 0 0 0 \n",
|
|
|
"\n",
|
|
|
- " double freshers here tea running seat ... nike ache 08000930705 \\\n",
|
|
|
- "0 0 0 0 0 0 0 ... 0 0 0 \n",
|
|
|
- "1 0 0 0 0 0 0 ... 0 0 0 \n",
|
|
|
- "2 0 0 0 0 0 0 ... 0 0 0 \n",
|
|
|
- "3 0 0 0 0 0 0 ... 0 0 0 \n",
|
|
|
- "4 0 0 0 0 0 0 ... 0 0 0 \n",
|
|
|
+ " bettr std pobox334 wap kalstiya ... sterm click person2die amused \\\n",
|
|
|
+ "0 0 0 0 0 0 ... 0 0 0 0 \n",
|
|
|
+ "1 0 0 0 0 0 ... 0 0 0 0 \n",
|
|
|
+ "2 0 0 0 0 0 ... 0 0 0 0 \n",
|
|
|
+ "3 0 0 0 0 0 ... 0 0 0 0 \n",
|
|
|
+ "4 0 0 0 0 0 ... 0 0 0 0 \n",
|
|
|
"\n",
|
|
|
- " printing 450 shanghai purple bday nd ignorant \n",
|
|
|
- "0 0 0 0 0 0 0 0 \n",
|
|
|
- "1 0 0 0 0 0 0 0 \n",
|
|
|
- "2 0 0 0 0 0 0 0 \n",
|
|
|
- "3 0 0 0 0 0 0 0 \n",
|
|
|
- "4 0 0 0 0 0 0 0 \n",
|
|
|
+ " box434sk38wp150ppm18 bcaz lodging lyf officially again \n",
|
|
|
+ "0 0 0 0 0 0 0 \n",
|
|
|
+ "1 0 0 0 0 0 0 \n",
|
|
|
+ "2 0 0 0 0 0 0 \n",
|
|
|
+ "3 0 0 0 0 0 0 \n",
|
|
|
+ "4 0 0 0 0 0 0 \n",
|
|
|
"\n",
|
|
|
"[5 rows x 7785 columns]"
|
|
|
]
|
|
@@ -915,27 +908,21 @@
|
|
|
"We're now done with cleaning the training set, and we can begin creating the spam filter. The Naive Bayes algorithm will need to answer these two probability questions to be able to classify new messages:\n",
|
|
|
"\n",
|
|
|
"\\begin{equation}\n",
|
|
|
- "P(Spam | w_1,w_2, ..., w_n) = P(Spam) \\cdot \\prod_{i=1}^{n}P(w_i|Spam)\n",
|
|
|
- "\\end{equation}\n",
|
|
|
- "\n",
|
|
|
- "\\begin{equation}\n",
|
|
|
- "P(Ham | w_1,w_2, ..., w_n) = P(Ham) \\cdot \\prod_{i=1}^{n}P(w_i|Ham)\n",
|
|
|
+ "P(Spam | w_1,w_2, ..., w_n) \\propto P(Spam) \\cdot \\prod_{i=1}^{n}P(w_i|Spam) \\\\\\\n",
|
|
|
+ "P(Ham | w_1,w_2, ..., w_n) \\propto P(Ham) \\cdot \\prod_{i=1}^{n}P(w_i|Ham)\n",
|
|
|
"\\end{equation}\n",
|
|
|
"\n",
|
|
|
- "Also, to calculate $P(w_i|Spam)$ and $P(w_i|Ham)$ inside the formulas above (where $w_i$ can be any word), we'll need to use:\n",
|
|
|
+ "Also, to calculate P(w<sub>i</sub>|Spam) and P(w<sub>i</sub>|Ham) inside the formulas above, we'll need to use these equations:\n",
|
|
|
"\n",
|
|
|
"\\begin{equation}\n",
|
|
|
- "P(w_i|Spam) = \\frac{card(w_i|Spam) + \\alpha}{card(Spam) + \\alpha \\cdot card(Vocabulary)}\n",
|
|
|
- "\\end{equation}\n",
|
|
|
- "\n",
|
|
|
- "\\begin{equation}\n",
|
|
|
- "P(w_i|Ham) = \\frac{card(w_i|Ham) + \\alpha}{card(Ham) + \\alpha \\cdot card(Vocabulary)}\n",
|
|
|
+ "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}} \\\\\\\n",
|
|
|
+ "P(w_i|Ham) = \\frac{N_{w_i|Ham} + \\alpha}{N_{Ham} + \\alpha \\cdot N_{Vocabulary}}\n",
|
|
|
"\\end{equation}\n",
|
|
|
"\n",
|
|
|
"Some of the terms in the four equations above will have the same value for every new message. We can calculate the value of these terms once and avoid doing the computations again when a new messages comes in. Below, we'll use our training set to calculate:\n",
|
|
|
"\n",
|
|
|
"- P(Spam) and P(Ham)\n",
|
|
|
- "- card(Spam), card(Ham), and card(Vocabulary)\n",
|
|
|
+ "- N<sub>Spam</sub>, N<sub>Ham</sub>, N<sub>Vocabulary</sub>\n",
|
|
|
"\n",
|
|
|
"We'll also use Laplace smoothing and set $\\alpha = 1$."
|
|
|
]
|
|
@@ -950,12 +937,12 @@
|
|
|
"p_spam = training_set_clean['Label'].value_counts(normalize=True)['spam']\n",
|
|
|
"p_ham = training_set_clean['Label'].value_counts(normalize=True)['ham']\n",
|
|
|
"\n",
|
|
|
- "# card(Spam), card(Ham), and card(Vocabulary)\n",
|
|
|
- "card_spam = training_set_clean[training_set_clean['Label'] == 'spam'].sum(\n",
|
|
|
+ "# N_Spam, N_Ham, and N_Vocabulary\n",
|
|
|
+ "n_spam = training_set_clean[training_set_clean['Label'] == 'spam'].sum(\n",
|
|
|
" axis=1).sum()\n",
|
|
|
- "card_ham = training_set_clean[training_set_clean['Label'] == 'ham'].sum(\n",
|
|
|
+ "n_ham = training_set_clean[training_set_clean['Label'] == 'ham'].sum(\n",
|
|
|
" axis=1).sum()\n",
|
|
|
- "card_vocabulary = len(vocabulary)\n",
|
|
|
+ "n_vocabulary = len(vocabulary)\n",
|
|
|
"\n",
|
|
|
"# Laplace smoothing\n",
|
|
|
"alpha = 1"
|
|
@@ -972,11 +959,8 @@
|
|
|
"The parameters are calculated using the formulas:\n",
|
|
|
"\n",
|
|
|
"\\begin{equation}\n",
|
|
|
- "P(w_i|Spam) = \\frac{card(w_i|Spam) + \\alpha}{card(Spam) + \\alpha \\cdot card(Vocabulary)}\n",
|
|
|
- "\\end{equation}\n",
|
|
|
- "\n",
|
|
|
- "\\begin{equation}\n",
|
|
|
- "P(w_i|Ham) = \\frac{card(w_i|Ham) + \\alpha}{card(Ham) + \\alpha \\cdot card(Vocabulary)}\n",
|
|
|
+ "P(w_i|Spam) = \\frac{N_{w_i|Spam} + \\alpha}{N_{Spam} + \\alpha \\cdot N_{Vocabulary}} \\\\\\\n",
|
|
|
+ "P(w_i|Ham) = \\frac{N_{w_i|Ham} + \\alpha}{N_{Ham} + \\alpha \\cdot N_{Vocabulary}}\n",
|
|
|
"\\end{equation}"
|
|
|
]
|
|
|
},
|
|
@@ -997,12 +981,12 @@
|
|
|
"\n",
|
|
|
"# Calculate parameters\n",
|
|
|
"for word in vocabulary:\n",
|
|
|
- " card_word_given_spam = spam_messages[word].sum()\n",
|
|
|
- " p_word_given_spam = (card_word_given_spam + alpha) / (card_spam + alpha*card_vocabulary)\n",
|
|
|
+ " n_word_given_spam = spam_messages[word].sum()\n",
|
|
|
+ " p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)\n",
|
|
|
" parameters_spam[word] = p_word_given_spam\n",
|
|
|
" \n",
|
|
|
- " card_word_given_ham = ham_messages[word].sum()\n",
|
|
|
- " p_word_given_ham = (card_word_given_ham + alpha) / (card_ham + alpha*card_vocabulary)\n",
|
|
|
+ " n_word_given_ham = ham_messages[word].sum()\n",
|
|
|
+ " p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)\n",
|
|
|
" parameters_ham[word] = p_word_given_ham"
|
|
|
]
|
|
|
},
|
|
@@ -1105,7 +1089,7 @@
|
|
|
"\n",
|
|
|
"The two results above look promising, but let's see how well the filter does on our test set, which has 1,114 messages.\n",
|
|
|
"\n",
|
|
|
- "We'll start by writing a function that returns classification labels instead of print them."
|
|
|
+ "We'll start by writing a function that returns classification labels instead of printing them."
|
|
|
]
|
|
|
},
|
|
|
{
|