|
@@ -15,12 +15,12 @@ This analysis is an application of what we've learned in Dataquest's Conditional
|
|
|
|
|
|
```{r}
|
|
|
# Bring in the dataset
|
|
|
-spam = read.csv("spam.csv")
|
|
|
+spam = read_csv("spam.csv")
|
|
|
```
|
|
|
|
|
|
The `spam` dataset has `r nrow(spam)` rows and `r ncol(spam)` columns. Of these messages, `r mean(spam$label == "ham") * 100`% of them are not classified as spam, the rest are spam.
|
|
|
|
|
|
-# Dividing Up Into Training and Test Sets
|
|
|
+# Training, Cross-validation and Test Sets
|
|
|
|
|
|
```{r}
|
|
|
# Calculate some helper values to split the dataset
|
|
@@ -113,9 +113,15 @@ n_ham = ham_vocab %>% length
|
|
|
n_vocabulary = vocabulary %>% length
|
|
|
```
|
|
|
|
|
|
+# Calculating Probability Parameters
|
|
|
+
|
|
|
```{r}
|
|
|
# New vectorized approach to a calculating ham and spam probabilities
|
|
|
|
|
|
+# Marginal probability of a training message being spam or ham
|
|
|
+p_spam = mean(tidy_train$label == "spam")
|
|
|
+p_ham = mean(tidy_train$label == "ham")
|
|
|
+
|
|
|
# Break up the spam and ham counting into their own tibbles
|
|
|
spam_counts = tibble(
|
|
|
word = spam_vocab
|
|
@@ -172,10 +178,6 @@ word_counts = full_join(spam_counts, ham_counts, by = "word") %>%
|
|
|
# based on this alpha (default to 1)
|
|
|
classify = function(message, alpha = 1) {
|
|
|
|
|
|
- # Initializing the probability product
|
|
|
- p_spam = mean(tidy_train$label == "spam")
|
|
|
- p_ham = mean(tidy_train$label == "ham")
|
|
|
-
|
|
|
# Splitting and cleaning the new message
|
|
|
# This is the same cleaning procedure used on the training messages
|
|
|
clean_message = str_to_lower(message) %>%
|
|
@@ -233,13 +235,17 @@ final_train = tidy_train %>%
|
|
|
mutate(
|
|
|
prediction = map_chr(sms, function(m) { classify(m) })
|
|
|
)
|
|
|
+```
|
|
|
|
|
|
+# Calculating Accuracy
|
|
|
|
|
|
+```{r}
|
|
|
# Results of classification on training
|
|
|
confusion = table(final_train$label, final_train$prediction)
|
|
|
accuracy = (confusion[1,1] + confusion[2,2]) / nrow(final_train)
|
|
|
```
|
|
|
|
|
|
+
|
|
|
The Naive Bayes Classifier achieves an accuracy of about 89%. Pretty good! Let's see how well it works on messages that it has never seen before.
|
|
|
|
|
|
# Hyperparameter Tuning
|