123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267 |
- ---
- title: "Conditional Probability in R: Guided Project Solutions"
- output: html_document
- ---
- ```{r, warning = FALSE, message = FALSE }
- library(tidyverse)
- set.seed(1)
- ```
- # Introduction
- This analysis is an application of what we've learned in Dataquest's Conditional Probability course. Using a dataset of pre-labeled SMS messages, we'll create a spam filter using the Naive Bayes algorithm.
- # Data
- ```{r}
- spam = read.csv("./data/SMSSpamCollection", sep = "\t", header = F)
- colnames(spam) = c("label", "sms")
- ```
- The `spam` dataset has `r nrow(spam)` rows and `r ncol(spam)` columns. Of these messages, `r mean(spam$label == "ham") * 100`% of them are not classified as spam, the rest are spam.
- # Dividing Up Into Training and Test Sets
- ```{r}
- n = nrow(spam)
- n.training = 2547
- n.cv = 318
- n.test = 319
- # Create the random indices for training set
- train.indices = sample(1:n, size = n.training, replace = FALSE)
- # Get indices not used by the training set
- remaining.indices = setdiff(1:n, train.indices)
- # Remaining indices are already randomized, just allocate correctly
- cv.indices = remaining.indices[1:318]
- test.indices = remaining.indices[319:length(remaining.indices)]
- # Use the indices to create each of the datasets
- spam.train = spam[train.indices,]
- spam.cv = spam[cv.indices,]
- spam.test = spam[test.indices,]
- # Sanity check: are the ratios of ham to spam relatively constant?
- print(mean(spam.train$label == "ham"))
- print(mean(spam.cv$label == "ham"))
- print(mean(spam.test$label == "ham"))
- ```
- The number of ham messages in each dataset is relatively close to the original 87%. These datasets look good for future analysis.
- # Data Cleaning
- ```{r}
- # To lowercase, removal of punctuation
- tidy.train = spam.train %>%
- mutate(
- sms = tolower(sms),
- sms = str_replace_all(sms, "[[:punct:]]", ""),
- sms = str_replace_all(sms, "[[:digit:]]", " "),
- sms = str_replace_all(sms, "[\u0094\u0092\n\t]", " ")
- )
- # Creating the vocabulary
- vocabulary = NULL
- messages = pull(tidy.train, sms)
- # Iterate through the messages and add to the vocabulary
- for (m in messages) {
- words = str_split(m, " ")[[1]]
- words = words[!words %in% ""]
- vocabulary = c(vocabulary, words)
- }
- vocabulary = unique(vocabulary)
- ```
- # Calculating Constants and Parameters
- ```{r}
- # Calculating Constants
- # Mean of a vector of logicals is a percentage
- p.spam = mean(tidy.train$label == "spam")
- p.ham = mean(tidy.train$label == "ham")
- # Isolate the spam and ham messages
- spam.messages = tidy.train %>%
- filter(label == "spam") %>%
- pull("sms")
- ham.messages = tidy.train %>%
- filter(label == "ham") %>%
- pull("sms")
- spam.words = NULL
- for (sm in spam.messages) {
- words = str_split(sm, " ")[[1]]
- spam.words = c(spam.words, words)
- }
- ham.words = NULL
- for (hm in ham.messages) {
- words = str_split(hm, " ")[[1]]
- ham.words = c(ham.words, words)
- }
- n.spam = length(unique(spam.words))
- n.ham = length(unique(ham.words))
- n.vocabulary = length(vocabulary)
- alpha = 1
- ```
- ```{r}
- # Calculating Parameters
- spam.counts = list()
- ham.counts = list()
- spam.probs = list()
- ham.probs = list()
- # This might take a while to run with so many words
- for (vocab in vocabulary) {
-
- # Initialize the counts for the word
- spam.counts[[vocab]] = 0
- ham.counts[[vocab]] = 0
-
- # Break up the message and count how many times word appears
- for (sm in spam.messages) {
- words = str_split(sm, " ")[[1]]
- spam.counts[[vocab]] = spam.counts[[vocab]] + sum(words == vocab)
- }
-
- for (hm in ham.messages) {
- words = str_split(hm, " ")[[1]]
- ham.counts[[vocab]] = ham.counts[[vocab]] + sum(words == vocab)
- }
-
- # Use the counts to calculate the probability
- spam.probs[[vocab]] = (spam.counts[[vocab]] + alpha) / (n.spam + alpha * n.vocabulary)
- ham.probs[[vocab]] = (ham.counts[[vocab]] + alpha) / (n.ham + alpha * n.vocabulary)
- }
- ```
- # Classifying New Messages
- ```{r}
- classify = function(message) {
-
- # Initializing the probability product
- p.spam.given.message = p.spam
- p.ham.given.message = p.ham
-
- # Splitting and cleaning the new message
- clean.message = tolower(message)
- clean.message = str_replace_all(clean.message, "[[:punct:]]", "")
- clean.message = str_replace_all(clean.message, "[[:digit:]]", " ")
- clean.message = str_replace_all(clean.message, "[\u0094\u0092\n\t]", " ")
- words = str_split(clean.message, " ")[[1]]
-
- for (word in words) {
-
- # Extra check if word is not in vocabulary
- wi.spam.prob = ifelse(word %in% vocabulary,
- spam.probs[[word]],
- 1)
- wi.ham.prob = ifelse(word %in% vocabulary,
- ham.probs[[word]],
- 1)
-
- p.spam.given.message = p.spam.given.message * wi.spam.prob
- p.ham.given.message = p.ham.given.message * wi.ham.prob
- }
-
- result = case_when(
- p.spam.given.message >= p.ham.given.message ~ "spam",
- p.spam.given.message < p.ham.given.message ~ "ham")
-
- return(result)
- }
- final.train = tidy.train %>%
- mutate(
- prediction = unlist(map(sms, classify))
- ) %>%
- select(label, prediction, sms)
- # Results of classification on training
- confusion = table(final.train$label, final.train$prediction)
- accuracy = (confusion[1,1] + confusion[2,2]) / nrow(final.train)
- ```
- Roughly, the classifier achieves about 97% accuracy on the training set. We aren't interested in how well the classifier performs with training data though, the classifier has already "seen" all of these messages.
- # Hyperparameter Tuning
- ```{r}
- alpha.grid = seq(0.1, 1, by = 0.1)
- cv.accuracy = NULL
- for (a in alpha.grid) {
-
- spam.probs = list()
- ham.probs = list()
- # This might take a while to run with so many words
- for (vocab in vocabulary) {
-
- # Use the counts to calculate the probability
- spam.probs[[vocab]] = (spam.counts[[vocab]] + a) / (n.spam + a * n.vocabulary)
- ham.probs[[vocab]] = (ham.counts[[vocab]] + a) / (n.ham + a * n.vocabulary)
- }
-
- cv = spam.cv %>%
- mutate(
- prediction = unlist(map(sms, classify))
- ) %>%
- select(label, prediction, sms)
-
- confusion = table(cv$label, cv$prediction)
- acc = (confusion[1,1] + confusion[2,2]) / nrow(cv)
- cv.accuracy = c(cv.accuracy, acc)
- }
- cv.check = tibble(
- alpha = alpha.grid,
- accuracy = cv.accuracy
- )
- cv.check
- ```
- Judging from the cross-validation set, higher $\alpha$ values cause the accuracy to decrease. We'll go with $\alpha = 0.1$ since it produces the highest cross-validation prediction accuracy.
- # Test Set Performance
- ```{r}
- # Reestablishing the proper parameters
- optimal.alpha = 0.1
- for (a in alpha.grid) {
-
- spam.probs = list()
- ham.probs = list()
- # This might take a while to run with so many words
- for (vocab in vocabulary) {
-
- # Use the counts to calculate the probability
- spam.probs[[vocab]] = (spam.counts[[vocab]] + optimal.alpha) / (n.spam + optimal.alpha * n.vocabulary)
- ham.probs[[vocab]] = (ham.counts[[vocab]] + optimal.alpha) / (n.ham + optimal.alpha * n.vocabulary)
- }
- }
- spam.test = spam.test %>%
- mutate(
- prediction = unlist(map(sms, classify))
- ) %>%
- select(label, prediction, sms)
-
- confusion = table(spam.test$label, spam.test$prediction)
- test.accuracy = (confusion[1,1] + confusion[2,2]) / nrow(cv)
- test.accuracy
- ```
- We've achieved an accuracy of 93% in the test set. Not bad!
|