5 gadi atpakaļ · f0ec102d9c
--- a/Mission374Solutions.Rmd
+++ b/Mission374Solutions.Rmd
@@ -1,5 +1,5 @@
 
				 ---
			
 
				-title: "Mission 374 Solutions"
			
 
				+title: "Answering Business Questions using SQL (Intermediate SQL in R): Guided Project Solutions"
			
 
				 output: html_document
			
 
				 ---
			
 
				 
			
--- a/Mission376Solutions.Rmd
+++ b/Mission376Solutions.Rmd
@@ -1,5 +1,5 @@
 
				 ---
			
 
				-title: "Mission 376 Solutions"
			
 
				+title: "Designing and Creating a Database (Intermediate SQL in R): Guided Project Solutions"
			
 
				 output: html_document
			
 
				 ---
			
 
				 
			
--- a/Mission409Solutions.Rmd
+++ b/Mission409Solutions.Rmd
@@ -1,5 +1,5 @@
 
				 ---
			
 
				-title: "Mission 409 Solutions"
			
 
				+title: "Probability Fundamentals in R: Guided Project Solutions"
			
 
				 output: html_document
			
 
				 ---
			
 
				 
			
--- a/Mission443Solutions.Rmd
+++ b/Mission443Solutions.Rmd
@@ -1,5 +1,5 @@
 
				 ---
			
 
				-title: "Mission 443 Solutions"
			
 
				+title: "Hypothesis Testing in R: Guided Project Solutions"
			
 
				 output: html_document
			
 
				 ---
			
 
				 
			
--- a/Mission475Solutions.Rmd
+++ b/Mission475Solutions.Rmd
@@ -0,0 +1,267 @@
 
				+---
			
 
				+title: "Conditional Probability in R: Guided Project Solutions"
			
 
				+output: html_document
			
 
				+---
			
 
				+
			
 
				+```{r, warning = FALSE, message = FALSE }
			
 
				+library(tidyverse)
			
 
				+set.seed(1)
			
 
				+```
			
 
				+
			
 
				+# Introduction
			
 
				+
			
 
				+This analysis is an application of what we've learned in Dataquest's Conditional Probability course. Using a dataset of pre-labeled SMS messages, we'll create a spam filter using the Naive Bayes algorithm.
			
 
				+
			
 
				+# Data 
			
 
				+
			
 
				+```{r}
			
 
				+spam = read.csv("./data/SMSSpamCollection", sep = "\t", header = F)
			
 
				+colnames(spam) = c("label", "sms")
			
 
				+```
			
 
				+
			
 
				+The `spam` dataset has `r nrow(spam)` rows and `r ncol(spam)` columns. Of these messages, `r mean(spam$label == "ham") * 100`% of them are not classified as spam, the rest are spam.
			
 
				+
			
 
				+# Dividing Up Into Training and Test Sets
			
 
				+
			
 
				+```{r}
			
 
				+n = nrow(spam)
			
 
				+n.training = 2547
			
 
				+n.cv = 318
			
 
				+n.test = 319
			
 
				+
			
 
				+# Create the random indices for training set
			
 
				+train.indices = sample(1:n, size = n.training, replace = FALSE)
			
 
				+
			
 
				+# Get indices not used by the training set
			
 
				+remaining.indices = setdiff(1:n, train.indices)
			
 
				+
			
 
				+# Remaining indices are already randomized, just allocate correctly
			
 
				+cv.indices = remaining.indices[1:318]
			
 
				+test.indices = remaining.indices[319:length(remaining.indices)]
			
 
				+
			
 
				+# Use the indices to create each of the datasets
			
 
				+spam.train = spam[train.indices,]
			
 
				+spam.cv = spam[cv.indices,]
			
 
				+spam.test = spam[test.indices,]
			
 
				+
			
 
				+# Sanity check: are the ratios of ham to spam relatively constant?
			
 
				+print(mean(spam.train$label == "ham"))
			
 
				+print(mean(spam.cv$label == "ham"))
			
 
				+print(mean(spam.test$label == "ham"))
			
 
				+```
			
 
				+
			
 
				+The number of ham messages in each dataset is relatively close to the original 87%. These datasets look good for future analysis.
			
 
				+
			
 
				+# Data Cleaning
			
 
				+
			
 
				+```{r}
			
 
				+# To lowercase, removal of punctuation
			
 
				+tidy.train = spam.train %>% 
			
 
				+  mutate(
			
 
				+    sms = tolower(sms),
			
 
				+    sms = str_replace_all(sms, "[[:punct:]]", ""),
			
 
				+    sms = str_replace_all(sms, "[[:digit:]]", " "),
			
 
				+    sms = str_replace_all(sms, "[\u0094\u0092\n\t]", " ")
			
 
				+  )
			
 
				+
			
 
				+# Creating the vocabulary
			
 
				+vocabulary = NULL
			
 
				+messages = pull(tidy.train, sms)
			
 
				+
			
 
				+# Iterate through the messages and add to the vocabulary
			
 
				+for (m in messages) {
			
 
				+  words = str_split(m, " ")[[1]]
			
 
				+  words = words[!words %in% ""]
			
 
				+  vocabulary = c(vocabulary, words)
			
 
				+}
			
 
				+vocabulary = unique(vocabulary)
			
 
				+```
			
 
				+
			
 
				+# Calculating Constants and Parameters
			
 
				+
			
 
				+```{r}
			
 
				+# Calculating Constants
			
 
				+# Mean of a vector of logicals is a percentage
			
 
				+p.spam = mean(tidy.train$label == "spam")
			
 
				+p.ham = mean(tidy.train$label == "ham")
			
 
				+
			
 
				+# Isolate the spam and ham messages
			
 
				+spam.messages = tidy.train %>% 
			
 
				+  filter(label == "spam") %>% 
			
 
				+  pull("sms")
			
 
				+
			
 
				+ham.messages = tidy.train %>% 
			
 
				+  filter(label == "ham") %>% 
			
 
				+  pull("sms")
			
 
				+
			
 
				+spam.words = NULL
			
 
				+for (sm in spam.messages) {
			
 
				+  words = str_split(sm, " ")[[1]]
			
 
				+  spam.words = c(spam.words, words)
			
 
				+}
			
 
				+
			
 
				+ham.words = NULL
			
 
				+for (hm in ham.messages) {
			
 
				+  words = str_split(hm, " ")[[1]]
			
 
				+  ham.words = c(ham.words, words)
			
 
				+}
			
 
				+
			
 
				+n.spam = length(unique(spam.words))
			
 
				+n.ham = length(unique(ham.words))
			
 
				+n.vocabulary = length(vocabulary)
			
 
				+alpha = 1
			
 
				+```
			
 
				+
			
 
				+```{r}
			
 
				+# Calculating Parameters
			
 
				+spam.counts = list()
			
 
				+ham.counts = list()
			
 
				+spam.probs = list()
			
 
				+ham.probs = list()
			
 
				+
			
 
				+# This might take a while to run with so many words
			
 
				+for (vocab in vocabulary) {
			
 
				+  
			
 
				+  # Initialize the counts for the word
			
 
				+  spam.counts[[vocab]] = 0
			
 
				+  ham.counts[[vocab]] = 0
			
 
				+  
			
 
				+  # Break up the message and count how many times word appears
			
 
				+  for (sm in spam.messages) {
			
 
				+    words = str_split(sm, " ")[[1]]
			
 
				+    spam.counts[[vocab]] = spam.counts[[vocab]] + sum(words == vocab)
			
 
				+  }
			
 
				+  
			
 
				+  for (hm in ham.messages) {
			
 
				+    words = str_split(hm, " ")[[1]]
			
 
				+    ham.counts[[vocab]] = ham.counts[[vocab]] + sum(words == vocab)
			
 
				+  }
			
 
				+  
			
 
				+  # Use the counts to calculate the probability
			
 
				+  spam.probs[[vocab]] = (spam.counts[[vocab]] + alpha) / (n.spam + alpha * n.vocabulary)
			
 
				+  ham.probs[[vocab]] = (ham.counts[[vocab]] + alpha) / (n.ham + alpha * n.vocabulary)
			
 
				+}
			
 
				+
			
 
				+```
			
 
				+
			
 
				+# Classifying New Messages
			
 
				+
			
 
				+```{r}
			
 
				+classify = function(message) {
			
 
				+  
			
 
				+  # Initializing the probability product
			
 
				+  p.spam.given.message = p.spam
			
 
				+  p.ham.given.message = p.ham
			
 
				+  
			
 
				+  # Splitting and cleaning the new message
			
 
				+  clean.message = tolower(message)
			
 
				+  clean.message = str_replace_all(clean.message, "[[:punct:]]", "")
			
 
				+  clean.message = str_replace_all(clean.message, "[[:digit:]]", " ")
			
 
				+  clean.message = str_replace_all(clean.message, "[\u0094\u0092\n\t]", " ")
			
 
				+  words = str_split(clean.message, " ")[[1]]
			
 
				+  
			
 
				+  for (word in words) {
			
 
				+    
			
 
				+    # Extra check if word is not in vocabulary
			
 
				+    wi.spam.prob = ifelse(word %in% vocabulary, 
			
 
				+                          spam.probs[[word]],
			
 
				+                          1)
			
 
				+    wi.ham.prob = ifelse(word %in% vocabulary, 
			
 
				+                         ham.probs[[word]],
			
 
				+                        1)
			
 
				+    
			
 
				+    p.spam.given.message = p.spam.given.message * wi.spam.prob
			
 
				+    p.ham.given.message = p.ham.given.message * wi.ham.prob
			
 
				+  }
			
 
				+  
			
 
				+  result = case_when(
			
 
				+    p.spam.given.message >= p.ham.given.message ~ "spam",
			
 
				+    p.spam.given.message < p.ham.given.message ~ "ham")
			
 
				+  
			
 
				+  return(result)
			
 
				+}
			
 
				+
			
 
				+final.train = tidy.train %>% 
			
 
				+  mutate(
			
 
				+    prediction = unlist(map(sms, classify))
			
 
				+  ) %>% 
			
 
				+  select(label, prediction, sms)
			
 
				+
			
 
				+
			
 
				+# Results of classification on training
			
 
				+confusion = table(final.train$label, final.train$prediction)
			
 
				+accuracy = (confusion[1,1] + confusion[2,2]) / nrow(final.train)
			
 
				+```
			
 
				+
			
 
				+Roughly, the classifier achieves about 97% accuracy on the training set. We aren't interested in how well the classifier performs with training data though, the classifier has already "seen" all of these messages.
			
 
				+
			
 
				+# Hyperparameter Tuning
			
 
				+
			
 
				+```{r}
			
 
				+alpha.grid = seq(0.1, 1, by = 0.1)
			
 
				+cv.accuracy = NULL
			
 
				+
			
 
				+for (a in alpha.grid) {
			
 
				+  
			
 
				+  spam.probs = list()
			
 
				+  ham.probs = list()
			
 
				+
			
 
				+  # This might take a while to run with so many words
			
 
				+  for (vocab in vocabulary) {
			
 
				+    
			
 
				+    # Use the counts to calculate the probability
			
 
				+    spam.probs[[vocab]] = (spam.counts[[vocab]] + a) / (n.spam + a * n.vocabulary)
			
 
				+    ham.probs[[vocab]] = (ham.counts[[vocab]] + a) / (n.ham + a * n.vocabulary)
			
 
				+  }
			
 
				+  
			
 
				+  cv = spam.cv %>% 
			
 
				+    mutate(
			
 
				+      prediction = unlist(map(sms, classify))
			
 
				+    ) %>% 
			
 
				+  select(label, prediction, sms)
			
 
				+  
			
 
				+  confusion = table(cv$label, cv$prediction)
			
 
				+  acc = (confusion[1,1] + confusion[2,2]) / nrow(cv)
			
 
				+  cv.accuracy = c(cv.accuracy, acc)
			
 
				+}
			
 
				+
			
 
				+cv.check = tibble(
			
 
				+  alpha = alpha.grid,
			
 
				+  accuracy = cv.accuracy
			
 
				+)
			
 
				+cv.check
			
 
				+```
			
 
				+
			
 
				+Judging from the cross-validation set, higher $\alpha$ values cause the accuracy to decrease. We'll go with $\alpha = 0.1$ since it produces the highest cross-validation prediction accuracy.
			
 
				+
			
 
				+# Test Set Performance
			
 
				+
			
 
				+```{r}
			
 
				+# Reestablishing the  proper parameters
			
 
				+optimal.alpha = 0.1
			
 
				+for (a in alpha.grid) {
			
 
				+  
			
 
				+  spam.probs = list()
			
 
				+  ham.probs = list()
			
 
				+
			
 
				+  # This might take a while to run with so many words
			
 
				+  for (vocab in vocabulary) {
			
 
				+    
			
 
				+    # Use the counts to calculate the probability
			
 
				+    spam.probs[[vocab]] = (spam.counts[[vocab]] + optimal.alpha) / (n.spam + optimal.alpha * n.vocabulary)
			
 
				+    ham.probs[[vocab]] = (ham.counts[[vocab]] + optimal.alpha) / (n.ham + optimal.alpha * n.vocabulary)
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+spam.test = spam.test %>% 
			
 
				+  mutate(
			
 
				+    prediction = unlist(map(sms, classify))
			
 
				+    ) %>% 
			
 
				+  select(label, prediction, sms)
			
 
				+  
			
 
				+confusion = table(spam.test$label, spam.test$prediction)
			
 
				+test.accuracy = (confusion[1,1] + confusion[2,2]) / nrow(cv)
			
 
				+test.accuracy
			
 
				+```
			
 
				+
			
 
				+We've achieved an accuracy of 93% in the test set. Not bad!