vor 4 Jahren · 27a33481b9
--- a/Mission475Solutions.Rmd
+++ b/Mission475Solutions.Rmd
@@ -15,12 +15,12 @@ This analysis is an application of what we've learned in Dataquest's Conditional
 
				 
			
 
				 ```{r}
			
 
				 # Bring in the dataset
			
 
				-spam = read.csv("spam.csv")
			
 
				+spam = read_csv("spam.csv")
			
 
				 ```
			
 
				 
			
 
				 The `spam` dataset has `r nrow(spam)` rows and `r ncol(spam)` columns. Of these messages, `r mean(spam$label == "ham") * 100`% of them are not classified as spam, the rest are spam.
			
 
				 
			
 
				-# Dividing Up Into Training and Test Sets
			
 
				+# Training, Cross-validation and Test Sets
			
 
				 
			
 
				 ```{r}
			
 
				 # Calculate some helper values to split the dataset
			
@@ -113,9 +113,15 @@ n_ham = ham_vocab %>% length
 
				 n_vocabulary = vocabulary %>% length 
			
 
				 ```
			
 
				 
			
 
				+# Calculating Probability Parameters
			
 
				+
			
 
				 ```{r}
			
 
				 # New vectorized approach to a calculating ham and spam probabilities
			
 
				 
			
 
				+# Marginal probability of a training message being spam or ham
			
 
				+p_spam = mean(tidy_train$label == "spam")
			
 
				+p_ham = mean(tidy_train$label == "ham")
			
 
				+
			
 
				 # Break up the spam and ham counting into their own tibbles
			
 
				 spam_counts = tibble(
			
 
				   word = spam_vocab
			
@@ -172,10 +178,6 @@ word_counts = full_join(spam_counts, ham_counts, by = "word") %>%
 
				 # based on this alpha (default to 1)
			
 
				 classify = function(message, alpha = 1) {
			
 
				   
			
 
				-  # Initializing the probability product
			
 
				-  p_spam = mean(tidy_train$label == "spam")
			
 
				-  p_ham = mean(tidy_train$label == "ham")
			
 
				-  
			
 
				   # Splitting and cleaning the new message
			
 
				   # This is the same cleaning procedure used on the training messages
			
 
				   clean_message = str_to_lower(message) %>% 
			
@@ -233,13 +235,17 @@ final_train = tidy_train %>%
 
				   mutate(
			
 
				     prediction = map_chr(sms, function(m) { classify(m) })
			
 
				   ) 
			
 
				+```
			
 
				 
			
 
				+# Calculating Accuracy
			
 
				 
			
 
				+```{r}
			
 
				 # Results of classification on training
			
 
				 confusion = table(final_train$label, final_train$prediction)
			
 
				 accuracy = (confusion[1,1] + confusion[2,2]) / nrow(final_train)
			
 
				 ```
			
 
				 
			
 
				+
			
 
				 The Naive Bayes Classifier achieves an accuracy of about 89%. Pretty good! Let's see how well it works on messages that it has never seen before.
			
 
				 
			
 
				 # Hyperparameter Tuning