123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208 |
- ---
- title: "Hypothesis Testing in R: Guided Project Solutions"
- output: html_document
- ---
- We would like to remind our students that our solutions represent just one of the many ways that a programmer might perform the analyses. This solution merely provides a platform for those who need a bit more guidance.
- ```{r setup }
- library(tidyverse)
- ```
- ```{r}
- jeopardy = read_csv("./data/jeopardy.csv")
- ```
- ```{r}
- head(jeopardy)
- ```
- ```{r}
- colnames(jeopardy)
- ```
- ```{r}
- colnames(jeopardy) = c("show_number", "air_date", "round", "category", "value", "question", "answer")
- ```
- ```{r}
- sapply(jeopardy, typeof)
- ```
- ```{r}
- unique(jeopardy$value)
- ```
- ```{r}
- jeopardy = jeopardy %>%
- filter(value != "None") %>%
- mutate(
- value = str_replace_all(value, "[$,]", ""),
- value = as.numeric(value)
- )
- ```
- ```{r}
- unique(jeopardy$value)
- ```
- ```{r}
- jeopardy = jeopardy %>%
- mutate(
- question = tolower(question),
- question = str_replace_all(question, "[^A-Za-z0-9 ]", ""),
- answer = tolower(answer),
- answer = str_replace_all(answer, "[^A-Za-z0-9 ]", ""),
- category = tolower(category),
- category = str_replace_all(category, "[^A-Za-z0-9 ]", "")
- )
- ```
- ```{r}
- head(jeopardy)
- ```
- ```{r}
- jeopardy = jeopardy %>%
- separate(., air_date, into = c("year", "month", "day"), sep = "-") %>%
- mutate(
- year = as.numeric(year),
- month = as.numeric(month),
- day = as.numeric(day)
- )
- ```
- ```{r}
- n_questions = nrow(jeopardy)
- p_category_expected = 1/3369
- p_not_category_expected = 3368/3369
- ```
- ```{r}
- categories = pull(jeopardy, category)
- n_science_categories = 0
- for (c in categories) {
- if ("science" %in% c) {
- n_science_categories = n_science_categories + 1
- }
- }
- science_obs = c(n_science_categories, n_questions - n_science_categories)
- p_expected = c(1/3369, 3368/3369)
- chisq.test(science_obs, p = p_expected)
- ```
- ```{r}
- n_history_categories = 0
- for (c in categories) {
- if ("history" %in% c) {
- n_history_categories = n_history_categories + 1
- }
- }
- history_obs = c(n_history_categories, n_questions - n_history_categories)
- p_expected = c(1/3369, 3368/3369)
- chisq.test(history_obs, p = p_expected)
- ```
- ```{r}
- n_shakespeare_categories = 0
- for (c in categories) {
- if ("shakespeare" %in% c) {
- n_shakespeare_categories = n_shakespeare_categories + 1
- }
- }
- shakespeare_obs = c(n_shakespeare_categories, n_questions - n_shakespeare_categories)
- p_expected = c(1/3369, 3368/3369)
- chisq.test(shakespeare_obs, p = p_expected)
- ```
- We see p-values less than 0.05 for each of the hypothesis tests. From this, we would conclude that we should reject the null hypothesis that science doesn't have a higher prevalence than other topics in the Jeopardy data. We would conclude the same with history and Shakespeare.
- ```{r}
- questions = pull(jeopardy, question)
- terms_used = character(0)
- for (q in questions) {
-
- split_sentence = str_split(q, " ")[[1]]
-
-
- for (term in split_sentence) {
- if (!term %in% terms_used & nchar(term) >= 6) {
- terms_used = c(terms_used, term)
- }
- }
- }
- ```
- ```{r}
- values = pull(jeopardy, value)
- value_count_data = NULL
- for (term in terms_used[1:20]) {
- n_high_value = 0
- n_low_value = 0
-
- for (i in 1:length(questions)) {
-
- split_sentence = str_split(questions[i], " ")[[1]]
-
-
- if (term %in% split_sentence & values[i] >= 800) {
- n_high_value = n_high_value + 1
- } else if (term %in% split_sentence & values[i] < 800) {
- n_low_value = n_low_value + 1
- }
- }
-
-
- test = chisq.test(c(n_high_value, n_low_value), p = c(2/5, 3/5))
- new_row = c(term, n_high_value, n_low_value, test$p.value)
-
-
- value_count_data = rbind(value_count_data, new_row)
-
- }
- ```
- ```{r}
- tidy_value_count_data = as_tibble(value_count_data)
- colnames(tidy_value_count_data) = c("term", "n_high", "n_low", "p_value")
- head(tidy_value_count_data)
- ```
- We can see from the output that some of the values are less than 5. Recall that the chi-squared test is prone to errors when the counts in each of the cells are less than 5. We may need to discard these terms and only look at terms where both counts are greater than 5.
- From the 20 terms that we looked at, it seems that the term "indian" is more associated with high value questions. Interesting!
|