Queer European MD passionate about IT

Mission443Solutions.Rmd 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. ---
  2. title: "Hypothesis Testing in R: Guided Project Solutions"
  3. output: html_document
  4. ---
  5. We would like to remind our students that our solutions represent just one of the many ways that a programmer might perform the analyses. This solution merely provides a platform for those who need a bit more guidance.
  6. ```{r setup }
  7. library(tidyverse)
  8. ```
  9. # Data Import
  10. ```{r}
  11. jeopardy = read_csv("./data/jeopardy.csv")
  12. ```
  13. ```{r}
  14. head(jeopardy)
  15. ```
  16. ```{r}
  17. colnames(jeopardy)
  18. ```
  19. ```{r}
  20. # the clean_names() function from the janitor package would have been great here too
  21. colnames(jeopardy) = c("show_number", "air_date", "round", "category", "value", "question", "answer")
  22. ```
  23. ```{r}
  24. sapply(jeopardy, typeof)
  25. ```
  26. # Fixing Data Types
  27. ```{r}
  28. unique(jeopardy$value)
  29. ```
  30. ```{r}
  31. # Removing Nones, cleaning the text, and converting everything into numeric
  32. jeopardy = jeopardy %>%
  33. filter(value != "None") %>%
  34. mutate(
  35. value = str_replace_all(value, "[$,]", ""),
  36. value = as.numeric(value)
  37. )
  38. ```
  39. ```{r}
  40. unique(jeopardy$value)
  41. ```
  42. # Normalizing Text
  43. ```{r}
  44. # The stringr library is automatically brought in when tidyverse is brought in
  45. # Notice how there is a space in the regular expression
  46. jeopardy = jeopardy %>%
  47. mutate(
  48. question = tolower(question),
  49. question = str_replace_all(question, "[^A-Za-z0-9 ]", ""),
  50. answer = tolower(answer),
  51. answer = str_replace_all(answer, "[^A-Za-z0-9 ]", ""),
  52. category = tolower(category),
  53. category = str_replace_all(category, "[^A-Za-z0-9 ]", "")
  54. )
  55. ```
  56. ```{r}
  57. head(jeopardy)
  58. ```
  59. # Making Dates More Accessible
  60. ```{r}
  61. jeopardy = jeopardy %>%
  62. separate(., air_date, into = c("year", "month", "day"), sep = "-") %>%
  63. mutate(
  64. year = as.numeric(year),
  65. month = as.numeric(month),
  66. day = as.numeric(day)
  67. )
  68. ```
  69. # Focusing On Particular Subject Areas
  70. ```{r}
  71. n_questions = nrow(jeopardy)
  72. p_category_expected = 1/3369
  73. p_not_category_expected = 3368/3369
  74. ```
  75. ```{r}
  76. categories = pull(jeopardy, category)
  77. n_science_categories = 0
  78. # Count how many times the word science appears in the categories
  79. for (c in categories) {
  80. if ("science" %in% c) {
  81. n_science_categories = n_science_categories + 1
  82. }
  83. }
  84. science_obs = c(n_science_categories, n_questions - n_science_categories)
  85. p_expected = c(1/3369, 3368/3369)
  86. chisq.test(science_obs, p = p_expected)
  87. ```
  88. ```{r}
  89. n_history_categories = 0
  90. # Count how many times the word history appears in the categories
  91. for (c in categories) {
  92. if ("history" %in% c) {
  93. n_history_categories = n_history_categories + 1
  94. }
  95. }
  96. history_obs = c(n_history_categories, n_questions - n_history_categories)
  97. p_expected = c(1/3369, 3368/3369)
  98. chisq.test(history_obs, p = p_expected)
  99. ```
  100. ```{r}
  101. n_shakespeare_categories = 0
  102. # Count how many times the word science appears in the categories
  103. for (c in categories) {
  104. if ("shakespeare" %in% c) {
  105. n_shakespeare_categories = n_shakespeare_categories + 1
  106. }
  107. }
  108. shakespeare_obs = c(n_shakespeare_categories, n_questions - n_shakespeare_categories)
  109. p_expected = c(1/3369, 3368/3369)
  110. chisq.test(shakespeare_obs, p = p_expected)
  111. ```
  112. We see p-values less than 0.05 for each of the hypothesis tests. From this, we would conclude that we should reject the null hypothesis that science doesn't have a higher prevalence than other topics in the Jeopardy data. We would conclude the same with history and Shakespeare.
  113. # Unique Terms in Questions
  114. ```{r}
  115. # Pull just the questions from the jeopardy data
  116. questions = pull(jeopardy, question)
  117. terms_used = character(0)
  118. for (q in questions) {
  119. # Split the sentence into distinct words
  120. split_sentence = str_split(q, " ")[[1]]
  121. # Check if each word is longer than 6 and if it's currently in terms_used
  122. for (term in split_sentence) {
  123. if (!term %in% terms_used & nchar(term) >= 6) {
  124. terms_used = c(terms_used, term)
  125. }
  126. }
  127. }
  128. ```
  129. # Terms In Low and High Value Questions
  130. ```{r}
  131. # Going only through the first 20 terms for shortness
  132. # But you can remove the indexing to perform this code on all the terms
  133. values = pull(jeopardy, value)
  134. value_count_data = NULL
  135. for (term in terms_used[1:20]) {
  136. n_high_value = 0
  137. n_low_value = 0
  138. for (i in 1:length(questions)) {
  139. # Split the sentence into a new vector
  140. split_sentence = str_split(questions[i], " ")[[1]]
  141. # Detect if the term is in the question and its value status
  142. if (term %in% split_sentence & values[i] >= 800) {
  143. n_high_value = n_high_value + 1
  144. } else if (term %in% split_sentence & values[i] < 800) {
  145. n_low_value = n_low_value + 1
  146. }
  147. }
  148. # Testing if the counts for high and low value questions deviates from what we expect
  149. test = chisq.test(c(n_high_value, n_low_value), p = c(2/5, 3/5))
  150. new_row = c(term, n_high_value, n_low_value, test$p.value)
  151. # Append this new row to our
  152. value_count_data = rbind(value_count_data, new_row)
  153. }
  154. ```
  155. ```{r}
  156. # Take the value count data and put it in a better format
  157. tidy_value_count_data = as_tibble(value_count_data)
  158. colnames(tidy_value_count_data) = c("term", "n_high", "n_low", "p_value")
  159. head(tidy_value_count_data)
  160. ```
  161. We can see from the output that some of the values are less than 5. Recall that the chi-squared test is prone to errors when the counts in each of the cells are less than 5. We may need to discard these terms and only look at terms where both counts are greater than 5.
  162. From the 20 terms that we looked at, it seems that the term "indian" is more associated with high value questions. Interesting!