Queer European MD passionate about IT

Mission572Solutions.Rmd 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. ---
  2. title: 'Guided Project: Analyzing Movie Ratings'
  3. author: "Dataquest"
  4. date: "11/26/2020"
  5. output: html_document
  6. ---
  7. # Introduction
  8. - Title: Movies' ratings versus user votes
  9. - Usually, we can find a lot of information online about the ranking of movies, universities, supermarkets, etc. We can use these data to supplement information from another database or facilitate trend analysis. However, it's not easy to choose the right criterion because several might be interesting (e.g., movies' ratings and user votes). In this project, we want to extract information on the most popular movies from early 2020 and check if the ratings are in alignment with the votes. If yes, then we can consider either one or the other without loss of information.
  10. # Loading the Web Page
  11. ```{r}
  12. # Loading the `rvest`, `dplyr`, and `ggplot2` packages
  13. library(rvest)
  14. library(dplyr)
  15. library(ggplot2)
  16. # Specifying the URL where we will extract video data
  17. url <- "http://dataquestio.github.io/web-scraping-pages/IMDb-DQgp.html"
  18. # Loading the web page content using the `read_html()` function
  19. wp_content <- read_html(url)
  20. ```
  21. # String Manipulation Reminder
  22. ```{r}
  23. # Converting "10.50" into numeric
  24. as.numeric("10.50")
  25. # Converting the vector `c("14.59", "3.14", "55")` into numeric
  26. as.numeric(c("14.59", "3.14", "55"))
  27. # Parsing the vector `c("14 min", "17,35", "(2012)", "1,2,3,4")` into numeric
  28. readr::parse_number(c("14 min", "17,35", "(2012)", "1,2,3,4"))
  29. # Removing whitespaces at the begining and end of `" Space before and after should disappear "`
  30. stringr::str_trim(" Space before and after should disappear ")
  31. ```
  32. # Extracting Elements from the Header
  33. ```{r}
  34. # Extracting the movie's titles
  35. ## Finding the title CSS selector
  36. title_selector <- ".lister-item-header a"
  37. ## Identifying the number of elements this selector will select from Selector Gadget
  38. n_title <- 30
  39. ## Extracting the movie titles combining the `html_nodes()` and `html_text()` function
  40. titles <- wp_content %>%
  41. html_nodes(title_selector) %>%
  42. html_text()
  43. ## Printing titles vector
  44. titles
  45. # Extracting the movie's years
  46. ## Using a process similar to the one we used to extract the titles
  47. year_selector <- ".lister-item-year"
  48. n_year <- 30
  49. years <- wp_content %>%
  50. html_nodes(year_selector) %>%
  51. html_text()
  52. ## Converting the years from character to numeric data type
  53. years <- readr::parse_number(years)
  54. ## Printing years vector
  55. years
  56. ```
  57. # Extracting Movie's Features
  58. ```{r}
  59. # Extracting the movie's runtimes
  60. ## Finding the title CSS selector
  61. runtime_selector <- ".runtime"
  62. ## Identifying the number of elements this selector will select from Selector Gadget
  63. n_runtime <- 30
  64. ## Extracting the movie runtimes combining the `html_nodes()` and `html_text()` function
  65. runtimes <- wp_content %>%
  66. html_nodes(runtime_selector) %>%
  67. html_text()
  68. ## Converting the runtimes from character to numeric data type
  69. runtimes <- readr::parse_number(runtimes)
  70. ## Printing runtimes vector
  71. runtimes
  72. # Extracting the movie's genres
  73. ## Extracting the movie genres using a similar process as previously
  74. genre_selector <- ".genre"
  75. n_genre <- 30
  76. genres <- wp_content %>%
  77. html_nodes(genre_selector) %>%
  78. html_text()
  79. ## Removing whitespaces at the end of genre characters
  80. genres <- stringr::str_trim(genres)
  81. ## Printing genres vector
  82. genres
  83. ```
  84. # Extracting Movie's Ratings
  85. ```{r}
  86. # Extracting the movie's user ratings
  87. ## Finding the user rating CSS selector
  88. user_rating_selector <- ".ratings-imdb-rating"
  89. ## Identifying the number of elements this selector will select from Selector Gadget
  90. n_user_rating <- 29
  91. ## Extracting the user rating combining the `html_nodes()` and `html_attr()` function
  92. user_ratings <- wp_content %>%
  93. html_nodes(user_rating_selector) %>%
  94. html_attr("data-value")
  95. ## Converting the user rating from character to numeric data type
  96. user_ratings <- as.numeric(user_ratings)
  97. ## Printing user ratings vector
  98. user_ratings
  99. # Extracting the movie's metascores
  100. ## Extracting the movie metascore using a similar process as previously
  101. metascore_selector <- ".metascore"
  102. n_metascore <- 25
  103. metascores <- wp_content %>%
  104. html_nodes(metascore_selector) %>%
  105. html_text()
  106. ## Removing whitespaces at the end of metascores and converting them into numeric
  107. metascores <- stringr::str_trim(metascores)
  108. metascores <- as.numeric(metascores)
  109. ## Printing metascores vector
  110. metascores
  111. ```
  112. # Extracting Movie's Votes
  113. ```{r}
  114. # Extracting the movie's votes
  115. ## Finding the vote CSS selector
  116. vote_selector <- ".sort-num_votes-visible :nth-child(2)"
  117. ## Identifying the number of elements this selector will select from Selector Gadget
  118. n_vote <- 29
  119. ## Extracting the votes combining the `html_nodes()` and `html_text()` function
  120. votes <- wp_content %>%
  121. html_nodes(vote_selector) %>%
  122. html_text()
  123. ## Converting the vote from character to numeric data type
  124. votes <- readr::parse_number(votes)
  125. ## Printing votes vector
  126. votes
  127. ```
  128. # Dealing with missing data
  129. ```{r}
  130. # Copy-pasting the `append_vector()` in our Markdown file
  131. append_vector <- function(vector, inserted_indices, values){
  132. ## Creating the current indices of the vector
  133. vector_current_indices <- 1:length(vector)
  134. ## Adding `0.5` to the `inserted_indices`
  135. new_inserted_indices <- inserted_indices + seq(0, 0.9, length.out = length(inserted_indices))
  136. ## Appending the `new_inserted_indices` to the current vector indices
  137. indices <- c(vector_current_indices, new_inserted_indices)
  138. ## Ordering the indices
  139. ordered_indices <- order(indices)
  140. ## Appending the new value to the existing vector
  141. new_vector <- c(vector, values)
  142. ## Ordering the new vector wrt the ordered indices
  143. new_vector[ordered_indices]
  144. }
  145. # Using the `append_vector()` function to insert `NA` into the metascores vector after the positions 1, 1, 1, 13, and 24 and saving the result back in metascores vector
  146. metascores <- append_vector(metascores, c(1, 1, 1, 13, 24), NA)
  147. metascores
  148. # Removing the 17th element from the vectors: titles, years, runtimes, genres, and metascores
  149. ## Saving the result back to these vectors.
  150. titles <- titles[-17]
  151. years <- years[-17]
  152. runtimes <- runtimes[-17]
  153. genres <- genres[-17]
  154. metascores <- metascores[-17]
  155. ```
  156. # Putting all together and Visualize
  157. ```{r}
  158. # Creating a dataframe with the data we previously extracted: titles, years, runtimes, genres, user ratings, metascores, and votes.
  159. ## Keeping only the integer part of the user ratings using the `floor()` function. For example, `3.4` becomes `3`.
  160. movie_df <- tibble::tibble("title" = titles,
  161. "year" = years,
  162. "runtime" = runtimes,
  163. "genre" = genres,
  164. "rating" = floor(user_ratings),
  165. "metascore" = metascores,
  166. "vote" = votes)
  167. # Creating a boxplot that show the number of vote again the user rating
  168. ggplot(data = movie_df,
  169. aes(x = rating, y = vote, group = rating)) +
  170. geom_boxplot()
  171. ```