Queer European MD passionate about IT

Mission572Solutions.Rmd 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. ---
  2. title: 'Guided Project: Analyzing Movie Ratings'
  3. author: "Dataquest"
  4. date: "11/26/2020"
  5. output: html_document
  6. ---
  7. # Loading the Web Page
  8. ```{r}
  9. # Loading the `rvest`, `dplyr`, and `ggplot2` packages
  10. library(rvest)
  11. library(dplyr)
  12. library(ggplot2)
  13. # Specifying the URL where we will extract video data
  14. url <- "http://dataquestio.github.io/web-scraping-pages/Feature%20Film,%20Released%20between%202020-03-01%20and%202020-07-31%20(Sorted%20by%20Popularity%20Ascending)%20-%20IMDb.html"
  15. # Loading the web page content using the `read_html()` function
  16. wp_content <- read_html(url)
  17. ```
  18. # String Manipulation Reminder
  19. ```{r}
  20. # Converting "10.50" into numeric
  21. as.numeric("10.50")
  22. # Converting the vector `c("14.59", "3.14", "55")` into numeric
  23. as.numeric(c("14.59", "3.14", "55"))
  24. # Parsing the vector `c("14 min", "17,35", "(2012)", "1,2,3,4")` into numeric
  25. readr::parse_number(c("14 min", "17,35", "(2012)", "1,2,3,4"))
  26. # Removing whitespaces at the begining and end of `" Space before and after should disappear "`
  27. stringr::str_trim(" Space before and after should disappear ")
  28. ```
  29. # Extracting Elements from the Header
  30. ```{r}
  31. # Extracting the movie's titles
  32. ## Finding the title CSS selector
  33. title_selector <- ".lister-item-header a"
  34. ## Identifying the number of elements this selector will select from Selector Gadget
  35. n_title <- 30
  36. ## Extracting the movie titles combining the `html_nodes()` and `html_text()` function
  37. titles <- wp_content %>%
  38. html_nodes(title_selector) %>%
  39. html_text()
  40. ## Printing titles vector
  41. titles
  42. # Extracting the movie's years
  43. ## Using a process similar to the one we used to extract the titles
  44. year_selector <- ".lister-item-year"
  45. n_year <- 30
  46. years <- wp_content %>%
  47. html_nodes(year_selector) %>%
  48. html_text()
  49. ## Converting the years from character to numeric data type
  50. years <- readr::parse_number(years)
  51. ## Printing years vector
  52. years
  53. ```
  54. # Extracting Movie's Features
  55. ```{r}
  56. # Extracting the movie's runtimes
  57. ## Finding the title CSS selector
  58. runtime_selector <- ".runtime"
  59. ## Identifying the number of elements this selector will select from Selector Gadget
  60. n_runtime <- 30
  61. ## Extracting the movie runtimes combining the `html_nodes()` and `html_text()` function
  62. runtimes <- wp_content %>%
  63. html_nodes(runtime_selector) %>%
  64. html_text()
  65. ## Converting the runtimes from character to numeric data type
  66. runtimes <- readr::parse_number(runtimes)
  67. ## Printing runtimes vector
  68. runtimes
  69. # Extracting the movie's genres
  70. ## Extracting the movie genres using a similar process as previously
  71. genre_selector <- ".genre"
  72. n_genre <- 30
  73. genres <- wp_content %>%
  74. html_nodes(genre_selector) %>%
  75. html_text()
  76. ## Removing whitespaces at the end of genre characters
  77. genres <- stringr::str_trim(genres)
  78. ## Printing genres vector
  79. genres
  80. ```
  81. # Extracting Movie's Ratings
  82. ```{r}
  83. # Extracting the movie's user ratings
  84. ## Finding the user rating CSS selector
  85. user_rating_selector <- ".ratings-imdb-rating"
  86. ## Identifying the number of elements this selector will select from Selector Gadget
  87. n_user_rating <- 29
  88. ## Extracting the user rating combining the `html_nodes()` and `html_attr()` function
  89. user_ratings <- wp_content %>%
  90. html_nodes(user_rating_selector) %>%
  91. html_attr("data-value")
  92. ## Converting the user rating from character to numeric data type
  93. user_ratings <- as.numeric(user_ratings)
  94. ## Printing user ratings vector
  95. user_ratings
  96. # Extracting the movie's metascores
  97. ## Extracting the movie metascore using a similar process as previously
  98. metascore_selector <- ".metascore"
  99. n_metascore <- 25
  100. metascores <- wp_content %>%
  101. html_nodes(metascore_selector) %>%
  102. html_text()
  103. ## Removing whitespaces at the end of metascores and converting them into numeric
  104. metascores <- stringr::str_trim(metascores)
  105. metascores <- as.numeric(metascores)
  106. ## Printing metascores vector
  107. metascores
  108. ```
  109. # Extracting Movie's Votes
  110. ```{r}
  111. # Extracting the movie's votes
  112. ## Finding the vote CSS selector
  113. vote_selector <- ".sort-num_votes-visible :nth-child(2)"
  114. ## Identifying the number of elements this selector will select from Selector Gadget
  115. n_vote <- 29
  116. ## Extracting the votes combining the `html_nodes()` and `html_text()` function
  117. votes <- wp_content %>%
  118. html_nodes(vote_selector) %>%
  119. html_text()
  120. ## Converting the vote from character to numeric data type
  121. votes <- readr::parse_number(votes)
  122. ## Printing votes vector
  123. votes
  124. ```
  125. # Putting all together and Visualize
  126. ```{r}
  127. # Creating a dataframe with the data we previously extracted: titles, years, runtimes, genres, user ratings, and votes.
  128. ## Removing the 17th element from the vectors: titles, years, runtimes, and genres
  129. ## Keeping only the integer part of the user ratings using the `floor()` function. For example, `3.4` becomes `3`.
  130. movie_df <- tibble::tibble("title" = titles[-17],
  131. "year" = years[-17],
  132. "runtime" = runtimes[-17],
  133. "genre" = genres[-17],
  134. "rating" = floor(user_ratings),
  135. "vote" = votes)
  136. # Creating a boxplot that show the number of vote again the user rating
  137. ggplot(data = movie_df,
  138. aes(x = rating, y = vote, group = rating)) +
  139. geom_boxplot()
  140. ```