123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- ---
- title: 'Guided Project: Analyzing Movie Ratings'
- author: "Dataquest"
- date: "11/26/2020"
- output: html_document
- ---
- - Title: Movies' ratings versus user votes
- - Usually, we can find a lot of information online about the ranking of movies, universities, supermarkets, etc. We can use these data to supplement information from another database or facilitate trend analysis. However, it's not easy to choose the right criterion because several might be interesting (e.g., movies' ratings and user votes). In this project, we want to extract information on the most popular movies from early 2020 and check if the ratings are in alignment with the votes. If yes, then we can consider either one or the other without loss of information.
- ```{r}
- library(rvest)
- library(dplyr)
- library(ggplot2)
- url <- "http://dataquestio.github.io/web-scraping-pages/IMDb-DQgp.html"
- wp_content <- read_html(url)
- ```
- ```{r}
- as.numeric("10.50")
- as.numeric(c("14.59", "3.14", "55"))
- readr::parse_number(c("14 min", "17,35", "(2012)", "1,2,3,4"))
- stringr::str_trim(" Space before and after should disappear ")
- ```
- ```{r}
- title_selector <- ".lister-item-header a"
- n_title <- 30
- titles <- wp_content %>%
- html_nodes(title_selector) %>%
- html_text()
- titles
- year_selector <- ".lister-item-year"
- n_year <- 30
- years <- wp_content %>%
- html_nodes(year_selector) %>%
- html_text()
- years <- readr::parse_number(years)
- years
- ```
- ```{r}
- runtime_selector <- ".runtime"
- n_runtime <- 30
- runtimes <- wp_content %>%
- html_nodes(runtime_selector) %>%
- html_text()
- runtimes <- readr::parse_number(runtimes)
- runtimes
- genre_selector <- ".genre"
- n_genre <- 30
- genres <- wp_content %>%
- html_nodes(genre_selector) %>%
- html_text()
- genres <- stringr::str_trim(genres)
- genres
- ```
- ```{r}
- user_rating_selector <- ".ratings-imdb-rating"
- n_user_rating <- 29
- user_ratings <- wp_content %>%
- html_nodes(user_rating_selector) %>%
- html_attr("data-value")
- user_ratings <- as.numeric(user_ratings)
- user_ratings
- metascore_selector <- ".metascore"
- n_metascore <- 25
- metascores <- wp_content %>%
- html_nodes(metascore_selector) %>%
- html_text()
- metascores <- stringr::str_trim(metascores)
- metascores <- as.numeric(metascores)
- metascores
- ```
- ```{r}
- vote_selector <- ".sort-num_votes-visible :nth-child(2)"
- n_vote <- 29
- votes <- wp_content %>%
- html_nodes(vote_selector) %>%
- html_text()
- votes <- readr::parse_number(votes)
- votes
- ```
- ```{r}
- append_vector <- function(vector, inserted_indices, values){
- vector_current_indices <- 1:length(vector)
- new_inserted_indices <- inserted_indices + seq(0, 0.9, length.out = length(inserted_indices))
- indices <- c(vector_current_indices, new_inserted_indices)
- ordered_indices <- order(indices)
- new_vector <- c(vector, values)
- new_vector[ordered_indices]
- }
- metascores <- append_vector(metascores, c(1, 1, 1, 13, 24), NA)
- metascores
- titles <- titles[-17]
- years <- years[-17]
- runtimes <- runtimes[-17]
- genres <- genres[-17]
- metascores <- metascores[-17]
- ```
- ```{r}
- movie_df <- tibble::tibble("title" = titles,
- "year" = years,
- "runtime" = runtimes,
- "genre" = genres,
- "rating" = floor(user_ratings),
- "metascore" = metascores,
- "vote" = votes)
- ggplot(data = movie_df,
- aes(x = rating, y = vote, group = rating)) +
- geom_boxplot()
- ```
|