123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- ---
- title: 'Guided Project: Analyzing Movie Ratings'
- author: "Dataquest"
- date: "11/26/2020"
- output: html_document
- ---
- # Loading the Web Page
- ```{r}
- # Loading the `rvest`, `dplyr`, and `ggplot2` packages
- library(rvest)
- library(dplyr)
- library(ggplot2)
- # Specifying the URL where we will extract video data
- url <- "http://dataquestio.github.io/web-scraping-pages/Feature%20Film,%20Released%20between%202020-03-01%20and%202020-07-31%20(Sorted%20by%20Popularity%20Ascending)%20-%20IMDb.html"
- # Loading the web page content using the `read_html()` function
- wp_content <- read_html(url)
- ```
- # String Manipulation Reminder
- ```{r}
- # Converting "10.50" into numeric
- as.numeric("10.50")
- # Converting the vector `c("14.59", "3.14", "55")` into numeric
- as.numeric(c("14.59", "3.14", "55"))
- # Parsing the vector `c("14 min", "17,35", "(2012)", "1,2,3,4")` into numeric
- readr::parse_number(c("14 min", "17,35", "(2012)", "1,2,3,4"))
- # Removing whitespaces at the begining and end of `" Space before and after should disappear "`
- stringr::str_trim(" Space before and after should disappear ")
- ```
- # Extracting Elements from the Header
- ```{r}
- # Extracting the movie's titles
- ## Finding the title CSS selector
- title_selector <- ".lister-item-header a"
- ## Identifying the number of elements this selector will select from Selector Gadget
- n_title <- 30
- ## Extracting the movie titles combining the `html_nodes()` and `html_text()` function
- titles <- wp_content %>%
- html_nodes(title_selector) %>%
- html_text()
- ## Printing titles vector
- titles
- # Extracting the movie's years
- ## Using a process similar to the one we used to extract the titles
- year_selector <- ".lister-item-year"
- n_year <- 30
- years <- wp_content %>%
- html_nodes(year_selector) %>%
- html_text()
- ## Converting the years from character to numeric data type
- years <- readr::parse_number(years)
- ## Printing years vector
- years
- ```
- # Extracting Movie's Features
- ```{r}
- # Extracting the movie's runtimes
- ## Finding the title CSS selector
- runtime_selector <- ".runtime"
- ## Identifying the number of elements this selector will select from Selector Gadget
- n_runtime <- 30
- ## Extracting the movie runtimes combining the `html_nodes()` and `html_text()` function
- runtimes <- wp_content %>%
- html_nodes(runtime_selector) %>%
- html_text()
- ## Converting the runtimes from character to numeric data type
- runtimes <- readr::parse_number(runtimes)
- ## Printing runtimes vector
- runtimes
- # Extracting the movie's genres
- ## Extracting the movie genres using a similar process as previously
- genre_selector <- ".genre"
- n_genre <- 30
- genres <- wp_content %>%
- html_nodes(genre_selector) %>%
- html_text()
- ## Removing whitespaces at the end of genre characters
- genres <- stringr::str_trim(genres)
- ## Printing genres vector
- genres
- ```
- # Extracting Movie's Ratings
- ```{r}
- # Extracting the movie's user ratings
- ## Finding the user rating CSS selector
- user_rating_selector <- ".ratings-imdb-rating"
- ## Identifying the number of elements this selector will select from Selector Gadget
- n_user_rating <- 29
- ## Extracting the user rating combining the `html_nodes()` and `html_attr()` function
- user_ratings <- wp_content %>%
- html_nodes(user_rating_selector) %>%
- html_attr("data-value")
- ## Converting the user rating from character to numeric data type
- user_ratings <- as.numeric(user_ratings)
- ## Printing user ratings vector
- user_ratings
- # Extracting the movie's metascores
- ## Extracting the movie metascore using a similar process as previously
- metascore_selector <- ".metascore"
- n_metascore <- 25
- metascores <- wp_content %>%
- html_nodes(metascore_selector) %>%
- html_text()
- ## Removing whitespaces at the end of metascores and converting them into numeric
- metascores <- stringr::str_trim(metascores)
- metascores <- as.numeric(metascores)
- ## Printing metascores vector
- metascores
- ```
- # Extracting Movie's Votes
- ```{r}
- # Extracting the movie's votes
- ## Finding the vote CSS selector
- vote_selector <- ".sort-num_votes-visible :nth-child(2)"
- ## Identifying the number of elements this selector will select from Selector Gadget
- n_vote <- 29
- ## Extracting the votes combining the `html_nodes()` and `html_text()` function
- votes <- wp_content %>%
- html_nodes(vote_selector) %>%
- html_text()
- ## Converting the vote from character to numeric data type
- votes <- readr::parse_number(votes)
- ## Printing votes vector
- votes
- ```
- # Putting all together and Visualize
- ```{r}
- # Creating a dataframe with the data we previously extracted: titles, years, runtimes, genres, user ratings, and votes.
- ## Removing the 17th element from the vectors: titles, years, runtimes, and genres
- ## Keeping only the integer part of the user ratings using the `floor()` function. For example, `3.4` becomes `3`.
- movie_df <- tibble::tibble("title" = titles[-17],
- "year" = years[-17],
- "runtime" = runtimes[-17],
- "genre" = genres[-17],
- "rating" = floor(user_ratings),
- "vote" = votes)
- # Creating a boxplot that show the number of vote again the user rating
- ggplot(data = movie_df,
- aes(x = rating, y = vote, group = rating)) +
- geom_boxplot()
- ```
|