Davte
/
solutions


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
							---
title: 'Guided Project: Analyzing Movie Ratings'
author: "Dataquest"
date: "11/26/2020"
output: html_document
---

# Loading the Web Page
```{r}
# Loading the `rvest`, `dplyr`, and `ggplot2` packages
library(rvest)
library(dplyr)
library(ggplot2)

# Specifying the URL where we will extract video data
url <- "http://dataquestio.github.io/web-scraping-pages/Feature%20Film,%20Released%20between%202020-03-01%20and%202020-07-31%20(Sorted%20by%20Popularity%20Ascending)%20-%20IMDb.html"

# Loading the web page content using the `read_html()` function
wp_content <- read_html(url)
```

# String Manipulation Reminder
```{r}
# Converting "10.50" into numeric
as.numeric("10.50")

# Converting the vector `c("14.59", "3.14", "55")` into numeric
as.numeric(c("14.59", "3.14", "55"))

# Parsing the vector `c("14 min", "17,35", "(2012)", "1,2,3,4")` into numeric
readr::parse_number(c("14 min", "17,35", "(2012)", "1,2,3,4"))

# Removing whitespaces at the begining and end of `" Space before and after should disappear     "`
stringr::str_trim(" Space before and after should disappear     ")
```

# Extracting Elements from the Header
```{r}

# Extracting the movie's titles
## Finding the title CSS selector
title_selector <- ".lister-item-header a"

## Identifying the number of elements this selector will select from Selector Gadget 
n_title <- 30

## Extracting the movie titles combining the `html_nodes()` and `html_text()` function
titles <- wp_content %>% 
  html_nodes(title_selector) %>% 
  html_text()

## Printing titles vector
titles

# Extracting the movie's years
## Using a process similar to the one we used to extract the titles
year_selector <- ".lister-item-year"
n_year <- 30
years <- wp_content %>% 
  html_nodes(year_selector) %>% 
  html_text()

## Converting the years from character to numeric data type
years <- readr::parse_number(years)

## Printing years vector
years
```

# Extracting Movie's Features
```{r}

# Extracting the movie's runtimes
## Finding the title CSS selector
runtime_selector <- ".runtime"

## Identifying the number of elements this selector will select from Selector Gadget 
n_runtime <- 30

## Extracting the movie runtimes combining the `html_nodes()` and `html_text()` function
runtimes <- wp_content %>% 
  html_nodes(runtime_selector) %>% 
  html_text()

## Converting the runtimes from character to numeric data type
runtimes <- readr::parse_number(runtimes)

## Printing runtimes vector
runtimes

# Extracting the movie's genres
## Extracting the movie genres using a similar process as previously
genre_selector <- ".genre"
n_genre <- 30
genres <- wp_content %>% 
  html_nodes(genre_selector) %>% 
  html_text()

## Removing whitespaces at the end of genre characters
genres <- stringr::str_trim(genres)

## Printing genres vector
genres
```

# Extracting Movie's Ratings
```{r}
# Extracting the movie's user ratings
## Finding the user rating CSS selector
user_rating_selector <- ".ratings-imdb-rating"

## Identifying the number of elements this selector will select from Selector Gadget 
n_user_rating <- 29

## Extracting the user rating combining the `html_nodes()` and `html_attr()` function
user_ratings <- wp_content %>% 
  html_nodes(user_rating_selector) %>% 
  html_attr("data-value")

## Converting the user rating from character to numeric data type
user_ratings <- as.numeric(user_ratings)

## Printing user ratings vector
user_ratings

# Extracting the movie's metascores
## Extracting the movie metascore using a similar process as previously
metascore_selector <- ".metascore"
n_metascore <- 25
metascores <- wp_content %>% 
  html_nodes(metascore_selector) %>% 
  html_text()

## Removing whitespaces at the end of metascores and converting them into numeric
metascores <- stringr::str_trim(metascores)
metascores <- as.numeric(metascores)

## Printing metascores vector
metascores
```

# Extracting Movie's Votes
```{r}

# Extracting the movie's votes
## Finding the vote CSS selector
vote_selector <- ".sort-num_votes-visible :nth-child(2)"

## Identifying the number of elements this selector will select from Selector Gadget 
n_vote <- 29

## Extracting the votes combining the `html_nodes()` and `html_text()` function
votes <- wp_content %>% 
  html_nodes(vote_selector) %>% 
  html_text()

## Converting the vote from character to numeric data type
votes <- readr::parse_number(votes)

## Printing votes vector
votes
```


# Putting all together and Visualize
```{r}
# Creating a dataframe with the data we previously extracted: titles, years, runtimes, genres, user ratings, and votes.
## Removing the 17th element from the vectors: titles, years, runtimes, and genres
## Keeping only the integer part of the user ratings using the `floor()` function. For example, `3.4` becomes `3`.
movie_df <- tibble::tibble("title" = titles[-17], 
                           "year" = years[-17], 
                           "runtime" = runtimes[-17], 
                           "genre" = genres[-17], 
                           "rating" = floor(user_ratings), 
                           "vote" = votes)

# Creating a boxplot that show the number of vote again the user rating
ggplot(data = movie_df,
       aes(x = rating, y = vote, group = rating)) +
  geom_boxplot()
```