4 gadi atpakaļ · 628fe36cdf
--- a/Mission571Solutions.Rmd
+++ b/Mission571Solutions.Rmd
@@ -6,17 +6,16 @@ output: html_document
 
				 ---
			
 
				 
			
 
				 # Introduction
			
 
				+
			
 
				 - Title: Analyzing New York solar data.
			
 
				-- By using APIs, we have access to an astronomical amount of data available only online. In this study, we want to extract New York solar data. Such data can, for example, allow us to decide on average the most fruitful periods of the year for solar panel deployment.
			
 
				-        
			
 
				+- Using APIs gives us access to an incredible amount of data only available online. In this exercise, we want to extract New York City solar data. Such data can, for example, allow us to determine on average the most productive periods of the year for solar panel deployment.
			
 
				+
			
 
				 # Finding the Suitable Endpoint and Parameters to Query the API
			
 
				 ```{r}
			
 
				 # Storing my api key in a variable
			
 
				 the_key = "" #TODO Store your API key here
			
 
				-
			
 
				 # Identifying the API URL
			
 
				 url <- "https://developer.nrel.gov/api/solar/solar_resource/v1.json"
			
 
				-
			
 
				 # Specifying the necessary parameters to request the New York City solar data
			
 
				 parameters_list <- list(api_key = the_key, lat = 41, lon = -75)
			
 
				 ```
			
@@ -25,22 +24,17 @@ parameters_list <- list(api_key = the_key, lat = 41, lon = -75)
 
				 ```{r}
			
 
				 # Loading the `httr` package
			
 
				 library(httr)
			
 
				-
			
 
				 # Using the `GET()` function to request the data from the API with `url` and `parameters_list`
			
 
				 response <- GET(url, query = parameters_list)
			
 
				-
			
 
				 # Tracking errors 
			
 
				 ## Displaying the status code with the `status_code()` function
			
 
				 status <- status_code(response)
			
 
				 status
			
 
				-
			
 
				 ## Displaying the API response format
			
 
				 response_type <- http_type(response)
			
 
				 response_type
			
 
				-
			
 
				 # Extracting the API response content as text
			
 
				 content <- content(response, "text")
			
 
				-
			
 
				 # Displaying this content to check how it looks visually.
			
 
				 print(content)
			
 
				 ```
			
@@ -49,62 +43,48 @@ print(content)
 
				 ```{r}
			
 
				 # Parsing the `json_text` to a R object using the `jsonlite::fromJSON()` function
			
 
				 json_lists <- jsonlite::fromJSON(content)
			
 
				-
			
 
				 # Displaying the structure of the R object using the `str()` function
			
 
				 str(json_lists)
			
 
				 ```
			
 
				 
			
 
				 # How to Create a Datarame from a Complex List
			
 
				-
			
 
				 # Building Datarame from a Complex List
			
 
				 ```{r}
			
 
				 # Extracting the outputs data
			
 
				 outputs_list <- json_lists$outputs
			
 
				-
			
 
				 # Extracting the monthly vector (`monthly`) from the (`avg_dni`) list in the outputs data
			
 
				 avg_dni <- outputs_list$avg_dni$monthly
			
 
				-
			
 
				 # Extracting the monthly vector (`monthly`) from the (`avg_ghi`) list in the outputs data
			
 
				 avg_ghi <- outputs_list$avg_ghi$monthly
			
 
				-
			
 
				 # Extracting the monthly vector (`monthly`) from the (`avg_lat_tilt`) list in the outputs data
			
 
				 avg_lat_tilt <- outputs_list$avg_lat_tilt$monthly
			
 
				-
			
 
				 # Combining the monthly vectors into a dataframe using the `tibble::tibble()` function
			
 
				 ## Adding the `month` column containing month abbreviations: `Jan`, `Fev`,...,`Dec`
			
 
				 dataframe <- tibble::tibble("month" = month.abb,
			
 
				                             "avg_dni" = avg_dni, 
			
 
				                             "avg_ghi" = avg_ghi, 
			
 
				                             "avg_lat_tilt" = avg_lat_tilt)
			
 
				-
			
 
				 # Displaying the dataframe
			
 
				 dataframe
			
 
				 ```
			
 
				-
			
 
				 - (Instruction 4's answer)
			
 
				-We can see that all the columns are still lists containing one item. For future use of this dataframe, it would probably be necessary to convert these columns to numeric.
			
 
				+We can see that all the columns are still lists containing one item. For future use of this dataframe, it would likely be necessary to convert these columns to numeric data type.
			
 
				 
			
 
				 # Extracting Datarame from a Complex List: 
			
 
				 ```{r}
			
 
				 # Extracting the outputs list
			
 
				 outputs_list <- json_lists$outputs
			
 
				-
			
 
				 # Simplifying the outputs list
			
 
				 simplified_outputs_list <- unlist(outputs_list)
			
 
				-
			
 
				 # Restructuring the simplified list into a matrix of 13 rows (the annual value and 12 months values)
			
 
				 data_matrix <- matrix(data = simplified_outputs_list, nrow = 13)
			
 
				-
			
 
				 # Removing the annual values from the data matrix
			
 
				 data_matrix <- data_matrix[-1, ]
			
 
				-
			
 
				 # Converting the matrix into a dataframe using the `as.data.frame()` function
			
 
				 another_dataframe <- as.data.frame(data_matrix)
			
 
				-
			
 
				 # Displaying the dataframe
			
 
				 another_dataframe
			
 
				 ```
			
 
				-
			
 
				 - (Instruction 6's answer)
			
 
				 We can see that all the columns are numeric. However, we haven't appended the `month` column yet.
			
 
				 
			
@@ -113,51 +93,39 @@ We can see that all the columns are numeric. However, we haven't appended the `m
 
				 library(httr)
			
 
				 library(dplyr)
			
 
				 the_key = "" #TODO Store your API key here 
			
 
				-
			
 
				 # Creating the custom `nrel_api_json_get_df()` function inspiring from what we did in the previous missions
			
 
				 ## The function has two parameters
			
 
				 ### The `endpoint` parameter represents the endpoint we need
			
 
				 ### The `queries` parameter represents the list of API request parameters.
			
 
				 nrel_api_json_get_df <- function(endpoint, queries = list()) {
			
 
				-
			
 
				   ## Preparing the URL 
			
 
				   url <- modify_url("https://developer.nrel.gov", path = endpoint)
			
 
				-  
			
 
				   ## Querying the API
			
 
				   response <- GET(url, query = queries)
			
 
				-
			
 
				   ## Tracking errors
			
 
				   if ( http_error(response) ){
			
 
				     print(status_code(response))
			
 
				     print(http_status(response))
			
 
				     stop("Something went wrong.", call. = FALSE)
			
 
				   }
			
 
				-    
			
 
				   if (http_type(response) != "application/json") {
			
 
				     stop("API did not return json", call. = FALSE)
			
 
				   }
			
 
				-  
			
 
				   ## Extracting content
			
 
				   json_text <- content(response, "text")
			
 
				-    
			
 
				   ## Converting content into Dataframe
			
 
				   table_lst <- jsonlite::fromJSON(json_text)
			
 
				-
			
 
				   dataframe <- tibble::tibble("month" = month.abb,
			
 
				                               "avg_dni" = as.numeric(table_lst$outputs$avg_dni$monthly),
			
 
				                               "avg_ghi" = as.numeric(table_lst$outputs$avg_ghi$monthly),
			
 
				                               "avg_lat_tilt" = as.numeric(table_lst$outputs$avg_lat_tilt$monthly))
			
 
				-    
			
 
				   ## Returning the dataframe  
			
 
				   dataframe
			
 
				-    
			
 
				 }
			
 
				-
			
 
				 # Using the custom `nrel_api_json_get_df()` function to extract the solar resource as a dataframe
			
 
				 ## Providing the `"api/solar/solar_resource/v1.json"` as the `endpoint` parameter
			
 
				 ## Providing the `parameters_list` variable as `queries` parameter
			
 
				 solar_resource_df <- nrel_api_json_get_df("api/solar/solar_resource/v1.json", parameters_list)
			
 
				-
			
 
				 # Printing the output dataframe
			
 
				 solar_resource_df
			
 
				 ```
			
@@ -167,29 +135,22 @@ solar_resource_df
 
				 # Loading the `ggplot2` and `dplyr` packages
			
 
				 library(ggplot2)
			
 
				 library(dplyr)
			
 
				-
			
 
				 # Using the `ggplot()` function to plot the `avg_dni` value for each month 
			
 
				 ggplot(data = solar_resource_df,
			
 
				        aes(x = month, y = avg_dni, group = 1)) +
			
 
				   geom_line() +
			
 
				   geom_point() +
			
 
				   theme_bw()
			
 
				-
			
 
				 # Converting the `month` column into factor using the following command  
			
 
				 solar_resource_df <- solar_resource_df %>% 
			
 
				   mutate(month = factor(month, levels = month.abb))
			
 
				-
			
 
				 # Replotting the `avg_dni` value for each month 
			
 
				 ggplot(data = solar_resource_df,
			
 
				        aes(x = month, y = avg_dni, group = 1)) +
			
 
				   geom_line() +
			
 
				   geom_point() +
			
 
				   theme_bw()
			
 
				-
			
 
				 ```
			
 
				-
			
 
				 - (Instruction 5's answer)
			
 
				-The first plot x-axis is ordered alphabetically, while the second is ordered chronologically, from January to December. 
			
 
				-
			
 
				-This operation allows ordering the labels in the plot as we wish.
			
 
				-
			
 
				+The first plot x-axis is ordered alphabetically, while the second is ordered chronologically from January to December. 
			
 
				+This operation allows ordering the labels in the plot as we wish.
			
--- a/Mission572Solutions.Rmd
+++ b/Mission572Solutions.Rmd
@@ -6,8 +6,9 @@ output: html_document
 
				 ---
			
 
				 
			
 
				 # Introduction
			
 
				-- Title: Movie's ratings versus user votes
			
 
				-- Usually, we can find online a lot of information about the ranking of movies, universities, supermarkets. We can use these data to supplement information from another database or facilitate trend analysis. However, it's not easy to choose the right criterion because several criteria can be interesting (e.g., movies' rating and user votes). In this project, we want to extract information on the most famous movies early this year and check if the ratings are in adequacy with the votes. If yes, then we can consider either one or the other without loss of information.
			
 
				+
			
 
				+- Title: Movies' ratings versus user votes
			
 
				+- Usually, we can find a lot of information online about the ranking of movies, universities, supermarkets, etc. We can use these data to supplement information from another database or facilitate trend analysis. However, it's not easy to choose the right criterion because several might be interesting (e.g., movies' ratings and user votes). In this project, we want to extract information on the most popular movies from early 2020 and check if the ratings are in alignment with the votes. If yes, then we can consider either one or the other without loss of information.
			
 
				 
			
 
				 # Loading the Web Page
			
 
				 ```{r}
			
@@ -15,10 +16,8 @@ output: html_document
 
				 library(rvest)
			
 
				 library(dplyr)
			
 
				 library(ggplot2)
			
 
				-
			
 
				 # Specifying the URL where we will extract video data
			
 
				 url <- "http://dataquestio.github.io/web-scraping-pages/IMDb-DQgp.html"
			
 
				-
			
 
				 # Loading the web page content using the `read_html()` function
			
 
				 wp_content <- read_html(url)
			
 
				 ```
			
@@ -27,35 +26,27 @@ wp_content <- read_html(url)
 
				 ```{r}
			
 
				 # Converting "10.50" into numeric
			
 
				 as.numeric("10.50")
			
 
				-
			
 
				 # Converting the vector `c("14.59", "3.14", "55")` into numeric
			
 
				 as.numeric(c("14.59", "3.14", "55"))
			
 
				-
			
 
				 # Parsing the vector `c("14 min", "17,35", "(2012)", "1,2,3,4")` into numeric
			
 
				 readr::parse_number(c("14 min", "17,35", "(2012)", "1,2,3,4"))
			
 
				-
			
 
				 # Removing whitespaces at the begining and end of `" Space before and after should disappear     "`
			
 
				 stringr::str_trim(" Space before and after should disappear     ")
			
 
				 ```
			
 
				 
			
 
				 # Extracting Elements from the Header
			
 
				 ```{r}
			
 
				-
			
 
				 # Extracting the movie's titles
			
 
				 ## Finding the title CSS selector
			
 
				 title_selector <- ".lister-item-header a"
			
 
				-
			
 
				 ## Identifying the number of elements this selector will select from Selector Gadget 
			
 
				 n_title <- 30
			
 
				-
			
 
				 ## Extracting the movie titles combining the `html_nodes()` and `html_text()` function
			
 
				 titles <- wp_content %>% 
			
 
				   html_nodes(title_selector) %>% 
			
 
				   html_text()
			
 
				-
			
 
				 ## Printing titles vector
			
 
				 titles
			
 
				-
			
 
				 # Extracting the movie's years
			
 
				 ## Using a process similar to the one we used to extract the titles
			
 
				 year_selector <- ".lister-item-year"
			
@@ -63,35 +54,27 @@ n_year <- 30
 
				 years <- wp_content %>% 
			
 
				   html_nodes(year_selector) %>% 
			
 
				   html_text()
			
 
				-
			
 
				 ## Converting the years from character to numeric data type
			
 
				 years <- readr::parse_number(years)
			
 
				-
			
 
				 ## Printing years vector
			
 
				 years
			
 
				 ```
			
 
				 
			
 
				 # Extracting Movie's Features
			
 
				 ```{r}
			
 
				-
			
 
				 # Extracting the movie's runtimes
			
 
				 ## Finding the title CSS selector
			
 
				 runtime_selector <- ".runtime"
			
 
				-
			
 
				 ## Identifying the number of elements this selector will select from Selector Gadget 
			
 
				 n_runtime <- 30
			
 
				-
			
 
				 ## Extracting the movie runtimes combining the `html_nodes()` and `html_text()` function
			
 
				 runtimes <- wp_content %>% 
			
 
				   html_nodes(runtime_selector) %>% 
			
 
				   html_text()
			
 
				-
			
 
				 ## Converting the runtimes from character to numeric data type
			
 
				 runtimes <- readr::parse_number(runtimes)
			
 
				-
			
 
				 ## Printing runtimes vector
			
 
				 runtimes
			
 
				-
			
 
				 # Extracting the movie's genres
			
 
				 ## Extracting the movie genres using a similar process as previously
			
 
				 genre_selector <- ".genre"
			
@@ -99,10 +82,8 @@ n_genre <- 30
 
				 genres <- wp_content %>% 
			
 
				   html_nodes(genre_selector) %>% 
			
 
				   html_text()
			
 
				-
			
 
				 ## Removing whitespaces at the end of genre characters
			
 
				 genres <- stringr::str_trim(genres)
			
 
				-
			
 
				 ## Printing genres vector
			
 
				 genres
			
 
				 ```
			
@@ -112,21 +93,16 @@ genres
 
				 # Extracting the movie's user ratings
			
 
				 ## Finding the user rating CSS selector
			
 
				 user_rating_selector <- ".ratings-imdb-rating"
			
 
				-
			
 
				 ## Identifying the number of elements this selector will select from Selector Gadget 
			
 
				 n_user_rating <- 29
			
 
				-
			
 
				 ## Extracting the user rating combining the `html_nodes()` and `html_attr()` function
			
 
				 user_ratings <- wp_content %>% 
			
 
				   html_nodes(user_rating_selector) %>% 
			
 
				   html_attr("data-value")
			
 
				-
			
 
				 ## Converting the user rating from character to numeric data type
			
 
				 user_ratings <- as.numeric(user_ratings)
			
 
				-
			
 
				 ## Printing user ratings vector
			
 
				 user_ratings
			
 
				-
			
 
				 # Extracting the movie's metascores
			
 
				 ## Extracting the movie metascore using a similar process as previously
			
 
				 metascore_selector <- ".metascore"
			
@@ -134,33 +110,26 @@ n_metascore <- 25
 
				 metascores <- wp_content %>% 
			
 
				   html_nodes(metascore_selector) %>% 
			
 
				   html_text()
			
 
				-
			
 
				 ## Removing whitespaces at the end of metascores and converting them into numeric
			
 
				 metascores <- stringr::str_trim(metascores)
			
 
				 metascores <- as.numeric(metascores)
			
 
				-
			
 
				 ## Printing metascores vector
			
 
				 metascores
			
 
				 ```
			
 
				 
			
 
				 # Extracting Movie's Votes
			
 
				 ```{r}
			
 
				-
			
 
				 # Extracting the movie's votes
			
 
				 ## Finding the vote CSS selector
			
 
				 vote_selector <- ".sort-num_votes-visible :nth-child(2)"
			
 
				-
			
 
				 ## Identifying the number of elements this selector will select from Selector Gadget 
			
 
				 n_vote <- 29
			
 
				-
			
 
				 ## Extracting the votes combining the `html_nodes()` and `html_text()` function
			
 
				 votes <- wp_content %>% 
			
 
				   html_nodes(vote_selector) %>% 
			
 
				   html_text()
			
 
				-
			
 
				 ## Converting the vote from character to numeric data type
			
 
				 votes <- readr::parse_number(votes)
			
 
				-
			
 
				 ## Printing votes vector
			
 
				 votes
			
 
				 ```
			
@@ -169,30 +138,22 @@ votes
 
				 ```{r}
			
 
				 # Copy-pasting the `append_vector()` in our Markdown file
			
 
				 append_vector <- function(vector, inserted_indices, values){
			
 
				-  
			
 
				   ## Creating the current indices of the vector
			
 
				   vector_current_indices <- 1:length(vector)
			
 
				-  
			
 
				   ## Adding `0.5` to the `inserted_indices`
			
 
				   new_inserted_indices <- inserted_indices + seq(0, 0.9, length.out = length(inserted_indices))
			
 
				-  
			
 
				   ## Appending the `new_inserted_indices` to the current vector indices
			
 
				   indices <- c(vector_current_indices, new_inserted_indices)
			
 
				-  
			
 
				   ## Ordering the indices
			
 
				   ordered_indices <- order(indices)
			
 
				-  
			
 
				   ## Appending the new value to the existing vector
			
 
				   new_vector <- c(vector, values)
			
 
				-  
			
 
				   ## Ordering the new vector wrt the ordered indices
			
 
				   new_vector[ordered_indices]
			
 
				 }
			
 
				-
			
 
				 # Using the `append_vector()` function to insert `NA` into the metascores vector after the positions 1, 1, 1, 13, and 24 and saving the result back in metascores vector
			
 
				 metascores <- append_vector(metascores, c(1, 1, 1, 13, 24), NA)
			
 
				 metascores
			
 
				-
			
 
				 # Removing the 17th element from the vectors: titles, years, runtimes, genres, and metascores
			
 
				 ## Saving the result back to these vectors.
			
 
				 titles <- titles[-17]
			
@@ -213,9 +174,8 @@ movie_df <- tibble::tibble("title" = titles,
 
				                            "rating" = floor(user_ratings), 
			
 
				                            "metascore" = metascores,
			
 
				                            "vote" = votes)
			
 
				-
			
 
				 # Creating a boxplot that show the number of vote again the user rating
			
 
				 ggplot(data = movie_df,
			
 
				        aes(x = rating, y = vote, group = rating)) +
			
 
				   geom_boxplot()
			
 
				-```
			
 
				+```