5 лет назад · 628fe36cdf
--- a/Mission571Solutions.Rmd
+++ b/Mission571Solutions.Rmd
@@ -6,17 +6,16 @@ output: html_document
 
															 ---
														
 
															 # Introduction
														
 
															+
														
 
															 - Title: Analyzing New York solar data.
														
 
															-- By using APIs, we have access to an astronomical amount of data available only online. In this study, we want to extract New York solar data. Such data can, for example, allow us to decide on average the most fruitful periods of the year for solar panel deployment.
														
 
															-        
														
 
															+- Using APIs gives us access to an incredible amount of data only available online. In this exercise, we want to extract New York City solar data. Such data can, for example, allow us to determine on average the most productive periods of the year for solar panel deployment.
														
 
															+
														
 
															 # Finding the Suitable Endpoint and Parameters to Query the API
														
 
															 ```{r}
														
 
															 # Storing my api key in a variable
														
 
															 the_key = "" #TODO Store your API key here
														
 
															-
														
 
															 # Identifying the API URL
														
 
															 url <- "https://developer.nrel.gov/api/solar/solar_resource/v1.json"
														
 
															-
														
 
															 # Specifying the necessary parameters to request the New York City solar data
														
 
															 parameters_list <- list(api_key = the_key, lat = 41, lon = -75)
														
 
															 ```
														
@@ -25,22 +24,17 @@ parameters_list <- list(api_key = the_key, lat = 41, lon = -75)
 
															 ```{r}
														
 
															 # Loading the `httr` package
														
 
															 library(httr)
														
 
															-
														
 
															 # Using the `GET()` function to request the data from the API with `url` and `parameters_list`
														
 
															 response <- GET(url, query = parameters_list)
														
 
															-
														
 
															 # Tracking errors 
														
 
															 ## Displaying the status code with the `status_code()` function
														
 
															 status <- status_code(response)
														
 
															 status
														
 
															-
														
 
															 ## Displaying the API response format
														
 
															 response_type <- http_type(response)
														
 
															 response_type
														
 
															-
														
 
															 # Extracting the API response content as text
														
 
															 content <- content(response, "text")
														
 
															-
														
 
															 # Displaying this content to check how it looks visually.
														
 
															 print(content)
														
 
															 ```
														
@@ -49,62 +43,48 @@ print(content)
 
															 ```{r}
														
 
															 # Parsing the `json_text` to a R object using the `jsonlite::fromJSON()` function
														
 
															 json_lists <- jsonlite::fromJSON(content)
														
 
															-
														
 
															 # Displaying the structure of the R object using the `str()` function
														
 
															 str(json_lists)
														
 
															 ```
														
 
															 # How to Create a Datarame from a Complex List
														
 
															-
														
 
															 # Building Datarame from a Complex List
														
 
															 ```{r}
														
 
															 # Extracting the outputs data
														
 
															 outputs_list <- json_lists$outputs
														
 
															-
														
 
															 # Extracting the monthly vector (`monthly`) from the (`avg_dni`) list in the outputs data
														
 
															 avg_dni <- outputs_list$avg_dni$monthly
														
 
															-
														
 
															 # Extracting the monthly vector (`monthly`) from the (`avg_ghi`) list in the outputs data
														
 
															 avg_ghi <- outputs_list$avg_ghi$monthly
														
 
															-
														
 
															 # Extracting the monthly vector (`monthly`) from the (`avg_lat_tilt`) list in the outputs data
														
 
															 avg_lat_tilt <- outputs_list$avg_lat_tilt$monthly
														
 
															-
														
 
															 # Combining the monthly vectors into a dataframe using the `tibble::tibble()` function
														
 
															 ## Adding the `month` column containing month abbreviations: `Jan`, `Fev`,...,`Dec`
														
 
															 dataframe <- tibble::tibble("month" = month.abb,
														
 
															                             "avg_dni" = avg_dni, 
														
 
															                             "avg_ghi" = avg_ghi, 
														
 
															                             "avg_lat_tilt" = avg_lat_tilt)
														
 
															-
														
 
															 # Displaying the dataframe
														
 
															 dataframe
														
 
															 ```
														
 
															-
														
 
															 - (Instruction 4's answer)
														
 
															-We can see that all the columns are still lists containing one item. For future use of this dataframe, it would probably be necessary to convert these columns to numeric.
														
 
															+We can see that all the columns are still lists containing one item. For future use of this dataframe, it would likely be necessary to convert these columns to numeric data type.
														
 
															 # Extracting Datarame from a Complex List: 
														
 
															 ```{r}
														
 
															 # Extracting the outputs list
														
 
															 outputs_list <- json_lists$outputs
														
 
															-
														
 
															 # Simplifying the outputs list
														
 
															 simplified_outputs_list <- unlist(outputs_list)
														
 
															-
														
 
															 # Restructuring the simplified list into a matrix of 13 rows (the annual value and 12 months values)
														
 
															 data_matrix <- matrix(data = simplified_outputs_list, nrow = 13)
														
 
															-
														
 
															 # Removing the annual values from the data matrix
														
 
															 data_matrix <- data_matrix[-1, ]
														
 
															-
														
 
															 # Converting the matrix into a dataframe using the `as.data.frame()` function
														
 
															 another_dataframe <- as.data.frame(data_matrix)
														
 
															-
														
 
															 # Displaying the dataframe
														
 
															 another_dataframe
														
 
															 ```
														
 
															-
														
 
															 - (Instruction 6's answer)
														
 
															 We can see that all the columns are numeric. However, we haven't appended the `month` column yet.
														
@@ -113,51 +93,39 @@ We can see that all the columns are numeric. However, we haven't appended the `m
 
															 library(httr)
														
 
															 library(dplyr)
														
 
															 the_key = "" #TODO Store your API key here 
														
 
															-
														
 
															 # Creating the custom `nrel_api_json_get_df()` function inspiring from what we did in the previous missions
														
 
															 ## The function has two parameters
														
 
															 ### The `endpoint` parameter represents the endpoint we need
														
 
															 ### The `queries` parameter represents the list of API request parameters.
														
 
															 nrel_api_json_get_df <- function(endpoint, queries = list()) {
														
 
															-
														
 
															   ## Preparing the URL 
														
 
															   url <- modify_url("https://developer.nrel.gov", path = endpoint)
														
 
															-  
														
 
															   ## Querying the API
														
 
															   response <- GET(url, query = queries)
														
 
															-
														
 
															   ## Tracking errors
														
 
															   if ( http_error(response) ){
														
 
															     print(status_code(response))
														
 
															     print(http_status(response))
														
 
															     stop("Something went wrong.", call. = FALSE)
														
 
															   }
														
 
															-    
														
 
															   if (http_type(response) != "application/json") {
														
 
															     stop("API did not return json", call. = FALSE)
														
 
															   }
														
 
															-  
														
 
															   ## Extracting content
														
 
															   json_text <- content(response, "text")
														
 
															-    
														
 
															   ## Converting content into Dataframe
														
 
															   table_lst <- jsonlite::fromJSON(json_text)
														
 
															-
														
 
															   dataframe <- tibble::tibble("month" = month.abb,
														
 
															                               "avg_dni" = as.numeric(table_lst$outputs$avg_dni$monthly),
														
 
															                               "avg_ghi" = as.numeric(table_lst$outputs$avg_ghi$monthly),
														
 
															                               "avg_lat_tilt" = as.numeric(table_lst$outputs$avg_lat_tilt$monthly))
														
 
															-    
														
 
															   ## Returning the dataframe  
														
 
															   dataframe
														
 
															-    
														
 
															 }
														
 
															-
														
 
															 # Using the custom `nrel_api_json_get_df()` function to extract the solar resource as a dataframe
														
 
															 ## Providing the `"api/solar/solar_resource/v1.json"` as the `endpoint` parameter
														
 
															 ## Providing the `parameters_list` variable as `queries` parameter
														
 
															 solar_resource_df <- nrel_api_json_get_df("api/solar/solar_resource/v1.json", parameters_list)
														
 
															-
														
 
															 # Printing the output dataframe
														
 
															 solar_resource_df
														
 
															 ```
														
@@ -167,29 +135,22 @@ solar_resource_df
 
															 # Loading the `ggplot2` and `dplyr` packages
														
 
															 library(ggplot2)
														
 
															 library(dplyr)
														
 
															-
														
 
															 # Using the `ggplot()` function to plot the `avg_dni` value for each month 
														
 
															 ggplot(data = solar_resource_df,
														
 
															        aes(x = month, y = avg_dni, group = 1)) +
														
 
															   geom_line() +
														
 
															   geom_point() +
														
 
															   theme_bw()
														
 
															-
														
 
															 # Converting the `month` column into factor using the following command  
														
 
															 solar_resource_df <- solar_resource_df %>% 
														
 
															   mutate(month = factor(month, levels = month.abb))
														
 
															-
														
 
															 # Replotting the `avg_dni` value for each month 
														
 
															 ggplot(data = solar_resource_df,
														
 
															        aes(x = month, y = avg_dni, group = 1)) +
														
 
															   geom_line() +
														
 
															   geom_point() +
														
 
															   theme_bw()
														
 
															-
														
 
															 ```
														
 
															-
														
 
															 - (Instruction 5's answer)
														
 
															-The first plot x-axis is ordered alphabetically, while the second is ordered chronologically, from January to December. 
														
 
															-
														
 
															-This operation allows ordering the labels in the plot as we wish.
														
 
															-
														
 
															+The first plot x-axis is ordered alphabetically, while the second is ordered chronologically from January to December. 
														
 
															+This operation allows ordering the labels in the plot as we wish.
														
--- a/Mission572Solutions.Rmd
+++ b/Mission572Solutions.Rmd
@@ -6,8 +6,9 @@ output: html_document
 
															 ---
														
 
															 # Introduction
														
 
															-- Title: Movie's ratings versus user votes
														
 
															-- Usually, we can find online a lot of information about the ranking of movies, universities, supermarkets. We can use these data to supplement information from another database or facilitate trend analysis. However, it's not easy to choose the right criterion because several criteria can be interesting (e.g., movies' rating and user votes). In this project, we want to extract information on the most famous movies early this year and check if the ratings are in adequacy with the votes. If yes, then we can consider either one or the other without loss of information.
														
 
															+
														
 
															+- Title: Movies' ratings versus user votes
														
 
															+- Usually, we can find a lot of information online about the ranking of movies, universities, supermarkets, etc. We can use these data to supplement information from another database or facilitate trend analysis. However, it's not easy to choose the right criterion because several might be interesting (e.g., movies' ratings and user votes). In this project, we want to extract information on the most popular movies from early 2020 and check if the ratings are in alignment with the votes. If yes, then we can consider either one or the other without loss of information.
														
 
															 # Loading the Web Page
														
 
															 ```{r}
														
@@ -15,10 +16,8 @@ output: html_document
 
															 library(rvest)
														
 
															 library(dplyr)
														
 
															 library(ggplot2)
														
 
															-
														
 
															 # Specifying the URL where we will extract video data
														
 
															 url <- "http://dataquestio.github.io/web-scraping-pages/IMDb-DQgp.html"
														
 
															-
														
 
															 # Loading the web page content using the `read_html()` function
														
 
															 wp_content <- read_html(url)
														
 
															 ```
														
@@ -27,35 +26,27 @@ wp_content <- read_html(url)
 
															 ```{r}
														
 
															 # Converting "10.50" into numeric
														
 
															 as.numeric("10.50")
														
 
															-
														
 
															 # Converting the vector `c("14.59", "3.14", "55")` into numeric
														
 
															 as.numeric(c("14.59", "3.14", "55"))
														
 
															-
														
 
															 # Parsing the vector `c("14 min", "17,35", "(2012)", "1,2,3,4")` into numeric
														
 
															 readr::parse_number(c("14 min", "17,35", "(2012)", "1,2,3,4"))
														
 
															-
														
 
															 # Removing whitespaces at the begining and end of `" Space before and after should disappear     "`
														
 
															 stringr::str_trim(" Space before and after should disappear     ")
														
 
															 ```
														
 
															 # Extracting Elements from the Header
														
 
															 ```{r}
														
 
															-
														
 
															 # Extracting the movie's titles
														
 
															 ## Finding the title CSS selector
														
 
															 title_selector <- ".lister-item-header a"
														
 
															-
														
 
															 ## Identifying the number of elements this selector will select from Selector Gadget 
														
 
															 n_title <- 30
														
 
															-
														
 
															 ## Extracting the movie titles combining the `html_nodes()` and `html_text()` function
														
 
															 titles <- wp_content %>% 
														
 
															   html_nodes(title_selector) %>% 
														
 
															   html_text()
														
 
															-
														
 
															 ## Printing titles vector
														
 
															 titles
														
 
															-
														
 
															 # Extracting the movie's years
														
 
															 ## Using a process similar to the one we used to extract the titles
														
 
															 year_selector <- ".lister-item-year"
														
@@ -63,35 +54,27 @@ n_year <- 30
 
															 years <- wp_content %>% 
														
 
															   html_nodes(year_selector) %>% 
														
 
															   html_text()
														
 
															-
														
 
															 ## Converting the years from character to numeric data type
														
 
															 years <- readr::parse_number(years)
														
 
															-
														
 
															 ## Printing years vector
														
 
															 years
														
 
															 ```
														
 
															 # Extracting Movie's Features
														
 
															 ```{r}
														
 
															-
														
 
															 # Extracting the movie's runtimes
														
 
															 ## Finding the title CSS selector
														
 
															 runtime_selector <- ".runtime"
														
 
															-
														
 
															 ## Identifying the number of elements this selector will select from Selector Gadget 
														
 
															 n_runtime <- 30
														
 
															-
														
 
															 ## Extracting the movie runtimes combining the `html_nodes()` and `html_text()` function
														
 
															 runtimes <- wp_content %>% 
														
 
															   html_nodes(runtime_selector) %>% 
														
 
															   html_text()
														
 
															-
														
 
															 ## Converting the runtimes from character to numeric data type
														
 
															 runtimes <- readr::parse_number(runtimes)
														
 
															-
														
 
															 ## Printing runtimes vector
														
 
															 runtimes
														
 
															-
														
 
															 # Extracting the movie's genres
														
 
															 ## Extracting the movie genres using a similar process as previously
														
 
															 genre_selector <- ".genre"
														
@@ -99,10 +82,8 @@ n_genre <- 30
 
															 genres <- wp_content %>% 
														
 
															   html_nodes(genre_selector) %>% 
														
 
															   html_text()
														
 
															-
														
 
															 ## Removing whitespaces at the end of genre characters
														
 
															 genres <- stringr::str_trim(genres)
														
 
															-
														
 
															 ## Printing genres vector
														
 
															 genres
														
 
															 ```
														
@@ -112,21 +93,16 @@ genres
 
															 # Extracting the movie's user ratings
														
 
															 ## Finding the user rating CSS selector
														
 
															 user_rating_selector <- ".ratings-imdb-rating"
														
 
															-
														
 
															 ## Identifying the number of elements this selector will select from Selector Gadget 
														
 
															 n_user_rating <- 29
														
 
															-
														
 
															 ## Extracting the user rating combining the `html_nodes()` and `html_attr()` function
														
 
															 user_ratings <- wp_content %>% 
														
 
															   html_nodes(user_rating_selector) %>% 
														
 
															   html_attr("data-value")
														
 
															-
														
 
															 ## Converting the user rating from character to numeric data type
														
 
															 user_ratings <- as.numeric(user_ratings)
														
 
															-
														
 
															 ## Printing user ratings vector
														
 
															 user_ratings
														
 
															-
														
 
															 # Extracting the movie's metascores
														
 
															 ## Extracting the movie metascore using a similar process as previously
														
 
															 metascore_selector <- ".metascore"
														
@@ -134,33 +110,26 @@ n_metascore <- 25
 
															 metascores <- wp_content %>% 
														
 
															   html_nodes(metascore_selector) %>% 
														
 
															   html_text()
														
 
															-
														
 
															 ## Removing whitespaces at the end of metascores and converting them into numeric
														
 
															 metascores <- stringr::str_trim(metascores)
														
 
															 metascores <- as.numeric(metascores)
														
 
															-
														
 
															 ## Printing metascores vector
														
 
															 metascores
														
 
															 ```
														
 
															 # Extracting Movie's Votes
														
 
															 ```{r}
														
 
															-
														
 
															 # Extracting the movie's votes
														
 
															 ## Finding the vote CSS selector
														
 
															 vote_selector <- ".sort-num_votes-visible :nth-child(2)"
														
 
															-
														
 
															 ## Identifying the number of elements this selector will select from Selector Gadget 
														
 
															 n_vote <- 29
														
 
															-
														
 
															 ## Extracting the votes combining the `html_nodes()` and `html_text()` function
														
 
															 votes <- wp_content %>% 
														
 
															   html_nodes(vote_selector) %>% 
														
 
															   html_text()
														
 
															-
														
 
															 ## Converting the vote from character to numeric data type
														
 
															 votes <- readr::parse_number(votes)
														
 
															-
														
 
															 ## Printing votes vector
														
 
															 votes
														
 
															 ```
														
@@ -169,30 +138,22 @@ votes
 
															 ```{r}
														
 
															 # Copy-pasting the `append_vector()` in our Markdown file
														
 
															 append_vector <- function(vector, inserted_indices, values){
														
 
															-  
														
 
															   ## Creating the current indices of the vector
														
 
															   vector_current_indices <- 1:length(vector)
														
 
															-  
														
 
															   ## Adding `0.5` to the `inserted_indices`
														
 
															   new_inserted_indices <- inserted_indices + seq(0, 0.9, length.out = length(inserted_indices))
														
 
															-  
														
 
															   ## Appending the `new_inserted_indices` to the current vector indices
														
 
															   indices <- c(vector_current_indices, new_inserted_indices)
														
 
															-  
														
 
															   ## Ordering the indices
														
 
															   ordered_indices <- order(indices)
														
 
															-  
														
 
															   ## Appending the new value to the existing vector
														
 
															   new_vector <- c(vector, values)
														
 
															-  
														
 
															   ## Ordering the new vector wrt the ordered indices
														
 
															   new_vector[ordered_indices]
														
 
															 }
														
 
															-
														
 
															 # Using the `append_vector()` function to insert `NA` into the metascores vector after the positions 1, 1, 1, 13, and 24 and saving the result back in metascores vector
														
 
															 metascores <- append_vector(metascores, c(1, 1, 1, 13, 24), NA)
														
 
															 metascores
														
 
															-
														
 
															 # Removing the 17th element from the vectors: titles, years, runtimes, genres, and metascores
														
 
															 ## Saving the result back to these vectors.
														
 
															 titles <- titles[-17]
														
@@ -213,9 +174,8 @@ movie_df <- tibble::tibble("title" = titles,
 
															                            "rating" = floor(user_ratings), 
														
 
															                            "metascore" = metascores,
														
 
															                            "vote" = votes)
														
 
															-
														
 
															 # Creating a boxplot that show the number of vote again the user rating
														
 
															 ggplot(data = movie_df,
														
 
															        aes(x = rating, y = vote, group = rating)) +
														
 
															   geom_boxplot()
														
 
															-```
														
 
															+```