Queer European MD passionate about IT
Pārlūkot izejas kodu

Style edits, formatting

Casey Bates 4 gadi atpakaļ
vecāks
revīzija
628fe36cdf
2 mainītis faili ar 10 papildinājumiem un 89 dzēšanām
  1. 6 45
      Mission571Solutions.Rmd
  2. 4 44
      Mission572Solutions.Rmd

+ 6 - 45
Mission571Solutions.Rmd

@@ -6,17 +6,16 @@ output: html_document
 ---
 
 # Introduction
+
 - Title: Analyzing New York solar data.
-- By using APIs, we have access to an astronomical amount of data available only online. In this study, we want to extract New York solar data. Such data can, for example, allow us to decide on average the most fruitful periods of the year for solar panel deployment.
-        
+- Using APIs gives us access to an incredible amount of data only available online. In this exercise, we want to extract New York City solar data. Such data can, for example, allow us to determine on average the most productive periods of the year for solar panel deployment.
+
 # Finding the Suitable Endpoint and Parameters to Query the API
 ```{r}
 # Storing my api key in a variable
 the_key = "" #TODO Store your API key here
-
 # Identifying the API URL
 url <- "https://developer.nrel.gov/api/solar/solar_resource/v1.json"
-
 # Specifying the necessary parameters to request the New York City solar data
 parameters_list <- list(api_key = the_key, lat = 41, lon = -75)
 ```
@@ -25,22 +24,17 @@ parameters_list <- list(api_key = the_key, lat = 41, lon = -75)
 ```{r}
 # Loading the `httr` package
 library(httr)
-
 # Using the `GET()` function to request the data from the API with `url` and `parameters_list`
 response <- GET(url, query = parameters_list)
-
 # Tracking errors 
 ## Displaying the status code with the `status_code()` function
 status <- status_code(response)
 status
-
 ## Displaying the API response format
 response_type <- http_type(response)
 response_type
-
 # Extracting the API response content as text
 content <- content(response, "text")
-
 # Displaying this content to check how it looks visually.
 print(content)
 ```
@@ -49,62 +43,48 @@ print(content)
 ```{r}
 # Parsing the `json_text` to a R object using the `jsonlite::fromJSON()` function
 json_lists <- jsonlite::fromJSON(content)
-
 # Displaying the structure of the R object using the `str()` function
 str(json_lists)
 ```
 
 # How to Create a Datarame from a Complex List
-
 # Building Datarame from a Complex List
 ```{r}
 # Extracting the outputs data
 outputs_list <- json_lists$outputs
-
 # Extracting the monthly vector (`monthly`) from the (`avg_dni`) list in the outputs data
 avg_dni <- outputs_list$avg_dni$monthly
-
 # Extracting the monthly vector (`monthly`) from the (`avg_ghi`) list in the outputs data
 avg_ghi <- outputs_list$avg_ghi$monthly
-
 # Extracting the monthly vector (`monthly`) from the (`avg_lat_tilt`) list in the outputs data
 avg_lat_tilt <- outputs_list$avg_lat_tilt$monthly
-
 # Combining the monthly vectors into a dataframe using the `tibble::tibble()` function
 ## Adding the `month` column containing month abbreviations: `Jan`, `Fev`,...,`Dec`
 dataframe <- tibble::tibble("month" = month.abb,
                             "avg_dni" = avg_dni, 
                             "avg_ghi" = avg_ghi, 
                             "avg_lat_tilt" = avg_lat_tilt)
-
 # Displaying the dataframe
 dataframe
 ```
-
 - (Instruction 4's answer)
-We can see that all the columns are still lists containing one item. For future use of this dataframe, it would probably be necessary to convert these columns to numeric.
+We can see that all the columns are still lists containing one item. For future use of this dataframe, it would likely be necessary to convert these columns to numeric data type.
 
 # Extracting Datarame from a Complex List: 
 ```{r}
 # Extracting the outputs list
 outputs_list <- json_lists$outputs
-
 # Simplifying the outputs list
 simplified_outputs_list <- unlist(outputs_list)
-
 # Restructuring the simplified list into a matrix of 13 rows (the annual value and 12 months values)
 data_matrix <- matrix(data = simplified_outputs_list, nrow = 13)
-
 # Removing the annual values from the data matrix
 data_matrix <- data_matrix[-1, ]
-
 # Converting the matrix into a dataframe using the `as.data.frame()` function
 another_dataframe <- as.data.frame(data_matrix)
-
 # Displaying the dataframe
 another_dataframe
 ```
-
 - (Instruction 6's answer)
 We can see that all the columns are numeric. However, we haven't appended the `month` column yet.
 
@@ -113,51 +93,39 @@ We can see that all the columns are numeric. However, we haven't appended the `m
 library(httr)
 library(dplyr)
 the_key = "" #TODO Store your API key here 
-
 # Creating the custom `nrel_api_json_get_df()` function inspiring from what we did in the previous missions
 ## The function has two parameters
 ### The `endpoint` parameter represents the endpoint we need
 ### The `queries` parameter represents the list of API request parameters.
 nrel_api_json_get_df <- function(endpoint, queries = list()) {
-
   ## Preparing the URL 
   url <- modify_url("https://developer.nrel.gov", path = endpoint)
-  
   ## Querying the API
   response <- GET(url, query = queries)
-
   ## Tracking errors
   if ( http_error(response) ){
     print(status_code(response))
     print(http_status(response))
     stop("Something went wrong.", call. = FALSE)
   }
-    
   if (http_type(response) != "application/json") {
     stop("API did not return json", call. = FALSE)
   }
-  
   ## Extracting content
   json_text <- content(response, "text")
-    
   ## Converting content into Dataframe
   table_lst <- jsonlite::fromJSON(json_text)
-
   dataframe <- tibble::tibble("month" = month.abb,
                               "avg_dni" = as.numeric(table_lst$outputs$avg_dni$monthly),
                               "avg_ghi" = as.numeric(table_lst$outputs$avg_ghi$monthly),
                               "avg_lat_tilt" = as.numeric(table_lst$outputs$avg_lat_tilt$monthly))
-    
   ## Returning the dataframe  
   dataframe
-    
 }
-
 # Using the custom `nrel_api_json_get_df()` function to extract the solar resource as a dataframe
 ## Providing the `"api/solar/solar_resource/v1.json"` as the `endpoint` parameter
 ## Providing the `parameters_list` variable as `queries` parameter
 solar_resource_df <- nrel_api_json_get_df("api/solar/solar_resource/v1.json", parameters_list)
-
 # Printing the output dataframe
 solar_resource_df
 ```
@@ -167,29 +135,22 @@ solar_resource_df
 # Loading the `ggplot2` and `dplyr` packages
 library(ggplot2)
 library(dplyr)
-
 # Using the `ggplot()` function to plot the `avg_dni` value for each month 
 ggplot(data = solar_resource_df,
        aes(x = month, y = avg_dni, group = 1)) +
   geom_line() +
   geom_point() +
   theme_bw()
-
 # Converting the `month` column into factor using the following command  
 solar_resource_df <- solar_resource_df %>% 
   mutate(month = factor(month, levels = month.abb))
-
 # Replotting the `avg_dni` value for each month 
 ggplot(data = solar_resource_df,
        aes(x = month, y = avg_dni, group = 1)) +
   geom_line() +
   geom_point() +
   theme_bw()
-
 ```
-
 - (Instruction 5's answer)
-The first plot x-axis is ordered alphabetically, while the second is ordered chronologically, from January to December. 
-
-This operation allows ordering the labels in the plot as we wish.
-
+The first plot x-axis is ordered alphabetically, while the second is ordered chronologically from January to December. 
+This operation allows ordering the labels in the plot as we wish.

+ 4 - 44
Mission572Solutions.Rmd

@@ -6,8 +6,9 @@ output: html_document
 ---
 
 # Introduction
-- Title: Movie's ratings versus user votes
-- Usually, we can find online a lot of information about the ranking of movies, universities, supermarkets. We can use these data to supplement information from another database or facilitate trend analysis. However, it's not easy to choose the right criterion because several criteria can be interesting (e.g., movies' rating and user votes). In this project, we want to extract information on the most famous movies early this year and check if the ratings are in adequacy with the votes. If yes, then we can consider either one or the other without loss of information.
+
+- Title: Movies' ratings versus user votes
+- Usually, we can find a lot of information online about the ranking of movies, universities, supermarkets, etc. We can use these data to supplement information from another database or facilitate trend analysis. However, it's not easy to choose the right criterion because several might be interesting (e.g., movies' ratings and user votes). In this project, we want to extract information on the most popular movies from early 2020 and check if the ratings are in alignment with the votes. If yes, then we can consider either one or the other without loss of information.
 
 # Loading the Web Page
 ```{r}
@@ -15,10 +16,8 @@ output: html_document
 library(rvest)
 library(dplyr)
 library(ggplot2)
-
 # Specifying the URL where we will extract video data
 url <- "http://dataquestio.github.io/web-scraping-pages/IMDb-DQgp.html"
-
 # Loading the web page content using the `read_html()` function
 wp_content <- read_html(url)
 ```
@@ -27,35 +26,27 @@ wp_content <- read_html(url)
 ```{r}
 # Converting "10.50" into numeric
 as.numeric("10.50")
-
 # Converting the vector `c("14.59", "3.14", "55")` into numeric
 as.numeric(c("14.59", "3.14", "55"))
-
 # Parsing the vector `c("14 min", "17,35", "(2012)", "1,2,3,4")` into numeric
 readr::parse_number(c("14 min", "17,35", "(2012)", "1,2,3,4"))
-
 # Removing whitespaces at the begining and end of `" Space before and after should disappear     "`
 stringr::str_trim(" Space before and after should disappear     ")
 ```
 
 # Extracting Elements from the Header
 ```{r}
-
 # Extracting the movie's titles
 ## Finding the title CSS selector
 title_selector <- ".lister-item-header a"
-
 ## Identifying the number of elements this selector will select from Selector Gadget 
 n_title <- 30
-
 ## Extracting the movie titles combining the `html_nodes()` and `html_text()` function
 titles <- wp_content %>% 
   html_nodes(title_selector) %>% 
   html_text()
-
 ## Printing titles vector
 titles
-
 # Extracting the movie's years
 ## Using a process similar to the one we used to extract the titles
 year_selector <- ".lister-item-year"
@@ -63,35 +54,27 @@ n_year <- 30
 years <- wp_content %>% 
   html_nodes(year_selector) %>% 
   html_text()
-
 ## Converting the years from character to numeric data type
 years <- readr::parse_number(years)
-
 ## Printing years vector
 years
 ```
 
 # Extracting Movie's Features
 ```{r}
-
 # Extracting the movie's runtimes
 ## Finding the title CSS selector
 runtime_selector <- ".runtime"
-
 ## Identifying the number of elements this selector will select from Selector Gadget 
 n_runtime <- 30
-
 ## Extracting the movie runtimes combining the `html_nodes()` and `html_text()` function
 runtimes <- wp_content %>% 
   html_nodes(runtime_selector) %>% 
   html_text()
-
 ## Converting the runtimes from character to numeric data type
 runtimes <- readr::parse_number(runtimes)
-
 ## Printing runtimes vector
 runtimes
-
 # Extracting the movie's genres
 ## Extracting the movie genres using a similar process as previously
 genre_selector <- ".genre"
@@ -99,10 +82,8 @@ n_genre <- 30
 genres <- wp_content %>% 
   html_nodes(genre_selector) %>% 
   html_text()
-
 ## Removing whitespaces at the end of genre characters
 genres <- stringr::str_trim(genres)
-
 ## Printing genres vector
 genres
 ```
@@ -112,21 +93,16 @@ genres
 # Extracting the movie's user ratings
 ## Finding the user rating CSS selector
 user_rating_selector <- ".ratings-imdb-rating"
-
 ## Identifying the number of elements this selector will select from Selector Gadget 
 n_user_rating <- 29
-
 ## Extracting the user rating combining the `html_nodes()` and `html_attr()` function
 user_ratings <- wp_content %>% 
   html_nodes(user_rating_selector) %>% 
   html_attr("data-value")
-
 ## Converting the user rating from character to numeric data type
 user_ratings <- as.numeric(user_ratings)
-
 ## Printing user ratings vector
 user_ratings
-
 # Extracting the movie's metascores
 ## Extracting the movie metascore using a similar process as previously
 metascore_selector <- ".metascore"
@@ -134,33 +110,26 @@ n_metascore <- 25
 metascores <- wp_content %>% 
   html_nodes(metascore_selector) %>% 
   html_text()
-
 ## Removing whitespaces at the end of metascores and converting them into numeric
 metascores <- stringr::str_trim(metascores)
 metascores <- as.numeric(metascores)
-
 ## Printing metascores vector
 metascores
 ```
 
 # Extracting Movie's Votes
 ```{r}
-
 # Extracting the movie's votes
 ## Finding the vote CSS selector
 vote_selector <- ".sort-num_votes-visible :nth-child(2)"
-
 ## Identifying the number of elements this selector will select from Selector Gadget 
 n_vote <- 29
-
 ## Extracting the votes combining the `html_nodes()` and `html_text()` function
 votes <- wp_content %>% 
   html_nodes(vote_selector) %>% 
   html_text()
-
 ## Converting the vote from character to numeric data type
 votes <- readr::parse_number(votes)
-
 ## Printing votes vector
 votes
 ```
@@ -169,30 +138,22 @@ votes
 ```{r}
 # Copy-pasting the `append_vector()` in our Markdown file
 append_vector <- function(vector, inserted_indices, values){
-  
   ## Creating the current indices of the vector
   vector_current_indices <- 1:length(vector)
-  
   ## Adding `0.5` to the `inserted_indices`
   new_inserted_indices <- inserted_indices + seq(0, 0.9, length.out = length(inserted_indices))
-  
   ## Appending the `new_inserted_indices` to the current vector indices
   indices <- c(vector_current_indices, new_inserted_indices)
-  
   ## Ordering the indices
   ordered_indices <- order(indices)
-  
   ## Appending the new value to the existing vector
   new_vector <- c(vector, values)
-  
   ## Ordering the new vector wrt the ordered indices
   new_vector[ordered_indices]
 }
-
 # Using the `append_vector()` function to insert `NA` into the metascores vector after the positions 1, 1, 1, 13, and 24 and saving the result back in metascores vector
 metascores <- append_vector(metascores, c(1, 1, 1, 13, 24), NA)
 metascores
-
 # Removing the 17th element from the vectors: titles, years, runtimes, genres, and metascores
 ## Saving the result back to these vectors.
 titles <- titles[-17]
@@ -213,9 +174,8 @@ movie_df <- tibble::tibble("title" = titles,
                            "rating" = floor(user_ratings), 
                            "metascore" = metascores,
                            "vote" = votes)
-
 # Creating a boxplot that show the number of vote again the user rating
 ggplot(data = movie_df,
        aes(x = rating, y = vote, group = rating)) +
   geom_boxplot()
-```
+```