Queer European MD passionate about IT
فهرست منبع

apis and web scraping gps

John Aoga 4 سال پیش
والد
کامیت
a45875b02b
2فایلهای تغییر یافته به همراه368 افزوده شده و 0 حذف شده
  1. 187 0
      Mission571Solutions.Rmd
  2. 181 0
      Mission572Solutions.Rmd

+ 187 - 0
Mission571Solutions.Rmd

@@ -0,0 +1,187 @@
+---
+title: 'Guided Project: New York Solar Resource Data'
+author: "Dataquest"
+date: "11/26/2020"
+output: html_document
+---
+
+# Finding the Suitable Endpoint and Parameters to Query the API
+```{r}
+# Storing my api key in a variable
+the_key = "" #TODO Store your API key here
+
+# Identifying the API URL
+url <- "https://developer.nrel.gov/api/solar/solar_resource/v1.json"
+
+# Specifying the necessary parameters to request the New York City solar data
+parameters_list <- list(api_key = the_key, lat = 41, lon = -75)
+```
+
+# Extracting the New York Solar Resource Data 
+```{r}
+# Loading the `httr` package
+library(httr)
+
+# Using the `GET()` function to request the data from the API with `url` and `parameters_list`
+response <- GET(url, query = parameters_list)
+
+# Tracking errors 
+## Displaying the status code with the `status_code()` function
+print(status_code(response))
+
+## Displaying the API response format
+print(http_type(response))
+
+# Extracting the API response content as text
+json_text <- content(response, "text")
+
+# Displaying this content to check how it looks visually.
+print(json_text)
+```
+
+# Parsing the JSON into R Object
+```{r}
+# Parsing the `json_text` to a R object using the `jsonlite::fromJSON()` function
+json_lists <- jsonlite::fromJSON(json_text)
+
+# Displaying the structure of the R object using the `str()` function
+str(json_lists)
+```
+
+# How to Create a Datarame from a Complex List
+
+# Building Datarame from a Complex List
+```{r}
+# Extracting the outputs data
+outputs_list <- json_lists$outputs
+
+# Extracting the monthly vector (`monthly`) from the (`avg_dni`) list in the outputs data
+avg_dni <- outputs_list$avg_dni$monthly
+
+# Extracting the monthly vector (`monthly`) from the (`avg_ghi`) list in the outputs data
+avg_ghi <- outputs_list$avg_ghi$monthly
+
+# Extracting the monthly vector (`monthly`) from the (`avg_lat_tilt`) list in the outputs data
+avg_lat_tilt <- outputs_list$avg_lat_tilt$monthly
+
+# Combining the monthly vectors into a dataframe using the `tibble::tibble()` function
+## Adding the `month` column containing month abbreviations: `Jan`, `Fev`,...,`Dec`
+dataframe <- tibble::tibble("avg_dni" = avg_dni, 
+                            "avg_ghi" = avg_ghi, 
+                            "avg_lat_tilt" = avg_lat_tilt, 
+                            "month" = month.abb)
+
+# Displaying the dataframe
+dataframe
+```
+
+We can see that all the columns are still lists containing one item. For future use of this dataframe, it would probably be necessary to convert these columns to numeric.
+
+# Extracting Datarame from a Complex List: 
+```{r}
+# Extracting the outputs list
+outputs_list <- json_lists$outputs
+
+# Simplifying the outputs list
+simplified_outputs_list <- unlist(outputs_list)
+
+# Restructuring the simplified list into a matrix of 13 rows (the annual value and 12 months values)
+data_matrix <- matrix(data = simplified_outputs_list, nrow = 13)
+
+# Removing the annual values from the data matrix
+data_matrix <- data_matrix[-1, ]
+
+# Converting the matrix into a dataframe using the `as.data.frame()` function
+another_dataframe <- as.data.frame(data_matrix)
+
+# Displaying the dataframe
+another_dataframe
+```
+
+# Putting all together
+```{r}
+library(httr)
+library(dplyr)
+the_key = "" #TODO Store your API key here 
+
+# Creating the custom `nrel_api_json_get_df()` function inspiring from what we did in the previous missions
+## The function has two parameters
+### The `endpoint` parameter represents the endpoint we need
+### The `queries` parameter represents the list of API request parameters.
+nrel_api_json_get_df <- function(endpoint, queries = list()) {
+
+  ## Preparing the URL 
+  url <- modify_url("https://developer.nrel.gov", path = endpoint)
+  
+  ## Querying the API
+  response <- GET(url, query = queries)
+
+  ## Tracking errors
+  if ( http_error(response) ){
+    print(status_code(response))
+    print(http_status(response))
+    stop("Something went wrong.", call. = FALSE)
+  }
+    
+  if (http_type(response) != "application/json") {
+    stop("API did not return json", call. = FALSE)
+  }
+  
+  ## Extracting content
+  json_text <- content(response, "text")
+    
+  ## Converting content into Dataframe
+  table_lst <- jsonlite::fromJSON(json_text)
+
+  dataframe <- tibble::tibble("avg_dni" = as.numeric(table_lst$outputs$avg_dni$monthly), 
+                            "avg_ghi" = as.numeric(table_lst$outputs$avg_ghi$monthly), 
+                            "avg_lat_tilt" = as.numeric(table_lst$outputs$avg_lat_tilt$monthly), 
+                            "month" = month.abb)
+    
+  ## Returning the dataframe  
+  dataframe
+    
+}
+
+# Using the custom `nrel_api_json_get_df()` function to extract the solar resource as a dataframe
+## Providing the `"api/solar/solar_resource/v1.json"` as the `endpoint` parameter
+## Providing the `parameters_list` variable as `queries` parameter
+solar_resource_df <- nrel_api_json_get_df("api/solar/solar_resource/v1.json", parameters_list)
+
+# Printing the output dataframe
+solar_resource_df
+```
+
+# Visualizing New York City Solar Resource Data
+```{r}
+# Loading the `ggplot2` and `dplyr` packages
+library(ggplot2)
+library(dplyr)
+
+# Using the `ggplot()` function to plot the `avg_dni` value for each month 
+ggplot(data = solar_resource_df,
+       aes(x = month, y = avg_dni, group = 1)) +
+  geom_line() +
+  geom_point() +
+  theme_bw()
+
+ggsave("plot_avg_dni_before_factor.svg")
+
+# Converting the `month` column into factor using the following command  
+solar_resource_df <- solar_resource_df %>% 
+  mutate(month = factor(month, levels = month.abb))
+
+# Replotting the `avg_dni` value for each month 
+ggplot(data = solar_resource_df,
+       aes(x = month, y = avg_dni, group = 1)) +
+  geom_line() +
+  geom_point() +
+  theme_bw()
+
+ggsave("plot_avg_dni_after_factor.svg")
+```
+
+The first plot x-axis is ordered alphabetically, while the second is in the natural order of months, from January to December. 
+
+This operation allows ordering the labels in the plot as we wish.
+

+ 181 - 0
Mission572Solutions.Rmd

@@ -0,0 +1,181 @@
+---
+title: 'Guided Project: Analyzing Movie Ratings'
+author: "Dataquest"
+date: "11/26/2020"
+output: html_document
+---
+
+# Loading the Web Page
+```{r}
+# Loading the `rvest`, `dplyr`, and `ggplot2` packages
+library(rvest)
+library(dplyr)
+library(ggplot2)
+
+# Specifying the URL where we will extract video data
+url <- "http://dataquestio.github.io/web-scraping-pages/Feature%20Film,%20Released%20between%202020-03-01%20and%202020-07-31%20(Sorted%20by%20Popularity%20Ascending)%20-%20IMDb.html"
+
+# Loading the web page content using the `read_html()` function
+wp_content <- read_html(url)
+```
+
+# String Manipulation Reminder
+```{r}
+# Converting "10.50" into numeric
+as.numeric("10.50")
+
+# Converting the vector `c("14.59", "3.14", "55")` into numeric
+as.numeric(c("14.59", "3.14", "55"))
+
+# Parsing the vector `c("14 min", "17,35", "(2012)", "1,2,3,4")` into numeric
+readr::parse_number(c("14 min", "17,35", "(2012)", "1,2,3,4"))
+
+# Removing whitespaces at the begining and end of `" Space before and after should disappear     "`
+stringr::str_trim(" Space before and after should disappear     ")
+```
+
+# Extracting Elements from the Header
+```{r}
+
+# Extracting the movie's titles
+## Finding the title CSS selector
+title_selector <- ".lister-item-header a"
+
+## Identifying the number of elements this selector will select from Selector Gadget 
+n_title <- 30
+
+## Extracting the movie titles combining the `html_nodes()` and `html_text()` function
+titles <- wp_content %>% 
+  html_nodes(title_selector) %>% 
+  html_text()
+
+## Printing titles vector
+titles
+
+# Extracting the movie's years
+## Using a process similar to the one we used to extract the titles
+year_selector <- ".lister-item-year"
+n_year <- 30
+years <- wp_content %>% 
+  html_nodes(year_selector) %>% 
+  html_text()
+
+## Converting the years from character to numeric data type
+years <- readr::parse_number(years)
+
+## Printing years vector
+years
+```
+
+# Extracting Movie's Features
+```{r}
+
+# Extracting the movie's runtimes
+## Finding the title CSS selector
+runtime_selector <- ".runtime"
+
+## Identifying the number of elements this selector will select from Selector Gadget 
+n_runtime <- 30
+
+## Extracting the movie runtimes combining the `html_nodes()` and `html_text()` function
+runtimes <- wp_content %>% 
+  html_nodes(runtime_selector) %>% 
+  html_text()
+
+## Converting the runtimes from character to numeric data type
+runtimes <- readr::parse_number(runtimes)
+
+## Printing runtimes vector
+runtimes
+
+# Extracting the movie's genres
+## Extracting the movie genres using a similar process as previously
+genre_selector <- ".genre"
+n_genre <- 30
+genres <- wp_content %>% 
+  html_nodes(genre_selector) %>% 
+  html_text()
+
+## Removing whitespaces at the end of genre characters
+genres <- stringr::str_trim(genres)
+
+## Printing genres vector
+genres
+```
+
+# Extracting Movie's Ratings
+```{r}
+# Extracting the movie's user ratings
+## Finding the user rating CSS selector
+user_rating_selector <- ".ratings-imdb-rating"
+
+## Identifying the number of elements this selector will select from Selector Gadget 
+n_user_rating <- 29
+
+## Extracting the user rating combining the `html_nodes()` and `html_attr()` function
+user_ratings <- wp_content %>% 
+  html_nodes(user_rating_selector) %>% 
+  html_attr("data-value")
+
+## Converting the user rating from character to numeric data type
+user_ratings <- as.numeric(user_ratings)
+
+## Printing user ratings vector
+user_ratings
+
+# Extracting the movie's metascores
+## Extracting the movie metascore using a similar process as previously
+metascore_selector <- ".metascore"
+n_metascore <- 25
+metascores <- wp_content %>% 
+  html_nodes(metascore_selector) %>% 
+  html_text()
+
+## Removing whitespaces at the end of metascores and converting them into numeric
+metascores <- stringr::str_trim(metascores)
+metascores <- as.numeric(metascores)
+
+## Printing metascores vector
+metascores
+```
+
+# Extracting Movie's Votes
+```{r}
+
+# Extracting the movie's votes
+## Finding the vote CSS selector
+vote_selector <- ".sort-num_votes-visible :nth-child(2)"
+
+## Identifying the number of elements this selector will select from Selector Gadget 
+n_vote <- 29
+
+## Extracting the votes combining the `html_nodes()` and `html_text()` function
+votes <- wp_content %>% 
+  html_nodes(vote_selector) %>% 
+  html_text()
+
+## Converting the vote from character to numeric data type
+votes <- readr::parse_number(votes)
+
+## Printing votes vector
+votes
+```
+
+
+# Putting all together and Visualize
+```{r}
+# Creating a dataframe with the data we previously extracted: titles, years, runtimes, genres, user ratings, and votes.
+## Removing the 17th element from the vectors: titles, years, runtimes, and genres
+## Keeping only the integer part of the user ratings using the `floor()` function. For example, `3.4` becomes `3`.
+movie_df <- tibble::tibble("title" = titles[-17], 
+                           "year" = years[-17], 
+                           "runtime" = runtimes[-17], 
+                           "genre" = genres[-17], 
+                           "rating" = floor(user_ratings), 
+                           "vote" = votes)
+
+# Creating a boxplot that show the number of vote again the user rating
+ggplot(data = movie_df,
+       aes(x = rating, y = vote, group = rating)) +
+  geom_boxplot()
+```