Here are suggested solutions to the questions in the Data Cleaning With R Guided Project: Exploring NYC Schools Survey Data.

Load the packages you’ll need for your analysis

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(purrr)
library(tidyr)
library(ggplot2)

Import the data into R.

combined <- read_csv("combined.csv") 
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   DBN = col_character(),
##   school_name = col_character(),
##   `Num of SAT Test Takers` = col_integer(),
##   `SAT Critical Reading Avg. Score` = col_integer(),
##   `SAT Math Avg. Score` = col_integer(),
##   `SAT Writing Avg. Score` = col_integer(),
##   avg_sat_score = col_integer(),
##   `Total Exams Taken` = col_integer(),
##   `Number of Exams with scores 3 4 or 5` = col_integer(),
##   avg_class_size = col_integer(),
##   total_enrollment = col_integer(),
##   selfcontained_num = col_integer(),
##   `Total Cohort` = col_integer(),
##   boro = col_character()
## )
## See spec(...) for full column specifications.
survey <- read_tsv("survey_all.txt")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   dbn = col_character(),
##   bn = col_character(),
##   schoolname = col_character(),
##   studentssurveyed = col_character(),
##   schooltype = col_character(),
##   saf_p_11 = col_double(),
##   com_p_11 = col_double(),
##   eng_p_11 = col_double(),
##   aca_p_11 = col_double(),
##   saf_t_11 = col_double(),
##   com_t_11 = col_double(),
##   eng_t_11 = col_double(),
##   aca_t_11 = col_double(),
##   saf_s_11 = col_double(),
##   com_s_11 = col_double(),
##   eng_s_11 = col_double(),
##   aca_s_11 = col_double(),
##   saf_tot_11 = col_double(),
##   com_tot_11 = col_double(),
##   eng_tot_11 = col_double()
##   # ... with 175 more columns
## )
## See spec(...) for full column specifications.
survey_d75 <- read_tsv("survey_d75.txt")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   dbn = col_character(),
##   bn = col_character(),
##   schoolname = col_character(),
##   studentssurveyed = col_character(),
##   schooltype = col_character(),
##   saf_p_11 = col_double(),
##   com_p_11 = col_double(),
##   eng_p_11 = col_double(),
##   aca_p_11 = col_double(),
##   saf_t_11 = col_double(),
##   com_t_11 = col_double(),
##   eng_t_11 = col_double(),
##   aca_t_11 = col_double(),
##   saf_s_11 = col_double(),
##   com_s_11 = col_double(),
##   eng_s_11 = col_double(),
##   aca_s_11 = col_double(),
##   saf_tot_11 = col_double(),
##   com_tot_11 = col_double(),
##   eng_tot_11 = col_double()
##   # ... with 178 more columns
## )
## See spec(...) for full column specifications.

Filter survey data to include only high schools and select columns needed for analysis based on the data dictionary.

survey_select <- survey %>%
  filter(schooltype == "High School") %>%
  select(dbn:aca_tot_11)

Select columns needed for analysis from survey_d75.

survey_d75_select <- survey_d75 %>%       
  select(dbn:aca_tot_11)

Combine survey and survey_d75 data frames.

survey_total <- survey_select %>% 
  bind_rows(survey_d75_select)

Rename survey_total variable dbn to DBN so can use as key to join with the combined data frame.

survey_total <- survey_total %>%
  rename(DBN = dbn)

Join the combined and survey_total data frames. Use left_join() to keep only survey data that correspond to schools for which we have data in combined.

combined_survey <- combined %>%
  left_join(survey_total, by = "DBN")

Create a correlation matrix to look for interesting relationships between pairs of variables in combined_survey and convert it to a tibble so it’s easier to work with using tidyverse tools.

cor_mat <- combined_survey %>%    ## interesting relationshipsS
  select(avg_sat_score, saf_p_11:aca_tot_11) %>%
  cor(use = "pairwise.complete.obs")

cor_tib <- cor_mat %>%
  as_tibble(rownames = "variable")

Look for correlations of other variables with avg_sat_score that are greater than 0.25 or less than -0.25 (strong correlations).

strong_cors <- cor_tib %>%
  select(variable, avg_sat_score) %>%
  filter(avg_sat_score > 0.25 | avg_sat_score < -0.25)  

Make scatter plots of those variables with avg_sat_score to examine relationships more closely.

create_scatter <- function(x, y) {     
  ggplot(data = combined_survey) + 
    aes_string(x = x, y = y) +
    geom_point(alpha = 0.3) +
    theme(panel.background = element_rect(fill = "white"))
}

x_var <- strong_cors$variable[2:5]
y_var <- "avg_sat_score"
  
map2(x_var, y_var, create_scatter)
## [[1]]
## Warning: Removed 137 rows containing missing values (geom_point).

## 
## [[2]]
## Warning: Removed 139 rows containing missing values (geom_point).

## 
## [[3]]
## Warning: Removed 139 rows containing missing values (geom_point).

## 
## [[4]]
## Warning: Removed 137 rows containing missing values (geom_point).

Reshape the data so that you can investigate differences in student, parent, and teacher responses to survey questions.

combined_survey_gather <- combined_survey %>%                         
  gather(key = "survey_question", value = score, saf_p_11:aca_tot_11)

Use str_sub() to create new variables, response_type and question, from the survey_question variable.

combined_survey_gather <- combined_survey_gather %>%
  mutate(response_type = str_sub(survey_question, 4, 6)) %>%   
  mutate(question = str_sub(survey_question, 1, 3))

Replace response_type variable values with names “parent”, “teacher”, “student”, “total” using if_else() function.

combined_survey_gather <- combined_survey_gather %>%
  mutate(response_type = ifelse(response_type  == "_p_", "parent", 
                                ifelse(response_type == "_t_", "teacher",
                                       ifelse(response_type == "_s_", "student", 
                                              ifelse(response_type == "_to", "total", "NA")))))

Make a boxplot to see if there appear to be differences in how the three groups of responders (parents, students, and teachers) answered the four questions.

combined_survey_gather %>%
  filter(response_type != "total") %>%
  ggplot() +
  aes(x = question, y = score, fill = response_type) +
  geom_boxplot()
## Warning: Removed 1268 rows containing non-finite values (stat_boxplot).