Here are suggested solutions to the questions in the Data Cleaning With R Guided Project: Exploring NYC Schools Survey Data.
Load the packages you’ll need for your analysis
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(purrr)
library(tidyr)
library(ggplot2)
Import the data into R.
combined <- read_csv("combined.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## DBN = col_character(),
## school_name = col_character(),
## `Num of SAT Test Takers` = col_integer(),
## `SAT Critical Reading Avg. Score` = col_integer(),
## `SAT Math Avg. Score` = col_integer(),
## `SAT Writing Avg. Score` = col_integer(),
## avg_sat_score = col_integer(),
## `Total Exams Taken` = col_integer(),
## `Number of Exams with scores 3 4 or 5` = col_integer(),
## avg_class_size = col_integer(),
## total_enrollment = col_integer(),
## selfcontained_num = col_integer(),
## `Total Cohort` = col_integer(),
## boro = col_character()
## )
## See spec(...) for full column specifications.
survey <- read_tsv("survey_all.txt")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## dbn = col_character(),
## bn = col_character(),
## schoolname = col_character(),
## studentssurveyed = col_character(),
## schooltype = col_character(),
## saf_p_11 = col_double(),
## com_p_11 = col_double(),
## eng_p_11 = col_double(),
## aca_p_11 = col_double(),
## saf_t_11 = col_double(),
## com_t_11 = col_double(),
## eng_t_11 = col_double(),
## aca_t_11 = col_double(),
## saf_s_11 = col_double(),
## com_s_11 = col_double(),
## eng_s_11 = col_double(),
## aca_s_11 = col_double(),
## saf_tot_11 = col_double(),
## com_tot_11 = col_double(),
## eng_tot_11 = col_double()
## # ... with 175 more columns
## )
## See spec(...) for full column specifications.
survey_d75 <- read_tsv("survey_d75.txt")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## dbn = col_character(),
## bn = col_character(),
## schoolname = col_character(),
## studentssurveyed = col_character(),
## schooltype = col_character(),
## saf_p_11 = col_double(),
## com_p_11 = col_double(),
## eng_p_11 = col_double(),
## aca_p_11 = col_double(),
## saf_t_11 = col_double(),
## com_t_11 = col_double(),
## eng_t_11 = col_double(),
## aca_t_11 = col_double(),
## saf_s_11 = col_double(),
## com_s_11 = col_double(),
## eng_s_11 = col_double(),
## aca_s_11 = col_double(),
## saf_tot_11 = col_double(),
## com_tot_11 = col_double(),
## eng_tot_11 = col_double()
## # ... with 178 more columns
## )
## See spec(...) for full column specifications.
Filter survey
data to include only high schools and select columns needed for analysis based on the data dictionary.
survey_select <- survey %>%
filter(schooltype == "High School") %>%
select(dbn:aca_tot_11)
Select columns needed for analysis from survey_d75
.
survey_d75_select <- survey_d75 %>%
select(dbn:aca_tot_11)
Combine survey
and survey_d75
data frames.
survey_total <- survey_select %>%
bind_rows(survey_d75_select)
Rename survey_total
variable dbn
to DBN
so can use as key to join with the combined
data frame.
survey_total <- survey_total %>%
rename(DBN = dbn)
Join the combined
and survey_total
data frames. Use left_join()
to keep only survey data that correspond to schools for which we have data in combined
.
combined_survey <- combined %>%
left_join(survey_total, by = "DBN")
Create a correlation matrix to look for interesting relationships between pairs of variables in combined_survey
and convert it to a tibble so it’s easier to work with using tidyverse tools.
cor_mat <- combined_survey %>% ## interesting relationshipsS
select(avg_sat_score, saf_p_11:aca_tot_11) %>%
cor(use = "pairwise.complete.obs")
cor_tib <- cor_mat %>%
as_tibble(rownames = "variable")
Look for correlations of other variables with avg_sat_score
that are greater than 0.25 or less than -0.25 (strong correlations).
strong_cors <- cor_tib %>%
select(variable, avg_sat_score) %>%
filter(avg_sat_score > 0.25 | avg_sat_score < -0.25)
Make scatter plots of those variables with avg_sat_score
to examine relationships more closely.
create_scatter <- function(x, y) {
ggplot(data = combined_survey) +
aes_string(x = x, y = y) +
geom_point(alpha = 0.3) +
theme(panel.background = element_rect(fill = "white"))
}
x_var <- strong_cors$variable[2:5]
y_var <- "avg_sat_score"
map2(x_var, y_var, create_scatter)
## [[1]]
## Warning: Removed 137 rows containing missing values (geom_point).
##
## [[2]]
## Warning: Removed 139 rows containing missing values (geom_point).
##
## [[3]]
## Warning: Removed 139 rows containing missing values (geom_point).
##
## [[4]]
## Warning: Removed 137 rows containing missing values (geom_point).
Reshape the data so that you can investigate differences in student, parent, and teacher responses to survey questions.
combined_survey_gather <- combined_survey %>%
gather(key = "survey_question", value = score, saf_p_11:aca_tot_11)
Use str_sub()
to create new variables, response_type
and question
, from the survey_question
variable.
combined_survey_gather <- combined_survey_gather %>%
mutate(response_type = str_sub(survey_question, 4, 6)) %>%
mutate(question = str_sub(survey_question, 1, 3))
Replace response_type
variable values with names “parent”, “teacher”, “student”, “total” using if_else()
function.
combined_survey_gather <- combined_survey_gather %>%
mutate(response_type = ifelse(response_type == "_p_", "parent",
ifelse(response_type == "_t_", "teacher",
ifelse(response_type == "_s_", "student",
ifelse(response_type == "_to", "total", "NA")))))
Make a boxplot to see if there appear to be differences in how the three groups of responders (parents, students, and teachers) answered the four questions.
combined_survey_gather %>%
filter(response_type != "total") %>%
ggplot() +
aes(x = question, y = score, fill = response_type) +
geom_boxplot()
## Warning: Removed 1268 rows containing non-finite values (stat_boxplot).