For this project I downloaded the 2024 Environmental Justice Index for the full state of Illinois at census tract level. The data can be found here: https://www.atsdr.cdc.gov/place-health/php/eji/index.html.
#Load the data
data <- read.csv("EJI_2024_Illinois.csv")
#Load the data dictionary with variable information
col_list <- c("VARIABLE_NAME_2024", "VARIABLE_NAME_2022", "VARIABLE_DESCRIPTION_2024",
"MODULE", "DOMAIN", "DATA SOURCE", "TABLE_FIELD_CALCULATION_2024", "NOTES")
dictionary <- read_excel("EJI_DATADICTIONARY_2024.xlsx",
skip = 2,
col_names = col_list)
#Build a list of the estimate variables for 2024
variable_list <- dictionary %>%
filter(str_starts(VARIABLE_NAME_2024, "E_")) %>%
select(VARIABLE_NAME_2024, VARIABLE_DESCRIPTION_2024)
print(variable_list)
## # A tibble: 54 × 2
## VARIABLE_NAME_2024 VARIABLE_DESCRIPTION_2024
## <chr> <chr>
## 1 E_TOTPOP Population estimate, 2018-2022 ACS
## 2 E_DAYPOP Adjunct variable - Estimated daytime population, LandScan…
## 3 E_MINRTY Percentage of persons who identify as Hispanic or Latino …
## 4 E_POV200 Percentage of persons with income below 200% of the feder…
## 5 E_NOHSDP Percentage of persons with no high school diploma (age 25…
## 6 E_UNEMP Percentage of persons who are unemployed
## 7 E_RENTER Percentage of housing units that are renter occupied
## 8 E_HOUBDN Percentage of households that make less than 75,000 who a…
## 9 E_UNINSUR Percentage of persons who are uninsured (i.e., have no he…
## 10 E_NOINT Percentage of persons without internet
## # ℹ 44 more rows
#Identify numerical columns
numeric_cols <- sapply(data, is.numeric)
# Replace values < 0 with NA only in numeric columns (ie, missing values being stored as -99, etc)
data[ , numeric_cols] <- lapply(data[ , numeric_cols], function(x) {
x[x < 0] <- NA
return(x)
})
I wanted to explore a few different imputation approaches and see how they performed when compared with original, non-missing values.
#Set seed for reproducibility
set.seed(987)
#Select specific variables that will be used for imputation
imput_list <- c("E_POV200", "E_NOHSDP", "E_RENTER", "E_TRND", "E_SWND")
dictionary %>%
filter(VARIABLE_NAME_2024 %in% imput_list) %>%
select(VARIABLE_NAME_2024, VARIABLE_DESCRIPTION_2024)
## # A tibble: 5 × 2
## VARIABLE_NAME_2024 VARIABLE_DESCRIPTION_2024
## <chr> <chr>
## 1 E_POV200 Percentage of persons with income below 200% of the federa…
## 2 E_NOHSDP Percentage of persons with no high school diploma (age 25+)
## 3 E_RENTER Percentage of housing units that are renter occupied
## 4 E_SWND Average annualized frequency of strong wind events
## 5 E_TRND Average annualized frequency of tornado events
#Select only complete cases
data <- data[complete.cases(data[ , imput_list]), ]
#Make a copy to add the 'NA' values into, while keeping 'data' intact
data_missing <- data
#Setup an empty data frame
original <- data.frame(value = numeric(),
var = character(),
na_indices = numeric())
# Loop over each column in imput_list selecting 5% to make NA
for (var in imput_list) {
na_indices <- sample(1:nrow(data_missing), size = floor(nrow(data_missing) * 0.05), replace = FALSE)
org <- as.data.frame(data_missing[na_indices, var])
names(org) <- c("value")
org$var <- var #add column with variable name
org <- org %>% bind_cols(na_indices)
colnames(org)[3] <- "na_indices"
original<- original %>% bind_rows(org)
data_missing[na_indices, var] <- NA
rm(org, na_indices, var)
}
sapply(data_missing[ , imput_list], function(x) sum(is.na(x)))
## E_POV200 E_NOHSDP E_RENTER E_TRND E_SWND
## 162 162 162 162 162
The first stop is to use the very crude approach of the mean or median values for imputation.
#Build dataset with mean and median
mm <- data.frame(var = character(),
mean1 = numeric(),
median1 = numeric())
for (var_d in imput_list) {
mm_var <- data_missing %>%
summarise(
mean1 = mean(.data[[var_d]], na.rm = TRUE),
median1 = median(.data[[var_d]], na.rm = TRUE)
) %>%
mutate(var = var_d) %>%
select(var, mean1, median1)
mm <- mm %>% bind_rows(mm_var)
rm(mm_var)
}
compare_mm <- original %>%
left_join(mm)
#Summarize fit for each of the 5 variables.
compare_imputation <- function(original, imputed, var_name = "Variable") {
#Check lengths
if (length(original) != length(imputed)) {
stop("original and imputed vectors must be of the same length.")
}
#Remove NA pairs
valid_idx <- complete.cases(original, imputed)
original <- original[valid_idx]
imputed <- imputed[valid_idx]
#RMSE
rmse <- sqrt(mean((original - imputed)^2))
#R2
ss_res <- sum((original - imputed)^2)
ss_tot <- sum((original - mean(original))^2)
r_squared <- 1 - (ss_res / ss_tot)
#Paired t-test
ttest_result <- t.test(original, imputed, paired = TRUE)
ttest_p <- ttest_result$p.value
#Correlation
cor_value <- cor(original, imputed, method = "pearson")
#Create results table
results <- data.frame(
Variable = var_name,
RMSE = round(rmse, 4),
R_squared = round(r_squared, 4),
Paired_ttest_p = signif(ttest_p, 4),
Correlation = round(cor_value, 4)
)
return(results)
}
results_list <- lapply(imput_list, function(var_d) {
subset_data <- subset(compare_mm, var == var_d)
compare_imputation(
original = subset_data$value,
imputed = subset_data$mean1,
var_name = var_d
)
})
#Combine into one final result table
final_results <- do.call(rbind, results_list)
print("Mean Imputation Comparison")
## [1] "Mean Imputation Comparison"
print(final_results)
## Variable RMSE R_squared Paired_ttest_p Correlation
## 1 E_POV200 17.3405 -0.0207 0.07004 NA
## 2 E_NOHSDP 10.0117 -0.0002 0.85550 NA
## 3 E_RENTER 23.2540 -0.0007 0.73430 NA
## 4 E_TRND 0.0478 -0.0058 0.33600 NA
## 5 E_SWND 0.6323 -0.0004 0.80400 NA
results_list2 <- lapply(imput_list, function(var_d) {
subset_data <- subset(compare_mm, var == var_d)
compare_imputation(
original = subset_data$value,
imputed = subset_data$median1,
var_name = var_d
)
})
final_results2 <- do.call(rbind, results_list2)
print("Median Imputation Comparison")
## [1] "Median Imputation Comparison"
print(final_results2)
## Variable RMSE R_squared Paired_ttest_p Correlation
## 1 E_POV200 17.1676 -0.0004 7.975e-01 NA
## 2 E_NOHSDP 10.4421 -0.0880 2.331e-04 NA
## 3 E_RENTER 23.8723 -0.0546 3.475e-03 NA
## 4 E_TRND 0.0522 -0.2025 5.326e-08 NA
## 5 E_SWND 0.6332 -0.0030 4.893e-01 NA
#Plot the imputed values against original values
#E_POV200 Percentage of persons with income below 200% of the federal poverty level
compare_mm %>%
filter(var == "E_POV200") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = mean1),size = 1, alpha = 0.8, color = "darkred") +
geom_point(aes(x = index, y = median1),size = 1, alpha = 0.8, color = "red") +
labs(title = "Income Below 200% of the Federal Poverty Level \n Original Values Compared to Mean and Median",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_blank()
) +
annotate("text", x = 150, y = unique(compare_mm$mean1[compare_mm$var == "E_POV200"]) + 3,
label = "Mean", color = "darkred", size = 3.5) +
annotate("text", x = 150, y = unique(compare_mm$median1[compare_mm$var == "E_POV200"]) - 3,
label = "Median", color = "red", size = 3.5)
#E_NOHSDP Percentage of persons with no high school diploma (age 25+)
compare_mm %>%
filter(var == "E_NOHSDP") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = mean1),size = 1, alpha = 0.8, color = "darkred") +
geom_point(aes(x = index, y = median1),size = 1, alpha = 0.8, color = "red") +
labs(title = "No High School Diploma (age 25+) \n Original Values Compared to Mean and Median",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_blank()
) +
annotate("text", x = 150, y = unique(compare_mm$mean1[compare_mm$var == "E_NOHSDP"]) + 3,
label = "Mean", color = "darkred", size = 3.5) +
annotate("text", x = 150, y = unique(compare_mm$median1[compare_mm$var == "E_NOHSDP"]) - 3,
label = "Median", color = "red", size = 3.5)
#E_RENTER Percentage of housing units that are renter occupied
compare_mm %>%
filter(var == "E_RENTER") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = mean1),size = 1, alpha = 0.8, color = "darkred") +
geom_point(aes(x = index, y = median1),size = 1, alpha = 0.8, color = "red") +
labs(title = "Housing Units that are Renter Occupied \n Original Values Compared to Mean and Median",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_blank()
) +
annotate("text", x = 150, y = unique(compare_mm$mean1[compare_mm$var == "E_RENTER"]) + 3,
label = "Mean", color = "darkred", size = 3.5) +
annotate("text", x = 150, y = unique(compare_mm$median1[compare_mm$var == "E_RENTER"]) - 3,
label = "Median", color = "red", size = 3.5)
#E_SWND Average annualized frequency of strong wind events
compare_mm %>%
filter(var == "E_SWND") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = mean1),size = 1, alpha = 0.8, color = "darkred") +
geom_point(aes(x = index, y = median1),size = 1, alpha = 0.8, color = "red") +
labs(title = "Average Annualized Frequency of Strong Wind Events \n Original Values Compared to Mean and Median",
x = "Original Values Sorted Smallest to Largest",
y = "Average") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_blank()
) +
annotate("text", x = 150, y = unique(compare_mm$mean1[compare_mm$var == "E_SWND"]) - 0.25,
label = "Mean", color = "darkred", size = 3.5) +
annotate("text", x = 150, y = unique(compare_mm$median1[compare_mm$var == "E_SWND"]) + 0.25,
label = "Median", color = "red", size = 3.5)
#E_TRND Average annualized frequency of tornado events
compare_mm %>%
filter(var == "E_TRND") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = mean1),size = 1, alpha = 0.8, color = "darkred") +
geom_point(aes(x = index, y = median1),size = 1, alpha = 0.8, color = "red") +
labs(title = "Average Annualized Frequency of Tornado Events \n Original Values Compared to Mean and Median",
x = "Original Values Sorted Smallest to Largest",
y = "Average") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_blank()
) +
annotate("text", x = 150, y = unique(compare_mm$mean1[compare_mm$var == "E_TRND"]) + 0.01,
label = "Mean", color = "darkred", size = 3.5) +
annotate("text", x = 150, y = unique(compare_mm$median1[compare_mm$var == "E_TRND"]) - 0.01,
label = "Median", color = "red", size = 3.5)
rm(compare_mm, mm, results_list, results_list2, final_results, final_results2)
My second approach was to try out linear regression. Given the large number of variables in the dataset, I selected those with correlations > 0.1.
lr <- data.frame(var = character(),
imputed = numeric(),
na_indices = numeric())
for (var_d in imput_list) {
# Subset complete cases and remove unneeded variables
complete <- data_missing %>%
filter(!is.na(.data[[var_d]])) %>%
select(-STATEFP, -COUNTYFP, -TRACTCE, -AFFGEOID, -GEOID, -GEOID_2020,
-COUNTY, -StateDesc, -STATEABBR, -LOCATION)
# Compute correlations with all other variables
cor_vals <- sapply(names(complete), function(other_var) {
if (other_var == var_d) return(NA)
cor(data_missing[[var_d]], data_missing[[other_var]], use = "pairwise.complete.obs")
})
# Keep predictors with |correlation| > 0.1
selected_predictors <- names(cor_vals)[abs(cor_vals) > 0.1 &
!is.na(cor_vals) &
!(names(cor_vals) %in% imput_list)]
# Build regression model on complete cases
formula_str <- paste(var_d, "~", paste(selected_predictors, collapse = " + "))
model <- lm(as.formula(formula_str), data = complete)
# Predict for missing values
missing <- data_missing %>%
mutate(na_indices = row_number()) %>%
filter(is.na(.data[[var_d]]))
imputed <- predict(model, missing)
imputed <- as.data.frame(imputed)
imputed <- imputed %>%
mutate(var = var_d) %>%
select(var, imputed) %>%
bind_cols(missing$na_indices)
names(imputed) <- c('var', 'imputed', 'na_indices')
lr <- lr %>% bind_rows(imputed)
rm(complete, missing, formula_str, model, selected_predictors, imputed)
}
compare_lr <- original %>%
left_join(lr)
#Summarize fit for each of the 5 variables.
results_list <- lapply(imput_list, function(var_d) {
subset_data <- subset(compare_lr , var == var_d)
compare_imputation(
original = subset_data$value,
imputed = subset_data$imputed,
var_name = var_d
)
})
# Combine into one final result table
final_results <- do.call(rbind, results_list)
print("Linear Regression Imputation Comparison")
## [1] "Linear Regression Imputation Comparison"
print(final_results)
## Variable RMSE R_squared Paired_ttest_p Correlation
## 1 E_POV200 2.4547 0.9795 0.78310 0.9899
## 2 E_NOHSDP 3.4046 0.8843 0.22930 0.9437
## 3 E_RENTER 4.4227 0.9638 0.13350 0.9825
## 4 E_TRND 0.0175 0.8655 0.04847 0.9364
## 5 E_SWND 0.3194 0.7447 0.13950 0.9425
#Plot the imputed values against original values
#E_POV200 Percentage of persons with income below 200% of the federal poverty level
compare_lr %>%
filter(var == "E_POV200") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Income Below 200% of the Federal Poverty Level <br> Original Values Compared to <span style='color:red;'>Linear Regression Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5)
)
#E_NOHSDP Percentage of persons with no high school diploma (age 25+)
compare_lr %>%
filter(var == "E_NOHSDP") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "No High School Diploma (age 25+) <br> Original Values Compared to <span style='color:red;'>Linear Regression Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
#E_RENTER Percentage of housing units that are renter occupied
compare_lr %>%
filter(var == "E_RENTER") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Housing Units that are Renter Occupied <br> Original Values Compared to <span style='color:red;'>Linear Regression Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
#E_SWND Average annualized frequency of strong wind events
compare_lr %>%
filter(var == "E_SWND") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Average Annualized Frequency of Strong Wind Events <br> Original Values Compared to <span style='color:red;'>Linear Regression Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Average") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
#E_TRND Average annualized frequency of tornado events
compare_lr %>%
filter(var == "E_TRND") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Average Annualized Frequency of Tornado Events <br> Original Values Compared to <span style='color:red;'>Linear Regression Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Average") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
rm(compare_lr, lr, results_list, results_list2, final_results, final_results2)
Next, I tried out an out-of-the box implementation of MICE. MICE cycles through several iterations of imputation, of which I averaged the results. Using the functions as-is recruited all variables to prediction and I was curious to see how it compared to regular linear regression restricted to highly correlated variables. Clearly MICE can be modified for more accurate imputation in future projects, but this was my first taste of the package.
#Remove variables that shouldn't be used for prediction
data_missing2 <- data_missing %>%
select(-STATEFP, -COUNTYFP, -TRACTCE, -AFFGEOID, -GEOID, -GEOID_2020,
-COUNTY, -StateDesc, -STATEABBR, -LOCATION)
#Run MICE
#m = number of imputed datasets to create
#method = "pmm" = predictive mean matching
#maxit = number of iterations of chained equations
mice_results <- mice(data_missing2,
m = 5,
method = "pmm",
maxit = 5,
seed = 987)
##
## iter imp variable
## 1 1 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 1 2 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 1 3 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 1 4 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 1 5 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 2 1 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 2 2 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 2 3 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 2 4 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 2 5 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 3 1 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 3 2 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 3 3 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 3 4 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 3 5 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 4 1 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 4 2 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 4 3 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 4 4 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 4 5 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 5 1 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 5 2 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 5 3 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 5 4 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
## 5 5 E_POV200* E_NOHSDP* E_RENTER* E_SWND* E_TRND*
#List of completed datasets from MICE
completed_list <- lapply(1:5, function(i) complete(mice_results, action = i))
#Average the imputed values row-wise
imputed_avg_df <- sapply(imput_list, function(var) {
imputed_matrix <- sapply(completed_list, function(df) df[[var]])
rowMeans(imputed_matrix, na.rm = TRUE)
})
imputed_avg_df <- as.data.frame(imputed_avg_df)
imputed_avg_df <- imputed_avg_df %>%
mutate(na_indices = row_number())
compare_mc <- imputed_avg_df %>%
mutate(var = 'E_POV200',
imputed = E_POV200) %>%
select(na_indices, var, imputed) %>%
bind_rows(imputed_avg_df %>%
mutate(var = 'E_NOHSDP',
imputed = E_NOHSDP) %>%
select(na_indices, var, imputed)) %>%
bind_rows(imputed_avg_df %>%
mutate(var = 'E_RENTER',
imputed = E_RENTER) %>%
select(na_indices, var, imputed)) %>%
bind_rows(imputed_avg_df %>%
mutate(var = 'E_TRND',
imputed = E_TRND) %>%
select(na_indices, var, imputed))%>%
bind_rows(imputed_avg_df %>%
mutate(var = 'E_SWND',
imputed = E_SWND) %>%
select(na_indices, var, imputed)) %>%
inner_join(original)
#Summarize fit for each of the 5 variables.
results_list <- lapply(imput_list, function(var_d) {
subset_data <- subset(compare_mc , var == var_d)
compare_imputation(
original = subset_data$value,
imputed = subset_data$imputed,
var_name = var_d
)
})
#Combine into one final result table
final_results <- do.call(rbind, results_list)
print("MICE Imputation Comparison")
## [1] "MICE Imputation Comparison"
print(final_results)
## Variable RMSE R_squared Paired_ttest_p Correlation
## 1 E_POV200 35.9838 -3.3951 3.803e-55 0.3199
## 2 E_NOHSDP 16.4611 -1.7039 5.125e-34 0.1027
## 3 E_RENTER 38.1779 -1.6974 8.570e-25 0.0399
## 4 E_TRND 0.1235 -5.7245 2.667e-53 -0.0500
## 5 E_SWND 7.0612 -123.7444 3.107e-81 0.1465
#Plot the imputed values against original values
#E_POV200 Percentage of persons with income below 200% of the federal poverty level
compare_mc %>%
filter(var == "E_POV200") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Income Below 200% of the Federal Poverty Level <br> Original Values Compared to <span style='color:red;'>MICE Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5)
)
#E_NOHSDP Percentage of persons with no high school diploma (age 25+)
compare_mc %>%
filter(var == "E_NOHSDP") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "No High School Diploma (age 25+) <br> Original Values Compared to <span style='color:red;'>MICE Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
#E_RENTER Percentage of housing units that are renter occupied
compare_mc %>%
filter(var == "E_RENTER") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Housing Units that are Renter Occupied <br> Original Values Compared to <span style='color:red;'>MICE Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
#E_SWND Average annualized frequency of strong wind events
compare_mc %>%
filter(var == "E_SWND") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Average Annualized Frequency of Strong Wind Events <br> Original Values Compared to <span style='color:red;'>MICE Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Average") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
#E_TRND Average annualized frequency of tornado events
compare_mc %>%
filter(var == "E_TRND") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Average Annualized Frequency of Tornado Events <br> Original Values Compared to <span style='color:red;'>MICE Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Average") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
rm(compare_mc, mm, results_list, results_list2, final_results, final_results2, completed_list, data_missing2, imputed_avg_df, mice_results)
If I can favor an imputation approach just because of the name, this would be it. I feel like it should be branded with some Hot Wheels style flames.
This imputation approach takes a step away from building a model and focuses of identifying similar cases similar to nearest neighbor approach. I did choose to use the most correlated variables this time around.
hd <- data.frame(var = character(),
imputed = numeric(),
na_indices = numeric())
for (var_d in imput_list) {
#Subset complete cases and remove variables that should not be used in the correlations
corr_set <- data_missing %>%
filter(!is.na(.data[[var_d]])) %>%
select(-STATEFP, -COUNTYFP, -TRACTCE, -AFFGEOID, -GEOID, -GEOID_2020,
-COUNTY, -StateDesc, -STATEABBR, -LOCATION)
#Compute correlations with all other variables
cor_vals <- sapply(names(corr_set), function(other_var) {
if (other_var == var_d) return(NA)
cor(data_missing[[var_d]], data_missing[[other_var]], use = "pairwise.complete.obs")
})
#Keep top 3 predictors with |correlation| > 0.1
selected_predictors <- names(sort(abs(cor_vals), decreasing = TRUE))[
abs(sort(abs(cor_vals), decreasing = TRUE)) > 0.1 &
!is.na(sort(abs(cor_vals), decreasing = TRUE)) &
!(names(sort(abs(cor_vals), decreasing = TRUE)) %in% imput_list)
][1:3]
#Subset data agian, but keep County variable to be used as domain_var
data_missing_run <- data_missing %>%
select(-STATEFP, -COUNTYFP, -TRACTCE, -AFFGEOID, -GEOID, -GEOID_2020,
-StateDesc, -STATEABBR, -LOCATION)
data_imputed <- hotdeck(data_missing_run,
variable = var_d,
ord_var = selected_predictors,
domain_var = c("COUNTY"))
imputed <- data_imputed %>%
mutate(var = var_d) %>%
select(var, .data[[var_d]]) %>%
mutate(na_indices = row_number())
names(imputed) <- c('var', 'imputed', 'na_indices')
hd <- hd %>% bind_rows(imputed)
rm(corr_set, cor_vals, data_missing_run, data_imputed, selected_predictors, imputed)
}
compare_hd <- original %>%
left_join(hd)
#Summarize fit for each of the 5 variables.
results_list <- lapply(imput_list, function(var_d) {
subset_data <- subset(compare_hd , var == var_d)
compare_imputation(
original = subset_data$value,
imputed = subset_data$imputed,
var_name = var_d
)
})
#Combine into one final result table
final_results <- do.call(rbind, results_list)
print("Hot Deck Imputation Comparison")
## [1] "Hot Deck Imputation Comparison"
print(final_results)
## Variable RMSE R_squared Paired_ttest_p Correlation
## 1 E_POV200 2.3539 0.9812 1.229e-07 0.9924
## 2 E_NOHSDP 2.9498 0.9132 2.105e-03 0.9600
## 3 E_RENTER 3.9287 0.9714 6.712e-05 0.9870
## 4 E_TRND 0.0248 0.7293 1.348e-01 0.8564
## 5 E_SWND 0.1043 0.9728 6.496e-02 0.9866
#Plot the imputed values against original values
#E_POV200 Percentage of persons with income below 200% of the federal poverty level
compare_hd %>%
filter(var == "E_POV200") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Income Below 200% of the Federal Poverty Level <br> Original Values Compared to <span style='color:red;'>Hot Deck Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5)
)
#E_NOHSDP Percentage of persons with no high school diploma (age 25+)
compare_hd %>%
filter(var == "E_NOHSDP") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "No High School Diploma (age 25+) <br> Original Values Compared to <span style='color:red;'>Hot Deck Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
#E_RENTER Percentage of housing units that are renter occupied
compare_hd %>%
filter(var == "E_RENTER") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Housing Units that are Renter Occupied <br> Original Values Compared to <span style='color:red;'>Hot Deck Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Percentage of Persons") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
#E_SWND Average annualized frequency of strong wind events
compare_hd %>%
filter(var == "E_SWND") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Average Annualized Frequency of Strong Wind Events <br> Original Values Compared to <span style='color:red;'>Hot Deck Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Average") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
#E_TRND Average annualized frequency of tornado events
compare_hd %>%
filter(var == "E_TRND") %>%
arrange(value) %>%
mutate(index = row_number()) %>%
ggplot(aes(x = index, y = value)) +
geom_line(size = 1) +
geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
labs(title = "Average Annualized Frequency of Tornado Events <br> Original Values Compared to <span style='color:red;'>Hot Deck Imputation</span> ",
x = "Original Values Sorted Smallest to Largest",
y = "Average") +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_markdown(size = 14, hjust = 0.5),
axis.text.x = element_blank()
)
rm(compare_hd, hd, results_list, final_results, final_results2)
Please forgive the minimal write-up, lack of any story, and weak conclusion. This has served me while as purely an engine to expand a skillset and I am excited to try more approaches out in the future.