Hauntingly Accurate: Imputing Environmental Justice Data

Sourced Data

For this project I downloaded the 2024 Environmental Justice Index for the full state of Illinois at census tract level. The data can be found here: https://www.atsdr.cdc.gov/place-health/php/eji/index.html.

#Load the data
data <- read.csv("EJI_2024_Illinois.csv")

#Load the data dictionary with variable information
col_list <- c("VARIABLE_NAME_2024", "VARIABLE_NAME_2022", "VARIABLE_DESCRIPTION_2024",
              "MODULE", "DOMAIN",   "DATA SOURCE",  "TABLE_FIELD_CALCULATION_2024", "NOTES")

dictionary  <- read_excel("EJI_DATADICTIONARY_2024.xlsx", 
                          skip = 2,
                          col_names = col_list)

#Build a list of the estimate variables for 2024 
variable_list <- dictionary %>% 
  filter(str_starts(VARIABLE_NAME_2024, "E_")) %>%
  select(VARIABLE_NAME_2024, VARIABLE_DESCRIPTION_2024)

print(variable_list)

## # A tibble: 54 × 2
##    VARIABLE_NAME_2024 VARIABLE_DESCRIPTION_2024                                 
##    <chr>              <chr>                                                     
##  1 E_TOTPOP           Population estimate, 2018-2022 ACS                        
##  2 E_DAYPOP           Adjunct variable - Estimated daytime population, LandScan…
##  3 E_MINRTY           Percentage of persons who identify as Hispanic or Latino …
##  4 E_POV200           Percentage of persons with income below 200% of the feder…
##  5 E_NOHSDP           Percentage of persons with no high school diploma (age 25…
##  6 E_UNEMP            Percentage of persons who are unemployed                  
##  7 E_RENTER           Percentage of housing units that are renter occupied      
##  8 E_HOUBDN           Percentage of households that make less than 75,000 who a…
##  9 E_UNINSUR          Percentage of persons who are uninsured (i.e., have no he…
## 10 E_NOINT            Percentage of persons without internet                    
## # ℹ 44 more rows

#Identify numerical columns
numeric_cols <- sapply(data, is.numeric)

# Replace values < 0 with NA only in numeric columns (ie, missing values being stored as -99, etc)
data[ , numeric_cols] <- lapply(data[ , numeric_cols], function(x) {
  x[x < 0] <- NA
  return(x)
})

Project Goal

I wanted to explore a few different imputation approaches and see how they performed when compared with original, non-missing values.

#Set seed for reproducibility
set.seed(987) 

#Select specific variables that will be used for imputation
imput_list <- c("E_POV200", "E_NOHSDP", "E_RENTER", "E_TRND", "E_SWND")

dictionary %>% 
  filter(VARIABLE_NAME_2024 %in% imput_list) %>% 
  select(VARIABLE_NAME_2024, VARIABLE_DESCRIPTION_2024)

## # A tibble: 5 × 2
##   VARIABLE_NAME_2024 VARIABLE_DESCRIPTION_2024                                  
##   <chr>              <chr>                                                      
## 1 E_POV200           Percentage of persons with income below 200% of the federa…
## 2 E_NOHSDP           Percentage of persons with no high school diploma (age 25+)
## 3 E_RENTER           Percentage of housing units that are renter occupied       
## 4 E_SWND             Average annualized frequency of strong wind events         
## 5 E_TRND             Average annualized frequency of tornado events

#Select only complete cases 
data <- data[complete.cases(data[ , imput_list]), ]

#Make a copy to add the 'NA' values into, while keeping 'data' intact 
data_missing <- data

#Setup an empty data frame 
original <- data.frame(value = numeric(), 
                       var = character(),
                       na_indices = numeric())

# Loop over each column in imput_list selecting 5% to make NA
for (var in imput_list) {
  na_indices <- sample(1:nrow(data_missing), size = floor(nrow(data_missing) * 0.05), replace = FALSE) 
  org <- as.data.frame(data_missing[na_indices, var]) 
  names(org) <- c("value") 
  org$var <- var  #add column with variable name
  org <- org %>% bind_cols(na_indices) 
  colnames(org)[3] <- "na_indices"
  original<- original %>% bind_rows(org)
  data_missing[na_indices, var] <- NA 
  rm(org, na_indices, var) 
}

sapply(data_missing[ , imput_list], function(x) sum(is.na(x)))

## E_POV200 E_NOHSDP E_RENTER   E_TRND   E_SWND 
##      162      162      162      162      162

Mean and Median Imputation

The first stop is to use the very crude approach of the mean or median values for imputation.

#Build dataset with mean and median
mm <- data.frame(var = character(),
                 mean1 = numeric(), 
                 median1 = numeric())

for (var_d in imput_list) {

  mm_var <- data_missing %>% 
    summarise(
      mean1 = mean(.data[[var_d]], na.rm = TRUE),
      median1 = median(.data[[var_d]], na.rm = TRUE)
    ) %>% 
    mutate(var = var_d) %>% 
    select(var, mean1, median1)

  mm <- mm %>% bind_rows(mm_var)
  rm(mm_var)
}

compare_mm <- original %>% 
  left_join(mm) 

#Summarize fit for each of the 5 variables. 
compare_imputation <- function(original, imputed, var_name = "Variable") {
  #Check lengths
  if (length(original) != length(imputed)) {
    stop("original and imputed vectors must be of the same length.")
  }
  
  #Remove NA pairs
  valid_idx <- complete.cases(original, imputed)
  original <- original[valid_idx]
  imputed <- imputed[valid_idx]
  
  #RMSE
  rmse <- sqrt(mean((original - imputed)^2))
  
  #R2
  ss_res <- sum((original - imputed)^2)
  ss_tot <- sum((original - mean(original))^2)
  r_squared <- 1 - (ss_res / ss_tot)
  
  #Paired t-test
  ttest_result <- t.test(original, imputed, paired = TRUE)
  ttest_p <- ttest_result$p.value
  
  #Correlation
  cor_value <- cor(original, imputed, method = "pearson")
  
  #Create results table 
  results <- data.frame(
    Variable = var_name,
    RMSE = round(rmse, 4),
    R_squared = round(r_squared, 4),
    Paired_ttest_p = signif(ttest_p, 4),
    Correlation = round(cor_value, 4)
  )
  
  return(results)
}

results_list <- lapply(imput_list, function(var_d) {
  subset_data <- subset(compare_mm, var == var_d)
  compare_imputation(
    original = subset_data$value,
    imputed = subset_data$mean1,
    var_name = var_d
  )
})

#Combine into one final result table
final_results <- do.call(rbind, results_list)
print("Mean Imputation Comparison")

## [1] "Mean Imputation Comparison"

print(final_results)

##   Variable    RMSE R_squared Paired_ttest_p Correlation
## 1 E_POV200 17.3405   -0.0207        0.07004          NA
## 2 E_NOHSDP 10.0117   -0.0002        0.85550          NA
## 3 E_RENTER 23.2540   -0.0007        0.73430          NA
## 4   E_TRND  0.0478   -0.0058        0.33600          NA
## 5   E_SWND  0.6323   -0.0004        0.80400          NA

results_list2 <- lapply(imput_list, function(var_d) {
  subset_data <- subset(compare_mm, var == var_d)
  compare_imputation(
    original = subset_data$value,
    imputed = subset_data$median1,
    var_name = var_d
  )
})

final_results2 <- do.call(rbind, results_list2)
print("Median Imputation Comparison")

## [1] "Median Imputation Comparison"

print(final_results2)

##   Variable    RMSE R_squared Paired_ttest_p Correlation
## 1 E_POV200 17.1676   -0.0004      7.975e-01          NA
## 2 E_NOHSDP 10.4421   -0.0880      2.331e-04          NA
## 3 E_RENTER 23.8723   -0.0546      3.475e-03          NA
## 4   E_TRND  0.0522   -0.2025      5.326e-08          NA
## 5   E_SWND  0.6332   -0.0030      4.893e-01          NA

#Plot the imputed values against original values
#E_POV200   Percentage of persons with income below 200% of the federal poverty level
compare_mm %>% 
  filter(var == "E_POV200") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = mean1),size = 1, alpha = 0.8, color = "darkred") +
  geom_point(aes(x = index, y = median1),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Income Below 200% of the Federal Poverty Level \n Original Values Compared to Mean and Median",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_blank() 
  ) + 
  annotate("text", x = 150, y = unique(compare_mm$mean1[compare_mm$var == "E_POV200"]) + 3,
           label = "Mean", color = "darkred", size = 3.5) +
  annotate("text", x = 150, y = unique(compare_mm$median1[compare_mm$var == "E_POV200"]) - 3,
           label = "Median", color = "red", size = 3.5)

#E_NOHSDP   Percentage of persons with no high school diploma (age 25+)
compare_mm %>% 
  filter(var == "E_NOHSDP") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = mean1),size = 1, alpha = 0.8, color = "darkred") +
  geom_point(aes(x = index, y = median1),size = 1, alpha = 0.8, color = "red") +
  labs(title = "No High School Diploma (age 25+) \n Original Values Compared to Mean and Median",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_blank() 
  ) + 
  annotate("text", x = 150, y = unique(compare_mm$mean1[compare_mm$var == "E_NOHSDP"]) + 3,
           label = "Mean", color = "darkred", size = 3.5) +
  annotate("text", x = 150, y = unique(compare_mm$median1[compare_mm$var == "E_NOHSDP"]) - 3,
           label = "Median", color = "red", size = 3.5)

#E_RENTER   Percentage of housing units that are renter occupied
compare_mm %>% 
  filter(var == "E_RENTER") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = mean1),size = 1, alpha = 0.8, color = "darkred") +
  geom_point(aes(x = index, y = median1),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Housing Units that are Renter Occupied \n Original Values Compared to Mean and Median",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_blank() 
  ) + 
  annotate("text", x = 150, y = unique(compare_mm$mean1[compare_mm$var == "E_RENTER"]) + 3,
           label = "Mean", color = "darkred", size = 3.5) +
  annotate("text", x = 150, y = unique(compare_mm$median1[compare_mm$var == "E_RENTER"]) - 3,
           label = "Median", color = "red", size = 3.5)

#E_SWND Average annualized frequency of strong wind events
compare_mm %>% 
  filter(var == "E_SWND") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = mean1),size = 1, alpha = 0.8, color = "darkred") +
  geom_point(aes(x = index, y = median1),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Average Annualized Frequency of Strong Wind Events \n Original Values Compared to Mean and Median",
       x = "Original Values Sorted Smallest to Largest",
       y = "Average") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_blank() 
  ) + 
  annotate("text", x = 150, y = unique(compare_mm$mean1[compare_mm$var == "E_SWND"]) - 0.25,
           label = "Mean", color = "darkred", size = 3.5) +
  annotate("text", x = 150, y = unique(compare_mm$median1[compare_mm$var == "E_SWND"]) + 0.25,
           label = "Median", color = "red", size = 3.5)

#E_TRND Average annualized frequency of tornado events
compare_mm %>% 
  filter(var == "E_TRND") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = mean1),size = 1, alpha = 0.8, color = "darkred") +
  geom_point(aes(x = index, y = median1),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Average Annualized Frequency of Tornado Events \n Original Values Compared to Mean and Median",
       x = "Original Values Sorted Smallest to Largest",
       y = "Average") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_blank() 
  ) + 
  annotate("text", x = 150, y = unique(compare_mm$mean1[compare_mm$var == "E_TRND"]) + 0.01,
           label = "Mean", color = "darkred", size = 3.5) +
  annotate("text", x = 150, y = unique(compare_mm$median1[compare_mm$var == "E_TRND"]) - 0.01,
           label = "Median", color = "red", size = 3.5)

rm(compare_mm, mm, results_list, results_list2, final_results, final_results2)

Linear Regression

My second approach was to try out linear regression. Given the large number of variables in the dataset, I selected those with correlations > 0.1.

lr <- data.frame(var = character(),
                 imputed = numeric(),
                 na_indices = numeric())

for (var_d in imput_list) {
  # Subset complete cases and remove unneeded variables
  complete <- data_missing %>% 
  filter(!is.na(.data[[var_d]])) %>% 
  select(-STATEFP, -COUNTYFP, -TRACTCE, -AFFGEOID, -GEOID, -GEOID_2020, 
         -COUNTY, -StateDesc, -STATEABBR, -LOCATION)

  # Compute correlations with all other variables
  cor_vals <- sapply(names(complete), function(other_var) {
    if (other_var == var_d) return(NA) 
    cor(data_missing[[var_d]], data_missing[[other_var]], use = "pairwise.complete.obs")
  })
  
  # Keep predictors with |correlation| > 0.1
  selected_predictors <- names(cor_vals)[abs(cor_vals) > 0.1 & 
        !is.na(cor_vals) & 
        !(names(cor_vals) %in% imput_list)]
  
  # Build regression model on complete cases
  formula_str <- paste(var_d, "~", paste(selected_predictors, collapse = " + "))
  model <- lm(as.formula(formula_str), data = complete)
  
  # Predict for missing values
  missing <- data_missing %>% 
    mutate(na_indices = row_number()) %>% 
    filter(is.na(.data[[var_d]]))
  
  imputed <- predict(model, missing)
  imputed <- as.data.frame(imputed)
  imputed <- imputed %>% 
    mutate(var = var_d) %>% 
    select(var, imputed) %>% 
    bind_cols(missing$na_indices)
  
  names(imputed) <- c('var', 'imputed', 'na_indices')
  
  lr <- lr %>% bind_rows(imputed)
  
  rm(complete, missing, formula_str, model, selected_predictors, imputed)
}

compare_lr <- original %>% 
  left_join(lr) 

#Summarize fit for each of the 5 variables. 
results_list <- lapply(imput_list, function(var_d) {
  subset_data <- subset(compare_lr , var == var_d)
  compare_imputation(
    original = subset_data$value,
    imputed = subset_data$imputed,
    var_name = var_d
  )
})

# Combine into one final result table
final_results <- do.call(rbind, results_list)
print("Linear Regression Imputation Comparison")

## [1] "Linear Regression Imputation Comparison"

print(final_results)

##   Variable   RMSE R_squared Paired_ttest_p Correlation
## 1 E_POV200 2.4547    0.9795        0.78310      0.9899
## 2 E_NOHSDP 3.4046    0.8843        0.22930      0.9437
## 3 E_RENTER 4.4227    0.9638        0.13350      0.9825
## 4   E_TRND 0.0175    0.8655        0.04847      0.9364
## 5   E_SWND 0.3194    0.7447        0.13950      0.9425

#Plot the imputed values against original values
#E_POV200   Percentage of persons with income below 200% of the federal poverty level
compare_lr %>% 
  filter(var == "E_POV200") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Income Below 200% of the Federal Poverty Level <br> Original Values Compared to <span style='color:red;'>Linear Regression Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text.x = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5)
  )

#E_NOHSDP   Percentage of persons with no high school diploma (age 25+)
compare_lr %>% 
  filter(var == "E_NOHSDP") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "No High School Diploma (age 25+) <br> Original Values Compared to <span style='color:red;'>Linear Regression Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

#E_RENTER   Percentage of housing units that are renter occupied
compare_lr %>% 
  filter(var == "E_RENTER") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Housing Units that are Renter Occupied <br> Original Values Compared to <span style='color:red;'>Linear Regression Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

#E_SWND Average annualized frequency of strong wind events
compare_lr %>% 
  filter(var == "E_SWND") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Average Annualized Frequency of Strong Wind Events <br> Original Values Compared to <span style='color:red;'>Linear Regression Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Average") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

#E_TRND Average annualized frequency of tornado events
compare_lr %>%  
  filter(var == "E_TRND") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Average Annualized Frequency of Tornado Events <br> Original Values Compared to <span style='color:red;'>Linear Regression Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Average") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

rm(compare_lr, lr, results_list, results_list2, final_results, final_results2)

Multivariate Imputation by Chained Equations (MICE)

Next, I tried out an out-of-the box implementation of MICE. MICE cycles through several iterations of imputation, of which I averaged the results. Using the functions as-is recruited all variables to prediction and I was curious to see how it compared to regular linear regression restricted to highly correlated variables. Clearly MICE can be modified for more accurate imputation in future projects, but this was my first taste of the package.

#Remove variables that shouldn't be used for prediction
data_missing2 <- data_missing %>%
  select(-STATEFP, -COUNTYFP, -TRACTCE, -AFFGEOID, -GEOID, -GEOID_2020, 
         -COUNTY, -StateDesc, -STATEABBR, -LOCATION)

#Run MICE
  #m = number of imputed datasets to create
  #method = "pmm" = predictive mean matching
  #maxit = number of iterations of chained equations
mice_results <- mice(data_missing2, 
                     m = 5, 
                     method = "pmm", 
                     maxit = 5, 
                     seed = 987)

## 
##  iter imp variable
##   1   1  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   1   2  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   1   3  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   1   4  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   1   5  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   2   1  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   2   2  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   2   3  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   2   4  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   2   5  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   3   1  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   3   2  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   3   3  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   3   4  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   3   5  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   4   1  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   4   2  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   4   3  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   4   4  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   4   5  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   5   1  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   5   2  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   5   3  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   5   4  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*
##   5   5  E_POV200*  E_NOHSDP*  E_RENTER*  E_SWND*  E_TRND*

#List of completed datasets from MICE
completed_list <- lapply(1:5, function(i) complete(mice_results, action = i))

#Average the imputed values row-wise
imputed_avg_df <- sapply(imput_list, function(var) {
  imputed_matrix <- sapply(completed_list, function(df) df[[var]])
  rowMeans(imputed_matrix, na.rm = TRUE)
})

imputed_avg_df <- as.data.frame(imputed_avg_df)
imputed_avg_df <- imputed_avg_df %>% 
  mutate(na_indices = row_number()) 

compare_mc <- imputed_avg_df %>% 
  mutate(var = 'E_POV200',
         imputed = E_POV200) %>% 
  select(na_indices, var, imputed) %>% 
  bind_rows(imputed_avg_df %>% 
    mutate(var = 'E_NOHSDP',
         imputed = E_NOHSDP) %>% 
    select(na_indices, var, imputed)) %>% 
  bind_rows(imputed_avg_df %>% 
    mutate(var = 'E_RENTER',
         imputed = E_RENTER) %>% 
    select(na_indices, var, imputed)) %>% 
  bind_rows(imputed_avg_df %>% 
    mutate(var = 'E_TRND',
         imputed = E_TRND) %>% 
    select(na_indices, var, imputed))%>% 
  bind_rows(imputed_avg_df %>% 
    mutate(var = 'E_SWND',
         imputed = E_SWND) %>% 
    select(na_indices, var, imputed)) %>% 
  inner_join(original)

#Summarize fit for each of the 5 variables. 
results_list <- lapply(imput_list, function(var_d) {
  subset_data <- subset(compare_mc , var == var_d)
  compare_imputation(
    original = subset_data$value,
    imputed = subset_data$imputed,
    var_name = var_d
  )
})

#Combine into one final result table
final_results <- do.call(rbind, results_list)
print("MICE Imputation Comparison")

## [1] "MICE Imputation Comparison"

print(final_results)

##   Variable    RMSE R_squared Paired_ttest_p Correlation
## 1 E_POV200 35.9838   -3.3951      3.803e-55      0.3199
## 2 E_NOHSDP 16.4611   -1.7039      5.125e-34      0.1027
## 3 E_RENTER 38.1779   -1.6974      8.570e-25      0.0399
## 4   E_TRND  0.1235   -5.7245      2.667e-53     -0.0500
## 5   E_SWND  7.0612 -123.7444      3.107e-81      0.1465

#Plot the imputed values against original values
#E_POV200   Percentage of persons with income below 200% of the federal poverty level
compare_mc %>% 
  filter(var == "E_POV200") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Income Below 200% of the Federal Poverty Level <br> Original Values Compared to <span style='color:red;'>MICE Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text.x = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5)
  )

#E_NOHSDP   Percentage of persons with no high school diploma (age 25+)
compare_mc %>% 
  filter(var == "E_NOHSDP") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "No High School Diploma (age 25+) <br> Original Values Compared to <span style='color:red;'>MICE Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

#E_RENTER   Percentage of housing units that are renter occupied
compare_mc %>% 
  filter(var == "E_RENTER") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Housing Units that are Renter Occupied <br> Original Values Compared to <span style='color:red;'>MICE Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

#E_SWND Average annualized frequency of strong wind events
compare_mc %>% 
  filter(var == "E_SWND") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Average Annualized Frequency of Strong Wind Events <br> Original Values Compared to <span style='color:red;'>MICE Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Average") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

#E_TRND Average annualized frequency of tornado events
compare_mc %>%  
  filter(var == "E_TRND") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Average Annualized Frequency of Tornado Events <br> Original Values Compared to <span style='color:red;'>MICE Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Average") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

rm(compare_mc, mm, results_list, results_list2, final_results, final_results2, completed_list, data_missing2, imputed_avg_df, mice_results)

Hot Deck Imputation

If I can favor an imputation approach just because of the name, this would be it. I feel like it should be branded with some Hot Wheels style flames.

This imputation approach takes a step away from building a model and focuses of identifying similar cases similar to nearest neighbor approach. I did choose to use the most correlated variables this time around.

hd <- data.frame(var = character(),
                 imputed = numeric(),
                 na_indices = numeric())

for (var_d in imput_list) {
  
  #Subset complete cases and remove variables that should not be used in the correlations
  corr_set <- data_missing %>% 
  filter(!is.na(.data[[var_d]])) %>% 
  select(-STATEFP, -COUNTYFP, -TRACTCE, -AFFGEOID, -GEOID, -GEOID_2020, 
         -COUNTY, -StateDesc, -STATEABBR, -LOCATION)

  #Compute correlations with all other variables
  cor_vals <- sapply(names(corr_set), function(other_var) {
    if (other_var == var_d) return(NA) 
    cor(data_missing[[var_d]], data_missing[[other_var]], use = "pairwise.complete.obs")
  })
  
  #Keep top 3 predictors with |correlation| > 0.1
  selected_predictors <- names(sort(abs(cor_vals), decreasing = TRUE))[
  abs(sort(abs(cor_vals), decreasing = TRUE)) > 0.1 &
  !is.na(sort(abs(cor_vals), decreasing = TRUE)) &
  !(names(sort(abs(cor_vals), decreasing = TRUE)) %in% imput_list)
  ][1:3]
  
  #Subset data agian, but keep County variable to be used as domain_var
  data_missing_run <- data_missing %>% 
  select(-STATEFP, -COUNTYFP, -TRACTCE, -AFFGEOID, -GEOID, -GEOID_2020, 
          -StateDesc, -STATEABBR, -LOCATION)

  data_imputed <- hotdeck(data_missing_run, 
                        variable = var_d,  
                        ord_var = selected_predictors,   
                        domain_var = c("COUNTY"))

  imputed <- data_imputed %>% 
    mutate(var = var_d) %>% 
    select(var, .data[[var_d]]) %>% 
    mutate(na_indices = row_number()) 
  
  names(imputed) <- c('var', 'imputed', 'na_indices')
  
  hd <- hd %>% bind_rows(imputed)
  
  rm(corr_set, cor_vals, data_missing_run, data_imputed, selected_predictors, imputed)
}

compare_hd <- original %>% 
  left_join(hd) 

#Summarize fit for each of the 5 variables. 
results_list <- lapply(imput_list, function(var_d) {
  subset_data <- subset(compare_hd , var == var_d)
  compare_imputation(
    original = subset_data$value,
    imputed = subset_data$imputed,
    var_name = var_d
  )
})

#Combine into one final result table
final_results <- do.call(rbind, results_list)
print("Hot Deck Imputation Comparison")

## [1] "Hot Deck Imputation Comparison"

print(final_results)

##   Variable   RMSE R_squared Paired_ttest_p Correlation
## 1 E_POV200 2.3539    0.9812      1.229e-07      0.9924
## 2 E_NOHSDP 2.9498    0.9132      2.105e-03      0.9600
## 3 E_RENTER 3.9287    0.9714      6.712e-05      0.9870
## 4   E_TRND 0.0248    0.7293      1.348e-01      0.8564
## 5   E_SWND 0.1043    0.9728      6.496e-02      0.9866

#Plot the imputed values against original values
#E_POV200   Percentage of persons with income below 200% of the federal poverty level
compare_hd %>% 
  filter(var == "E_POV200") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Income Below 200% of the Federal Poverty Level <br> Original Values Compared to <span style='color:red;'>Hot Deck Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text.x = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5)
  )

#E_NOHSDP   Percentage of persons with no high school diploma (age 25+)
compare_hd %>% 
  filter(var == "E_NOHSDP") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "No High School Diploma (age 25+) <br> Original Values Compared to <span style='color:red;'>Hot Deck Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

#E_RENTER   Percentage of housing units that are renter occupied
compare_hd %>% 
  filter(var == "E_RENTER") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Housing Units that are Renter Occupied <br> Original Values Compared to <span style='color:red;'>Hot Deck Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Percentage of Persons") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

#E_SWND Average annualized frequency of strong wind events
compare_hd %>% 
  filter(var == "E_SWND") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Average Annualized Frequency of Strong Wind Events <br> Original Values Compared to <span style='color:red;'>Hot Deck Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Average") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

#E_TRND Average annualized frequency of tornado events
compare_hd %>%  
  filter(var == "E_TRND") %>% 
  arrange(value) %>% 
  mutate(index = row_number()) %>% 
ggplot(aes(x = index, y = value)) +
  geom_line(size = 1) +
  geom_point(aes(x = index, y = imputed),size = 1, alpha = 0.8, color = "red") +
  labs(title = "Average Annualized Frequency of Tornado Events <br> Original Values Compared to <span style='color:red;'>Hot Deck Imputation</span> ",
       x = "Original Values Sorted Smallest to Largest",
       y = "Average") +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_markdown(size = 14, hjust = 0.5),
    axis.text.x = element_blank() 
  )

rm(compare_hd, hd, results_list, final_results, final_results2)

Wrap-Up

Please forgive the minimal write-up, lack of any story, and weak conclusion. This has served me while as purely an engine to expand a skillset and I am excited to try more approaches out in the future.