I wanted to explore a few supervised machine learning classification models at a superficial level to get a feel for how each works. Below I have used naive bayes, k-nearest neighbors, logistic regression, and support vector machines to run the same predictions of which patients will have long inpatient stays of 8 days or more.
I used the Basic Stand Alone (BSA) Inpatient Public Use Files (PUF) containing information about 2008 CMS Medicare inpatient claims available here.
The full data dictionary can be found here. Below are the variable definitions.
BENE_SEX_IDENT_CD This field indicates the sex of the beneficiary.
BENE_AGE_CAT_CD This categorical variable is based on the beneficiary’s age at end of the reference year (2008).
IP_CLM_BASE_DRG_CD This is a categorical variable. It represents diagnostic related groups (DRGs).
IP_CLM_ICD9_PRCDR_CD This code indicates the primary procedure (mainly surgical procedures) performed during the inpatient stay.
IP_CLM_DAYS_CD This categorical variable is based on the number of inpatient days (or length of stay) on a claim for all stays ending in 2008.
IP_DRG_QUINT_PMT_AVG This field contains the average Medicare total claim payment amount of the quintile for the payments (of a particular DRG) in the 100% Inpatient claims data for 2008.
IP_DRG_QUINT_PMT_CD This categorical field indicates the quintile value (or code) to which the actual Medicare payment amount on the claim belongs.
#load libraries
library(dplyr)
library(tidyr)
library(naivebayes)
library(caret)
library(class)
library(e1071)
#load data
ip_data <- read.csv("2008_BSA_Inpatient_Claims_PUF.csv")
#review data
str(ip_data)
## 'data.frame': 588415 obs. of 8 variables:
## $ IP_CLM_ID : chr "IP-000022CE4125DBE7" "IP-0000417F50942D90" "IP-0000865F5457AC0E" "IP-00009789630AF474" ...
## $ BENE_SEX_IDENT_CD : int 2 2 1 2 2 1 1 2 1 2 ...
## $ BENE_AGE_CAT_CD : int 4 5 1 2 2 1 3 2 1 3 ...
## $ IP_CLM_BASE_DRG_CD : int 3 199 119 128 236 123 64 284 284 83 ...
## $ IP_CLM_ICD9_PRCDR_CD: int 31 NA 54 NA 70 45 NA NA 94 37 ...
## $ IP_CLM_DAYS_CD : int 4 2 4 2 1 2 2 4 2 3 ...
## $ IP_DRG_QUINT_PMT_AVG: int 86240 3447 34878 3007 3352 2690 5234 2713 9143 23354 ...
## $ IP_DRG_QUINT_PMT_CD : int 4 2 5 2 2 1 3 2 5 5 ...
head(ip_data, 10)
## IP_CLM_ID BENE_SEX_IDENT_CD BENE_AGE_CAT_CD IP_CLM_BASE_DRG_CD
## 1 IP-000022CE4125DBE7 2 4 3
## 2 IP-0000417F50942D90 2 5 199
## 3 IP-0000865F5457AC0E 1 1 119
## 4 IP-00009789630AF474 2 2 128
## 5 IP-0000C9D229B79D36 2 2 236
## 6 IP-0000EDBB82D78532 1 1 123
## 7 IP-0000F4ADB85FBB06 1 3 64
## 8 IP-0000FC306F8D4062 2 2 284
## 9 IP-00010C8D321FA2A6 1 1 284
## 10 IP-0001608148125BD5 2 3 83
## IP_CLM_ICD9_PRCDR_CD IP_CLM_DAYS_CD IP_DRG_QUINT_PMT_AVG IP_DRG_QUINT_PMT_CD
## 1 31 4 86240 4
## 2 NA 2 3447 2
## 3 54 4 34878 5
## 4 NA 2 3007 2
## 5 70 1 3352 2
## 6 45 2 2690 1
## 7 NA 2 5234 3
## 8 NA 4 2713 2
## 9 94 2 9143 5
## 10 37 3 23354 5
#build new variable to flag long in-patient stays
ip_data <- ip_data %>%
mutate(IP_long_stay_fl = case_when(IP_CLM_DAYS_CD <4 ~ 0,
IP_CLM_DAYS_CD ==4 ~ 1))
table(ip_data$IP_CLM_DAYS_CD, ip_data$IP_long_stay_fl)
##
## 0 1
## 1 76025 0
## 2 261419 0
## 3 122073 0
## 4 0 128898
ip_data %>% group_by(BENE_SEX_IDENT_CD) %>% count()
## # A tibble: 2 × 2
## # Groups: BENE_SEX_IDENT_CD [2]
## BENE_SEX_IDENT_CD n
## <int> <int>
## 1 1 258217
## 2 2 330198
ip_data %>% group_by(BENE_AGE_CAT_CD) %>% count()
## # A tibble: 6 × 2
## # Groups: BENE_AGE_CAT_CD [6]
## BENE_AGE_CAT_CD n
## <int> <int>
## 1 1 116080
## 2 2 77597
## 3 3 86205
## 4 4 91487
## 5 5 94759
## 6 6 122287
ip_data %>% distinct(IP_CLM_BASE_DRG_CD) %>% count()
## n
## 1 311
#set categorical variables as factors
ip_data$BENE_SEX_IDENT_CD <- as.factor(ip_data$BENE_SEX_IDENT_CD)
ip_data$BENE_AGE_CAT_CD <- as.factor(ip_data$BENE_AGE_CAT_CD)
ip_data$IP_CLM_BASE_DRG_CD <- as.factor(ip_data$IP_CLM_BASE_DRG_CD)
ip_data$IP_long_stay_fl <- as.factor(ip_data$IP_long_stay_fl)
#set seed
set.seed(12345)
#set test data and train data for model
ip_intrain<-createDataPartition(ip_data$IP_CLM_DAYS_CD,p=0.8,list=FALSE)
ip_training<-ip_data[ip_intrain,]
ip_testing<-ip_data[-ip_intrain,]
#divide up the training data into a main training set and a validation set
ip_intrain2<-createDataPartition(y=ip_training$IP_CLM_DAYS_CD,p=0.8,list=FALSE)
ip_train<-ip_training[ip_intrain2,]
ip_validation<-ip_training[-ip_intrain2,]
rm(ip_training, ip_intrain, ip_intrain2)
#train model
ip_model_nb <- naive_bayes(IP_long_stay_fl ~ BENE_SEX_IDENT_CD + BENE_AGE_CAT_CD + IP_CLM_BASE_DRG_CD, data = ip_train, laplace = 1)
plot(ip_model_nb)
#use model to predict validation data
val_nb1 <- predict(ip_model_nb, ip_validation) #, type = "prob"
val_nb2 <- cbind(ip_validation, val_nb1)
#comparison of categorization
(comp_nb <- table(val_nb2$IP_long_stay_fl, val_nb2$val_nb1))
##
## 0 1
## 0 69626 3896
## 1 12770 7853
#misclassification %
1- sum(diag(comp_nb))/sum(comp_nb)
## [1] 0.1770248
rm(val_nb2, comp_nb, val_nb1)
#train model
ip_train_knn <- ip_train %>% select(IP_long_stay_fl, BENE_SEX_IDENT_CD, BENE_AGE_CAT_CD,IP_CLM_BASE_DRG_CD) %>% distinct()
ip_validation_knn <- ip_validation %>% select(IP_long_stay_fl, BENE_SEX_IDENT_CD, BENE_AGE_CAT_CD,IP_CLM_BASE_DRG_CD) %>% distinct()
ip_model_knn <- knn(
train = ip_train_knn[-1],
test = ip_validation_knn[-1],
cl = ip_train_knn$IP_long_stay_fl,
k=300,
use.all = F
)
#use model to predict validation data
val_knn1 <- cbind(ip_validation_knn, ip_model_knn)
val_knn2 <- ip_validation %>% left_join(val_knn1)
#comparison of categorization
(comp_knn <- table(val_knn2$IP_long_stay_fl, val_knn2$ip_model_knn))
##
## 0 1
## 0 68166 5356
## 1 16512 4111
#Misclassification %
1- sum(diag(comp_knn))/sum(comp_knn)
## [1] 0.23228
rm(ip_validation_knn, val_knn1, val_knn2, ip_model_knn, comp_knn)
#train model
ip_model_lr <- glm(IP_long_stay_fl ~ BENE_SEX_IDENT_CD + BENE_AGE_CAT_CD + IP_CLM_BASE_DRG_CD,
data = ip_train_knn,
family = "binomial")
#use model to predict validation data
val_lr1a <- predict(ip_model_lr, ip_validation[2:4], type = "response")
val_lr1b <- ifelse(val_lr1a > mean(val_lr1a), 1, 0)
val_lr2 <- cbind(ip_validation, val_lr1b)
#comparison of categorization
(comp_lr <- table(val_lr2$IP_long_stay_fl, val_lr2$val_lr1b))
##
## 0 1
## 0 30819 42703
## 1 6648 13975
#Misclassification %
1- sum(diag(comp_lr))/sum(comp_lr)
## [1] 0.524202
rm(comp_lr, val_lr1a, val_lr1b, val_lr2)
#train model
ip_train_svm <- ip_train %>% select(IP_long_stay_fl, BENE_SEX_IDENT_CD, BENE_AGE_CAT_CD,IP_CLM_BASE_DRG_CD) %>% distinct()
ip_validation_svm <- ip_validation %>% select(IP_long_stay_fl, BENE_SEX_IDENT_CD, BENE_AGE_CAT_CD,IP_CLM_BASE_DRG_CD) %>% distinct()
ip_model_svm <- svm(IP_long_stay_fl ~ BENE_SEX_IDENT_CD + BENE_AGE_CAT_CD + IP_CLM_BASE_DRG_CD,
data = ip_train_svm,
type= "C-classification",
kernel = "linear",
scale = FALSE)
ip_model_svm
##
## Call:
## svm(formula = IP_long_stay_fl ~ BENE_SEX_IDENT_CD + BENE_AGE_CAT_CD +
## IP_CLM_BASE_DRG_CD, data = ip_train_svm, type = "C-classification",
## kernel = "linear", scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 4942
#use model to predict validation data
val_svm1 <- predict(ip_model_svm, ip_validation_svm[-1])
val_svm1b <-cbind(ip_validation_svm, val_svm1)
val_svm2 <- ip_validation %>% left_join(val_svm1b)
#comparison of categorization
(comp_svm <- table(val_svm2$IP_long_stay_fl, val_svm2$val_svm1))
##
## 0 1
## 0 73180 342
## 1 19741 882
#Misclassification %
1- sum(diag(comp_svm))/sum(comp_svm)
## [1] 0.2133199
rm(ip_model_svm, ip_validation_svm, comp_svm, val_svm1,val_svm1b, val_svm2)
After the initial run with the training data, we see the following misclassification rate for each approach with the validation data:
As naive bayes had the lowest misclassification rate, I used this approach for running the final testing dataset.
#use model to predict test data
val_nb1 <- predict(ip_model_nb, ip_testing)
val_nb2 <- cbind(ip_testing, val_nb1)
#comparison of categorization
(comp_nb <- table(val_nb2$IP_long_stay_fl, val_nb2$val_nb1))
##
## 0 1
## 0 86930 4972
## 1 15795 9984
#Misclassification %
1- sum(diag(comp_nb))/sum(comp_nb)
## [1] 0.1764686
rm(val_nb2, comp_nb, val_nb1)
The final misclassification rate for the test data using naive bayes was 17.65%. This was a short preview of several machine learning approaches. Next time I look forward to digging deeper into other metrics to use for evaluating each model’s performance as well as learning more about K-fold Cross-Validation.