Predicting Long Inpatient Stays

The Goal

I wanted to explore a few supervised machine learning classification models at a superficial level to get a feel for how each works. Below I have used naive bayes, k-nearest neighbors, logistic regression, and support vector machines to run the same predictions of which patients will have long inpatient stays of 8 days or more.

The Data

I used the Basic Stand Alone (BSA) Inpatient Public Use Files (PUF) containing information about 2008 CMS Medicare inpatient claims available here.

Variable Overview

The full data dictionary can be found here. Below are the variable definitions.

BENE_SEX_IDENT_CD This field indicates the sex of the beneficiary.

BENE_AGE_CAT_CD This categorical variable is based on the beneficiary’s age at end of the reference year (2008).

IP_CLM_BASE_DRG_CD This is a categorical variable. It represents diagnostic related groups (DRGs).

IP_CLM_ICD9_PRCDR_CD This code indicates the primary procedure (mainly surgical procedures) performed during the inpatient stay.

IP_CLM_DAYS_CD This categorical variable is based on the number of inpatient days (or length of stay) on a claim for all stays ending in 2008.

IP_DRG_QUINT_PMT_AVG This field contains the average Medicare total claim payment amount of the quintile for the payments (of a particular DRG) in the 100% Inpatient claims data for 2008.

IP_DRG_QUINT_PMT_CD This categorical field indicates the quintile value (or code) to which the actual Medicare payment amount on the claim belongs.

#load libraries
library(dplyr)
library(tidyr)
library(naivebayes)
library(caret)
library(class)
library(e1071)

#load data
ip_data <- read.csv("2008_BSA_Inpatient_Claims_PUF.csv")

#review data
str(ip_data)

## 'data.frame':    588415 obs. of  8 variables:
##  $ IP_CLM_ID           : chr  "IP-000022CE4125DBE7" "IP-0000417F50942D90" "IP-0000865F5457AC0E" "IP-00009789630AF474" ...
##  $ BENE_SEX_IDENT_CD   : int  2 2 1 2 2 1 1 2 1 2 ...
##  $ BENE_AGE_CAT_CD     : int  4 5 1 2 2 1 3 2 1 3 ...
##  $ IP_CLM_BASE_DRG_CD  : int  3 199 119 128 236 123 64 284 284 83 ...
##  $ IP_CLM_ICD9_PRCDR_CD: int  31 NA 54 NA 70 45 NA NA 94 37 ...
##  $ IP_CLM_DAYS_CD      : int  4 2 4 2 1 2 2 4 2 3 ...
##  $ IP_DRG_QUINT_PMT_AVG: int  86240 3447 34878 3007 3352 2690 5234 2713 9143 23354 ...
##  $ IP_DRG_QUINT_PMT_CD : int  4 2 5 2 2 1 3 2 5 5 ...

head(ip_data, 10)

##              IP_CLM_ID BENE_SEX_IDENT_CD BENE_AGE_CAT_CD IP_CLM_BASE_DRG_CD
## 1  IP-000022CE4125DBE7                 2               4                  3
## 2  IP-0000417F50942D90                 2               5                199
## 3  IP-0000865F5457AC0E                 1               1                119
## 4  IP-00009789630AF474                 2               2                128
## 5  IP-0000C9D229B79D36                 2               2                236
## 6  IP-0000EDBB82D78532                 1               1                123
## 7  IP-0000F4ADB85FBB06                 1               3                 64
## 8  IP-0000FC306F8D4062                 2               2                284
## 9  IP-00010C8D321FA2A6                 1               1                284
## 10 IP-0001608148125BD5                 2               3                 83
##    IP_CLM_ICD9_PRCDR_CD IP_CLM_DAYS_CD IP_DRG_QUINT_PMT_AVG IP_DRG_QUINT_PMT_CD
## 1                    31              4                86240                   4
## 2                    NA              2                 3447                   2
## 3                    54              4                34878                   5
## 4                    NA              2                 3007                   2
## 5                    70              1                 3352                   2
## 6                    45              2                 2690                   1
## 7                    NA              2                 5234                   3
## 8                    NA              4                 2713                   2
## 9                    94              2                 9143                   5
## 10                   37              3                23354                   5

#build new variable to flag long in-patient stays
ip_data <- ip_data %>% 
  mutate(IP_long_stay_fl = case_when(IP_CLM_DAYS_CD <4 ~ 0, 
            IP_CLM_DAYS_CD ==4 ~ 1))

table(ip_data$IP_CLM_DAYS_CD, ip_data$IP_long_stay_fl)

##    
##          0      1
##   1  76025      0
##   2 261419      0
##   3 122073      0
##   4      0 128898

ip_data %>% group_by(BENE_SEX_IDENT_CD) %>% count()

## # A tibble: 2 × 2
## # Groups:   BENE_SEX_IDENT_CD [2]
##   BENE_SEX_IDENT_CD      n
##               <int>  <int>
## 1                 1 258217
## 2                 2 330198

ip_data %>% group_by(BENE_AGE_CAT_CD) %>% count()

## # A tibble: 6 × 2
## # Groups:   BENE_AGE_CAT_CD [6]
##   BENE_AGE_CAT_CD      n
##             <int>  <int>
## 1               1 116080
## 2               2  77597
## 3               3  86205
## 4               4  91487
## 5               5  94759
## 6               6 122287

ip_data %>% distinct(IP_CLM_BASE_DRG_CD) %>% count()

##     n
## 1 311

Prepare Training, Validation, and Test Datasets

#set categorical variables as factors
ip_data$BENE_SEX_IDENT_CD <- as.factor(ip_data$BENE_SEX_IDENT_CD)
ip_data$BENE_AGE_CAT_CD <- as.factor(ip_data$BENE_AGE_CAT_CD)
ip_data$IP_CLM_BASE_DRG_CD <- as.factor(ip_data$IP_CLM_BASE_DRG_CD)
ip_data$IP_long_stay_fl <- as.factor(ip_data$IP_long_stay_fl)

#set seed
set.seed(12345)

#set test data and train data for model
ip_intrain<-createDataPartition(ip_data$IP_CLM_DAYS_CD,p=0.8,list=FALSE)
ip_training<-ip_data[ip_intrain,]
ip_testing<-ip_data[-ip_intrain,]

#divide up the training data into a main training set and a validation set
ip_intrain2<-createDataPartition(y=ip_training$IP_CLM_DAYS_CD,p=0.8,list=FALSE)
ip_train<-ip_training[ip_intrain2,]
ip_validation<-ip_training[-ip_intrain2,]

rm(ip_training, ip_intrain, ip_intrain2)

Naive Bayes

#train model
ip_model_nb <- naive_bayes(IP_long_stay_fl ~ BENE_SEX_IDENT_CD + BENE_AGE_CAT_CD + IP_CLM_BASE_DRG_CD, data = ip_train, laplace = 1)

plot(ip_model_nb)

#use model to predict validation data
val_nb1 <- predict(ip_model_nb, ip_validation)  #, type = "prob"
val_nb2 <- cbind(ip_validation, val_nb1)

#comparison of categorization
(comp_nb <- table(val_nb2$IP_long_stay_fl, val_nb2$val_nb1))

##    
##         0     1
##   0 69626  3896
##   1 12770  7853

#misclassification %
1- sum(diag(comp_nb))/sum(comp_nb)

## [1] 0.1770248

rm(val_nb2, comp_nb, val_nb1)

K-Nearest Neighbors

#train model
ip_train_knn <- ip_train %>% select(IP_long_stay_fl, BENE_SEX_IDENT_CD, BENE_AGE_CAT_CD,IP_CLM_BASE_DRG_CD) %>% distinct()

ip_validation_knn <- ip_validation %>% select(IP_long_stay_fl, BENE_SEX_IDENT_CD, BENE_AGE_CAT_CD,IP_CLM_BASE_DRG_CD) %>%  distinct()


ip_model_knn <- knn(
                 train = ip_train_knn[-1], 
                 test = ip_validation_knn[-1],
                 cl = ip_train_knn$IP_long_stay_fl, 
                 k=300,
                 use.all = F
                 )

#use model to predict validation data
val_knn1 <-  cbind(ip_validation_knn, ip_model_knn)
val_knn2 <- ip_validation %>% left_join(val_knn1)

#comparison of categorization
(comp_knn <- table(val_knn2$IP_long_stay_fl, val_knn2$ip_model_knn))

##    
##         0     1
##   0 68166  5356
##   1 16512  4111

#Misclassification %
1- sum(diag(comp_knn))/sum(comp_knn)

## [1] 0.23228

rm(ip_validation_knn, val_knn1, val_knn2, ip_model_knn, comp_knn)

Logistic Regression

#train model
ip_model_lr <- glm(IP_long_stay_fl ~ BENE_SEX_IDENT_CD + BENE_AGE_CAT_CD + IP_CLM_BASE_DRG_CD,
                   data = ip_train_knn,
                   family = "binomial")

#use model to predict validation data
val_lr1a <- predict(ip_model_lr, ip_validation[2:4], type = "response")  

val_lr1b <- ifelse(val_lr1a > mean(val_lr1a), 1, 0)
val_lr2 <- cbind(ip_validation, val_lr1b)

#comparison of categorization
(comp_lr <- table(val_lr2$IP_long_stay_fl, val_lr2$val_lr1b))

##    
##         0     1
##   0 30819 42703
##   1  6648 13975

#Misclassification %
1- sum(diag(comp_lr))/sum(comp_lr)

## [1] 0.524202

rm(comp_lr, val_lr1a, val_lr1b, val_lr2)

SVM (Support Vector Machines)-Linear Kernal

#train model
ip_train_svm <- ip_train %>% select(IP_long_stay_fl, BENE_SEX_IDENT_CD, BENE_AGE_CAT_CD,IP_CLM_BASE_DRG_CD) %>% distinct()

ip_validation_svm <- ip_validation %>% select(IP_long_stay_fl, BENE_SEX_IDENT_CD, BENE_AGE_CAT_CD,IP_CLM_BASE_DRG_CD) %>%  distinct()


ip_model_svm <- svm(IP_long_stay_fl  ~ BENE_SEX_IDENT_CD + BENE_AGE_CAT_CD + IP_CLM_BASE_DRG_CD,
                    data = ip_train_svm,
                    type= "C-classification",
                    kernel = "linear", 
                    scale = FALSE)
ip_model_svm

## 
## Call:
## svm(formula = IP_long_stay_fl ~ BENE_SEX_IDENT_CD + BENE_AGE_CAT_CD + 
##     IP_CLM_BASE_DRG_CD, data = ip_train_svm, type = "C-classification", 
##     kernel = "linear", scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  4942

#use model to predict validation data
val_svm1 <- predict(ip_model_svm, ip_validation_svm[-1])  
val_svm1b <-cbind(ip_validation_svm, val_svm1)
val_svm2 <- ip_validation %>% left_join(val_svm1b)

#comparison of categorization
(comp_svm <- table(val_svm2$IP_long_stay_fl, val_svm2$val_svm1))

##    
##         0     1
##   0 73180   342
##   1 19741   882

#Misclassification %
1- sum(diag(comp_svm))/sum(comp_svm)

## [1] 0.2133199

rm(ip_model_svm, ip_validation_svm, comp_svm, val_svm1,val_svm1b, val_svm2)

Testing

After the initial run with the training data, we see the following misclassification rate for each approach with the validation data:

naive bayes: 0.1770248
k-nearest neighbors: 0.23228
logistic regression: 0.524202
SVM: 0.2133199

As naive bayes had the lowest misclassification rate, I used this approach for running the final testing dataset.

#use model to predict test data
val_nb1 <- predict(ip_model_nb, ip_testing) 
val_nb2 <- cbind(ip_testing, val_nb1)

#comparison of categorization
(comp_nb <- table(val_nb2$IP_long_stay_fl, val_nb2$val_nb1))

##    
##         0     1
##   0 86930  4972
##   1 15795  9984

#Misclassification %
1- sum(diag(comp_nb))/sum(comp_nb)

## [1] 0.1764686

rm(val_nb2, comp_nb, val_nb1)

The final misclassification rate for the test data using naive bayes was 17.65%. This was a short preview of several machine learning approaches. Next time I look forward to digging deeper into other metrics to use for evaluating each model’s performance as well as learning more about K-fold Cross-Validation.