title | author | format | editor |
---|---|---|---|
Code Walk-Through |
Nicholas Spies, MD |
gfm |
visual |
Science is in the throes of a reproducibility crisis, and machine learning literature is especially vulnerable. In order to reduce the barrier to reviewing, appraising, and reproducing this work, we provide the code, Docker container, and anonymized data necessary to replicate this approach in a public and transparent fashion.
renv::activate()
library(targets)
library(tidyverse)
library(tidymodels)
tar_source("R/SetGlobals.R")
tar_source("R/unsupervised_pipeline.R")
# Download Inputs from FigShare, uncomment this the first time you run the code.
#download.file("https://figshare.com/ndownloader/files/41764293", "bmp_data")
data <- arrow::read_feather("bmp_data") %>% drop_na(any_of(lab_strings_bmp)) %>% slice_sample(prop = 0.01)
Once we have loaded our data, we will split it into a training set, validation set (for creating the likelihood ratio grid), and testing set.
In general, we recommend grouping the data by patient prior to assigning each patient's lab data to one of the training, validation, or testing sets. However, the anonymized data set does not contain patient IDs, so we will proceed without this step.
#split <- group_initial_split(data, prop = 0.8, group = patient_id)
split <- initial_split(data, prop = 0.8)
# Assign a held-out test set for final performance assessment.
test <- testing(split)
#We'll split the training set further into a training and validation set for the likelihood grid
train_input <- training(split) %>% initial_split(prop = 0.8)
train <- training(train_input)
validation <- testing(train_input)
# Perform calculations by row.
simulateContaminationRow <- function(input, mix_ratio, fluid){
cols <- names(fluid)[which(names(fluid) %in% names(input))]
output <- input %>%
dplyr::mutate(across(all_of(cols), ~(1 - mix_ratio) * . + fluid[cur_column()] * mix_ratio)) %>%
select(all_of(cols))
# Round output to correct digits for reporting, and re-calculate anion gap.
output %>%
mutate(across(c("sodium", "chloride", "co2_totl", "bun", "glucose"), ~round(.))) %>%
mutate(across(c("potassium_plas", "calcium"), ~round(., 1))) %>%
mutate(creatinine = round(creatinine, 2)) %>%
mutate(anion_gap = sodium - chloride - co2_totl) %>%
mutate(mix_ratio = mix_ratio)
}
# Wrap row-wise simulation across ratios and fluids
makeFullContaminationTibble <- function(input, mix_ratios, fluid, fluid_name){
input_prep =
input %>% drop_na(any_of(lab_strings)) %>%
slice_sample(n = length(mix_ratios), replace = T) %>%
mutate(across(any_of(lab_strings), ~.x, .names = "{col}_real"), mix_ratio = mix_ratios, label = fluid_name)
tmp = simulateContaminationRow(input_prep, input_prep$mix_ratio, fluid)
input_prep[,names(tmp)] <- tmp
input_prep %>%
mutate(label = fluid_name)
}
ratios <- rep(seq(0.01, 1, by = 0.01), each = 10000)
contam_sim <- map2(fluids, fluid_names, ~makeFullContaminationTibble(train, mix_ratios = ratios, fluid = .x, fluid_name = .y)) %>% bind_rows()
contam_sim %>% slice_sample(n = 10)
We recommend tuning the many hyperparameters of the UMAP model and the bandwidth of the kernel density estimation to optimize the positive predictive value and alarm rate for your own data, but we will proceed with the only set of hyperparameters used in the paper for simplicity. We will add some pre-processing steps in the form of a tidymodels recipe, but this can also be done directly by calling uwot::umap().
library(tidymodels)
library(embed)
## Hyperparameters used in paper. Tune these to your liking. Anecdotally, n_neighbors and dens_scale provided the most substantial changes on the embeddings. However, the likelihood ratio approach is relatively robust to variation in the underlying embedding, provided you can simulate the error in question.
neighbors <- 50
metric <- "cosine"
min_dist <- 0
init <- "spca"
bandwidth <- 10
connectivity <- 10
set_op_mix_ratio <- 1
dens_scale <- 1
# Uncomment this to train your own models.
# results_umap_rec <-
# recipe(train %>% drop_na(any_of(!!lab_strings_bmp)) %>% bind_rows(contam_sim %>% slice_sample(prop = 0.25, by = c(label, mix_ratio)))) %>%
# update_role(everything(), new_role = "metadata") %>%
# update_role_requirements("metadata", bake = F) %>%
# update_role(any_of(c(!!!lab_strings)), new_role = "predictor") %>%
# step_normalize(all_predictors()) %>%
# embed::step_umap(any_of(c(!!!lab_strings_bmp)), keep_original_cols = F, metric = metric, num_comp = 2, neighbors = neighbors, min_dist = min_dist,
# options = list(bandwidth = bandwidth, local_connectivity = connectivity, init = init, set_op_mix_ratio = set_op_mix_ratio, dens_scale = dens_scale, fast_sgd = F, n_threads = 32, n_sgd_threads = 32, tmpdir = "/scratch1/fs1/zaydmanm/nspies/umap/", ret_model = T, verbose = F)) %>%
# prep()
# Because training these models is resource intensive, its worth saving them at each iteration.
#pins::pin_write(model_board, results_umap_rec %>% bundle::unbundle(), name = "BMP_UMAP_model", versioned = T)
#results_umap_rec
Once we've built our models, we can apply them to new data using the code below.
### Load the model. Uncomment the download command the first time you run this.
options(timeout = max(1200, getOption("timeout")))
#download.file("https://figshare.com/ndownloader/files/41764995", "umap_model")
model <- readRDS("umap_model") %>% bundle::unbundle()
### Apply Models to Simulated Contamination, Validation and Test Sets #####
validation_umap <- bake(model, validation %>% drop_na(any_of(lab_strings_bmp)))
test_umap <- bake(model, test %>% drop_na(any_of(lab_strings_bmp)))
sim_umap <- bake(model, contam_sim %>% drop_na(any_of(lab_strings_bmp)) %>% slice_sample(n = 10, by = c(label, mix_ratio)))
validation_umap
Now that we have our embedding coordinates for our validation set and simulated contamination, we can calculate likelihood ratios for a grid of coordinates using the following two functions.
makeEnrichmentGrid <- function(real_embed, sim){
library(KernSmooth)
kde_input <-
bind_rows(
D5 = sim %>% filter(mix_ratio > 0.05 & mix_ratio < 0.25 & grepl("D5|hyper", label)),
nonD5 = sim %>% filter(mix_ratio > 0.25 & !grepl("D5|hyper", label)))
uncontam_real <- real_embed %>% filter(!contam_comment) %>% slice_sample(n = nrow(kde_input), replace = T)
min_x <- min(c(kde_input$UMAP1, uncontam_real$UMAP1))
max_x <- max(c(kde_input$UMAP1, uncontam_real$UMAP1))
min_y <- min(c(kde_input$UMAP2, uncontam_real$UMAP2))
max_y <- max(c(kde_input$UMAP2, uncontam_real$UMAP2))
kde_uncontam <-
bkde2D(
uncontam_real %>% select(UMAP1, UMAP2) %>% as.matrix(),
bandwidth = 0.2,
gridsize = c(1000, 1000),
range.x = list(c(min_x, max_x), c(min_y, max_y)))
kde_contam <-
bkde2D(
kde_input %>% select(UMAP1, UMAP2) %>% as.matrix(),
bandwidth = 0.2,
gridsize = c(1000, 1000),
range.x = list(c(min_x, max_x), c(min_y, max_y)))
grid <- kde_contam
grid$fhat <- (kde_contam$fhat + 1e-20)/(kde_uncontam$fhat + 1e-20)
grid
}
grid <- makeEnrichmentGrid(validation_umap, sim_umap)
assignEnrichmentScores <- function(input, grid = likelihood_ratio_grid){
bin_x <- cut(input$UMAP1, breaks = grid$x1, right = F) %>% as.numeric
bin_y <- cut(input$UMAP2, breaks = grid$x2, right = F) %>% as.numeric
enrichment <- map2(bin_x, bin_y, ~grid[["fhat"]][.x, .y]) %>% unlist()
input$likelihood_ratio <- enrichment
input
}
sim_ratio <- assignEnrichmentScores(sim_umap, grid) %>% drop_na()
test_ratio <- assignEnrichmentScores(test_umap, grid) %>% drop_na()
Next, we will visualize the test set's embedding. To re-make figure 1, we can run the code below.
kde <-
bkde2D(
test_ratio %>% select(UMAP1, UMAP2) %>% as.matrix(),
bandwidth = 0.2,
gridsize = c(1000, 1000))
bin_x <- cut(test_ratio$UMAP1, breaks = kde$x1, right = F) %>% as.numeric
bin_y <- cut(test_ratio$UMAP2, breaks = kde$x2, right = F) %>% as.numeric
test_ratio$kde <- map2(bin_x, bin_y, ~kde[["fhat"]][.x, .y]) %>% unlist()
gg_test_ratio <-
ggplot(test_ratio, aes(UMAP1, UMAP2, color = kde)) +
geom_point() +
scico::scale_color_scico(palette = "oslo", breaks = c(0.05, 1), end = 0.95, labels = c("Lowest", "Highest")) +
guides(alpha = F, color = guide_colorbar(title = "Patient Density", direction = "horizontal", title.position = "top", ticks = F, title.theme = element_text(size = 8, face = "bold", hjust = 0.5), label.theme = element_text(size = 8))) +
theme(plot.background = element_blank(), legend.position = c(0.25, 0.25), axis.title.y.left = element_text(size = 12, face = "bold.italic"), axis.title.x.bottom = element_text(size = 12, face = "bold.italic"))
gg_test_ratio
To make figure 2, we normalize the results in the BMPs, then color our test set embedding by those normalized results. Each facet will represent one analyte.
umap_norm <-
bind_cols(test, test_umap %>% select(UMAP1, UMAP2)) %>% mutate(across(any_of(lab_strings_bmp), ~scale(.), .names = "{col}_norm")) %>%
dplyr::select(matches("UMAP|norm")) %>%
pivot_longer(cols = matches("norm"), names_to = "Analyte", values_to = "Scaled Result") %>%
mutate(Analyte = factor(str_replace_all(Analyte, "_norm", ""), levels = c("sodium", "chloride", "glucose", "calcium", "potassium_plas", "co2_totl", "creatinine", "bun", "anion_gap")))
gg_results_by_analyte <-
ggplot(umap_norm %>% group_by(Analyte), aes(UMAP1, UMAP2, color = `Scaled Result`)) +
geom_point(shape = ".") +
scico::scale_color_scico(palette = "berlin", limits = c(-3, 3), breaks = c(-2.75, 2.65), labels = c("Lowest", "Highest"), oob = scales::squish) + guides(color = guide_colorbar(title = "Result Value", ticks = F, title.theme = element_text(size = 12, face = "bold.italic"), label.theme = element_text(size = 10, face = "bold"), title.position = "top", title.hjust = 0.5)) + facet_wrap(~Analyte, ncol = 3, labeller = labeller(Analyte = analyte_labels)) + theme(panel.spacing = unit(0, "lines"), strip.text = element_text(size = 12, face = "bold.italic"), legend.direction = "horizontal", legend.position = "bottom", legend.key.size = unit(0.25, "in"), axis.text = element_blank(), axis.title = element_blank(), axis.line = element_blank())
ggsave(here::here("../../Figures/Unsupervised/Supplemental/UMAP_by_analyte_berlin.png"), gg_results_by_analyte, width = 8, height = 8, dpi = 600)
ggsave(here::here("../../Figures/Unsupervised/Supplemental/UMAP_by_analyte.pdf"), gg_results_by_analyte, width = 8, height = 8)
Next, we'll do the same with the embeddings for the simulated IV Fluid contamation.
x2 <- quantile(sim_ratio %>% filter(mix_ratio < 0.75) %>% pluck("UMAP1"), probs = c(0.001, 0.999))
y2 <- quantile(sim_ratio %>% filter(mix_ratio < 0.75) %>% pluck("UMAP2"), probs = c(0.001, 0.999))
cap2 <- quantile(sim_ratio %>% filter(mix_ratio < 0.75) %>% pluck("likelihood_ratio"), probs = c(0.95), na.rm = T)[[1]]
gg_sim_ratio <-
list(
ggplot() + xlim(x2[[1]] - 0.25, x2[[2]] + 0.2) + ylim(y2[[1]] - 0.25, y2[[2]] + 0.1) + geom_point(data = sim_ratio %>% group_by(label, mix_ratio) %>% slice_sample(prop = 0.1) %>% mutate(likelihood_ratio = ifelse(likelihood_ratio > cap2, cap2, likelihood_ratio)), aes(UMAP1, UMAP2, color = label), size = 4, shape = 20, stroke = 0, alpha = 0.25) + scale_color_manual(values = color_map_global) + guides(alpha = F, color = guide_legend(title = "Fluid", direction = "horizontal", title.position = "top", reverse = T, ticks = F, title.theme = element_text(size = 10, face = "bold", hjust = 0.5), nrow = 2, label.theme = element_text(size = 8, face = "bold"), override.aes = list(size = 8, alpha = 1, stroke = 1, shape = 21))) + ggtitle("Fluid Types") + theme(axis.title.y.left = element_text(size = 12, face = "bold.italic")),
ggplot() + xlim(x2[[1]] - 0.25, x2[[2]] + 0.2) + ylim(y2[[1]] - 0.25, y2[[2]] + 0.1) + geom_point(data = sim_ratio %>% group_by(label, mix_ratio) %>% slice_sample(prop = 0.1) %>% mutate(likelihood_ratio = ifelse(likelihood_ratio > cap2, cap2, likelihood_ratio)), aes(UMAP1, UMAP2, color = mix_ratio), size = 4, shape = 20, stroke = 0, alpha = 0.75) + scico::scale_color_scico(direction = -1, end = 0.95, palette = "davos", breaks = c(0, 1), limits = c(0, 1), labels = c(0, 1)) + ggtitle("Contamination Severity") + guides(color = guide_colorbar(title = "Mixture Ratio", direction = "horizontal", title.position = "top", ticks = F, title.theme = element_text(size = 10, face = "bold", hjust = 0.5), label.theme = element_text(size = 8, face = "bold", hjust = 0)), fill = F),
ggplot() + xlim(x2[[1]] - 0.25, x2[[2]] + 0.2) + ylim(y2[[1]] - 0.25, y2[[2]] + 0.1) + geom_point(data = sim_ratio %>% group_by(label, mix_ratio) %>% slice_sample(prop = 0.1) %>% mutate(likelihood_ratio = ifelse(likelihood_ratio > cap2, cap2, likelihood_ratio)), aes(UMAP1, UMAP2, color = likelihood_ratio), size = 4, shape = 20, stroke = 0, alpha = 0.75) + scico::scale_color_scico(direction = 1, begin = 0.05, palette = "bilbao", trans = scales::log1p_trans(), oob = scales::squish, limits = c(0, cap2*2), breaks = c(0, cap2*2), labels = c("Lowest", "Highest")) + ggtitle("Likelihood Ratio") + guides(fill = F, color = guide_colorbar(title = "Likelihood Ratio", direction = "horizontal", title.position = "top", ticks = F, title.theme = element_text(size = 10, face = "bold", hjust = 0.5), label.theme = element_text(size = 8, face = "bold", hjust = 0))))
gg_sim_overlay <- ggpubr::ggarrange(plotlist = gg_sim_ratio, nrow = 1, ncol = 3)
gg_sim_overlay