title	author	format	editor
Code Walk-Through	Nicholas Spies, MD	gfm	visual

Code Walk-Through

Science is in the throes of a reproducibility crisis, and machine learning literature is especially vulnerable. In order to reduce the barrier to reviewing, appraising, and reproducing this work, we provide the code, Docker container, and anonymized data necessary to replicate this approach in a public and transparent fashion.

Preparation


renv::activate()
library(targets)
library(tidyverse)
library(tidymodels)

tar_source("R/SetGlobals.R")
tar_source("R/unsupervised_pipeline.R")

# Download Inputs from FigShare, uncomment this the first time you run the code. 
#download.file("https://figshare.com/ndownloader/files/41764293", "bmp_data")

data <- arrow::read_feather("bmp_data") %>% drop_na(any_of(lab_strings_bmp)) %>% slice_sample(prop = 0.01)

Once we have loaded our data, we will split it into a training set, validation set (for creating the likelihood ratio grid), and testing set.

In general, we recommend grouping the data by patient prior to assigning each patient's lab data to one of the training, validation, or testing sets. However, the anonymized data set does not contain patient IDs, so we will proceed without this step.


#split <- group_initial_split(data, prop = 0.8, group = patient_id)
split <- initial_split(data, prop = 0.8)

# Assign a held-out test set for final performance assessment.
test <- testing(split)

#We'll split the training set further into a training and validation set for the likelihood grid
train_input <- training(split) %>% initial_split(prop = 0.8)
train <- training(train_input)
validation <- testing(train_input)

Simulating IV Fluid Contamination


# Perform calculations by row.
simulateContaminationRow <- function(input, mix_ratio, fluid){
  
  cols <- names(fluid)[which(names(fluid) %in% names(input))]
  
  output <- input %>%
    dplyr::mutate(across(all_of(cols), ~(1 - mix_ratio) * . + fluid[cur_column()] * mix_ratio)) %>%
    select(all_of(cols))
  
  # Round output to correct digits for reporting, and re-calculate anion gap.
  output %>% 
    mutate(across(c("sodium", "chloride", "co2_totl", "bun", "glucose"), ~round(.))) %>%
    mutate(across(c("potassium_plas", "calcium"), ~round(., 1))) %>%
    mutate(creatinine = round(creatinine, 2)) %>% 
    mutate(anion_gap = sodium - chloride - co2_totl) %>%
    mutate(mix_ratio = mix_ratio)
  
}

# Wrap row-wise simulation across ratios and fluids
makeFullContaminationTibble <- function(input, mix_ratios, fluid, fluid_name){
  
  input_prep = 
    input %>% drop_na(any_of(lab_strings)) %>%
      slice_sample(n = length(mix_ratios), replace = T) %>% 
      mutate(across(any_of(lab_strings), ~.x, .names = "{col}_real"), mix_ratio = mix_ratios, label = fluid_name)
  
  tmp = simulateContaminationRow(input_prep, input_prep$mix_ratio, fluid)
  
  input_prep[,names(tmp)] <- tmp
  
  input_prep %>% 
    mutate(label = fluid_name)
  
}

ratios <- rep(seq(0.01, 1, by = 0.01), each = 10000)
contam_sim <- map2(fluids, fluid_names, ~makeFullContaminationTibble(train, mix_ratios = ratios, fluid = .x, fluid_name = .y)) %>% bind_rows()

contam_sim %>% slice_sample(n = 10)

Building the Unsupervised Models

We recommend tuning the many hyperparameters of the UMAP model and the bandwidth of the kernel density estimation to optimize the positive predictive value and alarm rate for your own data, but we will proceed with the only set of hyperparameters used in the paper for simplicity. We will add some pre-processing steps in the form of a tidymodels recipe, but this can also be done directly by calling uwot::umap().


library(tidymodels)
library(embed)

## Hyperparameters used in paper. Tune these to your liking. Anecdotally, n_neighbors and dens_scale provided the most substantial changes on the embeddings. However, the likelihood ratio approach is relatively robust to variation in the underlying embedding, provided you can simulate the error in question.
neighbors <- 50
metric <- "cosine"
min_dist <- 0
init <- "spca"
bandwidth <- 10
connectivity <- 10
set_op_mix_ratio <- 1
dens_scale <- 1

# Uncomment this to train your own models.
  # results_umap_rec <-     
  #   recipe(train %>% drop_na(any_of(!!lab_strings_bmp)) %>% bind_rows(contam_sim %>% slice_sample(prop = 0.25, by = c(label, mix_ratio)))) %>%
  #     update_role(everything(), new_role = "metadata") %>%
  #     update_role_requirements("metadata", bake = F) %>%
  #     update_role(any_of(c(!!!lab_strings)), new_role = "predictor") %>%
  #     step_normalize(all_predictors()) %>%
  #     embed::step_umap(any_of(c(!!!lab_strings_bmp)), keep_original_cols = F, metric = metric, num_comp = 2, neighbors = neighbors, min_dist = min_dist, 
  #                      options = list(bandwidth = bandwidth, local_connectivity = connectivity, init = init, set_op_mix_ratio = set_op_mix_ratio, dens_scale = dens_scale, fast_sgd = F, n_threads = 32, n_sgd_threads = 32, tmpdir = "/scratch1/fs1/zaydmanm/nspies/umap/", ret_model = T, verbose = F)) %>% 
  #   prep() 
 
  # Because training these models is resource intensive, its worth saving them at each iteration. 
  #pins::pin_write(model_board, results_umap_rec %>% bundle::unbundle(), name = "BMP_UMAP_model", versioned = T)
  
  
  #results_umap_rec

Applying the Models

Once we've built our models, we can apply them to new data using the code below.


### Load the model. Uncomment the download command the first time you run this. 
options(timeout = max(1200, getOption("timeout")))
#download.file("https://figshare.com/ndownloader/files/41764995", "umap_model")  
model <- readRDS("umap_model") %>% bundle::unbundle()

### Apply Models to Simulated Contamination, Validation and Test Sets #####

  validation_umap <- bake(model, validation %>% drop_na(any_of(lab_strings_bmp)))
  test_umap <- bake(model, test %>% drop_na(any_of(lab_strings_bmp)))
  sim_umap <- bake(model, contam_sim %>% drop_na(any_of(lab_strings_bmp)) %>% slice_sample(n = 10, by = c(label, mix_ratio)))
  
  validation_umap

Calculating Likelihood Ratios

Now that we have our embedding coordinates for our validation set and simulated contamination, we can calculate likelihood ratios for a grid of coordinates using the following two functions.

makeEnrichmentGrid <- function(real_embed, sim){
  
  library(KernSmooth)
  
  kde_input <- 
    bind_rows(
      D5 = sim %>% filter(mix_ratio > 0.05 & mix_ratio < 0.25 & grepl("D5|hyper", label)), 
      nonD5 = sim %>% filter(mix_ratio > 0.25 & !grepl("D5|hyper", label)))

  uncontam_real <- real_embed %>% filter(!contam_comment) %>% slice_sample(n = nrow(kde_input), replace = T)
  
  min_x <- min(c(kde_input$UMAP1, uncontam_real$UMAP1))
  max_x <- max(c(kde_input$UMAP1, uncontam_real$UMAP1))
  
  min_y <- min(c(kde_input$UMAP2, uncontam_real$UMAP2))
  max_y <- max(c(kde_input$UMAP2, uncontam_real$UMAP2))
  
  kde_uncontam <- 
    bkde2D(
      uncontam_real %>% select(UMAP1, UMAP2) %>% as.matrix(), 
      bandwidth = 0.2, 
      gridsize = c(1000, 1000),
      range.x = list(c(min_x, max_x), c(min_y, max_y)))
  
  kde_contam <- 
    bkde2D(
      kde_input %>% select(UMAP1, UMAP2) %>% as.matrix(), 
      bandwidth = 0.2, 
      gridsize = c(1000, 1000),
      range.x = list(c(min_x, max_x), c(min_y, max_y)))
  
  grid <- kde_contam
  grid$fhat <- (kde_contam$fhat + 1e-20)/(kde_uncontam$fhat + 1e-20)
  
  grid
  
}

grid <- makeEnrichmentGrid(validation_umap, sim_umap)

assignEnrichmentScores <- function(input, grid = likelihood_ratio_grid){
  
  bin_x <- cut(input$UMAP1, breaks = grid$x1, right = F) %>% as.numeric
  bin_y <- cut(input$UMAP2, breaks = grid$x2, right = F) %>% as.numeric
  
  enrichment <- map2(bin_x, bin_y, ~grid[["fhat"]][.x, .y]) %>% unlist()
  
  input$likelihood_ratio <- enrichment
  
  input
  
}

sim_ratio <- assignEnrichmentScores(sim_umap, grid) %>% drop_na()
test_ratio <- assignEnrichmentScores(test_umap, grid) %>% drop_na()

Plotting the Embeddings

Next, we will visualize the test set's embedding. To re-make figure 1, we can run the code below.


 kde <- 
   bkde2D(
     test_ratio %>% select(UMAP1, UMAP2) %>% as.matrix(), 
     bandwidth = 0.2, 
     gridsize = c(1000, 1000)) 
 
bin_x <- cut(test_ratio$UMAP1, breaks = kde$x1, right = F) %>% as.numeric
bin_y <- cut(test_ratio$UMAP2, breaks = kde$x2, right = F) %>% as.numeric
  
test_ratio$kde <- map2(bin_x, bin_y, ~kde[["fhat"]][.x, .y]) %>% unlist()

gg_test_ratio <- 
ggplot(test_ratio, aes(UMAP1, UMAP2, color = kde)) + 
geom_point() +
scico::scale_color_scico(palette = "oslo", breaks = c(0.05, 1), end = 0.95, labels = c("Lowest", "Highest")) +
guides(alpha = F, color = guide_colorbar(title = "Patient Density", direction = "horizontal", title.position = "top", ticks = F, title.theme = element_text(size = 8, face = "bold", hjust = 0.5), label.theme = element_text(size = 8))) +
theme(plot.background = element_blank(), legend.position = c(0.25, 0.25), axis.title.y.left = element_text(size = 12, face = "bold.italic"), axis.title.x.bottom = element_text(size = 12, face = "bold.italic"))

gg_test_ratio

Analyte-Level Contributions

To make figure 2, we normalize the results in the BMPs, then color our test set embedding by those normalized results. Each facet will represent one analyte.


umap_norm <-
    bind_cols(test, test_umap %>% select(UMAP1, UMAP2)) %>% mutate(across(any_of(lab_strings_bmp), ~scale(.), .names = "{col}_norm")) %>%
      dplyr::select(matches("UMAP|norm")) %>%
      pivot_longer(cols = matches("norm"), names_to = "Analyte", values_to = "Scaled Result") %>%
      mutate(Analyte = factor(str_replace_all(Analyte, "_norm", ""), levels = c("sodium", "chloride", "glucose", "calcium", "potassium_plas",  "co2_totl", "creatinine", "bun", "anion_gap")))
  
gg_results_by_analyte <-
    ggplot(umap_norm %>% group_by(Analyte), aes(UMAP1, UMAP2, color = `Scaled Result`)) +
      geom_point(shape = ".") + 
      scico::scale_color_scico(palette = "berlin", limits = c(-3, 3), breaks = c(-2.75, 2.65), labels = c("Lowest", "Highest"), oob = scales::squish) + guides(color = guide_colorbar(title = "Result Value", ticks = F, title.theme = element_text(size = 12, face = "bold.italic"), label.theme = element_text(size = 10, face = "bold"), title.position = "top", title.hjust = 0.5)) + facet_wrap(~Analyte, ncol = 3, labeller = labeller(Analyte = analyte_labels)) + theme(panel.spacing = unit(0, "lines"), strip.text = element_text(size = 12, face = "bold.italic"), legend.direction = "horizontal", legend.position = "bottom", legend.key.size = unit(0.25, "in"), axis.text = element_blank(), axis.title = element_blank(), axis.line = element_blank())
    ggsave(here::here("../../Figures/Unsupervised/Supplemental/UMAP_by_analyte_berlin.png"), gg_results_by_analyte, width = 8, height = 8, dpi = 600)
    ggsave(here::here("../../Figures/Unsupervised/Supplemental/UMAP_by_analyte.pdf"), gg_results_by_analyte, width = 8, height = 8)

Plotting Simulated Contamination

Next, we'll do the same with the embeddings for the simulated IV Fluid contamation.


  x2 <- quantile(sim_ratio %>% filter(mix_ratio < 0.75) %>% pluck("UMAP1"), probs = c(0.001, 0.999))
  y2 <- quantile(sim_ratio %>% filter(mix_ratio < 0.75) %>% pluck("UMAP2"), probs = c(0.001, 0.999))
  cap2 <- quantile(sim_ratio %>% filter(mix_ratio < 0.75) %>% pluck("likelihood_ratio"), probs = c(0.95), na.rm = T)[[1]]

 gg_sim_ratio <- 
    list(
      ggplot() + xlim(x2[[1]] - 0.25, x2[[2]] + 0.2) + ylim(y2[[1]] - 0.25, y2[[2]] + 0.1) + geom_point(data = sim_ratio %>% group_by(label, mix_ratio) %>% slice_sample(prop = 0.1) %>% mutate(likelihood_ratio = ifelse(likelihood_ratio > cap2, cap2, likelihood_ratio)), aes(UMAP1, UMAP2, color = label), size = 4, shape = 20, stroke = 0, alpha = 0.25) + scale_color_manual(values = color_map_global) + guides(alpha = F, color = guide_legend(title = "Fluid", direction = "horizontal", title.position = "top", reverse = T, ticks = F, title.theme = element_text(size = 10, face = "bold", hjust = 0.5), nrow = 2, label.theme = element_text(size = 8, face = "bold"), override.aes = list(size = 8, alpha = 1, stroke = 1, shape = 21))) + ggtitle("Fluid Types") + theme(axis.title.y.left = element_text(size = 12, face = "bold.italic")),
      ggplot() + xlim(x2[[1]] - 0.25, x2[[2]] + 0.2) + ylim(y2[[1]] - 0.25, y2[[2]] + 0.1) + geom_point(data = sim_ratio %>% group_by(label, mix_ratio) %>% slice_sample(prop = 0.1) %>% mutate(likelihood_ratio = ifelse(likelihood_ratio > cap2, cap2, likelihood_ratio)), aes(UMAP1, UMAP2, color = mix_ratio), size = 4, shape = 20, stroke = 0, alpha = 0.75) + scico::scale_color_scico(direction = -1, end = 0.95, palette = "davos", breaks = c(0, 1), limits = c(0, 1), labels = c(0, 1)) + ggtitle("Contamination Severity") + guides(color = guide_colorbar(title = "Mixture Ratio", direction = "horizontal", title.position = "top", ticks = F, title.theme = element_text(size = 10, face = "bold", hjust = 0.5), label.theme = element_text(size = 8, face = "bold", hjust = 0)), fill = F), 
        ggplot() + xlim(x2[[1]] - 0.25, x2[[2]] + 0.2) + ylim(y2[[1]] - 0.25, y2[[2]] + 0.1) + geom_point(data = sim_ratio %>% group_by(label, mix_ratio) %>% slice_sample(prop = 0.1) %>% mutate(likelihood_ratio = ifelse(likelihood_ratio > cap2, cap2, likelihood_ratio)), aes(UMAP1, UMAP2, color = likelihood_ratio), size = 4, shape = 20, stroke = 0, alpha = 0.75) + scico::scale_color_scico(direction = 1, begin = 0.05, palette = "bilbao", trans = scales::log1p_trans(), oob = scales::squish, limits = c(0, cap2*2), breaks = c(0, cap2*2), labels = c("Lowest", "Highest")) + ggtitle("Likelihood Ratio") + guides(fill = F, color = guide_colorbar(title = "Likelihood Ratio", direction = "horizontal", title.position = "top", ticks = F, title.theme = element_text(size = 10, face = "bold", hjust = 0.5), label.theme = element_text(size = 8, face = "bold", hjust = 0))))

    gg_sim_overlay <- ggpubr::ggarrange(plotlist = gg_sim_ratio, nrow = 1, ncol = 3)
    gg_sim_overlay

nspies13 / bmp_umap_paper Goto Github PK

bmp_umap_paper's Introduction

Code Walk-Through

Preparation

Simulating IV Fluid Contamination

Building the Unsupervised Models

Applying the Models

Calculating Likelihood Ratios

Plotting the Embeddings

Analyte-Level Contributions

Plotting Simulated Contamination

bmp_umap_paper's People

Contributors

Watchers

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent