From f435fa3f1133c3fc763d5ecae5c268984f8cdb7d Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Wed, 2 Oct 2024 19:11:30 -0400 Subject: [PATCH] Move max_conc into the dataframe to simplify args --- .../apps/r/calculate_interaction_zscores.R | 117 +++++++----------- 1 file changed, 45 insertions(+), 72 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index f928476c..75df759d 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -160,6 +160,11 @@ load_and_filter_data <- function(easy_results_file, sd = 3) { conc_num_factor_factor = as.factor(conc_num) ) + # Set the max concentration across the whole dataframe + max_conc <- max(df$conc_num_factor, na.rm = TRUE) + df <- df %>% + mutate(max_conc = max_conc) + return(df) } @@ -211,7 +216,7 @@ calculate_summary_stats <- function(df, variables, group_vars) { return(list(summary_stats = summary_stats, df_with_stats = df_joined)) } -calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, overlap_threshold = 2) { +calculate_interaction_scores <- function(df, bg_stats, group_vars, overlap_threshold = 2) { # Calculate total concentration variables total_conc_num <- length(unique(df$conc_num)) @@ -294,16 +299,12 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, ove lm_means_sds <- calculations %>% group_by(across(all_of(group_vars))) %>% summarise( - mean_mean_L = mean(mean_L, na.rm = TRUE), mean_lm_L = mean(lm_Score_L, na.rm = TRUE), sd_lm_L = sd(lm_Score_L, na.rm = TRUE), - mean_mean_K = mean(mean_K, na.rm = TRUE), mean_lm_K = mean(lm_Score_K, na.rm = TRUE), sd_lm_K = sd(lm_Score_K, na.rm = TRUE), - mean_mean_r = mean(mean_r, na.rm = TRUE), mean_lm_r = mean(lm_Score_r, na.rm = TRUE), sd_lm_r = sd(lm_Score_r, na.rm = TRUE), - mean_mean_AUC = mean(mean_AUC, na.rm = TRUE), mean_lm_AUC = mean(lm_Score_AUC, na.rm = TRUE), sd_lm_AUC = sd(lm_Score_AUC, na.rm = TRUE) ) @@ -469,7 +470,12 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs) { "red" } - y_mean_col <- paste0("mean_mean_", config$y_var) + y_mean_prefix <- if (!is.null(config$error_bar_params$y_mean_prefix)) { + config$error_bar_params$y_mean_prefix + } else { + "mean_" + } + y_mean_col <- paste0(y_mean_prefix, config$y_var) # Dynamically set y_sd_col based on the provided prefix in error_bar_params y_sd_prefix <- if (!is.null(config$error_bar_params$y_sd_prefix)) { @@ -482,7 +488,7 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs) { if (!is.null(config$error_bar_params$center_point)) { plot <- plot + geom_point(aes( x = .data[[config$x_var]], - y = .data[[y_mean_col]]), + y = first(.data[[y_mean_col]])), color = error_bar_color, shape = 16) } @@ -495,14 +501,12 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs) { color = error_bar_color) } else { plot <- plot + geom_errorbar(aes( - ymin = .data[[y_mean_col]] - .data[[y_sd_col]], - ymax = .data[[y_mean_col]] + .data[[y_sd_col]]), + ymin = first(.data[[y_mean_col]]) - first(.data[[y_sd_col]]), + ymax = first(.data[[y_mean_col]]) + first(.data[[y_sd_col]])), color = error_bar_color) } } - - # Convert ggplot to plotly for interactive version plotly_plot <- suppressWarnings(plotly::ggplotly(plot)) @@ -546,7 +550,7 @@ generate_scatter_plot <- function(plot, config) { size <- if (!is.null(config$size)) config$size else 1.5 position <- if (!is.null(config$position) && config$position == "jitter") { - position_jitter(width = 0.3, height = 0.1) + position_jitter(width = 0.4, height = 0.1) } else { "identity" } @@ -752,7 +756,7 @@ generate_interaction_plot_configs <- function(df, type) { for (var in names(limits_map)) { y_limits <- limits_map[[var]] - y_span <- y_limits[2] - y_limits[1] + y_span <- y_limits[2] - y_limits[1] # Common plot configuration plot_config <- list( @@ -773,15 +777,16 @@ generate_interaction_plot_configs <- function(df, type) { plot_config$error_bar <- TRUE plot_config$error_bar_params <- list( y_sd_prefix = "WT_sd_", + y_mean_prefix = "mean_", color = "red", center_point = TRUE ) plot_config$position <- "jitter" annotations <- list( - list(x = -0.25, y = y_limits[1] + 0.1 * y_span, label = "NG ="), # Slightly above y-min - list(x = -0.25, y = y_limits[1] + 0.05 * y_span, label = "DB ="), - list(x = -0.25, y = y_limits[1], label = "SM =") + list(x = 0.25, y = y_limits[1] + 0.1 * y_span, label = "NG ="), # Slightly above y-min + list(x = 0.25, y = y_limits[1] + 0.05 * y_span, label = "DB ="), + list(x = 0.25, y = y_limits[1], label = "SM =") ) # Loop over unique x values and add NG, DB, SM values at calculated y positions @@ -1067,70 +1072,35 @@ main <- function() { out_dir_qc <- file.path(exp_path, "zscores", "qc") dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE) - - summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across - interaction_vars <- c("L", "K", "r", "AUC") # fields to calculate interaction z-scores message("Loading and filtering data for experiment: ", exp_name) df <- load_and_filter_data(args$easy_results_file, sd = exp_sd) %>% update_gene_names(args$sgd_gene_list) %>% as_tibble() - # Filter rows above delta background tolerance - df_above_tolerance <- df %>% filter(DB == 1) - df_na <- df %>% mutate(across(all_of(summary_vars), ~ ifelse(DB == 1, NA, .))) # formerly X - df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero - - # Save some constants - max_conc <- max(df$conc_num_factor) - message("Calculating summary statistics before quality control") df_stats <- calculate_summary_stats( df = df, - variables = summary_vars, + variables = c("L", "K", "r", "AUC", "delta_bg"), group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$df_with_stats - + message("Calculating summary statistics after quality control") + df_na <- df %>% mutate(across(all_of(c("L", "K", "r", "AUC", "delta_bg")), ~ ifelse(DB == 1, NA, .))) # formerly X ss <- calculate_summary_stats( df = df_na, - variables = summary_vars, + variables = c("L", "K", "r", "AUC", "delta_bg"), group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")) df_na_ss <- ss$summary_stats - df_na_stats <- ss$df_with_stats + df_na_stats <- ss$df_with_stats # formerly X_stats_ALL write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) # For plotting (ggplot warns on NAs) - df_na_stats_filtered <- df_na_stats %>% filter(if_all(all_of(summary_vars), is.finite)) - - df_na_stats <- df_na_stats %>% - mutate( - WT_L = mean_L, - WT_K = mean_K, - WT_r = mean_r, - WT_AUC = mean_AUC, - WT_sd_L = sd_L, - WT_sd_K = sd_K, - WT_sd_r = sd_r, - WT_sd_AUC = sd_AUC - ) - - # Pull the background means and standard deviations from zero concentration for interactions - bg_stats <- df_na_stats %>% - filter(conc_num == 0) %>% - summarise( - mean_L = first(mean_L), - mean_K = first(mean_K), - mean_r = first(mean_r), - mean_AUC = first(mean_AUC), - sd_L = first(sd_L), - sd_K = first(sd_K), - sd_r = first(sd_r), - sd_AUC = first(sd_AUC) - ) + df_na_stats_filtered <- df_na_stats %>% filter(if_all(all_of(c("L", "K", "r", "AUC", "delta_bg")), is.finite)) message("Calculating summary statistics after quality control excluding zero values") + df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero df_no_zeros_stats <- calculate_summary_stats( df = df_no_zeros, - variables = summary_vars, + variables = c("L", "K", "r", "AUC", "delta_bg"), group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor") )$df_with_stats @@ -1142,7 +1112,8 @@ main <- function() { message("Calculating summary statistics for L within 2SD of K") # TODO We're omitting the original z_max calculation, not sure if needed? - ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats + ss <- calculate_summary_stats(df_na_within_2sd_k, "L", + group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats write.csv(ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) @@ -1153,7 +1124,7 @@ main <- function() { write.csv(ss$summary_stats, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE) - + # Each list of plots corresponds to a file l_vs_k_plot_configs <- list( plots = list( @@ -1200,6 +1171,7 @@ main <- function() { ) ) + df_above_tolerance <- df %>% filter(DB == 1) above_threshold_plot_configs <- list( plots = list( list( @@ -1226,26 +1198,26 @@ main <- function() { ) plate_analysis_plot_configs <- generate_plate_analysis_plot_configs( - variables = summary_vars, + variables = c("L", "K", "r", "AUC", "delta_bg"), df_before = df_stats, df_after = df_na_stats_filtered ) plate_analysis_boxplot_configs <- generate_plate_analysis_plot_configs( - variables = summary_vars, + variables = c("L", "K", "r", "AUC", "delta_bg"), df_before = df_stats, df_after = df_na_stats_filtered, plot_type = "box" ) plate_analysis_no_zeros_plot_configs <- generate_plate_analysis_plot_configs( - variables = summary_vars, + variables = c("L", "K", "r", "AUC", "delta_bg"), stages = c("after"), # Only after QC df_after = df_no_zeros_stats ) plate_analysis_no_zeros_boxplot_configs <- generate_plate_analysis_plot_configs( - variables = summary_vars, + variables = c("L", "K", "r", "AUC", "delta_bg"), stages = c("after"), # Only after QC df_after = df_no_zeros_stats, plot_type = "box" @@ -1329,7 +1301,6 @@ main <- function() { # generate_and_save_plots(config$out_dir, config$filename, config$plot_configs) # }, .options = furrr_options(seed = TRUE)) - # Process background strains bg_strains <- c("YDL227C") lapply(bg_strains, function(strain) { @@ -1349,15 +1320,17 @@ main <- function() { # Recalculate summary statistics for the background strain message("Calculating summary statistics for background strain") - ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor")) + ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), + group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor")) summary_stats_bg <- ss_bg$summary_stats + ss_bg_stats <- ss_bg$df_with_stats write.csv(summary_stats_bg, file = file.path(out_dir, paste0("summary_stats_background_strain_", strain, ".csv")), row.names = FALSE) # Set the missing values to the highest theoretical value at each drug conc for L # Leave other values as 0 for the max/min - df_reference <- df_na_stats %>% # formerly X2_RF + df_reference <- df_bg_stats %>% # formerly X2_RF filter(OrfRep == strain) %>% filter(!is.na(L)) %>% group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>% @@ -1384,10 +1357,10 @@ main <- function() { message("Calculating reference strain interaction scores") df_reference_stats <- calculate_summary_stats( df = df_reference, - variables = interaction_vars, + variables = c("L", "K", "r", "AUC"), group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") )$df_with_stats - reference_results <- calculate_interaction_scores(df_reference_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene", "num")) + reference_results <- calculate_interaction_scores(df_reference_stats, bg_stats, group_vars = c("OrfRep", "Gene", "num")) zscore_calculations_reference <- reference_results$calculations zscore_interactions_reference <- reference_results$interactions zscore_interactions_reference_joined <- reference_results$full_data @@ -1395,10 +1368,10 @@ main <- function() { message("Calculating deletion strain(s) interactions scores") df_deletion_stats <- calculate_summary_stats( df = df_deletion, - variables = interaction_vars, + variables = c("L", "K", "r", "AUC"), group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor") )$df_with_stats - deletion_results <- calculate_interaction_scores(df_deletion_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene")) + deletion_results <- calculate_interaction_scores(df_deletion_stats, bg_stats, group_vars = c("OrfRep", "Gene")) zscore_calculations <- deletion_results$calculations zscore_interactions <- deletion_results$interactions zscore_interactions_joined <- deletion_results$full_data