From 25a5c7ff2349a852ce8c0511e0b3e4e9e2dda669 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Sat, 5 Oct 2024 03:04:28 -0400 Subject: [PATCH] Flatten means and sds for error bars --- .../apps/r/calculate_interaction_zscores.R | 186 +++++++++++------- 1 file changed, 120 insertions(+), 66 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 7e8907b2..590e18b6 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -345,7 +345,7 @@ calculate_interaction_scores <- function(df, df_bg, group_vars, overlap_threshol ungroup() # For interaction plot error bars - delta_means <- calculations %>% + delta_means_sds <- calculations %>% group_by(across(all_of(group_vars))) %>% summarise( mean_Delta_L = mean(Delta_L, na.rm = TRUE), @@ -356,12 +356,11 @@ calculate_interaction_scores <- function(df, df_bg, group_vars, overlap_threshol sd_Delta_K = sd(Delta_K, na.rm = TRUE), sd_Delta_r = sd(Delta_r, na.rm = TRUE), sd_Delta_AUC = sd(Delta_AUC, na.rm = TRUE), - .groups = "drop" ) calculations <- calculations %>% - left_join(delta_means, by = group_vars) + left_join(delta_means_sds, by = group_vars) # Summary statistics for lm scores lm_means_sds <- calculations %>% @@ -436,13 +435,12 @@ calculate_interaction_scores <- function(df, df_bg, group_vars, overlap_threshol NG = first(NG), DB = first(DB), SM = first(SM), - .groups = "drop" ) # Add overlap threshold categories based on Z-lm and Avg-Z scores interactions <- interactions %>% - filter(!is.na(Z_lm_L)) %>% + filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>% mutate( Overlap = case_when( Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both", @@ -603,7 +601,7 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width plot <- ggplot(df, aes_mapping) + theme_publication(legend_position = config$legend_position) - # Add appropriate plot layer based on plot type + # Add appropriate plot layer or helper function based on plot type plot <- switch(config$plot_type, "scatter" = generate_scatter_plot(plot, config), "box" = generate_boxplot(plot, config), @@ -642,8 +640,19 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width # Add error bars if specified if (!is.null(config$error_bar) && config$error_bar) { - y_mean_col <- paste0("mean_", config$y_var) - y_sd_col <- paste0("sd_", config$y_var) + # Check if custom columns are provided for y_mean and y_sd, or use the defaults + y_mean_col <- if (!is.null(config$error_bar_params$y_mean_col)) { + config$error_bar_params$y_mean_col + } else { + paste0("mean_", config$y_var) + } + + y_sd_col <- if (!is.null(config$error_bar_params$y_sd_col)) { + config$error_bar_params$y_sd_col + } else { + paste0("sd_", config$y_var) + } + # If color_var is provided and no fixed error bar color is set, use aes() to map color dynamically if (!is.null(config$color_var) && is.null(config$error_bar_params$color)) { plot <- plot + geom_errorbar( @@ -651,7 +660,7 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width x = .data[[config$x_var]], ymin = .data[[y_mean_col]] - .data[[y_sd_col]], ymax = .data[[y_mean_col]] + .data[[y_sd_col]], - color = .data[[config$color_var]] # Dynamic color from the data + color = .data[[config$color_var]] ) ) } else { @@ -662,7 +671,17 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width ymin = .data[[y_mean_col]] - .data[[y_sd_col]], ymax = .data[[y_mean_col]] + .data[[y_sd_col]] ), - color = config$error_bar_params$color # Fixed color + color = config$error_bar_params$color + ) + } + + # Add the center point if the option is provided + if (!is.null(config$error_bar_params$center_point)) { + plot <- plot + geom_point(aes( + x = .data[[config$x_var]], + y = .data[[y_mean_col]]), + color = config$error_bar_params$color, + shape = 16 ) } } @@ -730,6 +749,10 @@ generate_scatter_plot <- function(plot, config) { size = 0.5 ) } + + if (!is.null(config$gray_points) && config$gray_points) { + plot <- plot + geom_point(shape = 3, color = "gray70", size = 1) + } # Add Smooth Line if specified if (!is.null(config$smooth) && config$smooth) { @@ -742,14 +765,17 @@ generate_scatter_plot <- function(plot, config) { slope = config$lm_line$slope, color = smooth_color ) - } else { - plot <- plot + - geom_smooth( - method = "lm", - se = FALSE, - color = smooth_color - ) } + + # For now I want to try and hardcode it + # else { + # plot <- plot + + # geom_smooth( + # method = "lm", + # se = FALSE, + # color = smooth_color + # ) + # } } # Add SD Bands if specified @@ -878,7 +904,7 @@ generate_plate_analysis_plot_configs <- function(variables, df_before = NULL, df return(list(plots = plot_configs)) } -generate_interaction_plot_configs <- function(df, type) { +generate_interaction_plot_configs <- function(df_summary, df_interaction, type) { # Define the y-limits for the plots limits_map <- list( @@ -893,7 +919,7 @@ generate_interaction_plot_configs <- function(df, type) { delta_plot_configs <- list() # Overall statistics plots - OrfRep <- first(df$OrfRep) # this should correspond to the reference strain + OrfRep <- first(df_summary$OrfRep) # this should correspond to the reference strain for (plot_type in c("scatter", "box")) { @@ -903,15 +929,15 @@ generate_interaction_plot_configs <- function(df, type) { # Common plot configuration plot_config <- list( - df = df, + df = df_summary, plot_type = plot_type, x_var = "conc_num_factor_factor", y_var = var, shape = 16, - x_label = unique(df$Drug)[1], + x_label = paste0("[", unique(df_summary$Drug)[1], "]"), coord_cartesian = y_limits, - x_breaks = unique(df$conc_num_factor_factor), - x_labels = as.character(unique(df$conc_num)) + x_breaks = unique(df_summary$conc_num_factor_factor), + x_labels = as.character(unique(df_summary$conc_num)) ) # Add specific configurations for scatter and box plots @@ -920,23 +946,25 @@ generate_interaction_plot_configs <- function(df, type) { plot_config$error_bar <- TRUE plot_config$error_bar_params <- list( color = "red", - center_point = TRUE + center_point = TRUE, + y_mean_col = paste0("mean_mean_", var), + y_sd_col = paste0("mean_sd_", var) ) plot_config$position <- "jitter" # Cannot figure out how to place these properly for discrete x-axis so let's be hacky annotations <- list( - list(x = 0.25, y = y_limits[1] + 0.1 * y_span, label = " NG:"), - list(x = 0.25, y = y_limits[1] + 0.05 * y_span, label = " DB:"), - list(x = 0.25, y = y_limits[1], label = " SM:") + list(x = 0.25, y = y_limits[1] + 0.08 * y_span, label = " NG =", size = 4), + list(x = 0.25, y = y_limits[1] + 0.04 * y_span, label = " DB =", size = 4), + list(x = 0.25, y = y_limits[1], label = " SM =", size = 4) ) - for (x_val in unique(df$conc_num_factor_factor)) { - current_df <- df %>% filter(.data[[plot_config$x_var]] == x_val) + for (x_val in unique(df_summary$conc_num_factor_factor)) { + current_df <- df_summary %>% filter(.data[[plot_config$x_var]] == x_val) annotations <- append(annotations, list( - list(x = x_val, y = y_limits[1] + 0.1 * y_span, label = first(current_df$NG, default = 0)), - list(x = x_val, y = y_limits[1] + 0.05 * y_span, label = first(current_df$DB, default = 0)), - list(x = x_val, y = y_limits[1], label = first(current_df$SM, default = 0)) + list(x = x_val, y = y_limits[1] + 0.08 * y_span, label = first(current_df$NG, default = 0), size = 4), + list(x = x_val, y = y_limits[1] + 0.04 * y_span, label = first(current_df$DB, default = 0), size = 4), + list(x = x_val, y = y_limits[1], label = first(current_df$SM, default = 0), size = 4) )) } @@ -945,7 +973,7 @@ generate_interaction_plot_configs <- function(df, type) { stats_plot_configs <- append(stats_plot_configs, list(plot_config)) } else if (plot_type == "box") { - plot_config$title <- sprintf("%s Boxplot RF for %s with SD", OrfRep, var) + plot_config$title <- sprintf("%s Box RF for %s with SD", OrfRep, var) plot_config$position <- "dodge" stats_boxplot_configs <- append(stats_boxplot_configs, list(plot_config)) @@ -967,7 +995,7 @@ generate_interaction_plot_configs <- function(df, type) { AUC = c(-6000, 6000) ) - grouped_data <- df %>% + grouped_data <- df_interaction %>% group_by(across(all_of(group_vars))) %>% group_split() @@ -1003,11 +1031,12 @@ generate_interaction_plot_configs <- function(df, type) { plot_config <- list( df = group_data, plot_type = "scatter", - x_var = "conc_num_factor_factor", + x_var = "conc_num_factor_factor", y_var = paste0("Delta_", var), - x_label = unique(group_data$Drug)[1], + x_label = paste0("[", unique(df_summary$Drug)[1], "]"), + shape = 16, title = paste(OrfRepTitle, Gene, sep = " "), - title_size = rel(1.2), + title_size = rel(1.3), coord_cartesian = y_limits, annotations = list( list(x = 1, y = y_limits[2] - 0.2 * y_span, label = paste("ZShift =", Z_Shift_value)), @@ -1021,16 +1050,17 @@ generate_interaction_plot_configs <- function(df, type) { error_bar_params = list( ymin = 0 - (2 * WT_sd_value), ymax = 0 + (2 * WT_sd_value), - color = "black" + color = "gray" ), - smooth = TRUE, x_breaks = unique(group_data$conc_num_factor_factor), x_labels = as.character(unique(group_data$conc_num)), ylim_vals = y_limits, y_filter = FALSE, + smooth = TRUE, lm_line = list( intercept = lm_intercept_value, - slope = lm_slope_value + slope = lm_slope_value, + color = "blue" ) ) delta_plot_configs <- append(delta_plot_configs, list(plot_config)) @@ -1082,7 +1112,6 @@ generate_rank_plot_configs <- function(df, is_lm = FALSE, adjust = FALSE, overla fill_negative = "orange", alpha_positive = 0.3, alpha_negative = 0.3, - annotations = NULL, shape = 3, size = 0.1, y_label = y_label, @@ -1183,7 +1212,8 @@ generate_correlation_plot_configs <- function(df) { shape = 3, size = 0.5, color_var = "Overlap", - cyan_points = highlight_cyan # Include cyan points or not based on the loop + cyan_points = highlight_cyan, # include cyan points or not based on the loop + gray_points = TRUE ) plot_configs <- append(plot_configs, list(plot_config)) @@ -1226,10 +1256,10 @@ main <- function() { ) message("Calculating summary statistics before quality control") - df_stats <- calculate_summary_stats( + df_stats <- calculate_summary_stats( # formerly X_stats_ALL df = df, variables = c("L", "K", "r", "AUC", "delta_bg"), - group_vars = c("conc_num"))$df_with_stats + group_vars = c("conc_num", "conc_num_factor_factor"))$df_with_stats frequency_delta_bg_plot_configs <- list( plots = list( @@ -1295,7 +1325,7 @@ main <- function() { ss <- calculate_summary_stats( df = df_na, variables = c("L", "K", "r", "AUC", "delta_bg"), - group_vars = c("conc_num")) + group_vars = c("conc_num", "conc_num_factor_factor")) df_na_ss <- ss$summary_stats df_na_stats <- ss$df_with_stats # formerly X_stats_ALL write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) @@ -1307,7 +1337,7 @@ main <- function() { df_no_zeros_stats <- calculate_summary_stats( df = df_no_zeros, variables = c("L", "K", "r", "AUC", "delta_bg"), - group_vars = c("conc_num") + group_vars = c("conc_num", "conc_num_factor_factor") )$df_with_stats message("Filtering by 2SD of K") @@ -1318,14 +1348,15 @@ main <- function() { message("Calculating summary statistics for L within 2SD of K") # TODO We're omitting the original z_max calculation, not sure if needed? - ss <- calculate_summary_stats(df_na_within_2sd_k, "L", - group_vars = c("conc_num"))$summary_stats + ss <- calculate_summary_stats(df_na_within_2sd_k, "L", # formerly X_stats_BY_L_within_2SD_K + group_vars = c("conc_num", "conc_num_factor_factor"))$summary_stats write.csv(ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2SD_K.csv"), row.names = FALSE) message("Calculating summary statistics for L outside 2SD of K") - ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num")) + ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", # formerly X_stats_BY_L_outside_2SD_K + group_vars = c("conc_num", "conc_num_factor_factor")) df_na_l_outside_2sd_k_stats <- ss$df_with_stats write.csv(ss$summary_stats, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2SD_K.csv"), @@ -1435,10 +1466,10 @@ main <- function() { ) # Parallelize background and quality control plot generation - furrr::future_map(plot_configs, function(config) { - generate_and_save_plots(config$out_dir, config$filename, config$plot_configs, - page_width = config$page_width, page_height = config$page_height) - }, .options = furrr_options(seed = TRUE)) + # furrr::future_map(plot_configs, function(config) { + # generate_and_save_plots(config$out_dir, config$filename, config$plot_configs, + # page_width = config$page_width, page_height = config$page_height) + # }, .options = furrr_options(seed = TRUE)) # Loop over background strains # TODO currently only tested against one strain, if we want to do multiple strains we'll @@ -1460,7 +1491,7 @@ main <- function() { filter(!is.na(L)) message("Calculating background strain summary statistics") - ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), + ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), # formerly X_stats_BY group_vars = c("OrfRep", "Drug", "conc_num", "conc_num_factor_factor")) summary_stats_bg <- ss_bg$summary_stats df_bg_stats <- ss_bg$df_with_stats @@ -1473,7 +1504,7 @@ main <- function() { df_reference <- df_na_stats %>% # formerly X2_RF filter(OrfRep == strain) %>% filter(!is.na(L)) %>% - group_by(OrfRep, Drug, conc_num) %>% + group_by(OrfRep, Drug, conc_num, conc_num_factor_factor) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), @@ -1482,22 +1513,45 @@ main <- function() { ungroup() message("Calculating reference strain summary statistics") - df_reference_stats <- calculate_summary_stats( + df_reference_summary_stats <- calculate_summary_stats( # formerly X_stats_X2_RF df = df_reference, variables = c("L", "K", "r", "AUC"), - group_vars = c("OrfRep", "Gene", "Drug", "num", "conc_num", "conc_num_factor_factor") + group_vars = c("OrfRep", "Drug", "conc_num", "conc_num_factor_factor") + )$df_with_stats + + # Summarise statistics for error bars + df_reference_summary_stats <- df_reference_summary_stats %>% + group_by(OrfRep, Drug, conc_num, conc_num_factor_factor) %>% + mutate( + mean_mean_L = first(mean_L), + mean_sd_L = first(sd_L), + mean_mean_K = first(mean_K), + mean_sd_K = first(sd_K), + mean_mean_r = first(mean_r), + mean_sd_r = first(sd_r), + mean_mean_AUC = first(mean_AUC), + mean_sd_AUC = first(sd_AUC), + .groups = "drop" + ) + + message("Calculating reference strain interaction summary statistics") # formerly X_stats_interaction + df_reference_interaction_stats <- calculate_summary_stats( + df = df_reference, + variables = c("L", "K", "r", "AUC"), + group_vars = c("OrfRep", "Gene", "num", "Drug", "conc_num", "conc_num_factor_factor") )$df_with_stats message("Calculating reference strain interaction scores") - results <- calculate_interaction_scores(df_reference_stats, df_bg_stats, group_vars = c("OrfRep", "Gene", "Drug", "num")) - df_calculations_reference <- results$calculations - df_interactions_reference <- results$interactions - df_interactions_reference_joined <- results$full_data - write.csv(df_calculations_reference, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE) - write.csv(df_interactions_reference, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE) + results <- calculate_interaction_scores(df_reference_interaction_stats, + df_bg_stats, group_vars = c("OrfRep", "Gene", "num", "Drug")) + df_reference_calculations <- results$calculations + df_reference_interactions <- results$interactions + df_reference_interactions_joined <- results$full_data + write.csv(df_reference_calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE) + write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE) message("Generating reference interaction plots") - reference_plot_configs <- generate_interaction_plot_configs(df_interactions_reference_joined, "reference") + reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference") generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16) message("Setting missing deletion values to the highest theoretical value at each drug conc for L") @@ -1512,7 +1566,7 @@ main <- function() { L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% ungroup() - message("Calculating deletion strain(s) summary statistics") + message("Calculating deletion strain(s) interaction summary statistics") df_deletion_stats <- calculate_summary_stats( df = df_deletion, variables = c("L", "K", "r", "AUC"), @@ -1528,7 +1582,7 @@ main <- function() { write.csv(df_interactions, file = file.path(out_dir, "zscore_interactions.csv"), row.names = FALSE) message("Generating deletion interaction plots") - deletion_plot_configs <- generate_interaction_plot_configs(df_interactions_joined, "deletion") + deletion_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_interactions_joined, "deletion") generate_and_save_plots(out_dir, "interaction_plots", deletion_plot_configs, page_width = 16, page_height = 16) message("Writing enhancer/suppressor csv files")