From 53f4bb3ca78ed927c97572eb0d5469b8bb0326fb Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Wed, 11 Sep 2024 17:25:45 -0400 Subject: [PATCH] Break apart plot files to prevent OOM --- .../apps/r/calculate_interaction_zscores.R | 230 +++++++++++------- 1 file changed, 147 insertions(+), 83 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index eb27dcfd..5b999e9c 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -8,7 +8,7 @@ suppressMessages({ library(unix) }) -options(warn = 2, max.print = 1000) +options(warn = 2) options(width = 10000) # Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes) @@ -154,7 +154,7 @@ update_gene_names <- function(df, sgd_gene_list) { } # Calculate summary statistics for all variables -calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) { +calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) { # Summarize the variables within the grouped data summary_stats <- df %>% @@ -171,6 +171,8 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c ), .names = "{.fn}_{.col}") ) + print(summary_stats) + # Prevent .x and .y suffix issues by renaming columns df_cleaned <- df %>% select(-any_of(setdiff(names(summary_stats), group_vars))) # Avoid duplicate columns in the final join @@ -342,8 +344,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la plots <- lapply(plot_configs, function(config) { df <- config$df - print(head(df)) - # Check if y_var is NULL and adjust the aes mapping aes_mapping <- if (is.null(config$y_var)) { aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var))) @@ -424,7 +424,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE) } - generate_interaction_plot_configs <- function(df, variables) { configs <- list() @@ -587,33 +586,24 @@ main <- function() { dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE) - variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across + summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by + print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor", + "delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB") message("Loading and filtering data") df <- load_and_process_data(args$easy_results_file, sd = exp_sd) df <- update_gene_names(df, args$sgd_gene_list) + df <- as_tibble(df) # Filter rows that are above tolerance for quality control plots df_above_tolerance <- df %>% filter(DB == 1) - # Set L, r, K, and AUC to NA for rows that are above tolerance - df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .))) + # Set L, r, K, AUC (and delta_bg?) to NA for rows that are above tolerance + df_na <- df %>% mutate(across(all_of(summary_vars), ~ ifelse(DB == 1, NA, .))) # Remove rows with 0 values in L df_no_zeros <- df_na %>% filter(L > 0) - - # Additional filtering for non-finite values - # Filter and print non-finite rows, then filter and keep only finite rows - df_na_filtered <- df_na %>% - { - non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.))) - if (nrow(non_finite_rows) > 0) { - message("Removing non-finite rows:\n") - print(head(non_finite_rows, n = 10)) - } - filter(., if_all(c(L), is.finite)) - } # Set some constants max_conc <- max(df$conc_num_factor) @@ -621,18 +611,47 @@ main <- function() { k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2 message("Calculating summary statistics before quality control") - ss <- calculate_summary_stats(df, variables, group_vars = group_vars) + ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars) + df_ss <- ss$summary_stats df_stats <- ss$df_with_stats + df_filtered_stats <- df_stats %>% + { + non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.))) + if (nrow(non_finite_rows) > 0) { + message("Removed the following non-finite rows:") + print(non_finite_rows %>% select(any_of(print_vars)), n = 200) + } + filter(., if_all(c(L), is.finite)) + } message("Calculating summary statistics after quality control") - ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars) + ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars) df_na_ss <- ss$summary_stats df_na_stats <- ss$df_with_stats write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) + # Filter out non-finite rows for plotting + df_na_filtered_stats <- df_na_stats %>% + { + non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.))) + if (nrow(non_finite_rows) > 0) { + message("Removed the following non-finite rows:") + print(non_finite_rows %>% select(any_of(print_vars)), n = 200) + } + filter(., if_all(c(L), is.finite)) + } message("Calculating summary statistics after quality control excluding zero values") - ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars) + ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars) df_no_zeros_stats <- ss$df_with_stats + df_no_zeros_filtered_stats <- df_no_zeros_stats %>% + { + non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.))) + if (nrow(non_finite_rows) > 0) { + message("Removed the following non-finite rows:") + print(non_finite_rows %>% select(any_of(print_vars)), n = 200) + } + filter(., if_all(c(L), is.finite)) + } message("Filtering by 2SD of K") df_na_within_2sd_k <- df_na_stats %>% @@ -645,19 +664,24 @@ main <- function() { ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) l_within_2sd_k_ss <- ss$summary_stats df_na_l_within_2sd_k_stats <- ss$df_with_stats - write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) + write.csv(l_within_2sd_k_ss, + file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) message("Calculating summary statistics for L outside 2SD of K") ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) l_outside_2sd_k_ss <- ss$summary_stats df_na_l_outside_2sd_k_stats <- ss$df_with_stats - write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE) + write.csv(l_outside_2sd_k_ss, + file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE) - # Plot configurations # Each plots list corresponds to a file message("Generating QC plot configurations") l_vs_k_plots <- list( - list(df = df, x_var = "L", y_var = "K", plot_type = "scatter", + list( + df = df, + x_var = "L", + y_var = "K", + plot_type = "scatter", title = "Raw L vs K before quality control", color_var = "conc_num", legend_position = "right" @@ -665,8 +689,13 @@ main <- function() { ) above_threshold_plots <- list( - list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter", - title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"), + list( + df = df_above_tolerance, + x_var = "L", + y_var = "K", + plot_type = "scatter", + title = paste("Raw L vs K for strains above delta background threshold of", + df_above_tolerance$delta_bg_tolerance[[1]], "or above"), color_var = "conc_num", annotations = list( x = l_half_median, @@ -679,72 +708,102 @@ main <- function() { ) frequency_delta_bg_plots <- list( - list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density", + list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density", title = "Plate analysis by Drug Conc for delta background before quality control", color_var = "conc_num", x_label = "Delta Background", y_label = "Density", error_bar = FALSE, - legend_position = "right" - ), - list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar", + legend_position = "right"), + list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar", title = "Plate analysis by Drug Conc for delta background before quality control", color_var = "conc_num", x_label = "Delta Background", y_label = "Count", error_bar = FALSE, - legend_position = "right" - ) + legend_position = "right") ) plate_analysis_plots <- list() - for (plot_type in c("scatter", "box")) { - variables <- c("L", "K", "r", "AUC", "delta_bg") - for (var in variables) { - for (stage in c("before", "after")) { - if (stage == "before") { - df_plot <- df_stats - } else { - df_plot <- df_na_stats # TODO use df_na_filtered if necessary - } - - # Set error_bar = TRUE only for scatter plots - error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE) - - # Create the plot configuration - plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type, - title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"), - error_bar = error_bar, color_var = "conc_num") - - plate_analysis_plots <- append(plate_analysis_plots, list(plot_config)) + for (var in summary_vars) { + for (stage in c("before", "after")) { + if (stage == "before") { + df_plot <- df_filtered_stats + } else { + df_plot <- df_na_filtered_stats } + + config <- list( + df = df_plot, + x_var = "scan", + y_var = var, + plot_type = "scatter", + title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"), + error_bar = TRUE, color_var = "conc_num") + + plate_analysis_plots <- append(plate_analysis_plots, list(config)) } } - plate_analysis_no_zero_plots <- list() - for (plot_type in c("scatter", "box")) { - variables <- c("L", "K", "r", "AUC", "delta_bg") - for (var in variables) { - - # Set error_bar = TRUE only for scatter plots - error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE) - - # Create the plot configuration - plot_config <- list( - df = df_no_zeros_stats, - x_var = "scan", - y_var = var, - plot_type = plot_type, - title = paste("Plate analysis by Drug Conc for", var, "after quality control"), - error_bar = error_bar, - color_var = "conc_num" - ) - plate_analysis_plots <- append(plate_analysis_plots, list(plot_config)) + plate_analysis_boxplots <- list() + for (var in summary_vars) { + for (stage in c("before", "after")) { + if (stage == "before") { + df_plot <- df_filtered_stats + } else { + df_plot <- df_na_filtered_stats } + + config <- list( + df = df_plot, + x_var = "scan", + y_var = var, + plot_type = "box", + title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"), + error_bar = FALSE, color_var = "conc_num") + + plate_analysis_boxplots <- append(plate_analysis_boxplots, list(config)) } + } + + plate_analysis_no_zeros_plots <- list() + for (var in summary_vars) { + + config <- list( + df = df_no_zeros_filtered_stats, + x_var = "scan", + y_var = var, + plot_type = "scatter", + title = paste("Plate analysis by Drug Conc for", var, "after quality control"), + error_bar = TRUE, + color_var = "conc_num") + + plate_analysis_no_zeros_plots <- append(plate_analysis_no_zeros_plots, list(config)) + } + + plate_analysis_no_zeros_boxplots <- list() + for (var in summary_vars) { + + + # Create the plot configuration + config <- list( + df = df_no_zeros_filtered_stats, + x_var = "scan", + y_var = var, + plot_type = "box", + title = paste("Plate analysis by Drug Conc for", var, "after quality control"), + error_bar = FALSE, + color_var = "conc_num" + ) + plate_analysis_no_zeros_boxplots <- append(plate_analysis_no_zeros_boxplots, list(config)) + } l_outside_2sd_k_plots <- list( - list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter", + list( + df = df_na_l_outside_2sd_k_stats, + x_var = "L", + y_var = "K", + plot_type = "scatter", title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc", color_var = "conc_num", legend_position = "right" @@ -752,20 +811,25 @@ main <- function() { ) delta_bg_outside_2sd_k_plots <- list( - list(df = df_na_l_outside_2sd_k_stats, x_var = "delta_bg", y_var = "K", plot_type = "scatter", + list( + df = df_na_l_outside_2sd_k_stats, + x_var = "delta_bg", + y_var = "K", + plot_type = "scatter", title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc", color_var = "conc_num", legend_position = "right" ) ) - # Generate and save plots for each QC step message("Generating QC plots") generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots) generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) + generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots) generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots) + generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots) generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots) generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots) @@ -795,7 +859,7 @@ main <- function() { # Recalculate summary statistics for the background strain message("Calculating summary statistics for background strain") - ss_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) + ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = group_vars) summary_stats_bg <- ss_bg$summary_stats # df_bg_stats <- ss_bg$df_with_stats write.csv(summary_stats_bg, @@ -835,14 +899,14 @@ main <- function() { ungroup() # Calculate interactions - variables <- c("L", "K", "r", "AUC") + interaction_vars <- c("L", "K", "r", "AUC") message("Calculating interaction scores") # print("Reference strain:") # print(head(reference_strain)) - reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables) + reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars) # print("Deletion strains:") # print(head(deletion_strains)) - deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables) + deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars) zscores_calculations_reference <- reference_results$calculations zscores_interactions_reference <- reference_results$interactions @@ -856,8 +920,8 @@ main <- function() { write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE) # Create interaction plots - reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables) - deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables) + reference_plot_configs <- generate_interaction_plot_configs(df_reference, interaction_vars) + deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, interaction_vars) generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3)) generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3)) @@ -965,7 +1029,7 @@ main <- function() { generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm", plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2)) - correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, variables) + correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, interaction_vars) generate_and_save_plots(output_dir = out_dir, file_name = "Avg_Zscore_vs_lm_NA_rm", plot_configs = correlation_plot_configs, grid_layout = list(ncol = 2, nrow = 2)) })