瀏覽代碼

Break apart plot files to prevent OOM

Bryan Roessler 7 月之前
父節點
當前提交
53f4bb3ca7
共有 1 個文件被更改,包括 147 次插入83 次删除
  1. 147 83
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 147 - 83
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -8,7 +8,7 @@ suppressMessages({
   library(unix)
 })
 
-options(warn = 2, max.print = 1000)
+options(warn = 2)
 options(width = 10000)
 
 # Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes)
@@ -154,7 +154,7 @@ update_gene_names <- function(df, sgd_gene_list) {
 }
 
 # Calculate summary statistics for all variables
-calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
+calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) {
 
   # Summarize the variables within the grouped data
   summary_stats <- df %>%
@@ -171,6 +171,8 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
       ), .names = "{.fn}_{.col}")
     )
 
+  print(summary_stats)
+
   # Prevent .x and .y suffix issues by renaming columns
   df_cleaned <- df %>%
     select(-any_of(setdiff(names(summary_stats), group_vars))) # Avoid duplicate columns in the final join
@@ -342,8 +344,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
   plots <- lapply(plot_configs, function(config) {
     df <- config$df
 
-    print(head(df))
-
     # Check if y_var is NULL and adjust the aes mapping
     aes_mapping <- if (is.null(config$y_var)) {
       aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var)))
@@ -424,7 +424,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
   saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE)
 }
 
-
 generate_interaction_plot_configs <- function(df, variables) {
   configs <- list()
 
@@ -587,33 +586,24 @@ main <- function() {
     dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
     dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
 
-    variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across
+    summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
     group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by
+    print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor",
+      "delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB")
     
     message("Loading and filtering data")
     df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
     df <- update_gene_names(df, args$sgd_gene_list)
+    df <- as_tibble(df)
 
     # Filter rows that are above tolerance for quality control plots
     df_above_tolerance <- df %>% filter(DB == 1)
 
-    # Set L, r, K, and AUC to NA for rows that are above tolerance
-    df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .)))
+    # Set L, r, K, AUC (and delta_bg?) to NA for rows that are above tolerance
+    df_na <- df %>% mutate(across(all_of(summary_vars), ~ ifelse(DB == 1, NA, .)))
 
     # Remove rows with 0 values in L
     df_no_zeros <- df_na %>% filter(L > 0)
-
-    # Additional filtering for non-finite values
-    # Filter and print non-finite rows, then filter and keep only finite rows
-    df_na_filtered <- df_na %>%
-      {
-        non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
-        if (nrow(non_finite_rows) > 0) {
-          message("Removing non-finite rows:\n")
-          print(head(non_finite_rows, n = 10))
-        }
-        filter(., if_all(c(L), is.finite))
-      }
     
     # Set some constants
     max_conc <- max(df$conc_num_factor)
@@ -621,18 +611,47 @@ main <- function() {
     k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
 
     message("Calculating summary statistics before quality control")
-    ss <- calculate_summary_stats(df, variables, group_vars = group_vars)
+    ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars)
+    df_ss <- ss$summary_stats
     df_stats <- ss$df_with_stats
+    df_filtered_stats <- df_stats %>%
+      {
+        non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
+        if (nrow(non_finite_rows) > 0) {
+          message("Removed the following non-finite rows:")
+          print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
+        }
+        filter(., if_all(c(L), is.finite))
+      }
 
     message("Calculating summary statistics after quality control")
-    ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars)
+    ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars)
     df_na_ss <- ss$summary_stats
     df_na_stats <- ss$df_with_stats
     write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
+    # Filter out non-finite rows for plotting
+    df_na_filtered_stats <- df_na_stats %>%
+      {
+        non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
+        if (nrow(non_finite_rows) > 0) {
+          message("Removed the following non-finite rows:")
+          print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
+        }
+        filter(., if_all(c(L), is.finite))
+      }
 
     message("Calculating summary statistics after quality control excluding zero values")
-    ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars)
+    ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars)
     df_no_zeros_stats <- ss$df_with_stats
+    df_no_zeros_filtered_stats <- df_no_zeros_stats %>%
+      {
+        non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
+        if (nrow(non_finite_rows) > 0) {
+          message("Removed the following non-finite rows:")
+          print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
+        }
+        filter(., if_all(c(L), is.finite))
+      }
 
     message("Filtering by 2SD of K")
     df_na_within_2sd_k <- df_na_stats %>%
@@ -645,19 +664,24 @@ main <- function() {
     ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
     l_within_2sd_k_ss <- ss$summary_stats
     df_na_l_within_2sd_k_stats <- ss$df_with_stats
-    write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
+    write.csv(l_within_2sd_k_ss,
+      file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
     
     message("Calculating summary statistics for L outside 2SD of K")
     ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
     l_outside_2sd_k_ss <- ss$summary_stats
     df_na_l_outside_2sd_k_stats <- ss$df_with_stats
-    write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
+    write.csv(l_outside_2sd_k_ss,
+      file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
 
-    # Plot configurations
     # Each plots list corresponds to a file
     message("Generating QC plot configurations")
     l_vs_k_plots <- list(
-      list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
+      list(
+        df = df,
+        x_var = "L",
+        y_var = "K",
+        plot_type = "scatter",
         title = "Raw L vs K before quality control",
         color_var = "conc_num",
         legend_position = "right"
@@ -665,8 +689,13 @@ main <- function() {
     )
 
     above_threshold_plots <- list(
-      list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter",
-        title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
+      list(
+        df = df_above_tolerance,
+        x_var = "L",
+        y_var = "K",
+        plot_type = "scatter",
+        title = paste("Raw L vs K for strains above delta background threshold of",
+          df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
         color_var = "conc_num",
         annotations = list(
           x = l_half_median,
@@ -679,72 +708,102 @@ main <- function() {
     )
 
     frequency_delta_bg_plots <- list(
-      list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
+      list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
         title = "Plate analysis by Drug Conc for delta background before quality control",
         color_var = "conc_num",
         x_label = "Delta Background",
         y_label = "Density",
         error_bar = FALSE,
-        legend_position = "right"
-      ),
-      list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
+        legend_position = "right"),
+      list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
         title = "Plate analysis by Drug Conc for delta background before quality control",
         color_var = "conc_num",
         x_label = "Delta Background",
         y_label = "Count",
         error_bar = FALSE,
-        legend_position = "right"
-      )
+        legend_position = "right")
     )
 
     plate_analysis_plots <- list()
-    for (plot_type in c("scatter", "box")) {
-      variables <- c("L", "K", "r", "AUC", "delta_bg")
-      for (var in variables) {
-        for (stage in c("before", "after")) {
-          if (stage == "before") {
-            df_plot <- df_stats
-          } else {
-            df_plot <- df_na_stats # TODO use df_na_filtered if necessary
-          }
-          
-          # Set error_bar = TRUE only for scatter plots
-          error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
-          
-          # Create the plot configuration
-          plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type,
-            title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
-            error_bar = error_bar, color_var = "conc_num")
-
-          plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
+    for (var in summary_vars) {
+      for (stage in c("before", "after")) {
+        if (stage == "before") {
+          df_plot <- df_filtered_stats
+        } else {
+          df_plot <- df_na_filtered_stats
         }
+        
+        config <- list(
+          df = df_plot,
+          x_var = "scan",
+          y_var = var,
+          plot_type = "scatter",
+          title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
+          error_bar = TRUE, color_var = "conc_num")
+
+        plate_analysis_plots <- append(plate_analysis_plots, list(config))
       }
     }
 
-    plate_analysis_no_zero_plots <- list()
-      for (plot_type in c("scatter", "box")) {
-        variables <- c("L", "K", "r", "AUC", "delta_bg")
-        for (var in variables) {
-            
-          # Set error_bar = TRUE only for scatter plots
-          error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
-          
-          # Create the plot configuration
-          plot_config <- list(
-            df = df_no_zeros_stats,
-            x_var = "scan",
-            y_var = var,
-            plot_type = plot_type,
-            title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
-            error_bar = error_bar,
-            color_var = "conc_num"
-          )
-          plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
+    plate_analysis_boxplots <- list()
+    for (var in summary_vars) {
+      for (stage in c("before", "after")) {
+        if (stage == "before") {
+          df_plot <- df_filtered_stats
+        } else {
+          df_plot <- df_na_filtered_stats
         }
+        
+        config <- list(
+          df = df_plot,
+          x_var = "scan",
+          y_var = var,
+          plot_type = "box",
+          title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
+          error_bar = FALSE, color_var = "conc_num")
+
+        plate_analysis_boxplots <- append(plate_analysis_boxplots, list(config))
       }
+    }
+
+    plate_analysis_no_zeros_plots <- list()
+    for (var in summary_vars) {
+
+      config <- list(
+        df = df_no_zeros_filtered_stats,
+        x_var = "scan",
+        y_var = var,
+        plot_type = "scatter",
+        title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
+        error_bar = TRUE,
+        color_var = "conc_num")
+
+      plate_analysis_no_zeros_plots <- append(plate_analysis_no_zeros_plots, list(config))
+    }
+
+    plate_analysis_no_zeros_boxplots <- list()
+    for (var in summary_vars) {
+      
+      
+      # Create the plot configuration
+      config <- list(
+        df = df_no_zeros_filtered_stats,
+        x_var = "scan",
+        y_var = var,
+        plot_type = "box",
+        title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
+        error_bar = FALSE,
+        color_var = "conc_num"
+      )
+      plate_analysis_no_zeros_boxplots <- append(plate_analysis_no_zeros_boxplots, list(config))
+    }
 
     l_outside_2sd_k_plots <- list(
-      list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter",
+      list(
+        df = df_na_l_outside_2sd_k_stats,
+        x_var = "L",
+        y_var = "K",
+        plot_type = "scatter",
         title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
         color_var = "conc_num",
         legend_position = "right"
@@ -752,20 +811,25 @@ main <- function() {
     )
 
     delta_bg_outside_2sd_k_plots <- list(
-      list(df = df_na_l_outside_2sd_k_stats, x_var = "delta_bg", y_var = "K", plot_type = "scatter",
+      list(
+        df = df_na_l_outside_2sd_k_stats,
+        x_var = "delta_bg",
+        y_var = "K",
+        plot_type = "scatter",
         title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
         color_var = "conc_num",
         legend_position = "right"
       )
     )
 
-    # Generate and save plots for each QC step
     message("Generating QC plots")
     generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
     generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
     generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
     generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
+    generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
     generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
+    generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
     generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
     generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
 
@@ -795,7 +859,7 @@ main <- function() {
       
       # Recalculate summary statistics for the background strain
       message("Calculating summary statistics for background strain")
-      ss_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
+      ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = group_vars)
       summary_stats_bg <- ss_bg$summary_stats
       # df_bg_stats <- ss_bg$df_with_stats
       write.csv(summary_stats_bg,
@@ -835,14 +899,14 @@ main <- function() {
         ungroup()
 
       # Calculate interactions
-      variables <- c("L", "K", "r", "AUC")
+      interaction_vars <- c("L", "K", "r", "AUC")
       message("Calculating interaction scores")
       # print("Reference strain:")
       # print(head(reference_strain))
-      reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables)
+      reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars)
       # print("Deletion strains:")
       # print(head(deletion_strains))
-      deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables)
+      deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars)
       
       zscores_calculations_reference <- reference_results$calculations
       zscores_interactions_reference <- reference_results$interactions
@@ -856,8 +920,8 @@ main <- function() {
       write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE)
 
       # Create interaction plots
-      reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables)
-      deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables)
+      reference_plot_configs <- generate_interaction_plot_configs(df_reference, interaction_vars)
+      deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, interaction_vars)
       generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
       generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
 
@@ -965,7 +1029,7 @@ main <- function() {
       generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm",
         plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2))
 
-      correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, variables)
+      correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, interaction_vars)
       generate_and_save_plots(output_dir = out_dir, file_name = "Avg_Zscore_vs_lm_NA_rm",
         plot_configs = correlation_plot_configs, grid_layout = list(ncol = 2, nrow = 2))
     })