9 月之前 · 53f4bb3ca7
--- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
+++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
@@ -8,7 +8,7 @@ suppressMessages({
 
				   library(unix)
			
 
				 })
			
 
				 
			
 
				-options(warn = 2, max.print = 1000)
			
 
				+options(warn = 2)
			
 
				 options(width = 10000)
			
 
				 
			
 
				 # Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes)
			
@@ -154,7 +154,7 @@ update_gene_names <- function(df, sgd_gene_list) {
 
				 }
			
 
				 
			
 
				 # Calculate summary statistics for all variables
			
 
				-calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
			
 
				+calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) {
			
 
				 
			
 
				   # Summarize the variables within the grouped data
			
 
				   summary_stats <- df %>%
			
@@ -171,6 +171,8 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
 
				       ), .names = "{.fn}_{.col}")
			
 
				     )
			
 
				 
			
 
				+  print(summary_stats)
			
 
				+
			
 
				   # Prevent .x and .y suffix issues by renaming columns
			
 
				   df_cleaned <- df %>%
			
 
				     select(-any_of(setdiff(names(summary_stats), group_vars))) # Avoid duplicate columns in the final join
			
@@ -342,8 +344,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
 
				   plots <- lapply(plot_configs, function(config) {
			
 
				     df <- config$df
			
 
				 
			
 
				-    print(head(df))
			
 
				-
			
 
				     # Check if y_var is NULL and adjust the aes mapping
			
 
				     aes_mapping <- if (is.null(config$y_var)) {
			
 
				       aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var)))
			
@@ -424,7 +424,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
 
				   saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE)
			
 
				 }
			
 
				 
			
 
				-
			
 
				 generate_interaction_plot_configs <- function(df, variables) {
			
 
				   configs <- list()
			
 
				 
			
@@ -587,33 +586,24 @@ main <- function() {
 
				     dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
			
 
				     dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
			
 
				 
			
 
				-    variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across
			
 
				+    summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
			
 
				     group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by
			
 
				+    print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor",
			
 
				+      "delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB")
			
 
				     
			
 
				     message("Loading and filtering data")
			
 
				     df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
			
 
				     df <- update_gene_names(df, args$sgd_gene_list)
			
 
				+    df <- as_tibble(df)
			
 
				 
			
 
				     # Filter rows that are above tolerance for quality control plots
			
 
				     df_above_tolerance <- df %>% filter(DB == 1)
			
 
				 
			
 
				-    # Set L, r, K, and AUC to NA for rows that are above tolerance
			
 
				-    df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .)))
			
 
				+    # Set L, r, K, AUC (and delta_bg?) to NA for rows that are above tolerance
			
 
				+    df_na <- df %>% mutate(across(all_of(summary_vars), ~ ifelse(DB == 1, NA, .)))
			
 
				 
			
 
				     # Remove rows with 0 values in L
			
 
				     df_no_zeros <- df_na %>% filter(L > 0)
			
 
				-
			
 
				-    # Additional filtering for non-finite values
			
 
				-    # Filter and print non-finite rows, then filter and keep only finite rows
			
 
				-    df_na_filtered <- df_na %>%
			
 
				-      {
			
 
				-        non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
			
 
				-        if (nrow(non_finite_rows) > 0) {
			
 
				-          message("Removing non-finite rows:\n")
			
 
				-          print(head(non_finite_rows, n = 10))
			
 
				-        }
			
 
				-        filter(., if_all(c(L), is.finite))
			
 
				-      }
			
 
				     
			
 
				     # Set some constants
			
 
				     max_conc <- max(df$conc_num_factor)
			
@@ -621,18 +611,47 @@ main <- function() {
 
				     k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
			
 
				 
			
 
				     message("Calculating summary statistics before quality control")
			
 
				-    ss <- calculate_summary_stats(df, variables, group_vars = group_vars)
			
 
				+    ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars)
			
 
				+    df_ss <- ss$summary_stats
			
 
				     df_stats <- ss$df_with_stats
			
 
				+    df_filtered_stats <- df_stats %>%
			
 
				+      {
			
 
				+        non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
			
 
				+        if (nrow(non_finite_rows) > 0) {
			
 
				+          message("Removed the following non-finite rows:")
			
 
				+          print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
			
 
				+        }
			
 
				+        filter(., if_all(c(L), is.finite))
			
 
				+      }
			
 
				 
			
 
				     message("Calculating summary statistics after quality control")
			
 
				-    ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars)
			
 
				+    ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars)
			
 
				     df_na_ss <- ss$summary_stats
			
 
				     df_na_stats <- ss$df_with_stats
			
 
				     write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
			
 
				+    # Filter out non-finite rows for plotting
			
 
				+    df_na_filtered_stats <- df_na_stats %>%
			
 
				+      {
			
 
				+        non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
			
 
				+        if (nrow(non_finite_rows) > 0) {
			
 
				+          message("Removed the following non-finite rows:")
			
 
				+          print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
			
 
				+        }
			
 
				+        filter(., if_all(c(L), is.finite))
			
 
				+      }
			
 
				 
			
 
				     message("Calculating summary statistics after quality control excluding zero values")
			
 
				-    ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars)
			
 
				+    ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars)
			
 
				     df_no_zeros_stats <- ss$df_with_stats
			
 
				+    df_no_zeros_filtered_stats <- df_no_zeros_stats %>%
			
 
				+      {
			
 
				+        non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
			
 
				+        if (nrow(non_finite_rows) > 0) {
			
 
				+          message("Removed the following non-finite rows:")
			
 
				+          print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
			
 
				+        }
			
 
				+        filter(., if_all(c(L), is.finite))
			
 
				+      }
			
 
				 
			
 
				     message("Filtering by 2SD of K")
			
 
				     df_na_within_2sd_k <- df_na_stats %>%
			
@@ -645,19 +664,24 @@ main <- function() {
 
				     ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
			
 
				     l_within_2sd_k_ss <- ss$summary_stats
			
 
				     df_na_l_within_2sd_k_stats <- ss$df_with_stats
			
 
				-    write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
			
 
				+    write.csv(l_within_2sd_k_ss,
			
 
				+      file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
			
 
				     
			
 
				     message("Calculating summary statistics for L outside 2SD of K")
			
 
				     ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
			
 
				     l_outside_2sd_k_ss <- ss$summary_stats
			
 
				     df_na_l_outside_2sd_k_stats <- ss$df_with_stats
			
 
				-    write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
			
 
				+    write.csv(l_outside_2sd_k_ss,
			
 
				+      file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
			
 
				 
			
 
				-    # Plot configurations
			
 
				     # Each plots list corresponds to a file
			
 
				     message("Generating QC plot configurations")
			
 
				     l_vs_k_plots <- list(
			
 
				-      list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
			
 
				+      list(
			
 
				+        df = df,
			
 
				+        x_var = "L",
			
 
				+        y_var = "K",
			
 
				+        plot_type = "scatter",
			
 
				         title = "Raw L vs K before quality control",
			
 
				         color_var = "conc_num",
			
 
				         legend_position = "right"
			
@@ -665,8 +689,13 @@ main <- function() {
 
				     )
			
 
				 
			
 
				     above_threshold_plots <- list(
			
 
				-      list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter",
			
 
				-        title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
			
 
				+      list(
			
 
				+        df = df_above_tolerance,
			
 
				+        x_var = "L",
			
 
				+        y_var = "K",
			
 
				+        plot_type = "scatter",
			
 
				+        title = paste("Raw L vs K for strains above delta background threshold of",
			
 
				+          df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
			
 
				         color_var = "conc_num",
			
 
				         annotations = list(
			
 
				           x = l_half_median,
			
@@ -679,72 +708,102 @@ main <- function() {
 
				     )
			
 
				 
			
 
				     frequency_delta_bg_plots <- list(
			
 
				-      list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
			
 
				+      list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
			
 
				         title = "Plate analysis by Drug Conc for delta background before quality control",
			
 
				         color_var = "conc_num",
			
 
				         x_label = "Delta Background",
			
 
				         y_label = "Density",
			
 
				         error_bar = FALSE,
			
 
				-        legend_position = "right"
			
 
				-      ),
			
 
				-      list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
			
 
				+        legend_position = "right"),
			
 
				+      list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
			
 
				         title = "Plate analysis by Drug Conc for delta background before quality control",
			
 
				         color_var = "conc_num",
			
 
				         x_label = "Delta Background",
			
 
				         y_label = "Count",
			
 
				         error_bar = FALSE,
			
 
				-        legend_position = "right"
			
 
				-      )
			
 
				+        legend_position = "right")
			
 
				     )
			
 
				 
			
 
				     plate_analysis_plots <- list()
			
 
				-    for (plot_type in c("scatter", "box")) {
			
 
				-      variables <- c("L", "K", "r", "AUC", "delta_bg")
			
 
				-      for (var in variables) {
			
 
				-        for (stage in c("before", "after")) {
			
 
				-          if (stage == "before") {
			
 
				-            df_plot <- df_stats
			
 
				-          } else {
			
 
				-            df_plot <- df_na_stats # TODO use df_na_filtered if necessary
			
 
				-          }
			
 
				-          
			
 
				-          # Set error_bar = TRUE only for scatter plots
			
 
				-          error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
			
 
				-          
			
 
				-          # Create the plot configuration
			
 
				-          plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type,
			
 
				-            title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
			
 
				-            error_bar = error_bar, color_var = "conc_num")
			
 
				-
			
 
				-          plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
			
 
				+    for (var in summary_vars) {
			
 
				+      for (stage in c("before", "after")) {
			
 
				+        if (stage == "before") {
			
 
				+          df_plot <- df_filtered_stats
			
 
				+        } else {
			
 
				+          df_plot <- df_na_filtered_stats
			
 
				         }
			
 
				+        
			
 
				+        config <- list(
			
 
				+          df = df_plot,
			
 
				+          x_var = "scan",
			
 
				+          y_var = var,
			
 
				+          plot_type = "scatter",
			
 
				+          title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
			
 
				+          error_bar = TRUE, color_var = "conc_num")
			
 
				+
			
 
				+        plate_analysis_plots <- append(plate_analysis_plots, list(config))
			
 
				       }
			
 
				     }
			
 
				 
			
 
				-    plate_analysis_no_zero_plots <- list()
			
 
				-      for (plot_type in c("scatter", "box")) {
			
 
				-        variables <- c("L", "K", "r", "AUC", "delta_bg")
			
 
				-        for (var in variables) {
			
 
				-            
			
 
				-          # Set error_bar = TRUE only for scatter plots
			
 
				-          error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
			
 
				-          
			
 
				-          # Create the plot configuration
			
 
				-          plot_config <- list(
			
 
				-            df = df_no_zeros_stats,
			
 
				-            x_var = "scan",
			
 
				-            y_var = var,
			
 
				-            plot_type = plot_type,
			
 
				-            title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
			
 
				-            error_bar = error_bar,
			
 
				-            color_var = "conc_num"
			
 
				-          )
			
 
				-          plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
			
 
				+    plate_analysis_boxplots <- list()
			
 
				+    for (var in summary_vars) {
			
 
				+      for (stage in c("before", "after")) {
			
 
				+        if (stage == "before") {
			
 
				+          df_plot <- df_filtered_stats
			
 
				+        } else {
			
 
				+          df_plot <- df_na_filtered_stats
			
 
				         }
			
 
				+        
			
 
				+        config <- list(
			
 
				+          df = df_plot,
			
 
				+          x_var = "scan",
			
 
				+          y_var = var,
			
 
				+          plot_type = "box",
			
 
				+          title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
			
 
				+          error_bar = FALSE, color_var = "conc_num")
			
 
				+
			
 
				+        plate_analysis_boxplots <- append(plate_analysis_boxplots, list(config))
			
 
				       }
			
 
				+    }
			
 
				+
			
 
				+    plate_analysis_no_zeros_plots <- list()
			
 
				+    for (var in summary_vars) {
			
 
				+
			
 
				+      config <- list(
			
 
				+        df = df_no_zeros_filtered_stats,
			
 
				+        x_var = "scan",
			
 
				+        y_var = var,
			
 
				+        plot_type = "scatter",
			
 
				+        title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
			
 
				+        error_bar = TRUE,
			
 
				+        color_var = "conc_num")
			
 
				+
			
 
				+      plate_analysis_no_zeros_plots <- append(plate_analysis_no_zeros_plots, list(config))
			
 
				+    }
			
 
				+
			
 
				+    plate_analysis_no_zeros_boxplots <- list()
			
 
				+    for (var in summary_vars) {
			
 
				+      
			
 
				+      
			
 
				+      # Create the plot configuration
			
 
				+      config <- list(
			
 
				+        df = df_no_zeros_filtered_stats,
			
 
				+        x_var = "scan",
			
 
				+        y_var = var,
			
 
				+        plot_type = "box",
			
 
				+        title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
			
 
				+        error_bar = FALSE,
			
 
				+        color_var = "conc_num"
			
 
				+      )
			
 
				+      plate_analysis_no_zeros_boxplots <- append(plate_analysis_no_zeros_boxplots, list(config))
			
 
				+    }
			
 
				 
			
 
				     l_outside_2sd_k_plots <- list(
			
 
				-      list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter",
			
 
				+      list(
			
 
				+        df = df_na_l_outside_2sd_k_stats,
			
 
				+        x_var = "L",
			
 
				+        y_var = "K",
			
 
				+        plot_type = "scatter",
			
 
				         title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
			
 
				         color_var = "conc_num",
			
 
				         legend_position = "right"
			
@@ -752,20 +811,25 @@ main <- function() {
 
				     )
			
 
				 
			
 
				     delta_bg_outside_2sd_k_plots <- list(
			
 
				-      list(df = df_na_l_outside_2sd_k_stats, x_var = "delta_bg", y_var = "K", plot_type = "scatter",
			
 
				+      list(
			
 
				+        df = df_na_l_outside_2sd_k_stats,
			
 
				+        x_var = "delta_bg",
			
 
				+        y_var = "K",
			
 
				+        plot_type = "scatter",
			
 
				         title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
			
 
				         color_var = "conc_num",
			
 
				         legend_position = "right"
			
 
				       )
			
 
				     )
			
 
				 
			
 
				-    # Generate and save plots for each QC step
			
 
				     message("Generating QC plots")
			
 
				     generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
			
 
				     generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
			
 
				     generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
			
 
				     generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
			
 
				+    generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
			
 
				     generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
			
 
				+    generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
			
 
				     generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
			
 
				     generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
			
 
				 
			
@@ -795,7 +859,7 @@ main <- function() {
 
				       
			
 
				       # Recalculate summary statistics for the background strain
			
 
				       message("Calculating summary statistics for background strain")
			
 
				-      ss_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
			
 
				+      ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = group_vars)
			
 
				       summary_stats_bg <- ss_bg$summary_stats
			
 
				       # df_bg_stats <- ss_bg$df_with_stats
			
 
				       write.csv(summary_stats_bg,
			
@@ -835,14 +899,14 @@ main <- function() {
 
				         ungroup()
			
 
				 
			
 
				       # Calculate interactions
			
 
				-      variables <- c("L", "K", "r", "AUC")
			
 
				+      interaction_vars <- c("L", "K", "r", "AUC")
			
 
				       message("Calculating interaction scores")
			
 
				       # print("Reference strain:")
			
 
				       # print(head(reference_strain))
			
 
				-      reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables)
			
 
				+      reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars)
			
 
				       # print("Deletion strains:")
			
 
				       # print(head(deletion_strains))
			
 
				-      deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables)
			
 
				+      deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars)
			
 
				       
			
 
				       zscores_calculations_reference <- reference_results$calculations
			
 
				       zscores_interactions_reference <- reference_results$interactions
			
@@ -856,8 +920,8 @@ main <- function() {
 
				       write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE)
			
 
				 
			
 
				       # Create interaction plots
			
 
				-      reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables)
			
 
				-      deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables)
			
 
				+      reference_plot_configs <- generate_interaction_plot_configs(df_reference, interaction_vars)
			
 
				+      deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, interaction_vars)
			
 
				       generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
			
 
				       generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
			
 
				 
			
@@ -965,7 +1029,7 @@ main <- function() {
 
				       generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm",
			
 
				         plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2))
			
 
				 
			
 
				-      correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, variables)
			
 
				+      correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, interaction_vars)
			
 
				       generate_and_save_plots(output_dir = out_dir, file_name = "Avg_Zscore_vs_lm_NA_rm",
			
 
				         plot_configs = correlation_plot_configs, grid_layout = list(ncol = 2, nrow = 2))
			
 
				     })