فهرست منبع

Move max_conc into the dataframe to simplify args

Bryan Roessler 6 ماه پیش
والد
کامیت
f435fa3f11
1فایلهای تغییر یافته به همراه45 افزوده شده و 72 حذف شده
  1. 45 72
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 45 - 72
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -160,6 +160,11 @@ load_and_filter_data <- function(easy_results_file, sd = 3) {
       conc_num_factor_factor = as.factor(conc_num)
     )
 
+  # Set the max concentration across the whole dataframe
+  max_conc <- max(df$conc_num_factor, na.rm = TRUE)
+  df <- df %>%
+    mutate(max_conc = max_conc)
+
   return(df)
 }
 
@@ -211,7 +216,7 @@ calculate_summary_stats <- function(df, variables, group_vars) {
   return(list(summary_stats = summary_stats, df_with_stats = df_joined))
 }
 
-calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, overlap_threshold = 2) {
+calculate_interaction_scores <- function(df, bg_stats, group_vars, overlap_threshold = 2) {
 
   # Calculate total concentration variables
   total_conc_num <- length(unique(df$conc_num))
@@ -294,16 +299,12 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, ove
   lm_means_sds <- calculations %>%
     group_by(across(all_of(group_vars))) %>%
     summarise(
-      mean_mean_L = mean(mean_L, na.rm = TRUE),
       mean_lm_L = mean(lm_Score_L, na.rm = TRUE),
       sd_lm_L = sd(lm_Score_L, na.rm = TRUE),
-      mean_mean_K = mean(mean_K, na.rm = TRUE),
       mean_lm_K = mean(lm_Score_K, na.rm = TRUE),
       sd_lm_K = sd(lm_Score_K, na.rm = TRUE),
-      mean_mean_r = mean(mean_r, na.rm = TRUE),
       mean_lm_r = mean(lm_Score_r, na.rm = TRUE),
       sd_lm_r = sd(lm_Score_r, na.rm = TRUE),
-      mean_mean_AUC = mean(mean_AUC, na.rm = TRUE),
       mean_lm_AUC = mean(lm_Score_AUC, na.rm = TRUE),
       sd_lm_AUC = sd(lm_Score_AUC, na.rm = TRUE)
     )
@@ -469,7 +470,12 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs) {
           "red"
         }
 
-        y_mean_col <- paste0("mean_mean_", config$y_var)
+        y_mean_prefix <- if (!is.null(config$error_bar_params$y_mean_prefix)) {
+          config$error_bar_params$y_mean_prefix
+        } else {
+          "mean_"
+        }
+        y_mean_col <- paste0(y_mean_prefix, config$y_var)
 
         # Dynamically set y_sd_col based on the provided prefix in error_bar_params
         y_sd_prefix <- if (!is.null(config$error_bar_params$y_sd_prefix)) {
@@ -482,7 +488,7 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs) {
         if (!is.null(config$error_bar_params$center_point)) {
           plot <- plot + geom_point(aes(
             x = .data[[config$x_var]],
-            y = .data[[y_mean_col]]),
+            y = first(.data[[y_mean_col]])),
             color = error_bar_color,
             shape = 16)
         }
@@ -495,14 +501,12 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs) {
             color = error_bar_color)
         } else {
           plot <- plot + geom_errorbar(aes(
-            ymin = .data[[y_mean_col]] - .data[[y_sd_col]],
-            ymax = .data[[y_mean_col]] + .data[[y_sd_col]]),
+            ymin = first(.data[[y_mean_col]]) - first(.data[[y_sd_col]]),
+            ymax = first(.data[[y_mean_col]]) + first(.data[[y_sd_col]])),
             color = error_bar_color)
         }
       }
 
-
-
       # Convert ggplot to plotly for interactive version
       plotly_plot <- suppressWarnings(plotly::ggplotly(plot))
 
@@ -546,7 +550,7 @@ generate_scatter_plot <- function(plot, config) {
   size <- if (!is.null(config$size)) config$size else 1.5
   position <-
     if (!is.null(config$position) && config$position == "jitter") {
-      position_jitter(width = 0.3, height = 0.1)
+      position_jitter(width = 0.4, height = 0.1)
     } else {
       "identity"
     }
@@ -752,7 +756,7 @@ generate_interaction_plot_configs <- function(df, type) {
 
     for (var in names(limits_map)) {
       y_limits <- limits_map[[var]]
-      y_span <- y_limits[2] - y_limits[1] 
+      y_span <- y_limits[2] - y_limits[1]
 
       # Common plot configuration
       plot_config <- list(
@@ -773,15 +777,16 @@ generate_interaction_plot_configs <- function(df, type) {
         plot_config$error_bar <- TRUE
         plot_config$error_bar_params <- list(
           y_sd_prefix = "WT_sd_",
+          y_mean_prefix = "mean_",
           color = "red",
           center_point = TRUE
         )
         plot_config$position <- "jitter"
 
       annotations <- list(
-        list(x = -0.25, y = y_limits[1] + 0.1 * y_span, label = "NG ="),  # Slightly above y-min
-        list(x = -0.25, y = y_limits[1] + 0.05 * y_span, label = "DB ="),
-        list(x = -0.25, y = y_limits[1], label = "SM =")
+        list(x = 0.25, y = y_limits[1] + 0.1 * y_span, label = "NG ="), # Slightly above y-min
+        list(x = 0.25, y = y_limits[1] + 0.05 * y_span, label = "DB ="),
+        list(x = 0.25, y = y_limits[1], label = "SM =")
       )
 
       # Loop over unique x values and add NG, DB, SM values at calculated y positions
@@ -1067,70 +1072,35 @@ main <- function() {
     out_dir_qc <- file.path(exp_path, "zscores", "qc")
     dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
     dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
-
-    summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
-    interaction_vars <- c("L", "K", "r", "AUC") # fields to calculate interaction z-scores
     
     message("Loading and filtering data for experiment: ", exp_name)
     df <- load_and_filter_data(args$easy_results_file, sd = exp_sd) %>%
       update_gene_names(args$sgd_gene_list) %>%
       as_tibble()
 
-    # Filter rows above delta background tolerance
-    df_above_tolerance <- df %>% filter(DB == 1)
-    df_na <- df %>% mutate(across(all_of(summary_vars), ~ ifelse(DB == 1, NA, .))) # formerly X
-    df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero
-    
-    # Save some constants
-    max_conc <- max(df$conc_num_factor)
-
     message("Calculating summary statistics before quality control")
     df_stats <- calculate_summary_stats(
       df = df,
-      variables = summary_vars,
+      variables = c("L", "K", "r", "AUC", "delta_bg"),
       group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$df_with_stats
-
+   
     message("Calculating summary statistics after quality control")
+    df_na <- df %>% mutate(across(all_of(c("L", "K", "r", "AUC", "delta_bg")), ~ ifelse(DB == 1, NA, .))) # formerly X
     ss <- calculate_summary_stats(
       df = df_na,
-      variables = summary_vars,
+      variables = c("L", "K", "r", "AUC", "delta_bg"),
       group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))
     df_na_ss <- ss$summary_stats
-    df_na_stats <- ss$df_with_stats
+    df_na_stats <- ss$df_with_stats # formerly X_stats_ALL
     write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
     # For plotting (ggplot warns on NAs)
-    df_na_stats_filtered <- df_na_stats %>% filter(if_all(all_of(summary_vars), is.finite))
-
-    df_na_stats <- df_na_stats %>%
-      mutate(
-        WT_L = mean_L,
-        WT_K = mean_K,
-        WT_r = mean_r,
-        WT_AUC = mean_AUC,
-        WT_sd_L = sd_L,
-        WT_sd_K = sd_K,
-        WT_sd_r = sd_r,
-        WT_sd_AUC = sd_AUC
-      )
-
-    # Pull the background means and standard deviations from zero concentration for interactions
-    bg_stats <- df_na_stats %>%
-      filter(conc_num == 0) %>%
-      summarise(
-        mean_L = first(mean_L),
-        mean_K = first(mean_K),
-        mean_r = first(mean_r),
-        mean_AUC = first(mean_AUC),
-        sd_L = first(sd_L),
-        sd_K = first(sd_K),
-        sd_r = first(sd_r),
-        sd_AUC = first(sd_AUC)
-      )
+    df_na_stats_filtered <- df_na_stats %>% filter(if_all(all_of(c("L", "K", "r", "AUC", "delta_bg")), is.finite))
 
     message("Calculating summary statistics after quality control excluding zero values")
+    df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero
     df_no_zeros_stats <- calculate_summary_stats(
       df = df_no_zeros,
-      variables = summary_vars,
+      variables = c("L", "K", "r", "AUC", "delta_bg"),
       group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")
     )$df_with_stats
 
@@ -1142,7 +1112,8 @@ main <- function() {
 
     message("Calculating summary statistics for L within 2SD of K")
     # TODO We're omitting the original z_max calculation, not sure if needed?
-    ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats
+    ss <- calculate_summary_stats(df_na_within_2sd_k, "L",
+      group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats
     write.csv(ss,
       file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
       row.names = FALSE)
@@ -1153,7 +1124,7 @@ main <- function() {
     write.csv(ss$summary_stats,
       file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"),
       row.names = FALSE)
-    
+      
     # Each list of plots corresponds to a file
     l_vs_k_plot_configs <- list(
       plots = list(
@@ -1200,6 +1171,7 @@ main <- function() {
       )
     )
 
+    df_above_tolerance <- df %>% filter(DB == 1)
     above_threshold_plot_configs <- list(
       plots = list(
         list(
@@ -1226,26 +1198,26 @@ main <- function() {
     )
 
     plate_analysis_plot_configs <- generate_plate_analysis_plot_configs(
-      variables = summary_vars,
+      variables = c("L", "K", "r", "AUC", "delta_bg"),
       df_before = df_stats,
       df_after = df_na_stats_filtered
     )
 
     plate_analysis_boxplot_configs <- generate_plate_analysis_plot_configs(
-      variables = summary_vars,
+      variables = c("L", "K", "r", "AUC", "delta_bg"),
       df_before = df_stats,
       df_after = df_na_stats_filtered,
       plot_type = "box"
     )
 
     plate_analysis_no_zeros_plot_configs <- generate_plate_analysis_plot_configs(
-      variables = summary_vars,
+      variables = c("L", "K", "r", "AUC", "delta_bg"),
       stages = c("after"),  # Only after QC
       df_after = df_no_zeros_stats
     )
 
     plate_analysis_no_zeros_boxplot_configs <- generate_plate_analysis_plot_configs(
-      variables = summary_vars,
+      variables = c("L", "K", "r", "AUC", "delta_bg"),
       stages = c("after"),  # Only after QC
       df_after = df_no_zeros_stats,
       plot_type = "box"
@@ -1329,7 +1301,6 @@ main <- function() {
     #   generate_and_save_plots(config$out_dir, config$filename, config$plot_configs)
     # }, .options = furrr_options(seed = TRUE))
 
-    # Process background strains
     bg_strains <- c("YDL227C")
     lapply(bg_strains, function(strain) {
       
@@ -1349,15 +1320,17 @@ main <- function() {
       
       # Recalculate summary statistics for the background strain
       message("Calculating summary statistics for background strain")
-      ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor"))
+      ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), 
+        group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor"))
       summary_stats_bg <- ss_bg$summary_stats
+      ss_bg_stats <- ss_bg$df_with_stats
       write.csv(summary_stats_bg,
         file = file.path(out_dir, paste0("summary_stats_background_strain_", strain, ".csv")),
         row.names = FALSE)
       
       # Set the missing values to the highest theoretical value at each drug conc for L
       # Leave other values as 0 for the max/min
-      df_reference <- df_na_stats %>% # formerly X2_RF
+      df_reference <- df_bg_stats %>% # formerly X2_RF
         filter(OrfRep == strain) %>%
         filter(!is.na(L)) %>%
         group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>%
@@ -1384,10 +1357,10 @@ main <- function() {
       message("Calculating reference strain interaction scores")
       df_reference_stats <- calculate_summary_stats(
         df = df_reference,
-        variables = interaction_vars,
+        variables = c("L", "K", "r", "AUC"),
         group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")
         )$df_with_stats
-      reference_results <- calculate_interaction_scores(df_reference_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene", "num"))
+      reference_results <- calculate_interaction_scores(df_reference_stats, bg_stats, group_vars = c("OrfRep", "Gene", "num"))
       zscore_calculations_reference <- reference_results$calculations
       zscore_interactions_reference <- reference_results$interactions
       zscore_interactions_reference_joined <- reference_results$full_data
@@ -1395,10 +1368,10 @@ main <- function() {
       message("Calculating deletion strain(s) interactions scores")
       df_deletion_stats <- calculate_summary_stats(
         df = df_deletion,
-        variables = interaction_vars,
+        variables = c("L", "K", "r", "AUC"),
         group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")
         )$df_with_stats
-      deletion_results <- calculate_interaction_scores(df_deletion_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene"))
+      deletion_results <- calculate_interaction_scores(df_deletion_stats, bg_stats, group_vars = c("OrfRep", "Gene"))
       zscore_calculations <- deletion_results$calculations
       zscore_interactions <- deletion_results$interactions
       zscore_interactions_joined <- deletion_results$full_data