Jelajahi Sumber

Flatten means and sds for error bars

Bryan Roessler 7 bulan lalu
induk
melakukan
25a5c7ff23
1 mengubah file dengan 120 tambahan dan 66 penghapusan
  1. 120 66
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 120 - 66
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -345,7 +345,7 @@ calculate_interaction_scores <- function(df, df_bg, group_vars, overlap_threshol
     ungroup()
 
   # For interaction plot error bars
-  delta_means <- calculations %>%
+  delta_means_sds <- calculations %>%
     group_by(across(all_of(group_vars))) %>%
     summarise(
       mean_Delta_L = mean(Delta_L, na.rm = TRUE),
@@ -356,12 +356,11 @@ calculate_interaction_scores <- function(df, df_bg, group_vars, overlap_threshol
       sd_Delta_K = sd(Delta_K, na.rm = TRUE),
       sd_Delta_r = sd(Delta_r, na.rm = TRUE),
       sd_Delta_AUC = sd(Delta_AUC, na.rm = TRUE),
-
       .groups = "drop"
     )
 
   calculations <- calculations %>%
-    left_join(delta_means, by = group_vars)
+    left_join(delta_means_sds, by = group_vars)
 
   # Summary statistics for lm scores
   lm_means_sds <- calculations %>%
@@ -436,13 +435,12 @@ calculate_interaction_scores <- function(df, df_bg, group_vars, overlap_threshol
       NG = first(NG),
       DB = first(DB),
       SM = first(SM),
-
       .groups = "drop"
     )
 
   # Add overlap threshold categories based on Z-lm and Avg-Z scores
   interactions <- interactions %>%
-    filter(!is.na(Z_lm_L)) %>%
+    filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
     mutate(
       Overlap = case_when(
         Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both",
@@ -603,7 +601,7 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
 
       plot <- ggplot(df, aes_mapping) + theme_publication(legend_position = config$legend_position)
 
-      # Add appropriate plot layer based on plot type
+      # Add appropriate plot layer or helper function based on plot type
       plot <- switch(config$plot_type,
         "scatter" = generate_scatter_plot(plot, config),
         "box" = generate_boxplot(plot, config),
@@ -642,8 +640,19 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
 
       # Add error bars if specified
       if (!is.null(config$error_bar) && config$error_bar) {
-        y_mean_col <- paste0("mean_", config$y_var)
-        y_sd_col <- paste0("sd_", config$y_var)
+        # Check if custom columns are provided for y_mean and y_sd, or use the defaults
+        y_mean_col <- if (!is.null(config$error_bar_params$y_mean_col)) {
+          config$error_bar_params$y_mean_col
+        } else {
+          paste0("mean_", config$y_var)
+        }
+
+        y_sd_col <- if (!is.null(config$error_bar_params$y_sd_col)) {
+          config$error_bar_params$y_sd_col
+        } else {
+          paste0("sd_", config$y_var)
+        }
+
         # If color_var is provided and no fixed error bar color is set, use aes() to map color dynamically
         if (!is.null(config$color_var) && is.null(config$error_bar_params$color)) {
           plot <- plot + geom_errorbar(
@@ -651,7 +660,7 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
               x = .data[[config$x_var]],
               ymin = .data[[y_mean_col]] - .data[[y_sd_col]],
               ymax = .data[[y_mean_col]] + .data[[y_sd_col]],
-              color = .data[[config$color_var]]  # Dynamic color from the data
+              color = .data[[config$color_var]]
             )
           )
         } else {
@@ -662,7 +671,17 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
               ymin = .data[[y_mean_col]] - .data[[y_sd_col]],
               ymax = .data[[y_mean_col]] + .data[[y_sd_col]]
             ),
-            color = config$error_bar_params$color  # Fixed color
+            color = config$error_bar_params$color
+          )
+        }
+
+        # Add the center point if the option is provided
+        if (!is.null(config$error_bar_params$center_point)) {
+          plot <- plot + geom_point(aes(
+            x = .data[[config$x_var]],
+            y = .data[[y_mean_col]]),
+            color = config$error_bar_params$color,
+            shape = 16
           )
         }
       }
@@ -730,6 +749,10 @@ generate_scatter_plot <- function(plot, config) {
       size = 0.5
     )
   }
+
+  if (!is.null(config$gray_points) && config$gray_points) {
+    plot <- plot + geom_point(shape = 3, color = "gray70", size = 1)
+  }
   
   # Add Smooth Line if specified
   if (!is.null(config$smooth) && config$smooth) {
@@ -742,14 +765,17 @@ generate_scatter_plot <- function(plot, config) {
           slope = config$lm_line$slope,
           color = smooth_color
         )
-    } else {
-      plot <- plot +
-        geom_smooth(
-          method = "lm",
-          se = FALSE,
-          color = smooth_color
-        )
     }
+    
+    # For now I want to try and hardcode it
+    # else {
+    #   plot <- plot +
+    #     geom_smooth(
+    #       method = "lm",
+    #       se = FALSE,
+    #       color = smooth_color
+    #     )
+    # }
   }
   
   # Add SD Bands if specified
@@ -878,7 +904,7 @@ generate_plate_analysis_plot_configs <- function(variables, df_before = NULL, df
   return(list(plots = plot_configs))
 }
 
-generate_interaction_plot_configs <- function(df, type) {
+generate_interaction_plot_configs <- function(df_summary, df_interaction, type) {
 
   # Define the y-limits for the plots
   limits_map <- list(
@@ -893,7 +919,7 @@ generate_interaction_plot_configs <- function(df, type) {
   delta_plot_configs <- list()
 
   # Overall statistics plots
-  OrfRep <- first(df$OrfRep) # this should correspond to the reference strain
+  OrfRep <- first(df_summary$OrfRep) # this should correspond to the reference strain
 
   for (plot_type in c("scatter", "box")) {
 
@@ -903,15 +929,15 @@ generate_interaction_plot_configs <- function(df, type) {
 
       # Common plot configuration
       plot_config <- list(
-        df = df,
+        df = df_summary,
         plot_type = plot_type,
         x_var = "conc_num_factor_factor",
         y_var = var,
         shape = 16,
-        x_label = unique(df$Drug)[1],
+        x_label = paste0("[", unique(df_summary$Drug)[1], "]"),
         coord_cartesian = y_limits,
-        x_breaks = unique(df$conc_num_factor_factor),
-        x_labels = as.character(unique(df$conc_num))
+        x_breaks = unique(df_summary$conc_num_factor_factor),
+        x_labels = as.character(unique(df_summary$conc_num))
       )
 
       # Add specific configurations for scatter and box plots
@@ -920,23 +946,25 @@ generate_interaction_plot_configs <- function(df, type) {
         plot_config$error_bar <- TRUE
         plot_config$error_bar_params <- list(
           color = "red",
-          center_point = TRUE
+          center_point = TRUE,
+          y_mean_col = paste0("mean_mean_", var),
+          y_sd_col = paste0("mean_sd_", var)
         )
         plot_config$position <- "jitter"
 
         # Cannot figure out how to place these properly for discrete x-axis so let's be hacky
         annotations <- list(
-          list(x = 0.25, y = y_limits[1] + 0.1 * y_span, label = "                NG:"),
-          list(x = 0.25, y = y_limits[1] + 0.05 * y_span, label = "                DB:"),
-          list(x = 0.25, y = y_limits[1], label = "                SM:")
+          list(x = 0.25, y = y_limits[1] + 0.08 * y_span, label = "                NG =", size = 4),
+          list(x = 0.25, y = y_limits[1] + 0.04 * y_span, label = "                DB =", size = 4),
+          list(x = 0.25, y = y_limits[1], label = "                SM =", size = 4)
         )
 
-        for (x_val in unique(df$conc_num_factor_factor)) {
-          current_df <- df %>% filter(.data[[plot_config$x_var]] == x_val)
+        for (x_val in unique(df_summary$conc_num_factor_factor)) {
+          current_df <- df_summary %>% filter(.data[[plot_config$x_var]] == x_val)
           annotations <- append(annotations, list(
-            list(x = x_val, y = y_limits[1] + 0.1 * y_span, label = first(current_df$NG, default = 0)),
-            list(x = x_val, y = y_limits[1] + 0.05 * y_span, label = first(current_df$DB, default = 0)),
-            list(x = x_val, y = y_limits[1], label = first(current_df$SM, default = 0))
+            list(x = x_val, y = y_limits[1] + 0.08 * y_span, label = first(current_df$NG, default = 0), size = 4),
+            list(x = x_val, y = y_limits[1] + 0.04 * y_span, label = first(current_df$DB, default = 0), size = 4),
+            list(x = x_val, y = y_limits[1], label = first(current_df$SM, default = 0), size = 4)
           ))
         }
 
@@ -945,7 +973,7 @@ generate_interaction_plot_configs <- function(df, type) {
         stats_plot_configs <- append(stats_plot_configs, list(plot_config))
 
       } else if (plot_type == "box") {
-        plot_config$title <- sprintf("%s Boxplot RF for %s with SD", OrfRep, var)
+        plot_config$title <- sprintf("%s Box RF for %s with SD", OrfRep, var)
         plot_config$position <- "dodge"
 
         stats_boxplot_configs <- append(stats_boxplot_configs, list(plot_config))
@@ -967,7 +995,7 @@ generate_interaction_plot_configs <- function(df, type) {
     AUC = c(-6000, 6000)
   )
 
-  grouped_data <- df %>%
+  grouped_data <- df_interaction %>%
     group_by(across(all_of(group_vars))) %>%
     group_split()
 
@@ -1003,11 +1031,12 @@ generate_interaction_plot_configs <- function(df, type) {
       plot_config <- list(
         df = group_data,
         plot_type = "scatter",
-        x_var = "conc_num_factor_factor",
+        x_var = "conc_num_factor_factor", 
         y_var = paste0("Delta_", var),
-        x_label = unique(group_data$Drug)[1],
+        x_label = paste0("[", unique(df_summary$Drug)[1], "]"),
+        shape = 16,
         title = paste(OrfRepTitle, Gene, sep = "      "),
-        title_size = rel(1.2),
+        title_size = rel(1.3),
         coord_cartesian = y_limits,
         annotations = list(
           list(x = 1, y = y_limits[2] - 0.2 * y_span, label = paste("ZShift =", Z_Shift_value)),
@@ -1021,16 +1050,17 @@ generate_interaction_plot_configs <- function(df, type) {
         error_bar_params = list(
           ymin = 0 - (2 * WT_sd_value),
           ymax = 0 + (2 * WT_sd_value),
-          color = "black"
+          color = "gray"
         ),
-        smooth = TRUE,
         x_breaks = unique(group_data$conc_num_factor_factor),
         x_labels = as.character(unique(group_data$conc_num)),
         ylim_vals = y_limits,
         y_filter = FALSE,
+        smooth = TRUE,
         lm_line = list(
           intercept = lm_intercept_value,
-          slope = lm_slope_value
+          slope = lm_slope_value,
+          color = "blue"
         )
       )
       delta_plot_configs <- append(delta_plot_configs, list(plot_config))
@@ -1082,7 +1112,6 @@ generate_rank_plot_configs <- function(df, is_lm = FALSE, adjust = FALSE, overla
       fill_negative = "orange",
       alpha_positive = 0.3,
       alpha_negative = 0.3,
-      annotations = NULL,
       shape = 3,
       size = 0.1,
       y_label = y_label,
@@ -1183,7 +1212,8 @@ generate_correlation_plot_configs <- function(df) {
         shape = 3,
         size = 0.5,
         color_var = "Overlap",
-        cyan_points = highlight_cyan  # Include cyan points or not based on the loop
+        cyan_points = highlight_cyan, # include cyan points or not based on the loop
+        gray_points = TRUE
       )
 
       plot_configs <- append(plot_configs, list(plot_config))
@@ -1226,10 +1256,10 @@ main <- function() {
     )
 
     message("Calculating summary statistics before quality control")
-    df_stats <- calculate_summary_stats(
+    df_stats <- calculate_summary_stats( # formerly X_stats_ALL
       df = df,
       variables = c("L", "K", "r", "AUC", "delta_bg"),
-      group_vars = c("conc_num"))$df_with_stats
+      group_vars = c("conc_num", "conc_num_factor_factor"))$df_with_stats
 
     frequency_delta_bg_plot_configs <- list(
       plots = list(
@@ -1295,7 +1325,7 @@ main <- function() {
     ss <- calculate_summary_stats(
       df = df_na,
       variables = c("L", "K", "r", "AUC", "delta_bg"),
-      group_vars = c("conc_num"))
+      group_vars = c("conc_num", "conc_num_factor_factor"))
     df_na_ss <- ss$summary_stats
     df_na_stats <- ss$df_with_stats # formerly X_stats_ALL
     write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
@@ -1307,7 +1337,7 @@ main <- function() {
     df_no_zeros_stats <- calculate_summary_stats(
       df = df_no_zeros,
       variables = c("L", "K", "r", "AUC", "delta_bg"),
-      group_vars = c("conc_num")
+      group_vars = c("conc_num", "conc_num_factor_factor")
     )$df_with_stats
 
     message("Filtering by 2SD of K")
@@ -1318,14 +1348,15 @@ main <- function() {
 
     message("Calculating summary statistics for L within 2SD of K")
     # TODO We're omitting the original z_max calculation, not sure if needed?
-    ss <- calculate_summary_stats(df_na_within_2sd_k, "L",
-      group_vars = c("conc_num"))$summary_stats
+    ss <- calculate_summary_stats(df_na_within_2sd_k, "L", # formerly X_stats_BY_L_within_2SD_K
+      group_vars = c("conc_num", "conc_num_factor_factor"))$summary_stats
     write.csv(ss,
       file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2SD_K.csv"),
       row.names = FALSE)
     
     message("Calculating summary statistics for L outside 2SD of K")
-    ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num"))
+    ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", # formerly X_stats_BY_L_outside_2SD_K
+      group_vars = c("conc_num", "conc_num_factor_factor"))
     df_na_l_outside_2sd_k_stats <- ss$df_with_stats
     write.csv(ss$summary_stats,
       file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2SD_K.csv"),
@@ -1435,10 +1466,10 @@ main <- function() {
     )
 
     # Parallelize background and quality control plot generation
-    furrr::future_map(plot_configs, function(config) {
-      generate_and_save_plots(config$out_dir, config$filename, config$plot_configs,
-        page_width = config$page_width, page_height = config$page_height)
-    }, .options = furrr_options(seed = TRUE))
+    # furrr::future_map(plot_configs, function(config) {
+    #   generate_and_save_plots(config$out_dir, config$filename, config$plot_configs,
+    #     page_width = config$page_width, page_height = config$page_height)
+    # }, .options = furrr_options(seed = TRUE))
 
     # Loop over background strains
     # TODO currently only tested against one strain, if we want to do multiple strains we'll
@@ -1460,7 +1491,7 @@ main <- function() {
         filter(!is.na(L))
       
       message("Calculating background strain summary statistics")
-      ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"),
+      ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), # formerly X_stats_BY
         group_vars = c("OrfRep", "Drug", "conc_num", "conc_num_factor_factor"))
       summary_stats_bg <- ss_bg$summary_stats
       df_bg_stats <- ss_bg$df_with_stats
@@ -1473,7 +1504,7 @@ main <- function() {
       df_reference <- df_na_stats %>% # formerly X2_RF
         filter(OrfRep == strain) %>%
         filter(!is.na(L)) %>%
-        group_by(OrfRep, Drug, conc_num) %>%
+        group_by(OrfRep, Drug, conc_num, conc_num_factor_factor) %>%
         mutate(
           max_l_theoretical = max(max_L, na.rm = TRUE),
           L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
@@ -1482,22 +1513,45 @@ main <- function() {
         ungroup()
 
       message("Calculating reference strain summary statistics")
-      df_reference_stats <- calculate_summary_stats(
+      df_reference_summary_stats <- calculate_summary_stats( # formerly X_stats_X2_RF
+        df = df_reference,
+        variables = c("L", "K", "r", "AUC"),
+        group_vars = c("OrfRep", "Drug", "conc_num", "conc_num_factor_factor")
+      )$df_with_stats
+
+      # Summarise statistics for error bars
+      df_reference_summary_stats <- df_reference_summary_stats %>%
+        group_by(OrfRep, Drug, conc_num, conc_num_factor_factor) %>%
+        mutate(
+          mean_mean_L = first(mean_L),
+          mean_sd_L = first(sd_L),
+          mean_mean_K = first(mean_K),
+          mean_sd_K = first(sd_K),
+          mean_mean_r = first(mean_r),
+          mean_sd_r = first(sd_r),
+          mean_mean_AUC = first(mean_AUC),
+          mean_sd_AUC = first(sd_AUC),
+          .groups = "drop"
+        )
+
+      message("Calculating reference strain interaction summary statistics") # formerly X_stats_interaction
+      df_reference_interaction_stats <- calculate_summary_stats(
         df = df_reference,
         variables = c("L", "K", "r", "AUC"),
-        group_vars = c("OrfRep", "Gene", "Drug", "num", "conc_num", "conc_num_factor_factor")
+        group_vars = c("OrfRep", "Gene", "num", "Drug", "conc_num", "conc_num_factor_factor")
         )$df_with_stats
       
       message("Calculating reference strain interaction scores")
-      results <- calculate_interaction_scores(df_reference_stats, df_bg_stats, group_vars = c("OrfRep", "Gene", "Drug", "num"))
-      df_calculations_reference <- results$calculations
-      df_interactions_reference <- results$interactions
-      df_interactions_reference_joined <- results$full_data
-      write.csv(df_calculations_reference, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
-      write.csv(df_interactions_reference, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
+      results <- calculate_interaction_scores(df_reference_interaction_stats,
+        df_bg_stats, group_vars = c("OrfRep", "Gene", "num", "Drug"))
+      df_reference_calculations <- results$calculations
+      df_reference_interactions <- results$interactions
+      df_reference_interactions_joined <- results$full_data
+      write.csv(df_reference_calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
+      write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
 
       message("Generating reference interaction plots")
-      reference_plot_configs <- generate_interaction_plot_configs(df_interactions_reference_joined, "reference")
+      reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
       generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
 
       message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
@@ -1512,7 +1566,7 @@ main <- function() {
           L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
         ungroup()
 
-      message("Calculating deletion strain(s) summary statistics")
+      message("Calculating deletion strain(s) interaction summary statistics")
       df_deletion_stats <- calculate_summary_stats(
         df = df_deletion,
         variables = c("L", "K", "r", "AUC"),
@@ -1528,7 +1582,7 @@ main <- function() {
       write.csv(df_interactions, file = file.path(out_dir, "zscore_interactions.csv"), row.names = FALSE)
 
       message("Generating deletion interaction plots")
-      deletion_plot_configs <- generate_interaction_plot_configs(df_interactions_joined, "deletion")
+      deletion_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_interactions_joined, "deletion")
       generate_and_save_plots(out_dir, "interaction_plots", deletion_plot_configs, page_width = 16, page_height = 16)
 
       message("Writing enhancer/suppressor csv files")