Ver código fonte

Tidy some code up and make more specific

Bryan Roessler 7 meses atrás
pai
commit
cdeaf6e6ba
1 arquivos alterados com 52 adições e 37 exclusões
  1. 52 37
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 52 - 37
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -155,7 +155,7 @@ update_gene_names <- function(df, sgd_gene_list) {
   return(df)
 }
 
-calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) {
+calculate_summary_stats <- function(df, variables, group_vars) {
 
   summary_stats <- df %>%
     group_by(across(all_of(group_vars))) %>%
@@ -168,7 +168,9 @@ calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "con
         min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)),
         sd = ~sd(., na.rm = TRUE),
         se = ~ifelse(all(is.na(.)), NA, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1))
-      ), .names = "{.fn}_{.col}")
+      ),
+      .names = "{.fn}_{.col}"),
+      .groups = "drop"
     )
 
   # Create a cleaned version of df that doesn't overlap with summary_stats
@@ -181,7 +183,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "con
   return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
 }
 
-calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
+calculate_interaction_scores <- function(df, max_conc, variables, group_vars) {
 
   # Calculate total concentration variables
   total_conc_num <- length(unique(df$conc_num))
@@ -201,8 +203,10 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
     AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
   )
 
-  stats <- calculate_summary_stats(df, variables,
-    group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))$summary_stats
+  stats <- calculate_summary_stats(df,
+    variables = variables,
+    group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"
+    ))$summary_stats
 
   stats <- df %>%
     group_by(OrfRep, Gene, num) %>%
@@ -302,7 +306,8 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
       lm_slope_AUC = coef(lm_AUC)[2],
       NG = sum(NG, na.rm = TRUE),
       DB = sum(DB, na.rm = TRUE),
-      SM = sum(SM, na.rm = TRUE)
+      SM = sum(SM, na.rm = TRUE),
+      .groups = "keep"
     )
 
   num_non_removed_concs <- total_conc_num - sum(stats$DB, na.rm = TRUE) - 1
@@ -402,7 +407,7 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
     }
 
     # Add interactive tooltips for plotly plots
-    tooltip_vars <- c("x", "y")  # default tooltip variables
+    tooltip_vars <- c("x", "y")
     if (!is.null(config$tooltip_vars)) {
       tooltip_vars <- config$tooltip_vars
     } else {
@@ -687,14 +692,11 @@ generate_interaction_plot_configs <- function(df, variables) {
   return(configs)
 }
 
-generate_rank_plot_configs <- function(df_filtered, is_lm = FALSE, adjust = FALSE) {
+generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE, adjust = FALSE) {
   
   # Define SD bands
   sd_bands <- c(1, 2, 3)
   
-  # Define variables for Avg ZScore and Rank Avg ZScore plots
-  variables <- c("r", "L", "K", "AUC")
-  
   # Initialize list to store plot configurations
   configs <- list()
   
@@ -963,8 +965,7 @@ main <- function() {
     dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
 
     summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
-    group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by
-    orf_group_vars <- c("OrfRep", "Gene", "num")
+    interaction_vars <- c("L", "K", "r", "AUC") # fields to calculate interaction z-scores
     print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor",
       "delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB")
     
@@ -984,20 +985,29 @@ main <- function() {
     k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
 
     message("Calculating summary statistics before quality control")
-    ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars)
+    ss <- calculate_summary_stats(
+      df = df,
+      variables = summary_vars,
+      group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
     df_stats <- ss$df_with_stats
     message("Filtering non-finite data")
     df_filtered_stats <- filter_data(df_stats, c("L"), nf = TRUE)
 
     message("Calculating summary statistics after quality control")
-    ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars)
+    ss <- calculate_summary_stats(
+      df = df_na,
+      variables = summary_vars,
+      group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
     df_na_ss <- ss$summary_stats
     df_na_stats <- ss$df_with_stats
     write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
     df_na_filtered_stats <- filter_data(df_na_stats, c("L"), nf = TRUE)
 
     message("Calculating summary statistics after quality control excluding zero values")
-    ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars)
+    ss <- calculate_summary_stats(
+      df = df_no_zeros,
+      variables = summary_vars,
+      group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
     df_no_zeros_stats <- ss$df_with_stats
     df_no_zeros_filtered_stats <- filter_data(df_no_zeros_stats, c("L"), nf = TRUE)
 
@@ -1011,12 +1021,16 @@ main <- function() {
     # TODO We're omitting the original z_max calculation, not sure if needed?
     ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
     # df_na_l_within_2sd_k_stats <- ss$df_with_stats
-    write.csv(ss$summary_stats, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
+    write.csv(ss$summary_stats,
+      file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
+      row.names = FALSE)
     
     message("Calculating summary statistics for L outside 2SD of K")
     ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
     df_na_l_outside_2sd_k_stats <- ss$df_with_stats
-    write.csv(ss$summary_stats, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
+    write.csv(ss$summary_stats,
+      file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"),
+      row.names = FALSE)
     
     # Each plots list corresponds to a file
     l_vs_k_plots <- list(
@@ -1183,16 +1197,16 @@ main <- function() {
       )
     )
 
-    message("Generating quality control plots")
-    generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
-    generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
-    generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
-    generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
-    generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
-    generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
-    generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
-    generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
-    generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
+    # message("Generating quality control plots")
+    # generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
+    # generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
+    # generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
+    # generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
+    # generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
+    # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
+    # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
+    # generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
+    # generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
 
     # TODO: Originally this filtered L NA's
     # Let's try to avoid for now since stats have already been calculated
@@ -1217,7 +1231,7 @@ main <- function() {
       
       # Recalculate summary statistics for the background strain
       message("Calculating summary statistics for background strain")
-      ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = group_vars)
+      ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
       summary_stats_bg <- ss_bg$summary_stats
       # df_bg_stats <- ss_bg$df_with_stats
       write.csv(summary_stats_bg,
@@ -1256,18 +1270,15 @@ main <- function() {
           L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
         ungroup()
 
-      message("Calculating interaction scores")
-      interaction_vars <- c("L", "K", "r", "AUC")
-
-      message("Calculating reference strain(s)")
-      reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars, group_vars = orf_group_vars)
+      message("Calculating reference strain interaction scores")
+      reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num"))
       zscores_calculations_reference <- reference_results$calculations
       zscores_interactions_reference <- reference_results$interactions
       zscores_calculations_reference_joined <- reference_results$calculations_joined
       zscores_interactions_reference_joined <- reference_results$interactions_joined
 
-      message("Calculating deletion strain(s)")
-      deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars, group_vars = orf_group_vars)
+      message("Calculating deletion strain(s) interactions scores")
+      deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num"))
       zscores_calculations <- deletion_results$calculations
       zscores_interactions <- deletion_results$interactions
       zscores_calculations_joined <- deletion_results$calculations_joined
@@ -1346,6 +1357,7 @@ main <- function() {
         rank = TRUE)
       rank_plot_configs <- generate_rank_plot_configs(
         df = zscores_interactions_joined_filtered,
+        variables = interaction_vars,
         is_lm = FALSE,
         adjust = TRUE
       )
@@ -1355,6 +1367,7 @@ main <- function() {
       message("Generating ranked linear model plots")
       rank_lm_plot_configs <- generate_rank_plot_configs(
         df = zscores_interactions_joined_filtered,
+        variables = interaction_vars,
         is_lm = TRUE,
         adjust = TRUE
       )
@@ -1364,7 +1377,7 @@ main <- function() {
       message("Filtering and reranking plots")
       # Formerly X_NArm
       zscores_interactions_filtered <- zscores_interactions_joined %>%
-        group_by(across(all_of(orf_group_vars))) %>%
+        group_by(across(all_of(c("OrfRep", "Gene", "num")))) %>%
         filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
         ungroup() %>%
         rowwise() %>%
@@ -1396,6 +1409,7 @@ main <- function() {
 
       rank_plot_filtered_configs <- generate_rank_plot_configs(
         df = zscores_interactions_filtered,
+        variables = interaction_vars,
         is_lm = FALSE,
         adjust = FALSE
       )
@@ -1410,6 +1424,7 @@ main <- function() {
       message("Generating filtered ranked linear model plots")
       rank_plot_lm_filtered_configs <- generate_rank_plot_configs(
         df = zscores_interactions_filtered,
+        variables = interaction_vars,
         is_lm = TRUE,
         adjust = FALSE
       )