Browse Source

Add additional stats data frames

Bryan Roessler 7 months ago
parent
commit
cee290fd69
1 changed files with 51 additions and 46 deletions
  1. 51 46
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 51 - 46
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -30,7 +30,7 @@ parse_arguments <- function() {
       "/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
       "Experiment 1: Doxo versus HLD",
       3,
-      "/home/bryan/documents/develop/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
+      "/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
       "Experiment 2: HLD versus Doxo",
       3
     )
@@ -170,7 +170,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
         se = ~ ifelse(all(is.na(.)), NA, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1))
       ), .names = "{.fn}_{.col}")
     )
-  
+
   # Prevent .x and .y suffix issues by renaming columns
   df_cleaned <- df %>%
     select(-any_of(setdiff(names(summary_stats), group_vars))) # Avoid duplicate columns in the final join
@@ -336,9 +336,14 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
 }
 
 generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
+
+  message("Generating html and pdf plots for: ", file_name)
+
   plots <- lapply(plot_configs, function(config) {
     df <- config$df
-    
+
+    print(head(df))
+
     # Check if y_var is NULL and adjust the aes mapping
     aes_mapping <- if (is.null(config$y_var)) {
       aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var)))
@@ -374,7 +379,7 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
       } else if (config$plot_type == "density") {
         plot <- plot + geom_density()
       } else if (config$plot_type == "bar") {
-        plot <- plot + geom_bar(stat = "identity")
+        plot <- plot + geom_bar()
       } else {
         plot <- plot + geom_point(shape = 3) + geom_smooth(method = "lm", se = FALSE)
       }
@@ -582,35 +587,15 @@ main <- function() {
     dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
     dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
     
-    # Load and process data
+    message("Loading and filtering data")
     df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
     df <- update_gene_names(df, args$sgd_gene_list)
-    
-    max_conc <- max(df$conc_num_factor)
-    
-    # QC steps and filtering
-    df_above_tolerance <- df %>% filter(DB == 1)
-
-    # Calculate the half-medians for `L` and `K` for rows above tolerance
-    L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
-    K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
 
-    # Get the number of rows that are above tolerance
-    rows_to_remove <- nrow(df_above_tolerance)
+    # Filter rows that are above tolerance for quality control plots
+    df_above_tolerance <- df %>% filter(DB == 1)
 
     # Set L, r, K, and AUC to NA for rows that are above tolerance
-    df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .)))
-
-    # Calculate summary statistics for all strains, including both background and the deletions
-    message("Calculating summary statistics for all strains")
-    variables <- c("L", "K", "r", "AUC")
-    ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
-    summary_stats <- ss$summary_stats
-    df_na_stats <- ss$df_with_stats
-    write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
-
-    # print("Summary stats:")
-    # print(head(summary_stats), width = 200)
+    df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .)))
 
     # Remove rows with 0 values in L
     df_no_zeros <- df_na %>% filter(L > 0)
@@ -626,34 +611,54 @@ main <- function() {
         }
         filter(., if_all(c(L), is.finite))
       }
+    
+    variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across
+    group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # fields to group by
+
+    # Set some constants
+    max_conc <- max(df$conc_num_factor)
+    l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
+    k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
+
+    message("Calculating summary statistics before quality control")
+    ss <- calculate_summary_stats(df, variables, group_vars = group_vars)
+    df_stats <- ss$df_with_stats
+
+    message("Calculating summary statistics after quality control")
+    ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars)
+    df_na_ss <- ss$summary_stats
+    df_na_stats <- ss$df_with_stats
+    write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
+
+    message("Calculating summary statistics after quality control excluding zero values")
+    ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars)
+    df_no_zeros_stats <- ss$df_with_stats
 
-    # Filter data within and outside 2SD
     message("Filtering by 2SD of K")
     df_na_within_2sd_k <- df_na_stats %>%
       filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
     df_na_outside_2sd_k <- df_na_stats %>%
       filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
 
-    # Summary statistics for within and outside 2SD of K
     message("Calculating summary statistics for L within 2SD of K")
     # TODO We're omitting the original z_max calculation, not sure if needed?
     ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
-    l_within_2sd_k_stats <- ss$summary_stats
+    l_within_2sd_k_ss <- ss$summary_stats
     df_na_l_within_2sd_k_stats <- ss$df_with_stats
+    write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
+    
     message("Calculating summary statistics for L outside 2SD of K")
     ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
-    l_outside_2sd_k_stats <- ss$summary_stats
+    l_outside_2sd_k_ss <- ss$summary_stats
     df_na_l_outside_2sd_k_stats <- ss$df_with_stats
-    # Write CSV files
-    write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
-    write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
+    write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
 
     # Plot configurations
     # Each plots list corresponds to a file
     message("Generating QC plot configurations")
     l_vs_k_plots <- list(
       list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
-        title = "Raw L vs K before QC",
+        title = "Raw L vs K before quality control",
         color_var = "conc_num",
         legend_position = "right"
       )
@@ -664,8 +669,8 @@ main <- function() {
         title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
         color_var = "conc_num",
         annotations = list(
-          x = L_half_median,
-          y = K_half_median,
+          x = l_half_median,
+          y = k_half_median,
           label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance))
         ),
         error_bar = FALSE,
@@ -674,16 +679,16 @@ main <- function() {
     )
 
     frequency_delta_bg_plots <- list(
-      list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density",
-        title = "Density plot for Delta Background by Conc All Data",
+      list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
+        title = "Plate analysis by Drug Conc for delta background before quality control",
         color_var = "conc_num",
         x_label = "Delta Background",
         y_label = "Density",
         error_bar = FALSE,
         legend_position = "right"
       ),
-      list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
-        title = "Bar plot for Delta Background by Conc All Data",
+      list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
+        title = "Plate analysis by Drug Conc for delta background before quality control",
         color_var = "conc_num",
         x_label = "Delta Background",
         y_label = "Count",
@@ -698,9 +703,9 @@ main <- function() {
       for (var in variables) {
         for (stage in c("before", "after")) {
           if (stage == "before") {
-            df_plot <- df
+            df_plot <- df_stats
           } else {
-            df_plot <- df_na # TODO use df_na_filtered if necessary
+            df_plot <- df_na_stats # TODO use df_na_filtered if necessary
           }
           
           # Set error_bar = TRUE only for scatter plots
@@ -726,7 +731,7 @@ main <- function() {
           
           # Create the plot configuration
           plot_config <- list(
-            df = df_no_zeros,
+            df = df_no_zeros_stats,
             x_var = "scan",
             y_var = var,
             plot_type = plot_type,
@@ -739,7 +744,7 @@ main <- function() {
       }
 
     l_outside_2sd_k_plots <- list(
-      list(df = df_na_l_outside_2sd_k_stats, x_var = "l", y_var = "K", plot_type = "scatter",
+      list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter",
         title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
         color_var = "conc_num",
         legend_position = "right"