Browse Source

Begin main() refactor

Bryan Roessler 8 tháng trước cách đây
mục cha
commit
ba7f575be1
1 tập tin đã thay đổi với 73 bổ sung63 xóa
  1. 73 63
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 73 - 63
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -9,7 +9,6 @@ suppressMessages({
 })
 
 options(warn = 2)
-options(width = 10000)
 
 # Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes)
 soft_limit <- 30 * 1024 * 1024 * 1024
@@ -127,7 +126,9 @@ load_and_process_data <- function(easy_results_file, sd = 3) {
       SM = 0,
       OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded?
       conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)),
-      conc_num_factor = as.numeric(as.factor(conc_num)) - 1
+      conc_num_factor = as.factor(conc_num)
+      # conc_num_factor = factor(conc_num, levels = sort(unique(conc_num)))
+      # conc_num_numeric = as.numeric(conc_num_factor) - 1
     )
   
   return(df)
@@ -334,8 +335,7 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
       Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE),
       Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE)
     ) %>%
-    arrange(desc(Z_lm_L)) %>%
-    arrange(desc(NG))
+    arrange(desc(Z_lm_L), desc(NG))
 
   # Declare column order for output
   calculations <- stats %>%
@@ -509,7 +509,7 @@ generate_scatter_plot <- function(plot, config) {
 
   # Add x-axis customization if specified
   if (!is.null(config$x_breaks) && !is.null(config$x_labels) && !is.null(config$x_label)) {
-    plot <- plot + scale_x_continuous(
+    plot <- plot + scale_x_discrete(
       name = config$x_label,
       breaks = config$x_breaks,
       labels = config$x_labels
@@ -529,7 +529,11 @@ generate_scatter_plot <- function(plot, config) {
   # Add annotations if specified
   if (!is.null(config$annotations)) {
     for (annotation in config$annotations) {
-      plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label)
+      plot <- plot + annotate("text",
+        x = annotation$x,
+        y = annotation$y,
+        label = annotation$label,
+        na.rm = TRUE)
     }
   }
 
@@ -606,15 +610,6 @@ generate_interaction_plot_configs <- function(df, variables) {
     AUC = c(-6500, 6500)
   )
   
-  # Define annotation positions and labels
-  annotation_positions <- list(
-    ZShift = 45,
-    lm_ZScore = 25,
-    NG = -25,
-    DB = -35,
-    SM = -45
-  )
-  
   # Define functions to generate annotation labels
   annotation_labels <- list(
     ZShift = function(df, var) {
@@ -640,22 +635,6 @@ generate_interaction_plot_configs <- function(df, variables) {
       slope = df[[paste0("lm_slope_", variable)]]
     )
     
-    # Generate annotations
-    annotations <- lapply(names(annotation_positions), function(annotation_name) {
-      y_pos <- annotation_positions[[annotation_name]]
-      label_func <- annotation_labels[[annotation_name]]
-      if (!is.null(label_func)) {
-        label <- label_func(df, variable)
-        list(x = 1, y = y_pos, label = label)
-      } else {
-        message(paste("Warning: No annotation function found for", annotation_name))
-        NULL
-      }
-    })
-    
-    # Remove NULL annotations
-    annotations <- Filter(Negate(is.null), annotations)
-    
     # Filter the data based on y-limits and missing values
     y_var_sym <- sym(variable)
     x_var_sym <- sym("conc_num_factor")
@@ -677,6 +656,39 @@ generate_interaction_plot_configs <- function(df, variables) {
     filtered_data_list[[variable]] <- filtered_data
     out_of_range_data_list[[variable]] <- data_to_filter_out
     
+    # Calculate x and y positions for annotations based on filtered data
+    x_levels <- levels(filtered_data$conc_num_factor)
+    x_pos <- mean(seq_along(x_levels))  # Midpoint of x-axis
+
+    y_min <- min(ylim_vals)
+    y_max <- max(ylim_vals)
+    y_span <- y_max - y_min
+    
+    # Adjust y positions as fractions of y-span
+    annotation_positions <- list(
+      ZShift = y_max - 0.1 * y_span,
+      lm_ZScore = y_max - 0.2 * y_span,
+      NG = y_min + 0.2 * y_span,
+      DB = y_min + 0.1 * y_span,
+      SM = y_min + 0.05 * y_span
+    )
+    
+    # Generate annotations
+    annotations <- lapply(names(annotation_positions), function(annotation_name) {
+      y_pos <- annotation_positions[[annotation_name]]
+      label_func <- annotation_labels[[annotation_name]]
+      if (!is.null(label_func)) {
+        label <- label_func(df, variable)
+        list(x = x_pos, y = y_pos, label = label)
+      } else {
+        message(paste("Warning: No annotation function found for", annotation_name))
+        NULL
+      }
+    })
+    
+    # Remove NULL annotations
+    annotations <- Filter(Negate(is.null), annotations)
+    
     # Create scatter plot config
     configs[[length(configs) + 1]] <- list(
       df = filtered_data,
@@ -688,11 +700,11 @@ generate_interaction_plot_configs <- function(df, variables) {
       annotations = annotations,
       lm_line = lm_line,
       error_bar = TRUE,
-      x_breaks = unique(df$conc_num_factor),
-      x_labels = unique(as.character(df$conc_num)),
+      x_breaks = levels(filtered_data$conc_num_factor),
+      x_labels = levels(filtered_data$conc_num_factor),
       x_label = unique(df$Drug[1]),
       position = "jitter",
-      coord_cartesian = c(min(ylim_vals), max(ylim_vals))
+      coord_cartesian = ylim_vals  # Use the actual y-limits
     )
     
     # Create box plot config
@@ -705,10 +717,10 @@ generate_interaction_plot_configs <- function(df, variables) {
       ylim_vals = ylim_vals,
       annotations = annotations,
       error_bar = FALSE,
-      x_breaks = unique(df$conc_num_factor),
-      x_labels = unique(as.character(df$conc_num)),
+      x_breaks = unique(filtered_data$conc_num_factor),
+      x_labels = unique(as.character(filtered_data$conc_num)),
       x_label = unique(df$Drug[1]),
-      coord_cartesian = c(min(ylim_vals), max(ylim_vals))
+      coord_cartesian = ylim_vals
     )
   }
   
@@ -718,6 +730,7 @@ generate_interaction_plot_configs <- function(df, variables) {
   
   return(list(
     configs = configs,
+    filtered_data = filtered_data_df,
     out_of_range_data = out_of_range_data_df
   ))
 }
@@ -823,7 +836,7 @@ main <- function() {
     df_no_zeros <- df_na %>% filter(L > 0)
     
     # Save some constants
-    max_conc <- max(df$conc_num_factor)
+    max_conc <- max(df$conc_num)
     l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
     k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
 
@@ -901,7 +914,7 @@ main <- function() {
         plot_type = "scatter",
         delta_bg_point = TRUE,
         title = "Raw L vs K before quality control",
-        color_var = "conc_num",
+        color_var = "conc_num_factor",
         error_bar = FALSE,
         legend_position = "right"
       )
@@ -914,7 +927,7 @@ main <- function() {
         y_var = NULL,
         plot_type = "density",
         title = "Plate analysis by Drug Conc for Delta Background before quality control",
-        color_var = "conc_num",
+        color_var = "conc_num_factor",
         x_label = "Delta Background",
         y_label = "Density",
         error_bar = FALSE,
@@ -925,7 +938,7 @@ main <- function() {
         y_var = NULL,
         plot_type = "bar",
         title = "Plate analysis by Drug Conc for Delta Background before quality control",
-        color_var = "conc_num",
+        color_var = "conc_num_factor",
         x_label = "Delta Background",
         y_label = "Count",
         error_bar = FALSE,
@@ -941,7 +954,7 @@ main <- function() {
         delta_bg_point = TRUE,
         title = paste("Raw L vs K for strains above Delta Background threshold of",
           df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
-        color_var = "conc_num",
+        color_var = "conc_num_factor",
         position = "jitter",
         annotations = list(
           x = l_half_median,
@@ -969,7 +982,7 @@ main <- function() {
           plot_type = "scatter",
           title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
           error_bar = TRUE,
-          color_var = "conc_num",
+          color_var = "conc_num_factor",
           position = "jitter")
 
         plate_analysis_plots <- append(plate_analysis_plots, list(config))
@@ -992,7 +1005,7 @@ main <- function() {
           plot_type = "box",
           title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
           error_bar = FALSE,
-          color_var = "conc_num")
+          color_var = "conc_num_factor")
 
         plate_analysis_boxplots <- append(plate_analysis_boxplots, list(config))
       }
@@ -1007,7 +1020,7 @@ main <- function() {
         plot_type = "scatter",
         title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
         error_bar = TRUE,
-        color_var = "conc_num",
+        color_var = "conc_num_factor",
         position = "jitter")
 
       plate_analysis_no_zeros_plots <- append(plate_analysis_no_zeros_plots, list(config))
@@ -1022,7 +1035,7 @@ main <- function() {
         plot_type = "box",
         title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
         error_bar = FALSE,
-        color_var = "conc_num"
+        color_var = "conc_num_factor"
       )
       plate_analysis_no_zeros_boxplots <- append(plate_analysis_no_zeros_boxplots, list(config))
     }
@@ -1035,7 +1048,7 @@ main <- function() {
         plot_type = "scatter",
         delta_bg_point = TRUE,
         title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
-        color_var = "conc_num",
+        color_var = "conc_num_factor",
         position = "jitter",
         legend_position = "right"
       )
@@ -1049,26 +1062,22 @@ main <- function() {
         plot_type = "scatter",
         gene_point = TRUE,
         title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
-        color_var = "conc_num",
+        color_var = "conc_num_factor",
         position = "jitter",
         legend_position = "right"
       )
     )
 
-    # message("Generating quality control plots")
-    # generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
-    # generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
-    # generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
-    # generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
-    # generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
-    # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
-    # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
-    # generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
-    # generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
-
-    # Clean up
-    rm(df, df_above_tolerance, df_no_zeros, df_no_zeros_stats, df_no_zeros_filtered_stats, ss)
-    gc()
+    message("Generating quality control plots")
+    generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
+    generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
+    generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
+    generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
+    generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
+    generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
+    generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
+    generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
+    generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
 
     # TODO: Originally this filtered L NA's
     # Let's try to avoid for now since stats have already been calculated
@@ -1160,7 +1169,8 @@ main <- function() {
       results <- generate_interaction_plot_configs(zscores_interactions_reference_joined, interaction_vars)
       if (nrow(results$out_of_range_data) > 0) {
         message("Out-of-range data:")
-        print(results$out_of_range_data)
+        print(results$out_of_range_data %>% select("OrfRep", "Gene", "num",
+          "conc_num", "conc_num_factor", config$x_var, config$y_var))
       }
       reference_plot_configs <- results$configs
       generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3))