Browse Source

Move y-limit filtering from generate_and_save_plots to generate_interaction_plot_configs

Bryan Roessler 9 months ago
parent
commit
c6fec56630
1 changed files with 71 additions and 74 deletions
  1. 71 74
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 71 - 74
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -525,36 +525,17 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
       config <- group$plots[[i]]
       config <- group$plots[[i]]
       df <- config$df
       df <- config$df
 
 
-      # Filter and debug out-of-bounds data
-      if (!is.null(config$ylim_vals)) {
-        out_of_bounds <- df %>%
-          filter(
-            is.na(.data[[config$y_var]]) |
-            .data[[config$y_var]] < config$ylim_vals[1] |
-            .data[[config$y_var]] > config$ylim_vals[2]
-          )
-        if (nrow(out_of_bounds) > 0) {
-          message("Filtered ", nrow(out_of_bounds), " row(s) from '", config$title, "' because ", config$y_var,
-                  " is outside of y-limits: [", config$ylim_vals[1], ", ", config$ylim_vals[2], "]:")
-          # print(out_of_bounds %>% select(OrfRep, Gene, num, Drug, scan, Plate, Row, Col, conc_num, all_of(config$y_var)), width = 1000)
-        }
-        df <- df %>%
-          filter(
-            !is.na(.data[[config$y_var]]) &
-            .data[[config$y_var]] >= config$ylim_vals[1] &
-            .data[[config$y_var]] <= config$ylim_vals[2]
-          )
-      }
-
       # Filter NAs
       # Filter NAs
       if (!is.null(config$filter_na) && config$filter_na) {
       if (!is.null(config$filter_na) && config$filter_na) {
         df <- df %>%
         df <- df %>%
           filter(!is.na(.data[[config$y_var]]))
           filter(!is.na(.data[[config$y_var]]))
       }
       }
 
 
+      # TODO for now skip all NA plots NA data
+      # Eventually add to own or filter_na block so we can handle selectively
       if (nrow(df) == 0) {
       if (nrow(df) == 0) {
-        message("No data available after filtering for plot ", config$title)
-        next  # Skip this plot if no data is available
+        message("Insufficient data for plot:", config$title)
+        next  # skip plot if insufficient data is available
       }
       }
 
 
       aes_mapping <- if (config$plot_type == "bar") {
       aes_mapping <- if (config$plot_type == "bar") {
@@ -599,10 +580,10 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
       if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label)
       if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label)
       if (!is.null(config$coord_cartesian)) plot <- plot + coord_cartesian(ylim = config$coord_cartesian)
       if (!is.null(config$coord_cartesian)) plot <- plot + coord_cartesian(ylim = config$coord_cartesian)
 
 
-      plotly_plot <- suppressWarnings(plotly::ggplotly(plot))
+      #plotly_plot <- suppressWarnings(plotly::ggplotly(plot))
 
 
       static_plots[[i]] <- plot
       static_plots[[i]] <- plot
-      plotly_plots[[i]] <- plotly_plot
+      #plotly_plots[[i]] <- plotly_plot
     }
     }
 
 
     grid_layout <- group$grid_layout
     grid_layout <- group$grid_layout
@@ -624,16 +605,11 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
         static_plots <- c(static_plots, replicate(total_spots - num_plots, nullGrob(), simplify = FALSE))
         static_plots <- c(static_plots, replicate(total_spots - num_plots, nullGrob(), simplify = FALSE))
       }
       }
 
 
-      tryCatch({
-        grid.arrange(
-          grobs = static_plots,
-          ncol = grid_layout$ncol,
-          nrow = grid_layout$nrow
-        )
-      }, error = function(e) {
-        message("Error in grid.arrange: ", e$message)
-        print(static_plots)
-      })
+      grid.arrange(
+        grobs = static_plots,
+        ncol = grid_layout$ncol,
+        nrow = grid_layout$nrow
+      )
       
       
     } else {
     } else {
       for (plot in static_plots) {
       for (plot in static_plots) {
@@ -644,15 +620,14 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
 
 
   dev.off()
   dev.off()
 
 
-  out_html_file <- file.path(out_dir, paste0(filename, ".html"))
-  message("Saving combined HTML file: ", out_html_file)
-  htmltools::save_html(
-    htmltools::tagList(plotly_plots),
-    file = out_html_file
-  )
+  # out_html_file <- file.path(out_dir, paste0(filename, ".html"))
+  # message("Saving combined HTML file: ", out_html_file)
+  # htmltools::save_html(
+  #   htmltools::tagList(plotly_plots),
+  #   file = out_html_file
+  # )
 }
 }
 
 
-
 generate_scatter_plot <- function(plot, config) {
 generate_scatter_plot <- function(plot, config) {
 
 
   # Define the points
   # Define the points
@@ -790,7 +765,7 @@ generate_scatter_plot <- function(plot, config) {
             )
             )
           }
           }
         } else {
         } else {
-          message("Skipping linear modeling line due to y-values outside of limits.")
+          message("Skipping linear regression line due to y-values outside of limits")
         }
         }
       } else {
       } else {
         # If no y-limits are provided, proceed with the annotation
         # If no y-limits are provided, proceed with the annotation
@@ -805,7 +780,7 @@ generate_scatter_plot <- function(plot, config) {
         )
         )
       }
       }
     } else {
     } else {
-      message("Skipping linear modeling line due to missing or invalid values.")
+      message("Skipping linear regression line due to missing or invalid values")
     }
     }
   }
   }
 
 
@@ -984,7 +959,6 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
         )
         )
         plot_config$position <- "jitter"
         plot_config$position <- "jitter"
 
 
-        # Cannot figure out how to place these properly for discrete x-axis so let's be hacky
         annotations <- list(
         annotations <- list(
           list(x = 0.25, y = y_limits[1] + 0.08 * y_span, label = "                NG =", size = 4),
           list(x = 0.25, y = y_limits[1] + 0.08 * y_span, label = "                NG =", size = 4),
           list(x = 0.25, y = y_limits[1] + 0.04 * y_span, label = "                DB =", size = 4),
           list(x = 0.25, y = y_limits[1] + 0.04 * y_span, label = "                DB =", size = 4),
@@ -1045,26 +1019,50 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
     for (var in names(delta_limits_map)) {
     for (var in names(delta_limits_map)) {
       y_limits <- delta_limits_map[[var]]
       y_limits <- delta_limits_map[[var]]
       y_span <- y_limits[2] - y_limits[1]
       y_span <- y_limits[2] - y_limits[1]
+      y_var_name <- paste0("Delta_", var)
 
 
-      WT_sd_value <- first(group_data[[paste0("WT_sd_", var)]], default = 0)
-      Z_Shift_value <- round(first(group_data[[paste0("Z_Shift_", var)]], default = 0), 2)
-      Z_lm_value <- round(first(group_data[[paste0("Z_lm_", var)]], default = 0), 2)
-      R_squared_value <- round(first(group_data[[paste0("R_Squared_", var)]], default = 0), 2)
+      # Anti-filter to select out-of-bounds rows
+      out_of_bounds <- group_data %>%
+        filter(is.na(.data[[y_var_name]]) |
+        .data[[y_var_name]] < y_limits[1] |
+        .data[[y_var_name]] > y_limits[2])
+      
+      if (nrow(out_of_bounds) > 0) {
+        message(sprintf("Filtered %d row(s) from '%s' because %s is outside of y-limits: [%f, %f]",
+          nrow(out_of_bounds), OrfRepTitle, y_var_name, y_limits[1], y_limits[2]
+        ))
+      }
 
 
-      NG_value <- first(group_data$NG, default = 0)
-      DB_value <- first(group_data$DB, default = 0)
-      SM_value <- first(group_data$SM, default = 0)
+      # Do the actual filtering
+      group_data_filtered <- group_data %>%
+        filter(!is.na(.data[[y_var_name]]) &
+          .data[[y_var_name]] >= y_limits[1] &
+          .data[[y_var_name]] <= y_limits[2])
 
 
+      if (nrow(group_data_filtered) == 0) {
+        message("Insufficient data for plot: ", OrfRepTitle, " ", var)
+        next  # skip plot if insufficient data is available
+      }
+      
+      WT_sd_value <- first(group_data_filtered[[paste0("WT_sd_", var)]], default = 0)
+      Z_Shift_value <- round(first(group_data_filtered[[paste0("Z_Shift_", var)]], default = 0), 2)
+      Z_lm_value <- round(first(group_data_filtered[[paste0("Z_lm_", var)]], default = 0), 2)
+      R_squared_value <- round(first(group_data_filtered[[paste0("R_Squared_", var)]], default = 0), 2)
+      
+      NG_value <- first(group_data_filtered$NG, default = 0)
+      DB_value <- first(group_data_filtered$DB, default = 0)
+      SM_value <- first(group_data_filtered$SM, default = 0)
+      
       lm_intercept_col <- paste0("lm_intercept_", var)
       lm_intercept_col <- paste0("lm_intercept_", var)
       lm_slope_col <- paste0("lm_slope_", var)
       lm_slope_col <- paste0("lm_slope_", var)
-      lm_intercept_value <- first(group_data[[lm_intercept_col]], default = 0)
-      lm_slope_value <- first(group_data[[lm_slope_col]], default = 0)
-
+      lm_intercept_value <- first(group_data_filtered[[lm_intercept_col]], default = 0)
+      lm_slope_value <- first(group_data_filtered[[lm_slope_col]], default = 0)
+      
       plot_config <- list(
       plot_config <- list(
-        df = group_data,
+        df = group_data_filtered,
         plot_type = "scatter",
         plot_type = "scatter",
         x_var = "conc_num_factor_factor",
         x_var = "conc_num_factor_factor",
-        y_var = paste0("Delta_", var),
+        y_var = y_var_name,
         x_label = paste0("[", unique(df_summary$Drug)[1], "]"),
         x_label = paste0("[", unique(df_summary$Drug)[1], "]"),
         shape = 16,
         shape = 16,
         title = paste(OrfRepTitle, Gene, sep = "      "),
         title = paste(OrfRepTitle, Gene, sep = "      "),
@@ -1087,27 +1085,26 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
           color = "gray70",
           color = "gray70",
           linewidth = 0.5
           linewidth = 0.5
         ),
         ),
-        x_breaks = unique(group_data$conc_num_factor_factor),
-        x_labels = as.character(unique(group_data$conc_num)),
+        x_breaks = unique(group_data_filtered$conc_num_factor_factor),
+        x_labels = as.character(unique(group_data_filtered$conc_num)),
         ylim_vals = y_limits,
         ylim_vals = y_limits,
-        # filter_na = TRUE,
         lm_line = list(
         lm_line = list(
           intercept = lm_intercept_value,
           intercept = lm_intercept_value,
           slope = lm_slope_value,
           slope = lm_slope_value,
           color = "blue",
           color = "blue",
           linewidth = 0.8,
           linewidth = 0.8,
-          x_min = min(as.numeric(group_data$conc_num_factor_factor)),
-          x_max = max(as.numeric(group_data$conc_num_factor_factor))
+          x_min = min(as.numeric(group_data_filtered$conc_num_factor_factor)),
+          x_max = max(as.numeric(group_data_filtered$conc_num_factor_factor))
         )
         )
       )
       )
       delta_plot_configs <- append(delta_plot_configs, list(plot_config))
       delta_plot_configs <- append(delta_plot_configs, list(plot_config))
     }
     }
   }
   }
-
+  
   # Group delta plots in chunks of 12 per page
   # Group delta plots in chunks of 12 per page
   chunk_size <- 12
   chunk_size <- 12
   delta_plot_chunks <- split(delta_plot_configs, ceiling(seq_along(delta_plot_configs) / chunk_size))
   delta_plot_chunks <- split(delta_plot_configs, ceiling(seq_along(delta_plot_configs) / chunk_size))
-
+  
   return(c(
   return(c(
     list(list(grid_layout = list(ncol = 2), plots = stats_plot_configs)),
     list(list(grid_layout = list(ncol = 2), plots = stats_plot_configs)),
     list(list(grid_layout = list(ncol = 2), plots = stats_boxplot_configs)),
     list(list(grid_layout = list(ncol = 2), plots = stats_boxplot_configs)),
@@ -1587,16 +1584,16 @@ main <- function() {
         group_vars = c("OrfRep", "Gene", "num", "Drug", "conc_num", "conc_num_factor_factor")
         group_vars = c("OrfRep", "Gene", "num", "Drug", "conc_num", "conc_num_factor_factor")
         )$df_with_stats
         )$df_with_stats
       
       
-      # message("Calculating reference strain interaction scores")
-      # reference_results <- calculate_interaction_scores(df_reference_interaction_stats, df_bg_stats, "reference")
-      # df_reference_interactions_joined <- reference_results$full_data
-      # df_reference_interactions <- reference_results$interactions
-      # write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
-      # write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
-
-      # message("Generating reference interaction plots")
-      # reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
-      # generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
+      message("Calculating reference strain interaction scores")
+      reference_results <- calculate_interaction_scores(df_reference_interaction_stats, df_bg_stats, "reference")
+      df_reference_interactions_joined <- reference_results$full_data
+      df_reference_interactions <- reference_results$interactions
+      write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
+      write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
+
+      message("Generating reference interaction plots")
+      reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
+      generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
 
 
       message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
       message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
       df_deletion <- df_na_stats %>% # formerly X2
       df_deletion <- df_na_stats %>% # formerly X2