Browse Source

Break out plot filtering

Bryan Roessler 7 months ago
parent
commit
e07baf7a37
2 changed files with 94 additions and 125 deletions
  1. 92 88
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R
  2. 2 37
      qhtcp-workflow/qhtcp-workflow

+ 92 - 88
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -1,11 +1,12 @@
 suppressMessages({
-  library(ggplot2)
-  library(plotly)
-  library(htmlwidgets)
-  library(dplyr)
-  library(ggthemes)
-  library(data.table)
-  library(unix)
+  library("ggplot2")
+  library("plotly")
+  library("htmlwidgets")
+  library("dplyr")
+  library("rlang")
+  library("ggthemes")
+  library("data.table")
+  library("unix")
 })
 
 options(warn = 2)
@@ -568,13 +569,9 @@ generate_box_plot <- function(plot, config) {
 }
 
 generate_interaction_plot_configs <- function(df, variables) {
+
   configs <- list()
-  
-  # Data frames to collect filtered data and out-of-range data
-  filtered_data_list <- list()
-  out_of_range_data_list <- list()
-  
-  # Define common y-limits for each variable
+
   limits_map <- list(
     L = c(-65, 65),
     K = c(-65, 65),
@@ -596,44 +593,22 @@ generate_interaction_plot_configs <- function(df, variables) {
     DB = function(df, var) paste("DB =", df$DB),
     SM = function(df, var) paste("SM =", df$SM)
   )
+
+  results <- filter_data_for_plots(df, variables, limits_map)
+  df_filtered <- results$df_filtered
+  lm_lines <- filtered_results$lm_lines
   
+  # Iterate over each variable to create plot configurations
   for (variable in variables) {
-    # Get y-limits for the variable
-    ylim_vals <- limits_map[[variable]]
-    
-    # Extract precomputed linear model coefficients
-    lm_line <- list(
-      intercept = df[[paste0("lm_intercept_", variable)]],
-      slope = df[[paste0("lm_slope_", variable)]]
-    )
-    
-    # Filter the data based on y-limits and missing values
-    y_var_sym <- sym(variable)
-    x_var_sym <- sym("conc_num_factor")
-    
-    # Identify missing data and out-of-range data
-    missing_data <- df %>% filter(is.na(!!x_var_sym) | is.na(!!y_var_sym))
-    out_of_range_data <- df %>% filter(
-      !is.na(!!y_var_sym) & 
-        (!!y_var_sym < min(ylim_vals, na.rm = TRUE) | !!y_var_sym > max(ylim_vals, na.rm = TRUE))
-    )
-    
-    # Combine missing data and out-of-range data
-    data_to_filter_out <- bind_rows(missing_data, out_of_range_data) %>% distinct()
-    
-    # Filtered data for plotting
-    filtered_data <- df %>% anti_join(data_to_filter_out, by = names(df))
-    
-    # Collect the filtered data and out-of-range data
-    filtered_data_list[[variable]] <- filtered_data
-    out_of_range_data_list[[variable]] <- data_to_filter_out
     
     # Calculate x and y positions for annotations based on filtered data
-    x_levels <- levels(filtered_data$conc_num_factor)
-    x_pos <- mean(seq_along(x_levels))  # Midpoint of x-axis
-
-    y_min <- min(ylim_vals)
-    y_max <- max(ylim_vals)
+    x_levels <- levels(df_filtered$conc_num_factor)
+    num_levels <- length(x_levels)
+    x_pos <- (1 + num_levels) / 2  # Midpoint of x-axis positions
+    
+    y_range <- limits_map[[variable]]
+    y_min <- min(y_range)
+    y_max <- max(y_range)
     y_span <- y_max - y_min
     
     # Adjust y positions as fractions of y-span
@@ -650,7 +625,7 @@ generate_interaction_plot_configs <- function(df, variables) {
       y_pos <- annotation_positions[[annotation_name]]
       label_func <- annotation_labels[[annotation_name]]
       if (!is.null(label_func)) {
-        label <- label_func(df, variable)
+        label <- label_func(df_filtered, variable)
         list(x = x_pos, y = y_pos, label = label)
       } else {
         message(paste("Warning: No annotation function found for", annotation_name))
@@ -663,48 +638,40 @@ generate_interaction_plot_configs <- function(df, variables) {
     
     # Create scatter plot config
     configs[[length(configs) + 1]] <- list(
-      df = filtered_data,
+      df = df_filtered,
       x_var = "conc_num_factor",
       y_var = variable,
       plot_type = "scatter",
-      title = sprintf("%s      %s", df$OrfRep[1], df$Gene[1]),
-      ylim_vals = ylim_vals,
+      title = sprintf("%s      %s", df_filtered$OrfRep[1], df_filteredGene[1]),
+      ylim_vals = y_range,
       annotations = annotations,
-      lm_line = lm_line,
+      lm_line = lm_lines[[variable]],
       error_bar = TRUE,
-      x_breaks = levels(filtered_data$conc_num_factor),
-      x_labels = levels(filtered_data$conc_num_factor),
+      x_breaks = levels(df_filtered$conc_num_factor),
+      x_labels = levels(df_filtered$conc_num_factor),
       x_label = unique(df$Drug[1]),
       position = "jitter",
-      coord_cartesian = ylim_vals  # Use the actual y-limits
+      coord_cartesian = y_range  # Use the actual y-limits
     )
     
     # Create box plot config
     configs[[length(configs) + 1]] <- list(
-      df = filtered_data,
+      df = df_filtered,
       x_var = "conc_num_factor",
       y_var = variable,
       plot_type = "box",
-      title = sprintf("%s      %s (Boxplot)", df$OrfRep[1], df$Gene[1]),
-      ylim_vals = ylim_vals,
+      title = sprintf("%s      %s (Boxplot)", df_filtered$OrfRep[1], df_filtered$Gene[1]),
+      ylim_vals = y_range,
       annotations = annotations,
       error_bar = FALSE,
-      x_breaks = unique(filtered_data$conc_num_factor),
-      x_labels = unique(as.character(filtered_data$conc_num)),
-      x_label = unique(df$Drug[1]),
-      coord_cartesian = ylim_vals
+      x_breaks = levels(df_filtered$conc_num_factor),
+      x_labels = levels(df_filtered$conc_num_factor),
+      x_label = unique(df_filtered$Drug[1]),
+      coord_cartesian = y_range
     )
   }
   
-  # Combine the filtered data and out-of-range data into data frames
-  filtered_data_df <- bind_rows(filtered_data_list, .id = "variable")
-  out_of_range_data_df <- bind_rows(out_of_range_data_list, .id = "variable")
-  
-  return(list(
-    configs = configs,
-    filtered_data = filtered_data_df,
-    out_of_range_data = out_of_range_data_df
-  ))
+  return(configs)
 }
 
 generate_rank_plot_configs <- function(df, interaction_vars, rank_vars = c("L", "K"), is_lm = FALSE, adjust = FALSE) {
@@ -822,6 +789,54 @@ filter_and_print_non_finite <- function(df, vars_to_check, print_vars) {
   df %>% filter(if_all(all_of(vars_to_check), is.finite))
 }
 
+filter_data_for_plots <- function(df, variables, limits_map) {
+
+  # Initialize lists to store lm lines and filtered data
+  lm_lines <- list()
+  
+  # Print out NA and out-of-range data separately
+  for (variable in variables) {
+    # Get y-limits for the variable
+    ylim_vals <- limits_map[[variable]]
+    
+    # Extract precomputed linear model coefficients
+    lm_lines[[variable]] <- list(
+      intercept = df[[paste0("lm_intercept_", variable)]],
+      slope = df[[paste0("lm_slope_", variable)]]
+    )
+    
+    # Convert variable name to symbol for dplyr
+    y_var_sym <- sym(variable)
+    
+    # Identify missing data and print it
+    missing_data <- df %>% filter(is.na(!!y_var_sym))
+    if (nrow(missing_data) > 0) {
+      message("Missing data for variable ", variable, ":")
+      print(missing_data)
+    }
+    
+    # Identify out-of-range data and print it
+    out_of_range_data <- df %>% filter(
+      !is.na(!!y_var_sym) &
+        (!!y_var_sym < min(ylim_vals, na.rm = TRUE) | !!y_var_sym > max(ylim_vals, na.rm = TRUE))
+    )
+    if (nrow(out_of_range_data) > 0) {
+      message("Out-of-range data for variable ", variable, ":")
+      print(out_of_range_data)
+    }
+  }
+
+  # Perform all filtering at once for all variables
+  df_filtered <- df %>% filter(across(all_of(variables), ~ !is.na(.))) %>%
+    filter(across(all_of(variables), ~ between(., limits_map[[cur_column()]][1], limits_map[[cur_column()]][2]), .names = "filter_{col}"))
+
+  # Return the filtered dataframe and lm lines
+  return(list(
+    df_filtered = df_filtered,
+    lm_lines = lm_lines
+  ))
+}
+
 main <- function() {
   lapply(names(args$experiments), function(exp_name) {
     exp <- args$experiments[[exp_name]]
@@ -1151,22 +1166,11 @@ main <- function() {
 
       # Create interaction plots
       message("Generating reference interaction plots")
-      results <- generate_interaction_plot_configs(zscores_interactions_reference_joined, interaction_vars)
-      if (nrow(results$out_of_range_data) > 0) {
-        message("Out-of-range data:")
-        print(results$out_of_range_data %>% select("OrfRep", "Gene", "num",
-          "conc_num", "conc_num_factor", config$x_var, config$y_var))
-      }
-      reference_plot_configs <- results$configs
+      reference_plot_configs <- generate_interaction_plot_configs(zscores_interactions_reference_joined, interaction_vars)
       generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
 
       message("Generating deletion interaction plots")
-      results <- generate_interaction_plot_configs(zscores_interactions_joined, interaction_vars)
-      if (nrow(results$out_of_range_data) > 0) {
-        message("Out-of-range data:")
-        print(results$out_of_range_data)
-      }
-      deletion_plot_configs <- results$configs
+      deletion_plot_configs <- generate_interaction_plot_configs(zscores_interactions_joined, interaction_vars)
       generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
 
       # Define conditions for enhancers and suppressors
@@ -1253,10 +1257,10 @@ main <- function() {
         ungroup() %>%
         rowwise() %>%
         mutate(
-          lm_R_squared_L = summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared,
-          lm_R_squared_K = summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared,
-          lm_R_squared_r = summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared,
-          lm_R_squared_AUC = summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared,
+          lm_R_squared_L = if (n() > 1) summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared else NA,
+          lm_R_squared_K = if (n() > 1) summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared else NA,
+          lm_R_squared_r = if (n() > 1) summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared else NA,
+          lm_R_squared_AUC = if (n() > 1) summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared else NA,
 
           Overlap = case_when(
             Z_lm_L >= 2 & Avg_Zscore_L >= 2 ~ "Deletion Enhancer Both",

+ 2 - 37
qhtcp-workflow/qhtcp-workflow

@@ -614,41 +614,6 @@ interactive_header() {
 module install_dependencies
 # @description This module will automatically install the dependencies for running QHTCP.
 #
-# If you wish to install them manually, you can use the following information to do so:
-#
-# #### System dependencies
-#
-# * R
-# * Perl
-# * Java
-# * MATLAB
-#
-# #### MacOS
-#
-# * `export HOMEBREW_BREW_GIT_REMOTE=https://github.com/Homebrew/brew`
-# * `/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"`
-# * `cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder`
-# * `brew install graphiz gd pdftk-java pandoc shdoc nano rsync coreutils`
-#
-# #### Linux DEB
-#
-# * `apt install graphviz pandoc pdftk-java libgd-dev perl shdoc nano rsync coreutils libcurl-dev openssl-dev`
-#
-# #### Linux RPM
-#
-# * `dnf install graphviz pandoc pdftk-java gd-devel perl-CPAN shdoc nano rsync coreutils libcurl-devel openssl-devel`
-#
-# #### Perl
-#
-# * `cpan -I -i File::Map ExtUtils::PkgConfig GD GO::TermFinder`
-#
-# #### R
-#
-# * `install.packages(c('BiocManager', 'ontologyIndex', 'ggrepel', 'tidyverse', 'sos', 'openxlsx', 'ggplot2', 'plyr', 'extrafont', 'gridExtra', 'gplots', 'stringr', 'plotly', 'ggthemes', 'pandoc', 'rmarkdown', 'plotly', 'htmlwidgets'), dep=TRUE)`
-# * `BiocManager::install('UCSC.utils')`
-# * `BiocManager::install('org.Sc.sgd.db')`
-#
-#
 install_dependencies() {
   debug "Running: ${FUNCNAME[0]} $*"
 
@@ -669,8 +634,8 @@ install_dependencies() {
     ExtUtils::PkgConfig IPC::Run Module::Build::Tiny GD GO::TermFinder)
   depends_r=(
     BiocManager ontologyIndex ggrepel tidyverse sos openxlsx ggplot2 
-    plyr extrafont gridExtra gplots stringr plotly ggthemes pandoc
-    rmarkdown plotly htmlwidgets gplots gdata Hmisc)
+    dplyr rlang data.table unix gridExtra gplots stringr plotly ggthemes pandoc
+    rmarkdown htmlwidgets gdata Hmisc)
   depends_bioc=(UCSC.utils org.Sc.sgd.db)
 
   [[ $1 == "--get-depends" ]] && return 0 # if we just want to read the depends vars