From e07baf7a371c7dcc8477ecc6a585ccc2def6df8f Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Mon, 16 Sep 2024 13:55:00 -0400 Subject: [PATCH] Break out plot filtering --- .../apps/r/calculate_interaction_zscores.R | 180 +++++++++--------- qhtcp-workflow/qhtcp-workflow | 39 +--- 2 files changed, 94 insertions(+), 125 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 6f8de734..ec9138e6 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -1,11 +1,12 @@ suppressMessages({ - library(ggplot2) - library(plotly) - library(htmlwidgets) - library(dplyr) - library(ggthemes) - library(data.table) - library(unix) + library("ggplot2") + library("plotly") + library("htmlwidgets") + library("dplyr") + library("rlang") + library("ggthemes") + library("data.table") + library("unix") }) options(warn = 2) @@ -568,13 +569,9 @@ generate_box_plot <- function(plot, config) { } generate_interaction_plot_configs <- function(df, variables) { + configs <- list() - - # Data frames to collect filtered data and out-of-range data - filtered_data_list <- list() - out_of_range_data_list <- list() - - # Define common y-limits for each variable + limits_map <- list( L = c(-65, 65), K = c(-65, 65), @@ -596,44 +593,22 @@ generate_interaction_plot_configs <- function(df, variables) { DB = function(df, var) paste("DB =", df$DB), SM = function(df, var) paste("SM =", df$SM) ) + + results <- filter_data_for_plots(df, variables, limits_map) + df_filtered <- results$df_filtered + lm_lines <- filtered_results$lm_lines + # Iterate over each variable to create plot configurations for (variable in variables) { - # Get y-limits for the variable - ylim_vals <- limits_map[[variable]] - - # Extract precomputed linear model coefficients - lm_line <- list( - intercept = df[[paste0("lm_intercept_", variable)]], - slope = df[[paste0("lm_slope_", variable)]] - ) - - # Filter the data based on y-limits and missing values - y_var_sym <- sym(variable) - x_var_sym <- sym("conc_num_factor") - - # Identify missing data and out-of-range data - missing_data <- df %>% filter(is.na(!!x_var_sym) | is.na(!!y_var_sym)) - out_of_range_data <- df %>% filter( - !is.na(!!y_var_sym) & - (!!y_var_sym < min(ylim_vals, na.rm = TRUE) | !!y_var_sym > max(ylim_vals, na.rm = TRUE)) - ) - - # Combine missing data and out-of-range data - data_to_filter_out <- bind_rows(missing_data, out_of_range_data) %>% distinct() - - # Filtered data for plotting - filtered_data <- df %>% anti_join(data_to_filter_out, by = names(df)) - - # Collect the filtered data and out-of-range data - filtered_data_list[[variable]] <- filtered_data - out_of_range_data_list[[variable]] <- data_to_filter_out # Calculate x and y positions for annotations based on filtered data - x_levels <- levels(filtered_data$conc_num_factor) - x_pos <- mean(seq_along(x_levels)) # Midpoint of x-axis - - y_min <- min(ylim_vals) - y_max <- max(ylim_vals) + x_levels <- levels(df_filtered$conc_num_factor) + num_levels <- length(x_levels) + x_pos <- (1 + num_levels) / 2 # Midpoint of x-axis positions + + y_range <- limits_map[[variable]] + y_min <- min(y_range) + y_max <- max(y_range) y_span <- y_max - y_min # Adjust y positions as fractions of y-span @@ -650,7 +625,7 @@ generate_interaction_plot_configs <- function(df, variables) { y_pos <- annotation_positions[[annotation_name]] label_func <- annotation_labels[[annotation_name]] if (!is.null(label_func)) { - label <- label_func(df, variable) + label <- label_func(df_filtered, variable) list(x = x_pos, y = y_pos, label = label) } else { message(paste("Warning: No annotation function found for", annotation_name)) @@ -663,48 +638,40 @@ generate_interaction_plot_configs <- function(df, variables) { # Create scatter plot config configs[[length(configs) + 1]] <- list( - df = filtered_data, + df = df_filtered, x_var = "conc_num_factor", y_var = variable, plot_type = "scatter", - title = sprintf("%s %s", df$OrfRep[1], df$Gene[1]), - ylim_vals = ylim_vals, + title = sprintf("%s %s", df_filtered$OrfRep[1], df_filteredGene[1]), + ylim_vals = y_range, annotations = annotations, - lm_line = lm_line, + lm_line = lm_lines[[variable]], error_bar = TRUE, - x_breaks = levels(filtered_data$conc_num_factor), - x_labels = levels(filtered_data$conc_num_factor), + x_breaks = levels(df_filtered$conc_num_factor), + x_labels = levels(df_filtered$conc_num_factor), x_label = unique(df$Drug[1]), position = "jitter", - coord_cartesian = ylim_vals # Use the actual y-limits + coord_cartesian = y_range # Use the actual y-limits ) # Create box plot config configs[[length(configs) + 1]] <- list( - df = filtered_data, + df = df_filtered, x_var = "conc_num_factor", y_var = variable, plot_type = "box", - title = sprintf("%s %s (Boxplot)", df$OrfRep[1], df$Gene[1]), - ylim_vals = ylim_vals, + title = sprintf("%s %s (Boxplot)", df_filtered$OrfRep[1], df_filtered$Gene[1]), + ylim_vals = y_range, annotations = annotations, error_bar = FALSE, - x_breaks = unique(filtered_data$conc_num_factor), - x_labels = unique(as.character(filtered_data$conc_num)), - x_label = unique(df$Drug[1]), - coord_cartesian = ylim_vals + x_breaks = levels(df_filtered$conc_num_factor), + x_labels = levels(df_filtered$conc_num_factor), + x_label = unique(df_filtered$Drug[1]), + coord_cartesian = y_range ) } - # Combine the filtered data and out-of-range data into data frames - filtered_data_df <- bind_rows(filtered_data_list, .id = "variable") - out_of_range_data_df <- bind_rows(out_of_range_data_list, .id = "variable") - - return(list( - configs = configs, - filtered_data = filtered_data_df, - out_of_range_data = out_of_range_data_df - )) + return(configs) } generate_rank_plot_configs <- function(df, interaction_vars, rank_vars = c("L", "K"), is_lm = FALSE, adjust = FALSE) { @@ -822,6 +789,54 @@ filter_and_print_non_finite <- function(df, vars_to_check, print_vars) { df %>% filter(if_all(all_of(vars_to_check), is.finite)) } +filter_data_for_plots <- function(df, variables, limits_map) { + + # Initialize lists to store lm lines and filtered data + lm_lines <- list() + + # Print out NA and out-of-range data separately + for (variable in variables) { + # Get y-limits for the variable + ylim_vals <- limits_map[[variable]] + + # Extract precomputed linear model coefficients + lm_lines[[variable]] <- list( + intercept = df[[paste0("lm_intercept_", variable)]], + slope = df[[paste0("lm_slope_", variable)]] + ) + + # Convert variable name to symbol for dplyr + y_var_sym <- sym(variable) + + # Identify missing data and print it + missing_data <- df %>% filter(is.na(!!y_var_sym)) + if (nrow(missing_data) > 0) { + message("Missing data for variable ", variable, ":") + print(missing_data) + } + + # Identify out-of-range data and print it + out_of_range_data <- df %>% filter( + !is.na(!!y_var_sym) & + (!!y_var_sym < min(ylim_vals, na.rm = TRUE) | !!y_var_sym > max(ylim_vals, na.rm = TRUE)) + ) + if (nrow(out_of_range_data) > 0) { + message("Out-of-range data for variable ", variable, ":") + print(out_of_range_data) + } + } + + # Perform all filtering at once for all variables + df_filtered <- df %>% filter(across(all_of(variables), ~ !is.na(.))) %>% + filter(across(all_of(variables), ~ between(., limits_map[[cur_column()]][1], limits_map[[cur_column()]][2]), .names = "filter_{col}")) + + # Return the filtered dataframe and lm lines + return(list( + df_filtered = df_filtered, + lm_lines = lm_lines + )) +} + main <- function() { lapply(names(args$experiments), function(exp_name) { exp <- args$experiments[[exp_name]] @@ -1151,22 +1166,11 @@ main <- function() { # Create interaction plots message("Generating reference interaction plots") - results <- generate_interaction_plot_configs(zscores_interactions_reference_joined, interaction_vars) - if (nrow(results$out_of_range_data) > 0) { - message("Out-of-range data:") - print(results$out_of_range_data %>% select("OrfRep", "Gene", "num", - "conc_num", "conc_num_factor", config$x_var, config$y_var)) - } - reference_plot_configs <- results$configs + reference_plot_configs <- generate_interaction_plot_configs(zscores_interactions_reference_joined, interaction_vars) generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3)) message("Generating deletion interaction plots") - results <- generate_interaction_plot_configs(zscores_interactions_joined, interaction_vars) - if (nrow(results$out_of_range_data) > 0) { - message("Out-of-range data:") - print(results$out_of_range_data) - } - deletion_plot_configs <- results$configs + deletion_plot_configs <- generate_interaction_plot_configs(zscores_interactions_joined, interaction_vars) generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3)) # Define conditions for enhancers and suppressors @@ -1253,10 +1257,10 @@ main <- function() { ungroup() %>% rowwise() %>% mutate( - lm_R_squared_L = summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared, - lm_R_squared_K = summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared, - lm_R_squared_r = summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared, - lm_R_squared_AUC = summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared, + lm_R_squared_L = if (n() > 1) summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared else NA, + lm_R_squared_K = if (n() > 1) summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared else NA, + lm_R_squared_r = if (n() > 1) summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared else NA, + lm_R_squared_AUC = if (n() > 1) summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared else NA, Overlap = case_when( Z_lm_L >= 2 & Avg_Zscore_L >= 2 ~ "Deletion Enhancer Both", diff --git a/qhtcp-workflow/qhtcp-workflow b/qhtcp-workflow/qhtcp-workflow index cbd9163d..608cae02 100755 --- a/qhtcp-workflow/qhtcp-workflow +++ b/qhtcp-workflow/qhtcp-workflow @@ -614,41 +614,6 @@ interactive_header() { module install_dependencies # @description This module will automatically install the dependencies for running QHTCP. # -# If you wish to install them manually, you can use the following information to do so: -# -# #### System dependencies -# -# * R -# * Perl -# * Java -# * MATLAB -# -# #### MacOS -# -# * `export HOMEBREW_BREW_GIT_REMOTE=https://github.com/Homebrew/brew` -# * `/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"` -# * `cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder` -# * `brew install graphiz gd pdftk-java pandoc shdoc nano rsync coreutils` -# -# #### Linux DEB -# -# * `apt install graphviz pandoc pdftk-java libgd-dev perl shdoc nano rsync coreutils libcurl-dev openssl-dev` -# -# #### Linux RPM -# -# * `dnf install graphviz pandoc pdftk-java gd-devel perl-CPAN shdoc nano rsync coreutils libcurl-devel openssl-devel` -# -# #### Perl -# -# * `cpan -I -i File::Map ExtUtils::PkgConfig GD GO::TermFinder` -# -# #### R -# -# * `install.packages(c('BiocManager', 'ontologyIndex', 'ggrepel', 'tidyverse', 'sos', 'openxlsx', 'ggplot2', 'plyr', 'extrafont', 'gridExtra', 'gplots', 'stringr', 'plotly', 'ggthemes', 'pandoc', 'rmarkdown', 'plotly', 'htmlwidgets'), dep=TRUE)` -# * `BiocManager::install('UCSC.utils')` -# * `BiocManager::install('org.Sc.sgd.db')` -# -# install_dependencies() { debug "Running: ${FUNCNAME[0]} $*" @@ -669,8 +634,8 @@ install_dependencies() { ExtUtils::PkgConfig IPC::Run Module::Build::Tiny GD GO::TermFinder) depends_r=( BiocManager ontologyIndex ggrepel tidyverse sos openxlsx ggplot2 - plyr extrafont gridExtra gplots stringr plotly ggthemes pandoc - rmarkdown plotly htmlwidgets gplots gdata Hmisc) + dplyr rlang data.table unix gridExtra gplots stringr plotly ggthemes pandoc + rmarkdown htmlwidgets gdata Hmisc) depends_bioc=(UCSC.utils org.Sc.sgd.db) [[ $1 == "--get-depends" ]] && return 0 # if we just want to read the depends vars