From dc5501ea5da3327dddad9295696e487ade93fed9 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Mon, 9 Sep 2024 19:20:45 -0400 Subject: [PATCH] Improve generate_and_save_plots() --- .../apps/r/calculate_interaction_zscores5.R | 446 ++++++++++++------ 1 file changed, 290 insertions(+), 156 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index 2760f67e..07d068cd 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -143,13 +143,10 @@ update_gene_names <- function(df, sgd_gene_list) { # Create a named vector for mapping ORF to GeneName gene_map <- setNames(genes$V5, genes$V4) - # Vectorized match to find the GeneName from gene_map mapped_genes <- gene_map[df$ORF] - # Replace NAs in mapped_genes with original Gene names (preserves existing Gene names if ORF is not found) updated_genes <- ifelse(is.na(mapped_genes) | df$OrfRep == "YDL227C", df$Gene, mapped_genes) - # Ensure Gene is not left blank or incorrectly updated to "OCT1" df <- df %>% mutate(Gene = ifelse(updated_genes == "" | updated_genes == "OCT1", OrfRep, updated_genes)) @@ -313,104 +310,102 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) } -generate_plot <- function(df, x_var, y_var = NULL, plot_type = "scatter", color_var = "conc_num", - title = "", x_label = NULL, y_label = NULL, ylim_vals = NULL, annotations = NULL) { - - plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) - - if (!is.null(y_var)) { - plot <- plot + aes(y = !!sym(y_var)) +generate_and_save_plots <- function(output_dir, file_name, plot_configs) { + + `%||%` <- function(a, b) if (!is.null(a)) a else b + + # Generalized plot generation + plots <- lapply(plot_configs, function(config) { - # Add scatter points with summary stats if `y_var` is present - y_mean_col <- paste0("mean_", y_var) - y_sd_col <- paste0("sd_", y_var) - - plot <- plot + geom_point(shape = 3, size = 0.2) + - geom_errorbar(aes( - ymin = !!sym(y_mean_col) - !!sym(y_sd_col), - ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), - width = 0.1) + - geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) - } + # Use the dataframe from the plot configuration + df <- config$df - # Add the specified plot type - plot <- switch(plot_type, - "box" = plot + geom_boxplot(), - "density" = plot + geom_density(), - "bar" = plot + geom_bar(stat = "identity"), - plot + geom_point() + geom_smooth(method = "lm", se = FALSE) # Default is scatter - ) + # Initialize the basic ggplot + plot <- ggplot(df, aes(x = if (config$plot_type == "box") as.factor(!!sym(config$x_var)) else !!sym(config$x_var), + color = as.factor(!!sym(config$color_var)))) - if (!is.null(ylim_vals)) { - plot <- plot + coord_cartesian(ylim = ylim_vals) - } + # Handle specific y_var cases, like "delta_bg" + if (!is.null(config$y_var)) { + plot <- plot + aes(y = !!sym(config$y_var)) - plot <- plot + ggtitle(title) + theme_publication() - - if (!is.null(x_label)) plot <- plot + xlab(x_label) - if (!is.null(y_label)) plot <- plot + ylab(y_label) + y_mean_col <- paste0("mean_", config$y_var) + y_sd_col <- paste0("sd_", config$y_var) - if (!is.null(annotations)) { - for (annotation in annotations) { - plot <- plot + annotate("text", - x = annotation$x, - y = annotation$y, - label = annotation$label) + # Special case for "delta_bg" to add jitter + if (config$y_var == "delta_bg" && config$plot_type == "scatter") { + plot <- plot + geom_point(shape = 3, size = 0.2, position = "jitter") + + geom_errorbar(aes( + ymin = !!sym(y_mean_col) - !!sym(y_sd_col), + ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), + width = 0.1) + + geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) + } else if (config$error_bar %||% FALSE) { + plot <- plot + + geom_point(shape = 3, size = 0.2) + + geom_errorbar(aes( + ymin = !!sym(y_mean_col) - !!sym(y_sd_col), + ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), + width = 0.1) + + geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) + + stat_summary(aes( + ymin = !!sym(y_mean_col) - !!sym(y_sd_col), + ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), + fun.data = "identity", geom = "errorbar", width = 0.1) + + stat_summary(aes(y = !!sym(y_mean_col)), + fun.data = "identity", geom = "point", size = 0.6) + } } - } - return(plot) -} - -generate_and_save_plots <- function(df, output_dir, file_prefix, plot_configs) { - plots <- list() - - if (nrow(df) == 0) { - message("The dataframe is empty, skipping plots") - return() - } - - message("Generating plots for dataframe") - - for (config in plot_configs) { - plot <- generate_plot( - df = df, - x_var = config$x_var, - y_var = config$y_var, - plot_type = config$plot_type, - title = config$title, - ylim_vals = config$ylim_vals, - annotations = config$annotations + # Switch plot type (scatter, box, density, bar) + plot <- switch(config$plot_type, + "box" = plot + geom_boxplot(), + "density" = plot + geom_density(), + "bar" = plot + geom_bar(stat = "identity"), + plot + geom_point() + geom_smooth(method = "lm", se = FALSE) # Default is scatter ) - plots[[paste0(config$y_var, "_", config$plot_type)]] <- plot - } + # Apply y-axis limits if provided + if (!is.null(config$ylim_vals)) { + plot <- plot + coord_cartesian(ylim = config$ylim_vals) + } - # Save plots to PDF and HTML - save_plots(file_prefix, plots, output_dir) -} + # Add title and publication theme with custom legend position + legend_position <- config$legend_position %||% "bottom" # Default to bottom if not specified + plot <- plot + ggtitle(config$title) + theme_publication(legend_position = legend_position) -# Standardized saving of plots -save_plots <- function(file_name, plot_list, output_dir) { + # Add x and y axis labels if provided + if (!is.null(config$x_label)) plot <- plot + xlab(config$x_label) + if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label) + + # Add custom annotations if provided + if (!is.null(config$annotations)) { + for (annotation in config$annotations) { + plot <- plot + annotate("text", + x = annotation$x, + y = annotation$y, + label = annotation$label) + } + } + + return(plot) + }) + + # Save plots to PDF pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9) - lapply(plot_list, print) + lapply(plots, print) dev.off() - lapply(names(plot_list), function(plot_name) { - pgg <- tryCatch({ - suppressWarnings(ggplotly(plot_list[[plot_name]]) %>% - layout(legend = list(orientation = "h"))) - }, error = function(e) { - message("Error generating plot: ", plot_name) - return(NULL) - }) - - if (!is.null(pgg)) { - saveWidget(pgg, - file = file.path(output_dir, paste0(file_name, "_", plot_name, ".html")), - selfcontained = TRUE) - } + # Convert plots to plotly and combine them using subplot + plotly_plots <- lapply(plots, function(plot) { + suppressWarnings(ggplotly(plot) %>% + layout(legend = list(orientation = "h"))) }) + + # Combine all plotly plots into a single interactive layout using subplot + combined_plot <- subplot(plotly_plots, nrows = length(plotly_plots), margin = 0.05) + + # Save the combined HTML file + saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE) } interaction_plot_configs <- function(df, variables) { @@ -493,42 +488,6 @@ adjust_missing_and_rank <- function(df, variables) { return(df) } -generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", title_prefix = "", - x_label = NULL, y_label = NULL, ylim_vals = NULL, output_dir, file_prefix = "plot") { - plot_list <- list() - - # Iterate over each y variable and generate the corresponding plot - for (var in y_vars) { - plot <- ggplot(df, aes(x = !!sym(x_var), y = !!sym(var), color = as.factor(!!sym(color_var)))) + - ggtitle(paste(title_prefix, var)) + - theme_publication() - - if (plot_type == "scatter") { - plot <- plot + - geom_point() + - geom_smooth(method = "lm", formula = y ~ x, se = FALSE) - - if (!is.null(ylim_vals)) { - plot <- plot + coord_cartesian(ylim = ylim_vals) - } - } else if (plot_type == "box") { - plot <- plot + geom_boxplot() - } else if (plot_type == "density") { - plot <- plot + geom_density() - } else if (plot_type == "bar") { - plot <- plot + geom_bar() - } - - if (!is.null(x_label)) plot <- plot + xlab(x_label) - if (!is.null(y_label)) plot <- plot + ylab(y_label) - - plot_list[[var]] <- plot - } - - # Save plots to PDF and HTML - save_plots(file_prefix, plot_list, output_dir) -} - main <- function() { lapply(names(args$experiments), function(exp_name) { exp <- args$experiments[[exp_name]] @@ -540,7 +499,7 @@ main <- function() { dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE) # Load and process data - df <- load_and_process_data(args$easy_results_file, exp_sd) + df <- load_and_process_data(args$easy_results_file, sd = exp_sd) df <- update_gene_names(df, args$sgd_gene_list) max_conc <- max(df$conc_num_factor) @@ -583,40 +542,223 @@ main <- function() { df_na %>% filter(if_all(c(L), is.finite)) } - # Define the plot configurations for each QC step - qc_plot_configs <- list( - list(x_var = "L", y_var = "K", plot_type = "scatter", title = "Raw L vs K before QC", ylim_vals = NULL), - list(x_var = "delta_bg", y_var = NULL, plot_type = "density", title = "Density plot for Delta Background", ylim_vals = NULL), - list(x_var = "as.factor(delta_bg)", y_var = NULL, plot_type = "bar", title = "Bar plot for Delta Background", ylim_vals = NULL) + # Plot configurations + # Each list corresponds to a group of plots in the same file + raw_l_vs_k_plots <- list( + list(df = df, x_var = "L", y_var = "K", plot_type = "scatter", + title = "Raw L vs K before QC", + color_var = "conc_num", + legend_position = "right") ) + above_threshold_plots <- list( + list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter", + title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"), + color_var = "conc_num", + annotations = list( + list( + x = L_half_median, + y = K_half_median, + label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance)) + ) + ), + error_bar = FALSE, + legend_position = "right" + ) + ) + + frequency_delta_bg_plots <- list( + list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density", + title = "Density plot for Delta Background by Conc All Data", + color_var = "conc_num", + x_label = "Delta Background", + y_label = "Density", + error_bar = FALSE, + legend_position = "right" + ), + list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar", + title = "Bar plot for Delta Background by Conc All Data", + color_var = "conc_num", + x_label = "Delta Background", + y_label = "Count", + error_bar = FALSE, + legend_position = "right" + ) + ) + + plate_analysis_plots <- list() + for (plot_type in c("scatter", "box")) { + variables <- c("L", "K", "r", "AUC", "delta_bg") + for (var in variables) { + for (stage in c("before", "after")) { + if (stage == "before") { + df_plot <- df + } else { + df_plot <- df_na # TODO use df_na_filtered if necessary + } + + # Set error_bar = TRUE only for scatter plots + error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE) + + # Create the plot configuration + plot_config <- list( + df = df_plot, + x_var = "scan", + y_var = var, + plot_type = plot_type, + title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"), + error_bar = error_bar, + color_var = "conc_num" + ) + plate_analysis_plots <- append(plate_analysis_plots, list(plot_config)) + } + } + } + + plate_analysis_no_zero_plots <- list() + for (plot_type in c("scatter", "box")) { + variables <- c("L", "K", "r", "AUC", "delta_bg") + for (var in variables) { + + # Set error_bar = TRUE only for scatter plots + error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE) + + # Create the plot configuration + plot_config <- list( + df = df_no_zeros, + x_var = "scan", + y_var = var, + plot_type = plot_type, + title = paste("Plate analysis by Drug Conc for", var, "after quality control"), + error_bar = error_bar, + color_var = "conc_num" + ) + plate_analysis_plots <- append(plate_analysis_plots, list(plot_config)) + } + } + + + + + + + delta_bg_plot_configs <- list( + list(x_var = "delta_bg", y_var = NULL, plot_type = "density", + title = paste("Raw L vs K for strains above delta background threshold of", delta_bg_tolerance, "or above"), + ylim_vals = NULL, + annotate("text", x = L_half_median, y = K_half_median, + label = paste("Strains above delta background tolerance = ", nrow(df_above_tolerance))) + ) + + + + ) + + before_qc_configs <- list( + list(x_var = "scan", y_var = "delta_bg", plot_type = "scatter", + title = "Plate analysis by Drug Conc for Delta Background before QC", + error_bar = TRUE, color_var = "conc_num"), + list(x_var = "scan", y_var = "delta_bg", plot_type = "box", + title = "Plate analysis by Drug Conc for Delta Background before QC", + error_bar = FALSE, color_var = "conc_num") + ) + + + + + + + + + above_delta_bg_tolerance <- list( + + + + + + + + + ) + + + + + # Print quality control graphs before removing data due to contamination and + # adjusting missing data to max theoretical values + before_qc_plot_configs <- list( + # Plate analysis is a quality check to identify plate effects containing anomalies + + ) + + # list(x_var = "delta_bg", y_var = NULL, plot_type = "density", + # title = "Density plot for Delta Background", + # ylim_vals = NULL), + # list(x_var = "as.factor(delta_bg)", y_var = NULL, plot_type = "bar", + # title = "Bar plot for Delta Background", + # ylim_vals = NULL) + above_tolerance_plot_configs <- list( - list(x_var = "L", y_var = "K", plot_type = "scatter", title = "L vs K for Rows Above Delta Background Threshold", ylim_vals = NULL) + list(x_var = "L", y_var = "K", plot_type = "scatter", + title = "Raw L vs K for strains above delta background threshold", ylim_vals = NULL) ) after_qc_plot_configs <- list( - list(x_var = "L", y_var = "K", plot_type = "scatter", title = "L vs K After QC Filtering", ylim_vals = NULL) + list(x_var = "L", y_var = "K", plot_type = "scatter", + title = "L vs K After QC Filtering", ylim_vals = NULL) ) no_zeros_plot_configs <- list( - list(x_var = "scan", y_var = "L", plot_type = "scatter", title = "L vs Scan for Non-Zero L Values"), - list(x_var = "scan", y_var = "K", plot_type = "scatter", title = "K vs Scan for Non-Zero L Values"), - list(x_var = "scan", y_var = "r", plot_type = "scatter", title = "r vs Scan for Non-Zero L Values"), - list(x_var = "scan", y_var = "AUC", plot_type = "scatter", title = "AUC vs Scan for Non-Zero L Values"), - list(x_var = "scan", y_var = "delta_bg", plot_type = "scatter", title = "Delta background vs Scan for Non-Zero L Values"), - list(x_var = "as.factor(scan)", y_var = "L", plot_type = "box", title = "L vs Scan for Non-Zero L Values"), - list(x_var = "as.factor(scan)", y_var = "K", plot_type = "box", title = "K vs Scan for Non-Zero L Values"), - list(x_var = "as.factor(scan)", y_var = "r", plot_type = "box", title = "r vs Scan for Non-Zero L Values"), - list(x_var = "as.factor(scan)", y_var = "AUC", plot_type = "box", title = "AUC vs Scan for Non-Zero L Values"), - list(x_var = "as.factor(scan)", y_var = "delta_bg", plot_type = "box", title = "Delta background vs Scan for Non-Zero L Values") + list(x_var = "scan", y_var = "L", plot_type = "scatter", + title = "Plate analysis by Drug Conc for L after quality control", + file_name = "Plate_Analysis_L_afterQC_Z", error_bar = TRUE, color_var = "conc_num"), + list(x_var = "scan", y_var = "K", plot_type = "scatter", + title = "Plate analysis by Drug Conc for K after quality control", + file_name = "Plate_Analysis_K_afterQC_Z", error_bar = TRUE, color_var = "conc_num"), + list(x_var = "scan", y_var = "r", plot_type = "scatter", + title = "Plate analysis by Drug Conc for r after quality control", + file_name = "Plate_Analysis_r_afterQC_Z", error_bar = TRUE, color_var = "conc_num"), + list(x_var = "scan", y_var = "AUC", plot_type = "scatter", + title = "Plate analysis by Drug Conc for AUC after quality control", + file_name = "Plate_Analysis_AUC_afterQC_Z", error_bar = TRUE, color_var = "conc_num"), + list(x_var = "scan", y_var = "delta_bg", plot_type = "scatter", + title = "Plate analysis by Drug Conc for Delta_Backgrd after quality control", + file_name = "Plate_Analysis_Delta_Backgrd_afterQC_Z", error_bar = TRUE, color_var = "conc_num"), + + # Box plots + list(x_var = "as.factor(scan)", y_var = "L", plot_type = "box", + title = "Plate analysis by Drug Conc for L after quality control", + file_name = "Plate_Analysis_L_Box_afterQC_Z"), + list(x_var = "as.factor(scan)", y_var = "K", plot_type = "box", + title = "Plate analysis by Drug Conc for K after quality control", + file_name = "Plate_Analysis_K_Box_afterQC_Z"), + list(x_var = "as.factor(scan)", y_var = "r", plot_type = "box", + title = "Plate analysis by Drug Conc for r after quality control", + file_name = "Plate_Analysis_r_Box_afterQC_Z"), + list(x_var = "as.factor(scan)", y_var = "AUC", plot_type = "box", + title = "Plate analysis by Drug Conc for AUC after quality control", + file_name = "Plate_Analysis_AUC_Box_afterQC_Z"), + list(x_var = "as.factor(scan)", y_var = "delta_bg", plot_type = "box", + title = "Plate analysis by Drug Conc for Delta_Backgrd after quality control", + file_name = "Plate_Analysis_Delta_Backgrd_Box_afterQC_Z") ) + # Generate and save plots for each QC step message("Generating QC plots") - generate_and_save_plots(df, out_dir_qc, "Before_QC", qc_plot_configs) - generate_and_save_plots(df_above_tolerance, out_dir_qc, "Above_Tolerance", above_tolerance_plot_configs) - generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", after_qc_plot_configs) - generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", no_zeros_plot_configs) + generate_and_save_plots(out_dir_qc, "L_vs_K_before_QC", raw_l_vs_k_plots) + generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) + generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) + generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) + + + + + generate_and_save_plots(df, out_dir_qc, "raw_L_vs_K_before_QC.pdf", delta_bg_plots) + generate_and_save_plots(df, out_dir_qc, "plate_analysis", before_qc_plot_configs) + generate_and_save_plots(df_above_tolerance, out_dir_qc, above_tolerance_plot_configs) + generate_and_save_plots(df_na_filtered, out_dir_qc, after_qc_plot_configs) + generate_and_save_plots(df_no_zeros, out_dir_qc, "plate_analysis_no_zeros", no_zeros_plot_configs) # Clean up rm(df, df_above_tolerance, df_no_zeros) @@ -713,9 +855,9 @@ main <- function() { # Create interaction plots deletion_plot_configs <- interaction_plot_configs(df_reference, variables) - generate_and_save_plots(zscores_calculations, out_dir, "RF_InteractionPlots", deletion_plot_configs) + generate_and_save_plots(zscores_calculations, out_dir, deletion_plot_configs) deletion_plot_configs <- interaction_plot_configs(df_deletion, variables) - generate_and_save_plots(zscores_calculations, out_dir, "InteractionPlots", deletion_plot_configs) + generate_and_save_plots(zscores_calculations, out_dir, deletion_plot_configs) # Define conditions for enhancers and suppressors # TODO Add to study config file? @@ -765,14 +907,6 @@ main <- function() { file = file.path(out_dir, "ZScores_Interaction_DeletionEnhancers_K_lm.csv"), row.names = FALSE) write.csv(suppressors_lm_K, file = file.path(out_dir, "ZScores_Interaction_DeletionSuppressors_K_lm.csv"), row.names = FALSE) - - # Interaction plots for reference strain - variables <- c("L", "K", "r", "AUC") - reference_plot_configs <- interaction_plot_configs(df_reference, variables) - generate_and_save_plots(zscores_calculation_reference, out_dir, "RF_InteractionPlots", reference_plot_configs) - - deletion_plot_configs <- interaction_plot_configs(df_deletion, variables) - generate_and_save_plots(zscores_calculations, out_dir, "InteractionPlots", deletion_plot_configs) # Correlation plots lm_list <- list( @@ -786,7 +920,7 @@ main <- function() { lm_summaries <- lapply(lm_list, summary) correlation_plot_config <- correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries) - generate_and_save_plots(zscores_interactions_filtered, output_dir, "CorrelationPlots", correlation_plot_config) + generate_and_save_plots(zscores_interactions_filtered, output_dir, correlation_plot_config) # Generate ranked plots rank_plot_config <- list( @@ -796,7 +930,7 @@ main <- function() { list(x_var = "AUC_Rank", y_var = "Avg_Zscore_AUC", plot_type = "scatter", title = "Rank vs Avg Z score for AUC") ) # Generate and save rank plots using the existing plotting framework - generate_and_save_plots(zscores_interactions_filtered, output_dir, "RankPlots", rank_plot_config) + generate_and_save_plots(zscores_interactions_filtered, output_dir, rank_plot_config) }) }) }