From dc5dea8bfc540ffb55800838b5f3c36a8b8e35fc Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Fri, 23 Aug 2024 22:43:41 -0400 Subject: [PATCH] Add more output to calculate_interaction_zscores.R --- .../apps/r/calculate_interaction_zscores.R | 221 +++++++++++++----- 1 file changed, 168 insertions(+), 53 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores.R b/workflow/apps/r/calculate_interaction_zscores.R index dbc5e9d2..af30aa3a 100644 --- a/workflow/apps/r/calculate_interaction_zscores.R +++ b/workflow/apps/r/calculate_interaction_zscores.R @@ -1,10 +1,10 @@ suppressMessages({ - library("ggplot2") - library("plotly") - library("htmlwidgets") - library("dplyr") - library("ggthemes") - library("data.table") + library(ggplot2) + library(plotly) + library(htmlwidgets) + library(dplyr) + library(ggthemes) + library(data.table) }) # Constants for configuration @@ -14,10 +14,9 @@ BASE_SIZE <- 14 options(warn = 2, max.print = 100) -# Function to parse arguments parse_arguments <- function() { - if (interactive()) { - args <- c( + args <- if (interactive()) { + c( "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD", 3, "/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab", @@ -28,13 +27,13 @@ parse_arguments <- function() { "Experiment 2: HLD versus Doxo" ) } else { - args <- commandArgs(trailingOnly = TRUE) + commandArgs(trailingOnly = TRUE) } - + paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE) names <- args[seq(6, length(args), by = 2)] experiments <- setNames(paths, names) - + list( out_dir = normalizePath(file.path(args[1]), mustWork = FALSE), sd = as.numeric(args[2]), @@ -46,7 +45,6 @@ parse_arguments <- function() { args <- parse_arguments() -# Ensure main output directories exist dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE) dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE) @@ -109,26 +107,20 @@ sgd_genes <- function(sgd_gene_list) { genes <- sgd_genes(args$sgd_gene_list) load_and_preprocess_data <- function(easy_results_file, genes) { - # Attempt to read the file and handle any errors that occur df <- tryCatch({ read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) }, error = function(e) { stop("Error reading file: ", easy_results_file, "\n", e$message) }) - # Filter out non-data rows - df <- df %>% - filter(!.[[1]] %in% c("", "Scan")) - - # Convert specified columns to numeric numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd") - df[numeric_columns[numeric_columns %in% colnames(df)]] <- - lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric) + df[numeric_columns[numeric_columns %in% colnames(df)]] <- + suppressWarnings(lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric)) - # Further filter and mutate the data df <- df %>% - filter(!(Scan %in% c("", "Scan"))) %>% - {if ("Conc" %in% colnames(.)) filter(., Conc != "0ug/mL") else .} %>% + filter(!.[[1]] %in% c("", "Scan")) %>% + filter(Gene != "BLANK" & Gene != "Blank" & ORF != "Blank" & Gene != "blank") %>% + filter(Drug != "BMH21") %>% mutate( L = if ("l" %in% colnames(.)) l else {warning("Missing column: l"); NA}, AUC = if ("AUC96" %in% colnames(.)) AUC96 else {warning("Missing column: AUC96"); NA}, @@ -138,40 +130,43 @@ load_and_preprocess_data <- function(easy_results_file, genes) { GeneName = vapply(ORF, function(orf) { gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName) ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name) - }, character(1)) - ) + }, character(1)), + OrfRep = ifelse(ORF == "YDL227C", "YDL227C", OrfRep) + ) # %>% + # mutate(across(everything(), ~na_if(., ""))) - # Check if the dataframe is empty after filtering if (nrow(df) == 0) warning("Dataframe is empty after filtering") - + return(df) } - -# Plot creation and publication create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "") { + if (!("Scan" %in% colnames(df))) { + warning("'Scan' column is not present in the data. Skipping this plot.") + return(NULL) + } + plot_func <- if (plot_type == "scatter") geom_point else geom_boxplot filtered_df <- df[is.finite(df[[var]]), ] - p <- ggplot(filtered_df, aes(Scan, .data[[var]], color = as.factor(conc_num))) + + p <- ggplot(filtered_df, aes(Scan, !!sym(var), color = as.factor(conc_num))) + plot_func(shape = 3, size = 0.2, position = "jitter") + stat_summary(fun = mean, geom = "point", size = 0.6) + stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar") + ggtitle(paste("Plate analysis by Drug Conc for", var, "before quality control")) + theme_publication() - + pdf_path <- file.path(out_dir_qc, paste0("plate_analysis_", var, suffix, ".pdf")) pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT) print(p) dev.off() - + html_path <- sub(".pdf$", ".html", pdf_path) pgg <- suppressWarnings(ggplotly(p, tooltip = c("L", "K", "ORF", "Gene", "delta_bg", "GeneName")) %>% - layout(legend = list(orientation = "h"))) + layout(legend = list(orientation = "h"))) saveWidget(pgg, html_path, selfcontained = TRUE) } -# Summary statistics publication publish_summary_stats <- function(df, variables, out_dir) { stats_list <- lapply(variables, function(var) { df %>% @@ -205,13 +200,45 @@ publish_interaction_scores <- function(df, out_dir) { r_rank = rank(mean_r), auc_rank = rank(mean_AUC) ) - + fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE) fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank), file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE) + + # Additional enhancer and suppressor calculations and outputs + deletion_enhancers_L <- interaction_scores[interaction_scores$mean_L >= 2, ] + deletion_enhancers_L <- deletion_enhancers_L[!is.na(deletion_enhancers_L$OrfRep), ] + + deletion_enhancers_K <- interaction_scores[interaction_scores$mean_K <= -2, ] + deletion_enhancers_K <- deletion_enhancers_K[!is.na(deletion_enhancers_K$OrfRep), ] + + deletion_suppressors_L <- interaction_scores[interaction_scores$mean_L <= -2, ] + deletion_suppressors_L <- deletion_suppressors_L[!is.na(deletion_suppressors_L$OrfRep), ] + +deletion_suppressors_K <- interaction_scores[interaction_scores$mean_K >= 2, ] + deletion_suppressors_K <- deletion_suppressors_K[!is.na(deletion_suppressors_K$OrfRep), ] + + deletion_enhancers_and_suppressors_L <- interaction_scores[ + interaction_scores$mean_L >= 2 | interaction_scores$mean_L <= -2, ] + deletion_enhancers_and_suppressors_L <- deletion_enhancers_and_suppressors_L[ + !is.na(deletion_enhancers_and_suppressors_L$OrfRep), ] + + deletion_enhancers_and_suppressors_K <- interaction_scores[ + interaction_scores$mean_K >= 2 | interaction_scores$mean_K <= -2, ] + deletion_enhancers_and_suppressors_K <- deletion_enhancers_and_suppressors_K[ + !is.na(deletion_enhancers_and_suppressors_K$OrfRep), ] + + # Write CSV files with updated filenames + fwrite(deletion_enhancers_L, file.path(out_dir, "deletion_enhancers_l.csv"), row.names = FALSE) + fwrite(deletion_enhancers_K, file.path(out_dir, "deletion_enhancers_k.csv"), row.names = FALSE) + fwrite(deletion_suppressors_L, file.path(out_dir, "deletion_suppressors_l.csv"), row.names = FALSE) + fwrite(deletion_suppressors_K, file.path(out_dir, "deletion_suppressors_k.csv"), row.names = FALSE) + fwrite(deletion_enhancers_and_suppressors_L, file.path(out_dir, "deletion_enhancers_and_suppressors_l.csv"), row.names = FALSE) + fwrite(deletion_enhancers_and_suppressors_K, file.path(out_dir, "deletion_enhancers_and_suppressors_k.csv"), row.names = FALSE) + + return(interaction_scores) # Return the updated data frame with rank columns } -# Z-scores publication publish_zscores <- function(df, out_dir) { zscores <- df %>% dplyr::mutate( @@ -220,40 +247,128 @@ publish_zscores <- function(df, out_dir) { zscore_r = scale(r, center = TRUE, scale = TRUE), zscore_AUC = scale(AUC, center = TRUE, scale = TRUE) ) - + fwrite(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE) } -# QC generation and publication generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) { variables <- c("L", "K", "r", "AUC", "delta_bg") - lapply(variables, create_and_publish_plot, df = df, plot_type = "scatter", out_dir_qc = out_dir_qc) + + # Pre-QC plots + lapply(variables, function(var) { + if (var %in% colnames(df)) { + create_and_publish_plot(df, var, "scatter", out_dir_qc) + } + }) + + # Filter data based on delta background tolerance for Post-QC analysis + df_post_qc <- df %>% + mutate(across(all_of(variables), + ~ ifelse(delta_bg >= delta_bg_tolerance, NA, .))) + + # Post-QC plots + lapply(variables, function(var) { + if (var %in% colnames(df_post_qc)) { + create_and_publish_plot(df_post_qc, var, "scatter", out_dir_qc, suffix = "_after_qc") + } + }) + + # For plots specifically for data above the tolerance threshold delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ] - lapply(variables, create_and_publish_plot, df = delta_bg_above_tolerance, - plot_type = "scatter", out_dir_qc = out_dir_qc, suffix = "_above_tolerance") + lapply(variables, function(var) { + if (var %in% colnames(delta_bg_above_tolerance)) { + create_and_publish_plot(delta_bg_above_tolerance, var, "scatter", out_dir_qc, suffix = "_above_tolerance") + } + }) } -# Process experiments -process_experiment <- function(exp_name, exp_dir, sgd_genes, output_dir) { +# Create rank plots +create_rank_plots <- function(interaction_scores, out_dir) { + rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank") + + lapply(rank_vars, function(rank_var) { + p <- ggplot(interaction_scores, aes(x = !!sym(rank_var))) + + geom_bar() + + ggtitle(paste("Rank Distribution for", rank_var)) + + theme_publication() + + pdf_path <- file.path(out_dir, paste0("rank_distribution_", rank_var, ".pdf")) + pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT) + print(p) + dev.off() + + # Generate HTML output + html_path <- sub(".pdf$", ".html", pdf_path) + pgg <- suppressWarnings(ggplotly(p) %>% + layout(legend = list(orientation = "h"))) + saveWidget(pgg, html_path, selfcontained = TRUE) + }) +} + + +create_correlation_plot <- function(interaction_scores, out_dir) { + # Check for non-finite values and remove them from the dataset + interaction_scores <- interaction_scores %>% + filter_all(all_vars(is.finite(.))) + + # Generate correlation plots for each pair of variables + pairs <- list( + c("mean_L", "mean_K"), + c("mean_L", "mean_r"), + c("mean_L", "mean_AUC"), + c("mean_K", "mean_r"), + c("mean_K", "mean_AUC"), + c("mean_r", "mean_AUC") + ) + + lapply(pairs, function(vars) { + p <- ggplot(interaction_scores, aes(x = !!sym(vars[1]), y = !!sym(vars[2]))) + + geom_point() + + geom_smooth(method = "lm", se = FALSE) + + ggtitle(paste("Correlation between", vars[1], "and", vars[2])) + + theme_publication() + + pdf_path <- file.path(out_dir, paste0("correlation_", vars[1], "_", vars[2], ".pdf")) + pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT) + print(p) + dev.off() + + # Generate HTML output + html_path <- sub(".pdf$", ".html", pdf_path) + pgg <- suppressWarnings(ggplotly(p, tooltip = c(vars[1], vars[2])) %>% + layout(legend = list(orientation = "h"))) + saveWidget(pgg, html_path, selfcontained = TRUE) + }) +} + + +process_experiment <- function(exp_name, exp_dir, genes, output_dir) { out_dir <- file.path(exp_dir, "zscores") out_dir_qc <- file.path(out_dir, "qc") dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) dir.create(out_dir_qc, showWarnings = FALSE) - - data <- load_and_preprocess_data(args$easy_results_file, sgd_genes) - + + data <- load_and_preprocess_data(args$easy_results_file, genes) + + # Calculate delta background tolerance delta_bg_tolerance <- mean(data$delta_bg, na.rm = TRUE) + 3 * sd(data$delta_bg, na.rm = TRUE) - + + # Generate and publish QC plots (both pre-QC and post-QC) generate_and_publish_qc(data, delta_bg_tolerance, out_dir_qc) - + + # Process and publish summary stats, interaction scores, and z-scores variables <- c("L", "K", "r", "AUC", "delta_bg") publish_summary_stats(data, variables, out_dir) - publish_interaction_scores(data, out_dir) + interaction_scores <- publish_interaction_scores(data, out_dir) publish_zscores(data, out_dir) - + + # Generate rank plots and correlation plots + create_rank_plots(interaction_scores, out_dir) + create_correlation_plot(interaction_scores, out_dir) + output_file <- file.path(out_dir, "zscores_interaction.csv") fwrite(data, output_file, row.names = FALSE) - + return(output_file) } @@ -267,7 +382,7 @@ if (length(processed_files) > 1) { combined_data <- Reduce(function(x, y) { merge(fread(x), fread(y), by = "OrfRep", all = TRUE, allow.cartesian = TRUE) }, processed_files) - + combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv") fwrite(combined_data, combined_output_file, row.names = FALSE) }