From d52903f8fabf89df54dca90eac3e27af7fd26549 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Wed, 4 Sep 2024 02:03:23 -0400 Subject: [PATCH] Before old plotting removal --- .../apps/r/calculate_interaction_zscores5.R | 587 ++++++++++-------- workflow/qhtcp-workflow | 22 +- 2 files changed, 327 insertions(+), 282 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index 5cfefd9f..d010ae43 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -163,102 +163,6 @@ update_gene_names <- function(df, sgd_gene_list) { return(df) } -generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num", title, x_label = NULL, y_label = NULL) { - # Start the ggplot with just the x_var and color_var - plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) + - ggtitle(title) + - theme_publication() - - # If y_var is not NULL, add it to the plot aesthetics - if (!is.null(y_var)) { - plot <- plot + aes(y = !!sym(y_var)) - } - - # Handle different plot types - plot <- switch(plot_type, - "scatter" = plot + geom_point(shape = 3, size = 0.2, position = "jitter") + - stat_summary(fun.data = mean_sdl, geom = "errorbar") + - stat_summary(fun = mean, geom = "point", size = 0.6), - "box" = plot + geom_boxplot(), - "density" = plot + geom_density(), - "bar" = plot + geom_bar(stat = ifelse(is.null(y_var), "count", "identity")) - ) - - # Add optional labels if provided - if (!is.null(x_label)) plot <- plot + xlab(x_label) - if (!is.null(y_label)) plot <- plot + ylab(y_label) - - return(plot) -} - - - - -generate_and_save_plots <- function(df, output_dir, prefix, variables, include_qc = FALSE) { - plots <- list() - - if (nrow(df) == 0) { - message("The \"", deparse(substitute(df)), "\" dataframe is empty, skipping plots") - return() - } - - message("Generating plots for \"", deparse(substitute(df)), "\" dataframe") - - for (var in variables) { - scatter_plot <- - generate_plot(df, x_var = "scan", y_var = var, plot_type = "scatter", - title = paste(prefix, "Scatter Plot for", var)) - boxplot <- - generate_plot(df, x_var = "scan", y_var = var, plot_type = "box", - title = paste(prefix, "Box Plot for", var)) - - plots[[paste0(var, "_scatter")]] <- scatter_plot - plots[[paste0(var, "_box")]] <- boxplot - } - - if (include_qc) { - plots[["Raw_L_vs_K"]] <- - generate_plot(df, x_var = "L", y_var = "K", plot_type = "scatter", - title = "Raw L vs K before QC") - plots[["Delta_bg_Density"]] <- - generate_plot(df, x_var = "delta_bg", plot_type = "density", color_var = "conc_num", - title = "Density plot for Delta Background by Conc All Data") - plots[["Delta_bg_Bar"]] <- - generate_plot(df, x_var = "delta_bg", plot_type = "bar", - title = "Bar plot for Delta Background by Conc All Data") - } - - save_plots(prefix, plots, output_dir) -} - -# Ensure all plots are saved and printed to PDF -save_plots <- function(file_name, plot_list, output_dir) { - # Save to PDF - pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9) - lapply(plot_list, function(plot) { - print(plot) - }) - dev.off() - - # Save to HTML with horizontal legend orientation - lapply(names(plot_list), function(plot_name) { - message("Generating plot: ", plot_name) - pgg <- tryCatch({ - suppressWarnings(ggplotly(plot_list[[plot_name]]) %>% - layout(legend = list(orientation = "h"))) - }, error = function(e) { - message("Error in plot: ", plot_name, "\n", e) - return(NULL) - }) - if (!is.null(pgg)) { - saveWidget(pgg, - file = file.path(output_dir, - paste0(file_name, "_", plot_name, ".html")), - selfcontained = TRUE) - } - }) -} - # Process strains (deletion and reference) process_strains <- function(df) { df_strains <- data.frame() # Initialize an empty dataframe to store results @@ -295,7 +199,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c max = ~max(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE), sd = ~sd(.x, na.rm = TRUE), - se = ~sd(.x, na.rm = TRUE) / sqrt(n() - 1) + se = ~sd(.x, na.rm = TRUE) / sqrt(n() - 1) # TODO why - 1? ), .names = "{.col}_{.fn}")) return(summary_stats) @@ -305,48 +209,70 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_ # Pull the background means print("Calculating background means") - l_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_mean) - k_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_mean) + L_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_mean) + K_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_mean) r_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(r_mean) - auc_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_mean) + AUC_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_mean) + + L_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_sd) + K_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_sd) + r_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(r_sd) + AUC_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_sd) - # Calculate all necessary statistics and shifts in one step + # Calculate summary statistics and shifts print("Calculating interaction scores part 1") interaction_scores_all <- df %>% group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% summarise( N = n(), - across(all_of(variables), list(mean = mean, sd = sd), na.rm = TRUE), - NG = sum(NG, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE), - raw_shift_l = mean(L, na.rm = TRUE) - l_mean_bg, - raw_shift_k = mean(K, na.rm = TRUE) - k_mean_bg, - raw_shift_r = mean(r, na.rm = TRUE) - r_mean_bg, - raw_shift_auc = mean(AUC, na.rm = TRUE) - auc_mean_bg, - z_shift_l = raw_shift_l / L_sd[1], - z_shift_k = raw_shift_k / K_sd[1], - z_shift_r = raw_shift_r / r_sd[1], - z_shift_auc = raw_shift_auc / AUC_sd[1], - exp_l = l_mean_bg + raw_shift_l, - exp_k = k_mean_bg + raw_shift_k, - exp_r = r_mean_bg + raw_shift_r, - exp_auc = auc_mean_bg + raw_shift_auc, - delta_l = mean(L, na.rm = TRUE) - exp_l, - delta_k = mean(K, na.rm = TRUE) - exp_k, - delta_r = mean(r, na.rm = TRUE) - exp_r, - delta_auc = mean(AUC, na.rm = TRUE) - exp_auc, - zscore_l = delta_l / L_sd, - zscore_k = delta_k / K_sd, - zscore_r = delta_r / r_sd, - zscore_auc = delta_auc / AUC_sd, - sum_z_score_l = sum(zscore_l, na.rm = TRUE), - avg_zscore_l = sum_z_score_l / (length(variables) - sum(NG, na.rm = TRUE)), - sum_z_score_k = sum(zscore_k, na.rm = TRUE), - avg_zscore_k = sum_z_score_k / (length(variables) - sum(NG, na.rm = TRUE)), - sum_z_score_r = sum(zscore_r, na.rm = TRUE), - avg_zscore_r = sum_z_score_r / (length(variables) - sum(NG, na.rm = TRUE)), - sum_z_score_auc = sum(zscore_auc, na.rm = TRUE), - avg_zscore_auc = sum_z_score_auc / (length(variables) - sum(NG, na.rm = TRUE)) + L_mean = mean(L), + L_median = median(L), + L_sd = sd(L), + L_se = L_sd / sqrt(N), + K_mean = mean(K), + K_median = median(K), + K_sd = sd(K), + K_se = K_sd / sqrt(N), + r_mean = mean(r), + r_median = median(r), + r_sd = sd(r), + r_se = r_sd / sqrt(N), + AUC_mean = mean(AUC), + AUC_median = median(AUC), + AUC_sd = sd(AUC), + AUC_se = AUC_sd / sqrt(N), + NG = sum(NG), + DB = sum(DB), + SM = sum(SM), + Raw_Shift_L = L_mean - L_mean_bg[[1]], + Raw_Shift_K = K_mean - K_mean_bg[[1]], + Raw_Shift_r = r_mean - r_mean_bg[[1]], + Raw_Shift_AUC = AUC_mean - AUC_mean_bg[[1]], + Z_Shift_L = Raw_Shift_L / L_sd[[0]], + Z_Shift_K = Raw_Shift_K / K_sd[[0]], + Z_Shift_r = Raw_Shift_r / r_sd[[0]], + Z_Shift_AUC = Raw_Shift_AUC / AUC_sd[[0]], + WT_l = df$L_mean, + WT_K = df$K_mean, + WT_r = df$r_mean, + WT_AUC = df$AUC_mean, + WT_sd_l = L_sd_bg[[1]], + WT_sd_K = K_sd_bg[[1]], + WT_sd_r = r_sd_bg[[1]], + WT_sd_AUC = AUC_sd_bg[[1]], + Exp_L = L_mean_bg[[1]] + Raw_Shift_L, + Exp_K = K_mean_bg[[1]] + Raw_Shift_K, + Exp_r = r_mean_bg[[1]] + Raw_Shift_r, + Exp_AUC = AUC_mean_bg[[1]] + Raw_Shift_AUC, + Delta_L = ifelse(NG == 1, mean(L) - WT_l, Delta_L), + Delta_L = ifelse(SM == 1, mean(L) - WT_l, Delta_L), + Delta_K = ifelse(NG == 1, mean(K) - WT_K, Delta_K), + Delta_r = ifelse(NG == 1, mean(r) - WT_r, Delta_r), + Delta_AUC = ifelse(NG == 1, mean(AUC) - WT_AUC, Delta_AUC), + Zscore_L = Delta_L / WT_sd_l, + Zscore_K = Delta_K / WT_sd_K, + Zscore_r = Delta_r / WT_sd_r, + Zscore_AUC = Delta_AUC / WT_sd_AUC, ) %>% ungroup() @@ -355,10 +281,10 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_ interaction_scores <- interaction_scores_all %>% group_by(across(all_of(group_vars))) %>% summarise( - lm_l = list(lm(delta_l ~ conc_num_factor)), - lm_k = list(lm(delta_k ~ conc_num_factor)), - lm_r = list(lm(delta_r ~ conc_num_factor)), - lm_auc = list(lm(delta_auc ~ conc_num_factor)), + lm_l = list(lm(Delta_L ~ conc_num_factor)), + lm_k = list(lm(Delta_K ~ conc_num_factor)), + lm_r = list(lm(Delta_r ~ conc_num_factor)), + lm_auc = list(lm(Delta_AUC ~ conc_num_factor)), lm_Score_L = max_conc * coef(lm_l[[1]])[2] + coef(lm_l[[1]])[1], lm_Score_K = max_conc * coef(lm_k[[1]])[2] + coef(lm_k[[1]])[1], lm_Score_r = max_conc * coef(lm_r[[1]])[2] + coef(lm_r[[1]])[1], @@ -367,66 +293,192 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_ R_Squared_K = summary(lm_k[[1]])$r.squared, R_Squared_r = summary(lm_r[[1]])$r.squared, R_Squared_AUC = summary(lm_auc[[1]])$r.squared, - Sum_Z_Score_L = sum(zscore_l, na.rm = TRUE), - Avg_Zscore_L = avg_zscore_l[1], - Sum_Z_Score_K = sum(zscore_k, na.rm = TRUE), - Avg_Zscore_K = avg_zscore_k[1], - Sum_Z_Score_r = sum(zscore_r, na.rm = TRUE), - Avg_Zscore_r = avg_zscore_r[1], - Sum_Z_Score_AUC = sum(zscore_auc, na.rm = TRUE), - Avg_Zscore_AUC = avg_zscore_auc[1], - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) + Sum_Z_Score_L = sum(Zscore_L), + Avg_Zscore_L = Sum_Z_Score_L / (length(variables) - sum(NG)), + Sum_Z_Score_K = sum(Zscore_K), + Avg_Zscore_K = Sum_Z_Score_K / (length(variables) - sum(NG)), + Sum_Z_Score_r = sum(Zscore_r), + Avg_Zscore_r = Sum_Z_Score_r / (length(variables) - sum(NG)), + Sum_Z_Score_AUC = sum(Zscore_AUC), + Avg_Zscore_AUC = Sum_Z_Score_AUC / (length(variables) - sum(NG)), + NG = sum(NG), + DB = sum(DB), + SM = sum(SM) ) %>% ungroup() - # Ensure interaction_scores has only the specified columns - interaction_scores <- interaction_scores %>% - select(Gene, raw_shift_l, z_shift_l, lm_Score_L, Z_lm_L = lm_Score_L, R_Squared_L, sum_z_score_l, avg_zscore_l, - raw_shift_k, z_shift_k, lm_Score_K, Z_lm_K = lm_Score_K, R_Squared_K, sum_z_score_k, avg_zscore_k, - raw_shift_r, z_shift_r, lm_Score_r, Z_lm_r = lm_Score_r, R_Squared_r, sum_z_score_r, avg_zscore_r, - raw_shift_auc, z_shift_auc, lm_Score_AUC, Z_lm_AUC = lm_Score_AUC, R_Squared_AUC, sum_z_score_auc, avg_zscore_auc, - NG, DB, SM) - return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) } -generate_rf_plots <- function(df_calculations, df_interactions, output_dir, file_prefix = "RF") { - variables <- c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC") - WT_sds <- list(WT_sd_l = 2, WT_sd_K = 2, WT_sd_r = 0.65, WT_sd_AUC = 6500) + +# generate_rf_plots <- function(df_calculations, df_interactions, output_dir, file_prefix = "RF") { +# variables <- c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC") +# WT_sds <- list(WT_sd_l = 2, WT_sd_K = 2, WT_sd_r = 0.65, WT_sd_AUC = 6500) - plot_list <- lapply(seq_along(variables), function(i) { - var <- variables[i] - WT_sd <- WT_sds[[i]] +# plot_list <- lapply(seq_along(variables), function(i) { +# var <- variables[i] +# WT_sd <- WT_sds[[i]] - ggplot(df_calculations, aes(conc_num_factor, !!sym(var))) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-WT_sd, WT_sd)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd), ymax = 0 + (2 * WT_sd)), alpha = 0.3) + - ggtitle(paste(df_calculations$OrfRep[1], df_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 0.9 * WT_sd, label = paste("ZShift =", round(df_interactions[[paste0("Z_Shift_", var)]], 2))) + - annotate("text", x = 1, y = 0.7 * WT_sd, label = paste("lm Zscore =", round(df_interactions[[paste0("Z_lm_", var)]], 2))) + - annotate("text", x = 1, y = -0.7 * WT_sd, label = paste("NG =", df_interactions$NG)) + - annotate("text", x = 1, y = -0.9 * WT_sd, label = paste("DB =", df_interactions$DB)) + - annotate("text", x = 1, y = -1.1 * WT_sd, label = paste("SM =", df_interactions$SM)) + - scale_x_continuous( - name = unique(df_calculations$Drug[1]), - breaks = unique(df_calculations$conc_num_factor), - labels = unique(as.character(df_calculations$conc_num))) + - theme_publication() - }) +# ggplot(df_calculations, aes(conc_num_factor, !!sym(var))) + +# geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + +# coord_cartesian(ylim = c(-WT_sd, WT_sd)) + +# geom_errorbar(aes(ymin = 0 - (2 * WT_sd), ymax = 0 + (2 * WT_sd)), alpha = 0.3) + +# ggtitle(paste(df_calculations$OrfRep[1], df_calculations$Gene[1], sep = " ")) + +# annotate("text", x = 1, y = 0.9 * WT_sd, label = paste("ZShift =", round(df_interactions[[paste0("Z_Shift_", var)]], 2))) + +# annotate("text", x = 1, y = 0.7 * WT_sd, label = paste("lm Zscore =", round(df_interactions[[paste0("Z_lm_", var)]], 2))) + +# annotate("text", x = 1, y = -0.7 * WT_sd, label = paste("NG =", df_interactions$NG)) + +# annotate("text", x = 1, y = -0.9 * WT_sd, label = paste("DB =", df_interactions$DB)) + +# annotate("text", x = 1, y = -1.1 * WT_sd, label = paste("SM =", df_interactions$SM)) + +# scale_x_continuous( +# name = unique(df_calculations$Drug[1]), +# breaks = unique(df_calculations$conc_num_factor), +# labels = unique(as.character(df_calculations$conc_num))) + +# theme_publication() +# }) - save_plots(file_prefix, plot_list, output_dir) +# save_plots(file_prefix, plot_list, output_dir) +# } + +# generate_summary_plots <- function(df, output_dir) { +# variables <- c("L", "K", "r", "AUC") +# plot_list <- lapply(variables, function(var) { +# generate_plot(df, x_var = "conc_num_factor", y_var = var, plot_type = "scatter", title = paste("Summary Plot for", var)) +# }) + +# save_plots("Summary_Plots", plot_list, output_dir) +# } + + +# # Generate ranked plots for a specific metric +# generate_ranked_plot <- function(df, rank_var, zscore_var, sd_threshold, title_prefix) { +# ggplot(df, aes(x = {{rank_var}}, y = {{zscore_var}})) + +# ggtitle(paste(title_prefix, "above", sd_threshold, "SD")) + +# xlab("Rank") + ylab(paste("Avg Z score", title_prefix)) + +# annotate("rect", xmin = -Inf, xmax = Inf, ymin = sd_threshold, ymax = Inf, fill = "#542788", alpha = 0.3) + +# annotate("rect", xmin = -Inf, xmax = Inf, ymin = -sd_threshold, ymax = -Inf, fill = "orange", alpha = 0.3) + +# geom_hline(yintercept = c(-sd_threshold, sd_threshold)) + +# geom_point(size = 0.1, shape = 3) + +# theme_publication() +# } + +# # Generate and save all ranked plots +# generate_and_save_ranked_plots <- function(df, output_dir, prefix) { +# rank_metrics <- list( +# list("L_Rank", "Avg_Zscore_L", "L"), +# list("K_Rank", "Avg_Zscore_K", "K"), +# list("r_Rank", "Avg_Zscore_r", "r"), +# list("AUC_Rank", "Avg_Zscore_AUC", "AUC"), +# list("L_Rank_lm", "Z_lm_L", "L"), +# list("K_Rank_lm", "Z_lm_K", "K"), +# list("r_Rank_lm", "Z_lm_r", "r"), +# list("AUC_Rank_lm", "Z_lm_AUC", "AUC") +# ) + +# pdf(file.path(output_dir, paste0(prefix, ".pdf")), width = 18, height = 12, onefile = TRUE) + +# for (sd_threshold in c(1, 2, 3)) { +# for (metric in rank_metrics) { +# plot <- generate_ranked_plot(df, sym(metric[[1]]), sym(metric[[2]]), sd_threshold, metric[[3]]) +# print(plot) +# } +# } + +# dev.off() +# } + + + + +generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num", + title, x_label = NULL, y_label = NULL, ylim_vals = NULL) { + plot <- ggplot(df, aes_string(x = x_var, color = color_var)) + + if (!is.null(y_var)) { + plot <- plot + aes_string(y = y_var) + } + + # Set up the plot based on the requested plot type + plot <- switch(plot_type, + "scatter" = plot + geom_point() + geom_smooth(method = "lm", se = FALSE), + "box" = plot + geom_boxplot(), + "density" = plot + geom_density(), + "bar" = plot + geom_bar(stat = "identity"), + plot # Default: return the plot as is + ) + + if (!is.null(ylim_vals)) { + plot <- plot + coord_cartesian(ylim = ylim_vals) + } + + # Add titles and labels if available + plot <- plot + ggtitle(title) + theme_publication() + if (!is.null(x_label)) plot <- plot + xlab(x_label) + if (!is.null(y_label)) plot <- plot + ylab(y_label) + + return(plot) } -generate_summary_plots <- function(df, output_dir) { - variables <- c("L", "K", "r", "AUC") - plot_list <- lapply(variables, function(var) { - generate_plot(df, x_var = "conc_num_factor", y_var = var, plot_type = "scatter", title = paste("Summary Plot for", var)) - }) + +generate_and_save_plots <- function(df, output_dir, file_prefix, variables, plot_type = "scatter", include_qc = FALSE, ylim_vals = NULL) { + plots <- list() - save_plots("Summary_Plots", plot_list, output_dir) + if (nrow(df) == 0) { + message("The \"", deparse(substitute(df)), "\" dataframe is empty, skipping plots") + return() + } + + message("Generating plots for \"", deparse(substitute(df)), "\" dataframe") + + # Create plots for the given variables + for (var in variables) { + plot <- generate_plot( + df = df, + x_var = "scan", + y_var = var, + plot_type = plot_type, + title = paste(file_prefix, "Plot for", var), + ylim_vals = ylim_vals + ) + plots[[paste0(var, "_", plot_type)]] <- plot + } + + # Include additional QC plots if requested + if (include_qc) { + plots[["Raw_L_vs_K"]] <- generate_plot(df, "L", "K", "scatter", "Raw L vs K before QC") + plots[["Delta_bg_Density"]] <- generate_plot(df, "delta_bg", NULL, "density", "Density plot for Delta Background") + plots[["Delta_bg_Bar"]] <- generate_plot(df, "delta_bg", NULL, "bar", "Bar plot for Delta Background") + } + + save_plots(file_prefix, plots, output_dir) +} + + +# Ensure all plots are saved and printed to PDF +save_plots <- function(file_name, plot_list, output_dir) { + # Save to PDF + pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9) + lapply(plot_list, function(plot) { + print(plot) + }) + dev.off() + + # Save to HTML with horizontal legend orientation + lapply(names(plot_list), function(plot_name) { + message("Generating plot: ", plot_name) + pgg <- tryCatch({ + suppressWarnings(ggplotly(plot_list[[plot_name]]) %>% + layout(legend = list(orientation = "h"))) + }, error = function(e) { + message("Error in plot: ", plot_name, "\n", e) + return(NULL) + }) + if (!is.null(pgg)) { + saveWidget(pgg, + file = file.path(output_dir, + paste0(file_name, "_", plot_name, ".html")), + selfcontained = TRUE) + } + }) } @@ -472,42 +524,6 @@ adjust_missing_and_rank <- function(df) { return(df) } -# Generate ranked plots for a specific metric -generate_ranked_plot <- function(df, rank_var, zscore_var, sd_threshold, title_prefix) { - ggplot(df, aes(x = {{rank_var}}, y = {{zscore_var}})) + - ggtitle(paste(title_prefix, "above", sd_threshold, "SD")) + - xlab("Rank") + ylab(paste("Avg Z score", title_prefix)) + - annotate("rect", xmin = -Inf, xmax = Inf, ymin = sd_threshold, ymax = Inf, fill = "#542788", alpha = 0.3) + - annotate("rect", xmin = -Inf, xmax = Inf, ymin = -sd_threshold, ymax = -Inf, fill = "orange", alpha = 0.3) + - geom_hline(yintercept = c(-sd_threshold, sd_threshold)) + - geom_point(size = 0.1, shape = 3) + - theme_publication() -} - -# Generate and save all ranked plots -generate_and_save_ranked_plots <- function(df, output_dir, prefix) { - rank_metrics <- list( - list("L_Rank", "Avg_Zscore_L", "L"), - list("K_Rank", "Avg_Zscore_K", "K"), - list("r_Rank", "Avg_Zscore_r", "r"), - list("AUC_Rank", "Avg_Zscore_AUC", "AUC"), - list("L_Rank_lm", "Z_lm_L", "L"), - list("K_Rank_lm", "Z_lm_K", "K"), - list("r_Rank_lm", "Z_lm_r", "r"), - list("AUC_Rank_lm", "Z_lm_AUC", "AUC") - ) - - pdf(file.path(output_dir, paste0(prefix, ".pdf")), width = 18, height = 12, onefile = TRUE) - - for (sd_threshold in c(1, 2, 3)) { - for (metric in rank_metrics) { - plot <- generate_ranked_plot(df, sym(metric[[1]]), sym(metric[[2]]), sd_threshold, metric[[3]]) - print(plot) - } - } - - dev.off() -} # Function to create and save all ranked plots create_ranked_plots <- function(df, output_dir) { @@ -520,8 +536,45 @@ create_ranked_plots <- function(df, output_dir) { generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_lm_naRM") } + +generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", title_prefix = "", + x_label = NULL, y_label = NULL, ylim_vals = NULL, output_dir, file_prefix = "plot") { + plot_list <- list() + + # Iterate over each y variable and generate the corresponding plot + for (var in y_vars) { + plot <- ggplot(df, aes(x = !!sym(x_var), y = !!sym(var), color = as.factor(!!sym(color_var)))) + + ggtitle(paste(title_prefix, var)) + + theme_publication() + + if (plot_type == "scatter") { + plot <- plot + + geom_point() + + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + + if (!is.null(ylim_vals)) { + plot <- plot + coord_cartesian(ylim = ylim_vals) + } + } else if (plot_type == "box") { + plot <- plot + geom_boxplot() + } else if (plot_type == "density") { + plot <- plot + geom_density() + } else if (plot_type == "bar") { + plot <- plot + geom_bar(stat = "identity") + } + + if (!is.null(x_label)) plot <- plot + xlab(x_label) + if (!is.null(y_label)) plot <- plot + ylab(y_label) + + plot_list[[var]] <- plot + } + + # Save plots to PDF and HTML + save_plots(file_prefix, plot_list, output_dir) +} + + main <- function() { - # Applying to all experiments lapply(names(args$experiments), function(exp_name) { exp <- args$experiments[[exp_name]] exp_path <- exp$path @@ -537,34 +590,28 @@ main <- function() { max_conc <- max(df$conc_num_factor) - # QC - # Filter the df above sd tolerance for raw L vs K plot + # QC steps and filtering df_above_tolerance <- df %>% filter(DB == 1) - - # Set vars above the delta background tolerance to NA df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .x))) - - # Remove rows where L is 0 df_no_zeros <- df_na %>% filter(L > 0) - - # Flag and remove non-finite data, printing affected rows + df_na_filtered <- df_na %>% - filter(if_any(c(L), ~ !is.finite(.))) %>% # Add L, r, AUC, K as needed for debugging + filter(if_any(c(L), ~ !is.finite(.))) %>% { if (nrow(.) > 0) { message("Removing non-finite rows:\n") print(head(., n = 10)) } - df_na %>% filter(if_all(c(L), is.finite)) # Add L, r, AUC, K as needed for debugging + df_na %>% filter(if_all(c(L), is.finite)) } - - # Generate QC PDFs and HTMLs - # message("Generating QC plots") - # variables <- c("L", "K", "r", "AUC", "delta_bg") - # generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE) - # generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE) - # generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables) - # generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables) + + # Generate and save QC plots using the new generalized function + message("Generating QC plots") + variables <- c("L", "K", "r", "AUC", "delta_bg") + generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE) + generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE) + generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables) + generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables) rm(df, df_above_tolerance, df_no_zeros) @@ -577,38 +624,36 @@ main <- function() { # Originally this filtered L NA's - # Filter data within 2SD + # Filter data within and outside 2SD within_2sd_k <- stats_joined %>% filter(K >= (K_mean - 2 * K_sd) & K <= (K_mean + 2 * K_sd)) - - # Filter data outside 2SD outside_2sd_k <- stats_joined %>% filter(K < (K_mean - 2 * K_sd) | K > (K_mean + 2 * K_sd)) - message("Calculating summary statistics for L within 2SD of K") + # Summary statistics for within and outside 2SD of K l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + + # Write CSV files + write.csv(l_within_2sd_k, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE) + write.csv(l_outside_2sd_k, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) + + message("Calculating summary statistics for L within 2SD of K") cols_to_remove <- names(l_within_2sd_k) cols_to_keep <- c("conc_num", "conc_num_factor") within_2sd_k_clean <- within_2sd_k %>% select(-all_of(setdiff(cols_to_remove, cols_to_keep))) l_within_2sd_k_joined <- within_2sd_k_clean %>% left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor")) - write.csv(l_within_2sd_k, - file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), - row.names = FALSE) - + message("Calculating summary statistics for for L outside 2SD of K") - l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) cols_to_remove <- names(l_outside_2sd_k) cols_to_keep <- c("conc_num", "conc_num_factor") outside_2sd_k_clean <- outside_2sd_k %>% select(-all_of(setdiff(cols_to_remove, cols_to_keep))) l_outside_2sd_k_joined <- outside_2sd_k_clean %>% left_join(l_outside_2sd_k, by = c("conc_num", "conc_num_factor")) - write.csv(l_outside_2sd_k, - file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), - row.names = FALSE) - + # Process background strains bg_strains <- c("YDL227C") lapply(bg_strains, function(strain) { @@ -646,18 +691,14 @@ main <- function() { filter(OrfRep != strain) %>% mutate(SM = 0) - message("Processing reference strain") reference_strain <- process_strains(l_within_2sd_k_joined) - message("Processing deletion strains") deletion_strains <- process_strains(l_within_2sd_k_joined) # TODO we may need to add "num" to grouping vars # Calculate interactions variables <- c("L", "K", "r", "AUC") - message("Calculating reference interaction scores") reference_results <- calculate_interaction_scores(stats_joined, reference_strain, max_conc, variables) - message("Calculating deletion interaction scores") deletion_results <- calculate_interaction_scores(stats_joined, deletion_strains, max_conc, variables) zscores_calculations_reference <- reference_results$zscores_calculations @@ -666,11 +707,11 @@ main <- function() { zscores_interactions <- deletion_results$zscores_interactions # TODO: I don't know if we need this? - zscores_interactions <- zscores_interactions %>% - arrange(desc(Z_lm_L)) %>% - arrange(desc(NG)) + # zscores_interactions <- zscores_interactions %>% + # arrange(desc(Z_lm_L)) %>% + # arrange(desc(NG)) - message("Writing zscores csv files") + # Writing Z-Scores to file write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE) write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE) write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE) @@ -678,10 +719,11 @@ main <- function() { # Define conditions for enhancers and suppressors # TODO Add to study config file? - enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= 2 - suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -2 - enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= 2 - suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -2 + threshold <- 2 + enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= threshold + suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -threshold + enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= threshold + suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -threshold # Subset data enhancers_L <- zscores_interactions[enhancer_condition_L, ] @@ -707,10 +749,11 @@ main <- function() { file = file.path(out_dir, "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K.csv"), row.names = FALSE) # Handle linear model based enhancers and suppressors - enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= 2, ] - suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -2, ] - enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= 2, ] - suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -2, ] + lm_threshold <- 2 + enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= lm_threshold, ] + suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -lm_threshold, ] + enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= lm_threshold, ] + suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -lm_threshold, ] # Save linear model based enhancers and suppressors message("Writing linear model enhancer/suppressor csv files") @@ -724,9 +767,11 @@ main <- function() { file = file.path(out_dir, "ZScores_Interaction_DeletionSuppressors_K_lm.csv"), row.names = FALSE) # Generate plots for interaction scores - generate_rf_plots(zscores_calculations_reference, zscores_interactions_reference, out_dir) - generate_rf_plots(zscores_calculations, zscores_interactions, out_dir) - + # generate_rf_plots(zscores_calculations_reference, zscores_interactions_reference, out_dir) + # generate_rf_plots(zscores_calculations, zscores_interactions, out_dir) + generate_and_save_plots(zscores_calculations_reference, out_dir, "Reference_Calculations", variables) + generate_and_save_plots(zscores_calculations, out_dir, "Deletion_Calculations", variables) + # Apply filtering to remove NA values before further analysis zscores_interactions_filtered <- zscores_interactions %>% filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) diff --git a/workflow/qhtcp-workflow b/workflow/qhtcp-workflow index c2a8f0ce..e516d682 100755 --- a/workflow/qhtcp-workflow +++ b/workflow/qhtcp-workflow @@ -785,7 +785,7 @@ init_project() { debug "Running: ${FUNCNAME[0]}" # Create a project scans dir - [[ -d $PROJECT_SCANS_DIR ]] || (execute mkdir -p "$PROJECT_SCANS_DIR" || return 1) + [[ -d $PROJECT_SCANS_DIR ]] || { execute mkdir -p "$PROJECT_SCANS_DIR" || return 1; } # TODO eventually copy scans from robot but for now let's pause and wait for transfer echo "In the future we will copy scans from robot here" @@ -1561,7 +1561,7 @@ join_interaction_zscores() { "${EXP_PATHS_AND_NAMES[@]}" for f in "${out_files[@]}"; do - [[ -f $f ]] || (echo "$f does not exist"; return 1) + [[ -f $f ]] || { echo "$f does not exist"; return 1; } done } @@ -1606,7 +1606,7 @@ r_gta() { "${5:-"$GTA_OUT_DIR"}" \ "${@:6}" # future arguments - [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) + [[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; } } @@ -1779,7 +1779,7 @@ r_add_shift_values() { rm -f "$STUDY_RESULTS_DIR/REMcHeatmaps/"*.pdf # TODO why? out_file="${4:-"$STUDY_RESULTS_DIR/REMcWithShift.csv"}" - [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) + [[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; } } @@ -1815,7 +1815,7 @@ r_create_heat_maps() { pdfs=(REMcHeatmaps/*.pdf) execute pdftk "${pdfs[@]}" output "$out_file" out_file="$2/compiledREMcHeatmaps.pdf" - [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) + [[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; } } @@ -1845,7 +1845,7 @@ r_heat_maps_homology() { pdfs=("${1:-"$STUDY_RESULTS_DIR/homology"}"/*.pdf) execute pdftk "${pdfs[@]}" output "$out_file" - [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) + [[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; } } @@ -1869,7 +1869,7 @@ py_gtf_dcon() { "$2/" \ "${@:3}" # future arguments out_file="$2/1-0-0-finaltable.csv" - [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) + [[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; } } @@ -1930,7 +1930,7 @@ py_gtf_concat() { debug "Running: ${FUNCNAME[0]} $*" script="$APPS_DIR/python/concatGTFResults.py" execute "$PYTHON" "$script" "$1/" "$2" - [[ -f $2 ]] || (echo "$2 does not exist"; return 1) + [[ -f $2 ]] || { echo "$2 does not exist"; return 1; } } @@ -1983,7 +1983,7 @@ handle_results_dir() { results_dir="${results_dirs[0]}" else # Print results dirs - [[ ${#results_dirs[@]} -gt 0 ]] || (err "No ${2}s found"; return 1) + [[ ${#results_dirs[@]} -gt 0 ]] || { err "No ${2}s found"; return 1; } print_in_columns "results_dirs" # Results selection prompt cat <<-EOF @@ -2091,7 +2091,7 @@ handle_config() { # Declare a nameref to refer to the actual array using the variable name declare -n config_array_ref="$config_array_name" - [[ -z ${!config_array_ref[*]} ]] && (err "No $config_array_name array found in $config_file" && return 1) + [[ -z ${!config_array_ref[*]} ]] && { err "No $config_array_name array found in $config_file" && return 1; } for key in "${!config_array_ref[@]}"; do IFS=',' read -r main_key sub_key <<< "$key" @@ -2298,7 +2298,7 @@ main() { if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then echo "Module ${MODULES[$i]} is not available, removing" read -r -p "Enter replacement module name: " module - ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${module}[[:space:]] ]] || (echo "RTFM"; return 1) + ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${module}[[:space:]] ]] || { echo "RTFM"; return 1; } MODULES[i]="$module" fi unset module