From a5c38ec47c650a4742a5de6c411209b39c7cc13f Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Tue, 10 Sep 2024 02:16:59 -0400 Subject: [PATCH] Cleanup interaction scores --- .../apps/r/calculate_interaction_zscores5.R | 199 +++++++++++------- 1 file changed, 124 insertions(+), 75 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index 2bcd7882..004a16cf 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -180,20 +180,21 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c return(list(summary_stats = summary_stats, df_with_stats = df_with_stats)) } -# Calculate interaction scores + calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { # Calculate total concentration variables total_conc_num <- length(unique(df$conc_num)) num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1 - # Pull the background means and standard deviations from 0 concentration + # Pull the background means and standard deviations from zero concentration bg_means <- list( L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(), K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(), r = df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first(), AUC = df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first() ) + bg_sd <- list( L = df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first(), K = df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first(), @@ -201,56 +202,58 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() ) - # Calculate NG, DB, SM, N interaction_scores <- df %>% + mutate( + WT_L = df$mean_L, + WT_K = df$mean_K, + WT_r = df$mean_r, + WT_AUC = df$mean_AUC, + WT_sd_L = df$sd_L, + WT_sd_K = df$sd_K, + WT_sd_r = df$sd_r, + WT_sd_AUC = df$sd_AUC + ) %>% group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% - mutate( - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE), - N = sum(!is.na(L)) - ) + mutate( + N = sum(!is.na(L)), + NG = sum(NG, na.rm = TRUE), + DB = sum(DB, na.rm = TRUE), + SM = sum(SM, na.rm = TRUE), + across(all_of(variables), list( + mean = ~mean(., na.rm = TRUE), + median = ~median(., na.rm = TRUE), + max = ~max(., na.rm = TRUE), + min = ~min(., na.rm = TRUE), + sd = ~sd(., na.rm = TRUE), + se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1) + ), .names = "{.fn}_{.col}") + ) %>% + ungroup() - # Calculate Raw_Shift and Z_Shift for each variable interaction_scores <- interaction_scores %>% mutate( - Raw_Shift_L = mean_L - bg_means$L, - Raw_Shift_K = mean_K - bg_means$K, - Raw_Shift_r = mean_r - bg_means$r, - Raw_Shift_AUC = mean_AUC - bg_means$AUC, - Z_Shift_L = Raw_Shift_L / bg_sd$L, - Z_Shift_K = Raw_Shift_K / bg_sd$K, - Z_Shift_r = Raw_Shift_r / bg_sd$r, - Z_Shift_AUC = Raw_Shift_AUC / bg_sd$AUC + Raw_Shift_L = mean_L[[1]] - bg_means$L, + Raw_Shift_K = mean_K[[1]] - bg_means$K, + Raw_Shift_r = mean_r[[1]] - bg_means$r, + Raw_Shift_AUC = mean_AUC[[1]] - bg_means$AUC, + Z_Shift_L = Raw_Shift_L[[1]] / df$sd_L[[1]], + Z_Shift_K = Raw_Shift_K[[1]] / df$sd_K[[1]], + Z_Shift_r = Raw_Shift_r[[1]] / df$sd_r[[1]], + Z_Shift_AUC = Raw_Shift_AUC[[1]] / df$sd_AUC[[1]] ) - # Calculate WT and WT_sd for each variable - interaction_scores <- interaction_scores %>% - mutate( - WT_L = mean(mean_L, na.rm = TRUE), - WT_K = mean(mean_K, na.rm = TRUE), - WT_r = mean(mean_r, na.rm = TRUE), - WT_AUC = mean(mean_AUC, na.rm = TRUE), - WT_sd_L = sd(mean_L, na.rm = TRUE), - WT_sd_K = sd(mean_K, na.rm = TRUE), - WT_sd_r = sd(mean_r, na.rm = TRUE), - WT_sd_AUC = sd(mean_AUC, na.rm = TRUE) - ) - - # Calculate Exp and Delta for each variable interaction_scores <- interaction_scores %>% mutate( Exp_L = WT_L + Raw_Shift_L, - Delta_L = WT_L - Exp_L, + Delta_L = mean_L - Exp_L, Exp_K = WT_K + Raw_Shift_K, - Delta_K = WT_K - Exp_K, + Delta_K = mean_K - Exp_K, Exp_r = WT_r + Raw_Shift_r, - Delta_r = WT_r - Exp_r, + Delta_r = mean_r - Exp_r, Exp_AUC = WT_AUC + Raw_Shift_AUC, - Delta_AUC = WT_AUC - Exp_AUC + Delta_AUC = mean_AUC - Exp_AUC ) - # Adjust Delta for NG and SM conditions interaction_scores <- interaction_scores %>% mutate( Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L), @@ -258,17 +261,38 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC), Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) - ) %>% - ungroup() + ) - # Calculate linear models and lm_Score for each variable + # Calculate Z-scores for each variable + interaction_scores <- interaction_scores %>% + mutate( + Zscore_L = Delta_L / WT_sd_L, + Zscore_K = Delta_K / WT_sd_K, + Zscore_r = Delta_r / WT_sd_r, + Zscore_AUC = Delta_AUC / WT_sd_AUC + ) + + # Calculate linear models + interaction_scores <- interaction_scores %>% + mutate( + lm_L = lm(Delta_L ~ conc_num_factor), + lm_K = lm(Delta_K ~ conc_num_factor), + lm_r = lm(Delta_r ~ conc_num_factor), + lm_AUC = Dlm(Delta_AUC ~ conc_num_factor) + ) + + # Calculate linear models and interaction scores interaction_scores_all <- interaction_scores %>% group_by(across(all_of(group_vars))) %>% mutate( - lm_Score_L = max_conc * coef(lm(Delta_L ~ conc_num_factor))[2] + coef(lm(Delta_L ~ conc_num_factor))[1], - lm_Score_K = max_conc * coef(lm(Delta_K ~ conc_num_factor))[2] + coef(lm(Delta_K ~ conc_num_factor))[1], - lm_Score_r = max_conc * coef(lm(Delta_r ~ conc_num_factor))[2] + coef(lm(Delta_r ~ conc_num_factor))[1], - lm_Score_AUC = max_conc * coef(lm(Delta_AUC ~ conc_num_factor))[2] + coef(lm(Delta_AUC ~ conc_num_factor))[1] + lm_Score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1], + lm_Score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1], + lm_Score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1], + lm_Score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1], + r_squared_L = summary(lm_L)$r.squared, + r_squared_K = summary(lm_K)$r.squared, + r_squared_r = summary(lm_r)$r.squared, + r_squared_AUC = summary(lm_AUC)$r.squared ) # Calculate Z_lm for each variable @@ -277,16 +301,11 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE), Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE), Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE), - Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE) - ) - - # Calculate Average Z-scores for each variable - interaction_scores_all <- interaction_scores_all %>% - mutate( - Avg_Zscore_L = sum(Z_Shift_L, na.rm = TRUE) / num_non_removed_concs, - Avg_Zscore_K = sum(Z_Shift_K, na.rm = TRUE) / num_non_removed_concs, - Avg_Zscore_r = sum(Z_Shift_r, na.rm = TRUE) / (total_conc_num - 1), - Avg_Zscore_AUC = sum(Z_Shift_AUC, na.rm = TRUE) / (total_conc_num - 1) + Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE), + Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / num_non_removed_concs, + Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / num_non_removed_concs, + Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / (total_conc_num - 1), + Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / (total_conc_num - 1) ) # Arrange results by Z_lm_L and NG @@ -298,8 +317,6 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) } - -# Main function generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) { `%||%` <- function(a, b) if (!is.null(a)) a else b @@ -308,9 +325,39 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la plots <- lapply(plot_configs, function(config) { df <- config$df - plot <- ggplot(df, aes(x = if (config$plot_type == "box") as.factor(!!sym(config$x_var)) else !!sym(config$x_var), - color = as.factor(!!sym(config$color_var)))) + plot <- ggplot(df, aes(x = !!sym(config$x_var), y = !!sym(config$y_var), color = as.factor(!!sym(config$color_var)))) + # Rank plots with SD annotations + if (config$plot_type == "rank") { + plot <- plot + geom_point(size = 0.1, shape = 3) + + # Adding SD bands + if (!is.null(config$sd_band)) { + for (i in seq_len(config$sd_band)) { + plot <- plot + + annotate("rect", xmin = -Inf, xmax = Inf, ymin = i, ymax = Inf, fill = "#542788", alpha = 0.3) + + annotate("rect", xmin = -Inf, xmax = Inf, ymin = -i, ymax = -Inf, fill = "orange", alpha = 0.3) + + geom_hline(yintercept = c(-i, i), color = "gray") + } + } + + # Optionally add enhancer/suppressor count annotations + if (!is.null(config$enhancer_label)) { + plot <- plot + annotate("text", x = config$enhancer_label$x, + y = config$enhancer_label$y, label = config$enhancer_label$label) + + annotate("text", x = config$suppressor_label$x, + y = config$suppressor_label$y, label = config$suppressor_label$label) + } + } + + # Correlation plots + if (config$plot_type == "correlation") { + plot <- plot + geom_point(shape = 3, color = "gray70") + + geom_smooth(method = "lm", color = "tomato3") + + annotate("text", x = 0, y = 0, label = config$correlation_text) + } + + # General scatter/boxplot/density handling if (!is.null(config$y_var)) { plot <- plot + aes(y = !!sym(config$y_var)) @@ -330,23 +377,27 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) } } - + + # Plot type selection plot <- switch(config$plot_type, "box" = plot + geom_boxplot(), "density" = plot + geom_density(), "bar" = plot + geom_bar(stat = "identity"), plot + geom_point() + geom_smooth(method = "lm", se = FALSE)) + # Adding y-limits if provided if (!is.null(config$ylim_vals)) { plot <- plot + coord_cartesian(ylim = config$ylim_vals) } - + + # Setting legend position, titles, and labels legend_position <- config$legend_position %||% "bottom" - plot <- plot + ggtitle(config$title) + theme_publication(legend_position = legend_position) + plot <- plot + ggtitle(config$title) + theme_Publication(legend_position = legend_position) if (!is.null(config$x_label)) plot <- plot + xlab(config$x_label) if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label) + # Adding text annotations if provided if (!is.null(config$annotations)) { for (annotation in config$annotations) { plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label) @@ -393,7 +444,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la } } - interaction_plot_configs <- function(df, variables) { plot_configs <- list() @@ -463,7 +513,6 @@ interaction_plot_configs <- function(df, variables) { return(plot_configs) } - correlation_plot_configs <- function(df, lm_list, lm_summaries) { lapply(seq_along(lm_list), function(i) { r_squared <- round(lm_summaries[[i]]$r.squared, 3) @@ -479,18 +528,20 @@ correlation_plot_configs <- function(df, lm_list, lm_summaries) { # Adjust missing values and calculate ranks adjust_missing_and_rank <- function(df, variables) { - + + # Adjust missing values in Avg_Zscore and Z_lm columns, and apply rank to the specified variables df <- df %>% mutate(across(all_of(variables), list( - Avg_Zscore = if_else(is.na(.Avg_Zscore), 0.001, .Avg_Zscore), - Z_lm = if_else(is.na(.Z_lm), 0.001, .Z_lm), - Rank = rank(.Avg_Zscore), - Rank_lm = rank(.Z_lm), - ), "{.fn}_{.col}")) - + Avg_Zscore = ~ if_else(is.na(get(paste0("Avg_Zscore_", cur_column()))), 0.001, get(paste0("Avg_Zscore_", cur_column()))), + Z_lm = ~ if_else(is.na(get(paste0("Z_lm_", cur_column()))), 0.001, get(paste0("Z_lm_", cur_column()))), + Rank = ~ rank(get(paste0("Avg_Zscore_", cur_column()))), + Rank_lm = ~ rank(get(paste0("Z_lm_", cur_column()))) + ), .names = "{fn}_{col}")) + return(df) } + main <- function() { lapply(names(args$experiments), function(exp_name) { exp <- args$experiments[[exp_name]] @@ -566,9 +617,7 @@ main <- function() { write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE) write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) - - # Plot configurations - # Each list corresponds to a group of plots in the same file + # Plots # Print quality control graphs before removing data due to contamination and # adjusting missing data to max theoretical values @@ -842,8 +891,8 @@ main <- function() { ) lm_summaries <- lapply(lm_list, summary) - correlation_plot_config <- correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries) - generate_and_save_plots(zscores_interactions_filtered, output_dir, correlation_plot_config) + correlation_plot_configs <- correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries) + generate_and_save_plots(zscores_interactions_filtered, output_dir, correlation_plot_configs) # Generate ranked plots rank_plot_config <- list(