From 919058fc3a15c987b0806f44dfd0555f2d6e8926 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Wed, 4 Sep 2024 19:38:05 -0400 Subject: [PATCH] Add generate_interaction_plots() --- .../apps/r/calculate_interaction_zscores5.R | 278 +++++++++++------- 1 file changed, 176 insertions(+), 102 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index 2eeafbae..6ecbb378 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -194,118 +194,138 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% summarise(across(all_of(variables), list( + N = length(.x), mean = ~mean(.x, na.rm = TRUE), median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE), sd = ~sd(.x, na.rm = TRUE), - se = ~sd(.x, na.rm = TRUE) / sqrt(n() - 1) # TODO why - 1? - ), .names = "{.col}_{.fn}")) + se = sd / sqrt(N - 1), # TODO why - 1? + z_max = (max - mean) / sd + ), .names = "{.fn}_{.col}")) return(summary_stats) } + calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { + # Calculate total concentration variables + total_conc_num <- length(unique(df$conc_num)) + num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1 + # Pull the background means print("Calculating background means") - L_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_mean) - K_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_mean) - r_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(r_mean) - AUC_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_mean) + bg_L <- df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first() + bg_K <- df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first() + bg_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first() + bg_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first() - L_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_sd) - K_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_sd) - r_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(r_sd) - AUC_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_sd) + bg_sd_L <- df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first() + bg_sd_K <- df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first() + bg_sd_r <- df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first() + bg_sd_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() # Calculate summary statistics and shifts print("Calculating interaction scores part 1") - interaction_scores_all <- df %>% + interaction_scores <- df %>% group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% - summarise( - N = n(), - L_mean = mean(L), - L_median = median(L), - L_sd = sd(L), - L_se = L_sd / sqrt(N), - K_mean = mean(K), - K_median = median(K), - K_sd = sd(K), - K_se = K_sd / sqrt(N), - r_mean = mean(r), - r_median = median(r), - r_sd = sd(r), - r_se = r_sd / sqrt(N), - AUC_mean = mean(AUC), - AUC_median = median(AUC), - AUC_sd = sd(AUC), - AUC_se = AUC_sd / sqrt(N), - NG = sum(NG), - DB = sum(DB), - SM = sum(SM), - Raw_Shift_L = L_mean - L_mean_bg[[1]], - Raw_Shift_K = K_mean - K_mean_bg[[1]], - Raw_Shift_r = r_mean - r_mean_bg[[1]], - Raw_Shift_AUC = AUC_mean - AUC_mean_bg[[1]], - Z_Shift_L = Raw_Shift_L / L_sd[[0]], - Z_Shift_K = Raw_Shift_K / K_sd[[0]], - Z_Shift_r = Raw_Shift_r / r_sd[[0]], - Z_Shift_AUC = Raw_Shift_AUC / AUC_sd[[0]], - WT_l = df$L_mean, - WT_K = df$K_mean, - WT_r = df$r_mean, - WT_AUC = df$AUC_mean, - WT_sd_l = L_sd_bg[[1]], - WT_sd_K = K_sd_bg[[1]], - WT_sd_r = r_sd_bg[[1]], - WT_sd_AUC = AUC_sd_bg[[1]], - Exp_L = L_mean_bg[[1]] + Raw_Shift_L, - Exp_K = K_mean_bg[[1]] + Raw_Shift_K, - Exp_r = r_mean_bg[[1]] + Raw_Shift_r, - Exp_AUC = AUC_mean_bg[[1]] + Raw_Shift_AUC, - Delta_L = ifelse(NG == 1, mean(L) - WT_l, Delta_L), - Delta_L = ifelse(SM == 1, mean(L) - WT_l, Delta_L), - Delta_K = ifelse(NG == 1, mean(K) - WT_K, Delta_K), - Delta_r = ifelse(NG == 1, mean(r) - WT_r, Delta_r), - Delta_AUC = ifelse(NG == 1, mean(AUC) - WT_AUC, Delta_AUC), - Zscore_L = Delta_L / WT_sd_l, - Zscore_K = Delta_K / WT_sd_K, - Zscore_r = Delta_r / WT_sd_r, - Zscore_AUC = Delta_AUC / WT_sd_AUC, - ) %>% - ungroup() + summarise( + NG = sum(NG), + DB = sum(DB), + SM = sum(SM) + ) %>% + summarise(across(all_of(variables), list( + mean = ~mean(.x, na.rm = TRUE), + median = ~median(.x, na.rm = TRUE), + max = ~max(.x, na.rm = TRUE), + min = ~min(.x, na.rm = TRUE), + sd = ~sd(.x, na.rm = TRUE), + se = ~sd(.x, na.rm = TRUE) / sqrt(N - 1) # TODO why - 1? + ), .names = "{.fn}_{.col}")) %>% + summarise( + Raw_Shift_L = mean_L[[1]] - bg_L, + Raw_Shift_K = mean_K[[1]] - bg_K, + Raw_Shift_r = mean_r[[1]] - bg_r, + Raw_Shift_AUC = mean_AUC[[1]] - bg_AUC, + Z_Shift_L = Raw_Shift_L[[1]] / bg_sd_L, + Z_Shift_K = Raw_Shift_K[[1]] / bg_sd_K, + Z_Shift_r = Raw_Shift_r[[1]] / bg_sd_r, + Z_Shift_AUC = Raw_Shift_AUC[[1]] / bg_sd_AUC, + WT_L = mean_L, + WT_K = mean_K, + WT_r = mean_r, + WT_AUC = mean_AUC, + WT_sd_L = sd_L, + WT_sd_K = sd_K, + WT_sd_r = sd_r, + WT_sd_AUC = sd_AUC, + Exp_L = WT_L + Raw_Shift_L, + Exp_K = WT_K + Raw_Shift_K, + Exp_r = WT_r + Raw_Shift_r, + Exp_AUC = WT_AUC + Raw_Shift_AUC, + Delta_L = mean_L - Exp_L, + Delta_K = mean_K - Exp_K, + Delta_r = mean_r - Exp_r, + Delta_AUC = mean_AUC - Exp_AUC, + Delta_L = ifelse(NG == 1, mean_L - WT_L, Delta_L), # disregard shift for no growth values in Z score calculation + Delta_L = ifelse(SM == 1, mean_L - WT_L, Delta_L), # disregard shift for set to max values in Z score calculation + Delta_K = ifelse(NG == 1, mean_K - WT_K, Delta_K), + Delta_r = ifelse(NG == 1, mean_r - WT_r, Delta_r), + Delta_AUC = ifelse(NG == 1, mean_AUC - WT_AUC, Delta_AUC), + Zscore_L = Delta_L / WT_sd_L, + Zscore_K = Delta_K / WT_sd_K, + Zscore_r = Delta_r / WT_sd_r, + Zscore_AUC = Delta_AUC / WT_sd_AUC, + ) %>% + ungroup() - # Calculate linear models and interaction scores + # Calculate linear models and interaction scores per gene print("Calculating interaction scores part 2") - interaction_scores <- interaction_scores_all %>% + interaction_scores_all <- interaction_scores %>% group_by(across(all_of(group_vars))) %>% summarise( - lm_l = list(lm(Delta_L ~ conc_num_factor)), - lm_k = list(lm(Delta_K ~ conc_num_factor)), - lm_r = list(lm(Delta_r ~ conc_num_factor)), - lm_auc = list(lm(Delta_AUC ~ conc_num_factor)), - lm_Score_L = max_conc * coef(lm_l[[1]])[2] + coef(lm_l[[1]])[1], - lm_Score_K = max_conc * coef(lm_k[[1]])[2] + coef(lm_k[[1]])[1], - lm_Score_r = max_conc * coef(lm_r[[1]])[2] + coef(lm_r[[1]])[1], - lm_Score_AUC = max_conc * coef(lm_auc[[1]])[2] + coef(lm_auc[[1]])[1], - R_Squared_L = summary(lm_l[[1]])$r.squared, - R_Squared_K = summary(lm_k[[1]])$r.squared, - R_Squared_r = summary(lm_r[[1]])$r.squared, - R_Squared_AUC = summary(lm_auc[[1]])$r.squared, - Sum_Z_Score_L = sum(Zscore_L), - Avg_Zscore_L = Sum_Z_Score_L / (length(variables) - sum(NG)), - Sum_Z_Score_K = sum(Zscore_K), - Avg_Zscore_K = Sum_Z_Score_K / (length(variables) - sum(NG)), - Sum_Z_Score_r = sum(Zscore_r), - Avg_Zscore_r = Sum_Z_Score_r / (length(variables) - sum(NG)), - Sum_Z_Score_AUC = sum(Zscore_AUC), - Avg_Zscore_AUC = Sum_Z_Score_AUC / (length(variables) - sum(NG)), + lm_L = lm(Delta_L ~ conc_num_factor), + lm_K = lm(Delta_K ~ conc_num_factor), + lm_r = lm(Delta_r ~ conc_num_factor), + lm_AUC = lm(Delta_AUC ~ conc_num_factor), + lm_score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1], + lm_score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1], + lm_score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1], + lm_score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1], + lm_sd_L = sd(lm_score_L), + lm_sd_K = sd(lm_score_K), + lm_sd_r = sd(lm_score_r), + lm_sd_AUC = sd(lm_score_AUC), + lm_mean_L = mean(lm_score_L), + lm_mean_K = mean(lm_score_K), + lm_mean_r = mean(lm_score_r), + lm_mean_AUC = mean(lm_score_AUC), + Z_lm_L = (lm_score_L - lm_mean_L) / lm_sd_L, + Z_lm_K = (lm_score_K - lm_mean_K) / lm_sd_K, + Z_lm_r = (lm_score_r - lm_mean_r) / lm_sd_r, + Z_lm_AUC = (lm_score_AUC - lm_mean_AUC) / lm_sd_AUC, + r_squared_L = summary(lm_L)$r.squared, + r_squared_K = summary(lm_K)$r.squared, + r_squared_r = summary(lm_r)$r.squared, + r_squared_AUC = summary(lm_AUC)$r.squared, + Sum_Zscore_L = sum(Zscore_L), + Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs, + Sum_Zscore_K = sum(Zscore_K), + Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs, + Sum_Zscore_r = sum(Zscore_r), + Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1), + Sum_Zscore_AUC = sum(Zscore_AUC), + Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1), NG = sum(NG), DB = sum(DB), SM = sum(SM) ) %>% ungroup() + + interactions_scores_all <- interactions_scores_all %>% + arrange(desc(Z_lm_L)) %>% + arrange(desc(NG)) return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) } @@ -315,7 +335,7 @@ generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_ title, x_label = NULL, y_label = NULL, ylim_vals = NULL) { # Use tidy evaluation with aes() and !!sym() for dynamic column names - plot <- ggplot(df, aes(x = !!sym(x_var), color = !!sym(color_var))) + plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) if (!is.null(y_var)) { plot <- plot + aes(y = !!sym(y_var)) @@ -407,6 +427,53 @@ save_plots <- function(file_name, plot_list, output_dir) { } + +generate_interaction_plots <- function(df, output_file) { + message("Generating interaction plots") + + # Variables to be plotted + variables <- c("L", "K", "r", "AUC") + ylims <- list( + L = c(0, 160), + K = c(-20, 160), + r = c(0, 1), + AUC = c(0, 12500) + ) + + plot_list <- list() + + # Generate plots for each variable using the existing plotting function + for (var in variables) { + plot <- generate_plot( + df = df, + x_var = "conc_num_factor", + y_var = var, + plot_type = "scatter", + title = paste("Scatter RF for", var, "with SD"), + ylim_vals = ylims[[var]] + ) + + annotate("text", x = -0.25, y = ifelse(var == "L", 10, ifelse(var == "K", -5, 0.9)), label = "NG") + + annotate("text", x = -0.25, y = ifelse(var == "L", 5, ifelse(var == "K", -12.5, 0.8)), label = "DB") + + annotate("text", x = -0.25, y = ifelse(var == "L", 0, ifelse(var == "K", -20, 0.7)), label = "SM") + + annotate("text", x = unique(df$conc_num_factor), y = ifelse(var == "L", 10, ifelse(var == "K", -5, 0.9)), label = df$NG) + + annotate("text", x = unique(df$conc_num_factor), y = ifelse(var == "L", 5, ifelse(var == "K", -12.5, 0.8)), label = df$DB) + + annotate("text", x = unique(df$conc_num_factor), y = ifelse(var == "L", 0, ifelse(var == "K", -20, 0.7)), label = df$SM) + + plot_list[[var]] <- plot + } + + # Save plots in a PDF + pdf(output_file, width = 16, height = 16) + grid.arrange( + plot_list$L, plot_list$K, + plot_list$r, plot_list$AUC, + ncol = 2, nrow = 2 + ) + dev.off() +} + + + generate_cpp_correlation_plots <- function(df_na_rm, lm_list, output_dir) { lm_summaries <- lapply(lm_list, summary) plot_titles <- c("Interaction L vs. Interaction K", "Interaction L vs. Interaction r", "Interaction L vs. Interaction AUC", @@ -531,21 +598,27 @@ main <- function() { } # Generate and save QC plots using the new generalized function - message("Generating QC plots") - variables <- c("L", "K", "r", "AUC", "delta_bg") - generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE) - generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE) - generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables) - generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables) + # message("Generating QC plots") + # variables <- c("L", "K", "r", "AUC", "delta_bg") + # generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE) + # generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE) + # generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables) + # generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables) rm(df, df_above_tolerance, df_no_zeros) # Calculate summary statistics message("Calculating summary statistics for all strains") variables <- c("L", "K", "r", "AUC") - stats <- calculate_summary_stats(df_na, variables, group_vars = c("conc_num", "conc_num_factor")) + stats <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) write.csv(stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) - stats_joined <- left_join(df_na, stats, by = c("conc_num", "conc_num_factor")) + stats_joined <- left_join(df_na, stats, by = c("OrfRep", "conc_num", "conc_num_factor")) + + # Create interaction plots + generate_interaction_plots(stats_joined, output_file = file.path(output_dir, "InteractionPlots.pdf")) + + print("stats:") + print(head(stats)) # Originally this filtered L NA's @@ -603,7 +676,7 @@ main <- function() { write.csv(stats_bg, file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), row.names = FALSE) - stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "conc_num", "conc_num_factor")) + # stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "conc_num", "conc_num_factor")) # Filter reference and deletion strains # Formerly X2_RF (reference strain) @@ -616,13 +689,14 @@ main <- function() { filter(OrfRep != strain) %>% mutate(SM = 0) - reference_strain <- process_strains(l_within_2sd_k_joined) - deletion_strains <- process_strains(l_within_2sd_k_joined) + reference_strain <- process_strains(df_reference) # TODO double-check + deletion_strains <- process_strains(df_deletion) # TODO double-check # TODO we may need to add "num" to grouping vars # Calculate interactions variables <- c("L", "K", "r", "AUC") + # We are recalculating some of the data here reference_results <- calculate_interaction_scores(stats_joined, reference_strain, max_conc, variables) deletion_results <- calculate_interaction_scores(stats_joined, deletion_strains, max_conc, variables) @@ -631,17 +705,17 @@ main <- function() { zscores_calculations <- deletion_results$zscores_calculations zscores_interactions <- deletion_results$zscores_interactions - # TODO: I don't know if we need this? - # zscores_interactions <- zscores_interactions %>% - # arrange(desc(Z_lm_L)) %>% - # arrange(desc(NG)) - # Writing Z-Scores to file write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE) write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE) write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE) write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE) + + + + + # Define conditions for enhancers and suppressors # TODO Add to study config file? threshold <- 2