From e1f57ff7d7b1cf887a5775f228a83f8bbfc1bc28 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Sun, 1 Sep 2024 17:33:40 -0400 Subject: [PATCH] Add more informative output --- .../apps/r/calculate_interaction_zscores5.R | 115 +++++++++--------- 1 file changed, 59 insertions(+), 56 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index 828f3cb8..17a02ff5 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -127,7 +127,7 @@ load_and_process_data <- function(easy_results_file, sd = 3) { OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)), conc_num_factor = as.numeric(as.factor(conc_num)) - 1) - + return(df) } @@ -137,7 +137,7 @@ update_gene_names <- function(df, sgd_gene_list) { genes <- read.delim(file = sgd_gene_list, quote = "", header = FALSE, colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) - + # Create a named vector for mapping ORF to GeneName gene_map <- setNames(genes$V5, genes$V4) @@ -150,7 +150,7 @@ update_gene_names <- function(df, sgd_gene_list) { # Ensure Gene is not left blank or incorrectly updated to "OCT1" df <- df %>% mutate(Gene = ifelse(updated_genes == "" | updated_genes == "OCT1", OrfRep, updated_genes)) - + return(df) } @@ -193,21 +193,21 @@ generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_ generate_and_save_plots <- function(df, output_dir, prefix, variables, include_qc = FALSE) { plots <- list() - + if (nrow(df) == 0) { message("The \"", deparse(substitute(df)), "\" dataframe is empty, skipping plots") return() } - + message("Generating plots for \"", deparse(substitute(df)), "\" dataframe") for (var in variables) { scatter_plot <- generate_plot(df, x_var = "scan", y_var = var, plot_type = "scatter", - title = paste(prefix, "Scatter Plot for", var)) + title = paste(prefix, "Scatter Plot for", var)) boxplot <- generate_plot(df, x_var = "scan", y_var = var, plot_type = "box", - title = paste(prefix, "Box Plot for", var)) + title = paste(prefix, "Box Plot for", var)) plots[[paste0(var, "_scatter")]] <- scatter_plot plots[[paste0(var, "_box")]] <- boxplot @@ -216,13 +216,13 @@ generate_and_save_plots <- function(df, output_dir, prefix, variables, include_q if (include_qc) { plots[["Raw_L_vs_K"]] <- generate_plot(df, x_var = "L", y_var = "K", plot_type = "scatter", - title = "Raw L vs K before QC") + title = "Raw L vs K before QC") plots[["Delta_bg_Density"]] <- generate_plot(df, x_var = "delta_bg", plot_type = "density", color_var = "conc_num", - title = "Density plot for Delta Background by Conc All Data") + title = "Density plot for Delta Background by Conc All Data") plots[["Delta_bg_Bar"]] <- generate_plot(df, x_var = "delta_bg", plot_type = "bar", - title = "Bar plot for Delta Background by Conc All Data") + title = "Bar plot for Delta Background by Conc All Data") } save_plots(prefix, plots, output_dir) @@ -233,6 +233,10 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% summarise(across(all_of(variables), list( + N = ~{ + message("Calculating summary statistics for ", cur_column()) + n() + }, mean = ~mean(.x, na.rm = TRUE), median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE), @@ -284,11 +288,10 @@ save_plots <- function(file_name, plot_list, output_dir) { message("Error in plot: ", plot_name, "\n", e) return(NULL) }) - if (!is.null(pgg)) { saveWidget(pgg, file = file.path(output_dir, - paste0(file_name, "_", plot_name, ".html")), + paste0(file_name, "_", plot_name, ".html")), selfcontained = TRUE) } }) @@ -325,13 +328,13 @@ process_strains <- function(df, l_within_2sd_k, strain, output_dir) { } df_strains <- bind_rows(df_strains, df_temp) # Append the results of this concentration } - + return(df_strains) } -calculate_interaction_scores <- function(df, df_stats_by_l, df_stats_by_k, - df_stats_by_r, df_stats_by_auc, background_means, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { - +calculate_interaction_scores <- function(df, df_stats_by_l, df_stats_by_k, df_stats_by_r, df_stats_by_auc, + background_means, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { + # Calculate all necessary statistics and shifts in one step interaction_scores_all <- df %>% group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% @@ -400,7 +403,7 @@ calculate_interaction_scores <- function(df, df_stats_by_l, df_stats_by_k, SM = sum(SM, na.rm = TRUE) ) %>% ungroup() - + # Ensure interaction_scores has only the specified columns interaction_scores <- interaction_scores %>% select(Gene, raw_shift_l, z_shift_l, lm_Score_L, Z_lm_L = lm_Score_L, R_Squared_L, sum_z_score_l, avg_zscore_l, @@ -436,7 +439,7 @@ generate_rf_plots <- function(df_calculations, df_interactions, output_dir, file labels = unique(as.character(df_calculations$conc_num))) + theme_publication() }) - + save_plots(file_prefix, plot_list, output_dir) } @@ -445,7 +448,7 @@ generate_summary_plots <- function(df, output_dir) { plot_list <- lapply(variables, function(var) { generate_plot(df, x_var = "conc_num_factor", y_var = var, plot_type = "scatter", title = paste("Summary Plot for", var)) }) - + save_plots("Summary_Plots", plot_list, output_dir) } @@ -453,7 +456,7 @@ generate_summary_plots <- function(df, output_dir) { generate_cpp_correlation_plots <- function(df_na_rm, lm_list, output_dir) { lm_summaries <- lapply(lm_list, summary) plot_titles <- c("Interaction L vs. Interaction K", "Interaction L vs. Interaction r", "Interaction L vs. Interaction AUC", - "Interaction K vs. Interaction r", "Interaction K vs. Interaction AUC", "Interaction r vs. Interaction AUC") + "Interaction K vs. Interaction r", "Interaction K vs. Interaction AUC", "Interaction r vs. Interaction AUC") plot_list <- lapply(seq_along(lm_list), function(i) { ggplot(df_na_rm, aes_string(x = names(lm_list)[i][1], y = names(lm_list)[i][2])) + @@ -463,7 +466,7 @@ generate_cpp_correlation_plots <- function(df_na_rm, lm_list, output_dir) { annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[i]]$r.squared, 3))) + theme_Publication_legend_right() }) - + save_plots("Correlation_CPPs", plot_list, output_dir) } @@ -488,7 +491,7 @@ adjust_missing_and_rank <- function(df) { r_Rank_lm = rank(Z_lm_r), AUC_Rank_lm = rank(Z_lm_AUC) ) - + return(df) } @@ -556,7 +559,7 @@ main <- function() { df <- update_gene_names(df, args$sgd_gene_list) max_conc <- max(df$conc_num_factor) - + # QC # Filter the df above sd tolerance df_above_tolerance <- df %>% filter(DB == 1) @@ -570,31 +573,31 @@ main <- function() { K = ifelse(DB == 1, NA, K) ) df_no_zeros <- df_na %>% filter(L > 0) - + # Generate QC PDFs and HTMLs message("Generating QC plots") variables <- c("L", "K", "r", "AUC", "delta_bg") - generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE) - generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE) - generate_and_save_plots(df_na, out_dir_qc, "After_QC", variables) - generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables) - + # generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE) + # generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE) + # generate_and_save_plots(df_na, out_dir_qc, "After_QC", variables) + # generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables) + # Calculate summary statistics message("Calculating summary statistics for all strains") variables <- c("L", "K", "r", "AUC") summary_stats <- calculate_summary_stats(df_na, variables, group_vars = c("conc_num", "conc_num_factor")) write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) - + # Calculate the pre-background stats once stats_by_l <- summary_stats %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor") stats_by_k <- summary_stats %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor") stats_by_r <- summary_stats %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor") stats_by_auc <- summary_stats %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor") - + # Process background strains background_strains <- c("YDL227C") lapply(background_strains, function(strain) { - + message("Processing background strain: ", strain) # Handle missing data by setting zero values to NA @@ -608,7 +611,7 @@ main <- function() { AUC = if_else(AUC == 0, NA, AUC) ) %>% filter(!is.na(L)) - + # Recalculate summary statistics for the background strain message("Calculating summary statistics for background strain") stats_background <- calculate_summary_stats(df_background, variables, group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")) @@ -619,12 +622,12 @@ main <- function() { write.csv(stats_background, file = file.path(output_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), row.names = FALSE) - + # Filter L values within and outside 2SD of K results_2sd <- calculate_l_2sd_of_k(df_background, stats_by_k_background) within_2sd_k <- results_2sd$within_2sd_k outside_2sd_k <- results_2sd$outside_2sd_k - + # Calculate summary statistics for L within and outside 2SD of K message("Calculating summary statistics for for L within 2SD of K") l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) @@ -636,20 +639,20 @@ main <- function() { write.csv(l_outside_2sd_k, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) - + message("Generating Raw_L_vs_K_for_strains_outside_2sd_k plots") generate_and_save_plots(outside_2sd_k, out_dir, "Raw_L_vs_K_for_strains_outside_2sd_k") - + message("Calculating background means") background_means <- calculate_background_means(stats_by_l_background, stats_by_k_background, stats_by_r_background, stats_by_auc_background) - + # Filter reference and deletion strains # Formerly X2_RF (reference strain) df_reference <- df_na %>% filter(OrfRep == strain) %>% mutate(SM = 0) - + # Formerly X2 (deletion strains) df_deletion <- df_na %>% filter(OrfRep != strain) %>% @@ -659,7 +662,7 @@ main <- function() { reference_strain <- process_strains(df_reference, l_within_2sd_k, strain, out_dir) message("Processing deletion strains") deletion_strains <- process_strains(df_deletion, l_within_2sd_k, strain, out_dir) - + # Deprecated # Change OrfRep to include the reference strain, gene, and Num so each RF gets its own score # reference_strain <- reference_strain %>% @@ -668,7 +671,7 @@ main <- function() { # This is synonymous with the legacy OrfRep mutation # Use group_by in functions in lieu of mutating OrfRep # default_group_vars <- c("OrfRep", "Gene", "num") - + # Calculate interactions variables <- c("L", "K", "r", "AUC") message("Calculating reference interaction scores") @@ -677,58 +680,58 @@ main <- function() { message("Calculating deletion interaction scores") deletion_results <- calculate_interaction_scores(deletion_strains, stats_by_l, stats_by_k, stats_by_r, stats_by_auc, background_means, max_conc, variables) - + zscores_calculations_reference <- reference_results$zscores_calculations zscores_interactions_reference <- reference_results$zscores_interactions zscores_calculations <- deletion_results$zscores_calculations zscores_interactions <- deletion_results$zscores_interactions - + # TODO: I don't know if we need this? zscores_interactions <- zscores_interactions %>% arrange(desc(Z_lm_L)) %>% arrange(desc(NG)) - + message("Writing zscores csv files") write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE) write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE) write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE) write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE) - + # Define conditions for enhancers and suppressors enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= 2 suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -2 enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= 2 suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -2 - + # Subset data enhancers_L <- zscores_interactions[enhancer_condition_L, ] suppressors_L <- zscores_interactions[suppressor_condition_L, ] enhancers_K <- zscores_interactions[enhancer_condition_K, ] suppressors_K <- zscores_interactions[suppressor_condition_K, ] - + # Save enhancers and suppressors message("Writing enhancer/suppressor csv files") write.csv(enhancers_L, file = file.path(out_dir, "ZScores_Interaction_DeletionEnhancers_L.csv"), row.names = FALSE) write.csv(suppressors_L, file = file.path(out_dir, "ZScores_Interaction_DeletionSuppressors_L.csv"), row.names = FALSE) write.csv(enhancers_K, file = file.path(out_dir, "ZScores_Interaction_DeletionEnhancers_K.csv"), row.names = FALSE) write.csv(suppressors_K, file = file.path(out_dir, "ZScores_Interaction_DeletionSuppressors_K.csv"), row.names = FALSE) - + # Combine conditions for enhancers and suppressors enhancers_and_suppressors_L <- zscores_interactions[enhancer_condition_L | suppressor_condition_L, ] enhancers_and_suppressors_K <- zscores_interactions[enhancer_condition_K | suppressor_condition_K, ] - + # Save combined enhancers and suppressors write.csv(enhancers_and_suppressors_L, file = file.path(out_dir, "ZScores_Interaction_DeletionEnhancers_and_Suppressors_L.csv"), row.names = FALSE) write.csv(enhancers_and_suppressors_K, file = file.path(out_dir, "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K.csv"), row.names = FALSE) - + # Handle linear model based enhancers and suppressors enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= 2, ] suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -2, ] enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= 2, ] suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -2, ] - + # Save linear model based enhancers and suppressors message("Writing linear model enhancer/suppressor csv files") write.csv(enhancers_lm_L, @@ -739,15 +742,15 @@ main <- function() { file = file.path(out_dir, "ZScores_Interaction_DeletionEnhancers_K_lm.csv"), row.names = FALSE) write.csv(suppressors_lm_K, file = file.path(out_dir, "ZScores_Interaction_DeletionSuppressors_K_lm.csv"), row.names = FALSE) - + # Generate plots for interaction scores generate_rf_plots(zscores_calculations_reference, zscores_interactions_reference, out_dir) generate_rf_plots(zscores_calculations, zscores_interactions, out_dir) - + # Apply filtering to remove NA values before further analysis zscores_interactions_filtered <- zscores_interactions %>% filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) - + # Generate summary and correlation plots generate_summary_plots(df, out_dir) lm_list <- list( @@ -758,10 +761,10 @@ main <- function() { lm(Z_lm_AUC ~ Z_lm_K, data = zscores_interactions_filtered), lm(Z_lm_AUC ~ Z_lm_r, data = zscores_interactions_filtered) ) - + # Generate cpp correlation plots generate_cpp_correlation_plots(zscores_interactions_filtered, lm_list, out_dir) - + # Generate ranked plots create_ranked_plots(zscores_interactions_filtered, out_dir) })