diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 1fa651ff..3dafd1b2 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -155,7 +155,7 @@ update_gene_names <- function(df, sgd_gene_list) { return(df) } -calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) { +calculate_summary_stats <- function(df, variables, group_vars) { summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% @@ -168,7 +168,9 @@ calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "con min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), sd = ~sd(., na.rm = TRUE), se = ~ifelse(all(is.na(.)), NA, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)) - ), .names = "{.fn}_{.col}") + ), + .names = "{.fn}_{.col}"), + .groups = "drop" ) # Create a cleaned version of df that doesn't overlap with summary_stats @@ -181,7 +183,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "con return(list(summary_stats = summary_stats, df_with_stats = df_with_stats)) } -calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { +calculate_interaction_scores <- function(df, max_conc, variables, group_vars) { # Calculate total concentration variables total_conc_num <- length(unique(df$conc_num)) @@ -201,8 +203,10 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() ) - stats <- calculate_summary_stats(df, variables, - group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))$summary_stats + stats <- calculate_summary_stats(df, + variables = variables, + group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor" + ))$summary_stats stats <- df %>% group_by(OrfRep, Gene, num) %>% @@ -302,7 +306,8 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c lm_slope_AUC = coef(lm_AUC)[2], NG = sum(NG, na.rm = TRUE), DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) + SM = sum(SM, na.rm = TRUE), + .groups = "keep" ) num_non_removed_concs <- total_conc_num - sum(stats$DB, na.rm = TRUE) - 1 @@ -402,7 +407,7 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la } # Add interactive tooltips for plotly plots - tooltip_vars <- c("x", "y") # default tooltip variables + tooltip_vars <- c("x", "y") if (!is.null(config$tooltip_vars)) { tooltip_vars <- config$tooltip_vars } else { @@ -687,14 +692,11 @@ generate_interaction_plot_configs <- function(df, variables) { return(configs) } -generate_rank_plot_configs <- function(df_filtered, is_lm = FALSE, adjust = FALSE) { +generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE, adjust = FALSE) { # Define SD bands sd_bands <- c(1, 2, 3) - # Define variables for Avg ZScore and Rank Avg ZScore plots - variables <- c("r", "L", "K", "AUC") - # Initialize list to store plot configurations configs <- list() @@ -963,8 +965,7 @@ main <- function() { dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE) summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across - group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by - orf_group_vars <- c("OrfRep", "Gene", "num") + interaction_vars <- c("L", "K", "r", "AUC") # fields to calculate interaction z-scores print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor", "delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB") @@ -984,20 +985,29 @@ main <- function() { k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2 message("Calculating summary statistics before quality control") - ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars) + ss <- calculate_summary_stats( + df = df, + variables = summary_vars, + group_vars = c("OrfRep", "conc_num", "conc_num_factor")) df_stats <- ss$df_with_stats message("Filtering non-finite data") df_filtered_stats <- filter_data(df_stats, c("L"), nf = TRUE) message("Calculating summary statistics after quality control") - ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars) + ss <- calculate_summary_stats( + df = df_na, + variables = summary_vars, + group_vars = c("OrfRep", "conc_num", "conc_num_factor")) df_na_ss <- ss$summary_stats df_na_stats <- ss$df_with_stats write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) df_na_filtered_stats <- filter_data(df_na_stats, c("L"), nf = TRUE) message("Calculating summary statistics after quality control excluding zero values") - ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars) + ss <- calculate_summary_stats( + df = df_no_zeros, + variables = summary_vars, + group_vars = c("OrfRep", "conc_num", "conc_num_factor")) df_no_zeros_stats <- ss$df_with_stats df_no_zeros_filtered_stats <- filter_data(df_no_zeros_stats, c("L"), nf = TRUE) @@ -1011,12 +1021,16 @@ main <- function() { # TODO We're omitting the original z_max calculation, not sure if needed? ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) # df_na_l_within_2sd_k_stats <- ss$df_with_stats - write.csv(ss$summary_stats, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) + write.csv(ss$summary_stats, + file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), + row.names = FALSE) message("Calculating summary statistics for L outside 2SD of K") ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) df_na_l_outside_2sd_k_stats <- ss$df_with_stats - write.csv(ss$summary_stats, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE) + write.csv(ss$summary_stats, + file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), + row.names = FALSE) # Each plots list corresponds to a file l_vs_k_plots <- list( @@ -1183,16 +1197,16 @@ main <- function() { ) ) - message("Generating quality control plots") - generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots) - generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) - generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) - generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) - generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots) - generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots) - generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots) - generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots) - generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots) + # message("Generating quality control plots") + # generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots) + # generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) + # generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) + # generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) + # generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots) + # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots) + # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots) + # generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots) + # generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots) # TODO: Originally this filtered L NA's # Let's try to avoid for now since stats have already been calculated @@ -1217,7 +1231,7 @@ main <- function() { # Recalculate summary statistics for the background strain message("Calculating summary statistics for background strain") - ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = group_vars) + ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) summary_stats_bg <- ss_bg$summary_stats # df_bg_stats <- ss_bg$df_with_stats write.csv(summary_stats_bg, @@ -1256,18 +1270,15 @@ main <- function() { L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% ungroup() - message("Calculating interaction scores") - interaction_vars <- c("L", "K", "r", "AUC") - - message("Calculating reference strain(s)") - reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars, group_vars = orf_group_vars) + message("Calculating reference strain interaction scores") + reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num")) zscores_calculations_reference <- reference_results$calculations zscores_interactions_reference <- reference_results$interactions zscores_calculations_reference_joined <- reference_results$calculations_joined zscores_interactions_reference_joined <- reference_results$interactions_joined - message("Calculating deletion strain(s)") - deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars, group_vars = orf_group_vars) + message("Calculating deletion strain(s) interactions scores") + deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num")) zscores_calculations <- deletion_results$calculations zscores_interactions <- deletion_results$interactions zscores_calculations_joined <- deletion_results$calculations_joined @@ -1346,6 +1357,7 @@ main <- function() { rank = TRUE) rank_plot_configs <- generate_rank_plot_configs( df = zscores_interactions_joined_filtered, + variables = interaction_vars, is_lm = FALSE, adjust = TRUE ) @@ -1355,6 +1367,7 @@ main <- function() { message("Generating ranked linear model plots") rank_lm_plot_configs <- generate_rank_plot_configs( df = zscores_interactions_joined_filtered, + variables = interaction_vars, is_lm = TRUE, adjust = TRUE ) @@ -1364,7 +1377,7 @@ main <- function() { message("Filtering and reranking plots") # Formerly X_NArm zscores_interactions_filtered <- zscores_interactions_joined %>% - group_by(across(all_of(orf_group_vars))) %>% + group_by(across(all_of(c("OrfRep", "Gene", "num")))) %>% filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>% ungroup() %>% rowwise() %>% @@ -1396,6 +1409,7 @@ main <- function() { rank_plot_filtered_configs <- generate_rank_plot_configs( df = zscores_interactions_filtered, + variables = interaction_vars, is_lm = FALSE, adjust = FALSE ) @@ -1410,6 +1424,7 @@ main <- function() { message("Generating filtered ranked linear model plots") rank_plot_lm_filtered_configs <- generate_rank_plot_configs( df = zscores_interactions_filtered, + variables = interaction_vars, is_lm = TRUE, adjust = FALSE )