From 971fb801944ee6ecbc762249181b6cac6428b82d Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Sun, 15 Sep 2024 17:21:21 -0400 Subject: [PATCH] Remove existing df calls for single dplyr --- .../apps/r/calculate_interaction_zscores.R | 68 ++++++------------- 1 file changed, 22 insertions(+), 46 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 913aea91..a9890c84 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -805,6 +805,17 @@ generate_correlation_plot_configs <- function(df, variables) { return(configs) } +filter_and_print_non_finite <- function(df, vars_to_check, print_vars) { + non_finite_rows <- df %>% filter(if_any(all_of(vars_to_check), ~ !is.finite(.))) + + if (nrow(non_finite_rows) > 0) { + message("Removing the following non-finite rows:") + print(non_finite_rows %>% select(all_of(print_vars)), n = 200) + } + + df %>% filter(if_all(all_of(vars_to_check), is.finite)) +} + main <- function() { lapply(names(args$experiments), function(exp_name) { exp <- args$experiments[[exp_name]] @@ -821,18 +832,14 @@ main <- function() { print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor", "delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB") - message("Loading and filtering data") - df <- load_and_process_data(args$easy_results_file, sd = exp_sd) - df <- update_gene_names(df, args$sgd_gene_list) - df <- as_tibble(df) + message("Loading and filtering data for experiment: ", exp_name) + df <- load_and_process_data(args$easy_results_file, sd = exp_sd) %>% + update_gene_names(args$sgd_gene_list) %>% + as_tibble() - # Filter rows that are above tolerance for quality control plots + # Quality Control: Filter rows above tolerance df_above_tolerance <- df %>% filter(DB == 1) - - # Set L, r, K, AUC (and delta_bg?) to NA for rows that are above tolerance df_na <- df %>% mutate(across(all_of(summary_vars), ~ ifelse(DB == 1, NA, .))) - - # Remove rows with 0 values in L df_no_zeros <- df_na %>% filter(L > 0) # Save some constants @@ -842,17 +849,8 @@ main <- function() { message("Calculating summary statistics before quality control") ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars) - # df_ss <- ss$summary_stats df_stats <- ss$df_with_stats - df_filtered_stats <- df_stats %>% - { - non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.))) - if (nrow(non_finite_rows) > 0) { - message("Filtering out the following non-finite rows:") - print(non_finite_rows %>% select(any_of(print_vars)), n = 200) - } - filter(., if_all(c(L), is.finite)) - } + df_filtered_stats <- filter_and_print_non_finite(df_stats, "L", print_vars) message("Calculating summary statistics after quality control") ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars) @@ -860,28 +858,12 @@ main <- function() { df_na_stats <- ss$df_with_stats write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) # Filter out non-finite rows for plotting - df_na_filtered_stats <- df_na_stats %>% - { - non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.))) - if (nrow(non_finite_rows) > 0) { - message("Removed the following non-finite rows:") - print(non_finite_rows %>% select(any_of(print_vars)), n = 200) - } - filter(., if_all(c(L), is.finite)) - } + df_na_filtered_stats <- filter_and_print_non_finite(df_na_stats, "L", print_vars) message("Calculating summary statistics after quality control excluding zero values") ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars) df_no_zeros_stats <- ss$df_with_stats - df_no_zeros_filtered_stats <- df_no_zeros_stats %>% - { - non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.))) - if (nrow(non_finite_rows) > 0) { - message("Removed the following non-finite rows:") - print(non_finite_rows %>% select(any_of(print_vars)), n = 200) - } - filter(., if_all(c(L), is.finite)) - } + df_no_zeros_filtered_stats <- filter_and_print_non_finite(df_no_zeros_stats, "L", print_vars) message("Filtering by 2SD of K") df_na_within_2sd_k <- df_na_stats %>% @@ -892,18 +874,12 @@ main <- function() { message("Calculating summary statistics for L within 2SD of K") # TODO We're omitting the original z_max calculation, not sure if needed? ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - l_within_2sd_k_ss <- ss$summary_stats - df_na_l_within_2sd_k_stats <- ss$df_with_stats - write.csv(l_within_2sd_k_ss, - file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) + write.csv(ss$summary_stats, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) message("Calculating summary statistics for L outside 2SD of K") ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - l_outside_2sd_k_ss <- ss$summary_stats - df_na_l_outside_2sd_k_stats <- ss$df_with_stats - write.csv(l_outside_2sd_k_ss, - file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE) - + write.csv(ss$summary_stats, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE) + # Each plots list corresponds to a file message("Generating quality control plot configurations") l_vs_k_plots <- list(