From 3f2d62937133117f5eae90974ff0c5e137debac7 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Mon, 16 Sep 2024 15:33:25 -0400 Subject: [PATCH] Absorb finite filter --- .../apps/r/calculate_interaction_zscores.R | 36 +++++++++---------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 3374eb50..f59ea4e9 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -579,7 +579,7 @@ generate_interaction_plot_configs <- function(df, variables) { AUC = c(-6500, 6500) ) - df_filtered <- filter_data_for_plots(df, variables, missing = TRUE, limits_map) + df_filtered <- filter_data(df, variables, missing = TRUE, limits_map) # Define annotation label functions generate_annotation_labels <- function(df, var, annotation_name) { @@ -672,7 +672,7 @@ generate_interaction_plot_configs <- function(df, variables) { generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FALSE) { - df_filtered <- filter_data_for_plots(df, variables, missing = TRUE) + df_filtered <- filter_data(df, variables, missing = TRUE) for (var in variables) { avg_zscore_col <- paste0("Avg_Zscore_", var) @@ -776,22 +776,20 @@ generate_correlation_plot_configs <- function(df, variables) { return(configs) } -filter_and_print_non_finite <- function(df, vars_to_check, print_vars) { - non_finite_rows <- df %>% filter(if_any(all_of(vars_to_check), ~ !is.finite(.))) - - if (nrow(non_finite_rows) > 0) { - message("Filtering non-finite rows:") - print(non_finite_rows %>% select(all_of(print_vars)), n = 200) - } - - df %>% filter(if_all(all_of(vars_to_check), is.finite)) -} - -filter_data_for_plots <- function(df, variables, missing = FALSE, limits_map = NULL) { +filter_data <- function(df, variables, nf = FALSE, missing = FALSE, limits_map = NULL) { # Loop through each variable to filter and print missing/out-of-range data for (variable in variables) { y_var_sym <- sym(variable) + + if (nf) { + non_finite <- df %>% filter(!is.finite(!!y_var_sym)) + if (nrow(non_finite) > 0) { + message("Non-finite rows for variable ", variable, ":") + print(non_finite) + } + df <- df %>% filter(is.finite(!!y_var_sym)) + } # Filter missing data if (missing) { @@ -844,7 +842,7 @@ main <- function() { update_gene_names(args$sgd_gene_list) %>% as_tibble() - # Quality Control: Filter rows above tolerance + # Filter rows above delta background tolerance df_above_tolerance <- df %>% filter(DB == 1) df_na <- df %>% mutate(across(all_of(summary_vars), ~ ifelse(DB == 1, NA, .))) df_no_zeros <- df_na %>% filter(L > 0) @@ -857,20 +855,20 @@ main <- function() { message("Calculating summary statistics before quality control") ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars) df_stats <- ss$df_with_stats - df_filtered_stats <- filter_and_print_non_finite(df_stats, "L", print_vars) + message("Filtering non-finite data") + df_filtered_stats <- filter_data(df_stats, c("L"), nf = TRUE) message("Calculating summary statistics after quality control") ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars) df_na_ss <- ss$summary_stats df_na_stats <- ss$df_with_stats write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) - # Filter out non-finite rows for plotting - df_na_filtered_stats <- filter_and_print_non_finite(df_na_stats, "L", print_vars) + df_na_filtered_stats <- filter_data(df_na_stats, c("L"), nf = TRUE) message("Calculating summary statistics after quality control excluding zero values") ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars) df_no_zeros_stats <- ss$df_with_stats - df_no_zeros_filtered_stats <- filter_and_print_non_finite(df_no_zeros_stats, "L", print_vars) + df_no_zeros_filtered_stats <- filter_data(df_no_zeros_stats, c("L"), nf = TRUE) message("Filtering by 2SD of K") df_na_within_2sd_k <- df_na_stats %>%