From d5141d04c4a550e8fe353cd059f8f59db288f8f7 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Mon, 16 Sep 2024 21:10:22 -0400 Subject: [PATCH] Refactor data filtering --- .../apps/r/calculate_interaction_zscores.R | 43 ++++--------------- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 945cd74d..cc65a84e 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -155,9 +155,7 @@ update_gene_names <- function(df, sgd_gene_list) { return(df) } -# Calculate summary statistics for all variables calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) { - # Summarize the variables within the grouped data summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% @@ -166,19 +164,18 @@ calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "con across(all_of(variables), list( mean = ~mean(., na.rm = TRUE), median = ~median(., na.rm = TRUE), - max = ~ ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), - min = ~ ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), + max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), + min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), sd = ~sd(., na.rm = TRUE), - se = ~ ifelse(all(is.na(.)), NA, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)) + se = ~ifelse(all(is.na(.)), NA, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)) ), .names = "{.fn}_{.col}") ) - # print(summary_stats) - - # Prevent .x and .y suffix issues by renaming columns + # Create a cleaned version of df that doesn't overlap with summary_stats + cols_to_exclude <- unique(c(variables, group_vars)) df_cleaned <- df %>% - select(-any_of(setdiff(names(summary_stats), group_vars))) # Avoid duplicate columns in the final join - + select(-any_of(cols_to_exclude)) + # Join the stats back to the original dataframe df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars) @@ -205,22 +202,8 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() ) - stats <- df %>% - group_by(OrfRep, Gene, num, conc_num, conc_num_factor) %>% - summarise( - N = sum(!is.na(L)), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE), - across(all_of(variables), list( - mean = ~mean(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), - min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), - sd = ~sd(., na.rm = TRUE), - se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), NA) - ), .names = "{.fn}_{.col}") - ) + stats <- calculate_summary_stats(df, variables, + group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))$summary_stats stats <- df %>% group_by(OrfRep, Gene, num) %>% @@ -873,14 +856,6 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL avg_zscore_cols <- paste0("Avg_Zscore_", variables) z_lm_cols <- paste0("Z_lm_", variables) - # # Optional: Validate that the expected columns exist in the dataframe - # expected_cols <- c(avg_zscore_cols, z_lm_cols, variables) - # missing_cols <- setdiff(expected_cols, names(df)) - # if (length(missing_cols) > 0) { - # stop("The following expected columns are missing from the dataframe: ", - # paste(missing_cols, collapse = ", ")) - # } - # Adjust NAs if 'adjust' is TRUE if (adjust) { if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns.")