From 5ba669f880a89467846e66146038cede8c8820f2 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Thu, 5 Sep 2024 20:55:54 -0400 Subject: [PATCH] Replace existing summary stats in df --- .../apps/r/calculate_interaction_zscores5.R | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index e287b62d..a8a1e5ca 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -195,20 +195,22 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), sd = ~sd(., na.rm = TRUE), se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), NA) - # TODO: not in original stats but better to do here than in calculate_interactions? - # z_max = ~ifelse(sd(., na.rm = TRUE) == 0 | all(is.na(.)), NA, - # (max(., na.rm = TRUE) - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE)) # Z-score ), .names = "{.fn}_{.col}") ) - # Join the summary stats back to the original dataframe - df_with_stats <- left_join(df, summary_stats, by = group_vars) + # Get the column names from the summary_stats dataframe (excluding the group_vars) + stat_columns <- setdiff(names(summary_stats), group_vars) + + # Remove existing stats columns from df if they already exist + df_cleaned <- df %>% select(-any_of(stat_columns)) + + # Join the summary stats back to the cleaned original dataframe + df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars) # Return both the summary stats and the updated dataframe return(list(summary_stats = summary_stats, df_with_stats = df_with_stats)) } - # Calculate interaction scores calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { @@ -651,9 +653,9 @@ main <- function() { # Filter data within and outside 2SD message("Filtering by 2SD of K") - df_na_within_2sd_k <- df_na %>% + df_na_within_2sd_k <- df_na_stats %>% filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) - df_na_outside_2sd_k <- df_na %>% + df_na_outside_2sd_k <- df_na_stats %>% filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) # Summary statistics for within and outside 2SD of K