From d91a38004ac9c656b078751f031c1e297a3d5623 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Sun, 1 Sep 2024 22:38:59 -0400 Subject: [PATCH] Refactor out individual stats dataframes --- .../apps/r/calculate_interaction_zscores5.R | 70 +++++++++---------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index 17b17ecd..7a377dc1 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -283,7 +283,7 @@ process_strains <- function(df, l_within_2sd_k, strain) { if (concentration > 0) { max_l_theoretical <- l_within_2sd_k %>% filter(conc_num_factor == concentration) %>% - pull(max_L) + pull(L_max) df_temp <- df_temp %>% mutate( L = ifelse(L == 0 & !is.na(L), max_l_theoretical, L), @@ -297,14 +297,13 @@ process_strains <- function(df, l_within_2sd_k, strain) { return(df_strains) } -calculate_interaction_scores <- function(df, df_stats_by_l, df_stats_by_k, df_stats_by_r, df_stats_by_auc, - max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { +calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { - # Calculate background means - L_mean_bg <- df %>% filter(conc_num_factor == 0) %>% pull(L_mean) - K_mean_bg <- df %>% filter(conc_num_factor == 0) %>% pull(K_mean) + # Pull the background means + l_mean_bg <- df %>% filter(conc_num_factor == 0) %>% pull(L_mean) + k_mean_bg <- df %>% filter(conc_num_factor == 0) %>% pull(K_mean) r_mean_bg <- df %>% filter(conc_num_factor == 0) %>% pull(r_mean) - AUC_mean_bg <- df %>% filter(conc_num_factor == 0) %>% pull(AUC_mean) + auc_mean_bg <- df %>% filter(conc_num_factor == 0) %>% pull(AUC_mean) # Calculate all necessary statistics and shifts in one step interaction_scores_all <- df %>% @@ -314,26 +313,26 @@ calculate_interaction_scores <- function(df, df_stats_by_l, df_stats_by_k, df_st across(all_of(variables), list(mean = mean, sd = sd), na.rm = TRUE), NG = sum(NG, na.rm = TRUE), SM = sum(SM, na.rm = TRUE), - raw_shift_l = mean(L, na.rm = TRUE) - L_mean_bg, - raw_shift_k = mean(K, na.rm = TRUE) - K_mean_bg, + raw_shift_l = mean(L, na.rm = TRUE) - l_mean_bg, + raw_shift_k = mean(K, na.rm = TRUE) - k_mean_bg, raw_shift_r = mean(r, na.rm = TRUE) - r_mean_bg, - raw_shift_auc = mean(AUC, na.rm = TRUE) - AUC_mean_bg, - z_shift_l = raw_shift_l / df_stats_by_l$L_sd[1], - z_shift_k = raw_shift_k / df_stats_by_k$K_sd[1], - z_shift_r = raw_shift_r / df_stats_by_r$r_sd[1], - z_shift_auc = raw_shift_auc / df_stats_by_auc$AUC_sd[1], - exp_l = L_mean_bg + raw_shift_l, - exp_k = K_mean_bg + raw_shift_k, + raw_shift_auc = mean(AUC, na.rm = TRUE) - auc_mean_bg, + z_shift_l = raw_shift_l / L_sd[1], + z_shift_k = raw_shift_k / K_sd[1], + z_shift_r = raw_shift_r / r_sd[1], + z_shift_auc = raw_shift_auc / AUC_sd[1], + exp_l = l_mean_bg + raw_shift_l, + exp_k = k_mean_bg + raw_shift_k, exp_r = r_mean_bg + raw_shift_r, - exp_auc = AUC_mean_bg + raw_shift_auc, + exp_auc = auc_mean_bg + raw_shift_auc, delta_l = mean(L, na.rm = TRUE) - exp_l, delta_k = mean(K, na.rm = TRUE) - exp_k, delta_r = mean(r, na.rm = TRUE) - exp_r, delta_auc = mean(AUC, na.rm = TRUE) - exp_auc, - zscore_l = delta_l / df_stats_by_l$L_sd, - zscore_k = delta_k / df_stats_by_k$K_sd, - zscore_r = delta_r / df_stats_by_r$r_sd, - zscore_auc = delta_auc / df_stats_by_auc$AUC_sd, + zscore_l = delta_l / L_sd, + zscore_k = delta_k / K_sd, + zscore_r = delta_r / r_sd, + zscore_auc = delta_auc / AUC_sd, sum_z_score_l = sum(zscore_l, na.rm = TRUE), avg_zscore_l = sum_z_score_l / (length(variables) - sum(NG, na.rm = TRUE)), sum_z_score_k = sum(zscore_k, na.rm = TRUE), @@ -574,24 +573,21 @@ main <- function() { stats_joined <- left_join(df_na, stats, by = c("conc_num", "conc_num_factor")) # Create separate dataframes for each variable (we'll use later for plotting) - stats_by_l <- stats_joined %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor") - stats_by_k <- stats_joined %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor") - stats_by_r <- stats_joined %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor") - stats_by_auc <- stats_joined %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor") + # stats_by_l <- stats_joined %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor") + # stats_by_k <- stats_joined %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor") + # stats_by_r <- stats_joined %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor") + # stats_by_auc <- stats_joined %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor") # Originally this filtered L NA's # I've removed that filtering for now since it didn't seem right but may need to add it back in later # str(stats_by_k) - stats_by_k_joined <- left_join(df_na, stats_by_k, by = c("conc_num", "conc_num_factor")) - - str(stats_by_k_joined) # Filter data within 2SD - within_2sd_k <- stats_by_k %>% + within_2sd_k <- stats_joined %>% filter(K >= (K_mean - 2 * K_sd) & K <= (K_mean + 2 * K_sd)) # Filter data outside 2SD - outside_2sd_k <- stats_by_k %>% + outside_2sd_k <- stats_joined %>% filter(K < (K_mean - 2 * K_sd) | K > (K_mean + 2 * K_sd)) # Calculate summary statistics for L within and outside 2SD of K @@ -627,10 +623,10 @@ main <- function() { # Recalculate summary statistics for the background strain message("Calculating summary statistics for background strain") stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")) - stats_by_l_bg <- stats_bg %>% select(starts_with("L_"), "OrfRep", "Gene", "conc_num", "conc_num_factor") - stats_by_k_bg <- stats_bg %>% select(starts_with("K_"), "OrfRep", "Gene", "conc_num", "conc_num_factor") - stats_by_r_bg <- stats_bg %>% select(starts_with("r_"), "OrfRep", "Gene", "conc_num", "conc_num_factor") - stats_by_auc_bg <- stats_bg %>% select(starts_with("AUC_"), "OrfRep", "Gene", "conc_num", "conc_num_factor") + # stats_by_l_bg <- stats_bg %>% select(starts_with("L_"), "OrfRep", "Gene", "conc_num", "conc_num_factor") + # stats_by_k_bg <- stats_bg %>% select(starts_with("K_"), "OrfRep", "Gene", "conc_num", "conc_num_factor") + # stats_by_r_bg <- stats_bg %>% select(starts_with("r_"), "OrfRep", "Gene", "conc_num", "conc_num_factor") + # stats_by_auc_bg <- stats_bg %>% select(starts_with("AUC_"), "OrfRep", "Gene", "conc_num", "conc_num_factor") write.csv(stats_bg, file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), row.names = FALSE) @@ -666,11 +662,9 @@ main <- function() { # Calculate interactions variables <- c("L", "K", "r", "AUC") message("Calculating reference interaction scores") - reference_results <- calculate_interaction_scores(reference_strain, stats_by_l, - stats_by_k, stats_by_r, stats_by_auc, max_conc, variables) + reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables) message("Calculating deletion interaction scores") - deletion_results <- calculate_interaction_scores(deletion_strains, stats_by_l, - stats_by_k, stats_by_r, stats_by_auc, max_conc, variables) + deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables) zscores_calculations_reference <- reference_results$zscores_calculations zscores_interactions_reference <- reference_results$zscores_interactions