From fcc6cdd2d4d21617666b76d3f1dc627b865550c5 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Fri, 6 Sep 2024 01:22:15 -0400 Subject: [PATCH] Pre sd/se fixes --- .../apps/r/calculate_interaction_zscores5.R | 174 +++++++++++------- 1 file changed, 105 insertions(+), 69 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index a261e8ee..a25102ec 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -183,6 +183,10 @@ process_strains <- function(df) { # Calculate summary statistics for all variables calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) { + # Replace 0 values with NA + df <- df %>% + mutate(across(all_of(variables), ~ifelse(. == 0, NA, .))) + # Calculate summary statistics, including a single N based on L summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% @@ -193,44 +197,50 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c median = ~median(., na.rm = TRUE), max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), - sd = ~sd(., na.rm = TRUE), - se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), NA) + sd = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE), 0), # If N == 1, sd is set to 0 + se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), 0) # If N == 1, se is set to 0 ), .names = "{.fn}_{.col}") ) - # Get the column names from the summary_stats dataframe (excluding the group_vars) - stat_columns <- setdiff(names(summary_stats), group_vars) - # Remove existing stats columns from df if they already exist + stat_columns <- setdiff(names(summary_stats), group_vars) df_cleaned <- df %>% select(-any_of(stat_columns)) - + # Join the summary stats back to the cleaned original dataframe df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars) - - # Return both the summary stats and the updated dataframe + return(list(summary_stats = summary_stats, df_with_stats = df_with_stats)) } # Calculate interaction scores calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { + if (nrow(df) == 0) { + message("Dataframe is empty after filtering") + return(NULL) # Or handle the empty dataframe case as needed + } + # Calculate total concentration variables total_conc_num <- length(unique(df$conc_num)) num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1 - # Pull the background means + # Pull the background means and standard deviations print("Calculating background means") - bg_L <- df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first() - bg_K <- df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first() - bg_r <- df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first() - bg_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first() + print(head(df)) + bg_means <- list( + L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(), + K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(), + r = df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first(), + AUC = df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first() + ) + bg_sd <- list( + L = df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first(), + K = df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first(), + r = df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first(), + AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() + ) - bg_sd_L <- df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first() - bg_sd_K <- df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first() - bg_sd_r <- df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first() - bg_sd_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() - - # First mutate block to calculate NG, DB, SM + # First mutate block to calculate NG, DB, SM, and N print("Calculating interaction scores part 1") interaction_scores <- df %>% group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% @@ -238,38 +248,59 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c NG = sum(NG, na.rm = TRUE), DB = sum(DB, na.rm = TRUE), SM = sum(SM, na.rm = TRUE), - N = ~sum(!is.na(L)), # Count of non-NA values + N = sum(!is.na(L)) # Count of non-NA values in L ) - - # Second mutate block to calculate variables and Delta using NG + + # Calculate Raw_Shift and Z_Shift for each variable + print("Calculating Raw_Shift and Z_Shift") interaction_scores <- interaction_scores %>% - mutate(across(all_of(variables), list( - mean = ~mean(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - se = ~sd(., na.rm = TRUE) / sqrt(N - 1), # Standard error calculation - Raw_Shift = ~mean(., na.rm = TRUE) - case_when( - cur_column() == "L" ~ bg_L, - cur_column() == "K" ~ bg_K, - cur_column() == "r" ~ bg_r, - cur_column() == "AUC" ~ bg_AUC - ), - Z_Shift = ~ Raw_Shift / case_when( - cur_column() == "L" ~ bg_sd_L, - cur_column() == "K" ~ bg_sd_K, - cur_column() == "r" ~ bg_sd_r, - cur_column() == "AUC" ~ bg_sd_AUC - ), - WT = ~mean(., na.rm = TRUE), - WT_sd = ~sd(., na.rm = TRUE), - Exp = ~ WT + Raw_Shift, - Delta = ~ WT - Exp, - Delta = if_else(NG == 1, mean - WT, Delta) - ), .names = "{.fn}_{.col}")) %>% mutate( - Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) # disregard shift for set to max values in Z score calculation + Raw_Shift_L = mean_L - bg_means$L, + Raw_Shift_K = mean_K - bg_means$K, + Raw_Shift_r = mean_r - bg_means$r, + Raw_Shift_AUC = mean_AUC - bg_means$AUC, + Z_Shift_L = Raw_Shift_L / bg_sd$L, + Z_Shift_K = Raw_Shift_K / bg_sd$K, + Z_Shift_r = Raw_Shift_r / bg_sd$r, + Z_Shift_AUC = Raw_Shift_AUC / bg_sd$AUC + ) + + # Calculate WT and WT_sd for each variable + print("Calculating WT and WT_sd") + interaction_scores <- interaction_scores %>% + mutate( + WT_L = mean(mean_L, na.rm = TRUE), + WT_K = mean(mean_K, na.rm = TRUE), + WT_r = mean(mean_r, na.rm = TRUE), + WT_AUC = mean(mean_AUC, na.rm = TRUE), + WT_sd_L = sd(mean_L, na.rm = TRUE), + WT_sd_K = sd(mean_K, na.rm = TRUE), + WT_sd_r = sd(mean_r, na.rm = TRUE), + WT_sd_AUC = sd(mean_AUC, na.rm = TRUE) + ) + + # Calculate Exp and Delta for each variable + print("Calculating Exp and Delta") + interaction_scores <- interaction_scores %>% + mutate( + Exp_L = WT_L + Raw_Shift_L, + Delta_L = WT_L - Exp_L, + Exp_K = WT_K + Raw_Shift_K, + Delta_K = WT_K - Exp_K, + Exp_r = WT_r + Raw_Shift_r, + Delta_r = WT_r - Exp_r, + Exp_AUC = WT_AUC + Raw_Shift_AUC, + Delta_AUC = WT_AUC - Exp_AUC + ) + + # Final adjustment to Delta for NG and SM conditions + interaction_scores <- interaction_scores %>% + mutate( + Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L), + Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K), + Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), + Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC), + Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) # disregard shift for set-to-max values in Z score calculation ) %>% ungroup() @@ -277,29 +308,29 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c print("Calculating interaction scores part 2") interaction_scores_all <- interaction_scores %>% group_by(across(all_of(group_vars))) %>% - mutate(across(all_of(variables), list( - lm_score = ~ max_conc * lm(Delta ~ conc_num_factor)$coefficients[2] + lm(Delta ~ conc_num_factor)$coefficients[1], - lm_sd = ~ sd(lm_score), - lm_mean = ~ mean(lm_score), - Z_lm = ~ (lm_score - lm_mean) / lm_sd, - Sum_Zscore = ~ sum(Zscore, na.rm = TRUE) - ), .names = "{.fn}_{.col}")) %>% + mutate(lm_score_L = max_conc * coef(lm(Delta_L ~ conc_num_factor))[2] + coef(lm(Delta_L ~ conc_num_factor))[1], + lm_score_K = max_conc * coef(lm(Delta_K ~ conc_num_factor))[2] + coef(lm(Delta_K ~ conc_num_factor))[1], + lm_score_r = max_conc * coef(lm(Delta_r ~ conc_num_factor))[2] + coef(lm(Delta_r ~ conc_num_factor))[1], + lm_score_AUC = max_conc * coef(lm(Delta_AUC ~ conc_num_factor))[2] + coef(lm(Delta_AUC ~ conc_num_factor))[1]) %>% mutate( - Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs, - Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs, - Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1), - Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1) - ) %>% + Avg_Zscore_L = sum(Z_Shift_L, na.rm = TRUE) / num_non_removed_concs, + Avg_Zscore_K = sum(Z_Shift_K, na.rm = TRUE) / num_non_removed_concs, + Avg_Zscore_r = sum(Z_Shift_r, na.rm = TRUE) / (total_conc_num - 1), + Avg_Zscore_AUC = sum(Z_Shift_AUC, na.rm = TRUE) / (total_conc_num - 1) + ) + + # Arrange the results by Z_lm_L and NG + interaction_scores_all <- interaction_scores_all %>% + arrange(desc(lm_score_L)) %>% + arrange(desc(NG)) %>% ungroup() - interaction_scores_all <- interaction_scores_all %>% - arrange(desc(Z_lm_L)) %>% - arrange(desc(NG)) - return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) } + + interaction_plot_configs <- function(df, variable) { ylim_vals <- switch(variable, "L" = c(-65, 65), @@ -650,7 +681,7 @@ main <- function() { write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) print("Summary stats:") - print(head(summary_stats)) + print(head(summary_stats), width = 200) # TODO: Originally this filtered L NA's # Let's try to avoid for now since stats have already been calculated @@ -702,17 +733,17 @@ main <- function() { file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), row.names = FALSE) - print("Background summary stats:") - print(head(summary_stats_bg)) + #print("Background summary stats:") + #print(head(summary_stats_bg)) # Filter reference and deletion strains # Formerly X2_RF (reference strain) - df_reference <- df_bg_stats %>% + df_reference <- df_na_stats %>% filter(OrfRep == strain) %>% mutate(SM = 0) # Formerly X2 (deletion strains) - df_deletion <- df_bg_stats %>% + df_deletion <- df_na_stats %>% filter(OrfRep != strain) %>% mutate(SM = 0) @@ -722,7 +753,12 @@ main <- function() { # Calculate interactions variables <- c("L", "K", "r", "AUC") # We are recalculating some of the data here + message("Calculating interaction scores") + print("Reference strain:") + print(head(reference_strain)) reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables) + print("Deletion strains:") + print(head(deletion_strains)) deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables) zscores_calculations_reference <- reference_results$zscores_calculations