From 5f1331f9a527feee0cce27db17ddbb1ab2d28a51 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Wed, 18 Sep 2024 22:14:53 -0400 Subject: [PATCH] Workaround bogus sd and se calculations --- .../apps/r/calculate_interaction_zscores.R | 154 +++++++++--------- 1 file changed, 78 insertions(+), 76 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 3da34a34..9ab59d19 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -163,26 +163,46 @@ update_gene_names <- function(df, sgd_gene_list) { return(df) } -calculate_summary_stats <- function(df, variables, group_vars) { - +calculate_summary_stats <- function(df, variables, group_vars, no_sd = FALSE) { summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% summarise( N = n(), - across(all_of(variables), list( - mean = ~mean(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - max = ~ ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), - min = ~ ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), - sd = ~sd(., na.rm = TRUE), - se = ~sd(., na.rm = TRUE) / sqrt(N) - 1 # TODO needs comment for explanation - ), .names = "{.fn}_{.col}"), + across( + all_of(variables), + list( + mean = ~mean(., na.rm = TRUE), + median = ~median(., na.rm = TRUE), + max = ~ ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), + min = ~ ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)) + ), + .names = "{.fn}_{.col}" + ), .groups = "drop" ) + + if (!no_sd) { + sd_se_stats <- df %>% + group_by(across(all_of(group_vars))) %>% + summarise( + across( + all_of(variables), + list( + sd = ~sd(., na.rm = TRUE), + se = ~sd(., na.rm = TRUE) / sqrt(N - 1) + ), + .names = "{.fn}_{.col}" + ), + .groups = "drop" + ) + + summary_stats <- left_join(summary_stats, sd_se_stats, by = group_vars) + } # Create a cleaned version of df that doesn't overlap with summary_stats cleaned_df <- df %>% select(-any_of(setdiff(intersect(names(df), names(summary_stats)), group_vars))) + df_joined <- left_join(cleaned_df, summary_stats, by = group_vars) return(list(summary_stats = summary_stats, df_with_stats = df_joined)) @@ -209,35 +229,32 @@ calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", " AUC = df %>% filter(conc_num == 0) %>% pull(sd_AUC) %>% first() ) - # Calculate per spot - # sd and se calculations are invalid when grouping at this level - interaction_ss <- calculate_summary_stats( + calculations <- calculate_summary_stats( df = df, variables = variables, - group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") + group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"), + no_sd = TRUE )$df_with_stats - # Load original WT data - stats <- df %>% + calculations <- calculations %>% group_by(across(all_of(group_vars))) %>% mutate( - WT_L = mean_L, - WT_K = mean_K, - WT_r = mean_r, - WT_AUC = mean_AUC, - WT_sd_L = sd_L, - WT_sd_K = sd_K, - WT_sd_r = sd_r, - WT_sd_AUC = sd_AUC - ) - - cleaned_df <- stats %>% - select(-any_of(setdiff(intersect(names(stats), names(interaction_ss)), - c(group_vars, "sd_L", "sd_K", "sd_r", "sd_AUC", "se_L", "se_K", "se_r", "se_AUC")))) - stats <- left_join(cleaned_df, interaction_ss, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) - - stats <- stats %>% - mutate( + sd_L = df$sd_L, + sd_K = df$sd_K, + sd_r = df$sd_r, + sd_AUC = df$sd_AUC, + se_L = df$se_L, + se_K = df$se_K, + se_r = df$se_r, + se_AUC = df$se_AUC, + WT_L = bg_means$L, + WT_K = bg_means$K, + WT_r = bg_means$r, + WT_AUC = bg_means$AUC, + WT_sd_L = bg_sd$L, + WT_sd_K = bg_sd$K, + WT_sd_r = bg_sd$r, + WT_sd_AUC = bg_sd$AUC, Raw_Shift_L = first(mean_L) - bg_means$L, Raw_Shift_K = first(mean_K) - bg_means$K, Raw_Shift_r = first(mean_r) - bg_means$r, @@ -245,11 +262,7 @@ calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", " Z_Shift_L = first(Raw_Shift_L) / bg_sd$L, Z_Shift_K = first(Raw_Shift_K) / bg_sd$K, Z_Shift_r = first(Raw_Shift_r) / bg_sd$r, - Z_Shift_AUC = first(Raw_Shift_AUC) / bg_sd$AUC - ) - - stats <- stats %>% - mutate( + Z_Shift_AUC = first(Raw_Shift_AUC) / bg_sd$AUC, Exp_L = WT_L + Raw_Shift_L, Exp_K = WT_K + Raw_Shift_K, Exp_r = WT_r + Raw_Shift_r, @@ -257,11 +270,7 @@ calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", " Delta_L = mean_L - Exp_L, Delta_K = mean_K - Exp_K, Delta_r = mean_r - Exp_r, - Delta_AUC = mean_AUC - Exp_AUC - ) - - stats <- stats %>% - mutate( + Delta_AUC = mean_AUC - Exp_AUC, Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L), Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K), Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), @@ -269,21 +278,7 @@ calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", " Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) ) - stats <- stats %>% - mutate( - Zscore_L = Delta_L / WT_sd_L, - Zscore_K = Delta_K / WT_sd_K, - Zscore_r = Delta_r / WT_sd_r, - Zscore_AUC = Delta_AUC / WT_sd_AUC - ) - - # Fit linear models and calculate interaction scores - gene_lm_L <- lm(formula = Delta_L ~ conc_num_factor, data = df) - gene_lm_K <- lm(formula = Delta_K ~ conc_num_factor, data = df) - gene_lm_r <- lm(formula = Delta_r ~ conc_num_factor, data = df) - gene_lm_AUC <- lm(formula = Delta_AUC ~ conc_num_factor, data = df) - - interactions <- stats %>% + interactions <- calculations %>% group_by(across(all_of(group_vars))) %>% mutate( OrfRep = first(OrfRep), @@ -291,6 +286,7 @@ calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", " num = first(num), conc_num = first(conc_num), conc_num_factor = first(conc_num_factor), + N = n(), Raw_Shift_L = first(Raw_Shift_L), Raw_Shift_K = first(Raw_Shift_K), Raw_Shift_r = first(Raw_Shift_r), @@ -299,6 +295,22 @@ calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", " Z_Shift_K = first(Z_Shift_K), Z_Shift_r = first(Z_Shift_r), Z_Shift_AUC = first(Z_Shift_AUC), + Zscore_L = Delta_L / WT_sd_L, + Zscore_K = Delta_K / WT_sd_K, + Zscore_r = Delta_r / WT_sd_r, + Zscore_AUC = Delta_AUC / WT_sd_AUC, + Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE), + Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE), + Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE), + Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE), + Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs, + Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs, + Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1), + Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1), + gene_lm_L = lm(formula = Delta_L ~ conc_num_factor, data = df), + gene_lm_K = lm(formula = Delta_K ~ conc_num_factor, data = df), + gene_lm_r = lm(formula = Delta_r ~ conc_num_factor, data = df), + gene_lm_AUC = lm(formula = Delta_AUC ~ conc_num_factor, data = df), lm_Score_L = max_conc * (gene_lm_L$coefficients[2]) + gene_lm_L$coefficients[1], lm_Score_K = max_conc * (gene_lm_K$coefficients[2]) + gene_lm_K$coefficients[1], lm_Score_r = max_conc * (gene_lm_r$coefficients[2]) + gene_lm_r$coefficients[1], @@ -307,7 +319,8 @@ calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", " R_Squared_K = summary(gene_lm_K)$r.squared, R_Squared_r = summary(gene_lm_r)$r.squared, R_Squared_AUC = summary(gene_lm_AUC)$r.squared, - lm_intercept_L = coef(lm(Zscore_L ~ conc_num))[1], + + lm_intercept_L = coef(lm(Zscore_L ~ conc_num_factor))[1], lm_slope_L = coef(lm(Zscore_L ~ conc_num))[2], lm_intercept_K = coef(lm(Zscore_K ~ conc_num))[1], lm_slope_K = coef(lm(Zscore_K ~ conc_num))[2], @@ -315,31 +328,20 @@ calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", " lm_slope_r = coef(lm(Zscore_r ~ conc_num))[2], lm_intercept_AUC = coef(lm(Zscore_AUC ~ conc_num))[1], lm_slope_AUC = coef(lm(Zscore_AUC ~ conc_num))[2], - Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE), - Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE), - Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE), - Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE), + + Z_lm_L = (lm_Score_L - mean_L) / sd(lm_Score_L, na.rm = TRUE), + Z_lm_K = (lm_Score_K - mean_K) / sd(lm_Score_K, na.rm = TRUE), + Z_lm_r = (lm_Score_r - mean_r) / sd(lm_Score_r, na.rm = TRUE), + Z_lm_AUC = (lm_Score_AUC - mean_AUC) / sd(lm_Score_AUC, na.rm = TRUE), NG = sum(NG, na.rm = TRUE), DB = sum(DB, na.rm = TRUE), SM = sum(SM, na.rm = TRUE), num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1 - ) - - interactions <- interactions %>% - mutate( - Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs, - Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs, - Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1), - Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1), - Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE), - Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE), - Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE), - Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE) ) %>% arrange(desc(Z_lm_L), desc(NG)) # Declare column order for output - calculations <- stats %>% + calculations <- calculations %>% select( "OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "N", "mean_L", "mean_K", "mean_r", "mean_AUC",