From 74eace8cde4510c5c1704ddf42812fc745ea6444 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Fri, 13 Sep 2024 14:37:31 -0400 Subject: [PATCH] Simplify calculate_interaction_scores() --- .../apps/r/calculate_interaction_zscores.R | 146 +++++++----------- 1 file changed, 59 insertions(+), 87 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 3c189244..5aa61fcc 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -215,7 +215,7 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c WT_sd_r = sd_r, WT_sd_AUC = sd_AUC ) %>% - group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% + group_by(OrfRep, Gene, num, conc_num, conc_num_factor) %>% mutate( N = sum(!is.na(L)), NG = sum(NG, na.rm = TRUE), @@ -229,18 +229,20 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c sd = ~sd(., na.rm = TRUE), se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), NA) ), .names = "{.fn}_{.col}") - ) + ) %>% + ungroup() stats <- stats %>% + group_by(OrfRep, Gene, num) %>% mutate( - Raw_Shift_L = mean_L[[1]] - bg_means$L, - Raw_Shift_K = mean_K[[1]] - bg_means$K, - Raw_Shift_r = mean_r[[1]] - bg_means$r, - Raw_Shift_AUC = mean_AUC[[1]] - bg_means$AUC, - Z_Shift_L = Raw_Shift_L[[1]] / bg_sd$L, - Z_Shift_K = Raw_Shift_K[[1]] / bg_sd$K, - Z_Shift_r = Raw_Shift_r[[1]] / bg_sd$r, - Z_Shift_AUC = Raw_Shift_AUC[[1]] / bg_sd$AUC + Raw_Shift_L = first(mean_L) - bg_means$L, + Raw_Shift_K = first(mean_K) - bg_means$K, + Raw_Shift_r = first(mean_r) - bg_means$r, + Raw_Shift_AUC = first(mean_AUC) - bg_means$AUC, + Z_Shift_L = first(Raw_Shift_L) / bg_sd$L, + Z_Shift_K = first(Raw_Shift_K) / bg_sd$K, + Z_Shift_r = first(Raw_Shift_r) / bg_sd$r, + Z_Shift_AUC = first(Raw_Shift_AUC) / bg_sd$AUC ) stats <- stats %>% @@ -270,111 +272,81 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c Zscore_K = Delta_K / WT_sd_K, Zscore_r = Delta_r / WT_sd_r, Zscore_AUC = Delta_AUC / WT_sd_AUC - ) %>% - ungroup() - - # Create linear models with error handling for missing/insufficient data - # This part is a PITA so best to contain it in its own function - calculate_lm_values <- function(y, x) { - if (length(unique(x)) > 1 && sum(!is.na(y)) > 1) { - # Suppress warnings only for perfect fits or similar issues - model <- suppressWarnings(lm(y ~ x)) - coefficients <- coef(model) - r_squared <- tryCatch({ - summary(model)$r.squared - }, warning = function(w) { - NA # Set r-squared to NA if there's a warning - }) - return(list(intercept = coefficients[1], slope = coefficients[2], r_squared = r_squared)) - } else { - return(list(intercept = NA, slope = NA, r_squared = NA)) - } - } - - lms <- stats %>% - group_by(across(all_of(group_vars))) %>% - reframe( - lm_L = list(calculate_lm_values(Delta_L, conc_num_factor)), - lm_K = list(calculate_lm_values(Delta_K, conc_num_factor)), - lm_r = list(calculate_lm_values(Delta_r, conc_num_factor)), - lm_AUC = list(calculate_lm_values(Delta_AUC, conc_num_factor)) ) - lms <- lms %>% - mutate( - lm_L_intercept = sapply(lm_L, `[[`, "intercept"), - lm_L_slope = sapply(lm_L, `[[`, "slope"), - lm_L_r_squared = sapply(lm_L, `[[`, "r_squared"), - lm_K_intercept = sapply(lm_K, `[[`, "intercept"), - lm_K_slope = sapply(lm_K, `[[`, "slope"), - lm_K_r_squared = sapply(lm_K, `[[`, "r_squared"), - lm_r_intercept = sapply(lm_r, `[[`, "intercept"), - lm_r_slope = sapply(lm_r, `[[`, "slope"), - lm_r_r_squared = sapply(lm_r, `[[`, "r_squared"), - lm_AUC_intercept = sapply(lm_AUC, `[[`, "intercept"), - lm_AUC_slope = sapply(lm_AUC, `[[`, "slope"), - lm_AUC_r_squared = sapply(lm_AUC, `[[`, "r_squared") - ) %>% - select(-lm_L, -lm_K, -lm_r, -lm_AUC) - stats <- stats %>% - left_join(lms, by = group_vars) %>% mutate( - lm_Score_L = lm_L_slope * max_conc + lm_L_intercept, - lm_Score_K = lm_K_slope * max_conc + lm_K_intercept, - lm_Score_r = lm_r_slope * max_conc + lm_r_intercept, - lm_Score_AUC = lm_AUC_slope * max_conc + lm_AUC_intercept, - R_Squared_L = lm_L_r_squared, - R_Squared_K = lm_K_r_squared, - R_Squared_r = lm_r_r_squared, - R_Squared_AUC = lm_AUC_r_squared, Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE), Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE), Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE), Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) ) + # Calculate linear models and store in own df for now + lms <- stats %>% + reframe( + L = lm(Delta_L ~ conc_num_factor), + K = lm(Delta_K ~ conc_num_factor), + r = lm(Delta_r ~ conc_num_factor), + AUC = lm(Delta_AUC ~ conc_num_factor) + ) + stats <- stats %>% mutate( Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs, Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs, Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1), Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1), + lm_Score_L = max_conc * coef(lms$L)[2] + coef(lms$L)[1], + lm_Score_K = max_conc * coef(lms$K)[2] + coef(lms$K)[1], + lm_Score_r = max_conc * coef(lms$r)[2] + coef(lms$r)[1], + lm_Score_AUC = max_conc * coef(lms$AUC)[2] + coef(lms$AUC)[1], + R_Squared_L = summary(lms$L)$r.squared, + R_Squared_K = summary(lms$K)$r.squared, + R_Squared_r = summary(lms$r)$r.squared, + R_Squared_AUC = summary(lms$AUC)$r.squared + ) + + stats <- stats %>% + mutate( Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE), Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE), Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE), Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE) - ) %>% - ungroup() + ) # Declare column order for output calculations <- stats %>% - select("OrfRep", "Gene", "num", "conc_num", "conc_num_factor", - "mean_L", "mean_K", "mean_r", "mean_AUC", - "median_L", "median_K", "median_r", "median_AUC", - "sd_L", "sd_K", "sd_r", "sd_AUC", - "se_L", "se_K", "se_r", "se_AUC", - "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", - "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", - "WT_L", "WT_K", "WT_r", "WT_AUC", - "WT_sd_L", "WT_sd_K", "WT_sd_r", "WT_sd_AUC", - "Exp_L", "Exp_K", "Exp_r", "Exp_AUC", - "Delta_L", "Delta_K", "Delta_r", "Delta_AUC", - "Zscore_L", "Zscore_K", "Zscore_r", "Zscore_AUC", - "NG", "SM", "DB") + select( + "OrfRep", "Gene", "num", "conc_num", "conc_num_factor", + "mean_L", "mean_K", "mean_r", "mean_AUC", + "median_L", "median_K", "median_r", "median_AUC", + "sd_L", "sd_K", "sd_r", "sd_AUC", + "se_L", "se_K", "se_r", "se_AUC", + "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", + "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", + "WT_L", "WT_K", "WT_r", "WT_AUC", + "WT_sd_L", "WT_sd_K", "WT_sd_r", "WT_sd_AUC", + "Exp_L", "Exp_K", "Exp_r", "Exp_AUC", + "Delta_L", "Delta_K", "Delta_r", "Delta_AUC", + "Zscore_L", "Zscore_K", "Zscore_r", "Zscore_AUC", + "NG", "SM", "DB") interactions <- stats %>% - select("OrfRep", "Gene", "num", "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_AUC", "Raw_Shift_r", - "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", - "lm_Score_L", "lm_Score_K", "lm_Score_AUC", "lm_Score_r", - "R_Squared_L", "R_Squared_K", "R_Squared_r", "R_Squared_AUC", - "Sum_Zscore_L", "Sum_Zscore_K", "Sum_Zscore_r", "Sum_Zscore_AUC", - "Avg_Zscore_L", "Avg_Zscore_K", "Avg_Zscore_r", "Avg_Zscore_AUC", - "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC", - "NG", "SM", "DB") %>% + select( + "OrfRep", "Gene", "num", "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_AUC", "Raw_Shift_r", + "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", + "lm_Score_L", "lm_Score_K", "lm_Score_AUC", "lm_Score_r", + "R_Squared_L", "R_Squared_K", "R_Squared_r", "R_Squared_AUC", + "Sum_Zscore_L", "Sum_Zscore_K", "Sum_Zscore_r", "Sum_Zscore_AUC", + "Avg_Zscore_L", "Avg_Zscore_K", "Avg_Zscore_r", "Avg_Zscore_AUC", + "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC", + "NG", "SM", "DB") %>% arrange(desc(lm_Score_L)) %>% arrange(desc(NG)) + print(df, n = 1) + print(calculations, n = 1) df <- df %>% select(-any_of(setdiff(names(calculations), group_vars))) df <- left_join(df, calculations, by = group_vars) # df <- df %>% select(-any_of(setdiff(names(interactions), group_vars)))