diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 83c34abd..cdfa77cb 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -256,79 +256,66 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, ove Zscore_L = Delta_L / WT_sd_L, Zscore_K = Delta_K / WT_sd_K, Zscore_r = Delta_r / WT_sd_r, - Zscore_AUC = Delta_AUC / WT_sd_AUC - ) - - # Fit linear models per group - lm_results <- calculations %>% - nest() %>% - mutate( - # Fit linear models - gene_lm_L = map(data, ~ lm(Delta_L ~ conc_num_factor, data = .x)), - gene_lm_K = map(data, ~ lm(Delta_K ~ conc_num_factor, data = .x)), - gene_lm_r = map(data, ~ lm(Delta_r ~ conc_num_factor, data = .x)), - gene_lm_AUC = map(data, ~ lm(Delta_AUC ~ conc_num_factor, data = .x)), - - # Extract coefficients using purrr::map_dbl - lm_intercept_L = map_dbl(gene_lm_L, ~ coef(.x)[1]), - lm_slope_L = map_dbl(gene_lm_L, ~ coef(.x)[2]), - lm_intercept_K = map_dbl(gene_lm_K, ~ coef(.x)[1]), - lm_slope_K = map_dbl(gene_lm_K, ~ coef(.x)[2]), - lm_intercept_r = map_dbl(gene_lm_r, ~ coef(.x)[1]), - lm_slope_r = map_dbl(gene_lm_r, ~ coef(.x)[2]), - lm_intercept_AUC = map_dbl(gene_lm_AUC, ~ coef(.x)[1]), - lm_slope_AUC = map_dbl(gene_lm_AUC, ~ coef(.x)[2]), - - # Calculate R-squared values for Delta_ models - R_Squared_L = map_dbl(gene_lm_L, ~ summary(.x)$r.squared), - R_Squared_K = map_dbl(gene_lm_K, ~ summary(.x)$r.squared), - R_Squared_r = map_dbl(gene_lm_r, ~ summary(.x)$r.squared), - R_Squared_AUC = map_dbl(gene_lm_AUC, ~ summary(.x)$r.squared), - - # Calculate lm_Score_* based on coefficients - lm_Score_L = max_conc * lm_slope_L + lm_intercept_L, - lm_Score_K = max_conc * lm_slope_K + lm_intercept_K, - lm_Score_r = max_conc * lm_slope_r + lm_intercept_r, - lm_Score_AUC = max_conc * lm_slope_AUC + lm_intercept_AUC + Zscore_AUC = Delta_AUC / WT_sd_AUC, ) %>% - select(-data, -starts_with("gene_lm_")) %>% + group_modify(~ { + lm_L <- lm(Delta_L ~ conc_num_factor, data = .x) + lm_K <- lm(Delta_K ~ conc_num_factor, data = .x) + lm_r <- lm(Delta_r ~ conc_num_factor, data = .x) + lm_AUC <- lm(Delta_AUC ~ conc_num_factor, data = .x) + + # Return coefficients and R-squared values + .x %>% + mutate( + lm_intercept_L = coef(lm_L)[1], + lm_slope_L = coef(lm_L)[2], + R_Squared_L = summary(lm_L)$r.squared, + lm_Score_L = max_conc * lm_slope_L + lm_intercept_L, + + lm_intercept_K = coef(lm_K)[1], + lm_slope_K = coef(lm_K)[2], + R_Squared_K = summary(lm_K)$r.squared, + lm_Score_K = max_conc * lm_slope_K + lm_intercept_K, + + lm_intercept_r = coef(lm_r)[1], + lm_slope_r = coef(lm_r)[2], + R_Squared_r = summary(lm_r)$r.squared, + lm_Score_r = max_conc * lm_slope_r + lm_intercept_r, + + lm_intercept_AUC = coef(lm_AUC)[1], + lm_slope_AUC = coef(lm_AUC)[2], + R_Squared_AUC = summary(lm_AUC)$r.squared, + lm_Score_AUC = max_conc * lm_slope_AUC + lm_intercept_AUC + ) + }) %>% ungroup() - - # Merge lm_results back into calculations - calculations <- calculations %>% - left_join(lm_results, by = group_vars) - + # Calculate overall mean and SD for lm_Score_* variables - gene_lm_means <- lm_results %>% + lm_means_sds <- calculations %>% + group_by(across(all_of(group_vars))) %>% summarise( - L = mean(lm_Score_L, na.rm = TRUE), - K = mean(lm_Score_K, na.rm = TRUE), - r = mean(lm_Score_r, na.rm = TRUE), - AUC = mean(lm_Score_AUC, na.rm = TRUE) - ) - - gene_lm_sds <- lm_results %>% - summarise( - L = sd(lm_Score_L, na.rm = TRUE), - K = sd(lm_Score_K, na.rm = TRUE), - r = sd(lm_Score_r, na.rm = TRUE), - AUC = sd(lm_Score_AUC, na.rm = TRUE) + L_mean = mean(lm_Score_L, na.rm = TRUE), + L_sd = sd(lm_Score_L, na.rm = TRUE), + K_mean = mean(lm_Score_K, na.rm = TRUE), + K_sd = sd(lm_Score_K, na.rm = TRUE), + r_mean = mean(lm_Score_r, na.rm = TRUE), + r_sd = sd(lm_Score_r, na.rm = TRUE), + AUC_mean = mean(lm_Score_AUC, na.rm = TRUE), + AUC_sd = sd(lm_Score_AUC, na.rm = TRUE) ) # Calculate gene Z-scores calculations <- calculations %>% mutate( - Z_lm_L = (lm_Score_L - gene_lm_means$L) / gene_lm_sds$L, - Z_lm_K = (lm_Score_K - gene_lm_means$K) / gene_lm_sds$K, - Z_lm_r = (lm_Score_r - gene_lm_means$r) / gene_lm_sds$r, - Z_lm_AUC = (lm_Score_AUC - gene_lm_means$AUC) / gene_lm_sds$AUC + Z_lm_L = (lm_Score_L - lm_means_sds$L_mean) / lm_means_sds$L_sd, + Z_lm_K = (lm_Score_K - lm_means_sds$K_mean) / lm_means_sds$K_sd, + Z_lm_r = (lm_Score_r - lm_means_sds$r_mean) / lm_means_sds$r_sd, + Z_lm_AUC = (lm_Score_AUC - lm_means_sds$AUC_mean) / lm_means_sds$AUC_sd ) # Build summary stats (interactions) interactions <- calculations %>% - group_by(across(all_of(group_vars))) %>% summarise( - # Calculate average Z-scores Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / first(num_non_removed_concs), Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / first(num_non_removed_concs), Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / first(num_non_removed_concs), @@ -352,16 +339,12 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, ove Z_Shift_r = first(Z_Shift_r), Z_Shift_AUC = first(Z_Shift_AUC), - # Include NG, DB, SM NG = first(NG), DB = first(DB), SM = first(SM) ) %>% arrange(desc(Z_lm_L), desc(NG)) %>% - ungroup() - - # Calculate overlap - interactions <- interactions %>% + ungroup() %>% mutate( Overlap = case_when( Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both", @@ -375,7 +358,7 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, ove TRUE ~ "No Effect" ) ) - + # Fit correlation models between Z_lm_* and Avg_Zscore_* (same-variable) correlation_lms_same <- list( L = lm(Z_lm_L ~ Avg_Zscore_L, data = interactions),