From ce324529203ec4a30b96c58afe6751608c254fb0 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Wed, 2 Oct 2024 06:08:46 -0400 Subject: [PATCH] More nteraction score data structure improvements --- .../apps/r/calculate_interaction_zscores.R | 95 ++++++------------- 1 file changed, 29 insertions(+), 66 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index cdfa77cb..5359d61b 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -256,15 +256,15 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, ove Zscore_L = Delta_L / WT_sd_L, Zscore_K = Delta_K / WT_sd_K, Zscore_r = Delta_r / WT_sd_r, - Zscore_AUC = Delta_AUC / WT_sd_AUC, + Zscore_AUC = Delta_AUC / WT_sd_AUC ) %>% group_modify(~ { + # Perform linear models lm_L <- lm(Delta_L ~ conc_num_factor, data = .x) lm_K <- lm(Delta_K ~ conc_num_factor, data = .x) lm_r <- lm(Delta_r ~ conc_num_factor, data = .x) lm_AUC <- lm(Delta_AUC ~ conc_num_factor, data = .x) - - # Return coefficients and R-squared values + .x %>% mutate( lm_intercept_L = coef(lm_L)[1], @@ -290,31 +290,34 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, ove }) %>% ungroup() - # Calculate overall mean and SD for lm_Score_* variables + # Continue with the rest of the function as before lm_means_sds <- calculations %>% group_by(across(all_of(group_vars))) %>% summarise( - L_mean = mean(lm_Score_L, na.rm = TRUE), - L_sd = sd(lm_Score_L, na.rm = TRUE), - K_mean = mean(lm_Score_K, na.rm = TRUE), - K_sd = sd(lm_Score_K, na.rm = TRUE), - r_mean = mean(lm_Score_r, na.rm = TRUE), - r_sd = sd(lm_Score_r, na.rm = TRUE), - AUC_mean = mean(lm_Score_AUC, na.rm = TRUE), - AUC_sd = sd(lm_Score_AUC, na.rm = TRUE) + mean_lm_L = mean(lm_Score_L, na.rm = TRUE), + sd_lm_L = sd(lm_Score_L, na.rm = TRUE), + mean_lm_K = mean(lm_Score_K, na.rm = TRUE), + sd_lm_K = sd(lm_Score_K, na.rm = TRUE), + mean_lm_r = mean(lm_Score_r, na.rm = TRUE), + sd_lm_r = sd(lm_Score_r, na.rm = TRUE), + mean_lm_AUC = mean(lm_Score_AUC, na.rm = TRUE), + sd_lm_AUC = sd(lm_Score_AUC, na.rm = TRUE) ) - # Calculate gene Z-scores + # Continue with gene Z-scores and interactions calculations <- calculations %>% + left_join(lm_means_sds, by = group_vars) %>% + group_by(across(all_of(group_vars))) %>% mutate( - Z_lm_L = (lm_Score_L - lm_means_sds$L_mean) / lm_means_sds$L_sd, - Z_lm_K = (lm_Score_K - lm_means_sds$K_mean) / lm_means_sds$K_sd, - Z_lm_r = (lm_Score_r - lm_means_sds$r_mean) / lm_means_sds$r_sd, - Z_lm_AUC = (lm_Score_AUC - lm_means_sds$AUC_mean) / lm_means_sds$AUC_sd + Z_lm_L = (lm_Score_L - mean_lm_L) / sd_lm_L, + Z_lm_K = (lm_Score_K - mean_lm_K) / sd_lm_K, + Z_lm_r = (lm_Score_r - mean_lm_r) / sd_lm_r, + Z_lm_AUC = (lm_Score_AUC - mean_lm_AUC) / sd_lm_AUC ) # Build summary stats (interactions) interactions <- calculations %>% + group_by(across(all_of(group_vars))) %>% summarise( Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / first(num_non_removed_concs), Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / first(num_non_removed_concs), @@ -359,54 +362,14 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, ove ) ) - # Fit correlation models between Z_lm_* and Avg_Zscore_* (same-variable) - correlation_lms_same <- list( - L = lm(Z_lm_L ~ Avg_Zscore_L, data = interactions), - K = lm(Z_lm_K ~ Avg_Zscore_K, data = interactions), - r = lm(Z_lm_r ~ Avg_Zscore_r, data = interactions), - AUC = lm(Z_lm_AUC ~ Avg_Zscore_AUC, data = interactions) - ) - - # Extract correlation statistics for same-variable correlations - correlation_stats_same <- map(correlation_lms_same, ~ { - list( - intercept = coef(.x)[1], - slope = coef(.x)[2], - r_squared = summary(.x)$r.squared - ) - }) - - # Fit additional correlation models between different Z_lm_* variables - correlation_lms_diff <- list( - L_vs_K = lm(Z_lm_K ~ Z_lm_L, data = interactions), - L_vs_r = lm(Z_lm_r ~ Z_lm_L, data = interactions), - L_vs_AUC = lm(Z_lm_AUC ~ Z_lm_L, data = interactions), - K_vs_r = lm(Z_lm_r ~ Z_lm_K, data = interactions), - K_vs_AUC = lm(Z_lm_AUC ~ Z_lm_K, data = interactions), - r_vs_AUC = lm(Z_lm_AUC ~ Z_lm_r, data = interactions) - ) - - # Extract correlation statistics for different-variable correlations - correlation_stats_diff <- map(correlation_lms_diff, ~ { - list( - intercept = coef(.x)[1], - slope = coef(.x)[2], - r_squared = summary(.x)$r.squared - ) - }) - - # Combine all correlation stats - correlation_stats <- c(correlation_stats_same, correlation_stats_diff) - - # Prepare full_data by merging interactions back into calculations + # Return full data and correlation stats full_data <- calculations %>% left_join(interactions, by = group_vars) return(list( calculations = calculations, interactions = interactions, - full_data = full_data, - correlation_stats = correlation_stats + full_data = full_data )) } @@ -709,7 +672,7 @@ generate_interaction_plot_configs <- function(df, type) { if (type == "reference") { group_vars <- c("OrfRep", "Gene", "num") df <- df %>% - mutate(OrfRepCombined = do.call(paste, c(across(all_of(group_vars)), sep = "_"))) + mutate(OrfRepCombined = paste(!!!syms(group_vars), sep = "_")) } else if (type == "deletion") { group_vars <- c("OrfRep", "Gene") df <- df %>% @@ -753,7 +716,7 @@ generate_interaction_plot_configs <- function(df, type) { x_breaks = unique(df$conc_num_factor_factor), x_labels = as.character(unique(df$conc_num)), position = "jitter", - smooth = TRUE, + smooth = TRUE, lm_line = list( intercept = mean(df[[lm_intercept_col]], na.rm = TRUE), slope = mean(df[[lm_slope_col]], na.rm = TRUE) @@ -1251,9 +1214,9 @@ main <- function() { ) # Generating quality control plots in parallel - furrr::future_map(plot_configs, function(config) { - generate_and_save_plots(config$out_dir, config$filename, config$plot_configs) - }, .options = furrr_options(seed = TRUE)) + # furrr::future_map(plot_configs, function(config) { + # generate_and_save_plots(config$out_dir, config$filename, config$plot_configs) + # }, .options = furrr_options(seed = TRUE)) # Process background strains bg_strains <- c("YDL227C") @@ -1311,7 +1274,7 @@ main <- function() { df_reference_stats <- calculate_summary_stats( df = df_reference, variables = interaction_vars, - group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "conc_num_factor_factor") + group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") )$df_with_stats reference_results <- calculate_interaction_scores(df_reference_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene", "num")) zscore_calculations_reference <- reference_results$calculations @@ -1322,7 +1285,7 @@ main <- function() { df_deletion_stats <- calculate_summary_stats( df = df_deletion, variables = interaction_vars, - group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor", "conc_num_factor_factor") + group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor") )$df_with_stats deletion_results <- calculate_interaction_scores(df_deletion_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene")) zscore_calculations <- deletion_results$calculations