From c7ad7246eed082573a711dd3998bef8675d119c0 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Tue, 24 Sep 2024 20:02:02 -0400 Subject: [PATCH] Redo interaction score groupings --- .../apps/r/calculate_interaction_zscores.R | 186 +++++++++--------- 1 file changed, 98 insertions(+), 88 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 9ab59d19..7ff5a570 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -163,41 +163,26 @@ update_gene_names <- function(df, sgd_gene_list) { return(df) } -calculate_summary_stats <- function(df, variables, group_vars, no_sd = FALSE) { +calculate_summary_stats <- function(df, variables, group_vars) { + summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% summarise( N = n(), across( - all_of(variables), + all_of(variables), list( mean = ~mean(., na.rm = TRUE), median = ~median(., na.rm = TRUE), - max = ~ ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), - min = ~ ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)) + max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), + min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), + sd = ~sd(., na.rm = TRUE), + se = ~sd(., na.rm = TRUE) / sqrt(N - 1) # TODO non-standard SE, needs explanation ), .names = "{.fn}_{.col}" ), .groups = "drop" ) - - if (!no_sd) { - sd_se_stats <- df %>% - group_by(across(all_of(group_vars))) %>% - summarise( - across( - all_of(variables), - list( - sd = ~sd(., na.rm = TRUE), - se = ~sd(., na.rm = TRUE) / sqrt(N - 1) - ), - .names = "{.fn}_{.col}" - ), - .groups = "drop" - ) - - summary_stats <- left_join(summary_stats, sd_se_stats, by = group_vars) - } # Create a cleaned version of df that doesn't overlap with summary_stats cleaned_df <- df %>% @@ -232,21 +217,18 @@ calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", " calculations <- calculate_summary_stats( df = df, variables = variables, - group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"), - no_sd = TRUE + group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") )$df_with_stats calculations <- calculations %>% group_by(across(all_of(group_vars))) %>% mutate( - sd_L = df$sd_L, - sd_K = df$sd_K, - sd_r = df$sd_r, - sd_AUC = df$sd_AUC, - se_L = df$se_L, - se_K = df$se_K, - se_r = df$se_r, - se_AUC = df$se_AUC, + NG = sum(NG, na.rm = TRUE), + DB = sum(DB, na.rm = TRUE), + SM = sum(SM, na.rm = TRUE), + num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1, + + # Store the background data WT_L = bg_means$L, WT_K = bg_means$K, WT_r = bg_means$r, @@ -275,70 +257,84 @@ calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", " Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K), Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC), - Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) - ) + Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L), + # Calculate Z-scores + Zscore_L = Delta_L / WT_sd_L, + Zscore_K = Delta_K / WT_sd_K, + Zscore_r = Delta_r / WT_sd_r, + Zscore_AUC = Delta_AUC / WT_sd_AUC, + + # Fit linear models and store in list-columns + gene_lm_L = list(lm(Delta_L ~ conc_num_factor, data = cur_data())), + gene_lm_K = list(lm(Delta_K ~ conc_num_factor, data = cur_data())), + gene_lm_r = list(lm(Delta_r ~ conc_num_factor, data = cur_data())), + gene_lm_AUC = list(lm(Delta_AUC ~ conc_num_factor, data = cur_data())), + + # Extract coefficients using purrr::map_dbl + lm_intercept_L = map_dbl(gene_lm_L, ~ coef(.x)[1]), + lm_slope_L = map_dbl(gene_lm_L, ~ coef(.x)[2]), + lm_intercept_K = map_dbl(gene_lm_K, ~ coef(.x)[1]), + lm_slope_K = map_dbl(gene_lm_K, ~ coef(.x)[2]), + lm_intercept_r = map_dbl(gene_lm_r, ~ coef(.x)[1]), + lm_slope_r = map_dbl(gene_lm_r, ~ coef(.x)[2]), + lm_intercept_AUC = map_dbl(gene_lm_AUC, ~ coef(.x)[1]), + lm_slope_AUC = map_dbl(gene_lm_AUC, ~ coef(.x)[2]), + + # Calculate lm_Score_* based on coefficients + lm_Score_L = max_conc * lm_slope_L + lm_intercept_L, + lm_Score_K = max_conc * lm_slope_K + lm_intercept_K, + lm_Score_r = max_conc * lm_slope_r + lm_intercept_r, + lm_Score_AUC = max_conc * lm_slope_AUC + lm_intercept_AUC, + + # Calculate R-squared values + R_Squared_L = map_dbl(gene_lm_L, ~ summary(.x)$r.squared), + R_Squared_K = map_dbl(gene_lm_K, ~ summary(.x)$r.squared), + R_Squared_r = map_dbl(gene_lm_r, ~ summary(.x)$r.squared), + R_Squared_AUC = map_dbl(gene_lm_AUC, ~ summary(.x)$r.squared), + + # Calculate Z_lm_* Scores + Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE), + Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE), + Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE), + Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE) + ) %>% + ungroup() + + # Summarize some of the stats interactions <- calculations %>% group_by(across(all_of(group_vars))) %>% mutate( OrfRep = first(OrfRep), Gene = first(Gene), num = first(num), - conc_num = first(conc_num), - conc_num_factor = first(conc_num_factor), - N = n(), + + # Calculate raw shifts Raw_Shift_L = first(Raw_Shift_L), Raw_Shift_K = first(Raw_Shift_K), Raw_Shift_r = first(Raw_Shift_r), Raw_Shift_AUC = first(Raw_Shift_AUC), + + # Calculate Z-shifts Z_Shift_L = first(Z_Shift_L), Z_Shift_K = first(Z_Shift_K), Z_Shift_r = first(Z_Shift_r), Z_Shift_AUC = first(Z_Shift_AUC), - Zscore_L = Delta_L / WT_sd_L, - Zscore_K = Delta_K / WT_sd_K, - Zscore_r = Delta_r / WT_sd_r, - Zscore_AUC = Delta_AUC / WT_sd_AUC, + + # Sum Z-scores Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE), Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE), Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE), Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE), + + # Calculate Average Z-scores Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs, Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs, Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1), - Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1), - gene_lm_L = lm(formula = Delta_L ~ conc_num_factor, data = df), - gene_lm_K = lm(formula = Delta_K ~ conc_num_factor, data = df), - gene_lm_r = lm(formula = Delta_r ~ conc_num_factor, data = df), - gene_lm_AUC = lm(formula = Delta_AUC ~ conc_num_factor, data = df), - lm_Score_L = max_conc * (gene_lm_L$coefficients[2]) + gene_lm_L$coefficients[1], - lm_Score_K = max_conc * (gene_lm_K$coefficients[2]) + gene_lm_K$coefficients[1], - lm_Score_r = max_conc * (gene_lm_r$coefficients[2]) + gene_lm_r$coefficients[1], - lm_Score_AUC = max_conc * (gene_lm_AUC$coefficients[2]) + gene_lm_AUC$coefficients[1], - R_Squared_L = summary(gene_lm_L)$r.squared, - R_Squared_K = summary(gene_lm_K)$r.squared, - R_Squared_r = summary(gene_lm_r)$r.squared, - R_Squared_AUC = summary(gene_lm_AUC)$r.squared, - - lm_intercept_L = coef(lm(Zscore_L ~ conc_num_factor))[1], - lm_slope_L = coef(lm(Zscore_L ~ conc_num))[2], - lm_intercept_K = coef(lm(Zscore_K ~ conc_num))[1], - lm_slope_K = coef(lm(Zscore_K ~ conc_num))[2], - lm_intercept_r = coef(lm(Zscore_r ~ conc_num))[1], - lm_slope_r = coef(lm(Zscore_r ~ conc_num))[2], - lm_intercept_AUC = coef(lm(Zscore_AUC ~ conc_num))[1], - lm_slope_AUC = coef(lm(Zscore_AUC ~ conc_num))[2], - - Z_lm_L = (lm_Score_L - mean_L) / sd(lm_Score_L, na.rm = TRUE), - Z_lm_K = (lm_Score_K - mean_K) / sd(lm_Score_K, na.rm = TRUE), - Z_lm_r = (lm_Score_r - mean_r) / sd(lm_Score_r, na.rm = TRUE), - Z_lm_AUC = (lm_Score_AUC - mean_AUC) / sd(lm_Score_AUC, na.rm = TRUE), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE), - num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1 + Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1) ) %>% - arrange(desc(Z_lm_L), desc(NG)) + arrange(desc(Z_lm_L), desc(NG)) %>% + ungroup() # Declare column order for output calculations <- calculations %>% @@ -356,17 +352,33 @@ calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", " "Delta_L", "Delta_K", "Delta_r", "Delta_AUC", "Zscore_L", "Zscore_K", "Zscore_r", "Zscore_AUC", "NG", "SM", "DB") + + interactions <- interactions %>% + select( + "OrfRep", "Gene", "num", "NG", "DB", "SM", + "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", + "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", + "Sum_Z_Score_L", "Sum_Z_Score_K", "Sum_Z_Score_r", "Sum_Z_Score_AUC", + "Avg_Zscore_L", "Avg_Zscore_K", "Avg_Zscore_r", "Avg_Zscore_AUC", + "lm_Score_L", "lm_Score_K", "lm_Score_r", "lm_Score_AUC", + "R_Squared_L", "R_Squared_K", "R_Squared_r", "R_Squared_AUC", + "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC") - cleaned_df <- df %>% - select(-any_of(setdiff(intersect(names(df), names(calculations)), group_vars))) - calculations_joined <- left_join(cleaned_df, calculations, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) + # cleaned_df <- df %>% + # select(-any_of(setdiff(intersect(names(df), names(calculations)), group_vars))) + + # calculations_joined <- left_join(cleaned_df, calculations, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) cleaned_df <- df %>% select(-any_of(setdiff(intersect(names(df), names(interactions)), group_vars))) + interactions_joined <- left_join(cleaned_df, interactions, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) - return(list(calculations = calculations, interactions = interactions, interactions_joined = interactions_joined, - calculations_joined = calculations_joined)) + return(list( + calculations = calculations, + interactions = interactions, + interactions_joined = interactions_joined + )) } generate_and_save_plots <- function(out_dir, filename, plot_configs, grid_layout = NULL) { @@ -380,18 +392,16 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, grid_layout config <- plot_configs[[i]] df <- config$df - aes_mapping <- if (is.null(config$color_var)) { - if (is.null(config$y_var)) { + aes_mapping <- if (config$plot_type == "bar" || config$plot_type == "density") { + if (is.null(config$color_var)) { aes(x = .data[[config$x_var]]) } else { - aes(x = .data[[config$x_var]], y = .data[[config$y_var]]) - } - } else { - if (is.null(config$y_var)) { aes(x = .data[[config$x_var]], color = as.factor(.data[[config$color_var]])) - } else { - aes(x = .data[[config$x_var]], y = .data[[config$y_var]], color = as.factor(.data[[config$color_var]])) } + } else if (is.null(config$color_var)) { + aes(x = .data[[config$x_var]], y = .data[[config$y_var]]) + } else { + aes(x = .data[[config$x_var]], y = .data[[config$y_var]], color = as.factor(.data[[config$color_var]])) } # Start building the plot with aes_mapping @@ -1331,14 +1341,14 @@ main <- function() { reference_results <- calculate_interaction_scores(reference_strain, max_conc) zscores_calculations_reference <- reference_results$calculations zscores_interactions_reference <- reference_results$interactions - zscores_calculations_reference_joined <- reference_results$calculations_joined + # zscores_calculations_reference_joined <- reference_results$calculations_joined zscores_interactions_reference_joined <- reference_results$interactions_joined message("Calculating deletion strain(s) interactions scores") deletion_results <- calculate_interaction_scores(deletion_strains, max_conc) zscores_calculations <- deletion_results$calculations zscores_interactions <- deletion_results$interactions - zscores_calculations_joined <- deletion_results$calculations_joined + # zscores_calculations_joined <- deletion_results$calculations_joined zscores_interactions_joined <- deletion_results$interactions_joined # Writing Z-Scores to file