From 20735171924421288b54f768f294ba1c5c5fa4d0 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Mon, 7 Oct 2024 01:20:36 -0400 Subject: [PATCH] Add correlation lms to calculate_interaction_scores() --- .../apps/r/calculate_interaction_zscores.R | 232 +++++++++++------- 1 file changed, 143 insertions(+), 89 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 9fe74dd4..fc6a5666 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -217,6 +217,22 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) group_vars <- c("OrfRep", "Gene", "Drug") } + # This is a helper function to perform a linear regression and extract coefficients + perform_lm <- function(data, response, predictor, max_conc) { + valid_data <- data %>% filter(!is.na(.data[[response]])) + if (nrow(valid_data) > 1) { + model <- lm(.data[[response]] ~ .data[[predictor]], data = valid_data) + list( + intercept = coef(model)[1], + slope = coef(model)[2], + r_squared = summary(model)$r.squared, + score = max_conc * coef(model)[2] + coef(model)[1] + ) + } else { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } + } + # Calculate WT statistics from df_bg wt_stats <- df_bg %>% group_by(across(all_of(bg_group_vars))) %>% @@ -300,41 +316,32 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) ungroup() %>% # Ungroup before group_modify group_by(across(all_of(group_vars))) %>% group_modify(~ { + lm_L <- perform_lm(.x, "Delta_L", "conc_num_factor", max_conc) + lm_K <- perform_lm(.x, "Delta_K", "conc_num_factor", max_conc) + lm_r <- perform_lm(.x, "Delta_r", "conc_num_factor", max_conc) + lm_AUC <- perform_lm(.x, "Delta_AUC", "conc_num_factor", max_conc) - # Filter each column for valid data or else linear modeling will fail - valid_data_L <- .x %>% filter(!is.na(Delta_L)) - valid_data_K <- .x %>% filter(!is.na(Delta_K)) - valid_data_r <- .x %>% filter(!is.na(Delta_r)) - valid_data_AUC <- .x %>% filter(!is.na(Delta_AUC)) - - # Perform linear modeling - lm_L <- if (nrow(valid_data_L) > 1) lm(Delta_L ~ conc_num_factor, data = valid_data_L) else NULL - lm_K <- if (nrow(valid_data_K) > 1) lm(Delta_K ~ conc_num_factor, data = valid_data_K) else NULL - lm_r <- if (nrow(valid_data_r) > 1) lm(Delta_r ~ conc_num_factor, data = valid_data_r) else NULL - lm_AUC <- if (nrow(valid_data_AUC) > 1) lm(Delta_AUC ~ conc_num_factor, data = valid_data_AUC) else NULL - - # Extract coefficients for calculations and plotting .x %>% mutate( - lm_intercept_L = if (!is.null(lm_L)) coef(lm_L)[1] else NA, - lm_slope_L = if (!is.null(lm_L)) coef(lm_L)[2] else NA, - R_Squared_L = if (!is.null(lm_L)) summary(lm_L)$r.squared else NA, - lm_Score_L = if (!is.null(lm_L)) max_conc * coef(lm_L)[2] + coef(lm_L)[1] else NA, + lm_intercept_L = lm_L$intercept, + lm_slope_L = lm_L$slope, + R_Squared_L = lm_L$r_squared, + lm_Score_L = lm_L$score, - lm_intercept_K = if (!is.null(lm_K)) coef(lm_K)[1] else NA, - lm_slope_K = if (!is.null(lm_K)) coef(lm_K)[2] else NA, - R_Squared_K = if (!is.null(lm_K)) summary(lm_K)$r.squared else NA, - lm_Score_K = if (!is.null(lm_K)) max_conc * coef(lm_K)[2] + coef(lm_K)[1] else NA, + lm_intercept_K = lm_K$intercept, + lm_slope_K = lm_K$slope, + R_Squared_K = lm_K$r_squared, + lm_Score_K = lm_K$score, - lm_intercept_r = if (!is.null(lm_r)) coef(lm_r)[1] else NA, - lm_slope_r = if (!is.null(lm_r)) coef(lm_r)[2] else NA, - R_Squared_r = if (!is.null(lm_r)) summary(lm_r)$r.squared else NA, - lm_Score_r = if (!is.null(lm_r)) max_conc * coef(lm_r)[2] + coef(lm_r)[1] else NA, + lm_intercept_r = lm_r$intercept, + lm_slope_r = lm_r$slope, + R_Squared_r = lm_r$r_squared, + lm_Score_r = lm_r$score, - lm_intercept_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[1] else NA, - lm_slope_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[2] else NA, - R_Squared_AUC = if (!is.null(lm_AUC)) summary(lm_AUC)$r.squared else NA, - lm_Score_AUC = if (!is.null(lm_AUC)) max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1] else NA + lm_intercept_AUC = lm_AUC$intercept, + lm_slope_AUC = lm_AUC$slope, + R_Squared_AUC = lm_AUC$r_squared, + lm_Score_AUC = lm_AUC$score ) }) %>% ungroup() @@ -404,55 +411,116 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) Z_Shift_r = first(Z_Shift_r), Z_Shift_AUC = first(Z_Shift_AUC), - # R Squared values - R_Squared_L = first(R_Squared_L), - R_Squared_K = first(R_Squared_K), - R_Squared_r = first(R_Squared_r), - R_Squared_AUC = first(R_Squared_AUC), - - # lm intercepts - lm_intercept_L = first(lm_intercept_L), - lm_intercept_K = first(lm_intercept_K), - lm_intercept_r = first(lm_intercept_r), - lm_intercept_AUC = first(lm_intercept_AUC), - - # lm slopes - lm_slope_L = first(lm_slope_L), - lm_slope_K = first(lm_slope_K), - lm_slope_r = first(lm_slope_r), - lm_slope_AUC = first(lm_slope_AUC), - # NG, DB, SM values NG = first(NG), DB = first(DB), SM = first(SM), .groups = "drop" - ) + ) %>% + arrange(desc(Z_lm_L), desc(NG)) - # Add overlap threshold categories based on Z-lm and Avg-Z scores - interactions <- interactions %>% - filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>% - mutate( - Overlap = case_when( - Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both", - Z_lm_L <= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Both", - Z_lm_L >= overlap_threshold & Avg_Zscore_L <= overlap_threshold ~ "Deletion Enhancer lm only", - Z_lm_L <= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Avg Zscore only", - Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= -overlap_threshold ~ "Deletion Suppressor lm only", - Z_lm_L >= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Avg Zscore only", - Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore", - Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore", - TRUE ~ "No Effect" - ), + # Deletion data ranking and linear modeling + if (type == "deletion") { + interactions <- interactions %>% + mutate( + Avg_Zscore_L_adjusted = ifelse(is.na(Avg_Zscore_L), 0.001, Avg_Zscore_L), + Avg_Zscore_K_adjusted = ifelse(is.na(Avg_Zscore_K), 0.001, Avg_Zscore_K), + Avg_Zscore_r_adjusted = ifelse(is.na(Avg_Zscore_r), 0.001, Avg_Zscore_r), + Avg_Zscore_AUC_adjusted = ifelse(is.na(Avg_Zscore_AUC), 0.001, Avg_Zscore_AUC), + Z_lm_L_adjusted = ifelse(is.na(Z_lm_L), 0.001, Z_lm_L), + Z_lm_K_adjusted = ifelse(is.na(Z_lm_K), 0.001, Z_lm_K), + Z_lm_r_adjusted = ifelse(is.na(Z_lm_r), 0.001, Z_lm_r), + Z_lm_AUC_adjusted = ifelse(is.na(Z_lm_AUC), 0.001, Z_lm_AUC) + ) %>% + mutate( + Rank_L = rank(Avg_Zscore_L_adjusted), + Rank_K = rank(Avg_Zscore_K_adjusted), + Rank_r = rank(Avg_Zscore_r_adjusted), + Rank_AUC = rank(Avg_Zscore_AUC_adjusted), + Rank_lm_L = rank(Z_lm_L_adjusted), + Rank_lm_K = rank(Z_lm_K_adjusted), + Rank_lm_r = rank(Z_lm_r_adjusted), + Rank_lm_AUC = rank(Z_lm_AUC_adjusted) + ) %>% + mutate( + lm_R_squared_rank_L = summary(lm(Rank_lm_L ~ Rank_L, data = .))$r.squared, + lm_R_squared_rank_K = summary(lm(Rank_lm_K ~ Rank_K, data = .))$r.squared, + lm_R_squared_rank_r = summary(lm(Rank_lm_r ~ Rank_r, data = .))$r.squared, + lm_R_squared_rank_AUC = summary(lm(Rank_lm_AUC ~ Rank_AUC, data = .))$r.squared + ) + + # Add overlap threshold categories based on Z-lm and Avg-Z scores + interactions <- interactions %>% + filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>% + mutate( + Overlap = case_when( + Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both", + Z_lm_L <= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Both", + Z_lm_L >= overlap_threshold & Avg_Zscore_L <= overlap_threshold ~ "Deletion Enhancer lm only", + Z_lm_L <= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Avg Zscore only", + Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= -overlap_threshold ~ "Deletion Suppressor lm only", + Z_lm_L >= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Avg Zscore only", + Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore", + Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore", + TRUE ~ "No Effect" + )) %>% + group_modify(~ { - # For correlation plots - lm_R_squared_L = summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared, - lm_R_squared_K = summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared, - lm_R_squared_r = summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared, - lm_R_squared_AUC = summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared - ) + # For rank plots + lm_L <- perform_lm(.x, "Z_lm_L", "Avg_Zscore_L", max_conc) + lm_K <- perform_lm(.x, "Z_lm_K", "Avg_Zscore_K", max_conc) + lm_r <- perform_lm(.x, "Z_lm_r", "Avg_Zscore_r", max_conc) + lm_AUC <- perform_lm(.x, "Z_lm_AUC", "Avg_Zscore_AUC", max_conc) - # Creating the final calculations and interactions dataframes with only required columns for csv output + # For correlation plots + Z_lm_K_L <- perform_lm(.x, "Z_lm_K", "Z_lm_L", max_conc) + Z_lm_r_L <- perform_lm(.x, "Z_lm_r", "Z_lm_L", max_conc) + Z_lm_R_AUC_L <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_L", max_conc) + Z_lm_R_r_K <- perform_lm(.x, "Z_lm_R_r", "Z_lm_K", max_conc) + Z_lm_R_AUC_K <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_K", max_conc) + Z_lm_R_AUC_r <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_r", max_conc) + + .x %>% + mutate( + # For rank plots + lm_R_squared_L = lm_L$r.squared, + lm_R_squared_K = lm_K$r.squared, + lm_R_squared_r = lm_r$r.squared, + lm_R_squared_AUC = lm_AUC$r.squared, + + # For correlation plots + Z_lm_R_squared_K_L = Z_lm_K_L$r.squared, + Z_lm_R_squared_r_L = Z_lm_r_L$r.squared, + Z_lm_R_squared_AUC_L = Z_lm_R_AUC_L$r.squared, + Z_lm_R_squared_r_K = Z_lm_R_r_K$r.squared, + Z_lm_R_squared_AUC_K = Z_lm_R_AUC_K$r.squared, + Z_lm_R_squared_AUC_r = Z_lm_R_AUC_r$r.squared, + + Z_lm_intercept_K_L = Z_lm_K_L$intercept, + Z_lm_intercept_r_L = Z_lm_r_L$intercept, + Z_lm_intercept_AUC_L = Z_lm_R_AUC_L$intercept, + Z_lm_intercept_r_K = Z_lm_R_r_K$intercept, + Z_lm_intercept_AUC_K = Z_lm_R_AUC_K$intercept, + Z_lm_intercept_AUC_r = Z_lm_R_AUC_r$intercept, + + Z_lm_slope_K_L = Z_lm_K_L$slope, + Z_lm_slope_r_L = Z_lm_r_L$slope, + Z_lm_slope_AUC_L = Z_lm_R_AUC_L$slope, + Z_lm_slope_r_K = Z_lm_R_r_K$slope, + Z_lm_slope_AUC_K = Z_lm_R_AUC_K$slope, + Z_lm_slope_AUC_r = Z_lm_R_AUC_r$slope, + + Z_lm_Score_K_L = Z_lm_K_L$score, + Z_lm_Score_r_L = Z_lm_r_L$score, + Z_lm_Score_AUC_L = Z_lm_R_AUC_L$score, + Z_lm_Score_r_K = Z_lm_R_r_K$score, + Z_lm_Score_AUC_K = Z_lm_R_AUC_K$score, + Z_lm_Score_AUC_r = Z_lm_R_AUC_r$score + ) + }) + } # end deletion-specific block + + # Create the final calculations and interactions dataframes with only required columns for csv output df_calculations <- calculations %>% select( all_of(group_vars), @@ -484,7 +552,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap ) - # Join calculations and interactions to avoid dimension mismatch + # Avoid column collision on left join for overlapping variables calculations_no_overlap <- calculations %>% select(-any_of(c("DB", "NG", "SM", "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", @@ -1091,22 +1159,12 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type) )) } -generate_rank_plot_configs <- function(df, is_lm = FALSE, adjust = FALSE, filter_na = FALSE, overlap_color = FALSE) { +generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, overlap_color = FALSE) { sd_bands <- c(1, 2, 3) plot_configs <- list() variables <- c("L", "K") - # Adjust (if necessary) and rank columns - for (variable in variables) { - if (adjust) { - df[[paste0("Avg_Zscore_", variable)]] <- ifelse(is.na(df[[paste0("Avg_Zscore_", variable)]]), 0.001, df[[paste0("Avg_Zscore_", variable)]]) - df[[paste0("Z_lm_", variable)]] <- ifelse(is.na(df[[paste0("Z_lm_", variable)]]), 0.001, df[[paste0("Z_lm_", variable)]]) - } - df[[paste0("Rank_", variable)]] <- rank(df[[paste0("Avg_Zscore_", variable)]], na.last = "keep") - df[[paste0("Rank_lm_", variable)]] <- rank(df[[paste0("Z_lm_", variable)]], na.last = "keep") - } - # Helper function to create a rank plot configuration create_plot_config <- function(variable, rank_var, zscore_var, y_label, sd_band, filter_na, with_annotations = TRUE) { num_enhancers <- sum(df[[zscore_var]] >= sd_band, na.rm = TRUE) @@ -1578,9 +1636,9 @@ main <- function() { write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE) write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE) - message("Generating reference interaction plots") - reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference") - generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16) + # message("Generating reference interaction plots") + # reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference") + # generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16) message("Setting missing deletion values to the highest theoretical value at each drug conc for L") df_deletion <- df_na_stats %>% # formerly X2 @@ -1648,7 +1706,6 @@ main <- function() { # rank_plot_configs <- generate_rank_plot_configs( # df_interactions, # is_lm = FALSE, - # adjust = TRUE # ) # generate_and_save_plots(out_dir, "rank_plots", rank_plot_configs, # page_width = 18, page_height = 12) @@ -1657,7 +1714,6 @@ main <- function() { # rank_lm_plot_configs <- generate_rank_plot_configs( # df_interactions, # is_lm = TRUE, - # adjust = TRUE # ) # generate_and_save_plots(out_dir, "rank_plots_lm", rank_lm_plot_configs, # page_width = 18, page_height = 12) @@ -1666,7 +1722,6 @@ main <- function() { # rank_plot_filtered_configs <- generate_rank_plot_configs( # df_interactions, # is_lm = FALSE, - # adjust = FALSE, # filter_na = TRUE, # overlap_color = TRUE # ) @@ -1677,7 +1732,6 @@ main <- function() { # rank_plot_lm_filtered_configs <- generate_rank_plot_configs( # df_interactions, # is_lm = TRUE, - # adjust = FALSE, # filter_na = TRUE, # overlap_color = TRUE # )