From 52381426b25076ddd169ff2d0f2c69405754fff3 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Mon, 7 Oct 2024 04:47:30 -0400 Subject: [PATCH] Add correlation and rank coefficients to calculations --- .../apps/r/calculate_interaction_zscores.R | 198 +++++++++++------- 1 file changed, 117 insertions(+), 81 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index fc6a5666..6e878831 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -219,9 +219,9 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) # This is a helper function to perform a linear regression and extract coefficients perform_lm <- function(data, response, predictor, max_conc) { - valid_data <- data %>% filter(!is.na(.data[[response]])) + valid_data <- data %>% filter(!is.na(data[[response]])) if (nrow(valid_data) > 1) { - model <- lm(.data[[response]] ~ .data[[predictor]], data = valid_data) + model <- lm(data[[response]] ~ data[[predictor]], data = valid_data) list( intercept = coef(model)[1], slope = coef(model)[2], @@ -286,8 +286,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) NG_sum = sum(NG, na.rm = TRUE), DB_sum = sum(DB, na.rm = TRUE), SM_sum = sum(SM, na.rm = TRUE), - num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1, - + # Expected values Exp_L = WT_L + Raw_Shift_L, Exp_K = WT_K + Raw_Shift_K, @@ -374,10 +373,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) lm_mean_r = mean(lm_Score_r, na.rm = TRUE), lm_sd_r = sd(lm_Score_r, na.rm = TRUE), lm_mean_AUC = mean(lm_Score_AUC, na.rm = TRUE), - lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE) - ) %>% - # Calculate Z-lm scores - mutate( + lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE), Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L, Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K, Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r, @@ -388,10 +384,24 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) interactions <- calculations %>% group_by(across(all_of(group_vars))) %>% summarise( - Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / first(num_non_removed_concs), - Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / first(num_non_removed_concs), - Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / (total_conc_num - 1), - Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / (total_conc_num - 1), + + num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1, + + Sum_Z_Score_L = sum(Zscore_L, na.rm = TRUE), + Sum_Z_Score_K = sum(Zscore_K, na.rm = TRUE), + Sum_Z_Score_r = sum(Zscore_r, na.rm = TRUE), + Sum_Z_Score_AUC = sum(Zscore_AUC, na.rm = TRUE), + + Avg_Zscore_L = Sum_Z_Score_L / first(num_non_removed_concs), + Avg_Zscore_K = Sum_Z_Score_K / first(num_non_removed_concs), + Avg_Zscore_r = Sum_Z_Score_r / first(num_non_removed_concs), + Avg_Zscore_AUC = Sum_Z_Score_AUC / first(num_non_removed_concs), + + # R_Squared + R_Squared_L = first(R_Squared_L), + R_Squared_K = first(R_Squared_K), + R_Squared_r = first(R_Squared_r), + R_Squared_AUC = first(R_Squared_AUC), # Interaction Z-scores Z_lm_L = first(Z_lm_L), @@ -411,13 +421,19 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) Z_Shift_r = first(Z_Shift_r), Z_Shift_AUC = first(Z_Shift_AUC), + # Gene-Gene Interaction + lm_Score_L = first(lm_Score_L), + lm_Score_K = first(lm_Score_K), + lm_Score_r = first(lm_Score_r), + lm_Score_AUC = first(lm_Score_AUC), + # NG, DB, SM values - NG = first(NG), - DB = first(DB), - SM = first(SM), + NG_sum_int = sum(NG), + DB_sum_int = sum(DB), + SM_sum_int = sum(SM), .groups = "drop" ) %>% - arrange(desc(Z_lm_L), desc(NG)) + arrange(desc(Z_lm_L), desc(NG_sum_int)) # Deletion data ranking and linear modeling if (type == "deletion") { @@ -463,61 +479,78 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore", Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore", TRUE ~ "No Effect" - )) %>% - group_modify(~ { - + ), + # For rank plots - lm_L <- perform_lm(.x, "Z_lm_L", "Avg_Zscore_L", max_conc) - lm_K <- perform_lm(.x, "Z_lm_K", "Avg_Zscore_K", max_conc) - lm_r <- perform_lm(.x, "Z_lm_r", "Avg_Zscore_r", max_conc) - lm_AUC <- perform_lm(.x, "Z_lm_AUC", "Avg_Zscore_AUC", max_conc) + lm_L = perform_lm(., "Z_lm_L", "Avg_Zscore_L", max_conc), + lm_K = perform_lm(., "Z_lm_K", "Avg_Zscore_K", max_conc), + lm_r = perform_lm(., "Z_lm_r", "Avg_Zscore_r", max_conc), + lm_AUC = perform_lm(., "Z_lm_AUC", "Avg_Zscore_AUC", max_conc), # For correlation plots - Z_lm_K_L <- perform_lm(.x, "Z_lm_K", "Z_lm_L", max_conc) - Z_lm_r_L <- perform_lm(.x, "Z_lm_r", "Z_lm_L", max_conc) - Z_lm_R_AUC_L <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_L", max_conc) - Z_lm_R_r_K <- perform_lm(.x, "Z_lm_R_r", "Z_lm_K", max_conc) - Z_lm_R_AUC_K <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_K", max_conc) - Z_lm_R_AUC_r <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_r", max_conc) + Z_lm_K_L = perform_lm(., "Z_lm_K", "Z_lm_L", max_conc), + Z_lm_r_L = perform_lm(., "Z_lm_r", "Z_lm_L", max_conc), + Z_lm_R_AUC_L = perform_lm(., "Z_lm_AUC", "Z_lm_L", max_conc), + Z_lm_R_r_K = perform_lm(., "Z_lm_r", "Z_lm_K", max_conc), + Z_lm_R_AUC_K = perform_lm(., "Z_lm_AUC", "Z_lm_K", max_conc), + Z_lm_R_AUC_r = perform_lm(., "Z_lm_AUC", "Z_lm_r", max_conc), - .x %>% - mutate( - # For rank plots - lm_R_squared_L = lm_L$r.squared, - lm_R_squared_K = lm_K$r.squared, - lm_R_squared_r = lm_r$r.squared, - lm_R_squared_AUC = lm_AUC$r.squared, + # Extract coefficients and statistics for each model + lm_rank_intercept_L = lm_L$intercept, + lm_rank_slope_L = lm_L$slope, + R_Squared_L = lm_L$r_squared, + lm_Score_L = lm_L$score, - # For correlation plots - Z_lm_R_squared_K_L = Z_lm_K_L$r.squared, - Z_lm_R_squared_r_L = Z_lm_r_L$r.squared, - Z_lm_R_squared_AUC_L = Z_lm_R_AUC_L$r.squared, - Z_lm_R_squared_r_K = Z_lm_R_r_K$r.squared, - Z_lm_R_squared_AUC_K = Z_lm_R_AUC_K$r.squared, - Z_lm_R_squared_AUC_r = Z_lm_R_AUC_r$r.squared, + lm_intercept_K = lm_K$intercept, + lm_slope_K = lm_K$slope, + R_Squared_K = lm_K$r_squared, + lm_Score_K = lm_K$score, - Z_lm_intercept_K_L = Z_lm_K_L$intercept, - Z_lm_intercept_r_L = Z_lm_r_L$intercept, - Z_lm_intercept_AUC_L = Z_lm_R_AUC_L$intercept, - Z_lm_intercept_r_K = Z_lm_R_r_K$intercept, - Z_lm_intercept_AUC_K = Z_lm_R_AUC_K$intercept, - Z_lm_intercept_AUC_r = Z_lm_R_AUC_r$intercept, + lm_intercept_r = lm_r$intercept, + lm_slope_r = lm_r$slope, + R_Squared_r = lm_r$r_squared, + lm_Score_r = lm_r$score, - Z_lm_slope_K_L = Z_lm_K_L$slope, - Z_lm_slope_r_L = Z_lm_r_L$slope, - Z_lm_slope_AUC_L = Z_lm_R_AUC_L$slope, - Z_lm_slope_r_K = Z_lm_R_r_K$slope, - Z_lm_slope_AUC_K = Z_lm_R_AUC_K$slope, - Z_lm_slope_AUC_r = Z_lm_R_AUC_r$slope, + lm_intercept_AUC = lm_AUC$intercept, + lm_slope_AUC = lm_AUC$slope, + R_Squared_AUC = lm_AUC$r_squared, + lm_Score_AUC = lm_AUC$score, - Z_lm_Score_K_L = Z_lm_K_L$score, - Z_lm_Score_r_L = Z_lm_r_L$score, - Z_lm_Score_AUC_L = Z_lm_R_AUC_L$score, - Z_lm_Score_r_K = Z_lm_R_r_K$score, - Z_lm_Score_AUC_K = Z_lm_R_AUC_K$score, - Z_lm_Score_AUC_r = Z_lm_R_AUC_r$score - ) - }) + Z_lm_intercept_K_L = Z_lm_K_L$intercept, + Z_lm_slope_K_L = Z_lm_K_L$slope, + Z_lm_R_squared_K_L = Z_lm_K_L$r_squared, + Z_lm_Score_K_L = Z_lm_K_L$score, + + Z_lm_intercept_r_L = Z_lm_r_L$intercept, + Z_lm_slope_r_L = Z_lm_r_L$slope, + Z_lm_R_squared_r_L = Z_lm_r_L$r_squared, + Z_lm_Score_r_L = Z_lm_r_L$score, + + Z_lm_intercept_R_AUC_L = Z_lm_R_AUC_L$intercept, + Z_lm_slope_R_AUC_L = Z_lm_R_AUC_L$slope, + Z_lm_R_squared_R_AUC_L = Z_lm_R_AUC_L$r_squared, + Z_lm_Score_R_AUC_L = Z_lm_R_AUC_L$score, + + Z_lm_intercept_R_r_K = Z_lm_R_r_K$intercept, + Z_lm_slope_R_r_K = Z_lm_R_r_K$slope, + Z_lm_R_squared_R_r_K = Z_lm_R_r_K$r_squared, + Z_lm_Score_R_r_K = Z_lm_R_r_K$score, + + Z_lm_intercept_R_AUC_K = Z_lm_R_AUC_K$intercept, + Z_lm_slope_R_AUC_K = Z_lm_R_AUC_K$slope, + Z_lm_R_squared_R_AUC_K = Z_lm_R_AUC_K$r_squared, + Z_lm_Score_R_AUC_K = Z_lm_R_AUC_K$score, + + Z_lm_intercept_R_AUC_r = Z_lm_R_AUC_r$intercept, + Z_lm_slope_R_AUC_r = Z_lm_R_AUC_r$slope, + Z_lm_R_squared_R_AUC_r = Z_lm_R_AUC_r$r_squared, + Z_lm_Score_R_AUC_r = Z_lm_R_AUC_r$score + ) %>% + # Drop the lm objects and keep only the lm results + select( + -lm_L, -lm_K, -lm_r, -lm_AUC, -Z_lm_K_L, -Z_lm_r_L, -Z_lm_R_AUC_L, + -Z_lm_R_r_K, -Z_lm_R_AUC_K, -Z_lm_R_AUC_r + ) } # end deletion-specific block # Create the final calculations and interactions dataframes with only required columns for csv output @@ -536,35 +569,38 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) Exp_L, Exp_K, Exp_r, Exp_AUC, Delta_L, Delta_K, Delta_r, Delta_AUC, mean_Delta_L, mean_Delta_K, mean_Delta_r, mean_Delta_AUC, - Zscore_L, Zscore_K, Zscore_r, Zscore_AUC - ) + Zscore_L, Zscore_K, Zscore_r, Zscore_AUC, + NG_sum, DB_sum, SM_sum + ) %>% + rename(NG = NG_sum, DB = DB_sum, SM = SM_sum) df_interactions <- interactions %>% select( all_of(group_vars), - NG, DB, SM, Avg_Zscore_L, Avg_Zscore_K, Avg_Zscore_r, Avg_Zscore_AUC, + Sum_Z_Score_L, Sum_Z_Score_K, Sum_Z_Score_r, Sum_Z_Score_AUC, Z_lm_L, Z_lm_K, Z_lm_r, Z_lm_AUC, Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC, Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC, - lm_R_squared_L, lm_R_squared_K, lm_R_squared_r, lm_R_squared_AUC, - lm_intercept_L, lm_intercept_K, lm_intercept_r, lm_intercept_AUC, - lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap - ) + lm_Score_L, lm_Score_K, lm_Score_r, lm_Score_AUC, + R_Squared_L, R_Squared_K, R_Squared_r, R_Squared_AUC, + NG_sum_int, DB_sum_int, SM_sum_int + ) %>% + rename(NG = NG_sum_int, DB = DB_sum_int, SM = SM_sum_int) # Avoid column collision on left join for overlapping variables calculations_no_overlap <- calculations %>% select(-any_of(c("DB", "NG", "SM", + # Don't need these anywhere so easier to remove "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", + "R_Squared_L", "R_Squared_K", "R_Squared_r", "R_Squared_AUC", + # we need these for the interactions but the original code has the same names in both datasets "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", - "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC", - "lm_R_squared_L", "lm_R_squared_K", "lm_R_squared_r", "lm_R_squared_AUC", - "lm_intercept_L", "lm_intercept_K", "lm_intercept_r", "lm_intercept_AUC", - "lm_slope_L", "lm_slope_K", "lm_slope_r", "lm_slope_AUC" + "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC" ))) full_data <- calculations_no_overlap %>% - left_join(df_interactions, by = group_vars) + left_join(interactions, by = group_vars) # Return final dataframes return(list( @@ -1239,7 +1275,7 @@ generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, ove generate_correlation_plot_configs <- function(df, df_reference) { # Define relationships for different-variable correlations relationships <- list( - list(x = "L", y = "K"), + list(x = "L", y = "K"), # x-var is predictor, y-var is reponse list(x = "L", y = "r"), list(x = "L", y = "AUC"), list(x = "K", y = "r"), @@ -1261,17 +1297,17 @@ generate_correlation_plot_configs <- function(df, df_reference) { for (highlight_cyan in highlight_cyan_options) { for (rel in relationships) { # Extract relevant variable names for Z_lm values - x_var <- paste0("Z_lm_", rel$x) - y_var <- paste0("Z_lm_", rel$y) + x_var <- paste0("Z_lm_", rel$x) # predictor + y_var <- paste0("Z_lm_", rel$y) # response # Find the max and min of both dataframes for printing linear regression line xmin <- min(c(min(df[[x_var]], na.rm = TRUE), min(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE) xmax <- max(c(max(df[[x_var]], na.rm = TRUE), max(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE) # Extract the R-squared, intercept, and slope from the df (first value) - intercept <- df[[paste0("lm_intercept_", rel$x)]][1] - slope <- df[[paste0("lm_slope_", rel$x)]][1] - r_squared <- df[[paste0("lm_R_squared_", rel$x)]][1] + intercept <- df[[paste0("Z_lm_intercept_", rel$y, "_", rel$x)]][1] + slope <- df[[paste0("Z_lm_slope_", rel$y, "_", rel$x)]][1] + r_squared <- df[[paste0("Z_lm_R_squared_", rel$y, "_", rel$x)]][1] # Generate the label for the plot plot_label <- paste("Interaction", rel$x, "vs.", rel$y)