Add correlation and rank coefficients to calculations

2024-10-07 04:47:30 -04:00
parent 2073517192
commit 52381426b2
1 changed files with 117 additions and 81 deletions
--- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
+++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
@@ -219,9 +219,9 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
  # This is a helper function to perform a linear regression and extract coefficients
  perform_lm <- function(data, response, predictor, max_conc) {
-    valid_data <- data %>% filter(!is.na(.data[[response]]))
+    valid_data <- data %>% filter(!is.na(data[[response]]))
    if (nrow(valid_data) > 1) {
-      model <- lm(.data[[response]] ~ .data[[predictor]], data = valid_data)
+      model <- lm(data[[response]] ~ data[[predictor]], data = valid_data)
      list(
        intercept = coef(model)[1],
        slope = coef(model)[2],
@@ -286,7 +286,6 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
      NG_sum = sum(NG, na.rm = TRUE),
      DB_sum = sum(DB, na.rm = TRUE),
      SM_sum = sum(SM, na.rm = TRUE),
      num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
      # Expected values
      Exp_L = WT_L + Raw_Shift_L,
@@ -374,10 +373,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
      lm_mean_r = mean(lm_Score_r, na.rm = TRUE),
      lm_sd_r = sd(lm_Score_r, na.rm = TRUE),
      lm_mean_AUC = mean(lm_Score_AUC, na.rm = TRUE),
-      lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE)
+      lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE),
    ) %>%
    # Calculate Z-lm scores
    mutate(
      Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L,
      Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K,
      Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r,
@@ -388,10 +384,24 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
  interactions <- calculations %>%
    group_by(across(all_of(group_vars))) %>%
    summarise(
-      Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / first(num_non_removed_concs),
+
-      Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / first(num_non_removed_concs),
+      num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
-      Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / (total_conc_num - 1),
+
-      Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / (total_conc_num - 1),
+      Sum_Z_Score_L = sum(Zscore_L, na.rm = TRUE),
      Sum_Z_Score_K = sum(Zscore_K, na.rm = TRUE),
      Sum_Z_Score_r = sum(Zscore_r, na.rm = TRUE),
      Sum_Z_Score_AUC = sum(Zscore_AUC, na.rm = TRUE),
      Avg_Zscore_L = Sum_Z_Score_L / first(num_non_removed_concs),
      Avg_Zscore_K = Sum_Z_Score_K / first(num_non_removed_concs),
      Avg_Zscore_r = Sum_Z_Score_r / first(num_non_removed_concs),
      Avg_Zscore_AUC = Sum_Z_Score_AUC / first(num_non_removed_concs),
      # R_Squared
      R_Squared_L = first(R_Squared_L),
      R_Squared_K = first(R_Squared_K),
      R_Squared_r = first(R_Squared_r),
      R_Squared_AUC = first(R_Squared_AUC),
      # Interaction Z-scores
      Z_lm_L = first(Z_lm_L),
@@ -411,13 +421,19 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
      Z_Shift_r = first(Z_Shift_r),
      Z_Shift_AUC = first(Z_Shift_AUC),
      # Gene-Gene Interaction
      lm_Score_L = first(lm_Score_L),
      lm_Score_K = first(lm_Score_K),
      lm_Score_r = first(lm_Score_r),
      lm_Score_AUC = first(lm_Score_AUC),
      # NG, DB, SM values
-      NG = first(NG),
+      NG_sum_int = sum(NG),
-      DB = first(DB),
+      DB_sum_int = sum(DB),
-      SM = first(SM),
+      SM_sum_int = sum(SM),
      .groups = "drop"
    ) %>%
-    arrange(desc(Z_lm_L), desc(NG))
+    arrange(desc(Z_lm_L), desc(NG_sum_int))
  # Deletion data ranking and linear modeling
  if (type == "deletion") {
@@ -463,61 +479,78 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
          Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
          Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
          TRUE ~ "No Effect"
-        )) %>%
+        ),
      group_modify(~ {
        # For rank plots
-        lm_L <- perform_lm(.x, "Z_lm_L", "Avg_Zscore_L", max_conc)
+        lm_L = perform_lm(., "Z_lm_L", "Avg_Zscore_L", max_conc),
-        lm_K <- perform_lm(.x, "Z_lm_K", "Avg_Zscore_K", max_conc)
+        lm_K = perform_lm(., "Z_lm_K", "Avg_Zscore_K", max_conc),
-        lm_r <- perform_lm(.x, "Z_lm_r", "Avg_Zscore_r", max_conc)
+        lm_r = perform_lm(., "Z_lm_r", "Avg_Zscore_r", max_conc),
-        lm_AUC <- perform_lm(.x, "Z_lm_AUC", "Avg_Zscore_AUC", max_conc)
+        lm_AUC = perform_lm(., "Z_lm_AUC", "Avg_Zscore_AUC", max_conc),
        # For correlation plots
-        Z_lm_K_L <- perform_lm(.x, "Z_lm_K", "Z_lm_L", max_conc)
+        Z_lm_K_L = perform_lm(., "Z_lm_K", "Z_lm_L", max_conc),
-        Z_lm_r_L <- perform_lm(.x, "Z_lm_r", "Z_lm_L", max_conc)
+        Z_lm_r_L = perform_lm(., "Z_lm_r", "Z_lm_L", max_conc),
-        Z_lm_R_AUC_L <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_L", max_conc)
+        Z_lm_R_AUC_L = perform_lm(., "Z_lm_AUC", "Z_lm_L", max_conc),
-        Z_lm_R_r_K <- perform_lm(.x, "Z_lm_R_r", "Z_lm_K", max_conc)
+        Z_lm_R_r_K = perform_lm(., "Z_lm_r", "Z_lm_K", max_conc),
-        Z_lm_R_AUC_K <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_K", max_conc)
+        Z_lm_R_AUC_K = perform_lm(., "Z_lm_AUC", "Z_lm_K", max_conc),
-        Z_lm_R_AUC_r <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_r", max_conc)
+        Z_lm_R_AUC_r = perform_lm(., "Z_lm_AUC", "Z_lm_r", max_conc),
-        .x %>%
+        # Extract coefficients and statistics for each model
-          mutate(
+        lm_rank_intercept_L = lm_L$intercept,
-            # For rank plots
+        lm_rank_slope_L = lm_L$slope,
-            lm_R_squared_L = lm_L$r.squared,
+        R_Squared_L = lm_L$r_squared,
-            lm_R_squared_K = lm_K$r.squared,
+        lm_Score_L = lm_L$score,
            lm_R_squared_r = lm_r$r.squared,
            lm_R_squared_AUC = lm_AUC$r.squared,
-            # For correlation plots
+        lm_intercept_K = lm_K$intercept,
-            Z_lm_R_squared_K_L = Z_lm_K_L$r.squared,
+        lm_slope_K = lm_K$slope,
-            Z_lm_R_squared_r_L = Z_lm_r_L$r.squared,
+        R_Squared_K = lm_K$r_squared,
-            Z_lm_R_squared_AUC_L = Z_lm_R_AUC_L$r.squared,
+        lm_Score_K = lm_K$score,
-            Z_lm_R_squared_r_K = Z_lm_R_r_K$r.squared,
+
-            Z_lm_R_squared_AUC_K = Z_lm_R_AUC_K$r.squared,
+        lm_intercept_r = lm_r$intercept,
-            Z_lm_R_squared_AUC_r = Z_lm_R_AUC_r$r.squared,
+        lm_slope_r = lm_r$slope,
        R_Squared_r = lm_r$r_squared,
        lm_Score_r = lm_r$score,
        lm_intercept_AUC = lm_AUC$intercept,
        lm_slope_AUC = lm_AUC$slope,
        R_Squared_AUC = lm_AUC$r_squared,
        lm_Score_AUC = lm_AUC$score,
        Z_lm_intercept_K_L = Z_lm_K_L$intercept,
            Z_lm_intercept_r_L = Z_lm_r_L$intercept,
            Z_lm_intercept_AUC_L = Z_lm_R_AUC_L$intercept,
            Z_lm_intercept_r_K = Z_lm_R_r_K$intercept,
            Z_lm_intercept_AUC_K = Z_lm_R_AUC_K$intercept,
            Z_lm_intercept_AUC_r = Z_lm_R_AUC_r$intercept,
        Z_lm_slope_K_L = Z_lm_K_L$slope,
-            Z_lm_slope_r_L = Z_lm_r_L$slope,
+        Z_lm_R_squared_K_L = Z_lm_K_L$r_squared,
            Z_lm_slope_AUC_L = Z_lm_R_AUC_L$slope,
            Z_lm_slope_r_K = Z_lm_R_r_K$slope,
            Z_lm_slope_AUC_K = Z_lm_R_AUC_K$slope,
            Z_lm_slope_AUC_r = Z_lm_R_AUC_r$slope,
        Z_lm_Score_K_L = Z_lm_K_L$score,
        Z_lm_intercept_r_L = Z_lm_r_L$intercept,
        Z_lm_slope_r_L = Z_lm_r_L$slope,
        Z_lm_R_squared_r_L = Z_lm_r_L$r_squared,
        Z_lm_Score_r_L = Z_lm_r_L$score,
-            Z_lm_Score_AUC_L = Z_lm_R_AUC_L$score,
+
-            Z_lm_Score_r_K = Z_lm_R_r_K$score,
+        Z_lm_intercept_R_AUC_L = Z_lm_R_AUC_L$intercept,
-            Z_lm_Score_AUC_K = Z_lm_R_AUC_K$score,
+        Z_lm_slope_R_AUC_L = Z_lm_R_AUC_L$slope,
-            Z_lm_Score_AUC_r = Z_lm_R_AUC_r$score
+        Z_lm_R_squared_R_AUC_L = Z_lm_R_AUC_L$r_squared,
        Z_lm_Score_R_AUC_L = Z_lm_R_AUC_L$score,
        Z_lm_intercept_R_r_K = Z_lm_R_r_K$intercept,
        Z_lm_slope_R_r_K = Z_lm_R_r_K$slope,
        Z_lm_R_squared_R_r_K = Z_lm_R_r_K$r_squared,
        Z_lm_Score_R_r_K = Z_lm_R_r_K$score,
        Z_lm_intercept_R_AUC_K = Z_lm_R_AUC_K$intercept,
        Z_lm_slope_R_AUC_K = Z_lm_R_AUC_K$slope,
        Z_lm_R_squared_R_AUC_K = Z_lm_R_AUC_K$r_squared,
        Z_lm_Score_R_AUC_K = Z_lm_R_AUC_K$score,
        Z_lm_intercept_R_AUC_r = Z_lm_R_AUC_r$intercept,
        Z_lm_slope_R_AUC_r = Z_lm_R_AUC_r$slope,
        Z_lm_R_squared_R_AUC_r = Z_lm_R_AUC_r$r_squared,
        Z_lm_Score_R_AUC_r = Z_lm_R_AUC_r$score
      ) %>%
      # Drop the lm objects and keep only the lm results
      select(
        -lm_L, -lm_K, -lm_r, -lm_AUC, -Z_lm_K_L, -Z_lm_r_L, -Z_lm_R_AUC_L,
        -Z_lm_R_r_K, -Z_lm_R_AUC_K, -Z_lm_R_AUC_r
      )
      })
  } # end deletion-specific block
  # Create the final calculations and interactions dataframes with only required columns for csv output
@@ -536,35 +569,38 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
      Exp_L, Exp_K, Exp_r, Exp_AUC,
      Delta_L, Delta_K, Delta_r, Delta_AUC,
      mean_Delta_L, mean_Delta_K, mean_Delta_r, mean_Delta_AUC,
-      Zscore_L, Zscore_K, Zscore_r, Zscore_AUC
+      Zscore_L, Zscore_K, Zscore_r, Zscore_AUC,
-    )
+      NG_sum, DB_sum, SM_sum
    ) %>%
      rename(NG = NG_sum, DB = DB_sum, SM = SM_sum)
  df_interactions <- interactions %>%
    select(
      all_of(group_vars),
      NG, DB, SM,
      Avg_Zscore_L, Avg_Zscore_K, Avg_Zscore_r, Avg_Zscore_AUC,
      Sum_Z_Score_L, Sum_Z_Score_K, Sum_Z_Score_r, Sum_Z_Score_AUC,
      Z_lm_L, Z_lm_K, Z_lm_r, Z_lm_AUC,
      Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC,
      Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC,
-      lm_R_squared_L, lm_R_squared_K, lm_R_squared_r, lm_R_squared_AUC,
+      lm_Score_L, lm_Score_K, lm_Score_r, lm_Score_AUC,
-      lm_intercept_L, lm_intercept_K, lm_intercept_r, lm_intercept_AUC,
+      R_Squared_L, R_Squared_K, R_Squared_r, R_Squared_AUC,
-      lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap
+      NG_sum_int, DB_sum_int, SM_sum_int
-    )
+    ) %>%
      rename(NG = NG_sum_int, DB = DB_sum_int, SM = SM_sum_int)
  # Avoid column collision on left join for overlapping variables
  calculations_no_overlap <- calculations %>%
    select(-any_of(c("DB", "NG", "SM",
      # Don't need these anywhere so easier to remove
      "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
      "R_Squared_L", "R_Squared_K", "R_Squared_r", "R_Squared_AUC",
      # we need these for the interactions but the original code has the same names in both datasets
      "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC",
-      "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC",
+      "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC"
      "lm_R_squared_L", "lm_R_squared_K", "lm_R_squared_r", "lm_R_squared_AUC",
      "lm_intercept_L", "lm_intercept_K", "lm_intercept_r", "lm_intercept_AUC",
      "lm_slope_L", "lm_slope_K", "lm_slope_r", "lm_slope_AUC"
    )))
  full_data <- calculations_no_overlap %>%
-    left_join(df_interactions, by = group_vars)
+    left_join(interactions, by = group_vars)
  # Return final dataframes
  return(list(
@@ -1239,7 +1275,7 @@ generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, ove
 generate_correlation_plot_configs <- function(df, df_reference) {
  # Define relationships for different-variable correlations
  relationships <- list(
-    list(x = "L", y = "K"),
+    list(x = "L", y = "K"), # x-var is predictor, y-var is reponse
    list(x = "L", y = "r"),
    list(x = "L", y = "AUC"),
    list(x = "K", y = "r"),
@@ -1261,17 +1297,17 @@ generate_correlation_plot_configs <- function(df, df_reference) {
  for (highlight_cyan in highlight_cyan_options) {
    for (rel in relationships) {
      # Extract relevant variable names for Z_lm values
-      x_var <- paste0("Z_lm_", rel$x)
+      x_var <- paste0("Z_lm_", rel$x) # predictor
-      y_var <- paste0("Z_lm_", rel$y)
+      y_var <- paste0("Z_lm_", rel$y) # response
      # Find the max and min of both dataframes for printing linear regression line
      xmin <- min(c(min(df[[x_var]], na.rm = TRUE), min(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE)
      xmax <- max(c(max(df[[x_var]], na.rm = TRUE), max(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE)
      # Extract the R-squared, intercept, and slope from the df (first value)
-      intercept <- df[[paste0("lm_intercept_", rel$x)]][1]
+      intercept <- df[[paste0("Z_lm_intercept_", rel$y, "_", rel$x)]][1]
-      slope <- df[[paste0("lm_slope_", rel$x)]][1]
+      slope <- df[[paste0("Z_lm_slope_", rel$y, "_", rel$x)]][1]
-      r_squared <- df[[paste0("lm_R_squared_", rel$x)]][1]
+      r_squared <- df[[paste0("Z_lm_R_squared_", rel$y, "_", rel$x)]][1]
      # Generate the label for the plot
      plot_label <- paste("Interaction", rel$x, "vs.", rel$y)