Quellcode durchsuchen

Add correlation and rank coefficients to calculations

Bryan Roessler vor 6 Monaten
Ursprung
Commit
52381426b2
1 geänderte Dateien mit 122 neuen und 86 gelöschten Zeilen
  1. 122 86
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 122 - 86
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -219,9 +219,9 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
   # This is a helper function to perform a linear regression and extract coefficients
   perform_lm <- function(data, response, predictor, max_conc) {
-    valid_data <- data %>% filter(!is.na(.data[[response]]))
+    valid_data <- data %>% filter(!is.na(data[[response]]))
     if (nrow(valid_data) > 1) {
-      model <- lm(.data[[response]] ~ .data[[predictor]], data = valid_data)
+      model <- lm(data[[response]] ~ data[[predictor]], data = valid_data)
       list(
         intercept = coef(model)[1],
         slope = coef(model)[2],
@@ -286,8 +286,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
       NG_sum = sum(NG, na.rm = TRUE),
       DB_sum = sum(DB, na.rm = TRUE),
       SM_sum = sum(SM, na.rm = TRUE),
-      num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
-      
+
       # Expected values
       Exp_L = WT_L + Raw_Shift_L,
       Exp_K = WT_K + Raw_Shift_K,
@@ -374,10 +373,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
       lm_mean_r = mean(lm_Score_r, na.rm = TRUE),
       lm_sd_r = sd(lm_Score_r, na.rm = TRUE),
       lm_mean_AUC = mean(lm_Score_AUC, na.rm = TRUE),
-      lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE)
-    ) %>%
-    # Calculate Z-lm scores
-    mutate(
+      lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE),
       Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L,
       Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K,
       Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r,
@@ -388,10 +384,24 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
   interactions <- calculations %>%
     group_by(across(all_of(group_vars))) %>%
     summarise(
-      Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / first(num_non_removed_concs),
-      Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / first(num_non_removed_concs),
-      Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / (total_conc_num - 1),
-      Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / (total_conc_num - 1),
+
+      num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
+
+      Sum_Z_Score_L = sum(Zscore_L, na.rm = TRUE),
+      Sum_Z_Score_K = sum(Zscore_K, na.rm = TRUE),
+      Sum_Z_Score_r = sum(Zscore_r, na.rm = TRUE),
+      Sum_Z_Score_AUC = sum(Zscore_AUC, na.rm = TRUE),
+
+      Avg_Zscore_L = Sum_Z_Score_L / first(num_non_removed_concs),
+      Avg_Zscore_K = Sum_Z_Score_K / first(num_non_removed_concs),
+      Avg_Zscore_r = Sum_Z_Score_r / first(num_non_removed_concs),
+      Avg_Zscore_AUC = Sum_Z_Score_AUC / first(num_non_removed_concs),
+
+      # R_Squared
+      R_Squared_L = first(R_Squared_L),
+      R_Squared_K = first(R_Squared_K),
+      R_Squared_r = first(R_Squared_r),
+      R_Squared_AUC = first(R_Squared_AUC),
       
       # Interaction Z-scores
       Z_lm_L = first(Z_lm_L),
@@ -411,13 +421,19 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
       Z_Shift_r = first(Z_Shift_r),
       Z_Shift_AUC = first(Z_Shift_AUC),
 
+      # Gene-Gene Interaction
+      lm_Score_L = first(lm_Score_L),
+      lm_Score_K = first(lm_Score_K),
+      lm_Score_r = first(lm_Score_r),
+      lm_Score_AUC = first(lm_Score_AUC),
+
       # NG, DB, SM values
-      NG = first(NG),
-      DB = first(DB),
-      SM = first(SM),
+      NG_sum_int = sum(NG),
+      DB_sum_int = sum(DB),
+      SM_sum_int = sum(SM),
       .groups = "drop"
     ) %>%
-    arrange(desc(Z_lm_L), desc(NG))
+    arrange(desc(Z_lm_L), desc(NG_sum_int))
 
   # Deletion data ranking and linear modeling
   if (type == "deletion") {
@@ -463,61 +479,78 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
           Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
           Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
           TRUE ~ "No Effect"
-        )) %>%
-      group_modify(~ {
-      
+        ),
+        
         # For rank plots
-        lm_L <- perform_lm(.x, "Z_lm_L", "Avg_Zscore_L", max_conc)
-        lm_K <- perform_lm(.x, "Z_lm_K", "Avg_Zscore_K", max_conc)
-        lm_r <- perform_lm(.x, "Z_lm_r", "Avg_Zscore_r", max_conc)
-        lm_AUC <- perform_lm(.x, "Z_lm_AUC", "Avg_Zscore_AUC", max_conc)
+        lm_L = perform_lm(., "Z_lm_L", "Avg_Zscore_L", max_conc),
+        lm_K = perform_lm(., "Z_lm_K", "Avg_Zscore_K", max_conc),
+        lm_r = perform_lm(., "Z_lm_r", "Avg_Zscore_r", max_conc),
+        lm_AUC = perform_lm(., "Z_lm_AUC", "Avg_Zscore_AUC", max_conc),
 
         # For correlation plots
-        Z_lm_K_L <- perform_lm(.x, "Z_lm_K", "Z_lm_L", max_conc)
-        Z_lm_r_L <- perform_lm(.x, "Z_lm_r", "Z_lm_L", max_conc)
-        Z_lm_R_AUC_L <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_L", max_conc)
-        Z_lm_R_r_K <- perform_lm(.x, "Z_lm_R_r", "Z_lm_K", max_conc)
-        Z_lm_R_AUC_K <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_K", max_conc)
-        Z_lm_R_AUC_r <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_r", max_conc)
-
-        .x %>%
-          mutate(
-            # For rank plots
-            lm_R_squared_L = lm_L$r.squared,
-            lm_R_squared_K = lm_K$r.squared,
-            lm_R_squared_r = lm_r$r.squared,
-            lm_R_squared_AUC = lm_AUC$r.squared,
-
-            # For correlation plots
-            Z_lm_R_squared_K_L = Z_lm_K_L$r.squared,
-            Z_lm_R_squared_r_L = Z_lm_r_L$r.squared,
-            Z_lm_R_squared_AUC_L = Z_lm_R_AUC_L$r.squared,
-            Z_lm_R_squared_r_K = Z_lm_R_r_K$r.squared,
-            Z_lm_R_squared_AUC_K = Z_lm_R_AUC_K$r.squared,
-            Z_lm_R_squared_AUC_r = Z_lm_R_AUC_r$r.squared,
-
-            Z_lm_intercept_K_L = Z_lm_K_L$intercept,
-            Z_lm_intercept_r_L = Z_lm_r_L$intercept,
-            Z_lm_intercept_AUC_L = Z_lm_R_AUC_L$intercept,
-            Z_lm_intercept_r_K = Z_lm_R_r_K$intercept,
-            Z_lm_intercept_AUC_K = Z_lm_R_AUC_K$intercept,
-            Z_lm_intercept_AUC_r = Z_lm_R_AUC_r$intercept,
-
-            Z_lm_slope_K_L = Z_lm_K_L$slope,
-            Z_lm_slope_r_L = Z_lm_r_L$slope,
-            Z_lm_slope_AUC_L = Z_lm_R_AUC_L$slope,
-            Z_lm_slope_r_K = Z_lm_R_r_K$slope,
-            Z_lm_slope_AUC_K = Z_lm_R_AUC_K$slope,
-            Z_lm_slope_AUC_r = Z_lm_R_AUC_r$slope,
-
-            Z_lm_Score_K_L = Z_lm_K_L$score,
-            Z_lm_Score_r_L = Z_lm_r_L$score,
-            Z_lm_Score_AUC_L = Z_lm_R_AUC_L$score,
-            Z_lm_Score_r_K = Z_lm_R_r_K$score,
-            Z_lm_Score_AUC_K = Z_lm_R_AUC_K$score,
-            Z_lm_Score_AUC_r = Z_lm_R_AUC_r$score
-          )
-      })
+        Z_lm_K_L = perform_lm(., "Z_lm_K", "Z_lm_L", max_conc),
+        Z_lm_r_L = perform_lm(., "Z_lm_r", "Z_lm_L", max_conc),
+        Z_lm_R_AUC_L = perform_lm(., "Z_lm_AUC", "Z_lm_L", max_conc),
+        Z_lm_R_r_K = perform_lm(., "Z_lm_r", "Z_lm_K", max_conc),
+        Z_lm_R_AUC_K = perform_lm(., "Z_lm_AUC", "Z_lm_K", max_conc),
+        Z_lm_R_AUC_r = perform_lm(., "Z_lm_AUC", "Z_lm_r", max_conc),
+
+        # Extract coefficients and statistics for each model
+        lm_rank_intercept_L = lm_L$intercept,
+        lm_rank_slope_L = lm_L$slope,
+        R_Squared_L = lm_L$r_squared,
+        lm_Score_L = lm_L$score,
+
+        lm_intercept_K = lm_K$intercept,
+        lm_slope_K = lm_K$slope,
+        R_Squared_K = lm_K$r_squared,
+        lm_Score_K = lm_K$score,
+
+        lm_intercept_r = lm_r$intercept,
+        lm_slope_r = lm_r$slope,
+        R_Squared_r = lm_r$r_squared,
+        lm_Score_r = lm_r$score,
+
+        lm_intercept_AUC = lm_AUC$intercept,
+        lm_slope_AUC = lm_AUC$slope,
+        R_Squared_AUC = lm_AUC$r_squared,
+        lm_Score_AUC = lm_AUC$score,
+
+        Z_lm_intercept_K_L = Z_lm_K_L$intercept,
+        Z_lm_slope_K_L = Z_lm_K_L$slope,
+        Z_lm_R_squared_K_L = Z_lm_K_L$r_squared,
+        Z_lm_Score_K_L = Z_lm_K_L$score,
+
+        Z_lm_intercept_r_L = Z_lm_r_L$intercept,
+        Z_lm_slope_r_L = Z_lm_r_L$slope,
+        Z_lm_R_squared_r_L = Z_lm_r_L$r_squared,
+        Z_lm_Score_r_L = Z_lm_r_L$score,
+
+        Z_lm_intercept_R_AUC_L = Z_lm_R_AUC_L$intercept,
+        Z_lm_slope_R_AUC_L = Z_lm_R_AUC_L$slope,
+        Z_lm_R_squared_R_AUC_L = Z_lm_R_AUC_L$r_squared,
+        Z_lm_Score_R_AUC_L = Z_lm_R_AUC_L$score,
+
+        Z_lm_intercept_R_r_K = Z_lm_R_r_K$intercept,
+        Z_lm_slope_R_r_K = Z_lm_R_r_K$slope,
+        Z_lm_R_squared_R_r_K = Z_lm_R_r_K$r_squared,
+        Z_lm_Score_R_r_K = Z_lm_R_r_K$score,
+
+        Z_lm_intercept_R_AUC_K = Z_lm_R_AUC_K$intercept,
+        Z_lm_slope_R_AUC_K = Z_lm_R_AUC_K$slope,
+        Z_lm_R_squared_R_AUC_K = Z_lm_R_AUC_K$r_squared,
+        Z_lm_Score_R_AUC_K = Z_lm_R_AUC_K$score,
+
+        Z_lm_intercept_R_AUC_r = Z_lm_R_AUC_r$intercept,
+        Z_lm_slope_R_AUC_r = Z_lm_R_AUC_r$slope,
+        Z_lm_R_squared_R_AUC_r = Z_lm_R_AUC_r$r_squared,
+        Z_lm_Score_R_AUC_r = Z_lm_R_AUC_r$score
+      ) %>%
+      # Drop the lm objects and keep only the lm results
+      select(
+        -lm_L, -lm_K, -lm_r, -lm_AUC, -Z_lm_K_L, -Z_lm_r_L, -Z_lm_R_AUC_L,
+        -Z_lm_R_r_K, -Z_lm_R_AUC_K, -Z_lm_R_AUC_r
+      )
   } # end deletion-specific block
 
   # Create the final calculations and interactions dataframes with only required columns for csv output
@@ -536,35 +569,38 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
       Exp_L, Exp_K, Exp_r, Exp_AUC,
       Delta_L, Delta_K, Delta_r, Delta_AUC,
       mean_Delta_L, mean_Delta_K, mean_Delta_r, mean_Delta_AUC,
-      Zscore_L, Zscore_K, Zscore_r, Zscore_AUC
-    )
+      Zscore_L, Zscore_K, Zscore_r, Zscore_AUC,
+      NG_sum, DB_sum, SM_sum
+    ) %>%
+      rename(NG = NG_sum, DB = DB_sum, SM = SM_sum)
 
   df_interactions <- interactions %>%
     select(
       all_of(group_vars),
-      NG, DB, SM,
       Avg_Zscore_L, Avg_Zscore_K, Avg_Zscore_r, Avg_Zscore_AUC,
+      Sum_Z_Score_L, Sum_Z_Score_K, Sum_Z_Score_r, Sum_Z_Score_AUC,
       Z_lm_L, Z_lm_K, Z_lm_r, Z_lm_AUC,
       Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC,
       Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC,
-      lm_R_squared_L, lm_R_squared_K, lm_R_squared_r, lm_R_squared_AUC,
-      lm_intercept_L, lm_intercept_K, lm_intercept_r, lm_intercept_AUC,
-      lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap
-    )
+      lm_Score_L, lm_Score_K, lm_Score_r, lm_Score_AUC,
+      R_Squared_L, R_Squared_K, R_Squared_r, R_Squared_AUC,
+      NG_sum_int, DB_sum_int, SM_sum_int
+    ) %>%
+      rename(NG = NG_sum_int, DB = DB_sum_int, SM = SM_sum_int)
 
   # Avoid column collision on left join for overlapping variables
   calculations_no_overlap <- calculations %>%
     select(-any_of(c("DB", "NG", "SM",
+      # Don't need these anywhere so easier to remove
       "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
+      "R_Squared_L", "R_Squared_K", "R_Squared_r", "R_Squared_AUC",
+      # we need these for the interactions but the original code has the same names in both datasets
       "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC",
-      "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC",
-      "lm_R_squared_L", "lm_R_squared_K", "lm_R_squared_r", "lm_R_squared_AUC",
-      "lm_intercept_L", "lm_intercept_K", "lm_intercept_r", "lm_intercept_AUC",
-      "lm_slope_L", "lm_slope_K", "lm_slope_r", "lm_slope_AUC"
+      "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC"
     )))
 
   full_data <- calculations_no_overlap %>%
-    left_join(df_interactions, by = group_vars)
+    left_join(interactions, by = group_vars)
 
   # Return final dataframes
   return(list(
@@ -1239,7 +1275,7 @@ generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, ove
 generate_correlation_plot_configs <- function(df, df_reference) {
   # Define relationships for different-variable correlations
   relationships <- list(
-    list(x = "L", y = "K"),
+    list(x = "L", y = "K"), # x-var is predictor, y-var is reponse
     list(x = "L", y = "r"),
     list(x = "L", y = "AUC"),
     list(x = "K", y = "r"),
@@ -1261,17 +1297,17 @@ generate_correlation_plot_configs <- function(df, df_reference) {
   for (highlight_cyan in highlight_cyan_options) {
     for (rel in relationships) {
       # Extract relevant variable names for Z_lm values
-      x_var <- paste0("Z_lm_", rel$x)
-      y_var <- paste0("Z_lm_", rel$y)
+      x_var <- paste0("Z_lm_", rel$x) # predictor
+      y_var <- paste0("Z_lm_", rel$y) # response
 
       # Find the max and min of both dataframes for printing linear regression line
       xmin <- min(c(min(df[[x_var]], na.rm = TRUE), min(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE)
       xmax <- max(c(max(df[[x_var]], na.rm = TRUE), max(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE)
 
       # Extract the R-squared, intercept, and slope from the df (first value)
-      intercept <- df[[paste0("lm_intercept_", rel$x)]][1]
-      slope <- df[[paste0("lm_slope_", rel$x)]][1]
-      r_squared <- df[[paste0("lm_R_squared_", rel$x)]][1]
+      intercept <- df[[paste0("Z_lm_intercept_", rel$y, "_", rel$x)]][1]
+      slope <- df[[paste0("Z_lm_slope_", rel$y, "_", rel$x)]][1]
+      r_squared <- df[[paste0("Z_lm_R_squared_", rel$y, "_", rel$x)]][1]
 
       # Generate the label for the plot
       plot_label <- paste("Interaction", rel$x, "vs.", rel$y)