8 miesięcy temu · 52381426b2
--- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
+++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
@@ -219,9 +219,9 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
				 
			
 
				   # This is a helper function to perform a linear regression and extract coefficients
			
 
				   perform_lm <- function(data, response, predictor, max_conc) {
			
 
				-    valid_data <- data %>% filter(!is.na(.data[[response]]))
			
 
				+    valid_data <- data %>% filter(!is.na(data[[response]]))
			
 
				     if (nrow(valid_data) > 1) {
			
 
				-      model <- lm(.data[[response]] ~ .data[[predictor]], data = valid_data)
			
 
				+      model <- lm(data[[response]] ~ data[[predictor]], data = valid_data)
			
 
				       list(
			
 
				         intercept = coef(model)[1],
			
 
				         slope = coef(model)[2],
			
@@ -286,8 +286,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
				       NG_sum = sum(NG, na.rm = TRUE),
			
 
				       DB_sum = sum(DB, na.rm = TRUE),
			
 
				       SM_sum = sum(SM, na.rm = TRUE),
			
 
				-      num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
			
 
				-      
			
 
				+
			
 
				       # Expected values
			
 
				       Exp_L = WT_L + Raw_Shift_L,
			
 
				       Exp_K = WT_K + Raw_Shift_K,
			
@@ -374,10 +373,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
				       lm_mean_r = mean(lm_Score_r, na.rm = TRUE),
			
 
				       lm_sd_r = sd(lm_Score_r, na.rm = TRUE),
			
 
				       lm_mean_AUC = mean(lm_Score_AUC, na.rm = TRUE),
			
 
				-      lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE)
			
 
				-    ) %>%
			
 
				-    # Calculate Z-lm scores
			
 
				-    mutate(
			
 
				+      lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE),
			
 
				       Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L,
			
 
				       Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K,
			
 
				       Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r,
			
@@ -388,10 +384,24 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
				   interactions <- calculations %>%
			
 
				     group_by(across(all_of(group_vars))) %>%
			
 
				     summarise(
			
 
				-      Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / first(num_non_removed_concs),
			
 
				-      Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / first(num_non_removed_concs),
			
 
				-      Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / (total_conc_num - 1),
			
 
				-      Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / (total_conc_num - 1),
			
 
				+
			
 
				+      num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
			
 
				+
			
 
				+      Sum_Z_Score_L = sum(Zscore_L, na.rm = TRUE),
			
 
				+      Sum_Z_Score_K = sum(Zscore_K, na.rm = TRUE),
			
 
				+      Sum_Z_Score_r = sum(Zscore_r, na.rm = TRUE),
			
 
				+      Sum_Z_Score_AUC = sum(Zscore_AUC, na.rm = TRUE),
			
 
				+
			
 
				+      Avg_Zscore_L = Sum_Z_Score_L / first(num_non_removed_concs),
			
 
				+      Avg_Zscore_K = Sum_Z_Score_K / first(num_non_removed_concs),
			
 
				+      Avg_Zscore_r = Sum_Z_Score_r / first(num_non_removed_concs),
			
 
				+      Avg_Zscore_AUC = Sum_Z_Score_AUC / first(num_non_removed_concs),
			
 
				+
			
 
				+      # R_Squared
			
 
				+      R_Squared_L = first(R_Squared_L),
			
 
				+      R_Squared_K = first(R_Squared_K),
			
 
				+      R_Squared_r = first(R_Squared_r),
			
 
				+      R_Squared_AUC = first(R_Squared_AUC),
			
 
				       
			
 
				       # Interaction Z-scores
			
 
				       Z_lm_L = first(Z_lm_L),
			
@@ -411,13 +421,19 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
				       Z_Shift_r = first(Z_Shift_r),
			
 
				       Z_Shift_AUC = first(Z_Shift_AUC),
			
 
				 
			
 
				+      # Gene-Gene Interaction
			
 
				+      lm_Score_L = first(lm_Score_L),
			
 
				+      lm_Score_K = first(lm_Score_K),
			
 
				+      lm_Score_r = first(lm_Score_r),
			
 
				+      lm_Score_AUC = first(lm_Score_AUC),
			
 
				+
			
 
				       # NG, DB, SM values
			
 
				-      NG = first(NG),
			
 
				-      DB = first(DB),
			
 
				-      SM = first(SM),
			
 
				+      NG_sum_int = sum(NG),
			
 
				+      DB_sum_int = sum(DB),
			
 
				+      SM_sum_int = sum(SM),
			
 
				       .groups = "drop"
			
 
				     ) %>%
			
 
				-    arrange(desc(Z_lm_L), desc(NG))
			
 
				+    arrange(desc(Z_lm_L), desc(NG_sum_int))
			
 
				 
			
 
				   # Deletion data ranking and linear modeling
			
 
				   if (type == "deletion") {
			
@@ -463,61 +479,78 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
				           Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
			
 
				           Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
			
 
				           TRUE ~ "No Effect"
			
 
				-        )) %>%
			
 
				-      group_modify(~ {
			
 
				-      
			
 
				+        ),
			
 
				+        
			
 
				         # For rank plots
			
 
				-        lm_L <- perform_lm(.x, "Z_lm_L", "Avg_Zscore_L", max_conc)
			
 
				-        lm_K <- perform_lm(.x, "Z_lm_K", "Avg_Zscore_K", max_conc)
			
 
				-        lm_r <- perform_lm(.x, "Z_lm_r", "Avg_Zscore_r", max_conc)
			
 
				-        lm_AUC <- perform_lm(.x, "Z_lm_AUC", "Avg_Zscore_AUC", max_conc)
			
 
				+        lm_L = perform_lm(., "Z_lm_L", "Avg_Zscore_L", max_conc),
			
 
				+        lm_K = perform_lm(., "Z_lm_K", "Avg_Zscore_K", max_conc),
			
 
				+        lm_r = perform_lm(., "Z_lm_r", "Avg_Zscore_r", max_conc),
			
 
				+        lm_AUC = perform_lm(., "Z_lm_AUC", "Avg_Zscore_AUC", max_conc),
			
 
				 
			
 
				         # For correlation plots
			
 
				-        Z_lm_K_L <- perform_lm(.x, "Z_lm_K", "Z_lm_L", max_conc)
			
 
				-        Z_lm_r_L <- perform_lm(.x, "Z_lm_r", "Z_lm_L", max_conc)
			
 
				-        Z_lm_R_AUC_L <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_L", max_conc)
			
 
				-        Z_lm_R_r_K <- perform_lm(.x, "Z_lm_R_r", "Z_lm_K", max_conc)
			
 
				-        Z_lm_R_AUC_K <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_K", max_conc)
			
 
				-        Z_lm_R_AUC_r <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_r", max_conc)
			
 
				-
			
 
				-        .x %>%
			
 
				-          mutate(
			
 
				-            # For rank plots
			
 
				-            lm_R_squared_L = lm_L$r.squared,
			
 
				-            lm_R_squared_K = lm_K$r.squared,
			
 
				-            lm_R_squared_r = lm_r$r.squared,
			
 
				-            lm_R_squared_AUC = lm_AUC$r.squared,
			
 
				-
			
 
				-            # For correlation plots
			
 
				-            Z_lm_R_squared_K_L = Z_lm_K_L$r.squared,
			
 
				-            Z_lm_R_squared_r_L = Z_lm_r_L$r.squared,
			
 
				-            Z_lm_R_squared_AUC_L = Z_lm_R_AUC_L$r.squared,
			
 
				-            Z_lm_R_squared_r_K = Z_lm_R_r_K$r.squared,
			
 
				-            Z_lm_R_squared_AUC_K = Z_lm_R_AUC_K$r.squared,
			
 
				-            Z_lm_R_squared_AUC_r = Z_lm_R_AUC_r$r.squared,
			
 
				-
			
 
				-            Z_lm_intercept_K_L = Z_lm_K_L$intercept,
			
 
				-            Z_lm_intercept_r_L = Z_lm_r_L$intercept,
			
 
				-            Z_lm_intercept_AUC_L = Z_lm_R_AUC_L$intercept,
			
 
				-            Z_lm_intercept_r_K = Z_lm_R_r_K$intercept,
			
 
				-            Z_lm_intercept_AUC_K = Z_lm_R_AUC_K$intercept,
			
 
				-            Z_lm_intercept_AUC_r = Z_lm_R_AUC_r$intercept,
			
 
				-
			
 
				-            Z_lm_slope_K_L = Z_lm_K_L$slope,
			
 
				-            Z_lm_slope_r_L = Z_lm_r_L$slope,
			
 
				-            Z_lm_slope_AUC_L = Z_lm_R_AUC_L$slope,
			
 
				-            Z_lm_slope_r_K = Z_lm_R_r_K$slope,
			
 
				-            Z_lm_slope_AUC_K = Z_lm_R_AUC_K$slope,
			
 
				-            Z_lm_slope_AUC_r = Z_lm_R_AUC_r$slope,
			
 
				-
			
 
				-            Z_lm_Score_K_L = Z_lm_K_L$score,
			
 
				-            Z_lm_Score_r_L = Z_lm_r_L$score,
			
 
				-            Z_lm_Score_AUC_L = Z_lm_R_AUC_L$score,
			
 
				-            Z_lm_Score_r_K = Z_lm_R_r_K$score,
			
 
				-            Z_lm_Score_AUC_K = Z_lm_R_AUC_K$score,
			
 
				-            Z_lm_Score_AUC_r = Z_lm_R_AUC_r$score
			
 
				-          )
			
 
				-      })
			
 
				+        Z_lm_K_L = perform_lm(., "Z_lm_K", "Z_lm_L", max_conc),
			
 
				+        Z_lm_r_L = perform_lm(., "Z_lm_r", "Z_lm_L", max_conc),
			
 
				+        Z_lm_R_AUC_L = perform_lm(., "Z_lm_AUC", "Z_lm_L", max_conc),
			
 
				+        Z_lm_R_r_K = perform_lm(., "Z_lm_r", "Z_lm_K", max_conc),
			
 
				+        Z_lm_R_AUC_K = perform_lm(., "Z_lm_AUC", "Z_lm_K", max_conc),
			
 
				+        Z_lm_R_AUC_r = perform_lm(., "Z_lm_AUC", "Z_lm_r", max_conc),
			
 
				+
			
 
				+        # Extract coefficients and statistics for each model
			
 
				+        lm_rank_intercept_L = lm_L$intercept,
			
 
				+        lm_rank_slope_L = lm_L$slope,
			
 
				+        R_Squared_L = lm_L$r_squared,
			
 
				+        lm_Score_L = lm_L$score,
			
 
				+
			
 
				+        lm_intercept_K = lm_K$intercept,
			
 
				+        lm_slope_K = lm_K$slope,
			
 
				+        R_Squared_K = lm_K$r_squared,
			
 
				+        lm_Score_K = lm_K$score,
			
 
				+
			
 
				+        lm_intercept_r = lm_r$intercept,
			
 
				+        lm_slope_r = lm_r$slope,
			
 
				+        R_Squared_r = lm_r$r_squared,
			
 
				+        lm_Score_r = lm_r$score,
			
 
				+
			
 
				+        lm_intercept_AUC = lm_AUC$intercept,
			
 
				+        lm_slope_AUC = lm_AUC$slope,
			
 
				+        R_Squared_AUC = lm_AUC$r_squared,
			
 
				+        lm_Score_AUC = lm_AUC$score,
			
 
				+
			
 
				+        Z_lm_intercept_K_L = Z_lm_K_L$intercept,
			
 
				+        Z_lm_slope_K_L = Z_lm_K_L$slope,
			
 
				+        Z_lm_R_squared_K_L = Z_lm_K_L$r_squared,
			
 
				+        Z_lm_Score_K_L = Z_lm_K_L$score,
			
 
				+
			
 
				+        Z_lm_intercept_r_L = Z_lm_r_L$intercept,
			
 
				+        Z_lm_slope_r_L = Z_lm_r_L$slope,
			
 
				+        Z_lm_R_squared_r_L = Z_lm_r_L$r_squared,
			
 
				+        Z_lm_Score_r_L = Z_lm_r_L$score,
			
 
				+
			
 
				+        Z_lm_intercept_R_AUC_L = Z_lm_R_AUC_L$intercept,
			
 
				+        Z_lm_slope_R_AUC_L = Z_lm_R_AUC_L$slope,
			
 
				+        Z_lm_R_squared_R_AUC_L = Z_lm_R_AUC_L$r_squared,
			
 
				+        Z_lm_Score_R_AUC_L = Z_lm_R_AUC_L$score,
			
 
				+
			
 
				+        Z_lm_intercept_R_r_K = Z_lm_R_r_K$intercept,
			
 
				+        Z_lm_slope_R_r_K = Z_lm_R_r_K$slope,
			
 
				+        Z_lm_R_squared_R_r_K = Z_lm_R_r_K$r_squared,
			
 
				+        Z_lm_Score_R_r_K = Z_lm_R_r_K$score,
			
 
				+
			
 
				+        Z_lm_intercept_R_AUC_K = Z_lm_R_AUC_K$intercept,
			
 
				+        Z_lm_slope_R_AUC_K = Z_lm_R_AUC_K$slope,
			
 
				+        Z_lm_R_squared_R_AUC_K = Z_lm_R_AUC_K$r_squared,
			
 
				+        Z_lm_Score_R_AUC_K = Z_lm_R_AUC_K$score,
			
 
				+
			
 
				+        Z_lm_intercept_R_AUC_r = Z_lm_R_AUC_r$intercept,
			
 
				+        Z_lm_slope_R_AUC_r = Z_lm_R_AUC_r$slope,
			
 
				+        Z_lm_R_squared_R_AUC_r = Z_lm_R_AUC_r$r_squared,
			
 
				+        Z_lm_Score_R_AUC_r = Z_lm_R_AUC_r$score
			
 
				+      ) %>%
			
 
				+      # Drop the lm objects and keep only the lm results
			
 
				+      select(
			
 
				+        -lm_L, -lm_K, -lm_r, -lm_AUC, -Z_lm_K_L, -Z_lm_r_L, -Z_lm_R_AUC_L,
			
 
				+        -Z_lm_R_r_K, -Z_lm_R_AUC_K, -Z_lm_R_AUC_r
			
 
				+      )
			
 
				   } # end deletion-specific block
			
 
				 
			
 
				   # Create the final calculations and interactions dataframes with only required columns for csv output
			
@@ -536,35 +569,38 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
				       Exp_L, Exp_K, Exp_r, Exp_AUC,
			
 
				       Delta_L, Delta_K, Delta_r, Delta_AUC,
			
 
				       mean_Delta_L, mean_Delta_K, mean_Delta_r, mean_Delta_AUC,
			
 
				-      Zscore_L, Zscore_K, Zscore_r, Zscore_AUC
			
 
				-    )
			
 
				+      Zscore_L, Zscore_K, Zscore_r, Zscore_AUC,
			
 
				+      NG_sum, DB_sum, SM_sum
			
 
				+    ) %>%
			
 
				+      rename(NG = NG_sum, DB = DB_sum, SM = SM_sum)
			
 
				 
			
 
				   df_interactions <- interactions %>%
			
 
				     select(
			
 
				       all_of(group_vars),
			
 
				-      NG, DB, SM,
			
 
				       Avg_Zscore_L, Avg_Zscore_K, Avg_Zscore_r, Avg_Zscore_AUC,
			
 
				+      Sum_Z_Score_L, Sum_Z_Score_K, Sum_Z_Score_r, Sum_Z_Score_AUC,
			
 
				       Z_lm_L, Z_lm_K, Z_lm_r, Z_lm_AUC,
			
 
				       Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC,
			
 
				       Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC,
			
 
				-      lm_R_squared_L, lm_R_squared_K, lm_R_squared_r, lm_R_squared_AUC,
			
 
				-      lm_intercept_L, lm_intercept_K, lm_intercept_r, lm_intercept_AUC,
			
 
				-      lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap
			
 
				-    )
			
 
				+      lm_Score_L, lm_Score_K, lm_Score_r, lm_Score_AUC,
			
 
				+      R_Squared_L, R_Squared_K, R_Squared_r, R_Squared_AUC,
			
 
				+      NG_sum_int, DB_sum_int, SM_sum_int
			
 
				+    ) %>%
			
 
				+      rename(NG = NG_sum_int, DB = DB_sum_int, SM = SM_sum_int)
			
 
				 
			
 
				   # Avoid column collision on left join for overlapping variables
			
 
				   calculations_no_overlap <- calculations %>%
			
 
				     select(-any_of(c("DB", "NG", "SM",
			
 
				+      # Don't need these anywhere so easier to remove
			
 
				       "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
			
 
				+      "R_Squared_L", "R_Squared_K", "R_Squared_r", "R_Squared_AUC",
			
 
				+      # we need these for the interactions but the original code has the same names in both datasets
			
 
				       "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC",
			
 
				-      "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC",
			
 
				-      "lm_R_squared_L", "lm_R_squared_K", "lm_R_squared_r", "lm_R_squared_AUC",
			
 
				-      "lm_intercept_L", "lm_intercept_K", "lm_intercept_r", "lm_intercept_AUC",
			
 
				-      "lm_slope_L", "lm_slope_K", "lm_slope_r", "lm_slope_AUC"
			
 
				+      "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC"
			
 
				     )))
			
 
				 
			
 
				   full_data <- calculations_no_overlap %>%
			
 
				-    left_join(df_interactions, by = group_vars)
			
 
				+    left_join(interactions, by = group_vars)
			
 
				 
			
 
				   # Return final dataframes
			
 
				   return(list(
			
@@ -1239,7 +1275,7 @@ generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, ove
 
				 generate_correlation_plot_configs <- function(df, df_reference) {
			
 
				   # Define relationships for different-variable correlations
			
 
				   relationships <- list(
			
 
				-    list(x = "L", y = "K"),
			
 
				+    list(x = "L", y = "K"), # x-var is predictor, y-var is reponse
			
 
				     list(x = "L", y = "r"),
			
 
				     list(x = "L", y = "AUC"),
			
 
				     list(x = "K", y = "r"),
			
@@ -1261,17 +1297,17 @@ generate_correlation_plot_configs <- function(df, df_reference) {
 
				   for (highlight_cyan in highlight_cyan_options) {
			
 
				     for (rel in relationships) {
			
 
				       # Extract relevant variable names for Z_lm values
			
 
				-      x_var <- paste0("Z_lm_", rel$x)
			
 
				-      y_var <- paste0("Z_lm_", rel$y)
			
 
				+      x_var <- paste0("Z_lm_", rel$x) # predictor
			
 
				+      y_var <- paste0("Z_lm_", rel$y) # response
			
 
				 
			
 
				       # Find the max and min of both dataframes for printing linear regression line
			
 
				       xmin <- min(c(min(df[[x_var]], na.rm = TRUE), min(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE)
			
 
				       xmax <- max(c(max(df[[x_var]], na.rm = TRUE), max(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE)
			
 
				 
			
 
				       # Extract the R-squared, intercept, and slope from the df (first value)
			
 
				-      intercept <- df[[paste0("lm_intercept_", rel$x)]][1]
			
 
				-      slope <- df[[paste0("lm_slope_", rel$x)]][1]
			
 
				-      r_squared <- df[[paste0("lm_R_squared_", rel$x)]][1]
			
 
				+      intercept <- df[[paste0("Z_lm_intercept_", rel$y, "_", rel$x)]][1]
			
 
				+      slope <- df[[paste0("Z_lm_slope_", rel$y, "_", rel$x)]][1]
			
 
				+      r_squared <- df[[paste0("Z_lm_R_squared_", rel$y, "_", rel$x)]][1]
			
 
				 
			
 
				       # Generate the label for the plot
			
 
				       plot_label <- paste("Interaction", rel$x, "vs.", rel$y)