vor 9 Monaten · 2073517192
--- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
+++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
@@ -217,6 +217,22 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
				     group_vars <- c("OrfRep", "Gene", "Drug")
			
 
				   }
			
 
				 
			
 
				+  # This is a helper function to perform a linear regression and extract coefficients
			
 
				+  perform_lm <- function(data, response, predictor, max_conc) {
			
 
				+    valid_data <- data %>% filter(!is.na(.data[[response]]))
			
 
				+    if (nrow(valid_data) > 1) {
			
 
				+      model <- lm(.data[[response]] ~ .data[[predictor]], data = valid_data)
			
 
				+      list(
			
 
				+        intercept = coef(model)[1],
			
 
				+        slope = coef(model)[2],
			
 
				+        r_squared = summary(model)$r.squared,
			
 
				+        score = max_conc * coef(model)[2] + coef(model)[1]
			
 
				+      )
			
 
				+    } else {
			
 
				+      list(intercept = NA, slope = NA, r_squared = NA, score = NA)
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				   # Calculate WT statistics from df_bg
			
 
				   wt_stats <- df_bg %>%
			
 
				     group_by(across(all_of(bg_group_vars))) %>%
			
@@ -300,41 +316,32 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
				     ungroup() %>%  # Ungroup before group_modify
			
 
				     group_by(across(all_of(group_vars))) %>%
			
 
				     group_modify(~ {
			
 
				+      lm_L <- perform_lm(.x, "Delta_L", "conc_num_factor", max_conc)
			
 
				+      lm_K <- perform_lm(.x, "Delta_K", "conc_num_factor", max_conc)
			
 
				+      lm_r <- perform_lm(.x, "Delta_r", "conc_num_factor", max_conc)
			
 
				+      lm_AUC <- perform_lm(.x, "Delta_AUC", "conc_num_factor", max_conc)
			
 
				 
			
 
				-      # Filter each column for valid data or else linear modeling will fail
			
 
				-      valid_data_L <- .x %>% filter(!is.na(Delta_L))
			
 
				-      valid_data_K <- .x %>% filter(!is.na(Delta_K))
			
 
				-      valid_data_r <- .x %>% filter(!is.na(Delta_r))
			
 
				-      valid_data_AUC <- .x %>% filter(!is.na(Delta_AUC))
			
 
				-      
			
 
				-      # Perform linear modeling
			
 
				-      lm_L <- if (nrow(valid_data_L) > 1) lm(Delta_L ~ conc_num_factor, data = valid_data_L) else NULL
			
 
				-      lm_K <- if (nrow(valid_data_K) > 1) lm(Delta_K ~ conc_num_factor, data = valid_data_K) else NULL
			
 
				-      lm_r <- if (nrow(valid_data_r) > 1) lm(Delta_r ~ conc_num_factor, data = valid_data_r) else NULL
			
 
				-      lm_AUC <- if (nrow(valid_data_AUC) > 1) lm(Delta_AUC ~ conc_num_factor, data = valid_data_AUC) else NULL
			
 
				-
			
 
				-      # Extract coefficients for calculations and plotting
			
 
				       .x %>%
			
 
				         mutate(
			
 
				-          lm_intercept_L = if (!is.null(lm_L)) coef(lm_L)[1] else NA,
			
 
				-          lm_slope_L = if (!is.null(lm_L)) coef(lm_L)[2] else NA,
			
 
				-          R_Squared_L = if (!is.null(lm_L)) summary(lm_L)$r.squared else NA,
			
 
				-          lm_Score_L = if (!is.null(lm_L)) max_conc * coef(lm_L)[2] + coef(lm_L)[1] else NA,
			
 
				-
			
 
				-          lm_intercept_K = if (!is.null(lm_K)) coef(lm_K)[1] else NA,
			
 
				-          lm_slope_K = if (!is.null(lm_K)) coef(lm_K)[2] else NA,
			
 
				-          R_Squared_K = if (!is.null(lm_K)) summary(lm_K)$r.squared else NA,
			
 
				-          lm_Score_K = if (!is.null(lm_K)) max_conc * coef(lm_K)[2] + coef(lm_K)[1] else NA,
			
 
				-
			
 
				-          lm_intercept_r = if (!is.null(lm_r)) coef(lm_r)[1] else NA,
			
 
				-          lm_slope_r = if (!is.null(lm_r)) coef(lm_r)[2] else NA,
			
 
				-          R_Squared_r = if (!is.null(lm_r)) summary(lm_r)$r.squared else NA,
			
 
				-          lm_Score_r = if (!is.null(lm_r)) max_conc * coef(lm_r)[2] + coef(lm_r)[1] else NA,
			
 
				-
			
 
				-          lm_intercept_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[1] else NA,
			
 
				-          lm_slope_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[2] else NA,
			
 
				-          R_Squared_AUC = if (!is.null(lm_AUC)) summary(lm_AUC)$r.squared else NA,
			
 
				-          lm_Score_AUC = if (!is.null(lm_AUC)) max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1] else NA
			
 
				+          lm_intercept_L = lm_L$intercept,
			
 
				+          lm_slope_L = lm_L$slope,
			
 
				+          R_Squared_L = lm_L$r_squared,
			
 
				+          lm_Score_L = lm_L$score,
			
 
				+
			
 
				+          lm_intercept_K = lm_K$intercept,
			
 
				+          lm_slope_K = lm_K$slope,
			
 
				+          R_Squared_K = lm_K$r_squared,
			
 
				+          lm_Score_K = lm_K$score,
			
 
				+
			
 
				+          lm_intercept_r = lm_r$intercept,
			
 
				+          lm_slope_r = lm_r$slope,
			
 
				+          R_Squared_r = lm_r$r_squared,
			
 
				+          lm_Score_r = lm_r$score,
			
 
				+
			
 
				+          lm_intercept_AUC = lm_AUC$intercept,
			
 
				+          lm_slope_AUC = lm_AUC$slope,
			
 
				+          R_Squared_AUC = lm_AUC$r_squared,
			
 
				+          lm_Score_AUC = lm_AUC$score
			
 
				         )
			
 
				     }) %>%
			
 
				     ungroup()
			
@@ -404,55 +411,116 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
				       Z_Shift_r = first(Z_Shift_r),
			
 
				       Z_Shift_AUC = first(Z_Shift_AUC),
			
 
				 
			
 
				-      # R Squared values
			
 
				-      R_Squared_L = first(R_Squared_L),
			
 
				-      R_Squared_K = first(R_Squared_K),
			
 
				-      R_Squared_r = first(R_Squared_r),
			
 
				-      R_Squared_AUC = first(R_Squared_AUC),
			
 
				-
			
 
				-      # lm intercepts
			
 
				-      lm_intercept_L = first(lm_intercept_L),
			
 
				-      lm_intercept_K = first(lm_intercept_K),
			
 
				-      lm_intercept_r = first(lm_intercept_r),
			
 
				-      lm_intercept_AUC = first(lm_intercept_AUC),
			
 
				-
			
 
				-      # lm slopes
			
 
				-      lm_slope_L = first(lm_slope_L),
			
 
				-      lm_slope_K = first(lm_slope_K),
			
 
				-      lm_slope_r = first(lm_slope_r),
			
 
				-      lm_slope_AUC = first(lm_slope_AUC),
			
 
				-
			
 
				       # NG, DB, SM values
			
 
				       NG = first(NG),
			
 
				       DB = first(DB),
			
 
				       SM = first(SM),
			
 
				       .groups = "drop"
			
 
				-    )
			
 
				+    ) %>%
			
 
				+    arrange(desc(Z_lm_L), desc(NG))
			
 
				+
			
 
				+  # Deletion data ranking and linear modeling
			
 
				+  if (type == "deletion") {
			
 
				+    interactions <- interactions %>%
			
 
				+      mutate(
			
 
				+        Avg_Zscore_L_adjusted = ifelse(is.na(Avg_Zscore_L), 0.001, Avg_Zscore_L),
			
 
				+        Avg_Zscore_K_adjusted = ifelse(is.na(Avg_Zscore_K), 0.001, Avg_Zscore_K),
			
 
				+        Avg_Zscore_r_adjusted = ifelse(is.na(Avg_Zscore_r), 0.001, Avg_Zscore_r),
			
 
				+        Avg_Zscore_AUC_adjusted = ifelse(is.na(Avg_Zscore_AUC), 0.001, Avg_Zscore_AUC),
			
 
				+        Z_lm_L_adjusted = ifelse(is.na(Z_lm_L), 0.001, Z_lm_L),
			
 
				+        Z_lm_K_adjusted = ifelse(is.na(Z_lm_K), 0.001, Z_lm_K),
			
 
				+        Z_lm_r_adjusted = ifelse(is.na(Z_lm_r), 0.001, Z_lm_r),
			
 
				+        Z_lm_AUC_adjusted = ifelse(is.na(Z_lm_AUC), 0.001, Z_lm_AUC)
			
 
				+      ) %>%
			
 
				+      mutate(
			
 
				+        Rank_L = rank(Avg_Zscore_L_adjusted),
			
 
				+        Rank_K = rank(Avg_Zscore_K_adjusted),
			
 
				+        Rank_r = rank(Avg_Zscore_r_adjusted),
			
 
				+        Rank_AUC = rank(Avg_Zscore_AUC_adjusted),
			
 
				+        Rank_lm_L = rank(Z_lm_L_adjusted),
			
 
				+        Rank_lm_K = rank(Z_lm_K_adjusted),
			
 
				+        Rank_lm_r = rank(Z_lm_r_adjusted),
			
 
				+        Rank_lm_AUC = rank(Z_lm_AUC_adjusted)
			
 
				+      ) %>%
			
 
				+      mutate(
			
 
				+        lm_R_squared_rank_L = summary(lm(Rank_lm_L ~ Rank_L, data = .))$r.squared,
			
 
				+        lm_R_squared_rank_K = summary(lm(Rank_lm_K ~ Rank_K, data = .))$r.squared,
			
 
				+        lm_R_squared_rank_r = summary(lm(Rank_lm_r ~ Rank_r, data = .))$r.squared,
			
 
				+        lm_R_squared_rank_AUC = summary(lm(Rank_lm_AUC ~ Rank_AUC, data = .))$r.squared
			
 
				+      )
			
 
				 
			
 
				-  # Add overlap threshold categories based on Z-lm and Avg-Z scores
			
 
				-  interactions <- interactions %>%
			
 
				-    filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
			
 
				-    mutate(
			
 
				-      Overlap = case_when(
			
 
				-        Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both",
			
 
				-        Z_lm_L <= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Both",
			
 
				-        Z_lm_L >= overlap_threshold & Avg_Zscore_L <= overlap_threshold ~ "Deletion Enhancer lm only",
			
 
				-        Z_lm_L <= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Avg Zscore only",
			
 
				-        Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= -overlap_threshold ~ "Deletion Suppressor lm only",
			
 
				-        Z_lm_L >= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Avg Zscore only",
			
 
				-        Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
			
 
				-        Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
			
 
				-        TRUE ~ "No Effect"
			
 
				-      ),
			
 
				+    # Add overlap threshold categories based on Z-lm and Avg-Z scores
			
 
				+    interactions <- interactions %>%
			
 
				+      filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
			
 
				+      mutate(
			
 
				+        Overlap = case_when(
			
 
				+          Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both",
			
 
				+          Z_lm_L <= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Both",
			
 
				+          Z_lm_L >= overlap_threshold & Avg_Zscore_L <= overlap_threshold ~ "Deletion Enhancer lm only",
			
 
				+          Z_lm_L <= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Avg Zscore only",
			
 
				+          Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= -overlap_threshold ~ "Deletion Suppressor lm only",
			
 
				+          Z_lm_L >= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Avg Zscore only",
			
 
				+          Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
			
 
				+          Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
			
 
				+          TRUE ~ "No Effect"
			
 
				+        )) %>%
			
 
				+      group_modify(~ {
			
 
				       
			
 
				-      # For correlation plots
			
 
				-      lm_R_squared_L = summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared,
			
 
				-      lm_R_squared_K = summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared,
			
 
				-      lm_R_squared_r = summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared,
			
 
				-      lm_R_squared_AUC = summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared
			
 
				-    )
			
 
				+        # For rank plots
			
 
				+        lm_L <- perform_lm(.x, "Z_lm_L", "Avg_Zscore_L", max_conc)
			
 
				+        lm_K <- perform_lm(.x, "Z_lm_K", "Avg_Zscore_K", max_conc)
			
 
				+        lm_r <- perform_lm(.x, "Z_lm_r", "Avg_Zscore_r", max_conc)
			
 
				+        lm_AUC <- perform_lm(.x, "Z_lm_AUC", "Avg_Zscore_AUC", max_conc)
			
 
				+
			
 
				+        # For correlation plots
			
 
				+        Z_lm_K_L <- perform_lm(.x, "Z_lm_K", "Z_lm_L", max_conc)
			
 
				+        Z_lm_r_L <- perform_lm(.x, "Z_lm_r", "Z_lm_L", max_conc)
			
 
				+        Z_lm_R_AUC_L <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_L", max_conc)
			
 
				+        Z_lm_R_r_K <- perform_lm(.x, "Z_lm_R_r", "Z_lm_K", max_conc)
			
 
				+        Z_lm_R_AUC_K <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_K", max_conc)
			
 
				+        Z_lm_R_AUC_r <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_r", max_conc)
			
 
				+
			
 
				+        .x %>%
			
 
				+          mutate(
			
 
				+            # For rank plots
			
 
				+            lm_R_squared_L = lm_L$r.squared,
			
 
				+            lm_R_squared_K = lm_K$r.squared,
			
 
				+            lm_R_squared_r = lm_r$r.squared,
			
 
				+            lm_R_squared_AUC = lm_AUC$r.squared,
			
 
				+
			
 
				+            # For correlation plots
			
 
				+            Z_lm_R_squared_K_L = Z_lm_K_L$r.squared,
			
 
				+            Z_lm_R_squared_r_L = Z_lm_r_L$r.squared,
			
 
				+            Z_lm_R_squared_AUC_L = Z_lm_R_AUC_L$r.squared,
			
 
				+            Z_lm_R_squared_r_K = Z_lm_R_r_K$r.squared,
			
 
				+            Z_lm_R_squared_AUC_K = Z_lm_R_AUC_K$r.squared,
			
 
				+            Z_lm_R_squared_AUC_r = Z_lm_R_AUC_r$r.squared,
			
 
				+
			
 
				+            Z_lm_intercept_K_L = Z_lm_K_L$intercept,
			
 
				+            Z_lm_intercept_r_L = Z_lm_r_L$intercept,
			
 
				+            Z_lm_intercept_AUC_L = Z_lm_R_AUC_L$intercept,
			
 
				+            Z_lm_intercept_r_K = Z_lm_R_r_K$intercept,
			
 
				+            Z_lm_intercept_AUC_K = Z_lm_R_AUC_K$intercept,
			
 
				+            Z_lm_intercept_AUC_r = Z_lm_R_AUC_r$intercept,
			
 
				+
			
 
				+            Z_lm_slope_K_L = Z_lm_K_L$slope,
			
 
				+            Z_lm_slope_r_L = Z_lm_r_L$slope,
			
 
				+            Z_lm_slope_AUC_L = Z_lm_R_AUC_L$slope,
			
 
				+            Z_lm_slope_r_K = Z_lm_R_r_K$slope,
			
 
				+            Z_lm_slope_AUC_K = Z_lm_R_AUC_K$slope,
			
 
				+            Z_lm_slope_AUC_r = Z_lm_R_AUC_r$slope,
			
 
				+
			
 
				+            Z_lm_Score_K_L = Z_lm_K_L$score,
			
 
				+            Z_lm_Score_r_L = Z_lm_r_L$score,
			
 
				+            Z_lm_Score_AUC_L = Z_lm_R_AUC_L$score,
			
 
				+            Z_lm_Score_r_K = Z_lm_R_r_K$score,
			
 
				+            Z_lm_Score_AUC_K = Z_lm_R_AUC_K$score,
			
 
				+            Z_lm_Score_AUC_r = Z_lm_R_AUC_r$score
			
 
				+          )
			
 
				+      })
			
 
				+  } # end deletion-specific block
			
 
				 
			
 
				-  # Creating the final calculations and interactions dataframes with only required columns for csv output
			
 
				+  # Create the final calculations and interactions dataframes with only required columns for csv output
			
 
				   df_calculations <- calculations %>%
			
 
				     select(
			
 
				       all_of(group_vars),
			
@@ -484,7 +552,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
 
				       lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap
			
 
				     )
			
 
				 
			
 
				-  # Join calculations and interactions to avoid dimension mismatch
			
 
				+  # Avoid column collision on left join for overlapping variables
			
 
				   calculations_no_overlap <- calculations %>%
			
 
				     select(-any_of(c("DB", "NG", "SM",
			
 
				       "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
			
@@ -1091,22 +1159,12 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
 
				   ))
			
 
				 }
			
 
				 
			
 
				-generate_rank_plot_configs <- function(df, is_lm = FALSE, adjust = FALSE, filter_na = FALSE, overlap_color = FALSE) {
			
 
				+generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, overlap_color = FALSE) {
			
 
				   sd_bands <- c(1, 2, 3)
			
 
				   plot_configs <- list()
			
 
				   
			
 
				   variables <- c("L", "K")
			
 
				 
			
 
				-  # Adjust (if necessary) and rank columns
			
 
				-  for (variable in variables) {
			
 
				-    if (adjust) {
			
 
				-      df[[paste0("Avg_Zscore_", variable)]] <- ifelse(is.na(df[[paste0("Avg_Zscore_", variable)]]), 0.001, df[[paste0("Avg_Zscore_", variable)]])
			
 
				-      df[[paste0("Z_lm_", variable)]] <- ifelse(is.na(df[[paste0("Z_lm_", variable)]]), 0.001, df[[paste0("Z_lm_", variable)]])
			
 
				-    }
			
 
				-    df[[paste0("Rank_", variable)]] <- rank(df[[paste0("Avg_Zscore_", variable)]], na.last = "keep")
			
 
				-    df[[paste0("Rank_lm_", variable)]] <- rank(df[[paste0("Z_lm_", variable)]], na.last = "keep")
			
 
				-  }
			
 
				-
			
 
				   # Helper function to create a rank plot configuration
			
 
				   create_plot_config <- function(variable, rank_var, zscore_var, y_label, sd_band, filter_na, with_annotations = TRUE) {
			
 
				     num_enhancers <- sum(df[[zscore_var]] >= sd_band, na.rm = TRUE)
			
@@ -1578,9 +1636,9 @@ main <- function() {
 
				       write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
			
 
				       write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
			
 
				 
			
 
				-      message("Generating reference interaction plots")
			
 
				-      reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
			
 
				-      generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
			
 
				+      # message("Generating reference interaction plots")
			
 
				+      # reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
			
 
				+      # generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
			
 
				 
			
 
				       message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
			
 
				       df_deletion <- df_na_stats %>% # formerly X2
			
@@ -1648,7 +1706,6 @@ main <- function() {
 
				       # rank_plot_configs <- generate_rank_plot_configs(
			
 
				       #   df_interactions,
			
 
				       #   is_lm = FALSE,
			
 
				-      #   adjust = TRUE
			
 
				       # )
			
 
				       # generate_and_save_plots(out_dir, "rank_plots", rank_plot_configs,
			
 
				       #   page_width = 18, page_height = 12)
			
@@ -1657,7 +1714,6 @@ main <- function() {
 
				       # rank_lm_plot_configs <- generate_rank_plot_configs(
			
 
				       #   df_interactions,
			
 
				       #   is_lm = TRUE,
			
 
				-      #   adjust = TRUE
			
 
				       # )
			
 
				       # generate_and_save_plots(out_dir, "rank_plots_lm", rank_lm_plot_configs,
			
 
				       #   page_width = 18, page_height = 12)
			
@@ -1666,7 +1722,6 @@ main <- function() {
 
				       # rank_plot_filtered_configs <- generate_rank_plot_configs(
			
 
				       #   df_interactions,
			
 
				       #   is_lm = FALSE,
			
 
				-      #   adjust = FALSE,
			
 
				       #   filter_na = TRUE,
			
 
				       #   overlap_color = TRUE
			
 
				       # )
			
@@ -1677,7 +1732,6 @@ main <- function() {
 
				       # rank_plot_lm_filtered_configs <- generate_rank_plot_configs(
			
 
				       #   df_interactions,
			
 
				       #   is_lm = TRUE,
			
 
				-      #   adjust = FALSE,
			
 
				       #   filter_na = TRUE,
			
 
				       #   overlap_color = TRUE
			
 
				       # )