Quellcode durchsuchen

Add correlation lms to calculate_interaction_scores()

Bryan Roessler vor 6 Monaten
Ursprung
Commit
2073517192
1 geänderte Dateien mit 146 neuen und 92 gelöschten Zeilen
  1. 146 92
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 146 - 92
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -217,6 +217,22 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
     group_vars <- c("OrfRep", "Gene", "Drug")
   }
 
+  # This is a helper function to perform a linear regression and extract coefficients
+  perform_lm <- function(data, response, predictor, max_conc) {
+    valid_data <- data %>% filter(!is.na(.data[[response]]))
+    if (nrow(valid_data) > 1) {
+      model <- lm(.data[[response]] ~ .data[[predictor]], data = valid_data)
+      list(
+        intercept = coef(model)[1],
+        slope = coef(model)[2],
+        r_squared = summary(model)$r.squared,
+        score = max_conc * coef(model)[2] + coef(model)[1]
+      )
+    } else {
+      list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+    }
+  }
+
   # Calculate WT statistics from df_bg
   wt_stats <- df_bg %>%
     group_by(across(all_of(bg_group_vars))) %>%
@@ -300,41 +316,32 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
     ungroup() %>%  # Ungroup before group_modify
     group_by(across(all_of(group_vars))) %>%
     group_modify(~ {
+      lm_L <- perform_lm(.x, "Delta_L", "conc_num_factor", max_conc)
+      lm_K <- perform_lm(.x, "Delta_K", "conc_num_factor", max_conc)
+      lm_r <- perform_lm(.x, "Delta_r", "conc_num_factor", max_conc)
+      lm_AUC <- perform_lm(.x, "Delta_AUC", "conc_num_factor", max_conc)
 
-      # Filter each column for valid data or else linear modeling will fail
-      valid_data_L <- .x %>% filter(!is.na(Delta_L))
-      valid_data_K <- .x %>% filter(!is.na(Delta_K))
-      valid_data_r <- .x %>% filter(!is.na(Delta_r))
-      valid_data_AUC <- .x %>% filter(!is.na(Delta_AUC))
-      
-      # Perform linear modeling
-      lm_L <- if (nrow(valid_data_L) > 1) lm(Delta_L ~ conc_num_factor, data = valid_data_L) else NULL
-      lm_K <- if (nrow(valid_data_K) > 1) lm(Delta_K ~ conc_num_factor, data = valid_data_K) else NULL
-      lm_r <- if (nrow(valid_data_r) > 1) lm(Delta_r ~ conc_num_factor, data = valid_data_r) else NULL
-      lm_AUC <- if (nrow(valid_data_AUC) > 1) lm(Delta_AUC ~ conc_num_factor, data = valid_data_AUC) else NULL
-
-      # Extract coefficients for calculations and plotting
       .x %>%
         mutate(
-          lm_intercept_L = if (!is.null(lm_L)) coef(lm_L)[1] else NA,
-          lm_slope_L = if (!is.null(lm_L)) coef(lm_L)[2] else NA,
-          R_Squared_L = if (!is.null(lm_L)) summary(lm_L)$r.squared else NA,
-          lm_Score_L = if (!is.null(lm_L)) max_conc * coef(lm_L)[2] + coef(lm_L)[1] else NA,
-
-          lm_intercept_K = if (!is.null(lm_K)) coef(lm_K)[1] else NA,
-          lm_slope_K = if (!is.null(lm_K)) coef(lm_K)[2] else NA,
-          R_Squared_K = if (!is.null(lm_K)) summary(lm_K)$r.squared else NA,
-          lm_Score_K = if (!is.null(lm_K)) max_conc * coef(lm_K)[2] + coef(lm_K)[1] else NA,
-
-          lm_intercept_r = if (!is.null(lm_r)) coef(lm_r)[1] else NA,
-          lm_slope_r = if (!is.null(lm_r)) coef(lm_r)[2] else NA,
-          R_Squared_r = if (!is.null(lm_r)) summary(lm_r)$r.squared else NA,
-          lm_Score_r = if (!is.null(lm_r)) max_conc * coef(lm_r)[2] + coef(lm_r)[1] else NA,
-
-          lm_intercept_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[1] else NA,
-          lm_slope_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[2] else NA,
-          R_Squared_AUC = if (!is.null(lm_AUC)) summary(lm_AUC)$r.squared else NA,
-          lm_Score_AUC = if (!is.null(lm_AUC)) max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1] else NA
+          lm_intercept_L = lm_L$intercept,
+          lm_slope_L = lm_L$slope,
+          R_Squared_L = lm_L$r_squared,
+          lm_Score_L = lm_L$score,
+
+          lm_intercept_K = lm_K$intercept,
+          lm_slope_K = lm_K$slope,
+          R_Squared_K = lm_K$r_squared,
+          lm_Score_K = lm_K$score,
+
+          lm_intercept_r = lm_r$intercept,
+          lm_slope_r = lm_r$slope,
+          R_Squared_r = lm_r$r_squared,
+          lm_Score_r = lm_r$score,
+
+          lm_intercept_AUC = lm_AUC$intercept,
+          lm_slope_AUC = lm_AUC$slope,
+          R_Squared_AUC = lm_AUC$r_squared,
+          lm_Score_AUC = lm_AUC$score
         )
     }) %>%
     ungroup()
@@ -404,55 +411,116 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
       Z_Shift_r = first(Z_Shift_r),
       Z_Shift_AUC = first(Z_Shift_AUC),
 
-      # R Squared values
-      R_Squared_L = first(R_Squared_L),
-      R_Squared_K = first(R_Squared_K),
-      R_Squared_r = first(R_Squared_r),
-      R_Squared_AUC = first(R_Squared_AUC),
-
-      # lm intercepts
-      lm_intercept_L = first(lm_intercept_L),
-      lm_intercept_K = first(lm_intercept_K),
-      lm_intercept_r = first(lm_intercept_r),
-      lm_intercept_AUC = first(lm_intercept_AUC),
-
-      # lm slopes
-      lm_slope_L = first(lm_slope_L),
-      lm_slope_K = first(lm_slope_K),
-      lm_slope_r = first(lm_slope_r),
-      lm_slope_AUC = first(lm_slope_AUC),
-
       # NG, DB, SM values
       NG = first(NG),
       DB = first(DB),
       SM = first(SM),
       .groups = "drop"
-    )
+    ) %>%
+    arrange(desc(Z_lm_L), desc(NG))
+
+  # Deletion data ranking and linear modeling
+  if (type == "deletion") {
+    interactions <- interactions %>%
+      mutate(
+        Avg_Zscore_L_adjusted = ifelse(is.na(Avg_Zscore_L), 0.001, Avg_Zscore_L),
+        Avg_Zscore_K_adjusted = ifelse(is.na(Avg_Zscore_K), 0.001, Avg_Zscore_K),
+        Avg_Zscore_r_adjusted = ifelse(is.na(Avg_Zscore_r), 0.001, Avg_Zscore_r),
+        Avg_Zscore_AUC_adjusted = ifelse(is.na(Avg_Zscore_AUC), 0.001, Avg_Zscore_AUC),
+        Z_lm_L_adjusted = ifelse(is.na(Z_lm_L), 0.001, Z_lm_L),
+        Z_lm_K_adjusted = ifelse(is.na(Z_lm_K), 0.001, Z_lm_K),
+        Z_lm_r_adjusted = ifelse(is.na(Z_lm_r), 0.001, Z_lm_r),
+        Z_lm_AUC_adjusted = ifelse(is.na(Z_lm_AUC), 0.001, Z_lm_AUC)
+      ) %>%
+      mutate(
+        Rank_L = rank(Avg_Zscore_L_adjusted),
+        Rank_K = rank(Avg_Zscore_K_adjusted),
+        Rank_r = rank(Avg_Zscore_r_adjusted),
+        Rank_AUC = rank(Avg_Zscore_AUC_adjusted),
+        Rank_lm_L = rank(Z_lm_L_adjusted),
+        Rank_lm_K = rank(Z_lm_K_adjusted),
+        Rank_lm_r = rank(Z_lm_r_adjusted),
+        Rank_lm_AUC = rank(Z_lm_AUC_adjusted)
+      ) %>%
+      mutate(
+        lm_R_squared_rank_L = summary(lm(Rank_lm_L ~ Rank_L, data = .))$r.squared,
+        lm_R_squared_rank_K = summary(lm(Rank_lm_K ~ Rank_K, data = .))$r.squared,
+        lm_R_squared_rank_r = summary(lm(Rank_lm_r ~ Rank_r, data = .))$r.squared,
+        lm_R_squared_rank_AUC = summary(lm(Rank_lm_AUC ~ Rank_AUC, data = .))$r.squared
+      )
 
-  # Add overlap threshold categories based on Z-lm and Avg-Z scores
-  interactions <- interactions %>%
-    filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
-    mutate(
-      Overlap = case_when(
-        Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both",
-        Z_lm_L <= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Both",
-        Z_lm_L >= overlap_threshold & Avg_Zscore_L <= overlap_threshold ~ "Deletion Enhancer lm only",
-        Z_lm_L <= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Avg Zscore only",
-        Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= -overlap_threshold ~ "Deletion Suppressor lm only",
-        Z_lm_L >= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Avg Zscore only",
-        Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
-        Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
-        TRUE ~ "No Effect"
-      ),
+    # Add overlap threshold categories based on Z-lm and Avg-Z scores
+    interactions <- interactions %>%
+      filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
+      mutate(
+        Overlap = case_when(
+          Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both",
+          Z_lm_L <= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Both",
+          Z_lm_L >= overlap_threshold & Avg_Zscore_L <= overlap_threshold ~ "Deletion Enhancer lm only",
+          Z_lm_L <= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Avg Zscore only",
+          Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= -overlap_threshold ~ "Deletion Suppressor lm only",
+          Z_lm_L >= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Avg Zscore only",
+          Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
+          Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
+          TRUE ~ "No Effect"
+        )) %>%
+      group_modify(~ {
       
-      # For correlation plots
-      lm_R_squared_L = summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared,
-      lm_R_squared_K = summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared,
-      lm_R_squared_r = summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared,
-      lm_R_squared_AUC = summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared
-    )
+        # For rank plots
+        lm_L <- perform_lm(.x, "Z_lm_L", "Avg_Zscore_L", max_conc)
+        lm_K <- perform_lm(.x, "Z_lm_K", "Avg_Zscore_K", max_conc)
+        lm_r <- perform_lm(.x, "Z_lm_r", "Avg_Zscore_r", max_conc)
+        lm_AUC <- perform_lm(.x, "Z_lm_AUC", "Avg_Zscore_AUC", max_conc)
+
+        # For correlation plots
+        Z_lm_K_L <- perform_lm(.x, "Z_lm_K", "Z_lm_L", max_conc)
+        Z_lm_r_L <- perform_lm(.x, "Z_lm_r", "Z_lm_L", max_conc)
+        Z_lm_R_AUC_L <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_L", max_conc)
+        Z_lm_R_r_K <- perform_lm(.x, "Z_lm_R_r", "Z_lm_K", max_conc)
+        Z_lm_R_AUC_K <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_K", max_conc)
+        Z_lm_R_AUC_r <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_r", max_conc)
+
+        .x %>%
+          mutate(
+            # For rank plots
+            lm_R_squared_L = lm_L$r.squared,
+            lm_R_squared_K = lm_K$r.squared,
+            lm_R_squared_r = lm_r$r.squared,
+            lm_R_squared_AUC = lm_AUC$r.squared,
+
+            # For correlation plots
+            Z_lm_R_squared_K_L = Z_lm_K_L$r.squared,
+            Z_lm_R_squared_r_L = Z_lm_r_L$r.squared,
+            Z_lm_R_squared_AUC_L = Z_lm_R_AUC_L$r.squared,
+            Z_lm_R_squared_r_K = Z_lm_R_r_K$r.squared,
+            Z_lm_R_squared_AUC_K = Z_lm_R_AUC_K$r.squared,
+            Z_lm_R_squared_AUC_r = Z_lm_R_AUC_r$r.squared,
+
+            Z_lm_intercept_K_L = Z_lm_K_L$intercept,
+            Z_lm_intercept_r_L = Z_lm_r_L$intercept,
+            Z_lm_intercept_AUC_L = Z_lm_R_AUC_L$intercept,
+            Z_lm_intercept_r_K = Z_lm_R_r_K$intercept,
+            Z_lm_intercept_AUC_K = Z_lm_R_AUC_K$intercept,
+            Z_lm_intercept_AUC_r = Z_lm_R_AUC_r$intercept,
+
+            Z_lm_slope_K_L = Z_lm_K_L$slope,
+            Z_lm_slope_r_L = Z_lm_r_L$slope,
+            Z_lm_slope_AUC_L = Z_lm_R_AUC_L$slope,
+            Z_lm_slope_r_K = Z_lm_R_r_K$slope,
+            Z_lm_slope_AUC_K = Z_lm_R_AUC_K$slope,
+            Z_lm_slope_AUC_r = Z_lm_R_AUC_r$slope,
+
+            Z_lm_Score_K_L = Z_lm_K_L$score,
+            Z_lm_Score_r_L = Z_lm_r_L$score,
+            Z_lm_Score_AUC_L = Z_lm_R_AUC_L$score,
+            Z_lm_Score_r_K = Z_lm_R_r_K$score,
+            Z_lm_Score_AUC_K = Z_lm_R_AUC_K$score,
+            Z_lm_Score_AUC_r = Z_lm_R_AUC_r$score
+          )
+      })
+  } # end deletion-specific block
 
-  # Creating the final calculations and interactions dataframes with only required columns for csv output
+  # Create the final calculations and interactions dataframes with only required columns for csv output
   df_calculations <- calculations %>%
     select(
       all_of(group_vars),
@@ -484,7 +552,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
       lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap
     )
 
-  # Join calculations and interactions to avoid dimension mismatch
+  # Avoid column collision on left join for overlapping variables
   calculations_no_overlap <- calculations %>%
     select(-any_of(c("DB", "NG", "SM",
       "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
@@ -1091,22 +1159,12 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
   ))
 }
 
-generate_rank_plot_configs <- function(df, is_lm = FALSE, adjust = FALSE, filter_na = FALSE, overlap_color = FALSE) {
+generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, overlap_color = FALSE) {
   sd_bands <- c(1, 2, 3)
   plot_configs <- list()
   
   variables <- c("L", "K")
 
-  # Adjust (if necessary) and rank columns
-  for (variable in variables) {
-    if (adjust) {
-      df[[paste0("Avg_Zscore_", variable)]] <- ifelse(is.na(df[[paste0("Avg_Zscore_", variable)]]), 0.001, df[[paste0("Avg_Zscore_", variable)]])
-      df[[paste0("Z_lm_", variable)]] <- ifelse(is.na(df[[paste0("Z_lm_", variable)]]), 0.001, df[[paste0("Z_lm_", variable)]])
-    }
-    df[[paste0("Rank_", variable)]] <- rank(df[[paste0("Avg_Zscore_", variable)]], na.last = "keep")
-    df[[paste0("Rank_lm_", variable)]] <- rank(df[[paste0("Z_lm_", variable)]], na.last = "keep")
-  }
-
   # Helper function to create a rank plot configuration
   create_plot_config <- function(variable, rank_var, zscore_var, y_label, sd_band, filter_na, with_annotations = TRUE) {
     num_enhancers <- sum(df[[zscore_var]] >= sd_band, na.rm = TRUE)
@@ -1578,9 +1636,9 @@ main <- function() {
       write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
       write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
 
-      message("Generating reference interaction plots")
-      reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
-      generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
+      # message("Generating reference interaction plots")
+      # reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
+      # generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
 
       message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
       df_deletion <- df_na_stats %>% # formerly X2
@@ -1648,7 +1706,6 @@ main <- function() {
       # rank_plot_configs <- generate_rank_plot_configs(
       #   df_interactions,
       #   is_lm = FALSE,
-      #   adjust = TRUE
       # )
       # generate_and_save_plots(out_dir, "rank_plots", rank_plot_configs,
       #   page_width = 18, page_height = 12)
@@ -1657,7 +1714,6 @@ main <- function() {
       # rank_lm_plot_configs <- generate_rank_plot_configs(
       #   df_interactions,
       #   is_lm = TRUE,
-      #   adjust = TRUE
       # )
       # generate_and_save_plots(out_dir, "rank_plots_lm", rank_lm_plot_configs,
       #   page_width = 18, page_height = 12)
@@ -1666,7 +1722,6 @@ main <- function() {
       # rank_plot_filtered_configs <- generate_rank_plot_configs(
       #   df_interactions,
       #   is_lm = FALSE,
-      #   adjust = FALSE,
       #   filter_na = TRUE,
       #   overlap_color = TRUE
       # )
@@ -1677,7 +1732,6 @@ main <- function() {
       # rank_plot_lm_filtered_configs <- generate_rank_plot_configs(
       #   df_interactions,
       #   is_lm = TRUE,
-      #   adjust = FALSE,
       #   filter_na = TRUE,
       #   overlap_color = TRUE
       # )