Browse Source

Add correlation lms to calculate_interaction_scores()

Bryan Roessler 8 months ago
parent
commit
2073517192
1 changed files with 146 additions and 92 deletions
  1. 146 92
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 146 - 92
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -217,6 +217,22 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
     group_vars <- c("OrfRep", "Gene", "Drug")
     group_vars <- c("OrfRep", "Gene", "Drug")
   }
   }
 
 
+  # This is a helper function to perform a linear regression and extract coefficients
+  perform_lm <- function(data, response, predictor, max_conc) {
+    valid_data <- data %>% filter(!is.na(.data[[response]]))
+    if (nrow(valid_data) > 1) {
+      model <- lm(.data[[response]] ~ .data[[predictor]], data = valid_data)
+      list(
+        intercept = coef(model)[1],
+        slope = coef(model)[2],
+        r_squared = summary(model)$r.squared,
+        score = max_conc * coef(model)[2] + coef(model)[1]
+      )
+    } else {
+      list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+    }
+  }
+
   # Calculate WT statistics from df_bg
   # Calculate WT statistics from df_bg
   wt_stats <- df_bg %>%
   wt_stats <- df_bg %>%
     group_by(across(all_of(bg_group_vars))) %>%
     group_by(across(all_of(bg_group_vars))) %>%
@@ -300,41 +316,32 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
     ungroup() %>%  # Ungroup before group_modify
     ungroup() %>%  # Ungroup before group_modify
     group_by(across(all_of(group_vars))) %>%
     group_by(across(all_of(group_vars))) %>%
     group_modify(~ {
     group_modify(~ {
+      lm_L <- perform_lm(.x, "Delta_L", "conc_num_factor", max_conc)
+      lm_K <- perform_lm(.x, "Delta_K", "conc_num_factor", max_conc)
+      lm_r <- perform_lm(.x, "Delta_r", "conc_num_factor", max_conc)
+      lm_AUC <- perform_lm(.x, "Delta_AUC", "conc_num_factor", max_conc)
 
 
-      # Filter each column for valid data or else linear modeling will fail
-      valid_data_L <- .x %>% filter(!is.na(Delta_L))
-      valid_data_K <- .x %>% filter(!is.na(Delta_K))
-      valid_data_r <- .x %>% filter(!is.na(Delta_r))
-      valid_data_AUC <- .x %>% filter(!is.na(Delta_AUC))
-      
-      # Perform linear modeling
-      lm_L <- if (nrow(valid_data_L) > 1) lm(Delta_L ~ conc_num_factor, data = valid_data_L) else NULL
-      lm_K <- if (nrow(valid_data_K) > 1) lm(Delta_K ~ conc_num_factor, data = valid_data_K) else NULL
-      lm_r <- if (nrow(valid_data_r) > 1) lm(Delta_r ~ conc_num_factor, data = valid_data_r) else NULL
-      lm_AUC <- if (nrow(valid_data_AUC) > 1) lm(Delta_AUC ~ conc_num_factor, data = valid_data_AUC) else NULL
-
-      # Extract coefficients for calculations and plotting
       .x %>%
       .x %>%
         mutate(
         mutate(
-          lm_intercept_L = if (!is.null(lm_L)) coef(lm_L)[1] else NA,
-          lm_slope_L = if (!is.null(lm_L)) coef(lm_L)[2] else NA,
-          R_Squared_L = if (!is.null(lm_L)) summary(lm_L)$r.squared else NA,
-          lm_Score_L = if (!is.null(lm_L)) max_conc * coef(lm_L)[2] + coef(lm_L)[1] else NA,
-
-          lm_intercept_K = if (!is.null(lm_K)) coef(lm_K)[1] else NA,
-          lm_slope_K = if (!is.null(lm_K)) coef(lm_K)[2] else NA,
-          R_Squared_K = if (!is.null(lm_K)) summary(lm_K)$r.squared else NA,
-          lm_Score_K = if (!is.null(lm_K)) max_conc * coef(lm_K)[2] + coef(lm_K)[1] else NA,
-
-          lm_intercept_r = if (!is.null(lm_r)) coef(lm_r)[1] else NA,
-          lm_slope_r = if (!is.null(lm_r)) coef(lm_r)[2] else NA,
-          R_Squared_r = if (!is.null(lm_r)) summary(lm_r)$r.squared else NA,
-          lm_Score_r = if (!is.null(lm_r)) max_conc * coef(lm_r)[2] + coef(lm_r)[1] else NA,
-
-          lm_intercept_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[1] else NA,
-          lm_slope_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[2] else NA,
-          R_Squared_AUC = if (!is.null(lm_AUC)) summary(lm_AUC)$r.squared else NA,
-          lm_Score_AUC = if (!is.null(lm_AUC)) max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1] else NA
+          lm_intercept_L = lm_L$intercept,
+          lm_slope_L = lm_L$slope,
+          R_Squared_L = lm_L$r_squared,
+          lm_Score_L = lm_L$score,
+
+          lm_intercept_K = lm_K$intercept,
+          lm_slope_K = lm_K$slope,
+          R_Squared_K = lm_K$r_squared,
+          lm_Score_K = lm_K$score,
+
+          lm_intercept_r = lm_r$intercept,
+          lm_slope_r = lm_r$slope,
+          R_Squared_r = lm_r$r_squared,
+          lm_Score_r = lm_r$score,
+
+          lm_intercept_AUC = lm_AUC$intercept,
+          lm_slope_AUC = lm_AUC$slope,
+          R_Squared_AUC = lm_AUC$r_squared,
+          lm_Score_AUC = lm_AUC$score
         )
         )
     }) %>%
     }) %>%
     ungroup()
     ungroup()
@@ -404,55 +411,116 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
       Z_Shift_r = first(Z_Shift_r),
       Z_Shift_r = first(Z_Shift_r),
       Z_Shift_AUC = first(Z_Shift_AUC),
       Z_Shift_AUC = first(Z_Shift_AUC),
 
 
-      # R Squared values
-      R_Squared_L = first(R_Squared_L),
-      R_Squared_K = first(R_Squared_K),
-      R_Squared_r = first(R_Squared_r),
-      R_Squared_AUC = first(R_Squared_AUC),
-
-      # lm intercepts
-      lm_intercept_L = first(lm_intercept_L),
-      lm_intercept_K = first(lm_intercept_K),
-      lm_intercept_r = first(lm_intercept_r),
-      lm_intercept_AUC = first(lm_intercept_AUC),
-
-      # lm slopes
-      lm_slope_L = first(lm_slope_L),
-      lm_slope_K = first(lm_slope_K),
-      lm_slope_r = first(lm_slope_r),
-      lm_slope_AUC = first(lm_slope_AUC),
-
       # NG, DB, SM values
       # NG, DB, SM values
       NG = first(NG),
       NG = first(NG),
       DB = first(DB),
       DB = first(DB),
       SM = first(SM),
       SM = first(SM),
       .groups = "drop"
       .groups = "drop"
-    )
+    ) %>%
+    arrange(desc(Z_lm_L), desc(NG))
+
+  # Deletion data ranking and linear modeling
+  if (type == "deletion") {
+    interactions <- interactions %>%
+      mutate(
+        Avg_Zscore_L_adjusted = ifelse(is.na(Avg_Zscore_L), 0.001, Avg_Zscore_L),
+        Avg_Zscore_K_adjusted = ifelse(is.na(Avg_Zscore_K), 0.001, Avg_Zscore_K),
+        Avg_Zscore_r_adjusted = ifelse(is.na(Avg_Zscore_r), 0.001, Avg_Zscore_r),
+        Avg_Zscore_AUC_adjusted = ifelse(is.na(Avg_Zscore_AUC), 0.001, Avg_Zscore_AUC),
+        Z_lm_L_adjusted = ifelse(is.na(Z_lm_L), 0.001, Z_lm_L),
+        Z_lm_K_adjusted = ifelse(is.na(Z_lm_K), 0.001, Z_lm_K),
+        Z_lm_r_adjusted = ifelse(is.na(Z_lm_r), 0.001, Z_lm_r),
+        Z_lm_AUC_adjusted = ifelse(is.na(Z_lm_AUC), 0.001, Z_lm_AUC)
+      ) %>%
+      mutate(
+        Rank_L = rank(Avg_Zscore_L_adjusted),
+        Rank_K = rank(Avg_Zscore_K_adjusted),
+        Rank_r = rank(Avg_Zscore_r_adjusted),
+        Rank_AUC = rank(Avg_Zscore_AUC_adjusted),
+        Rank_lm_L = rank(Z_lm_L_adjusted),
+        Rank_lm_K = rank(Z_lm_K_adjusted),
+        Rank_lm_r = rank(Z_lm_r_adjusted),
+        Rank_lm_AUC = rank(Z_lm_AUC_adjusted)
+      ) %>%
+      mutate(
+        lm_R_squared_rank_L = summary(lm(Rank_lm_L ~ Rank_L, data = .))$r.squared,
+        lm_R_squared_rank_K = summary(lm(Rank_lm_K ~ Rank_K, data = .))$r.squared,
+        lm_R_squared_rank_r = summary(lm(Rank_lm_r ~ Rank_r, data = .))$r.squared,
+        lm_R_squared_rank_AUC = summary(lm(Rank_lm_AUC ~ Rank_AUC, data = .))$r.squared
+      )
 
 
-  # Add overlap threshold categories based on Z-lm and Avg-Z scores
-  interactions <- interactions %>%
-    filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
-    mutate(
-      Overlap = case_when(
-        Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both",
-        Z_lm_L <= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Both",
-        Z_lm_L >= overlap_threshold & Avg_Zscore_L <= overlap_threshold ~ "Deletion Enhancer lm only",
-        Z_lm_L <= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Avg Zscore only",
-        Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= -overlap_threshold ~ "Deletion Suppressor lm only",
-        Z_lm_L >= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Avg Zscore only",
-        Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
-        Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
-        TRUE ~ "No Effect"
-      ),
+    # Add overlap threshold categories based on Z-lm and Avg-Z scores
+    interactions <- interactions %>%
+      filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
+      mutate(
+        Overlap = case_when(
+          Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both",
+          Z_lm_L <= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Both",
+          Z_lm_L >= overlap_threshold & Avg_Zscore_L <= overlap_threshold ~ "Deletion Enhancer lm only",
+          Z_lm_L <= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Avg Zscore only",
+          Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= -overlap_threshold ~ "Deletion Suppressor lm only",
+          Z_lm_L >= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Avg Zscore only",
+          Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
+          Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
+          TRUE ~ "No Effect"
+        )) %>%
+      group_modify(~ {
       
       
-      # For correlation plots
-      lm_R_squared_L = summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared,
-      lm_R_squared_K = summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared,
-      lm_R_squared_r = summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared,
-      lm_R_squared_AUC = summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared
-    )
+        # For rank plots
+        lm_L <- perform_lm(.x, "Z_lm_L", "Avg_Zscore_L", max_conc)
+        lm_K <- perform_lm(.x, "Z_lm_K", "Avg_Zscore_K", max_conc)
+        lm_r <- perform_lm(.x, "Z_lm_r", "Avg_Zscore_r", max_conc)
+        lm_AUC <- perform_lm(.x, "Z_lm_AUC", "Avg_Zscore_AUC", max_conc)
+
+        # For correlation plots
+        Z_lm_K_L <- perform_lm(.x, "Z_lm_K", "Z_lm_L", max_conc)
+        Z_lm_r_L <- perform_lm(.x, "Z_lm_r", "Z_lm_L", max_conc)
+        Z_lm_R_AUC_L <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_L", max_conc)
+        Z_lm_R_r_K <- perform_lm(.x, "Z_lm_R_r", "Z_lm_K", max_conc)
+        Z_lm_R_AUC_K <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_K", max_conc)
+        Z_lm_R_AUC_r <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_r", max_conc)
+
+        .x %>%
+          mutate(
+            # For rank plots
+            lm_R_squared_L = lm_L$r.squared,
+            lm_R_squared_K = lm_K$r.squared,
+            lm_R_squared_r = lm_r$r.squared,
+            lm_R_squared_AUC = lm_AUC$r.squared,
+
+            # For correlation plots
+            Z_lm_R_squared_K_L = Z_lm_K_L$r.squared,
+            Z_lm_R_squared_r_L = Z_lm_r_L$r.squared,
+            Z_lm_R_squared_AUC_L = Z_lm_R_AUC_L$r.squared,
+            Z_lm_R_squared_r_K = Z_lm_R_r_K$r.squared,
+            Z_lm_R_squared_AUC_K = Z_lm_R_AUC_K$r.squared,
+            Z_lm_R_squared_AUC_r = Z_lm_R_AUC_r$r.squared,
+
+            Z_lm_intercept_K_L = Z_lm_K_L$intercept,
+            Z_lm_intercept_r_L = Z_lm_r_L$intercept,
+            Z_lm_intercept_AUC_L = Z_lm_R_AUC_L$intercept,
+            Z_lm_intercept_r_K = Z_lm_R_r_K$intercept,
+            Z_lm_intercept_AUC_K = Z_lm_R_AUC_K$intercept,
+            Z_lm_intercept_AUC_r = Z_lm_R_AUC_r$intercept,
+
+            Z_lm_slope_K_L = Z_lm_K_L$slope,
+            Z_lm_slope_r_L = Z_lm_r_L$slope,
+            Z_lm_slope_AUC_L = Z_lm_R_AUC_L$slope,
+            Z_lm_slope_r_K = Z_lm_R_r_K$slope,
+            Z_lm_slope_AUC_K = Z_lm_R_AUC_K$slope,
+            Z_lm_slope_AUC_r = Z_lm_R_AUC_r$slope,
+
+            Z_lm_Score_K_L = Z_lm_K_L$score,
+            Z_lm_Score_r_L = Z_lm_r_L$score,
+            Z_lm_Score_AUC_L = Z_lm_R_AUC_L$score,
+            Z_lm_Score_r_K = Z_lm_R_r_K$score,
+            Z_lm_Score_AUC_K = Z_lm_R_AUC_K$score,
+            Z_lm_Score_AUC_r = Z_lm_R_AUC_r$score
+          )
+      })
+  } # end deletion-specific block
 
 
-  # Creating the final calculations and interactions dataframes with only required columns for csv output
+  # Create the final calculations and interactions dataframes with only required columns for csv output
   df_calculations <- calculations %>%
   df_calculations <- calculations %>%
     select(
     select(
       all_of(group_vars),
       all_of(group_vars),
@@ -484,7 +552,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
       lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap
       lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap
     )
     )
 
 
-  # Join calculations and interactions to avoid dimension mismatch
+  # Avoid column collision on left join for overlapping variables
   calculations_no_overlap <- calculations %>%
   calculations_no_overlap <- calculations %>%
     select(-any_of(c("DB", "NG", "SM",
     select(-any_of(c("DB", "NG", "SM",
       "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
       "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
@@ -1091,22 +1159,12 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
   ))
   ))
 }
 }
 
 
-generate_rank_plot_configs <- function(df, is_lm = FALSE, adjust = FALSE, filter_na = FALSE, overlap_color = FALSE) {
+generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, overlap_color = FALSE) {
   sd_bands <- c(1, 2, 3)
   sd_bands <- c(1, 2, 3)
   plot_configs <- list()
   plot_configs <- list()
   
   
   variables <- c("L", "K")
   variables <- c("L", "K")
 
 
-  # Adjust (if necessary) and rank columns
-  for (variable in variables) {
-    if (adjust) {
-      df[[paste0("Avg_Zscore_", variable)]] <- ifelse(is.na(df[[paste0("Avg_Zscore_", variable)]]), 0.001, df[[paste0("Avg_Zscore_", variable)]])
-      df[[paste0("Z_lm_", variable)]] <- ifelse(is.na(df[[paste0("Z_lm_", variable)]]), 0.001, df[[paste0("Z_lm_", variable)]])
-    }
-    df[[paste0("Rank_", variable)]] <- rank(df[[paste0("Avg_Zscore_", variable)]], na.last = "keep")
-    df[[paste0("Rank_lm_", variable)]] <- rank(df[[paste0("Z_lm_", variable)]], na.last = "keep")
-  }
-
   # Helper function to create a rank plot configuration
   # Helper function to create a rank plot configuration
   create_plot_config <- function(variable, rank_var, zscore_var, y_label, sd_band, filter_na, with_annotations = TRUE) {
   create_plot_config <- function(variable, rank_var, zscore_var, y_label, sd_band, filter_na, with_annotations = TRUE) {
     num_enhancers <- sum(df[[zscore_var]] >= sd_band, na.rm = TRUE)
     num_enhancers <- sum(df[[zscore_var]] >= sd_band, na.rm = TRUE)
@@ -1578,9 +1636,9 @@ main <- function() {
       write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
       write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
       write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
       write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
 
 
-      message("Generating reference interaction plots")
-      reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
-      generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
+      # message("Generating reference interaction plots")
+      # reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
+      # generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
 
 
       message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
       message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
       df_deletion <- df_na_stats %>% # formerly X2
       df_deletion <- df_na_stats %>% # formerly X2
@@ -1648,7 +1706,6 @@ main <- function() {
       # rank_plot_configs <- generate_rank_plot_configs(
       # rank_plot_configs <- generate_rank_plot_configs(
       #   df_interactions,
       #   df_interactions,
       #   is_lm = FALSE,
       #   is_lm = FALSE,
-      #   adjust = TRUE
       # )
       # )
       # generate_and_save_plots(out_dir, "rank_plots", rank_plot_configs,
       # generate_and_save_plots(out_dir, "rank_plots", rank_plot_configs,
       #   page_width = 18, page_height = 12)
       #   page_width = 18, page_height = 12)
@@ -1657,7 +1714,6 @@ main <- function() {
       # rank_lm_plot_configs <- generate_rank_plot_configs(
       # rank_lm_plot_configs <- generate_rank_plot_configs(
       #   df_interactions,
       #   df_interactions,
       #   is_lm = TRUE,
       #   is_lm = TRUE,
-      #   adjust = TRUE
       # )
       # )
       # generate_and_save_plots(out_dir, "rank_plots_lm", rank_lm_plot_configs,
       # generate_and_save_plots(out_dir, "rank_plots_lm", rank_lm_plot_configs,
       #   page_width = 18, page_height = 12)
       #   page_width = 18, page_height = 12)
@@ -1666,7 +1722,6 @@ main <- function() {
       # rank_plot_filtered_configs <- generate_rank_plot_configs(
       # rank_plot_filtered_configs <- generate_rank_plot_configs(
       #   df_interactions,
       #   df_interactions,
       #   is_lm = FALSE,
       #   is_lm = FALSE,
-      #   adjust = FALSE,
       #   filter_na = TRUE,
       #   filter_na = TRUE,
       #   overlap_color = TRUE
       #   overlap_color = TRUE
       # )
       # )
@@ -1677,7 +1732,6 @@ main <- function() {
       # rank_plot_lm_filtered_configs <- generate_rank_plot_configs(
       # rank_plot_lm_filtered_configs <- generate_rank_plot_configs(
       #   df_interactions,
       #   df_interactions,
       #   is_lm = TRUE,
       #   is_lm = TRUE,
-      #   adjust = FALSE,
       #   filter_na = TRUE,
       #   filter_na = TRUE,
       #   overlap_color = TRUE
       #   overlap_color = TRUE
       # )
       # )