Calculate ranks step-wise

2024-10-08 11:59:24 -04:00
parent d456bcdadd
commit bff7e7cfb4
1 changed files with 88 additions and 56 deletions
--- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
+++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
@@ -222,7 +222,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)

  perform_lm <- function(response, predictor, max_conc) {
    if (all(is.na(response)) || all(is.na(predictor)) ||
-      length(response[!is.na(response)]) == 0 || length(predictor[!is.na(predictor)]) == 0) {
+      length(response[!is.na(response)]) < 2 || length(predictor[!is.na(predictor)]) < 2) {
      return(list(intercept = NA, slope = NA, r_squared = NA, score = NA))
    } else {
      fit <- lm(response ~ predictor)
@@ -337,52 +337,54 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
      Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / first(total_conc_num - 1),
      Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / first(total_conc_num - 1),

-      # Perform gene-gene linear modeling
+      # Perform per-gene linear modeling
      lm_L = list(perform_lm(Delta_L, conc_num_factor, max_conc)),
      lm_K = list(perform_lm(Delta_K, conc_num_factor, max_conc)),
      lm_r = list(perform_lm(Delta_r, conc_num_factor, max_conc)),
      lm_AUC = list(perform_lm(Delta_AUC, conc_num_factor, max_conc)),
-
-      # Extract coefficients and statistics for each model
-      lm_intercept_L = lm_L[[1]]$intercept,
-      lm_slope_L = lm_L[[1]]$slope,
-      R_Squared_L = lm_L[[1]]$r_squared,
-      lm_Score_L = lm_L[[1]]$score,
-      lm_intercept_K = lm_K[[1]]$intercept,
-      lm_slope_K = lm_K[[1]]$slope,
-      R_Squared_K = lm_K[[1]]$r_squared,
-      lm_Score_K = lm_K[[1]]$score,
-      lm_intercept_r = lm_r[[1]]$intercept,
-      lm_slope_r = lm_r[[1]]$slope,
-      R_Squared_r = lm_r[[1]]$r_squared,
-      lm_Score_r = lm_r[[1]]$score,
-      lm_intercept_AUC = lm_AUC[[1]]$intercept,
-      lm_slope_AUC = lm_AUC[[1]]$slope,
-      R_Squared_AUC = lm_AUC[[1]]$r_squared,
-      lm_Score_AUC = lm_AUC[[1]]$score,
-
      .groups = "drop"
    ) %>%
-    select(-c(lm_L, lm_K, lm_r, lm_AUC)) # drop linear models since we have coefficients
-
-  # Summary statistics for lm scores
-  interactions <- interactions %>%
-    group_by(across(all_of(group_vars))) %>%
    mutate(
-      lm_mean_L = mean(lm_Score_L, na.rm = TRUE),
-      lm_sd_L = sd(lm_Score_L, na.rm = TRUE),
-      lm_mean_K = mean(lm_Score_K, na.rm = TRUE),
-      lm_sd_K = sd(lm_Score_K, na.rm = TRUE),
-      lm_mean_r = mean(lm_Score_r, na.rm = TRUE),
-      lm_sd_r = sd(lm_Score_r, na.rm = TRUE),
-      lm_mean_AUC = mean(lm_Score_AUC, na.rm = TRUE),
-      lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE),
-      Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L,
-      Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K,
-      Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r,
-      Z_lm_AUC = (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC
+      lm_intercept_L = map_dbl(lm_L, "intercept"),
+      lm_slope_L = map_dbl(lm_L, "slope"),
+      R_Squared_L = map_dbl(lm_L, "r_squared"),
+      lm_Score_L = map_dbl(lm_L, "score"),
+      
+      lm_intercept_K = map_dbl(lm_K, "intercept"),
+      lm_slope_K = map_dbl(lm_K, "slope"),
+      R_Squared_K = map_dbl(lm_K, "r_squared"),
+      lm_Score_K = map_dbl(lm_K, "score"),
+      
+      lm_intercept_r = map_dbl(lm_r, "intercept"),
+      lm_slope_r = map_dbl(lm_r, "slope"),
+      R_Squared_r = map_dbl(lm_r, "r_squared"),
+      lm_Score_r = map_dbl(lm_r, "score"),
+      
+      lm_intercept_AUC = map_dbl(lm_AUC, "intercept"),
+      lm_slope_AUC = map_dbl(lm_AUC, "slope"),
+      R_Squared_AUC = map_dbl(lm_AUC, "r_squared"),
+      lm_Score_AUC = map_dbl(lm_AUC, "score")
+    ) %>%
+    select(-c(lm_L, lm_K, lm_r, lm_AUC)) # drop linear models since we now have coefficients
+
+  lm_mean_L <- mean(interactions$lm_Score_L, na.rm = TRUE)
+  lm_sd_L <- sd(interactions$lm_Score_L, na.rm = TRUE)
+  lm_mean_K <- mean(interactions$lm_Score_K, na.rm = TRUE)
+  lm_sd_K <- sd(interactions$lm_Score_K, na.rm = TRUE)
+  lm_mean_r <- mean(interactions$lm_Score_r, na.rm = TRUE)
+  lm_sd_r <- sd(interactions$lm_Score_r, na.rm = TRUE)
+  lm_mean_AUC <- mean(interactions$lm_Score_AUC, na.rm = TRUE)
+  lm_sd_AUC <- sd(interactions$lm_Score_AUC, na.rm = TRUE)
+
+  interactions <- interactions %>%
+    mutate(
+      Z_lm_L = if (!is.na(lm_sd_L) && lm_sd_L != 0) (lm_Score_L - lm_mean_L) / lm_sd_L else NA_real_,
+      Z_lm_K = if (!is.na(lm_sd_K) && lm_sd_K != 0) (lm_Score_K - lm_mean_K) / lm_sd_K else NA_real_,
+      Z_lm_r = if (!is.na(lm_sd_r) && lm_sd_r != 0) (lm_Score_r - lm_mean_r) / lm_sd_r else NA_real_,
+      Z_lm_AUC = if (!is.na(lm_sd_AUC) && lm_sd_AUC != 0) (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC else NA_real_
    ) %>%
    arrange(desc(Z_lm_L), desc(NG))
+  

  # Deletion data ranking and linear modeling
  # Initialize this variable so we can return it as NULL for reference
@@ -408,11 +410,12 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
        Rank_lm_L = rank(Z_lm_adjusted_L),
        Rank_lm_K = rank(Z_lm_adjusted_K),
        Rank_lm_r = rank(Z_lm_adjusted_r),
-        Rank_lm_AUC = rank(Z_lm_adjusted_AUC),
-        Rank_lm_lm_L = list(perform_lm(Rank_lm_L, Rank_L, max_conc)),
-        Rank_lm_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)),
-        Rank_lm_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)),
-        Rank_lm_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)),
+        Rank_lm_AUC = rank(Z_lm_adjusted_AUC)
+        # Rank_lm_lm_L = list(perform_lm(Rank_lm_L, Rank_L, max_conc)),
+        # Rank_lm_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)),
+        # Rank_lm_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)),
+        # Rank_lm_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)),
+        
      )

    # We are filtering invalid Z_lm scores so this must be in its own df
@@ -443,10 +446,10 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
        Rank_lm_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)),
        Rank_lm_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)),
        Rank_lm_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)),
-        # Rank_lm_R_Squared_L = Rank_lm_lm_L[[1]]$r_squared,
-        # Rank_lm_R_Squared_K = Rank_lm_lm_K[[1]]$r_squared,
-        # Rank_lm_R_Squared_r = Rank_lm_lm_r[[1]]$r_squared,
-        # Rank_lm_R_Squared_AUC = Rank_lm_lm_AUC[[1]]$r_squared,
+        Rank_lm_R_Squared_L = Rank_lm_lm_L[[1]]$r_squared,
+        Rank_lm_R_Squared_K = Rank_lm_lm_K[[1]]$r_squared,
+        Rank_lm_R_Squared_r = Rank_lm_lm_r[[1]]$r_squared,
+        Rank_lm_R_Squared_AUC = Rank_lm_lm_AUC[[1]]$r_squared,
        Correlation_lm_L = list(perform_lm(Z_lm_L, Avg_Zscore_L, max_conc)),
        Correlation_lm_K = list(perform_lm(Z_lm_K, Avg_Zscore_K, max_conc)),
        Correlation_lm_r = list(perform_lm(Z_lm_r, Avg_Zscore_r, max_conc)),
@@ -497,7 +500,11 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
        Correlation_lm_slope_AUC_r = Correlation_lm_AUC_r[[1]]$slope,
        Correlation_lm_R_Squared_AUC_r = Correlation_lm_AUC_r[[1]]$r_squared,
        Correlation_lm_Score_AUC_r = Correlation_lm_AUC_r[[1]]$score
-      )
+      ) %>%
+        select(-c(Rank_lm_lm_L, Rank_lm_lm_K, Rank_lm_lm_r, Rank_lm_lm_AUC,
+          Correlation_lm_L, Correlation_lm_K, Correlation_lm_r, Correlation_lm_AUC,
+          Correlation_lm_K_L, Correlation_lm_r_L, Correlation_lm_AUC_L,
+          Correlation_lm_r_K, Correlation_lm_AUC_K, Correlation_lm_AUC_r))
  }

  # Create the final calculations and interactions dataframes with specific columns for csv output
@@ -625,6 +632,23 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
        )
      }

+      # Add annotations if specified
+      if (!is.null(config$annotations)) {
+        for (annotation in config$annotations) {
+          plot <- plot +
+            annotate(
+              "text",
+              x = ifelse(is.null(annotation$x), 0, annotation$x),
+              y = ifelse(is.null(annotation$y), Inf, annotation$y),
+              label = annotation$label,
+              hjust = ifelse(is.null(annotation$hjust), 0.5, annotation$hjust),
+              vjust = ifelse(is.null(annotation$vjust), 1, annotation$vjust),
+              size = ifelse(is.null(annotation$size), 3, annotation$size),
+              color = ifelse(is.null(annotation$color), "black", annotation$color)
+            )
+        }
+      }
+
      if (!is.null(config$grid_lines)) {
        plot <- plot + theme(
          panel.grid.major = if (config$grid_lines$major) element_line() else element_blank(),
@@ -783,6 +807,14 @@ generate_scatter_plot <- function(plot, config) {
    # Extract necessary values
    intercept <- config$lm_line$intercept # required
    slope <- config$lm_line$slope # required
+
+    message("----- Generating lm_line -----")
+    message("Intercept: ", intercept)
+    message("Slope: ", slope)
+    message("Class: ", class(config$df[[config$x_var]]))
+    message("------------------------------")
+    
+
    xmin <- ifelse(!is.null(config$lm_line$xmin), config$lm_line$xmin, min(as.numeric(config$df[[config$x_var]])))
    xmax <- ifelse(!is.null(config$lm_line$xmax), config$lm_line$xmax, max(as.numeric(config$df[[config$x_var]])))
    color <- ifelse(!is.null(config$lm_line$color), config$lm_line$color, "blue")
@@ -1078,7 +1110,7 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
      lm_slope_col <- paste0("lm_slope_", var)
      lm_intercept <- first(group_data_filtered[[lm_intercept_col]])
      lm_slope <- first(group_data_filtered[[lm_slope_col]])
-      
+
      plot_config <- list(
        df = group_data_filtered,
        plot_type = "scatter",
@@ -1183,7 +1215,7 @@ generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, ove
  # Generate plots for each variable
  for (variable in variables) {
    rank_var <- if (is_lm) paste0("Rank_lm_", variable) else paste0("Rank_", variable)
-    zscore_var <- if (is_lm) paste0("Z_lm_", variable) else paste0("Avg_Zscore_", variable)
+    zscore_var <- if (is_lm) paste0("Z_lm_adjusted_", variable) else paste0("Avg_Zscore_adjusted_", variable)
    y_label <- if (is_lm) paste("Int Z score", variable) else paste("Avg Z score", variable)
    
    # Loop through SD bands
@@ -1614,9 +1646,9 @@ main <- function() {
      write.csv(calculations_reference_csv, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
      write.csv(interactions_reference_csv, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)

-      message("Generating reference interaction plots")
-      reference_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_reference_joined, "reference")
-      generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
+      # message("Generating reference interaction plots")
+      # reference_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_reference_joined, "reference")
+      # generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)

      message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
      deletion_adjusted <- stats_na %>% # formerly X2
@@ -1647,9 +1679,9 @@ main <- function() {
      write.csv(calculations_csv, file = file.path(out_dir, "zscore_calculations.csv"), row.names = FALSE)
      write.csv(interactions_csv, file = file.path(out_dir, "zscore_interactions.csv"), row.names = FALSE)

-      message("Generating deletion interaction plots")
-      deletion_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_joined, "deletion")
-      generate_and_save_plots(out_dir, "interaction_plots", deletion_plot_configs, page_width = 16, page_height = 16)
+      # message("Generating deletion interaction plots")
+      # deletion_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_joined, "deletion")
+      # generate_and_save_plots(out_dir, "interaction_plots", deletion_plot_configs, page_width = 16, page_height = 16)

      message("Writing enhancer/suppressor csv files")
      interaction_threshold <- 2  # TODO add to study config?