Calculate ranks step-wise

2024-10-08 11:59:24 -04:00
parent d456bcdadd
commit bff7e7cfb4
1 changed files with 88 additions and 56 deletions
--- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
+++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
@@ -222,7 +222,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
  perform_lm <- function(response, predictor, max_conc) {
    if (all(is.na(response)) || all(is.na(predictor)) ||
-      length(response[!is.na(response)]) == 0 || length(predictor[!is.na(predictor)]) == 0) {
+      length(response[!is.na(response)]) < 2 || length(predictor[!is.na(predictor)]) < 2) {
      return(list(intercept = NA, slope = NA, r_squared = NA, score = NA))
    } else {
      fit <- lm(response ~ predictor)
@@ -337,53 +337,55 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
      Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / first(total_conc_num - 1),
      Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / first(total_conc_num - 1),
-      # Perform gene-gene linear modeling
+      # Perform per-gene linear modeling
      lm_L = list(perform_lm(Delta_L, conc_num_factor, max_conc)),
      lm_K = list(perform_lm(Delta_K, conc_num_factor, max_conc)),
      lm_r = list(perform_lm(Delta_r, conc_num_factor, max_conc)),
      lm_AUC = list(perform_lm(Delta_AUC, conc_num_factor, max_conc)),
      # Extract coefficients and statistics for each model
      lm_intercept_L = lm_L[[1]]$intercept,
      lm_slope_L = lm_L[[1]]$slope,
      R_Squared_L = lm_L[[1]]$r_squared,
      lm_Score_L = lm_L[[1]]$score,
      lm_intercept_K = lm_K[[1]]$intercept,
      lm_slope_K = lm_K[[1]]$slope,
      R_Squared_K = lm_K[[1]]$r_squared,
      lm_Score_K = lm_K[[1]]$score,
      lm_intercept_r = lm_r[[1]]$intercept,
      lm_slope_r = lm_r[[1]]$slope,
      R_Squared_r = lm_r[[1]]$r_squared,
      lm_Score_r = lm_r[[1]]$score,
      lm_intercept_AUC = lm_AUC[[1]]$intercept,
      lm_slope_AUC = lm_AUC[[1]]$slope,
      R_Squared_AUC = lm_AUC[[1]]$r_squared,
      lm_Score_AUC = lm_AUC[[1]]$score,
      .groups = "drop"
    ) %>%
    select(-c(lm_L, lm_K, lm_r, lm_AUC)) # drop linear models since we have coefficients
  # Summary statistics for lm scores
  interactions <- interactions %>%
    group_by(across(all_of(group_vars))) %>%
    mutate(
-      lm_mean_L = mean(lm_Score_L, na.rm = TRUE),
+      lm_intercept_L = map_dbl(lm_L, "intercept"),
-      lm_sd_L = sd(lm_Score_L, na.rm = TRUE),
+      lm_slope_L = map_dbl(lm_L, "slope"),
-      lm_mean_K = mean(lm_Score_K, na.rm = TRUE),
+      R_Squared_L = map_dbl(lm_L, "r_squared"),
-      lm_sd_K = sd(lm_Score_K, na.rm = TRUE),
+      lm_Score_L = map_dbl(lm_L, "score"),
-      lm_mean_r = mean(lm_Score_r, na.rm = TRUE),
+      
-      lm_sd_r = sd(lm_Score_r, na.rm = TRUE),
+      lm_intercept_K = map_dbl(lm_K, "intercept"),
-      lm_mean_AUC = mean(lm_Score_AUC, na.rm = TRUE),
+      lm_slope_K = map_dbl(lm_K, "slope"),
-      lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE),
+      R_Squared_K = map_dbl(lm_K, "r_squared"),
-      Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L,
+      lm_Score_K = map_dbl(lm_K, "score"),
-      Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K,
+      
-      Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r,
+      lm_intercept_r = map_dbl(lm_r, "intercept"),
-      Z_lm_AUC = (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC
+      lm_slope_r = map_dbl(lm_r, "slope"),
      R_Squared_r = map_dbl(lm_r, "r_squared"),
      lm_Score_r = map_dbl(lm_r, "score"),
      lm_intercept_AUC = map_dbl(lm_AUC, "intercept"),
      lm_slope_AUC = map_dbl(lm_AUC, "slope"),
      R_Squared_AUC = map_dbl(lm_AUC, "r_squared"),
      lm_Score_AUC = map_dbl(lm_AUC, "score")
    ) %>%
    select(-c(lm_L, lm_K, lm_r, lm_AUC)) # drop linear models since we now have coefficients
  lm_mean_L <- mean(interactions$lm_Score_L, na.rm = TRUE)
  lm_sd_L <- sd(interactions$lm_Score_L, na.rm = TRUE)
  lm_mean_K <- mean(interactions$lm_Score_K, na.rm = TRUE)
  lm_sd_K <- sd(interactions$lm_Score_K, na.rm = TRUE)
  lm_mean_r <- mean(interactions$lm_Score_r, na.rm = TRUE)
  lm_sd_r <- sd(interactions$lm_Score_r, na.rm = TRUE)
  lm_mean_AUC <- mean(interactions$lm_Score_AUC, na.rm = TRUE)
  lm_sd_AUC <- sd(interactions$lm_Score_AUC, na.rm = TRUE)
  interactions <- interactions %>%
    mutate(
      Z_lm_L = if (!is.na(lm_sd_L) && lm_sd_L != 0) (lm_Score_L - lm_mean_L) / lm_sd_L else NA_real_,
      Z_lm_K = if (!is.na(lm_sd_K) && lm_sd_K != 0) (lm_Score_K - lm_mean_K) / lm_sd_K else NA_real_,
      Z_lm_r = if (!is.na(lm_sd_r) && lm_sd_r != 0) (lm_Score_r - lm_mean_r) / lm_sd_r else NA_real_,
      Z_lm_AUC = if (!is.na(lm_sd_AUC) && lm_sd_AUC != 0) (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC else NA_real_
    ) %>%
    arrange(desc(Z_lm_L), desc(NG))
  # Deletion data ranking and linear modeling
  # Initialize this variable so we can return it as NULL for reference
  # We don't do the following calculations for the reference strain
@@ -408,11 +410,12 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
        Rank_lm_L = rank(Z_lm_adjusted_L),
        Rank_lm_K = rank(Z_lm_adjusted_K),
        Rank_lm_r = rank(Z_lm_adjusted_r),
-        Rank_lm_AUC = rank(Z_lm_adjusted_AUC),
+        Rank_lm_AUC = rank(Z_lm_adjusted_AUC)
-        Rank_lm_lm_L = list(perform_lm(Rank_lm_L, Rank_L, max_conc)),
+        # Rank_lm_lm_L = list(perform_lm(Rank_lm_L, Rank_L, max_conc)),
-        Rank_lm_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)),
+        # Rank_lm_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)),
-        Rank_lm_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)),
+        # Rank_lm_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)),
-        Rank_lm_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)),
+        # Rank_lm_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)),
      )
    # We are filtering invalid Z_lm scores so this must be in its own df
@@ -443,10 +446,10 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
        Rank_lm_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)),
        Rank_lm_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)),
        Rank_lm_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)),
-        # Rank_lm_R_Squared_L = Rank_lm_lm_L[[1]]$r_squared,
+        Rank_lm_R_Squared_L = Rank_lm_lm_L[[1]]$r_squared,
-        # Rank_lm_R_Squared_K = Rank_lm_lm_K[[1]]$r_squared,
+        Rank_lm_R_Squared_K = Rank_lm_lm_K[[1]]$r_squared,
-        # Rank_lm_R_Squared_r = Rank_lm_lm_r[[1]]$r_squared,
+        Rank_lm_R_Squared_r = Rank_lm_lm_r[[1]]$r_squared,
-        # Rank_lm_R_Squared_AUC = Rank_lm_lm_AUC[[1]]$r_squared,
+        Rank_lm_R_Squared_AUC = Rank_lm_lm_AUC[[1]]$r_squared,
        Correlation_lm_L = list(perform_lm(Z_lm_L, Avg_Zscore_L, max_conc)),
        Correlation_lm_K = list(perform_lm(Z_lm_K, Avg_Zscore_K, max_conc)),
        Correlation_lm_r = list(perform_lm(Z_lm_r, Avg_Zscore_r, max_conc)),
@@ -497,7 +500,11 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
        Correlation_lm_slope_AUC_r = Correlation_lm_AUC_r[[1]]$slope,
        Correlation_lm_R_Squared_AUC_r = Correlation_lm_AUC_r[[1]]$r_squared,
        Correlation_lm_Score_AUC_r = Correlation_lm_AUC_r[[1]]$score
-      )
+      ) %>%
        select(-c(Rank_lm_lm_L, Rank_lm_lm_K, Rank_lm_lm_r, Rank_lm_lm_AUC,
          Correlation_lm_L, Correlation_lm_K, Correlation_lm_r, Correlation_lm_AUC,
          Correlation_lm_K_L, Correlation_lm_r_L, Correlation_lm_AUC_L,
          Correlation_lm_r_K, Correlation_lm_AUC_K, Correlation_lm_AUC_r))
  }
  # Create the final calculations and interactions dataframes with specific columns for csv output
@@ -625,6 +632,23 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
        )
      }
      # Add annotations if specified
      if (!is.null(config$annotations)) {
        for (annotation in config$annotations) {
          plot <- plot +
            annotate(
              "text",
              x = ifelse(is.null(annotation$x), 0, annotation$x),
              y = ifelse(is.null(annotation$y), Inf, annotation$y),
              label = annotation$label,
              hjust = ifelse(is.null(annotation$hjust), 0.5, annotation$hjust),
              vjust = ifelse(is.null(annotation$vjust), 1, annotation$vjust),
              size = ifelse(is.null(annotation$size), 3, annotation$size),
              color = ifelse(is.null(annotation$color), "black", annotation$color)
            )
        }
      }
      if (!is.null(config$grid_lines)) {
        plot <- plot + theme(
          panel.grid.major = if (config$grid_lines$major) element_line() else element_blank(),
@@ -783,6 +807,14 @@ generate_scatter_plot <- function(plot, config) {
    # Extract necessary values
    intercept <- config$lm_line$intercept # required
    slope <- config$lm_line$slope # required
    message("----- Generating lm_line -----")
    message("Intercept: ", intercept)
    message("Slope: ", slope)
    message("Class: ", class(config$df[[config$x_var]]))
    message("------------------------------")
    xmin <- ifelse(!is.null(config$lm_line$xmin), config$lm_line$xmin, min(as.numeric(config$df[[config$x_var]])))
    xmax <- ifelse(!is.null(config$lm_line$xmax), config$lm_line$xmax, max(as.numeric(config$df[[config$x_var]])))
    color <- ifelse(!is.null(config$lm_line$color), config$lm_line$color, "blue")
@@ -1183,7 +1215,7 @@ generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, ove
  # Generate plots for each variable
  for (variable in variables) {
    rank_var <- if (is_lm) paste0("Rank_lm_", variable) else paste0("Rank_", variable)
-    zscore_var <- if (is_lm) paste0("Z_lm_", variable) else paste0("Avg_Zscore_", variable)
+    zscore_var <- if (is_lm) paste0("Z_lm_adjusted_", variable) else paste0("Avg_Zscore_adjusted_", variable)
    y_label <- if (is_lm) paste("Int Z score", variable) else paste("Avg Z score", variable)
    # Loop through SD bands
@@ -1614,9 +1646,9 @@ main <- function() {
      write.csv(calculations_reference_csv, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
      write.csv(interactions_reference_csv, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
-      message("Generating reference interaction plots")
+      # message("Generating reference interaction plots")
-      reference_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_reference_joined, "reference")
+      # reference_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_reference_joined, "reference")
-      generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
+      # generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
      message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
      deletion_adjusted <- stats_na %>% # formerly X2
@@ -1647,9 +1679,9 @@ main <- function() {
      write.csv(calculations_csv, file = file.path(out_dir, "zscore_calculations.csv"), row.names = FALSE)
      write.csv(interactions_csv, file = file.path(out_dir, "zscore_interactions.csv"), row.names = FALSE)
-      message("Generating deletion interaction plots")
+      # message("Generating deletion interaction plots")
-      deletion_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_joined, "deletion")
+      # deletion_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_joined, "deletion")
-      generate_and_save_plots(out_dir, "interaction_plots", deletion_plot_configs, page_width = 16, page_height = 16)
+      # generate_and_save_plots(out_dir, "interaction_plots", deletion_plot_configs, page_width = 16, page_height = 16)
      message("Writing enhancer/suppressor csv files")
      interaction_threshold <- 2  # TODO add to study config?