ソースを参照

Move ranks into rank_plots function

Bryan Roessler 7 ヶ月 前
コミット
fbbe351cbb
1 ファイル変更48 行追加59 行削除
  1. 48 59
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 48 - 59
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -118,7 +118,7 @@ scale_colour_publication <- function(...) {
 }
 
 # Load the initial dataframe from the easy_results_file
-load_and_process_data <- function(easy_results_file, sd = 3) {
+load_and_filter_data <- function(easy_results_file, sd = 3) {
   df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
 
   df <- df %>%
@@ -656,7 +656,7 @@ generate_interaction_plot_configs <- function(df, variables) {
     AUC = c(-6500, 6500)
   )
 
-  df_filtered <- process_data(df, variables, filter_na = TRUE, limits_map = limits_map)
+  df_filtered <- filter_data(df, variables, filter_na = TRUE, limits_map = limits_map)
 
   # Define annotation label functions
   generate_annotation_labels <- function(df, var, annotation_name) {
@@ -747,11 +747,34 @@ generate_interaction_plot_configs <- function(df, variables) {
   return(configs)
 }
 
-generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE, overlap_color = FALSE) {
+generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, overlap_color = FALSE) {
   
   sd_bands <- c(1, 2, 3)
+
+  avg_zscore_cols <- paste0("Avg_Zscore_", variables)
+  z_lm_cols <- paste0("Z_lm_", variables)
+  rank_avg_zscore_cols <- paste0("Rank_", variables)
+  rank_z_lm_cols <- paste0("Rank_lm_", variables)
   
   configs <- list()
+
+  if (adjust) {
+    message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns for ranks")
+    df <- df %>%
+      mutate(
+        across(all_of(avg_zscore_cols), ~ifelse(is.na(.), 0.001, .)),
+        across(all_of(z_lm_cols), ~ifelse(is.na(.), 0.001, .))
+      )
+  }
+
+  message("Calculating ranks for Avg_Zscore and Z_lm columns")
+  rank_col_mapping <- setNames(rank_avg_zscore_cols, avg_zscore_cols)
+  df_ranked <- df %>%
+    mutate(across(all_of(avg_zscore_cols), ~rank(., na.last = "keep"), .names = "{rank_col_mapping[.col]}"))
+
+  rank_lm_col_mapping <- setNames(rank_z_lm_cols, z_lm_cols)
+  df_ranked <- df_ranked %>%
+    mutate(across(all_of(z_lm_cols), ~rank(., na.last = "keep"), .names = "{rank_lm_col_mapping[.col]}"))
   
   # SD-based plots for L and K
   for (variable in c("L", "K")) {
@@ -768,12 +791,12 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE, ov
     
     for (sd_band in sd_bands) {
       
-      num_enhancers <- sum(df_filtered[[zscore_var]] >= sd_band, na.rm = TRUE)
-      num_suppressors <- sum(df_filtered[[zscore_var]] <= -sd_band, na.rm = TRUE)
+      num_enhancers <- sum(df_ranked[[zscore_var]] >= sd_band, na.rm = TRUE)
+      num_suppressors <- sum(df_ranked[[zscore_var]] <= -sd_band, na.rm = TRUE)
       
       # Annotated plot configuration
       configs[[length(configs) + 1]] <- list(
-        df = df_filtered,
+        df = df_ranked,
         x_var = rank_var,
         y_var = zscore_var,
         plot_type = "scatter",
@@ -785,14 +808,14 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE, ov
         alpha_negative = 0.3,
         annotations = list(
           list(
-            x = median(df_filtered[[rank_var]], na.rm = TRUE),
+            x = median(df_ranked[[rank_var]], na.rm = TRUE),
             y = 10,
             label = paste("Deletion Enhancers =", num_enhancers),
             hjust = 0.5,
             vjust = 1
           ),
           list(
-            x = median(df_filtered[[rank_var]], na.rm = TRUE),
+            x = median(df_ranked[[rank_var]], na.rm = TRUE),
             y = -10,
             label = paste("Deletion Suppressors =", num_suppressors),
             hjust = 0.5,
@@ -808,7 +831,7 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE, ov
       
       # Non-Annotated Plot Configuration
       configs[[length(configs) + 1]] <- list(
-        df = df_filtered,
+        df = df_ranked,
         x_var = rank_var,
         y_var = zscore_var,
         plot_type = "scatter",
@@ -849,30 +872,29 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE, ov
         rectangles <- NULL
       }
 
-      # Fit linear model
-      lm_model <- lm(as.formula(paste(y_var, "~", x_var)), data = df_filtered)
-      lm_summary <- summary(lm_model)
+      # Fit the linear model
+      lm_model <- lm(as.formula(paste(y_var, "~", x_var)), data = df_ranked)
       
       # Extract intercept and slope from the model coefficients
       intercept <- coef(lm_model)[1]
       slope <- coef(lm_model)[2]
         
       configs[[length(configs) + 1]] <- list(
-        df = df_filtered,
+        df = df_ranked,
         x_var = x_var,
         y_var = y_var,
         plot_type = "scatter",
         title = title,
         annotations = list(
           list(
-            x = median(df_filtered[[rank_var]], na.rm = TRUE),
+            x = median(df_ranked[[rank_var]], na.rm = TRUE),
             y = 10,
             label = paste("Deletion Enhancers =", num_enhancers),
             hjust = 0.5,
             vjust = 1
           ),
           list(
-            x = median(df_filtered[[rank_var]], na.rm = TRUE),
+            x = median(df_ranked[[rank_var]], na.rm = TRUE),
             y = -10,
             label = paste("Deletion Suppressors =", num_suppressors),
             hjust = 0.5,
@@ -955,8 +977,7 @@ generate_correlation_plot_configs <- function(df) {
   return(configs)
 }
 
-process_data <- function(df, variables, filter_nf = FALSE, filter_na = FALSE, adjust = FALSE,
-  rank = FALSE, limits_map = NULL) {
+filter_data <- function(df, variables, filter_nf = FALSE, filter_na = FALSE, limits_map = NULL) {
 
   avg_zscore_cols <- paste0("Avg_Zscore_", variables)
   z_lm_cols <- paste0("Z_lm_", variables)
@@ -986,29 +1007,6 @@ process_data <- function(df, variables, filter_nf = FALSE, filter_na = FALSE, ad
     }
   }
 
-  if (adjust) {
-    message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns for ranks")
-    df <- df %>%
-      mutate(
-        across(all_of(avg_zscore_cols), ~ifelse(is.na(.), 0.001, .)),
-        across(all_of(z_lm_cols), ~ifelse(is.na(.), 0.001, .))
-      )
-  }
-
-  # Calculate and add rank columns
-  # TODO probably should be moved to separate function
-  if (rank) {
-    message("Calculating ranks for Avg_Zscore and Z_lm columns")
-
-    rank_col_mapping <- setNames(rank_avg_zscore_cols, avg_zscore_cols)
-    df <- df %>%
-      mutate(across(all_of(avg_zscore_cols), ~rank(., na.last = "keep"), .names = "{rank_col_mapping[.col]}"))
-
-    rank_lm_col_mapping <- setNames(rank_z_lm_cols, z_lm_cols)
-    df <- df %>%
-      mutate(across(all_of(z_lm_cols), ~rank(., na.last = "keep"), .names = "{rank_lm_col_mapping[.col]}"))
-  }
-
   return(df)
 }
 
@@ -1028,7 +1026,7 @@ main <- function() {
       "delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB")
     
     message("Loading and filtering data for experiment: ", exp_name)
-    df <- load_and_process_data(args$easy_results_file, sd = exp_sd) %>%
+    df <- load_and_filter_data(args$easy_results_file, sd = exp_sd) %>%
       update_gene_names(args$sgd_gene_list) %>%
       as_tibble()
 
@@ -1399,25 +1397,21 @@ main <- function() {
         file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE)
 
       message("Generating rank plots")
-      # Formerly InteractionScores_AdjustMissing
-      zscores_interactions_joined_ranked <- process_data(
-        df = zscores_interactions_joined,
-        variables = interaction_vars,
-        adjust = TRUE,
-        rank = TRUE)
       rank_plot_configs <- generate_rank_plot_configs(
-        df = zscores_interactions_joined_ranked,
+        df = zscores_interactions_joined,
         variables = interaction_vars,
-        is_lm = FALSE
+        is_lm = FALSE,
+        adjust = TRUE
       )
       generate_and_save_plots(out_dir = out_dir, filename = "RankPlots",
         plot_configs = rank_plot_configs, grid_layout = list(ncol = 3, nrow = 2))
 
       message("Generating ranked linear model plots")
       rank_lm_plot_configs <- generate_rank_plot_configs(
-        df = zscores_interactions_joined_ranked,
+        df = zscores_interactions_joined,
         variables = interaction_vars,
-        is_lm = TRUE
+        is_lm = TRUE,
+        adjust = TRUE
       )
       generate_and_save_plots(out_dir = out_dir, filename = "RankPlots_lm",
         plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2))
@@ -1444,21 +1438,15 @@ main <- function() {
           lm_R_squared_AUC = summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared
         )
 
-      # Re-rank
-      zscores_interactions_filtered_ranked <- process_data(
-        df = zscores_interactions_filtered,
-        variables = interaction_vars,
-        rank = TRUE
-      )
-
+      message("Generating filtered ranked plots")
       rank_plot_filtered_configs <- generate_rank_plot_configs(
         df = zscores_interactions_filtered_ranked,
         variables = interaction_vars,
         is_lm = FALSE,
+        adjust = FALSE,
         overlap_color = TRUE
       )
 
-      message("Generating filtered ranked plots")
       generate_and_save_plots(
         out_dir = out_dir,
         filename = "RankPlots_na_rm",
@@ -1470,6 +1458,7 @@ main <- function() {
         df = zscores_interactions_filtered_ranked,
         variables = interaction_vars,
         is_lm = TRUE,
+        adjust = FALSE,
         overlap_color = TRUE
       )
       generate_and_save_plots(