Sfoglia il codice sorgente

Improve rank plot filtering and plot config generation

Bryan Roessler 8 mesi fa
parent
commit
1dfb5d5084
1 ha cambiato i file con 46 aggiunte e 48 eliminazioni
  1. 46 48
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 46 - 48
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -707,16 +707,16 @@ generate_interaction_plot_configs <- function(df, variables) {
   ))
 }
 
-generate_rank_plot_configs <- function(df, interaction_vars, rank_vars = c("L", "K"), is_lm = FALSE) {
+generate_rank_plot_configs <- function(df, interaction_vars, rank_vars = c("L", "K"), is_lm = FALSE, adjust = FALSE) {
 
   # Adjust missing values and compute ranks for each interaction variable
-  for (var in interaction_vars) {
-    avg_zscore_col <- paste0("Avg_Zscore_", var)
-    z_lm_col <- paste0("Z_lm_", var)
-    rank_col <- paste0("Rank_", var)
-    rank_lm_col <- paste0("Rank_lm_", var)
-    
-    if (all(c(avg_zscore_col, z_lm_col) %in% names(df))) {
+  if (adjust) {
+    for (var in interaction_vars) {
+      avg_zscore_col <- paste0("Avg_Zscore_", var)
+      z_lm_col <- paste0("Z_lm_", var)
+      rank_col <- paste0("Rank_", var)
+      rank_lm_col <- paste0("Rank_lm_", var)
+      
       # Replace NA with 0.001 for interaction variables
       df[[avg_zscore_col]] <- if_else(is.na(df[[avg_zscore_col]]), 0.001, df[[avg_zscore_col]])
       df[[z_lm_col]] <- if_else(is.na(df[[z_lm_col]]), 0.001, df[[z_lm_col]])
@@ -724,8 +724,7 @@ generate_rank_plot_configs <- function(df, interaction_vars, rank_vars = c("L",
       # Compute ranks for interaction variables
       df[[rank_col]] <- rank(df[[avg_zscore_col]], na.last = "keep")
       df[[rank_lm_col]] <- rank(df[[z_lm_col]], na.last = "keep")
-    } else {
-      warning(paste("Columns", avg_zscore_col, "or", z_lm_col, "not found in the data frame"))
+
     }
   }
   
@@ -779,8 +778,7 @@ generate_rank_plot_configs <- function(df, interaction_vars, rank_vars = c("L",
         enhancer_label = NULL,
         suppressor_label = NULL,
         shape = 3,
-        size = 0.1,
-        position = "jitter"
+        size = 0.1
       )
     }
   }
@@ -1226,18 +1224,12 @@ main <- function() {
         file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE)
 
       message("Generating rank plots")
-      # Generate rank plot configurations and adjust the dataframe
-      zscores_interactions_adjusted <- generate_rank_plot_configs(
-        df = zscores_interactions,
-        interaction_vars = interaction_vars,
-        is_lm = FALSE
-      )$adjusted_df
-
       # Generate rank plots for L and K using standard ranks
       rank_plot_configs <- generate_rank_plot_configs(
-        df = zscores_interactions_adjusted,
+        df = zscores_interactions,
         interaction_vars = interaction_vars,
-        is_lm = FALSE
+        is_lm = FALSE,
+        adjust = TRUE
       )$plot_configs
 
       # Save the generated rank plots for L and K
@@ -1246,9 +1238,10 @@ main <- function() {
 
       # Generate rank plots for L and K using linear model (`lm`) ranks
       rank_lm_plot_configs <- generate_rank_plot_configs(
-        df = zscores_interactions_adjusted,
+        df = zscores_interactions,
         interaction_vars = interaction_vars,
-        is_lm = TRUE
+        is_lm = TRUE,
+        adjust = TRUE
       )$plot_configs
 
       # Save the linear model based rank plots for L and K
@@ -1256,23 +1249,20 @@ main <- function() {
         plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2))
       
       message("Filtering and regenerating rank plots")
-      # Formerly X_NArm
+      # Filter rows where either Z_lm_L or Avg_Zscore_L is not NA
       zscores_interactions_filtered <- zscores_interactions %>%
         group_by(across(all_of(orf_group_vars))) %>%
-          filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L))
-      
-      # Final filtered correlation calculations and plots
-      lm_results <- zscores_interactions_filtered %>%
-        summarise(
-          lm_R_squared_L = if (n() > 1) summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared else NA,
-          lm_R_squared_K = if (n() > 1) summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared else NA,
-          lm_R_squared_r = if (n() > 1) summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared else NA,
-          lm_R_squared_AUC = if (n() > 1) summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared else NA
-        )
+        filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
+        ungroup()
 
+      # Final filtered correlation calculations and Overlap column
       zscores_interactions_filtered <- zscores_interactions_filtered %>%
-        left_join(lm_results, by = orf_group_vars) %>%
+        rowwise() %>%
         mutate(
+          lm_R_squared_L = if (n() > 1) summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared else NA,
+          lm_R_squared_K = if (n() > 1) summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared else NA,
+          lm_R_squared_r = if (n() > 1) summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared else NA,
+          lm_R_squared_AUC = if (n() > 1) summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared else NA,
           Overlap = case_when(
             Z_lm_L >= 2 & Avg_Zscore_L >= 2 ~ "Deletion Enhancer Both",
             Z_lm_L <= -2 & Avg_Zscore_L <= -2 ~ "Deletion Suppressor Both",
@@ -1285,24 +1275,32 @@ main <- function() {
         ) %>%
         ungroup()
 
-      rank_plot_configs <- c(
-        generate_rank_plot_configs(zscores_interactions_filtered, "Rank_L", "Avg_Zscore_L", "L"),
-        generate_rank_plot_configs(zscores_interactions_filtered, "Rank_K", "Avg_Zscore_K", "K")
-      )
-      generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots",
-        plot_configs = rank_plot_configs, grid_layout = list(ncol = 3, nrow = 2))
+      message("Generating filtered rank plots")
+      rank_plot_filtered_configs <- generate_rank_plot_configs(
+        df = zscores_interactions_filtered,
+        interaction_vars = interaction_vars,
+        is_lm = FALSE,
+        adjust = FALSE
+      )$plot_configs
+      generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_na_rm",
+        plot_configs = rank_plot_filtered_configs,
+        grid_layout = list(ncol = 3, nrow = 2))
           
-      rank_lm_plot_configs <- c(
-        generate_rank_plot_configs(zscores_interactions_filtered, "Rank_lm_L", "Z_lm_L", "L", is_lm = TRUE),
-        generate_rank_plot_configs(zscores_interactions_filtered, "Rank_lm_K", "Z_lm_K", "K", is_lm = TRUE)
-      )
-      generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm",
-        plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2))
+      rank_plot_lm_filtered_configs <- generate_rank_plot_configs(
+        df = zscores_interactions_filtered,
+        interaction_vars = interaction_vars,
+        is_lm = TRUE,
+        adjust = FALSE
+      )$plot_configs
+      generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm_na_rm",
+        plot_configs = rank_plot_lm_filtered_configs,
+        grid_layout = list(ncol = 3, nrow = 2))
 
       message("Generating correlation plots")
       correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, interaction_vars)
       generate_and_save_plots(output_dir = out_dir, file_name = "Avg_Zscore_vs_lm_NA_rm",
-        plot_configs = correlation_plot_configs, grid_layout = list(ncol = 2, nrow = 2))
+        plot_configs = correlation_plot_configs,
+        grid_layout = list(ncol = 2, nrow = 2))
     })
   })
 }