Bladeren bron

Refactor data filtering

Bryan Roessler 7 maanden geleden
bovenliggende
commit
200df19922
1 gewijzigde bestanden met toevoegingen van 112 en 44 verwijderingen
  1. 112 44
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 112 - 44
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -793,9 +793,12 @@ generate_rank_plot_configs <- function(df_filtered, is_lm = FALSE, adjust = FALS
   return(configs)
 }
 
-generate_correlation_plot_configs <- function(df, variables) {
+generate_correlation_plot_configs <- function(df) {
+
   configs <- list()
 
+  variables <- c("r", "L", "K", "AUC")
+
   for (variable in variables) {
     z_lm_var <- paste0("Z_lm_", variable)
     avg_zscore_var <- paste0("Avg_Zscore_", variable)
@@ -821,47 +824,94 @@ generate_correlation_plot_configs <- function(df, variables) {
 }
 
 filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FALSE,
-  limits_map = NULL, verbose = TRUE) {
+  rank = FALSE, limits_map = NULL, verbose = TRUE) {
   
-  for (variable in variables) {
-
-    avg_zscore_col <- paste0("Avg_Zscore_", var)
-    z_lm_col <- paste0("Z_lm_", var)
-    rank_col <- paste0("Rank_", var)
-    rank_lm_col <- paste0("Rank_lm_", var)
-
-    if (adjust) {
-      message("Replacing NA with 0.001 for interaction variables")
-      df[[avg_zscore_col]] <- if_else(is.na(df[[avg_zscore_col]]), 0.001, df[[avg_zscore_col]])
-      df[[z_lm_col]] <- if_else(is.na(df[[z_lm_col]]), 0.001, df[[z_lm_col]])
+  # Precompute column names for efficiency
+  avg_zscore_cols <- paste0("Avg_Zscore_", variables)
+  z_lm_cols <- paste0("Z_lm_", variables)
+  
+  if (adjust) {
+    if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns")
+    df <- df %>%
+      mutate(
+        across(all_of(avg_zscore_cols), ~ replace_na(., 0.001)),
+        across(all_of(z_lm_cols), ~ replace_na(., 0.001))
+      )
+  }
+  
+  # Filter non-finite values
+  if (nf) {
+    non_finite_df <- df %>%
+      filter(across(all_of(variables), ~ !is.finite(.)))
+    
+    if (verbose && nrow(non_finite_df) > 0) {
+      message("Non-finite rows for variables ", paste(variables, collapse = ", "), ":")
+      print(non_finite_df)
     }
-
-    if (nf) {
-      non_finite <- df %>% filter(!is.finite(.data[[variable]]))
-      if (verbose && nrow(non_finite) > 0) {
-        message("Non-finite rows for variable ", variable, ":")
-        print(non_finite)
-      }
-      df <- df %>% filter(is.finite(.data[[variable]]))
+    
+    # Keep only rows where all specified variables are finite
+    df <- df %>%
+      filter(across(all_of(variables), ~ is.finite(.)))
+  }
+  
+  # Filter missing malues
+  if (missing) {
+    missing_df <- df %>%
+      filter(across(all_of(variables), ~ is.na(.)))
+    
+    if (verbose && nrow(missing_df) > 0) {
+      message("Missing data for variables ", paste(variables, collapse = ", "), ":")
+      print(missing_df)
     }
     
-    if (missing) {
-      missing_data <- df %>% filter(is.na(.data[[variable]]))
-      if (verbose && nrow(missing_data) > 0) {
-        message("Missing data for variable ", variable, ":")
-        print(missing_data)
+    # Keep only rows where all specified variables are not missing
+    df <- df %>%
+      filter(across(all_of(variables), ~ !is.na(.)))
+  }
+  
+  # Filter data outside of y-limits (for plotting)
+  if (!is.null(limits_map)) {
+    for (variable in names(limits_map)) {
+      if (variable %in% variables) {
+        ylim_vals <- limits_map[[variable]]
+        
+        # Identify out-of-range data
+        out_of_range_df <- df %>%
+          filter(.data[[variable]] < ylim_vals[1] | .data[[variable]] > ylim_vals[2])
+        
+        if (verbose && nrow(out_of_range_df) > 0) {
+          message("Out-of-range data for variable ", variable, ":")
+          print(out_of_range_df)
+        }
+        
+        # Keep only rows within the specified limits
+        df <- df %>%
+          filter(.data[[variable]] >= ylim_vals[1] & .data[[variable]] <= ylim_vals[2])
       }
-      df <- df %>% filter(!is.na(.data[[variable]]))
     }
+  }
+
+  if (rank) {
+    if (verbose) message("Calculating rank columns for variables: ", paste(variables, collapse = ", "))
     
-    if (!is.null(limits_map) && !is.null(limits_map[[variable]])) {
-      ylim_vals <- limits_map[[variable]]
-      out_of_range_data <- df %>% filter(.data[[variable]] < ylim_vals[1] | .data[[variable]] > ylim_vals[2])
-      if (verbose && nrow(out_of_range_data) > 0) {
-        message("Out-of-range data for variable ", variable, ":")
-        print(out_of_range_data)
-      }
-      df <- df %>% filter(.data[[variable]] >= ylim_vals[1] & .data[[variable]] <= ylim_vals[2])
+    # Create Rank and Rank_lm columns using mutate and across
+    df <- df %>%
+      mutate(
+        # Rank based on Avg_Zscore_
+        across(all_of(avg_zscore_cols), ~ rank(., na.last = "keep"), .names = "Rank_{.col}"),
+        # Rank_lm based on Z_lm_
+        across(all_of(z_lm_cols), ~ rank(., na.last = "keep"), .names = "Rank_lm_{.col}")
+      )
+    
+    # Rename the newly created rank columns to match desired names
+    for (variable in variables) {
+      old_rank_col <- paste0("Rank_Avg_Zscore_", variable)
+      new_rank_col <- paste0("Rank_", variable)
+      df <- df %>% rename(!!new_rank_col := !!sym(old_rank_col))
+      
+      old_rank_lm_col <- paste0("Rank_lm_Z_lm_", variable)
+      new_rank_lm_col <- paste0("Rank_lm_", variable)
+      df <- df %>% rename(!!new_rank_lm_col := !!sym(old_rank_lm_col))
     }
   }
   
@@ -1254,10 +1304,14 @@ main <- function() {
         file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE)
 
       message("Generating rank plots")
-      zscores_interactions_joined_filtered <- filter_data(zscores_interactions_joined, variables, missing = TRUE, adjust = TRUE)
+      zscores_interactions_joined_filtered <- filter_data(
+        zscores_interactions_joined,
+        variables,
+        missing = TRUE,
+        adjust = TRUE,
+        rank = TRUE)
       rank_plot_configs <- generate_rank_plot_configs(
         df = zscores_interactions_joined_filtered,
-        variables = interaction_vars,
         is_lm = FALSE,
         adjust = TRUE
       )
@@ -1267,7 +1321,6 @@ main <- function() {
       message("Generating ranked linear model plots")
       rank_lm_plot_configs <- generate_rank_plot_configs(
         df = zscores_interactions_joined_filtered,
-        variables = interaction_vars,
         is_lm = TRUE,
         adjust = TRUE
       )
@@ -1276,7 +1329,7 @@ main <- function() {
 
       message("Filtering and reranking plots")
       # Formerly X_NArm
-      zscores_interactions_filtered <- zscores_interactions %>%
+      zscores_interactions_filtered <- zscores_interactions_joined %>%
         group_by(across(all_of(orf_group_vars))) %>%
         filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
         ungroup() %>%
@@ -1299,13 +1352,24 @@ main <- function() {
         ) %>%
         ungroup()
 
-      message("Generating filtered ranked plots")
+      # Re-rank
+      zscores_interactions_filtered <- filter_data(
+        df = zscores_interactions_filtered,
+        variables = interaction_vars,
+        missing = TRUE, # TODO what I'm currently having issues with
+        rank = TRUE
+      )
+
       rank_plot_filtered_configs <- generate_rank_plot_configs(
         df = zscores_interactions_filtered,
         is_lm = FALSE,
         adjust = FALSE
       )
-      generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_na_rm",
+
+      message("Generating filtered ranked plots")
+      generate_and_save_plots(
+        output_dir = out_dir,
+        file_name = "RankPlots_na_rm",
         plot_configs = rank_plot_filtered_configs,
         grid_layout = list(ncol = 3, nrow = 2))
 
@@ -1315,13 +1379,17 @@ main <- function() {
         is_lm = TRUE,
         adjust = FALSE
       )
-      generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm_na_rm",
+      generate_and_save_plots(
+        output_dir = out_dir,
+        file_name = "RankPlots_lm_na_rm",
         plot_configs = rank_plot_lm_filtered_configs,
         grid_layout = list(ncol = 3, nrow = 2))
 
       message("Generating correlation plots")
-      correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, interaction_vars)
-      generate_and_save_plots(output_dir = out_dir, file_name = "Avg_Zscore_vs_lm_NA_rm",
+      correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered)
+      generate_and_save_plots(
+        output_dir = out_dir,
+        file_name = "Avg_Zscore_vs_lm_NA_rm",
         plot_configs = correlation_plot_configs,
         grid_layout = list(ncol = 2, nrow = 2))
     })