Browse Source

Vectorized data filtering

Bryan Roessler 7 months ago
parent
commit
4fd3453bd6
1 changed files with 40 additions and 23 deletions
  1. 40 23
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 40 - 23
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -826,12 +826,21 @@ generate_correlation_plot_configs <- function(df) {
 filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FALSE,
   rank = FALSE, limits_map = NULL, verbose = TRUE) {
   
-  # Precompute column names for efficiency
+  # Precompute Column Names for Efficiency
   avg_zscore_cols <- paste0("Avg_Zscore_", variables)
   z_lm_cols <- paste0("Z_lm_", variables)
   
+  # # Optional: Validate that the expected columns exist in the dataframe
+  # expected_cols <- c(avg_zscore_cols, z_lm_cols, variables)
+  # missing_cols <- setdiff(expected_cols, names(df))
+  # if (length(missing_cols) > 0) {
+  #   stop("The following expected columns are missing from the dataframe: ",
+  #        paste(missing_cols, collapse = ", "))
+  # }
+  
+  # Adjust NAs if 'adjust' is TRUE
   if (adjust) {
-    if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns")
+    if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns.")
     df <- df %>%
       mutate(
         across(all_of(avg_zscore_cols), ~ replace_na(., 0.001)),
@@ -839,10 +848,13 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
       )
   }
   
-  # Filter non-finite values
+  # Filter Non-Finite Values if 'nf' is TRUE
   if (nf) {
+    if (verbose) message("Filtering non-finite values for variables: ", paste(variables, collapse = ", "))
+    
+    # Identify non-finite rows for logging
     non_finite_df <- df %>%
-      filter(across(all_of(variables), ~ !is.finite(.)))
+      filter(if_any(all_of(variables), ~ !is.finite(.)))
     
     if (verbose && nrow(non_finite_df) > 0) {
       message("Non-finite rows for variables ", paste(variables, collapse = ", "), ":")
@@ -851,13 +863,16 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
     
     # Keep only rows where all specified variables are finite
     df <- df %>%
-      filter(across(all_of(variables), ~ is.finite(.)))
+      filter(if_all(all_of(variables), ~ is.finite(.)))
   }
   
-  # Filter missing malues
+  # Filter Missing Values if 'missing' is TRUE
   if (missing) {
+    if (verbose) message("Filtering missing values for variables: ", paste(variables, collapse = ", "))
+    
+    # Identify missing rows for logging
     missing_df <- df %>%
-      filter(across(all_of(variables), ~ is.na(.)))
+      filter(if_any(all_of(variables), ~ is.na(.)))
     
     if (verbose && nrow(missing_df) > 0) {
       message("Missing data for variables ", paste(variables, collapse = ", "), ":")
@@ -866,16 +881,18 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
     
     # Keep only rows where all specified variables are not missing
     df <- df %>%
-      filter(across(all_of(variables), ~ !is.na(.)))
+      filter(if_all(all_of(variables), ~ !is.na(.)))
   }
   
-  # Filter data outside of y-limits (for plotting)
+  # Apply Limits from 'limits_map' if Provided
   if (!is.null(limits_map)) {
     for (variable in names(limits_map)) {
       if (variable %in% variables) {
         ylim_vals <- limits_map[[variable]]
         
-        # Identify out-of-range data
+        if (verbose) message("Applying limits for variable ", variable, ": [", ylim_vals[1], ", ", ylim_vals[2], "].")
+        
+        # Identify out-of-range data for logging
         out_of_range_df <- df %>%
           filter(.data[[variable]] < ylim_vals[1] | .data[[variable]] > ylim_vals[2])
         
@@ -890,7 +907,8 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
       }
     }
   }
-
+  
+  # Calculate Rank Columns if 'rank' is TRUE
   if (rank) {
     if (verbose) message("Calculating rank columns for variables: ", paste(variables, collapse = ", "))
     
@@ -898,21 +916,20 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
     df <- df %>%
       mutate(
         # Rank based on Avg_Zscore_
-        across(all_of(avg_zscore_cols), ~ rank(., na.last = "keep"), .names = "Rank_{.col}"),
+        across(all_of(avg_zscore_cols), ~ rank(., na.last = "keep"), .names = "Rank_Avg_Zscore_{.col}"),
         # Rank_lm based on Z_lm_
-        across(all_of(z_lm_cols), ~ rank(., na.last = "keep"), .names = "Rank_lm_{.col}")
+        across(all_of(z_lm_cols), ~ rank(., na.last = "keep"), .names = "Rank_lm_Z_lm_{.col}")
       )
     
-    # Rename the newly created rank columns to match desired names
-    for (variable in variables) {
-      old_rank_col <- paste0("Rank_Avg_Zscore_", variable)
-      new_rank_col <- paste0("Rank_", variable)
-      df <- df %>% rename(!!new_rank_col := !!sym(old_rank_col))
-      
-      old_rank_lm_col <- paste0("Rank_lm_Z_lm_", variable)
-      new_rank_lm_col <- paste0("Rank_lm_", variable)
-      df <- df %>% rename(!!new_rank_lm_col := !!sym(old_rank_lm_col))
-    }
+    # Prepare a named vector for renaming columns: new_name = old_name
+    rename_vector <- c(
+      setNames(paste0("Rank_", variables), paste0("Rank_Avg_Zscore_", avg_zscore_cols)),
+      setNames(paste0("Rank_lm_", variables), paste0("Rank_lm_Z_lm_", z_lm_cols))
+    )
+    
+    # Rename the rank columns in a single step
+    df <- df %>%
+      rename(!!!rename_vector)
   }
   
   return(df)