Vectorized data filtering

2024-09-16 20:18:38 -04:00
parent 200df19922
commit 4fd3453bd6
1 changed files with 40 additions and 23 deletions
--- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
+++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
@@ -826,12 +826,21 @@ generate_correlation_plot_configs <- function(df) {
 filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FALSE,
  rank = FALSE, limits_map = NULL, verbose = TRUE) {
  
-  # Precompute column names for efficiency
+  # Precompute Column Names for Efficiency
  avg_zscore_cols <- paste0("Avg_Zscore_", variables)
  z_lm_cols <- paste0("Z_lm_", variables)
  
+  # # Optional: Validate that the expected columns exist in the dataframe
+  # expected_cols <- c(avg_zscore_cols, z_lm_cols, variables)
+  # missing_cols <- setdiff(expected_cols, names(df))
+  # if (length(missing_cols) > 0) {
+  #   stop("The following expected columns are missing from the dataframe: ",
+  #        paste(missing_cols, collapse = ", "))
+  # }
+  
+  # Adjust NAs if 'adjust' is TRUE
  if (adjust) {
-    if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns")
+    if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns.")
    df <- df %>%
      mutate(
        across(all_of(avg_zscore_cols), ~ replace_na(., 0.001)),
@@ -839,10 +848,13 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
      )
  }
  
-  # Filter non-finite values
+  # Filter Non-Finite Values if 'nf' is TRUE
  if (nf) {
+    if (verbose) message("Filtering non-finite values for variables: ", paste(variables, collapse = ", "))
+    
+    # Identify non-finite rows for logging
    non_finite_df <- df %>%
-      filter(across(all_of(variables), ~ !is.finite(.)))
+      filter(if_any(all_of(variables), ~ !is.finite(.)))
    
    if (verbose && nrow(non_finite_df) > 0) {
      message("Non-finite rows for variables ", paste(variables, collapse = ", "), ":")
@@ -851,13 +863,16 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
    
    # Keep only rows where all specified variables are finite
    df <- df %>%
-      filter(across(all_of(variables), ~ is.finite(.)))
+      filter(if_all(all_of(variables), ~ is.finite(.)))
  }
  
-  # Filter missing malues
+  # Filter Missing Values if 'missing' is TRUE
  if (missing) {
+    if (verbose) message("Filtering missing values for variables: ", paste(variables, collapse = ", "))
+    
+    # Identify missing rows for logging
    missing_df <- df %>%
-      filter(across(all_of(variables), ~ is.na(.)))
+      filter(if_any(all_of(variables), ~ is.na(.)))
    
    if (verbose && nrow(missing_df) > 0) {
      message("Missing data for variables ", paste(variables, collapse = ", "), ":")
@@ -866,16 +881,18 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
    
    # Keep only rows where all specified variables are not missing
    df <- df %>%
-      filter(across(all_of(variables), ~ !is.na(.)))
+      filter(if_all(all_of(variables), ~ !is.na(.)))
  }
  
-  # Filter data outside of y-limits (for plotting)
+  # Apply Limits from 'limits_map' if Provided
  if (!is.null(limits_map)) {
    for (variable in names(limits_map)) {
      if (variable %in% variables) {
        ylim_vals <- limits_map[[variable]]
        
-        # Identify out-of-range data
+        if (verbose) message("Applying limits for variable ", variable, ": [", ylim_vals[1], ", ", ylim_vals[2], "].")
+        
+        # Identify out-of-range data for logging
        out_of_range_df <- df %>%
          filter(.data[[variable]] < ylim_vals[1] | .data[[variable]] > ylim_vals[2])
        
@@ -891,6 +908,7 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
    }
  }
  
+  # Calculate Rank Columns if 'rank' is TRUE
  if (rank) {
    if (verbose) message("Calculating rank columns for variables: ", paste(variables, collapse = ", "))
    
@@ -898,21 +916,20 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
    df <- df %>%
      mutate(
        # Rank based on Avg_Zscore_
-        across(all_of(avg_zscore_cols), ~ rank(., na.last = "keep"), .names = "Rank_{.col}"),
+        across(all_of(avg_zscore_cols), ~ rank(., na.last = "keep"), .names = "Rank_Avg_Zscore_{.col}"),
        # Rank_lm based on Z_lm_
-        across(all_of(z_lm_cols), ~ rank(., na.last = "keep"), .names = "Rank_lm_{.col}")
+        across(all_of(z_lm_cols), ~ rank(., na.last = "keep"), .names = "Rank_lm_Z_lm_{.col}")
      )
    
-    # Rename the newly created rank columns to match desired names
-    for (variable in variables) {
-      old_rank_col <- paste0("Rank_Avg_Zscore_", variable)
-      new_rank_col <- paste0("Rank_", variable)
-      df <- df %>% rename(!!new_rank_col := !!sym(old_rank_col))
+    # Prepare a named vector for renaming columns: new_name = old_name
+    rename_vector <- c(
+      setNames(paste0("Rank_", variables), paste0("Rank_Avg_Zscore_", avg_zscore_cols)),
+      setNames(paste0("Rank_lm_", variables), paste0("Rank_lm_Z_lm_", z_lm_cols))
+    )
    
-      old_rank_lm_col <- paste0("Rank_lm_Z_lm_", variable)
-      new_rank_lm_col <- paste0("Rank_lm_", variable)
-      df <- df %>% rename(!!new_rank_lm_col := !!sym(old_rank_lm_col))
-    }
+    # Rename the rank columns in a single step
+    df <- df %>%
+      rename(!!!rename_vector)
  }
  
  return(df)