Vectorized data filtering
This commit is contained in:
@@ -826,12 +826,21 @@ generate_correlation_plot_configs <- function(df) {
|
||||
filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FALSE,
|
||||
rank = FALSE, limits_map = NULL, verbose = TRUE) {
|
||||
|
||||
# Precompute column names for efficiency
|
||||
# Precompute Column Names for Efficiency
|
||||
avg_zscore_cols <- paste0("Avg_Zscore_", variables)
|
||||
z_lm_cols <- paste0("Z_lm_", variables)
|
||||
|
||||
# # Optional: Validate that the expected columns exist in the dataframe
|
||||
# expected_cols <- c(avg_zscore_cols, z_lm_cols, variables)
|
||||
# missing_cols <- setdiff(expected_cols, names(df))
|
||||
# if (length(missing_cols) > 0) {
|
||||
# stop("The following expected columns are missing from the dataframe: ",
|
||||
# paste(missing_cols, collapse = ", "))
|
||||
# }
|
||||
|
||||
# Adjust NAs if 'adjust' is TRUE
|
||||
if (adjust) {
|
||||
if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns")
|
||||
if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns.")
|
||||
df <- df %>%
|
||||
mutate(
|
||||
across(all_of(avg_zscore_cols), ~ replace_na(., 0.001)),
|
||||
@@ -839,10 +848,13 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
|
||||
)
|
||||
}
|
||||
|
||||
# Filter non-finite values
|
||||
# Filter Non-Finite Values if 'nf' is TRUE
|
||||
if (nf) {
|
||||
if (verbose) message("Filtering non-finite values for variables: ", paste(variables, collapse = ", "))
|
||||
|
||||
# Identify non-finite rows for logging
|
||||
non_finite_df <- df %>%
|
||||
filter(across(all_of(variables), ~ !is.finite(.)))
|
||||
filter(if_any(all_of(variables), ~ !is.finite(.)))
|
||||
|
||||
if (verbose && nrow(non_finite_df) > 0) {
|
||||
message("Non-finite rows for variables ", paste(variables, collapse = ", "), ":")
|
||||
@@ -851,13 +863,16 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
|
||||
|
||||
# Keep only rows where all specified variables are finite
|
||||
df <- df %>%
|
||||
filter(across(all_of(variables), ~ is.finite(.)))
|
||||
filter(if_all(all_of(variables), ~ is.finite(.)))
|
||||
}
|
||||
|
||||
# Filter missing malues
|
||||
# Filter Missing Values if 'missing' is TRUE
|
||||
if (missing) {
|
||||
if (verbose) message("Filtering missing values for variables: ", paste(variables, collapse = ", "))
|
||||
|
||||
# Identify missing rows for logging
|
||||
missing_df <- df %>%
|
||||
filter(across(all_of(variables), ~ is.na(.)))
|
||||
filter(if_any(all_of(variables), ~ is.na(.)))
|
||||
|
||||
if (verbose && nrow(missing_df) > 0) {
|
||||
message("Missing data for variables ", paste(variables, collapse = ", "), ":")
|
||||
@@ -866,16 +881,18 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
|
||||
|
||||
# Keep only rows where all specified variables are not missing
|
||||
df <- df %>%
|
||||
filter(across(all_of(variables), ~ !is.na(.)))
|
||||
filter(if_all(all_of(variables), ~ !is.na(.)))
|
||||
}
|
||||
|
||||
# Filter data outside of y-limits (for plotting)
|
||||
# Apply Limits from 'limits_map' if Provided
|
||||
if (!is.null(limits_map)) {
|
||||
for (variable in names(limits_map)) {
|
||||
if (variable %in% variables) {
|
||||
ylim_vals <- limits_map[[variable]]
|
||||
|
||||
# Identify out-of-range data
|
||||
if (verbose) message("Applying limits for variable ", variable, ": [", ylim_vals[1], ", ", ylim_vals[2], "].")
|
||||
|
||||
# Identify out-of-range data for logging
|
||||
out_of_range_df <- df %>%
|
||||
filter(.data[[variable]] < ylim_vals[1] | .data[[variable]] > ylim_vals[2])
|
||||
|
||||
@@ -891,6 +908,7 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
|
||||
}
|
||||
}
|
||||
|
||||
# Calculate Rank Columns if 'rank' is TRUE
|
||||
if (rank) {
|
||||
if (verbose) message("Calculating rank columns for variables: ", paste(variables, collapse = ", "))
|
||||
|
||||
@@ -898,21 +916,20 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL
|
||||
df <- df %>%
|
||||
mutate(
|
||||
# Rank based on Avg_Zscore_
|
||||
across(all_of(avg_zscore_cols), ~ rank(., na.last = "keep"), .names = "Rank_{.col}"),
|
||||
across(all_of(avg_zscore_cols), ~ rank(., na.last = "keep"), .names = "Rank_Avg_Zscore_{.col}"),
|
||||
# Rank_lm based on Z_lm_
|
||||
across(all_of(z_lm_cols), ~ rank(., na.last = "keep"), .names = "Rank_lm_{.col}")
|
||||
across(all_of(z_lm_cols), ~ rank(., na.last = "keep"), .names = "Rank_lm_Z_lm_{.col}")
|
||||
)
|
||||
|
||||
# Rename the newly created rank columns to match desired names
|
||||
for (variable in variables) {
|
||||
old_rank_col <- paste0("Rank_Avg_Zscore_", variable)
|
||||
new_rank_col <- paste0("Rank_", variable)
|
||||
df <- df %>% rename(!!new_rank_col := !!sym(old_rank_col))
|
||||
# Prepare a named vector for renaming columns: new_name = old_name
|
||||
rename_vector <- c(
|
||||
setNames(paste0("Rank_", variables), paste0("Rank_Avg_Zscore_", avg_zscore_cols)),
|
||||
setNames(paste0("Rank_lm_", variables), paste0("Rank_lm_Z_lm_", z_lm_cols))
|
||||
)
|
||||
|
||||
old_rank_lm_col <- paste0("Rank_lm_Z_lm_", variable)
|
||||
new_rank_lm_col <- paste0("Rank_lm_", variable)
|
||||
df <- df %>% rename(!!new_rank_lm_col := !!sym(old_rank_lm_col))
|
||||
}
|
||||
# Rename the rank columns in a single step
|
||||
df <- df %>%
|
||||
rename(!!!rename_vector)
|
||||
}
|
||||
|
||||
return(df)
|
||||
|
||||
Reference in New Issue
Block a user