Procházet zdrojové kódy

Extend plot filtering

Bryan Roessler před 7 měsíci
rodič
revize
4045b31543
1 změnil soubory, kde provedl 39 přidání a 36 odebrání
  1. 39 36
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 39 - 36
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -579,7 +579,7 @@ generate_interaction_plot_configs <- function(df, variables) {
     AUC = c(-6500, 6500)
   )
 
-  df_filtered <- filter_data_for_plots(df, variables, limits_map)$filtered_data
+  df_filtered <- filter_data_for_plots(df, variables, missing = TRUE, limits_map)
 
   # Define annotation label functions
   generate_annotation_labels <- function(df, var, annotation_name) {
@@ -670,10 +670,11 @@ generate_interaction_plot_configs <- function(df, variables) {
   return(configs)
 }
 
+generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FALSE) {
 
-generate_rank_plot_configs <- function(df, interaction_vars, rank_vars = c("L", "K"), is_lm = FALSE, adjust = FALSE) {
+  df_filtered <- filter_data_for_plots(df, variables, missing = TRUE)
 
-  for (var in interaction_vars) {
+  for (var in variables) {
     avg_zscore_col <- paste0("Avg_Zscore_", var)
     z_lm_col <- paste0("Z_lm_", var)
     rank_col <- paste0("Rank_", var)
@@ -694,7 +695,7 @@ generate_rank_plot_configs <- function(df, interaction_vars, rank_vars = c("L",
   configs <- list()
   
   # Generate plot configurations for rank variables (L and K) with sd bands
-  for (var in rank_vars) {
+  for (var in c("L", "K")) {
     if (is_lm) {
       rank_var <- paste0("Rank_lm_", var)
       zscore_var <- paste0("Z_lm_", var)
@@ -779,53 +780,55 @@ filter_and_print_non_finite <- function(df, vars_to_check, print_vars) {
   non_finite_rows <- df %>% filter(if_any(all_of(vars_to_check), ~ !is.finite(.)))
   
   if (nrow(non_finite_rows) > 0) {
-    message("Removing the following non-finite rows:")
+    message("Filtering non-finite rows:")
     print(non_finite_rows %>% select(all_of(print_vars)), n = 200)
   }
   
   df %>% filter(if_all(all_of(vars_to_check), is.finite))
 }
 
-filter_data_for_plots <- function(df, variables, limits_map = NULL) {
+filter_data_for_plots <- function(df, variables, missing = TRUE, limits_map = NULL) {
 
-  # Initialize lists to store lm lines and filtered data
+  # Initialize lists to store lm lines
   lm_lines <- list()
   
-  # Print out NA and out-of-range data separately
+  # Check for missing and out-of-range data
   for (variable in variables) {
-    # Get y-limits for the variable
-    ylim_vals <- limits_map[[variable]]
-    
-    # Extract precomputed linear model coefficients
-    lm_lines[[variable]] <- list(
-      intercept = df[[paste0("lm_intercept_", variable)]],
-      slope = df[[paste0("lm_slope_", variable)]]
-    )
-    
-    # Convert variable name to symbol for dplyr
     y_var_sym <- sym(variable)
-    
-    # Identify missing data and print it
-    missing_data <- df %>% filter(is.na(!!y_var_sym))
-    if (nrow(missing_data) > 0) {
-      message("Missing data for variable ", variable, ":")
-      print(missing_data)
+
+    # Print missing data if requested
+    if (missing) {
+      missing_data <- df %>% filter(is.na(!!y_var_sym))
+      if (nrow(missing_data) > 0) {
+        message("Filtering missing data for variable ", variable, " for plotting:")
+        print(head(missing_data, 10))  # Print only the first 10 rows to avoid too much output
+      }
     }
-    
-    # Identify out-of-range data and print it
-    out_of_range_data <- df %>% filter(
-      !is.na(!!y_var_sym) &
-        (!!y_var_sym < min(ylim_vals, na.rm = TRUE) | !!y_var_sym > max(ylim_vals, na.rm = TRUE))
-    )
-    if (nrow(out_of_range_data) > 0) {
-      message("Out-of-range data for variable ", variable, ":")
-      print(out_of_range_data)
+
+    # Print out-of-range data if limits_map is provided
+    if (!is.null(limits_map)) {
+      ylim_vals <- limits_map[[variable]]
+      out_of_range_data <- df %>% filter(
+        !is.na(!!y_var_sym) &
+          (!!y_var_sym < min(ylim_vals, na.rm = TRUE) | !!y_var_sym > max(ylim_vals, na.rm = TRUE))
+      )
+      if (nrow(out_of_range_data) > 0) {
+        message("Filtering out-of-range data for variable ", variable, " for plotting:")
+        print(head(out_of_range_data, 10))  # Print only the first 10 rows
+      }
     }
   }
 
-  # Perform all filtering at once for all variables
-  df_filtered <- df %>% filter(across(all_of(variables), ~ !is.na(.))) %>%
-    filter(across(all_of(variables), ~ between(., limits_map[[cur_column()]][1], limits_map[[cur_column()]][2]), .names = "filter_{col}"))
+  # Apply filtering across all variables in one step using if_any and if_all
+  if (!is.null(limits_map)) {
+    df_filtered <- df %>%
+      filter(if_all(all_of(variables), ~ !is.na(.))) %>%
+      filter(if_all(all_of(variables),
+        ~ between(., limits_map[[cur_column()]][1], limits_map[[cur_column()]][2])
+      ))
+  } else {
+    df_filtered <- df %>% filter(if_all(all_of(variables), ~ !is.na(.)))
+  }
 
   return(df_filtered)
 }