diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 2d1877e5..9cbf0184 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -793,9 +793,12 @@ generate_rank_plot_configs <- function(df_filtered, is_lm = FALSE, adjust = FALS return(configs) } -generate_correlation_plot_configs <- function(df, variables) { +generate_correlation_plot_configs <- function(df) { + configs <- list() + variables <- c("r", "L", "K", "AUC") + for (variable in variables) { z_lm_var <- paste0("Z_lm_", variable) avg_zscore_var <- paste0("Avg_Zscore_", variable) @@ -821,47 +824,94 @@ generate_correlation_plot_configs <- function(df, variables) { } filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FALSE, - limits_map = NULL, verbose = TRUE) { + rank = FALSE, limits_map = NULL, verbose = TRUE) { - for (variable in variables) { - - avg_zscore_col <- paste0("Avg_Zscore_", var) - z_lm_col <- paste0("Z_lm_", var) - rank_col <- paste0("Rank_", var) - rank_lm_col <- paste0("Rank_lm_", var) - - if (adjust) { - message("Replacing NA with 0.001 for interaction variables") - df[[avg_zscore_col]] <- if_else(is.na(df[[avg_zscore_col]]), 0.001, df[[avg_zscore_col]]) - df[[z_lm_col]] <- if_else(is.na(df[[z_lm_col]]), 0.001, df[[z_lm_col]]) - } - - if (nf) { - non_finite <- df %>% filter(!is.finite(.data[[variable]])) - if (verbose && nrow(non_finite) > 0) { - message("Non-finite rows for variable ", variable, ":") - print(non_finite) - } - df <- df %>% filter(is.finite(.data[[variable]])) + # Precompute column names for efficiency + avg_zscore_cols <- paste0("Avg_Zscore_", variables) + z_lm_cols <- paste0("Z_lm_", variables) + + if (adjust) { + if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns") + df <- df %>% + mutate( + across(all_of(avg_zscore_cols), ~ replace_na(., 0.001)), + across(all_of(z_lm_cols), ~ replace_na(., 0.001)) + ) + } + + # Filter non-finite values + if (nf) { + non_finite_df <- df %>% + filter(across(all_of(variables), ~ !is.finite(.))) + + if (verbose && nrow(non_finite_df) > 0) { + message("Non-finite rows for variables ", paste(variables, collapse = ", "), ":") + print(non_finite_df) } - if (missing) { - missing_data <- df %>% filter(is.na(.data[[variable]])) - if (verbose && nrow(missing_data) > 0) { - message("Missing data for variable ", variable, ":") - print(missing_data) - } - df <- df %>% filter(!is.na(.data[[variable]])) + # Keep only rows where all specified variables are finite + df <- df %>% + filter(across(all_of(variables), ~ is.finite(.))) + } + + # Filter missing malues + if (missing) { + missing_df <- df %>% + filter(across(all_of(variables), ~ is.na(.))) + + if (verbose && nrow(missing_df) > 0) { + message("Missing data for variables ", paste(variables, collapse = ", "), ":") + print(missing_df) } - if (!is.null(limits_map) && !is.null(limits_map[[variable]])) { - ylim_vals <- limits_map[[variable]] - out_of_range_data <- df %>% filter(.data[[variable]] < ylim_vals[1] | .data[[variable]] > ylim_vals[2]) - if (verbose && nrow(out_of_range_data) > 0) { - message("Out-of-range data for variable ", variable, ":") - print(out_of_range_data) + # Keep only rows where all specified variables are not missing + df <- df %>% + filter(across(all_of(variables), ~ !is.na(.))) + } + + # Filter data outside of y-limits (for plotting) + if (!is.null(limits_map)) { + for (variable in names(limits_map)) { + if (variable %in% variables) { + ylim_vals <- limits_map[[variable]] + + # Identify out-of-range data + out_of_range_df <- df %>% + filter(.data[[variable]] < ylim_vals[1] | .data[[variable]] > ylim_vals[2]) + + if (verbose && nrow(out_of_range_df) > 0) { + message("Out-of-range data for variable ", variable, ":") + print(out_of_range_df) + } + + # Keep only rows within the specified limits + df <- df %>% + filter(.data[[variable]] >= ylim_vals[1] & .data[[variable]] <= ylim_vals[2]) } - df <- df %>% filter(.data[[variable]] >= ylim_vals[1] & .data[[variable]] <= ylim_vals[2]) + } + } + + if (rank) { + if (verbose) message("Calculating rank columns for variables: ", paste(variables, collapse = ", ")) + + # Create Rank and Rank_lm columns using mutate and across + df <- df %>% + mutate( + # Rank based on Avg_Zscore_ + across(all_of(avg_zscore_cols), ~ rank(., na.last = "keep"), .names = "Rank_{.col}"), + # Rank_lm based on Z_lm_ + across(all_of(z_lm_cols), ~ rank(., na.last = "keep"), .names = "Rank_lm_{.col}") + ) + + # Rename the newly created rank columns to match desired names + for (variable in variables) { + old_rank_col <- paste0("Rank_Avg_Zscore_", variable) + new_rank_col <- paste0("Rank_", variable) + df <- df %>% rename(!!new_rank_col := !!sym(old_rank_col)) + + old_rank_lm_col <- paste0("Rank_lm_Z_lm_", variable) + new_rank_lm_col <- paste0("Rank_lm_", variable) + df <- df %>% rename(!!new_rank_lm_col := !!sym(old_rank_lm_col)) } } @@ -1254,10 +1304,14 @@ main <- function() { file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE) message("Generating rank plots") - zscores_interactions_joined_filtered <- filter_data(zscores_interactions_joined, variables, missing = TRUE, adjust = TRUE) + zscores_interactions_joined_filtered <- filter_data( + zscores_interactions_joined, + variables, + missing = TRUE, + adjust = TRUE, + rank = TRUE) rank_plot_configs <- generate_rank_plot_configs( df = zscores_interactions_joined_filtered, - variables = interaction_vars, is_lm = FALSE, adjust = TRUE ) @@ -1267,7 +1321,6 @@ main <- function() { message("Generating ranked linear model plots") rank_lm_plot_configs <- generate_rank_plot_configs( df = zscores_interactions_joined_filtered, - variables = interaction_vars, is_lm = TRUE, adjust = TRUE ) @@ -1276,7 +1329,7 @@ main <- function() { message("Filtering and reranking plots") # Formerly X_NArm - zscores_interactions_filtered <- zscores_interactions %>% + zscores_interactions_filtered <- zscores_interactions_joined %>% group_by(across(all_of(orf_group_vars))) %>% filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>% ungroup() %>% @@ -1299,13 +1352,24 @@ main <- function() { ) %>% ungroup() - message("Generating filtered ranked plots") + # Re-rank + zscores_interactions_filtered <- filter_data( + df = zscores_interactions_filtered, + variables = interaction_vars, + missing = TRUE, # TODO what I'm currently having issues with + rank = TRUE + ) + rank_plot_filtered_configs <- generate_rank_plot_configs( df = zscores_interactions_filtered, is_lm = FALSE, adjust = FALSE ) - generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_na_rm", + + message("Generating filtered ranked plots") + generate_and_save_plots( + output_dir = out_dir, + file_name = "RankPlots_na_rm", plot_configs = rank_plot_filtered_configs, grid_layout = list(ncol = 3, nrow = 2)) @@ -1315,13 +1379,17 @@ main <- function() { is_lm = TRUE, adjust = FALSE ) - generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm_na_rm", + generate_and_save_plots( + output_dir = out_dir, + file_name = "RankPlots_lm_na_rm", plot_configs = rank_plot_lm_filtered_configs, grid_layout = list(ncol = 3, nrow = 2)) message("Generating correlation plots") - correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, interaction_vars) - generate_and_save_plots(output_dir = out_dir, file_name = "Avg_Zscore_vs_lm_NA_rm", + correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered) + generate_and_save_plots( + output_dir = out_dir, + file_name = "Avg_Zscore_vs_lm_NA_rm", plot_configs = correlation_plot_configs, grid_layout = list(ncol = 2, nrow = 2)) })