diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index d1bfce85..4f9a4899 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -669,7 +669,7 @@ generate_interaction_plot_configs <- function(df, variables) { AUC = c(-6500, 6500) ) - df_filtered <- filter_data(df, variables, missing = TRUE, limits_map = limits_map) + df_filtered <- process_data(df, variables, filter_na = TRUE, limits_map = limits_map) # Define annotation label functions generate_annotation_labels <- function(df, var, annotation_name) { @@ -836,7 +836,7 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE) { if (plot_type == "Avg_Zscore_vs_lm") { x_var <- paste0("Avg_Zscore_", variable) y_var <- paste0("Z_lm_", variable) - title_suffix <- paste("Avg Zscore vs lm", variable) + title <- paste("Avg Zscore vs lm", variable) rectangles <- list( list(xmin = -2, xmax = 2, ymin = -2, ymax = 2, fill = NA, color = "grey20", alpha = 0.1 @@ -845,9 +845,12 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE) { } else { x_var <- paste0("Rank_", variable) y_var <- paste0("Rank_lm_", variable) - title_suffix <- paste("Rank Avg Zscore vs lm", variable) + title <- paste("Rank Avg Zscore vs lm", variable) rectangles <- NULL } + + print(summary(df_filtered[[x_var]])) + print(summary(df_filtered[[y_var]])) # Fit linear model lm_model <- lm(as.formula(paste(y_var, "~", x_var)), data = df_filtered) @@ -862,7 +865,7 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE) { x_var = x_var, y_var = y_var, plot_type = "scatter", - title = title_suffix, + title = title, annotations = list( list( x = 0, @@ -936,36 +939,26 @@ generate_correlation_plot_configs <- function(df) { return(configs) } -filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FALSE, - rank = FALSE, limits_map = NULL, verbose = TRUE) { +process_data <- function(df, variables, filter_nf = FALSE, filter_na = FALSE, adjust = FALSE, + rank = FALSE, limits_map = NULL) { avg_zscore_cols <- paste0("Avg_Zscore_", variables) z_lm_cols <- paste0("Z_lm_", variables) - # Step 1: Adjust NAs to 0.001 for linear model (if adjust = TRUE) - if (adjust) { - if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns") - df <- df %>% - mutate( - across(all_of(avg_zscore_cols), ~ ifelse(is.na(.), 0.001, .)), - across(all_of(z_lm_cols), ~ ifelse(is.na(.), 0.001, .)) - ) - } - - # Filter non-finite values - if (nf) { + if (filter_nf) { + message("Filtering non-finite values") df <- df %>% filter(if_all(all_of(variables), ~ is.finite(.))) } - # Filter missing values - if (missing) { + if (filter_na) { + message("Filtering NA values") df <- df %>% filter(if_all(all_of(variables), ~ !is.na(.))) } - # Apply limits from 'limits_map' if provided if (!is.null(limits_map)) { + message("Filtering data outside y-limits (for plotting)") for (variable in names(limits_map)) { if (variable %in% variables) { ylim_vals <- limits_map[[variable]] @@ -975,19 +968,23 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL } } + if (adjust) { + message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns for ranks") + df <- df %>% + mutate( + across(all_of(avg_zscore_cols), ~ ifelse(is.na(.), 0.001, .)), + across(all_of(z_lm_cols), ~ ifelse(is.na(.), 0.001, .)) + ) + } + # Calculate and add rank columns + # TODO probably should be moved to separate function if (rank) { - if (verbose) message("Calculating ranks for variable(s): ", paste(variables, collapse = ", ")) - - for (variable in variables) { - avg_zscore_col <- paste0("Avg_Zscore_", variable) - rank_col <- paste0("Rank_", variable) - df[[rank_col]] <- rank(df[[avg_zscore_col]], na.last = "keep") - - z_lm_col <- paste0("Z_lm_", variable) - rank_lm_col <- paste0("Rank_lm_", variable) - df[[rank_lm_col]] <- rank(df[[z_lm_col]], na.last = "keep") - } + message("Calculating ranks for Avg_Zscore_ and Z_lm_ columns") + df <- df %>% + mutate(across(all_of(avg_zscore_cols), rank, .names = "Rank_{.col}", na.last = "keep")) + df <- df %>% + mutate(across(all_of(z_lm_cols), rank, .names = "Rank_lm_{.col}", na.last = "keep")) } return(df) @@ -1030,7 +1027,7 @@ main <- function() { group_vars = c("conc_num", "conc_num_factor")) df_stats <- ss$df_with_stats message("Filtering non-finite data") - df_filtered_stats <- filter_data(df_stats, c("L"), nf = TRUE) + df_filtered_stats <- process_data(df_stats, c("L"), filter_nf = TRUE) message("Calculating summary statistics after quality control") ss <- calculate_summary_stats( @@ -1040,7 +1037,7 @@ main <- function() { df_na_ss <- ss$summary_stats df_na_stats <- ss$df_with_stats write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) - df_na_filtered_stats <- filter_data(df_na_stats, c("L"), nf = TRUE) + df_na_filtered_stats <- process_data(df_na_stats, c("L"), filter_nf = TRUE) message("Calculating summary statistics after quality control excluding zero values") ss <- calculate_summary_stats( @@ -1048,7 +1045,7 @@ main <- function() { variables = summary_vars, group_vars = c("conc_num", "conc_num_factor")) df_no_zeros_stats <- ss$df_with_stats - df_no_zeros_filtered_stats <- filter_data(df_no_zeros_stats, c("L"), nf = TRUE) + df_no_zeros_filtered_stats <- process_data(df_no_zeros_stats, c("L"), filter_nf = TRUE) message("Filtering by 2SD of K") df_na_within_2sd_k <- df_na_stats %>% @@ -1337,14 +1334,14 @@ main <- function() { file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE) message("Generating rank plots") - zscores_interactions_joined_filtered <- filter_data( + # Formerly InteractionScores_AdjustMissing + zscores_interactions_joined_ranked <- process_data( df = zscores_interactions_joined, variables = interaction_vars, - missing = TRUE, adjust = TRUE, rank = TRUE) rank_plot_configs <- generate_rank_plot_configs( - df = zscores_interactions_joined_filtered, + df = zscores_interactions_joined_ranked, variables = interaction_vars, is_lm = FALSE ) @@ -1353,7 +1350,7 @@ main <- function() { message("Generating ranked linear model plots") rank_lm_plot_configs <- generate_rank_plot_configs( - df = zscores_interactions_joined_filtered, + df = zscores_interactions_joined_ranked, variables = interaction_vars, is_lm = TRUE ) @@ -1361,17 +1358,10 @@ main <- function() { plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2)) message("Filtering and reranking plots") - # Filter out rows where both Z_lm_L and Avg_Zscore_L are NA - zscores_interactions_filtered <- zscores_interactions_joined %>% - filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) - # Formerly X_NArm - zscores_interactions_filtered <- zscores_interactions_filtered %>% + zscores_interactions_filtered <- zscores_interactions_joined %>% + filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>% mutate( - lm_R_squared_L = if (n() > 1) summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared else NA, - lm_R_squared_K = if (n() > 1) summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared else NA, - lm_R_squared_r = if (n() > 1) summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared else NA, - lm_R_squared_AUC = if (n() > 1) summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared else NA, Overlap = case_when( Z_lm_L >= 2 & Avg_Zscore_L >= 2 ~ "Deletion Enhancer Both", Z_lm_L <= -2 & Avg_Zscore_L <= -2 ~ "Deletion Suppressor Both", @@ -1382,14 +1372,18 @@ main <- function() { Z_lm_L >= 2 & Avg_Zscore_L <= -2 ~ "Deletion Enhancer lm, Deletion Suppressor Avg Z score", Z_lm_L <= -2 & Avg_Zscore_L >= 2 ~ "Deletion Suppressor lm, Deletion Enhancer Avg Z score", TRUE ~ "No Effect" - ) + ), + lm_R_squared_L = summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared, + lm_R_squared_K = summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared, + lm_R_squared_r = summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared, + lm_R_squared_AUC = summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared ) # Re-rank - zscores_interactions_filtered <- filter_data( + zscores_interactions_filtered <- process_data( df = zscores_interactions_filtered, variables = interaction_vars, - missing = TRUE, # TODO what I'm currently having issues with + filter_na = TRUE, # TODO what I'm currently having issues with rank = TRUE )