From 0836dc70d2cbdf1b9d480d1b94a7f691d552c668 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Wed, 25 Sep 2024 01:08:32 -0400 Subject: [PATCH] Improve NA filtering --- .../apps/r/calculate_interaction_zscores.R | 75 ++++++++----------- 1 file changed, 32 insertions(+), 43 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index bec28dd6..987aa1a3 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -134,10 +134,9 @@ load_and_process_data <- function(easy_results_file, sd = 3) { DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0), SM = 0, OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded? - conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)) - ) %>% - mutate( - conc_num_factor = as.factor(match(conc_num, sort(unique(conc_num))) - 1) + conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)), + conc_num_factor = factor(as.numeric(factor(conc_num)) - 1), + conc_num_factor_num = as.numeric(conc_num_factor) ) return(df) @@ -169,8 +168,7 @@ calculate_summary_stats <- function(df, variables, group_vars) { group_by(across(all_of(group_vars))) %>% summarise( N = n(), - across( - all_of(variables), + across(all_of(variables), list( mean = ~mean(., na.rm = TRUE), median = ~median(., na.rm = TRUE), @@ -208,10 +206,10 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, variables = c(" num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1, # Calculate raw data - Raw_Shift_L = first(mean_L) - bg_stats$L, - Raw_Shift_K = first(mean_K) - bg_stats$K, - Raw_Shift_r = first(mean_r) - bg_stats$r, - Raw_Shift_AUC = first(mean_AUC) - bg_stats$AUC, + Raw_Shift_L = first(mean_L) - bg_stats$mean_L, + Raw_Shift_K = first(mean_K) - bg_stats$mean_K, + Raw_Shift_r = first(mean_r) - bg_stats$mean_r, + Raw_Shift_AUC = first(mean_AUC) - bg_stats$mean_AUC, Z_Shift_L = first(Raw_Shift_L) / bg_stats$sd_L, Z_Shift_K = first(Raw_Shift_K) / bg_stats$sd_K, Z_Shift_r = first(Raw_Shift_r) / bg_stats$sd_r, @@ -237,10 +235,10 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, variables = c(" Zscore_AUC = Delta_AUC / WT_sd_AUC, # Fit linear models and store in list-columns - gene_lm_L = list(lm(Delta_L ~ conc_num_factor, data = pick(everything()))), - gene_lm_K = list(lm(Delta_K ~ conc_num_factor, data = pick(everything()))), - gene_lm_r = list(lm(Delta_r ~ conc_num_factor, data = pick(everything()))), - gene_lm_AUC = list(lm(Delta_AUC ~ conc_num_factor, data = pick(everything()))), + gene_lm_L = list(lm(Delta_L ~ conc_num_factor_num, data = pick(everything()))), + gene_lm_K = list(lm(Delta_K ~ conc_num_factor_num, data = pick(everything()))), + gene_lm_r = list(lm(Delta_r ~ conc_num_factor_num, data = pick(everything()))), + gene_lm_AUC = list(lm(Delta_AUC ~ conc_num_factor_num, data = pick(everything()))), # Extract coefficients using purrr::map_dbl lm_intercept_L = map_dbl(gene_lm_L, ~ coef(.x)[1]), @@ -1040,7 +1038,7 @@ main <- function() { df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero # Save some constants - max_conc <- max(as.numeric(df$conc_num_factor)) + max_conc <- max(df$conc_num_factor_num) l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2 k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2 @@ -1050,7 +1048,6 @@ main <- function() { variables = summary_vars, group_vars = c("conc_num", "conc_num_factor"))$df_with_stats message("Filtering non-finite data") - # df_filtered_stats <- process_data(df_stats, c("L"), filter_nf = TRUE) message("Calculating summary statistics after quality control") ss <- calculate_summary_stats( @@ -1060,9 +1057,7 @@ main <- function() { df_na_ss <- ss$summary_stats df_na_stats <- ss$df_with_stats write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) - # df_na_filtered_stats <- process_data(df_na_stats, c("L"), filter_nf = TRUE) - # Create background (WT) data columns df_na_stats <- df_na_stats %>% mutate( WT_L = mean_L, @@ -1079,10 +1074,10 @@ main <- function() { bg_stats <- df_na_stats %>% filter(conc_num == 0) %>% summarise( - L = first(mean_L), - K = first(mean_K), - r = first(mean_r), - AUC = first(mean_AUC), + mean_L = first(mean_L), + mean_K = first(mean_K), + mean_r = first(mean_r), + mean_AUC = first(mean_AUC), sd_L = first(sd_L), sd_K = first(sd_K), sd_r = first(sd_r), @@ -1090,12 +1085,11 @@ main <- function() { ) message("Calculating summary statistics after quality control excluding zero values") - ss <- calculate_summary_stats( + df_no_zeros_stats <- calculate_summary_stats( df = df_no_zeros, variables = summary_vars, - group_vars = c("conc_num", "conc_num_factor")) - df_no_zeros_stats <- ss$df_with_stats - # df_no_zeros_filtered_stats <- process_data(df_no_zeros_stats, c("L"), filter_nf = TRUE) + group_vars = c("conc_num", "conc_num_factor") + )$df_with_stats message("Filtering by 2SD of K") df_na_within_2sd_k <- df_na_stats %>% @@ -1105,9 +1099,8 @@ main <- function() { message("Calculating summary statistics for L within 2SD of K") # TODO We're omitting the original z_max calculation, not sure if needed? - ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - # df_na_l_within_2sd_k_stats <- ss$df_with_stats - write.csv(ss$summary_stats, + ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))$summary_stats + write.csv(ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) @@ -1293,28 +1286,24 @@ main <- function() { file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), row.names = FALSE) - # Filter reference and deletion strains - df_reference <- df_na_stats %>% # formerly X2_RF - filter(OrfRep == strain) %>% - mutate(SM = 0) - - df_deletion <- df_na_stats %>% # formerly X2 - filter(OrfRep != strain) %>% - mutate(SM = 0) - # Set the missing values to the highest theoretical value at each drug conc for L # Leave other values as 0 for the max/min - reference_strain <- df_reference %>% + df_reference <- df_na_stats %>% # formerly X2_RF + filter(OrfRep == strain) %>% + filter(!is.na(L)) %>% group_by(conc_num, conc_num_factor) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), - SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM), + SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, 0), L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% ungroup() # Ditto for deletion strains - deletion_strains <- df_deletion %>% + df_deletion <- df_na_stats %>% # formerly X2 + filter(OrfRep != strain) %>% + filter(!is.na(L)) %>% + mutate(SM = 0) %>% group_by(conc_num, conc_num_factor) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), @@ -1325,7 +1314,7 @@ main <- function() { message("Calculating reference strain interaction scores") df_reference_stats <- calculate_summary_stats( - df = reference_strain, + df = df_reference, variables = interaction_vars, group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") )$df_with_stats @@ -1336,7 +1325,7 @@ main <- function() { message("Calculating deletion strain(s) interactions scores") df_deletion_stats <- calculate_summary_stats( - df = deletion_strains, + df = df_deletion, variables = interaction_vars, group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor") )$df_with_stats