diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 75df759d..c5350880 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -216,7 +216,16 @@ calculate_summary_stats <- function(df, variables, group_vars) { return(list(summary_stats = summary_stats, df_with_stats = df_joined)) } -calculate_interaction_scores <- function(df, bg_stats, group_vars, overlap_threshold = 2) { +calculate_interaction_scores <- function(df, bg_df, group_vars, overlap_threshold = 2) { + + bg_df_selected <- bg_df %>% + select(OrfRep, conc_num, conc_num_factor, conc_num_factor_factor, + mean_L, mean_K, mean_r, mean_AUC, sd_L, sd_K, sd_r, sd_AUC + ) + + df <- df %>% + left_join(bg_df_selected, by = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor"), + suffix = c("", "_bg")) # Calculate total concentration variables total_conc_num <- length(unique(df$conc_num)) @@ -229,16 +238,26 @@ calculate_interaction_scores <- function(df, bg_stats, group_vars, overlap_thres DB = sum(DB, na.rm = TRUE), SM = sum(SM, na.rm = TRUE), num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1, + + # Assign WT values from the background data + WT_L = mean_L_bg, + WT_K = mean_K_bg, + WT_r = mean_r_bg, + WT_AUC = mean_AUC_bg, + WT_sd_L = sd_L_bg, + WT_sd_K = sd_K_bg, + WT_sd_r = sd_r_bg, + WT_sd_AUC = sd_AUC_bg, # Calculate raw data - Raw_Shift_L = first(mean_L) - bg_stats$mean_L, - Raw_Shift_K = first(mean_K) - bg_stats$mean_K, - Raw_Shift_r = first(mean_r) - bg_stats$mean_r, - Raw_Shift_AUC = first(mean_AUC) - bg_stats$mean_AUC, - Z_Shift_L = Raw_Shift_L / bg_stats$sd_L, - Z_Shift_K = Raw_Shift_K / bg_stats$sd_K, - Z_Shift_r = Raw_Shift_r / bg_stats$sd_r, - Z_Shift_AUC = Raw_Shift_AUC / bg_stats$sd_AUC, + Raw_Shift_L = first(mean_L) - first(mean_L_bg), + Raw_Shift_K = first(mean_K) - first(mean_K_bg), + Raw_Shift_r = first(mean_r) - first(mean_r_bg), + Raw_Shift_AUC = first(mean_AUC) - first(mean_AUC_bg), + Z_Shift_L = Raw_Shift_L / first(sd_L_bg), + Z_Shift_K = Raw_Shift_K / first(sd_K_bg), + Z_Shift_r = Raw_Shift_r / first(sd_r_bg), + Z_Shift_AUC = Raw_Shift_AUC / first(sd_AUC_bg), # Expected values Exp_L = WT_L + Raw_Shift_L, @@ -1073,59 +1092,12 @@ main <- function() { dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE) + # Each list of plots corresponds to a separate file message("Loading and filtering data for experiment: ", exp_name) df <- load_and_filter_data(args$easy_results_file, sd = exp_sd) %>% update_gene_names(args$sgd_gene_list) %>% as_tibble() - message("Calculating summary statistics before quality control") - df_stats <- calculate_summary_stats( - df = df, - variables = c("L", "K", "r", "AUC", "delta_bg"), - group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$df_with_stats - - message("Calculating summary statistics after quality control") - df_na <- df %>% mutate(across(all_of(c("L", "K", "r", "AUC", "delta_bg")), ~ ifelse(DB == 1, NA, .))) # formerly X - ss <- calculate_summary_stats( - df = df_na, - variables = c("L", "K", "r", "AUC", "delta_bg"), - group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")) - df_na_ss <- ss$summary_stats - df_na_stats <- ss$df_with_stats # formerly X_stats_ALL - write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) - # For plotting (ggplot warns on NAs) - df_na_stats_filtered <- df_na_stats %>% filter(if_all(all_of(c("L", "K", "r", "AUC", "delta_bg")), is.finite)) - - message("Calculating summary statistics after quality control excluding zero values") - df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero - df_no_zeros_stats <- calculate_summary_stats( - df = df_no_zeros, - variables = c("L", "K", "r", "AUC", "delta_bg"), - group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor") - )$df_with_stats - - message("Filtering by 2SD of K") - df_na_within_2sd_k <- df_na_stats %>% - filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) - df_na_outside_2sd_k <- df_na_stats %>% - filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) - - message("Calculating summary statistics for L within 2SD of K") - # TODO We're omitting the original z_max calculation, not sure if needed? - ss <- calculate_summary_stats(df_na_within_2sd_k, "L", - group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats - write.csv(ss, - file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), - row.names = FALSE) - - message("Calculating summary statistics for L outside 2SD of K") - ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")) - df_na_l_outside_2sd_k_stats <- ss$df_with_stats - write.csv(ss$summary_stats, - file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), - row.names = FALSE) - - # Each list of plots corresponds to a file l_vs_k_plot_configs <- list( plots = list( list( @@ -1142,6 +1114,12 @@ main <- function() { ) ) + message("Calculating summary statistics before quality control") + df_stats <- calculate_summary_stats( + df = df, + variables = c("L", "K", "r", "AUC", "delta_bg"), + group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$df_with_stats + frequency_delta_bg_plot_configs <- list( plots = list( list( @@ -1171,7 +1149,9 @@ main <- function() { ) ) + message("Filtering rows above delta background tolerance for plotting") df_above_tolerance <- df %>% filter(DB == 1) + above_threshold_plot_configs <- list( plots = list( list( @@ -1196,6 +1176,49 @@ main <- function() { ) ) ) + + message("Setting rows above delta background tolerance to NA") + df_na <- df %>% mutate(across(all_of(c("L", "K", "r", "AUC", "delta_bg")), ~ ifelse(DB == 1, NA, .))) # formerly X + + message("Calculating summary statistics across all strains") + ss <- calculate_summary_stats( + df = df_na, + variables = c("L", "K", "r", "AUC", "delta_bg"), + group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")) + df_na_ss <- ss$summary_stats + df_na_stats <- ss$df_with_stats # formerly X_stats_ALL + write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) + # This can help bypass missing values ggplot warnings during testing + df_na_stats_filtered <- df_na_stats %>% filter(if_all(all_of(c("L", "K", "r", "AUC", "delta_bg")), is.finite)) + + message("Calculating summary statistics excluding zero values") + df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero + df_no_zeros_stats <- calculate_summary_stats( + df = df_no_zeros, + variables = c("L", "K", "r", "AUC", "delta_bg"), + group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor") + )$df_with_stats + + message("Filtering by 2SD of K") + df_na_within_2sd_k <- df_na_stats %>% + filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) + df_na_outside_2sd_k <- df_na_stats %>% + filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) + + message("Calculating summary statistics for L within 2SD of K") + # TODO We're omitting the original z_max calculation, not sure if needed? + ss <- calculate_summary_stats(df_na_within_2sd_k, "L", + group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats + write.csv(ss, + file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), + row.names = FALSE) + + message("Calculating summary statistics for L outside 2SD of K") + ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")) + df_na_l_outside_2sd_k_stats <- ss$df_with_stats + write.csv(ss$summary_stats, + file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), + row.names = FALSE) plate_analysis_plot_configs <- generate_plate_analysis_plot_configs( variables = c("L", "K", "r", "AUC", "delta_bg"), @@ -1303,7 +1326,6 @@ main <- function() { bg_strains <- c("YDL227C") lapply(bg_strains, function(strain) { - message("Processing background strain: ", strain) # Handle missing data by setting zero values to NA @@ -1318,19 +1340,18 @@ main <- function() { ) %>% filter(!is.na(L)) - # Recalculate summary statistics for the background strain message("Calculating summary statistics for background strain") - ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), + ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor")) summary_stats_bg <- ss_bg$summary_stats - ss_bg_stats <- ss_bg$df_with_stats - write.csv(summary_stats_bg, + df_bg_stats <- ss_bg$df_with_stats + write.csv( + summary_stats_bg, file = file.path(out_dir, paste0("summary_stats_background_strain_", strain, ".csv")), row.names = FALSE) - # Set the missing values to the highest theoretical value at each drug conc for L - # Leave other values as 0 for the max/min - df_reference <- df_bg_stats %>% # formerly X2_RF + message("Setting missing reference values to the highest theoretical value at each drug conc for L") + df_reference <- df_na_stats %>% # formerly X2_RF filter(OrfRep == strain) %>% filter(!is.na(L)) %>% group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>% @@ -1341,11 +1362,21 @@ main <- function() { L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% ungroup() - # Ditto for deletion strains + message("Calculating reference strain interaction scores") + df_reference_stats <- calculate_summary_stats( + df = df_reference, + variables = c("L", "K", "r", "AUC"), + group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") + )$df_with_stats + reference_results <- calculate_interaction_scores(df_reference_stats, df_bg_stats, group_vars = c("OrfRep", "Gene", "num")) + zscore_calculations_reference <- reference_results$calculations + zscore_interactions_reference <- reference_results$interactions + zscore_interactions_reference_joined <- reference_results$full_data + + message("Setting missing deletion values to the highest theoretical value at each drug conc for L") df_deletion <- df_na_stats %>% # formerly X2 filter(OrfRep != strain) %>% filter(!is.na(L)) %>% - mutate(SM = 0) %>% group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), @@ -1354,24 +1385,13 @@ main <- function() { L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% ungroup() - message("Calculating reference strain interaction scores") - df_reference_stats <- calculate_summary_stats( - df = df_reference, - variables = c("L", "K", "r", "AUC"), - group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") - )$df_with_stats - reference_results <- calculate_interaction_scores(df_reference_stats, bg_stats, group_vars = c("OrfRep", "Gene", "num")) - zscore_calculations_reference <- reference_results$calculations - zscore_interactions_reference <- reference_results$interactions - zscore_interactions_reference_joined <- reference_results$full_data - message("Calculating deletion strain(s) interactions scores") df_deletion_stats <- calculate_summary_stats( df = df_deletion, variables = c("L", "K", "r", "AUC"), group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor") )$df_with_stats - deletion_results <- calculate_interaction_scores(df_deletion_stats, bg_stats, group_vars = c("OrfRep", "Gene")) + deletion_results <- calculate_interaction_scores(df_deletion_stats, df_bg_stats, group_vars = c("OrfRep", "Gene")) zscore_calculations <- deletion_results$calculations zscore_interactions <- deletion_results$interactions zscore_interactions_joined <- deletion_results$full_data