diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index 07d068cd..8a8e2959 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -154,20 +154,6 @@ update_gene_names <- function(df, sgd_gene_list) { return(df) } -# Process either deletion and or reference strain(s) -process_strains <- function(df) { - message("Processing strains") - - df %>% - group_by(conc_num) %>% - mutate( - max_l_theoretical = max(max_L, na.rm = TRUE), - L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), - SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM), - L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% - ungroup() -} - # Calculate summary statistics for all variables calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) { df <- df %>% @@ -202,9 +188,7 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c total_conc_num <- length(unique(df$conc_num)) num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1 - # Pull the background means and standard deviations - print("Calculating background means") - print(head(df)) + # Pull the background means and standard deviations from 0 concentration bg_means <- list( L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(), K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(), @@ -218,19 +202,17 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() ) - # First mutate block to calculate NG, DB, SM, and N - print("Calculating interaction scores part 1") + # Calculate NG, DB, SM, N interaction_scores <- df %>% group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% mutate( NG = sum(NG, na.rm = TRUE), DB = sum(DB, na.rm = TRUE), SM = sum(SM, na.rm = TRUE), - N = sum(!is.na(L)) # Count of non-NA values in L + N = sum(!is.na(L)) ) # Calculate Raw_Shift and Z_Shift for each variable - print("Calculating Raw_Shift and Z_Shift") interaction_scores <- interaction_scores %>% mutate( Raw_Shift_L = mean_L - bg_means$L, @@ -244,7 +226,6 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c ) # Calculate WT and WT_sd for each variable - print("Calculating WT and WT_sd") interaction_scores <- interaction_scores %>% mutate( WT_L = mean(mean_L, na.rm = TRUE), @@ -258,7 +239,6 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c ) # Calculate Exp and Delta for each variable - print("Calculating Exp and Delta") interaction_scores <- interaction_scores %>% mutate( Exp_L = WT_L + Raw_Shift_L, @@ -271,29 +251,38 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c Delta_AUC = WT_AUC - Exp_AUC ) - # Final adjustment to Delta for NG and SM conditions + # Adjust Delta for NG and SM conditions interaction_scores <- interaction_scores %>% mutate( Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L), Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K), Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC), - Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) # disregard shift for set-to-max values in Z score calculation + Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) ) %>% ungroup() - print("Interaction scores:") - print(head(interaction_scores)) - - # Calculate linear models and interaction scores per gene - print("Calculating interaction scores part 2") + # Calculate linear models and lm_Score for each variable interaction_scores_all <- interaction_scores %>% group_by(across(all_of(group_vars))) %>% mutate( - lm_score_L = max_conc * coef(lm(Delta_L ~ conc_num_factor))[2] + coef(lm(Delta_L ~ conc_num_factor))[1], - lm_score_K = max_conc * coef(lm(Delta_K ~ conc_num_factor))[2] + coef(lm(Delta_K ~ conc_num_factor))[1], - lm_score_r = max_conc * coef(lm(Delta_r ~ conc_num_factor))[2] + coef(lm(Delta_r ~ conc_num_factor))[1], - lm_score_AUC = max_conc * coef(lm(Delta_AUC ~ conc_num_factor))[2] + coef(lm(Delta_AUC ~ conc_num_factor))[1]) %>% + lm_Score_L = max_conc * coef(lm(Delta_L ~ conc_num_factor))[2] + coef(lm(Delta_L ~ conc_num_factor))[1], + lm_Score_K = max_conc * coef(lm(Delta_K ~ conc_num_factor))[2] + coef(lm(Delta_K ~ conc_num_factor))[1], + lm_Score_r = max_conc * coef(lm(Delta_r ~ conc_num_factor))[2] + coef(lm(Delta_r ~ conc_num_factor))[1], + lm_Score_AUC = max_conc * coef(lm(Delta_AUC ~ conc_num_factor))[2] + coef(lm(Delta_AUC ~ conc_num_factor))[1] + ) + + # Calculate Z_lm for each variable + interaction_scores_all <- interaction_scores_all %>% + mutate( + Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE), + Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE), + Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE), + Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE) + ) + + # Calculate Average Z-scores for each variable + interaction_scores_all <- interaction_scores_all %>% mutate( Avg_Zscore_L = sum(Z_Shift_L, na.rm = TRUE) / num_non_removed_concs, Avg_Zscore_K = sum(Z_Shift_K, na.rm = TRUE) / num_non_removed_concs, @@ -301,15 +290,16 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c Avg_Zscore_AUC = sum(Z_Shift_AUC, na.rm = TRUE) / (total_conc_num - 1) ) - # Arrange the results by Z_lm_L and NG + # Arrange results by Z_lm_L and NG interaction_scores_all <- interaction_scores_all %>% - arrange(desc(lm_score_L)) %>% + arrange(desc(lm_Score_L)) %>% arrange(desc(NG)) %>% ungroup() return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) } + generate_and_save_plots <- function(output_dir, file_name, plot_configs) { `%||%` <- function(a, b) if (!is.null(a)) a else b @@ -340,19 +330,14 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs) { width = 0.1) + geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) } else if (config$error_bar %||% FALSE) { + # Directly use geom_point and geom_errorbar with pre-calculated values plot <- plot + geom_point(shape = 3, size = 0.2) + geom_errorbar(aes( ymin = !!sym(y_mean_col) - !!sym(y_sd_col), ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) + - geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) + - stat_summary(aes( - ymin = !!sym(y_mean_col) - !!sym(y_sd_col), - ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), - fun.data = "identity", geom = "errorbar", width = 0.1) + - stat_summary(aes(y = !!sym(y_mean_col)), - fun.data = "identity", geom = "point", size = 0.6) + geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) } } @@ -440,20 +425,21 @@ interaction_plot_configs <- function(df, variables) { ifelse(variable == "r", -0.45, -4500))), label = paste("SM =", df$SM)) ) - # Define the configuration for each variable (plot type, limits, labels) - plot_configs[[variable]] <- list( - x_var = "conc_num_factor", + # Append a new plot configuration for each variable + plot_configs[[length(plot_configs) + 1]] <- list( + df = df, + x_var = "Conc_Num_Factor", y_var = delta_var, plot_type = "scatter", title = paste(df$OrfRep[1], df$Gene[1], sep = " "), ylim_vals = ylim_vals, annotations = annotations, error_bar = list( - ymin = 0 - (2 * df[[wt_sd_col]]), - ymax = 0 + (2 * df[[wt_sd_col]]) + ymin = 0 - (2 * df[[wt_sd_col]][1]), + ymax = 0 + (2 * df[[wt_sd_col]][1]) ), - x_breaks = unique(df$conc_num_factor), - x_labels = unique(as.character(df$conc_num)), + x_breaks = unique(df$Conc_Num_Factor), + x_labels = unique(as.character(df$Conc_Num)), x_label = unique(df$Drug[1]) ) } @@ -461,6 +447,9 @@ interaction_plot_configs <- function(df, variables) { return(plot_configs) } + + + correlation_plot_configs <- function(df, lm_list, lm_summaries) { lapply(seq_along(lm_list), function(i) { r_squared <- round(lm_summaries[[i]]$r.squared, 3) @@ -542,9 +531,31 @@ main <- function() { df_na %>% filter(if_all(c(L), is.finite)) } + # Filter data within and outside 2SD + message("Filtering by 2SD of K") + df_na_within_2sd_k <- df_na_stats %>% + filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) + df_na_outside_2sd_k <- df_na_stats %>% + filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) + + # Summary statistics for within and outside 2SD of K + message("Calculating summary statistics for L within 2SD of K") + # TODO We're omitting the original z_max calculation, not sure if needed? + ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + l_within_2sd_k_stats <- ss$summary_stats + df_na_l_within_2sd_k_stats <- ss$df_with_stats + message("Calculating summary statistics for L outside 2SD of K") + ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + l_outside_2sd_k_stats <- ss$summary_stats + df_na_l_outside_2sd_k_stats <- ss$df_with_stats + # Write CSV files + write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE) + write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) + + # Plot configurations # Each list corresponds to a group of plots in the same file - raw_l_vs_k_plots <- list( + l_vs_k_plots <- list( list(df = df, x_var = "L", y_var = "K", plot_type = "scatter", title = "Raw L vs K before QC", color_var = "conc_num", @@ -637,59 +648,33 @@ main <- function() { } } - - - - - - delta_bg_plot_configs <- list( - list(x_var = "delta_bg", y_var = NULL, plot_type = "density", - title = paste("Raw L vs K for strains above delta background threshold of", delta_bg_tolerance, "or above"), - ylim_vals = NULL, - annotate("text", x = L_half_median, y = K_half_median, - label = paste("Strains above delta background tolerance = ", nrow(df_above_tolerance))) - ) - - - + l_outside_2sd_k_plots <- list( + list(df = X_outside_2SD_K, x_var = "l", y_var = "K", plot_type = "scatter", + title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc", + color_var = "conc_num", + legend_position = "right") ) - before_qc_configs <- list( - list(x_var = "scan", y_var = "delta_bg", plot_type = "scatter", - title = "Plate analysis by Drug Conc for Delta Background before QC", - error_bar = TRUE, color_var = "conc_num"), - list(x_var = "scan", y_var = "delta_bg", plot_type = "box", - title = "Plate analysis by Drug Conc for Delta Background before QC", - error_bar = FALSE, color_var = "conc_num") + delta_bg_outside_2sd_k_plots <- list( + list(df = X_outside_2SD_K, x_var = "delta_bg", y_var = "K", plot_type = "scatter", + title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc", + color_var = "conc_num", + legend_position = "right") ) - - - - - - - - above_delta_bg_tolerance <- list( - - - - - - - - - ) - - + # Generate and save plots for each QC step + message("Generating QC plots") + generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots) + generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) + generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) + generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) + generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots) + generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots) + generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots) # Print quality control graphs before removing data due to contamination and # adjusting missing data to max theoretical values - before_qc_plot_configs <- list( - # Plate analysis is a quality check to identify plate effects containing anomalies - - ) # list(x_var = "delta_bg", y_var = NULL, plot_type = "density", # title = "Density plot for Delta Background", @@ -744,21 +729,16 @@ main <- function() { ) - # Generate and save plots for each QC step - message("Generating QC plots") - generate_and_save_plots(out_dir_qc, "L_vs_K_before_QC", raw_l_vs_k_plots) - generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) - generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) - generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) + - generate_and_save_plots(df, out_dir_qc, "raw_L_vs_K_before_QC.pdf", delta_bg_plots) - generate_and_save_plots(df, out_dir_qc, "plate_analysis", before_qc_plot_configs) - generate_and_save_plots(df_above_tolerance, out_dir_qc, above_tolerance_plot_configs) - generate_and_save_plots(df_na_filtered, out_dir_qc, after_qc_plot_configs) - generate_and_save_plots(df_no_zeros, out_dir_qc, "plate_analysis_no_zeros", no_zeros_plot_configs) + # generate_and_save_plots(df, out_dir_qc, "raw_L_vs_K_before_QC.pdf", delta_bg_plots) + # generate_and_save_plots(df, out_dir_qc, "plate_analysis", before_qc_plot_configs) + # generate_and_save_plots(df_above_tolerance, out_dir_qc, above_tolerance_plot_configs) + # generate_and_save_plots(df_na_filtered, out_dir_qc, after_qc_plot_configs) + # generate_and_save_plots(df_no_zeros, out_dir_qc, "plate_analysis_no_zeros", no_zeros_plot_configs) # Clean up rm(df, df_above_tolerance, df_no_zeros) @@ -766,26 +746,6 @@ main <- function() { # TODO: Originally this filtered L NA's # Let's try to avoid for now since stats have already been calculated - # Filter data within and outside 2SD - message("Filtering by 2SD of K") - df_na_within_2sd_k <- df_na_stats %>% - filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) - df_na_outside_2sd_k <- df_na_stats %>% - filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) - - # Summary statistics for within and outside 2SD of K - message("Calculating summary statistics for L within 2SD of K") - # TODO We're omitting the original z_max calculation, not sure if needed? - ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - l_within_2sd_k_stats <- ss$summary_stats - df_na_l_within_2sd_k_stats <- ss$df_with_stats - message("Calculating summary statistics for L outside 2SD of K") - ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - l_outside_2sd_k_stats <- ss$summary_stats - df_na_l_outside_2sd_k_stats <- ss$df_with_stats - # Write CSV files - write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE) - write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) # Process background strains bg_strains <- c("YDL227C") @@ -818,7 +778,7 @@ main <- function() { #print(head(summary_stats_bg)) # Filter reference and deletion strains - # Formerly X2_RF (reference strain) + # Formerly X2_RF (reference strains) df_reference <- df_na_stats %>% filter(OrfRep == strain) %>% mutate(SM = 0) @@ -827,13 +787,30 @@ main <- function() { df_deletion <- df_na_stats %>% filter(OrfRep != strain) %>% mutate(SM = 0) - - reference_strain <- process_strains(df_reference) # TODO double-check - deletion_strains <- process_strains(df_deletion) # TODO double-check - + + # Set the missing values to the highest theoretical value at each drug conc for L + # Leave other values as 0 for the max/min + reference_strain <- df_reference %>% + group_by(conc_num) %>% + mutate( + max_l_theoretical = max(max_L, na.rm = TRUE), + L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), + SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM), + L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% + ungroup() + + # Ditto for deletion strains + deletion_strains <- df_deletion %>% + group_by(conc_num) %>% + mutate( + max_l_theoretical = max(max_L, na.rm = TRUE), + L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), + SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM), + L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% + ungroup() + # Calculate interactions variables <- c("L", "K", "r", "AUC") - # We are recalculating some of the data here message("Calculating interaction scores") print("Reference strain:") print(head(reference_strain))