diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index a7f95279..c165b0f1 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -30,7 +30,7 @@ parse_arguments <- function() { "/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1", "Experiment 1: Doxo versus HLD", 3, - "/home/bryan/documents/develop/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2", + "/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2", "Experiment 2: HLD versus Doxo", 3 ) @@ -170,7 +170,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c se = ~ ifelse(all(is.na(.)), NA, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)) ), .names = "{.fn}_{.col}") ) - + # Prevent .x and .y suffix issues by renaming columns df_cleaned <- df %>% select(-any_of(setdiff(names(summary_stats), group_vars))) # Avoid duplicate columns in the final join @@ -336,9 +336,14 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c } generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) { + + message("Generating html and pdf plots for: ", file_name) + plots <- lapply(plot_configs, function(config) { df <- config$df - + + print(head(df)) + # Check if y_var is NULL and adjust the aes mapping aes_mapping <- if (is.null(config$y_var)) { aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var))) @@ -374,7 +379,7 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la } else if (config$plot_type == "density") { plot <- plot + geom_density() } else if (config$plot_type == "bar") { - plot <- plot + geom_bar(stat = "identity") + plot <- plot + geom_bar() } else { plot <- plot + geom_point(shape = 3) + geom_smooth(method = "lm", se = FALSE) } @@ -582,35 +587,15 @@ main <- function() { dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE) - # Load and process data + message("Loading and filtering data") df <- load_and_process_data(args$easy_results_file, sd = exp_sd) df <- update_gene_names(df, args$sgd_gene_list) - - max_conc <- max(df$conc_num_factor) - - # QC steps and filtering + + # Filter rows that are above tolerance for quality control plots df_above_tolerance <- df %>% filter(DB == 1) - # Calculate the half-medians for `L` and `K` for rows above tolerance - L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2 - K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2 - - # Get the number of rows that are above tolerance - rows_to_remove <- nrow(df_above_tolerance) - # Set L, r, K, and AUC to NA for rows that are above tolerance - df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .))) - - # Calculate summary statistics for all strains, including both background and the deletions - message("Calculating summary statistics for all strains") - variables <- c("L", "K", "r", "AUC") - ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) - summary_stats <- ss$summary_stats - df_na_stats <- ss$df_with_stats - write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) - - # print("Summary stats:") - # print(head(summary_stats), width = 200) + df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .))) # Remove rows with 0 values in L df_no_zeros <- df_na %>% filter(L > 0) @@ -626,34 +611,54 @@ main <- function() { } filter(., if_all(c(L), is.finite)) } + + variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across + group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # fields to group by + + # Set some constants + max_conc <- max(df$conc_num_factor) + l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2 + k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2 + + message("Calculating summary statistics before quality control") + ss <- calculate_summary_stats(df, variables, group_vars = group_vars) + df_stats <- ss$df_with_stats + + message("Calculating summary statistics after quality control") + ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars) + df_na_ss <- ss$summary_stats + df_na_stats <- ss$df_with_stats + write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) + + message("Calculating summary statistics after quality control excluding zero values") + ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars) + df_no_zeros_stats <- ss$df_with_stats - # Filter data within and outside 2SD message("Filtering by 2SD of K") df_na_within_2sd_k <- df_na_stats %>% filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) df_na_outside_2sd_k <- df_na_stats %>% filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) - # Summary statistics for within and outside 2SD of K message("Calculating summary statistics for L within 2SD of K") # TODO We're omitting the original z_max calculation, not sure if needed? ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - l_within_2sd_k_stats <- ss$summary_stats + l_within_2sd_k_ss <- ss$summary_stats df_na_l_within_2sd_k_stats <- ss$df_with_stats + write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) + message("Calculating summary statistics for L outside 2SD of K") ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - l_outside_2sd_k_stats <- ss$summary_stats + l_outside_2sd_k_ss <- ss$summary_stats df_na_l_outside_2sd_k_stats <- ss$df_with_stats - # Write CSV files - write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE) - write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) + write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE) # Plot configurations # Each plots list corresponds to a file message("Generating QC plot configurations") l_vs_k_plots <- list( list(df = df, x_var = "L", y_var = "K", plot_type = "scatter", - title = "Raw L vs K before QC", + title = "Raw L vs K before quality control", color_var = "conc_num", legend_position = "right" ) @@ -664,8 +669,8 @@ main <- function() { title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"), color_var = "conc_num", annotations = list( - x = L_half_median, - y = K_half_median, + x = l_half_median, + y = k_half_median, label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance)) ), error_bar = FALSE, @@ -674,16 +679,16 @@ main <- function() { ) frequency_delta_bg_plots <- list( - list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density", - title = "Density plot for Delta Background by Conc All Data", + list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density", + title = "Plate analysis by Drug Conc for delta background before quality control", color_var = "conc_num", x_label = "Delta Background", y_label = "Density", error_bar = FALSE, legend_position = "right" ), - list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar", - title = "Bar plot for Delta Background by Conc All Data", + list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar", + title = "Plate analysis by Drug Conc for delta background before quality control", color_var = "conc_num", x_label = "Delta Background", y_label = "Count", @@ -698,9 +703,9 @@ main <- function() { for (var in variables) { for (stage in c("before", "after")) { if (stage == "before") { - df_plot <- df + df_plot <- df_stats } else { - df_plot <- df_na # TODO use df_na_filtered if necessary + df_plot <- df_na_stats # TODO use df_na_filtered if necessary } # Set error_bar = TRUE only for scatter plots @@ -726,7 +731,7 @@ main <- function() { # Create the plot configuration plot_config <- list( - df = df_no_zeros, + df = df_no_zeros_stats, x_var = "scan", y_var = var, plot_type = plot_type, @@ -739,7 +744,7 @@ main <- function() { } l_outside_2sd_k_plots <- list( - list(df = df_na_l_outside_2sd_k_stats, x_var = "l", y_var = "K", plot_type = "scatter", + list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter", title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc", color_var = "conc_num", legend_position = "right"