|
@@ -30,7 +30,7 @@ parse_arguments <- function() {
|
|
|
"/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
|
|
|
"Experiment 1: Doxo versus HLD",
|
|
|
3,
|
|
|
- "/home/bryan/documents/develop/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
|
|
|
+ "/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
|
|
|
"Experiment 2: HLD versus Doxo",
|
|
|
3
|
|
|
)
|
|
@@ -170,7 +170,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
|
|
|
se = ~ ifelse(all(is.na(.)), NA, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1))
|
|
|
), .names = "{.fn}_{.col}")
|
|
|
)
|
|
|
-
|
|
|
+
|
|
|
# Prevent .x and .y suffix issues by renaming columns
|
|
|
df_cleaned <- df %>%
|
|
|
select(-any_of(setdiff(names(summary_stats), group_vars))) # Avoid duplicate columns in the final join
|
|
@@ -336,9 +336,14 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
|
|
}
|
|
|
|
|
|
generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
|
|
|
+
|
|
|
+ message("Generating html and pdf plots for: ", file_name)
|
|
|
+
|
|
|
plots <- lapply(plot_configs, function(config) {
|
|
|
df <- config$df
|
|
|
-
|
|
|
+
|
|
|
+ print(head(df))
|
|
|
+
|
|
|
# Check if y_var is NULL and adjust the aes mapping
|
|
|
aes_mapping <- if (is.null(config$y_var)) {
|
|
|
aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var)))
|
|
@@ -374,7 +379,7 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
|
|
|
} else if (config$plot_type == "density") {
|
|
|
plot <- plot + geom_density()
|
|
|
} else if (config$plot_type == "bar") {
|
|
|
- plot <- plot + geom_bar(stat = "identity")
|
|
|
+ plot <- plot + geom_bar()
|
|
|
} else {
|
|
|
plot <- plot + geom_point(shape = 3) + geom_smooth(method = "lm", se = FALSE)
|
|
|
}
|
|
@@ -582,35 +587,15 @@ main <- function() {
|
|
|
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
|
|
|
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
|
|
- # Load and process data
|
|
|
+ message("Loading and filtering data")
|
|
|
df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
|
|
|
df <- update_gene_names(df, args$sgd_gene_list)
|
|
|
-
|
|
|
- max_conc <- max(df$conc_num_factor)
|
|
|
-
|
|
|
- # QC steps and filtering
|
|
|
- df_above_tolerance <- df %>% filter(DB == 1)
|
|
|
-
|
|
|
- # Calculate the half-medians for `L` and `K` for rows above tolerance
|
|
|
- L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
|
|
|
- K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
|
|
|
|
|
- # Get the number of rows that are above tolerance
|
|
|
- rows_to_remove <- nrow(df_above_tolerance)
|
|
|
+ # Filter rows that are above tolerance for quality control plots
|
|
|
+ df_above_tolerance <- df %>% filter(DB == 1)
|
|
|
|
|
|
# Set L, r, K, and AUC to NA for rows that are above tolerance
|
|
|
- df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .)))
|
|
|
-
|
|
|
- # Calculate summary statistics for all strains, including both background and the deletions
|
|
|
- message("Calculating summary statistics for all strains")
|
|
|
- variables <- c("L", "K", "r", "AUC")
|
|
|
- ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
|
|
- summary_stats <- ss$summary_stats
|
|
|
- df_na_stats <- ss$df_with_stats
|
|
|
- write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
|
|
|
-
|
|
|
- # print("Summary stats:")
|
|
|
- # print(head(summary_stats), width = 200)
|
|
|
+ df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .)))
|
|
|
|
|
|
# Remove rows with 0 values in L
|
|
|
df_no_zeros <- df_na %>% filter(L > 0)
|
|
@@ -626,34 +611,54 @@ main <- function() {
|
|
|
}
|
|
|
filter(., if_all(c(L), is.finite))
|
|
|
}
|
|
|
+
|
|
|
+ variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across
|
|
|
+ group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # fields to group by
|
|
|
+
|
|
|
+ # Set some constants
|
|
|
+ max_conc <- max(df$conc_num_factor)
|
|
|
+ l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
|
|
|
+ k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
|
|
+
|
|
|
+ message("Calculating summary statistics before quality control")
|
|
|
+ ss <- calculate_summary_stats(df, variables, group_vars = group_vars)
|
|
|
+ df_stats <- ss$df_with_stats
|
|
|
+
|
|
|
+ message("Calculating summary statistics after quality control")
|
|
|
+ ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars)
|
|
|
+ df_na_ss <- ss$summary_stats
|
|
|
+ df_na_stats <- ss$df_with_stats
|
|
|
+ write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
|
|
+
|
|
|
+ message("Calculating summary statistics after quality control excluding zero values")
|
|
|
+ ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars)
|
|
|
+ df_no_zeros_stats <- ss$df_with_stats
|
|
|
|
|
|
- # Filter data within and outside 2SD
|
|
|
message("Filtering by 2SD of K")
|
|
|
df_na_within_2sd_k <- df_na_stats %>%
|
|
|
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
|
|
|
df_na_outside_2sd_k <- df_na_stats %>%
|
|
|
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
|
|
|
|
|
|
- # Summary statistics for within and outside 2SD of K
|
|
|
message("Calculating summary statistics for L within 2SD of K")
|
|
|
# TODO We're omitting the original z_max calculation, not sure if needed?
|
|
|
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
|
|
- l_within_2sd_k_stats <- ss$summary_stats
|
|
|
+ l_within_2sd_k_ss <- ss$summary_stats
|
|
|
df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
|
|
+ write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
|
|
|
+
|
|
|
message("Calculating summary statistics for L outside 2SD of K")
|
|
|
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
|
|
- l_outside_2sd_k_stats <- ss$summary_stats
|
|
|
+ l_outside_2sd_k_ss <- ss$summary_stats
|
|
|
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
|
|
- # Write CSV files
|
|
|
- write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
|
|
|
- write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
|
|
|
+ write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
|
|
|
|
|
|
# Plot configurations
|
|
|
# Each plots list corresponds to a file
|
|
|
message("Generating QC plot configurations")
|
|
|
l_vs_k_plots <- list(
|
|
|
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
|
|
|
- title = "Raw L vs K before QC",
|
|
|
+ title = "Raw L vs K before quality control",
|
|
|
color_var = "conc_num",
|
|
|
legend_position = "right"
|
|
|
)
|
|
@@ -664,8 +669,8 @@ main <- function() {
|
|
|
title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
|
|
|
color_var = "conc_num",
|
|
|
annotations = list(
|
|
|
- x = L_half_median,
|
|
|
- y = K_half_median,
|
|
|
+ x = l_half_median,
|
|
|
+ y = k_half_median,
|
|
|
label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance))
|
|
|
),
|
|
|
error_bar = FALSE,
|
|
@@ -674,16 +679,16 @@ main <- function() {
|
|
|
)
|
|
|
|
|
|
frequency_delta_bg_plots <- list(
|
|
|
- list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
|
|
- title = "Density plot for Delta Background by Conc All Data",
|
|
|
+ list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
|
|
+ title = "Plate analysis by Drug Conc for delta background before quality control",
|
|
|
color_var = "conc_num",
|
|
|
x_label = "Delta Background",
|
|
|
y_label = "Density",
|
|
|
error_bar = FALSE,
|
|
|
legend_position = "right"
|
|
|
),
|
|
|
- list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
|
|
|
- title = "Bar plot for Delta Background by Conc All Data",
|
|
|
+ list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
|
|
|
+ title = "Plate analysis by Drug Conc for delta background before quality control",
|
|
|
color_var = "conc_num",
|
|
|
x_label = "Delta Background",
|
|
|
y_label = "Count",
|
|
@@ -698,9 +703,9 @@ main <- function() {
|
|
|
for (var in variables) {
|
|
|
for (stage in c("before", "after")) {
|
|
|
if (stage == "before") {
|
|
|
- df_plot <- df
|
|
|
+ df_plot <- df_stats
|
|
|
} else {
|
|
|
- df_plot <- df_na # TODO use df_na_filtered if necessary
|
|
|
+ df_plot <- df_na_stats # TODO use df_na_filtered if necessary
|
|
|
}
|
|
|
|
|
|
# Set error_bar = TRUE only for scatter plots
|
|
@@ -726,7 +731,7 @@ main <- function() {
|
|
|
|
|
|
# Create the plot configuration
|
|
|
plot_config <- list(
|
|
|
- df = df_no_zeros,
|
|
|
+ df = df_no_zeros_stats,
|
|
|
x_var = "scan",
|
|
|
y_var = var,
|
|
|
plot_type = plot_type,
|
|
@@ -739,7 +744,7 @@ main <- function() {
|
|
|
}
|
|
|
|
|
|
l_outside_2sd_k_plots <- list(
|
|
|
- list(df = df_na_l_outside_2sd_k_stats, x_var = "l", y_var = "K", plot_type = "scatter",
|
|
|
+ list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter",
|
|
|
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
|
|
|
color_var = "conc_num",
|
|
|
legend_position = "right"
|