Add additional stats data frames

This commit is contained in:
2024-09-11 15:18:54 -04:00
parent 2eccb30b31
commit cee290fd69

View File

@@ -30,7 +30,7 @@ parse_arguments <- function() {
"/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
"Experiment 1: Doxo versus HLD",
3,
"/home/bryan/documents/develop/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
"/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
"Experiment 2: HLD versus Doxo",
3
)
@@ -336,9 +336,14 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
}
generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
message("Generating html and pdf plots for: ", file_name)
plots <- lapply(plot_configs, function(config) {
df <- config$df
print(head(df))
# Check if y_var is NULL and adjust the aes mapping
aes_mapping <- if (is.null(config$y_var)) {
aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var)))
@@ -374,7 +379,7 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
} else if (config$plot_type == "density") {
plot <- plot + geom_density()
} else if (config$plot_type == "bar") {
plot <- plot + geom_bar(stat = "identity")
plot <- plot + geom_bar()
} else {
plot <- plot + geom_point(shape = 3) + geom_smooth(method = "lm", se = FALSE)
}
@@ -582,35 +587,15 @@ main <- function() {
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
# Load and process data
message("Loading and filtering data")
df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
df <- update_gene_names(df, args$sgd_gene_list)
max_conc <- max(df$conc_num_factor)
# QC steps and filtering
# Filter rows that are above tolerance for quality control plots
df_above_tolerance <- df %>% filter(DB == 1)
# Calculate the half-medians for `L` and `K` for rows above tolerance
L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
# Get the number of rows that are above tolerance
rows_to_remove <- nrow(df_above_tolerance)
# Set L, r, K, and AUC to NA for rows that are above tolerance
df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .)))
# Calculate summary statistics for all strains, including both background and the deletions
message("Calculating summary statistics for all strains")
variables <- c("L", "K", "r", "AUC")
ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
summary_stats <- ss$summary_stats
df_na_stats <- ss$df_with_stats
write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
# print("Summary stats:")
# print(head(summary_stats), width = 200)
df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .)))
# Remove rows with 0 values in L
df_no_zeros <- df_na %>% filter(L > 0)
@@ -627,33 +612,53 @@ main <- function() {
filter(., if_all(c(L), is.finite))
}
# Filter data within and outside 2SD
variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across
group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # fields to group by
# Set some constants
max_conc <- max(df$conc_num_factor)
l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
message("Calculating summary statistics before quality control")
ss <- calculate_summary_stats(df, variables, group_vars = group_vars)
df_stats <- ss$df_with_stats
message("Calculating summary statistics after quality control")
ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars)
df_na_ss <- ss$summary_stats
df_na_stats <- ss$df_with_stats
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
message("Calculating summary statistics after quality control excluding zero values")
ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars)
df_no_zeros_stats <- ss$df_with_stats
message("Filtering by 2SD of K")
df_na_within_2sd_k <- df_na_stats %>%
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
df_na_outside_2sd_k <- df_na_stats %>%
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
# Summary statistics for within and outside 2SD of K
message("Calculating summary statistics for L within 2SD of K")
# TODO We're omitting the original z_max calculation, not sure if needed?
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_within_2sd_k_stats <- ss$summary_stats
l_within_2sd_k_ss <- ss$summary_stats
df_na_l_within_2sd_k_stats <- ss$df_with_stats
write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
message("Calculating summary statistics for L outside 2SD of K")
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_outside_2sd_k_stats <- ss$summary_stats
l_outside_2sd_k_ss <- ss$summary_stats
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
# Write CSV files
write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
# Plot configurations
# Each plots list corresponds to a file
message("Generating QC plot configurations")
l_vs_k_plots <- list(
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
title = "Raw L vs K before QC",
title = "Raw L vs K before quality control",
color_var = "conc_num",
legend_position = "right"
)
@@ -664,8 +669,8 @@ main <- function() {
title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
color_var = "conc_num",
annotations = list(
x = L_half_median,
y = K_half_median,
x = l_half_median,
y = k_half_median,
label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance))
),
error_bar = FALSE,
@@ -674,16 +679,16 @@ main <- function() {
)
frequency_delta_bg_plots <- list(
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density",
title = "Density plot for Delta Background by Conc All Data",
list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
title = "Plate analysis by Drug Conc for delta background before quality control",
color_var = "conc_num",
x_label = "Delta Background",
y_label = "Density",
error_bar = FALSE,
legend_position = "right"
),
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
title = "Bar plot for Delta Background by Conc All Data",
list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
title = "Plate analysis by Drug Conc for delta background before quality control",
color_var = "conc_num",
x_label = "Delta Background",
y_label = "Count",
@@ -698,9 +703,9 @@ main <- function() {
for (var in variables) {
for (stage in c("before", "after")) {
if (stage == "before") {
df_plot <- df
df_plot <- df_stats
} else {
df_plot <- df_na # TODO use df_na_filtered if necessary
df_plot <- df_na_stats # TODO use df_na_filtered if necessary
}
# Set error_bar = TRUE only for scatter plots
@@ -726,7 +731,7 @@ main <- function() {
# Create the plot configuration
plot_config <- list(
df = df_no_zeros,
df = df_no_zeros_stats,
x_var = "scan",
y_var = var,
plot_type = plot_type,
@@ -739,7 +744,7 @@ main <- function() {
}
l_outside_2sd_k_plots <- list(
list(df = df_na_l_outside_2sd_k_stats, x_var = "l", y_var = "K", plot_type = "scatter",
list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter",
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
color_var = "conc_num",
legend_position = "right"