Add additional stats data frames
This commit is contained in:
@@ -30,7 +30,7 @@ parse_arguments <- function() {
|
||||
"/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
|
||||
"Experiment 1: Doxo versus HLD",
|
||||
3,
|
||||
"/home/bryan/documents/develop/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
|
||||
"/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
|
||||
"Experiment 2: HLD versus Doxo",
|
||||
3
|
||||
)
|
||||
@@ -336,9 +336,14 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
||||
}
|
||||
|
||||
generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
|
||||
|
||||
message("Generating html and pdf plots for: ", file_name)
|
||||
|
||||
plots <- lapply(plot_configs, function(config) {
|
||||
df <- config$df
|
||||
|
||||
print(head(df))
|
||||
|
||||
# Check if y_var is NULL and adjust the aes mapping
|
||||
aes_mapping <- if (is.null(config$y_var)) {
|
||||
aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var)))
|
||||
@@ -374,7 +379,7 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
|
||||
} else if (config$plot_type == "density") {
|
||||
plot <- plot + geom_density()
|
||||
} else if (config$plot_type == "bar") {
|
||||
plot <- plot + geom_bar(stat = "identity")
|
||||
plot <- plot + geom_bar()
|
||||
} else {
|
||||
plot <- plot + geom_point(shape = 3) + geom_smooth(method = "lm", se = FALSE)
|
||||
}
|
||||
@@ -582,35 +587,15 @@ main <- function() {
|
||||
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
|
||||
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
|
||||
|
||||
# Load and process data
|
||||
message("Loading and filtering data")
|
||||
df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
|
||||
df <- update_gene_names(df, args$sgd_gene_list)
|
||||
|
||||
max_conc <- max(df$conc_num_factor)
|
||||
|
||||
# QC steps and filtering
|
||||
# Filter rows that are above tolerance for quality control plots
|
||||
df_above_tolerance <- df %>% filter(DB == 1)
|
||||
|
||||
# Calculate the half-medians for `L` and `K` for rows above tolerance
|
||||
L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
|
||||
K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
||||
|
||||
# Get the number of rows that are above tolerance
|
||||
rows_to_remove <- nrow(df_above_tolerance)
|
||||
|
||||
# Set L, r, K, and AUC to NA for rows that are above tolerance
|
||||
df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .)))
|
||||
|
||||
# Calculate summary statistics for all strains, including both background and the deletions
|
||||
message("Calculating summary statistics for all strains")
|
||||
variables <- c("L", "K", "r", "AUC")
|
||||
ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
||||
summary_stats <- ss$summary_stats
|
||||
df_na_stats <- ss$df_with_stats
|
||||
write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
|
||||
|
||||
# print("Summary stats:")
|
||||
# print(head(summary_stats), width = 200)
|
||||
df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .)))
|
||||
|
||||
# Remove rows with 0 values in L
|
||||
df_no_zeros <- df_na %>% filter(L > 0)
|
||||
@@ -627,33 +612,53 @@ main <- function() {
|
||||
filter(., if_all(c(L), is.finite))
|
||||
}
|
||||
|
||||
# Filter data within and outside 2SD
|
||||
variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across
|
||||
group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # fields to group by
|
||||
|
||||
# Set some constants
|
||||
max_conc <- max(df$conc_num_factor)
|
||||
l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
|
||||
k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
||||
|
||||
message("Calculating summary statistics before quality control")
|
||||
ss <- calculate_summary_stats(df, variables, group_vars = group_vars)
|
||||
df_stats <- ss$df_with_stats
|
||||
|
||||
message("Calculating summary statistics after quality control")
|
||||
ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars)
|
||||
df_na_ss <- ss$summary_stats
|
||||
df_na_stats <- ss$df_with_stats
|
||||
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
||||
|
||||
message("Calculating summary statistics after quality control excluding zero values")
|
||||
ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars)
|
||||
df_no_zeros_stats <- ss$df_with_stats
|
||||
|
||||
message("Filtering by 2SD of K")
|
||||
df_na_within_2sd_k <- df_na_stats %>%
|
||||
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
|
||||
df_na_outside_2sd_k <- df_na_stats %>%
|
||||
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
|
||||
|
||||
# Summary statistics for within and outside 2SD of K
|
||||
message("Calculating summary statistics for L within 2SD of K")
|
||||
# TODO We're omitting the original z_max calculation, not sure if needed?
|
||||
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||
l_within_2sd_k_stats <- ss$summary_stats
|
||||
l_within_2sd_k_ss <- ss$summary_stats
|
||||
df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
||||
write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
|
||||
|
||||
message("Calculating summary statistics for L outside 2SD of K")
|
||||
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||
l_outside_2sd_k_stats <- ss$summary_stats
|
||||
l_outside_2sd_k_ss <- ss$summary_stats
|
||||
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
||||
# Write CSV files
|
||||
write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
|
||||
write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
|
||||
write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
|
||||
|
||||
# Plot configurations
|
||||
# Each plots list corresponds to a file
|
||||
message("Generating QC plot configurations")
|
||||
l_vs_k_plots <- list(
|
||||
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
|
||||
title = "Raw L vs K before QC",
|
||||
title = "Raw L vs K before quality control",
|
||||
color_var = "conc_num",
|
||||
legend_position = "right"
|
||||
)
|
||||
@@ -664,8 +669,8 @@ main <- function() {
|
||||
title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
|
||||
color_var = "conc_num",
|
||||
annotations = list(
|
||||
x = L_half_median,
|
||||
y = K_half_median,
|
||||
x = l_half_median,
|
||||
y = k_half_median,
|
||||
label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance))
|
||||
),
|
||||
error_bar = FALSE,
|
||||
@@ -674,16 +679,16 @@ main <- function() {
|
||||
)
|
||||
|
||||
frequency_delta_bg_plots <- list(
|
||||
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
||||
title = "Density plot for Delta Background by Conc All Data",
|
||||
list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
||||
title = "Plate analysis by Drug Conc for delta background before quality control",
|
||||
color_var = "conc_num",
|
||||
x_label = "Delta Background",
|
||||
y_label = "Density",
|
||||
error_bar = FALSE,
|
||||
legend_position = "right"
|
||||
),
|
||||
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
|
||||
title = "Bar plot for Delta Background by Conc All Data",
|
||||
list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
|
||||
title = "Plate analysis by Drug Conc for delta background before quality control",
|
||||
color_var = "conc_num",
|
||||
x_label = "Delta Background",
|
||||
y_label = "Count",
|
||||
@@ -698,9 +703,9 @@ main <- function() {
|
||||
for (var in variables) {
|
||||
for (stage in c("before", "after")) {
|
||||
if (stage == "before") {
|
||||
df_plot <- df
|
||||
df_plot <- df_stats
|
||||
} else {
|
||||
df_plot <- df_na # TODO use df_na_filtered if necessary
|
||||
df_plot <- df_na_stats # TODO use df_na_filtered if necessary
|
||||
}
|
||||
|
||||
# Set error_bar = TRUE only for scatter plots
|
||||
@@ -726,7 +731,7 @@ main <- function() {
|
||||
|
||||
# Create the plot configuration
|
||||
plot_config <- list(
|
||||
df = df_no_zeros,
|
||||
df = df_no_zeros_stats,
|
||||
x_var = "scan",
|
||||
y_var = var,
|
||||
plot_type = plot_type,
|
||||
@@ -739,7 +744,7 @@ main <- function() {
|
||||
}
|
||||
|
||||
l_outside_2sd_k_plots <- list(
|
||||
list(df = df_na_l_outside_2sd_k_stats, x_var = "l", y_var = "K", plot_type = "scatter",
|
||||
list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter",
|
||||
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
|
||||
color_var = "conc_num",
|
||||
legend_position = "right"
|
||||
|
||||
Reference in New Issue
Block a user