Add additional stats data frames
This commit is contained in:
@@ -30,7 +30,7 @@ parse_arguments <- function() {
|
|||||||
"/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
|
"/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
|
||||||
"Experiment 1: Doxo versus HLD",
|
"Experiment 1: Doxo versus HLD",
|
||||||
3,
|
3,
|
||||||
"/home/bryan/documents/develop/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
|
"/home/bryan/documents/develop/hartmanlab/qhtcp-workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
|
||||||
"Experiment 2: HLD versus Doxo",
|
"Experiment 2: HLD versus Doxo",
|
||||||
3
|
3
|
||||||
)
|
)
|
||||||
@@ -336,9 +336,14 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
|||||||
}
|
}
|
||||||
|
|
||||||
generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
|
generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
|
||||||
|
|
||||||
|
message("Generating html and pdf plots for: ", file_name)
|
||||||
|
|
||||||
plots <- lapply(plot_configs, function(config) {
|
plots <- lapply(plot_configs, function(config) {
|
||||||
df <- config$df
|
df <- config$df
|
||||||
|
|
||||||
|
print(head(df))
|
||||||
|
|
||||||
# Check if y_var is NULL and adjust the aes mapping
|
# Check if y_var is NULL and adjust the aes mapping
|
||||||
aes_mapping <- if (is.null(config$y_var)) {
|
aes_mapping <- if (is.null(config$y_var)) {
|
||||||
aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var)))
|
aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var)))
|
||||||
@@ -374,7 +379,7 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
|
|||||||
} else if (config$plot_type == "density") {
|
} else if (config$plot_type == "density") {
|
||||||
plot <- plot + geom_density()
|
plot <- plot + geom_density()
|
||||||
} else if (config$plot_type == "bar") {
|
} else if (config$plot_type == "bar") {
|
||||||
plot <- plot + geom_bar(stat = "identity")
|
plot <- plot + geom_bar()
|
||||||
} else {
|
} else {
|
||||||
plot <- plot + geom_point(shape = 3) + geom_smooth(method = "lm", se = FALSE)
|
plot <- plot + geom_point(shape = 3) + geom_smooth(method = "lm", se = FALSE)
|
||||||
}
|
}
|
||||||
@@ -582,35 +587,15 @@ main <- function() {
|
|||||||
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
|
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
|
||||||
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
|
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
|
||||||
|
|
||||||
# Load and process data
|
message("Loading and filtering data")
|
||||||
df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
|
df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
|
||||||
df <- update_gene_names(df, args$sgd_gene_list)
|
df <- update_gene_names(df, args$sgd_gene_list)
|
||||||
|
|
||||||
max_conc <- max(df$conc_num_factor)
|
# Filter rows that are above tolerance for quality control plots
|
||||||
|
|
||||||
# QC steps and filtering
|
|
||||||
df_above_tolerance <- df %>% filter(DB == 1)
|
df_above_tolerance <- df %>% filter(DB == 1)
|
||||||
|
|
||||||
# Calculate the half-medians for `L` and `K` for rows above tolerance
|
|
||||||
L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
|
|
||||||
K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
|
||||||
|
|
||||||
# Get the number of rows that are above tolerance
|
|
||||||
rows_to_remove <- nrow(df_above_tolerance)
|
|
||||||
|
|
||||||
# Set L, r, K, and AUC to NA for rows that are above tolerance
|
# Set L, r, K, and AUC to NA for rows that are above tolerance
|
||||||
df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .)))
|
df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .)))
|
||||||
|
|
||||||
# Calculate summary statistics for all strains, including both background and the deletions
|
|
||||||
message("Calculating summary statistics for all strains")
|
|
||||||
variables <- c("L", "K", "r", "AUC")
|
|
||||||
ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
|
||||||
summary_stats <- ss$summary_stats
|
|
||||||
df_na_stats <- ss$df_with_stats
|
|
||||||
write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
|
|
||||||
|
|
||||||
# print("Summary stats:")
|
|
||||||
# print(head(summary_stats), width = 200)
|
|
||||||
|
|
||||||
# Remove rows with 0 values in L
|
# Remove rows with 0 values in L
|
||||||
df_no_zeros <- df_na %>% filter(L > 0)
|
df_no_zeros <- df_na %>% filter(L > 0)
|
||||||
@@ -627,33 +612,53 @@ main <- function() {
|
|||||||
filter(., if_all(c(L), is.finite))
|
filter(., if_all(c(L), is.finite))
|
||||||
}
|
}
|
||||||
|
|
||||||
# Filter data within and outside 2SD
|
variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across
|
||||||
|
group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # fields to group by
|
||||||
|
|
||||||
|
# Set some constants
|
||||||
|
max_conc <- max(df$conc_num_factor)
|
||||||
|
l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
|
||||||
|
k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
||||||
|
|
||||||
|
message("Calculating summary statistics before quality control")
|
||||||
|
ss <- calculate_summary_stats(df, variables, group_vars = group_vars)
|
||||||
|
df_stats <- ss$df_with_stats
|
||||||
|
|
||||||
|
message("Calculating summary statistics after quality control")
|
||||||
|
ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars)
|
||||||
|
df_na_ss <- ss$summary_stats
|
||||||
|
df_na_stats <- ss$df_with_stats
|
||||||
|
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
||||||
|
|
||||||
|
message("Calculating summary statistics after quality control excluding zero values")
|
||||||
|
ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars)
|
||||||
|
df_no_zeros_stats <- ss$df_with_stats
|
||||||
|
|
||||||
message("Filtering by 2SD of K")
|
message("Filtering by 2SD of K")
|
||||||
df_na_within_2sd_k <- df_na_stats %>%
|
df_na_within_2sd_k <- df_na_stats %>%
|
||||||
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
|
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
|
||||||
df_na_outside_2sd_k <- df_na_stats %>%
|
df_na_outside_2sd_k <- df_na_stats %>%
|
||||||
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
|
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
|
||||||
|
|
||||||
# Summary statistics for within and outside 2SD of K
|
|
||||||
message("Calculating summary statistics for L within 2SD of K")
|
message("Calculating summary statistics for L within 2SD of K")
|
||||||
# TODO We're omitting the original z_max calculation, not sure if needed?
|
# TODO We're omitting the original z_max calculation, not sure if needed?
|
||||||
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||||
l_within_2sd_k_stats <- ss$summary_stats
|
l_within_2sd_k_ss <- ss$summary_stats
|
||||||
df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
||||||
|
write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
|
||||||
|
|
||||||
message("Calculating summary statistics for L outside 2SD of K")
|
message("Calculating summary statistics for L outside 2SD of K")
|
||||||
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||||
l_outside_2sd_k_stats <- ss$summary_stats
|
l_outside_2sd_k_ss <- ss$summary_stats
|
||||||
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
||||||
# Write CSV files
|
write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
|
||||||
write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
|
|
||||||
write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
|
|
||||||
|
|
||||||
# Plot configurations
|
# Plot configurations
|
||||||
# Each plots list corresponds to a file
|
# Each plots list corresponds to a file
|
||||||
message("Generating QC plot configurations")
|
message("Generating QC plot configurations")
|
||||||
l_vs_k_plots <- list(
|
l_vs_k_plots <- list(
|
||||||
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
|
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
|
||||||
title = "Raw L vs K before QC",
|
title = "Raw L vs K before quality control",
|
||||||
color_var = "conc_num",
|
color_var = "conc_num",
|
||||||
legend_position = "right"
|
legend_position = "right"
|
||||||
)
|
)
|
||||||
@@ -664,8 +669,8 @@ main <- function() {
|
|||||||
title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
|
title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
|
||||||
color_var = "conc_num",
|
color_var = "conc_num",
|
||||||
annotations = list(
|
annotations = list(
|
||||||
x = L_half_median,
|
x = l_half_median,
|
||||||
y = K_half_median,
|
y = k_half_median,
|
||||||
label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance))
|
label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance))
|
||||||
),
|
),
|
||||||
error_bar = FALSE,
|
error_bar = FALSE,
|
||||||
@@ -674,16 +679,16 @@ main <- function() {
|
|||||||
)
|
)
|
||||||
|
|
||||||
frequency_delta_bg_plots <- list(
|
frequency_delta_bg_plots <- list(
|
||||||
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
||||||
title = "Density plot for Delta Background by Conc All Data",
|
title = "Plate analysis by Drug Conc for delta background before quality control",
|
||||||
color_var = "conc_num",
|
color_var = "conc_num",
|
||||||
x_label = "Delta Background",
|
x_label = "Delta Background",
|
||||||
y_label = "Density",
|
y_label = "Density",
|
||||||
error_bar = FALSE,
|
error_bar = FALSE,
|
||||||
legend_position = "right"
|
legend_position = "right"
|
||||||
),
|
),
|
||||||
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
|
list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
|
||||||
title = "Bar plot for Delta Background by Conc All Data",
|
title = "Plate analysis by Drug Conc for delta background before quality control",
|
||||||
color_var = "conc_num",
|
color_var = "conc_num",
|
||||||
x_label = "Delta Background",
|
x_label = "Delta Background",
|
||||||
y_label = "Count",
|
y_label = "Count",
|
||||||
@@ -698,9 +703,9 @@ main <- function() {
|
|||||||
for (var in variables) {
|
for (var in variables) {
|
||||||
for (stage in c("before", "after")) {
|
for (stage in c("before", "after")) {
|
||||||
if (stage == "before") {
|
if (stage == "before") {
|
||||||
df_plot <- df
|
df_plot <- df_stats
|
||||||
} else {
|
} else {
|
||||||
df_plot <- df_na # TODO use df_na_filtered if necessary
|
df_plot <- df_na_stats # TODO use df_na_filtered if necessary
|
||||||
}
|
}
|
||||||
|
|
||||||
# Set error_bar = TRUE only for scatter plots
|
# Set error_bar = TRUE only for scatter plots
|
||||||
@@ -726,7 +731,7 @@ main <- function() {
|
|||||||
|
|
||||||
# Create the plot configuration
|
# Create the plot configuration
|
||||||
plot_config <- list(
|
plot_config <- list(
|
||||||
df = df_no_zeros,
|
df = df_no_zeros_stats,
|
||||||
x_var = "scan",
|
x_var = "scan",
|
||||||
y_var = var,
|
y_var = var,
|
||||||
plot_type = plot_type,
|
plot_type = plot_type,
|
||||||
@@ -739,7 +744,7 @@ main <- function() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
l_outside_2sd_k_plots <- list(
|
l_outside_2sd_k_plots <- list(
|
||||||
list(df = df_na_l_outside_2sd_k_stats, x_var = "l", y_var = "K", plot_type = "scatter",
|
list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter",
|
||||||
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
|
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
|
||||||
color_var = "conc_num",
|
color_var = "conc_num",
|
||||||
legend_position = "right"
|
legend_position = "right"
|
||||||
|
|||||||
Reference in New Issue
Block a user