Break apart plot files to prevent OOM
This commit is contained in:
@@ -8,7 +8,7 @@ suppressMessages({
|
||||
library(unix)
|
||||
})
|
||||
|
||||
options(warn = 2, max.print = 1000)
|
||||
options(warn = 2)
|
||||
options(width = 10000)
|
||||
|
||||
# Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes)
|
||||
@@ -154,7 +154,7 @@ update_gene_names <- function(df, sgd_gene_list) {
|
||||
}
|
||||
|
||||
# Calculate summary statistics for all variables
|
||||
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
|
||||
calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) {
|
||||
|
||||
# Summarize the variables within the grouped data
|
||||
summary_stats <- df %>%
|
||||
@@ -171,6 +171,8 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
|
||||
), .names = "{.fn}_{.col}")
|
||||
)
|
||||
|
||||
print(summary_stats)
|
||||
|
||||
# Prevent .x and .y suffix issues by renaming columns
|
||||
df_cleaned <- df %>%
|
||||
select(-any_of(setdiff(names(summary_stats), group_vars))) # Avoid duplicate columns in the final join
|
||||
@@ -342,8 +344,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
|
||||
plots <- lapply(plot_configs, function(config) {
|
||||
df <- config$df
|
||||
|
||||
print(head(df))
|
||||
|
||||
# Check if y_var is NULL and adjust the aes mapping
|
||||
aes_mapping <- if (is.null(config$y_var)) {
|
||||
aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var)))
|
||||
@@ -424,7 +424,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
|
||||
saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE)
|
||||
}
|
||||
|
||||
|
||||
generate_interaction_plot_configs <- function(df, variables) {
|
||||
configs <- list()
|
||||
|
||||
@@ -587,52 +586,72 @@ main <- function() {
|
||||
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
|
||||
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
|
||||
|
||||
variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across
|
||||
summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
|
||||
group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by
|
||||
print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor",
|
||||
"delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB")
|
||||
|
||||
message("Loading and filtering data")
|
||||
df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
|
||||
df <- update_gene_names(df, args$sgd_gene_list)
|
||||
df <- as_tibble(df)
|
||||
|
||||
# Filter rows that are above tolerance for quality control plots
|
||||
df_above_tolerance <- df %>% filter(DB == 1)
|
||||
|
||||
# Set L, r, K, and AUC to NA for rows that are above tolerance
|
||||
df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .)))
|
||||
# Set L, r, K, AUC (and delta_bg?) to NA for rows that are above tolerance
|
||||
df_na <- df %>% mutate(across(all_of(summary_vars), ~ ifelse(DB == 1, NA, .)))
|
||||
|
||||
# Remove rows with 0 values in L
|
||||
df_no_zeros <- df_na %>% filter(L > 0)
|
||||
|
||||
# Additional filtering for non-finite values
|
||||
# Filter and print non-finite rows, then filter and keep only finite rows
|
||||
df_na_filtered <- df_na %>%
|
||||
{
|
||||
non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
|
||||
if (nrow(non_finite_rows) > 0) {
|
||||
message("Removing non-finite rows:\n")
|
||||
print(head(non_finite_rows, n = 10))
|
||||
}
|
||||
filter(., if_all(c(L), is.finite))
|
||||
}
|
||||
|
||||
# Set some constants
|
||||
max_conc <- max(df$conc_num_factor)
|
||||
l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
|
||||
k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
||||
|
||||
message("Calculating summary statistics before quality control")
|
||||
ss <- calculate_summary_stats(df, variables, group_vars = group_vars)
|
||||
ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars)
|
||||
df_ss <- ss$summary_stats
|
||||
df_stats <- ss$df_with_stats
|
||||
df_filtered_stats <- df_stats %>%
|
||||
{
|
||||
non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
|
||||
if (nrow(non_finite_rows) > 0) {
|
||||
message("Removed the following non-finite rows:")
|
||||
print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
|
||||
}
|
||||
filter(., if_all(c(L), is.finite))
|
||||
}
|
||||
|
||||
message("Calculating summary statistics after quality control")
|
||||
ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars)
|
||||
ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars)
|
||||
df_na_ss <- ss$summary_stats
|
||||
df_na_stats <- ss$df_with_stats
|
||||
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
||||
# Filter out non-finite rows for plotting
|
||||
df_na_filtered_stats <- df_na_stats %>%
|
||||
{
|
||||
non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
|
||||
if (nrow(non_finite_rows) > 0) {
|
||||
message("Removed the following non-finite rows:")
|
||||
print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
|
||||
}
|
||||
filter(., if_all(c(L), is.finite))
|
||||
}
|
||||
|
||||
message("Calculating summary statistics after quality control excluding zero values")
|
||||
ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars)
|
||||
ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars)
|
||||
df_no_zeros_stats <- ss$df_with_stats
|
||||
df_no_zeros_filtered_stats <- df_no_zeros_stats %>%
|
||||
{
|
||||
non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
|
||||
if (nrow(non_finite_rows) > 0) {
|
||||
message("Removed the following non-finite rows:")
|
||||
print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
|
||||
}
|
||||
filter(., if_all(c(L), is.finite))
|
||||
}
|
||||
|
||||
message("Filtering by 2SD of K")
|
||||
df_na_within_2sd_k <- df_na_stats %>%
|
||||
@@ -645,19 +664,24 @@ main <- function() {
|
||||
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||
l_within_2sd_k_ss <- ss$summary_stats
|
||||
df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
||||
write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
|
||||
write.csv(l_within_2sd_k_ss,
|
||||
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
|
||||
|
||||
message("Calculating summary statistics for L outside 2SD of K")
|
||||
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||
l_outside_2sd_k_ss <- ss$summary_stats
|
||||
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
||||
write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
|
||||
write.csv(l_outside_2sd_k_ss,
|
||||
file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
|
||||
|
||||
# Plot configurations
|
||||
# Each plots list corresponds to a file
|
||||
message("Generating QC plot configurations")
|
||||
l_vs_k_plots <- list(
|
||||
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
|
||||
list(
|
||||
df = df,
|
||||
x_var = "L",
|
||||
y_var = "K",
|
||||
plot_type = "scatter",
|
||||
title = "Raw L vs K before quality control",
|
||||
color_var = "conc_num",
|
||||
legend_position = "right"
|
||||
@@ -665,8 +689,13 @@ main <- function() {
|
||||
)
|
||||
|
||||
above_threshold_plots <- list(
|
||||
list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter",
|
||||
title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
|
||||
list(
|
||||
df = df_above_tolerance,
|
||||
x_var = "L",
|
||||
y_var = "K",
|
||||
plot_type = "scatter",
|
||||
title = paste("Raw L vs K for strains above delta background threshold of",
|
||||
df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
|
||||
color_var = "conc_num",
|
||||
annotations = list(
|
||||
x = l_half_median,
|
||||
@@ -679,72 +708,102 @@ main <- function() {
|
||||
)
|
||||
|
||||
frequency_delta_bg_plots <- list(
|
||||
list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
||||
list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
||||
title = "Plate analysis by Drug Conc for delta background before quality control",
|
||||
color_var = "conc_num",
|
||||
x_label = "Delta Background",
|
||||
y_label = "Density",
|
||||
error_bar = FALSE,
|
||||
legend_position = "right"
|
||||
),
|
||||
list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
|
||||
legend_position = "right"),
|
||||
list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
|
||||
title = "Plate analysis by Drug Conc for delta background before quality control",
|
||||
color_var = "conc_num",
|
||||
x_label = "Delta Background",
|
||||
y_label = "Count",
|
||||
error_bar = FALSE,
|
||||
legend_position = "right"
|
||||
)
|
||||
legend_position = "right")
|
||||
)
|
||||
|
||||
plate_analysis_plots <- list()
|
||||
for (plot_type in c("scatter", "box")) {
|
||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||
for (var in variables) {
|
||||
for (var in summary_vars) {
|
||||
for (stage in c("before", "after")) {
|
||||
if (stage == "before") {
|
||||
df_plot <- df_stats
|
||||
df_plot <- df_filtered_stats
|
||||
} else {
|
||||
df_plot <- df_na_stats # TODO use df_na_filtered if necessary
|
||||
df_plot <- df_na_filtered_stats
|
||||
}
|
||||
|
||||
# Set error_bar = TRUE only for scatter plots
|
||||
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
|
||||
|
||||
# Create the plot configuration
|
||||
plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type,
|
||||
title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
|
||||
error_bar = error_bar, color_var = "conc_num")
|
||||
|
||||
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
plate_analysis_no_zero_plots <- list()
|
||||
for (plot_type in c("scatter", "box")) {
|
||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||
for (var in variables) {
|
||||
|
||||
# Set error_bar = TRUE only for scatter plots
|
||||
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
|
||||
|
||||
# Create the plot configuration
|
||||
plot_config <- list(
|
||||
df = df_no_zeros_stats,
|
||||
config <- list(
|
||||
df = df_plot,
|
||||
x_var = "scan",
|
||||
y_var = var,
|
||||
plot_type = plot_type,
|
||||
plot_type = "scatter",
|
||||
title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
|
||||
error_bar = TRUE, color_var = "conc_num")
|
||||
|
||||
plate_analysis_plots <- append(plate_analysis_plots, list(config))
|
||||
}
|
||||
}
|
||||
|
||||
plate_analysis_boxplots <- list()
|
||||
for (var in summary_vars) {
|
||||
for (stage in c("before", "after")) {
|
||||
if (stage == "before") {
|
||||
df_plot <- df_filtered_stats
|
||||
} else {
|
||||
df_plot <- df_na_filtered_stats
|
||||
}
|
||||
|
||||
config <- list(
|
||||
df = df_plot,
|
||||
x_var = "scan",
|
||||
y_var = var,
|
||||
plot_type = "box",
|
||||
title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
|
||||
error_bar = FALSE, color_var = "conc_num")
|
||||
|
||||
plate_analysis_boxplots <- append(plate_analysis_boxplots, list(config))
|
||||
}
|
||||
}
|
||||
|
||||
plate_analysis_no_zeros_plots <- list()
|
||||
for (var in summary_vars) {
|
||||
|
||||
config <- list(
|
||||
df = df_no_zeros_filtered_stats,
|
||||
x_var = "scan",
|
||||
y_var = var,
|
||||
plot_type = "scatter",
|
||||
title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
|
||||
error_bar = error_bar,
|
||||
error_bar = TRUE,
|
||||
color_var = "conc_num")
|
||||
|
||||
plate_analysis_no_zeros_plots <- append(plate_analysis_no_zeros_plots, list(config))
|
||||
}
|
||||
|
||||
plate_analysis_no_zeros_boxplots <- list()
|
||||
for (var in summary_vars) {
|
||||
|
||||
|
||||
# Create the plot configuration
|
||||
config <- list(
|
||||
df = df_no_zeros_filtered_stats,
|
||||
x_var = "scan",
|
||||
y_var = var,
|
||||
plot_type = "box",
|
||||
title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
|
||||
error_bar = FALSE,
|
||||
color_var = "conc_num"
|
||||
)
|
||||
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
|
||||
}
|
||||
plate_analysis_no_zeros_boxplots <- append(plate_analysis_no_zeros_boxplots, list(config))
|
||||
}
|
||||
|
||||
l_outside_2sd_k_plots <- list(
|
||||
list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter",
|
||||
list(
|
||||
df = df_na_l_outside_2sd_k_stats,
|
||||
x_var = "L",
|
||||
y_var = "K",
|
||||
plot_type = "scatter",
|
||||
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
|
||||
color_var = "conc_num",
|
||||
legend_position = "right"
|
||||
@@ -752,20 +811,25 @@ main <- function() {
|
||||
)
|
||||
|
||||
delta_bg_outside_2sd_k_plots <- list(
|
||||
list(df = df_na_l_outside_2sd_k_stats, x_var = "delta_bg", y_var = "K", plot_type = "scatter",
|
||||
list(
|
||||
df = df_na_l_outside_2sd_k_stats,
|
||||
x_var = "delta_bg",
|
||||
y_var = "K",
|
||||
plot_type = "scatter",
|
||||
title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
|
||||
color_var = "conc_num",
|
||||
legend_position = "right"
|
||||
)
|
||||
)
|
||||
|
||||
# Generate and save plots for each QC step
|
||||
message("Generating QC plots")
|
||||
generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
|
||||
generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
|
||||
generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
|
||||
generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
|
||||
generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
|
||||
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
|
||||
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
|
||||
generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
|
||||
generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
|
||||
|
||||
@@ -795,7 +859,7 @@ main <- function() {
|
||||
|
||||
# Recalculate summary statistics for the background strain
|
||||
message("Calculating summary statistics for background strain")
|
||||
ss_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
||||
ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = group_vars)
|
||||
summary_stats_bg <- ss_bg$summary_stats
|
||||
# df_bg_stats <- ss_bg$df_with_stats
|
||||
write.csv(summary_stats_bg,
|
||||
@@ -835,14 +899,14 @@ main <- function() {
|
||||
ungroup()
|
||||
|
||||
# Calculate interactions
|
||||
variables <- c("L", "K", "r", "AUC")
|
||||
interaction_vars <- c("L", "K", "r", "AUC")
|
||||
message("Calculating interaction scores")
|
||||
# print("Reference strain:")
|
||||
# print(head(reference_strain))
|
||||
reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables)
|
||||
reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars)
|
||||
# print("Deletion strains:")
|
||||
# print(head(deletion_strains))
|
||||
deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables)
|
||||
deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars)
|
||||
|
||||
zscores_calculations_reference <- reference_results$calculations
|
||||
zscores_interactions_reference <- reference_results$interactions
|
||||
@@ -856,8 +920,8 @@ main <- function() {
|
||||
write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE)
|
||||
|
||||
# Create interaction plots
|
||||
reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables)
|
||||
deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables)
|
||||
reference_plot_configs <- generate_interaction_plot_configs(df_reference, interaction_vars)
|
||||
deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, interaction_vars)
|
||||
generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
|
||||
generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
|
||||
|
||||
@@ -965,7 +1029,7 @@ main <- function() {
|
||||
generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm",
|
||||
plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2))
|
||||
|
||||
correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, variables)
|
||||
correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, interaction_vars)
|
||||
generate_and_save_plots(output_dir = out_dir, file_name = "Avg_Zscore_vs_lm_NA_rm",
|
||||
plot_configs = correlation_plot_configs, grid_layout = list(ncol = 2, nrow = 2))
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user