Break apart plot files to prevent OOM

This commit is contained in:
2024-09-11 17:25:45 -04:00
parent f4581d7fcb
commit 53f4bb3ca7

View File

@@ -8,7 +8,7 @@ suppressMessages({
library(unix) library(unix)
}) })
options(warn = 2, max.print = 1000) options(warn = 2)
options(width = 10000) options(width = 10000)
# Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes) # Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes)
@@ -154,7 +154,7 @@ update_gene_names <- function(df, sgd_gene_list) {
} }
# Calculate summary statistics for all variables # Calculate summary statistics for all variables
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) { calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) {
# Summarize the variables within the grouped data # Summarize the variables within the grouped data
summary_stats <- df %>% summary_stats <- df %>%
@@ -171,6 +171,8 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
), .names = "{.fn}_{.col}") ), .names = "{.fn}_{.col}")
) )
print(summary_stats)
# Prevent .x and .y suffix issues by renaming columns # Prevent .x and .y suffix issues by renaming columns
df_cleaned <- df %>% df_cleaned <- df %>%
select(-any_of(setdiff(names(summary_stats), group_vars))) # Avoid duplicate columns in the final join select(-any_of(setdiff(names(summary_stats), group_vars))) # Avoid duplicate columns in the final join
@@ -342,8 +344,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
plots <- lapply(plot_configs, function(config) { plots <- lapply(plot_configs, function(config) {
df <- config$df df <- config$df
print(head(df))
# Check if y_var is NULL and adjust the aes mapping # Check if y_var is NULL and adjust the aes mapping
aes_mapping <- if (is.null(config$y_var)) { aes_mapping <- if (is.null(config$y_var)) {
aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var))) aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var)))
@@ -424,7 +424,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE) saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE)
} }
generate_interaction_plot_configs <- function(df, variables) { generate_interaction_plot_configs <- function(df, variables) {
configs <- list() configs <- list()
@@ -587,33 +586,24 @@ main <- function() {
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE) dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by
print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor",
"delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB")
message("Loading and filtering data") message("Loading and filtering data")
df <- load_and_process_data(args$easy_results_file, sd = exp_sd) df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
df <- update_gene_names(df, args$sgd_gene_list) df <- update_gene_names(df, args$sgd_gene_list)
df <- as_tibble(df)
# Filter rows that are above tolerance for quality control plots # Filter rows that are above tolerance for quality control plots
df_above_tolerance <- df %>% filter(DB == 1) df_above_tolerance <- df %>% filter(DB == 1)
# Set L, r, K, and AUC to NA for rows that are above tolerance # Set L, r, K, AUC (and delta_bg?) to NA for rows that are above tolerance
df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .))) df_na <- df %>% mutate(across(all_of(summary_vars), ~ ifelse(DB == 1, NA, .)))
# Remove rows with 0 values in L # Remove rows with 0 values in L
df_no_zeros <- df_na %>% filter(L > 0) df_no_zeros <- df_na %>% filter(L > 0)
# Additional filtering for non-finite values
# Filter and print non-finite rows, then filter and keep only finite rows
df_na_filtered <- df_na %>%
{
non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
if (nrow(non_finite_rows) > 0) {
message("Removing non-finite rows:\n")
print(head(non_finite_rows, n = 10))
}
filter(., if_all(c(L), is.finite))
}
# Set some constants # Set some constants
max_conc <- max(df$conc_num_factor) max_conc <- max(df$conc_num_factor)
@@ -621,18 +611,47 @@ main <- function() {
k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2 k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
message("Calculating summary statistics before quality control") message("Calculating summary statistics before quality control")
ss <- calculate_summary_stats(df, variables, group_vars = group_vars) ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars)
df_ss <- ss$summary_stats
df_stats <- ss$df_with_stats df_stats <- ss$df_with_stats
df_filtered_stats <- df_stats %>%
{
non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
if (nrow(non_finite_rows) > 0) {
message("Removed the following non-finite rows:")
print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
}
filter(., if_all(c(L), is.finite))
}
message("Calculating summary statistics after quality control") message("Calculating summary statistics after quality control")
ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars) ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars)
df_na_ss <- ss$summary_stats df_na_ss <- ss$summary_stats
df_na_stats <- ss$df_with_stats df_na_stats <- ss$df_with_stats
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
# Filter out non-finite rows for plotting
df_na_filtered_stats <- df_na_stats %>%
{
non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
if (nrow(non_finite_rows) > 0) {
message("Removed the following non-finite rows:")
print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
}
filter(., if_all(c(L), is.finite))
}
message("Calculating summary statistics after quality control excluding zero values") message("Calculating summary statistics after quality control excluding zero values")
ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars) ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars)
df_no_zeros_stats <- ss$df_with_stats df_no_zeros_stats <- ss$df_with_stats
df_no_zeros_filtered_stats <- df_no_zeros_stats %>%
{
non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
if (nrow(non_finite_rows) > 0) {
message("Removed the following non-finite rows:")
print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
}
filter(., if_all(c(L), is.finite))
}
message("Filtering by 2SD of K") message("Filtering by 2SD of K")
df_na_within_2sd_k <- df_na_stats %>% df_na_within_2sd_k <- df_na_stats %>%
@@ -645,19 +664,24 @@ main <- function() {
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_within_2sd_k_ss <- ss$summary_stats l_within_2sd_k_ss <- ss$summary_stats
df_na_l_within_2sd_k_stats <- ss$df_with_stats df_na_l_within_2sd_k_stats <- ss$df_with_stats
write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) write.csv(l_within_2sd_k_ss,
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
message("Calculating summary statistics for L outside 2SD of K") message("Calculating summary statistics for L outside 2SD of K")
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_outside_2sd_k_ss <- ss$summary_stats l_outside_2sd_k_ss <- ss$summary_stats
df_na_l_outside_2sd_k_stats <- ss$df_with_stats df_na_l_outside_2sd_k_stats <- ss$df_with_stats
write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE) write.csv(l_outside_2sd_k_ss,
file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
# Plot configurations
# Each plots list corresponds to a file # Each plots list corresponds to a file
message("Generating QC plot configurations") message("Generating QC plot configurations")
l_vs_k_plots <- list( l_vs_k_plots <- list(
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter", list(
df = df,
x_var = "L",
y_var = "K",
plot_type = "scatter",
title = "Raw L vs K before quality control", title = "Raw L vs K before quality control",
color_var = "conc_num", color_var = "conc_num",
legend_position = "right" legend_position = "right"
@@ -665,8 +689,13 @@ main <- function() {
) )
above_threshold_plots <- list( above_threshold_plots <- list(
list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter", list(
title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"), df = df_above_tolerance,
x_var = "L",
y_var = "K",
plot_type = "scatter",
title = paste("Raw L vs K for strains above delta background threshold of",
df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
color_var = "conc_num", color_var = "conc_num",
annotations = list( annotations = list(
x = l_half_median, x = l_half_median,
@@ -679,72 +708,102 @@ main <- function() {
) )
frequency_delta_bg_plots <- list( frequency_delta_bg_plots <- list(
list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density", list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
title = "Plate analysis by Drug Conc for delta background before quality control", title = "Plate analysis by Drug Conc for delta background before quality control",
color_var = "conc_num", color_var = "conc_num",
x_label = "Delta Background", x_label = "Delta Background",
y_label = "Density", y_label = "Density",
error_bar = FALSE, error_bar = FALSE,
legend_position = "right" legend_position = "right"),
), list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
title = "Plate analysis by Drug Conc for delta background before quality control", title = "Plate analysis by Drug Conc for delta background before quality control",
color_var = "conc_num", color_var = "conc_num",
x_label = "Delta Background", x_label = "Delta Background",
y_label = "Count", y_label = "Count",
error_bar = FALSE, error_bar = FALSE,
legend_position = "right" legend_position = "right")
)
) )
plate_analysis_plots <- list() plate_analysis_plots <- list()
for (plot_type in c("scatter", "box")) { for (var in summary_vars) {
variables <- c("L", "K", "r", "AUC", "delta_bg") for (stage in c("before", "after")) {
for (var in variables) { if (stage == "before") {
for (stage in c("before", "after")) { df_plot <- df_filtered_stats
if (stage == "before") { } else {
df_plot <- df_stats df_plot <- df_na_filtered_stats
} else {
df_plot <- df_na_stats # TODO use df_na_filtered if necessary
}
# Set error_bar = TRUE only for scatter plots
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
# Create the plot configuration
plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type,
title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
error_bar = error_bar, color_var = "conc_num")
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
} }
config <- list(
df = df_plot,
x_var = "scan",
y_var = var,
plot_type = "scatter",
title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
error_bar = TRUE, color_var = "conc_num")
plate_analysis_plots <- append(plate_analysis_plots, list(config))
} }
} }
plate_analysis_no_zero_plots <- list() plate_analysis_boxplots <- list()
for (plot_type in c("scatter", "box")) { for (var in summary_vars) {
variables <- c("L", "K", "r", "AUC", "delta_bg") for (stage in c("before", "after")) {
for (var in variables) { if (stage == "before") {
df_plot <- df_filtered_stats
# Set error_bar = TRUE only for scatter plots } else {
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE) df_plot <- df_na_filtered_stats
# Create the plot configuration
plot_config <- list(
df = df_no_zeros_stats,
x_var = "scan",
y_var = var,
plot_type = plot_type,
title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
error_bar = error_bar,
color_var = "conc_num"
)
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
} }
config <- list(
df = df_plot,
x_var = "scan",
y_var = var,
plot_type = "box",
title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
error_bar = FALSE, color_var = "conc_num")
plate_analysis_boxplots <- append(plate_analysis_boxplots, list(config))
} }
}
plate_analysis_no_zeros_plots <- list()
for (var in summary_vars) {
config <- list(
df = df_no_zeros_filtered_stats,
x_var = "scan",
y_var = var,
plot_type = "scatter",
title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
error_bar = TRUE,
color_var = "conc_num")
plate_analysis_no_zeros_plots <- append(plate_analysis_no_zeros_plots, list(config))
}
plate_analysis_no_zeros_boxplots <- list()
for (var in summary_vars) {
# Create the plot configuration
config <- list(
df = df_no_zeros_filtered_stats,
x_var = "scan",
y_var = var,
plot_type = "box",
title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
error_bar = FALSE,
color_var = "conc_num"
)
plate_analysis_no_zeros_boxplots <- append(plate_analysis_no_zeros_boxplots, list(config))
}
l_outside_2sd_k_plots <- list( l_outside_2sd_k_plots <- list(
list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter", list(
df = df_na_l_outside_2sd_k_stats,
x_var = "L",
y_var = "K",
plot_type = "scatter",
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc", title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
color_var = "conc_num", color_var = "conc_num",
legend_position = "right" legend_position = "right"
@@ -752,20 +811,25 @@ main <- function() {
) )
delta_bg_outside_2sd_k_plots <- list( delta_bg_outside_2sd_k_plots <- list(
list(df = df_na_l_outside_2sd_k_stats, x_var = "delta_bg", y_var = "K", plot_type = "scatter", list(
df = df_na_l_outside_2sd_k_stats,
x_var = "delta_bg",
y_var = "K",
plot_type = "scatter",
title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc", title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
color_var = "conc_num", color_var = "conc_num",
legend_position = "right" legend_position = "right"
) )
) )
# Generate and save plots for each QC step
message("Generating QC plots") message("Generating QC plots")
generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots) generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots) generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots) generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots) generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
@@ -795,7 +859,7 @@ main <- function() {
# Recalculate summary statistics for the background strain # Recalculate summary statistics for the background strain
message("Calculating summary statistics for background strain") message("Calculating summary statistics for background strain")
ss_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = group_vars)
summary_stats_bg <- ss_bg$summary_stats summary_stats_bg <- ss_bg$summary_stats
# df_bg_stats <- ss_bg$df_with_stats # df_bg_stats <- ss_bg$df_with_stats
write.csv(summary_stats_bg, write.csv(summary_stats_bg,
@@ -835,14 +899,14 @@ main <- function() {
ungroup() ungroup()
# Calculate interactions # Calculate interactions
variables <- c("L", "K", "r", "AUC") interaction_vars <- c("L", "K", "r", "AUC")
message("Calculating interaction scores") message("Calculating interaction scores")
# print("Reference strain:") # print("Reference strain:")
# print(head(reference_strain)) # print(head(reference_strain))
reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables) reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars)
# print("Deletion strains:") # print("Deletion strains:")
# print(head(deletion_strains)) # print(head(deletion_strains))
deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables) deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars)
zscores_calculations_reference <- reference_results$calculations zscores_calculations_reference <- reference_results$calculations
zscores_interactions_reference <- reference_results$interactions zscores_interactions_reference <- reference_results$interactions
@@ -856,8 +920,8 @@ main <- function() {
write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE) write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE)
# Create interaction plots # Create interaction plots
reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables) reference_plot_configs <- generate_interaction_plot_configs(df_reference, interaction_vars)
deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables) deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, interaction_vars)
generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3)) generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3)) generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
@@ -965,7 +1029,7 @@ main <- function() {
generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm", generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm",
plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2)) plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2))
correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, variables) correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, interaction_vars)
generate_and_save_plots(output_dir = out_dir, file_name = "Avg_Zscore_vs_lm_NA_rm", generate_and_save_plots(output_dir = out_dir, file_name = "Avg_Zscore_vs_lm_NA_rm",
plot_configs = correlation_plot_configs, grid_layout = list(ncol = 2, nrow = 2)) plot_configs = correlation_plot_configs, grid_layout = list(ncol = 2, nrow = 2))
}) })