|
@@ -8,7 +8,7 @@ suppressMessages({
|
|
|
library(unix)
|
|
|
})
|
|
|
|
|
|
-options(warn = 2, max.print = 1000)
|
|
|
+options(warn = 2)
|
|
|
options(width = 10000)
|
|
|
|
|
|
# Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes)
|
|
@@ -154,7 +154,7 @@ update_gene_names <- function(df, sgd_gene_list) {
|
|
|
}
|
|
|
|
|
|
# Calculate summary statistics for all variables
|
|
|
-calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
|
|
|
+calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) {
|
|
|
|
|
|
# Summarize the variables within the grouped data
|
|
|
summary_stats <- df %>%
|
|
@@ -171,6 +171,8 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
|
|
|
), .names = "{.fn}_{.col}")
|
|
|
)
|
|
|
|
|
|
+ print(summary_stats)
|
|
|
+
|
|
|
# Prevent .x and .y suffix issues by renaming columns
|
|
|
df_cleaned <- df %>%
|
|
|
select(-any_of(setdiff(names(summary_stats), group_vars))) # Avoid duplicate columns in the final join
|
|
@@ -342,8 +344,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
|
|
|
plots <- lapply(plot_configs, function(config) {
|
|
|
df <- config$df
|
|
|
|
|
|
- print(head(df))
|
|
|
-
|
|
|
# Check if y_var is NULL and adjust the aes mapping
|
|
|
aes_mapping <- if (is.null(config$y_var)) {
|
|
|
aes(x = !!sym(config$x_var), color = as.factor(!!sym(config$color_var)))
|
|
@@ -424,7 +424,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
|
|
|
saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE)
|
|
|
}
|
|
|
|
|
|
-
|
|
|
generate_interaction_plot_configs <- function(df, variables) {
|
|
|
configs <- list()
|
|
|
|
|
@@ -587,33 +586,24 @@ main <- function() {
|
|
|
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
|
|
|
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
|
|
- variables <- c("L", "K", "r", "AUC") # fields to filter and calculate across
|
|
|
+ summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
|
|
|
group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by
|
|
|
+ print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor",
|
|
|
+ "delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB")
|
|
|
|
|
|
message("Loading and filtering data")
|
|
|
df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
|
|
|
df <- update_gene_names(df, args$sgd_gene_list)
|
|
|
+ df <- as_tibble(df)
|
|
|
|
|
|
# Filter rows that are above tolerance for quality control plots
|
|
|
df_above_tolerance <- df %>% filter(DB == 1)
|
|
|
|
|
|
- # Set L, r, K, and AUC to NA for rows that are above tolerance
|
|
|
- df_na <- df %>% mutate(across(all_of(variables), ~ ifelse(DB == 1, NA, .)))
|
|
|
+ # Set L, r, K, AUC (and delta_bg?) to NA for rows that are above tolerance
|
|
|
+ df_na <- df %>% mutate(across(all_of(summary_vars), ~ ifelse(DB == 1, NA, .)))
|
|
|
|
|
|
# Remove rows with 0 values in L
|
|
|
df_no_zeros <- df_na %>% filter(L > 0)
|
|
|
-
|
|
|
- # Additional filtering for non-finite values
|
|
|
- # Filter and print non-finite rows, then filter and keep only finite rows
|
|
|
- df_na_filtered <- df_na %>%
|
|
|
- {
|
|
|
- non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
|
|
|
- if (nrow(non_finite_rows) > 0) {
|
|
|
- message("Removing non-finite rows:\n")
|
|
|
- print(head(non_finite_rows, n = 10))
|
|
|
- }
|
|
|
- filter(., if_all(c(L), is.finite))
|
|
|
- }
|
|
|
|
|
|
# Set some constants
|
|
|
max_conc <- max(df$conc_num_factor)
|
|
@@ -621,18 +611,47 @@ main <- function() {
|
|
|
k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
|
|
|
|
|
message("Calculating summary statistics before quality control")
|
|
|
- ss <- calculate_summary_stats(df, variables, group_vars = group_vars)
|
|
|
+ ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars)
|
|
|
+ df_ss <- ss$summary_stats
|
|
|
df_stats <- ss$df_with_stats
|
|
|
+ df_filtered_stats <- df_stats %>%
|
|
|
+ {
|
|
|
+ non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
|
|
|
+ if (nrow(non_finite_rows) > 0) {
|
|
|
+ message("Removed the following non-finite rows:")
|
|
|
+ print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
|
|
|
+ }
|
|
|
+ filter(., if_all(c(L), is.finite))
|
|
|
+ }
|
|
|
|
|
|
message("Calculating summary statistics after quality control")
|
|
|
- ss <- calculate_summary_stats(df_na, variables, group_vars = group_vars)
|
|
|
+ ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars)
|
|
|
df_na_ss <- ss$summary_stats
|
|
|
df_na_stats <- ss$df_with_stats
|
|
|
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
|
|
+ # Filter out non-finite rows for plotting
|
|
|
+ df_na_filtered_stats <- df_na_stats %>%
|
|
|
+ {
|
|
|
+ non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
|
|
|
+ if (nrow(non_finite_rows) > 0) {
|
|
|
+ message("Removed the following non-finite rows:")
|
|
|
+ print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
|
|
|
+ }
|
|
|
+ filter(., if_all(c(L), is.finite))
|
|
|
+ }
|
|
|
|
|
|
message("Calculating summary statistics after quality control excluding zero values")
|
|
|
- ss <- calculate_summary_stats(df_no_zeros, variables, group_vars = group_vars)
|
|
|
+ ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars)
|
|
|
df_no_zeros_stats <- ss$df_with_stats
|
|
|
+ df_no_zeros_filtered_stats <- df_no_zeros_stats %>%
|
|
|
+ {
|
|
|
+ non_finite_rows <- filter(., if_any(c(L), ~ !is.finite(.)))
|
|
|
+ if (nrow(non_finite_rows) > 0) {
|
|
|
+ message("Removed the following non-finite rows:")
|
|
|
+ print(non_finite_rows %>% select(any_of(print_vars)), n = 200)
|
|
|
+ }
|
|
|
+ filter(., if_all(c(L), is.finite))
|
|
|
+ }
|
|
|
|
|
|
message("Filtering by 2SD of K")
|
|
|
df_na_within_2sd_k <- df_na_stats %>%
|
|
@@ -645,19 +664,24 @@ main <- function() {
|
|
|
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
|
|
l_within_2sd_k_ss <- ss$summary_stats
|
|
|
df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
|
|
- write.csv(l_within_2sd_k_ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
|
|
|
+ write.csv(l_within_2sd_k_ss,
|
|
|
+ file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
|
|
|
|
|
|
message("Calculating summary statistics for L outside 2SD of K")
|
|
|
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
|
|
l_outside_2sd_k_ss <- ss$summary_stats
|
|
|
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
|
|
- write.csv(l_outside_2sd_k_ss, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
|
|
|
+ write.csv(l_outside_2sd_k_ss,
|
|
|
+ file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
|
|
|
|
|
|
- # Plot configurations
|
|
|
# Each plots list corresponds to a file
|
|
|
message("Generating QC plot configurations")
|
|
|
l_vs_k_plots <- list(
|
|
|
- list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
|
|
|
+ list(
|
|
|
+ df = df,
|
|
|
+ x_var = "L",
|
|
|
+ y_var = "K",
|
|
|
+ plot_type = "scatter",
|
|
|
title = "Raw L vs K before quality control",
|
|
|
color_var = "conc_num",
|
|
|
legend_position = "right"
|
|
@@ -665,8 +689,13 @@ main <- function() {
|
|
|
)
|
|
|
|
|
|
above_threshold_plots <- list(
|
|
|
- list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter",
|
|
|
- title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
|
|
|
+ list(
|
|
|
+ df = df_above_tolerance,
|
|
|
+ x_var = "L",
|
|
|
+ y_var = "K",
|
|
|
+ plot_type = "scatter",
|
|
|
+ title = paste("Raw L vs K for strains above delta background threshold of",
|
|
|
+ df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
|
|
|
color_var = "conc_num",
|
|
|
annotations = list(
|
|
|
x = l_half_median,
|
|
@@ -679,72 +708,102 @@ main <- function() {
|
|
|
)
|
|
|
|
|
|
frequency_delta_bg_plots <- list(
|
|
|
- list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
|
|
+ list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
|
|
title = "Plate analysis by Drug Conc for delta background before quality control",
|
|
|
color_var = "conc_num",
|
|
|
x_label = "Delta Background",
|
|
|
y_label = "Density",
|
|
|
error_bar = FALSE,
|
|
|
- legend_position = "right"
|
|
|
- ),
|
|
|
- list(df = df_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
|
|
|
+ legend_position = "right"),
|
|
|
+ list(df = df_filtered_stats, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
|
|
|
title = "Plate analysis by Drug Conc for delta background before quality control",
|
|
|
color_var = "conc_num",
|
|
|
x_label = "Delta Background",
|
|
|
y_label = "Count",
|
|
|
error_bar = FALSE,
|
|
|
- legend_position = "right"
|
|
|
- )
|
|
|
+ legend_position = "right")
|
|
|
)
|
|
|
|
|
|
plate_analysis_plots <- list()
|
|
|
- for (plot_type in c("scatter", "box")) {
|
|
|
- variables <- c("L", "K", "r", "AUC", "delta_bg")
|
|
|
- for (var in variables) {
|
|
|
- for (stage in c("before", "after")) {
|
|
|
- if (stage == "before") {
|
|
|
- df_plot <- df_stats
|
|
|
- } else {
|
|
|
- df_plot <- df_na_stats # TODO use df_na_filtered if necessary
|
|
|
- }
|
|
|
-
|
|
|
- # Set error_bar = TRUE only for scatter plots
|
|
|
- error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
|
|
|
-
|
|
|
- # Create the plot configuration
|
|
|
- plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type,
|
|
|
- title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
|
|
|
- error_bar = error_bar, color_var = "conc_num")
|
|
|
-
|
|
|
- plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
|
|
|
+ for (var in summary_vars) {
|
|
|
+ for (stage in c("before", "after")) {
|
|
|
+ if (stage == "before") {
|
|
|
+ df_plot <- df_filtered_stats
|
|
|
+ } else {
|
|
|
+ df_plot <- df_na_filtered_stats
|
|
|
}
|
|
|
+
|
|
|
+ config <- list(
|
|
|
+ df = df_plot,
|
|
|
+ x_var = "scan",
|
|
|
+ y_var = var,
|
|
|
+ plot_type = "scatter",
|
|
|
+ title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
|
|
|
+ error_bar = TRUE, color_var = "conc_num")
|
|
|
+
|
|
|
+ plate_analysis_plots <- append(plate_analysis_plots, list(config))
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- plate_analysis_no_zero_plots <- list()
|
|
|
- for (plot_type in c("scatter", "box")) {
|
|
|
- variables <- c("L", "K", "r", "AUC", "delta_bg")
|
|
|
- for (var in variables) {
|
|
|
-
|
|
|
- # Set error_bar = TRUE only for scatter plots
|
|
|
- error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
|
|
|
-
|
|
|
- # Create the plot configuration
|
|
|
- plot_config <- list(
|
|
|
- df = df_no_zeros_stats,
|
|
|
- x_var = "scan",
|
|
|
- y_var = var,
|
|
|
- plot_type = plot_type,
|
|
|
- title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
|
|
|
- error_bar = error_bar,
|
|
|
- color_var = "conc_num"
|
|
|
- )
|
|
|
- plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
|
|
|
+ plate_analysis_boxplots <- list()
|
|
|
+ for (var in summary_vars) {
|
|
|
+ for (stage in c("before", "after")) {
|
|
|
+ if (stage == "before") {
|
|
|
+ df_plot <- df_filtered_stats
|
|
|
+ } else {
|
|
|
+ df_plot <- df_na_filtered_stats
|
|
|
}
|
|
|
+
|
|
|
+ config <- list(
|
|
|
+ df = df_plot,
|
|
|
+ x_var = "scan",
|
|
|
+ y_var = var,
|
|
|
+ plot_type = "box",
|
|
|
+ title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
|
|
|
+ error_bar = FALSE, color_var = "conc_num")
|
|
|
+
|
|
|
+ plate_analysis_boxplots <- append(plate_analysis_boxplots, list(config))
|
|
|
}
|
|
|
+ }
|
|
|
+
|
|
|
+ plate_analysis_no_zeros_plots <- list()
|
|
|
+ for (var in summary_vars) {
|
|
|
+
|
|
|
+ config <- list(
|
|
|
+ df = df_no_zeros_filtered_stats,
|
|
|
+ x_var = "scan",
|
|
|
+ y_var = var,
|
|
|
+ plot_type = "scatter",
|
|
|
+ title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
|
|
|
+ error_bar = TRUE,
|
|
|
+ color_var = "conc_num")
|
|
|
+
|
|
|
+ plate_analysis_no_zeros_plots <- append(plate_analysis_no_zeros_plots, list(config))
|
|
|
+ }
|
|
|
+
|
|
|
+ plate_analysis_no_zeros_boxplots <- list()
|
|
|
+ for (var in summary_vars) {
|
|
|
+
|
|
|
+
|
|
|
+ # Create the plot configuration
|
|
|
+ config <- list(
|
|
|
+ df = df_no_zeros_filtered_stats,
|
|
|
+ x_var = "scan",
|
|
|
+ y_var = var,
|
|
|
+ plot_type = "box",
|
|
|
+ title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
|
|
|
+ error_bar = FALSE,
|
|
|
+ color_var = "conc_num"
|
|
|
+ )
|
|
|
+ plate_analysis_no_zeros_boxplots <- append(plate_analysis_no_zeros_boxplots, list(config))
|
|
|
+ }
|
|
|
|
|
|
l_outside_2sd_k_plots <- list(
|
|
|
- list(df = df_na_l_outside_2sd_k_stats, x_var = "L", y_var = "K", plot_type = "scatter",
|
|
|
+ list(
|
|
|
+ df = df_na_l_outside_2sd_k_stats,
|
|
|
+ x_var = "L",
|
|
|
+ y_var = "K",
|
|
|
+ plot_type = "scatter",
|
|
|
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
|
|
|
color_var = "conc_num",
|
|
|
legend_position = "right"
|
|
@@ -752,20 +811,25 @@ main <- function() {
|
|
|
)
|
|
|
|
|
|
delta_bg_outside_2sd_k_plots <- list(
|
|
|
- list(df = df_na_l_outside_2sd_k_stats, x_var = "delta_bg", y_var = "K", plot_type = "scatter",
|
|
|
+ list(
|
|
|
+ df = df_na_l_outside_2sd_k_stats,
|
|
|
+ x_var = "delta_bg",
|
|
|
+ y_var = "K",
|
|
|
+ plot_type = "scatter",
|
|
|
title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
|
|
|
color_var = "conc_num",
|
|
|
legend_position = "right"
|
|
|
)
|
|
|
)
|
|
|
|
|
|
- # Generate and save plots for each QC step
|
|
|
message("Generating QC plots")
|
|
|
generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
|
|
|
generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
|
|
|
generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
|
|
|
generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
|
|
|
+ generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
|
|
|
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
|
|
|
+ generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
|
|
|
generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
|
|
|
generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
|
|
|
|
|
@@ -795,7 +859,7 @@ main <- function() {
|
|
|
|
|
|
# Recalculate summary statistics for the background strain
|
|
|
message("Calculating summary statistics for background strain")
|
|
|
- ss_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
|
|
+ ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = group_vars)
|
|
|
summary_stats_bg <- ss_bg$summary_stats
|
|
|
# df_bg_stats <- ss_bg$df_with_stats
|
|
|
write.csv(summary_stats_bg,
|
|
@@ -835,14 +899,14 @@ main <- function() {
|
|
|
ungroup()
|
|
|
|
|
|
# Calculate interactions
|
|
|
- variables <- c("L", "K", "r", "AUC")
|
|
|
+ interaction_vars <- c("L", "K", "r", "AUC")
|
|
|
message("Calculating interaction scores")
|
|
|
# print("Reference strain:")
|
|
|
# print(head(reference_strain))
|
|
|
- reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables)
|
|
|
+ reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars)
|
|
|
# print("Deletion strains:")
|
|
|
# print(head(deletion_strains))
|
|
|
- deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables)
|
|
|
+ deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars)
|
|
|
|
|
|
zscores_calculations_reference <- reference_results$calculations
|
|
|
zscores_interactions_reference <- reference_results$interactions
|
|
@@ -856,8 +920,8 @@ main <- function() {
|
|
|
write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE)
|
|
|
|
|
|
# Create interaction plots
|
|
|
- reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables)
|
|
|
- deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables)
|
|
|
+ reference_plot_configs <- generate_interaction_plot_configs(df_reference, interaction_vars)
|
|
|
+ deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, interaction_vars)
|
|
|
generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
|
|
|
generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
|
|
|
|
|
@@ -965,7 +1029,7 @@ main <- function() {
|
|
|
generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm",
|
|
|
plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2))
|
|
|
|
|
|
- correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, variables)
|
|
|
+ correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, interaction_vars)
|
|
|
generate_and_save_plots(output_dir = out_dir, file_name = "Avg_Zscore_vs_lm_NA_rm",
|
|
|
plot_configs = correlation_plot_configs, grid_layout = list(ncol = 2, nrow = 2))
|
|
|
})
|