251 lines
9.2 KiB
R
251 lines
9.2 KiB
R
suppressMessages({
|
|
library("ggplot2")
|
|
library("plotly")
|
|
library("htmlwidgets")
|
|
library("dplyr")
|
|
library("ggthemes")
|
|
})
|
|
|
|
# Constants for configuration
|
|
PLOT_WIDTH <- 14
|
|
PLOT_HEIGHT <- 9
|
|
BASE_SIZE <- 14
|
|
# BACKUP_SUFFIX <- paste0(".backup_", Sys.Date())
|
|
|
|
options(warn = 2, max.print = 100)
|
|
|
|
# Parse arguments
|
|
parse_arguments <- function() {
|
|
if (interactive()) {
|
|
args <- c(
|
|
"1",
|
|
"3",
|
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
|
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
|
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2/results_std.txt",
|
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1/zscores"
|
|
)
|
|
} else {
|
|
args <- commandArgs(trailingOnly = TRUE)
|
|
}
|
|
list(
|
|
exp_number = as.numeric(args[1]),
|
|
delta_bg_factor = as.numeric(args[2]),
|
|
study_info_file = normalizePath(file.path(args[3]), mustWork = FALSE),
|
|
sgd_gene_list = normalizePath(file.path(args[4]), mustWork = FALSE),
|
|
input_file = normalizePath(file.path(args[5]), mustWork = FALSE),
|
|
out_dir = normalizePath(file.path(args[6]), mustWork = FALSE),
|
|
out_dir_qc = normalizePath(file.path(args[6], "qc"), mustWork = FALSE)
|
|
)
|
|
}
|
|
|
|
args <- parse_arguments()
|
|
dir.create(args$out_dir, showWarnings = FALSE)
|
|
dir.create(args$out_dir_qc, showWarnings = FALSE)
|
|
|
|
# Define themes and scales
|
|
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") {
|
|
theme_foundation(base_size = base_size, base_family = base_family) +
|
|
theme(
|
|
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
|
|
text = element_text(),
|
|
panel.background = element_rect(colour = NA),
|
|
plot.background = element_rect(colour = NA),
|
|
panel.border = element_rect(colour = NA),
|
|
axis.title = element_text(face = "bold", size = rel(1)),
|
|
axis.title.y = element_text(angle = 90, vjust = 2),
|
|
axis.title.x = element_text(vjust = -0.2),
|
|
axis.line = element_line(colour = "black"),
|
|
panel.grid.major = element_line(colour = "#f0f0f0"),
|
|
panel.grid.minor = element_blank(),
|
|
legend.key = element_rect(colour = NA),
|
|
legend.position = "bottom",
|
|
legend.direction = "horizontal",
|
|
plot.margin = unit(c(10, 5, 5, 5), "mm"),
|
|
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
|
|
strip.text = element_text(face = "bold")
|
|
)
|
|
}
|
|
|
|
theme_publication_legend_right <- function(base_size = BASE_SIZE, base_family = "sans") {
|
|
theme_publication(base_size, base_family) +
|
|
theme(
|
|
legend.position = "right",
|
|
legend.direction = "vertical",
|
|
legend.key.size = unit(0.5, "cm"),
|
|
legend.spacing = unit(0, "cm"),
|
|
legend.title = element_text(face = "italic")
|
|
)
|
|
}
|
|
|
|
scale_fill_publication <- function(...) {
|
|
discrete_scale("fill", "Publication", manual_pal(values = c(
|
|
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
|
|
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
|
|
)), ...)
|
|
}
|
|
|
|
scale_colour_publication <- function(...) {
|
|
discrete_scale("colour", "Publication", manual_pal(values = c(
|
|
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
|
|
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
|
|
)), ...)
|
|
}
|
|
|
|
|
|
# Load and preprocess data
|
|
load_and_preprocess_data <- function(input_file, genes) {
|
|
df <- tryCatch({
|
|
read.delim(input_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
|
|
}, error = function(e) {
|
|
stop("Error reading file: ", input_file, "\n", e$message)
|
|
})
|
|
|
|
# Filter out rows where the first column is empty or "Scan"
|
|
df <- df[!df[[1]] %in% c("", "Scan"), ]
|
|
|
|
# Convert specific columns to numeric
|
|
numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd")
|
|
df[numeric_columns[numeric_columns %in% colnames(df)]] <- lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric)
|
|
|
|
# Handle missing critical columns with warnings
|
|
df$L <- if ("l" %in% colnames(df)) df$l else {warning("Missing column: l"); NA}
|
|
df$AUC <- if ("AUC96" %in% colnames(df)) df$AUC96 else {warning("Missing column: AUC96"); NA}
|
|
df$conc_num <- if ("Conc" %in% colnames(df)) as.numeric(gsub("[^0-9\\.]", "", df$Conc)) else NA
|
|
df$delta_bg <- if (all(c("X1stBackgrd", "LstBackgrd") %in% colnames(df))) df$LstBackgrd - df$X1stBackgrd else {warning("Missing columns for delta_bg calculation"); NA}
|
|
|
|
# Replace ORF IDs with Gene Names using the SGD gene list
|
|
df$GeneName <- vapply(df$ORF, function(orf) {
|
|
gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName)
|
|
ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name)
|
|
}, character(1))
|
|
|
|
if (nrow(df) == 0) warning("Dataframe is empty after filtering")
|
|
|
|
return(df)
|
|
}
|
|
|
|
|
|
df <- load_and_preprocess_data(args$input_file)
|
|
|
|
delta_bg_tolerance <- mean(df$delta_bg, na.rm = TRUE) + args$delta_bg_factor * sd(df$delta_bg, na.rm = TRUE)
|
|
message("Exp", args$exp_number, ": The delta_bg_factor is: ", args$delta_bg_factor)
|
|
message("Exp", args$exp_number, ": The delta_bg_tolerance is ", delta_bg_tolerance)
|
|
|
|
df <- df %>%
|
|
mutate(OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep)) %>%
|
|
filter(!Gene %in% c("BLANK", "Blank", "blank"), Drug != "BMH21")
|
|
|
|
# Create plot
|
|
create_plot <- function(df, var, plot_type) {
|
|
filtered_df <- df %>% filter(is.finite(.data[[var]]))
|
|
p <- ggplot(filtered_df, aes(Scan, .data[[var]], color = as.factor(conc_num))) +
|
|
ggtitle(paste("Plate analysis by Drug Conc for", var, "before quality control")) +
|
|
theme_publication()
|
|
|
|
if (plot_type == "scatter") {
|
|
p <- p +
|
|
geom_point(shape = 3, size = 0.2, position = "jitter") +
|
|
stat_summary(fun = mean, geom = "point", size = 0.6) +
|
|
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar")
|
|
} else {
|
|
p <- p + geom_boxplot()
|
|
}
|
|
|
|
return(p)
|
|
}
|
|
|
|
# Publish plot to PDF and HTML (Plotly)
|
|
publish_plot <- function(plot, plot_path) {
|
|
# if (file.exists(plot_path)) {
|
|
# file.rename(plot_path, paste0(plot_path, BACKUP_SUFFIX))
|
|
# }
|
|
|
|
pdf(plot_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
|
|
print(plot)
|
|
dev.off()
|
|
|
|
pgg <- suppressWarnings(ggplotly(plot,
|
|
tooltip = c("L", "K", "ORF", "Gene", "delta_bg")) %>%
|
|
layout(legend = list(orientation = "h")))
|
|
saveWidget(pgg, sub(".pdf$", ".html", plot_path), selfcontained = TRUE)
|
|
}
|
|
|
|
# Publish multiple plots
|
|
publish_multiple_plots <- function(df, variables, plot_type, out_dir_qc, suffix = "") {
|
|
lapply(variables, function(var) {
|
|
plot <- create_plot(df, var, plot_type)
|
|
publish_plot(plot, file.path(out_dir_qc, paste0("plate_analysis_", var, suffix, ".pdf")))
|
|
})
|
|
}
|
|
|
|
# Calculate and publish summary statistics
|
|
publish_summary_stats <- function(df, variables, out_dir) {
|
|
stats_list <- lapply(variables, function(var) {
|
|
df %>%
|
|
group_by(OrfRep, conc_num) %>%
|
|
summarize(
|
|
N = n(),
|
|
mean_val = mean(.data[[var]], na.rm = TRUE),
|
|
sd_val = sd(.data[[var]], na.rm = TRUE),
|
|
se_val = sd_val / sqrt(N)
|
|
)
|
|
})
|
|
summary_stats_df <- bind_rows(stats_list, .id = "variable")
|
|
write.csv(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
|
}
|
|
|
|
# Calculate and publish interaction scores
|
|
publish_interaction_scores <- function(df, out_dir) {
|
|
interaction_scores <- df %>%
|
|
group_by(OrfRep) %>%
|
|
summarize(
|
|
mean_L = mean(L, na.rm = TRUE),
|
|
mean_K = mean(K, na.rm = TRUE),
|
|
mean_r = mean(r, na.rm = TRUE),
|
|
mean_AUC = mean(AUC, na.rm = TRUE),
|
|
delta_bg_mean = mean(delta_bg, na.rm = TRUE),
|
|
delta_bg_sd = sd(delta_bg, na.rm = TRUE)
|
|
) %>%
|
|
mutate(
|
|
l_rank = rank(mean_L),
|
|
k_rank = rank(mean_K),
|
|
r_rank = rank(mean_r),
|
|
auc_rank = rank(mean_AUC)
|
|
)
|
|
|
|
write.csv(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
|
|
write.csv(interaction_scores %>%
|
|
arrange(l_rank, k_rank), file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
|
|
}
|
|
|
|
# Publish z-scores
|
|
publish_zscores <- function(df, out_dir) {
|
|
zscores <- df %>%
|
|
mutate(
|
|
zscore_L = scale(L, center = TRUE, scale = TRUE),
|
|
zscore_K = scale(K, center = TRUE, scale = TRUE),
|
|
zscore_r = scale(r, center = TRUE, scale = TRUE),
|
|
zscore_AUC = scale(AUC, center = TRUE, scale = TRUE)
|
|
)
|
|
|
|
write.csv(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
|
|
}
|
|
|
|
# QC plot generation and publication
|
|
generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
|
|
publish_multiple_plots(df, c("L", "K", "r", "AUC", "delta_bg"), "scatter", out_dir_qc)
|
|
|
|
delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ]
|
|
publish_multiple_plots(delta_bg_above_tolerance, c("L", "K", "r", "AUC", "delta_bg"), "scatter", out_dir_qc, "_above_tolerance")
|
|
}
|
|
|
|
# Run QC plot generation and publication
|
|
generate_and_publish_qc(df, delta_bg_tolerance, args$out_dir_qc)
|
|
|
|
# Run summary statistics, interaction score, and z-score calculations and publications
|
|
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
|
publish_summary_stats(df, variables, args$out_dir)
|
|
publish_interaction_scores(df, args$out_dir)
|
|
publish_zscores(df, args$out_dir)
|