Files
hartman-server/workflow/.old/apps/r/interactions.R
2024-08-23 04:01:30 -04:00

251 lines
9.2 KiB
R

suppressMessages({
library("ggplot2")
library("plotly")
library("htmlwidgets")
library("dplyr")
library("ggthemes")
})
# Constants for configuration
PLOT_WIDTH <- 14
PLOT_HEIGHT <- 9
BASE_SIZE <- 14
# BACKUP_SUFFIX <- paste0(".backup_", Sys.Date())
options(warn = 2, max.print = 100)
# Parse arguments
parse_arguments <- function() {
if (interactive()) {
args <- c(
"1",
"3",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2/results_std.txt",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1/zscores"
)
} else {
args <- commandArgs(trailingOnly = TRUE)
}
list(
exp_number = as.numeric(args[1]),
delta_bg_factor = as.numeric(args[2]),
study_info_file = normalizePath(file.path(args[3]), mustWork = FALSE),
sgd_gene_list = normalizePath(file.path(args[4]), mustWork = FALSE),
input_file = normalizePath(file.path(args[5]), mustWork = FALSE),
out_dir = normalizePath(file.path(args[6]), mustWork = FALSE),
out_dir_qc = normalizePath(file.path(args[6], "qc"), mustWork = FALSE)
)
}
args <- parse_arguments()
dir.create(args$out_dir, showWarnings = FALSE)
dir.create(args$out_dir_qc, showWarnings = FALSE)
# Define themes and scales
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") {
theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.line = element_line(colour = "black"),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "bottom",
legend.direction = "horizontal",
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
}
theme_publication_legend_right <- function(base_size = BASE_SIZE, base_family = "sans") {
theme_publication(base_size, base_family) +
theme(
legend.position = "right",
legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic")
)
}
scale_fill_publication <- function(...) {
discrete_scale("fill", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
scale_colour_publication <- function(...) {
discrete_scale("colour", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
# Load and preprocess data
load_and_preprocess_data <- function(input_file, genes) {
df <- tryCatch({
read.delim(input_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
}, error = function(e) {
stop("Error reading file: ", input_file, "\n", e$message)
})
# Filter out rows where the first column is empty or "Scan"
df <- df[!df[[1]] %in% c("", "Scan"), ]
# Convert specific columns to numeric
numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd")
df[numeric_columns[numeric_columns %in% colnames(df)]] <- lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric)
# Handle missing critical columns with warnings
df$L <- if ("l" %in% colnames(df)) df$l else {warning("Missing column: l"); NA}
df$AUC <- if ("AUC96" %in% colnames(df)) df$AUC96 else {warning("Missing column: AUC96"); NA}
df$conc_num <- if ("Conc" %in% colnames(df)) as.numeric(gsub("[^0-9\\.]", "", df$Conc)) else NA
df$delta_bg <- if (all(c("X1stBackgrd", "LstBackgrd") %in% colnames(df))) df$LstBackgrd - df$X1stBackgrd else {warning("Missing columns for delta_bg calculation"); NA}
# Replace ORF IDs with Gene Names using the SGD gene list
df$GeneName <- vapply(df$ORF, function(orf) {
gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName)
ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name)
}, character(1))
if (nrow(df) == 0) warning("Dataframe is empty after filtering")
return(df)
}
df <- load_and_preprocess_data(args$input_file)
delta_bg_tolerance <- mean(df$delta_bg, na.rm = TRUE) + args$delta_bg_factor * sd(df$delta_bg, na.rm = TRUE)
message("Exp", args$exp_number, ": The delta_bg_factor is: ", args$delta_bg_factor)
message("Exp", args$exp_number, ": The delta_bg_tolerance is ", delta_bg_tolerance)
df <- df %>%
mutate(OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep)) %>%
filter(!Gene %in% c("BLANK", "Blank", "blank"), Drug != "BMH21")
# Create plot
create_plot <- function(df, var, plot_type) {
filtered_df <- df %>% filter(is.finite(.data[[var]]))
p <- ggplot(filtered_df, aes(Scan, .data[[var]], color = as.factor(conc_num))) +
ggtitle(paste("Plate analysis by Drug Conc for", var, "before quality control")) +
theme_publication()
if (plot_type == "scatter") {
p <- p +
geom_point(shape = 3, size = 0.2, position = "jitter") +
stat_summary(fun = mean, geom = "point", size = 0.6) +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar")
} else {
p <- p + geom_boxplot()
}
return(p)
}
# Publish plot to PDF and HTML (Plotly)
publish_plot <- function(plot, plot_path) {
# if (file.exists(plot_path)) {
# file.rename(plot_path, paste0(plot_path, BACKUP_SUFFIX))
# }
pdf(plot_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
print(plot)
dev.off()
pgg <- suppressWarnings(ggplotly(plot,
tooltip = c("L", "K", "ORF", "Gene", "delta_bg")) %>%
layout(legend = list(orientation = "h")))
saveWidget(pgg, sub(".pdf$", ".html", plot_path), selfcontained = TRUE)
}
# Publish multiple plots
publish_multiple_plots <- function(df, variables, plot_type, out_dir_qc, suffix = "") {
lapply(variables, function(var) {
plot <- create_plot(df, var, plot_type)
publish_plot(plot, file.path(out_dir_qc, paste0("plate_analysis_", var, suffix, ".pdf")))
})
}
# Calculate and publish summary statistics
publish_summary_stats <- function(df, variables, out_dir) {
stats_list <- lapply(variables, function(var) {
df %>%
group_by(OrfRep, conc_num) %>%
summarize(
N = n(),
mean_val = mean(.data[[var]], na.rm = TRUE),
sd_val = sd(.data[[var]], na.rm = TRUE),
se_val = sd_val / sqrt(N)
)
})
summary_stats_df <- bind_rows(stats_list, .id = "variable")
write.csv(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
}
# Calculate and publish interaction scores
publish_interaction_scores <- function(df, out_dir) {
interaction_scores <- df %>%
group_by(OrfRep) %>%
summarize(
mean_L = mean(L, na.rm = TRUE),
mean_K = mean(K, na.rm = TRUE),
mean_r = mean(r, na.rm = TRUE),
mean_AUC = mean(AUC, na.rm = TRUE),
delta_bg_mean = mean(delta_bg, na.rm = TRUE),
delta_bg_sd = sd(delta_bg, na.rm = TRUE)
) %>%
mutate(
l_rank = rank(mean_L),
k_rank = rank(mean_K),
r_rank = rank(mean_r),
auc_rank = rank(mean_AUC)
)
write.csv(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
write.csv(interaction_scores %>%
arrange(l_rank, k_rank), file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
}
# Publish z-scores
publish_zscores <- function(df, out_dir) {
zscores <- df %>%
mutate(
zscore_L = scale(L, center = TRUE, scale = TRUE),
zscore_K = scale(K, center = TRUE, scale = TRUE),
zscore_r = scale(r, center = TRUE, scale = TRUE),
zscore_AUC = scale(AUC, center = TRUE, scale = TRUE)
)
write.csv(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
}
# QC plot generation and publication
generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
publish_multiple_plots(df, c("L", "K", "r", "AUC", "delta_bg"), "scatter", out_dir_qc)
delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ]
publish_multiple_plots(delta_bg_above_tolerance, c("L", "K", "r", "AUC", "delta_bg"), "scatter", out_dir_qc, "_above_tolerance")
}
# Run QC plot generation and publication
generate_and_publish_qc(df, delta_bg_tolerance, args$out_dir_qc)
# Run summary statistics, interaction score, and z-score calculations and publications
variables <- c("L", "K", "r", "AUC", "delta_bg")
publish_summary_stats(df, variables, args$out_dir)
publish_interaction_scores(df, args$out_dir)
publish_zscores(df, args$out_dir)