Implement calculate_interaction_zscores.R
This commit is contained in:
@@ -1,10 +1,10 @@
|
|||||||
suppressMessages({
|
suppressMessages({
|
||||||
if (!require("ggplot2")) stop("Package ggplot2 is required but not installed.")
|
library("ggplot2")
|
||||||
if (!require("plotly")) stop("Package plotly is required but not installed.")
|
library("plotly")
|
||||||
if (!require("htmlwidgets")) stop("Package htmlwidgets is required but not installed.")
|
library("htmlwidgets")
|
||||||
if (!require("dplyr")) stop("Package dplyr is required but not installed.")
|
library("dplyr")
|
||||||
if (!require("ggthemes")) stop("Package ggthemes is required but not installed.")
|
library("ggthemes")
|
||||||
if (!require("plyr")) stop("Package plyr is required but not installed.")
|
library("data.table")
|
||||||
})
|
})
|
||||||
|
|
||||||
# Constants for configuration
|
# Constants for configuration
|
||||||
@@ -14,9 +14,10 @@ BASE_SIZE <- 14
|
|||||||
|
|
||||||
options(warn = 2, max.print = 100)
|
options(warn = 2, max.print = 100)
|
||||||
|
|
||||||
|
# Function to parse arguments
|
||||||
parse_arguments <- function() {
|
parse_arguments <- function() {
|
||||||
args <- if (interactive()) {
|
if (interactive()) {
|
||||||
c(
|
args <- c(
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD",
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD",
|
||||||
3,
|
3,
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
|
||||||
@@ -24,14 +25,10 @@ parse_arguments <- function() {
|
|||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
|
||||||
"Experiment 1: Doxo versus HLD",
|
"Experiment 1: Doxo versus HLD",
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
|
||||||
"Experiment 2: HLD versus Doxo",
|
"Experiment 2: HLD versus Doxo"
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp3",
|
|
||||||
"Experiment 3: HLD versus WT",
|
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp4",
|
|
||||||
"Experiment 4: Doxo versus WT"
|
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
commandArgs(trailingOnly = TRUE)
|
args <- commandArgs(trailingOnly = TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE)
|
paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE)
|
||||||
@@ -49,7 +46,9 @@ parse_arguments <- function() {
|
|||||||
|
|
||||||
args <- parse_arguments()
|
args <- parse_arguments()
|
||||||
|
|
||||||
dir.create(args$out_dir, showWarnings = FALSE)
|
# Ensure main output directories exist
|
||||||
|
dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
|
||||||
|
dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
|
||||||
|
|
||||||
# Define themes and scales
|
# Define themes and scales
|
||||||
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") {
|
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") {
|
||||||
@@ -110,18 +109,26 @@ sgd_genes <- function(sgd_gene_list) {
|
|||||||
genes <- sgd_genes(args$sgd_gene_list)
|
genes <- sgd_genes(args$sgd_gene_list)
|
||||||
|
|
||||||
load_and_preprocess_data <- function(easy_results_file, genes) {
|
load_and_preprocess_data <- function(easy_results_file, genes) {
|
||||||
|
# Attempt to read the file and handle any errors that occur
|
||||||
df <- tryCatch({
|
df <- tryCatch({
|
||||||
read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
|
read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
|
||||||
}, error = function(e) {
|
}, error = function(e) {
|
||||||
stop("Error reading file: ", easy_results_file, "\n", e$message)
|
stop("Error reading file: ", easy_results_file, "\n", e$message)
|
||||||
}) %>%
|
})
|
||||||
filter(!.[[1]] %in% c("", "Scan")) # Fixed syntax
|
|
||||||
|
# Filter out non-data rows
|
||||||
numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd")
|
|
||||||
df[numeric_columns[numeric_columns %in% colnames(df)]] <-
|
|
||||||
lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric)
|
|
||||||
|
|
||||||
df <- df %>%
|
df <- df %>%
|
||||||
|
filter(!.[[1]] %in% c("", "Scan"))
|
||||||
|
|
||||||
|
# Convert specified columns to numeric
|
||||||
|
numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd")
|
||||||
|
df[numeric_columns[numeric_columns %in% colnames(df)]] <-
|
||||||
|
lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric)
|
||||||
|
|
||||||
|
# Further filter and mutate the data
|
||||||
|
df <- df %>%
|
||||||
|
filter(!(Scan %in% c("", "Scan"))) %>%
|
||||||
|
{if ("Conc" %in% colnames(.)) filter(., Conc != "0ug/mL") else .} %>%
|
||||||
mutate(
|
mutate(
|
||||||
L = if ("l" %in% colnames(.)) l else {warning("Missing column: l"); NA},
|
L = if ("l" %in% colnames(.)) l else {warning("Missing column: l"); NA},
|
||||||
AUC = if ("AUC96" %in% colnames(.)) AUC96 else {warning("Missing column: AUC96"); NA},
|
AUC = if ("AUC96" %in% colnames(.)) AUC96 else {warning("Missing column: AUC96"); NA},
|
||||||
@@ -131,17 +138,20 @@ load_and_preprocess_data <- function(easy_results_file, genes) {
|
|||||||
GeneName = vapply(ORF, function(orf) {
|
GeneName = vapply(ORF, function(orf) {
|
||||||
gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName)
|
gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName)
|
||||||
ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name)
|
ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name)
|
||||||
}, character(1)) # Ensures a character vector is returned
|
}, character(1))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Check if the dataframe is empty after filtering
|
||||||
if (nrow(df) == 0) warning("Dataframe is empty after filtering")
|
if (nrow(df) == 0) warning("Dataframe is empty after filtering")
|
||||||
|
|
||||||
return(df)
|
return(df)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Plot creation and publication
|
||||||
create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "") {
|
create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "") {
|
||||||
plot_func <- if (plot_type == "scatter") geom_point else geom_boxplot
|
plot_func <- if (plot_type == "scatter") geom_point else geom_boxplot
|
||||||
filtered_df <- filter(df, is.finite(.data[[var]]))
|
filtered_df <- df[is.finite(df[[var]]), ]
|
||||||
|
|
||||||
p <- ggplot(filtered_df, aes(Scan, .data[[var]], color = as.factor(conc_num))) +
|
p <- ggplot(filtered_df, aes(Scan, .data[[var]], color = as.factor(conc_num))) +
|
||||||
plot_func(shape = 3, size = 0.2, position = "jitter") +
|
plot_func(shape = 3, size = 0.2, position = "jitter") +
|
||||||
@@ -161,27 +171,27 @@ create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "")
|
|||||||
saveWidget(pgg, html_path, selfcontained = TRUE)
|
saveWidget(pgg, html_path, selfcontained = TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Summary statistics publication
|
||||||
publish_summary_stats <- function(df, variables, out_dir) {
|
publish_summary_stats <- function(df, variables, out_dir) {
|
||||||
stats_list <- lapply(variables, function(var) {
|
stats_list <- lapply(variables, function(var) {
|
||||||
df %>%
|
df %>%
|
||||||
dplyr::group_by(OrfRep, conc_num) %>%
|
dplyr::group_by(OrfRep, conc_num) %>%
|
||||||
dplyr::summarize(
|
dplyr::summarize(
|
||||||
N = dplyr::n(), # Ensure that the correct version of n() is used
|
N = dplyr::n(),
|
||||||
mean_val = mean(.data[[var]], na.rm = TRUE),
|
mean_val = mean(.data[[var]], na.rm = TRUE),
|
||||||
sd_val = sd(.data[[var]], na.rm = TRUE),
|
sd_val = sd(.data[[var]], na.rm = TRUE),
|
||||||
se_val = sd_val / sqrt(N)
|
se_val = sd_val / sqrt(N)
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
summary_stats_df <- dplyr::bind_rows(stats_list, .id = "variable")
|
summary_stats_df <- dplyr::bind_rows(stats_list, .id = "variable")
|
||||||
write.csv(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
fwrite(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Interaction scores publication
|
||||||
publish_interaction_scores <- function(df, out_dir) {
|
publish_interaction_scores <- function(df, out_dir) {
|
||||||
interaction_scores <- df %>%
|
interaction_scores <- df %>%
|
||||||
group_by(OrfRep) %>%
|
dplyr::group_by(OrfRep) %>%
|
||||||
summarize(
|
dplyr::summarize(
|
||||||
mean_L = mean(L, na.rm = TRUE),
|
mean_L = mean(L, na.rm = TRUE),
|
||||||
mean_K = mean(K, na.rm = TRUE),
|
mean_K = mean(K, na.rm = TRUE),
|
||||||
mean_r = mean(r, na.rm = TRUE),
|
mean_r = mean(r, na.rm = TRUE),
|
||||||
@@ -189,147 +199,75 @@ publish_interaction_scores <- function(df, out_dir) {
|
|||||||
delta_bg_mean = mean(delta_bg, na.rm = TRUE),
|
delta_bg_mean = mean(delta_bg, na.rm = TRUE),
|
||||||
delta_bg_sd = sd(delta_bg, na.rm = TRUE)
|
delta_bg_sd = sd(delta_bg, na.rm = TRUE)
|
||||||
) %>%
|
) %>%
|
||||||
mutate(
|
dplyr::mutate(
|
||||||
l_rank = rank(mean_L),
|
l_rank = rank(mean_L),
|
||||||
k_rank = rank(mean_K),
|
k_rank = rank(mean_K),
|
||||||
r_rank = rank(mean_r),
|
r_rank = rank(mean_r),
|
||||||
auc_rank = rank(mean_AUC)
|
auc_rank = rank(mean_AUC)
|
||||||
)
|
)
|
||||||
|
|
||||||
write.csv(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
|
fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
|
||||||
write.csv(arrange(interaction_scores, l_rank, k_rank),
|
fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank),
|
||||||
file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
|
file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Z-scores publication
|
||||||
publish_zscores <- function(df, out_dir) {
|
publish_zscores <- function(df, out_dir) {
|
||||||
zscores <- df %>%
|
zscores <- df %>%
|
||||||
mutate(
|
dplyr::mutate(
|
||||||
zscore_L = scale(L, center = TRUE, scale = TRUE),
|
zscore_L = scale(L, center = TRUE, scale = TRUE),
|
||||||
zscore_K = scale(K, center = TRUE, scale = TRUE),
|
zscore_K = scale(K, center = TRUE, scale = TRUE),
|
||||||
zscore_r = scale(r, center = TRUE, scale = TRUE),
|
zscore_r = scale(r, center = TRUE, scale = TRUE),
|
||||||
zscore_AUC = scale(AUC, center = TRUE, scale = TRUE)
|
zscore_AUC = scale(AUC, center = TRUE, scale = TRUE)
|
||||||
)
|
)
|
||||||
|
|
||||||
write.csv(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
|
fwrite(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# QC generation and publication
|
||||||
generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
|
generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
|
||||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||||
lapply(variables, create_and_publish_plot, df = df, plot_type = "scatter", out_dir_qc = out_dir_qc)
|
lapply(variables, create_and_publish_plot, df = df, plot_type = "scatter", out_dir_qc = out_dir_qc)
|
||||||
delta_bg_above_tolerance <- filter(df, delta_bg >= delta_bg_tolerance)
|
delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ]
|
||||||
lapply(variables, create_and_publish_plot, df = delta_bg_above_tolerance,
|
lapply(variables, create_and_publish_plot, df = delta_bg_above_tolerance,
|
||||||
plot_type = "scatter", out_dir_qc = out_dir_qc, suffix = "_above_tolerance")
|
plot_type = "scatter", out_dir_qc = out_dir_qc, suffix = "_above_tolerance")
|
||||||
}
|
}
|
||||||
|
|
||||||
process_exp_dir <- function(exp_dir, exp_name, genes, easy_results_file) {
|
# Process experiments
|
||||||
|
process_experiment <- function(exp_name, exp_dir, sgd_genes, output_dir) {
|
||||||
out_dir <- file.path(exp_dir, "zscores")
|
out_dir <- file.path(exp_dir, "zscores")
|
||||||
out_dir_qc <- file.path(exp_dir, "qc")
|
out_dir_qc <- file.path(out_dir, "qc")
|
||||||
dir.create(out_dir, showWarnings = FALSE)
|
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
|
||||||
dir.create(out_dir_qc, showWarnings = FALSE)
|
dir.create(out_dir_qc, showWarnings = FALSE)
|
||||||
df <- load_and_preprocess_data(easy_results_file, genes)
|
|
||||||
delta_bg_tolerance <- mean(df$delta_bg, na.rm = TRUE) + 3 * sd(df$delta_bg, na.rm = TRUE)
|
|
||||||
|
|
||||||
generate_and_publish_qc(df, delta_bg_tolerance, out_dir_qc)
|
data <- load_and_preprocess_data(args$easy_results_file, sgd_genes)
|
||||||
|
|
||||||
|
delta_bg_tolerance <- mean(data$delta_bg, na.rm = TRUE) + 3 * sd(data$delta_bg, na.rm = TRUE)
|
||||||
|
|
||||||
|
generate_and_publish_qc(data, delta_bg_tolerance, out_dir_qc)
|
||||||
|
|
||||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||||
publish_summary_stats(df, variables, out_dir)
|
publish_summary_stats(data, variables, out_dir)
|
||||||
publish_interaction_scores(df, out_dir)
|
publish_interaction_scores(data, out_dir)
|
||||||
publish_zscores(df, out_dir)
|
publish_zscores(data, out_dir)
|
||||||
|
|
||||||
list(
|
output_file <- file.path(out_dir, "zscores_interaction.csv")
|
||||||
zscores_file = file.path(out_dir, "zscores_interaction.csv"),
|
fwrite(data, output_file, row.names = FALSE)
|
||||||
exp_name = exp_name
|
|
||||||
)
|
return(output_file)
|
||||||
}
|
}
|
||||||
|
|
||||||
processed_experiments <- lapply(names(args$experiments), function(exp_name) {
|
# Process all experiments
|
||||||
process_exp_dir(args$experiments[[exp_name]], exp_name, genes, args$easy_results_file)
|
processed_files <- lapply(names(args$experiments), function(exp_name) {
|
||||||
|
process_experiment(exp_name, args$experiments[[exp_name]], genes, args$out_dir)
|
||||||
})
|
})
|
||||||
|
|
||||||
zscores_files <- sapply(processed_experiments, `[[`, "zscores_file")
|
# Combine results from all experiments if multiple experiments exist
|
||||||
exp_names <- sapply(processed_experiments, `[[`, "exp_name")
|
if (length(processed_files) > 1) {
|
||||||
|
combined_data <- Reduce(function(x, y) {
|
||||||
combine_zscores <- function(zscores_files) {
|
merge(fread(x), fread(y), by = "OrfRep", all = TRUE, allow.cartesian = TRUE)
|
||||||
if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script")
|
}, processed_files)
|
||||||
|
|
||||||
joined_data <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE)
|
combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv")
|
||||||
for (file in zscores_files[-1]) {
|
fwrite(combined_data, combined_output_file, row.names = FALSE)
|
||||||
temp_data <- read.csv(file = file, stringsAsFactors = FALSE)
|
|
||||||
joined_data <- plyr::join(joined_data, temp_data, by = "OrfRep", type = "full")
|
|
||||||
}
|
|
||||||
joined_data
|
|
||||||
}
|
}
|
||||||
|
|
||||||
process_combined_zscores <- function(joined_data, sd, out_dir, exp_names) {
|
|
||||||
ordered_data <- joined_data %>%
|
|
||||||
select(contains("OrfRep"), matches("Gene"),
|
|
||||||
contains("z_lm_k"), contains("z_shift_k"),
|
|
||||||
contains("z_lm_l"), contains("z_shift_l")) %>%
|
|
||||||
arrange(contains("OrfRep"))
|
|
||||||
|
|
||||||
clean_headers <- function(data, suffixes) {
|
|
||||||
suffixes_to_remove <- paste0("Gene.", seq_len(suffixes))
|
|
||||||
select(data, -all_of(suffixes_to_remove))
|
|
||||||
}
|
|
||||||
|
|
||||||
headSel <- clean_headers(ordered_data, length(zscores_files) - 1)
|
|
||||||
headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1)
|
|
||||||
|
|
||||||
fill_na_in_columns <- function(data) {
|
|
||||||
for (header in colnames(data)) {
|
|
||||||
if (grepl("Shift", header)) {
|
|
||||||
data[[header]][is.na(data[[header]])] <- 0.001
|
|
||||||
} else if (grepl("Z_lm_", header)) {
|
|
||||||
data[[header]][is.na(data[[header]])] <- 0.0001
|
|
||||||
}
|
|
||||||
}
|
|
||||||
data
|
|
||||||
}
|
|
||||||
|
|
||||||
headSel <- fill_na_in_columns(headSel)
|
|
||||||
|
|
||||||
filter_by_sd <- function(data, sd) {
|
|
||||||
if (sd == 0) return(data)
|
|
||||||
|
|
||||||
z_lm_cols <- select(data, contains("z_lm_"))
|
|
||||||
filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0
|
|
||||||
data[filter_vector, ]
|
|
||||||
}
|
|
||||||
|
|
||||||
REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), sd)
|
|
||||||
shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), sd)
|
|
||||||
|
|
||||||
reorder_columns <- function(data1, data2) {
|
|
||||||
combined_data <- data1
|
|
||||||
for (i in 3:ncol(data1)) {
|
|
||||||
combined_data <- cbind(combined_data, data2[i], data1[i])
|
|
||||||
}
|
|
||||||
combined_data
|
|
||||||
}
|
|
||||||
|
|
||||||
combI <- reorder_columns(headSel2, shiftOnly)
|
|
||||||
|
|
||||||
write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
|
|
||||||
write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
|
|
||||||
|
|
||||||
relabel_headers <- function(headers, exp_names) {
|
|
||||||
new_labels <- headers
|
|
||||||
for (i in seq_along(headers)) {
|
|
||||||
suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i])
|
|
||||||
if (suffix %in% seq_along(exp_names)) {
|
|
||||||
exp_name <- exp_names[as.numeric(suffix)]
|
|
||||||
new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
new_labels
|
|
||||||
}
|
|
||||||
|
|
||||||
colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), exp_names)
|
|
||||||
colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), exp_names)
|
|
||||||
|
|
||||||
write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
|
|
||||||
write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
|
|
||||||
}
|
|
||||||
|
|
||||||
joined_data <- combine_zscores(zscores_files)
|
|
||||||
process_combined_zscores(joined_data, args$sd, args$out_dir, exp_names)
|
|
||||||
|
|||||||
@@ -1,238 +0,0 @@
|
|||||||
suppressMessages({
|
|
||||||
library("ggplot2")
|
|
||||||
library("plotly")
|
|
||||||
library("htmlwidgets")
|
|
||||||
library("dplyr")
|
|
||||||
library("ggthemes")
|
|
||||||
})
|
|
||||||
|
|
||||||
# Constants for configuration
|
|
||||||
PLOT_WIDTH <- 14
|
|
||||||
PLOT_HEIGHT <- 9
|
|
||||||
BASE_SIZE <- 14
|
|
||||||
# BACKUP_SUFFIX <- paste0(".backup_", Sys.Date())
|
|
||||||
|
|
||||||
options(warn = 2, max.print = 100)
|
|
||||||
|
|
||||||
# Parse arguments
|
|
||||||
parse_arguments <- function() {
|
|
||||||
if (interactive()) {
|
|
||||||
args <- c(
|
|
||||||
"1",
|
|
||||||
"3",
|
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
|
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
|
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2/results_std.txt",
|
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1/zscores"
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
args <- commandArgs(trailingOnly = TRUE)
|
|
||||||
}
|
|
||||||
list(
|
|
||||||
exp_number = as.numeric(args[1]),
|
|
||||||
delta_bg_factor = as.numeric(args[2]),
|
|
||||||
study_info_file = normalizePath(file.path(args[3]), mustWork = FALSE),
|
|
||||||
sgd_gene_list = normalizePath(file.path(args[4]), mustWork = FALSE),
|
|
||||||
input_file = normalizePath(file.path(args[5]), mustWork = FALSE),
|
|
||||||
out_dir = normalizePath(file.path(args[6]), mustWork = FALSE),
|
|
||||||
out_dir_qc = normalizePath(file.path(args[6], "qc"), mustWork = FALSE)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
args <- parse_arguments()
|
|
||||||
dir.create(args$out_dir, showWarnings = FALSE)
|
|
||||||
dir.create(args$out_dir_qc, showWarnings = FALSE)
|
|
||||||
|
|
||||||
# Define themes and scales
|
|
||||||
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") {
|
|
||||||
theme_foundation(base_size = base_size, base_family = base_family) +
|
|
||||||
theme(
|
|
||||||
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
|
|
||||||
text = element_text(),
|
|
||||||
panel.background = element_rect(colour = NA),
|
|
||||||
plot.background = element_rect(colour = NA),
|
|
||||||
panel.border = element_rect(colour = NA),
|
|
||||||
axis.title = element_text(face = "bold", size = rel(1)),
|
|
||||||
axis.title.y = element_text(angle = 90, vjust = 2),
|
|
||||||
axis.title.x = element_text(vjust = -0.2),
|
|
||||||
axis.line = element_line(colour = "black"),
|
|
||||||
panel.grid.major = element_line(colour = "#f0f0f0"),
|
|
||||||
panel.grid.minor = element_blank(),
|
|
||||||
legend.key = element_rect(colour = NA),
|
|
||||||
legend.position = "bottom",
|
|
||||||
legend.direction = "horizontal",
|
|
||||||
plot.margin = unit(c(10, 5, 5, 5), "mm"),
|
|
||||||
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
|
|
||||||
strip.text = element_text(face = "bold")
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
theme_publication_legend_right <- function(base_size = BASE_SIZE, base_family = "sans") {
|
|
||||||
theme_publication(base_size, base_family) +
|
|
||||||
theme(
|
|
||||||
legend.position = "right",
|
|
||||||
legend.direction = "vertical",
|
|
||||||
legend.key.size = unit(0.5, "cm"),
|
|
||||||
legend.spacing = unit(0, "cm"),
|
|
||||||
legend.title = element_text(face = "italic")
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
scale_fill_publication <- function(...) {
|
|
||||||
discrete_scale("fill", "Publication", manual_pal(values = c(
|
|
||||||
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
|
|
||||||
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
|
|
||||||
)), ...)
|
|
||||||
}
|
|
||||||
|
|
||||||
scale_colour_publication <- function(...) {
|
|
||||||
discrete_scale("colour", "Publication", manual_pal(values = c(
|
|
||||||
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
|
|
||||||
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
|
|
||||||
)), ...)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Load and preprocess data
|
|
||||||
load_and_preprocess_data <- function(input_file) {
|
|
||||||
df <- tryCatch({
|
|
||||||
read.delim(input_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
|
|
||||||
}, error = function(e) {
|
|
||||||
stop("Error reading file: ", input_file, "\n", e$message)
|
|
||||||
})
|
|
||||||
|
|
||||||
df <- df[!df[[1]] %in% c("", "Scan"), ]
|
|
||||||
|
|
||||||
numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd")
|
|
||||||
df[numeric_columns[numeric_columns %in% colnames(df)]] <- lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric)
|
|
||||||
|
|
||||||
df$L <- if ("l" %in% colnames(df)) df$l else {warning("Missing column: l"); NA}
|
|
||||||
df$AUC <- if ("AUC96" %in% colnames(df)) df$AUC96 else {warning("Missing column: AUC96"); NA}
|
|
||||||
df$conc_num <- if ("Conc" %in% colnames(df)) as.numeric(gsub("[^0-9\\.]", "", df$Conc)) else NA
|
|
||||||
df$delta_bg <- if (all(c("X1stBackgrd", "LstBackgrd") %in% colnames(df))) df$LstBackgrd - df$X1stBackgrd else {warning("Missing columns for delta_bg calculation"); NA}
|
|
||||||
|
|
||||||
if (nrow(df) == 0) warning("Dataframe is empty after filtering")
|
|
||||||
df
|
|
||||||
}
|
|
||||||
|
|
||||||
df <- load_and_preprocess_data(args$input_file)
|
|
||||||
|
|
||||||
delta_bg_tolerance <- mean(df$delta_bg, na.rm = TRUE) + args$delta_bg_factor * sd(df$delta_bg, na.rm = TRUE)
|
|
||||||
message("Exp", args$exp_number, ": The delta_bg_factor is: ", args$delta_bg_factor)
|
|
||||||
message("Exp", args$exp_number, ": The delta_bg_tolerance is ", delta_bg_tolerance)
|
|
||||||
|
|
||||||
df <- df %>%
|
|
||||||
mutate(OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep)) %>%
|
|
||||||
filter(!Gene %in% c("BLANK", "Blank", "blank"), Drug != "BMH21")
|
|
||||||
|
|
||||||
# Create plot
|
|
||||||
create_plot <- function(df, var, plot_type) {
|
|
||||||
filtered_df <- df %>% filter(is.finite(.data[[var]]))
|
|
||||||
p <- ggplot(filtered_df, aes(Scan, .data[[var]], color = as.factor(conc_num))) +
|
|
||||||
ggtitle(paste("Plate analysis by Drug Conc for", var, "before quality control")) +
|
|
||||||
theme_publication()
|
|
||||||
|
|
||||||
if (plot_type == "scatter") {
|
|
||||||
p <- p +
|
|
||||||
geom_point(shape = 3, size = 0.2, position = "jitter") +
|
|
||||||
stat_summary(fun = mean, geom = "point", size = 0.6) +
|
|
||||||
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar")
|
|
||||||
} else {
|
|
||||||
p <- p + geom_boxplot()
|
|
||||||
}
|
|
||||||
|
|
||||||
return(p)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Publish plot to PDF and HTML (Plotly)
|
|
||||||
publish_plot <- function(plot, plot_path) {
|
|
||||||
# if (file.exists(plot_path)) {
|
|
||||||
# file.rename(plot_path, paste0(plot_path, BACKUP_SUFFIX))
|
|
||||||
# }
|
|
||||||
|
|
||||||
pdf(plot_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
|
|
||||||
print(plot)
|
|
||||||
dev.off()
|
|
||||||
|
|
||||||
pgg <- suppressWarnings(ggplotly(plot,
|
|
||||||
tooltip = c("L", "K", "ORF", "Gene", "delta_bg")) %>%
|
|
||||||
layout(legend = list(orientation = "h")))
|
|
||||||
saveWidget(pgg, sub(".pdf$", ".html", plot_path), selfcontained = TRUE)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Publish multiple plots
|
|
||||||
publish_multiple_plots <- function(df, variables, plot_type, out_dir_qc, suffix = "") {
|
|
||||||
lapply(variables, function(var) {
|
|
||||||
plot <- create_plot(df, var, plot_type)
|
|
||||||
publish_plot(plot, file.path(out_dir_qc, paste0("plate_analysis_", var, suffix, ".pdf")))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
# Calculate and publish summary statistics
|
|
||||||
publish_summary_stats <- function(df, variables, out_dir) {
|
|
||||||
stats_list <- lapply(variables, function(var) {
|
|
||||||
df %>%
|
|
||||||
group_by(OrfRep, conc_num) %>%
|
|
||||||
summarize(
|
|
||||||
N = n(),
|
|
||||||
mean_val = mean(.data[[var]], na.rm = TRUE),
|
|
||||||
sd_val = sd(.data[[var]], na.rm = TRUE),
|
|
||||||
se_val = sd_val / sqrt(N)
|
|
||||||
)
|
|
||||||
})
|
|
||||||
summary_stats_df <- bind_rows(stats_list, .id = "variable")
|
|
||||||
write.csv(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Calculate and publish interaction scores
|
|
||||||
publish_interaction_scores <- function(df, out_dir) {
|
|
||||||
interaction_scores <- df %>%
|
|
||||||
group_by(OrfRep) %>%
|
|
||||||
summarize(
|
|
||||||
mean_L = mean(L, na.rm = TRUE),
|
|
||||||
mean_K = mean(K, na.rm = TRUE),
|
|
||||||
mean_r = mean(r, na.rm = TRUE),
|
|
||||||
mean_AUC = mean(AUC, na.rm = TRUE),
|
|
||||||
delta_bg_mean = mean(delta_bg, na.rm = TRUE),
|
|
||||||
delta_bg_sd = sd(delta_bg, na.rm = TRUE)
|
|
||||||
) %>%
|
|
||||||
mutate(
|
|
||||||
l_rank = rank(mean_L),
|
|
||||||
k_rank = rank(mean_K),
|
|
||||||
r_rank = rank(mean_r),
|
|
||||||
auc_rank = rank(mean_AUC)
|
|
||||||
)
|
|
||||||
|
|
||||||
write.csv(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
|
|
||||||
write.csv(interaction_scores %>%
|
|
||||||
arrange(l_rank, k_rank), file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Publish z-scores
|
|
||||||
publish_zscores <- function(df, out_dir) {
|
|
||||||
zscores <- df %>%
|
|
||||||
mutate(
|
|
||||||
zscore_L = scale(L, center = TRUE, scale = TRUE),
|
|
||||||
zscore_K = scale(K, center = TRUE, scale = TRUE),
|
|
||||||
zscore_r = scale(r, center = TRUE, scale = TRUE),
|
|
||||||
zscore_AUC = scale(AUC, center = TRUE, scale = TRUE)
|
|
||||||
)
|
|
||||||
|
|
||||||
write.csv(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
|
|
||||||
}
|
|
||||||
|
|
||||||
# QC plot generation and publication
|
|
||||||
generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
|
|
||||||
publish_multiple_plots(df, c("L", "K", "r", "AUC", "delta_bg"), "scatter", out_dir_qc)
|
|
||||||
|
|
||||||
delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ]
|
|
||||||
publish_multiple_plots(delta_bg_above_tolerance, c("L", "K", "r", "AUC", "delta_bg"), "scatter", out_dir_qc, "_above_tolerance")
|
|
||||||
}
|
|
||||||
|
|
||||||
# Run QC plot generation and publication
|
|
||||||
generate_and_publish_qc(df, delta_bg_tolerance, args$out_dir_qc)
|
|
||||||
|
|
||||||
# Run summary statistics, interaction score, and z-score calculations and publications
|
|
||||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
|
||||||
publish_summary_stats(df, variables, args$out_dir)
|
|
||||||
publish_interaction_scores(df, args$out_dir)
|
|
||||||
publish_zscores(df, args$out_dir)
|
|
||||||
@@ -7,7 +7,7 @@ parse_arguments <- function() {
|
|||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
|
||||||
2,
|
2,
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -427,6 +427,25 @@ execute() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
is_directory_empty() {
|
||||||
|
local dir="$1"
|
||||||
|
|
||||||
|
# Check if the directory exists and is a directory
|
||||||
|
if [ ! -d "$dir" ]; then
|
||||||
|
echo "Directory does not exist or is not a directory"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Iterate over the files and directories inside the specified directory
|
||||||
|
for _ in "$dir"/*; do
|
||||||
|
# If we find at least one entry, it's not empty
|
||||||
|
return 1
|
||||||
|
done
|
||||||
|
|
||||||
|
# If the loop completes without finding any entries, the directory is empty
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
# @description Backup one or more files to an incremented .bk file
|
# @description Backup one or more files to an incremented .bk file
|
||||||
#
|
#
|
||||||
# **TODO**
|
# **TODO**
|
||||||
@@ -1481,8 +1500,10 @@ calculate_interaction_zscores() {
|
|||||||
EOF
|
EOF
|
||||||
|
|
||||||
declare script="$APPS_DIR/r/calculate_interaction_zscores.R"
|
declare script="$APPS_DIR/r/calculate_interaction_zscores.R"
|
||||||
declare combined_out_path="${1:-"$STUDY_RESULTS_DIR/zscores"}"
|
declare -a out_paths=("${1:-"$STUDY_RESULTS_DIR/zscores"}")
|
||||||
declare -a out_paths=("${EXP_PATHS[@]}/zscores" "$combined_out_path")
|
for path in "${EXP_PATHS[@]}"; do
|
||||||
|
out_paths+=("${path}/zscores")
|
||||||
|
done
|
||||||
|
|
||||||
# TODO we'll need to change this behaviour after testing
|
# TODO we'll need to change this behaviour after testing
|
||||||
# we can test for existence and then choose to skip or rerun later
|
# we can test for existence and then choose to skip or rerun later
|
||||||
@@ -1493,7 +1514,7 @@ calculate_interaction_zscores() {
|
|||||||
done
|
done
|
||||||
|
|
||||||
execute "$RSCRIPT" "$script" \
|
execute "$RSCRIPT" "$script" \
|
||||||
"${1:-"$out_path"}" \
|
"${1:-"$STUDY_RESULTS_DIR"}" \
|
||||||
"${2:-3}" \
|
"${2:-3}" \
|
||||||
"${3:-"$APPS_DIR/r/SGD_features.tab"}" \
|
"${3:-"$APPS_DIR/r/SGD_features.tab"}" \
|
||||||
"${4:-"$EASY_RESULTS_DIR/results_std.txt"}" \
|
"${4:-"$EASY_RESULTS_DIR/results_std.txt"}" \
|
||||||
|
|||||||
Reference in New Issue
Block a user