Add more output to calculate_interaction_zscores.R
This commit is contained in:
@@ -1,10 +1,10 @@
|
|||||||
suppressMessages({
|
suppressMessages({
|
||||||
library("ggplot2")
|
library(ggplot2)
|
||||||
library("plotly")
|
library(plotly)
|
||||||
library("htmlwidgets")
|
library(htmlwidgets)
|
||||||
library("dplyr")
|
library(dplyr)
|
||||||
library("ggthemes")
|
library(ggthemes)
|
||||||
library("data.table")
|
library(data.table)
|
||||||
})
|
})
|
||||||
|
|
||||||
# Constants for configuration
|
# Constants for configuration
|
||||||
@@ -14,10 +14,9 @@ BASE_SIZE <- 14
|
|||||||
|
|
||||||
options(warn = 2, max.print = 100)
|
options(warn = 2, max.print = 100)
|
||||||
|
|
||||||
# Function to parse arguments
|
|
||||||
parse_arguments <- function() {
|
parse_arguments <- function() {
|
||||||
if (interactive()) {
|
args <- if (interactive()) {
|
||||||
args <- c(
|
c(
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD",
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD",
|
||||||
3,
|
3,
|
||||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
|
||||||
@@ -28,7 +27,7 @@ parse_arguments <- function() {
|
|||||||
"Experiment 2: HLD versus Doxo"
|
"Experiment 2: HLD versus Doxo"
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
args <- commandArgs(trailingOnly = TRUE)
|
commandArgs(trailingOnly = TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE)
|
paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE)
|
||||||
@@ -46,7 +45,6 @@ parse_arguments <- function() {
|
|||||||
|
|
||||||
args <- parse_arguments()
|
args <- parse_arguments()
|
||||||
|
|
||||||
# Ensure main output directories exist
|
|
||||||
dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
|
dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
|
||||||
dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
|
dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
|
||||||
|
|
||||||
@@ -109,26 +107,20 @@ sgd_genes <- function(sgd_gene_list) {
|
|||||||
genes <- sgd_genes(args$sgd_gene_list)
|
genes <- sgd_genes(args$sgd_gene_list)
|
||||||
|
|
||||||
load_and_preprocess_data <- function(easy_results_file, genes) {
|
load_and_preprocess_data <- function(easy_results_file, genes) {
|
||||||
# Attempt to read the file and handle any errors that occur
|
|
||||||
df <- tryCatch({
|
df <- tryCatch({
|
||||||
read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
|
read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
|
||||||
}, error = function(e) {
|
}, error = function(e) {
|
||||||
stop("Error reading file: ", easy_results_file, "\n", e$message)
|
stop("Error reading file: ", easy_results_file, "\n", e$message)
|
||||||
})
|
})
|
||||||
|
|
||||||
# Filter out non-data rows
|
|
||||||
df <- df %>%
|
|
||||||
filter(!.[[1]] %in% c("", "Scan"))
|
|
||||||
|
|
||||||
# Convert specified columns to numeric
|
|
||||||
numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd")
|
numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd")
|
||||||
df[numeric_columns[numeric_columns %in% colnames(df)]] <-
|
df[numeric_columns[numeric_columns %in% colnames(df)]] <-
|
||||||
lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric)
|
suppressWarnings(lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric))
|
||||||
|
|
||||||
# Further filter and mutate the data
|
|
||||||
df <- df %>%
|
df <- df %>%
|
||||||
filter(!(Scan %in% c("", "Scan"))) %>%
|
filter(!.[[1]] %in% c("", "Scan")) %>%
|
||||||
{if ("Conc" %in% colnames(.)) filter(., Conc != "0ug/mL") else .} %>%
|
filter(Gene != "BLANK" & Gene != "Blank" & ORF != "Blank" & Gene != "blank") %>%
|
||||||
|
filter(Drug != "BMH21") %>%
|
||||||
mutate(
|
mutate(
|
||||||
L = if ("l" %in% colnames(.)) l else {warning("Missing column: l"); NA},
|
L = if ("l" %in% colnames(.)) l else {warning("Missing column: l"); NA},
|
||||||
AUC = if ("AUC96" %in% colnames(.)) AUC96 else {warning("Missing column: AUC96"); NA},
|
AUC = if ("AUC96" %in% colnames(.)) AUC96 else {warning("Missing column: AUC96"); NA},
|
||||||
@@ -138,22 +130,26 @@ load_and_preprocess_data <- function(easy_results_file, genes) {
|
|||||||
GeneName = vapply(ORF, function(orf) {
|
GeneName = vapply(ORF, function(orf) {
|
||||||
gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName)
|
gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName)
|
||||||
ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name)
|
ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name)
|
||||||
}, character(1))
|
}, character(1)),
|
||||||
)
|
OrfRep = ifelse(ORF == "YDL227C", "YDL227C", OrfRep)
|
||||||
|
) # %>%
|
||||||
|
# mutate(across(everything(), ~na_if(., "")))
|
||||||
|
|
||||||
# Check if the dataframe is empty after filtering
|
|
||||||
if (nrow(df) == 0) warning("Dataframe is empty after filtering")
|
if (nrow(df) == 0) warning("Dataframe is empty after filtering")
|
||||||
|
|
||||||
return(df)
|
return(df)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# Plot creation and publication
|
|
||||||
create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "") {
|
create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "") {
|
||||||
|
if (!("Scan" %in% colnames(df))) {
|
||||||
|
warning("'Scan' column is not present in the data. Skipping this plot.")
|
||||||
|
return(NULL)
|
||||||
|
}
|
||||||
|
|
||||||
plot_func <- if (plot_type == "scatter") geom_point else geom_boxplot
|
plot_func <- if (plot_type == "scatter") geom_point else geom_boxplot
|
||||||
filtered_df <- df[is.finite(df[[var]]), ]
|
filtered_df <- df[is.finite(df[[var]]), ]
|
||||||
|
|
||||||
p <- ggplot(filtered_df, aes(Scan, .data[[var]], color = as.factor(conc_num))) +
|
p <- ggplot(filtered_df, aes(Scan, !!sym(var), color = as.factor(conc_num))) +
|
||||||
plot_func(shape = 3, size = 0.2, position = "jitter") +
|
plot_func(shape = 3, size = 0.2, position = "jitter") +
|
||||||
stat_summary(fun = mean, geom = "point", size = 0.6) +
|
stat_summary(fun = mean, geom = "point", size = 0.6) +
|
||||||
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar") +
|
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar") +
|
||||||
@@ -171,7 +167,6 @@ create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "")
|
|||||||
saveWidget(pgg, html_path, selfcontained = TRUE)
|
saveWidget(pgg, html_path, selfcontained = TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Summary statistics publication
|
|
||||||
publish_summary_stats <- function(df, variables, out_dir) {
|
publish_summary_stats <- function(df, variables, out_dir) {
|
||||||
stats_list <- lapply(variables, function(var) {
|
stats_list <- lapply(variables, function(var) {
|
||||||
df %>%
|
df %>%
|
||||||
@@ -209,9 +204,41 @@ publish_interaction_scores <- function(df, out_dir) {
|
|||||||
fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
|
fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
|
||||||
fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank),
|
fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank),
|
||||||
file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
|
file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
|
||||||
|
|
||||||
|
# Additional enhancer and suppressor calculations and outputs
|
||||||
|
deletion_enhancers_L <- interaction_scores[interaction_scores$mean_L >= 2, ]
|
||||||
|
deletion_enhancers_L <- deletion_enhancers_L[!is.na(deletion_enhancers_L$OrfRep), ]
|
||||||
|
|
||||||
|
deletion_enhancers_K <- interaction_scores[interaction_scores$mean_K <= -2, ]
|
||||||
|
deletion_enhancers_K <- deletion_enhancers_K[!is.na(deletion_enhancers_K$OrfRep), ]
|
||||||
|
|
||||||
|
deletion_suppressors_L <- interaction_scores[interaction_scores$mean_L <= -2, ]
|
||||||
|
deletion_suppressors_L <- deletion_suppressors_L[!is.na(deletion_suppressors_L$OrfRep), ]
|
||||||
|
|
||||||
|
deletion_suppressors_K <- interaction_scores[interaction_scores$mean_K >= 2, ]
|
||||||
|
deletion_suppressors_K <- deletion_suppressors_K[!is.na(deletion_suppressors_K$OrfRep), ]
|
||||||
|
|
||||||
|
deletion_enhancers_and_suppressors_L <- interaction_scores[
|
||||||
|
interaction_scores$mean_L >= 2 | interaction_scores$mean_L <= -2, ]
|
||||||
|
deletion_enhancers_and_suppressors_L <- deletion_enhancers_and_suppressors_L[
|
||||||
|
!is.na(deletion_enhancers_and_suppressors_L$OrfRep), ]
|
||||||
|
|
||||||
|
deletion_enhancers_and_suppressors_K <- interaction_scores[
|
||||||
|
interaction_scores$mean_K >= 2 | interaction_scores$mean_K <= -2, ]
|
||||||
|
deletion_enhancers_and_suppressors_K <- deletion_enhancers_and_suppressors_K[
|
||||||
|
!is.na(deletion_enhancers_and_suppressors_K$OrfRep), ]
|
||||||
|
|
||||||
|
# Write CSV files with updated filenames
|
||||||
|
fwrite(deletion_enhancers_L, file.path(out_dir, "deletion_enhancers_l.csv"), row.names = FALSE)
|
||||||
|
fwrite(deletion_enhancers_K, file.path(out_dir, "deletion_enhancers_k.csv"), row.names = FALSE)
|
||||||
|
fwrite(deletion_suppressors_L, file.path(out_dir, "deletion_suppressors_l.csv"), row.names = FALSE)
|
||||||
|
fwrite(deletion_suppressors_K, file.path(out_dir, "deletion_suppressors_k.csv"), row.names = FALSE)
|
||||||
|
fwrite(deletion_enhancers_and_suppressors_L, file.path(out_dir, "deletion_enhancers_and_suppressors_l.csv"), row.names = FALSE)
|
||||||
|
fwrite(deletion_enhancers_and_suppressors_K, file.path(out_dir, "deletion_enhancers_and_suppressors_k.csv"), row.names = FALSE)
|
||||||
|
|
||||||
|
return(interaction_scores) # Return the updated data frame with rank columns
|
||||||
}
|
}
|
||||||
|
|
||||||
# Z-scores publication
|
|
||||||
publish_zscores <- function(df, out_dir) {
|
publish_zscores <- function(df, out_dir) {
|
||||||
zscores <- df %>%
|
zscores <- df %>%
|
||||||
dplyr::mutate(
|
dplyr::mutate(
|
||||||
@@ -224,33 +251,121 @@ publish_zscores <- function(df, out_dir) {
|
|||||||
fwrite(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
|
fwrite(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
|
||||||
}
|
}
|
||||||
|
|
||||||
# QC generation and publication
|
|
||||||
generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
|
generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
|
||||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||||
lapply(variables, create_and_publish_plot, df = df, plot_type = "scatter", out_dir_qc = out_dir_qc)
|
|
||||||
|
# Pre-QC plots
|
||||||
|
lapply(variables, function(var) {
|
||||||
|
if (var %in% colnames(df)) {
|
||||||
|
create_and_publish_plot(df, var, "scatter", out_dir_qc)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
# Filter data based on delta background tolerance for Post-QC analysis
|
||||||
|
df_post_qc <- df %>%
|
||||||
|
mutate(across(all_of(variables),
|
||||||
|
~ ifelse(delta_bg >= delta_bg_tolerance, NA, .)))
|
||||||
|
|
||||||
|
# Post-QC plots
|
||||||
|
lapply(variables, function(var) {
|
||||||
|
if (var %in% colnames(df_post_qc)) {
|
||||||
|
create_and_publish_plot(df_post_qc, var, "scatter", out_dir_qc, suffix = "_after_qc")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
# For plots specifically for data above the tolerance threshold
|
||||||
delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ]
|
delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ]
|
||||||
lapply(variables, create_and_publish_plot, df = delta_bg_above_tolerance,
|
lapply(variables, function(var) {
|
||||||
plot_type = "scatter", out_dir_qc = out_dir_qc, suffix = "_above_tolerance")
|
if (var %in% colnames(delta_bg_above_tolerance)) {
|
||||||
|
create_and_publish_plot(delta_bg_above_tolerance, var, "scatter", out_dir_qc, suffix = "_above_tolerance")
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
# Process experiments
|
# Create rank plots
|
||||||
process_experiment <- function(exp_name, exp_dir, sgd_genes, output_dir) {
|
create_rank_plots <- function(interaction_scores, out_dir) {
|
||||||
|
rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank")
|
||||||
|
|
||||||
|
lapply(rank_vars, function(rank_var) {
|
||||||
|
p <- ggplot(interaction_scores, aes(x = !!sym(rank_var))) +
|
||||||
|
geom_bar() +
|
||||||
|
ggtitle(paste("Rank Distribution for", rank_var)) +
|
||||||
|
theme_publication()
|
||||||
|
|
||||||
|
pdf_path <- file.path(out_dir, paste0("rank_distribution_", rank_var, ".pdf"))
|
||||||
|
pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
|
||||||
|
print(p)
|
||||||
|
dev.off()
|
||||||
|
|
||||||
|
# Generate HTML output
|
||||||
|
html_path <- sub(".pdf$", ".html", pdf_path)
|
||||||
|
pgg <- suppressWarnings(ggplotly(p) %>%
|
||||||
|
layout(legend = list(orientation = "h")))
|
||||||
|
saveWidget(pgg, html_path, selfcontained = TRUE)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
create_correlation_plot <- function(interaction_scores, out_dir) {
|
||||||
|
# Check for non-finite values and remove them from the dataset
|
||||||
|
interaction_scores <- interaction_scores %>%
|
||||||
|
filter_all(all_vars(is.finite(.)))
|
||||||
|
|
||||||
|
# Generate correlation plots for each pair of variables
|
||||||
|
pairs <- list(
|
||||||
|
c("mean_L", "mean_K"),
|
||||||
|
c("mean_L", "mean_r"),
|
||||||
|
c("mean_L", "mean_AUC"),
|
||||||
|
c("mean_K", "mean_r"),
|
||||||
|
c("mean_K", "mean_AUC"),
|
||||||
|
c("mean_r", "mean_AUC")
|
||||||
|
)
|
||||||
|
|
||||||
|
lapply(pairs, function(vars) {
|
||||||
|
p <- ggplot(interaction_scores, aes(x = !!sym(vars[1]), y = !!sym(vars[2]))) +
|
||||||
|
geom_point() +
|
||||||
|
geom_smooth(method = "lm", se = FALSE) +
|
||||||
|
ggtitle(paste("Correlation between", vars[1], "and", vars[2])) +
|
||||||
|
theme_publication()
|
||||||
|
|
||||||
|
pdf_path <- file.path(out_dir, paste0("correlation_", vars[1], "_", vars[2], ".pdf"))
|
||||||
|
pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
|
||||||
|
print(p)
|
||||||
|
dev.off()
|
||||||
|
|
||||||
|
# Generate HTML output
|
||||||
|
html_path <- sub(".pdf$", ".html", pdf_path)
|
||||||
|
pgg <- suppressWarnings(ggplotly(p, tooltip = c(vars[1], vars[2])) %>%
|
||||||
|
layout(legend = list(orientation = "h")))
|
||||||
|
saveWidget(pgg, html_path, selfcontained = TRUE)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
process_experiment <- function(exp_name, exp_dir, genes, output_dir) {
|
||||||
out_dir <- file.path(exp_dir, "zscores")
|
out_dir <- file.path(exp_dir, "zscores")
|
||||||
out_dir_qc <- file.path(out_dir, "qc")
|
out_dir_qc <- file.path(out_dir, "qc")
|
||||||
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
|
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
|
||||||
dir.create(out_dir_qc, showWarnings = FALSE)
|
dir.create(out_dir_qc, showWarnings = FALSE)
|
||||||
|
|
||||||
data <- load_and_preprocess_data(args$easy_results_file, sgd_genes)
|
data <- load_and_preprocess_data(args$easy_results_file, genes)
|
||||||
|
|
||||||
|
# Calculate delta background tolerance
|
||||||
delta_bg_tolerance <- mean(data$delta_bg, na.rm = TRUE) + 3 * sd(data$delta_bg, na.rm = TRUE)
|
delta_bg_tolerance <- mean(data$delta_bg, na.rm = TRUE) + 3 * sd(data$delta_bg, na.rm = TRUE)
|
||||||
|
|
||||||
|
# Generate and publish QC plots (both pre-QC and post-QC)
|
||||||
generate_and_publish_qc(data, delta_bg_tolerance, out_dir_qc)
|
generate_and_publish_qc(data, delta_bg_tolerance, out_dir_qc)
|
||||||
|
|
||||||
|
# Process and publish summary stats, interaction scores, and z-scores
|
||||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||||
publish_summary_stats(data, variables, out_dir)
|
publish_summary_stats(data, variables, out_dir)
|
||||||
publish_interaction_scores(data, out_dir)
|
interaction_scores <- publish_interaction_scores(data, out_dir)
|
||||||
publish_zscores(data, out_dir)
|
publish_zscores(data, out_dir)
|
||||||
|
|
||||||
|
# Generate rank plots and correlation plots
|
||||||
|
create_rank_plots(interaction_scores, out_dir)
|
||||||
|
create_correlation_plot(interaction_scores, out_dir)
|
||||||
|
|
||||||
output_file <- file.path(out_dir, "zscores_interaction.csv")
|
output_file <- file.path(out_dir, "zscores_interaction.csv")
|
||||||
fwrite(data, output_file, row.names = FALSE)
|
fwrite(data, output_file, row.names = FALSE)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user