Rollup for calculation_interaction_zscores refactor
This commit is contained in:
@@ -7,13 +7,13 @@ suppressMessages({
|
||||
library(data.table)
|
||||
})
|
||||
|
||||
options(warn = 2, max.print = 1000)
|
||||
|
||||
# Constants for configuration
|
||||
PLOT_WIDTH <- 14
|
||||
PLOT_HEIGHT <- 9
|
||||
BASE_SIZE <- 14
|
||||
|
||||
options(warn = 2, max.print = 100)
|
||||
|
||||
parse_arguments <- function() {
|
||||
args <- if (interactive()) {
|
||||
c(
|
||||
@@ -29,11 +29,11 @@ parse_arguments <- function() {
|
||||
} else {
|
||||
commandArgs(trailingOnly = TRUE)
|
||||
}
|
||||
|
||||
|
||||
paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE)
|
||||
names <- args[seq(6, length(args), by = 2)]
|
||||
experiments <- setNames(paths, names)
|
||||
|
||||
|
||||
list(
|
||||
out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
|
||||
sd = as.numeric(args[2]),
|
||||
@@ -49,7 +49,7 @@ dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
|
||||
dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
|
||||
|
||||
# Define themes and scales
|
||||
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") {
|
||||
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans", legend_position = "bottom") {
|
||||
theme_foundation(base_size = base_size, base_family = base_family) +
|
||||
theme(
|
||||
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
|
||||
@@ -64,25 +64,14 @@ theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") {
|
||||
panel.grid.major = element_line(colour = "#f0f0f0"),
|
||||
panel.grid.minor = element_blank(),
|
||||
legend.key = element_rect(colour = NA),
|
||||
legend.position = "bottom",
|
||||
legend.direction = "horizontal",
|
||||
legend.position = legend_position,
|
||||
legend.direction = ifelse(legend_position == "right", "vertical", "horizontal"),
|
||||
plot.margin = unit(c(10, 5, 5, 5), "mm"),
|
||||
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
|
||||
strip.text = element_text(face = "bold")
|
||||
)
|
||||
}
|
||||
|
||||
theme_publication_legend_right <- function(base_size = BASE_SIZE, base_family = "sans") {
|
||||
theme_publication(base_size, base_family) +
|
||||
theme(
|
||||
legend.position = "right",
|
||||
legend.direction = "vertical",
|
||||
legend.key.size = unit(0.5, "cm"),
|
||||
legend.spacing = unit(0, "cm"),
|
||||
legend.title = element_text(face = "italic")
|
||||
)
|
||||
}
|
||||
|
||||
scale_fill_publication <- function(...) {
|
||||
discrete_scale("fill", "Publication", manual_pal(values = c(
|
||||
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
|
||||
@@ -100,58 +89,91 @@ scale_colour_publication <- function(...) {
|
||||
# Load SGD gene list
|
||||
sgd_genes <- function(sgd_gene_list) {
|
||||
read.delim(file = sgd_gene_list, quote = "", header = FALSE,
|
||||
colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) %>%
|
||||
colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) %>%
|
||||
dplyr::rename(ORF = V4, GeneName = V5)
|
||||
}
|
||||
|
||||
genes <- sgd_genes(args$sgd_gene_list)
|
||||
|
||||
# Define the adjust_values function
|
||||
adjust_values <- function(x, delta_bg, tolerance) {
|
||||
valid_x <- x[x != 0 & !is.na(x)]
|
||||
max_x <- ifelse(length(valid_x) == 0, NA_real_, max(valid_x, na.rm = TRUE))
|
||||
|
||||
x_adjusted <- ifelse(x == 0 & !is.na(x), max_x, x)
|
||||
|
||||
# Adjust only if delta_bg is valid and above tolerance
|
||||
x_adjusted <- ifelse(!is.na(delta_bg) & delta_bg >= tolerance, NA_real_, x_adjusted)
|
||||
|
||||
return(x_adjusted)
|
||||
}
|
||||
|
||||
|
||||
|
||||
load_and_preprocess_data <- function(easy_results_file, genes) {
|
||||
df <- tryCatch({
|
||||
df <-
|
||||
read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
|
||||
}, error = function(e) {
|
||||
stop("Error reading file: ", easy_results_file, "\n", e$message)
|
||||
})
|
||||
|
||||
# Convert specific columns to numeric
|
||||
numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd")
|
||||
df[numeric_columns[numeric_columns %in% colnames(df)]] <-
|
||||
suppressWarnings(lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric))
|
||||
|
||||
# Apply initial filters and mutating steps as in the original script
|
||||
# Apply further transformations
|
||||
df <- df %>%
|
||||
mutate(L = if ("l" %in% colnames(.)) l else {warning("Missing column: l"); NA}) %>%
|
||||
mutate(AUC = if ("AUC96" %in% colnames(.)) AUC96 else {warning("Missing column: AUC96"); NA}) %>%
|
||||
filter(!.[[1]] %in% c("", "Scan")) %>% # Filter out empty or Scan rows
|
||||
filter(Gene != "BLANK" & Gene != "Blank" & ORF != "Blank" & Gene != "blank") %>% # Remove blank genes
|
||||
filter(Drug != "BMH21") %>% # Filter out specific drugs if necessary
|
||||
filter(!is.na(ORF) & ORF != "") %>% # Ensure ORF is not NA or empty
|
||||
mutate(OrfRep = ifelse(ORF == "YDL227C", "YDL227C", OrfRep)) %>%
|
||||
mutate(conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc))) %>% # Extract numeric drug concentrations
|
||||
filter(!is.na(LstBackgrd) & !is.na(X1stBackgrd)) %>% # Ensure finite background values
|
||||
mutate(delta_bg = LstBackgrd - X1stBackgrd) %>% # Calculate delta background
|
||||
filter(!is.na(L) & !is.na(K) & !is.na(r) & !is.na(AUC96)) %>% # Ensure finite measurements
|
||||
mutate(GeneName = vapply(ORF, function(orf) {
|
||||
gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName)
|
||||
ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name)
|
||||
}, character(1)))
|
||||
filter(
|
||||
!is.na(ORF) & ORF != "",
|
||||
!ORF %in% c("Blank"),
|
||||
!is.na(Gene) & !Gene %in% c("BLANK", "Blank", "blank"),
|
||||
Drug != "BMH21",
|
||||
!is.na(Scan),
|
||||
!is.na(l), !is.na(K),
|
||||
!is.na(r), !is.na(AUC96)
|
||||
) %>%
|
||||
mutate(
|
||||
Col = suppressWarnings(as.numeric(Col)),
|
||||
Row = suppressWarnings(as.numeric(Row)),
|
||||
L = suppressWarnings(as.numeric(l)),
|
||||
K = suppressWarnings(as.numeric(K)),
|
||||
r = suppressWarnings(as.numeric(r)),
|
||||
Scan = suppressWarnings(as.numeric(Scan)),
|
||||
AUC = suppressWarnings(as.numeric(AUC96)),
|
||||
LstBackgrd = suppressWarnings(as.numeric(LstBackgrd)),
|
||||
X1stBackgrd = suppressWarnings(as.numeric(X1stBackgrd)),
|
||||
delta_bg = ifelse(is.na(LstBackgrd) | is.na(X1stBackgrd), NA_real_, LstBackgrd - X1stBackgrd),
|
||||
delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + 3 * sd(delta_bg, na.rm = TRUE),
|
||||
OrfRep = ifelse(ORF == "YDL227C", "YDL227C", OrfRep),
|
||||
OrfRepUnique = paste(OrfRep, Gene, Num., sep = "_"),
|
||||
conc_num = suppressWarnings(as.numeric(gsub("[^0-9\\.]", "", Conc))),
|
||||
DB = as.integer(delta_bg >= delta_bg_tolerance),
|
||||
SM = 0,
|
||||
NG = as.integer(l == 0 & !is.na(l)),
|
||||
L_adjusted = adjust_values(L, delta_bg, delta_bg_tolerance),
|
||||
r_adjusted = adjust_values(r, delta_bg, delta_bg_tolerance),
|
||||
AUC_adjusted = adjust_values(AUC, delta_bg, delta_bg_tolerance),
|
||||
K_adjusted = adjust_values(K, delta_bg, delta_bg_tolerance),
|
||||
max_L_adjusted = ifelse(all(is.na(L_adjusted)), NA_real_, max(L_adjusted, na.rm = TRUE)),
|
||||
SM = ifelse(!is.na(max_L_adjusted) & L_adjusted == max_L_adjusted, 1, SM)
|
||||
)
|
||||
|
||||
if (nrow(df) == 0) warning("Dataframe is empty after filtering")
|
||||
# Efficiently handle the assignment of Gene names
|
||||
gene_map <- setNames(genes$GeneName, genes$ORF)
|
||||
df$Gene <- ifelse(df$OrfRep != "YDL227C", gene_map[df$ORF], df$Gene)
|
||||
df$Gene <- ifelse(df$Gene == "" | df$Gene == "OCT1", df$OrfRep, df$Gene)
|
||||
|
||||
return(df)
|
||||
}
|
||||
|
||||
create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "") {
|
||||
if (!("Scan" %in% colnames(df))) {
|
||||
warning("'Scan' column is not present in the data. Skipping this plot.")
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
plot_func <- if (plot_type == "scatter") geom_point else geom_boxplot
|
||||
filtered_df <- df[is.finite(df[[var]]), ]
|
||||
|
||||
p <- ggplot(filtered_df, aes(Scan, !!sym(var), color = as.factor(conc_num))) +
|
||||
plot_func(shape = 3, size = 0.2, position = "jitter") +
|
||||
|
||||
|
||||
generate_plot <- function(df, var, plot_type = "scatter", out_dir_qc, suffix = "") {
|
||||
|
||||
# Filter out non-finite values before plotting
|
||||
df <- df %>%
|
||||
filter(is.finite(!!sym(var)))
|
||||
|
||||
plot_func <- switch(plot_type,
|
||||
scatter = geom_point(shape = 3, size = 0.2, position = "jitter"),
|
||||
boxplot = geom_boxplot())
|
||||
|
||||
p <- ggplot(df, aes(Scan, !!sym(var), color = as.factor(conc_num))) +
|
||||
plot_func +
|
||||
stat_summary(fun = mean, geom = "point", size = 0.6) +
|
||||
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar") +
|
||||
ggtitle(paste("Plate analysis by Drug Conc for", var, "before quality control")) +
|
||||
@@ -161,184 +183,233 @@ create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "")
|
||||
pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
|
||||
print(p)
|
||||
dev.off()
|
||||
|
||||
|
||||
html_path <- sub(".pdf$", ".html", pdf_path)
|
||||
pgg <- suppressWarnings(ggplotly(p, tooltip = c("L", "K", "ORF", "Gene", "delta_bg", "GeneName")) %>%
|
||||
pgg <- suppressWarnings(ggplotly(p) %>%
|
||||
layout(legend = list(orientation = "h")))
|
||||
saveWidget(pgg, html_path, selfcontained = TRUE)
|
||||
}
|
||||
|
||||
publish_summary_stats <- function(df, variables, out_dir) {
|
||||
stats_list <- lapply(variables, function(var) {
|
||||
df %>%
|
||||
dplyr::group_by(OrfRep, conc_num) %>%
|
||||
dplyr::summarize(
|
||||
N = dplyr::n(),
|
||||
mean_val = mean(.data[[var]], na.rm = TRUE),
|
||||
sd_val = sd(.data[[var]], na.rm = TRUE),
|
||||
se_val = sd_val / sqrt(N)
|
||||
)
|
||||
})
|
||||
summary_stats_df <- dplyr::bind_rows(stats_list, .id = "variable")
|
||||
fwrite(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
||||
}
|
||||
|
||||
# Compute Interaction Scores
|
||||
compute_interaction_scores <- function(df) {
|
||||
# Calculate raw shifts
|
||||
raw_shifts <- df %>%
|
||||
group_by(OrfRep) %>%
|
||||
reframe(
|
||||
Raw_Shift_L = mean(L, na.rm = TRUE),
|
||||
Raw_Shift_K = mean(K, na.rm = TRUE),
|
||||
Raw_Shift_r = mean(r, na.rm = TRUE),
|
||||
Raw_Shift_AUC = mean(AUC, na.rm = TRUE),
|
||||
NG = n()
|
||||
)
|
||||
|
||||
# Calculate Z-scores
|
||||
z_scores <- df %>%
|
||||
group_by(OrfRep) %>%
|
||||
reframe(
|
||||
Z_Shift_L = mean(scale(L, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Z_Shift_K = mean(scale(K, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Z_Shift_r = mean(scale(r, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Z_Shift_AUC = mean(scale(AUC, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE)
|
||||
)
|
||||
|
||||
# Linear Model Scores and R-Squared
|
||||
lm_scores <- df %>%
|
||||
group_by(OrfRep) %>%
|
||||
reframe(
|
||||
lm_Score_L = coef(lm(L ~ delta_bg, data = .))[2],
|
||||
R_Squared_L = summary(lm(L ~ delta_bg, data = .))$r.squared,
|
||||
lm_Score_K = coef(lm(K ~ delta_bg, data = .))[2],
|
||||
R_Squared_K = summary(lm(K ~ delta_bg, data = .))$r.squared,
|
||||
lm_Score_r = coef(lm(r ~ delta_bg, data = .))[2],
|
||||
R_Squared_r = summary(lm(r ~ delta_bg, data = .))$r.squared,
|
||||
lm_Score_AUC = coef(lm(AUC ~ delta_bg, data = .))[2],
|
||||
R_Squared_AUC = summary(lm(AUC ~ delta_bg, data = .))$r.squared
|
||||
)
|
||||
|
||||
# Calculate Sum and Average Z-scores
|
||||
sum_avg_z_scores <- df %>%
|
||||
group_by(OrfRep) %>%
|
||||
reframe(
|
||||
Sum_Z_Score_L = sum(scale(L, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Avg_Zscore_L = mean(scale(L, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Sum_Z_Score_K = sum(scale(K, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Avg_Zscore_K = mean(scale(K, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Sum_Z_Score_r = sum(scale(r, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Avg_Zscore_r = mean(scale(r, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Sum_Z_Score_AUC = sum(scale(AUC, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Avg_Zscore_AUC = mean(scale(AUC, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE)
|
||||
)
|
||||
|
||||
# Combine all calculations
|
||||
interaction_scores <- raw_shifts %>%
|
||||
left_join(z_scores, by = "OrfRep") %>%
|
||||
left_join(lm_scores, by = "OrfRep") %>%
|
||||
left_join(sum_avg_z_scores, by = "OrfRep") %>%
|
||||
mutate(
|
||||
l_rank = rank(-Raw_Shift_L),
|
||||
k_rank = rank(-Raw_Shift_K),
|
||||
r_rank = rank(-Raw_Shift_r),
|
||||
auc_rank = rank(-Raw_Shift_AUC)
|
||||
)
|
||||
|
||||
return(interaction_scores)
|
||||
}
|
||||
|
||||
|
||||
publish_interaction_scores <- function(df, out_dir) {
|
||||
interaction_scores <- compute_interaction_scores(df)
|
||||
|
||||
# Additional enhancer and suppressor calculations and outputs
|
||||
deletion_enhancers_L <- interaction_scores[interaction_scores$Raw_Shift_L >= 2, ]
|
||||
deletion_enhancers_L <- deletion_enhancers_L[!is.na(deletion_enhancers_L$OrfRep), ]
|
||||
|
||||
deletion_enhancers_K <- interaction_scores[interaction_scores$Raw_Shift_K <= -2, ]
|
||||
deletion_enhancers_K <- deletion_enhancers_K[!is.na(deletion_enhancers_K$OrfRep), ]
|
||||
|
||||
deletion_suppressors_L <- interaction_scores[interaction_scores$Raw_Shift_L <= -2, ]
|
||||
deletion_suppressors_L <- deletion_suppressors_L[!is.na(deletion_suppressors_L$OrfRep), ]
|
||||
|
||||
deletion_suppressors_K <- interaction_scores[interaction_scores$Raw_Shift_K >= 2, ]
|
||||
deletion_suppressors_K <- deletion_suppressors_K[!is.na(deletion_suppressors_K$OrfRep), ]
|
||||
|
||||
deletion_enhancers_and_suppressors_L <- interaction_scores[
|
||||
interaction_scores$Raw_Shift_L >= 2 | interaction_scores$Raw_Shift_L <= -2, ]
|
||||
deletion_enhancers_and_suppressors_L <- deletion_enhancers_and_suppressors_L[
|
||||
!is.na(deletion_enhancers_and_suppressors_L$OrfRep), ]
|
||||
|
||||
deletion_enhancers_and_suppressors_K <- interaction_scores[
|
||||
interaction_scores$Raw_Shift_K >= 2 | interaction_scores$Raw_Shift_K <= -2, ]
|
||||
deletion_enhancers_and_suppressors_K <- deletion_enhancers_and_suppressors_K[
|
||||
!is.na(deletion_enhancers_and_suppressors_K$OrfRep), ]
|
||||
|
||||
# Write CSV files with updated filenames
|
||||
fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
|
||||
fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank),
|
||||
file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
|
||||
fwrite(deletion_enhancers_L, file.path(out_dir, "deletion_enhancers_l.csv"), row.names = FALSE)
|
||||
fwrite(deletion_enhancers_K, file.path(out_dir, "deletion_enhancers_k.csv"), row.names = FALSE)
|
||||
fwrite(deletion_suppressors_L, file.path(out_dir, "deletion_suppressors_l.csv"), row.names = FALSE)
|
||||
fwrite(deletion_suppressors_K, file.path(out_dir, "deletion_suppressors_k.csv"), row.names = FALSE)
|
||||
fwrite(deletion_enhancers_and_suppressors_L, file.path(out_dir, "deletion_enhancers_and_suppressors_l.csv"), row.names = FALSE)
|
||||
fwrite(deletion_enhancers_and_suppressors_K, file.path(out_dir, "deletion_enhancers_and_suppressors_k.csv"), row.names = FALSE)
|
||||
|
||||
return(interaction_scores) # Return the updated data frame with all calculated columns
|
||||
}
|
||||
|
||||
publish_zscores <- function(df, out_dir) {
|
||||
zscores <- df %>%
|
||||
dplyr::mutate(
|
||||
zscore_L = scale(Raw_Shift_L, center = TRUE, scale = TRUE),
|
||||
zscore_K = scale(Raw_Shift_K, center = TRUE, scale = TRUE),
|
||||
zscore_r = scale(Raw_Shift_r, center = TRUE, scale = TRUE),
|
||||
zscore_AUC = scale(Raw_Shift_AUC, center = TRUE, scale = TRUE)
|
||||
)
|
||||
|
||||
fwrite(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
|
||||
}
|
||||
|
||||
|
||||
generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
|
||||
generate_and_publish_qc <- function(df, out_dir_qc) {
|
||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||
|
||||
# Pre-QC plots
|
||||
lapply(variables, function(var) {
|
||||
# Access delta_bg_tolerance from the dataframe
|
||||
delta_bg_tolerance <- df$delta_bg_tolerance[1]
|
||||
|
||||
for (var in variables) {
|
||||
if (var %in% colnames(df)) {
|
||||
create_and_publish_plot(df, var, "scatter", out_dir_qc)
|
||||
generate_plot(df, var, "scatter", out_dir_qc)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
# Filter data based on delta background tolerance for Post-QC analysis
|
||||
df_post_qc <- df %>%
|
||||
mutate(across(all_of(variables),
|
||||
~ ifelse(delta_bg >= delta_bg_tolerance, NA, .)))
|
||||
mutate(across(all_of(variables), ~ ifelse(delta_bg >= delta_bg_tolerance, NA, .)))
|
||||
|
||||
# Post-QC plots
|
||||
lapply(variables, function(var) {
|
||||
for (var in variables) {
|
||||
if (var %in% colnames(df_post_qc)) {
|
||||
create_and_publish_plot(df_post_qc, var, "scatter", out_dir_qc, suffix = "_after_qc")
|
||||
generate_plot(df_post_qc, var, "scatter", out_dir_qc, suffix = "_after_qc")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
# For plots specifically for data above the tolerance threshold
|
||||
delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ]
|
||||
lapply(variables, function(var) {
|
||||
for (var in variables) {
|
||||
if (var %in% colnames(delta_bg_above_tolerance)) {
|
||||
create_and_publish_plot(delta_bg_above_tolerance, var, "scatter", out_dir_qc, suffix = "_above_tolerance")
|
||||
generate_plot(delta_bg_above_tolerance, var, "scatter", out_dir_qc, suffix = "_above_tolerance")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
df_finite <- df %>% filter(is.finite(delta_bg))
|
||||
delta_bg_frequency_plot <- ggplot(df_finite, aes(delta_bg, color = as.factor(conc_num))) +
|
||||
geom_density() +
|
||||
ggtitle("Density plot for Delta Background by Conc All Data") +
|
||||
theme_publication(legend_position = "right")
|
||||
|
||||
delta_bg_bar_plot <- ggplot(df_finite, aes(delta_bg, color = as.factor(conc_num))) +
|
||||
geom_bar() +
|
||||
ggtitle("Bar plot for Delta Background by Conc All Data") +
|
||||
theme_publication(legend_position = "right")
|
||||
|
||||
pdf(file = file.path(out_dir_qc, "frequency_delta_background.pdf"), width = 12, height = 8)
|
||||
print(delta_bg_frequency_plot)
|
||||
print(delta_bg_bar_plot)
|
||||
dev.off()
|
||||
|
||||
saveWidget(ggplotly(delta_bg_frequency_plot),
|
||||
file = file.path(out_dir_qc, "frequency_delta_background_density.html"), selfcontained = TRUE)
|
||||
saveWidget(ggplotly(delta_bg_bar_plot),
|
||||
file = file.path(out_dir_qc, "frequency_delta_background_bar.html"), selfcontained = TRUE)
|
||||
|
||||
df_no_zero <- df %>% filter(L > 0 & is.finite(L))
|
||||
|
||||
scatter_plots <- list()
|
||||
box_plots <- list()
|
||||
for (var in variables) {
|
||||
if (var %in% colnames(df_no_zero)) {
|
||||
scatter_plots[[var]] <- ggplot(df_no_zero, aes(Scan, .data[[var]], color = as.factor(conc_num))) +
|
||||
geom_point(shape = 3, size = 0.2) +
|
||||
stat_summary(fun = mean, geom = "point", size = 0.6) +
|
||||
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar") +
|
||||
ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) +
|
||||
theme_publication()
|
||||
|
||||
box_plots[[var]] <- ggplot(df_no_zero, aes(as.factor(Scan), .data[[var]], color = as.factor(conc_num))) +
|
||||
geom_boxplot() +
|
||||
ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) +
|
||||
theme_publication()
|
||||
}
|
||||
}
|
||||
|
||||
pdf(file = file.path(out_dir_qc, "plate_analysis_no_zeros.pdf"), width = 14, height = 9)
|
||||
lapply(scatter_plots, print)
|
||||
dev.off()
|
||||
|
||||
pdf(file = file.path(out_dir_qc, "plate_analysis_no_zeros_boxplots.pdf"), width = 18, height = 9)
|
||||
lapply(box_plots, print)
|
||||
dev.off()
|
||||
|
||||
for (var in names(scatter_plots)) {
|
||||
html_path <- file.path(out_dir_qc, paste0("plate_analysis_no_zeros_", var, ".html"))
|
||||
|
||||
pgg <- suppressWarnings(ggplotly(scatter_plots[[var]]) %>%
|
||||
layout(legend = list(orientation = "h")))
|
||||
saveWidget(pgg, html_path, selfcontained = TRUE)
|
||||
}
|
||||
|
||||
for (var in names(box_plots)) {
|
||||
html_path <- file.path(out_dir_qc, paste0("plate_analysis_no_zeros_boxplots_", var, ".html"))
|
||||
|
||||
pgg <- suppressWarnings(ggplotly(box_plots[[var]]) %>%
|
||||
layout(legend = list(orientation = "h")))
|
||||
saveWidget(pgg, html_path, selfcontained = TRUE)
|
||||
}
|
||||
}
|
||||
|
||||
create_rank_plots <- function(interaction_scores, out_dir) {
|
||||
compute_experiment_summary_stats <- function(df, out_dir, background_strain = NULL) {
|
||||
if (!is.null(background_strain)) {
|
||||
df <- df %>% filter(OrfRep == background_strain)
|
||||
}
|
||||
|
||||
summary_stats <- df %>%
|
||||
group_by(OrfRep, conc_num) %>%
|
||||
summarize(
|
||||
N = n(),
|
||||
mean_L = mean(L, na.rm = TRUE),
|
||||
median_L = median(L, na.rm = TRUE),
|
||||
max_L = max(L, na.rm = TRUE),
|
||||
min_L = min(L, na.rm = TRUE),
|
||||
sd_L = sd(L, na.rm = TRUE),
|
||||
se_L = sd_L / sqrt(N),
|
||||
mean_K = mean(K, na.rm = TRUE),
|
||||
median_K = median(K, na.rm = TRUE),
|
||||
max_K = max(K, na.rm = TRUE),
|
||||
min_K = min(K, na.rm = TRUE),
|
||||
sd_K = sd(K, na.rm = TRUE),
|
||||
se_K = sd_K / sqrt(N),
|
||||
mean_r = mean(r, na.rm = TRUE),
|
||||
median_r = median(r, na.rm = TRUE),
|
||||
max_r = max(r, na.rm = TRUE),
|
||||
min_r = min(r, na.rm = TRUE),
|
||||
sd_r = sd(r, na.rm = TRUE),
|
||||
se_r = sd_r / sqrt(N),
|
||||
mean_AUC = mean(AUC, na.rm = TRUE),
|
||||
median_AUC = median(AUC, na.rm = TRUE),
|
||||
max_AUC = max(AUC, na.rm = TRUE),
|
||||
min_AUC = min(AUC, na.rm = TRUE),
|
||||
sd_AUC = sd(AUC, na.rm = TRUE),
|
||||
se_AUC = sd_AUC / sqrt(N)
|
||||
)
|
||||
|
||||
if (is.null(background_strain)) {
|
||||
fwrite(summary_stats, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
||||
} else {
|
||||
fwrite(summary_stats, file.path(out_dir, "summary_stats_background_strains.csv"), row.names = FALSE)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
filter_and_calculate_sd_k_stats <- function(df, out_dir_qc) {
|
||||
df_cleaned <- df %>%
|
||||
filter(is.finite(L_adjusted) & is.finite(K_adjusted))
|
||||
|
||||
df_within_2SD_K <- df_cleaned %>%
|
||||
filter(abs(K_adjusted - mean(K_adjusted, na.rm = TRUE)) <= 2 * sd(K_adjusted, na.rm = TRUE)) %>%
|
||||
group_by(conc_num) %>%
|
||||
summarize(
|
||||
N = n(),
|
||||
mean_L = mean(L_adjusted, na.rm = TRUE),
|
||||
max_L = ifelse(all(is.na(L_adjusted)), NA_real_, max(L_adjusted, na.rm = TRUE)),
|
||||
sd_L = sd(L_adjusted, na.rm = TRUE)
|
||||
)
|
||||
fwrite(df_within_2SD_K, file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2SD_K.csv"))
|
||||
|
||||
df_outside_2SD_K <- df_cleaned %>%
|
||||
filter(abs(K_adjusted - mean(K_adjusted, na.rm = TRUE)) > 2 * sd(K_adjusted, na.rm = TRUE)) %>%
|
||||
group_by(conc_num) %>%
|
||||
summarize(
|
||||
N = n(),
|
||||
mean_L = mean(L_adjusted, na.rm = TRUE),
|
||||
max_L = ifelse(all(is.na(L_adjusted)), NA_real_, max(L_adjusted, na.rm = TRUE)),
|
||||
sd_L = sd(L_adjusted, na.rm = TRUE)
|
||||
)
|
||||
fwrite(df_outside_2SD_K, file.path(out_dir_qc, "max_observed_L_vals_for_spots_outside_2SD_K.csv"))
|
||||
|
||||
p_raw_before <- ggplot(df_cleaned,
|
||||
aes(x = L_adjusted, y = K_adjusted, color = as.factor(conc_num))) +
|
||||
geom_point() +
|
||||
ggtitle("Raw L vs K before QC") +
|
||||
theme_publication(legend_position = "right")
|
||||
|
||||
pdf(file.path(out_dir_qc, "raw_L_vs_K_before_QC.pdf"))
|
||||
print(p_raw_before)
|
||||
dev.off()
|
||||
|
||||
saveWidget(ggplotly(p_raw_before), file.path(out_dir_qc, "raw_L_vs_K_before_QC.html"))
|
||||
|
||||
p_above_threshold <- ggplot(df_cleaned %>% filter(delta_bg > delta_bg_tolerance),
|
||||
aes(x = L_adjusted, y = K_adjusted, color = as.factor(conc_num))) +
|
||||
geom_point() +
|
||||
ggtitle("Raw L vs K for strains above delta background threshold") +
|
||||
theme_publication(legend_position = "right")
|
||||
|
||||
pdf(file.path(out_dir_qc, "raw_L_vs_K_for_strains_above_delta_bg_threshold.pdf"))
|
||||
print(p_above_threshold)
|
||||
dev.off()
|
||||
|
||||
saveWidget(ggplotly(p_above_threshold), file.path(out_dir_qc, "raw_L_vs_K_for_strains_above_delta_bg_threshold.html"))
|
||||
}
|
||||
|
||||
compute_rf_interaction_scores <- function(df_rf, output_dir) {
|
||||
lm_sd_L <- sd(df_rf$lm_Score_L, na.rm = TRUE)
|
||||
lm_sd_K <- sd(df_rf$lm_Score_K, na.rm = TRUE)
|
||||
lm_sd_r <- sd(df_rf$lm_Score_r, na.rm = TRUE)
|
||||
lm_sd_AUC <- sd(df_rf$lm_Score_AUC, na.rm = TRUE)
|
||||
|
||||
lm_mean_L <- mean(df_rf$lm_Score_L, na.rm = TRUE)
|
||||
lm_mean_K <- mean(df_rf$lm_Score_K, na.rm = TRUE)
|
||||
lm_mean_r <- mean(df_rf$lm_Score_r, na.rm = TRUE)
|
||||
lm_mean_AUC <- mean(df_rf$lm_Score_AUC, na.rm = TRUE)
|
||||
|
||||
df_rf <- df_rf %>%
|
||||
mutate(
|
||||
Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L,
|
||||
Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K,
|
||||
Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r,
|
||||
Z_lm_AUC = (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC
|
||||
)
|
||||
|
||||
df_rf <- df_rf %>%
|
||||
arrange(desc(Z_lm_L), desc(NG))
|
||||
|
||||
fwrite(df_rf, file.path(output_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
|
||||
|
||||
return(df_rf)
|
||||
}
|
||||
|
||||
create_rank_plots <- function(interaction_scores_ranks, out_dir) {
|
||||
rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank")
|
||||
|
||||
lapply(rank_vars, function(rank_var) {
|
||||
p <- ggplot(interaction_scores, aes(x = !!sym(rank_var))) +
|
||||
p <- ggplot(interaction_scores_ranks, aes(x = !!sym(rank_var))) +
|
||||
geom_bar() +
|
||||
ggtitle(paste("Rank Distribution for", rank_var)) +
|
||||
theme_publication()
|
||||
@@ -347,8 +418,7 @@ create_rank_plots <- function(interaction_scores, out_dir) {
|
||||
pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
|
||||
print(p)
|
||||
dev.off()
|
||||
|
||||
# Generate HTML output
|
||||
|
||||
html_path <- sub(".pdf$", ".html", pdf_path)
|
||||
pgg <- suppressWarnings(ggplotly(p) %>%
|
||||
layout(legend = list(orientation = "h")))
|
||||
@@ -357,11 +427,9 @@ create_rank_plots <- function(interaction_scores, out_dir) {
|
||||
}
|
||||
|
||||
create_correlation_plot <- function(interaction_scores, out_dir) {
|
||||
# Check for non-finite values and remove them from the dataset
|
||||
interaction_scores <- interaction_scores %>%
|
||||
filter_all(all_vars(is.finite(.)))
|
||||
|
||||
# Generate correlation plots for each pair of variables
|
||||
|
||||
pairs <- list(
|
||||
c("Raw_Shift_L", "Raw_Shift_K"),
|
||||
c("Raw_Shift_L", "Raw_Shift_r"),
|
||||
@@ -370,7 +438,7 @@ create_correlation_plot <- function(interaction_scores, out_dir) {
|
||||
c("Raw_Shift_K", "Raw_Shift_AUC"),
|
||||
c("Raw_Shift_r", "Raw_Shift_AUC")
|
||||
)
|
||||
|
||||
|
||||
lapply(pairs, function(vars) {
|
||||
p <- ggplot(interaction_scores, aes(x = !!sym(vars[1]), y = !!sym(vars[2]))) +
|
||||
geom_point() +
|
||||
@@ -382,8 +450,7 @@ create_correlation_plot <- function(interaction_scores, out_dir) {
|
||||
pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
|
||||
print(p)
|
||||
dev.off()
|
||||
|
||||
# Generate HTML output
|
||||
|
||||
html_path <- sub(".pdf$", ".html", pdf_path)
|
||||
pgg <- suppressWarnings(ggplotly(p, tooltip = c(vars[1], vars[2])) %>%
|
||||
layout(legend = list(orientation = "h")))
|
||||
@@ -397,260 +464,124 @@ process_experiment <- function(exp_name, exp_dir, genes, output_dir) {
|
||||
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
|
||||
dir.create(out_dir_qc, showWarnings = FALSE)
|
||||
|
||||
# Load and preprocess the data
|
||||
data <- load_and_preprocess_data(args$easy_results_file, genes)
|
||||
|
||||
# Calculate delta background tolerance
|
||||
delta_bg_tolerance <- mean(data$delta_bg, na.rm = TRUE) + 3 * sd(data$delta_bg, na.rm = TRUE)
|
||||
# Generate and publish QC plots
|
||||
generate_and_publish_qc(data, out_dir_qc)
|
||||
|
||||
# Generate and publish QC plots (both pre-QC and post-QC)
|
||||
generate_and_publish_qc(data, delta_bg_tolerance, out_dir_qc)
|
||||
# Calculate and save summary statistics
|
||||
compute_experiment_summary_stats(data, out_dir)
|
||||
compute_experiment_summary_stats(data, out_dir, background_strain = "YDL227C")
|
||||
|
||||
InteractionScores_RF <- data %>%
|
||||
select(OrfRepUnique) %>%
|
||||
distinct() %>%
|
||||
mutate(
|
||||
Gene = NA,
|
||||
Raw_Shift_L = NA, Z_Shift_L = NA, lm_Score_L = NA, Z_lm_L = NA,
|
||||
R_Squared_L = NA, Sum_Z_Score_L = NA, Avg_Zscore_L = NA,
|
||||
Raw_Shift_K = NA, Z_Shift_K = NA, lm_Score_K = NA, Z_lm_K = NA,
|
||||
R_Squared_K = NA, Sum_Z_Score_K = NA, Avg_Zscore_K = NA,
|
||||
Raw_Shift_r = NA, Z_Shift_r = NA, lm_Score_r = NA, Z_lm_r = NA,
|
||||
R_Squared_r = NA, Sum_Z_Score_r = NA, Avg_Zscore_r = NA,
|
||||
Raw_Shift_AUC = NA, Z_Shift_AUC = NA, lm_Score_AUC = NA, Z_lm_AUC = NA,
|
||||
R_Squared_AUC = NA, Sum_Z_Score_AUC = NA, Avg_Zscore_AUC = NA,
|
||||
NG = NA, SM = NA
|
||||
)
|
||||
|
||||
for (i in seq_along(unique(InteractionScores_RF$OrfRepUnique))) {
|
||||
Gene_Sel <- unique(InteractionScores_RF$OrfRepUnique)[i]
|
||||
X_Gene_Sel <- data[data$OrfRepUnique == Gene_Sel,]
|
||||
|
||||
X_stats_interaction <- X_Gene_Sel %>%
|
||||
group_by(OrfRepUnique, Gene) %>%
|
||||
summarize(
|
||||
Raw_Shift_L = mean(L_adjusted, na.rm = TRUE),
|
||||
Raw_Shift_K = mean(K_adjusted, na.rm = TRUE),
|
||||
Raw_Shift_r = mean(r_adjusted, na.rm = TRUE),
|
||||
Raw_Shift_AUC = mean(AUC_adjusted, na.rm = TRUE),
|
||||
Z_Shift_L = mean(scale(L_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Z_Shift_K = mean(scale(K_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Z_Shift_r = mean(scale(r_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
Z_Shift_AUC = mean(scale(AUC_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
|
||||
lm_Score_L = coef(lm(L_adjusted ~ delta_bg, data = X_Gene_Sel))[2],
|
||||
R_Squared_L = summary(lm(L_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared,
|
||||
lm_Score_K = coef(lm(K_adjusted ~ delta_bg, data = X_Gene_Sel))[2],
|
||||
R_Squared_K = summary(lm(K_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared,
|
||||
lm_Score_r = coef(lm(r_adjusted ~ delta_bg, data = X_Gene_Sel))[2],
|
||||
R_Squared_r = summary(lm(r_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared,
|
||||
lm_Score_AUC = coef(lm(AUC_adjusted ~ delta_bg, data = X_Gene_Sel))[2],
|
||||
R_Squared_AUC = summary(lm(AUC_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared,
|
||||
NG = sum(NG, na.rm = TRUE),
|
||||
DB = sum(DB, na.rm = TRUE),
|
||||
SM = sum(SM, na.rm = TRUE)
|
||||
)
|
||||
|
||||
InteractionScores_RF <- InteractionScores_RF %>%
|
||||
mutate(
|
||||
Gene = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Gene[1], Gene),
|
||||
Raw_Shift_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_L[1], Raw_Shift_L),
|
||||
Z_Shift_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_L[1], Z_Shift_L),
|
||||
lm_Score_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_L[1], lm_Score_L),
|
||||
R_Squared_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_L[1], R_Squared_L),
|
||||
Sum_Z_Score_L = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_L, na.rm = TRUE), Sum_Z_Score_L),
|
||||
Avg_Zscore_L = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_L, na.rm = TRUE), Avg_Zscore_L),
|
||||
Raw_Shift_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_K[1], Raw_Shift_K),
|
||||
Z_Shift_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_K[1], Z_Shift_K),
|
||||
lm_Score_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_K[1], lm_Score_K),
|
||||
R_Squared_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_K[1], R_Squared_K),
|
||||
Sum_Z_Score_K = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_K, na.rm = TRUE), Sum_Z_Score_K),
|
||||
Avg_Zscore_K = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_K, na.rm = TRUE), Avg_Zscore_K),
|
||||
Raw_Shift_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_r[1], Raw_Shift_r),
|
||||
Z_Shift_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_r[1], Z_Shift_r),
|
||||
lm_Score_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_r[1], lm_Score_r),
|
||||
R_Squared_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_r[1], R_Squared_r),
|
||||
Sum_Z_Score_r = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_r, na.rm = TRUE), Sum_Z_Score_r),
|
||||
Avg_Zscore_r = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_r, na.rm = TRUE), Avg_Zscore_r),
|
||||
Raw_Shift_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_AUC[1], Raw_Shift_AUC),
|
||||
Z_Shift_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_AUC[1], Z_Shift_AUC),
|
||||
lm_Score_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_AUC[1], lm_Score_AUC),
|
||||
R_Squared_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_AUC[1], R_Squared_AUC),
|
||||
Sum_Z_Score_AUC = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_AUC, na.rm = TRUE), Sum_Z_Score_AUC),
|
||||
Avg_Zscore_AUC = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_AUC, na.rm = TRUE), Avg_Zscore_AUC),
|
||||
NG = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$NG[1], NG),
|
||||
DB = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$DB[1], DB),
|
||||
SM = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$SM[1], SM)
|
||||
)
|
||||
}
|
||||
|
||||
fwrite(InteractionScores_RF, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
|
||||
|
||||
generate_and_publish_qc(data, out_dir_qc)
|
||||
|
||||
filter_and_calculate_sd_k_stats(data, out_dir_qc)
|
||||
|
||||
# Process and publish summary stats, interaction scores, and z-scores
|
||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||
publish_summary_stats(data, variables, out_dir)
|
||||
interaction_scores <- publish_interaction_scores(data, out_dir)
|
||||
publish_zscores(interaction_scores, out_dir) # Now writing interaction_scores, not original data
|
||||
compute_experiment_summary_stats(data, variables, out_dir)
|
||||
compute_experiment_summary_stats(data, variables, out_dir, background_strain = "YDL227C")
|
||||
|
||||
# Generate rank plots and correlation plots
|
||||
create_rank_plots(interaction_scores, out_dir)
|
||||
create_correlation_plot(interaction_scores, out_dir)
|
||||
compute_rf_interaction_scores(InteractionScores_RF, out_dir)
|
||||
|
||||
publish_scores(InteractionScores_RF, out_dir)
|
||||
|
||||
create_rank_plots(InteractionScores_RF, out_dir)
|
||||
create_correlation_plot(InteractionScores_RF, out_dir)
|
||||
|
||||
output_file <- file.path(out_dir, "zscores_interaction.csv")
|
||||
fwrite(interaction_scores, output_file, row.names = FALSE) # Write interaction_scores here
|
||||
fwrite(InteractionScores_RF, output_file, row.names = FALSE)
|
||||
|
||||
return(output_file)
|
||||
}
|
||||
|
||||
# Process all experiments
|
||||
processed_files <- lapply(names(args$experiments), function(exp_name) {
|
||||
process_experiment(exp_name, args$experiments[[exp_name]], genes, args$out_dir)
|
||||
})
|
||||
|
||||
# Combine results from all experiments if multiple experiments exist
|
||||
if (length(processed_files) > 1) {
|
||||
combined_data <- Reduce(function(x, y) {
|
||||
merge(fread(x), fread(y), by = "OrfRep", all = TRUE, allow.cartesian = TRUE)
|
||||
merge(fread(x), fread(y), by = "OrfRepUnique", all = TRUE)
|
||||
}, processed_files)
|
||||
|
||||
combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv")
|
||||
fwrite(combined_data, combined_output_file, row.names = FALSE)
|
||||
|
||||
combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv")
|
||||
fwrite(combined_data, combined_output_file, row.names = FALSE)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# publish_interaction_scores <- function(df, out_dir) {
|
||||
# interaction_scores <- df %>%
|
||||
# dplyr::group_by(OrfRep) %>%
|
||||
# dplyr::summarize(
|
||||
# mean_L = mean(L, na.rm = TRUE),
|
||||
# mean_K = mean(K, na.rm = TRUE),
|
||||
# mean_r = mean(r, na.rm = TRUE),
|
||||
# mean_AUC = mean(AUC, na.rm = TRUE),
|
||||
# delta_bg_mean = mean(delta_bg, na.rm = TRUE),
|
||||
# delta_bg_sd = sd(delta_bg, na.rm = TRUE)
|
||||
# ) %>%
|
||||
# dplyr::mutate(
|
||||
# l_rank = rank(mean_L),
|
||||
# k_rank = rank(mean_K),
|
||||
# r_rank = rank(mean_r),
|
||||
# auc_rank = rank(mean_AUC)
|
||||
# )
|
||||
|
||||
# fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
|
||||
# fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank),
|
||||
# file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
|
||||
|
||||
# # Additional enhancer and suppressor calculations and outputs
|
||||
# deletion_enhancers_L <- interaction_scores[interaction_scores$mean_L >= 2, ]
|
||||
# deletion_enhancers_L <- deletion_enhancers_L[!is.na(deletion_enhancers_L$OrfRep), ]
|
||||
|
||||
# deletion_enhancers_K <- interaction_scores[interaction_scores$mean_K <= -2, ]
|
||||
# deletion_enhancers_K <- deletion_enhancers_K[!is.na(deletion_enhancers_K$OrfRep), ]
|
||||
|
||||
# deletion_suppressors_L <- interaction_scores[interaction_scores$mean_L <= -2, ]
|
||||
# deletion_suppressors_L <- deletion_suppressors_L[!is.na(deletion_suppressors_L$OrfRep), ]
|
||||
|
||||
# deletion_suppressors_K <- interaction_scores[interaction_scores$mean_K >= 2, ]
|
||||
# deletion_suppressors_K <- deletion_suppressors_K[!is.na(deletion_suppressors_K$OrfRep), ]
|
||||
|
||||
# deletion_enhancers_and_suppressors_L <- interaction_scores[
|
||||
# interaction_scores$mean_L >= 2 | interaction_scores$mean_L <= -2, ]
|
||||
# deletion_enhancers_and_suppressors_L <- deletion_enhancers_and_suppressors_L[
|
||||
# !is.na(deletion_enhancers_and_suppressors_L$OrfRep), ]
|
||||
|
||||
# deletion_enhancers_and_suppressors_K <- interaction_scores[
|
||||
# interaction_scores$mean_K >= 2 | interaction_scores$mean_K <= -2, ]
|
||||
# deletion_enhancers_and_suppressors_K <- deletion_enhancers_and_suppressors_K[
|
||||
# !is.na(deletion_enhancers_and_suppressors_K$OrfRep), ]
|
||||
|
||||
# # Write CSV files with updated filenames
|
||||
# fwrite(deletion_enhancers_L, file.path(out_dir, "deletion_enhancers_l.csv"), row.names = FALSE)
|
||||
# fwrite(deletion_enhancers_K, file.path(out_dir, "deletion_enhancers_k.csv"), row.names = FALSE)
|
||||
# fwrite(deletion_suppressors_L, file.path(out_dir, "deletion_suppressors_l.csv"), row.names = FALSE)
|
||||
# fwrite(deletion_suppressors_K, file.path(out_dir, "deletion_suppressors_k.csv"), row.names = FALSE)
|
||||
# fwrite(deletion_enhancers_and_suppressors_L, file.path(out_dir, "deletion_enhancers_and_suppressors_l.csv"), row.names = FALSE)
|
||||
# fwrite(deletion_enhancers_and_suppressors_K, file.path(out_dir, "deletion_enhancers_and_suppressors_k.csv"), row.names = FALSE)
|
||||
|
||||
# return(interaction_scores) # Return the updated data frame with rank columns
|
||||
# }
|
||||
|
||||
|
||||
|
||||
# publish_zscores <- function(df, out_dir) {
|
||||
# zscores <- df %>%
|
||||
# dplyr::mutate(
|
||||
# zscore_L = scale(L, center = TRUE, scale = TRUE),
|
||||
# zscore_K = scale(K, center = TRUE, scale = TRUE),
|
||||
# zscore_r = scale(r, center = TRUE, scale = TRUE),
|
||||
# zscore_AUC = scale(AUC, center = TRUE, scale = TRUE)
|
||||
# )
|
||||
|
||||
# fwrite(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
|
||||
# }
|
||||
|
||||
# generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
|
||||
# variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||
|
||||
# # Pre-QC plots
|
||||
# lapply(variables, function(var) {
|
||||
# if (var %in% colnames(df)) {
|
||||
# create_and_publish_plot(df, var, "scatter", out_dir_qc)
|
||||
# }
|
||||
# })
|
||||
|
||||
# # Filter data based on delta background tolerance for Post-QC analysis
|
||||
# df_post_qc <- df %>%
|
||||
# mutate(across(all_of(variables),
|
||||
# ~ ifelse(delta_bg >= delta_bg_tolerance, NA, .)))
|
||||
|
||||
# # Post-QC plots
|
||||
# lapply(variables, function(var) {
|
||||
# if (var %in% colnames(df_post_qc)) {
|
||||
# create_and_publish_plot(df_post_qc, var, "scatter", out_dir_qc, suffix = "_after_qc")
|
||||
# }
|
||||
# })
|
||||
|
||||
# # For plots specifically for data above the tolerance threshold
|
||||
# delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ]
|
||||
# lapply(variables, function(var) {
|
||||
# if (var %in% colnames(delta_bg_above_tolerance)) {
|
||||
# create_and_publish_plot(delta_bg_above_tolerance, var, "scatter", out_dir_qc, suffix = "_above_tolerance")
|
||||
# }
|
||||
# })
|
||||
# }
|
||||
|
||||
|
||||
|
||||
# # Create rank plots
|
||||
# create_rank_plots <- function(interaction_scores, out_dir) {
|
||||
# rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank")
|
||||
|
||||
# lapply(rank_vars, function(rank_var) {
|
||||
# p <- ggplot(interaction_scores, aes(x = !!sym(rank_var))) +
|
||||
# geom_bar() +
|
||||
# ggtitle(paste("Rank Distribution for", rank_var)) +
|
||||
# theme_publication()
|
||||
|
||||
# pdf_path <- file.path(out_dir, paste0("rank_distribution_", rank_var, ".pdf"))
|
||||
# pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
|
||||
# print(p)
|
||||
# dev.off()
|
||||
|
||||
# # Generate HTML output
|
||||
# html_path <- sub(".pdf$", ".html", pdf_path)
|
||||
# pgg <- suppressWarnings(ggplotly(p) %>%
|
||||
# layout(legend = list(orientation = "h")))
|
||||
# saveWidget(pgg, html_path, selfcontained = TRUE)
|
||||
# })
|
||||
# }
|
||||
|
||||
# create_correlation_plot <- function(interaction_scores, out_dir) {
|
||||
# # Check for non-finite values and remove them from the dataset
|
||||
# interaction_scores <- interaction_scores %>%
|
||||
# filter_all(all_vars(is.finite(.)))
|
||||
|
||||
# # Generate correlation plots for each pair of variables
|
||||
# pairs <- list(
|
||||
# c("mean_L", "mean_K"),
|
||||
# c("mean_L", "mean_r"),
|
||||
# c("mean_L", "mean_AUC"),
|
||||
# c("mean_K", "mean_r"),
|
||||
# c("mean_K", "mean_AUC"),
|
||||
# c("mean_r", "mean_AUC")
|
||||
# )
|
||||
|
||||
# lapply(pairs, function(vars) {
|
||||
# p <- ggplot(interaction_scores, aes(x = !!sym(vars[1]), y = !!sym(vars[2]))) +
|
||||
# geom_point() +
|
||||
# geom_smooth(method = "lm", se = FALSE) +
|
||||
# ggtitle(paste("Correlation between", vars[1], "and", vars[2])) +
|
||||
# theme_publication()
|
||||
|
||||
# pdf_path <- file.path(out_dir, paste0("correlation_", vars[1], "_", vars[2], ".pdf"))
|
||||
# pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
|
||||
# print(p)
|
||||
# dev.off()
|
||||
|
||||
# # Generate HTML output
|
||||
# html_path <- sub(".pdf$", ".html", pdf_path)
|
||||
# pgg <- suppressWarnings(ggplotly(p, tooltip = c(vars[1], vars[2])) %>%
|
||||
# layout(legend = list(orientation = "h")))
|
||||
# saveWidget(pgg, html_path, selfcontained = TRUE)
|
||||
# })
|
||||
# }
|
||||
|
||||
|
||||
# process_experiment <- function(exp_name, exp_dir, genes, output_dir) {
|
||||
# out_dir <- file.path(exp_dir, "zscores")
|
||||
# out_dir_qc <- file.path(out_dir, "qc")
|
||||
# dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
|
||||
# dir.create(out_dir_qc, showWarnings = FALSE)
|
||||
|
||||
# # Load and preprocess the data
|
||||
# data <- load_and_preprocess_data(args$easy_results_file, genes)
|
||||
|
||||
# # Calculate delta background tolerance
|
||||
# delta_bg_tolerance <- mean(data$delta_bg, na.rm = TRUE) + 3 * sd(data$delta_bg, na.rm = TRUE)
|
||||
|
||||
# # Generate and publish QC plots (both pre-QC and post-QC)
|
||||
# generate_and_publish_qc(data, delta_bg_tolerance, out_dir_qc)
|
||||
|
||||
# # Process and publish summary stats, interaction scores, and z-scores
|
||||
# variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||
# publish_summary_stats(data, variables, out_dir)
|
||||
# interaction_scores <- publish_interaction_scores(data, out_dir)
|
||||
# publish_zscores(data, out_dir)
|
||||
|
||||
# # Generate rank plots and correlation plots
|
||||
# create_rank_plots(interaction_scores, out_dir)
|
||||
# create_correlation_plot(interaction_scores, out_dir)
|
||||
|
||||
# output_file <- file.path(out_dir, "zscores_interaction.csv")
|
||||
# fwrite(data, output_file, row.names = FALSE)
|
||||
|
||||
# return(output_file)
|
||||
# }
|
||||
|
||||
# # Process all experiments
|
||||
# processed_files <- lapply(names(args$experiments), function(exp_name) {
|
||||
# process_experiment(exp_name, args$experiments[[exp_name]], genes, args$out_dir)
|
||||
# })
|
||||
|
||||
# # Combine results from all experiments if multiple experiments exist
|
||||
# if (length(processed_files) > 1) {
|
||||
# combined_data <- Reduce(function(x, y) {
|
||||
# merge(fread(x), fread(y), by = "OrfRep", all = TRUE, allow.cartesian = TRUE)
|
||||
# }, processed_files)
|
||||
|
||||
# combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv")
|
||||
# fwrite(combined_data, combined_output_file, row.names = FALSE)
|
||||
# }
|
||||
Reference in New Issue
Block a user