Cleanup calculate_interaction_zscores files

This commit is contained in:
2024-09-10 13:36:16 -04:00
parent a8cf3c5325
commit 84563ca8b9
6 changed files with 860 additions and 6237 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,945 +0,0 @@
suppressMessages({
library(ggplot2)
library(plotly)
library(htmlwidgets)
library(dplyr)
library(ggthemes)
library(data.table)
library(unix)
})
options(warn = 2, max.print = 1000)
options(width = 10000)
# Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes)
soft_limit <- 30 * 1024 * 1024 * 1024
hard_limit <- 30 * 1024 * 1024 * 1024
rlimit_as(soft_limit, hard_limit)
# Constants for configuration
plot_width <- 14
plot_height <- 9
base_size <- 14
parse_arguments <- function() {
args <- if (interactive()) {
c(
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2_DoxoHLD/results_std.txt",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
"Experiment 1: Doxo versus HLD",
3,
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
"Experiment 2: HLD versus Doxo",
3
)
} else {
commandArgs(trailingOnly = TRUE)
}
# Extract paths, names, and standard deviations
paths <- args[seq(4, length(args), by = 3)]
names <- args[seq(5, length(args), by = 3)]
sds <- as.numeric(args[seq(6, length(args), by = 3)])
# Normalize paths
normalized_paths <- normalizePath(paths, mustWork = FALSE)
# Create named list of experiments
experiments <- list()
for (i in seq_along(paths)) {
experiments[[names[i]]] <- list(
path = normalized_paths[i],
sd = sds[i]
)
}
list(
out_dir = normalizePath(args[1], mustWork = FALSE),
sgd_gene_list = normalizePath(args[2], mustWork = FALSE),
easy_results_file = normalizePath(args[3], mustWork = FALSE),
experiments = experiments
)
}
args <- parse_arguments()
# Should we keep output in exp dirs or combine in the study output dir?
# dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
# dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
# Define themes and scales
theme_publication <- function(base_size = 14, base_family = "sans", legend_position = "bottom") {
theme_foundation <- ggplot2::theme_grey(base_size = base_size, base_family = base_family)
theme_foundation %+replace%
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.line = element_line(colour = "black"),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = legend_position,
legend.direction = ifelse(legend_position == "right", "vertical", "horizontal"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
}
scale_fill_publication <- function(...) {
discrete_scale("fill", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
scale_colour_publication <- function(...) {
discrete_scale("colour", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
# Load the initial dataframe from the easy_results_file
load_and_process_data <- function(easy_results_file, sd = 3) {
df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
df <- df %>%
filter(!(.[[1]] %in% c("", "Scan"))) %>%
filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>%
# Rename columns
rename(L = l, num = Num., AUC = AUC96, scan = Scan, last_bg = LstBackgrd, first_bg = X1stBackgrd) %>%
mutate(
across(c(Col, Row, num, L, K, r, scan, AUC, last_bg, first_bg), as.numeric),
delta_bg = last_bg - first_bg,
delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + (sd * sd(delta_bg, na.rm = TRUE)),
NG = if_else(L == 0 & !is.na(L), 1, 0),
DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0),
SM = 0,
OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded?
conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)),
conc_num_factor = as.numeric(as.factor(conc_num)) - 1
)
return(df)
}
# Update Gene names using the SGD gene list
update_gene_names <- function(df, sgd_gene_list) {
# Load SGD gene list
genes <- read.delim(file = sgd_gene_list,
quote = "", header = FALSE,
colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11)))
# Create a named vector for mapping ORF to GeneName
gene_map <- setNames(genes$V5, genes$V4)
# Vectorized match to find the GeneName from gene_map
mapped_genes <- gene_map[df$ORF]
# Replace NAs in mapped_genes with original Gene names (preserves existing Gene names if ORF is not found)
updated_genes <- ifelse(is.na(mapped_genes) | df$OrfRep == "YDL227C", df$Gene, mapped_genes)
# Ensure Gene is not left blank or incorrectly updated to "OCT1"
df <- df %>%
mutate(Gene = ifelse(updated_genes == "" | updated_genes == "OCT1", OrfRep, updated_genes))
return(df)
}
# Calculate summary statistics for all variables
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
df <- df %>%
mutate(across(all_of(variables), ~ ifelse(. == 0, NA, .)))
summary_stats <- df %>%
group_by(across(all_of(group_vars))) %>%
summarise(
N = sum(!is.na(L)),
across(all_of(variables), list(
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
max = ~max(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)
), .names = "{.fn}_{.col}")
)
df_cleaned <- df %>%
select(-any_of(names(summary_stats)))
df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars)
return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
}
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
# Calculate total concentration variables
total_conc_num <- length(unique(df$conc_num))
num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1
# Pull the background means and standard deviations from zero concentration
bg_means <- list(
L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(),
K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(),
r = df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first(),
AUC = df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first()
)
bg_sd <- list(
L = df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first(),
K = df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first(),
r = df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first(),
AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
)
interaction_scores <- df %>%
mutate(
WT_L = df$mean_L,
WT_K = df$mean_K,
WT_r = df$mean_r,
WT_AUC = df$mean_AUC,
WT_sd_L = df$sd_L,
WT_sd_K = df$sd_K,
WT_sd_r = df$sd_r,
WT_sd_AUC = df$sd_AUC
) %>%
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
mutate(
N = sum(!is.na(L)),
NG = sum(NG, na.rm = TRUE),
DB = sum(DB, na.rm = TRUE),
SM = sum(SM, na.rm = TRUE),
across(all_of(variables), list(
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
max = ~max(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)
), .names = "{.fn}_{.col}")
) %>%
ungroup()
interaction_scores <- interaction_scores %>%
group_by(across(all_of(group_vars))) %>%
mutate(
Raw_Shift_L = mean_L[[1]] - bg_means$L,
Raw_Shift_K = mean_K[[1]] - bg_means$K,
Raw_Shift_r = mean_r[[1]] - bg_means$r,
Raw_Shift_AUC = mean_AUC[[1]] - bg_means$AUC,
Z_Shift_L = Raw_Shift_L[[1]] / df$sd_L[[1]],
Z_Shift_K = Raw_Shift_K[[1]] / df$sd_K[[1]],
Z_Shift_r = Raw_Shift_r[[1]] / df$sd_r[[1]],
Z_Shift_AUC = Raw_Shift_AUC[[1]] / df$sd_AUC[[1]]
)
interaction_scores <- interaction_scores %>%
mutate(
Exp_L = WT_L + Raw_Shift_L,
Delta_L = mean_L - Exp_L,
Exp_K = WT_K + Raw_Shift_K,
Delta_K = mean_K - Exp_K,
Exp_r = WT_r + Raw_Shift_r,
Delta_r = mean_r - Exp_r,
Exp_AUC = WT_AUC + Raw_Shift_AUC,
Delta_AUC = mean_AUC - Exp_AUC
)
interaction_scores <- interaction_scores %>%
mutate(
Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L),
Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K),
Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r),
Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC),
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L)
)
# Calculate linear models and interaction scores
interaction_scores <- interaction_scores %>%
mutate(
lm_L = lm(Delta_L ~ conc_num_factor),
lm_K = lm(Delta_K ~ conc_num_factor),
lm_r = lm(Delta_r ~ conc_num_factor),
lm_AUC = lm(Delta_AUC ~ conc_num_factor),
Zscore_L = Delta_L / WT_sd_L,
Zscore_K = Delta_K / WT_sd_K,
Zscore_r = Delta_r / WT_sd_r,
Zscore_AUC = Delta_AUC / WT_sd_AUC
)
interaction_scores <- interaction_scores %>%
mutate(
Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE),
Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE),
Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE),
Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE)
)
interaction_scores_all <- interaction_scores %>%
mutate(
Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs,
Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs,
Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1),
Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1),
lm_Score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1],
lm_Score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1],
lm_Score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1],
lm_Score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1],
r_squared_L = summary(lm_L)$r.squared,
r_squared_K = summary(lm_K)$r.squared,
r_squared_r = summary(lm_r)$r.squared,
r_squared_AUC = summary(lm_AUC)$r.squared
)
# Calculate Z_lm for each variable
interaction_scores_all <- interaction_scores_all %>%
mutate(
Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE),
Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE),
Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE),
Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE)
)
# Arrange results by Z_lm_L and NG
interaction_scores_all <- interaction_scores_all %>%
arrange(desc(lm_Score_L)) %>%
arrange(desc(NG)) %>%
ungroup()
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
}
generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
`%||%` <- function(a, b) if (!is.null(a)) a else b
# Generalized plot generation
plots <- lapply(plot_configs, function(config) {
df <- config$df
plot <- ggplot(df, aes(x = !!sym(config$x_var), y = !!sym(config$y_var), color = as.factor(!!sym(config$color_var))))
# Rank plots with SD annotations
if (config$plot_type == "rank") {
plot <- plot + geom_point(size = 0.1, shape = 3)
# Adding SD bands
if (!is.null(config$sd_band)) {
for (i in seq_len(config$sd_band)) {
plot <- plot +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = i, ymax = Inf, fill = "#542788", alpha = 0.3) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = -i, ymax = -Inf, fill = "orange", alpha = 0.3) +
geom_hline(yintercept = c(-i, i), color = "gray")
}
}
# Optionally add enhancer/suppressor count annotations
if (!is.null(config$enhancer_label)) {
plot <- plot + annotate("text", x = config$enhancer_label$x,
y = config$enhancer_label$y, label = config$enhancer_label$label) +
annotate("text", x = config$suppressor_label$x,
y = config$suppressor_label$y, label = config$suppressor_label$label)
}
}
# Correlation plots
if (config$plot_type == "correlation") {
plot <- plot + geom_point(shape = 3, color = "gray70") +
geom_smooth(method = "lm", color = "tomato3") +
annotate("text", x = 0, y = 0, label = config$correlation_text)
}
# General scatter/boxplot/density handling
if (!is.null(config$y_var)) {
plot <- plot + aes(y = !!sym(config$y_var))
y_mean_col <- paste0("mean_", config$y_var)
y_sd_col <- paste0("sd_", config$y_var)
if (config$y_var == "delta_bg" && config$plot_type == "scatter") {
plot <- plot + geom_point(shape = 3, size = 0.2, position = "jitter") +
geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col),
ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) +
geom_point(aes(y = !!sym(y_mean_col)), size = 0.6)
} else if (config$error_bar %||% FALSE) {
plot <- plot +
geom_point(shape = 3, size = 0.2) +
geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col),
ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) +
geom_point(aes(y = !!sym(y_mean_col)), size = 0.6)
}
}
# Plot type selection
plot <- switch(config$plot_type,
"box" = plot + geom_boxplot(),
"density" = plot + geom_density(),
"bar" = plot + geom_bar(stat = "identity"),
plot + geom_point() + geom_smooth(method = "lm", se = FALSE))
# Adding y-limits if provided
if (!is.null(config$ylim_vals)) {
plot <- plot + coord_cartesian(ylim = config$ylim_vals)
}
# Setting legend position, titles, and labels
legend_position <- config$legend_position %||% "bottom"
plot <- plot + ggtitle(config$title) + theme_Publication(legend_position = legend_position)
if (!is.null(config$x_label)) plot <- plot + xlab(config$x_label)
if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label)
# Adding text annotations if provided
if (!is.null(config$annotations)) {
for (annotation in config$annotations) {
plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label)
}
}
return(plot)
})
# If grid_layout is provided, arrange plots in a grid and save in a single PDF
if (!is.null(grid_layout)) {
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
# Loop through plots in chunks defined by ncol and nrow
for (start_idx in seq(1, length(plots), by = grid_layout$ncol * grid_layout$nrow)) {
end_idx <- min(start_idx + grid_layout$ncol * grid_layout$nrow - 1, length(plots))
plot_subset <- plots[start_idx:end_idx]
# Arrange plots in a grid
do.call(grid.arrange, c(plot_subset, ncol = grid_layout$ncol, nrow = grid_layout$nrow))
}
dev.off()
# Save as HTML (optional)
plotly_plots <- lapply(plots, function(plot) {
suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h")))
})
combined_plot <- subplot(plotly_plots, nrows = grid_layout$nrow, margin = 0.05)
saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, "_grid.html")), selfcontained = TRUE)
} else {
# Save individual plots to PDF
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
lapply(plots, print)
dev.off()
# Convert plots to plotly and save as HTML
plotly_plots <- lapply(plots, function(plot) {
suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h")))
})
combined_plot <- subplot(plotly_plots, nrows = length(plotly_plots), margin = 0.05)
saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE)
}
}
generate_interaction_plot_configs <- function(df, variables) {
plot_configs <- list()
for (variable in variables) {
# Define the y-limits based on the variable being plotted
ylim_vals <- switch(variable,
"L" = c(-65, 65),
"K" = c(-65, 65),
"r" = c(-0.65, 0.65),
"AUC" = c(-6500, 6500)
)
# Dynamically generate the column names for standard deviation and delta
wt_sd_col <- paste0("WT_sd_", variable)
delta_var <- paste0("Delta_", variable)
z_shift <- paste0("Z_Shift_", variable)
z_lm <- paste0("Z_lm_", variable)
# Set annotations for ZShift, Z lm Score, NG, DB, SM
annotations <- list(
list(x = 1, y = ifelse(variable == "L", 45, ifelse(variable == "K", 45,
ifelse(variable == "r", 0.45, 4500))), label = paste("ZShift =", round(df[[z_shift]], 2))),
list(x = 1, y = ifelse(variable == "L", 25, ifelse(variable == "K", 25,
ifelse(variable == "r", 0.25, 2500))), label = paste("lm ZScore =", round(df[[z_lm]], 2))),
list(x = 1, y = ifelse(variable == "L", -25, ifelse(variable == "K", -25,
ifelse(variable == "r", -0.25, -2500))), label = paste("NG =", df$NG)),
list(x = 1, y = ifelse(variable == "L", -35, ifelse(variable == "K", -35,
ifelse(variable == "r", -0.35, -3500))), label = paste("DB =", df$DB)),
list(x = 1, y = ifelse(variable == "L", -45, ifelse(variable == "K", -45,
ifelse(variable == "r", -0.45, -4500))), label = paste("SM =", df$SM))
)
# Add scatter plot configuration for this variable
plot_configs[[length(plot_configs) + 1]] <- list(
df = df,
x_var = "conc_num_factor",
y_var = delta_var,
plot_type = "scatter",
title = sprintf("%s %s", df$OrfRep[1], df$Gene[1]),
ylim_vals = ylim_vals,
annotations = annotations,
error_bar = list(
ymin = 0 - (2 * df[[wt_sd_col]][1]),
ymax = 0 + (2 * df[[wt_sd_col]][1])
),
x_breaks = unique(df$conc_num_factor),
x_labels = unique(as.character(df$conc_num)),
x_label = unique(df$Drug[1])
)
# Add box plot configuration for this variable
plot_configs[[length(plot_configs) + 1]] <- list(
df = df,
x_var = "conc_num_factor",
y_var = variable,
plot_type = "box",
title = sprintf("%s %s (Boxplot)", df$OrfRep[1], df$Gene[1]),
ylim_vals = ylim_vals,
annotations = annotations,
error_bar = FALSE, # Boxplots typically don't need error bars
x_breaks = unique(df$conc_num_factor),
x_labels = unique(as.character(df$conc_num)),
x_label = unique(df$Drug[1])
)
}
return(plot_configs)
}
generate_rank_plot_configs <- function(df, rank_var, zscore_var, label_prefix) {
configs <- list()
for (sd_band in c(1, 2, 3)) {
# Annotated version
configs[[length(configs) + 1]] <- list(
df = df,
x_var = rank_var,
y_var = zscore_var,
plot_type = "rank",
title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"),
sd_band = sd_band,
enhancer_label = list(
x = nrow(df) / 2, y = 10,
label = paste("Deletion Enhancers =", nrow(df[df[[zscore_var]] >= sd_band, ]))
),
suppressor_label = list(
x = nrow(df) / 2, y = -10,
label = paste("Deletion Suppressors =", nrow(df[df[[zscore_var]] <= -sd_band, ]))
)
)
# Non-annotated version
configs[[length(configs) + 1]] <- list(
df = df,
x_var = rank_var,
y_var = zscore_var,
plot_type = "rank",
title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"),
sd_band = sd_band
)
}
return(configs)
}
generate_correlation_plot_configs <- function(df, lm_list, lm_summaries) {
lapply(seq_along(lm_list), function(i) {
r_squared <- round(lm_summaries[[i]]$r.squared, 3)
list(
x_var = names(lm_list)[i][1],
y_var = names(lm_list)[i][2],
plot_type = "scatter",
title = paste("Correlation between", names(lm_list)[i][1], "and", names(lm_list)[i][2]),
annotations = list(list(x = 0, y = 0, label = paste("R-squared =", r_squared)))
)
})
}
# Adjust missing values and calculate ranks
adjust_missing_and_rank <- function(df, variables) {
# Adjust missing values in Avg_Zscore and Z_lm columns, and apply rank to the specified variables
df <- df %>%
mutate(across(all_of(variables), list(
Avg_Zscore = ~ if_else(is.na(get(paste0("Avg_Zscore_", cur_column()))), 0.001, get(paste0("Avg_Zscore_", cur_column()))),
Z_lm = ~ if_else(is.na(get(paste0("Z_lm_", cur_column()))), 0.001, get(paste0("Z_lm_", cur_column()))),
Rank = ~ rank(get(paste0("Avg_Zscore_", cur_column()))),
Rank_lm = ~ rank(get(paste0("Z_lm_", cur_column())))
), .names = "{fn}_{col}"))
return(df)
}
main <- function() {
lapply(names(args$experiments), function(exp_name) {
exp <- args$experiments[[exp_name]]
exp_path <- exp$path
exp_sd <- exp$sd
out_dir <- file.path(exp_path, "zscores")
out_dir_qc <- file.path(exp_path, "zscores", "qc")
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
# Load and process data
df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
df <- update_gene_names(df, args$sgd_gene_list)
max_conc <- max(df$conc_num_factor)
# QC steps and filtering
df_above_tolerance <- df %>% filter(DB == 1)
# Calculate the half-medians for `L` and `K` for rows above tolerance
L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
# Get the number of rows that are above tolerance
rows_to_remove <- nrow(df_above_tolerance)
# Set L, r, K, and AUC to NA for rows that are above tolerance
df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .)))
# Calculate summary statistics for all strains, including both background and the deletions
message("Calculating summary statistics for all strains")
variables <- c("L", "K", "r", "AUC")
ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
summary_stats <- ss$summary_stats
df_na_stats <- ss$df_with_stats
write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
print("Summary stats:")
print(head(summary_stats), width = 200)
# Remove rows with 0 values in L
df_no_zeros <- df_na %>% filter(L > 0)
# Additional filtering for non-finite values
df_na_filtered <- df_na %>%
filter(if_any(c(L), ~ !is.finite(.))) %>%
{
if (nrow(.) > 0) {
message("Removing non-finite rows:\n")
print(head(., n = 10))
}
df_na %>% filter(if_all(c(L), is.finite))
}
# Filter data within and outside 2SD
message("Filtering by 2SD of K")
df_na_within_2sd_k <- df_na_stats %>%
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
df_na_outside_2sd_k <- df_na_stats %>%
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
# Summary statistics for within and outside 2SD of K
message("Calculating summary statistics for L within 2SD of K")
# TODO We're omitting the original z_max calculation, not sure if needed?
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_within_2sd_k_stats <- ss$summary_stats
df_na_l_within_2sd_k_stats <- ss$df_with_stats
message("Calculating summary statistics for L outside 2SD of K")
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_outside_2sd_k_stats <- ss$summary_stats
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
# Write CSV files
write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
# Plots
# Print quality control graphs before removing data due to contamination and
# adjusting missing data to max theoretical values
l_vs_k_plots <- list(
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
title = "Raw L vs K before QC",
color_var = "conc_num",
legend_position = "right"
)
)
above_threshold_plots <- list(
list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter",
title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
color_var = "conc_num",
annotations = list(
list(
x = L_half_median,
y = K_half_median,
label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance))
)
),
error_bar = FALSE,
legend_position = "right"
)
)
frequency_delta_bg_plots <- list(
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density",
title = "Density plot for Delta Background by Conc All Data",
color_var = "conc_num",
x_label = "Delta Background",
y_label = "Density",
error_bar = FALSE,
legend_position = "right"
),
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
title = "Bar plot for Delta Background by Conc All Data",
color_var = "conc_num",
x_label = "Delta Background",
y_label = "Count",
error_bar = FALSE,
legend_position = "right"
)
)
plate_analysis_plots <- list()
for (plot_type in c("scatter", "box")) {
variables <- c("L", "K", "r", "AUC", "delta_bg")
for (var in variables) {
for (stage in c("before", "after")) {
if (stage == "before") {
df_plot <- df
} else {
df_plot <- df_na # TODO use df_na_filtered if necessary
}
# Set error_bar = TRUE only for scatter plots
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
# Create the plot configuration
plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type,
title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
error_bar = error_bar, color_var = "conc_num")
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
}
}
}
plate_analysis_no_zero_plots <- list()
for (plot_type in c("scatter", "box")) {
variables <- c("L", "K", "r", "AUC", "delta_bg")
for (var in variables) {
# Set error_bar = TRUE only for scatter plots
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
# Create the plot configuration
plot_config <- list(
df = df_no_zeros,
x_var = "scan",
y_var = var,
plot_type = plot_type,
title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
error_bar = error_bar,
color_var = "conc_num"
)
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
}
}
l_outside_2sd_k_plots <- list(
list(df = X_outside_2SD_K, x_var = "l", y_var = "K", plot_type = "scatter",
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
color_var = "conc_num",
legend_position = "right"
)
)
delta_bg_outside_2sd_k_plots <- list(
list(df = X_outside_2SD_K, x_var = "delta_bg", y_var = "K", plot_type = "scatter",
title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
color_var = "conc_num",
legend_position = "right"
)
)
# Generate and save plots for each QC step
message("Generating QC plots")
generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
# Clean up
rm(df, df_above_tolerance, df_no_zeros)
# TODO: Originally this filtered L NA's
# Let's try to avoid for now since stats have already been calculated
# Process background strains
bg_strains <- c("YDL227C")
lapply(bg_strains, function(strain) {
message("Processing background strain: ", strain)
# Handle missing data by setting zero values to NA
# and then removing any rows with NA in L col
df_bg <- df_na %>%
filter(OrfRep == strain) %>%
mutate(
L = if_else(L == 0, NA, L),
K = if_else(K == 0, NA, K),
r = if_else(r == 0, NA, r),
AUC = if_else(AUC == 0, NA, AUC)
) %>%
filter(!is.na(L))
# Recalculate summary statistics for the background strain
message("Calculating summary statistics for background strain")
ss <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
summary_stats_bg <- ss$summary_stats
df_bg_stats <- ss$df_with_stats
write.csv(summary_stats_bg,
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
row.names = FALSE)
# Filter reference and deletion strains
# Formerly X2_RF (reference strains)
df_reference <- df_na_stats %>%
filter(OrfRep == strain) %>%
mutate(SM = 0)
# Formerly X2 (deletion strains)
df_deletion <- df_na_stats %>%
filter(OrfRep != strain) %>%
mutate(SM = 0)
# Set the missing values to the highest theoretical value at each drug conc for L
# Leave other values as 0 for the max/min
reference_strain <- df_reference %>%
group_by(conc_num) %>%
mutate(
max_l_theoretical = max(max_L, na.rm = TRUE),
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
ungroup()
# Ditto for deletion strains
deletion_strains <- df_deletion %>%
group_by(conc_num) %>%
mutate(
max_l_theoretical = max(max_L, na.rm = TRUE),
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
ungroup()
# Calculate interactions
variables <- c("L", "K", "r", "AUC")
message("Calculating interaction scores")
print("Reference strain:")
print(head(reference_strain))
reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables)
print("Deletion strains:")
print(head(deletion_strains))
deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables)
zscores_calculations_reference <- reference_results$zscores_calculations
zscores_interactions_reference <- reference_results$zscores_interactions
zscores_calculations <- deletion_results$zscores_calculations
zscores_interactions <- deletion_results$zscores_interactions
# Writing Z-Scores to file
write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE)
write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE)
write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE)
write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE)
# Create interaction plots
reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables)
deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables)
generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
# Define conditions for enhancers and suppressors
# TODO Add to study config file?
threshold <- 2
enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= threshold
suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -threshold
enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= threshold
suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -threshold
# Subset data
enhancers_L <- zscores_interactions[enhancer_condition_L, ]
suppressors_L <- zscores_interactions[suppressor_condition_L, ]
enhancers_K <- zscores_interactions[enhancer_condition_K, ]
suppressors_K <- zscores_interactions[suppressor_condition_K, ]
# Save enhancers and suppressors
message("Writing enhancer/suppressor csv files")
write.csv(enhancers_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L.csv"), row.names = FALSE)
write.csv(suppressors_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L.csv"), row.names = FALSE)
write.csv(enhancers_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K.csv"), row.names = FALSE)
write.csv(suppressors_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K.csv"), row.names = FALSE)
# Combine conditions for enhancers and suppressors
enhancers_and_suppressors_L <- zscores_interactions[enhancer_condition_L | suppressor_condition_L, ]
enhancers_and_suppressors_K <- zscores_interactions[enhancer_condition_K | suppressor_condition_K, ]
# Save combined enhancers and suppressors
write.csv(enhancers_and_suppressors_L,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_L.csv"), row.names = FALSE)
write.csv(enhancers_and_suppressors_K,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_K.csv"), row.names = FALSE)
# Handle linear model based enhancers and suppressors
lm_threshold <- 2
enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= lm_threshold, ]
suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -lm_threshold, ]
enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= lm_threshold, ]
suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -lm_threshold, ]
# Save linear model based enhancers and suppressors
message("Writing linear model enhancer/suppressor csv files")
write.csv(enhancers_lm_L,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L_lm.csv"), row.names = FALSE)
write.csv(suppressors_lm_L,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L_lm.csv"), row.names = FALSE)
write.csv(enhancers_lm_K,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K_lm.csv"), row.names = FALSE)
write.csv(suppressors_lm_K,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE)
zscores_interactions_adjusted <- adjust_missing_and_rank(zscores_interactions)
# Generate all rank plot configurations for L and K
rank_plot_configs <- c(
generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_L", "Avg_Zscore_L", "L"),
generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_K", "Avg_Zscore_K", "K")
)
# Generate and save rank plots
generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots",
plot_configs = rank_plot_config, grid_layout = list(ncol = 3, nrow = 2))
# # Correlation plots
# lm_list <- list(
# lm(Z_lm_K ~ Z_lm_L, data = zscores_interactions_filtered),
# lm(Z_lm_r ~ Z_lm_L, data = zscores_interactions_filtered),
# lm(Z_lm_AUC ~ Z_lm_L, data = zscores_interactions_filtered),
# lm(Z_lm_r ~ Z_lm_K, data = zscores_interactions_filtered),
# lm(Z_lm_AUC ~ Z_lm_K, data = zscores_interactions_filtered),
# lm(Z_lm_AUC ~ Z_lm_r, data = zscores_interactions_filtered)
# )
lm_summaries <- lapply(lm_list, summary)
correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries)
generate_and_save_plots(zscores_interactions_filtered, output_dir, correlation_plot_configs)
})
})
}
main()