Relocate project

This commit is contained in:
2024-09-10 13:23:18 -04:00
parent 3833f75184
commit a8cf3c5325
6220 changed files with 4859 additions and 69 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,54 @@
#!/usr/bin/env Rscript
# Load the openxlsx package
library(openxlsx)
args <- commandArgs(TRUE)
outDir <- args[1]
# Function to combine CSV and TXT files into a workbook with named sheets
combineFilesToWorkbook <- function(file_list, output_file) {
# Create a new workbook
wb <- createWorkbook()
# Loop through the file list
for (i in seq_along(file_list)) {
file <- file_list[i]
# Extract the sheet name from the file name based on the extension
ext <- tools::file_ext(file)
sheet_name <- tools::file_path_sans_ext(basename(file))
# Read the data based on the file extension
if (ext %in% c("csv", "txt")) {
if (ext == "csv") {
data <- read.csv(file)
} else if (ext == "txt") {
data <- readLines(file)
}
# Create a new sheet in the workbook with the extracted sheet name
addWorksheet(wb, sheetName = sheet_name)
# Write the data to the sheet
writeData(wb, sheet = sheet_name, x = data)
}
}
# Save the workbook as an Excel file
saveWorkbook(wb, output_file)
}
Component <- file.path(outDir, "Component", "ComponentResults.txt")
Function <- file.path(outDir, "Function", "FunctionResults.txt")
Process <- file.path(outDir, "Process", "ProcessResults.txt")
# Specify the list of input files (both CSV and TXT)
file_list <- c(Component, Process, Function)
# Specify the output file name
output_file <- file.path(outDir, "GTFCombined.xlsx")
# Call the function to combine the files into a workbook with named sheets
combineFilesToWorkbook(file_list, output_file)

View File

@@ -0,0 +1,895 @@
# This R script performs GTA L and K Pairwise Compares for user specified pairs of Experiments
library("ggplot2")
library("plotly")
library("htmlwidgets")
library("extrafont")
library("grid")
library("ggthemes")
args <- commandArgs(TRUE)
exp1_name <- args[1]
exp1_file <- args[2]
exp2_name <- args[3]
exp2_file <- args[4]
output_dir <- args[5]
pairDirL <- file.path(output_dir, paste0("PairwiseCompareL_", exp1_name, "-", exp2_name))
pairDirK <- file.path(output_dir, paste0("PairwiseCompareK_", exp1_name, "-", exp2_name))
# Pairwise L
# outputPlotly <- "../GTAresults/PairwiseCompareL/" #"/GTAresults/PairwiseCompareL/"
# Theme elements for plots
theme_Publication <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.size = unit(0.2, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
theme_Publication_legend_right <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "right",
legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
scale_fill_Publication <- function(...) {
discrete_scale("fill", "Publication", manual_pal(
values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33")), ...)
}
scale_colour_Publication <- function(...) {
discrete_scale("colour", "Publication", manual_pal(
values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33")), ...)
}
X1 <- read.csv(file = exp1_file, stringsAsFactors = FALSE, header = TRUE)
X2 <- read.csv(file = exp2_file, stringsAsFactors = FALSE, header = TRUE)
X <- merge(X1, X2, by = "Term_Avg", all = TRUE, suffixes = c("_X1", "_X2"))
gg <- ggplot(data = X, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Ontology_Avg_X1,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(
aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2),
color = "grey20",
size = 0.25,
alpha = 0.1,
inherit.aes = FALSE,
fill = NA
) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOntology.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOntology.html"))
htmlwidgets::saveWidget(pgg, fname)
# ID aggravators and alleviators, regardless of whether they meet 2SD threshold
X1_Specific_Aggravators <- X[which(X$Z_lm_L_Avg_X1 >= 2 & X$Z_lm_L_Avg_X2 < 2), ]
X1_Specific_Alleviators <- X[which(X$Z_lm_L_Avg_X1 <= -2 & X$Z_lm_L_Avg_X2 > -2), ]
X2_Specific_Aggravators <- X[which(X$Z_lm_L_Avg_X2 >= 2 & X$Z_lm_L_Avg_X1 < 2), ]
X2_Specific_Alleviators <- X[which(X$Z_lm_L_Avg_X2 <= -2 & X$Z_lm_L_Avg_X1 > -2), ]
Overlap_Aggravators <- X[which(X$Z_lm_L_Avg_X1 >= 2 & X$Z_lm_L_Avg_X2 >= 2), ]
Overlap_Alleviators <- X[which(X$Z_lm_L_Avg_X1 <= -2 & X$Z_lm_L_Avg_X2 <= -2), ]
X2_Specific_Aggravators_X1_Alleviatiors <- X[which(X$Z_lm_L_Avg_X2 >= 2 & X$Z_lm_L_Avg_X1 <= -2), ]
X2_Specific_Alleviators_X1_Aggravators <- X[which(X$Z_lm_L_Avg_X2 <= -2 & X$Z_lm_L_Avg_X1 >= 2), ]
X$Overlap_Avg <- NA
try(X[X$Term_Avg %in% X1_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X1_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
paste(exp1_name, "Specific_Deletion_Suppresors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% Overlap_Aggravators$Term_Avg, ]$Overlap_Avg <- "Overlapping_Deletion_Enhancers")
try(X[X$Term_Avg %in% Overlap_Alleviators$Term_Avg, ]$Overlap_Avg <- "Overlapping_Deletion_Suppressors")
try(X[X$Term_Avg %in% X2_Specific_Aggravators_X1_Alleviatiors$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators_X1_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
gg <- ggplot(data = X, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(
aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2),
color = "grey20",
size = 0.25,
alpha = 0.1,
inherit.aes = FALSE,
fill = NA
) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html"))
htmlwidgets::saveWidget(pgg, fname)
x_rem2_gene <- X[X$NumGenes_Avg_X1 >= 2 & X$NumGenes_Avg_X2 >= 2, ]
#3
gg <- ggplot(data = x_rem2_gene, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_above2genes.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html"))
htmlwidgets::saveWidget(pgg, fname)
#4
X_overlap_nothresold <- X[!(is.na(X$Overlap_Avg)), ]
gg <- ggplot(data = X_overlap_nothresold, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.html"))
htmlwidgets::saveWidget(pgg, fname)
# Only output GTA terms where average score is still above 2 after subtracting the SD
# Z1 will ID aggravators, Z2 alleviators
Z1 <- X
Z1$L_Subtract_SD_X1 <- Z1$Z_lm_L_Avg_X1 - Z1$Z_lm_L_SD_X1
Z1$L_Subtract_SD_X2 <- Z1$Z_lm_L_Avg_X2 - Z1$Z_lm_L_SD_X2
Z2 <- X
Z2$L_Subtract_SD_X1 <- Z1$Z_lm_L_Avg_X1 + Z1$Z_lm_L_SD_X1
Z2$L_Subtract_SD_X2 <- Z1$Z_lm_L_Avg_X2 + Z1$Z_lm_L_SD_X2
X1_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 < 2), ]
X1_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X1 <= -2 & Z2$L_Subtract_SD_X2 > -2), ]
X2_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z1$L_Subtract_SD_X1 < 2), ]
X2_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 > -2), ]
Overlap_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 >= 2), ]
Overlap_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 <= -2), ]
X2_Specific_Aggravators2_X1_Alleviatiors2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z2$L_Subtract_SD_X1 <= -2), ]
X2_Specific_Alleviators2_X1_Aggravators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z1$L_Subtract_SD_X1 >= 2), ]
X$Overlap <- NA
try(X[X$Term_Avg %in% X1_Specific_Aggravators2$Term_Avg, ]$Overlap <- paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X1_Specific_Alleviators2$Term_Avg, ]$Overlap <- paste(exp1_name, "Specific_Deletion_Suppresors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Aggravators2$Term_Avg, ]$Overlap <- paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators2$Term_Avg, ]$Overlap <- paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% Overlap_Aggravators2$Term_Avg, ]$Overlap <- "Overlapping_Deletion_Enhancers")
try(X[X$Term_Avg %in% Overlap_Alleviators2$Term_Avg, ]$Overlap <- "Overlapping_Deletion_Suppressors")
try(X[X$Term_Avg %in% X2_Specific_Aggravators2_X1_Alleviatiors2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators2_X1_Aggravators2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
#5
X_abovethreshold <- X[!(is.na(X$Overlap)), ]
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.html"))
htmlwidgets::saveWidget(pgg, fname)
#6
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = Term_Avg), nudge_y = 0.25, size = 2) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.pdf")),
width = 20,
height = 20
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.html"))
htmlwidgets::saveWidget(pgg, fname)
X_abovethreshold$X1_Rank <- NA
X_abovethreshold$X1_Rank <- rank(-X_abovethreshold$Z_lm_L_Avg_X1, ties.method = "random")
X_abovethreshold$X2_Rank <- NA
X_abovethreshold$X2_Rank <- rank(-X_abovethreshold$Z_lm_L_Avg_X2, ties.method = "random")
#7
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = X1_Rank), nudge_y = 0.25, size = 4) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.pdf")),
width = 15,
height = 15
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <-
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.html"))
htmlwidgets::saveWidget(pgg, fname)
#8
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = X2_Rank), nudge_y = 0.25, size = 4) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.pdf")),
width = 15,
height = 15
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <-
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.html"))
htmlwidgets::saveWidget(pgg, fname)
write.csv(
x = X,
file.path(pairDirL, paste0("All_GTA_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
row.names = FALSE
)
write.csv(
x = X_abovethreshold,
file = file.path(pairDirL, paste0("AboveThreshold_GTA_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
row.names = FALSE
)
# Begin Pairwsise K
# Theme elements for plots
theme_Publication <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.size = unit(0.2, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
scale_fill_Publication <- function(...) {
library("scales")
discrete_scale(
"fill",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
scale_colour_Publication <- function(...) {
discrete_scale(
"colour",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
theme_Publication_legend_right <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold",
size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "right",
legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
scale_fill_Publication <- function(...) {
discrete_scale(
"fill",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
scale_colour_Publication <- function(...) {
discrete_scale(
"colour",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
X1 <- read.csv(file = exp1_file, stringsAsFactors = FALSE, header = TRUE)
X2 <- read.csv(file = exp2_file, stringsAsFactors = FALSE, header = TRUE)
#1
X <- merge(X1, X2, by = "Term_Avg", all = TRUE, suffixes = c("_X1", "_X2"))
gg <- ggplot(data = X, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Ontology_Avg_X1,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOntology.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOntology.html"))
htmlwidgets::saveWidget(pgg, fname)
#2
# ID aggravators and alleviators, regardless of whether they meet 2SD threshold
X1_Specific_Aggravators <- X[which(X$Z_lm_K_Avg_X1 >= 2 & X$Z_lm_K_Avg_X2 < 2), ]
X1_Specific_Alleviators <- X[which(X$Z_lm_K_Avg_X1 <= -2 & X$Z_lm_K_Avg_X2 > -2), ]
X2_Specific_Aggravators <- X[which(X$Z_lm_K_Avg_X2 >= 2 & X$Z_lm_K_Avg_X1 < 2), ]
X2_Specific_Alleviators <- X[which(X$Z_lm_K_Avg_X2 <= -2 & X$Z_lm_K_Avg_X1 > -2), ]
Overlap_Aggravators <- X[which(X$Z_lm_K_Avg_X1 >= 2 & X$Z_lm_K_Avg_X2 >= 2), ]
Overlap_Alleviators <- X[which(X$Z_lm_K_Avg_X1 <= -2 & X$Z_lm_K_Avg_X2 <= -2), ]
X2_Specific_Aggravators_X1_Alleviatiors <- X[which(X$Z_lm_K_Avg_X2 >= 2 & X$Z_lm_K_Avg_X1 <= -2), ]
X2_Specific_Alleviators_X1_Aggravators <- X[which(X$Z_lm_K_Avg_X2 <= -2 & X$Z_lm_K_Avg_X1 >= 2), ]
X$Overlap_Avg <- NA
try(X[X$Term_Avg %in% X1_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X1_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% Overlap_Aggravators$Term_Avg, ]$Overlap_Avg <-
"Overlapping_Deletion_Suppressors")
try(X[X$Term_Avg %in% Overlap_Alleviators$Term_Avg, ]$Overlap_Avg <-
"Overlapping_Deletion_Enhancers")
try(X[X$Term_Avg %in% X2_Specific_Aggravators_X1_Alleviatiors$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators_X1_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
plotly_path <- file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html"))
gg <- ggplot(data = X, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#2
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html"))
htmlwidgets::saveWidget(pgg, fname)
#3
x_rem2_gene <- X[X$NumGenes_Avg_X1 >= 2 & X$NumGenes_Avg_X2 >= 2, ]
plotly_path <- file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html"))
gg <- ggplot(data = x_rem2_gene, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_above2genes.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html"))
htmlwidgets::saveWidget(pgg, fname)
#4
X_overlap_nothresold <- X[!(is.na(X$Overlap_Avg)), ]
gg <- ggplot(data = X_overlap_nothresold, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.html"))
htmlwidgets::saveWidget(pgg, fname)
#5
# Only output GTA terms where average score is still above 2 after subtracting the SD
# Z1 will ID aggravators, Z2 alleviators
Z1 <- X
Z1$L_Subtract_SD_X1 <- Z1$Z_lm_K_Avg_X1 - Z1$Z_lm_K_SD_X1
Z1$L_Subtract_SD_X2 <- Z1$Z_lm_K_Avg_X2 - Z1$Z_lm_K_SD_X2
Z2 <- X
Z2$L_Subtract_SD_X1 <- Z1$Z_lm_K_Avg_X1 + Z1$Z_lm_K_SD_X1
Z2$L_Subtract_SD_X2 <- Z1$Z_lm_K_Avg_X2 + Z1$Z_lm_K_SD_X2
X1_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 < 2), ]
X1_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X1 <= -2 & Z2$L_Subtract_SD_X2 > -2), ]
X2_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z1$L_Subtract_SD_X1 < 2), ]
X2_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 > -2), ]
Overlap_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 >= 2), ]
Overlap_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 <= -2), ]
X2_Specific_Aggravators2_X1_Alleviatiors2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z2$L_Subtract_SD_X1 <= -2), ]
X2_Specific_Alleviators2_X1_Aggravators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z1$L_Subtract_SD_X1 >= 2), ]
X$Overlap <- NA
try(X[X$Term_Avg %in% X1_Specific_Aggravators2$Term_Avg, ]$Overlap <-
paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X1_Specific_Alleviators2$Term_Avg, ]$Overlap <-
paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Aggravators2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% Overlap_Aggravators2$Term_Avg, ]$Overlap <-
"Overlapping_Deletion_Suppressors")
try(X[X$Term_Avg %in% Overlap_Alleviators2$Term_Avg, ]$Overlap <-
"Overlapping_Deletion_Enhancers")
try(X[X$Term_Avg %in% X2_Specific_Aggravators2_X1_Alleviatiors2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators2_X1_Aggravators2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
X_abovethreshold <- X[!(is.na(X$Overlap)), ]
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.html"))
htmlwidgets::saveWidget(pgg, fname)
#6
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = Term_Avg), nudge_y = 0.25, size = 2) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.pdf")),
width = 20,
height = 20
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.html"))
htmlwidgets::saveWidget(pgg, fname)
#7
X_abovethreshold$X1_Rank <- NA
X_abovethreshold$X1_Rank <- rank(-X_abovethreshold$Z_lm_K_Avg_X1, ties.method = "random")
X_abovethreshold$X2_Rank <- NA
X_abovethreshold$X2_Rank <- rank(-X_abovethreshold$Z_lm_K_Avg_X2, ties.method = "random")
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = X1_Rank), nudge_y = 0.25, size = 4) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.pdf")),
width = 15,
height = 15
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <-
file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.html"))
htmlwidgets::saveWidget(pgg, fname)
#8
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = X2_Rank), nudge_y = 0.25, size = 4) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.pdf")),
width = 15,
height = 15
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <-
file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.html"))
htmlwidgets::saveWidget(pgg, fname)
write.csv(
x = X,
file = file.path(pairDirK, paste0("All_GTF_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
row.names = FALSE
)
write.csv(
x = X_abovethreshold,
file = file.path(pairDirK, paste0("AboveThreshold_GTF_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
row.names = FALSE
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,810 @@
#!/usr/bin/env Rscript
# Makes heat maps of multiple experiments
#
# Updated 240724 Bryan C Roessler to improve file operations and portability
# I tried to leave as much logic intact as possible, just feeding in vars in a better way
# NOTE: The script now has 7 required arguments and a variable number of input experiments
# @arg $1 string StudyInfo.csv file
# @arg $2 string gene_ontology_edit.obo file
# @arg $3 string go_terms.tab file
# @arg $4 string All_SGD_GOTerms_for_QHTCPtk.csv
# @arg $5 string base directory
# @arg $6 string output directory
library("ontologyIndex")
library("ggplot2")
library("RColorBrewer")
library("grid")
library("ggthemes")
# library("plotly")
# library("htmlwidgets")
library("extrafont")
library("stringr")
library("org.Sc.sgd.db")
library("ggrepel")
library("gplots")
# Load arguments
args <- commandArgs(TRUE)
study_info_file <- args[1]
ontology_file <- args[2]
sgd_terms_tfile <- args[3]
all_sgd_terms_csv <- args[4]
base_dir <- args[5]
output_dir <- args[6]
study_nums <- args[7:length(args)]
# Import standard tables used in Sean's code That should be copied to each ExpStudy
labels <- read.csv(file = study_info_file, stringsAsFactors = FALSE)
Ontology <- get_ontology(file = ontology_file, propagate_relationships = "is_a", extract_tags = "minimal")
GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS) # all ORFs associated with GO term
Terms <- read.delim(file = sgd_terms_tfile,
header = FALSE,
quote = "",
col.names = c("GO_ID", "GO_Term", "GO_Aspect", "GO_Term_Definition")
)
XX3 <- read.csv(file = all_sgd_terms_csv, stringsAsFactors = FALSE, header = TRUE)
XX3[, 1] <- paste0("GO:", formatC(XX3[, 1], width = 7, flag = "0"))
XX3[, 2] <- gsub(pattern = " ", replacement = "_", x = XX3[, 2])
XX3[, 2] <- gsub(pattern = "/", replacement = "_", x = XX3[, 2])
# Load input files
for (study_num in study_nums) {
input_file <- file.path(base_dir, paste("Exp", study_num), zscores, "zscores_interaction.csv")
if (file.exists(input_file)) {
assign(paste(X, study_num), read.csv(file = input_file, stringsAsFactors = FALSE, header = TRUE))
assign(paste(Name, study_num), labels[study_num, 2])
}
}
for (study_num in study_nums) {
eval(paste("function", study_num))
}
if (length(study_nums) > 0) {
X1$ORF <- X1$OrfRep
X1$ORF <- gsub("_1", "", x = X1$ORF)
X1$ORF <- gsub("_2", "", x = X1$ORF)
X1$ORF <- gsub("_3", "", x = X1$ORF)
X1$ORF <- gsub("_4", "", x = X1$ORF)
X1$Score_L <- "No Effect"
try(X1[is.na(X1$Z_lm_L), ]$Score_L <- "No Growth")
try(X1[!is.na(X1$Z_lm_L) & X1$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
try(X1[!is.na(X1$Z_lm_L) & X1$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
X1$Score_K <- "No Effect"
try(X1[is.na(X1$Z_lm_K), ]$Score_K <- "No Growth")
try(X1[!is.na(X1$Z_lm_K) & X1$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
try(X1[!is.na(X1$Z_lm_K) & X1$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
# Express the na data as 0.001 in X1 for K and L
X1[is.na(X1$Z_lm_L), ]$Z_lm_L <- 0.001
X1[is.na(X1$Z_lm_K), ]$Z_lm_K <- 0.001
X1$Rank_L <- rank(X1$Z_lm_L)
X1$Rank_K <- rank(X1$Z_lm_K)
X1 <- X1[order(X1$OrfRep, decreasing = FALSE), ]
colnames(X1) <- paste0(colnames(X1), "_X1")
}
if (length(study_nums) > 1) {
X2$ORF <- X2$OrfRep
X2$ORF <- gsub("_1", "", x = X2$ORF)
X2$ORF <- gsub("_2", "", x = X2$ORF)
X2$ORF <- gsub("_3", "", x = X2$ORF)
X2$ORF <- gsub("_4", "", x = X2$ORF)
X2$Score_L <- "No Effect"
try(X2[is.na(X2$Z_lm_L), ]$Score_L <- "No Growth")
try(X2[!is.na(X2$Z_lm_L) & X2$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
try(X2[!is.na(X2$Z_lm_L) & X2$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
X2$Score_K <- "No Effect"
try(X2[is.na(X2$Z_lm_K), ]$Score_K <- "No Growth")
try(X2[!is.na(X2$Z_lm_K) & X2$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
try(X2[!is.na(X2$Z_lm_K) & X2$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
#express the na data as 0.001 in X2
X2[is.na(X2$Z_lm_L), ]$Z_lm_L <- 0.001
X2[is.na(X2$Z_lm_K), ]$Z_lm_K <- 0.001
X2$Rank_L <- rank(X2$Z_lm_L)
X2$Rank_K <- rank(X2$Z_lm_K)
X2 <- X2[order(X2$OrfRep, decreasing = FALSE), ]
colnames(X2) <- paste0(colnames(X2), "_X2")
X <- cbind(X1, X2)
}
if (length(study_nums) > 2) {
X3$ORF <- X3$OrfRep
X3$ORF <- gsub("_1", "", x = X3$ORF)
X3$ORF <- gsub("_2", "", x = X3$ORF)
X3$ORF <- gsub("_3", "", x = X3$ORF)
X3$ORF <- gsub("_4", "", x = X3$ORF)
X3$Score_L <- "No Effect"
try(X3[is.na(X3$Z_lm_L), ]$Score_L <- "No Growth")
try(X3[!is.na(X3$Z_lm_L) & X3$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
try(X3[!is.na(X3$Z_lm_L) & X3$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
X3$Score_K <- "No Effect"
try(X3[is.na(X3$Z_lm_K), ]$Score_K <- "No Growth")
try(X3[!is.na(X3$Z_lm_K) & X3$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
try(X3[!is.na(X3$Z_lm_K) & X3$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
# Express the na data as 0.001 in X3
X3[is.na(X3$Z_lm_L), ]$Z_lm_L <- 0.001
X3[is.na(X3$Z_lm_K), ]$Z_lm_K <- 0.001
X3$Rank_L <- rank(X3$Z_lm_L)
X3$Rank_K <- rank(X3$Z_lm_K)
X3 <- X3[order(X3$OrfRep, decreasing = FALSE), ]
colnames(X3) <- paste0(colnames(X3), "_X3")
X <- cbind(X, X3)
}
if (length(study_nums) > 3) {
X4$ORF <- X4$OrfRep
X4$ORF <- gsub("_1", "", x = X4$ORF)
X4$ORF <- gsub("_2", "", x = X4$ORF)
X4$ORF <- gsub("_3", "", x = X4$ORF)
X4$ORF <- gsub("_4", "", x = X4$ORF)
X4$Score_L <- "No Effect"
try(X4[is.na(X4$Z_lm_L), ]$Score_L <- "No Growth")
try(X4[!is.na(X4$Z_lm_L) & X4$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
try(X4[!is.na(X4$Z_lm_L) & X4$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
X4$Score_K <- "No Effect"
try(X4[is.na(X4$Z_lm_K), ]$Score_K <- "No Growth")
try(X4[!is.na(X4$Z_lm_K) & X4$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
try(X4[!is.na(X4$Z_lm_K) & X4$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
# Express the na data as 0.001 in X4
X4[is.na(X4$Z_lm_L), ]$Z_lm_L <- 0.001
X4[is.na(X4$Z_lm_K), ]$Z_lm_K <- 0.001
X4$Rank_L <- rank(X4$Z_lm_L)
X4$Rank_K <- rank(X4$Z_lm_K)
X4 <- X4[order(X4$OrfRep, decreasing = FALSE), ]
colnames(X4) <- paste0(colnames(X4), "_X4")
X <- cbind(X, X4)
}
if (length(study_nums) > 4) {
X5$ORF <- X5$OrfRep
X5$ORF <- gsub("_1", "", x = X5$ORF)
X5$ORF <- gsub("_2", "", x = X5$ORF)
X5$ORF <- gsub("_3", "", x = X5$ORF)
X5$ORF <- gsub("_4", "", x = X5$ORF)
X5$Score_L <- "No Effect"
try(X5[is.na(X5$Z_lm_L), ]$Score_L <- "No Growth")
try(X5[!is.na(X5$Z_lm_L) & X5$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
try(X5[!is.na(X5$Z_lm_L) & X5$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
X5$Score_K <- "No Effect"
try(X5[is.na(X5$Z_lm_K), ]$Score_K <- "No Growth")
try(X5[!is.na(X5$Z_lm_K) & X5$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
try(X5[!is.na(X5$Z_lm_K) & X5$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
# Express the na data as 0.001 in X5
X5[is.na(X5$Z_lm_L), ]$Z_lm_L <- 0.001
X5[is.na(X5$Z_lm_K), ]$Z_lm_K <- 0.001
X5$Rank_L <- rank(X5$Z_lm_L)
X5$Rank_K <- rank(X5$Z_lm_K)
X5 <- X5[order(X5$OrfRep, decreasing = FALSE), ]
colnames(X5) <- paste0(colnames(X5), "_X5")
X <- cbind(X, X5)
}
X$ORF <- X$OrfRep_X1
if (length(study_nums) > 1) {
X$ORF <- gsub("_1", "", x = X$ORF)
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
X_heatmap <-
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2"]
X_heatmap <- X_heatmap[, c(10, 1, 4, 5, 8, 9, 2, 3, 6, 7)]
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
colnames(X_heatmap)[2] <- "Gene"
}
if (length(study_nums) > 2) {
X$ORF <- gsub("_1", "", x = X$ORF)
X$ORF <- gsub("_2", "", x = X$ORF)
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
try(X[X$Gene_X3 == "", ]$Gene_X3 <- X[X$Gene_X3 == "", ]$OrfRep_X3)
X_heatmap <-
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
colnames(X) == "Z_Shift_K_X3" | colnames(X) == "Z_lm_K_X3" |
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2" |
colnames(X) == "Z_Shift_L_X3" | colnames(X) == "Z_lm_L_X3"]
# Reorder columns
X_heatmap <- X_heatmap[, c(14, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11)]
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X3", replacement = Name3, colnames(X_heatmap))
colnames(X_heatmap)[2] <- "Gene"
}
if (length(study_nums) > 3) {
X$ORF <- gsub("_1", "", x = X$ORF)
X$ORF <- gsub("_2", "", x = X$ORF)
X$ORF <- gsub("_3", "", x = X$ORF)
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
try(X[X$Gene_X3 == "", ]$Gene_X3 <- X[X$Gene_X3 == "", ]$OrfRep_X3)
try(X[X$Gene_X4 == "", ]$Gene_X4 <- X[X$Gene_X4 == "", ]$OrfRep_X4)
X_heatmap <-
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
colnames(X) == "Z_Shift_K_X3" | colnames(X) == "Z_lm_K_X3" |
colnames(X) == "Z_Shift_K_X4" | colnames(X) == "Z_lm_K_X4" |
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2" |
colnames(X) == "Z_Shift_L_X3" | colnames(X) == "Z_lm_L_X3" |
colnames(X) == "Z_Shift_L_X4" | colnames(X) == "Z_lm_L_X4"]
# Reorder columns
X_heatmap <- X_heatmap[, c(18, 1, 4, 5, 8, 9, 12, 13, 16, 17, 2, 3, 6, 7, 10, 11, 14, 15)]
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X3", replacement = Name3, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X4", replacement = Name4, colnames(X_heatmap))
colnames(X_heatmap)[2] <- "Gene"
}
if (length(study_nums) > 4) {
X$ORF <- gsub("_1", "", x = X$ORF)
X$ORF <- gsub("_2", "", x = X$ORF)
X$ORF <- gsub("_3", "", x = X$ORF)
X$ORF <- gsub("_4", "", x = X$ORF)
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
try(X[X$Gene_X3 == "", ]$Gene_X3 <- X[X$Gene_X3 == "", ]$OrfRep_X3)
try(X[X$Gene_X4 == "", ]$Gene_X4 <- X[X$Gene_X4 == "", ]$OrfRep_X4)
try(X[X$Gene_X5 == "", ]$Gene_X5 <- X[X$Gene_X5 == "", ]$OrfRep_X5)
X_heatmap <-
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
colnames(X) == "Z_Shift_K_X3" | colnames(X) == "Z_lm_K_X3" |
colnames(X) == "Z_Shift_K_X4" | colnames(X) == "Z_lm_K_X4" |
colnames(X) == "Z_Shift_K_X5" | colnames(X) == "Z_lm_K_X5" |
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2" |
colnames(X) == "Z_Shift_L_X3" | colnames(X) == "Z_lm_L_X3" |
colnames(X) == "Z_Shift_L_X4" | colnames(X) == "Z_lm_L_X4" |
colnames(X) == "Z_Shift_L_X5" | colnames(X) == "Z_lm_L_X5"]
# Reorder columns
X_heatmap <- X_heatmap[, c(22, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19)]
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X3", replacement = Name3, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X4", replacement = Name4, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X5", replacement = Name5, colnames(X_heatmap))
colnames(X_heatmap)[2] <- "Gene"
}
# Theme elements for plots
theme_Publication <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.size = unit(0.2, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
scale_fill_Publication <- function(...) {
library(scales)
discrete_scale(
"fill",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
scale_colour_Publication <- function(...) {
discrete_scale(
"colour",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
theme_Publication_legend_right <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "right",
legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
scale_fill_Publication <- function(...) {
discrete_scale(
"fill",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
scale_colour_Publication <- function(...) {
discrete_scale(
"colour",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
Ontology <- get_ontology(file = ontology_file, propagate_relationships = "is_a", extract_tags = "minimal")
print(Ontology)
# All ORFs associated with GO term
GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
# Terms is the GO term list jwr moved up to TAABLES
Terms <- read.delim(file = sgd_terms_tfile,
header = FALSE,
quote = "",
col.names = c("GO_ID", "GO_Term", "GO_Aspect", "GO_Term_Definition")
)
colormapbreaks <- c(-12, -10, -8, -6, -4, -2, 2, 4, 6, 8, 10, 12)
for (s in 1:dim(XX3)[1]) {
# Ontology <-
# get_ontology(file = "Documents/Hartman_Lab/SGD_Downloads/gene_ontology_edit.obo",
# propagate_relationships = "is_a", extract_tags = "minimal")
# Ontology_Everything <-
# get_ontology(file = "Documents/Hartman_Lab/SGD_Downloads/gene_ontology_edit.obo",
# propagate_relationships = "is_a", extract_tags = "everything")
#
# GO_ID_Arg <- "GO:0006325"
GO_ID_Arg_loop <- as.character(XX3[s, 1])
GOTerm_parent <- get_descendants(Ontology, roots = GO_ID_Arg_loop)
# GOTerm_parent <- get_descendants(Ontology,roots = "GO:0006325")
# Only make plots if parent term has fewer than 500 children
Parent_Size <- length(as.vector(GO2ALLORFs[GO_ID_Arg_loop][[1]]))
if (length(GOTerm_parent) > 100) {
next()
}
Parent_Size <- length(as.vector(GO2ALLORFs[GO_ID_Arg_loop][[1]])) # TODO why twice?
if (Parent_Size < 2) {
next()
}
if (Parent_Size > 2000) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 45,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.5, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 1000 && Parent_Size <= 2000) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 35,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.6, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 500 && Parent_Size <= 1000) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 30,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.6, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 200 && Parent_Size <= 500) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 25,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 100 && Parent_Size <= 200) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 20,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 60 && Parent_Size <= 100) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 15,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 30 && Parent_Size <= 60) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 10,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
if (dim(Genes_Annotated_to_Term)[1] <= 2 && dim(Genes_Annotated_to_Term)[1] > 0) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "none", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 3 && Parent_Size <= 30) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 7,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
if (dim(Genes_Annotated_to_Term)[1] <= 2 && dim(Genes_Annotated_to_Term)[1] > 0) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "none", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size == 2) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 7,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
if (dim(Genes_Annotated_to_Term)[1] <= 2 && dim(Genes_Annotated_to_Term)[1] > 0) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "none", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env Rscript
# This script will add the shift data to the finalTable.csv file
#
# May want to reorder columns in excel before making heatmaps - otherwise all the shift data will be plotted next to each other.
library(plyr)
library(dplyr)
library(sos)
args <- commandArgs(TRUE)
if (length(args) >= 1) {
finalTable <- file.path(args[1])
} else {
finalTable <- "REMcRdy_lm_only.csv-finalTable.csv" # for legacy workflow
}
if (length(args) >= 2) {
shiftFile <- file.path(args[2])
} else {
shiftFile <- "Shift_only.csv" # for legacy workflow
}
if (length(args) >= 3) {
studyInfo <- file.path(args[3])
} else {
studyInfo <- "../Code/StudyInfo.csv" # for legacy workflow
}
if (length(args) >= 4) {
output <- file.path(args[4])
} else {
output <- "REMcHeatmaps/REMcWithShift.csv" # for legacy workflow
}
# Read in the REMc finalTable data
X <- data.frame(read.csv(file = finalTable, header = TRUE, stringsAsFactors = FALSE))
# Read in the shift data From ../JoinInteractions
Y <- data.frame(read.csv(file = shiftFile, header = TRUE, stringsAsFactors = FALSE))
Labels <- read.delim(studyInfo, skip = 0, as.is = TRUE, row.names = 1, strip.white = TRUE)
# Determine the number of cols - needed to create the correct number of new cols
Xcolnum <- length(X[1, ])
ADDnum <- Xcolnum + length(Y[1, ]) - 2
# Create new columns filled with NAs to be filled with data
Xtemp <- X
Xtemp[, (Xcolnum + 1):ADDnum] <- NA
# Match the orf names in each row to a orf name in the shift data file and then add the shift data to the finalTable file
shiftTbl < - as.data.frame(matrix(nrow = 1, ncol = length(Y) - 2)) #the df shiftTbl must be initialized before for loop
for (i in 1:length(X[, 1])) {
Shiftrownum <- match(X[i, 2], Y[, 1])
shiftTbl[i, ] <- Y[Shiftrownum, 3:length(Y[1, ])]
Xtemp[i, (Xcolnum + 1):ADDnum] <- Y[Shiftrownum, 3:length(Y[1, ])]
}
headerX <- colnames(Xtemp)
headerY <- colnames(Y)
shfHdr <- headerY[3:length(headerY)]
combTbl <- X[, 1:3]
lmTbl <- select(Xtemp, contains("Z_lm")) #X[,(4:Xcolnum-2)]
shiftTbl <- select(Xtemp, contains("V"))
clustTbl <- select(Xtemp, contains("cluster."))
# Give the new column names the same names as in the shift file
Xcols <- colnames(X)
Ycols <- colnames(Y)[3:length(Y[1, ])]
newCols <- c(Xcols[1:Xcolnum], Ycols)
# Reorder columns for generating heatmaps
combI <- combTbl #Starting Template orf, Genename columns
headersRemc <- newCols #colnames(X)
newHeaders <- newCols[1:3]
lmHdr <- colnames(lmTbl) #newCols[4:(length(Xcols)-2)]
clstHdr <- colnames(clustTbl) #select(newCols, contains('cluster.')) #newCols[3+length(lmHdr):2]
intLvHdr <- vector()
#Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps.
for (i in 1:(length(shiftTbl[1, ]))) {
combI <- cbind.data.frame(combI, shiftTbl[i])
combI <- cbind.data.frame(combI, lmTbl[i])
intLvHdrx <- c(shfHdr[i], lmHdr[i])
intLvHdr <- c(intLvHdr, intLvHdrx)
}
combIHdr <- c(colnames(combTbl), intLvHdr, clstHdr)
combI <- cbind.data.frame(combI, clustTbl)
colnames(combI) <- combIHdr
write.csv(combI, file = output, row.names = FALSE)

View File

@@ -0,0 +1,587 @@
suppressMessages({
library(ggplot2)
library(plotly)
library(htmlwidgets)
library(dplyr)
library(ggthemes)
library(data.table)
})
options(warn = 2, max.print = 1000)
# Constants for configuration
PLOT_WIDTH <- 14
PLOT_HEIGHT <- 9
BASE_SIZE <- 14
parse_arguments <- function() {
args <- if (interactive()) {
c(
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD",
3,
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2_DoxoHLD/results_std.txt",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
"Experiment 1: Doxo versus HLD",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
"Experiment 2: HLD versus Doxo"
)
} else {
commandArgs(trailingOnly = TRUE)
}
paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE)
names <- args[seq(6, length(args), by = 2)]
experiments <- setNames(paths, names)
list(
out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
sd = as.numeric(args[2]),
sgd_gene_list = normalizePath(file.path(args[3]), mustWork = FALSE),
easy_results_file = normalizePath(file.path(args[4]), mustWork = FALSE),
experiments = experiments
)
}
args <- parse_arguments()
dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
# Define themes and scales
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans", legend_position = "bottom") {
theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.line = element_line(colour = "black"),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = legend_position,
legend.direction = ifelse(legend_position == "right", "vertical", "horizontal"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
}
scale_fill_publication <- function(...) {
discrete_scale("fill", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
scale_colour_publication <- function(...) {
discrete_scale("colour", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
# Load SGD gene list
sgd_genes <- function(sgd_gene_list) {
read.delim(file = sgd_gene_list, quote = "", header = FALSE,
colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) %>%
dplyr::rename(ORF = V4, GeneName = V5)
}
genes <- sgd_genes(args$sgd_gene_list)
# Define the adjust_values function
adjust_values <- function(x, delta_bg, tolerance) {
valid_x <- x[x != 0 & !is.na(x)]
max_x <- ifelse(length(valid_x) == 0, NA_real_, max(valid_x, na.rm = TRUE))
x_adjusted <- ifelse(x == 0 & !is.na(x), max_x, x)
# Adjust only if delta_bg is valid and above tolerance
x_adjusted <- ifelse(!is.na(delta_bg) & delta_bg >= tolerance, NA_real_, x_adjusted)
return(x_adjusted)
}
load_and_preprocess_data <- function(easy_results_file, genes) {
df <-
read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
# Apply further transformations
df <- df %>%
filter(
!is.na(ORF) & ORF != "",
!ORF %in% c("Blank"),
!is.na(Gene) & !Gene %in% c("BLANK", "Blank", "blank"),
Drug != "BMH21",
!is.na(Scan),
!is.na(l), !is.na(K),
!is.na(r), !is.na(AUC96)
) %>%
mutate(
Col = suppressWarnings(as.numeric(Col)),
Row = suppressWarnings(as.numeric(Row)),
L = suppressWarnings(as.numeric(l)),
K = suppressWarnings(as.numeric(K)),
r = suppressWarnings(as.numeric(r)),
Scan = suppressWarnings(as.numeric(Scan)),
AUC = suppressWarnings(as.numeric(AUC96)),
LstBackgrd = suppressWarnings(as.numeric(LstBackgrd)),
X1stBackgrd = suppressWarnings(as.numeric(X1stBackgrd)),
delta_bg = ifelse(is.na(LstBackgrd) | is.na(X1stBackgrd), NA_real_, LstBackgrd - X1stBackgrd),
delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + 3 * sd(delta_bg, na.rm = TRUE),
OrfRep = ifelse(ORF == "YDL227C", "YDL227C", OrfRep),
OrfRepUnique = paste(OrfRep, Gene, Num., sep = "_"),
conc_num = suppressWarnings(as.numeric(gsub("[^0-9\\.]", "", Conc))),
DB = as.integer(delta_bg >= delta_bg_tolerance),
SM = 0,
NG = as.integer(l == 0 & !is.na(l)),
L_adjusted = adjust_values(L, delta_bg, delta_bg_tolerance),
r_adjusted = adjust_values(r, delta_bg, delta_bg_tolerance),
AUC_adjusted = adjust_values(AUC, delta_bg, delta_bg_tolerance),
K_adjusted = adjust_values(K, delta_bg, delta_bg_tolerance),
max_L_adjusted = ifelse(all(is.na(L_adjusted)), NA_real_, max(L_adjusted, na.rm = TRUE)),
SM = ifelse(!is.na(max_L_adjusted) & L_adjusted == max_L_adjusted, 1, SM)
)
# Efficiently handle the assignment of Gene names
gene_map <- setNames(genes$GeneName, genes$ORF)
df$Gene <- ifelse(df$OrfRep != "YDL227C", gene_map[df$ORF], df$Gene)
df$Gene <- ifelse(df$Gene == "" | df$Gene == "OCT1", df$OrfRep, df$Gene)
return(df)
}
generate_plot <- function(df, var, plot_type = "scatter", out_dir_qc, suffix = "") {
# Filter out non-finite values before plotting
df <- df %>%
filter(is.finite(!!sym(var)))
plot_func <- switch(plot_type,
scatter = geom_point(shape = 3, size = 0.2, position = "jitter"),
boxplot = geom_boxplot())
p <- ggplot(df, aes(Scan, !!sym(var), color = as.factor(conc_num))) +
plot_func +
stat_summary(fun = mean, geom = "point", size = 0.6) +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar") +
ggtitle(paste("Plate analysis by Drug Conc for", var, "before quality control")) +
theme_publication()
pdf_path <- file.path(out_dir_qc, paste0("plate_analysis_", var, suffix, ".pdf"))
pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
print(p)
dev.off()
html_path <- sub(".pdf$", ".html", pdf_path)
pgg <- suppressWarnings(ggplotly(p) %>%
layout(legend = list(orientation = "h")))
saveWidget(pgg, html_path, selfcontained = TRUE)
}
generate_and_publish_qc <- function(df, out_dir_qc) {
variables <- c("L", "K", "r", "AUC", "delta_bg")
# Access delta_bg_tolerance from the dataframe
delta_bg_tolerance <- df$delta_bg_tolerance[1]
for (var in variables) {
if (var %in% colnames(df)) {
generate_plot(df, var, "scatter", out_dir_qc)
}
}
df_post_qc <- df %>%
mutate(across(all_of(variables), ~ ifelse(delta_bg >= delta_bg_tolerance, NA, .)))
for (var in variables) {
if (var %in% colnames(df_post_qc)) {
generate_plot(df_post_qc, var, "scatter", out_dir_qc, suffix = "_after_qc")
}
}
delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ]
for (var in variables) {
if (var %in% colnames(delta_bg_above_tolerance)) {
generate_plot(delta_bg_above_tolerance, var, "scatter", out_dir_qc, suffix = "_above_tolerance")
}
}
df_finite <- df %>% filter(is.finite(delta_bg))
delta_bg_frequency_plot <- ggplot(df_finite, aes(delta_bg, color = as.factor(conc_num))) +
geom_density() +
ggtitle("Density plot for Delta Background by Conc All Data") +
theme_publication(legend_position = "right")
delta_bg_bar_plot <- ggplot(df_finite, aes(delta_bg, color = as.factor(conc_num))) +
geom_bar() +
ggtitle("Bar plot for Delta Background by Conc All Data") +
theme_publication(legend_position = "right")
pdf(file = file.path(out_dir_qc, "frequency_delta_background.pdf"), width = 12, height = 8)
print(delta_bg_frequency_plot)
print(delta_bg_bar_plot)
dev.off()
saveWidget(ggplotly(delta_bg_frequency_plot),
file = file.path(out_dir_qc, "frequency_delta_background_density.html"), selfcontained = TRUE)
saveWidget(ggplotly(delta_bg_bar_plot),
file = file.path(out_dir_qc, "frequency_delta_background_bar.html"), selfcontained = TRUE)
df_no_zero <- df %>% filter(L > 0 & is.finite(L))
scatter_plots <- list()
box_plots <- list()
for (var in variables) {
if (var %in% colnames(df_no_zero)) {
scatter_plots[[var]] <- ggplot(df_no_zero, aes(Scan, .data[[var]], color = as.factor(conc_num))) +
geom_point(shape = 3, size = 0.2) +
stat_summary(fun = mean, geom = "point", size = 0.6) +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar") +
ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) +
theme_publication()
box_plots[[var]] <- ggplot(df_no_zero, aes(as.factor(Scan), .data[[var]], color = as.factor(conc_num))) +
geom_boxplot() +
ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) +
theme_publication()
}
}
pdf(file = file.path(out_dir_qc, "plate_analysis_no_zeros.pdf"), width = 14, height = 9)
lapply(scatter_plots, print)
dev.off()
pdf(file = file.path(out_dir_qc, "plate_analysis_no_zeros_boxplots.pdf"), width = 18, height = 9)
lapply(box_plots, print)
dev.off()
for (var in names(scatter_plots)) {
html_path <- file.path(out_dir_qc, paste0("plate_analysis_no_zeros_", var, ".html"))
pgg <- suppressWarnings(ggplotly(scatter_plots[[var]]) %>%
layout(legend = list(orientation = "h")))
saveWidget(pgg, html_path, selfcontained = TRUE)
}
for (var in names(box_plots)) {
html_path <- file.path(out_dir_qc, paste0("plate_analysis_no_zeros_boxplots_", var, ".html"))
pgg <- suppressWarnings(ggplotly(box_plots[[var]]) %>%
layout(legend = list(orientation = "h")))
saveWidget(pgg, html_path, selfcontained = TRUE)
}
}
compute_experiment_summary_stats <- function(df, out_dir, background_strain = NULL) {
if (!is.null(background_strain)) {
df <- df %>% filter(OrfRep == background_strain)
}
summary_stats <- df %>%
group_by(OrfRep, conc_num) %>%
summarize(
N = n(),
mean_L = mean(L, na.rm = TRUE),
median_L = median(L, na.rm = TRUE),
max_L = max(L, na.rm = TRUE),
min_L = min(L, na.rm = TRUE),
sd_L = sd(L, na.rm = TRUE),
se_L = sd_L / sqrt(N),
mean_K = mean(K, na.rm = TRUE),
median_K = median(K, na.rm = TRUE),
max_K = max(K, na.rm = TRUE),
min_K = min(K, na.rm = TRUE),
sd_K = sd(K, na.rm = TRUE),
se_K = sd_K / sqrt(N),
mean_r = mean(r, na.rm = TRUE),
median_r = median(r, na.rm = TRUE),
max_r = max(r, na.rm = TRUE),
min_r = min(r, na.rm = TRUE),
sd_r = sd(r, na.rm = TRUE),
se_r = sd_r / sqrt(N),
mean_AUC = mean(AUC, na.rm = TRUE),
median_AUC = median(AUC, na.rm = TRUE),
max_AUC = max(AUC, na.rm = TRUE),
min_AUC = min(AUC, na.rm = TRUE),
sd_AUC = sd(AUC, na.rm = TRUE),
se_AUC = sd_AUC / sqrt(N)
)
if (is.null(background_strain)) {
fwrite(summary_stats, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
} else {
fwrite(summary_stats, file.path(out_dir, "summary_stats_background_strains.csv"), row.names = FALSE)
}
}
filter_and_calculate_sd_k_stats <- function(df, out_dir_qc) {
df_cleaned <- df %>%
filter(is.finite(L_adjusted) & is.finite(K_adjusted))
df_within_2SD_K <- df_cleaned %>%
filter(abs(K_adjusted - mean(K_adjusted, na.rm = TRUE)) <= 2 * sd(K_adjusted, na.rm = TRUE)) %>%
group_by(conc_num) %>%
summarize(
N = n(),
mean_L = mean(L_adjusted, na.rm = TRUE),
max_L = ifelse(all(is.na(L_adjusted)), NA_real_, max(L_adjusted, na.rm = TRUE)),
sd_L = sd(L_adjusted, na.rm = TRUE)
)
fwrite(df_within_2SD_K, file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2SD_K.csv"))
df_outside_2SD_K <- df_cleaned %>%
filter(abs(K_adjusted - mean(K_adjusted, na.rm = TRUE)) > 2 * sd(K_adjusted, na.rm = TRUE)) %>%
group_by(conc_num) %>%
summarize(
N = n(),
mean_L = mean(L_adjusted, na.rm = TRUE),
max_L = ifelse(all(is.na(L_adjusted)), NA_real_, max(L_adjusted, na.rm = TRUE)),
sd_L = sd(L_adjusted, na.rm = TRUE)
)
fwrite(df_outside_2SD_K, file.path(out_dir_qc, "max_observed_L_vals_for_spots_outside_2SD_K.csv"))
p_raw_before <- ggplot(df_cleaned,
aes(x = L_adjusted, y = K_adjusted, color = as.factor(conc_num))) +
geom_point() +
ggtitle("Raw L vs K before QC") +
theme_publication(legend_position = "right")
pdf(file.path(out_dir_qc, "raw_L_vs_K_before_QC.pdf"))
print(p_raw_before)
dev.off()
saveWidget(ggplotly(p_raw_before), file.path(out_dir_qc, "raw_L_vs_K_before_QC.html"))
p_above_threshold <- ggplot(df_cleaned %>% filter(delta_bg > delta_bg_tolerance),
aes(x = L_adjusted, y = K_adjusted, color = as.factor(conc_num))) +
geom_point() +
ggtitle("Raw L vs K for strains above delta background threshold") +
theme_publication(legend_position = "right")
pdf(file.path(out_dir_qc, "raw_L_vs_K_for_strains_above_delta_bg_threshold.pdf"))
print(p_above_threshold)
dev.off()
saveWidget(ggplotly(p_above_threshold), file.path(out_dir_qc, "raw_L_vs_K_for_strains_above_delta_bg_threshold.html"))
}
compute_rf_interaction_scores <- function(df_rf, output_dir) {
lm_sd_L <- sd(df_rf$lm_Score_L, na.rm = TRUE)
lm_sd_K <- sd(df_rf$lm_Score_K, na.rm = TRUE)
lm_sd_r <- sd(df_rf$lm_Score_r, na.rm = TRUE)
lm_sd_AUC <- sd(df_rf$lm_Score_AUC, na.rm = TRUE)
lm_mean_L <- mean(df_rf$lm_Score_L, na.rm = TRUE)
lm_mean_K <- mean(df_rf$lm_Score_K, na.rm = TRUE)
lm_mean_r <- mean(df_rf$lm_Score_r, na.rm = TRUE)
lm_mean_AUC <- mean(df_rf$lm_Score_AUC, na.rm = TRUE)
df_rf <- df_rf %>%
mutate(
Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L,
Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K,
Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r,
Z_lm_AUC = (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC
)
df_rf <- df_rf %>%
arrange(desc(Z_lm_L), desc(NG))
fwrite(df_rf, file.path(output_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
return(df_rf)
}
create_rank_plots <- function(interaction_scores_ranks, out_dir) {
rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank")
lapply(rank_vars, function(rank_var) {
p <- ggplot(interaction_scores_ranks, aes(x = !!sym(rank_var))) +
geom_bar() +
ggtitle(paste("Rank Distribution for", rank_var)) +
theme_publication()
pdf_path <- file.path(out_dir, paste0("rank_distribution_", rank_var, ".pdf"))
pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
print(p)
dev.off()
html_path <- sub(".pdf$", ".html", pdf_path)
pgg <- suppressWarnings(ggplotly(p) %>%
layout(legend = list(orientation = "h")))
saveWidget(pgg, html_path, selfcontained = TRUE)
})
}
create_correlation_plot <- function(interaction_scores, out_dir) {
interaction_scores <- interaction_scores %>%
filter_all(all_vars(is.finite(.)))
pairs <- list(
c("Raw_Shift_L", "Raw_Shift_K"),
c("Raw_Shift_L", "Raw_Shift_r"),
c("Raw_Shift_L", "Raw_Shift_AUC"),
c("Raw_Shift_K", "Raw_Shift_r"),
c("Raw_Shift_K", "Raw_Shift_AUC"),
c("Raw_Shift_r", "Raw_Shift_AUC")
)
lapply(pairs, function(vars) {
p <- ggplot(interaction_scores, aes(x = !!sym(vars[1]), y = !!sym(vars[2]))) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
ggtitle(paste("Correlation between", vars[1], "and", vars[2])) +
theme_publication()
pdf_path <- file.path(out_dir, paste0("correlation_", vars[1], "_", vars[2], ".pdf"))
pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
print(p)
dev.off()
html_path <- sub(".pdf$", ".html", pdf_path)
pgg <- suppressWarnings(ggplotly(p, tooltip = c(vars[1], vars[2])) %>%
layout(legend = list(orientation = "h")))
saveWidget(pgg, html_path, selfcontained = TRUE)
})
}
process_experiment <- function(exp_name, exp_dir, genes, output_dir) {
out_dir <- file.path(exp_dir, "zscores")
out_dir_qc <- file.path(out_dir, "qc")
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
dir.create(out_dir_qc, showWarnings = FALSE)
data <- load_and_preprocess_data(args$easy_results_file, genes)
# Generate and publish QC plots
generate_and_publish_qc(data, out_dir_qc)
# Calculate and save summary statistics
compute_experiment_summary_stats(data, out_dir)
compute_experiment_summary_stats(data, out_dir, background_strain = "YDL227C")
InteractionScores_RF <- data %>%
select(OrfRepUnique) %>%
distinct() %>%
mutate(
Gene = NA,
Raw_Shift_L = NA, Z_Shift_L = NA, lm_Score_L = NA, Z_lm_L = NA,
R_Squared_L = NA, Sum_Z_Score_L = NA, Avg_Zscore_L = NA,
Raw_Shift_K = NA, Z_Shift_K = NA, lm_Score_K = NA, Z_lm_K = NA,
R_Squared_K = NA, Sum_Z_Score_K = NA, Avg_Zscore_K = NA,
Raw_Shift_r = NA, Z_Shift_r = NA, lm_Score_r = NA, Z_lm_r = NA,
R_Squared_r = NA, Sum_Z_Score_r = NA, Avg_Zscore_r = NA,
Raw_Shift_AUC = NA, Z_Shift_AUC = NA, lm_Score_AUC = NA, Z_lm_AUC = NA,
R_Squared_AUC = NA, Sum_Z_Score_AUC = NA, Avg_Zscore_AUC = NA,
NG = NA, SM = NA
)
for (i in seq_along(unique(InteractionScores_RF$OrfRepUnique))) {
Gene_Sel <- unique(InteractionScores_RF$OrfRepUnique)[i]
X_Gene_Sel <- data[data$OrfRepUnique == Gene_Sel,]
X_stats_interaction <- X_Gene_Sel %>%
group_by(OrfRepUnique, Gene) %>%
summarize(
Raw_Shift_L = mean(L_adjusted, na.rm = TRUE),
Raw_Shift_K = mean(K_adjusted, na.rm = TRUE),
Raw_Shift_r = mean(r_adjusted, na.rm = TRUE),
Raw_Shift_AUC = mean(AUC_adjusted, na.rm = TRUE),
Z_Shift_L = mean(scale(L_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
Z_Shift_K = mean(scale(K_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
Z_Shift_r = mean(scale(r_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
Z_Shift_AUC = mean(scale(AUC_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
lm_Score_L = coef(lm(L_adjusted ~ delta_bg, data = X_Gene_Sel))[2],
R_Squared_L = summary(lm(L_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared,
lm_Score_K = coef(lm(K_adjusted ~ delta_bg, data = X_Gene_Sel))[2],
R_Squared_K = summary(lm(K_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared,
lm_Score_r = coef(lm(r_adjusted ~ delta_bg, data = X_Gene_Sel))[2],
R_Squared_r = summary(lm(r_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared,
lm_Score_AUC = coef(lm(AUC_adjusted ~ delta_bg, data = X_Gene_Sel))[2],
R_Squared_AUC = summary(lm(AUC_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared,
NG = sum(NG, na.rm = TRUE),
DB = sum(DB, na.rm = TRUE),
SM = sum(SM, na.rm = TRUE)
)
InteractionScores_RF <- InteractionScores_RF %>%
mutate(
Gene = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Gene[1], Gene),
Raw_Shift_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_L[1], Raw_Shift_L),
Z_Shift_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_L[1], Z_Shift_L),
lm_Score_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_L[1], lm_Score_L),
R_Squared_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_L[1], R_Squared_L),
Sum_Z_Score_L = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_L, na.rm = TRUE), Sum_Z_Score_L),
Avg_Zscore_L = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_L, na.rm = TRUE), Avg_Zscore_L),
Raw_Shift_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_K[1], Raw_Shift_K),
Z_Shift_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_K[1], Z_Shift_K),
lm_Score_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_K[1], lm_Score_K),
R_Squared_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_K[1], R_Squared_K),
Sum_Z_Score_K = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_K, na.rm = TRUE), Sum_Z_Score_K),
Avg_Zscore_K = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_K, na.rm = TRUE), Avg_Zscore_K),
Raw_Shift_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_r[1], Raw_Shift_r),
Z_Shift_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_r[1], Z_Shift_r),
lm_Score_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_r[1], lm_Score_r),
R_Squared_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_r[1], R_Squared_r),
Sum_Z_Score_r = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_r, na.rm = TRUE), Sum_Z_Score_r),
Avg_Zscore_r = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_r, na.rm = TRUE), Avg_Zscore_r),
Raw_Shift_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_AUC[1], Raw_Shift_AUC),
Z_Shift_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_AUC[1], Z_Shift_AUC),
lm_Score_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_AUC[1], lm_Score_AUC),
R_Squared_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_AUC[1], R_Squared_AUC),
Sum_Z_Score_AUC = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_AUC, na.rm = TRUE), Sum_Z_Score_AUC),
Avg_Zscore_AUC = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_AUC, na.rm = TRUE), Avg_Zscore_AUC),
NG = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$NG[1], NG),
DB = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$DB[1], DB),
SM = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$SM[1], SM)
)
}
fwrite(InteractionScores_RF, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
generate_and_publish_qc(data, out_dir_qc)
filter_and_calculate_sd_k_stats(data, out_dir_qc)
variables <- c("L", "K", "r", "AUC", "delta_bg")
compute_experiment_summary_stats(data, variables, out_dir)
compute_experiment_summary_stats(data, variables, out_dir, background_strain = "YDL227C")
compute_rf_interaction_scores(InteractionScores_RF, out_dir)
publish_scores(InteractionScores_RF, out_dir)
create_rank_plots(InteractionScores_RF, out_dir)
create_correlation_plot(InteractionScores_RF, out_dir)
output_file <- file.path(out_dir, "zscores_interaction.csv")
fwrite(InteractionScores_RF, output_file, row.names = FALSE)
return(output_file)
}
processed_files <- lapply(names(args$experiments), function(exp_name) {
process_experiment(exp_name, args$experiments[[exp_name]], genes, args$out_dir)
})
if (length(processed_files) > 1) {
combined_data <- Reduce(function(x, y) {
merge(fread(x), fread(y), by = "OrfRepUnique", all = TRUE)
}, processed_files)
combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv")
fwrite(combined_data, combined_output_file, row.names = FALSE)
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,945 @@
suppressMessages({
library(ggplot2)
library(plotly)
library(htmlwidgets)
library(dplyr)
library(ggthemes)
library(data.table)
library(unix)
})
options(warn = 2, max.print = 1000)
options(width = 10000)
# Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes)
soft_limit <- 30 * 1024 * 1024 * 1024
hard_limit <- 30 * 1024 * 1024 * 1024
rlimit_as(soft_limit, hard_limit)
# Constants for configuration
plot_width <- 14
plot_height <- 9
base_size <- 14
parse_arguments <- function() {
args <- if (interactive()) {
c(
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2_DoxoHLD/results_std.txt",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
"Experiment 1: Doxo versus HLD",
3,
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
"Experiment 2: HLD versus Doxo",
3
)
} else {
commandArgs(trailingOnly = TRUE)
}
# Extract paths, names, and standard deviations
paths <- args[seq(4, length(args), by = 3)]
names <- args[seq(5, length(args), by = 3)]
sds <- as.numeric(args[seq(6, length(args), by = 3)])
# Normalize paths
normalized_paths <- normalizePath(paths, mustWork = FALSE)
# Create named list of experiments
experiments <- list()
for (i in seq_along(paths)) {
experiments[[names[i]]] <- list(
path = normalized_paths[i],
sd = sds[i]
)
}
list(
out_dir = normalizePath(args[1], mustWork = FALSE),
sgd_gene_list = normalizePath(args[2], mustWork = FALSE),
easy_results_file = normalizePath(args[3], mustWork = FALSE),
experiments = experiments
)
}
args <- parse_arguments()
# Should we keep output in exp dirs or combine in the study output dir?
# dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
# dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
# Define themes and scales
theme_publication <- function(base_size = 14, base_family = "sans", legend_position = "bottom") {
theme_foundation <- ggplot2::theme_grey(base_size = base_size, base_family = base_family)
theme_foundation %+replace%
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.line = element_line(colour = "black"),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = legend_position,
legend.direction = ifelse(legend_position == "right", "vertical", "horizontal"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
}
scale_fill_publication <- function(...) {
discrete_scale("fill", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
scale_colour_publication <- function(...) {
discrete_scale("colour", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
# Load the initial dataframe from the easy_results_file
load_and_process_data <- function(easy_results_file, sd = 3) {
df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
df <- df %>%
filter(!(.[[1]] %in% c("", "Scan"))) %>%
filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>%
# Rename columns
rename(L = l, num = Num., AUC = AUC96, scan = Scan, last_bg = LstBackgrd, first_bg = X1stBackgrd) %>%
mutate(
across(c(Col, Row, num, L, K, r, scan, AUC, last_bg, first_bg), as.numeric),
delta_bg = last_bg - first_bg,
delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + (sd * sd(delta_bg, na.rm = TRUE)),
NG = if_else(L == 0 & !is.na(L), 1, 0),
DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0),
SM = 0,
OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded?
conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)),
conc_num_factor = as.numeric(as.factor(conc_num)) - 1
)
return(df)
}
# Update Gene names using the SGD gene list
update_gene_names <- function(df, sgd_gene_list) {
# Load SGD gene list
genes <- read.delim(file = sgd_gene_list,
quote = "", header = FALSE,
colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11)))
# Create a named vector for mapping ORF to GeneName
gene_map <- setNames(genes$V5, genes$V4)
# Vectorized match to find the GeneName from gene_map
mapped_genes <- gene_map[df$ORF]
# Replace NAs in mapped_genes with original Gene names (preserves existing Gene names if ORF is not found)
updated_genes <- ifelse(is.na(mapped_genes) | df$OrfRep == "YDL227C", df$Gene, mapped_genes)
# Ensure Gene is not left blank or incorrectly updated to "OCT1"
df <- df %>%
mutate(Gene = ifelse(updated_genes == "" | updated_genes == "OCT1", OrfRep, updated_genes))
return(df)
}
# Calculate summary statistics for all variables
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
df <- df %>%
mutate(across(all_of(variables), ~ ifelse(. == 0, NA, .)))
summary_stats <- df %>%
group_by(across(all_of(group_vars))) %>%
summarise(
N = sum(!is.na(L)),
across(all_of(variables), list(
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
max = ~max(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)
), .names = "{.fn}_{.col}")
)
df_cleaned <- df %>%
select(-any_of(names(summary_stats)))
df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars)
return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
}
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
# Calculate total concentration variables
total_conc_num <- length(unique(df$conc_num))
num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1
# Pull the background means and standard deviations from zero concentration
bg_means <- list(
L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(),
K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(),
r = df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first(),
AUC = df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first()
)
bg_sd <- list(
L = df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first(),
K = df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first(),
r = df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first(),
AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
)
interaction_scores <- df %>%
mutate(
WT_L = df$mean_L,
WT_K = df$mean_K,
WT_r = df$mean_r,
WT_AUC = df$mean_AUC,
WT_sd_L = df$sd_L,
WT_sd_K = df$sd_K,
WT_sd_r = df$sd_r,
WT_sd_AUC = df$sd_AUC
) %>%
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
mutate(
N = sum(!is.na(L)),
NG = sum(NG, na.rm = TRUE),
DB = sum(DB, na.rm = TRUE),
SM = sum(SM, na.rm = TRUE),
across(all_of(variables), list(
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
max = ~max(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)
), .names = "{.fn}_{.col}")
) %>%
ungroup()
interaction_scores <- interaction_scores %>%
group_by(across(all_of(group_vars))) %>%
mutate(
Raw_Shift_L = mean_L[[1]] - bg_means$L,
Raw_Shift_K = mean_K[[1]] - bg_means$K,
Raw_Shift_r = mean_r[[1]] - bg_means$r,
Raw_Shift_AUC = mean_AUC[[1]] - bg_means$AUC,
Z_Shift_L = Raw_Shift_L[[1]] / df$sd_L[[1]],
Z_Shift_K = Raw_Shift_K[[1]] / df$sd_K[[1]],
Z_Shift_r = Raw_Shift_r[[1]] / df$sd_r[[1]],
Z_Shift_AUC = Raw_Shift_AUC[[1]] / df$sd_AUC[[1]]
)
interaction_scores <- interaction_scores %>%
mutate(
Exp_L = WT_L + Raw_Shift_L,
Delta_L = mean_L - Exp_L,
Exp_K = WT_K + Raw_Shift_K,
Delta_K = mean_K - Exp_K,
Exp_r = WT_r + Raw_Shift_r,
Delta_r = mean_r - Exp_r,
Exp_AUC = WT_AUC + Raw_Shift_AUC,
Delta_AUC = mean_AUC - Exp_AUC
)
interaction_scores <- interaction_scores %>%
mutate(
Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L),
Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K),
Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r),
Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC),
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L)
)
# Calculate linear models and interaction scores
interaction_scores <- interaction_scores %>%
mutate(
lm_L = lm(Delta_L ~ conc_num_factor),
lm_K = lm(Delta_K ~ conc_num_factor),
lm_r = lm(Delta_r ~ conc_num_factor),
lm_AUC = lm(Delta_AUC ~ conc_num_factor),
Zscore_L = Delta_L / WT_sd_L,
Zscore_K = Delta_K / WT_sd_K,
Zscore_r = Delta_r / WT_sd_r,
Zscore_AUC = Delta_AUC / WT_sd_AUC
)
interaction_scores <- interaction_scores %>%
mutate(
Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE),
Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE),
Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE),
Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE)
)
interaction_scores_all <- interaction_scores %>%
mutate(
Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs,
Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs,
Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1),
Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1),
lm_Score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1],
lm_Score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1],
lm_Score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1],
lm_Score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1],
r_squared_L = summary(lm_L)$r.squared,
r_squared_K = summary(lm_K)$r.squared,
r_squared_r = summary(lm_r)$r.squared,
r_squared_AUC = summary(lm_AUC)$r.squared
)
# Calculate Z_lm for each variable
interaction_scores_all <- interaction_scores_all %>%
mutate(
Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE),
Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE),
Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE),
Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE)
)
# Arrange results by Z_lm_L and NG
interaction_scores_all <- interaction_scores_all %>%
arrange(desc(lm_Score_L)) %>%
arrange(desc(NG)) %>%
ungroup()
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
}
generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
`%||%` <- function(a, b) if (!is.null(a)) a else b
# Generalized plot generation
plots <- lapply(plot_configs, function(config) {
df <- config$df
plot <- ggplot(df, aes(x = !!sym(config$x_var), y = !!sym(config$y_var), color = as.factor(!!sym(config$color_var))))
# Rank plots with SD annotations
if (config$plot_type == "rank") {
plot <- plot + geom_point(size = 0.1, shape = 3)
# Adding SD bands
if (!is.null(config$sd_band)) {
for (i in seq_len(config$sd_band)) {
plot <- plot +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = i, ymax = Inf, fill = "#542788", alpha = 0.3) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = -i, ymax = -Inf, fill = "orange", alpha = 0.3) +
geom_hline(yintercept = c(-i, i), color = "gray")
}
}
# Optionally add enhancer/suppressor count annotations
if (!is.null(config$enhancer_label)) {
plot <- plot + annotate("text", x = config$enhancer_label$x,
y = config$enhancer_label$y, label = config$enhancer_label$label) +
annotate("text", x = config$suppressor_label$x,
y = config$suppressor_label$y, label = config$suppressor_label$label)
}
}
# Correlation plots
if (config$plot_type == "correlation") {
plot <- plot + geom_point(shape = 3, color = "gray70") +
geom_smooth(method = "lm", color = "tomato3") +
annotate("text", x = 0, y = 0, label = config$correlation_text)
}
# General scatter/boxplot/density handling
if (!is.null(config$y_var)) {
plot <- plot + aes(y = !!sym(config$y_var))
y_mean_col <- paste0("mean_", config$y_var)
y_sd_col <- paste0("sd_", config$y_var)
if (config$y_var == "delta_bg" && config$plot_type == "scatter") {
plot <- plot + geom_point(shape = 3, size = 0.2, position = "jitter") +
geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col),
ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) +
geom_point(aes(y = !!sym(y_mean_col)), size = 0.6)
} else if (config$error_bar %||% FALSE) {
plot <- plot +
geom_point(shape = 3, size = 0.2) +
geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col),
ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) +
geom_point(aes(y = !!sym(y_mean_col)), size = 0.6)
}
}
# Plot type selection
plot <- switch(config$plot_type,
"box" = plot + geom_boxplot(),
"density" = plot + geom_density(),
"bar" = plot + geom_bar(stat = "identity"),
plot + geom_point() + geom_smooth(method = "lm", se = FALSE))
# Adding y-limits if provided
if (!is.null(config$ylim_vals)) {
plot <- plot + coord_cartesian(ylim = config$ylim_vals)
}
# Setting legend position, titles, and labels
legend_position <- config$legend_position %||% "bottom"
plot <- plot + ggtitle(config$title) + theme_Publication(legend_position = legend_position)
if (!is.null(config$x_label)) plot <- plot + xlab(config$x_label)
if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label)
# Adding text annotations if provided
if (!is.null(config$annotations)) {
for (annotation in config$annotations) {
plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label)
}
}
return(plot)
})
# If grid_layout is provided, arrange plots in a grid and save in a single PDF
if (!is.null(grid_layout)) {
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
# Loop through plots in chunks defined by ncol and nrow
for (start_idx in seq(1, length(plots), by = grid_layout$ncol * grid_layout$nrow)) {
end_idx <- min(start_idx + grid_layout$ncol * grid_layout$nrow - 1, length(plots))
plot_subset <- plots[start_idx:end_idx]
# Arrange plots in a grid
do.call(grid.arrange, c(plot_subset, ncol = grid_layout$ncol, nrow = grid_layout$nrow))
}
dev.off()
# Save as HTML (optional)
plotly_plots <- lapply(plots, function(plot) {
suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h")))
})
combined_plot <- subplot(plotly_plots, nrows = grid_layout$nrow, margin = 0.05)
saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, "_grid.html")), selfcontained = TRUE)
} else {
# Save individual plots to PDF
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
lapply(plots, print)
dev.off()
# Convert plots to plotly and save as HTML
plotly_plots <- lapply(plots, function(plot) {
suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h")))
})
combined_plot <- subplot(plotly_plots, nrows = length(plotly_plots), margin = 0.05)
saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE)
}
}
generate_interaction_plot_configs <- function(df, variables) {
plot_configs <- list()
for (variable in variables) {
# Define the y-limits based on the variable being plotted
ylim_vals <- switch(variable,
"L" = c(-65, 65),
"K" = c(-65, 65),
"r" = c(-0.65, 0.65),
"AUC" = c(-6500, 6500)
)
# Dynamically generate the column names for standard deviation and delta
wt_sd_col <- paste0("WT_sd_", variable)
delta_var <- paste0("Delta_", variable)
z_shift <- paste0("Z_Shift_", variable)
z_lm <- paste0("Z_lm_", variable)
# Set annotations for ZShift, Z lm Score, NG, DB, SM
annotations <- list(
list(x = 1, y = ifelse(variable == "L", 45, ifelse(variable == "K", 45,
ifelse(variable == "r", 0.45, 4500))), label = paste("ZShift =", round(df[[z_shift]], 2))),
list(x = 1, y = ifelse(variable == "L", 25, ifelse(variable == "K", 25,
ifelse(variable == "r", 0.25, 2500))), label = paste("lm ZScore =", round(df[[z_lm]], 2))),
list(x = 1, y = ifelse(variable == "L", -25, ifelse(variable == "K", -25,
ifelse(variable == "r", -0.25, -2500))), label = paste("NG =", df$NG)),
list(x = 1, y = ifelse(variable == "L", -35, ifelse(variable == "K", -35,
ifelse(variable == "r", -0.35, -3500))), label = paste("DB =", df$DB)),
list(x = 1, y = ifelse(variable == "L", -45, ifelse(variable == "K", -45,
ifelse(variable == "r", -0.45, -4500))), label = paste("SM =", df$SM))
)
# Add scatter plot configuration for this variable
plot_configs[[length(plot_configs) + 1]] <- list(
df = df,
x_var = "conc_num_factor",
y_var = delta_var,
plot_type = "scatter",
title = sprintf("%s %s", df$OrfRep[1], df$Gene[1]),
ylim_vals = ylim_vals,
annotations = annotations,
error_bar = list(
ymin = 0 - (2 * df[[wt_sd_col]][1]),
ymax = 0 + (2 * df[[wt_sd_col]][1])
),
x_breaks = unique(df$conc_num_factor),
x_labels = unique(as.character(df$conc_num)),
x_label = unique(df$Drug[1])
)
# Add box plot configuration for this variable
plot_configs[[length(plot_configs) + 1]] <- list(
df = df,
x_var = "conc_num_factor",
y_var = variable,
plot_type = "box",
title = sprintf("%s %s (Boxplot)", df$OrfRep[1], df$Gene[1]),
ylim_vals = ylim_vals,
annotations = annotations,
error_bar = FALSE, # Boxplots typically don't need error bars
x_breaks = unique(df$conc_num_factor),
x_labels = unique(as.character(df$conc_num)),
x_label = unique(df$Drug[1])
)
}
return(plot_configs)
}
generate_rank_plot_configs <- function(df, rank_var, zscore_var, label_prefix) {
configs <- list()
for (sd_band in c(1, 2, 3)) {
# Annotated version
configs[[length(configs) + 1]] <- list(
df = df,
x_var = rank_var,
y_var = zscore_var,
plot_type = "rank",
title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"),
sd_band = sd_band,
enhancer_label = list(
x = nrow(df) / 2, y = 10,
label = paste("Deletion Enhancers =", nrow(df[df[[zscore_var]] >= sd_band, ]))
),
suppressor_label = list(
x = nrow(df) / 2, y = -10,
label = paste("Deletion Suppressors =", nrow(df[df[[zscore_var]] <= -sd_band, ]))
)
)
# Non-annotated version
configs[[length(configs) + 1]] <- list(
df = df,
x_var = rank_var,
y_var = zscore_var,
plot_type = "rank",
title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"),
sd_band = sd_band
)
}
return(configs)
}
generate_correlation_plot_configs <- function(df, lm_list, lm_summaries) {
lapply(seq_along(lm_list), function(i) {
r_squared <- round(lm_summaries[[i]]$r.squared, 3)
list(
x_var = names(lm_list)[i][1],
y_var = names(lm_list)[i][2],
plot_type = "scatter",
title = paste("Correlation between", names(lm_list)[i][1], "and", names(lm_list)[i][2]),
annotations = list(list(x = 0, y = 0, label = paste("R-squared =", r_squared)))
)
})
}
# Adjust missing values and calculate ranks
adjust_missing_and_rank <- function(df, variables) {
# Adjust missing values in Avg_Zscore and Z_lm columns, and apply rank to the specified variables
df <- df %>%
mutate(across(all_of(variables), list(
Avg_Zscore = ~ if_else(is.na(get(paste0("Avg_Zscore_", cur_column()))), 0.001, get(paste0("Avg_Zscore_", cur_column()))),
Z_lm = ~ if_else(is.na(get(paste0("Z_lm_", cur_column()))), 0.001, get(paste0("Z_lm_", cur_column()))),
Rank = ~ rank(get(paste0("Avg_Zscore_", cur_column()))),
Rank_lm = ~ rank(get(paste0("Z_lm_", cur_column())))
), .names = "{fn}_{col}"))
return(df)
}
main <- function() {
lapply(names(args$experiments), function(exp_name) {
exp <- args$experiments[[exp_name]]
exp_path <- exp$path
exp_sd <- exp$sd
out_dir <- file.path(exp_path, "zscores")
out_dir_qc <- file.path(exp_path, "zscores", "qc")
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
# Load and process data
df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
df <- update_gene_names(df, args$sgd_gene_list)
max_conc <- max(df$conc_num_factor)
# QC steps and filtering
df_above_tolerance <- df %>% filter(DB == 1)
# Calculate the half-medians for `L` and `K` for rows above tolerance
L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
# Get the number of rows that are above tolerance
rows_to_remove <- nrow(df_above_tolerance)
# Set L, r, K, and AUC to NA for rows that are above tolerance
df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .)))
# Calculate summary statistics for all strains, including both background and the deletions
message("Calculating summary statistics for all strains")
variables <- c("L", "K", "r", "AUC")
ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
summary_stats <- ss$summary_stats
df_na_stats <- ss$df_with_stats
write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
print("Summary stats:")
print(head(summary_stats), width = 200)
# Remove rows with 0 values in L
df_no_zeros <- df_na %>% filter(L > 0)
# Additional filtering for non-finite values
df_na_filtered <- df_na %>%
filter(if_any(c(L), ~ !is.finite(.))) %>%
{
if (nrow(.) > 0) {
message("Removing non-finite rows:\n")
print(head(., n = 10))
}
df_na %>% filter(if_all(c(L), is.finite))
}
# Filter data within and outside 2SD
message("Filtering by 2SD of K")
df_na_within_2sd_k <- df_na_stats %>%
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
df_na_outside_2sd_k <- df_na_stats %>%
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
# Summary statistics for within and outside 2SD of K
message("Calculating summary statistics for L within 2SD of K")
# TODO We're omitting the original z_max calculation, not sure if needed?
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_within_2sd_k_stats <- ss$summary_stats
df_na_l_within_2sd_k_stats <- ss$df_with_stats
message("Calculating summary statistics for L outside 2SD of K")
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_outside_2sd_k_stats <- ss$summary_stats
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
# Write CSV files
write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
# Plots
# Print quality control graphs before removing data due to contamination and
# adjusting missing data to max theoretical values
l_vs_k_plots <- list(
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
title = "Raw L vs K before QC",
color_var = "conc_num",
legend_position = "right"
)
)
above_threshold_plots <- list(
list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter",
title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
color_var = "conc_num",
annotations = list(
list(
x = L_half_median,
y = K_half_median,
label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance))
)
),
error_bar = FALSE,
legend_position = "right"
)
)
frequency_delta_bg_plots <- list(
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density",
title = "Density plot for Delta Background by Conc All Data",
color_var = "conc_num",
x_label = "Delta Background",
y_label = "Density",
error_bar = FALSE,
legend_position = "right"
),
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
title = "Bar plot for Delta Background by Conc All Data",
color_var = "conc_num",
x_label = "Delta Background",
y_label = "Count",
error_bar = FALSE,
legend_position = "right"
)
)
plate_analysis_plots <- list()
for (plot_type in c("scatter", "box")) {
variables <- c("L", "K", "r", "AUC", "delta_bg")
for (var in variables) {
for (stage in c("before", "after")) {
if (stage == "before") {
df_plot <- df
} else {
df_plot <- df_na # TODO use df_na_filtered if necessary
}
# Set error_bar = TRUE only for scatter plots
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
# Create the plot configuration
plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type,
title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
error_bar = error_bar, color_var = "conc_num")
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
}
}
}
plate_analysis_no_zero_plots <- list()
for (plot_type in c("scatter", "box")) {
variables <- c("L", "K", "r", "AUC", "delta_bg")
for (var in variables) {
# Set error_bar = TRUE only for scatter plots
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
# Create the plot configuration
plot_config <- list(
df = df_no_zeros,
x_var = "scan",
y_var = var,
plot_type = plot_type,
title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
error_bar = error_bar,
color_var = "conc_num"
)
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
}
}
l_outside_2sd_k_plots <- list(
list(df = X_outside_2SD_K, x_var = "l", y_var = "K", plot_type = "scatter",
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
color_var = "conc_num",
legend_position = "right"
)
)
delta_bg_outside_2sd_k_plots <- list(
list(df = X_outside_2SD_K, x_var = "delta_bg", y_var = "K", plot_type = "scatter",
title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
color_var = "conc_num",
legend_position = "right"
)
)
# Generate and save plots for each QC step
message("Generating QC plots")
generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
# Clean up
rm(df, df_above_tolerance, df_no_zeros)
# TODO: Originally this filtered L NA's
# Let's try to avoid for now since stats have already been calculated
# Process background strains
bg_strains <- c("YDL227C")
lapply(bg_strains, function(strain) {
message("Processing background strain: ", strain)
# Handle missing data by setting zero values to NA
# and then removing any rows with NA in L col
df_bg <- df_na %>%
filter(OrfRep == strain) %>%
mutate(
L = if_else(L == 0, NA, L),
K = if_else(K == 0, NA, K),
r = if_else(r == 0, NA, r),
AUC = if_else(AUC == 0, NA, AUC)
) %>%
filter(!is.na(L))
# Recalculate summary statistics for the background strain
message("Calculating summary statistics for background strain")
ss <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
summary_stats_bg <- ss$summary_stats
df_bg_stats <- ss$df_with_stats
write.csv(summary_stats_bg,
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
row.names = FALSE)
# Filter reference and deletion strains
# Formerly X2_RF (reference strains)
df_reference <- df_na_stats %>%
filter(OrfRep == strain) %>%
mutate(SM = 0)
# Formerly X2 (deletion strains)
df_deletion <- df_na_stats %>%
filter(OrfRep != strain) %>%
mutate(SM = 0)
# Set the missing values to the highest theoretical value at each drug conc for L
# Leave other values as 0 for the max/min
reference_strain <- df_reference %>%
group_by(conc_num) %>%
mutate(
max_l_theoretical = max(max_L, na.rm = TRUE),
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
ungroup()
# Ditto for deletion strains
deletion_strains <- df_deletion %>%
group_by(conc_num) %>%
mutate(
max_l_theoretical = max(max_L, na.rm = TRUE),
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
ungroup()
# Calculate interactions
variables <- c("L", "K", "r", "AUC")
message("Calculating interaction scores")
print("Reference strain:")
print(head(reference_strain))
reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables)
print("Deletion strains:")
print(head(deletion_strains))
deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables)
zscores_calculations_reference <- reference_results$zscores_calculations
zscores_interactions_reference <- reference_results$zscores_interactions
zscores_calculations <- deletion_results$zscores_calculations
zscores_interactions <- deletion_results$zscores_interactions
# Writing Z-Scores to file
write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE)
write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE)
write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE)
write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE)
# Create interaction plots
reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables)
deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables)
generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
# Define conditions for enhancers and suppressors
# TODO Add to study config file?
threshold <- 2
enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= threshold
suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -threshold
enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= threshold
suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -threshold
# Subset data
enhancers_L <- zscores_interactions[enhancer_condition_L, ]
suppressors_L <- zscores_interactions[suppressor_condition_L, ]
enhancers_K <- zscores_interactions[enhancer_condition_K, ]
suppressors_K <- zscores_interactions[suppressor_condition_K, ]
# Save enhancers and suppressors
message("Writing enhancer/suppressor csv files")
write.csv(enhancers_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L.csv"), row.names = FALSE)
write.csv(suppressors_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L.csv"), row.names = FALSE)
write.csv(enhancers_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K.csv"), row.names = FALSE)
write.csv(suppressors_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K.csv"), row.names = FALSE)
# Combine conditions for enhancers and suppressors
enhancers_and_suppressors_L <- zscores_interactions[enhancer_condition_L | suppressor_condition_L, ]
enhancers_and_suppressors_K <- zscores_interactions[enhancer_condition_K | suppressor_condition_K, ]
# Save combined enhancers and suppressors
write.csv(enhancers_and_suppressors_L,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_L.csv"), row.names = FALSE)
write.csv(enhancers_and_suppressors_K,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_K.csv"), row.names = FALSE)
# Handle linear model based enhancers and suppressors
lm_threshold <- 2
enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= lm_threshold, ]
suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -lm_threshold, ]
enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= lm_threshold, ]
suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -lm_threshold, ]
# Save linear model based enhancers and suppressors
message("Writing linear model enhancer/suppressor csv files")
write.csv(enhancers_lm_L,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L_lm.csv"), row.names = FALSE)
write.csv(suppressors_lm_L,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L_lm.csv"), row.names = FALSE)
write.csv(enhancers_lm_K,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K_lm.csv"), row.names = FALSE)
write.csv(suppressors_lm_K,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE)
zscores_interactions_adjusted <- adjust_missing_and_rank(zscores_interactions)
# Generate all rank plot configurations for L and K
rank_plot_configs <- c(
generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_L", "Avg_Zscore_L", "L"),
generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_K", "Avg_Zscore_K", "K")
)
# Generate and save rank plots
generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots",
plot_configs = rank_plot_config, grid_layout = list(ncol = 3, nrow = 2))
# # Correlation plots
# lm_list <- list(
# lm(Z_lm_K ~ Z_lm_L, data = zscores_interactions_filtered),
# lm(Z_lm_r ~ Z_lm_L, data = zscores_interactions_filtered),
# lm(Z_lm_AUC ~ Z_lm_L, data = zscores_interactions_filtered),
# lm(Z_lm_r ~ Z_lm_K, data = zscores_interactions_filtered),
# lm(Z_lm_AUC ~ Z_lm_K, data = zscores_interactions_filtered),
# lm(Z_lm_AUC ~ Z_lm_r, data = zscores_interactions_filtered)
# )
lm_summaries <- lapply(lm_list, summary)
correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries)
generate_and_save_plots(zscores_interactions_filtered, output_dir, correlation_plot_configs)
})
})
}
main()

View File

@@ -0,0 +1,209 @@
suppressMessages({
library("ggplot2")
library("plotly")
library("htmlwidgets")
library("extrafont")
library("grid")
library("ggthemes")
})
# Constants for configuration
PLOT_WIDTH <- 12
PLOT_HEIGHT <- 8
BASE_SIZE <- 14
options(warn = 2, max.print = 100)
# Parse arguments
parse_arguments <- function() {
if (interactive()) {
args <- c(
"Exp1",
"/path/to/exp1_file.csv",
"Exp2",
"/path/to/exp2_file.csv",
"/path/to/output_dir"
)
} else {
args <- commandArgs(trailingOnly = TRUE)
}
list(
exp1_name = args[1],
exp1_file = normalizePath(args[2], mustWork = TRUE),
exp2_name = args[3],
exp2_file = normalizePath(args[4], mustWork = TRUE),
output_dir = normalizePath(args[5], mustWork = FALSE)
)
}
args <- parse_arguments()
# Create output directories if they don't exist
pairDirL <- file.path(args$output_dir, paste0("PairwiseCompareL_", args$exp1_name, "-", args$exp2_name))
pairDirK <- file.path(args$output_dir, paste0("PairwiseCompareK_", args$exp1_name, "-", args$exp2_name))
dir.create(pairDirL, showWarnings = FALSE, recursive = TRUE)
dir.create(pairDirK, showWarnings = FALSE, recursive = TRUE)
# Define themes and scales
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") {
theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.line = element_line(colour = "black"),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.size = unit(0.2, "cm"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
}
theme_publication_legend_right <- function(base_size = BASE_SIZE, base_family = "sans") {
theme_publication(base_size, base_family) +
theme(
legend.position = "right",
legend.direction = "vertical",
legend.key.size = unit(0.5, "cm")
)
}
scale_fill_publication <- function(...) {
discrete_scale("fill", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
scale_colour_publication <- function(...) {
discrete_scale("colour", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
# Read and merge data
load_and_merge_data <- function(file1, file2) {
df1 <- read.csv(file = file1, stringsAsFactors = FALSE, header = TRUE)
df2 <- read.csv(file = file2, stringsAsFactors = FALSE, header = TRUE)
merge(df1, df2, by = "Term_Avg", all = TRUE, suffixes = c("_df1", "_df2"))
}
# Generate a plot and save to PDF and HTML
generate_plot <- function(data, x_var, y_var, color_var, title, file_path, theme_function) {
gg <- ggplot(data = data, aes_string(
x = x_var,
y = y_var,
color = color_var
)) +
xlab(paste("GO Term Avg lm Z for", args$exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", args$exp2_name)) +
ggtitle(title) +
theme_function()
pdf(file.path(file_path, ".pdf"), width = PLOT_WIDTH, height = PLOT_HEIGHT)
print(gg)
dev.off()
pgg <- ggplotly(gg)
htmlwidgets::saveWidget(pgg, file.path(file_path, ".html"))
}
# Identify and annotate specific interactions
annotate_interactions <- function(df, exp1_name, exp2_name, suffix) {
df$Overlap_Avg <- NA
interactions <- list(
"df1_Specific_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] < 2),
"df1_Specific_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] > -2),
"df2_Specific_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] < 2),
"df2_Specific_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] > -2),
"Overlap_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2),
"Overlap_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2),
"df2_Aggravators_df1_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2),
"df2_Alleviators_df1_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2)
)
annotation <- list(
df1_Specific_Aggravators = paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"),
df1_Specific_Alleviators = paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_"),
df2_Specific_Aggravators = paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"),
df2_Specific_Alleviators = paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"),
Overlap_Aggravators = "Overlapping_Deletion_Enhancers",
Overlap_Alleviators = "Overlapping_Deletion_Suppressors",
df2_Aggravators_df1_Alleviators = paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"),
df2_Alleviators_df1_Aggravators = paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_")
)
for (key in names(interactions)) {
try(df$Overlap_Avg[interactions[[key]]] <- annotation[[key]])
}
df
}
# Rank and filter data
rank_and_filter_data <- function(df, suffix) {
z1 <- df
z1[[paste0("L_Subtract_SD_", suffix, "_df1")]] <- z1[[paste0("Z_lm_", suffix, "_Avg_df1")]] - z1[[paste0("Z_lm_", suffix, "_SD_df1")]]
z1[[paste0("L_Subtract_SD_", suffix, "_df2")]] <- z1[[paste0("Z_lm_", suffix, "_Avg_df2")]] - z1[[paste0("Z_lm_", suffix, "_SD_df2")]]
z2 <- df
z2[[paste0("L_Subtract_SD_", suffix, "_df1")]] <- z2[[paste0("Z_lm_", suffix, "_Avg_df1")]] + z2[[paste0("Z_lm_", suffix, "_SD_df1")]]
z2[[paste0("L_Subtract_SD_", suffix, "_df2")]] <- z2[[paste0("Z_lm_", suffix, "_Avg_df2")]] + z2[[paste0("Z_lm_", suffix, "_SD_df2")]]
df_above_threshold <- df[!is.na(df$Overlap_Avg), ]
df_above_threshold$df1_Rank <- rank(-df_above_threshold[[paste0("Z_lm_", suffix, "_Avg_df1")]], ties.method = "random")
df_above_threshold$df2_Rank <- rank(-df_above_threshold[[paste0("Z_lm_", suffix, "_Avg_df2")]], ties.method = "random")
list(z1 = z1, z2 = z2, df_above_threshold = df_above_threshold)
}
# Main execution function for a pairwise comparison
run_pairwise_comparison <- function(suffix, dir) {
df <- load_and_merge_data(args$exp1_file, args$exp2_file)
# Generate initial ontology-based plot
generate_plot(df,
paste0("Z_lm_", suffix, "_Avg_df1"), paste0("Z_lm_", suffix, "_Avg_df2"), "Ontology_Avg_df1",
paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name),
file.path(dir, paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOntology")),
theme_publication_legend_right)
# Annotate interactions and generate overlap-based plot
df <- annotate_interactions(df, args$exp1_name, args$exp2_name, suffix)
ranks <- rank_and_filter_data(df, suffix)
generate_plot(df,
paste0("Z_lm_", suffix, "_Avg_df1"),
paste0("Z_lm_", suffix, "_Avg_df2"),
"Overlap_Avg",
paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name),
file.path(dir, paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOverlap")),
theme_publication_legend_right)
generate_plot(ranks$df_above_threshold,
paste0("Z_lm_", suffix, "_Avg_df1"),
paste0("Z_lm_", suffix, "_Avg_df2"),
"Overlap_Avg",
paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name, "Above Threshold"),
file.path(dir,
paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOverlap_AboveThreshold")),
theme_publication_legend_right)
# Save CSV files
write.csv(df, file.path(dir,
paste0("All_GTA_Avg_Scores_", args$exp1_name, "_vs_", args$exp2_name, ".csv")),
row.names = FALSE)
write.csv(ranks$df_above_threshold,
file.path(dir, paste0("AboveThreshold_GTA_Avg_Scores_", args$exp1_name, "_vs_", args$exp2_name, ".csv")),
row.names = FALSE)
}
# Execute Pairwise L and Pairwise K comparisons
run_pairwise_comparison("L", pairDirL)
run_pairwise_comparison("K", pairDirK)

View File

@@ -0,0 +1,305 @@
#!/usr/bin/env Rscript
# This script will make heatmaps for the REMc analysis
# need to give the input "finalTable.csv" file after running REMc generated by eclipse
library(RColorBrewer)
library(gplots)
args <- commandArgs(TRUE)
# Set output dir
if (length(args) >= 1) {
input_finalTable <- file.path(args[1])
} else {
input_finalTable <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow
}
if (length(args) >= 2) {
outDir <- file.path(args[2])
} else {
outDir <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow
}
hmapfile <- data.frame(read.csv(file = input_finalTable, header = TRUE, sep = ",", stringsAsFactors = FALSE))
# set NAs to NA
hmapfile[hmapfile == -100] <- NA
hmapfile[hmapfile == 100] <- NA
hmapfile[hmapfile == 0.001] <- NA
hmapfile[hmapfile == -0.001] <- NA
# select the number of rows based on the number of genes
num_total_genes <- length(hmapfile[, 1])
# Break out the cluster names so each part of the cluster origin can be accessed
# line below removed because it adds to many genes to clusters when going past 1-0-10
# since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl.
# hmapfile$cluster.origin = gsub(" ","",x=hmapfile$cluster.origin)
hmapfile$cluster.origin <- gsub(";", " ;", x = hmapfile$cluster.origin)
hmapfile$cluster.origin <- strsplit(hmapfile$cluster.origin, ";")
#use tail(x,n) for accessing the outward most cluster
clust_rounds <- 0
for (i in 1:num_total_genes) {
if (length(hmapfile$cluster.origin[[i]]) > clust_rounds) {
clust_rounds <- length(hmapfile$cluster.origin[[i]])
}
}
unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes])
unique_clusts <- unique_clusts[unique_clusts != " "]
# Select only the unique cluster names
unique_clusts <- sort(unique(unlist(unique_clusts, use.names = FALSE)), decreasing = FALSE)
num_unique_clusts <- length(unique_clusts)
# Base the color key on a statistical analysis of the L and K data
# need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors)
# scale() will calculate the mean and standard deviation of the entire vector,
# then "scale" each element by those values by subtracting the mean and dividing by the sd.
# hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)])
# change so that the L data is multiplied to be on the same scale as the K data
KEY_MIN <- 0
KEY_MAX <- 0
K_MIN <- 0
L_MAX <- 0
KcolumnValues <- vector()
LcolumnValues <- vector()
for (i in 4:(length(hmapfile[1, ]) - 2)){
if (grepl("_Z_lm_K", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
KcolumnValues <- append(KcolumnValues, i)
}
if (grepl("_Z_lm_L", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
LcolumnValues <- append(LcolumnValues, i)
}
}
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[4]
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[2]
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm=TRUE)[4]
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm=TRUE)[2]
# Z scores are
L_MAX <- 12
K_MIN <- -12
# L_Multiplier <- as.numeric(abs(K_MIN/L_MAX))
# hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier
# if(grepl("SHIFT",colnames(hmapfile)[4],fixed=TRUE) == TRUE){
# print("FOUND SHIFT VALUES")
# hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier
# }
# KEY_MAX <- as.numeric(L_MAX * L_Multiplier)
# KEY_MIN <- as.numeric(K_MIN)
KEY_MAX <- as.numeric(L_MAX)
KEY_MIN <- as.numeric(K_MIN)
print(KEY_MIN)
print(L_MAX)
# print(L_Multiplier)
colormapbreaks <- c(KEY_MIN, KEY_MIN * (5 / 6), KEY_MIN * (4 / 6), KEY_MIN * (3 / 6), KEY_MIN * (2 / 6),
KEY_MIN * (1 / 6), KEY_MAX * (1 / 6), KEY_MAX * (2 / 6), KEY_MAX * (3 / 6), KEY_MAX * (4 / 6), KEY_MAX * (5 / 6), KEY_MAX)
# print(colormapbreaks)
# Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?)
# However since also using this to amend the first part.
# Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers
# then could just use these values and create spaces only between interaction values
# possibly could get rid of redundant shift values if we don't want to view these
# could we pool all the shift data/average it?
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == TRUE) {
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 2)
# ev_repeat = rep("white",length(even_columns))
# ev_repeat = rep("red",(length(hmapfile[1,]) - 5))
# middle_col <- (length(hmapfile[1,]) - 5)/2
# ev_repeat[(middle_col/2)] <- "black"
# print(ev_repeat)
}
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == FALSE) {
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 1)
print("NO SHIFT VALS FOUND")
}
# FOR THIS SCRIPT ONLY (rap tem hu script)
# even_columns <- c(2,5,7,10,12,15,17)
# m <- 0
colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1, ]) - 2)])
# print(colnames_edit)
for (i in 1:length(colnames_edit)) {
if (grepl("Shift", colnames_edit[i], fixed = TRUE) == TRUE) {
colnames_edit[i] <- ""
colnames_edit[i + 1] <- gsub(pattern = "_Z_lm_", replacement = " ", x = colnames_edit[i + 1])
try(colnames_edit[i + 1] <- gsub(pattern = "_", replacement = " ", x = colnames_edit[i + 1]))
# INT_store <- strsplit(colnames_edit[i+1], "Z_lm")
# print(length(unlist(INT_store)))
# if(length(unlist(INT_store)) == 4){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep=" ")
# }
# if(length(unlist(INT_store)) == 3){
#
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep=" ")
# }
# if(length(unlist(INT_store)) == 5){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep=" ")
# }
# if(length(unlist(INT_store)) == 6){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep=" ")
# }
}
}
print(colnames_edit)
# break()
# colnames_edit[5] <- "TEM HLEG K"
# colnames_edit[10] <- "TEM HL K"
# colnames_edit[15] <- "TEM HLEG L"
# colnames_edit[20] <- "TEM HL L"
# Create the heatmaps
for (i in 1:num_unique_clusts) {
cluster <- unique_clusts[i]
cluster_data <- subset(hmapfile, grepl(cluster, cluster.origin))
cluster_length <- length(cluster_data[, 1])
if (cluster_length != 1) {
X0 <- as.matrix(cluster_data[, 4:(length(hmapfile[1, ]) - 2)])
if (cluster_length >= 2001) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 20, width = 15)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132, 132),
xlab = "Type of Media", ylab = "Gene Name",
# cellnote = round(X0,digits = 0), notecex = 0.1, key = TRUE,
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
# ColSideColors=ev_repeat,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
# abline(v=0.5467,col="black")
dev.off()
}
if (cluster_length >= 201 && cluster_length <= 2000) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 15, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.1, key = TRUE,
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
# abline(v=0.5316,col="black")
dev.off()
}
if (cluster_length >= 150 && cluster_length <= 200) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.2, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
dev.off()
}
if (cluster_length >= 101 && cluster_length <= 149) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
#zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
dev.off()
}
if (cluster_length >= 60 && cluster_length <= 100) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
dev.off()
}
if (cluster_length <= 59 && cluster_length >= 30) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 9, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
dev.off()
}
if (cluster_length <= 29) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 7, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA,
distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
dev.off()
}
}
# print(paste("FINISHED", "CLUSTER",cluster,sep=" "))
}

View File

@@ -0,0 +1,364 @@
#!/usr/bin/env Rscript
library("RColorBrewer")
library("gplots")
library("tidyverse")
args <- commandArgs(TRUE)
# Define the output path for the heatmaps - create this folder first - in linux terminal in the working folder use > mkdir filename_heatmaps
output_path <- file.path(Args[1])
# Need to give the input "finalTable.csv" file after running REMc generated by eclipse
final_table <- file.path(args[2])
# Give the damp_list.txt as the third argument - will color the gene names differently
damps <- file.path(Args[3])
damp_list <- read.delim(file = damps, header = FALSE, stringsAsFactors = FALSE)
# Give the yeast human homology mapping as the fourth argument - will add the genes to the finalTable and use info for heatmaps
map_file <- file.path(Args[4])
mapping <- read.csv(file = map_file, stringsAsFactors = FALSE)
# Read in finalTablewithShift
hmapfile <- data.frame(read.csv(file = final_table, header = TRUE, sep = ",", stringsAsFactors = FALSE))
# Map the finalTable to the human homolog file
hmapfile_map <- hmapfile
# Match using OrfRep after dropping the _1 _2 _3 _4
# But need to also account for some older files have ORF as column name rather than OrfRep in finalTable file
if (colnames(hmapfile_map)[2] == "OrfRep") {
try(hmapfile_map$ORFMatch <- hmapfile_map$OrfRep)
}
if (colnames(hmapfile_map)[2] == "ORF") {
try(hmapfile_map$ORFMatch <- hmapfile_map$ORF)
}
hmapfile_map$ORFMatch <- gsub("_1", "", x = hmapfile_map$ORFMatch)
hmapfile_map$ORFMatch <- gsub("_2", "", x = hmapfile_map$ORFMatch)
hmapfile_map$ORFMatch <- gsub("_3", "", x = hmapfile_map$ORFMatch)
hmapfile_map$ORFMatch <- gsub("_4", "", x = hmapfile_map$ORFMatch)
# Join the hmapfile using
hmapfile_w_homolog <- full_join(hmapfile_map, mapping, by = c("ORFMatch" = "ensembl_gene_id"))
# Remove matches that are not from the finalTable
hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$likelihood) == FASLE, ]
# Write csv with all info from mapping file
write.csv(hmapfile_w_homolog, file.path(output_path, paste0(final_table, "_WithHomologAll.csv")), row.names = FALSE)
# Remove the non matches and output another mapping file - this is also one used to make heatmaps
hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$external_gene_name_Human) == FALSE, ]
write.csv(hmapfile_w_homolog, file.path(output_path, paste0(final_table, "_WithHomologMatchesOnly.csv"), row.names = FALSE))
# Add human gene name to the Gene column
hmapfile_w_homolog$Gene <- paste(hmapfile_w_homolog$Gene, hmapfile_w_homolog$external_gene_name_Human, sep = "/")
# Only keep the finalTable file columns and the homology info
hmap_len <- dim(hmapfile)[2]
hmapfile_w_homolog_remake <-
cbind(hmapfile_w_homolog[, 1:hmap_len], hsapiens_homolog_orthology_type = hmapfile_w_homolog$hsapiens_homolog_orthology_type)
hmapfile <- hmapfile_w_homolog_remake
# Set NAs to NA
hmapfile[hmapfile == -100] <- NA
hmapfile[hmapfile == 100] <- NA
hmapfile[hmapfile == 0.001] <- NA
hmapfile[hmapfile == -0.001] <- NA
# Select the number of rows based on the number of genes
num_total_genes <- length(hmapfile[, 1])
# Break out the cluster names so each part of the cluster origin can be accessed
# Line below removed because it adds to many genes to clusters when going past 1-0-10
# since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl.
# hmapfile$cluster.origin = gsub(" ","",x = hmapfile$cluster.origin)
hmapfile$cluster.origin <- gsub(";", " ;", x = hmapfile$cluster.origin)
hmapfile$cluster.origin <- strsplit(hmapfile$cluster.origin, ";")
# use tail(x,n) for accessing the outward most cluster
clust_rounds <- 0
for (i in 1:num_total_genes) {
if (length(hmapfile$cluster.origin[[i]]) > clust_rounds) {
clust_rounds <- length(hmapfile$cluster.origin[[i]])
}
}
unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes])
unique_clusts <- unique_clusts[unique_clusts != " "]
#select only the unique cluster names
unique_clusts <- sort(unique(unlist(unique_clusts, use.names = FALSE)), decreasing = FALSE)
num_unique_clusts <- length(unique_clusts)
# Base the color key on a statistical analysis of the L and K data
# need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors)
# scale() will calculate the mean and standard deviation of the entire vector
# then "scale" each element by those values by subtracting the mean and dividing by the sd
# hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)])
# Change so that the L data is multiplied to be on the same scale as the K data
KEY_MIN <- 0
KEY_MAX <- 0
K_MIN <- 0
L_MAX <- 0
KcolumnValues <- vector()
LcolumnValues <- vector()
for (i in 4:(length(hmapfile[1, ]) - 3)){
if (grepl("_Z_lm_K", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
KcolumnValues <- append(KcolumnValues, i)
}
if (grepl("_Z_lm_L", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
LcolumnValues <- append(LcolumnValues, i)
}
}
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm = TRUE)[4]
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm = TRUE)[2]
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm = TRUE)[4]
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm = TRUE)[2]
# Z scores are
L_MAX <- 12
K_MIN <- -12
# L_Multiplier <- as.numeric(abs(K_MIN/L_MAX))
# hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier
# if(grepl("SHIFT",colnames(hmapfile)[4],fixed = TRUE) == TRUE){
# print("FOUND SHIFT VALUES")
# hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier
# }
#KEY_MAX <- as.numeric(L_MAX * L_Multiplier)
#KEY_MIN <- as.numeric(K_MIN)
KEY_MAX <- as.numeric(L_MAX)
KEY_MIN <- as.numeric(K_MIN)
print(KEY_MIN)
print(L_MAX)
#print(L_Multiplier)
colormapbreaks <- c(KEY_MIN, KEY_MIN * (5 / 6), KEY_MIN * (4 / 6), KEY_MIN * (3 / 6),
KEY_MIN * (2 / 6), KEY_MIN * (1 / 6), KEY_MAX * (1 / 6), KEY_MAX * (2 / 6),
KEY_MAX * (3 / 6), KEY_MAX * (4 / 6), KEY_MAX * (5 / 6), KEY_MAX)
# print(colormapbreaks)
# Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?)
# However since also using this to amend the first part.
# Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers
# then could just use these values and create spaces only between interaction values
# possibly could get rid of redundant shift values if we don't want to view these
# could we pool all the shift data/average it?
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == TRUE) {
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 2)
# ev_repeat = rep("white",length(even_columns))
# ev_repeat = rep("red",(length(hmapfile[1,]) - 5))
# middle_col <- (length(hmapfile[1,]) - 5)/2
# ev_repeat[(middle_col/2)] <- "black"
# print(ev_repeat)
}
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == FALSE) {
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 1)
print("NO SHIFT VALS FOUND")
}
# for this script only (rap tem hu script)
# even_columns <- c(2,5,7,10,12,15,17)
# m <- 0
colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1, ]) - 3)])
colnames(damp_list)[1] <- "ORF"
hmapfile$damps <- "YKO"
colnames(hmapfile)[2] <- "ORF"
try(hmapfile[hmapfile$ORF %in% damp_list$ORF, ]$damps <- "YKD")
# X <- X[order(X$damps,decreasing = TRUE),]
hmapfile$color2 <- NA
try(hmapfile[hmapfile$damps == "YKO", ]$color2 <- "black")
try(hmapfile[hmapfile$damps == "YKD", ]$color2 <- "red")
hmapfile$color <- NA
try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_many2many", ]$color <- "#F8766D")
try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2many", ]$color <- "#00BA38")
try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2one", ]$color <- "#619CFF")
# print(colnames_edit)
for (i in 1:length(colnames_edit)) {
if (grepl("Shift", colnames_edit[i], fixed = TRUE) == TRUE) {
colnames_edit[i] <- ""
colnames_edit[i + 1] <- gsub(pattern = "_Z_lm_", replacement = " ", x = colnames_edit[i + 1])
try(colnames_edit[i + 1] <- gsub(pattern = "_", replacement = " ", x = colnames_edit[i + 1]))
# INT_store <- strsplit(colnames_edit[i+1], "Z_lm")
# print(length(unlist(INT_store)))
# if(length(unlist(INT_store)) == 4){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep = " ")
# }
# if(length(unlist(INT_store)) == 3){
#
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep = " ")
# }
# if(length(unlist(INT_store)) == 5){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep = " ")
# }
# if(length(unlist(INT_store)) == 6){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep = " ")
# }
}
}
print(colnames_edit)
# break()
# colnames_edit[5] <- "TEM HLEG K"
# colnames_edit[10] <- "TEM HL K"
# colnames_edit[15] <- "TEM HLEG L"
# colnames_edit[20] <- "TEM HL L"
# Create the heatmaps
for (i in 1:num_unique_clusts) {
cluster <- unique_clusts[i]
cluster_data <- subset(hmapfile, grepl(cluster, cluster.origin))
cluster_length <- length(cluster_data[, 1])
if (cluster_length != 1) {
X0 <- as.matrix(cluster_data[, 4:(length(hmapfile[1, ]) - 6)])
if (cluster_length >= 2001) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 20, width = 15)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
# cellnote = round(X0,digits = 0), notecex = 0.1, key = TRUE,
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
# ColSideColors = ev_repeat,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
# abline(v = 0.5467,col = "black")
dev.off()
}
if (cluster_length >= 201 && cluster_length <= 2000) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 15, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.1, key = TRUE,
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
# abline(v = 0.5316,col = "black")
dev.off()
}
if (cluster_length >= 150 && cluster_length <= 200) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.2, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
dev.off()
}
if (cluster_length >= 101 && cluster_length <= 149) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
dev.off()
}
if (cluster_length >= 60 && cluster_length <= 100) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
dev.off()
}
if (cluster_length <= 59 && cluster_length >= 30) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 9, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
dev.off()
}
if (cluster_length <= 29) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 7, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA,
distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
dev.off()
}
}
# print(paste("FINISHED", "CLUSTER",cluster,sep = " "))
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,196 @@
#!/usr/bin/env R
# GTA (GoTermAveraging)
# Your output may not be reproducible as org.Sc.sgd.db is uploaded from Bioconductor R library and changes
#
# Updated 240724 Bryan C Roessler to improve file operations and portability
# NOTE: The script now has 2 additional OPTIONAL arguments:
# 1. Path to SGD terms file (go.terms.tab)
# 2. Path to SGD features file (gene_association.sgd)
library("stringr")
library("org.Sc.sgd.db")
library("plyr")
# Parse arguments
args <- commandArgs(TRUE)
exp_name <- args[1]
if (length(args) >= 2) {
zscores_file <- args[2]
} else {
zscores_file <- "zscores/zscores_interaction.csv" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
}
if (length(args) >= 3) {
sgd_terms_file <- args[3]
} else {
sgd_terms_file <- "go_terms.tab"
}
if (length(args) >= 4) {
sgd_features_file <- args[4]
} else {
sgd_features_file <- "gene_association.sgd" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
}
if (length(args) >= 5) {
output_dir <- args[5]
} else {
output_dir <- "../../out/gta" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
}
# # Set SGDgeneList file path
# if (length(args) > 4) {
# SGDgeneList <- args[4]
# } else {
# SGDgeneList <- "../Code/SGD_features.tab"
# Begin for loop for experiments in this study
# ZScores_Interaction.csv
for (m in 1:length(zscores_file)) {
# zscores_file <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="") #ArgsScore[1]
X <- read.csv(file = zscores_file[m], stringsAsFactors = FALSE, header = TRUE)
if (colnames(X)[1] == "OrfRep") {
colnames(X)[1] <- "ORF"
}
#Terms is the GO term list
Terms <- read.delim(file = sgd_terms_file, header = FALSE, quote = "",
col.names = c("GO_ID", "GO_Term", "GO_Aspect", "GO_Term_Definition"))
#all ORFs associated with GO term
GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
#Gene_Association is the gene association to GO term file
Gene_Association <- read.delim(sgd_features_file, skip = 8, header = FALSE, quote = "",
col.names = c("Database", "Database_Object_ID", "Database_Object_Symbol", "NOT", "GO_ID",
"Database_Reference", "Evidence", "With_or_From", "Aspect", "Database_Object_Name",
"Database_Object_Synonym", "Database_Object_Type", "taxon", "Date", "Assigned_By", "OtherInfo", "Empty"
)
)
#Get the ORF names associated with each gene/GO term
Gene_Association$ORF <- str_split_fixed(as.character(Gene_Association$Database_Object_Synonym), "\\|", 2)[, 1]
#Get the numeric GO ID for matching
Gene_Association$GO_ID_Numeric <- as.integer(str_split_fixed(as.character(Gene_Association$GO_ID), "\\:", 2)[, 2])
#Get all unique GO terms
GO_Terms <- unique(Gene_Association$GO_ID)
#Create a character vector with just the ColNames of the input file to store the scores for each GO term
Col_Names_X <- colnames(X)
#Create a data_frame with header from input_file
GO_Term_Averages <- X[0, ]
#Fill table with NAs same length as number of GO terms
GO_Term_Averages[1:length(GO_Terms), ] <- NA
#Change the first and second col names to GO_ID and Term
colnames(GO_Term_Averages)[1] <- "GO_ID"
colnames(GO_Term_Averages)[2] <- "Term"
# Create new columns
GO_Term_Averages$Ontology <- NA
GO_Term_Averages$NumGenes <- NA
GO_Term_Averages$AllPossibleGenes <- NA
GO_Term_Averages$Genes <- NA
GO_Term_Averages$ORFs <- NA
# Create a data.frame for the standard deviation info
GO_Term_SD <- X[0, ]
GO_Term_SD[1:length(GO_Terms), ] <- NA
colnames(GO_Term_SD)[1] <- "GO_ID"
colnames(GO_Term_SD)[2] <- "Term"
# Loop for each GO term to get an average L and K Z score
for (i in 1:length(GO_Terms)) {
# Get the GO_Term
ID <- GO_Terms[i]
# Get data.frame for all genes associated to the GO Term
ID_AllGenes <- Gene_Association[Gene_Association$GO_ID == ID, ]
# Get a vector of just the gene names
ID_AllGenes_vector <- as.vector(GO2ALLORFs[as.character(ID)][[1]])
if (length(unique(ID_AllGenes_vector)) > 4000) {
next()
}
# Get the GO term character description where numeric Terms ID matches GO_Term's ID
GO_Description_Term <- as.character(Terms[Terms$GO_ID %in% ID_AllGenes$GO_ID_Numeric, ]$GO_Term[1])
# Get the Z scores for all genes in the GO_ID
Zscores_For_ID <- X[X$ORF %in% ID_AllGenes_vector, ]
# Get the Gene names and ORFs for the term
GO_Term_Averages$Genes[i] <- paste(unique(Zscores_For_ID$Gene), collapse = " | ")
GO_Term_Averages$ORFs[i] <- paste(unique(Zscores_For_ID$ORF), collapse = " | ")
# Dataframe to report the averages for a GO term
# Get the GO ID
GO_Term_Averages$GO_ID[i] <- as.character(ID)
# Get the term name
GO_Term_Averages$Term[i] <- GO_Description_Term
# Get total number of genes annotated to the Term that we have in our library
GO_Term_Averages$NumGenes[i] <- length(unique(Zscores_For_ID$ORF))
# Get total number of genes annotated to the Term in SGD
GO_Term_Averages$AllPossibleGenes[i] <- length(unique(ID_AllGenes_vector))
# Get the ontology of the term
GO_Term_Averages$Ontology[i] <- as.character(ID_AllGenes$Aspect[1])
# Calculate the average score for every column
for (j in 3:length(X[1, ])) {
GO_Term_Averages[i, j] <- mean(Zscores_For_ID[, j], na.rm = TRUE)
# GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
}
# Also calculate same values for the SD
GO_Term_SD$GO_ID[i] <- as.character(ID)
# Get the term name
GO_Term_SD$Term[i] <- GO_Description_Term
# Calculate column scores for SD
for (j in 3:length(X[1, ])) {
GO_Term_SD[i, j] <- sd(Zscores_For_ID[, j], na.rm = TRUE)
# GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
}
}
# Add either _Avg or _SD depending on if the calculated score is an average or SD
colnames(GO_Term_Averages) <- paste(colnames(GO_Term_Averages), "Avg", sep = "_")
colnames(GO_Term_SD) <- paste(colnames(GO_Term_SD), "SD", sep = "_")
# Combine the averages with the SDs to make one big data.frame
X2 <- cbind(GO_Term_Averages, GO_Term_SD)
# Test[ , order(names(test))]
X2 <- X2[, order(names(X2))]
X2 <- X2[!is.na(X2$Z_lm_L_Avg), ]
# Create output file
write.csv(X2, file.path(output_dir, expName[m], "Average_GOTerms_All.csv"), row.names = FALSE)
# Remove NAs
X3 <- X2[!is.na(X2$Z_lm_L_Avg), ]
# Identify redundant GO terms
for (i in 1:length(X3[, 1])) {
# Loop through each GO term - get term
GO_term_ID <- as.character(X3$GO_ID_Avg[i])
# Get term in the X3
X3_Temp <- X3[X3$GO_ID_Avg == GO_term_ID, ]
# Get anywhere that has the same number K_Avg value
X3_Temp2 <- X3[X3$Z_lm_K_Avg %in% X3_Temp, ]
if (length(X3_Temp2[, 1]) > 1) {
if (length(unique(X3_Temp2$Genes_Avg)) == 1) {
X3_Temp2 <- X3_Temp2[1, ]
}
}
if (i == 1) {
Y <- X3_Temp2
}
if (i > 1) {
Y <- rbind(Y, X3_Temp2)
}
}
Y1 <- unique(Y)
write.csv(Y1, file.path(output_dir, exp_name, "Average_GOTerms_All_NonRedundantTerms.csv"), row.names = FALSE)
Y2 <- Y1[Y1$Z_lm_L_Avg >= 2 | Y1$Z_lm_L_Avg <= -2, ]
Y2 <- Y2[!is.na(Y2$Z_lm_L_Avg), ]
write.csv(Y2, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_L.csv"), row.names = FALSE)
Y3 <- Y2[Y2$NumGenes_Avg > 2, ]
write.csv(Y3, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv"), row.names = FALSE)
Y4 <- Y1[Y1$Z_lm_K_Avg >= 2 | Y1$Z_lm_K_Avg <= -2, ]
Y4 <- Y4[!is.na(Y4$Z_lm_K_Avg), ]
write.csv(Y4, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_K.csv"), row.names = FALSE)
Y5 <- Y4[Y4$NumGenes_Avg > 2, ]
write.csv(Y5, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv"), row.names = FALSE)
}

View File

@@ -0,0 +1,375 @@
library("dplyr")
# Function to parse and set arguments
parse_arguments <- function() {
if (interactive()) {
args <- c(
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
2,
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
)
} else {
args <- commandArgs(trailingOnly = TRUE)
}
list(
out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
sd = as.numeric(args[2]),
study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
input_dirs = args[4:length(args)]
)
}
args <- parse_arguments()
# Create an array for the zscores files
get_zscores_files <- function(dirs) {
files <- sapply(dirs, function(exp) {
file_path <- file.path(exp, "zscores", "zscores_interaction.csv")
if (file.exists(file_path)) file_path else NULL
}, simplify = TRUE, USE.NAMES = FALSE)
# Filter out NULL entries
files[!sapply(files, is.null)]
}
zscores_files <- get_zscores_files(args$input_dirs)
sprintf("The SD value is: %d", args$sd)
# Ensure there are enough files to compare
if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script")
# Function to join zscores files
join_zscores_files <- function(files) {
joined_data <- read.csv(file = files[1], stringsAsFactors = FALSE)
for (file in files[-1]) {
temp_data <- read.csv(file = file, stringsAsFactors = FALSE)
joined_data <- join(joined_data, temp_data, by = "OrfRep")
}
joined_data
}
# Load and join zscores files
joined_data <- join_zscores_files(zscores_files)
# Order and select columns
order_and_select_columns <- function(data) {
ordered_data <- data[, order(colnames(data))]
selected_headers <- select(ordered_data,
contains("OrfRep"), matches("Gene"),
contains("z_lm_k"), contains("z_shift_k"),
contains("z_lm_l"), contains("z_shift_l"))
selected_headers
}
selected_headers <- order_and_select_columns(joined_data)
# Remove redundant columns like "Gene.1"
clean_headers <- function(data, suffixes) {
suffixes_to_remove <- paste0("Gene.", seq_len(suffixes))
select(data, -all_of(suffixes_to_remove))
}
headSel <- clean_headers(selected_headers, length(zscores_files) - 1)
headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1)
# Fill NA values in Shift and Z_lm columns
fill_na_in_columns <- function(data) {
for (header in colnames(data)) {
if (grepl("Shift", header)) {
data[[header]][is.na(data[[header]])] <- 0.001
} else if (grepl("Z_lm_", header)) {
data[[header]][is.na(data[[header]])] <- 0.0001
}
}
data
}
headSel <- fill_na_in_columns(headSel)
# Filter based on standard deviation
filter_by_sd <- function(data, sd) {
if (sd == 0) return(data)
z_lm_cols <- select(data, contains("z_lm_"))
filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0
data[filter_vector, ]
}
REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), args$sd)
shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), args$sd)
# Reorder columns to interleave Z_lm and Shift data
reorder_columns <- function(data1, data2) {
combined_data <- data1
for (i in 3:ncol(data1)) {
combined_data <- cbind(combined_data, data2[i], data1[i])
}
combined_data
}
combI <- reorder_columns(headSel2, shiftOnly)
# Write output files
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
# Relabel headers using experiment names from StudyInfo.csv
relabel_headers <- function(headers, labels) {
new_labels <- headers
for (i in seq_along(headers)) {
suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i])
if (suffix %in% 1:3) {
exp_name <- labels[as.numeric(suffix), 2]
new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i])
}
}
new_labels
}
LabelStd <- read.csv(file = args$study_info, stringsAsFactors = FALSE)
colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), LabelStd)
colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), LabelStd)
# Save relabeled files
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
# Save updated parameters
LabelStd[, 4] <- args$sd
write.csv(LabelStd, file.path(args$out_dir, "parameters.csv"), row.names = FALSE)
write.csv(LabelStd, file = args$study_info, row.names = FALSE)
# library("plyr")
# library("sos")
# library("dplyr")
# # Function to parse and set arguments
# parse_arguments <- function() {
# if (interactive()) {
# args <- c(
# "./", # Default out_dir
# "2", # Default SD value
# "../Code/StudyInfo.csv" # Default study_info path
# )
# } else {
# args <- commandArgs(trailingOnly = TRUE)
# }
# list(
# out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
# sd = as.numeric(args[2]),
# study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
# input_dirs = args[4:length(args)]
# )
# }
# args <- parse_arguments()
# # Create an array for the zscores files
# zscores_files <- sapply(args$input_dirs, function(study) {
# file_path <- file.path(study, "zscores", "zscores_interaction.csv")
# if (file.exists(file_path)) file_path else NULL
# }, simplify = TRUE, USE.NAMES = FALSE)
# # Filter out NULL entries
# zscores_files <- zscores_files[!sapply(zscores_files, is.null)]
# sprintf("The SD value is: %d", args$sd)
# # print(args$input_dirs)
# # TODO this is better handled in a loop in case you want to compare more experiments?
# # The input is already designed for this
# # Read in the files for your experiment and
# # Join the two files at a time as a function of how many inputFile
# # list the larger file first ? in this example X2 has the larger number of genes
# # If X1 has a larger number of genes, switch the order of X1 and X2
# if (length(zscores_files) < 2) {
# print("Note enough Exps to compare, skipping join")
# stop("Exiting script")
# }
# if (length(zscores_files) >= 2) {
# X1 <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE)
# X2 <- read.csv(file = zscores_files[2], stringsAsFactors = FALSE)
# X <- join(X1, X2, by = "OrfRep")
# OBH <- X[, order(colnames(X))] # OrderByHeader
# headers <- select(OBH, contains("OrfRep"), matches("Gene"),
# contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l"))
# headSel <- select(headers, -"Gene.1") # remove "Gene.1 column
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums
# headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column # frame for interleaving Z_lm with Shift colums
# }
# if (length(zscores_files) >= 3) {
# X3 <- read.csv(file = zscores_files[3], stringsAsFactors = FALSE)
# X <- join(X, X3, by = "OrfRep")
# headSel <- select(headers, -"Gene.1", -"Gene.2")
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2")
# }
# if (length(zscores_files) >= 4) {
# X4 <- read.csv(file = zscores_files[4], stringsAsFactors = FALSE)
# X <- join(X, X4, by = "OrfRep")
# headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3")
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3")
# }
# # print(headers)
# # headSel$contains("Z_Shift") %>% replace_na(0.001)
# headers <- colnames(headSel)
# # print(headers)
# i <- 0
# for (i in 1:length(headers)) {
# if (grepl("Shift", headers[i])) {
# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001
# }
# if (grepl("Z_lm_", headers[i])) {
# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001
# }
# }
# # 2SD option code to exclude Z_lm values less than 2 standard Deviations
# REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_"))
# shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift"))
# # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt
# Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",")
# # Using Text search grepl to relabel headers
# REMcRdyHdr <- colnames(REMcRdy)
# REMcRdyLabels <- "asdf"
# shftHdr <- colnames(shiftOnly)
# shiftLabels <- "asdf"
# shiftLabels[1:2] <- shftHdr[1:2]
# REMcRdyLabels[1:2] <- REMcRdyHdr[1:2]
# for (i in 3:(length(shftHdr))) {
# if (i == 3) {
# shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3])
# REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3])
# }
# if (i == 5) {
# shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5])
# REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5])
# }
# if (i == 7) {
# shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7])
# REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7])
# }
# if (grepl(".1", shftHdr[i], fixed = true)) {
# shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i])
# REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i])
# }
# if (grepl(".2", shftHdr[i], fixed = true)) {
# shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i])
# REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i])
# }
# if (grepl(".3", shftHdr[i], fixed = true)) {
# shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i])
# REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i])
# }
# }
# for (i in 3:(length(REMcRdyLabels))) {
# j <- as.integer(i)
# REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j])
# shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j])
# }
# colnames(shiftOnly) <- shiftLabels
# colnames(REMcRdy) <- REMcRdyLabels
# combI <- headSel2 # starting Template orf, Genename columns
# # headersRemc<-colnames(REMcRdy)
# # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps.
# for (i in 3:length(colnames(REMcRdy))) {
# combI <- cbind.data.frame(combI, shiftOnly[i])
# combI <- cbind.data.frame(combI, REMcRdy[i])
# }
# Vec1 <- NA
# Vec2 <- NA
# Vec3 <- NA
# Vec4 <- NA
# Vec5 <- NA
# Vec6 <- NA
# Vec7 <- NA
# Vec8 <- NA
# if (length(REMcRdy) == 6) {
# Vec1 <- abs(REMcRdy[, 3]) >= sd
# Vec2 <- abs(REMcRdy[, 4]) >= sd
# Vec3 <- abs(REMcRdy[, 5]) >= sd
# Vec4 <- abs(REMcRdy[, 6]) >= sd
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
# REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6]
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
# shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6]
# }
# if (length(REMcRdy) == 8) {
# Vec1 <- abs(REMcRdy[, 3]) >= sd
# Vec2 <- abs(REMcRdy[, 4]) >= sd
# Vec3 <- abs(REMcRdy[, 5]) >= sd
# Vec4 <- abs(REMcRdy[, 6]) >= sd
# Vec5 <- abs(REMcRdy[, 7]) >= sd
# Vec6 <- abs(REMcRdy[, 8]) >= sd
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
# REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8]
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
# shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8]
# }
# if (length(REMcRdy) == 10) {
# Vec1 <- abs(REMcRdy[, 3]) >= sd
# Vec2 <- abs(REMcRdy[, 4]) >= sd
# Vec3 <- abs(REMcRdy[, 5]) >= sd
# Vec4 <- abs(REMcRdy[, 6]) >= sd
# Vec5 <- abs(REMcRdy[, 7]) >= sd
# Vec6 <- abs(REMcRdy[, 8]) >= sd
# Vec7 <- abs(REMcRdy[, 9]) >= sd
# Vec8 <- abs(REMcRdy[, 10]) >= sd
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
# REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10]
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
# shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10]
# }
# if (sd != 0) {
# REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)]
# shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)]
# }
# if (sd == 0) {
# REMcRdy <- REMcRdy # [,2:length(REMcRdy)]
# shiftOnly <- shiftOnly # [,2:length(shiftOnly)]
# }
# # R places hidden "" around the header names. The following
# # is intended to remove those quote so that the "" do not blow up the Java REMc.
# # Use ,quote=F in the write.csv statement to fix R output file.
# # write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE)
# write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
# write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
# # LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",")
# LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE)
# # print(sd)
# LabelStd[, 4] <- as.numeric(sd)
# write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE)
# write.csv(LabelStd, file = study_info, row.names = FALSE)

View File

@@ -0,0 +1,113 @@
suppressMessages({
library("dplyr")
library("data.table")
library("readr")
library("stringr")
})
# Function to parse arguments
parse_arguments <- function() {
if (interactive()) {
args <- c(
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
3, # sd value
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
"Experiment 1: Doxo versus HLD",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
"Experiment 2: HLD versus Doxo"
)
} else {
args <- commandArgs(trailingOnly = TRUE)
}
out_dir <- normalizePath(args[1], mustWork = FALSE)
sd <- as.numeric(args[2])
paths <- normalizePath(args[seq(3, length(args), by = 2)], mustWork = FALSE)
names <- args[seq(4, length(args), by = 2)]
experiments <- setNames(paths, names)
list(
out_dir = out_dir,
sd = sd,
experiments = experiments
)
}
args <- parse_arguments()
# Ensure main output directory exists
dir.create(args$out_dir, showWarnings = FALSE, recursive = TRUE)
# Function to read and combine z-score interaction files
combine_zscores <- function(experiments, out_dir) {
combined_data <- lapply(names(experiments), function(exp_name) {
exp_dir <- experiments[[exp_name]]
zscore_file <- file.path(exp_dir, "zscores", "zscores_interaction.csv")
if (!file.exists(zscore_file)) {
stop("Z-score file does not exist for ", exp_name, " at ", zscore_file)
}
message("Reading z-score file for ", exp_name, " from ", zscore_file)
data <- fread(zscore_file)
data$Experiment <- exp_name
return(data)
}) %>%
bind_rows()
combined_output_file <- file.path(out_dir, "combined_zscores.csv")
fwrite(combined_data, combined_output_file, row.names = FALSE)
message("Combined z-score file saved to: ", combined_output_file)
}
# Function to read and combine summary statistics files
combine_summary_stats <- function(experiments, out_dir) {
combined_stats <- lapply(names(experiments), function(exp_name) {
exp_dir <- experiments[[exp_name]]
summary_file <- file.path(exp_dir, "zscores", "summary_stats_all_strains.csv")
if (!file.exists(summary_file)) {
stop("Summary stats file does not exist for ", exp_name, " at ", summary_file)
}
message("Reading summary stats file for ", exp_name, " from ", summary_file)
data <- fread(summary_file)
data$Experiment <- exp_name
return(data)
}) %>%
bind_rows()
combined_output_file <- file.path(out_dir, "combined_summary_stats.csv")
fwrite(combined_stats, combined_output_file, row.names = FALSE)
message("Combined summary stats file saved to: ", combined_output_file)
}
# Function to generate final summary report
generate_final_report <- function(out_dir) {
combined_zscores <- file.path(out_dir, "combined_zscores.csv")
combined_stats <- file.path(out_dir, "combined_summary_stats.csv")
if (!file.exists(combined_zscores) || !file.exists(combined_stats)) {
stop("Combined z-scores or summary stats files do not exist.")
}
zscores_data <- fread(combined_zscores)
stats_data <- fread(combined_stats)
message("Merging z-score and summary stats data...")
final_report <- merge(zscores_data, stats_data, by = c("OrfRep", "Experiment"), all = TRUE, allow.cartesian = TRUE)
final_report_file <- file.path(out_dir, "final_combined_report.csv")
fwrite(final_report, final_report_file, row.names = FALSE)
message("Final combined report saved to: ", final_report_file)
}
# Process all experiments and generate outputs
combine_zscores(args$experiments, args$out_dir)
combine_summary_stats(args$experiments, args$out_dir)
generate_final_report(args$out_dir)