Squashed initial commit

This commit is contained in:
2024-09-10 13:47:29 -04:00
commit 8ebb6ad265
6221 changed files with 2512206 additions and 0 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,54 @@
#!/usr/bin/env Rscript
# Load the openxlsx package
library(openxlsx)
args <- commandArgs(TRUE)
outDir <- args[1]
# Function to combine CSV and TXT files into a workbook with named sheets
combineFilesToWorkbook <- function(file_list, output_file) {
# Create a new workbook
wb <- createWorkbook()
# Loop through the file list
for (i in seq_along(file_list)) {
file <- file_list[i]
# Extract the sheet name from the file name based on the extension
ext <- tools::file_ext(file)
sheet_name <- tools::file_path_sans_ext(basename(file))
# Read the data based on the file extension
if (ext %in% c("csv", "txt")) {
if (ext == "csv") {
data <- read.csv(file)
} else if (ext == "txt") {
data <- readLines(file)
}
# Create a new sheet in the workbook with the extracted sheet name
addWorksheet(wb, sheetName = sheet_name)
# Write the data to the sheet
writeData(wb, sheet = sheet_name, x = data)
}
}
# Save the workbook as an Excel file
saveWorkbook(wb, output_file)
}
Component <- file.path(outDir, "Component", "ComponentResults.txt")
Function <- file.path(outDir, "Function", "FunctionResults.txt")
Process <- file.path(outDir, "Process", "ProcessResults.txt")
# Specify the list of input files (both CSV and TXT)
file_list <- c(Component, Process, Function)
# Specify the output file name
output_file <- file.path(outDir, "GTFCombined.xlsx")
# Call the function to combine the files into a workbook with named sheets
combineFilesToWorkbook(file_list, output_file)

View File

@@ -0,0 +1,895 @@
# This R script performs GTA L and K Pairwise Compares for user specified pairs of Experiments
library("ggplot2")
library("plotly")
library("htmlwidgets")
library("extrafont")
library("grid")
library("ggthemes")
args <- commandArgs(TRUE)
exp1_name <- args[1]
exp1_file <- args[2]
exp2_name <- args[3]
exp2_file <- args[4]
output_dir <- args[5]
pairDirL <- file.path(output_dir, paste0("PairwiseCompareL_", exp1_name, "-", exp2_name))
pairDirK <- file.path(output_dir, paste0("PairwiseCompareK_", exp1_name, "-", exp2_name))
# Pairwise L
# outputPlotly <- "../GTAresults/PairwiseCompareL/" #"/GTAresults/PairwiseCompareL/"
# Theme elements for plots
theme_Publication <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.size = unit(0.2, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
theme_Publication_legend_right <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "right",
legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
scale_fill_Publication <- function(...) {
discrete_scale("fill", "Publication", manual_pal(
values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33")), ...)
}
scale_colour_Publication <- function(...) {
discrete_scale("colour", "Publication", manual_pal(
values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33")), ...)
}
X1 <- read.csv(file = exp1_file, stringsAsFactors = FALSE, header = TRUE)
X2 <- read.csv(file = exp2_file, stringsAsFactors = FALSE, header = TRUE)
X <- merge(X1, X2, by = "Term_Avg", all = TRUE, suffixes = c("_X1", "_X2"))
gg <- ggplot(data = X, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Ontology_Avg_X1,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(
aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2),
color = "grey20",
size = 0.25,
alpha = 0.1,
inherit.aes = FALSE,
fill = NA
) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOntology.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOntology.html"))
htmlwidgets::saveWidget(pgg, fname)
# ID aggravators and alleviators, regardless of whether they meet 2SD threshold
X1_Specific_Aggravators <- X[which(X$Z_lm_L_Avg_X1 >= 2 & X$Z_lm_L_Avg_X2 < 2), ]
X1_Specific_Alleviators <- X[which(X$Z_lm_L_Avg_X1 <= -2 & X$Z_lm_L_Avg_X2 > -2), ]
X2_Specific_Aggravators <- X[which(X$Z_lm_L_Avg_X2 >= 2 & X$Z_lm_L_Avg_X1 < 2), ]
X2_Specific_Alleviators <- X[which(X$Z_lm_L_Avg_X2 <= -2 & X$Z_lm_L_Avg_X1 > -2), ]
Overlap_Aggravators <- X[which(X$Z_lm_L_Avg_X1 >= 2 & X$Z_lm_L_Avg_X2 >= 2), ]
Overlap_Alleviators <- X[which(X$Z_lm_L_Avg_X1 <= -2 & X$Z_lm_L_Avg_X2 <= -2), ]
X2_Specific_Aggravators_X1_Alleviatiors <- X[which(X$Z_lm_L_Avg_X2 >= 2 & X$Z_lm_L_Avg_X1 <= -2), ]
X2_Specific_Alleviators_X1_Aggravators <- X[which(X$Z_lm_L_Avg_X2 <= -2 & X$Z_lm_L_Avg_X1 >= 2), ]
X$Overlap_Avg <- NA
try(X[X$Term_Avg %in% X1_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X1_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
paste(exp1_name, "Specific_Deletion_Suppresors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% Overlap_Aggravators$Term_Avg, ]$Overlap_Avg <- "Overlapping_Deletion_Enhancers")
try(X[X$Term_Avg %in% Overlap_Alleviators$Term_Avg, ]$Overlap_Avg <- "Overlapping_Deletion_Suppressors")
try(X[X$Term_Avg %in% X2_Specific_Aggravators_X1_Alleviatiors$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators_X1_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
gg <- ggplot(data = X, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(
aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2),
color = "grey20",
size = 0.25,
alpha = 0.1,
inherit.aes = FALSE,
fill = NA
) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html"))
htmlwidgets::saveWidget(pgg, fname)
x_rem2_gene <- X[X$NumGenes_Avg_X1 >= 2 & X$NumGenes_Avg_X2 >= 2, ]
#3
gg <- ggplot(data = x_rem2_gene, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_above2genes.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html"))
htmlwidgets::saveWidget(pgg, fname)
#4
X_overlap_nothresold <- X[!(is.na(X$Overlap_Avg)), ]
gg <- ggplot(data = X_overlap_nothresold, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.html"))
htmlwidgets::saveWidget(pgg, fname)
# Only output GTA terms where average score is still above 2 after subtracting the SD
# Z1 will ID aggravators, Z2 alleviators
Z1 <- X
Z1$L_Subtract_SD_X1 <- Z1$Z_lm_L_Avg_X1 - Z1$Z_lm_L_SD_X1
Z1$L_Subtract_SD_X2 <- Z1$Z_lm_L_Avg_X2 - Z1$Z_lm_L_SD_X2
Z2 <- X
Z2$L_Subtract_SD_X1 <- Z1$Z_lm_L_Avg_X1 + Z1$Z_lm_L_SD_X1
Z2$L_Subtract_SD_X2 <- Z1$Z_lm_L_Avg_X2 + Z1$Z_lm_L_SD_X2
X1_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 < 2), ]
X1_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X1 <= -2 & Z2$L_Subtract_SD_X2 > -2), ]
X2_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z1$L_Subtract_SD_X1 < 2), ]
X2_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 > -2), ]
Overlap_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 >= 2), ]
Overlap_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 <= -2), ]
X2_Specific_Aggravators2_X1_Alleviatiors2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z2$L_Subtract_SD_X1 <= -2), ]
X2_Specific_Alleviators2_X1_Aggravators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z1$L_Subtract_SD_X1 >= 2), ]
X$Overlap <- NA
try(X[X$Term_Avg %in% X1_Specific_Aggravators2$Term_Avg, ]$Overlap <- paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X1_Specific_Alleviators2$Term_Avg, ]$Overlap <- paste(exp1_name, "Specific_Deletion_Suppresors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Aggravators2$Term_Avg, ]$Overlap <- paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators2$Term_Avg, ]$Overlap <- paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% Overlap_Aggravators2$Term_Avg, ]$Overlap <- "Overlapping_Deletion_Enhancers")
try(X[X$Term_Avg %in% Overlap_Alleviators2$Term_Avg, ]$Overlap <- "Overlapping_Deletion_Suppressors")
try(X[X$Term_Avg %in% X2_Specific_Aggravators2_X1_Alleviatiors2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators2_X1_Aggravators2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
#5
X_abovethreshold <- X[!(is.na(X$Overlap)), ]
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.html"))
htmlwidgets::saveWidget(pgg, fname)
#6
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = Term_Avg), nudge_y = 0.25, size = 2) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.pdf")),
width = 20,
height = 20
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.html"))
htmlwidgets::saveWidget(pgg, fname)
X_abovethreshold$X1_Rank <- NA
X_abovethreshold$X1_Rank <- rank(-X_abovethreshold$Z_lm_L_Avg_X1, ties.method = "random")
X_abovethreshold$X2_Rank <- NA
X_abovethreshold$X2_Rank <- rank(-X_abovethreshold$Z_lm_L_Avg_X2, ties.method = "random")
#7
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = X1_Rank), nudge_y = 0.25, size = 4) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.pdf")),
width = 15,
height = 15
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <-
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.html"))
htmlwidgets::saveWidget(pgg, fname)
#8
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_L_Avg_X1,
y = Z_lm_L_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_L_SD_X1,
SD_2 = Z_lm_L_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = X2_Rank), nudge_y = 0.25, size = 4) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.pdf")),
width = 15,
height = 15
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <-
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.html"))
htmlwidgets::saveWidget(pgg, fname)
write.csv(
x = X,
file.path(pairDirL, paste0("All_GTA_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
row.names = FALSE
)
write.csv(
x = X_abovethreshold,
file = file.path(pairDirL, paste0("AboveThreshold_GTA_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
row.names = FALSE
)
# Begin Pairwsise K
# Theme elements for plots
theme_Publication <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.size = unit(0.2, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
scale_fill_Publication <- function(...) {
library("scales")
discrete_scale(
"fill",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
scale_colour_Publication <- function(...) {
discrete_scale(
"colour",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
theme_Publication_legend_right <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold",
size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "right",
legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
scale_fill_Publication <- function(...) {
discrete_scale(
"fill",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
scale_colour_Publication <- function(...) {
discrete_scale(
"colour",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
X1 <- read.csv(file = exp1_file, stringsAsFactors = FALSE, header = TRUE)
X2 <- read.csv(file = exp2_file, stringsAsFactors = FALSE, header = TRUE)
#1
X <- merge(X1, X2, by = "Term_Avg", all = TRUE, suffixes = c("_X1", "_X2"))
gg <- ggplot(data = X, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Ontology_Avg_X1,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOntology.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOntology.html"))
htmlwidgets::saveWidget(pgg, fname)
#2
# ID aggravators and alleviators, regardless of whether they meet 2SD threshold
X1_Specific_Aggravators <- X[which(X$Z_lm_K_Avg_X1 >= 2 & X$Z_lm_K_Avg_X2 < 2), ]
X1_Specific_Alleviators <- X[which(X$Z_lm_K_Avg_X1 <= -2 & X$Z_lm_K_Avg_X2 > -2), ]
X2_Specific_Aggravators <- X[which(X$Z_lm_K_Avg_X2 >= 2 & X$Z_lm_K_Avg_X1 < 2), ]
X2_Specific_Alleviators <- X[which(X$Z_lm_K_Avg_X2 <= -2 & X$Z_lm_K_Avg_X1 > -2), ]
Overlap_Aggravators <- X[which(X$Z_lm_K_Avg_X1 >= 2 & X$Z_lm_K_Avg_X2 >= 2), ]
Overlap_Alleviators <- X[which(X$Z_lm_K_Avg_X1 <= -2 & X$Z_lm_K_Avg_X2 <= -2), ]
X2_Specific_Aggravators_X1_Alleviatiors <- X[which(X$Z_lm_K_Avg_X2 >= 2 & X$Z_lm_K_Avg_X1 <= -2), ]
X2_Specific_Alleviators_X1_Aggravators <- X[which(X$Z_lm_K_Avg_X2 <= -2 & X$Z_lm_K_Avg_X1 >= 2), ]
X$Overlap_Avg <- NA
try(X[X$Term_Avg %in% X1_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X1_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% Overlap_Aggravators$Term_Avg, ]$Overlap_Avg <-
"Overlapping_Deletion_Suppressors")
try(X[X$Term_Avg %in% Overlap_Alleviators$Term_Avg, ]$Overlap_Avg <-
"Overlapping_Deletion_Enhancers")
try(X[X$Term_Avg %in% X2_Specific_Aggravators_X1_Alleviatiors$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators_X1_Aggravators$Term_Avg, ]$Overlap_Avg <-
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
plotly_path <- file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html"))
gg <- ggplot(data = X, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#2
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html"))
htmlwidgets::saveWidget(pgg, fname)
#3
x_rem2_gene <- X[X$NumGenes_Avg_X1 >= 2 & X$NumGenes_Avg_X2 >= 2, ]
plotly_path <- file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html"))
gg <- ggplot(data = x_rem2_gene, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_above2genes.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html"))
htmlwidgets::saveWidget(pgg, fname)
#4
X_overlap_nothresold <- X[!(is.na(X$Overlap_Avg)), ]
gg <- ggplot(data = X_overlap_nothresold, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap_Avg,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.html"))
htmlwidgets::saveWidget(pgg, fname)
#5
# Only output GTA terms where average score is still above 2 after subtracting the SD
# Z1 will ID aggravators, Z2 alleviators
Z1 <- X
Z1$L_Subtract_SD_X1 <- Z1$Z_lm_K_Avg_X1 - Z1$Z_lm_K_SD_X1
Z1$L_Subtract_SD_X2 <- Z1$Z_lm_K_Avg_X2 - Z1$Z_lm_K_SD_X2
Z2 <- X
Z2$L_Subtract_SD_X1 <- Z1$Z_lm_K_Avg_X1 + Z1$Z_lm_K_SD_X1
Z2$L_Subtract_SD_X2 <- Z1$Z_lm_K_Avg_X2 + Z1$Z_lm_K_SD_X2
X1_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 < 2), ]
X1_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X1 <= -2 & Z2$L_Subtract_SD_X2 > -2), ]
X2_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z1$L_Subtract_SD_X1 < 2), ]
X2_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 > -2), ]
Overlap_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 >= 2), ]
Overlap_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 <= -2), ]
X2_Specific_Aggravators2_X1_Alleviatiors2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z2$L_Subtract_SD_X1 <= -2), ]
X2_Specific_Alleviators2_X1_Aggravators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z1$L_Subtract_SD_X1 >= 2), ]
X$Overlap <- NA
try(X[X$Term_Avg %in% X1_Specific_Aggravators2$Term_Avg, ]$Overlap <-
paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X1_Specific_Alleviators2$Term_Avg, ]$Overlap <-
paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Aggravators2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% Overlap_Aggravators2$Term_Avg, ]$Overlap <-
"Overlapping_Deletion_Suppressors")
try(X[X$Term_Avg %in% Overlap_Alleviators2$Term_Avg, ]$Overlap <-
"Overlapping_Deletion_Enhancers")
try(X[X$Term_Avg %in% X2_Specific_Aggravators2_X1_Alleviatiors2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
try(X[X$Term_Avg %in% X2_Specific_Alleviators2_X1_Aggravators2$Term_Avg, ]$Overlap <-
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
X_abovethreshold <- X[!(is.na(X$Overlap)), ]
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.pdf")),
width = 12,
height = 8
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.html"))
htmlwidgets::saveWidget(pgg, fname)
#6
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = Term_Avg), nudge_y = 0.25, size = 2) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.pdf")),
width = 20,
height = 20
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.html"))
htmlwidgets::saveWidget(pgg, fname)
#7
X_abovethreshold$X1_Rank <- NA
X_abovethreshold$X1_Rank <- rank(-X_abovethreshold$Z_lm_K_Avg_X1, ties.method = "random")
X_abovethreshold$X2_Rank <- NA
X_abovethreshold$X2_Rank <- rank(-X_abovethreshold$Z_lm_K_Avg_X2, ties.method = "random")
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = X1_Rank), nudge_y = 0.25, size = 4) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.pdf")),
width = 15,
height = 15
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <-
file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.html"))
htmlwidgets::saveWidget(pgg, fname)
#8
gg <- ggplot(data = X_abovethreshold, aes(
x = Z_lm_K_Avg_X1,
y = Z_lm_K_Avg_X2,
color = Overlap,
Term = Term_Avg,
Genes = Genes_Avg_X1,
NumGenes = NumGenes_Avg_X1,
AllPossibleGenes = AllPossibleGenes_Avg_X1,
SD_1 = Z_lm_K_SD_X1,
SD_2 = Z_lm_K_SD_X2
)) +
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
geom_text(aes(label = X2_Rank), nudge_y = 0.25, size = 4) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3, size = 3) +
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
theme_Publication_legend_right()
pdf(
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.pdf")),
width = 15,
height = 15
)
gg
dev.off()
pgg <- ggplotly(gg)
#pgg
fname <-
file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.html"))
htmlwidgets::saveWidget(pgg, fname)
write.csv(
x = X,
file = file.path(pairDirK, paste0("All_GTF_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
row.names = FALSE
)
write.csv(
x = X_abovethreshold,
file = file.path(pairDirK, paste0("AboveThreshold_GTF_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
row.names = FALSE
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,810 @@
#!/usr/bin/env Rscript
# Makes heat maps of multiple experiments
#
# Updated 240724 Bryan C Roessler to improve file operations and portability
# I tried to leave as much logic intact as possible, just feeding in vars in a better way
# NOTE: The script now has 7 required arguments and a variable number of input experiments
# @arg $1 string StudyInfo.csv file
# @arg $2 string gene_ontology_edit.obo file
# @arg $3 string go_terms.tab file
# @arg $4 string All_SGD_GOTerms_for_QHTCPtk.csv
# @arg $5 string base directory
# @arg $6 string output directory
library("ontologyIndex")
library("ggplot2")
library("RColorBrewer")
library("grid")
library("ggthemes")
# library("plotly")
# library("htmlwidgets")
library("extrafont")
library("stringr")
library("org.Sc.sgd.db")
library("ggrepel")
library("gplots")
# Load arguments
args <- commandArgs(TRUE)
study_info_file <- args[1]
ontology_file <- args[2]
sgd_terms_tfile <- args[3]
all_sgd_terms_csv <- args[4]
base_dir <- args[5]
output_dir <- args[6]
study_nums <- args[7:length(args)]
# Import standard tables used in Sean's code That should be copied to each ExpStudy
labels <- read.csv(file = study_info_file, stringsAsFactors = FALSE)
Ontology <- get_ontology(file = ontology_file, propagate_relationships = "is_a", extract_tags = "minimal")
GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS) # all ORFs associated with GO term
Terms <- read.delim(file = sgd_terms_tfile,
header = FALSE,
quote = "",
col.names = c("GO_ID", "GO_Term", "GO_Aspect", "GO_Term_Definition")
)
XX3 <- read.csv(file = all_sgd_terms_csv, stringsAsFactors = FALSE, header = TRUE)
XX3[, 1] <- paste0("GO:", formatC(XX3[, 1], width = 7, flag = "0"))
XX3[, 2] <- gsub(pattern = " ", replacement = "_", x = XX3[, 2])
XX3[, 2] <- gsub(pattern = "/", replacement = "_", x = XX3[, 2])
# Load input files
for (study_num in study_nums) {
input_file <- file.path(base_dir, paste("Exp", study_num), zscores, "zscores_interaction.csv")
if (file.exists(input_file)) {
assign(paste(X, study_num), read.csv(file = input_file, stringsAsFactors = FALSE, header = TRUE))
assign(paste(Name, study_num), labels[study_num, 2])
}
}
for (study_num in study_nums) {
eval(paste("function", study_num))
}
if (length(study_nums) > 0) {
X1$ORF <- X1$OrfRep
X1$ORF <- gsub("_1", "", x = X1$ORF)
X1$ORF <- gsub("_2", "", x = X1$ORF)
X1$ORF <- gsub("_3", "", x = X1$ORF)
X1$ORF <- gsub("_4", "", x = X1$ORF)
X1$Score_L <- "No Effect"
try(X1[is.na(X1$Z_lm_L), ]$Score_L <- "No Growth")
try(X1[!is.na(X1$Z_lm_L) & X1$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
try(X1[!is.na(X1$Z_lm_L) & X1$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
X1$Score_K <- "No Effect"
try(X1[is.na(X1$Z_lm_K), ]$Score_K <- "No Growth")
try(X1[!is.na(X1$Z_lm_K) & X1$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
try(X1[!is.na(X1$Z_lm_K) & X1$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
# Express the na data as 0.001 in X1 for K and L
X1[is.na(X1$Z_lm_L), ]$Z_lm_L <- 0.001
X1[is.na(X1$Z_lm_K), ]$Z_lm_K <- 0.001
X1$Rank_L <- rank(X1$Z_lm_L)
X1$Rank_K <- rank(X1$Z_lm_K)
X1 <- X1[order(X1$OrfRep, decreasing = FALSE), ]
colnames(X1) <- paste0(colnames(X1), "_X1")
}
if (length(study_nums) > 1) {
X2$ORF <- X2$OrfRep
X2$ORF <- gsub("_1", "", x = X2$ORF)
X2$ORF <- gsub("_2", "", x = X2$ORF)
X2$ORF <- gsub("_3", "", x = X2$ORF)
X2$ORF <- gsub("_4", "", x = X2$ORF)
X2$Score_L <- "No Effect"
try(X2[is.na(X2$Z_lm_L), ]$Score_L <- "No Growth")
try(X2[!is.na(X2$Z_lm_L) & X2$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
try(X2[!is.na(X2$Z_lm_L) & X2$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
X2$Score_K <- "No Effect"
try(X2[is.na(X2$Z_lm_K), ]$Score_K <- "No Growth")
try(X2[!is.na(X2$Z_lm_K) & X2$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
try(X2[!is.na(X2$Z_lm_K) & X2$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
#express the na data as 0.001 in X2
X2[is.na(X2$Z_lm_L), ]$Z_lm_L <- 0.001
X2[is.na(X2$Z_lm_K), ]$Z_lm_K <- 0.001
X2$Rank_L <- rank(X2$Z_lm_L)
X2$Rank_K <- rank(X2$Z_lm_K)
X2 <- X2[order(X2$OrfRep, decreasing = FALSE), ]
colnames(X2) <- paste0(colnames(X2), "_X2")
X <- cbind(X1, X2)
}
if (length(study_nums) > 2) {
X3$ORF <- X3$OrfRep
X3$ORF <- gsub("_1", "", x = X3$ORF)
X3$ORF <- gsub("_2", "", x = X3$ORF)
X3$ORF <- gsub("_3", "", x = X3$ORF)
X3$ORF <- gsub("_4", "", x = X3$ORF)
X3$Score_L <- "No Effect"
try(X3[is.na(X3$Z_lm_L), ]$Score_L <- "No Growth")
try(X3[!is.na(X3$Z_lm_L) & X3$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
try(X3[!is.na(X3$Z_lm_L) & X3$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
X3$Score_K <- "No Effect"
try(X3[is.na(X3$Z_lm_K), ]$Score_K <- "No Growth")
try(X3[!is.na(X3$Z_lm_K) & X3$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
try(X3[!is.na(X3$Z_lm_K) & X3$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
# Express the na data as 0.001 in X3
X3[is.na(X3$Z_lm_L), ]$Z_lm_L <- 0.001
X3[is.na(X3$Z_lm_K), ]$Z_lm_K <- 0.001
X3$Rank_L <- rank(X3$Z_lm_L)
X3$Rank_K <- rank(X3$Z_lm_K)
X3 <- X3[order(X3$OrfRep, decreasing = FALSE), ]
colnames(X3) <- paste0(colnames(X3), "_X3")
X <- cbind(X, X3)
}
if (length(study_nums) > 3) {
X4$ORF <- X4$OrfRep
X4$ORF <- gsub("_1", "", x = X4$ORF)
X4$ORF <- gsub("_2", "", x = X4$ORF)
X4$ORF <- gsub("_3", "", x = X4$ORF)
X4$ORF <- gsub("_4", "", x = X4$ORF)
X4$Score_L <- "No Effect"
try(X4[is.na(X4$Z_lm_L), ]$Score_L <- "No Growth")
try(X4[!is.na(X4$Z_lm_L) & X4$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
try(X4[!is.na(X4$Z_lm_L) & X4$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
X4$Score_K <- "No Effect"
try(X4[is.na(X4$Z_lm_K), ]$Score_K <- "No Growth")
try(X4[!is.na(X4$Z_lm_K) & X4$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
try(X4[!is.na(X4$Z_lm_K) & X4$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
# Express the na data as 0.001 in X4
X4[is.na(X4$Z_lm_L), ]$Z_lm_L <- 0.001
X4[is.na(X4$Z_lm_K), ]$Z_lm_K <- 0.001
X4$Rank_L <- rank(X4$Z_lm_L)
X4$Rank_K <- rank(X4$Z_lm_K)
X4 <- X4[order(X4$OrfRep, decreasing = FALSE), ]
colnames(X4) <- paste0(colnames(X4), "_X4")
X <- cbind(X, X4)
}
if (length(study_nums) > 4) {
X5$ORF <- X5$OrfRep
X5$ORF <- gsub("_1", "", x = X5$ORF)
X5$ORF <- gsub("_2", "", x = X5$ORF)
X5$ORF <- gsub("_3", "", x = X5$ORF)
X5$ORF <- gsub("_4", "", x = X5$ORF)
X5$Score_L <- "No Effect"
try(X5[is.na(X5$Z_lm_L), ]$Score_L <- "No Growth")
try(X5[!is.na(X5$Z_lm_L) & X5$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
try(X5[!is.na(X5$Z_lm_L) & X5$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
X5$Score_K <- "No Effect"
try(X5[is.na(X5$Z_lm_K), ]$Score_K <- "No Growth")
try(X5[!is.na(X5$Z_lm_K) & X5$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
try(X5[!is.na(X5$Z_lm_K) & X5$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
# Express the na data as 0.001 in X5
X5[is.na(X5$Z_lm_L), ]$Z_lm_L <- 0.001
X5[is.na(X5$Z_lm_K), ]$Z_lm_K <- 0.001
X5$Rank_L <- rank(X5$Z_lm_L)
X5$Rank_K <- rank(X5$Z_lm_K)
X5 <- X5[order(X5$OrfRep, decreasing = FALSE), ]
colnames(X5) <- paste0(colnames(X5), "_X5")
X <- cbind(X, X5)
}
X$ORF <- X$OrfRep_X1
if (length(study_nums) > 1) {
X$ORF <- gsub("_1", "", x = X$ORF)
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
X_heatmap <-
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2"]
X_heatmap <- X_heatmap[, c(10, 1, 4, 5, 8, 9, 2, 3, 6, 7)]
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
colnames(X_heatmap)[2] <- "Gene"
}
if (length(study_nums) > 2) {
X$ORF <- gsub("_1", "", x = X$ORF)
X$ORF <- gsub("_2", "", x = X$ORF)
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
try(X[X$Gene_X3 == "", ]$Gene_X3 <- X[X$Gene_X3 == "", ]$OrfRep_X3)
X_heatmap <-
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
colnames(X) == "Z_Shift_K_X3" | colnames(X) == "Z_lm_K_X3" |
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2" |
colnames(X) == "Z_Shift_L_X3" | colnames(X) == "Z_lm_L_X3"]
# Reorder columns
X_heatmap <- X_heatmap[, c(14, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11)]
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X3", replacement = Name3, colnames(X_heatmap))
colnames(X_heatmap)[2] <- "Gene"
}
if (length(study_nums) > 3) {
X$ORF <- gsub("_1", "", x = X$ORF)
X$ORF <- gsub("_2", "", x = X$ORF)
X$ORF <- gsub("_3", "", x = X$ORF)
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
try(X[X$Gene_X3 == "", ]$Gene_X3 <- X[X$Gene_X3 == "", ]$OrfRep_X3)
try(X[X$Gene_X4 == "", ]$Gene_X4 <- X[X$Gene_X4 == "", ]$OrfRep_X4)
X_heatmap <-
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
colnames(X) == "Z_Shift_K_X3" | colnames(X) == "Z_lm_K_X3" |
colnames(X) == "Z_Shift_K_X4" | colnames(X) == "Z_lm_K_X4" |
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2" |
colnames(X) == "Z_Shift_L_X3" | colnames(X) == "Z_lm_L_X3" |
colnames(X) == "Z_Shift_L_X4" | colnames(X) == "Z_lm_L_X4"]
# Reorder columns
X_heatmap <- X_heatmap[, c(18, 1, 4, 5, 8, 9, 12, 13, 16, 17, 2, 3, 6, 7, 10, 11, 14, 15)]
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X3", replacement = Name3, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X4", replacement = Name4, colnames(X_heatmap))
colnames(X_heatmap)[2] <- "Gene"
}
if (length(study_nums) > 4) {
X$ORF <- gsub("_1", "", x = X$ORF)
X$ORF <- gsub("_2", "", x = X$ORF)
X$ORF <- gsub("_3", "", x = X$ORF)
X$ORF <- gsub("_4", "", x = X$ORF)
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
try(X[X$Gene_X3 == "", ]$Gene_X3 <- X[X$Gene_X3 == "", ]$OrfRep_X3)
try(X[X$Gene_X4 == "", ]$Gene_X4 <- X[X$Gene_X4 == "", ]$OrfRep_X4)
try(X[X$Gene_X5 == "", ]$Gene_X5 <- X[X$Gene_X5 == "", ]$OrfRep_X5)
X_heatmap <-
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
colnames(X) == "Z_Shift_K_X3" | colnames(X) == "Z_lm_K_X3" |
colnames(X) == "Z_Shift_K_X4" | colnames(X) == "Z_lm_K_X4" |
colnames(X) == "Z_Shift_K_X5" | colnames(X) == "Z_lm_K_X5" |
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2" |
colnames(X) == "Z_Shift_L_X3" | colnames(X) == "Z_lm_L_X3" |
colnames(X) == "Z_Shift_L_X4" | colnames(X) == "Z_lm_L_X4" |
colnames(X) == "Z_Shift_L_X5" | colnames(X) == "Z_lm_L_X5"]
# Reorder columns
X_heatmap <- X_heatmap[, c(22, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19)]
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X3", replacement = Name3, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X4", replacement = Name4, colnames(X_heatmap))
colnames(X_heatmap) <- gsub(pattern = "X5", replacement = Name5, colnames(X_heatmap))
colnames(X_heatmap)[2] <- "Gene"
}
# Theme elements for plots
theme_Publication <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.size = unit(0.2, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
scale_fill_Publication <- function(...) {
library(scales)
discrete_scale(
"fill",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
scale_colour_Publication <- function(...) {
discrete_scale(
"colour",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
theme_Publication_legend_right <- function(base_size = 14, base_family = "sans") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "right",
legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"),
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
)
}
scale_fill_Publication <- function(...) {
discrete_scale(
"fill",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
scale_colour_Publication <- function(...) {
discrete_scale(
"colour",
"Publication",
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
...
)
}
Ontology <- get_ontology(file = ontology_file, propagate_relationships = "is_a", extract_tags = "minimal")
print(Ontology)
# All ORFs associated with GO term
GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
# Terms is the GO term list jwr moved up to TAABLES
Terms <- read.delim(file = sgd_terms_tfile,
header = FALSE,
quote = "",
col.names = c("GO_ID", "GO_Term", "GO_Aspect", "GO_Term_Definition")
)
colormapbreaks <- c(-12, -10, -8, -6, -4, -2, 2, 4, 6, 8, 10, 12)
for (s in 1:dim(XX3)[1]) {
# Ontology <-
# get_ontology(file = "Documents/Hartman_Lab/SGD_Downloads/gene_ontology_edit.obo",
# propagate_relationships = "is_a", extract_tags = "minimal")
# Ontology_Everything <-
# get_ontology(file = "Documents/Hartman_Lab/SGD_Downloads/gene_ontology_edit.obo",
# propagate_relationships = "is_a", extract_tags = "everything")
#
# GO_ID_Arg <- "GO:0006325"
GO_ID_Arg_loop <- as.character(XX3[s, 1])
GOTerm_parent <- get_descendants(Ontology, roots = GO_ID_Arg_loop)
# GOTerm_parent <- get_descendants(Ontology,roots = "GO:0006325")
# Only make plots if parent term has fewer than 500 children
Parent_Size <- length(as.vector(GO2ALLORFs[GO_ID_Arg_loop][[1]]))
if (length(GOTerm_parent) > 100) {
next()
}
Parent_Size <- length(as.vector(GO2ALLORFs[GO_ID_Arg_loop][[1]])) # TODO why twice?
if (Parent_Size < 2) {
next()
}
if (Parent_Size > 2000) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 45,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.5, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 1000 && Parent_Size <= 2000) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 35,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.6, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 500 && Parent_Size <= 1000) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 30,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.6, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 200 && Parent_Size <= 500) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 25,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 100 && Parent_Size <= 200) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 20,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 60 && Parent_Size <= 100) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 15,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 30 && Parent_Size <= 60) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 10,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
if (dim(Genes_Annotated_to_Term)[1] <= 2 && dim(Genes_Annotated_to_Term)[1] > 0) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "none", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size >= 3 && Parent_Size <= 30) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 7,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
if (dim(Genes_Annotated_to_Term)[1] <= 2 && dim(Genes_Annotated_to_Term)[1] > 0) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "none", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
if (Parent_Size == 2) {
pdf(
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
width = 12,
height = 7,
onefile = TRUE
)
for (i in 1:length(GOTerm_parent)) {
GO_Term <- GOTerm_parent[i]
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
if (dim(Genes_Annotated_to_Term)[1] > 2) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
if (dim(Genes_Annotated_to_Term)[1] <= 2 && dim(Genes_Annotated_to_Term)[1] > 0) {
try(heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "none", cexCol = 0.7, cexRow = 0.7, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
ylab = "Gene",
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = GO_Term_Name,
# ColSideColors = ev_repeat,
labRow = as.character(Genes_Annotated_to_Term$Gene)
))
}
}
dev.off()
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env Rscript
# This script will add the shift data to the finalTable.csv file
#
# May want to reorder columns in excel before making heatmaps - otherwise all the shift data will be plotted next to each other.
library(plyr)
library(dplyr)
library(sos)
args <- commandArgs(TRUE)
if (length(args) >= 1) {
finalTable <- file.path(args[1])
} else {
finalTable <- "REMcRdy_lm_only.csv-finalTable.csv" # for legacy workflow
}
if (length(args) >= 2) {
shiftFile <- file.path(args[2])
} else {
shiftFile <- "Shift_only.csv" # for legacy workflow
}
if (length(args) >= 3) {
studyInfo <- file.path(args[3])
} else {
studyInfo <- "../Code/StudyInfo.csv" # for legacy workflow
}
if (length(args) >= 4) {
output <- file.path(args[4])
} else {
output <- "REMcHeatmaps/REMcWithShift.csv" # for legacy workflow
}
# Read in the REMc finalTable data
X <- data.frame(read.csv(file = finalTable, header = TRUE, stringsAsFactors = FALSE))
# Read in the shift data From ../JoinInteractions
Y <- data.frame(read.csv(file = shiftFile, header = TRUE, stringsAsFactors = FALSE))
Labels <- read.delim(studyInfo, skip = 0, as.is = TRUE, row.names = 1, strip.white = TRUE)
# Determine the number of cols - needed to create the correct number of new cols
Xcolnum <- length(X[1, ])
ADDnum <- Xcolnum + length(Y[1, ]) - 2
# Create new columns filled with NAs to be filled with data
Xtemp <- X
Xtemp[, (Xcolnum + 1):ADDnum] <- NA
# Match the orf names in each row to a orf name in the shift data file and then add the shift data to the finalTable file
shiftTbl < - as.data.frame(matrix(nrow = 1, ncol = length(Y) - 2)) #the df shiftTbl must be initialized before for loop
for (i in 1:length(X[, 1])) {
Shiftrownum <- match(X[i, 2], Y[, 1])
shiftTbl[i, ] <- Y[Shiftrownum, 3:length(Y[1, ])]
Xtemp[i, (Xcolnum + 1):ADDnum] <- Y[Shiftrownum, 3:length(Y[1, ])]
}
headerX <- colnames(Xtemp)
headerY <- colnames(Y)
shfHdr <- headerY[3:length(headerY)]
combTbl <- X[, 1:3]
lmTbl <- select(Xtemp, contains("Z_lm")) #X[,(4:Xcolnum-2)]
shiftTbl <- select(Xtemp, contains("V"))
clustTbl <- select(Xtemp, contains("cluster."))
# Give the new column names the same names as in the shift file
Xcols <- colnames(X)
Ycols <- colnames(Y)[3:length(Y[1, ])]
newCols <- c(Xcols[1:Xcolnum], Ycols)
# Reorder columns for generating heatmaps
combI <- combTbl #Starting Template orf, Genename columns
headersRemc <- newCols #colnames(X)
newHeaders <- newCols[1:3]
lmHdr <- colnames(lmTbl) #newCols[4:(length(Xcols)-2)]
clstHdr <- colnames(clustTbl) #select(newCols, contains('cluster.')) #newCols[3+length(lmHdr):2]
intLvHdr <- vector()
#Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps.
for (i in 1:(length(shiftTbl[1, ]))) {
combI <- cbind.data.frame(combI, shiftTbl[i])
combI <- cbind.data.frame(combI, lmTbl[i])
intLvHdrx <- c(shfHdr[i], lmHdr[i])
intLvHdr <- c(intLvHdr, intLvHdrx)
}
combIHdr <- c(colnames(combTbl), intLvHdr, clstHdr)
combI <- cbind.data.frame(combI, clustTbl)
colnames(combI) <- combIHdr
write.csv(combI, file = output, row.names = FALSE)

View File

@@ -0,0 +1,945 @@
suppressMessages({
library(ggplot2)
library(plotly)
library(htmlwidgets)
library(dplyr)
library(ggthemes)
library(data.table)
library(unix)
})
options(warn = 2, max.print = 1000)
options(width = 10000)
# Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes)
soft_limit <- 30 * 1024 * 1024 * 1024
hard_limit <- 30 * 1024 * 1024 * 1024
rlimit_as(soft_limit, hard_limit)
# Constants for configuration
plot_width <- 14
plot_height <- 9
base_size <- 14
parse_arguments <- function() {
args <- if (interactive()) {
c(
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2_DoxoHLD/results_std.txt",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
"Experiment 1: Doxo versus HLD",
3,
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
"Experiment 2: HLD versus Doxo",
3
)
} else {
commandArgs(trailingOnly = TRUE)
}
# Extract paths, names, and standard deviations
paths <- args[seq(4, length(args), by = 3)]
names <- args[seq(5, length(args), by = 3)]
sds <- as.numeric(args[seq(6, length(args), by = 3)])
# Normalize paths
normalized_paths <- normalizePath(paths, mustWork = FALSE)
# Create named list of experiments
experiments <- list()
for (i in seq_along(paths)) {
experiments[[names[i]]] <- list(
path = normalized_paths[i],
sd = sds[i]
)
}
list(
out_dir = normalizePath(args[1], mustWork = FALSE),
sgd_gene_list = normalizePath(args[2], mustWork = FALSE),
easy_results_file = normalizePath(args[3], mustWork = FALSE),
experiments = experiments
)
}
args <- parse_arguments()
# Should we keep output in exp dirs or combine in the study output dir?
# dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
# dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
# Define themes and scales
theme_publication <- function(base_size = 14, base_family = "sans", legend_position = "bottom") {
theme_foundation <- ggplot2::theme_grey(base_size = base_size, base_family = base_family)
theme_foundation %+replace%
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.title.y = element_text(angle = 90, vjust = 2),
axis.title.x = element_text(vjust = -0.2),
axis.line = element_line(colour = "black"),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = legend_position,
legend.direction = ifelse(legend_position == "right", "vertical", "horizontal"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
}
scale_fill_publication <- function(...) {
discrete_scale("fill", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
scale_colour_publication <- function(...) {
discrete_scale("colour", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
# Load the initial dataframe from the easy_results_file
load_and_process_data <- function(easy_results_file, sd = 3) {
df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
df <- df %>%
filter(!(.[[1]] %in% c("", "Scan"))) %>%
filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>%
# Rename columns
rename(L = l, num = Num., AUC = AUC96, scan = Scan, last_bg = LstBackgrd, first_bg = X1stBackgrd) %>%
mutate(
across(c(Col, Row, num, L, K, r, scan, AUC, last_bg, first_bg), as.numeric),
delta_bg = last_bg - first_bg,
delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + (sd * sd(delta_bg, na.rm = TRUE)),
NG = if_else(L == 0 & !is.na(L), 1, 0),
DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0),
SM = 0,
OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded?
conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)),
conc_num_factor = as.numeric(as.factor(conc_num)) - 1
)
return(df)
}
# Update Gene names using the SGD gene list
update_gene_names <- function(df, sgd_gene_list) {
# Load SGD gene list
genes <- read.delim(file = sgd_gene_list,
quote = "", header = FALSE,
colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11)))
# Create a named vector for mapping ORF to GeneName
gene_map <- setNames(genes$V5, genes$V4)
# Vectorized match to find the GeneName from gene_map
mapped_genes <- gene_map[df$ORF]
# Replace NAs in mapped_genes with original Gene names (preserves existing Gene names if ORF is not found)
updated_genes <- ifelse(is.na(mapped_genes) | df$OrfRep == "YDL227C", df$Gene, mapped_genes)
# Ensure Gene is not left blank or incorrectly updated to "OCT1"
df <- df %>%
mutate(Gene = ifelse(updated_genes == "" | updated_genes == "OCT1", OrfRep, updated_genes))
return(df)
}
# Calculate summary statistics for all variables
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
df <- df %>%
mutate(across(all_of(variables), ~ ifelse(. == 0, NA, .)))
summary_stats <- df %>%
group_by(across(all_of(group_vars))) %>%
summarise(
N = sum(!is.na(L)),
across(all_of(variables), list(
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
max = ~max(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)
), .names = "{.fn}_{.col}")
)
df_cleaned <- df %>%
select(-any_of(names(summary_stats)))
df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars)
return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
}
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
# Calculate total concentration variables
total_conc_num <- length(unique(df$conc_num))
num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1
# Pull the background means and standard deviations from zero concentration
bg_means <- list(
L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(),
K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(),
r = df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first(),
AUC = df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first()
)
bg_sd <- list(
L = df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first(),
K = df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first(),
r = df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first(),
AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
)
interaction_scores <- df %>%
mutate(
WT_L = df$mean_L,
WT_K = df$mean_K,
WT_r = df$mean_r,
WT_AUC = df$mean_AUC,
WT_sd_L = df$sd_L,
WT_sd_K = df$sd_K,
WT_sd_r = df$sd_r,
WT_sd_AUC = df$sd_AUC
) %>%
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
mutate(
N = sum(!is.na(L)),
NG = sum(NG, na.rm = TRUE),
DB = sum(DB, na.rm = TRUE),
SM = sum(SM, na.rm = TRUE),
across(all_of(variables), list(
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
max = ~max(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)
), .names = "{.fn}_{.col}")
) %>%
ungroup()
interaction_scores <- interaction_scores %>%
group_by(across(all_of(group_vars))) %>%
mutate(
Raw_Shift_L = mean_L[[1]] - bg_means$L,
Raw_Shift_K = mean_K[[1]] - bg_means$K,
Raw_Shift_r = mean_r[[1]] - bg_means$r,
Raw_Shift_AUC = mean_AUC[[1]] - bg_means$AUC,
Z_Shift_L = Raw_Shift_L[[1]] / df$sd_L[[1]],
Z_Shift_K = Raw_Shift_K[[1]] / df$sd_K[[1]],
Z_Shift_r = Raw_Shift_r[[1]] / df$sd_r[[1]],
Z_Shift_AUC = Raw_Shift_AUC[[1]] / df$sd_AUC[[1]]
)
interaction_scores <- interaction_scores %>%
mutate(
Exp_L = WT_L + Raw_Shift_L,
Delta_L = mean_L - Exp_L,
Exp_K = WT_K + Raw_Shift_K,
Delta_K = mean_K - Exp_K,
Exp_r = WT_r + Raw_Shift_r,
Delta_r = mean_r - Exp_r,
Exp_AUC = WT_AUC + Raw_Shift_AUC,
Delta_AUC = mean_AUC - Exp_AUC
)
interaction_scores <- interaction_scores %>%
mutate(
Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L),
Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K),
Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r),
Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC),
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L)
)
# Calculate linear models and interaction scores
interaction_scores <- interaction_scores %>%
mutate(
lm_L = lm(Delta_L ~ conc_num_factor),
lm_K = lm(Delta_K ~ conc_num_factor),
lm_r = lm(Delta_r ~ conc_num_factor),
lm_AUC = lm(Delta_AUC ~ conc_num_factor),
Zscore_L = Delta_L / WT_sd_L,
Zscore_K = Delta_K / WT_sd_K,
Zscore_r = Delta_r / WT_sd_r,
Zscore_AUC = Delta_AUC / WT_sd_AUC
)
interaction_scores <- interaction_scores %>%
mutate(
Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE),
Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE),
Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE),
Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE)
)
interaction_scores_all <- interaction_scores %>%
mutate(
Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs,
Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs,
Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1),
Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1),
lm_Score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1],
lm_Score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1],
lm_Score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1],
lm_Score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1],
r_squared_L = summary(lm_L)$r.squared,
r_squared_K = summary(lm_K)$r.squared,
r_squared_r = summary(lm_r)$r.squared,
r_squared_AUC = summary(lm_AUC)$r.squared
)
# Calculate Z_lm for each variable
interaction_scores_all <- interaction_scores_all %>%
mutate(
Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE),
Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE),
Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE),
Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE)
)
# Arrange results by Z_lm_L and NG
interaction_scores_all <- interaction_scores_all %>%
arrange(desc(lm_Score_L)) %>%
arrange(desc(NG)) %>%
ungroup()
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
}
generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
`%||%` <- function(a, b) if (!is.null(a)) a else b
# Generalized plot generation
plots <- lapply(plot_configs, function(config) {
df <- config$df
plot <- ggplot(df, aes(x = !!sym(config$x_var), y = !!sym(config$y_var), color = as.factor(!!sym(config$color_var))))
# Rank plots with SD annotations
if (config$plot_type == "rank") {
plot <- plot + geom_point(size = 0.1, shape = 3)
# Adding SD bands
if (!is.null(config$sd_band)) {
for (i in seq_len(config$sd_band)) {
plot <- plot +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = i, ymax = Inf, fill = "#542788", alpha = 0.3) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = -i, ymax = -Inf, fill = "orange", alpha = 0.3) +
geom_hline(yintercept = c(-i, i), color = "gray")
}
}
# Optionally add enhancer/suppressor count annotations
if (!is.null(config$enhancer_label)) {
plot <- plot + annotate("text", x = config$enhancer_label$x,
y = config$enhancer_label$y, label = config$enhancer_label$label) +
annotate("text", x = config$suppressor_label$x,
y = config$suppressor_label$y, label = config$suppressor_label$label)
}
}
# Correlation plots
if (config$plot_type == "correlation") {
plot <- plot + geom_point(shape = 3, color = "gray70") +
geom_smooth(method = "lm", color = "tomato3") +
annotate("text", x = 0, y = 0, label = config$correlation_text)
}
# General scatter/boxplot/density handling
if (!is.null(config$y_var)) {
plot <- plot + aes(y = !!sym(config$y_var))
y_mean_col <- paste0("mean_", config$y_var)
y_sd_col <- paste0("sd_", config$y_var)
if (config$y_var == "delta_bg" && config$plot_type == "scatter") {
plot <- plot + geom_point(shape = 3, size = 0.2, position = "jitter") +
geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col),
ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) +
geom_point(aes(y = !!sym(y_mean_col)), size = 0.6)
} else if (config$error_bar %||% FALSE) {
plot <- plot +
geom_point(shape = 3, size = 0.2) +
geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col),
ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) +
geom_point(aes(y = !!sym(y_mean_col)), size = 0.6)
}
}
# Plot type selection
plot <- switch(config$plot_type,
"box" = plot + geom_boxplot(),
"density" = plot + geom_density(),
"bar" = plot + geom_bar(stat = "identity"),
plot + geom_point() + geom_smooth(method = "lm", se = FALSE))
# Adding y-limits if provided
if (!is.null(config$ylim_vals)) {
plot <- plot + coord_cartesian(ylim = config$ylim_vals)
}
# Setting legend position, titles, and labels
legend_position <- config$legend_position %||% "bottom"
plot <- plot + ggtitle(config$title) + theme_Publication(legend_position = legend_position)
if (!is.null(config$x_label)) plot <- plot + xlab(config$x_label)
if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label)
# Adding text annotations if provided
if (!is.null(config$annotations)) {
for (annotation in config$annotations) {
plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label)
}
}
return(plot)
})
# If grid_layout is provided, arrange plots in a grid and save in a single PDF
if (!is.null(grid_layout)) {
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
# Loop through plots in chunks defined by ncol and nrow
for (start_idx in seq(1, length(plots), by = grid_layout$ncol * grid_layout$nrow)) {
end_idx <- min(start_idx + grid_layout$ncol * grid_layout$nrow - 1, length(plots))
plot_subset <- plots[start_idx:end_idx]
# Arrange plots in a grid
do.call(grid.arrange, c(plot_subset, ncol = grid_layout$ncol, nrow = grid_layout$nrow))
}
dev.off()
# Save as HTML (optional)
plotly_plots <- lapply(plots, function(plot) {
suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h")))
})
combined_plot <- subplot(plotly_plots, nrows = grid_layout$nrow, margin = 0.05)
saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, "_grid.html")), selfcontained = TRUE)
} else {
# Save individual plots to PDF
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
lapply(plots, print)
dev.off()
# Convert plots to plotly and save as HTML
plotly_plots <- lapply(plots, function(plot) {
suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h")))
})
combined_plot <- subplot(plotly_plots, nrows = length(plotly_plots), margin = 0.05)
saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE)
}
}
generate_interaction_plot_configs <- function(df, variables) {
plot_configs <- list()
for (variable in variables) {
# Define the y-limits based on the variable being plotted
ylim_vals <- switch(variable,
"L" = c(-65, 65),
"K" = c(-65, 65),
"r" = c(-0.65, 0.65),
"AUC" = c(-6500, 6500)
)
# Dynamically generate the column names for standard deviation and delta
wt_sd_col <- paste0("WT_sd_", variable)
delta_var <- paste0("Delta_", variable)
z_shift <- paste0("Z_Shift_", variable)
z_lm <- paste0("Z_lm_", variable)
# Set annotations for ZShift, Z lm Score, NG, DB, SM
annotations <- list(
list(x = 1, y = ifelse(variable == "L", 45, ifelse(variable == "K", 45,
ifelse(variable == "r", 0.45, 4500))), label = paste("ZShift =", round(df[[z_shift]], 2))),
list(x = 1, y = ifelse(variable == "L", 25, ifelse(variable == "K", 25,
ifelse(variable == "r", 0.25, 2500))), label = paste("lm ZScore =", round(df[[z_lm]], 2))),
list(x = 1, y = ifelse(variable == "L", -25, ifelse(variable == "K", -25,
ifelse(variable == "r", -0.25, -2500))), label = paste("NG =", df$NG)),
list(x = 1, y = ifelse(variable == "L", -35, ifelse(variable == "K", -35,
ifelse(variable == "r", -0.35, -3500))), label = paste("DB =", df$DB)),
list(x = 1, y = ifelse(variable == "L", -45, ifelse(variable == "K", -45,
ifelse(variable == "r", -0.45, -4500))), label = paste("SM =", df$SM))
)
# Add scatter plot configuration for this variable
plot_configs[[length(plot_configs) + 1]] <- list(
df = df,
x_var = "conc_num_factor",
y_var = delta_var,
plot_type = "scatter",
title = sprintf("%s %s", df$OrfRep[1], df$Gene[1]),
ylim_vals = ylim_vals,
annotations = annotations,
error_bar = list(
ymin = 0 - (2 * df[[wt_sd_col]][1]),
ymax = 0 + (2 * df[[wt_sd_col]][1])
),
x_breaks = unique(df$conc_num_factor),
x_labels = unique(as.character(df$conc_num)),
x_label = unique(df$Drug[1])
)
# Add box plot configuration for this variable
plot_configs[[length(plot_configs) + 1]] <- list(
df = df,
x_var = "conc_num_factor",
y_var = variable,
plot_type = "box",
title = sprintf("%s %s (Boxplot)", df$OrfRep[1], df$Gene[1]),
ylim_vals = ylim_vals,
annotations = annotations,
error_bar = FALSE, # Boxplots typically don't need error bars
x_breaks = unique(df$conc_num_factor),
x_labels = unique(as.character(df$conc_num)),
x_label = unique(df$Drug[1])
)
}
return(plot_configs)
}
generate_rank_plot_configs <- function(df, rank_var, zscore_var, label_prefix) {
configs <- list()
for (sd_band in c(1, 2, 3)) {
# Annotated version
configs[[length(configs) + 1]] <- list(
df = df,
x_var = rank_var,
y_var = zscore_var,
plot_type = "rank",
title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"),
sd_band = sd_band,
enhancer_label = list(
x = nrow(df) / 2, y = 10,
label = paste("Deletion Enhancers =", nrow(df[df[[zscore_var]] >= sd_band, ]))
),
suppressor_label = list(
x = nrow(df) / 2, y = -10,
label = paste("Deletion Suppressors =", nrow(df[df[[zscore_var]] <= -sd_band, ]))
)
)
# Non-annotated version
configs[[length(configs) + 1]] <- list(
df = df,
x_var = rank_var,
y_var = zscore_var,
plot_type = "rank",
title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"),
sd_band = sd_band
)
}
return(configs)
}
generate_correlation_plot_configs <- function(df, lm_list, lm_summaries) {
lapply(seq_along(lm_list), function(i) {
r_squared <- round(lm_summaries[[i]]$r.squared, 3)
list(
x_var = names(lm_list)[i][1],
y_var = names(lm_list)[i][2],
plot_type = "scatter",
title = paste("Correlation between", names(lm_list)[i][1], "and", names(lm_list)[i][2]),
annotations = list(list(x = 0, y = 0, label = paste("R-squared =", r_squared)))
)
})
}
# Adjust missing values and calculate ranks
adjust_missing_and_rank <- function(df, variables) {
# Adjust missing values in Avg_Zscore and Z_lm columns, and apply rank to the specified variables
df <- df %>%
mutate(across(all_of(variables), list(
Avg_Zscore = ~ if_else(is.na(get(paste0("Avg_Zscore_", cur_column()))), 0.001, get(paste0("Avg_Zscore_", cur_column()))),
Z_lm = ~ if_else(is.na(get(paste0("Z_lm_", cur_column()))), 0.001, get(paste0("Z_lm_", cur_column()))),
Rank = ~ rank(get(paste0("Avg_Zscore_", cur_column()))),
Rank_lm = ~ rank(get(paste0("Z_lm_", cur_column())))
), .names = "{fn}_{col}"))
return(df)
}
main <- function() {
lapply(names(args$experiments), function(exp_name) {
exp <- args$experiments[[exp_name]]
exp_path <- exp$path
exp_sd <- exp$sd
out_dir <- file.path(exp_path, "zscores")
out_dir_qc <- file.path(exp_path, "zscores", "qc")
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
# Load and process data
df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
df <- update_gene_names(df, args$sgd_gene_list)
max_conc <- max(df$conc_num_factor)
# QC steps and filtering
df_above_tolerance <- df %>% filter(DB == 1)
# Calculate the half-medians for `L` and `K` for rows above tolerance
L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
# Get the number of rows that are above tolerance
rows_to_remove <- nrow(df_above_tolerance)
# Set L, r, K, and AUC to NA for rows that are above tolerance
df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .)))
# Calculate summary statistics for all strains, including both background and the deletions
message("Calculating summary statistics for all strains")
variables <- c("L", "K", "r", "AUC")
ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
summary_stats <- ss$summary_stats
df_na_stats <- ss$df_with_stats
write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
print("Summary stats:")
print(head(summary_stats), width = 200)
# Remove rows with 0 values in L
df_no_zeros <- df_na %>% filter(L > 0)
# Additional filtering for non-finite values
df_na_filtered <- df_na %>%
filter(if_any(c(L), ~ !is.finite(.))) %>%
{
if (nrow(.) > 0) {
message("Removing non-finite rows:\n")
print(head(., n = 10))
}
df_na %>% filter(if_all(c(L), is.finite))
}
# Filter data within and outside 2SD
message("Filtering by 2SD of K")
df_na_within_2sd_k <- df_na_stats %>%
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
df_na_outside_2sd_k <- df_na_stats %>%
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
# Summary statistics for within and outside 2SD of K
message("Calculating summary statistics for L within 2SD of K")
# TODO We're omitting the original z_max calculation, not sure if needed?
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_within_2sd_k_stats <- ss$summary_stats
df_na_l_within_2sd_k_stats <- ss$df_with_stats
message("Calculating summary statistics for L outside 2SD of K")
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_outside_2sd_k_stats <- ss$summary_stats
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
# Write CSV files
write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
# Plots
# Print quality control graphs before removing data due to contamination and
# adjusting missing data to max theoretical values
l_vs_k_plots <- list(
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
title = "Raw L vs K before QC",
color_var = "conc_num",
legend_position = "right"
)
)
above_threshold_plots <- list(
list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter",
title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
color_var = "conc_num",
annotations = list(
list(
x = L_half_median,
y = K_half_median,
label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance))
)
),
error_bar = FALSE,
legend_position = "right"
)
)
frequency_delta_bg_plots <- list(
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density",
title = "Density plot for Delta Background by Conc All Data",
color_var = "conc_num",
x_label = "Delta Background",
y_label = "Density",
error_bar = FALSE,
legend_position = "right"
),
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
title = "Bar plot for Delta Background by Conc All Data",
color_var = "conc_num",
x_label = "Delta Background",
y_label = "Count",
error_bar = FALSE,
legend_position = "right"
)
)
plate_analysis_plots <- list()
for (plot_type in c("scatter", "box")) {
variables <- c("L", "K", "r", "AUC", "delta_bg")
for (var in variables) {
for (stage in c("before", "after")) {
if (stage == "before") {
df_plot <- df
} else {
df_plot <- df_na # TODO use df_na_filtered if necessary
}
# Set error_bar = TRUE only for scatter plots
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
# Create the plot configuration
plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type,
title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
error_bar = error_bar, color_var = "conc_num")
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
}
}
}
plate_analysis_no_zero_plots <- list()
for (plot_type in c("scatter", "box")) {
variables <- c("L", "K", "r", "AUC", "delta_bg")
for (var in variables) {
# Set error_bar = TRUE only for scatter plots
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
# Create the plot configuration
plot_config <- list(
df = df_no_zeros,
x_var = "scan",
y_var = var,
plot_type = plot_type,
title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
error_bar = error_bar,
color_var = "conc_num"
)
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
}
}
l_outside_2sd_k_plots <- list(
list(df = X_outside_2SD_K, x_var = "l", y_var = "K", plot_type = "scatter",
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
color_var = "conc_num",
legend_position = "right"
)
)
delta_bg_outside_2sd_k_plots <- list(
list(df = X_outside_2SD_K, x_var = "delta_bg", y_var = "K", plot_type = "scatter",
title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
color_var = "conc_num",
legend_position = "right"
)
)
# Generate and save plots for each QC step
message("Generating QC plots")
generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
# Clean up
rm(df, df_above_tolerance, df_no_zeros)
# TODO: Originally this filtered L NA's
# Let's try to avoid for now since stats have already been calculated
# Process background strains
bg_strains <- c("YDL227C")
lapply(bg_strains, function(strain) {
message("Processing background strain: ", strain)
# Handle missing data by setting zero values to NA
# and then removing any rows with NA in L col
df_bg <- df_na %>%
filter(OrfRep == strain) %>%
mutate(
L = if_else(L == 0, NA, L),
K = if_else(K == 0, NA, K),
r = if_else(r == 0, NA, r),
AUC = if_else(AUC == 0, NA, AUC)
) %>%
filter(!is.na(L))
# Recalculate summary statistics for the background strain
message("Calculating summary statistics for background strain")
ss <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
summary_stats_bg <- ss$summary_stats
df_bg_stats <- ss$df_with_stats
write.csv(summary_stats_bg,
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
row.names = FALSE)
# Filter reference and deletion strains
# Formerly X2_RF (reference strains)
df_reference <- df_na_stats %>%
filter(OrfRep == strain) %>%
mutate(SM = 0)
# Formerly X2 (deletion strains)
df_deletion <- df_na_stats %>%
filter(OrfRep != strain) %>%
mutate(SM = 0)
# Set the missing values to the highest theoretical value at each drug conc for L
# Leave other values as 0 for the max/min
reference_strain <- df_reference %>%
group_by(conc_num) %>%
mutate(
max_l_theoretical = max(max_L, na.rm = TRUE),
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
ungroup()
# Ditto for deletion strains
deletion_strains <- df_deletion %>%
group_by(conc_num) %>%
mutate(
max_l_theoretical = max(max_L, na.rm = TRUE),
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
ungroup()
# Calculate interactions
variables <- c("L", "K", "r", "AUC")
message("Calculating interaction scores")
print("Reference strain:")
print(head(reference_strain))
reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables)
print("Deletion strains:")
print(head(deletion_strains))
deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables)
zscores_calculations_reference <- reference_results$zscores_calculations
zscores_interactions_reference <- reference_results$zscores_interactions
zscores_calculations <- deletion_results$zscores_calculations
zscores_interactions <- deletion_results$zscores_interactions
# Writing Z-Scores to file
write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE)
write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE)
write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE)
write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE)
# Create interaction plots
reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables)
deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables)
generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
# Define conditions for enhancers and suppressors
# TODO Add to study config file?
threshold <- 2
enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= threshold
suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -threshold
enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= threshold
suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -threshold
# Subset data
enhancers_L <- zscores_interactions[enhancer_condition_L, ]
suppressors_L <- zscores_interactions[suppressor_condition_L, ]
enhancers_K <- zscores_interactions[enhancer_condition_K, ]
suppressors_K <- zscores_interactions[suppressor_condition_K, ]
# Save enhancers and suppressors
message("Writing enhancer/suppressor csv files")
write.csv(enhancers_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L.csv"), row.names = FALSE)
write.csv(suppressors_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L.csv"), row.names = FALSE)
write.csv(enhancers_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K.csv"), row.names = FALSE)
write.csv(suppressors_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K.csv"), row.names = FALSE)
# Combine conditions for enhancers and suppressors
enhancers_and_suppressors_L <- zscores_interactions[enhancer_condition_L | suppressor_condition_L, ]
enhancers_and_suppressors_K <- zscores_interactions[enhancer_condition_K | suppressor_condition_K, ]
# Save combined enhancers and suppressors
write.csv(enhancers_and_suppressors_L,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_L.csv"), row.names = FALSE)
write.csv(enhancers_and_suppressors_K,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_K.csv"), row.names = FALSE)
# Handle linear model based enhancers and suppressors
lm_threshold <- 2
enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= lm_threshold, ]
suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -lm_threshold, ]
enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= lm_threshold, ]
suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -lm_threshold, ]
# Save linear model based enhancers and suppressors
message("Writing linear model enhancer/suppressor csv files")
write.csv(enhancers_lm_L,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L_lm.csv"), row.names = FALSE)
write.csv(suppressors_lm_L,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L_lm.csv"), row.names = FALSE)
write.csv(enhancers_lm_K,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K_lm.csv"), row.names = FALSE)
write.csv(suppressors_lm_K,
file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE)
zscores_interactions_adjusted <- adjust_missing_and_rank(zscores_interactions)
# Generate all rank plot configurations for L and K
rank_plot_configs <- c(
generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_L", "Avg_Zscore_L", "L"),
generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_K", "Avg_Zscore_K", "K")
)
# Generate and save rank plots
generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots",
plot_configs = rank_plot_config, grid_layout = list(ncol = 3, nrow = 2))
# # Correlation plots
# lm_list <- list(
# lm(Z_lm_K ~ Z_lm_L, data = zscores_interactions_filtered),
# lm(Z_lm_r ~ Z_lm_L, data = zscores_interactions_filtered),
# lm(Z_lm_AUC ~ Z_lm_L, data = zscores_interactions_filtered),
# lm(Z_lm_r ~ Z_lm_K, data = zscores_interactions_filtered),
# lm(Z_lm_AUC ~ Z_lm_K, data = zscores_interactions_filtered),
# lm(Z_lm_AUC ~ Z_lm_r, data = zscores_interactions_filtered)
# )
lm_summaries <- lapply(lm_list, summary)
correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries)
generate_and_save_plots(zscores_interactions_filtered, output_dir, correlation_plot_configs)
})
})
}
main()

View File

@@ -0,0 +1,209 @@
suppressMessages({
library("ggplot2")
library("plotly")
library("htmlwidgets")
library("extrafont")
library("grid")
library("ggthemes")
})
# Constants for configuration
PLOT_WIDTH <- 12
PLOT_HEIGHT <- 8
BASE_SIZE <- 14
options(warn = 2, max.print = 100)
# Parse arguments
parse_arguments <- function() {
if (interactive()) {
args <- c(
"Exp1",
"/path/to/exp1_file.csv",
"Exp2",
"/path/to/exp2_file.csv",
"/path/to/output_dir"
)
} else {
args <- commandArgs(trailingOnly = TRUE)
}
list(
exp1_name = args[1],
exp1_file = normalizePath(args[2], mustWork = TRUE),
exp2_name = args[3],
exp2_file = normalizePath(args[4], mustWork = TRUE),
output_dir = normalizePath(args[5], mustWork = FALSE)
)
}
args <- parse_arguments()
# Create output directories if they don't exist
pairDirL <- file.path(args$output_dir, paste0("PairwiseCompareL_", args$exp1_name, "-", args$exp2_name))
pairDirK <- file.path(args$output_dir, paste0("PairwiseCompareK_", args$exp1_name, "-", args$exp2_name))
dir.create(pairDirL, showWarnings = FALSE, recursive = TRUE)
dir.create(pairDirK, showWarnings = FALSE, recursive = TRUE)
# Define themes and scales
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") {
theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold", size = rel(1)),
axis.line = element_line(colour = "black"),
panel.grid.major = element_line(colour = "#f0f0f0"),
panel.grid.minor = element_blank(),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.size = unit(0.2, "cm"),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
strip.text = element_text(face = "bold")
)
}
theme_publication_legend_right <- function(base_size = BASE_SIZE, base_family = "sans") {
theme_publication(base_size, base_family) +
theme(
legend.position = "right",
legend.direction = "vertical",
legend.key.size = unit(0.5, "cm")
)
}
scale_fill_publication <- function(...) {
discrete_scale("fill", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
scale_colour_publication <- function(...) {
discrete_scale("colour", "Publication", manual_pal(values = c(
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
)), ...)
}
# Read and merge data
load_and_merge_data <- function(file1, file2) {
df1 <- read.csv(file = file1, stringsAsFactors = FALSE, header = TRUE)
df2 <- read.csv(file = file2, stringsAsFactors = FALSE, header = TRUE)
merge(df1, df2, by = "Term_Avg", all = TRUE, suffixes = c("_df1", "_df2"))
}
# Generate a plot and save to PDF and HTML
generate_plot <- function(data, x_var, y_var, color_var, title, file_path, theme_function) {
gg <- ggplot(data = data, aes_string(
x = x_var,
y = y_var,
color = color_var
)) +
xlab(paste("GO Term Avg lm Z for", args$exp1_name)) +
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
geom_point(shape = 3) +
ylab(paste("GO Term Avg lm Z for", args$exp2_name)) +
ggtitle(title) +
theme_function()
pdf(file.path(file_path, ".pdf"), width = PLOT_WIDTH, height = PLOT_HEIGHT)
print(gg)
dev.off()
pgg <- ggplotly(gg)
htmlwidgets::saveWidget(pgg, file.path(file_path, ".html"))
}
# Identify and annotate specific interactions
annotate_interactions <- function(df, exp1_name, exp2_name, suffix) {
df$Overlap_Avg <- NA
interactions <- list(
"df1_Specific_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] < 2),
"df1_Specific_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] > -2),
"df2_Specific_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] < 2),
"df2_Specific_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] > -2),
"Overlap_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2),
"Overlap_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2),
"df2_Aggravators_df1_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2),
"df2_Alleviators_df1_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2)
)
annotation <- list(
df1_Specific_Aggravators = paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"),
df1_Specific_Alleviators = paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_"),
df2_Specific_Aggravators = paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"),
df2_Specific_Alleviators = paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"),
Overlap_Aggravators = "Overlapping_Deletion_Enhancers",
Overlap_Alleviators = "Overlapping_Deletion_Suppressors",
df2_Aggravators_df1_Alleviators = paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"),
df2_Alleviators_df1_Aggravators = paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_")
)
for (key in names(interactions)) {
try(df$Overlap_Avg[interactions[[key]]] <- annotation[[key]])
}
df
}
# Rank and filter data
rank_and_filter_data <- function(df, suffix) {
z1 <- df
z1[[paste0("L_Subtract_SD_", suffix, "_df1")]] <- z1[[paste0("Z_lm_", suffix, "_Avg_df1")]] - z1[[paste0("Z_lm_", suffix, "_SD_df1")]]
z1[[paste0("L_Subtract_SD_", suffix, "_df2")]] <- z1[[paste0("Z_lm_", suffix, "_Avg_df2")]] - z1[[paste0("Z_lm_", suffix, "_SD_df2")]]
z2 <- df
z2[[paste0("L_Subtract_SD_", suffix, "_df1")]] <- z2[[paste0("Z_lm_", suffix, "_Avg_df1")]] + z2[[paste0("Z_lm_", suffix, "_SD_df1")]]
z2[[paste0("L_Subtract_SD_", suffix, "_df2")]] <- z2[[paste0("Z_lm_", suffix, "_Avg_df2")]] + z2[[paste0("Z_lm_", suffix, "_SD_df2")]]
df_above_threshold <- df[!is.na(df$Overlap_Avg), ]
df_above_threshold$df1_Rank <- rank(-df_above_threshold[[paste0("Z_lm_", suffix, "_Avg_df1")]], ties.method = "random")
df_above_threshold$df2_Rank <- rank(-df_above_threshold[[paste0("Z_lm_", suffix, "_Avg_df2")]], ties.method = "random")
list(z1 = z1, z2 = z2, df_above_threshold = df_above_threshold)
}
# Main execution function for a pairwise comparison
run_pairwise_comparison <- function(suffix, dir) {
df <- load_and_merge_data(args$exp1_file, args$exp2_file)
# Generate initial ontology-based plot
generate_plot(df,
paste0("Z_lm_", suffix, "_Avg_df1"), paste0("Z_lm_", suffix, "_Avg_df2"), "Ontology_Avg_df1",
paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name),
file.path(dir, paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOntology")),
theme_publication_legend_right)
# Annotate interactions and generate overlap-based plot
df <- annotate_interactions(df, args$exp1_name, args$exp2_name, suffix)
ranks <- rank_and_filter_data(df, suffix)
generate_plot(df,
paste0("Z_lm_", suffix, "_Avg_df1"),
paste0("Z_lm_", suffix, "_Avg_df2"),
"Overlap_Avg",
paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name),
file.path(dir, paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOverlap")),
theme_publication_legend_right)
generate_plot(ranks$df_above_threshold,
paste0("Z_lm_", suffix, "_Avg_df1"),
paste0("Z_lm_", suffix, "_Avg_df2"),
"Overlap_Avg",
paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name, "Above Threshold"),
file.path(dir,
paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOverlap_AboveThreshold")),
theme_publication_legend_right)
# Save CSV files
write.csv(df, file.path(dir,
paste0("All_GTA_Avg_Scores_", args$exp1_name, "_vs_", args$exp2_name, ".csv")),
row.names = FALSE)
write.csv(ranks$df_above_threshold,
file.path(dir, paste0("AboveThreshold_GTA_Avg_Scores_", args$exp1_name, "_vs_", args$exp2_name, ".csv")),
row.names = FALSE)
}
# Execute Pairwise L and Pairwise K comparisons
run_pairwise_comparison("L", pairDirL)
run_pairwise_comparison("K", pairDirK)

View File

@@ -0,0 +1,305 @@
#!/usr/bin/env Rscript
# This script will make heatmaps for the REMc analysis
# need to give the input "finalTable.csv" file after running REMc generated by eclipse
library(RColorBrewer)
library(gplots)
args <- commandArgs(TRUE)
# Set output dir
if (length(args) >= 1) {
input_finalTable <- file.path(args[1])
} else {
input_finalTable <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow
}
if (length(args) >= 2) {
outDir <- file.path(args[2])
} else {
outDir <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow
}
hmapfile <- data.frame(read.csv(file = input_finalTable, header = TRUE, sep = ",", stringsAsFactors = FALSE))
# set NAs to NA
hmapfile[hmapfile == -100] <- NA
hmapfile[hmapfile == 100] <- NA
hmapfile[hmapfile == 0.001] <- NA
hmapfile[hmapfile == -0.001] <- NA
# select the number of rows based on the number of genes
num_total_genes <- length(hmapfile[, 1])
# Break out the cluster names so each part of the cluster origin can be accessed
# line below removed because it adds to many genes to clusters when going past 1-0-10
# since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl.
# hmapfile$cluster.origin = gsub(" ","",x=hmapfile$cluster.origin)
hmapfile$cluster.origin <- gsub(";", " ;", x = hmapfile$cluster.origin)
hmapfile$cluster.origin <- strsplit(hmapfile$cluster.origin, ";")
#use tail(x,n) for accessing the outward most cluster
clust_rounds <- 0
for (i in 1:num_total_genes) {
if (length(hmapfile$cluster.origin[[i]]) > clust_rounds) {
clust_rounds <- length(hmapfile$cluster.origin[[i]])
}
}
unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes])
unique_clusts <- unique_clusts[unique_clusts != " "]
# Select only the unique cluster names
unique_clusts <- sort(unique(unlist(unique_clusts, use.names = FALSE)), decreasing = FALSE)
num_unique_clusts <- length(unique_clusts)
# Base the color key on a statistical analysis of the L and K data
# need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors)
# scale() will calculate the mean and standard deviation of the entire vector,
# then "scale" each element by those values by subtracting the mean and dividing by the sd.
# hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)])
# change so that the L data is multiplied to be on the same scale as the K data
KEY_MIN <- 0
KEY_MAX <- 0
K_MIN <- 0
L_MAX <- 0
KcolumnValues <- vector()
LcolumnValues <- vector()
for (i in 4:(length(hmapfile[1, ]) - 2)){
if (grepl("_Z_lm_K", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
KcolumnValues <- append(KcolumnValues, i)
}
if (grepl("_Z_lm_L", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
LcolumnValues <- append(LcolumnValues, i)
}
}
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[4]
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[2]
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm=TRUE)[4]
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm=TRUE)[2]
# Z scores are
L_MAX <- 12
K_MIN <- -12
# L_Multiplier <- as.numeric(abs(K_MIN/L_MAX))
# hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier
# if(grepl("SHIFT",colnames(hmapfile)[4],fixed=TRUE) == TRUE){
# print("FOUND SHIFT VALUES")
# hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier
# }
# KEY_MAX <- as.numeric(L_MAX * L_Multiplier)
# KEY_MIN <- as.numeric(K_MIN)
KEY_MAX <- as.numeric(L_MAX)
KEY_MIN <- as.numeric(K_MIN)
print(KEY_MIN)
print(L_MAX)
# print(L_Multiplier)
colormapbreaks <- c(KEY_MIN, KEY_MIN * (5 / 6), KEY_MIN * (4 / 6), KEY_MIN * (3 / 6), KEY_MIN * (2 / 6),
KEY_MIN * (1 / 6), KEY_MAX * (1 / 6), KEY_MAX * (2 / 6), KEY_MAX * (3 / 6), KEY_MAX * (4 / 6), KEY_MAX * (5 / 6), KEY_MAX)
# print(colormapbreaks)
# Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?)
# However since also using this to amend the first part.
# Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers
# then could just use these values and create spaces only between interaction values
# possibly could get rid of redundant shift values if we don't want to view these
# could we pool all the shift data/average it?
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == TRUE) {
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 2)
# ev_repeat = rep("white",length(even_columns))
# ev_repeat = rep("red",(length(hmapfile[1,]) - 5))
# middle_col <- (length(hmapfile[1,]) - 5)/2
# ev_repeat[(middle_col/2)] <- "black"
# print(ev_repeat)
}
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == FALSE) {
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 1)
print("NO SHIFT VALS FOUND")
}
# FOR THIS SCRIPT ONLY (rap tem hu script)
# even_columns <- c(2,5,7,10,12,15,17)
# m <- 0
colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1, ]) - 2)])
# print(colnames_edit)
for (i in 1:length(colnames_edit)) {
if (grepl("Shift", colnames_edit[i], fixed = TRUE) == TRUE) {
colnames_edit[i] <- ""
colnames_edit[i + 1] <- gsub(pattern = "_Z_lm_", replacement = " ", x = colnames_edit[i + 1])
try(colnames_edit[i + 1] <- gsub(pattern = "_", replacement = " ", x = colnames_edit[i + 1]))
# INT_store <- strsplit(colnames_edit[i+1], "Z_lm")
# print(length(unlist(INT_store)))
# if(length(unlist(INT_store)) == 4){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep=" ")
# }
# if(length(unlist(INT_store)) == 3){
#
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep=" ")
# }
# if(length(unlist(INT_store)) == 5){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep=" ")
# }
# if(length(unlist(INT_store)) == 6){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep=" ")
# }
}
}
print(colnames_edit)
# break()
# colnames_edit[5] <- "TEM HLEG K"
# colnames_edit[10] <- "TEM HL K"
# colnames_edit[15] <- "TEM HLEG L"
# colnames_edit[20] <- "TEM HL L"
# Create the heatmaps
for (i in 1:num_unique_clusts) {
cluster <- unique_clusts[i]
cluster_data <- subset(hmapfile, grepl(cluster, cluster.origin))
cluster_length <- length(cluster_data[, 1])
if (cluster_length != 1) {
X0 <- as.matrix(cluster_data[, 4:(length(hmapfile[1, ]) - 2)])
if (cluster_length >= 2001) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 20, width = 15)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132, 132),
xlab = "Type of Media", ylab = "Gene Name",
# cellnote = round(X0,digits = 0), notecex = 0.1, key = TRUE,
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
# ColSideColors=ev_repeat,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
# abline(v=0.5467,col="black")
dev.off()
}
if (cluster_length >= 201 && cluster_length <= 2000) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 15, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.1, key = TRUE,
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
# abline(v=0.5316,col="black")
dev.off()
}
if (cluster_length >= 150 && cluster_length <= 200) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.2, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
dev.off()
}
if (cluster_length >= 101 && cluster_length <= 149) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
#zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
dev.off()
}
if (cluster_length >= 60 && cluster_length <= 100) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
dev.off()
}
if (cluster_length <= 59 && cluster_length >= 30) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 9, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
dev.off()
}
if (cluster_length <= 29) {
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 7, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA,
distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim=c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
)
dev.off()
}
}
# print(paste("FINISHED", "CLUSTER",cluster,sep=" "))
}

View File

@@ -0,0 +1,364 @@
#!/usr/bin/env Rscript
library("RColorBrewer")
library("gplots")
library("tidyverse")
args <- commandArgs(TRUE)
# Define the output path for the heatmaps - create this folder first - in linux terminal in the working folder use > mkdir filename_heatmaps
output_path <- file.path(Args[1])
# Need to give the input "finalTable.csv" file after running REMc generated by eclipse
final_table <- file.path(args[2])
# Give the damp_list.txt as the third argument - will color the gene names differently
damps <- file.path(Args[3])
damp_list <- read.delim(file = damps, header = FALSE, stringsAsFactors = FALSE)
# Give the yeast human homology mapping as the fourth argument - will add the genes to the finalTable and use info for heatmaps
map_file <- file.path(Args[4])
mapping <- read.csv(file = map_file, stringsAsFactors = FALSE)
# Read in finalTablewithShift
hmapfile <- data.frame(read.csv(file = final_table, header = TRUE, sep = ",", stringsAsFactors = FALSE))
# Map the finalTable to the human homolog file
hmapfile_map <- hmapfile
# Match using OrfRep after dropping the _1 _2 _3 _4
# But need to also account for some older files have ORF as column name rather than OrfRep in finalTable file
if (colnames(hmapfile_map)[2] == "OrfRep") {
try(hmapfile_map$ORFMatch <- hmapfile_map$OrfRep)
}
if (colnames(hmapfile_map)[2] == "ORF") {
try(hmapfile_map$ORFMatch <- hmapfile_map$ORF)
}
hmapfile_map$ORFMatch <- gsub("_1", "", x = hmapfile_map$ORFMatch)
hmapfile_map$ORFMatch <- gsub("_2", "", x = hmapfile_map$ORFMatch)
hmapfile_map$ORFMatch <- gsub("_3", "", x = hmapfile_map$ORFMatch)
hmapfile_map$ORFMatch <- gsub("_4", "", x = hmapfile_map$ORFMatch)
# Join the hmapfile using
hmapfile_w_homolog <- full_join(hmapfile_map, mapping, by = c("ORFMatch" = "ensembl_gene_id"))
# Remove matches that are not from the finalTable
hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$likelihood) == FASLE, ]
# Write csv with all info from mapping file
write.csv(hmapfile_w_homolog, file.path(output_path, paste0(final_table, "_WithHomologAll.csv")), row.names = FALSE)
# Remove the non matches and output another mapping file - this is also one used to make heatmaps
hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$external_gene_name_Human) == FALSE, ]
write.csv(hmapfile_w_homolog, file.path(output_path, paste0(final_table, "_WithHomologMatchesOnly.csv"), row.names = FALSE))
# Add human gene name to the Gene column
hmapfile_w_homolog$Gene <- paste(hmapfile_w_homolog$Gene, hmapfile_w_homolog$external_gene_name_Human, sep = "/")
# Only keep the finalTable file columns and the homology info
hmap_len <- dim(hmapfile)[2]
hmapfile_w_homolog_remake <-
cbind(hmapfile_w_homolog[, 1:hmap_len], hsapiens_homolog_orthology_type = hmapfile_w_homolog$hsapiens_homolog_orthology_type)
hmapfile <- hmapfile_w_homolog_remake
# Set NAs to NA
hmapfile[hmapfile == -100] <- NA
hmapfile[hmapfile == 100] <- NA
hmapfile[hmapfile == 0.001] <- NA
hmapfile[hmapfile == -0.001] <- NA
# Select the number of rows based on the number of genes
num_total_genes <- length(hmapfile[, 1])
# Break out the cluster names so each part of the cluster origin can be accessed
# Line below removed because it adds to many genes to clusters when going past 1-0-10
# since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl.
# hmapfile$cluster.origin = gsub(" ","",x = hmapfile$cluster.origin)
hmapfile$cluster.origin <- gsub(";", " ;", x = hmapfile$cluster.origin)
hmapfile$cluster.origin <- strsplit(hmapfile$cluster.origin, ";")
# use tail(x,n) for accessing the outward most cluster
clust_rounds <- 0
for (i in 1:num_total_genes) {
if (length(hmapfile$cluster.origin[[i]]) > clust_rounds) {
clust_rounds <- length(hmapfile$cluster.origin[[i]])
}
}
unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes])
unique_clusts <- unique_clusts[unique_clusts != " "]
#select only the unique cluster names
unique_clusts <- sort(unique(unlist(unique_clusts, use.names = FALSE)), decreasing = FALSE)
num_unique_clusts <- length(unique_clusts)
# Base the color key on a statistical analysis of the L and K data
# need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors)
# scale() will calculate the mean and standard deviation of the entire vector
# then "scale" each element by those values by subtracting the mean and dividing by the sd
# hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)])
# Change so that the L data is multiplied to be on the same scale as the K data
KEY_MIN <- 0
KEY_MAX <- 0
K_MIN <- 0
L_MAX <- 0
KcolumnValues <- vector()
LcolumnValues <- vector()
for (i in 4:(length(hmapfile[1, ]) - 3)){
if (grepl("_Z_lm_K", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
KcolumnValues <- append(KcolumnValues, i)
}
if (grepl("_Z_lm_L", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
LcolumnValues <- append(LcolumnValues, i)
}
}
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm = TRUE)[4]
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm = TRUE)[2]
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm = TRUE)[4]
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm = TRUE)[2]
# Z scores are
L_MAX <- 12
K_MIN <- -12
# L_Multiplier <- as.numeric(abs(K_MIN/L_MAX))
# hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier
# if(grepl("SHIFT",colnames(hmapfile)[4],fixed = TRUE) == TRUE){
# print("FOUND SHIFT VALUES")
# hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier
# }
#KEY_MAX <- as.numeric(L_MAX * L_Multiplier)
#KEY_MIN <- as.numeric(K_MIN)
KEY_MAX <- as.numeric(L_MAX)
KEY_MIN <- as.numeric(K_MIN)
print(KEY_MIN)
print(L_MAX)
#print(L_Multiplier)
colormapbreaks <- c(KEY_MIN, KEY_MIN * (5 / 6), KEY_MIN * (4 / 6), KEY_MIN * (3 / 6),
KEY_MIN * (2 / 6), KEY_MIN * (1 / 6), KEY_MAX * (1 / 6), KEY_MAX * (2 / 6),
KEY_MAX * (3 / 6), KEY_MAX * (4 / 6), KEY_MAX * (5 / 6), KEY_MAX)
# print(colormapbreaks)
# Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?)
# However since also using this to amend the first part.
# Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers
# then could just use these values and create spaces only between interaction values
# possibly could get rid of redundant shift values if we don't want to view these
# could we pool all the shift data/average it?
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == TRUE) {
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 2)
# ev_repeat = rep("white",length(even_columns))
# ev_repeat = rep("red",(length(hmapfile[1,]) - 5))
# middle_col <- (length(hmapfile[1,]) - 5)/2
# ev_repeat[(middle_col/2)] <- "black"
# print(ev_repeat)
}
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == FALSE) {
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 1)
print("NO SHIFT VALS FOUND")
}
# for this script only (rap tem hu script)
# even_columns <- c(2,5,7,10,12,15,17)
# m <- 0
colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1, ]) - 3)])
colnames(damp_list)[1] <- "ORF"
hmapfile$damps <- "YKO"
colnames(hmapfile)[2] <- "ORF"
try(hmapfile[hmapfile$ORF %in% damp_list$ORF, ]$damps <- "YKD")
# X <- X[order(X$damps,decreasing = TRUE),]
hmapfile$color2 <- NA
try(hmapfile[hmapfile$damps == "YKO", ]$color2 <- "black")
try(hmapfile[hmapfile$damps == "YKD", ]$color2 <- "red")
hmapfile$color <- NA
try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_many2many", ]$color <- "#F8766D")
try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2many", ]$color <- "#00BA38")
try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2one", ]$color <- "#619CFF")
# print(colnames_edit)
for (i in 1:length(colnames_edit)) {
if (grepl("Shift", colnames_edit[i], fixed = TRUE) == TRUE) {
colnames_edit[i] <- ""
colnames_edit[i + 1] <- gsub(pattern = "_Z_lm_", replacement = " ", x = colnames_edit[i + 1])
try(colnames_edit[i + 1] <- gsub(pattern = "_", replacement = " ", x = colnames_edit[i + 1]))
# INT_store <- strsplit(colnames_edit[i+1], "Z_lm")
# print(length(unlist(INT_store)))
# if(length(unlist(INT_store)) == 4){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep = " ")
# }
# if(length(unlist(INT_store)) == 3){
#
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep = " ")
# }
# if(length(unlist(INT_store)) == 5){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep = " ")
# }
# if(length(unlist(INT_store)) == 6){
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep = " ")
# }
}
}
print(colnames_edit)
# break()
# colnames_edit[5] <- "TEM HLEG K"
# colnames_edit[10] <- "TEM HL K"
# colnames_edit[15] <- "TEM HLEG L"
# colnames_edit[20] <- "TEM HL L"
# Create the heatmaps
for (i in 1:num_unique_clusts) {
cluster <- unique_clusts[i]
cluster_data <- subset(hmapfile, grepl(cluster, cluster.origin))
cluster_length <- length(cluster_data[, 1])
if (cluster_length != 1) {
X0 <- as.matrix(cluster_data[, 4:(length(hmapfile[1, ]) - 6)])
if (cluster_length >= 2001) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 20, width = 15)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
# cellnote = round(X0,digits = 0), notecex = 0.1, key = TRUE,
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
# ColSideColors = ev_repeat,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
# abline(v = 0.5467,col = "black")
dev.off()
}
if (cluster_length >= 201 && cluster_length <= 2000) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 15, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.1, key = TRUE,
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
# abline(v = 0.5316,col = "black")
dev.off()
}
if (cluster_length >= 150 && cluster_length <= 200) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.2, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
dev.off()
}
if (cluster_length >= 101 && cluster_length <= 149) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
dev.off()
}
if (cluster_length >= 60 && cluster_length <= 100) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 12, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
dev.off()
}
if (cluster_length <= 59 && cluster_length >= 30) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 9, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
dev.off()
}
if (cluster_length <= 29) {
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
pdf(file = mypath, height = 7, width = 12)
heatmap.2(
x = X0,
Rowv = TRUE, Colv = NA,
distfun = dist, hclustfun = hclust,
dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none",
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
# zlim = c(-132,132),
xlab = "Type of Media", ylab = "Gene Name",
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
na.color = "red", col = brewer.pal(11, "PuOr"),
main = cluster,
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
)
dev.off()
}
}
# print(paste("FINISHED", "CLUSTER",cluster,sep = " "))
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,196 @@
#!/usr/bin/env R
# GTA (GoTermAveraging)
# Your output may not be reproducible as org.Sc.sgd.db is uploaded from Bioconductor R library and changes
#
# Updated 240724 Bryan C Roessler to improve file operations and portability
# NOTE: The script now has 2 additional OPTIONAL arguments:
# 1. Path to SGD terms file (go.terms.tab)
# 2. Path to SGD features file (gene_association.sgd)
library("stringr")
library("org.Sc.sgd.db")
library("plyr")
# Parse arguments
args <- commandArgs(TRUE)
exp_name <- args[1]
if (length(args) >= 2) {
zscores_file <- args[2]
} else {
zscores_file <- "zscores/zscores_interaction.csv" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
}
if (length(args) >= 3) {
sgd_terms_file <- args[3]
} else {
sgd_terms_file <- "go_terms.tab"
}
if (length(args) >= 4) {
sgd_features_file <- args[4]
} else {
sgd_features_file <- "gene_association.sgd" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
}
if (length(args) >= 5) {
output_dir <- args[5]
} else {
output_dir <- "../../out/gta" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
}
# # Set SGDgeneList file path
# if (length(args) > 4) {
# SGDgeneList <- args[4]
# } else {
# SGDgeneList <- "../Code/SGD_features.tab"
# Begin for loop for experiments in this study
# ZScores_Interaction.csv
for (m in 1:length(zscores_file)) {
# zscores_file <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="") #ArgsScore[1]
X <- read.csv(file = zscores_file[m], stringsAsFactors = FALSE, header = TRUE)
if (colnames(X)[1] == "OrfRep") {
colnames(X)[1] <- "ORF"
}
#Terms is the GO term list
Terms <- read.delim(file = sgd_terms_file, header = FALSE, quote = "",
col.names = c("GO_ID", "GO_Term", "GO_Aspect", "GO_Term_Definition"))
#all ORFs associated with GO term
GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
#Gene_Association is the gene association to GO term file
Gene_Association <- read.delim(sgd_features_file, skip = 8, header = FALSE, quote = "",
col.names = c("Database", "Database_Object_ID", "Database_Object_Symbol", "NOT", "GO_ID",
"Database_Reference", "Evidence", "With_or_From", "Aspect", "Database_Object_Name",
"Database_Object_Synonym", "Database_Object_Type", "taxon", "Date", "Assigned_By", "OtherInfo", "Empty"
)
)
#Get the ORF names associated with each gene/GO term
Gene_Association$ORF <- str_split_fixed(as.character(Gene_Association$Database_Object_Synonym), "\\|", 2)[, 1]
#Get the numeric GO ID for matching
Gene_Association$GO_ID_Numeric <- as.integer(str_split_fixed(as.character(Gene_Association$GO_ID), "\\:", 2)[, 2])
#Get all unique GO terms
GO_Terms <- unique(Gene_Association$GO_ID)
#Create a character vector with just the ColNames of the input file to store the scores for each GO term
Col_Names_X <- colnames(X)
#Create a data_frame with header from input_file
GO_Term_Averages <- X[0, ]
#Fill table with NAs same length as number of GO terms
GO_Term_Averages[1:length(GO_Terms), ] <- NA
#Change the first and second col names to GO_ID and Term
colnames(GO_Term_Averages)[1] <- "GO_ID"
colnames(GO_Term_Averages)[2] <- "Term"
# Create new columns
GO_Term_Averages$Ontology <- NA
GO_Term_Averages$NumGenes <- NA
GO_Term_Averages$AllPossibleGenes <- NA
GO_Term_Averages$Genes <- NA
GO_Term_Averages$ORFs <- NA
# Create a data.frame for the standard deviation info
GO_Term_SD <- X[0, ]
GO_Term_SD[1:length(GO_Terms), ] <- NA
colnames(GO_Term_SD)[1] <- "GO_ID"
colnames(GO_Term_SD)[2] <- "Term"
# Loop for each GO term to get an average L and K Z score
for (i in 1:length(GO_Terms)) {
# Get the GO_Term
ID <- GO_Terms[i]
# Get data.frame for all genes associated to the GO Term
ID_AllGenes <- Gene_Association[Gene_Association$GO_ID == ID, ]
# Get a vector of just the gene names
ID_AllGenes_vector <- as.vector(GO2ALLORFs[as.character(ID)][[1]])
if (length(unique(ID_AllGenes_vector)) > 4000) {
next()
}
# Get the GO term character description where numeric Terms ID matches GO_Term's ID
GO_Description_Term <- as.character(Terms[Terms$GO_ID %in% ID_AllGenes$GO_ID_Numeric, ]$GO_Term[1])
# Get the Z scores for all genes in the GO_ID
Zscores_For_ID <- X[X$ORF %in% ID_AllGenes_vector, ]
# Get the Gene names and ORFs for the term
GO_Term_Averages$Genes[i] <- paste(unique(Zscores_For_ID$Gene), collapse = " | ")
GO_Term_Averages$ORFs[i] <- paste(unique(Zscores_For_ID$ORF), collapse = " | ")
# Dataframe to report the averages for a GO term
# Get the GO ID
GO_Term_Averages$GO_ID[i] <- as.character(ID)
# Get the term name
GO_Term_Averages$Term[i] <- GO_Description_Term
# Get total number of genes annotated to the Term that we have in our library
GO_Term_Averages$NumGenes[i] <- length(unique(Zscores_For_ID$ORF))
# Get total number of genes annotated to the Term in SGD
GO_Term_Averages$AllPossibleGenes[i] <- length(unique(ID_AllGenes_vector))
# Get the ontology of the term
GO_Term_Averages$Ontology[i] <- as.character(ID_AllGenes$Aspect[1])
# Calculate the average score for every column
for (j in 3:length(X[1, ])) {
GO_Term_Averages[i, j] <- mean(Zscores_For_ID[, j], na.rm = TRUE)
# GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
}
# Also calculate same values for the SD
GO_Term_SD$GO_ID[i] <- as.character(ID)
# Get the term name
GO_Term_SD$Term[i] <- GO_Description_Term
# Calculate column scores for SD
for (j in 3:length(X[1, ])) {
GO_Term_SD[i, j] <- sd(Zscores_For_ID[, j], na.rm = TRUE)
# GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
}
}
# Add either _Avg or _SD depending on if the calculated score is an average or SD
colnames(GO_Term_Averages) <- paste(colnames(GO_Term_Averages), "Avg", sep = "_")
colnames(GO_Term_SD) <- paste(colnames(GO_Term_SD), "SD", sep = "_")
# Combine the averages with the SDs to make one big data.frame
X2 <- cbind(GO_Term_Averages, GO_Term_SD)
# Test[ , order(names(test))]
X2 <- X2[, order(names(X2))]
X2 <- X2[!is.na(X2$Z_lm_L_Avg), ]
# Create output file
write.csv(X2, file.path(output_dir, expName[m], "Average_GOTerms_All.csv"), row.names = FALSE)
# Remove NAs
X3 <- X2[!is.na(X2$Z_lm_L_Avg), ]
# Identify redundant GO terms
for (i in 1:length(X3[, 1])) {
# Loop through each GO term - get term
GO_term_ID <- as.character(X3$GO_ID_Avg[i])
# Get term in the X3
X3_Temp <- X3[X3$GO_ID_Avg == GO_term_ID, ]
# Get anywhere that has the same number K_Avg value
X3_Temp2 <- X3[X3$Z_lm_K_Avg %in% X3_Temp, ]
if (length(X3_Temp2[, 1]) > 1) {
if (length(unique(X3_Temp2$Genes_Avg)) == 1) {
X3_Temp2 <- X3_Temp2[1, ]
}
}
if (i == 1) {
Y <- X3_Temp2
}
if (i > 1) {
Y <- rbind(Y, X3_Temp2)
}
}
Y1 <- unique(Y)
write.csv(Y1, file.path(output_dir, exp_name, "Average_GOTerms_All_NonRedundantTerms.csv"), row.names = FALSE)
Y2 <- Y1[Y1$Z_lm_L_Avg >= 2 | Y1$Z_lm_L_Avg <= -2, ]
Y2 <- Y2[!is.na(Y2$Z_lm_L_Avg), ]
write.csv(Y2, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_L.csv"), row.names = FALSE)
Y3 <- Y2[Y2$NumGenes_Avg > 2, ]
write.csv(Y3, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv"), row.names = FALSE)
Y4 <- Y1[Y1$Z_lm_K_Avg >= 2 | Y1$Z_lm_K_Avg <= -2, ]
Y4 <- Y4[!is.na(Y4$Z_lm_K_Avg), ]
write.csv(Y4, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_K.csv"), row.names = FALSE)
Y5 <- Y4[Y4$NumGenes_Avg > 2, ]
write.csv(Y5, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv"), row.names = FALSE)
}

View File

@@ -0,0 +1,375 @@
library("dplyr")
# Function to parse and set arguments
parse_arguments <- function() {
if (interactive()) {
args <- c(
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
2,
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
)
} else {
args <- commandArgs(trailingOnly = TRUE)
}
list(
out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
sd = as.numeric(args[2]),
study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
input_dirs = args[4:length(args)]
)
}
args <- parse_arguments()
# Create an array for the zscores files
get_zscores_files <- function(dirs) {
files <- sapply(dirs, function(exp) {
file_path <- file.path(exp, "zscores", "zscores_interaction.csv")
if (file.exists(file_path)) file_path else NULL
}, simplify = TRUE, USE.NAMES = FALSE)
# Filter out NULL entries
files[!sapply(files, is.null)]
}
zscores_files <- get_zscores_files(args$input_dirs)
sprintf("The SD value is: %d", args$sd)
# Ensure there are enough files to compare
if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script")
# Function to join zscores files
join_zscores_files <- function(files) {
joined_data <- read.csv(file = files[1], stringsAsFactors = FALSE)
for (file in files[-1]) {
temp_data <- read.csv(file = file, stringsAsFactors = FALSE)
joined_data <- join(joined_data, temp_data, by = "OrfRep")
}
joined_data
}
# Load and join zscores files
joined_data <- join_zscores_files(zscores_files)
# Order and select columns
order_and_select_columns <- function(data) {
ordered_data <- data[, order(colnames(data))]
selected_headers <- select(ordered_data,
contains("OrfRep"), matches("Gene"),
contains("z_lm_k"), contains("z_shift_k"),
contains("z_lm_l"), contains("z_shift_l"))
selected_headers
}
selected_headers <- order_and_select_columns(joined_data)
# Remove redundant columns like "Gene.1"
clean_headers <- function(data, suffixes) {
suffixes_to_remove <- paste0("Gene.", seq_len(suffixes))
select(data, -all_of(suffixes_to_remove))
}
headSel <- clean_headers(selected_headers, length(zscores_files) - 1)
headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1)
# Fill NA values in Shift and Z_lm columns
fill_na_in_columns <- function(data) {
for (header in colnames(data)) {
if (grepl("Shift", header)) {
data[[header]][is.na(data[[header]])] <- 0.001
} else if (grepl("Z_lm_", header)) {
data[[header]][is.na(data[[header]])] <- 0.0001
}
}
data
}
headSel <- fill_na_in_columns(headSel)
# Filter based on standard deviation
filter_by_sd <- function(data, sd) {
if (sd == 0) return(data)
z_lm_cols <- select(data, contains("z_lm_"))
filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0
data[filter_vector, ]
}
REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), args$sd)
shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), args$sd)
# Reorder columns to interleave Z_lm and Shift data
reorder_columns <- function(data1, data2) {
combined_data <- data1
for (i in 3:ncol(data1)) {
combined_data <- cbind(combined_data, data2[i], data1[i])
}
combined_data
}
combI <- reorder_columns(headSel2, shiftOnly)
# Write output files
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
# Relabel headers using experiment names from StudyInfo.csv
relabel_headers <- function(headers, labels) {
new_labels <- headers
for (i in seq_along(headers)) {
suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i])
if (suffix %in% 1:3) {
exp_name <- labels[as.numeric(suffix), 2]
new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i])
}
}
new_labels
}
LabelStd <- read.csv(file = args$study_info, stringsAsFactors = FALSE)
colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), LabelStd)
colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), LabelStd)
# Save relabeled files
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
# Save updated parameters
LabelStd[, 4] <- args$sd
write.csv(LabelStd, file.path(args$out_dir, "parameters.csv"), row.names = FALSE)
write.csv(LabelStd, file = args$study_info, row.names = FALSE)
# library("plyr")
# library("sos")
# library("dplyr")
# # Function to parse and set arguments
# parse_arguments <- function() {
# if (interactive()) {
# args <- c(
# "./", # Default out_dir
# "2", # Default SD value
# "../Code/StudyInfo.csv" # Default study_info path
# )
# } else {
# args <- commandArgs(trailingOnly = TRUE)
# }
# list(
# out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
# sd = as.numeric(args[2]),
# study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
# input_dirs = args[4:length(args)]
# )
# }
# args <- parse_arguments()
# # Create an array for the zscores files
# zscores_files <- sapply(args$input_dirs, function(study) {
# file_path <- file.path(study, "zscores", "zscores_interaction.csv")
# if (file.exists(file_path)) file_path else NULL
# }, simplify = TRUE, USE.NAMES = FALSE)
# # Filter out NULL entries
# zscores_files <- zscores_files[!sapply(zscores_files, is.null)]
# sprintf("The SD value is: %d", args$sd)
# # print(args$input_dirs)
# # TODO this is better handled in a loop in case you want to compare more experiments?
# # The input is already designed for this
# # Read in the files for your experiment and
# # Join the two files at a time as a function of how many inputFile
# # list the larger file first ? in this example X2 has the larger number of genes
# # If X1 has a larger number of genes, switch the order of X1 and X2
# if (length(zscores_files) < 2) {
# print("Note enough Exps to compare, skipping join")
# stop("Exiting script")
# }
# if (length(zscores_files) >= 2) {
# X1 <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE)
# X2 <- read.csv(file = zscores_files[2], stringsAsFactors = FALSE)
# X <- join(X1, X2, by = "OrfRep")
# OBH <- X[, order(colnames(X))] # OrderByHeader
# headers <- select(OBH, contains("OrfRep"), matches("Gene"),
# contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l"))
# headSel <- select(headers, -"Gene.1") # remove "Gene.1 column
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums
# headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column # frame for interleaving Z_lm with Shift colums
# }
# if (length(zscores_files) >= 3) {
# X3 <- read.csv(file = zscores_files[3], stringsAsFactors = FALSE)
# X <- join(X, X3, by = "OrfRep")
# headSel <- select(headers, -"Gene.1", -"Gene.2")
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2")
# }
# if (length(zscores_files) >= 4) {
# X4 <- read.csv(file = zscores_files[4], stringsAsFactors = FALSE)
# X <- join(X, X4, by = "OrfRep")
# headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3")
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3")
# }
# # print(headers)
# # headSel$contains("Z_Shift") %>% replace_na(0.001)
# headers <- colnames(headSel)
# # print(headers)
# i <- 0
# for (i in 1:length(headers)) {
# if (grepl("Shift", headers[i])) {
# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001
# }
# if (grepl("Z_lm_", headers[i])) {
# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001
# }
# }
# # 2SD option code to exclude Z_lm values less than 2 standard Deviations
# REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_"))
# shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift"))
# # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt
# Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",")
# # Using Text search grepl to relabel headers
# REMcRdyHdr <- colnames(REMcRdy)
# REMcRdyLabels <- "asdf"
# shftHdr <- colnames(shiftOnly)
# shiftLabels <- "asdf"
# shiftLabels[1:2] <- shftHdr[1:2]
# REMcRdyLabels[1:2] <- REMcRdyHdr[1:2]
# for (i in 3:(length(shftHdr))) {
# if (i == 3) {
# shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3])
# REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3])
# }
# if (i == 5) {
# shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5])
# REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5])
# }
# if (i == 7) {
# shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7])
# REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7])
# }
# if (grepl(".1", shftHdr[i], fixed = true)) {
# shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i])
# REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i])
# }
# if (grepl(".2", shftHdr[i], fixed = true)) {
# shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i])
# REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i])
# }
# if (grepl(".3", shftHdr[i], fixed = true)) {
# shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i])
# REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i])
# }
# }
# for (i in 3:(length(REMcRdyLabels))) {
# j <- as.integer(i)
# REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j])
# shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j])
# }
# colnames(shiftOnly) <- shiftLabels
# colnames(REMcRdy) <- REMcRdyLabels
# combI <- headSel2 # starting Template orf, Genename columns
# # headersRemc<-colnames(REMcRdy)
# # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps.
# for (i in 3:length(colnames(REMcRdy))) {
# combI <- cbind.data.frame(combI, shiftOnly[i])
# combI <- cbind.data.frame(combI, REMcRdy[i])
# }
# Vec1 <- NA
# Vec2 <- NA
# Vec3 <- NA
# Vec4 <- NA
# Vec5 <- NA
# Vec6 <- NA
# Vec7 <- NA
# Vec8 <- NA
# if (length(REMcRdy) == 6) {
# Vec1 <- abs(REMcRdy[, 3]) >= sd
# Vec2 <- abs(REMcRdy[, 4]) >= sd
# Vec3 <- abs(REMcRdy[, 5]) >= sd
# Vec4 <- abs(REMcRdy[, 6]) >= sd
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
# REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6]
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
# shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6]
# }
# if (length(REMcRdy) == 8) {
# Vec1 <- abs(REMcRdy[, 3]) >= sd
# Vec2 <- abs(REMcRdy[, 4]) >= sd
# Vec3 <- abs(REMcRdy[, 5]) >= sd
# Vec4 <- abs(REMcRdy[, 6]) >= sd
# Vec5 <- abs(REMcRdy[, 7]) >= sd
# Vec6 <- abs(REMcRdy[, 8]) >= sd
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
# REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8]
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
# shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8]
# }
# if (length(REMcRdy) == 10) {
# Vec1 <- abs(REMcRdy[, 3]) >= sd
# Vec2 <- abs(REMcRdy[, 4]) >= sd
# Vec3 <- abs(REMcRdy[, 5]) >= sd
# Vec4 <- abs(REMcRdy[, 6]) >= sd
# Vec5 <- abs(REMcRdy[, 7]) >= sd
# Vec6 <- abs(REMcRdy[, 8]) >= sd
# Vec7 <- abs(REMcRdy[, 9]) >= sd
# Vec8 <- abs(REMcRdy[, 10]) >= sd
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
# REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10]
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
# shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10]
# }
# if (sd != 0) {
# REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)]
# shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)]
# }
# if (sd == 0) {
# REMcRdy <- REMcRdy # [,2:length(REMcRdy)]
# shiftOnly <- shiftOnly # [,2:length(shiftOnly)]
# }
# # R places hidden "" around the header names. The following
# # is intended to remove those quote so that the "" do not blow up the Java REMc.
# # Use ,quote=F in the write.csv statement to fix R output file.
# # write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE)
# write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
# write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
# # LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",")
# LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE)
# # print(sd)
# LabelStd[, 4] <- as.numeric(sd)
# write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE)
# write.csv(LabelStd, file = study_info, row.names = FALSE)

View File

@@ -0,0 +1,113 @@
suppressMessages({
library("dplyr")
library("data.table")
library("readr")
library("stringr")
})
# Function to parse arguments
parse_arguments <- function() {
if (interactive()) {
args <- c(
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
3, # sd value
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
"Experiment 1: Doxo versus HLD",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
"Experiment 2: HLD versus Doxo"
)
} else {
args <- commandArgs(trailingOnly = TRUE)
}
out_dir <- normalizePath(args[1], mustWork = FALSE)
sd <- as.numeric(args[2])
paths <- normalizePath(args[seq(3, length(args), by = 2)], mustWork = FALSE)
names <- args[seq(4, length(args), by = 2)]
experiments <- setNames(paths, names)
list(
out_dir = out_dir,
sd = sd,
experiments = experiments
)
}
args <- parse_arguments()
# Ensure main output directory exists
dir.create(args$out_dir, showWarnings = FALSE, recursive = TRUE)
# Function to read and combine z-score interaction files
combine_zscores <- function(experiments, out_dir) {
combined_data <- lapply(names(experiments), function(exp_name) {
exp_dir <- experiments[[exp_name]]
zscore_file <- file.path(exp_dir, "zscores", "zscores_interaction.csv")
if (!file.exists(zscore_file)) {
stop("Z-score file does not exist for ", exp_name, " at ", zscore_file)
}
message("Reading z-score file for ", exp_name, " from ", zscore_file)
data <- fread(zscore_file)
data$Experiment <- exp_name
return(data)
}) %>%
bind_rows()
combined_output_file <- file.path(out_dir, "combined_zscores.csv")
fwrite(combined_data, combined_output_file, row.names = FALSE)
message("Combined z-score file saved to: ", combined_output_file)
}
# Function to read and combine summary statistics files
combine_summary_stats <- function(experiments, out_dir) {
combined_stats <- lapply(names(experiments), function(exp_name) {
exp_dir <- experiments[[exp_name]]
summary_file <- file.path(exp_dir, "zscores", "summary_stats_all_strains.csv")
if (!file.exists(summary_file)) {
stop("Summary stats file does not exist for ", exp_name, " at ", summary_file)
}
message("Reading summary stats file for ", exp_name, " from ", summary_file)
data <- fread(summary_file)
data$Experiment <- exp_name
return(data)
}) %>%
bind_rows()
combined_output_file <- file.path(out_dir, "combined_summary_stats.csv")
fwrite(combined_stats, combined_output_file, row.names = FALSE)
message("Combined summary stats file saved to: ", combined_output_file)
}
# Function to generate final summary report
generate_final_report <- function(out_dir) {
combined_zscores <- file.path(out_dir, "combined_zscores.csv")
combined_stats <- file.path(out_dir, "combined_summary_stats.csv")
if (!file.exists(combined_zscores) || !file.exists(combined_stats)) {
stop("Combined z-scores or summary stats files do not exist.")
}
zscores_data <- fread(combined_zscores)
stats_data <- fread(combined_stats)
message("Merging z-score and summary stats data...")
final_report <- merge(zscores_data, stats_data, by = c("OrfRep", "Experiment"), all = TRUE, allow.cartesian = TRUE)
final_report_file <- file.path(out_dir, "final_combined_report.csv")
fwrite(final_report, final_report_file, row.names = FALSE)
message("Final combined report saved to: ", final_report_file)
}
# Process all experiments and generate outputs
combine_zscores(args$experiments, args$out_dir)
combine_summary_stats(args$experiments, args$out_dir)
generate_final_report(args$out_dir)