Squashed initial commit
This commit is contained in:
1
qhtcp-workflow/apps/r/170503_DAmPs_Only.txt
Normal file
1
qhtcp-workflow/apps/r/170503_DAmPs_Only.txt
Normal file
File diff suppressed because one or more lines are too long
8418
qhtcp-workflow/apps/r/All_SGD_GOTerms_for_QHTCPtk.csv
Normal file
8418
qhtcp-workflow/apps/r/All_SGD_GOTerms_for_QHTCPtk.csv
Normal file
File diff suppressed because it is too large
Load Diff
54
qhtcp-workflow/apps/r/CompileGTF.R
Normal file
54
qhtcp-workflow/apps/r/CompileGTF.R
Normal file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env Rscript
|
||||
|
||||
# Load the openxlsx package
|
||||
library(openxlsx)
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
outDir <- args[1]
|
||||
|
||||
# Function to combine CSV and TXT files into a workbook with named sheets
|
||||
combineFilesToWorkbook <- function(file_list, output_file) {
|
||||
# Create a new workbook
|
||||
wb <- createWorkbook()
|
||||
|
||||
# Loop through the file list
|
||||
for (i in seq_along(file_list)) {
|
||||
file <- file_list[i]
|
||||
|
||||
# Extract the sheet name from the file name based on the extension
|
||||
ext <- tools::file_ext(file)
|
||||
sheet_name <- tools::file_path_sans_ext(basename(file))
|
||||
|
||||
# Read the data based on the file extension
|
||||
if (ext %in% c("csv", "txt")) {
|
||||
if (ext == "csv") {
|
||||
data <- read.csv(file)
|
||||
} else if (ext == "txt") {
|
||||
data <- readLines(file)
|
||||
}
|
||||
|
||||
# Create a new sheet in the workbook with the extracted sheet name
|
||||
addWorksheet(wb, sheetName = sheet_name)
|
||||
|
||||
# Write the data to the sheet
|
||||
writeData(wb, sheet = sheet_name, x = data)
|
||||
}
|
||||
}
|
||||
|
||||
# Save the workbook as an Excel file
|
||||
saveWorkbook(wb, output_file)
|
||||
}
|
||||
|
||||
Component <- file.path(outDir, "Component", "ComponentResults.txt")
|
||||
Function <- file.path(outDir, "Function", "FunctionResults.txt")
|
||||
Process <- file.path(outDir, "Process", "ProcessResults.txt")
|
||||
|
||||
# Specify the list of input files (both CSV and TXT)
|
||||
file_list <- c(Component, Process, Function)
|
||||
|
||||
# Specify the output file name
|
||||
output_file <- file.path(outDir, "GTFCombined.xlsx")
|
||||
|
||||
# Call the function to combine the files into a workbook with named sheets
|
||||
combineFilesToWorkbook(file_list, output_file)
|
||||
895
qhtcp-workflow/apps/r/PairwiseLK.R
Normal file
895
qhtcp-workflow/apps/r/PairwiseLK.R
Normal file
@@ -0,0 +1,895 @@
|
||||
# This R script performs GTA L and K Pairwise Compares for user specified pairs of Experiments
|
||||
|
||||
library("ggplot2")
|
||||
library("plotly")
|
||||
library("htmlwidgets")
|
||||
library("extrafont")
|
||||
library("grid")
|
||||
library("ggthemes")
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
exp1_name <- args[1]
|
||||
exp1_file <- args[2]
|
||||
exp2_name <- args[3]
|
||||
exp2_file <- args[4]
|
||||
output_dir <- args[5]
|
||||
|
||||
pairDirL <- file.path(output_dir, paste0("PairwiseCompareL_", exp1_name, "-", exp2_name))
|
||||
pairDirK <- file.path(output_dir, paste0("PairwiseCompareK_", exp1_name, "-", exp2_name))
|
||||
|
||||
# Pairwise L
|
||||
# outputPlotly <- "../GTAresults/PairwiseCompareL/" #"/GTAresults/PairwiseCompareL/"
|
||||
|
||||
# Theme elements for plots
|
||||
theme_Publication <- function(base_size = 14, base_family = "sans") {
|
||||
(theme_foundation(base_size = base_size, base_family = base_family) +
|
||||
theme(
|
||||
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
|
||||
text = element_text(),
|
||||
panel.background = element_rect(colour = NA),
|
||||
plot.background = element_rect(colour = NA),
|
||||
panel.border = element_rect(colour = NA),
|
||||
axis.title = element_text(face = "bold", size = rel(1)),
|
||||
axis.title.y = element_text(angle = 90, vjust = 2),
|
||||
axis.title.x = element_text(vjust = -0.2),
|
||||
axis.text = element_text(),
|
||||
axis.line = element_line(colour = "black"),
|
||||
axis.ticks = element_line(),
|
||||
panel.grid.major = element_line(colour = "#f0f0f0"),
|
||||
panel.grid.minor = element_blank(),
|
||||
legend.key = element_rect(colour = NA),
|
||||
legend.position = "bottom",
|
||||
legend.direction = "horizontal",
|
||||
legend.key.size = unit(0.2, "cm"),
|
||||
legend.spacing = unit(0, "cm"),
|
||||
legend.title = element_text(face = "italic"),
|
||||
plot.margin = unit(c(10, 5, 5, 5), "mm"),
|
||||
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
|
||||
strip.text = element_text(face = "bold")
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
theme_Publication_legend_right <- function(base_size = 14, base_family = "sans") {
|
||||
(theme_foundation(base_size = base_size, base_family = base_family) +
|
||||
theme(
|
||||
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
|
||||
text = element_text(),
|
||||
panel.background = element_rect(colour = NA),
|
||||
plot.background = element_rect(colour = NA),
|
||||
panel.border = element_rect(colour = NA),
|
||||
axis.title = element_text(face = "bold", size = rel(1)),
|
||||
axis.title.y = element_text(angle = 90, vjust = 2),
|
||||
axis.title.x = element_text(vjust = -0.2),
|
||||
axis.text = element_text(),
|
||||
axis.line = element_line(colour = "black"),
|
||||
axis.ticks = element_line(),
|
||||
panel.grid.major = element_line(colour = "#f0f0f0"),
|
||||
panel.grid.minor = element_blank(),
|
||||
legend.key = element_rect(colour = NA),
|
||||
legend.position = "right",
|
||||
legend.direction = "vertical",
|
||||
legend.key.size = unit(0.5, "cm"),
|
||||
legend.spacing = unit(0, "cm"),
|
||||
legend.title = element_text(face = "italic"),
|
||||
plot.margin = unit(c(10, 5, 5, 5), "mm"),
|
||||
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
|
||||
strip.text = element_text(face = "bold")
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
scale_fill_Publication <- function(...) {
|
||||
discrete_scale("fill", "Publication", manual_pal(
|
||||
values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
|
||||
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33")), ...)
|
||||
}
|
||||
|
||||
scale_colour_Publication <- function(...) {
|
||||
discrete_scale("colour", "Publication", manual_pal(
|
||||
values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
|
||||
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33")), ...)
|
||||
}
|
||||
|
||||
X1 <- read.csv(file = exp1_file, stringsAsFactors = FALSE, header = TRUE)
|
||||
X2 <- read.csv(file = exp2_file, stringsAsFactors = FALSE, header = TRUE)
|
||||
|
||||
X <- merge(X1, X2, by = "Term_Avg", all = TRUE, suffixes = c("_X1", "_X2"))
|
||||
gg <- ggplot(data = X, aes(
|
||||
x = Z_lm_L_Avg_X1,
|
||||
y = Z_lm_L_Avg_X2,
|
||||
color = Ontology_Avg_X1,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_L_SD_X1,
|
||||
SD_2 = Z_lm_L_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_rect(
|
||||
aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2),
|
||||
color = "grey20",
|
||||
size = 0.25,
|
||||
alpha = 0.1,
|
||||
inherit.aes = FALSE,
|
||||
fill = NA
|
||||
) +
|
||||
geom_point(shape = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOntology.pdf")),
|
||||
width = 12,
|
||||
height = 8
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOntology.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
# ID aggravators and alleviators, regardless of whether they meet 2SD threshold
|
||||
X1_Specific_Aggravators <- X[which(X$Z_lm_L_Avg_X1 >= 2 & X$Z_lm_L_Avg_X2 < 2), ]
|
||||
X1_Specific_Alleviators <- X[which(X$Z_lm_L_Avg_X1 <= -2 & X$Z_lm_L_Avg_X2 > -2), ]
|
||||
X2_Specific_Aggravators <- X[which(X$Z_lm_L_Avg_X2 >= 2 & X$Z_lm_L_Avg_X1 < 2), ]
|
||||
X2_Specific_Alleviators <- X[which(X$Z_lm_L_Avg_X2 <= -2 & X$Z_lm_L_Avg_X1 > -2), ]
|
||||
Overlap_Aggravators <- X[which(X$Z_lm_L_Avg_X1 >= 2 & X$Z_lm_L_Avg_X2 >= 2), ]
|
||||
Overlap_Alleviators <- X[which(X$Z_lm_L_Avg_X1 <= -2 & X$Z_lm_L_Avg_X2 <= -2), ]
|
||||
X2_Specific_Aggravators_X1_Alleviatiors <- X[which(X$Z_lm_L_Avg_X2 >= 2 & X$Z_lm_L_Avg_X1 <= -2), ]
|
||||
X2_Specific_Alleviators_X1_Aggravators <- X[which(X$Z_lm_L_Avg_X2 <= -2 & X$Z_lm_L_Avg_X1 >= 2), ]
|
||||
X$Overlap_Avg <- NA
|
||||
|
||||
try(X[X$Term_Avg %in% X1_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X1_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp1_name, "Specific_Deletion_Suppresors", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
|
||||
try(X[X$Term_Avg %in% Overlap_Aggravators$Term_Avg, ]$Overlap_Avg <- "Overlapping_Deletion_Enhancers")
|
||||
try(X[X$Term_Avg %in% Overlap_Alleviators$Term_Avg, ]$Overlap_Avg <- "Overlapping_Deletion_Suppressors")
|
||||
try(X[X$Term_Avg %in% X2_Specific_Aggravators_X1_Alleviatiors$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Alleviators_X1_Aggravators$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
|
||||
|
||||
gg <- ggplot(data = X, aes(
|
||||
x = Z_lm_L_Avg_X1,
|
||||
y = Z_lm_L_Avg_X2,
|
||||
color = Overlap_Avg,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_L_SD_X1,
|
||||
SD_2 = Z_lm_L_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_rect(
|
||||
aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2),
|
||||
color = "grey20",
|
||||
size = 0.25,
|
||||
alpha = 0.1,
|
||||
inherit.aes = FALSE,
|
||||
fill = NA
|
||||
) +
|
||||
geom_point(shape = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap.pdf")),
|
||||
width = 12,
|
||||
height = 8
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
x_rem2_gene <- X[X$NumGenes_Avg_X1 >= 2 & X$NumGenes_Avg_X2 >= 2, ]
|
||||
|
||||
#3
|
||||
gg <- ggplot(data = x_rem2_gene, aes(
|
||||
x = Z_lm_L_Avg_X1,
|
||||
y = Z_lm_L_Avg_X2,
|
||||
color = Overlap_Avg,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_L_SD_X1,
|
||||
SD_2 = Z_lm_L_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_above2genes.pdf")),
|
||||
width = 12,
|
||||
height = 8
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
#4
|
||||
X_overlap_nothresold <- X[!(is.na(X$Overlap_Avg)), ]
|
||||
gg <- ggplot(data = X_overlap_nothresold, aes(
|
||||
x = Z_lm_L_Avg_X1,
|
||||
y = Z_lm_L_Avg_X2,
|
||||
color = Overlap_Avg,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_L_SD_X1,
|
||||
SD_2 = Z_lm_L_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.pdf")),
|
||||
width = 12,
|
||||
height = 8
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
# Only output GTA terms where average score is still above 2 after subtracting the SD
|
||||
# Z1 will ID aggravators, Z2 alleviators
|
||||
Z1 <- X
|
||||
Z1$L_Subtract_SD_X1 <- Z1$Z_lm_L_Avg_X1 - Z1$Z_lm_L_SD_X1
|
||||
Z1$L_Subtract_SD_X2 <- Z1$Z_lm_L_Avg_X2 - Z1$Z_lm_L_SD_X2
|
||||
Z2 <- X
|
||||
Z2$L_Subtract_SD_X1 <- Z1$Z_lm_L_Avg_X1 + Z1$Z_lm_L_SD_X1
|
||||
Z2$L_Subtract_SD_X2 <- Z1$Z_lm_L_Avg_X2 + Z1$Z_lm_L_SD_X2
|
||||
X1_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 < 2), ]
|
||||
X1_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X1 <= -2 & Z2$L_Subtract_SD_X2 > -2), ]
|
||||
X2_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z1$L_Subtract_SD_X1 < 2), ]
|
||||
X2_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 > -2), ]
|
||||
Overlap_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 >= 2), ]
|
||||
Overlap_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 <= -2), ]
|
||||
X2_Specific_Aggravators2_X1_Alleviatiors2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z2$L_Subtract_SD_X1 <= -2), ]
|
||||
X2_Specific_Alleviators2_X1_Aggravators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z1$L_Subtract_SD_X1 >= 2), ]
|
||||
X$Overlap <- NA
|
||||
|
||||
try(X[X$Term_Avg %in% X1_Specific_Aggravators2$Term_Avg, ]$Overlap <- paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X1_Specific_Alleviators2$Term_Avg, ]$Overlap <- paste(exp1_name, "Specific_Deletion_Suppresors", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Aggravators2$Term_Avg, ]$Overlap <- paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Alleviators2$Term_Avg, ]$Overlap <- paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
|
||||
try(X[X$Term_Avg %in% Overlap_Aggravators2$Term_Avg, ]$Overlap <- "Overlapping_Deletion_Enhancers")
|
||||
try(X[X$Term_Avg %in% Overlap_Alleviators2$Term_Avg, ]$Overlap <- "Overlapping_Deletion_Suppressors")
|
||||
try(X[X$Term_Avg %in% X2_Specific_Aggravators2_X1_Alleviatiors2$Term_Avg, ]$Overlap <-
|
||||
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Alleviators2_X1_Aggravators2$Term_Avg, ]$Overlap <-
|
||||
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
|
||||
|
||||
#5
|
||||
X_abovethreshold <- X[!(is.na(X$Overlap)), ]
|
||||
|
||||
gg <- ggplot(data = X_abovethreshold, aes(
|
||||
x = Z_lm_L_Avg_X1,
|
||||
y = Z_lm_L_Avg_X2,
|
||||
color = Overlap,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_L_SD_X1,
|
||||
SD_2 = Z_lm_L_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.pdf")),
|
||||
width = 12,
|
||||
height = 8
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
#6
|
||||
gg <- ggplot(data = X_abovethreshold, aes(
|
||||
x = Z_lm_L_Avg_X1,
|
||||
y = Z_lm_L_Avg_X2,
|
||||
color = Overlap,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_L_SD_X1,
|
||||
SD_2 = Z_lm_L_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_text(aes(label = Term_Avg), nudge_y = 0.25, size = 2) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3, size = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.pdf")),
|
||||
width = 20,
|
||||
height = 20
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
X_abovethreshold$X1_Rank <- NA
|
||||
X_abovethreshold$X1_Rank <- rank(-X_abovethreshold$Z_lm_L_Avg_X1, ties.method = "random")
|
||||
X_abovethreshold$X2_Rank <- NA
|
||||
X_abovethreshold$X2_Rank <- rank(-X_abovethreshold$Z_lm_L_Avg_X2, ties.method = "random")
|
||||
|
||||
#7
|
||||
gg <- ggplot(data = X_abovethreshold, aes(
|
||||
x = Z_lm_L_Avg_X1,
|
||||
y = Z_lm_L_Avg_X2,
|
||||
color = Overlap,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_L_SD_X1,
|
||||
SD_2 = Z_lm_L_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_text(aes(label = X1_Rank), nudge_y = 0.25, size = 4) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3, size = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.pdf")),
|
||||
width = 15,
|
||||
height = 15
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
|
||||
fname <-
|
||||
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
#8
|
||||
gg <- ggplot(data = X_abovethreshold, aes(
|
||||
x = Z_lm_L_Avg_X1,
|
||||
y = Z_lm_L_Avg_X2,
|
||||
color = Overlap,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_L_SD_X1,
|
||||
SD_2 = Z_lm_L_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_text(aes(label = X2_Rank), nudge_y = 0.25, size = 4) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3, size = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.pdf")),
|
||||
width = 15,
|
||||
height = 15
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
fname <-
|
||||
file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
write.csv(
|
||||
x = X,
|
||||
file.path(pairDirL, paste0("All_GTA_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
|
||||
row.names = FALSE
|
||||
)
|
||||
|
||||
write.csv(
|
||||
x = X_abovethreshold,
|
||||
file = file.path(pairDirL, paste0("AboveThreshold_GTA_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
|
||||
row.names = FALSE
|
||||
)
|
||||
|
||||
# Begin Pairwsise K
|
||||
# Theme elements for plots
|
||||
theme_Publication <- function(base_size = 14, base_family = "sans") {
|
||||
(theme_foundation(base_size = base_size, base_family = base_family) +
|
||||
theme(
|
||||
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
|
||||
text = element_text(),
|
||||
panel.background = element_rect(colour = NA),
|
||||
plot.background = element_rect(colour = NA),
|
||||
panel.border = element_rect(colour = NA),
|
||||
axis.title = element_text(face = "bold", size = rel(1)),
|
||||
axis.title.y = element_text(angle = 90, vjust = 2),
|
||||
axis.title.x = element_text(vjust = -0.2),
|
||||
axis.text = element_text(),
|
||||
axis.line = element_line(colour = "black"),
|
||||
axis.ticks = element_line(),
|
||||
panel.grid.major = element_line(colour = "#f0f0f0"),
|
||||
panel.grid.minor = element_blank(),
|
||||
legend.key = element_rect(colour = NA),
|
||||
legend.position = "bottom",
|
||||
legend.direction = "horizontal",
|
||||
legend.key.size = unit(0.2, "cm"),
|
||||
legend.spacing = unit(0, "cm"),
|
||||
legend.title = element_text(face = "italic"),
|
||||
plot.margin = unit(c(10, 5, 5, 5), "mm"),
|
||||
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
|
||||
strip.text = element_text(face = "bold")
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
scale_fill_Publication <- function(...) {
|
||||
library("scales")
|
||||
discrete_scale(
|
||||
"fill",
|
||||
"Publication",
|
||||
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
|
||||
...
|
||||
)
|
||||
}
|
||||
|
||||
scale_colour_Publication <- function(...) {
|
||||
discrete_scale(
|
||||
"colour",
|
||||
"Publication",
|
||||
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
|
||||
...
|
||||
)
|
||||
}
|
||||
|
||||
theme_Publication_legend_right <- function(base_size = 14, base_family = "sans") {
|
||||
(theme_foundation(base_size = base_size, base_family = base_family) +
|
||||
theme(
|
||||
plot.title = element_text(face = "bold",
|
||||
size = rel(1.2), hjust = 0.5),
|
||||
text = element_text(),
|
||||
panel.background = element_rect(colour = NA),
|
||||
plot.background = element_rect(colour = NA),
|
||||
panel.border = element_rect(colour = NA),
|
||||
axis.title = element_text(face = "bold", size = rel(1)),
|
||||
axis.title.y = element_text(angle = 90, vjust = 2),
|
||||
axis.title.x = element_text(vjust = -0.2),
|
||||
axis.text = element_text(),
|
||||
axis.line = element_line(colour = "black"),
|
||||
axis.ticks = element_line(),
|
||||
panel.grid.major = element_line(colour = "#f0f0f0"),
|
||||
panel.grid.minor = element_blank(),
|
||||
legend.key = element_rect(colour = NA),
|
||||
legend.position = "right",
|
||||
legend.direction = "vertical",
|
||||
legend.key.size = unit(0.5, "cm"),
|
||||
legend.spacing = unit(0, "cm"),
|
||||
legend.title = element_text(face = "italic"),
|
||||
plot.margin = unit(c(10, 5, 5, 5), "mm"),
|
||||
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
|
||||
strip.text = element_text(face = "bold")
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
scale_fill_Publication <- function(...) {
|
||||
discrete_scale(
|
||||
"fill",
|
||||
"Publication",
|
||||
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
|
||||
...
|
||||
)
|
||||
}
|
||||
|
||||
scale_colour_Publication <- function(...) {
|
||||
discrete_scale(
|
||||
"colour",
|
||||
"Publication",
|
||||
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
|
||||
...
|
||||
)
|
||||
}
|
||||
|
||||
X1 <- read.csv(file = exp1_file, stringsAsFactors = FALSE, header = TRUE)
|
||||
X2 <- read.csv(file = exp2_file, stringsAsFactors = FALSE, header = TRUE)
|
||||
|
||||
#1
|
||||
X <- merge(X1, X2, by = "Term_Avg", all = TRUE, suffixes = c("_X1", "_X2"))
|
||||
|
||||
gg <- ggplot(data = X, aes(
|
||||
x = Z_lm_K_Avg_X1,
|
||||
y = Z_lm_K_Avg_X2,
|
||||
color = Ontology_Avg_X1,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_K_SD_X1,
|
||||
SD_2 = Z_lm_K_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOntology.pdf")),
|
||||
width = 12,
|
||||
height = 8
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOntology.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
#2
|
||||
# ID aggravators and alleviators, regardless of whether they meet 2SD threshold
|
||||
X1_Specific_Aggravators <- X[which(X$Z_lm_K_Avg_X1 >= 2 & X$Z_lm_K_Avg_X2 < 2), ]
|
||||
X1_Specific_Alleviators <- X[which(X$Z_lm_K_Avg_X1 <= -2 & X$Z_lm_K_Avg_X2 > -2), ]
|
||||
X2_Specific_Aggravators <- X[which(X$Z_lm_K_Avg_X2 >= 2 & X$Z_lm_K_Avg_X1 < 2), ]
|
||||
X2_Specific_Alleviators <- X[which(X$Z_lm_K_Avg_X2 <= -2 & X$Z_lm_K_Avg_X1 > -2), ]
|
||||
Overlap_Aggravators <- X[which(X$Z_lm_K_Avg_X1 >= 2 & X$Z_lm_K_Avg_X2 >= 2), ]
|
||||
Overlap_Alleviators <- X[which(X$Z_lm_K_Avg_X1 <= -2 & X$Z_lm_K_Avg_X2 <= -2), ]
|
||||
X2_Specific_Aggravators_X1_Alleviatiors <- X[which(X$Z_lm_K_Avg_X2 >= 2 & X$Z_lm_K_Avg_X1 <= -2), ]
|
||||
X2_Specific_Alleviators_X1_Aggravators <- X[which(X$Z_lm_K_Avg_X2 <= -2 & X$Z_lm_K_Avg_X1 >= 2), ]
|
||||
X$Overlap_Avg <- NA
|
||||
|
||||
try(X[X$Term_Avg %in% X1_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X1_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
|
||||
try(X[X$Term_Avg %in% Overlap_Aggravators$Term_Avg, ]$Overlap_Avg <-
|
||||
"Overlapping_Deletion_Suppressors")
|
||||
try(X[X$Term_Avg %in% Overlap_Alleviators$Term_Avg, ]$Overlap_Avg <-
|
||||
"Overlapping_Deletion_Enhancers")
|
||||
try(X[X$Term_Avg %in% X2_Specific_Aggravators_X1_Alleviatiors$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Alleviators_X1_Aggravators$Term_Avg, ]$Overlap_Avg <-
|
||||
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
|
||||
|
||||
plotly_path <- file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html"))
|
||||
gg <- ggplot(data = X, aes(
|
||||
x = Z_lm_K_Avg_X1,
|
||||
y = Z_lm_K_Avg_X2,
|
||||
color = Overlap_Avg,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_K_SD_X1,
|
||||
SD_2 = Z_lm_K_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap.pdf")),
|
||||
width = 12,
|
||||
height = 8
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
|
||||
#2
|
||||
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
#3
|
||||
x_rem2_gene <- X[X$NumGenes_Avg_X1 >= 2 & X$NumGenes_Avg_X2 >= 2, ]
|
||||
plotly_path <- file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html"))
|
||||
gg <- ggplot(data = x_rem2_gene, aes(
|
||||
x = Z_lm_K_Avg_X1,
|
||||
y = Z_lm_K_Avg_X2,
|
||||
color = Overlap_Avg,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_K_SD_X1,
|
||||
SD_2 = Z_lm_K_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_above2genes.pdf")),
|
||||
width = 12,
|
||||
height = 8
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
#4
|
||||
X_overlap_nothresold <- X[!(is.na(X$Overlap_Avg)), ]
|
||||
gg <- ggplot(data = X_overlap_nothresold, aes(
|
||||
x = Z_lm_K_Avg_X1,
|
||||
y = Z_lm_K_Avg_X2,
|
||||
color = Overlap_Avg,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_K_SD_X1,
|
||||
SD_2 = Z_lm_K_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.pdf")),
|
||||
width = 12,
|
||||
height = 8
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
|
||||
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
#5
|
||||
# Only output GTA terms where average score is still above 2 after subtracting the SD
|
||||
# Z1 will ID aggravators, Z2 alleviators
|
||||
Z1 <- X
|
||||
Z1$L_Subtract_SD_X1 <- Z1$Z_lm_K_Avg_X1 - Z1$Z_lm_K_SD_X1
|
||||
Z1$L_Subtract_SD_X2 <- Z1$Z_lm_K_Avg_X2 - Z1$Z_lm_K_SD_X2
|
||||
Z2 <- X
|
||||
Z2$L_Subtract_SD_X1 <- Z1$Z_lm_K_Avg_X1 + Z1$Z_lm_K_SD_X1
|
||||
Z2$L_Subtract_SD_X2 <- Z1$Z_lm_K_Avg_X2 + Z1$Z_lm_K_SD_X2
|
||||
X1_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 < 2), ]
|
||||
X1_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X1 <= -2 & Z2$L_Subtract_SD_X2 > -2), ]
|
||||
X2_Specific_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z1$L_Subtract_SD_X1 < 2), ]
|
||||
X2_Specific_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 > -2), ]
|
||||
Overlap_Aggravators2 <- Z1[which(Z1$L_Subtract_SD_X1 >= 2 & Z1$L_Subtract_SD_X2 >= 2), ]
|
||||
Overlap_Alleviators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z2$L_Subtract_SD_X1 <= -2), ]
|
||||
X2_Specific_Aggravators2_X1_Alleviatiors2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & Z2$L_Subtract_SD_X1 <= -2), ]
|
||||
X2_Specific_Alleviators2_X1_Aggravators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z1$L_Subtract_SD_X1 >= 2), ]
|
||||
X$Overlap <- NA
|
||||
|
||||
try(X[X$Term_Avg %in% X1_Specific_Aggravators2$Term_Avg, ]$Overlap <-
|
||||
paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X1_Specific_Alleviators2$Term_Avg, ]$Overlap <-
|
||||
paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Aggravators2$Term_Avg, ]$Overlap <-
|
||||
paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Alleviators2$Term_Avg, ]$Overlap <-
|
||||
paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"))
|
||||
try(X[X$Term_Avg %in% Overlap_Aggravators2$Term_Avg, ]$Overlap <-
|
||||
"Overlapping_Deletion_Suppressors")
|
||||
try(X[X$Term_Avg %in% Overlap_Alleviators2$Term_Avg, ]$Overlap <-
|
||||
"Overlapping_Deletion_Enhancers")
|
||||
try(X[X$Term_Avg %in% X2_Specific_Aggravators2_X1_Alleviatiors2$Term_Avg, ]$Overlap <-
|
||||
paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_"))
|
||||
try(X[X$Term_Avg %in% X2_Specific_Alleviators2_X1_Aggravators2$Term_Avg, ]$Overlap <-
|
||||
paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"))
|
||||
|
||||
X_abovethreshold <- X[!(is.na(X$Overlap)), ]
|
||||
gg <- ggplot(data = X_abovethreshold, aes(
|
||||
x = Z_lm_K_Avg_X1,
|
||||
y = Z_lm_K_Avg_X2,
|
||||
color = Overlap,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_K_SD_X1,
|
||||
SD_2 = Z_lm_K_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.pdf")),
|
||||
width = 12,
|
||||
height = 8
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
#6
|
||||
gg <- ggplot(data = X_abovethreshold, aes(
|
||||
x = Z_lm_K_Avg_X1,
|
||||
y = Z_lm_K_Avg_X2,
|
||||
color = Overlap,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_K_SD_X1,
|
||||
SD_2 = Z_lm_K_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_text(aes(label = Term_Avg), nudge_y = 0.25, size = 2) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3, size = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.pdf")),
|
||||
width = 20,
|
||||
height = 20
|
||||
)
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
#7
|
||||
X_abovethreshold$X1_Rank <- NA
|
||||
X_abovethreshold$X1_Rank <- rank(-X_abovethreshold$Z_lm_K_Avg_X1, ties.method = "random")
|
||||
X_abovethreshold$X2_Rank <- NA
|
||||
X_abovethreshold$X2_Rank <- rank(-X_abovethreshold$Z_lm_K_Avg_X2, ties.method = "random")
|
||||
|
||||
gg <- ggplot(data = X_abovethreshold, aes(
|
||||
x = Z_lm_K_Avg_X1,
|
||||
y = Z_lm_K_Avg_X2,
|
||||
color = Overlap,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_K_SD_X1,
|
||||
SD_2 = Z_lm_K_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_text(aes(label = X1_Rank), nudge_y = 0.25, size = 4) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3, size = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.pdf")),
|
||||
width = 15,
|
||||
height = 15
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
fname <-
|
||||
file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
#8
|
||||
gg <- ggplot(data = X_abovethreshold, aes(
|
||||
x = Z_lm_K_Avg_X1,
|
||||
y = Z_lm_K_Avg_X2,
|
||||
color = Overlap,
|
||||
Term = Term_Avg,
|
||||
Genes = Genes_Avg_X1,
|
||||
NumGenes = NumGenes_Avg_X1,
|
||||
AllPossibleGenes = AllPossibleGenes_Avg_X1,
|
||||
SD_1 = Z_lm_K_SD_X1,
|
||||
SD_2 = Z_lm_K_SD_X2
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", exp1_name)) +
|
||||
geom_text(aes(label = X2_Rank), nudge_y = 0.25, size = 4) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3, size = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", exp2_name)) +
|
||||
ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) +
|
||||
theme_Publication_legend_right()
|
||||
|
||||
pdf(
|
||||
file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.pdf")),
|
||||
width = 15,
|
||||
height = 15
|
||||
)
|
||||
|
||||
gg
|
||||
dev.off()
|
||||
pgg <- ggplotly(gg)
|
||||
#pgg
|
||||
fname <-
|
||||
file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.html"))
|
||||
htmlwidgets::saveWidget(pgg, fname)
|
||||
|
||||
write.csv(
|
||||
x = X,
|
||||
file = file.path(pairDirK, paste0("All_GTF_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
|
||||
row.names = FALSE
|
||||
)
|
||||
|
||||
write.csv(
|
||||
x = X_abovethreshold,
|
||||
file = file.path(pairDirK, paste0("AboveThreshold_GTF_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")),
|
||||
row.names = FALSE
|
||||
)
|
||||
5839
qhtcp-workflow/apps/r/SGD_features.tab
Normal file
5839
qhtcp-workflow/apps/r/SGD_features.tab
Normal file
File diff suppressed because it is too large
Load Diff
810
qhtcp-workflow/apps/r/TSHeatmaps5dev2.R
Normal file
810
qhtcp-workflow/apps/r/TSHeatmaps5dev2.R
Normal file
@@ -0,0 +1,810 @@
|
||||
#!/usr/bin/env Rscript
|
||||
# Makes heat maps of multiple experiments
|
||||
#
|
||||
# Updated 240724 Bryan C Roessler to improve file operations and portability
|
||||
# I tried to leave as much logic intact as possible, just feeding in vars in a better way
|
||||
# NOTE: The script now has 7 required arguments and a variable number of input experiments
|
||||
# @arg $1 string StudyInfo.csv file
|
||||
# @arg $2 string gene_ontology_edit.obo file
|
||||
# @arg $3 string go_terms.tab file
|
||||
# @arg $4 string All_SGD_GOTerms_for_QHTCPtk.csv
|
||||
# @arg $5 string base directory
|
||||
# @arg $6 string output directory
|
||||
|
||||
library("ontologyIndex")
|
||||
library("ggplot2")
|
||||
library("RColorBrewer")
|
||||
library("grid")
|
||||
library("ggthemes")
|
||||
# library("plotly")
|
||||
# library("htmlwidgets")
|
||||
library("extrafont")
|
||||
library("stringr")
|
||||
library("org.Sc.sgd.db")
|
||||
library("ggrepel")
|
||||
library("gplots")
|
||||
|
||||
# Load arguments
|
||||
args <- commandArgs(TRUE)
|
||||
study_info_file <- args[1]
|
||||
ontology_file <- args[2]
|
||||
sgd_terms_tfile <- args[3]
|
||||
all_sgd_terms_csv <- args[4]
|
||||
base_dir <- args[5]
|
||||
output_dir <- args[6]
|
||||
study_nums <- args[7:length(args)]
|
||||
|
||||
# Import standard tables used in Sean's code That should be copied to each ExpStudy
|
||||
labels <- read.csv(file = study_info_file, stringsAsFactors = FALSE)
|
||||
Ontology <- get_ontology(file = ontology_file, propagate_relationships = "is_a", extract_tags = "minimal")
|
||||
GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS) # all ORFs associated with GO term
|
||||
Terms <- read.delim(file = sgd_terms_tfile,
|
||||
header = FALSE,
|
||||
quote = "",
|
||||
col.names = c("GO_ID", "GO_Term", "GO_Aspect", "GO_Term_Definition")
|
||||
)
|
||||
XX3 <- read.csv(file = all_sgd_terms_csv, stringsAsFactors = FALSE, header = TRUE)
|
||||
XX3[, 1] <- paste0("GO:", formatC(XX3[, 1], width = 7, flag = "0"))
|
||||
XX3[, 2] <- gsub(pattern = " ", replacement = "_", x = XX3[, 2])
|
||||
XX3[, 2] <- gsub(pattern = "/", replacement = "_", x = XX3[, 2])
|
||||
|
||||
# Load input files
|
||||
for (study_num in study_nums) {
|
||||
input_file <- file.path(base_dir, paste("Exp", study_num), zscores, "zscores_interaction.csv")
|
||||
if (file.exists(input_file)) {
|
||||
assign(paste(X, study_num), read.csv(file = input_file, stringsAsFactors = FALSE, header = TRUE))
|
||||
assign(paste(Name, study_num), labels[study_num, 2])
|
||||
}
|
||||
}
|
||||
|
||||
for (study_num in study_nums) {
|
||||
eval(paste("function", study_num))
|
||||
}
|
||||
|
||||
if (length(study_nums) > 0) {
|
||||
X1$ORF <- X1$OrfRep
|
||||
X1$ORF <- gsub("_1", "", x = X1$ORF)
|
||||
X1$ORF <- gsub("_2", "", x = X1$ORF)
|
||||
X1$ORF <- gsub("_3", "", x = X1$ORF)
|
||||
X1$ORF <- gsub("_4", "", x = X1$ORF)
|
||||
|
||||
X1$Score_L <- "No Effect"
|
||||
try(X1[is.na(X1$Z_lm_L), ]$Score_L <- "No Growth")
|
||||
try(X1[!is.na(X1$Z_lm_L) & X1$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
|
||||
try(X1[!is.na(X1$Z_lm_L) & X1$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
|
||||
|
||||
X1$Score_K <- "No Effect"
|
||||
try(X1[is.na(X1$Z_lm_K), ]$Score_K <- "No Growth")
|
||||
try(X1[!is.na(X1$Z_lm_K) & X1$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
|
||||
try(X1[!is.na(X1$Z_lm_K) & X1$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
|
||||
|
||||
# Express the na data as 0.001 in X1 for K and L
|
||||
X1[is.na(X1$Z_lm_L), ]$Z_lm_L <- 0.001
|
||||
X1[is.na(X1$Z_lm_K), ]$Z_lm_K <- 0.001
|
||||
X1$Rank_L <- rank(X1$Z_lm_L)
|
||||
X1$Rank_K <- rank(X1$Z_lm_K)
|
||||
X1 <- X1[order(X1$OrfRep, decreasing = FALSE), ]
|
||||
colnames(X1) <- paste0(colnames(X1), "_X1")
|
||||
}
|
||||
|
||||
if (length(study_nums) > 1) {
|
||||
X2$ORF <- X2$OrfRep
|
||||
X2$ORF <- gsub("_1", "", x = X2$ORF)
|
||||
X2$ORF <- gsub("_2", "", x = X2$ORF)
|
||||
X2$ORF <- gsub("_3", "", x = X2$ORF)
|
||||
X2$ORF <- gsub("_4", "", x = X2$ORF)
|
||||
|
||||
X2$Score_L <- "No Effect"
|
||||
try(X2[is.na(X2$Z_lm_L), ]$Score_L <- "No Growth")
|
||||
try(X2[!is.na(X2$Z_lm_L) & X2$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
|
||||
try(X2[!is.na(X2$Z_lm_L) & X2$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
|
||||
|
||||
X2$Score_K <- "No Effect"
|
||||
try(X2[is.na(X2$Z_lm_K), ]$Score_K <- "No Growth")
|
||||
try(X2[!is.na(X2$Z_lm_K) & X2$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
|
||||
try(X2[!is.na(X2$Z_lm_K) & X2$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
|
||||
#express the na data as 0.001 in X2
|
||||
X2[is.na(X2$Z_lm_L), ]$Z_lm_L <- 0.001
|
||||
X2[is.na(X2$Z_lm_K), ]$Z_lm_K <- 0.001
|
||||
|
||||
X2$Rank_L <- rank(X2$Z_lm_L)
|
||||
X2$Rank_K <- rank(X2$Z_lm_K)
|
||||
|
||||
X2 <- X2[order(X2$OrfRep, decreasing = FALSE), ]
|
||||
colnames(X2) <- paste0(colnames(X2), "_X2")
|
||||
X <- cbind(X1, X2)
|
||||
}
|
||||
|
||||
if (length(study_nums) > 2) {
|
||||
X3$ORF <- X3$OrfRep
|
||||
X3$ORF <- gsub("_1", "", x = X3$ORF)
|
||||
X3$ORF <- gsub("_2", "", x = X3$ORF)
|
||||
X3$ORF <- gsub("_3", "", x = X3$ORF)
|
||||
X3$ORF <- gsub("_4", "", x = X3$ORF)
|
||||
|
||||
X3$Score_L <- "No Effect"
|
||||
try(X3[is.na(X3$Z_lm_L), ]$Score_L <- "No Growth")
|
||||
try(X3[!is.na(X3$Z_lm_L) & X3$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
|
||||
try(X3[!is.na(X3$Z_lm_L) & X3$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
|
||||
|
||||
X3$Score_K <- "No Effect"
|
||||
try(X3[is.na(X3$Z_lm_K), ]$Score_K <- "No Growth")
|
||||
try(X3[!is.na(X3$Z_lm_K) & X3$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
|
||||
try(X3[!is.na(X3$Z_lm_K) & X3$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
|
||||
|
||||
# Express the na data as 0.001 in X3
|
||||
X3[is.na(X3$Z_lm_L), ]$Z_lm_L <- 0.001
|
||||
X3[is.na(X3$Z_lm_K), ]$Z_lm_K <- 0.001
|
||||
X3$Rank_L <- rank(X3$Z_lm_L)
|
||||
X3$Rank_K <- rank(X3$Z_lm_K)
|
||||
X3 <- X3[order(X3$OrfRep, decreasing = FALSE), ]
|
||||
colnames(X3) <- paste0(colnames(X3), "_X3")
|
||||
X <- cbind(X, X3)
|
||||
}
|
||||
|
||||
if (length(study_nums) > 3) {
|
||||
X4$ORF <- X4$OrfRep
|
||||
X4$ORF <- gsub("_1", "", x = X4$ORF)
|
||||
X4$ORF <- gsub("_2", "", x = X4$ORF)
|
||||
X4$ORF <- gsub("_3", "", x = X4$ORF)
|
||||
X4$ORF <- gsub("_4", "", x = X4$ORF)
|
||||
|
||||
X4$Score_L <- "No Effect"
|
||||
try(X4[is.na(X4$Z_lm_L), ]$Score_L <- "No Growth")
|
||||
try(X4[!is.na(X4$Z_lm_L) & X4$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
|
||||
try(X4[!is.na(X4$Z_lm_L) & X4$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
|
||||
|
||||
X4$Score_K <- "No Effect"
|
||||
try(X4[is.na(X4$Z_lm_K), ]$Score_K <- "No Growth")
|
||||
try(X4[!is.na(X4$Z_lm_K) & X4$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
|
||||
try(X4[!is.na(X4$Z_lm_K) & X4$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
|
||||
|
||||
# Express the na data as 0.001 in X4
|
||||
X4[is.na(X4$Z_lm_L), ]$Z_lm_L <- 0.001
|
||||
X4[is.na(X4$Z_lm_K), ]$Z_lm_K <- 0.001
|
||||
X4$Rank_L <- rank(X4$Z_lm_L)
|
||||
X4$Rank_K <- rank(X4$Z_lm_K)
|
||||
X4 <- X4[order(X4$OrfRep, decreasing = FALSE), ]
|
||||
colnames(X4) <- paste0(colnames(X4), "_X4")
|
||||
X <- cbind(X, X4)
|
||||
}
|
||||
|
||||
if (length(study_nums) > 4) {
|
||||
X5$ORF <- X5$OrfRep
|
||||
X5$ORF <- gsub("_1", "", x = X5$ORF)
|
||||
X5$ORF <- gsub("_2", "", x = X5$ORF)
|
||||
X5$ORF <- gsub("_3", "", x = X5$ORF)
|
||||
X5$ORF <- gsub("_4", "", x = X5$ORF)
|
||||
|
||||
X5$Score_L <- "No Effect"
|
||||
try(X5[is.na(X5$Z_lm_L), ]$Score_L <- "No Growth")
|
||||
try(X5[!is.na(X5$Z_lm_L) & X5$Z_lm_L >= 2, ]$Score_L <- "Deletion Enhancer")
|
||||
try(X5[!is.na(X5$Z_lm_L) & X5$Z_lm_L <= -2, ]$Score_L <- "Deletion Suppressor")
|
||||
|
||||
X5$Score_K <- "No Effect"
|
||||
try(X5[is.na(X5$Z_lm_K), ]$Score_K <- "No Growth")
|
||||
try(X5[!is.na(X5$Z_lm_K) & X5$Z_lm_K >= 2, ]$Score_K <- "Deletion Suppressor")
|
||||
try(X5[!is.na(X5$Z_lm_K) & X5$Z_lm_K <= -2, ]$Score_K <- "Deletion Enhancer")
|
||||
|
||||
# Express the na data as 0.001 in X5
|
||||
X5[is.na(X5$Z_lm_L), ]$Z_lm_L <- 0.001
|
||||
X5[is.na(X5$Z_lm_K), ]$Z_lm_K <- 0.001
|
||||
X5$Rank_L <- rank(X5$Z_lm_L)
|
||||
X5$Rank_K <- rank(X5$Z_lm_K)
|
||||
X5 <- X5[order(X5$OrfRep, decreasing = FALSE), ]
|
||||
colnames(X5) <- paste0(colnames(X5), "_X5")
|
||||
X <- cbind(X, X5)
|
||||
}
|
||||
|
||||
X$ORF <- X$OrfRep_X1
|
||||
|
||||
if (length(study_nums) > 1) {
|
||||
X$ORF <- gsub("_1", "", x = X$ORF)
|
||||
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
|
||||
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
|
||||
X_heatmap <-
|
||||
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
|
||||
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
|
||||
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
|
||||
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
|
||||
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2"]
|
||||
|
||||
X_heatmap <- X_heatmap[, c(10, 1, 4, 5, 8, 9, 2, 3, 6, 7)]
|
||||
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
|
||||
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
|
||||
colnames(X_heatmap)[2] <- "Gene"
|
||||
}
|
||||
|
||||
if (length(study_nums) > 2) {
|
||||
X$ORF <- gsub("_1", "", x = X$ORF)
|
||||
X$ORF <- gsub("_2", "", x = X$ORF)
|
||||
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
|
||||
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
|
||||
try(X[X$Gene_X3 == "", ]$Gene_X3 <- X[X$Gene_X3 == "", ]$OrfRep_X3)
|
||||
|
||||
X_heatmap <-
|
||||
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
|
||||
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
|
||||
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
|
||||
colnames(X) == "Z_Shift_K_X3" | colnames(X) == "Z_lm_K_X3" |
|
||||
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
|
||||
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2" |
|
||||
colnames(X) == "Z_Shift_L_X3" | colnames(X) == "Z_lm_L_X3"]
|
||||
|
||||
# Reorder columns
|
||||
X_heatmap <- X_heatmap[, c(14, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11)]
|
||||
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
|
||||
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
|
||||
colnames(X_heatmap) <- gsub(pattern = "X3", replacement = Name3, colnames(X_heatmap))
|
||||
colnames(X_heatmap)[2] <- "Gene"
|
||||
}
|
||||
|
||||
if (length(study_nums) > 3) {
|
||||
X$ORF <- gsub("_1", "", x = X$ORF)
|
||||
X$ORF <- gsub("_2", "", x = X$ORF)
|
||||
X$ORF <- gsub("_3", "", x = X$ORF)
|
||||
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
|
||||
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
|
||||
try(X[X$Gene_X3 == "", ]$Gene_X3 <- X[X$Gene_X3 == "", ]$OrfRep_X3)
|
||||
try(X[X$Gene_X4 == "", ]$Gene_X4 <- X[X$Gene_X4 == "", ]$OrfRep_X4)
|
||||
|
||||
X_heatmap <-
|
||||
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
|
||||
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
|
||||
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
|
||||
colnames(X) == "Z_Shift_K_X3" | colnames(X) == "Z_lm_K_X3" |
|
||||
colnames(X) == "Z_Shift_K_X4" | colnames(X) == "Z_lm_K_X4" |
|
||||
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
|
||||
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2" |
|
||||
colnames(X) == "Z_Shift_L_X3" | colnames(X) == "Z_lm_L_X3" |
|
||||
colnames(X) == "Z_Shift_L_X4" | colnames(X) == "Z_lm_L_X4"]
|
||||
|
||||
# Reorder columns
|
||||
X_heatmap <- X_heatmap[, c(18, 1, 4, 5, 8, 9, 12, 13, 16, 17, 2, 3, 6, 7, 10, 11, 14, 15)]
|
||||
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
|
||||
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
|
||||
colnames(X_heatmap) <- gsub(pattern = "X3", replacement = Name3, colnames(X_heatmap))
|
||||
colnames(X_heatmap) <- gsub(pattern = "X4", replacement = Name4, colnames(X_heatmap))
|
||||
colnames(X_heatmap)[2] <- "Gene"
|
||||
}
|
||||
|
||||
if (length(study_nums) > 4) {
|
||||
X$ORF <- gsub("_1", "", x = X$ORF)
|
||||
X$ORF <- gsub("_2", "", x = X$ORF)
|
||||
X$ORF <- gsub("_3", "", x = X$ORF)
|
||||
X$ORF <- gsub("_4", "", x = X$ORF)
|
||||
try(X[X$Gene_X1 == "", ]$Gene_X1 <- X[X$Gene_X1 == "", ]$OrfRep_X1)
|
||||
try(X[X$Gene_X2 == "", ]$Gene_X2 <- X[X$Gene_X2 == "", ]$OrfRep_X2)
|
||||
try(X[X$Gene_X3 == "", ]$Gene_X3 <- X[X$Gene_X3 == "", ]$OrfRep_X3)
|
||||
try(X[X$Gene_X4 == "", ]$Gene_X4 <- X[X$Gene_X4 == "", ]$OrfRep_X4)
|
||||
try(X[X$Gene_X5 == "", ]$Gene_X5 <- X[X$Gene_X5 == "", ]$OrfRep_X5)
|
||||
|
||||
X_heatmap <-
|
||||
X[colnames(X) == "ORF" | colnames(X) == "Gene_X1" |
|
||||
colnames(X) == "Z_Shift_K_X1" | colnames(X) == "Z_lm_K_X1" |
|
||||
colnames(X) == "Z_Shift_K_X2" | colnames(X) == "Z_lm_K_X2" |
|
||||
colnames(X) == "Z_Shift_K_X3" | colnames(X) == "Z_lm_K_X3" |
|
||||
colnames(X) == "Z_Shift_K_X4" | colnames(X) == "Z_lm_K_X4" |
|
||||
colnames(X) == "Z_Shift_K_X5" | colnames(X) == "Z_lm_K_X5" |
|
||||
colnames(X) == "Z_Shift_L_X1" | colnames(X) == "Z_lm_L_X1" |
|
||||
colnames(X) == "Z_Shift_L_X2" | colnames(X) == "Z_lm_L_X2" |
|
||||
colnames(X) == "Z_Shift_L_X3" | colnames(X) == "Z_lm_L_X3" |
|
||||
colnames(X) == "Z_Shift_L_X4" | colnames(X) == "Z_lm_L_X4" |
|
||||
colnames(X) == "Z_Shift_L_X5" | colnames(X) == "Z_lm_L_X5"]
|
||||
|
||||
# Reorder columns
|
||||
X_heatmap <- X_heatmap[, c(22, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19)]
|
||||
colnames(X_heatmap) <- gsub(pattern = "X1", replacement = Name1, colnames(X_heatmap))
|
||||
colnames(X_heatmap) <- gsub(pattern = "X2", replacement = Name2, colnames(X_heatmap))
|
||||
colnames(X_heatmap) <- gsub(pattern = "X3", replacement = Name3, colnames(X_heatmap))
|
||||
colnames(X_heatmap) <- gsub(pattern = "X4", replacement = Name4, colnames(X_heatmap))
|
||||
colnames(X_heatmap) <- gsub(pattern = "X5", replacement = Name5, colnames(X_heatmap))
|
||||
colnames(X_heatmap)[2] <- "Gene"
|
||||
}
|
||||
|
||||
# Theme elements for plots
|
||||
theme_Publication <- function(base_size = 14, base_family = "sans") {
|
||||
(theme_foundation(base_size = base_size, base_family = base_family) +
|
||||
theme(
|
||||
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
|
||||
text = element_text(),
|
||||
panel.background = element_rect(colour = NA),
|
||||
plot.background = element_rect(colour = NA),
|
||||
panel.border = element_rect(colour = NA),
|
||||
axis.title = element_text(face = "bold", size = rel(1)),
|
||||
axis.title.y = element_text(angle = 90, vjust = 2),
|
||||
axis.title.x = element_text(vjust = -0.2),
|
||||
axis.text = element_text(),
|
||||
axis.line = element_line(colour = "black"),
|
||||
axis.ticks = element_line(),
|
||||
panel.grid.major = element_line(colour = "#f0f0f0"),
|
||||
panel.grid.minor = element_blank(),
|
||||
legend.key = element_rect(colour = NA),
|
||||
legend.position = "bottom",
|
||||
legend.direction = "horizontal",
|
||||
legend.key.size = unit(0.2, "cm"),
|
||||
legend.spacing = unit(0, "cm"),
|
||||
legend.title = element_text(face = "italic"),
|
||||
plot.margin = unit(c(10, 5, 5, 5), "mm"),
|
||||
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
|
||||
strip.text = element_text(face = "bold")
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
scale_fill_Publication <- function(...) {
|
||||
library(scales)
|
||||
discrete_scale(
|
||||
"fill",
|
||||
"Publication",
|
||||
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
|
||||
...
|
||||
)
|
||||
}
|
||||
|
||||
scale_colour_Publication <- function(...) {
|
||||
discrete_scale(
|
||||
"colour",
|
||||
"Publication",
|
||||
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
|
||||
...
|
||||
)
|
||||
}
|
||||
|
||||
theme_Publication_legend_right <- function(base_size = 14, base_family = "sans") {
|
||||
(theme_foundation(base_size = base_size, base_family = base_family) +
|
||||
theme(
|
||||
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
|
||||
text = element_text(),
|
||||
panel.background = element_rect(colour = NA),
|
||||
plot.background = element_rect(colour = NA),
|
||||
panel.border = element_rect(colour = NA),
|
||||
axis.title = element_text(face = "bold", size = rel(1)),
|
||||
axis.title.y = element_text(angle = 90, vjust = 2),
|
||||
axis.title.x = element_text(vjust = -0.2),
|
||||
axis.text = element_text(),
|
||||
axis.line = element_line(colour = "black"),
|
||||
axis.ticks = element_line(),
|
||||
panel.grid.major = element_line(colour = "#f0f0f0"),
|
||||
panel.grid.minor = element_blank(),
|
||||
legend.key = element_rect(colour = NA),
|
||||
legend.position = "right",
|
||||
legend.direction = "vertical",
|
||||
legend.key.size = unit(0.5, "cm"),
|
||||
legend.spacing = unit(0, "cm"),
|
||||
legend.title = element_text(face = "italic"),
|
||||
plot.margin = unit(c(10, 5, 5, 5), "mm"),
|
||||
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
|
||||
strip.text = element_text(face = "bold")
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
scale_fill_Publication <- function(...) {
|
||||
discrete_scale(
|
||||
"fill",
|
||||
"Publication",
|
||||
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
|
||||
...
|
||||
)
|
||||
}
|
||||
|
||||
scale_colour_Publication <- function(...) {
|
||||
discrete_scale(
|
||||
"colour",
|
||||
"Publication",
|
||||
manual_pal(values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")),
|
||||
...
|
||||
)
|
||||
}
|
||||
|
||||
Ontology <- get_ontology(file = ontology_file, propagate_relationships = "is_a", extract_tags = "minimal")
|
||||
print(Ontology)
|
||||
|
||||
# All ORFs associated with GO term
|
||||
GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
|
||||
|
||||
# Terms is the GO term list jwr moved up to TAABLES
|
||||
Terms <- read.delim(file = sgd_terms_tfile,
|
||||
header = FALSE,
|
||||
quote = "",
|
||||
col.names = c("GO_ID", "GO_Term", "GO_Aspect", "GO_Term_Definition")
|
||||
)
|
||||
|
||||
colormapbreaks <- c(-12, -10, -8, -6, -4, -2, 2, 4, 6, 8, 10, 12)
|
||||
|
||||
for (s in 1:dim(XX3)[1]) {
|
||||
# Ontology <-
|
||||
# get_ontology(file = "Documents/Hartman_Lab/SGD_Downloads/gene_ontology_edit.obo",
|
||||
# propagate_relationships = "is_a", extract_tags = "minimal")
|
||||
# Ontology_Everything <-
|
||||
# get_ontology(file = "Documents/Hartman_Lab/SGD_Downloads/gene_ontology_edit.obo",
|
||||
# propagate_relationships = "is_a", extract_tags = "everything")
|
||||
#
|
||||
# GO_ID_Arg <- "GO:0006325"
|
||||
GO_ID_Arg_loop <- as.character(XX3[s, 1])
|
||||
GOTerm_parent <- get_descendants(Ontology, roots = GO_ID_Arg_loop)
|
||||
# GOTerm_parent <- get_descendants(Ontology,roots = "GO:0006325")
|
||||
# Only make plots if parent term has fewer than 500 children
|
||||
Parent_Size <- length(as.vector(GO2ALLORFs[GO_ID_Arg_loop][[1]]))
|
||||
|
||||
if (length(GOTerm_parent) > 100) {
|
||||
next()
|
||||
}
|
||||
|
||||
Parent_Size <- length(as.vector(GO2ALLORFs[GO_ID_Arg_loop][[1]])) # TODO why twice?
|
||||
|
||||
if (Parent_Size < 2) {
|
||||
next()
|
||||
}
|
||||
|
||||
if (Parent_Size > 2000) {
|
||||
|
||||
pdf(
|
||||
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
|
||||
width = 12,
|
||||
height = 45,
|
||||
onefile = TRUE
|
||||
)
|
||||
|
||||
for (i in 1:length(GOTerm_parent)) {
|
||||
GO_Term <- GOTerm_parent[i]
|
||||
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
|
||||
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
|
||||
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
|
||||
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
|
||||
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
|
||||
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
|
||||
if (dim(Genes_Annotated_to_Term)[1] > 2) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.7, cexRow = 0.5, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
}
|
||||
dev.off()
|
||||
}
|
||||
|
||||
if (Parent_Size >= 1000 && Parent_Size <= 2000) {
|
||||
|
||||
pdf(
|
||||
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
|
||||
width = 12,
|
||||
height = 35,
|
||||
onefile = TRUE
|
||||
)
|
||||
|
||||
for (i in 1:length(GOTerm_parent)) {
|
||||
GO_Term <- GOTerm_parent[i]
|
||||
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
|
||||
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
|
||||
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
|
||||
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
|
||||
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
|
||||
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
|
||||
if (dim(Genes_Annotated_to_Term)[1] > 2) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.7, cexRow = 0.6, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
}
|
||||
dev.off()
|
||||
}
|
||||
|
||||
if (Parent_Size >= 500 && Parent_Size <= 1000) {
|
||||
|
||||
pdf(
|
||||
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
|
||||
width = 12,
|
||||
height = 30,
|
||||
onefile = TRUE
|
||||
)
|
||||
|
||||
for (i in 1:length(GOTerm_parent)) {
|
||||
GO_Term <- GOTerm_parent[i]
|
||||
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
|
||||
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
|
||||
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
|
||||
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
|
||||
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
|
||||
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
|
||||
if (dim(Genes_Annotated_to_Term)[1] > 2) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.7, cexRow = 0.6, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
}
|
||||
dev.off()
|
||||
}
|
||||
|
||||
if (Parent_Size >= 200 && Parent_Size <= 500) {
|
||||
|
||||
pdf(
|
||||
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
|
||||
width = 12,
|
||||
height = 25,
|
||||
onefile = TRUE
|
||||
)
|
||||
|
||||
for (i in 1:length(GOTerm_parent)) {
|
||||
GO_Term <- GOTerm_parent[i]
|
||||
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
|
||||
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
|
||||
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
|
||||
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
|
||||
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
|
||||
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
|
||||
if (dim(Genes_Annotated_to_Term)[1] > 2) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
}
|
||||
dev.off()
|
||||
}
|
||||
|
||||
if (Parent_Size >= 100 && Parent_Size <= 200) {
|
||||
|
||||
pdf(
|
||||
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
|
||||
width = 12,
|
||||
height = 20,
|
||||
onefile = TRUE
|
||||
)
|
||||
|
||||
for (i in 1:length(GOTerm_parent)) {
|
||||
GO_Term <- GOTerm_parent[i]
|
||||
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
|
||||
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
|
||||
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
|
||||
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
|
||||
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
|
||||
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
|
||||
if (dim(Genes_Annotated_to_Term)[1] > 2) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
}
|
||||
dev.off()
|
||||
}
|
||||
|
||||
if (Parent_Size >= 60 && Parent_Size <= 100) {
|
||||
|
||||
pdf(
|
||||
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
|
||||
width = 12,
|
||||
height = 15,
|
||||
onefile = TRUE
|
||||
)
|
||||
|
||||
for (i in 1:length(GOTerm_parent)) {
|
||||
GO_Term <- GOTerm_parent[i]
|
||||
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
|
||||
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
|
||||
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
|
||||
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
|
||||
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
|
||||
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
|
||||
if (dim(Genes_Annotated_to_Term)[1] > 2) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
}
|
||||
dev.off()
|
||||
}
|
||||
|
||||
if (Parent_Size >= 30 && Parent_Size <= 60) {
|
||||
|
||||
pdf(
|
||||
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
|
||||
width = 12,
|
||||
height = 10,
|
||||
onefile = TRUE
|
||||
)
|
||||
|
||||
for (i in 1:length(GOTerm_parent)) {
|
||||
GO_Term <- GOTerm_parent[i]
|
||||
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
|
||||
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
|
||||
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
|
||||
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
|
||||
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
|
||||
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
|
||||
if (dim(Genes_Annotated_to_Term)[1] > 2) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
if (dim(Genes_Annotated_to_Term)[1] <= 2 && dim(Genes_Annotated_to_Term)[1] > 0) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "none", cexCol = 0.7, cexRow = 0.7, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
dev.off()
|
||||
}
|
||||
|
||||
if (Parent_Size >= 3 && Parent_Size <= 30) {
|
||||
|
||||
pdf(
|
||||
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
|
||||
width = 12,
|
||||
height = 7,
|
||||
onefile = TRUE
|
||||
)
|
||||
|
||||
for (i in 1:length(GOTerm_parent)) {
|
||||
GO_Term <- GOTerm_parent[i]
|
||||
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
|
||||
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
|
||||
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
|
||||
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
|
||||
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
|
||||
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
|
||||
if (dim(Genes_Annotated_to_Term)[1] > 2) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
if (dim(Genes_Annotated_to_Term)[1] <= 2 && dim(Genes_Annotated_to_Term)[1] > 0) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "none", cexCol = 0.7, cexRow = 0.7, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
}
|
||||
dev.off()
|
||||
}
|
||||
|
||||
if (Parent_Size == 2) {
|
||||
|
||||
pdf(
|
||||
file = file.path(output_dir, paste0(XX3[s, 2], ".pdf")),
|
||||
width = 12,
|
||||
height = 7,
|
||||
onefile = TRUE
|
||||
)
|
||||
|
||||
for (i in 1:length(GOTerm_parent)) {
|
||||
GO_Term <- GOTerm_parent[i]
|
||||
GO_Term_Num <- as.integer(str_split_fixed(as.character(GO_Term), "\\:", 2)[, 2])
|
||||
GO_Term_Name <- as.character(Terms[Terms$GO_ID == GO_Term_Num, ]$GO_Term)
|
||||
# Genes_Annotated_to_Term <- Gene_Association[Gene_Association$GO_ID == GO_Term, ]
|
||||
All_Genes_Annotated_to_Term <- as.vector(GO2ALLORFs[GO_Term][[1]])
|
||||
Genes_Annotated_to_Term <- X_heatmap[X_heatmap$ORF %in% All_Genes_Annotated_to_Term, ]
|
||||
X0 <- as.matrix(Genes_Annotated_to_Term[, 3:dim(Genes_Annotated_to_Term)[2]])
|
||||
if (dim(Genes_Annotated_to_Term)[1] > 2) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.7, cexRow = 0.7, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
if (dim(Genes_Annotated_to_Term)[1] <= 2 && dim(Genes_Annotated_to_Term)[1] > 0) {
|
||||
try(heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "none", cexCol = 0.7, cexRow = 0.7, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = c(2, 4, 6), sepcolor = "white", offsetCol = 0.1,
|
||||
ylab = "Gene",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.5, key = TRUE,
|
||||
keysize = 0.5, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = GO_Term_Name,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(Genes_Annotated_to_Term$Gene)
|
||||
))
|
||||
}
|
||||
}
|
||||
dev.off()
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
90
qhtcp-workflow/apps/r/addShiftVals.R
Normal file
90
qhtcp-workflow/apps/r/addShiftVals.R
Normal file
@@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env Rscript
|
||||
# This script will add the shift data to the finalTable.csv file
|
||||
#
|
||||
# May want to reorder columns in excel before making heatmaps - otherwise all the shift data will be plotted next to each other.
|
||||
|
||||
library(plyr)
|
||||
library(dplyr)
|
||||
library(sos)
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
if (length(args) >= 1) {
|
||||
finalTable <- file.path(args[1])
|
||||
} else {
|
||||
finalTable <- "REMcRdy_lm_only.csv-finalTable.csv" # for legacy workflow
|
||||
}
|
||||
|
||||
if (length(args) >= 2) {
|
||||
shiftFile <- file.path(args[2])
|
||||
} else {
|
||||
shiftFile <- "Shift_only.csv" # for legacy workflow
|
||||
}
|
||||
|
||||
if (length(args) >= 3) {
|
||||
studyInfo <- file.path(args[3])
|
||||
} else {
|
||||
studyInfo <- "../Code/StudyInfo.csv" # for legacy workflow
|
||||
}
|
||||
|
||||
if (length(args) >= 4) {
|
||||
output <- file.path(args[4])
|
||||
} else {
|
||||
output <- "REMcHeatmaps/REMcWithShift.csv" # for legacy workflow
|
||||
}
|
||||
|
||||
# Read in the REMc finalTable data
|
||||
X <- data.frame(read.csv(file = finalTable, header = TRUE, stringsAsFactors = FALSE))
|
||||
# Read in the shift data From ../JoinInteractions
|
||||
Y <- data.frame(read.csv(file = shiftFile, header = TRUE, stringsAsFactors = FALSE))
|
||||
Labels <- read.delim(studyInfo, skip = 0, as.is = TRUE, row.names = 1, strip.white = TRUE)
|
||||
|
||||
# Determine the number of cols - needed to create the correct number of new cols
|
||||
Xcolnum <- length(X[1, ])
|
||||
ADDnum <- Xcolnum + length(Y[1, ]) - 2
|
||||
|
||||
# Create new columns filled with NAs to be filled with data
|
||||
Xtemp <- X
|
||||
Xtemp[, (Xcolnum + 1):ADDnum] <- NA
|
||||
|
||||
# Match the orf names in each row to a orf name in the shift data file and then add the shift data to the finalTable file
|
||||
shiftTbl < - as.data.frame(matrix(nrow = 1, ncol = length(Y) - 2)) #the df shiftTbl must be initialized before for loop
|
||||
|
||||
for (i in 1:length(X[, 1])) {
|
||||
Shiftrownum <- match(X[i, 2], Y[, 1])
|
||||
shiftTbl[i, ] <- Y[Shiftrownum, 3:length(Y[1, ])]
|
||||
Xtemp[i, (Xcolnum + 1):ADDnum] <- Y[Shiftrownum, 3:length(Y[1, ])]
|
||||
}
|
||||
headerX <- colnames(Xtemp)
|
||||
headerY <- colnames(Y)
|
||||
shfHdr <- headerY[3:length(headerY)]
|
||||
combTbl <- X[, 1:3]
|
||||
lmTbl <- select(Xtemp, contains("Z_lm")) #X[,(4:Xcolnum-2)]
|
||||
shiftTbl <- select(Xtemp, contains("V"))
|
||||
clustTbl <- select(Xtemp, contains("cluster."))
|
||||
|
||||
# Give the new column names the same names as in the shift file
|
||||
Xcols <- colnames(X)
|
||||
Ycols <- colnames(Y)[3:length(Y[1, ])]
|
||||
newCols <- c(Xcols[1:Xcolnum], Ycols)
|
||||
|
||||
# Reorder columns for generating heatmaps
|
||||
combI <- combTbl #Starting Template orf, Genename columns
|
||||
headersRemc <- newCols #colnames(X)
|
||||
newHeaders <- newCols[1:3]
|
||||
lmHdr <- colnames(lmTbl) #newCols[4:(length(Xcols)-2)]
|
||||
clstHdr <- colnames(clustTbl) #select(newCols, contains('cluster.')) #newCols[3+length(lmHdr):2]
|
||||
|
||||
intLvHdr <- vector()
|
||||
#Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps.
|
||||
for (i in 1:(length(shiftTbl[1, ]))) {
|
||||
combI <- cbind.data.frame(combI, shiftTbl[i])
|
||||
combI <- cbind.data.frame(combI, lmTbl[i])
|
||||
intLvHdrx <- c(shfHdr[i], lmHdr[i])
|
||||
intLvHdr <- c(intLvHdr, intLvHdrx)
|
||||
}
|
||||
|
||||
combIHdr <- c(colnames(combTbl), intLvHdr, clstHdr)
|
||||
combI <- cbind.data.frame(combI, clustTbl)
|
||||
colnames(combI) <- combIHdr
|
||||
write.csv(combI, file = output, row.names = FALSE)
|
||||
945
qhtcp-workflow/apps/r/calculate_interaction_zscores.R
Normal file
945
qhtcp-workflow/apps/r/calculate_interaction_zscores.R
Normal file
@@ -0,0 +1,945 @@
|
||||
suppressMessages({
|
||||
library(ggplot2)
|
||||
library(plotly)
|
||||
library(htmlwidgets)
|
||||
library(dplyr)
|
||||
library(ggthemes)
|
||||
library(data.table)
|
||||
library(unix)
|
||||
})
|
||||
|
||||
options(warn = 2, max.print = 1000)
|
||||
options(width = 10000)
|
||||
|
||||
# Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes)
|
||||
soft_limit <- 30 * 1024 * 1024 * 1024
|
||||
hard_limit <- 30 * 1024 * 1024 * 1024
|
||||
rlimit_as(soft_limit, hard_limit)
|
||||
|
||||
# Constants for configuration
|
||||
plot_width <- 14
|
||||
plot_height <- 9
|
||||
base_size <- 14
|
||||
|
||||
parse_arguments <- function() {
|
||||
args <- if (interactive()) {
|
||||
c(
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD",
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2_DoxoHLD/results_std.txt",
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
|
||||
"Experiment 1: Doxo versus HLD",
|
||||
3,
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
|
||||
"Experiment 2: HLD versus Doxo",
|
||||
3
|
||||
)
|
||||
} else {
|
||||
commandArgs(trailingOnly = TRUE)
|
||||
}
|
||||
|
||||
# Extract paths, names, and standard deviations
|
||||
paths <- args[seq(4, length(args), by = 3)]
|
||||
names <- args[seq(5, length(args), by = 3)]
|
||||
sds <- as.numeric(args[seq(6, length(args), by = 3)])
|
||||
|
||||
# Normalize paths
|
||||
normalized_paths <- normalizePath(paths, mustWork = FALSE)
|
||||
|
||||
# Create named list of experiments
|
||||
experiments <- list()
|
||||
for (i in seq_along(paths)) {
|
||||
experiments[[names[i]]] <- list(
|
||||
path = normalized_paths[i],
|
||||
sd = sds[i]
|
||||
)
|
||||
}
|
||||
|
||||
list(
|
||||
out_dir = normalizePath(args[1], mustWork = FALSE),
|
||||
sgd_gene_list = normalizePath(args[2], mustWork = FALSE),
|
||||
easy_results_file = normalizePath(args[3], mustWork = FALSE),
|
||||
experiments = experiments
|
||||
)
|
||||
}
|
||||
|
||||
args <- parse_arguments()
|
||||
|
||||
# Should we keep output in exp dirs or combine in the study output dir?
|
||||
# dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
|
||||
# dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
|
||||
|
||||
# Define themes and scales
|
||||
theme_publication <- function(base_size = 14, base_family = "sans", legend_position = "bottom") {
|
||||
theme_foundation <- ggplot2::theme_grey(base_size = base_size, base_family = base_family)
|
||||
|
||||
theme_foundation %+replace%
|
||||
theme(
|
||||
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
|
||||
text = element_text(),
|
||||
panel.background = element_rect(colour = NA),
|
||||
plot.background = element_rect(colour = NA),
|
||||
panel.border = element_rect(colour = NA),
|
||||
axis.title = element_text(face = "bold", size = rel(1)),
|
||||
axis.title.y = element_text(angle = 90, vjust = 2),
|
||||
axis.title.x = element_text(vjust = -0.2),
|
||||
axis.line = element_line(colour = "black"),
|
||||
panel.grid.major = element_line(colour = "#f0f0f0"),
|
||||
panel.grid.minor = element_blank(),
|
||||
legend.key = element_rect(colour = NA),
|
||||
legend.position = legend_position,
|
||||
legend.direction = ifelse(legend_position == "right", "vertical", "horizontal"),
|
||||
plot.margin = unit(c(10, 5, 5, 5), "mm"),
|
||||
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
|
||||
strip.text = element_text(face = "bold")
|
||||
)
|
||||
}
|
||||
|
||||
scale_fill_publication <- function(...) {
|
||||
discrete_scale("fill", "Publication", manual_pal(values = c(
|
||||
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
|
||||
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
|
||||
)), ...)
|
||||
}
|
||||
|
||||
scale_colour_publication <- function(...) {
|
||||
discrete_scale("colour", "Publication", manual_pal(values = c(
|
||||
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
|
||||
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
|
||||
)), ...)
|
||||
}
|
||||
|
||||
# Load the initial dataframe from the easy_results_file
|
||||
load_and_process_data <- function(easy_results_file, sd = 3) {
|
||||
df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
|
||||
|
||||
df <- df %>%
|
||||
filter(!(.[[1]] %in% c("", "Scan"))) %>%
|
||||
filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>%
|
||||
# Rename columns
|
||||
rename(L = l, num = Num., AUC = AUC96, scan = Scan, last_bg = LstBackgrd, first_bg = X1stBackgrd) %>%
|
||||
mutate(
|
||||
across(c(Col, Row, num, L, K, r, scan, AUC, last_bg, first_bg), as.numeric),
|
||||
delta_bg = last_bg - first_bg,
|
||||
delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + (sd * sd(delta_bg, na.rm = TRUE)),
|
||||
NG = if_else(L == 0 & !is.na(L), 1, 0),
|
||||
DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0),
|
||||
SM = 0,
|
||||
OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded?
|
||||
conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)),
|
||||
conc_num_factor = as.numeric(as.factor(conc_num)) - 1
|
||||
)
|
||||
|
||||
return(df)
|
||||
}
|
||||
|
||||
# Update Gene names using the SGD gene list
|
||||
update_gene_names <- function(df, sgd_gene_list) {
|
||||
# Load SGD gene list
|
||||
genes <- read.delim(file = sgd_gene_list,
|
||||
quote = "", header = FALSE,
|
||||
colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11)))
|
||||
|
||||
# Create a named vector for mapping ORF to GeneName
|
||||
gene_map <- setNames(genes$V5, genes$V4)
|
||||
# Vectorized match to find the GeneName from gene_map
|
||||
mapped_genes <- gene_map[df$ORF]
|
||||
# Replace NAs in mapped_genes with original Gene names (preserves existing Gene names if ORF is not found)
|
||||
updated_genes <- ifelse(is.na(mapped_genes) | df$OrfRep == "YDL227C", df$Gene, mapped_genes)
|
||||
# Ensure Gene is not left blank or incorrectly updated to "OCT1"
|
||||
df <- df %>%
|
||||
mutate(Gene = ifelse(updated_genes == "" | updated_genes == "OCT1", OrfRep, updated_genes))
|
||||
|
||||
return(df)
|
||||
}
|
||||
|
||||
# Calculate summary statistics for all variables
|
||||
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
|
||||
df <- df %>%
|
||||
mutate(across(all_of(variables), ~ ifelse(. == 0, NA, .)))
|
||||
|
||||
summary_stats <- df %>%
|
||||
group_by(across(all_of(group_vars))) %>%
|
||||
summarise(
|
||||
N = sum(!is.na(L)),
|
||||
across(all_of(variables), list(
|
||||
mean = ~mean(., na.rm = TRUE),
|
||||
median = ~median(., na.rm = TRUE),
|
||||
max = ~max(., na.rm = TRUE),
|
||||
min = ~min(., na.rm = TRUE),
|
||||
sd = ~sd(., na.rm = TRUE),
|
||||
se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)
|
||||
), .names = "{.fn}_{.col}")
|
||||
)
|
||||
|
||||
df_cleaned <- df %>%
|
||||
select(-any_of(names(summary_stats)))
|
||||
|
||||
df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars)
|
||||
|
||||
return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
|
||||
}
|
||||
|
||||
|
||||
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
|
||||
|
||||
# Calculate total concentration variables
|
||||
total_conc_num <- length(unique(df$conc_num))
|
||||
num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1
|
||||
|
||||
# Pull the background means and standard deviations from zero concentration
|
||||
bg_means <- list(
|
||||
L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(),
|
||||
K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(),
|
||||
r = df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first(),
|
||||
AUC = df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first()
|
||||
)
|
||||
|
||||
bg_sd <- list(
|
||||
L = df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first(),
|
||||
K = df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first(),
|
||||
r = df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first(),
|
||||
AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
|
||||
)
|
||||
|
||||
interaction_scores <- df %>%
|
||||
mutate(
|
||||
WT_L = df$mean_L,
|
||||
WT_K = df$mean_K,
|
||||
WT_r = df$mean_r,
|
||||
WT_AUC = df$mean_AUC,
|
||||
WT_sd_L = df$sd_L,
|
||||
WT_sd_K = df$sd_K,
|
||||
WT_sd_r = df$sd_r,
|
||||
WT_sd_AUC = df$sd_AUC
|
||||
) %>%
|
||||
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
|
||||
mutate(
|
||||
N = sum(!is.na(L)),
|
||||
NG = sum(NG, na.rm = TRUE),
|
||||
DB = sum(DB, na.rm = TRUE),
|
||||
SM = sum(SM, na.rm = TRUE),
|
||||
across(all_of(variables), list(
|
||||
mean = ~mean(., na.rm = TRUE),
|
||||
median = ~median(., na.rm = TRUE),
|
||||
max = ~max(., na.rm = TRUE),
|
||||
min = ~min(., na.rm = TRUE),
|
||||
sd = ~sd(., na.rm = TRUE),
|
||||
se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)
|
||||
), .names = "{.fn}_{.col}")
|
||||
) %>%
|
||||
ungroup()
|
||||
|
||||
interaction_scores <- interaction_scores %>%
|
||||
group_by(across(all_of(group_vars))) %>%
|
||||
mutate(
|
||||
Raw_Shift_L = mean_L[[1]] - bg_means$L,
|
||||
Raw_Shift_K = mean_K[[1]] - bg_means$K,
|
||||
Raw_Shift_r = mean_r[[1]] - bg_means$r,
|
||||
Raw_Shift_AUC = mean_AUC[[1]] - bg_means$AUC,
|
||||
Z_Shift_L = Raw_Shift_L[[1]] / df$sd_L[[1]],
|
||||
Z_Shift_K = Raw_Shift_K[[1]] / df$sd_K[[1]],
|
||||
Z_Shift_r = Raw_Shift_r[[1]] / df$sd_r[[1]],
|
||||
Z_Shift_AUC = Raw_Shift_AUC[[1]] / df$sd_AUC[[1]]
|
||||
)
|
||||
|
||||
interaction_scores <- interaction_scores %>%
|
||||
mutate(
|
||||
Exp_L = WT_L + Raw_Shift_L,
|
||||
Delta_L = mean_L - Exp_L,
|
||||
Exp_K = WT_K + Raw_Shift_K,
|
||||
Delta_K = mean_K - Exp_K,
|
||||
Exp_r = WT_r + Raw_Shift_r,
|
||||
Delta_r = mean_r - Exp_r,
|
||||
Exp_AUC = WT_AUC + Raw_Shift_AUC,
|
||||
Delta_AUC = mean_AUC - Exp_AUC
|
||||
)
|
||||
|
||||
interaction_scores <- interaction_scores %>%
|
||||
mutate(
|
||||
Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L),
|
||||
Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K),
|
||||
Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r),
|
||||
Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC),
|
||||
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L)
|
||||
)
|
||||
|
||||
# Calculate linear models and interaction scores
|
||||
interaction_scores <- interaction_scores %>%
|
||||
mutate(
|
||||
lm_L = lm(Delta_L ~ conc_num_factor),
|
||||
lm_K = lm(Delta_K ~ conc_num_factor),
|
||||
lm_r = lm(Delta_r ~ conc_num_factor),
|
||||
lm_AUC = lm(Delta_AUC ~ conc_num_factor),
|
||||
Zscore_L = Delta_L / WT_sd_L,
|
||||
Zscore_K = Delta_K / WT_sd_K,
|
||||
Zscore_r = Delta_r / WT_sd_r,
|
||||
Zscore_AUC = Delta_AUC / WT_sd_AUC
|
||||
)
|
||||
|
||||
interaction_scores <- interaction_scores %>%
|
||||
mutate(
|
||||
Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE),
|
||||
Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE),
|
||||
Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE),
|
||||
Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE)
|
||||
)
|
||||
|
||||
interaction_scores_all <- interaction_scores %>%
|
||||
mutate(
|
||||
Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs,
|
||||
Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs,
|
||||
Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1),
|
||||
Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1),
|
||||
lm_Score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1],
|
||||
lm_Score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1],
|
||||
lm_Score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1],
|
||||
lm_Score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1],
|
||||
r_squared_L = summary(lm_L)$r.squared,
|
||||
r_squared_K = summary(lm_K)$r.squared,
|
||||
r_squared_r = summary(lm_r)$r.squared,
|
||||
r_squared_AUC = summary(lm_AUC)$r.squared
|
||||
)
|
||||
|
||||
# Calculate Z_lm for each variable
|
||||
interaction_scores_all <- interaction_scores_all %>%
|
||||
mutate(
|
||||
Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE),
|
||||
Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE),
|
||||
Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE),
|
||||
Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE)
|
||||
)
|
||||
|
||||
# Arrange results by Z_lm_L and NG
|
||||
interaction_scores_all <- interaction_scores_all %>%
|
||||
arrange(desc(lm_Score_L)) %>%
|
||||
arrange(desc(NG)) %>%
|
||||
ungroup()
|
||||
|
||||
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
|
||||
}
|
||||
|
||||
generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
|
||||
|
||||
`%||%` <- function(a, b) if (!is.null(a)) a else b
|
||||
|
||||
# Generalized plot generation
|
||||
plots <- lapply(plot_configs, function(config) {
|
||||
|
||||
df <- config$df
|
||||
plot <- ggplot(df, aes(x = !!sym(config$x_var), y = !!sym(config$y_var), color = as.factor(!!sym(config$color_var))))
|
||||
|
||||
# Rank plots with SD annotations
|
||||
if (config$plot_type == "rank") {
|
||||
plot <- plot + geom_point(size = 0.1, shape = 3)
|
||||
|
||||
# Adding SD bands
|
||||
if (!is.null(config$sd_band)) {
|
||||
for (i in seq_len(config$sd_band)) {
|
||||
plot <- plot +
|
||||
annotate("rect", xmin = -Inf, xmax = Inf, ymin = i, ymax = Inf, fill = "#542788", alpha = 0.3) +
|
||||
annotate("rect", xmin = -Inf, xmax = Inf, ymin = -i, ymax = -Inf, fill = "orange", alpha = 0.3) +
|
||||
geom_hline(yintercept = c(-i, i), color = "gray")
|
||||
}
|
||||
}
|
||||
|
||||
# Optionally add enhancer/suppressor count annotations
|
||||
if (!is.null(config$enhancer_label)) {
|
||||
plot <- plot + annotate("text", x = config$enhancer_label$x,
|
||||
y = config$enhancer_label$y, label = config$enhancer_label$label) +
|
||||
annotate("text", x = config$suppressor_label$x,
|
||||
y = config$suppressor_label$y, label = config$suppressor_label$label)
|
||||
}
|
||||
}
|
||||
|
||||
# Correlation plots
|
||||
if (config$plot_type == "correlation") {
|
||||
plot <- plot + geom_point(shape = 3, color = "gray70") +
|
||||
geom_smooth(method = "lm", color = "tomato3") +
|
||||
annotate("text", x = 0, y = 0, label = config$correlation_text)
|
||||
}
|
||||
|
||||
# General scatter/boxplot/density handling
|
||||
if (!is.null(config$y_var)) {
|
||||
plot <- plot + aes(y = !!sym(config$y_var))
|
||||
|
||||
y_mean_col <- paste0("mean_", config$y_var)
|
||||
y_sd_col <- paste0("sd_", config$y_var)
|
||||
|
||||
if (config$y_var == "delta_bg" && config$plot_type == "scatter") {
|
||||
plot <- plot + geom_point(shape = 3, size = 0.2, position = "jitter") +
|
||||
geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col),
|
||||
ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) +
|
||||
geom_point(aes(y = !!sym(y_mean_col)), size = 0.6)
|
||||
} else if (config$error_bar %||% FALSE) {
|
||||
plot <- plot +
|
||||
geom_point(shape = 3, size = 0.2) +
|
||||
geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col),
|
||||
ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) +
|
||||
geom_point(aes(y = !!sym(y_mean_col)), size = 0.6)
|
||||
}
|
||||
}
|
||||
|
||||
# Plot type selection
|
||||
plot <- switch(config$plot_type,
|
||||
"box" = plot + geom_boxplot(),
|
||||
"density" = plot + geom_density(),
|
||||
"bar" = plot + geom_bar(stat = "identity"),
|
||||
plot + geom_point() + geom_smooth(method = "lm", se = FALSE))
|
||||
|
||||
# Adding y-limits if provided
|
||||
if (!is.null(config$ylim_vals)) {
|
||||
plot <- plot + coord_cartesian(ylim = config$ylim_vals)
|
||||
}
|
||||
|
||||
# Setting legend position, titles, and labels
|
||||
legend_position <- config$legend_position %||% "bottom"
|
||||
plot <- plot + ggtitle(config$title) + theme_Publication(legend_position = legend_position)
|
||||
|
||||
if (!is.null(config$x_label)) plot <- plot + xlab(config$x_label)
|
||||
if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label)
|
||||
|
||||
# Adding text annotations if provided
|
||||
if (!is.null(config$annotations)) {
|
||||
for (annotation in config$annotations) {
|
||||
plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label)
|
||||
}
|
||||
}
|
||||
|
||||
return(plot)
|
||||
})
|
||||
|
||||
# If grid_layout is provided, arrange plots in a grid and save in a single PDF
|
||||
if (!is.null(grid_layout)) {
|
||||
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
|
||||
|
||||
# Loop through plots in chunks defined by ncol and nrow
|
||||
for (start_idx in seq(1, length(plots), by = grid_layout$ncol * grid_layout$nrow)) {
|
||||
end_idx <- min(start_idx + grid_layout$ncol * grid_layout$nrow - 1, length(plots))
|
||||
plot_subset <- plots[start_idx:end_idx]
|
||||
|
||||
# Arrange plots in a grid
|
||||
do.call(grid.arrange, c(plot_subset, ncol = grid_layout$ncol, nrow = grid_layout$nrow))
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
||||
# Save as HTML (optional)
|
||||
plotly_plots <- lapply(plots, function(plot) {
|
||||
suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h")))
|
||||
})
|
||||
combined_plot <- subplot(plotly_plots, nrows = grid_layout$nrow, margin = 0.05)
|
||||
saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, "_grid.html")), selfcontained = TRUE)
|
||||
|
||||
} else {
|
||||
# Save individual plots to PDF
|
||||
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
|
||||
lapply(plots, print)
|
||||
dev.off()
|
||||
|
||||
# Convert plots to plotly and save as HTML
|
||||
plotly_plots <- lapply(plots, function(plot) {
|
||||
suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h")))
|
||||
})
|
||||
combined_plot <- subplot(plotly_plots, nrows = length(plotly_plots), margin = 0.05)
|
||||
saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE)
|
||||
}
|
||||
}
|
||||
|
||||
generate_interaction_plot_configs <- function(df, variables) {
|
||||
plot_configs <- list()
|
||||
|
||||
for (variable in variables) {
|
||||
# Define the y-limits based on the variable being plotted
|
||||
ylim_vals <- switch(variable,
|
||||
"L" = c(-65, 65),
|
||||
"K" = c(-65, 65),
|
||||
"r" = c(-0.65, 0.65),
|
||||
"AUC" = c(-6500, 6500)
|
||||
)
|
||||
|
||||
# Dynamically generate the column names for standard deviation and delta
|
||||
wt_sd_col <- paste0("WT_sd_", variable)
|
||||
delta_var <- paste0("Delta_", variable)
|
||||
z_shift <- paste0("Z_Shift_", variable)
|
||||
z_lm <- paste0("Z_lm_", variable)
|
||||
|
||||
# Set annotations for ZShift, Z lm Score, NG, DB, SM
|
||||
annotations <- list(
|
||||
list(x = 1, y = ifelse(variable == "L", 45, ifelse(variable == "K", 45,
|
||||
ifelse(variable == "r", 0.45, 4500))), label = paste("ZShift =", round(df[[z_shift]], 2))),
|
||||
list(x = 1, y = ifelse(variable == "L", 25, ifelse(variable == "K", 25,
|
||||
ifelse(variable == "r", 0.25, 2500))), label = paste("lm ZScore =", round(df[[z_lm]], 2))),
|
||||
list(x = 1, y = ifelse(variable == "L", -25, ifelse(variable == "K", -25,
|
||||
ifelse(variable == "r", -0.25, -2500))), label = paste("NG =", df$NG)),
|
||||
list(x = 1, y = ifelse(variable == "L", -35, ifelse(variable == "K", -35,
|
||||
ifelse(variable == "r", -0.35, -3500))), label = paste("DB =", df$DB)),
|
||||
list(x = 1, y = ifelse(variable == "L", -45, ifelse(variable == "K", -45,
|
||||
ifelse(variable == "r", -0.45, -4500))), label = paste("SM =", df$SM))
|
||||
)
|
||||
|
||||
# Add scatter plot configuration for this variable
|
||||
plot_configs[[length(plot_configs) + 1]] <- list(
|
||||
df = df,
|
||||
x_var = "conc_num_factor",
|
||||
y_var = delta_var,
|
||||
plot_type = "scatter",
|
||||
title = sprintf("%s %s", df$OrfRep[1], df$Gene[1]),
|
||||
ylim_vals = ylim_vals,
|
||||
annotations = annotations,
|
||||
error_bar = list(
|
||||
ymin = 0 - (2 * df[[wt_sd_col]][1]),
|
||||
ymax = 0 + (2 * df[[wt_sd_col]][1])
|
||||
),
|
||||
x_breaks = unique(df$conc_num_factor),
|
||||
x_labels = unique(as.character(df$conc_num)),
|
||||
x_label = unique(df$Drug[1])
|
||||
)
|
||||
|
||||
# Add box plot configuration for this variable
|
||||
plot_configs[[length(plot_configs) + 1]] <- list(
|
||||
df = df,
|
||||
x_var = "conc_num_factor",
|
||||
y_var = variable,
|
||||
plot_type = "box",
|
||||
title = sprintf("%s %s (Boxplot)", df$OrfRep[1], df$Gene[1]),
|
||||
ylim_vals = ylim_vals,
|
||||
annotations = annotations,
|
||||
error_bar = FALSE, # Boxplots typically don't need error bars
|
||||
x_breaks = unique(df$conc_num_factor),
|
||||
x_labels = unique(as.character(df$conc_num)),
|
||||
x_label = unique(df$Drug[1])
|
||||
)
|
||||
}
|
||||
|
||||
return(plot_configs)
|
||||
}
|
||||
|
||||
generate_rank_plot_configs <- function(df, rank_var, zscore_var, label_prefix) {
|
||||
configs <- list()
|
||||
|
||||
for (sd_band in c(1, 2, 3)) {
|
||||
# Annotated version
|
||||
configs[[length(configs) + 1]] <- list(
|
||||
df = df,
|
||||
x_var = rank_var,
|
||||
y_var = zscore_var,
|
||||
plot_type = "rank",
|
||||
title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"),
|
||||
sd_band = sd_band,
|
||||
enhancer_label = list(
|
||||
x = nrow(df) / 2, y = 10,
|
||||
label = paste("Deletion Enhancers =", nrow(df[df[[zscore_var]] >= sd_band, ]))
|
||||
),
|
||||
suppressor_label = list(
|
||||
x = nrow(df) / 2, y = -10,
|
||||
label = paste("Deletion Suppressors =", nrow(df[df[[zscore_var]] <= -sd_band, ]))
|
||||
)
|
||||
)
|
||||
|
||||
# Non-annotated version
|
||||
configs[[length(configs) + 1]] <- list(
|
||||
df = df,
|
||||
x_var = rank_var,
|
||||
y_var = zscore_var,
|
||||
plot_type = "rank",
|
||||
title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"),
|
||||
sd_band = sd_band
|
||||
)
|
||||
}
|
||||
|
||||
return(configs)
|
||||
}
|
||||
|
||||
generate_correlation_plot_configs <- function(df, lm_list, lm_summaries) {
|
||||
lapply(seq_along(lm_list), function(i) {
|
||||
r_squared <- round(lm_summaries[[i]]$r.squared, 3)
|
||||
list(
|
||||
x_var = names(lm_list)[i][1],
|
||||
y_var = names(lm_list)[i][2],
|
||||
plot_type = "scatter",
|
||||
title = paste("Correlation between", names(lm_list)[i][1], "and", names(lm_list)[i][2]),
|
||||
annotations = list(list(x = 0, y = 0, label = paste("R-squared =", r_squared)))
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
# Adjust missing values and calculate ranks
|
||||
adjust_missing_and_rank <- function(df, variables) {
|
||||
|
||||
# Adjust missing values in Avg_Zscore and Z_lm columns, and apply rank to the specified variables
|
||||
df <- df %>%
|
||||
mutate(across(all_of(variables), list(
|
||||
Avg_Zscore = ~ if_else(is.na(get(paste0("Avg_Zscore_", cur_column()))), 0.001, get(paste0("Avg_Zscore_", cur_column()))),
|
||||
Z_lm = ~ if_else(is.na(get(paste0("Z_lm_", cur_column()))), 0.001, get(paste0("Z_lm_", cur_column()))),
|
||||
Rank = ~ rank(get(paste0("Avg_Zscore_", cur_column()))),
|
||||
Rank_lm = ~ rank(get(paste0("Z_lm_", cur_column())))
|
||||
), .names = "{fn}_{col}"))
|
||||
|
||||
return(df)
|
||||
}
|
||||
|
||||
main <- function() {
|
||||
lapply(names(args$experiments), function(exp_name) {
|
||||
exp <- args$experiments[[exp_name]]
|
||||
exp_path <- exp$path
|
||||
exp_sd <- exp$sd
|
||||
out_dir <- file.path(exp_path, "zscores")
|
||||
out_dir_qc <- file.path(exp_path, "zscores", "qc")
|
||||
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
|
||||
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
|
||||
|
||||
# Load and process data
|
||||
df <- load_and_process_data(args$easy_results_file, sd = exp_sd)
|
||||
df <- update_gene_names(df, args$sgd_gene_list)
|
||||
|
||||
max_conc <- max(df$conc_num_factor)
|
||||
|
||||
# QC steps and filtering
|
||||
df_above_tolerance <- df %>% filter(DB == 1)
|
||||
|
||||
# Calculate the half-medians for `L` and `K` for rows above tolerance
|
||||
L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
|
||||
K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
||||
|
||||
# Get the number of rows that are above tolerance
|
||||
rows_to_remove <- nrow(df_above_tolerance)
|
||||
|
||||
# Set L, r, K, and AUC to NA for rows that are above tolerance
|
||||
df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .)))
|
||||
|
||||
# Calculate summary statistics for all strains, including both background and the deletions
|
||||
message("Calculating summary statistics for all strains")
|
||||
variables <- c("L", "K", "r", "AUC")
|
||||
ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
||||
summary_stats <- ss$summary_stats
|
||||
df_na_stats <- ss$df_with_stats
|
||||
write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
|
||||
|
||||
print("Summary stats:")
|
||||
print(head(summary_stats), width = 200)
|
||||
|
||||
# Remove rows with 0 values in L
|
||||
df_no_zeros <- df_na %>% filter(L > 0)
|
||||
|
||||
# Additional filtering for non-finite values
|
||||
df_na_filtered <- df_na %>%
|
||||
filter(if_any(c(L), ~ !is.finite(.))) %>%
|
||||
{
|
||||
if (nrow(.) > 0) {
|
||||
message("Removing non-finite rows:\n")
|
||||
print(head(., n = 10))
|
||||
}
|
||||
df_na %>% filter(if_all(c(L), is.finite))
|
||||
}
|
||||
|
||||
# Filter data within and outside 2SD
|
||||
message("Filtering by 2SD of K")
|
||||
df_na_within_2sd_k <- df_na_stats %>%
|
||||
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
|
||||
df_na_outside_2sd_k <- df_na_stats %>%
|
||||
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
|
||||
|
||||
# Summary statistics for within and outside 2SD of K
|
||||
message("Calculating summary statistics for L within 2SD of K")
|
||||
# TODO We're omitting the original z_max calculation, not sure if needed?
|
||||
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||
l_within_2sd_k_stats <- ss$summary_stats
|
||||
df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
||||
message("Calculating summary statistics for L outside 2SD of K")
|
||||
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||
l_outside_2sd_k_stats <- ss$summary_stats
|
||||
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
||||
# Write CSV files
|
||||
write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
|
||||
write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
|
||||
|
||||
# Plots
|
||||
|
||||
# Print quality control graphs before removing data due to contamination and
|
||||
# adjusting missing data to max theoretical values
|
||||
l_vs_k_plots <- list(
|
||||
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
|
||||
title = "Raw L vs K before QC",
|
||||
color_var = "conc_num",
|
||||
legend_position = "right"
|
||||
)
|
||||
)
|
||||
|
||||
above_threshold_plots <- list(
|
||||
list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter",
|
||||
title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"),
|
||||
color_var = "conc_num",
|
||||
annotations = list(
|
||||
list(
|
||||
x = L_half_median,
|
||||
y = K_half_median,
|
||||
label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance))
|
||||
)
|
||||
),
|
||||
error_bar = FALSE,
|
||||
legend_position = "right"
|
||||
)
|
||||
)
|
||||
|
||||
frequency_delta_bg_plots <- list(
|
||||
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
||||
title = "Density plot for Delta Background by Conc All Data",
|
||||
color_var = "conc_num",
|
||||
x_label = "Delta Background",
|
||||
y_label = "Density",
|
||||
error_bar = FALSE,
|
||||
legend_position = "right"
|
||||
),
|
||||
list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar",
|
||||
title = "Bar plot for Delta Background by Conc All Data",
|
||||
color_var = "conc_num",
|
||||
x_label = "Delta Background",
|
||||
y_label = "Count",
|
||||
error_bar = FALSE,
|
||||
legend_position = "right"
|
||||
)
|
||||
)
|
||||
|
||||
plate_analysis_plots <- list()
|
||||
for (plot_type in c("scatter", "box")) {
|
||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||
for (var in variables) {
|
||||
for (stage in c("before", "after")) {
|
||||
if (stage == "before") {
|
||||
df_plot <- df
|
||||
} else {
|
||||
df_plot <- df_na # TODO use df_na_filtered if necessary
|
||||
}
|
||||
|
||||
# Set error_bar = TRUE only for scatter plots
|
||||
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
|
||||
|
||||
# Create the plot configuration
|
||||
plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type,
|
||||
title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
|
||||
error_bar = error_bar, color_var = "conc_num")
|
||||
|
||||
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
plate_analysis_no_zero_plots <- list()
|
||||
for (plot_type in c("scatter", "box")) {
|
||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||
for (var in variables) {
|
||||
|
||||
# Set error_bar = TRUE only for scatter plots
|
||||
error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE)
|
||||
|
||||
# Create the plot configuration
|
||||
plot_config <- list(
|
||||
df = df_no_zeros,
|
||||
x_var = "scan",
|
||||
y_var = var,
|
||||
plot_type = plot_type,
|
||||
title = paste("Plate analysis by Drug Conc for", var, "after quality control"),
|
||||
error_bar = error_bar,
|
||||
color_var = "conc_num"
|
||||
)
|
||||
plate_analysis_plots <- append(plate_analysis_plots, list(plot_config))
|
||||
}
|
||||
}
|
||||
|
||||
l_outside_2sd_k_plots <- list(
|
||||
list(df = X_outside_2SD_K, x_var = "l", y_var = "K", plot_type = "scatter",
|
||||
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
|
||||
color_var = "conc_num",
|
||||
legend_position = "right"
|
||||
)
|
||||
)
|
||||
|
||||
delta_bg_outside_2sd_k_plots <- list(
|
||||
list(df = X_outside_2SD_K, x_var = "delta_bg", y_var = "K", plot_type = "scatter",
|
||||
title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
|
||||
color_var = "conc_num",
|
||||
legend_position = "right"
|
||||
)
|
||||
)
|
||||
|
||||
# Generate and save plots for each QC step
|
||||
message("Generating QC plots")
|
||||
generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
|
||||
generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
|
||||
generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
|
||||
generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
|
||||
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
|
||||
generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
|
||||
generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
|
||||
|
||||
# Clean up
|
||||
rm(df, df_above_tolerance, df_no_zeros)
|
||||
|
||||
# TODO: Originally this filtered L NA's
|
||||
# Let's try to avoid for now since stats have already been calculated
|
||||
|
||||
# Process background strains
|
||||
bg_strains <- c("YDL227C")
|
||||
lapply(bg_strains, function(strain) {
|
||||
|
||||
message("Processing background strain: ", strain)
|
||||
|
||||
# Handle missing data by setting zero values to NA
|
||||
# and then removing any rows with NA in L col
|
||||
df_bg <- df_na %>%
|
||||
filter(OrfRep == strain) %>%
|
||||
mutate(
|
||||
L = if_else(L == 0, NA, L),
|
||||
K = if_else(K == 0, NA, K),
|
||||
r = if_else(r == 0, NA, r),
|
||||
AUC = if_else(AUC == 0, NA, AUC)
|
||||
) %>%
|
||||
filter(!is.na(L))
|
||||
|
||||
# Recalculate summary statistics for the background strain
|
||||
message("Calculating summary statistics for background strain")
|
||||
ss <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
||||
summary_stats_bg <- ss$summary_stats
|
||||
df_bg_stats <- ss$df_with_stats
|
||||
write.csv(summary_stats_bg,
|
||||
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
|
||||
row.names = FALSE)
|
||||
|
||||
# Filter reference and deletion strains
|
||||
# Formerly X2_RF (reference strains)
|
||||
df_reference <- df_na_stats %>%
|
||||
filter(OrfRep == strain) %>%
|
||||
mutate(SM = 0)
|
||||
|
||||
# Formerly X2 (deletion strains)
|
||||
df_deletion <- df_na_stats %>%
|
||||
filter(OrfRep != strain) %>%
|
||||
mutate(SM = 0)
|
||||
|
||||
# Set the missing values to the highest theoretical value at each drug conc for L
|
||||
# Leave other values as 0 for the max/min
|
||||
reference_strain <- df_reference %>%
|
||||
group_by(conc_num) %>%
|
||||
mutate(
|
||||
max_l_theoretical = max(max_L, na.rm = TRUE),
|
||||
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
|
||||
SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
|
||||
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
||||
ungroup()
|
||||
|
||||
# Ditto for deletion strains
|
||||
deletion_strains <- df_deletion %>%
|
||||
group_by(conc_num) %>%
|
||||
mutate(
|
||||
max_l_theoretical = max(max_L, na.rm = TRUE),
|
||||
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
|
||||
SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
|
||||
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
||||
ungroup()
|
||||
|
||||
# Calculate interactions
|
||||
variables <- c("L", "K", "r", "AUC")
|
||||
message("Calculating interaction scores")
|
||||
print("Reference strain:")
|
||||
print(head(reference_strain))
|
||||
reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables)
|
||||
print("Deletion strains:")
|
||||
print(head(deletion_strains))
|
||||
deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables)
|
||||
|
||||
zscores_calculations_reference <- reference_results$zscores_calculations
|
||||
zscores_interactions_reference <- reference_results$zscores_interactions
|
||||
zscores_calculations <- deletion_results$zscores_calculations
|
||||
zscores_interactions <- deletion_results$zscores_interactions
|
||||
|
||||
# Writing Z-Scores to file
|
||||
write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE)
|
||||
write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE)
|
||||
write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE)
|
||||
write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE)
|
||||
|
||||
# Create interaction plots
|
||||
reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables)
|
||||
deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables)
|
||||
generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
|
||||
generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3))
|
||||
|
||||
# Define conditions for enhancers and suppressors
|
||||
# TODO Add to study config file?
|
||||
threshold <- 2
|
||||
enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= threshold
|
||||
suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -threshold
|
||||
enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= threshold
|
||||
suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -threshold
|
||||
|
||||
# Subset data
|
||||
enhancers_L <- zscores_interactions[enhancer_condition_L, ]
|
||||
suppressors_L <- zscores_interactions[suppressor_condition_L, ]
|
||||
enhancers_K <- zscores_interactions[enhancer_condition_K, ]
|
||||
suppressors_K <- zscores_interactions[suppressor_condition_K, ]
|
||||
|
||||
# Save enhancers and suppressors
|
||||
message("Writing enhancer/suppressor csv files")
|
||||
write.csv(enhancers_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L.csv"), row.names = FALSE)
|
||||
write.csv(suppressors_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L.csv"), row.names = FALSE)
|
||||
write.csv(enhancers_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K.csv"), row.names = FALSE)
|
||||
write.csv(suppressors_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K.csv"), row.names = FALSE)
|
||||
|
||||
# Combine conditions for enhancers and suppressors
|
||||
enhancers_and_suppressors_L <- zscores_interactions[enhancer_condition_L | suppressor_condition_L, ]
|
||||
enhancers_and_suppressors_K <- zscores_interactions[enhancer_condition_K | suppressor_condition_K, ]
|
||||
|
||||
# Save combined enhancers and suppressors
|
||||
write.csv(enhancers_and_suppressors_L,
|
||||
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_L.csv"), row.names = FALSE)
|
||||
write.csv(enhancers_and_suppressors_K,
|
||||
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_K.csv"), row.names = FALSE)
|
||||
|
||||
# Handle linear model based enhancers and suppressors
|
||||
lm_threshold <- 2
|
||||
enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= lm_threshold, ]
|
||||
suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -lm_threshold, ]
|
||||
enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= lm_threshold, ]
|
||||
suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -lm_threshold, ]
|
||||
|
||||
# Save linear model based enhancers and suppressors
|
||||
message("Writing linear model enhancer/suppressor csv files")
|
||||
write.csv(enhancers_lm_L,
|
||||
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L_lm.csv"), row.names = FALSE)
|
||||
write.csv(suppressors_lm_L,
|
||||
file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L_lm.csv"), row.names = FALSE)
|
||||
write.csv(enhancers_lm_K,
|
||||
file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K_lm.csv"), row.names = FALSE)
|
||||
write.csv(suppressors_lm_K,
|
||||
file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE)
|
||||
|
||||
zscores_interactions_adjusted <- adjust_missing_and_rank(zscores_interactions)
|
||||
|
||||
# Generate all rank plot configurations for L and K
|
||||
rank_plot_configs <- c(
|
||||
generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_L", "Avg_Zscore_L", "L"),
|
||||
generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_K", "Avg_Zscore_K", "K")
|
||||
)
|
||||
|
||||
# Generate and save rank plots
|
||||
generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots",
|
||||
plot_configs = rank_plot_config, grid_layout = list(ncol = 3, nrow = 2))
|
||||
|
||||
# # Correlation plots
|
||||
# lm_list <- list(
|
||||
# lm(Z_lm_K ~ Z_lm_L, data = zscores_interactions_filtered),
|
||||
# lm(Z_lm_r ~ Z_lm_L, data = zscores_interactions_filtered),
|
||||
# lm(Z_lm_AUC ~ Z_lm_L, data = zscores_interactions_filtered),
|
||||
# lm(Z_lm_r ~ Z_lm_K, data = zscores_interactions_filtered),
|
||||
# lm(Z_lm_AUC ~ Z_lm_K, data = zscores_interactions_filtered),
|
||||
# lm(Z_lm_AUC ~ Z_lm_r, data = zscores_interactions_filtered)
|
||||
# )
|
||||
|
||||
lm_summaries <- lapply(lm_list, summary)
|
||||
correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries)
|
||||
generate_and_save_plots(zscores_interactions_filtered, output_dir, correlation_plot_configs)
|
||||
})
|
||||
})
|
||||
}
|
||||
main()
|
||||
209
qhtcp-workflow/apps/r/calculate_pairwise_lk.R
Normal file
209
qhtcp-workflow/apps/r/calculate_pairwise_lk.R
Normal file
@@ -0,0 +1,209 @@
|
||||
suppressMessages({
|
||||
library("ggplot2")
|
||||
library("plotly")
|
||||
library("htmlwidgets")
|
||||
library("extrafont")
|
||||
library("grid")
|
||||
library("ggthemes")
|
||||
})
|
||||
|
||||
# Constants for configuration
|
||||
PLOT_WIDTH <- 12
|
||||
PLOT_HEIGHT <- 8
|
||||
BASE_SIZE <- 14
|
||||
|
||||
options(warn = 2, max.print = 100)
|
||||
|
||||
# Parse arguments
|
||||
parse_arguments <- function() {
|
||||
if (interactive()) {
|
||||
args <- c(
|
||||
"Exp1",
|
||||
"/path/to/exp1_file.csv",
|
||||
"Exp2",
|
||||
"/path/to/exp2_file.csv",
|
||||
"/path/to/output_dir"
|
||||
)
|
||||
} else {
|
||||
args <- commandArgs(trailingOnly = TRUE)
|
||||
}
|
||||
list(
|
||||
exp1_name = args[1],
|
||||
exp1_file = normalizePath(args[2], mustWork = TRUE),
|
||||
exp2_name = args[3],
|
||||
exp2_file = normalizePath(args[4], mustWork = TRUE),
|
||||
output_dir = normalizePath(args[5], mustWork = FALSE)
|
||||
)
|
||||
}
|
||||
|
||||
args <- parse_arguments()
|
||||
|
||||
# Create output directories if they don't exist
|
||||
pairDirL <- file.path(args$output_dir, paste0("PairwiseCompareL_", args$exp1_name, "-", args$exp2_name))
|
||||
pairDirK <- file.path(args$output_dir, paste0("PairwiseCompareK_", args$exp1_name, "-", args$exp2_name))
|
||||
dir.create(pairDirL, showWarnings = FALSE, recursive = TRUE)
|
||||
dir.create(pairDirK, showWarnings = FALSE, recursive = TRUE)
|
||||
|
||||
# Define themes and scales
|
||||
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") {
|
||||
theme_foundation(base_size = base_size, base_family = base_family) +
|
||||
theme(
|
||||
plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
|
||||
panel.background = element_rect(colour = NA),
|
||||
plot.background = element_rect(colour = NA),
|
||||
panel.border = element_rect(colour = NA),
|
||||
axis.title = element_text(face = "bold", size = rel(1)),
|
||||
axis.line = element_line(colour = "black"),
|
||||
panel.grid.major = element_line(colour = "#f0f0f0"),
|
||||
panel.grid.minor = element_blank(),
|
||||
legend.position = "bottom",
|
||||
legend.direction = "horizontal",
|
||||
legend.key.size = unit(0.2, "cm"),
|
||||
plot.margin = unit(c(10, 5, 5, 5), "mm"),
|
||||
strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
|
||||
strip.text = element_text(face = "bold")
|
||||
)
|
||||
}
|
||||
|
||||
theme_publication_legend_right <- function(base_size = BASE_SIZE, base_family = "sans") {
|
||||
theme_publication(base_size, base_family) +
|
||||
theme(
|
||||
legend.position = "right",
|
||||
legend.direction = "vertical",
|
||||
legend.key.size = unit(0.5, "cm")
|
||||
)
|
||||
}
|
||||
|
||||
scale_fill_publication <- function(...) {
|
||||
discrete_scale("fill", "Publication", manual_pal(values = c(
|
||||
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
|
||||
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
|
||||
)), ...)
|
||||
}
|
||||
|
||||
scale_colour_publication <- function(...) {
|
||||
discrete_scale("colour", "Publication", manual_pal(values = c(
|
||||
"#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
|
||||
"#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
|
||||
)), ...)
|
||||
}
|
||||
|
||||
# Read and merge data
|
||||
load_and_merge_data <- function(file1, file2) {
|
||||
df1 <- read.csv(file = file1, stringsAsFactors = FALSE, header = TRUE)
|
||||
df2 <- read.csv(file = file2, stringsAsFactors = FALSE, header = TRUE)
|
||||
merge(df1, df2, by = "Term_Avg", all = TRUE, suffixes = c("_df1", "_df2"))
|
||||
}
|
||||
|
||||
# Generate a plot and save to PDF and HTML
|
||||
generate_plot <- function(data, x_var, y_var, color_var, title, file_path, theme_function) {
|
||||
gg <- ggplot(data = data, aes_string(
|
||||
x = x_var,
|
||||
y = y_var,
|
||||
color = color_var
|
||||
)) +
|
||||
xlab(paste("GO Term Avg lm Z for", args$exp1_name)) +
|
||||
geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) +
|
||||
geom_point(shape = 3) +
|
||||
ylab(paste("GO Term Avg lm Z for", args$exp2_name)) +
|
||||
ggtitle(title) +
|
||||
theme_function()
|
||||
|
||||
pdf(file.path(file_path, ".pdf"), width = PLOT_WIDTH, height = PLOT_HEIGHT)
|
||||
print(gg)
|
||||
dev.off()
|
||||
|
||||
pgg <- ggplotly(gg)
|
||||
htmlwidgets::saveWidget(pgg, file.path(file_path, ".html"))
|
||||
}
|
||||
|
||||
# Identify and annotate specific interactions
|
||||
annotate_interactions <- function(df, exp1_name, exp2_name, suffix) {
|
||||
df$Overlap_Avg <- NA
|
||||
interactions <- list(
|
||||
"df1_Specific_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] < 2),
|
||||
"df1_Specific_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] > -2),
|
||||
"df2_Specific_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] < 2),
|
||||
"df2_Specific_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] > -2),
|
||||
"Overlap_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2),
|
||||
"Overlap_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2),
|
||||
"df2_Aggravators_df1_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2),
|
||||
"df2_Alleviators_df1_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2)
|
||||
)
|
||||
annotation <- list(
|
||||
df1_Specific_Aggravators = paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"),
|
||||
df1_Specific_Alleviators = paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_"),
|
||||
df2_Specific_Aggravators = paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"),
|
||||
df2_Specific_Alleviators = paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"),
|
||||
Overlap_Aggravators = "Overlapping_Deletion_Enhancers",
|
||||
Overlap_Alleviators = "Overlapping_Deletion_Suppressors",
|
||||
df2_Aggravators_df1_Alleviators = paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"),
|
||||
df2_Alleviators_df1_Aggravators = paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_")
|
||||
)
|
||||
for (key in names(interactions)) {
|
||||
try(df$Overlap_Avg[interactions[[key]]] <- annotation[[key]])
|
||||
}
|
||||
df
|
||||
}
|
||||
|
||||
# Rank and filter data
|
||||
rank_and_filter_data <- function(df, suffix) {
|
||||
z1 <- df
|
||||
z1[[paste0("L_Subtract_SD_", suffix, "_df1")]] <- z1[[paste0("Z_lm_", suffix, "_Avg_df1")]] - z1[[paste0("Z_lm_", suffix, "_SD_df1")]]
|
||||
z1[[paste0("L_Subtract_SD_", suffix, "_df2")]] <- z1[[paste0("Z_lm_", suffix, "_Avg_df2")]] - z1[[paste0("Z_lm_", suffix, "_SD_df2")]]
|
||||
|
||||
z2 <- df
|
||||
z2[[paste0("L_Subtract_SD_", suffix, "_df1")]] <- z2[[paste0("Z_lm_", suffix, "_Avg_df1")]] + z2[[paste0("Z_lm_", suffix, "_SD_df1")]]
|
||||
z2[[paste0("L_Subtract_SD_", suffix, "_df2")]] <- z2[[paste0("Z_lm_", suffix, "_Avg_df2")]] + z2[[paste0("Z_lm_", suffix, "_SD_df2")]]
|
||||
|
||||
df_above_threshold <- df[!is.na(df$Overlap_Avg), ]
|
||||
df_above_threshold$df1_Rank <- rank(-df_above_threshold[[paste0("Z_lm_", suffix, "_Avg_df1")]], ties.method = "random")
|
||||
df_above_threshold$df2_Rank <- rank(-df_above_threshold[[paste0("Z_lm_", suffix, "_Avg_df2")]], ties.method = "random")
|
||||
|
||||
list(z1 = z1, z2 = z2, df_above_threshold = df_above_threshold)
|
||||
}
|
||||
|
||||
# Main execution function for a pairwise comparison
|
||||
run_pairwise_comparison <- function(suffix, dir) {
|
||||
df <- load_and_merge_data(args$exp1_file, args$exp2_file)
|
||||
|
||||
# Generate initial ontology-based plot
|
||||
generate_plot(df,
|
||||
paste0("Z_lm_", suffix, "_Avg_df1"), paste0("Z_lm_", suffix, "_Avg_df2"), "Ontology_Avg_df1",
|
||||
paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name),
|
||||
file.path(dir, paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOntology")),
|
||||
theme_publication_legend_right)
|
||||
|
||||
# Annotate interactions and generate overlap-based plot
|
||||
df <- annotate_interactions(df, args$exp1_name, args$exp2_name, suffix)
|
||||
ranks <- rank_and_filter_data(df, suffix)
|
||||
|
||||
generate_plot(df,
|
||||
paste0("Z_lm_", suffix, "_Avg_df1"),
|
||||
paste0("Z_lm_", suffix, "_Avg_df2"),
|
||||
"Overlap_Avg",
|
||||
paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name),
|
||||
file.path(dir, paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOverlap")),
|
||||
theme_publication_legend_right)
|
||||
|
||||
generate_plot(ranks$df_above_threshold,
|
||||
paste0("Z_lm_", suffix, "_Avg_df1"),
|
||||
paste0("Z_lm_", suffix, "_Avg_df2"),
|
||||
"Overlap_Avg",
|
||||
paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name, "Above Threshold"),
|
||||
file.path(dir,
|
||||
paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOverlap_AboveThreshold")),
|
||||
theme_publication_legend_right)
|
||||
|
||||
# Save CSV files
|
||||
write.csv(df, file.path(dir,
|
||||
paste0("All_GTA_Avg_Scores_", args$exp1_name, "_vs_", args$exp2_name, ".csv")),
|
||||
row.names = FALSE)
|
||||
write.csv(ranks$df_above_threshold,
|
||||
file.path(dir, paste0("AboveThreshold_GTA_Avg_Scores_", args$exp1_name, "_vs_", args$exp2_name, ".csv")),
|
||||
row.names = FALSE)
|
||||
}
|
||||
|
||||
# Execute Pairwise L and Pairwise K comparisons
|
||||
run_pairwise_comparison("L", pairDirL)
|
||||
run_pairwise_comparison("K", pairDirK)
|
||||
305
qhtcp-workflow/apps/r/createHeatMaps.R
Normal file
305
qhtcp-workflow/apps/r/createHeatMaps.R
Normal file
@@ -0,0 +1,305 @@
|
||||
#!/usr/bin/env Rscript
|
||||
# This script will make heatmaps for the REMc analysis
|
||||
# need to give the input "finalTable.csv" file after running REMc generated by eclipse
|
||||
|
||||
library(RColorBrewer)
|
||||
library(gplots)
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
# Set output dir
|
||||
if (length(args) >= 1) {
|
||||
input_finalTable <- file.path(args[1])
|
||||
} else {
|
||||
input_finalTable <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow
|
||||
}
|
||||
|
||||
if (length(args) >= 2) {
|
||||
outDir <- file.path(args[2])
|
||||
} else {
|
||||
outDir <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow
|
||||
}
|
||||
|
||||
hmapfile <- data.frame(read.csv(file = input_finalTable, header = TRUE, sep = ",", stringsAsFactors = FALSE))
|
||||
# set NAs to NA
|
||||
hmapfile[hmapfile == -100] <- NA
|
||||
hmapfile[hmapfile == 100] <- NA
|
||||
hmapfile[hmapfile == 0.001] <- NA
|
||||
hmapfile[hmapfile == -0.001] <- NA
|
||||
|
||||
# select the number of rows based on the number of genes
|
||||
num_total_genes <- length(hmapfile[, 1])
|
||||
|
||||
# Break out the cluster names so each part of the cluster origin can be accessed
|
||||
# line below removed because it adds to many genes to clusters when going past 1-0-10
|
||||
# since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl.
|
||||
# hmapfile$cluster.origin = gsub(" ","",x=hmapfile$cluster.origin)
|
||||
|
||||
hmapfile$cluster.origin <- gsub(";", " ;", x = hmapfile$cluster.origin)
|
||||
hmapfile$cluster.origin <- strsplit(hmapfile$cluster.origin, ";")
|
||||
#use tail(x,n) for accessing the outward most cluster
|
||||
|
||||
clust_rounds <- 0
|
||||
for (i in 1:num_total_genes) {
|
||||
if (length(hmapfile$cluster.origin[[i]]) > clust_rounds) {
|
||||
clust_rounds <- length(hmapfile$cluster.origin[[i]])
|
||||
}
|
||||
}
|
||||
|
||||
unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes])
|
||||
unique_clusts <- unique_clusts[unique_clusts != " "]
|
||||
|
||||
# Select only the unique cluster names
|
||||
unique_clusts <- sort(unique(unlist(unique_clusts, use.names = FALSE)), decreasing = FALSE)
|
||||
num_unique_clusts <- length(unique_clusts)
|
||||
|
||||
# Base the color key on a statistical analysis of the L and K data
|
||||
# need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors)
|
||||
# scale() will calculate the mean and standard deviation of the entire vector,
|
||||
# then "scale" each element by those values by subtracting the mean and dividing by the sd.
|
||||
# hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)])
|
||||
# change so that the L data is multiplied to be on the same scale as the K data
|
||||
|
||||
KEY_MIN <- 0
|
||||
KEY_MAX <- 0
|
||||
K_MIN <- 0
|
||||
L_MAX <- 0
|
||||
KcolumnValues <- vector()
|
||||
LcolumnValues <- vector()
|
||||
|
||||
for (i in 4:(length(hmapfile[1, ]) - 2)){
|
||||
if (grepl("_Z_lm_K", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
|
||||
KcolumnValues <- append(KcolumnValues, i)
|
||||
}
|
||||
if (grepl("_Z_lm_L", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
|
||||
LcolumnValues <- append(LcolumnValues, i)
|
||||
}
|
||||
}
|
||||
|
||||
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[4]
|
||||
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[2]
|
||||
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm=TRUE)[4]
|
||||
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm=TRUE)[2]
|
||||
|
||||
# Z scores are
|
||||
L_MAX <- 12
|
||||
K_MIN <- -12
|
||||
|
||||
# L_Multiplier <- as.numeric(abs(K_MIN/L_MAX))
|
||||
# hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier
|
||||
|
||||
# if(grepl("SHIFT",colnames(hmapfile)[4],fixed=TRUE) == TRUE){
|
||||
# print("FOUND SHIFT VALUES")
|
||||
# hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier
|
||||
# }
|
||||
|
||||
# KEY_MAX <- as.numeric(L_MAX * L_Multiplier)
|
||||
# KEY_MIN <- as.numeric(K_MIN)
|
||||
|
||||
KEY_MAX <- as.numeric(L_MAX)
|
||||
KEY_MIN <- as.numeric(K_MIN)
|
||||
|
||||
print(KEY_MIN)
|
||||
print(L_MAX)
|
||||
# print(L_Multiplier)
|
||||
|
||||
colormapbreaks <- c(KEY_MIN, KEY_MIN * (5 / 6), KEY_MIN * (4 / 6), KEY_MIN * (3 / 6), KEY_MIN * (2 / 6),
|
||||
KEY_MIN * (1 / 6), KEY_MAX * (1 / 6), KEY_MAX * (2 / 6), KEY_MAX * (3 / 6), KEY_MAX * (4 / 6), KEY_MAX * (5 / 6), KEY_MAX)
|
||||
# print(colormapbreaks)
|
||||
|
||||
# Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?)
|
||||
# However since also using this to amend the first part.
|
||||
# Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers
|
||||
# then could just use these values and create spaces only between interaction values
|
||||
# possibly could get rid of redundant shift values if we don't want to view these
|
||||
# could we pool all the shift data/average it?
|
||||
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == TRUE) {
|
||||
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 2)
|
||||
# ev_repeat = rep("white",length(even_columns))
|
||||
# ev_repeat = rep("red",(length(hmapfile[1,]) - 5))
|
||||
# middle_col <- (length(hmapfile[1,]) - 5)/2
|
||||
# ev_repeat[(middle_col/2)] <- "black"
|
||||
# print(ev_repeat)
|
||||
}
|
||||
|
||||
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == FALSE) {
|
||||
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 1)
|
||||
print("NO SHIFT VALS FOUND")
|
||||
}
|
||||
|
||||
# FOR THIS SCRIPT ONLY (rap tem hu script)
|
||||
# even_columns <- c(2,5,7,10,12,15,17)
|
||||
|
||||
# m <- 0
|
||||
colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1, ]) - 2)])
|
||||
# print(colnames_edit)
|
||||
for (i in 1:length(colnames_edit)) {
|
||||
if (grepl("Shift", colnames_edit[i], fixed = TRUE) == TRUE) {
|
||||
colnames_edit[i] <- ""
|
||||
colnames_edit[i + 1] <- gsub(pattern = "_Z_lm_", replacement = " ", x = colnames_edit[i + 1])
|
||||
try(colnames_edit[i + 1] <- gsub(pattern = "_", replacement = " ", x = colnames_edit[i + 1]))
|
||||
|
||||
# INT_store <- strsplit(colnames_edit[i+1], "Z_lm")
|
||||
# print(length(unlist(INT_store)))
|
||||
# if(length(unlist(INT_store)) == 4){
|
||||
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep=" ")
|
||||
# }
|
||||
# if(length(unlist(INT_store)) == 3){
|
||||
#
|
||||
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep=" ")
|
||||
# }
|
||||
# if(length(unlist(INT_store)) == 5){
|
||||
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep=" ")
|
||||
# }
|
||||
# if(length(unlist(INT_store)) == 6){
|
||||
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep=" ")
|
||||
# }
|
||||
}
|
||||
}
|
||||
print(colnames_edit)
|
||||
# break()
|
||||
# colnames_edit[5] <- "TEM HLEG K"
|
||||
# colnames_edit[10] <- "TEM HL K"
|
||||
# colnames_edit[15] <- "TEM HLEG L"
|
||||
# colnames_edit[20] <- "TEM HL L"
|
||||
|
||||
# Create the heatmaps
|
||||
for (i in 1:num_unique_clusts) {
|
||||
cluster <- unique_clusts[i]
|
||||
cluster_data <- subset(hmapfile, grepl(cluster, cluster.origin))
|
||||
cluster_length <- length(cluster_data[, 1])
|
||||
if (cluster_length != 1) {
|
||||
X0 <- as.matrix(cluster_data[, 4:(length(hmapfile[1, ]) - 2)])
|
||||
if (cluster_length >= 2001) {
|
||||
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 20, width = 15)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim=c(-132, 132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
# cellnote = round(X0,digits = 0), notecex = 0.1, key = TRUE,
|
||||
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
# ColSideColors=ev_repeat,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
|
||||
)
|
||||
# abline(v=0.5467,col="black")
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length >= 201 && cluster_length <= 2000) {
|
||||
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 15, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim=c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.1, key = TRUE,
|
||||
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
|
||||
)
|
||||
# abline(v=0.5316,col="black")
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length >= 150 && cluster_length <= 200) {
|
||||
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 12, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim=c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.2, key = TRUE,
|
||||
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
|
||||
)
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length >= 101 && cluster_length <= 149) {
|
||||
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, mypath, height = 12, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
#zlim=c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
|
||||
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
|
||||
)
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length >= 60 && cluster_length <= 100) {
|
||||
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 12, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim=c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
|
||||
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
|
||||
)
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length <= 59 && cluster_length >= 30) {
|
||||
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 9, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim=c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
|
||||
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
|
||||
)
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length <= 29) {
|
||||
mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 7, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA,
|
||||
distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim=c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
|
||||
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit
|
||||
)
|
||||
dev.off()
|
||||
}
|
||||
}
|
||||
# print(paste("FINISHED", "CLUSTER",cluster,sep=" "))
|
||||
}
|
||||
364
qhtcp-workflow/apps/r/createHeatMapsHomology.R
Normal file
364
qhtcp-workflow/apps/r/createHeatMapsHomology.R
Normal file
@@ -0,0 +1,364 @@
|
||||
#!/usr/bin/env Rscript
|
||||
|
||||
library("RColorBrewer")
|
||||
library("gplots")
|
||||
library("tidyverse")
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
# Define the output path for the heatmaps - create this folder first - in linux terminal in the working folder use > mkdir filename_heatmaps
|
||||
output_path <- file.path(Args[1])
|
||||
|
||||
# Need to give the input "finalTable.csv" file after running REMc generated by eclipse
|
||||
final_table <- file.path(args[2])
|
||||
|
||||
# Give the damp_list.txt as the third argument - will color the gene names differently
|
||||
damps <- file.path(Args[3])
|
||||
damp_list <- read.delim(file = damps, header = FALSE, stringsAsFactors = FALSE)
|
||||
|
||||
# Give the yeast human homology mapping as the fourth argument - will add the genes to the finalTable and use info for heatmaps
|
||||
map_file <- file.path(Args[4])
|
||||
mapping <- read.csv(file = map_file, stringsAsFactors = FALSE)
|
||||
|
||||
# Read in finalTablewithShift
|
||||
hmapfile <- data.frame(read.csv(file = final_table, header = TRUE, sep = ",", stringsAsFactors = FALSE))
|
||||
|
||||
# Map the finalTable to the human homolog file
|
||||
hmapfile_map <- hmapfile
|
||||
# Match using OrfRep after dropping the _1 _2 _3 _4
|
||||
# But need to also account for some older files have ORF as column name rather than OrfRep in finalTable file
|
||||
if (colnames(hmapfile_map)[2] == "OrfRep") {
|
||||
try(hmapfile_map$ORFMatch <- hmapfile_map$OrfRep)
|
||||
}
|
||||
if (colnames(hmapfile_map)[2] == "ORF") {
|
||||
try(hmapfile_map$ORFMatch <- hmapfile_map$ORF)
|
||||
}
|
||||
hmapfile_map$ORFMatch <- gsub("_1", "", x = hmapfile_map$ORFMatch)
|
||||
hmapfile_map$ORFMatch <- gsub("_2", "", x = hmapfile_map$ORFMatch)
|
||||
hmapfile_map$ORFMatch <- gsub("_3", "", x = hmapfile_map$ORFMatch)
|
||||
hmapfile_map$ORFMatch <- gsub("_4", "", x = hmapfile_map$ORFMatch)
|
||||
|
||||
# Join the hmapfile using
|
||||
hmapfile_w_homolog <- full_join(hmapfile_map, mapping, by = c("ORFMatch" = "ensembl_gene_id"))
|
||||
|
||||
# Remove matches that are not from the finalTable
|
||||
hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$likelihood) == FASLE, ]
|
||||
|
||||
# Write csv with all info from mapping file
|
||||
write.csv(hmapfile_w_homolog, file.path(output_path, paste0(final_table, "_WithHomologAll.csv")), row.names = FALSE)
|
||||
|
||||
# Remove the non matches and output another mapping file - this is also one used to make heatmaps
|
||||
hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$external_gene_name_Human) == FALSE, ]
|
||||
write.csv(hmapfile_w_homolog, file.path(output_path, paste0(final_table, "_WithHomologMatchesOnly.csv"), row.names = FALSE))
|
||||
|
||||
# Add human gene name to the Gene column
|
||||
hmapfile_w_homolog$Gene <- paste(hmapfile_w_homolog$Gene, hmapfile_w_homolog$external_gene_name_Human, sep = "/")
|
||||
|
||||
# Only keep the finalTable file columns and the homology info
|
||||
hmap_len <- dim(hmapfile)[2]
|
||||
|
||||
hmapfile_w_homolog_remake <-
|
||||
cbind(hmapfile_w_homolog[, 1:hmap_len], hsapiens_homolog_orthology_type = hmapfile_w_homolog$hsapiens_homolog_orthology_type)
|
||||
hmapfile <- hmapfile_w_homolog_remake
|
||||
|
||||
# Set NAs to NA
|
||||
hmapfile[hmapfile == -100] <- NA
|
||||
hmapfile[hmapfile == 100] <- NA
|
||||
hmapfile[hmapfile == 0.001] <- NA
|
||||
hmapfile[hmapfile == -0.001] <- NA
|
||||
|
||||
# Select the number of rows based on the number of genes
|
||||
num_total_genes <- length(hmapfile[, 1])
|
||||
|
||||
# Break out the cluster names so each part of the cluster origin can be accessed
|
||||
# Line below removed because it adds to many genes to clusters when going past 1-0-10
|
||||
# since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl.
|
||||
# hmapfile$cluster.origin = gsub(" ","",x = hmapfile$cluster.origin)
|
||||
|
||||
hmapfile$cluster.origin <- gsub(";", " ;", x = hmapfile$cluster.origin)
|
||||
hmapfile$cluster.origin <- strsplit(hmapfile$cluster.origin, ";")
|
||||
# use tail(x,n) for accessing the outward most cluster
|
||||
|
||||
clust_rounds <- 0
|
||||
for (i in 1:num_total_genes) {
|
||||
if (length(hmapfile$cluster.origin[[i]]) > clust_rounds) {
|
||||
clust_rounds <- length(hmapfile$cluster.origin[[i]])
|
||||
}
|
||||
}
|
||||
|
||||
unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes])
|
||||
unique_clusts <- unique_clusts[unique_clusts != " "]
|
||||
|
||||
#select only the unique cluster names
|
||||
unique_clusts <- sort(unique(unlist(unique_clusts, use.names = FALSE)), decreasing = FALSE)
|
||||
num_unique_clusts <- length(unique_clusts)
|
||||
|
||||
# Base the color key on a statistical analysis of the L and K data
|
||||
# need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors)
|
||||
# scale() will calculate the mean and standard deviation of the entire vector
|
||||
# then "scale" each element by those values by subtracting the mean and dividing by the sd
|
||||
|
||||
# hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)])
|
||||
|
||||
# Change so that the L data is multiplied to be on the same scale as the K data
|
||||
|
||||
KEY_MIN <- 0
|
||||
KEY_MAX <- 0
|
||||
K_MIN <- 0
|
||||
L_MAX <- 0
|
||||
KcolumnValues <- vector()
|
||||
LcolumnValues <- vector()
|
||||
|
||||
for (i in 4:(length(hmapfile[1, ]) - 3)){
|
||||
if (grepl("_Z_lm_K", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
|
||||
KcolumnValues <- append(KcolumnValues, i)
|
||||
}
|
||||
if (grepl("_Z_lm_L", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
|
||||
LcolumnValues <- append(LcolumnValues, i)
|
||||
}
|
||||
}
|
||||
|
||||
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm = TRUE)[4]
|
||||
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm = TRUE)[2]
|
||||
# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm = TRUE)[4]
|
||||
# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm = TRUE)[2]
|
||||
|
||||
# Z scores are
|
||||
L_MAX <- 12
|
||||
K_MIN <- -12
|
||||
|
||||
# L_Multiplier <- as.numeric(abs(K_MIN/L_MAX))
|
||||
# hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier
|
||||
|
||||
# if(grepl("SHIFT",colnames(hmapfile)[4],fixed = TRUE) == TRUE){
|
||||
# print("FOUND SHIFT VALUES")
|
||||
# hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier
|
||||
# }
|
||||
|
||||
#KEY_MAX <- as.numeric(L_MAX * L_Multiplier)
|
||||
#KEY_MIN <- as.numeric(K_MIN)
|
||||
KEY_MAX <- as.numeric(L_MAX)
|
||||
KEY_MIN <- as.numeric(K_MIN)
|
||||
|
||||
print(KEY_MIN)
|
||||
print(L_MAX)
|
||||
#print(L_Multiplier)
|
||||
|
||||
colormapbreaks <- c(KEY_MIN, KEY_MIN * (5 / 6), KEY_MIN * (4 / 6), KEY_MIN * (3 / 6),
|
||||
KEY_MIN * (2 / 6), KEY_MIN * (1 / 6), KEY_MAX * (1 / 6), KEY_MAX * (2 / 6),
|
||||
KEY_MAX * (3 / 6), KEY_MAX * (4 / 6), KEY_MAX * (5 / 6), KEY_MAX)
|
||||
# print(colormapbreaks)
|
||||
|
||||
# Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?)
|
||||
# However since also using this to amend the first part.
|
||||
# Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers
|
||||
# then could just use these values and create spaces only between interaction values
|
||||
# possibly could get rid of redundant shift values if we don't want to view these
|
||||
# could we pool all the shift data/average it?
|
||||
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == TRUE) {
|
||||
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 2)
|
||||
# ev_repeat = rep("white",length(even_columns))
|
||||
# ev_repeat = rep("red",(length(hmapfile[1,]) - 5))
|
||||
# middle_col <- (length(hmapfile[1,]) - 5)/2
|
||||
# ev_repeat[(middle_col/2)] <- "black"
|
||||
# print(ev_repeat)
|
||||
}
|
||||
|
||||
if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == FALSE) {
|
||||
even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 1)
|
||||
print("NO SHIFT VALS FOUND")
|
||||
}
|
||||
|
||||
# for this script only (rap tem hu script)
|
||||
# even_columns <- c(2,5,7,10,12,15,17)
|
||||
|
||||
# m <- 0
|
||||
colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1, ]) - 3)])
|
||||
|
||||
colnames(damp_list)[1] <- "ORF"
|
||||
hmapfile$damps <- "YKO"
|
||||
colnames(hmapfile)[2] <- "ORF"
|
||||
try(hmapfile[hmapfile$ORF %in% damp_list$ORF, ]$damps <- "YKD")
|
||||
# X <- X[order(X$damps,decreasing = TRUE),]
|
||||
hmapfile$color2 <- NA
|
||||
try(hmapfile[hmapfile$damps == "YKO", ]$color2 <- "black")
|
||||
try(hmapfile[hmapfile$damps == "YKD", ]$color2 <- "red")
|
||||
|
||||
hmapfile$color <- NA
|
||||
try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_many2many", ]$color <- "#F8766D")
|
||||
try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2many", ]$color <- "#00BA38")
|
||||
try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2one", ]$color <- "#619CFF")
|
||||
|
||||
# print(colnames_edit)
|
||||
for (i in 1:length(colnames_edit)) {
|
||||
if (grepl("Shift", colnames_edit[i], fixed = TRUE) == TRUE) {
|
||||
colnames_edit[i] <- ""
|
||||
colnames_edit[i + 1] <- gsub(pattern = "_Z_lm_", replacement = " ", x = colnames_edit[i + 1])
|
||||
try(colnames_edit[i + 1] <- gsub(pattern = "_", replacement = " ", x = colnames_edit[i + 1]))
|
||||
|
||||
# INT_store <- strsplit(colnames_edit[i+1], "Z_lm")
|
||||
# print(length(unlist(INT_store)))
|
||||
# if(length(unlist(INT_store)) == 4){
|
||||
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep = " ")
|
||||
# }
|
||||
# if(length(unlist(INT_store)) == 3){
|
||||
#
|
||||
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep = " ")
|
||||
# }
|
||||
# if(length(unlist(INT_store)) == 5){
|
||||
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep = " ")
|
||||
# }
|
||||
# if(length(unlist(INT_store)) == 6){
|
||||
# colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep = " ")
|
||||
# }
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
print(colnames_edit)
|
||||
# break()
|
||||
# colnames_edit[5] <- "TEM HLEG K"
|
||||
# colnames_edit[10] <- "TEM HL K"
|
||||
# colnames_edit[15] <- "TEM HLEG L"
|
||||
# colnames_edit[20] <- "TEM HL L"
|
||||
|
||||
# Create the heatmaps
|
||||
for (i in 1:num_unique_clusts) {
|
||||
cluster <- unique_clusts[i]
|
||||
cluster_data <- subset(hmapfile, grepl(cluster, cluster.origin))
|
||||
cluster_length <- length(cluster_data[, 1])
|
||||
if (cluster_length != 1) {
|
||||
X0 <- as.matrix(cluster_data[, 4:(length(hmapfile[1, ]) - 6)])
|
||||
if (cluster_length >= 2001) {
|
||||
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 20, width = 15)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim = c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
# cellnote = round(X0,digits = 0), notecex = 0.1, key = TRUE,
|
||||
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
# ColSideColors = ev_repeat,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
|
||||
)
|
||||
# abline(v = 0.5467,col = "black")
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length >= 201 && cluster_length <= 2000) {
|
||||
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 15, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim = c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.1, key = TRUE,
|
||||
keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
|
||||
)
|
||||
# abline(v = 0.5316,col = "black")
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length >= 150 && cluster_length <= 200) {
|
||||
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 12, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim = c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.2, key = TRUE,
|
||||
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
|
||||
)
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length >= 101 && cluster_length <= 149) {
|
||||
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 12, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim = c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
|
||||
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
|
||||
)
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length >= 60 && cluster_length <= 100) {
|
||||
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 12, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim = c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
|
||||
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
|
||||
)
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length <= 59 && cluster_length >= 30) {
|
||||
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 9, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim = c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
|
||||
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
|
||||
)
|
||||
dev.off()
|
||||
}
|
||||
if (cluster_length <= 29) {
|
||||
mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
|
||||
pdf(file = mypath, height = 7, width = 12)
|
||||
heatmap.2(
|
||||
x = X0,
|
||||
Rowv = TRUE, Colv = NA,
|
||||
distfun = dist, hclustfun = hclust,
|
||||
dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none",
|
||||
breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
|
||||
# zlim = c(-132,132),
|
||||
xlab = "Type of Media", ylab = "Gene Name",
|
||||
cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
|
||||
keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
|
||||
na.color = "red", col = brewer.pal(11, "PuOr"),
|
||||
main = cluster,
|
||||
labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
|
||||
)
|
||||
dev.off()
|
||||
}
|
||||
}
|
||||
# print(paste("FINISHED", "CLUSTER",cluster,sep = " "))
|
||||
}
|
||||
42887
qhtcp-workflow/apps/r/go_terms.tab
Normal file
42887
qhtcp-workflow/apps/r/go_terms.tab
Normal file
File diff suppressed because it is too large
Load Diff
196
qhtcp-workflow/apps/r/gtaTemplate.R
Normal file
196
qhtcp-workflow/apps/r/gtaTemplate.R
Normal file
@@ -0,0 +1,196 @@
|
||||
#!/usr/bin/env R
|
||||
# GTA (GoTermAveraging)
|
||||
# Your output may not be reproducible as org.Sc.sgd.db is uploaded from Bioconductor R library and changes
|
||||
#
|
||||
# Updated 240724 Bryan C Roessler to improve file operations and portability
|
||||
# NOTE: The script now has 2 additional OPTIONAL arguments:
|
||||
# 1. Path to SGD terms file (go.terms.tab)
|
||||
# 2. Path to SGD features file (gene_association.sgd)
|
||||
|
||||
library("stringr")
|
||||
library("org.Sc.sgd.db")
|
||||
library("plyr")
|
||||
|
||||
# Parse arguments
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
exp_name <- args[1]
|
||||
|
||||
if (length(args) >= 2) {
|
||||
zscores_file <- args[2]
|
||||
} else {
|
||||
zscores_file <- "zscores/zscores_interaction.csv" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
|
||||
}
|
||||
|
||||
if (length(args) >= 3) {
|
||||
sgd_terms_file <- args[3]
|
||||
} else {
|
||||
sgd_terms_file <- "go_terms.tab"
|
||||
}
|
||||
|
||||
if (length(args) >= 4) {
|
||||
sgd_features_file <- args[4]
|
||||
} else {
|
||||
sgd_features_file <- "gene_association.sgd" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
|
||||
}
|
||||
|
||||
if (length(args) >= 5) {
|
||||
output_dir <- args[5]
|
||||
} else {
|
||||
output_dir <- "../../out/gta" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
|
||||
}
|
||||
|
||||
# # Set SGDgeneList file path
|
||||
# if (length(args) > 4) {
|
||||
# SGDgeneList <- args[4]
|
||||
# } else {
|
||||
# SGDgeneList <- "../Code/SGD_features.tab"
|
||||
|
||||
# Begin for loop for experiments in this study
|
||||
# ZScores_Interaction.csv
|
||||
for (m in 1:length(zscores_file)) {
|
||||
|
||||
# zscores_file <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="") #ArgsScore[1]
|
||||
X <- read.csv(file = zscores_file[m], stringsAsFactors = FALSE, header = TRUE)
|
||||
|
||||
if (colnames(X)[1] == "OrfRep") {
|
||||
colnames(X)[1] <- "ORF"
|
||||
}
|
||||
|
||||
#Terms is the GO term list
|
||||
Terms <- read.delim(file = sgd_terms_file, header = FALSE, quote = "",
|
||||
col.names = c("GO_ID", "GO_Term", "GO_Aspect", "GO_Term_Definition"))
|
||||
#all ORFs associated with GO term
|
||||
GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
|
||||
#Gene_Association is the gene association to GO term file
|
||||
Gene_Association <- read.delim(sgd_features_file, skip = 8, header = FALSE, quote = "",
|
||||
col.names = c("Database", "Database_Object_ID", "Database_Object_Symbol", "NOT", "GO_ID",
|
||||
"Database_Reference", "Evidence", "With_or_From", "Aspect", "Database_Object_Name",
|
||||
"Database_Object_Synonym", "Database_Object_Type", "taxon", "Date", "Assigned_By", "OtherInfo", "Empty"
|
||||
)
|
||||
)
|
||||
#Get the ORF names associated with each gene/GO term
|
||||
Gene_Association$ORF <- str_split_fixed(as.character(Gene_Association$Database_Object_Synonym), "\\|", 2)[, 1]
|
||||
#Get the numeric GO ID for matching
|
||||
Gene_Association$GO_ID_Numeric <- as.integer(str_split_fixed(as.character(Gene_Association$GO_ID), "\\:", 2)[, 2])
|
||||
#Get all unique GO terms
|
||||
GO_Terms <- unique(Gene_Association$GO_ID)
|
||||
#Create a character vector with just the ColNames of the input file to store the scores for each GO term
|
||||
Col_Names_X <- colnames(X)
|
||||
#Create a data_frame with header from input_file
|
||||
GO_Term_Averages <- X[0, ]
|
||||
#Fill table with NAs same length as number of GO terms
|
||||
GO_Term_Averages[1:length(GO_Terms), ] <- NA
|
||||
#Change the first and second col names to GO_ID and Term
|
||||
colnames(GO_Term_Averages)[1] <- "GO_ID"
|
||||
colnames(GO_Term_Averages)[2] <- "Term"
|
||||
|
||||
# Create new columns
|
||||
GO_Term_Averages$Ontology <- NA
|
||||
GO_Term_Averages$NumGenes <- NA
|
||||
GO_Term_Averages$AllPossibleGenes <- NA
|
||||
GO_Term_Averages$Genes <- NA
|
||||
GO_Term_Averages$ORFs <- NA
|
||||
|
||||
# Create a data.frame for the standard deviation info
|
||||
GO_Term_SD <- X[0, ]
|
||||
GO_Term_SD[1:length(GO_Terms), ] <- NA
|
||||
colnames(GO_Term_SD)[1] <- "GO_ID"
|
||||
colnames(GO_Term_SD)[2] <- "Term"
|
||||
|
||||
# Loop for each GO term to get an average L and K Z score
|
||||
for (i in 1:length(GO_Terms)) {
|
||||
# Get the GO_Term
|
||||
ID <- GO_Terms[i]
|
||||
# Get data.frame for all genes associated to the GO Term
|
||||
ID_AllGenes <- Gene_Association[Gene_Association$GO_ID == ID, ]
|
||||
# Get a vector of just the gene names
|
||||
ID_AllGenes_vector <- as.vector(GO2ALLORFs[as.character(ID)][[1]])
|
||||
if (length(unique(ID_AllGenes_vector)) > 4000) {
|
||||
next()
|
||||
}
|
||||
# Get the GO term character description where numeric Terms ID matches GO_Term's ID
|
||||
GO_Description_Term <- as.character(Terms[Terms$GO_ID %in% ID_AllGenes$GO_ID_Numeric, ]$GO_Term[1])
|
||||
# Get the Z scores for all genes in the GO_ID
|
||||
Zscores_For_ID <- X[X$ORF %in% ID_AllGenes_vector, ]
|
||||
# Get the Gene names and ORFs for the term
|
||||
GO_Term_Averages$Genes[i] <- paste(unique(Zscores_For_ID$Gene), collapse = " | ")
|
||||
GO_Term_Averages$ORFs[i] <- paste(unique(Zscores_For_ID$ORF), collapse = " | ")
|
||||
|
||||
# Dataframe to report the averages for a GO term
|
||||
# Get the GO ID
|
||||
GO_Term_Averages$GO_ID[i] <- as.character(ID)
|
||||
# Get the term name
|
||||
GO_Term_Averages$Term[i] <- GO_Description_Term
|
||||
# Get total number of genes annotated to the Term that we have in our library
|
||||
GO_Term_Averages$NumGenes[i] <- length(unique(Zscores_For_ID$ORF))
|
||||
# Get total number of genes annotated to the Term in SGD
|
||||
GO_Term_Averages$AllPossibleGenes[i] <- length(unique(ID_AllGenes_vector))
|
||||
# Get the ontology of the term
|
||||
GO_Term_Averages$Ontology[i] <- as.character(ID_AllGenes$Aspect[1])
|
||||
|
||||
# Calculate the average score for every column
|
||||
for (j in 3:length(X[1, ])) {
|
||||
GO_Term_Averages[i, j] <- mean(Zscores_For_ID[, j], na.rm = TRUE)
|
||||
# GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
|
||||
}
|
||||
|
||||
# Also calculate same values for the SD
|
||||
GO_Term_SD$GO_ID[i] <- as.character(ID)
|
||||
# Get the term name
|
||||
GO_Term_SD$Term[i] <- GO_Description_Term
|
||||
|
||||
# Calculate column scores for SD
|
||||
for (j in 3:length(X[1, ])) {
|
||||
GO_Term_SD[i, j] <- sd(Zscores_For_ID[, j], na.rm = TRUE)
|
||||
# GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
|
||||
}
|
||||
}
|
||||
# Add either _Avg or _SD depending on if the calculated score is an average or SD
|
||||
colnames(GO_Term_Averages) <- paste(colnames(GO_Term_Averages), "Avg", sep = "_")
|
||||
colnames(GO_Term_SD) <- paste(colnames(GO_Term_SD), "SD", sep = "_")
|
||||
# Combine the averages with the SDs to make one big data.frame
|
||||
X2 <- cbind(GO_Term_Averages, GO_Term_SD)
|
||||
# Test[ , order(names(test))]
|
||||
X2 <- X2[, order(names(X2))]
|
||||
X2 <- X2[!is.na(X2$Z_lm_L_Avg), ]
|
||||
# Create output file
|
||||
write.csv(X2, file.path(output_dir, expName[m], "Average_GOTerms_All.csv"), row.names = FALSE)
|
||||
# Remove NAs
|
||||
X3 <- X2[!is.na(X2$Z_lm_L_Avg), ]
|
||||
|
||||
# Identify redundant GO terms
|
||||
for (i in 1:length(X3[, 1])) {
|
||||
# Loop through each GO term - get term
|
||||
GO_term_ID <- as.character(X3$GO_ID_Avg[i])
|
||||
# Get term in the X3
|
||||
X3_Temp <- X3[X3$GO_ID_Avg == GO_term_ID, ]
|
||||
# Get anywhere that has the same number K_Avg value
|
||||
X3_Temp2 <- X3[X3$Z_lm_K_Avg %in% X3_Temp, ]
|
||||
if (length(X3_Temp2[, 1]) > 1) {
|
||||
if (length(unique(X3_Temp2$Genes_Avg)) == 1) {
|
||||
X3_Temp2 <- X3_Temp2[1, ]
|
||||
}
|
||||
}
|
||||
|
||||
if (i == 1) {
|
||||
Y <- X3_Temp2
|
||||
}
|
||||
if (i > 1) {
|
||||
Y <- rbind(Y, X3_Temp2)
|
||||
}
|
||||
}
|
||||
|
||||
Y1 <- unique(Y)
|
||||
write.csv(Y1, file.path(output_dir, exp_name, "Average_GOTerms_All_NonRedundantTerms.csv"), row.names = FALSE)
|
||||
Y2 <- Y1[Y1$Z_lm_L_Avg >= 2 | Y1$Z_lm_L_Avg <= -2, ]
|
||||
Y2 <- Y2[!is.na(Y2$Z_lm_L_Avg), ]
|
||||
write.csv(Y2, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_L.csv"), row.names = FALSE)
|
||||
Y3 <- Y2[Y2$NumGenes_Avg > 2, ]
|
||||
write.csv(Y3, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv"), row.names = FALSE)
|
||||
Y4 <- Y1[Y1$Z_lm_K_Avg >= 2 | Y1$Z_lm_K_Avg <= -2, ]
|
||||
Y4 <- Y4[!is.na(Y4$Z_lm_K_Avg), ]
|
||||
write.csv(Y4, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_K.csv"), row.names = FALSE)
|
||||
Y5 <- Y4[Y4$NumGenes_Avg > 2, ]
|
||||
write.csv(Y5, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv"), row.names = FALSE)
|
||||
}
|
||||
375
qhtcp-workflow/apps/r/joinInteractExps.R
Normal file
375
qhtcp-workflow/apps/r/joinInteractExps.R
Normal file
@@ -0,0 +1,375 @@
|
||||
library("dplyr")
|
||||
|
||||
# Function to parse and set arguments
|
||||
parse_arguments <- function() {
|
||||
if (interactive()) {
|
||||
args <- c(
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
|
||||
2,
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
|
||||
)
|
||||
} else {
|
||||
args <- commandArgs(trailingOnly = TRUE)
|
||||
}
|
||||
|
||||
list(
|
||||
out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
|
||||
sd = as.numeric(args[2]),
|
||||
study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
|
||||
input_dirs = args[4:length(args)]
|
||||
)
|
||||
}
|
||||
|
||||
args <- parse_arguments()
|
||||
|
||||
# Create an array for the zscores files
|
||||
get_zscores_files <- function(dirs) {
|
||||
files <- sapply(dirs, function(exp) {
|
||||
file_path <- file.path(exp, "zscores", "zscores_interaction.csv")
|
||||
if (file.exists(file_path)) file_path else NULL
|
||||
}, simplify = TRUE, USE.NAMES = FALSE)
|
||||
|
||||
# Filter out NULL entries
|
||||
files[!sapply(files, is.null)]
|
||||
}
|
||||
|
||||
zscores_files <- get_zscores_files(args$input_dirs)
|
||||
sprintf("The SD value is: %d", args$sd)
|
||||
|
||||
# Ensure there are enough files to compare
|
||||
if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script")
|
||||
|
||||
# Function to join zscores files
|
||||
join_zscores_files <- function(files) {
|
||||
joined_data <- read.csv(file = files[1], stringsAsFactors = FALSE)
|
||||
for (file in files[-1]) {
|
||||
temp_data <- read.csv(file = file, stringsAsFactors = FALSE)
|
||||
joined_data <- join(joined_data, temp_data, by = "OrfRep")
|
||||
}
|
||||
joined_data
|
||||
}
|
||||
|
||||
# Load and join zscores files
|
||||
joined_data <- join_zscores_files(zscores_files)
|
||||
|
||||
# Order and select columns
|
||||
order_and_select_columns <- function(data) {
|
||||
ordered_data <- data[, order(colnames(data))]
|
||||
selected_headers <- select(ordered_data,
|
||||
contains("OrfRep"), matches("Gene"),
|
||||
contains("z_lm_k"), contains("z_shift_k"),
|
||||
contains("z_lm_l"), contains("z_shift_l"))
|
||||
selected_headers
|
||||
}
|
||||
|
||||
selected_headers <- order_and_select_columns(joined_data)
|
||||
|
||||
# Remove redundant columns like "Gene.1"
|
||||
clean_headers <- function(data, suffixes) {
|
||||
suffixes_to_remove <- paste0("Gene.", seq_len(suffixes))
|
||||
select(data, -all_of(suffixes_to_remove))
|
||||
}
|
||||
|
||||
headSel <- clean_headers(selected_headers, length(zscores_files) - 1)
|
||||
headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1)
|
||||
|
||||
# Fill NA values in Shift and Z_lm columns
|
||||
fill_na_in_columns <- function(data) {
|
||||
for (header in colnames(data)) {
|
||||
if (grepl("Shift", header)) {
|
||||
data[[header]][is.na(data[[header]])] <- 0.001
|
||||
} else if (grepl("Z_lm_", header)) {
|
||||
data[[header]][is.na(data[[header]])] <- 0.0001
|
||||
}
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
headSel <- fill_na_in_columns(headSel)
|
||||
|
||||
# Filter based on standard deviation
|
||||
filter_by_sd <- function(data, sd) {
|
||||
if (sd == 0) return(data)
|
||||
|
||||
z_lm_cols <- select(data, contains("z_lm_"))
|
||||
filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0
|
||||
data[filter_vector, ]
|
||||
}
|
||||
|
||||
REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), args$sd)
|
||||
shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), args$sd)
|
||||
|
||||
# Reorder columns to interleave Z_lm and Shift data
|
||||
reorder_columns <- function(data1, data2) {
|
||||
combined_data <- data1
|
||||
for (i in 3:ncol(data1)) {
|
||||
combined_data <- cbind(combined_data, data2[i], data1[i])
|
||||
}
|
||||
combined_data
|
||||
}
|
||||
|
||||
combI <- reorder_columns(headSel2, shiftOnly)
|
||||
|
||||
# Write output files
|
||||
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
|
||||
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
|
||||
|
||||
# Relabel headers using experiment names from StudyInfo.csv
|
||||
relabel_headers <- function(headers, labels) {
|
||||
new_labels <- headers
|
||||
for (i in seq_along(headers)) {
|
||||
suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i])
|
||||
if (suffix %in% 1:3) {
|
||||
exp_name <- labels[as.numeric(suffix), 2]
|
||||
new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i])
|
||||
}
|
||||
}
|
||||
new_labels
|
||||
}
|
||||
|
||||
LabelStd <- read.csv(file = args$study_info, stringsAsFactors = FALSE)
|
||||
colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), LabelStd)
|
||||
colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), LabelStd)
|
||||
|
||||
# Save relabeled files
|
||||
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
|
||||
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
|
||||
|
||||
# Save updated parameters
|
||||
LabelStd[, 4] <- args$sd
|
||||
write.csv(LabelStd, file.path(args$out_dir, "parameters.csv"), row.names = FALSE)
|
||||
write.csv(LabelStd, file = args$study_info, row.names = FALSE)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# library("plyr")
|
||||
# library("sos")
|
||||
# library("dplyr")
|
||||
|
||||
# # Function to parse and set arguments
|
||||
# parse_arguments <- function() {
|
||||
# if (interactive()) {
|
||||
# args <- c(
|
||||
# "./", # Default out_dir
|
||||
# "2", # Default SD value
|
||||
# "../Code/StudyInfo.csv" # Default study_info path
|
||||
# )
|
||||
# } else {
|
||||
# args <- commandArgs(trailingOnly = TRUE)
|
||||
# }
|
||||
|
||||
# list(
|
||||
# out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
|
||||
# sd = as.numeric(args[2]),
|
||||
# study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
|
||||
# input_dirs = args[4:length(args)]
|
||||
# )
|
||||
# }
|
||||
|
||||
# args <- parse_arguments()
|
||||
|
||||
# # Create an array for the zscores files
|
||||
# zscores_files <- sapply(args$input_dirs, function(study) {
|
||||
# file_path <- file.path(study, "zscores", "zscores_interaction.csv")
|
||||
# if (file.exists(file_path)) file_path else NULL
|
||||
# }, simplify = TRUE, USE.NAMES = FALSE)
|
||||
|
||||
# # Filter out NULL entries
|
||||
# zscores_files <- zscores_files[!sapply(zscores_files, is.null)]
|
||||
|
||||
# sprintf("The SD value is: %d", args$sd)
|
||||
# # print(args$input_dirs)
|
||||
|
||||
# # TODO this is better handled in a loop in case you want to compare more experiments?
|
||||
# # The input is already designed for this
|
||||
# # Read in the files for your experiment and
|
||||
# # Join the two files at a time as a function of how many inputFile
|
||||
# # list the larger file first ? in this example X2 has the larger number of genes
|
||||
# # If X1 has a larger number of genes, switch the order of X1 and X2
|
||||
# if (length(zscores_files) < 2) {
|
||||
# print("Note enough Exps to compare, skipping join")
|
||||
# stop("Exiting script")
|
||||
# }
|
||||
|
||||
# if (length(zscores_files) >= 2) {
|
||||
# X1 <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE)
|
||||
# X2 <- read.csv(file = zscores_files[2], stringsAsFactors = FALSE)
|
||||
# X <- join(X1, X2, by = "OrfRep")
|
||||
# OBH <- X[, order(colnames(X))] # OrderByHeader
|
||||
# headers <- select(OBH, contains("OrfRep"), matches("Gene"),
|
||||
# contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l"))
|
||||
# headSel <- select(headers, -"Gene.1") # remove "Gene.1 column
|
||||
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums
|
||||
# headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column # frame for interleaving Z_lm with Shift colums
|
||||
# }
|
||||
|
||||
# if (length(zscores_files) >= 3) {
|
||||
# X3 <- read.csv(file = zscores_files[3], stringsAsFactors = FALSE)
|
||||
# X <- join(X, X3, by = "OrfRep")
|
||||
# headSel <- select(headers, -"Gene.1", -"Gene.2")
|
||||
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
|
||||
# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2")
|
||||
# }
|
||||
|
||||
# if (length(zscores_files) >= 4) {
|
||||
# X4 <- read.csv(file = zscores_files[4], stringsAsFactors = FALSE)
|
||||
# X <- join(X, X4, by = "OrfRep")
|
||||
# headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3")
|
||||
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
|
||||
# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3")
|
||||
# }
|
||||
|
||||
# # print(headers)
|
||||
# # headSel$contains("Z_Shift") %>% replace_na(0.001)
|
||||
# headers <- colnames(headSel)
|
||||
# # print(headers)
|
||||
# i <- 0
|
||||
# for (i in 1:length(headers)) {
|
||||
# if (grepl("Shift", headers[i])) {
|
||||
# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001
|
||||
# }
|
||||
# if (grepl("Z_lm_", headers[i])) {
|
||||
# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001
|
||||
# }
|
||||
# }
|
||||
|
||||
# # 2SD option code to exclude Z_lm values less than 2 standard Deviations
|
||||
# REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_"))
|
||||
# shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift"))
|
||||
|
||||
# # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt
|
||||
# Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",")
|
||||
|
||||
# # Using Text search grepl to relabel headers
|
||||
# REMcRdyHdr <- colnames(REMcRdy)
|
||||
# REMcRdyLabels <- "asdf"
|
||||
# shftHdr <- colnames(shiftOnly)
|
||||
# shiftLabels <- "asdf"
|
||||
# shiftLabels[1:2] <- shftHdr[1:2]
|
||||
# REMcRdyLabels[1:2] <- REMcRdyHdr[1:2]
|
||||
|
||||
# for (i in 3:(length(shftHdr))) {
|
||||
# if (i == 3) {
|
||||
# shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3])
|
||||
# REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3])
|
||||
# }
|
||||
# if (i == 5) {
|
||||
# shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5])
|
||||
# REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5])
|
||||
# }
|
||||
# if (i == 7) {
|
||||
# shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7])
|
||||
# REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7])
|
||||
# }
|
||||
# if (grepl(".1", shftHdr[i], fixed = true)) {
|
||||
# shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i])
|
||||
# REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i])
|
||||
# }
|
||||
# if (grepl(".2", shftHdr[i], fixed = true)) {
|
||||
# shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i])
|
||||
# REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i])
|
||||
# }
|
||||
# if (grepl(".3", shftHdr[i], fixed = true)) {
|
||||
# shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i])
|
||||
# REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i])
|
||||
# }
|
||||
# }
|
||||
|
||||
# for (i in 3:(length(REMcRdyLabels))) {
|
||||
# j <- as.integer(i)
|
||||
# REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j])
|
||||
# shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j])
|
||||
# }
|
||||
|
||||
# colnames(shiftOnly) <- shiftLabels
|
||||
# colnames(REMcRdy) <- REMcRdyLabels
|
||||
|
||||
# combI <- headSel2 # starting Template orf, Genename columns
|
||||
|
||||
# # headersRemc<-colnames(REMcRdy)
|
||||
# # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps.
|
||||
# for (i in 3:length(colnames(REMcRdy))) {
|
||||
# combI <- cbind.data.frame(combI, shiftOnly[i])
|
||||
# combI <- cbind.data.frame(combI, REMcRdy[i])
|
||||
# }
|
||||
|
||||
# Vec1 <- NA
|
||||
# Vec2 <- NA
|
||||
# Vec3 <- NA
|
||||
# Vec4 <- NA
|
||||
# Vec5 <- NA
|
||||
# Vec6 <- NA
|
||||
# Vec7 <- NA
|
||||
# Vec8 <- NA
|
||||
|
||||
# if (length(REMcRdy) == 6) {
|
||||
# Vec1 <- abs(REMcRdy[, 3]) >= sd
|
||||
# Vec2 <- abs(REMcRdy[, 4]) >= sd
|
||||
# Vec3 <- abs(REMcRdy[, 5]) >= sd
|
||||
# Vec4 <- abs(REMcRdy[, 6]) >= sd
|
||||
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4
|
||||
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
|
||||
# REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6]
|
||||
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
|
||||
# shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6]
|
||||
# }
|
||||
|
||||
# if (length(REMcRdy) == 8) {
|
||||
# Vec1 <- abs(REMcRdy[, 3]) >= sd
|
||||
# Vec2 <- abs(REMcRdy[, 4]) >= sd
|
||||
# Vec3 <- abs(REMcRdy[, 5]) >= sd
|
||||
# Vec4 <- abs(REMcRdy[, 6]) >= sd
|
||||
# Vec5 <- abs(REMcRdy[, 7]) >= sd
|
||||
# Vec6 <- abs(REMcRdy[, 8]) >= sd
|
||||
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6
|
||||
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
|
||||
# REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8]
|
||||
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
|
||||
# shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8]
|
||||
# }
|
||||
|
||||
# if (length(REMcRdy) == 10) {
|
||||
# Vec1 <- abs(REMcRdy[, 3]) >= sd
|
||||
# Vec2 <- abs(REMcRdy[, 4]) >= sd
|
||||
# Vec3 <- abs(REMcRdy[, 5]) >= sd
|
||||
# Vec4 <- abs(REMcRdy[, 6]) >= sd
|
||||
# Vec5 <- abs(REMcRdy[, 7]) >= sd
|
||||
# Vec6 <- abs(REMcRdy[, 8]) >= sd
|
||||
# Vec7 <- abs(REMcRdy[, 9]) >= sd
|
||||
# Vec8 <- abs(REMcRdy[, 10]) >= sd
|
||||
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8
|
||||
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
|
||||
# REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10]
|
||||
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
|
||||
# shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10]
|
||||
# }
|
||||
|
||||
# if (sd != 0) {
|
||||
# REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)]
|
||||
# shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)]
|
||||
# }
|
||||
|
||||
# if (sd == 0) {
|
||||
# REMcRdy <- REMcRdy # [,2:length(REMcRdy)]
|
||||
# shiftOnly <- shiftOnly # [,2:length(shiftOnly)]
|
||||
# }
|
||||
|
||||
# # R places hidden "" around the header names. The following
|
||||
# # is intended to remove those quote so that the "" do not blow up the Java REMc.
|
||||
# # Use ,quote=F in the write.csv statement to fix R output file.
|
||||
# # write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE)
|
||||
# write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
|
||||
# write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
|
||||
# # LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",")
|
||||
|
||||
# LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE)
|
||||
# # print(sd)
|
||||
# LabelStd[, 4] <- as.numeric(sd)
|
||||
# write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE)
|
||||
# write.csv(LabelStd, file = study_info, row.names = FALSE)
|
||||
113
qhtcp-workflow/apps/r/join_interaction_zscores.R
Normal file
113
qhtcp-workflow/apps/r/join_interaction_zscores.R
Normal file
@@ -0,0 +1,113 @@
|
||||
suppressMessages({
|
||||
library("dplyr")
|
||||
library("data.table")
|
||||
library("readr")
|
||||
library("stringr")
|
||||
})
|
||||
|
||||
# Function to parse arguments
|
||||
parse_arguments <- function() {
|
||||
if (interactive()) {
|
||||
args <- c(
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
|
||||
3, # sd value
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1",
|
||||
"Experiment 1: Doxo versus HLD",
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2",
|
||||
"Experiment 2: HLD versus Doxo"
|
||||
)
|
||||
} else {
|
||||
args <- commandArgs(trailingOnly = TRUE)
|
||||
}
|
||||
|
||||
out_dir <- normalizePath(args[1], mustWork = FALSE)
|
||||
sd <- as.numeric(args[2])
|
||||
paths <- normalizePath(args[seq(3, length(args), by = 2)], mustWork = FALSE)
|
||||
names <- args[seq(4, length(args), by = 2)]
|
||||
experiments <- setNames(paths, names)
|
||||
|
||||
list(
|
||||
out_dir = out_dir,
|
||||
sd = sd,
|
||||
experiments = experiments
|
||||
)
|
||||
}
|
||||
|
||||
args <- parse_arguments()
|
||||
|
||||
# Ensure main output directory exists
|
||||
dir.create(args$out_dir, showWarnings = FALSE, recursive = TRUE)
|
||||
|
||||
# Function to read and combine z-score interaction files
|
||||
combine_zscores <- function(experiments, out_dir) {
|
||||
combined_data <- lapply(names(experiments), function(exp_name) {
|
||||
exp_dir <- experiments[[exp_name]]
|
||||
zscore_file <- file.path(exp_dir, "zscores", "zscores_interaction.csv")
|
||||
|
||||
if (!file.exists(zscore_file)) {
|
||||
stop("Z-score file does not exist for ", exp_name, " at ", zscore_file)
|
||||
}
|
||||
|
||||
message("Reading z-score file for ", exp_name, " from ", zscore_file)
|
||||
data <- fread(zscore_file)
|
||||
data$Experiment <- exp_name
|
||||
|
||||
return(data)
|
||||
}) %>%
|
||||
bind_rows()
|
||||
|
||||
combined_output_file <- file.path(out_dir, "combined_zscores.csv")
|
||||
fwrite(combined_data, combined_output_file, row.names = FALSE)
|
||||
|
||||
message("Combined z-score file saved to: ", combined_output_file)
|
||||
}
|
||||
|
||||
# Function to read and combine summary statistics files
|
||||
combine_summary_stats <- function(experiments, out_dir) {
|
||||
combined_stats <- lapply(names(experiments), function(exp_name) {
|
||||
exp_dir <- experiments[[exp_name]]
|
||||
summary_file <- file.path(exp_dir, "zscores", "summary_stats_all_strains.csv")
|
||||
|
||||
if (!file.exists(summary_file)) {
|
||||
stop("Summary stats file does not exist for ", exp_name, " at ", summary_file)
|
||||
}
|
||||
|
||||
message("Reading summary stats file for ", exp_name, " from ", summary_file)
|
||||
data <- fread(summary_file)
|
||||
data$Experiment <- exp_name
|
||||
|
||||
return(data)
|
||||
}) %>%
|
||||
bind_rows()
|
||||
|
||||
combined_output_file <- file.path(out_dir, "combined_summary_stats.csv")
|
||||
fwrite(combined_stats, combined_output_file, row.names = FALSE)
|
||||
|
||||
message("Combined summary stats file saved to: ", combined_output_file)
|
||||
}
|
||||
|
||||
# Function to generate final summary report
|
||||
generate_final_report <- function(out_dir) {
|
||||
combined_zscores <- file.path(out_dir, "combined_zscores.csv")
|
||||
combined_stats <- file.path(out_dir, "combined_summary_stats.csv")
|
||||
|
||||
if (!file.exists(combined_zscores) || !file.exists(combined_stats)) {
|
||||
stop("Combined z-scores or summary stats files do not exist.")
|
||||
}
|
||||
|
||||
zscores_data <- fread(combined_zscores)
|
||||
stats_data <- fread(combined_stats)
|
||||
|
||||
message("Merging z-score and summary stats data...")
|
||||
final_report <- merge(zscores_data, stats_data, by = c("OrfRep", "Experiment"), all = TRUE, allow.cartesian = TRUE)
|
||||
|
||||
final_report_file <- file.path(out_dir, "final_combined_report.csv")
|
||||
fwrite(final_report, final_report_file, row.names = FALSE)
|
||||
|
||||
message("Final combined report saved to: ", final_report_file)
|
||||
}
|
||||
|
||||
# Process all experiments and generate outputs
|
||||
combine_zscores(args$experiments, args$out_dir)
|
||||
combine_summary_stats(args$experiments, args$out_dir)
|
||||
generate_final_report(args$out_dir)
|
||||
Reference in New Issue
Block a user