Add gta module

This commit is contained in:
2024-07-24 00:06:56 -04:00
parent b049d58e79
commit 7b8ce3e7fd
3 changed files with 259 additions and 174 deletions

View File

@@ -1,59 +1,70 @@
#GTA (GoTermAveraging) Starting (Working Directory is /Code) All paths relative to /Code
#Your output may not be reproducible as org.Sc.sgd.db is uploaded from Bioconductor R library and changes
#Loops thru the number of experiments involved in study. JWR
Wstudy= getwd()
if (file.exists('../Exp1/ZScores/ZScores_Interaction.csv')){
inputFile <- '../Exp1/ZScores/ZScores_Interaction.csv'
expName= "Exp1"
dir.create("../GTAresults/Exp1")
}
if (file.exists('../Exp2/ZScores/ZScores_Interaction.csv')){
inputFile[2] <- '../Exp2/ZScores/ZScores_Interaction.csv'
expName[2]= "Exp2"
dir.create("../GTAresults/Exp2")
}
if (file.exists('../Exp3/ZScores/ZScores_Interaction.csv')){
inputFile[3] <- '../Exp3/ZScores/ZScores_Interaction.csv'
expName[3]= "Exp3"
dir.create("../GTAresults/Exp3")
}
if (file.exists('../Exp4/ZScores/ZScores_Interaction.csv')){
inputFile[4] <- '../Exp4/ZScores/ZScores_Interaction.csv'
expName[4]= "Exp4"
dir.create("../GTAresults/Exp4")
}
outputPathGTA= "../GTAresults"
#dir.create(outPathGTA)
#!/usr/bin/env R
# GTA (GoTermAveraging)
# Your output may not be reproducible as org.Sc.sgd.db is uploaded from Bioconductor R library and changes
#
# Updated 240724 Bryan C Roessler to improve file operations and portability
# NOTE: The script now has 2 additional OPTIONAL arguments:
# 1. Path to SGD terms file (go.terms.tab)
# 2. Path to SGD features file (gene_association.sgd)
library("stringr")
library("org.Sc.sgd.db")
library("plyr")
#build in command args to apply this code to a given !!results sheet
SGD_Terms_file <- "../Code/go_terms.tab" #ArgsScore[2]
#https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
SGD_features_file <- "../Code/gene_association.sgd" #ArgsScore[3]
# Parse arguments
args <- commandArgs(TRUE)
#R and Rstudio have issues: The for loop(189) seemed to fail to evaluate the paste (or paste0) to build the inputFile inside the for loop and bail as the second loop began. This crude fix below, seems to have alleviated the failure to loop problem at least for now. Also for some annoying reason, the underscores between word are sometimes not shown when they exist. No ryme of reason!!!
exp_name <- args[1]
if (length(args) > 2) {
zscores_file <- args[2]
} else {
zscores_file <- "ZScores/ZScores_Interaction.csv" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
}
if (length(args) > 3) {
sgd_terms_file <- args[3]
} else {
sgd_terms_file <- "../Code/go_terms.tab"
}
if (length(args) > 4) {
sgd_features_file <- args[4]
} else {
sgd_features_file <- "../Code/gene_association.sgd" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
}
if (length(args) > 5) {
output_dir <- args[5]
} else {
output_dir <- "../GTAresults" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
}
# # Set SGDgeneList file path
# if (length(args) > 4) {
# SGDgeneList <- args[4]
# } else {
# SGDgeneList <- "../Code/SGD_features.tab"
#Begin for loop for experiments in this study-----------------ZScores_Interaction.csv
for(m in 1:length(inputFile)){
for(m in 1:length(zscores_file)){
#inputFile <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="") #ArgsScore[1]
X <- read.csv(file = inputFile[m],stringsAsFactors=FALSE,header = TRUE)
#zscores_file <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="") #ArgsScore[1]
X <- read.csv(file = zscores_file[m],stringsAsFactors=FALSE,header = TRUE)
if(colnames(X)[1] == "OrfRep"){
colnames(X)[1] <- "ORF"
}
#Terms is the GO term list
Terms <- read.delim(file = SGD_Terms_file,header=FALSE,quote = "",col.names = c("GO_ID","GO_Term","GO_Aspect","GO_Term_Definition"))
Terms <- read.delim(file = sgd_terms_file,header=FALSE,quote = "",col.names = c("GO_ID","GO_Term","GO_Aspect","GO_Term_Definition"))
#all ORFs associated with GO term
GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
#Gene_Association is the gene association to GO term file
Gene_Association <- read.delim(SGD_features_file,skip=8,header=FALSE,quote="",col.names = c("Database","Database_Object_ID","Database_Object_Symbol","NOT","GO_ID","Database_Reference","Evidence","With_or_From","Aspect","Database_Object_Name","Database_Object_Synonym","Database_Object_Type","taxon","Date","Assigned_By","OtherInfo","Empty"))
Gene_Association <- read.delim(sgd_features_file,skip=8,header=FALSE,quote="",col.names = c("Database","Database_Object_ID","Database_Object_Symbol","NOT","GO_ID","Database_Reference","Evidence","With_or_From","Aspect","Database_Object_Name","Database_Object_Synonym","Database_Object_Type","taxon","Date","Assigned_By","OtherInfo","Empty"))
#Get the ORF names associated with each gene/GO term
Gene_Association$ORF <- str_split_fixed(as.character(Gene_Association$Database_Object_Synonym),"\\|",2)[,1]
#Get the numeric GO ID for matching
@@ -140,7 +151,7 @@ for(m in 1:length(inputFile)){
X2 <- X2[,order(names(X2))]
X2 <- X2[!is.na(X2$Z_lm_L_Avg),]
#create output file
write.csv(X2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All.csv",sep=""),row.names=FALSE)
write.csv(X2,file=paste(output_dir,"/",expName[m],"/Average_GOTerms_All.csv",sep=""),row.names=FALSE)
#remove NAs
X3 <- X2[!is.na(X2$Z_lm_L_Avg),]
#identify redundant GO terms
@@ -167,21 +178,21 @@ for(m in 1:length(inputFile)){
}
Y1 <- unique(Y)
write.csv(Y1,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All_NonRedundantTerms.csv",sep=""),row.names = FALSE)
write.csv(Y1,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_All_NonRedundantTerms.csv",sep=""),row.names = FALSE)
Y2 <- Y1[Y1$Z_lm_L_Avg >= 2 | Y1$Z_lm_L_Avg <= -2,]
Y2 <- Y2[!is.na(Y2$Z_lm_L_Avg),]
write.csv(Y2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L.csv",sep=""),row.names = FALSE)
write.csv(Y2,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_L.csv",sep=""),row.names = FALSE)
Y3 <- Y2[Y2$NumGenes_Avg > 2,]
write.csv(Y3,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv",sep=""),row.names = FALSE)
write.csv(Y3,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv",sep=""),row.names = FALSE)
Y4 <- Y1[Y1$Z_lm_K_Avg >= 2 | Y1$Z_lm_K_Avg <= -2,]
Y4 <- Y4[!is.na(Y4$Z_lm_K_Avg),]
write.csv(Y4,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K.csv",sep=""),row.names = FALSE)
write.csv(Y4,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_K.csv",sep=""),row.names = FALSE)
Y5 <- Y4[Y4$NumGenes_Avg > 2,]
write.csv(Y5,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv",sep=""),row.names = FALSE)
write.csv(Y5,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv",sep=""),row.names = FALSE)
#End of 'for loop'
}