Files
hartman-server/workflow/.old/ScriptTemplates/GTAtemplate.R

189 lines
8.4 KiB
R

#GTA (GoTermAveraging) Starting (Working Directory is /Code) All paths relative to /Code
#Currently must run for each Experiment. _Future could loop thru the number of experiments involved in study. JWR
Wstudy= getwd()
if (file.exists('../Exp2/ZScores/ZScores_Interaction.csv')){
inputFile <- '../Exp1/ZScores/ZScores_Interaction.csv'
expName= "Exp1"
dir.create("../GTAresults/Exp1")
}
if (file.exists('../Exp2/ZScores/ZScores_Interaction.csv')){
inputFile[2] <- '../Exp2/ZScores/ZScores_Interaction.csv'
expName[2]= "Exp2"
dir.create("../GTAresults/Exp2")
}
if (file.exists('../Exp3/ZScores/ZScores_Interaction.csv')){
inputFile[3] <- '../Exp3/ZScores/ZScores_Interaction.csv'
expName[3]= "Exp3"
dir.create("../GTAresults/Exp3")
}
if (file.exists('../Exp4/ZScores/ZScores_Interaction.csv')){
inputFile[4] <- '../Exp4/ZScores/ZScores_Interaction.csv'
expName[4]= "Exp4"
dir.create("../GTAresults/Exp4")
}
outputPathGTA= "../GTAresults"
#dir.create(outPathGTA)
library("stringr")
library("org.Sc.sgd.db")
library("plyr")
#build in command args to apply this code to a given !!results sheet
SGD_Terms_file <- "../Code/go_terms.tab" #ArgsScore[2]
#https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
SGD_features_file <- "../Code/gene_association.sgd" #ArgsScore[3]
#R and Rstudio have issues: The for loop(189) seemed to fail to evaluate the paste (or paste0) to build the inputFile inside the for loop and bail as the second loop began. This crude fix below, seems to have alleviated the failure to loop problem at least for now. Also for some annoying reason, the underscores between word are sometimes not shown when they exist. No ryme of reason!!!
#Begin for loop for experiments in this study-----------------ZScores_Interaction.csv
for(m in 1:length(inputFile)){
#inputFile <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="") #ArgsScore[1]
X <- read.csv(file = inputFile[m],stringsAsFactors=FALSE,header = TRUE)
if(colnames(X)[1] == "OrfRep"){
colnames(X)[1] <- "ORF"
}
#Terms is the GO term list
Terms <- read.delim(file = SGD_Terms_file,header=FALSE,quote = "",col.names = c("GO_ID","GO_Term","GO_Aspect","GO_Term_Definition"))
#all ORFs associated with GO term
GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
#Gene_Association is the gene association to GO term file
Gene_Association <- read.delim(SGD_features_file,skip=8,header=FALSE,quote="",col.names = c("Database","Database_Object_ID","Database_Object_Symbol","NOT","GO_ID","Database_Reference","Evidence","With_or_From","Aspect","Database_Object_Name","Database_Object_Synonym","Database_Object_Type","taxon","Date","Assigned_By","OtherInfo","Empty"))
#Get the ORF names associated with each gene/GO term
Gene_Association$ORF <- str_split_fixed(as.character(Gene_Association$Database_Object_Synonym),"\\|",2)[,1]
#Get the numeric GO ID for matching
Gene_Association$GO_ID_Numeric <- as.integer(str_split_fixed(as.character(Gene_Association$GO_ID),"\\:",2)[,2])
#get all unique GO terms
GO_Terms <- unique(Gene_Association$GO_ID)
#create a character vector with just the ColNames of the input file to store the scores for each GO term
Col_Names_X <- colnames(X)
#create a data_frame with header from input_file
GO_Term_Averages <- X[0,]
#fill table with NAs same length as number of GO terms
GO_Term_Averages[1:length(GO_Terms),] <- NA
#change the first and second col names to GO_ID and Term
colnames(GO_Term_Averages)[1] <- "GO_ID"
colnames(GO_Term_Averages)[2] <- "Term"
#create new columns for Ontology, number genes (used to calculate the avg score), all possible genes in the GO term, and print genes/ORFs used
GO_Term_Averages$Ontology <- NA
GO_Term_Averages$NumGenes <- NA
GO_Term_Averages$AllPossibleGenes <- NA
GO_Term_Averages$Genes <- NA
GO_Term_Averages$ORFs <- NA
#create a data.frame for the standard deviation info
GO_Term_SD <- X[0,]
GO_Term_SD[1:length(GO_Terms),] <- NA
colnames(GO_Term_SD)[1] <- "GO_ID"
colnames(GO_Term_SD)[2] <- "Term"
#Loop for each GO term to get an average L and K Z score
for(i in 1:length(GO_Terms)){
#get the GO_Term
ID <- GO_Terms[i]
#Get data.frame for all genes associated to the GO Term
ID_AllGenes <- Gene_Association[Gene_Association$GO_ID == ID,]
#get a vector of just the gene names
ID_AllGenes_vector <- as.vector(GO2ALLORFs[as.character(ID)][[1]])
if(length(unique(ID_AllGenes_vector)) > 4000){
next()
}
#get the GO term character description where numeric Terms ID matches GO_Term's ID
GO_Description_Term <- as.character(Terms[Terms$GO_ID %in% ID_AllGenes$GO_ID_Numeric,]$GO_Term[1])
#get the Z scores for all genes in the GO_ID
Zscores_For_ID <- X[X$ORF %in% ID_AllGenes_vector,]
#get the Gene names and ORFs for the term
GO_Term_Averages$Genes[i] <- paste(unique(Zscores_For_ID$Gene),collapse=" | ")
GO_Term_Averages$ORFs[i] <- paste(unique(Zscores_For_ID$ORF),collapse=" | ")
#dataframe to report the averages for a GO term
#get the GO ID
GO_Term_Averages$GO_ID[i] <- as.character(ID)
#get the term name
GO_Term_Averages$Term[i] <- GO_Description_Term
#get total number of genes annotated to the Term that we have in our library
GO_Term_Averages$NumGenes[i] <- length(unique(Zscores_For_ID$ORF))
#get total number of genes annotated to the Term in SGD
GO_Term_Averages$AllPossibleGenes[i] <- length(unique(ID_AllGenes_vector))
#get the ontology of the term
GO_Term_Averages$Ontology[i] <- as.character(ID_AllGenes$Aspect[1])
#calculate the average score for every column
for(j in 3:length(X[1,])){
GO_Term_Averages[i,j] <- mean(Zscores_For_ID[,j],na.rm = TRUE)
#GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
}
#also calculate same values for the SD
GO_Term_SD$GO_ID[i] <- as.character(ID)
#get the term name
GO_Term_SD$Term[i] <- GO_Description_Term
#calculate column scores for SD
for(j in 3:length(X[1,])){
GO_Term_SD[i,j] <- sd(Zscores_For_ID[,j],na.rm = TRUE)
#GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
}
}
#add either _Avg or _SD depending on if the calculated score is an average or SD
colnames(GO_Term_Averages) <- paste(colnames(GO_Term_Averages),"Avg", sep = "_")
colnames(GO_Term_SD) <- paste(colnames(GO_Term_SD),"SD", sep = "_")
#combine the averages with the SDs to make one big data.frame
X2 <- cbind(GO_Term_Averages,GO_Term_SD)
#test[ , order(names(test))]
X2 <- X2[,order(names(X2))]
X2 <- X2[!is.na(X2$Z_lm_L_Avg),]
#create output file
write.csv(X2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All.csv",sep=""),row.names=FALSE)
#remove NAs
X3 <- X2[!is.na(X2$Z_lm_L_Avg),]
#identify redundant GO terms
for(i in 1:length(X3[,1])){
#loop through each GO term - get term
GO_term_ID <- as.character(X3$GO_ID_Avg[i])
#get term in the X3
X3_Temp <- X3[X3$GO_ID_Avg == GO_term_ID,]
#get anywhere that has the same number K_Avg value
X3_Temp2 <- X3[X3$Z_lm_K_Avg %in% X3_Temp,]
if(length(X3_Temp2[,1]) > 1){
if(length(unique(X3_Temp2$Genes_Avg)) == 1){
X3_Temp2 <- X3_Temp2[1,]
}
}
if(i == 1){
Y <- X3_Temp2
}
if(i > 1){
Y <- rbind(Y,X3_Temp2)
}
}
Y1 <- unique(Y)
write.csv(Y1,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All_NonRedundantTerms.csv",sep=""),row.names = FALSE)
Y2 <- Y1[Y1$Z_lm_L_Avg >= 2 | Y1$Z_lm_L_Avg <= -2,]
Y2 <- Y2[!is.na(Y2$Z_lm_L_Avg),]
write.csv(Y2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L.csv",sep=""),row.names = FALSE)
Y3 <- Y2[Y2$NumGenes_Avg > 2,]
write.csv(Y3,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv",sep=""),row.names = FALSE)
Y4 <- Y1[Y1$Z_lm_K_Avg >= 2 | Y1$Z_lm_K_Avg <= -2,]
Y4 <- Y4[!is.na(Y4$Z_lm_K_Avg),]
write.csv(Y4,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K.csv",sep=""),row.names = FALSE)
Y5 <- Y4[Y4$NumGenes_Avg > 2,]
write.csv(Y5,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv",sep=""),row.names = FALSE)
#End of 'for loop'
}