#!/usr/bin/env R # GTA (GoTermAveraging) # Your output may not be reproducible as org.Sc.sgd.db is uploaded from Bioconductor R library and changes # # Updated 240724 Bryan C Roessler to improve file operations and portability # NOTE: The script now has 2 additional OPTIONAL arguments: # 1. Path to SGD terms file (go.terms.tab) # 2. Path to SGD features file (gene_association.sgd) library("stringr") library("org.Sc.sgd.db") library("plyr") # Parse arguments args <- commandArgs(TRUE) exp_name <- args[1] if (length(args) > 2) { zscores_file <- args[2] } else { zscores_file <- "ZScores/ZScores_Interaction.csv" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd } if (length(args) > 3) { sgd_terms_file <- args[3] } else { sgd_terms_file <- "../Code/go_terms.tab" } if (length(args) > 4) { sgd_features_file <- args[4] } else { sgd_features_file <- "../Code/gene_association.sgd" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd } if (length(args) > 5) { output_dir <- args[5] } else { output_dir <- "../GTAresults" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd } # # Set SGDgeneList file path # if (length(args) > 4) { # SGDgeneList <- args[4] # } else { # SGDgeneList <- "../Code/SGD_features.tab" #Begin for loop for experiments in this study-----------------ZScores_Interaction.csv for(m in 1:length(zscores_file)){ #zscores_file <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="") #ArgsScore[1] X <- read.csv(file = zscores_file[m],stringsAsFactors=FALSE,header = TRUE) if(colnames(X)[1] == "OrfRep"){ colnames(X)[1] <- "ORF" } #Terms is the GO term list Terms <- read.delim(file = sgd_terms_file,header=FALSE,quote = "",col.names = c("GO_ID","GO_Term","GO_Aspect","GO_Term_Definition")) #all ORFs associated with GO term GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS) #Gene_Association is the gene association to GO term file Gene_Association <- read.delim(sgd_features_file,skip=8,header=FALSE,quote="",col.names = c("Database","Database_Object_ID","Database_Object_Symbol","NOT","GO_ID","Database_Reference","Evidence","With_or_From","Aspect","Database_Object_Name","Database_Object_Synonym","Database_Object_Type","taxon","Date","Assigned_By","OtherInfo","Empty")) #Get the ORF names associated with each gene/GO term Gene_Association$ORF <- str_split_fixed(as.character(Gene_Association$Database_Object_Synonym),"\\|",2)[,1] #Get the numeric GO ID for matching Gene_Association$GO_ID_Numeric <- as.integer(str_split_fixed(as.character(Gene_Association$GO_ID),"\\:",2)[,2]) #get all unique GO terms GO_Terms <- unique(Gene_Association$GO_ID) #create a character vector with just the ColNames of the input file to store the scores for each GO term Col_Names_X <- colnames(X) #create a data_frame with header from input_file GO_Term_Averages <- X[0,] #fill table with NAs same length as number of GO terms GO_Term_Averages[1:length(GO_Terms),] <- NA #change the first and second col names to GO_ID and Term colnames(GO_Term_Averages)[1] <- "GO_ID" colnames(GO_Term_Averages)[2] <- "Term" #create new columns for Ontology, number genes (used to calculate the avg score), all possible genes in the GO term, and print genes/ORFs used GO_Term_Averages$Ontology <- NA GO_Term_Averages$NumGenes <- NA GO_Term_Averages$AllPossibleGenes <- NA GO_Term_Averages$Genes <- NA GO_Term_Averages$ORFs <- NA #create a data.frame for the standard deviation info GO_Term_SD <- X[0,] GO_Term_SD[1:length(GO_Terms),] <- NA colnames(GO_Term_SD)[1] <- "GO_ID" colnames(GO_Term_SD)[2] <- "Term" #Loop for each GO term to get an average L and K Z score for(i in 1:length(GO_Terms)){ #get the GO_Term ID <- GO_Terms[i] #Get data.frame for all genes associated to the GO Term ID_AllGenes <- Gene_Association[Gene_Association$GO_ID == ID,] #get a vector of just the gene names ID_AllGenes_vector <- as.vector(GO2ALLORFs[as.character(ID)][[1]]) if(length(unique(ID_AllGenes_vector)) > 4000){ next() } #get the GO term character description where numeric Terms ID matches GO_Term's ID GO_Description_Term <- as.character(Terms[Terms$GO_ID %in% ID_AllGenes$GO_ID_Numeric,]$GO_Term[1]) #get the Z scores for all genes in the GO_ID Zscores_For_ID <- X[X$ORF %in% ID_AllGenes_vector,] #get the Gene names and ORFs for the term GO_Term_Averages$Genes[i] <- paste(unique(Zscores_For_ID$Gene),collapse=" | ") GO_Term_Averages$ORFs[i] <- paste(unique(Zscores_For_ID$ORF),collapse=" | ") #dataframe to report the averages for a GO term #get the GO ID GO_Term_Averages$GO_ID[i] <- as.character(ID) #get the term name GO_Term_Averages$Term[i] <- GO_Description_Term #get total number of genes annotated to the Term that we have in our library GO_Term_Averages$NumGenes[i] <- length(unique(Zscores_For_ID$ORF)) #get total number of genes annotated to the Term in SGD GO_Term_Averages$AllPossibleGenes[i] <- length(unique(ID_AllGenes_vector)) #get the ontology of the term GO_Term_Averages$Ontology[i] <- as.character(ID_AllGenes$Aspect[1]) #calculate the average score for every column for(j in 3:length(X[1,])){ GO_Term_Averages[i,j] <- mean(Zscores_For_ID[,j],na.rm = TRUE) #GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])]) } #also calculate same values for the SD GO_Term_SD$GO_ID[i] <- as.character(ID) #get the term name GO_Term_SD$Term[i] <- GO_Description_Term #calculate column scores for SD for(j in 3:length(X[1,])){ GO_Term_SD[i,j] <- sd(Zscores_For_ID[,j],na.rm = TRUE) #GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])]) } } #add either _Avg or _SD depending on if the calculated score is an average or SD colnames(GO_Term_Averages) <- paste(colnames(GO_Term_Averages),"Avg", sep = "_") colnames(GO_Term_SD) <- paste(colnames(GO_Term_SD),"SD", sep = "_") #combine the averages with the SDs to make one big data.frame X2 <- cbind(GO_Term_Averages,GO_Term_SD) #test[ , order(names(test))] X2 <- X2[,order(names(X2))] X2 <- X2[!is.na(X2$Z_lm_L_Avg),] #create output file write.csv(X2,file=paste(output_dir,"/",expName[m],"/Average_GOTerms_All.csv",sep=""),row.names=FALSE) #remove NAs X3 <- X2[!is.na(X2$Z_lm_L_Avg),] #identify redundant GO terms for(i in 1:length(X3[,1])){ #loop through each GO term - get term GO_term_ID <- as.character(X3$GO_ID_Avg[i]) #get term in the X3 X3_Temp <- X3[X3$GO_ID_Avg == GO_term_ID,] #get anywhere that has the same number K_Avg value X3_Temp2 <- X3[X3$Z_lm_K_Avg %in% X3_Temp,] if(length(X3_Temp2[,1]) > 1){ if(length(unique(X3_Temp2$Genes_Avg)) == 1){ X3_Temp2 <- X3_Temp2[1,] } } if(i == 1){ Y <- X3_Temp2 } if(i > 1){ Y <- rbind(Y,X3_Temp2) } } Y1 <- unique(Y) write.csv(Y1,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_All_NonRedundantTerms.csv",sep=""),row.names = FALSE) Y2 <- Y1[Y1$Z_lm_L_Avg >= 2 | Y1$Z_lm_L_Avg <= -2,] Y2 <- Y2[!is.na(Y2$Z_lm_L_Avg),] write.csv(Y2,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_L.csv",sep=""),row.names = FALSE) Y3 <- Y2[Y2$NumGenes_Avg > 2,] write.csv(Y3,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv",sep=""),row.names = FALSE) Y4 <- Y1[Y1$Z_lm_K_Avg >= 2 | Y1$Z_lm_K_Avg <= -2,] Y4 <- Y4[!is.na(Y4$Z_lm_K_Avg),] write.csv(Y4,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_K.csv",sep=""),row.names = FALSE) Y5 <- Y4[Y4$NumGenes_Avg > 2,] write.csv(Y5,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv",sep=""),row.names = FALSE) #End of 'for loop' }