hartman-server/workflow/.old/ScriptTemplates/GTAtemplate.R

#GTA (GoTermAveraging) Starting (Working Directory is /Code) All paths relative to /Code

#Currently must run for each Experiment.  _Future could loop thru the number of experiments involved in study. JWR
         Wstudy= getwd()
         if (file.exists('../Exp2/ZScores/ZScores_Interaction.csv')){
           inputFile <- '../Exp1/ZScores/ZScores_Interaction.csv'
           expName= "Exp1"
           dir.create("../GTAresults/Exp1")
         }
         if (file.exists('../Exp2/ZScores/ZScores_Interaction.csv')){
           inputFile[2] <- '../Exp2/ZScores/ZScores_Interaction.csv'
           expName[2]= "Exp2"
           dir.create("../GTAresults/Exp2")
         }
         if (file.exists('../Exp3/ZScores/ZScores_Interaction.csv')){
           inputFile[3] <- '../Exp3/ZScores/ZScores_Interaction.csv'
           expName[3]= "Exp3"
           dir.create("../GTAresults/Exp3")
         }
         if (file.exists('../Exp4/ZScores/ZScores_Interaction.csv')){
           inputFile[4] <- '../Exp4/ZScores/ZScores_Interaction.csv'
           expName[4]= "Exp4"
           dir.create("../GTAresults/Exp4")
         }

outputPathGTA= "../GTAresults"
        #dir.create(outPathGTA)

library("stringr")
library("org.Sc.sgd.db")
library("plyr")
#build in command args to apply this code to a given !!results sheet

SGD_Terms_file <- "../Code/go_terms.tab"  #ArgsScore[2]
        #https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
SGD_features_file <- "../Code/gene_association.sgd"  #ArgsScore[3]

#R and Rstudio have issues: The for loop(189) seemed to fail to evaluate the paste (or paste0) to build the inputFile inside the for loop and bail as the second loop began. This crude fix below, seems to have alleviated the failure to loop problem at least for now. Also for some annoying reason, the underscores between word are sometimes not shown when they exist. No ryme of reason!!!


#Begin for loop for experiments in this study-----------------ZScores_Interaction.csv
for(m in 1:length(inputFile)){

  #inputFile <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="")  #ArgsScore[1]
  X <- read.csv(file = inputFile[m],stringsAsFactors=FALSE,header = TRUE)

  if(colnames(X)[1] == "OrfRep"){
    colnames(X)[1] <- "ORF"
  }

  #Terms is the GO term list
  Terms <- read.delim(file = SGD_Terms_file,header=FALSE,quote = "",col.names = c("GO_ID","GO_Term","GO_Aspect","GO_Term_Definition"))
  #all ORFs associated with GO term
  GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
  #Gene_Association is the gene association to GO term file
  Gene_Association <- read.delim(SGD_features_file,skip=8,header=FALSE,quote="",col.names = c("Database","Database_Object_ID","Database_Object_Symbol","NOT","GO_ID","Database_Reference","Evidence","With_or_From","Aspect","Database_Object_Name","Database_Object_Synonym","Database_Object_Type","taxon","Date","Assigned_By","OtherInfo","Empty"))
  #Get the ORF names associated with each gene/GO term
  Gene_Association$ORF <- str_split_fixed(as.character(Gene_Association$Database_Object_Synonym),"\\|",2)[,1]
  #Get the numeric GO ID for matching
  Gene_Association$GO_ID_Numeric <- as.integer(str_split_fixed(as.character(Gene_Association$GO_ID),"\\:",2)[,2])
  #get all unique GO terms
  GO_Terms <- unique(Gene_Association$GO_ID)
  #create a character vector with just the ColNames of the input file to store the scores for each GO term
  Col_Names_X <- colnames(X)
  #create a data_frame with header from input_file
  GO_Term_Averages <- X[0,]
  #fill table with NAs same length as number of GO terms
  GO_Term_Averages[1:length(GO_Terms),] <- NA
  #change the first and second col names to GO_ID and Term
  colnames(GO_Term_Averages)[1] <- "GO_ID"
  colnames(GO_Term_Averages)[2] <- "Term"

  #create new columns for Ontology, number genes (used to calculate the avg score), all possible genes in the GO term, and print genes/ORFs used
  GO_Term_Averages$Ontology <- NA
  GO_Term_Averages$NumGenes <- NA
  GO_Term_Averages$AllPossibleGenes <- NA
  GO_Term_Averages$Genes <- NA
  GO_Term_Averages$ORFs <- NA

  #create a data.frame for the standard deviation info
  GO_Term_SD <- X[0,]
  GO_Term_SD[1:length(GO_Terms),] <- NA
  colnames(GO_Term_SD)[1] <- "GO_ID"
  colnames(GO_Term_SD)[2] <- "Term"

  #Loop for each GO term to get an average L and K Z score
  for(i in 1:length(GO_Terms)){
    #get the GO_Term
    ID <- GO_Terms[i]
    #Get data.frame for all genes associated to the GO Term
    ID_AllGenes <- Gene_Association[Gene_Association$GO_ID == ID,]
    #get a vector of just the gene names
    ID_AllGenes_vector <- as.vector(GO2ALLORFs[as.character(ID)][[1]])
    if(length(unique(ID_AllGenes_vector)) > 4000){
      next()
    }
    #get the GO term character description where numeric Terms ID matches GO_Term's ID
    GO_Description_Term <- as.character(Terms[Terms$GO_ID %in% ID_AllGenes$GO_ID_Numeric,]$GO_Term[1])
    #get the Z scores for all genes in the GO_ID
    Zscores_For_ID <- X[X$ORF %in% ID_AllGenes_vector,]
    #get the Gene names and ORFs for the term
    GO_Term_Averages$Genes[i] <- paste(unique(Zscores_For_ID$Gene),collapse=" | ")
    GO_Term_Averages$ORFs[i] <- paste(unique(Zscores_For_ID$ORF),collapse=" | ")

    #dataframe to report the averages for a GO term
    #get the GO ID
    GO_Term_Averages$GO_ID[i] <- as.character(ID)
    #get the term name
    GO_Term_Averages$Term[i] <- GO_Description_Term
    #get total number of genes annotated to the Term that we have in our library
    GO_Term_Averages$NumGenes[i] <- length(unique(Zscores_For_ID$ORF))
    #get total number of genes annotated to the Term in SGD
    GO_Term_Averages$AllPossibleGenes[i] <- length(unique(ID_AllGenes_vector))
    #get the ontology of the term
    GO_Term_Averages$Ontology[i] <- as.character(ID_AllGenes$Aspect[1])

    #calculate the average score for every column
    for(j in 3:length(X[1,])){
      GO_Term_Averages[i,j] <- mean(Zscores_For_ID[,j],na.rm = TRUE)
      #GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
    }

    #also calculate same values for the SD
    GO_Term_SD$GO_ID[i] <- as.character(ID)
    #get the term name
    GO_Term_SD$Term[i] <- GO_Description_Term

    #calculate column scores for SD
    for(j in 3:length(X[1,])){
      GO_Term_SD[i,j] <- sd(Zscores_For_ID[,j],na.rm = TRUE)
      #GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
    }
  }
  #add either _Avg or _SD depending on if the calculated score is an average or SD
  colnames(GO_Term_Averages) <- paste(colnames(GO_Term_Averages),"Avg", sep = "_")
  colnames(GO_Term_SD) <- paste(colnames(GO_Term_SD),"SD", sep = "_")
  #combine the averages with the SDs to make one big data.frame
  X2 <- cbind(GO_Term_Averages,GO_Term_SD)
  #test[ , order(names(test))]
  X2 <- X2[,order(names(X2))]
  X2 <- X2[!is.na(X2$Z_lm_L_Avg),]
  #create output file
  write.csv(X2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All.csv",sep=""),row.names=FALSE)
  #remove NAs
  X3 <- X2[!is.na(X2$Z_lm_L_Avg),]
  #identify redundant GO terms

  for(i in 1:length(X3[,1])){
    #loop through each GO term - get term
    GO_term_ID <- as.character(X3$GO_ID_Avg[i])
    #get term in the X3
    X3_Temp <- X3[X3$GO_ID_Avg == GO_term_ID,]
    #get anywhere that has the same number K_Avg value
    X3_Temp2 <- X3[X3$Z_lm_K_Avg %in% X3_Temp,]
    if(length(X3_Temp2[,1]) > 1){
      if(length(unique(X3_Temp2$Genes_Avg)) == 1){
        X3_Temp2 <- X3_Temp2[1,]
      }
    }

    if(i == 1){
      Y <- X3_Temp2
    }
    if(i > 1){
      Y <- rbind(Y,X3_Temp2)
    }
  }
  Y1 <- unique(Y)

  write.csv(Y1,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All_NonRedundantTerms.csv",sep=""),row.names = FALSE)

  Y2 <- Y1[Y1$Z_lm_L_Avg >= 2 | Y1$Z_lm_L_Avg <= -2,]
  Y2 <- Y2[!is.na(Y2$Z_lm_L_Avg),]
  write.csv(Y2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L.csv",sep=""),row.names = FALSE)

  Y3 <- Y2[Y2$NumGenes_Avg > 2,]
  write.csv(Y3,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv",sep=""),row.names = FALSE)

  Y4 <- Y1[Y1$Z_lm_K_Avg >= 2 | Y1$Z_lm_K_Avg <= -2,]
  Y4 <- Y4[!is.na(Y4$Z_lm_K_Avg),]
  write.csv(Y4,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K.csv",sep=""),row.names = FALSE)

  Y5 <- Y4[Y4$NumGenes_Avg > 2,]
  write.csv(Y5,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv",sep=""),row.names = FALSE)

  #End of 'for loop'
}