Organize old files

2024-07-29 18:12:57 -04:00
parent d2a5d9e69f
commit 39672a6ca0
98 changed files with 57004 additions and 19259 deletions
--- a/workflow/.old/apps/r/ScriptTemplates/GTAtemplate.R
+++ b/workflow/.old/apps/r/ScriptTemplates/GTAtemplate.R
@@ -0,0 +1,188 @@
+#GTA (GoTermAveraging) Starting (Working Directory is /Code) All paths relative to /Code
+
+#Currently must run for each Experiment.  _Future could loop thru the number of experiments involved in study. JWR
+         Wstudy= getwd()
+         if (file.exists('../Exp2/ZScores/ZScores_Interaction.csv')){
+           inputFile <- '../Exp1/ZScores/ZScores_Interaction.csv'
+           expName= "Exp1"
+           dir.create("../GTAresults/Exp1")
+         }
+         if (file.exists('../Exp2/ZScores/ZScores_Interaction.csv')){
+           inputFile[2] <- '../Exp2/ZScores/ZScores_Interaction.csv'
+           expName[2]= "Exp2"
+           dir.create("../GTAresults/Exp2")
+         }
+         if (file.exists('../Exp3/ZScores/ZScores_Interaction.csv')){
+           inputFile[3] <- '../Exp3/ZScores/ZScores_Interaction.csv'
+           expName[3]= "Exp3"
+           dir.create("../GTAresults/Exp3")
+         }
+         if (file.exists('../Exp4/ZScores/ZScores_Interaction.csv')){
+           inputFile[4] <- '../Exp4/ZScores/ZScores_Interaction.csv'
+           expName[4]= "Exp4"
+           dir.create("../GTAresults/Exp4")
+         }
+         
+outputPathGTA= "../GTAresults"
+        #dir.create(outPathGTA)
+
+library("stringr")
+library("org.Sc.sgd.db")
+library("plyr")
+#build in command args to apply this code to a given !!results sheet
+
+SGD_Terms_file <- "../Code/go_terms.tab"  #ArgsScore[2]
+        #https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
+SGD_features_file <- "../Code/gene_association.sgd"  #ArgsScore[3]
+
+#R and Rstudio have issues: The for loop(189) seemed to fail to evaluate the paste (or paste0) to build the inputFile inside the for loop and bail as the second loop began. This crude fix below, seems to have alleviated the failure to loop problem at least for now. Also for some annoying reason, the underscores between word are sometimes not shown when they exist. No ryme of reason!!!
+
+
+#Begin for loop for experiments in this study-----------------ZScores_Interaction.csv
+for(m in 1:length(inputFile)){
+  
+  #inputFile <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="")  #ArgsScore[1]
+  X <- read.csv(file = inputFile[m],stringsAsFactors=FALSE,header = TRUE)
+  
+  if(colnames(X)[1] == "OrfRep"){
+    colnames(X)[1] <- "ORF"
+  }
+  
+  #Terms is the GO term list
+  Terms <- read.delim(file = SGD_Terms_file,header=FALSE,quote = "",col.names = c("GO_ID","GO_Term","GO_Aspect","GO_Term_Definition"))
+  #all ORFs associated with GO term
+  GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
+  #Gene_Association is the gene association to GO term file
+  Gene_Association <- read.delim(SGD_features_file,skip=8,header=FALSE,quote="",col.names = c("Database","Database_Object_ID","Database_Object_Symbol","NOT","GO_ID","Database_Reference","Evidence","With_or_From","Aspect","Database_Object_Name","Database_Object_Synonym","Database_Object_Type","taxon","Date","Assigned_By","OtherInfo","Empty"))
+  #Get the ORF names associated with each gene/GO term
+  Gene_Association$ORF <- str_split_fixed(as.character(Gene_Association$Database_Object_Synonym),"\\|",2)[,1]
+  #Get the numeric GO ID for matching
+  Gene_Association$GO_ID_Numeric <- as.integer(str_split_fixed(as.character(Gene_Association$GO_ID),"\\:",2)[,2])
+  #get all unique GO terms
+  GO_Terms <- unique(Gene_Association$GO_ID)
+  #create a character vector with just the ColNames of the input file to store the scores for each GO term
+  Col_Names_X <- colnames(X)
+  #create a data_frame with header from input_file
+  GO_Term_Averages <- X[0,]
+  #fill table with NAs same length as number of GO terms
+  GO_Term_Averages[1:length(GO_Terms),] <- NA
+  #change the first and second col names to GO_ID and Term
+  colnames(GO_Term_Averages)[1] <- "GO_ID"
+  colnames(GO_Term_Averages)[2] <- "Term"
+  
+  #create new columns for Ontology, number genes (used to calculate the avg score), all possible genes in the GO term, and print genes/ORFs used
+  GO_Term_Averages$Ontology <- NA
+  GO_Term_Averages$NumGenes <- NA
+  GO_Term_Averages$AllPossibleGenes <- NA
+  GO_Term_Averages$Genes <- NA
+  GO_Term_Averages$ORFs <- NA
+  
+  #create a data.frame for the standard deviation info 
+  GO_Term_SD <- X[0,]
+  GO_Term_SD[1:length(GO_Terms),] <- NA
+  colnames(GO_Term_SD)[1] <- "GO_ID"
+  colnames(GO_Term_SD)[2] <- "Term"
+  
+  #Loop for each GO term to get an average L and K Z score
+  for(i in 1:length(GO_Terms)){
+    #get the GO_Term
+    ID <- GO_Terms[i]
+    #Get data.frame for all genes associated to the GO Term
+    ID_AllGenes <- Gene_Association[Gene_Association$GO_ID == ID,]
+    #get a vector of just the gene names
+    ID_AllGenes_vector <- as.vector(GO2ALLORFs[as.character(ID)][[1]])
+    if(length(unique(ID_AllGenes_vector)) > 4000){
+      next()
+    }
+    #get the GO term character description where numeric Terms ID matches GO_Term's ID
+    GO_Description_Term <- as.character(Terms[Terms$GO_ID %in% ID_AllGenes$GO_ID_Numeric,]$GO_Term[1])
+    #get the Z scores for all genes in the GO_ID
+    Zscores_For_ID <- X[X$ORF %in% ID_AllGenes_vector,]
+    #get the Gene names and ORFs for the term
+    GO_Term_Averages$Genes[i] <- paste(unique(Zscores_For_ID$Gene),collapse=" | ")
+    GO_Term_Averages$ORFs[i] <- paste(unique(Zscores_For_ID$ORF),collapse=" | ")
+    
+    #dataframe to report the averages for a GO term
+    #get the GO ID
+    GO_Term_Averages$GO_ID[i] <- as.character(ID)
+    #get the term name
+    GO_Term_Averages$Term[i] <- GO_Description_Term
+    #get total number of genes annotated to the Term that we have in our library
+    GO_Term_Averages$NumGenes[i] <- length(unique(Zscores_For_ID$ORF))
+    #get total number of genes annotated to the Term in SGD
+    GO_Term_Averages$AllPossibleGenes[i] <- length(unique(ID_AllGenes_vector))
+    #get the ontology of the term
+    GO_Term_Averages$Ontology[i] <- as.character(ID_AllGenes$Aspect[1])
+    
+    #calculate the average score for every column
+    for(j in 3:length(X[1,])){
+      GO_Term_Averages[i,j] <- mean(Zscores_For_ID[,j],na.rm = TRUE)
+      #GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
+    }
+    
+    #also calculate same values for the SD
+    GO_Term_SD$GO_ID[i] <- as.character(ID)
+    #get the term name
+    GO_Term_SD$Term[i] <- GO_Description_Term
+    
+    #calculate column scores for SD
+    for(j in 3:length(X[1,])){
+      GO_Term_SD[i,j] <- sd(Zscores_For_ID[,j],na.rm = TRUE)
+      #GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])])
+    }
+  }
+  #add either _Avg or _SD depending on if the calculated score is an average or SD
+  colnames(GO_Term_Averages) <- paste(colnames(GO_Term_Averages),"Avg", sep = "_")
+  colnames(GO_Term_SD) <- paste(colnames(GO_Term_SD),"SD", sep = "_")
+  #combine the averages with the SDs to make one big data.frame
+  X2 <- cbind(GO_Term_Averages,GO_Term_SD)
+  #test[ , order(names(test))]
+  X2 <- X2[,order(names(X2))]
+  X2 <- X2[!is.na(X2$Z_lm_L_Avg),]
+  #create output file
+  write.csv(X2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All.csv",sep=""),row.names=FALSE)
+  #remove NAs
+  X3 <- X2[!is.na(X2$Z_lm_L_Avg),]
+  #identify redundant GO terms
+  
+  for(i in 1:length(X3[,1])){
+    #loop through each GO term - get term
+    GO_term_ID <- as.character(X3$GO_ID_Avg[i])
+    #get term in the X3
+    X3_Temp <- X3[X3$GO_ID_Avg == GO_term_ID,]
+    #get anywhere that has the same number K_Avg value
+    X3_Temp2 <- X3[X3$Z_lm_K_Avg %in% X3_Temp,]
+    if(length(X3_Temp2[,1]) > 1){
+      if(length(unique(X3_Temp2$Genes_Avg)) == 1){
+        X3_Temp2 <- X3_Temp2[1,]
+      }
+    }
+    
+    if(i == 1){
+      Y <- X3_Temp2
+    }
+    if(i > 1){
+      Y <- rbind(Y,X3_Temp2)
+    }
+  }
+  Y1 <- unique(Y)
+  
+  write.csv(Y1,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All_NonRedundantTerms.csv",sep=""),row.names = FALSE)
+  
+  Y2 <- Y1[Y1$Z_lm_L_Avg >= 2 | Y1$Z_lm_L_Avg <= -2,]
+  Y2 <- Y2[!is.na(Y2$Z_lm_L_Avg),]
+  write.csv(Y2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L.csv",sep=""),row.names = FALSE)
+  
+  Y3 <- Y2[Y2$NumGenes_Avg > 2,]
+  write.csv(Y3,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv",sep=""),row.names = FALSE)
+  
+  Y4 <- Y1[Y1$Z_lm_K_Avg >= 2 | Y1$Z_lm_K_Avg <= -2,]
+  Y4 <- Y4[!is.na(Y4$Z_lm_K_Avg),]
+  write.csv(Y4,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K.csv",sep=""),row.names = FALSE)
+  
+  Y5 <- Y4[Y4$NumGenes_Avg > 2,]
+  write.csv(Y5,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv",sep=""),row.names = FALSE)
+  
+  #End of 'for loop'
+}
+