Add gta module

2024-07-24 00:06:56 -04:00
parent b049d58e79
commit 7b8ce3e7fd
3 changed files with 259 additions and 174 deletions
--- a/workflow/templates/qhtcp/Code/GTAtemplate.R
+++ b/workflow/templates/qhtcp/Code/GTAtemplate.R
@@ -1,59 +1,70 @@
-#GTA (GoTermAveraging) Starting (Working Directory is /Code) All paths relative to /Code
-#Your output  may not be reproducible as org.Sc.sgd.db is uploaded from Bioconductor R library and changes
-#Loops thru the number of experiments involved in study. JWR
-Wstudy= getwd()
-if (file.exists('../Exp1/ZScores/ZScores_Interaction.csv')){
-  inputFile <- '../Exp1/ZScores/ZScores_Interaction.csv'
-  expName= "Exp1"
-  dir.create("../GTAresults/Exp1")
-}
-if (file.exists('../Exp2/ZScores/ZScores_Interaction.csv')){
-  inputFile[2] <- '../Exp2/ZScores/ZScores_Interaction.csv'
-  expName[2]= "Exp2"
-  dir.create("../GTAresults/Exp2")
-}
-if (file.exists('../Exp3/ZScores/ZScores_Interaction.csv')){
-  inputFile[3] <- '../Exp3/ZScores/ZScores_Interaction.csv'
-  expName[3]= "Exp3"
-  dir.create("../GTAresults/Exp3")
-}
-if (file.exists('../Exp4/ZScores/ZScores_Interaction.csv')){
-  inputFile[4] <- '../Exp4/ZScores/ZScores_Interaction.csv'
-  expName[4]= "Exp4"
-  dir.create("../GTAresults/Exp4")
-}
-         
-outputPathGTA= "../GTAresults"
-        #dir.create(outPathGTA)
+#!/usr/bin/env R 
+# GTA (GoTermAveraging)
+# Your output  may not be reproducible as org.Sc.sgd.db is uploaded from Bioconductor R library and changes
+#
+# Updated 240724 Bryan C Roessler to improve file operations and portability
+# NOTE: The script now has 2 additional OPTIONAL arguments: 
+#   1. Path to SGD terms file (go.terms.tab)
+#   2. Path to SGD features file (gene_association.sgd)

 library("stringr")
 library("org.Sc.sgd.db")
 library("plyr")
-#build in command args to apply this code to a given !!results sheet

-SGD_Terms_file <- "../Code/go_terms.tab"  #ArgsScore[2]
-        #https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
-SGD_features_file <- "../Code/gene_association.sgd"  #ArgsScore[3]
+# Parse arguments
+args <- commandArgs(TRUE)

-#R and Rstudio have issues: The for loop(189) seemed to fail to evaluate the paste (or paste0) to build the inputFile inside the for loop and bail as the second loop began. This crude fix below, seems to have alleviated the failure to loop problem at least for now. Also for some annoying reason, the underscores between word are sometimes not shown when they exist. No ryme of reason!!!
+exp_name <- args[1]
+
+if (length(args) > 2) {
+  zscores_file <- args[2]
+} else {
+  zscores_file <- "ZScores/ZScores_Interaction.csv" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
+}
+
+if (length(args) > 3) {
+  sgd_terms_file <- args[3]
+} else {
+  sgd_terms_file <- "../Code/go_terms.tab"
+}
+
+if (length(args) > 4) {
+  sgd_features_file <- args[4]
+} else {
+  sgd_features_file <- "../Code/gene_association.sgd" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
+}
+
+if (length(args) > 5) {
+  output_dir <- args[5]
+} else {
+  output_dir <- "../GTAresults" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
+}
+
+
+
+# # Set SGDgeneList file path
+# if (length(args) > 4) {
+#   SGDgeneList <- args[4]
+# } else {
+#   SGDgeneList <- "../Code/SGD_features.tab"


 #Begin for loop for experiments in this study-----------------ZScores_Interaction.csv
-for(m in 1:length(inputFile)){
+for(m in 1:length(zscores_file)){
  
-  #inputFile <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="")  #ArgsScore[1]
-  X <- read.csv(file = inputFile[m],stringsAsFactors=FALSE,header = TRUE)
+  #zscores_file <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="")  #ArgsScore[1]
+  X <- read.csv(file = zscores_file[m],stringsAsFactors=FALSE,header = TRUE)
  
  if(colnames(X)[1] == "OrfRep"){
    colnames(X)[1] <- "ORF"
  }
  
  #Terms is the GO term list
-  Terms <- read.delim(file = SGD_Terms_file,header=FALSE,quote = "",col.names = c("GO_ID","GO_Term","GO_Aspect","GO_Term_Definition"))
+  Terms <- read.delim(file = sgd_terms_file,header=FALSE,quote = "",col.names = c("GO_ID","GO_Term","GO_Aspect","GO_Term_Definition"))
  #all ORFs associated with GO term
  GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
  #Gene_Association is the gene association to GO term file
-  Gene_Association <- read.delim(SGD_features_file,skip=8,header=FALSE,quote="",col.names = c("Database","Database_Object_ID","Database_Object_Symbol","NOT","GO_ID","Database_Reference","Evidence","With_or_From","Aspect","Database_Object_Name","Database_Object_Synonym","Database_Object_Type","taxon","Date","Assigned_By","OtherInfo","Empty"))
+  Gene_Association <- read.delim(sgd_features_file,skip=8,header=FALSE,quote="",col.names = c("Database","Database_Object_ID","Database_Object_Symbol","NOT","GO_ID","Database_Reference","Evidence","With_or_From","Aspect","Database_Object_Name","Database_Object_Synonym","Database_Object_Type","taxon","Date","Assigned_By","OtherInfo","Empty"))
  #Get the ORF names associated with each gene/GO term
  Gene_Association$ORF <- str_split_fixed(as.character(Gene_Association$Database_Object_Synonym),"\\|",2)[,1]
  #Get the numeric GO ID for matching
@@ -140,7 +151,7 @@ for(m in 1:length(inputFile)){
  X2 <- X2[,order(names(X2))]
  X2 <- X2[!is.na(X2$Z_lm_L_Avg),]
  #create output file
-  write.csv(X2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All.csv",sep=""),row.names=FALSE)
+  write.csv(X2,file=paste(output_dir,"/",expName[m],"/Average_GOTerms_All.csv",sep=""),row.names=FALSE)
  #remove NAs
  X3 <- X2[!is.na(X2$Z_lm_L_Avg),]
  #identify redundant GO terms
@@ -167,21 +178,21 @@ for(m in 1:length(inputFile)){
  }
  Y1 <- unique(Y)
  
-  write.csv(Y1,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All_NonRedundantTerms.csv",sep=""),row.names = FALSE)
+  write.csv(Y1,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_All_NonRedundantTerms.csv",sep=""),row.names = FALSE)
  
  Y2 <- Y1[Y1$Z_lm_L_Avg >= 2 | Y1$Z_lm_L_Avg <= -2,]
  Y2 <- Y2[!is.na(Y2$Z_lm_L_Avg),]
-  write.csv(Y2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L.csv",sep=""),row.names = FALSE)
+  write.csv(Y2,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_L.csv",sep=""),row.names = FALSE)
  
  Y3 <- Y2[Y2$NumGenes_Avg > 2,]
-  write.csv(Y3,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv",sep=""),row.names = FALSE)
+  write.csv(Y3,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv",sep=""),row.names = FALSE)
  
  Y4 <- Y1[Y1$Z_lm_K_Avg >= 2 | Y1$Z_lm_K_Avg <= -2,]
  Y4 <- Y4[!is.na(Y4$Z_lm_K_Avg),]
-  write.csv(Y4,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K.csv",sep=""),row.names = FALSE)
+  write.csv(Y4,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_K.csv",sep=""),row.names = FALSE)
  
  Y5 <- Y4[Y4$NumGenes_Avg > 2,]
-  write.csv(Y5,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv",sep=""),row.names = FALSE)
+  write.csv(Y5,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv",sep=""),row.names = FALSE)
  
  #End of 'for loop'
 }