#!/usr/bin/env R # GTA (GoTermAveraging) # Your output may not be reproducible as org.Sc.sgd.db is uploaded from Bioconductor R library and changes # # Updated 240724 Bryan C Roessler to improve file operations and portability # NOTE: The script now has 2 additional OPTIONAL arguments: # 1. Path to SGD terms file (go.terms.tab) # 2. Path to SGD features file (gene_association.sgd) library("stringr") library("org.Sc.sgd.db") library("plyr") # Parse arguments args <- commandArgs(TRUE) exp_name <- args[1] if (length(args) >= 2) { zscores_file <- args[2] } else { zscores_file <- "zscores/zscores_interaction.csv" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd } if (length(args) >= 3) { sgd_terms_file <- args[3] } else { sgd_terms_file <- "go_terms.tab" } if (length(args) >= 4) { sgd_features_file <- args[4] } else { sgd_features_file <- "gene_association.sgd" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd } if (length(args) >= 5) { output_dir <- args[5] } else { output_dir <- "../../out/gta" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd } # # Set SGDgeneList file path # if (length(args) > 4) { # SGDgeneList <- args[4] # } else { # SGDgeneList <- "../Code/SGD_features.tab" # Begin for loop for experiments in this study # ZScores_Interaction.csv for (m in 1:length(zscores_file)) { # zscores_file <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="") #ArgsScore[1] X <- read.csv(file = zscores_file[m], stringsAsFactors = FALSE, header = TRUE) if (colnames(X)[1] == "OrfRep") { colnames(X)[1] <- "ORF" } #Terms is the GO term list Terms <- read.delim(file = sgd_terms_file, header = FALSE, quote = "", col.names = c("GO_ID", "GO_Term", "GO_Aspect", "GO_Term_Definition")) #all ORFs associated with GO term GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS) #Gene_Association is the gene association to GO term file Gene_Association <- read.delim(sgd_features_file, skip = 8, header = FALSE, quote = "", col.names = c("Database", "Database_Object_ID", "Database_Object_Symbol", "NOT", "GO_ID", "Database_Reference", "Evidence", "With_or_From", "Aspect", "Database_Object_Name", "Database_Object_Synonym", "Database_Object_Type", "taxon", "Date", "Assigned_By", "OtherInfo", "Empty" ) ) #Get the ORF names associated with each gene/GO term Gene_Association$ORF <- str_split_fixed(as.character(Gene_Association$Database_Object_Synonym), "\\|", 2)[, 1] #Get the numeric GO ID for matching Gene_Association$GO_ID_Numeric <- as.integer(str_split_fixed(as.character(Gene_Association$GO_ID), "\\:", 2)[, 2]) #Get all unique GO terms GO_Terms <- unique(Gene_Association$GO_ID) #Create a character vector with just the ColNames of the input file to store the scores for each GO term Col_Names_X <- colnames(X) #Create a data_frame with header from input_file GO_Term_Averages <- X[0, ] #Fill table with NAs same length as number of GO terms GO_Term_Averages[1:length(GO_Terms), ] <- NA #Change the first and second col names to GO_ID and Term colnames(GO_Term_Averages)[1] <- "GO_ID" colnames(GO_Term_Averages)[2] <- "Term" # Create new columns GO_Term_Averages$Ontology <- NA GO_Term_Averages$NumGenes <- NA GO_Term_Averages$AllPossibleGenes <- NA GO_Term_Averages$Genes <- NA GO_Term_Averages$ORFs <- NA # Create a data.frame for the standard deviation info GO_Term_SD <- X[0, ] GO_Term_SD[1:length(GO_Terms), ] <- NA colnames(GO_Term_SD)[1] <- "GO_ID" colnames(GO_Term_SD)[2] <- "Term" # Loop for each GO term to get an average L and K Z score for (i in 1:length(GO_Terms)) { # Get the GO_Term ID <- GO_Terms[i] # Get data.frame for all genes associated to the GO Term ID_AllGenes <- Gene_Association[Gene_Association$GO_ID == ID, ] # Get a vector of just the gene names ID_AllGenes_vector <- as.vector(GO2ALLORFs[as.character(ID)][[1]]) if (length(unique(ID_AllGenes_vector)) > 4000) { next() } # Get the GO term character description where numeric Terms ID matches GO_Term's ID GO_Description_Term <- as.character(Terms[Terms$GO_ID %in% ID_AllGenes$GO_ID_Numeric, ]$GO_Term[1]) # Get the Z scores for all genes in the GO_ID Zscores_For_ID <- X[X$ORF %in% ID_AllGenes_vector, ] # Get the Gene names and ORFs for the term GO_Term_Averages$Genes[i] <- paste(unique(Zscores_For_ID$Gene), collapse = " | ") GO_Term_Averages$ORFs[i] <- paste(unique(Zscores_For_ID$ORF), collapse = " | ") # Dataframe to report the averages for a GO term # Get the GO ID GO_Term_Averages$GO_ID[i] <- as.character(ID) # Get the term name GO_Term_Averages$Term[i] <- GO_Description_Term # Get total number of genes annotated to the Term that we have in our library GO_Term_Averages$NumGenes[i] <- length(unique(Zscores_For_ID$ORF)) # Get total number of genes annotated to the Term in SGD GO_Term_Averages$AllPossibleGenes[i] <- length(unique(ID_AllGenes_vector)) # Get the ontology of the term GO_Term_Averages$Ontology[i] <- as.character(ID_AllGenes$Aspect[1]) # Calculate the average score for every column for (j in 3:length(X[1, ])) { GO_Term_Averages[i, j] <- mean(Zscores_For_ID[, j], na.rm = TRUE) # GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])]) } # Also calculate same values for the SD GO_Term_SD$GO_ID[i] <- as.character(ID) # Get the term name GO_Term_SD$Term[i] <- GO_Description_Term # Calculate column scores for SD for (j in 3:length(X[1, ])) { GO_Term_SD[i, j] <- sd(Zscores_For_ID[, j], na.rm = TRUE) # GO_Scores <- colMeans(Zscores_For_ID[,3:length(X[1,])]) } } # Add either _Avg or _SD depending on if the calculated score is an average or SD colnames(GO_Term_Averages) <- paste(colnames(GO_Term_Averages), "Avg", sep = "_") colnames(GO_Term_SD) <- paste(colnames(GO_Term_SD), "SD", sep = "_") # Combine the averages with the SDs to make one big data.frame X2 <- cbind(GO_Term_Averages, GO_Term_SD) # Test[ , order(names(test))] X2 <- X2[, order(names(X2))] X2 <- X2[!is.na(X2$Z_lm_L_Avg), ] # Create output file write.csv(X2, file.path(output_dir, expName[m], "Average_GOTerms_All.csv"), row.names = FALSE) # Remove NAs X3 <- X2[!is.na(X2$Z_lm_L_Avg), ] # Identify redundant GO terms for (i in 1:length(X3[, 1])) { # Loop through each GO term - get term GO_term_ID <- as.character(X3$GO_ID_Avg[i]) # Get term in the X3 X3_Temp <- X3[X3$GO_ID_Avg == GO_term_ID, ] # Get anywhere that has the same number K_Avg value X3_Temp2 <- X3[X3$Z_lm_K_Avg %in% X3_Temp, ] if (length(X3_Temp2[, 1]) > 1) { if (length(unique(X3_Temp2$Genes_Avg)) == 1) { X3_Temp2 <- X3_Temp2[1, ] } } if (i == 1) { Y <- X3_Temp2 } if (i > 1) { Y <- rbind(Y, X3_Temp2) } } Y1 <- unique(Y) write.csv(Y1, file.path(output_dir, exp_name, "Average_GOTerms_All_NonRedundantTerms.csv"), row.names = FALSE) Y2 <- Y1[Y1$Z_lm_L_Avg >= 2 | Y1$Z_lm_L_Avg <= -2, ] Y2 <- Y2[!is.na(Y2$Z_lm_L_Avg), ] write.csv(Y2, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_L.csv"), row.names = FALSE) Y3 <- Y2[Y2$NumGenes_Avg > 2, ] write.csv(Y3, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv"), row.names = FALSE) Y4 <- Y1[Y1$Z_lm_K_Avg >= 2 | Y1$Z_lm_K_Avg <= -2, ] Y4 <- Y4[!is.na(Y4$Z_lm_K_Avg), ] write.csv(Y4, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_K.csv"), row.names = FALSE) Y5 <- Y4[Y4$NumGenes_Avg > 2, ] write.csv(Y5, file.path(output_dir, exp_name, "Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv"), row.names = FALSE) }