#!/usr/bin/env python # This code is to concatenate the batch GO Term Finder results (.tsv) generated from batch GTF perl code(Chris Johnson, U of Tulsa) into a list table import os import glob def list_files(directory): """Return a list of all files in the given directory.""" return glob.glob(os.path.join(directory, '*.txt.tsv')) def concatenate_gtf_results(data_dir, output_file): """Concatenate the GTF results into a single file.""" output = open(output_file, 'w') files = list_files(data_dir) files.sort() for file_path in files: file_name = os.path.basename(file_path).rstrip('.txt.tsv') with open(file_path, 'r') as f: labels = f.readline().strip().split('\t') output.write('\t'.join(labels) + '\n') for line in f: line = line.strip().strip('\t') if line: output.write(line + '\n') output.close() if __name__ == '__main__': if len(sys.argv) != 3: print('Usage: python Concatenate_GTF_results.py data_dir output_file') sys.exit(1) data_dir = sys.argv[1] output_file = sys.argv[2] concatenate_gtf_results(data_dir, output_file) # Old version # def list_files(directory): # """Return a list of all files in the given directory.""" # return glob.glob(os.path.join(directory, '*.txt.tsv')) # try: # data_file_Path = sys.argv[1] # output_file_Path = sys.argv[2] # except: # print ('Usage: python Concatenate_GTF_results.py /datasetPath /outputFilePath_and_Name') # print ('Data file not found, error in given directory') # sys.exit(1) # try: # output = open(output_file_Path, 'w') # except OSError: # print ('output file error') # # get all the GTF result files in given directory # File_list = [] # File_list = list_files(data_file_Path) # File_list.sort() # i = 0 # for file in File_list: # #parse the file names given in absolute path # file_name = file.strip().split('/')[-1] # file_name = file_name.rstrip('.txt.tsv') # # function to read tsv files from a given directory # #open the file # data = open(file,'r') # #reading the label line # labelLine = data.readline() # label = labelLine.strip().split('\t') # #write the label # #updates2010July26: update following label writing code # if i == 0: # # output.write('cluster origin') # for element in label: # output.write(element) # output.write('\t') # i = i + 1 # #updates2010July26 End # #switch to the next line # output.write('\n') # #read the GO terms # GOTermLines = data.readlines() # for GOTerm in GOTermLines: # GOTerm = GOTerm.strip().strip('\t') # if GOTerm != '': # #updates2010July26: remove the code to write the first column 'REMc cluster ID' # #output.write(file_name) # #output.write('\t') # ##updates2010July26 update end # output.write(GOTerm + '\n') # #output.write('\n') # output.close()