123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- #!/usr/bin/env python
- # This code is to concatenate the batch GO Term Finder results (.tsv) generated from batch GTF perl code(Chris Johnson, U of Tulsa) into a list table
- import os
- import glob
- def list_files(directory):
- """Return a list of all files in the given directory."""
- return glob.glob(os.path.join(directory, '*.txt.tsv'))
- def concatenate_gtf_results(data_dir, output_file):
- """Concatenate the GTF results into a single file."""
- output = open(output_file, 'w')
- files = list_files(data_dir)
- files.sort()
- for file_path in files:
- file_name = os.path.basename(file_path).rstrip('.txt.tsv')
- with open(file_path, 'r') as f:
- labels = f.readline().strip().split('\t')
- output.write('\t'.join(labels) + '\n')
- for line in f:
- line = line.strip().strip('\t')
- if line:
- output.write(line + '\n')
- output.close()
- if __name__ == '__main__':
- if len(sys.argv) != 3:
- print('Usage: python Concatenate_GTF_results.py data_dir output_file')
- sys.exit(1)
- data_dir = sys.argv[1]
- output_file = sys.argv[2]
- concatenate_gtf_results(data_dir, output_file)
- # Old version
- # def list_files(directory):
- # """Return a list of all files in the given directory."""
- # return glob.glob(os.path.join(directory, '*.txt.tsv'))
- # try:
- # data_file_Path = sys.argv[1]
- # output_file_Path = sys.argv[2]
- # except:
- # print ('Usage: python Concatenate_GTF_results.py /datasetPath /outputFilePath_and_Name')
- # print ('Data file not found, error in given directory')
- # sys.exit(1)
- # try:
- # output = open(output_file_Path, 'w')
- # except OSError:
- # print ('output file error')
- # # get all the GTF result files in given directory
- # File_list = []
- # File_list = list_files(data_file_Path)
- # File_list.sort()
- # i = 0
- # for file in File_list:
- # #parse the file names given in absolute path
- # file_name = file.strip().split('/')[-1]
- # file_name = file_name.rstrip('.txt.tsv')
- # # function to read tsv files from a given directory
- # #open the file
- # data = open(file,'r')
- # #reading the label line
- # labelLine = data.readline()
- # label = labelLine.strip().split('\t')
- # #write the label
- # #updates2010July26: update following label writing code
- # if i == 0:
- # # output.write('cluster origin')
- # for element in label:
- # output.write(element)
- # output.write('\t')
- # i = i + 1
- # #updates2010July26 End
- # #switch to the next line
- # output.write('\n')
- # #read the GO terms
- # GOTermLines = data.readlines()
- # for GOTerm in GOTermLines:
- # GOTerm = GOTerm.strip().strip('\t')
- # if GOTerm != '':
- # #updates2010July26: remove the code to write the first column 'REMc cluster ID'
- # #output.write(file_name)
- # #output.write('\t')
- # ##updates2010July26 update end
- # output.write(GOTerm + '\n')
- # #output.write('\n')
- # output.close()
|