concatGTFResults.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #!/usr/bin/env python
  2. # This code is to concatenate the batch GO Term Finder results (.tsv) generated from batch GTF perl code(Chris Johnson, U of Tulsa) into a list table
  3. import os
  4. import glob
  5. def list_files(directory):
  6. """Return a list of all files in the given directory."""
  7. return glob.glob(os.path.join(directory, '*.txt.tsv'))
  8. def concatenate_gtf_results(data_dir, output_file):
  9. """Concatenate the GTF results into a single file."""
  10. output = open(output_file, 'w')
  11. files = list_files(data_dir)
  12. files.sort()
  13. for file_path in files:
  14. file_name = os.path.basename(file_path).rstrip('.txt.tsv')
  15. with open(file_path, 'r') as f:
  16. labels = f.readline().strip().split('\t')
  17. output.write('\t'.join(labels) + '\n')
  18. for line in f:
  19. line = line.strip().strip('\t')
  20. if line:
  21. output.write(line + '\n')
  22. output.close()
  23. if __name__ == '__main__':
  24. if len(sys.argv) != 3:
  25. print('Usage: python Concatenate_GTF_results.py data_dir output_file')
  26. sys.exit(1)
  27. data_dir = sys.argv[1]
  28. output_file = sys.argv[2]
  29. concatenate_gtf_results(data_dir, output_file)
  30. # Old version
  31. # def list_files(directory):
  32. # """Return a list of all files in the given directory."""
  33. # return glob.glob(os.path.join(directory, '*.txt.tsv'))
  34. # try:
  35. # data_file_Path = sys.argv[1]
  36. # output_file_Path = sys.argv[2]
  37. # except:
  38. # print ('Usage: python Concatenate_GTF_results.py /datasetPath /outputFilePath_and_Name')
  39. # print ('Data file not found, error in given directory')
  40. # sys.exit(1)
  41. # try:
  42. # output = open(output_file_Path, 'w')
  43. # except OSError:
  44. # print ('output file error')
  45. # # get all the GTF result files in given directory
  46. # File_list = []
  47. # File_list = list_files(data_file_Path)
  48. # File_list.sort()
  49. # i = 0
  50. # for file in File_list:
  51. # #parse the file names given in absolute path
  52. # file_name = file.strip().split('/')[-1]
  53. # file_name = file_name.rstrip('.txt.tsv')
  54. # # function to read tsv files from a given directory
  55. # #open the file
  56. # data = open(file,'r')
  57. # #reading the label line
  58. # labelLine = data.readline()
  59. # label = labelLine.strip().split('\t')
  60. # #write the label
  61. # #updates2010July26: update following label writing code
  62. # if i == 0:
  63. # # output.write('cluster origin')
  64. # for element in label:
  65. # output.write(element)
  66. # output.write('\t')
  67. # i = i + 1
  68. # #updates2010July26 End
  69. # #switch to the next line
  70. # output.write('\n')
  71. # #read the GO terms
  72. # GOTermLines = data.readlines()
  73. # for GOTerm in GOTermLines:
  74. # GOTerm = GOTerm.strip().strip('\t')
  75. # if GOTerm != '':
  76. # #updates2010July26: remove the code to write the first column 'REMc cluster ID'
  77. # #output.write(file_name)
  78. # #output.write('\t')
  79. # ##updates2010July26 update end
  80. # output.write(GOTerm + '\n')
  81. # #output.write('\n')
  82. # output.close()