dCon.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. #!/usr/bin/env python
  2. """
  3. Reads the REMc "-finalTable.csv" output file and makes a series of subdatasets
  4. that reflect the pedigree structure of the way clusters break up.
  5. """
  6. import sys
  7. import os
  8. import string
  9. import glob
  10. def reading_single_file(file_path):
  11. """
  12. Reads a file and generates a list of gene names.
  13. """
  14. with open(file_path, 'r') as data:
  15. attribute_line = data.readline().strip()
  16. attributes = attribute_line.split(',')
  17. gene_list = []
  18. for dataline in data:
  19. dataline = dataline.strip()
  20. elements = dataline.split(',')
  21. gene_list.append(elements[1])
  22. return gene_list
  23. def writing_cluster_orf_list(list, output_dir, real_cluster_ori_name):
  24. """
  25. Writes a list of ORF names into a file in hierarchical series.
  26. """
  27. outfile_path = os.path.join(output_dir, f"{real_cluster_ori_name}.txt")
  28. with open(outfile_path, 'w') as outfile:
  29. for orf in list:
  30. outfile.write(orf.strip())
  31. outfile.write('\n')
  32. def writing_cluster_results(attributes, orf_list, dic, output_dir, real_cluster_ori_name):
  33. """
  34. Writes clusters information into a series of files.
  35. """
  36. outfile_path = os.path.join(output_dir, f"{real_cluster_ori_name}-finaltable.csv")
  37. with open(outfile_path, 'w') as outfile:
  38. outfile.write(attributes)
  39. outfile.write('\n')
  40. for orf in orf_list:
  41. outfile.write(dic[orf.strip()].strip())
  42. outfile.write('\n')
  43. def writing_ext_final_table(attributes, dic, ori_name_col_num, output_dir, output_file_name):
  44. """
  45. Writes the cluster name extensive final table.
  46. """
  47. outfile_path = os.path.join(output_dir, f"{output_file_name}-oriExtFinalTable.csv")
  48. with open(outfile_path, 'w') as outfile:
  49. outfile.write(attributes)
  50. outfile.write('\n')
  51. for orf in dic:
  52. elements = dic[orf].split(',')
  53. ori_name_list = elements[int(ori_name_col_num) - 1].strip().split(';')
  54. for ori_name in ori_name_list:
  55. elements.append(ori_name.strip())
  56. outfile.write(','.join(elements))
  57. outfile.write('\n')
  58. def main():
  59. """
  60. Main function to parse the REMc -finalTable.csv output file.
  61. """
  62. try:
  63. data_file_path = sys.argv[1]
  64. output_path = sys.argv[2]
  65. except IndexError:
  66. print('Usage: python parse_clustering_result_to_Pedigree_Dataset_and_genelist.py '
  67. '/datasetPath/datasetfilename cluster_origin_column_num output_path_name')
  68. print('Data file not found')
  69. sys.exit(1)
  70. try:
  71. with open(data_file_path, 'r') as data:
  72. attribute_line = data.readline().strip()
  73. attributes = attribute_line.split(',')
  74. orig_col = len(attributes) - 1
  75. data_dict = {}
  76. for dataline in data:
  77. dataline = dataline.strip()
  78. elements = dataline.split(',')
  79. data_dict[str.upper(elements[1].strip())] = ','.join(elements).upper()
  80. except FileNotFoundError:
  81. print('Input file does not exist')
  82. sys.exit(1)
  83. file_dict = {}
  84. for orf in data_dict:
  85. line = data_dict[orf].split(',')
  86. cluster_origin = line[int(orig_col) - 1].strip()
  87. cluster_identifier = cluster_origin.split(';')[0:-1]
  88. for i, identifier in enumerate(cluster_identifier):
  89. identifier = identifier.strip()
  90. if identifier not in file_dict:
  91. file_dict[identifier] = line[1]
  92. else:
  93. file_dict[identifier] = f"{file_dict[identifier]},{line[1]}"
  94. input_file_identifier = os.path.basename(data_file_path).split('.csv')[0]
  95. output_dir = os.path.join(output_path, input_file_identifier)
  96. os.makedirs(output_dir, exist_ok=True)
  97. # Writing the extensive ori name finaltable
  98. writing_ext_final_table(attribute_line, data_dict, orig_col, output_dir, input_file_identifier)
  99. # Writing the genelist files
  100. for cluster_name in file_dict:
  101. writing_cluster_orf_list(file_dict[cluster_name].split(','), output_dir, cluster_name)
  102. # Writing the cluster result files
  103. writing_cluster_results(attribute_line, file_dict[cluster_name].split(','), data_dict,
  104. output_dir, cluster_name)
  105. if __name__ == '__main__':
  106. main()