#!/usr/bin/env python """ Improved code to determine the origin column (OligCol) without user input of the argument and removed the former sys.argv[2]. JWR 22_0816 """ """ this code can be used for the parse of the REMc "-finalTable.csv" output file to make a series of subdatasets, which reflect the pedigree structure of the way cluters breaking up. """ import sys, os, string, glob try: data_file_Path = sys.argv[1] #print data_file_Path #cluster_ori_col_num = sys.argv[2] output_path = sys.argv[2] except: print ('Usage: python parse_clustering_result_to_Pedigree_Dataset_and_genelist.py /datasetPath/datasetfilename cluster_origin_column_num output_path_name') print ('Data file not found') sys.exit(1) #define a function to reading files and generate the list def read_file(file_path): with open(file_path, 'r') as file: attributes = file.readline().strip().split(',') gene_list = [elements[1] for line in file for elements in [line.strip().split(',')]] return gene_list # define a function to write the list into a file named in hierarchical series def write_cluster_orf_list(orf_list, output_dir, cluster_name): cluster_file_path = os.path.join(output_dir, f"cluster_name, 'txt') with open(cluster_file_path, 'w') as outfile: for orf in orf_list: outfile.write(orf.strip() + '\n') # define a function to write the clusters information into a series of files def write_cluster_results(attributes, orf_list, data_dict, output_directory, cluster_name): file_path = os.path.join(output_directory, f"{cluster_name}-finaltable.csv") with open(file_path, 'w') as output_file: output_file.write(attributes) output_file.write('\n') for orf in orf_list: output_file.write(data_dict[orf.strip()].strip()) output_file.write('\n') # define a function to write the cluster name origina extensive final table def write_extended_final_table(attributes, data, ori_name_column_number, output_directory, output_file_name): output_file_path = os.path.join(output_directory, f"{output_file_name}-oriExtFinalTable.csv") with open(output_file_path, 'w') as output_file: output_file.write(attributes) output_file.write('\n') for orf in data: elements = data[orf].split(',') ori_name_list = elements[int(ori_name_column_number)-1].strip().split(';') for ori_name in ori_name_list: elements.append(orii_name.strip()) output_file.write(','.join(elements)) output_file.write('\n') # Read the data file try: data = open(data_file_Path,'r') except OSError: print ('input file does not exists') # first the title line would be read and kept attributes = data.readline().strip().split(',') print(attributes) print(len(attributes)) OrigCol= len(attributes) - 1 print(OrigCol) # then the data data_dict = {} for data_line in data: data_line = data_line.strip() line_elements = data_line.split(',') orf_identifier = line_elements[1].strip().upper() data_dict[orf_identifier] = ','.join(line_elements).upper() data.close() #print dataDic print ("OrigCol is ", str(OrigCol)) fileDic = {} for orf in dataDic: line = dataDic[orf].split(',') #read the cluster name len(attributes) clusterOrigin = line[int(OrigCol) - 1] #clusterOrigin = line[int(cluster_ori_col_num) - 1] #print clusterOrigin clusterOrigin = clusterOrigin.strip() #print clusterOrigin clusterIdentifier = clusterOrigin.split(';')[0:-1] #print clusterIdentifier for identifier in clusterIdentifier: identifier = identifier.strip() upper_identifier = identifier.upper() if upper_identifier not in fileDic: fileDic[upper_identifier] = line[1] else: fileDic[upper_identifier] += ',' + line[1] input_file_identifier = data_file_Path.strip().split('/')[-1].strip().split('.csv')[-3] #make the output folder try: os.mkdir(str(output_path)+str(input_file_identifier)) except OSError: print ('dir exists') #Writing the extensive ori name finaltable Writing_ext_final_table(attributeLine, dataDic,str(OrigCol),str(output_path)+str(input_file_identifier), str(input_file_identifier)) #Writing_ext_final_table(attributeLine, dataDic,cluster_ori_col_num,str(output_path)+str(input_file_identifier), str(input_file_identifier)) #write the genelist files for cluster_name in fileDic: #print fileDic[cluster_name].split(',') Writing_clusterORF_list(fileDic[cluster_name].split(','), str(output_path)+str(input_file_identifier), cluster_name) #write the cluster result files Writing_cluster_results(attributeLine, fileDic[cluster_name].split(','), dataDic,str(output_path)+str(input_file_identifier),cluster_name)