join_interactions.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. import pandas as pd
  2. import os
  3. import sys
  4. import numpy as np
  5. # Function to parse and set arguments
  6. def parse_arguments():
  7. if len(sys.argv) == 1: # Interactive mode
  8. args = [
  9. "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
  10. 2,
  11. "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
  12. "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
  13. "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
  14. ]
  15. else:
  16. args = sys.argv[1:]
  17. return {
  18. "out_dir": os.path.abspath(args[0]),
  19. "sd": float(args[1]),
  20. "study_info": os.path.abspath(args[2]),
  21. "input_dirs": args[3:]
  22. }
  23. args = parse_arguments()
  24. # Create an array for the zscores files
  25. def get_zscores_files(dirs):
  26. files = [os.path.join(study, "zscores", "zscores_interaction.csv")
  27. for study in dirs if os.path.exists(os.path.join(study, "zscores", "zscores_interaction.csv"))]
  28. return files
  29. zscores_files = get_zscores_files(args['input_dirs'])
  30. print(f"The SD value is: {args['sd']}")
  31. # Ensure there are enough files to compare
  32. if len(zscores_files) < 2:
  33. sys.exit("Not enough experiments to compare, exiting script")
  34. # Function to join zscores files
  35. def join_zscores_files(files):
  36. joined_data = pd.read_csv(files[0])
  37. for file in files[1:]:
  38. temp_data = pd.read_csv(file)
  39. joined_data = pd.merge(joined_data, temp_data, on="OrfRep", how="outer")
  40. return joined_data
  41. # Load and join zscores files
  42. joined_data = join_zscores_files(zscores_files)
  43. # Order and select columns
  44. def order_and_select_columns(data):
  45. ordered_data = data[sorted(data.columns)]
  46. selected_headers = ordered_data.filter(regex="OrfRep|Gene|z_lm_k|z_shift_k|z_lm_l|z_shift_l")
  47. return selected_headers
  48. selected_headers = order_and_select_columns(joined_data)
  49. # Remove redundant columns like "Gene.1"
  50. def clean_headers(data, suffixes):
  51. suffixes_to_remove = [f"Gene.{i}" for i in range(1, suffixes+1)]
  52. return data.drop(columns=suffixes_to_remove, errors='ignore')
  53. headSel = clean_headers(selected_headers, len(zscores_files) - 1)
  54. headSel2 = clean_headers(joined_data.filter(regex="OrfRep|Gene"), len(zscores_files) - 1)
  55. # Fill NA values in Shift and Z_lm columns
  56. def fill_na_in_columns(data):
  57. for column in data.columns:
  58. if "Shift" in column:
  59. data[column].fillna(0.001, inplace=True)
  60. elif "Z_lm_" in column:
  61. data[column].fillna(0.0001, inplace=True)
  62. return data
  63. headSel = fill_na_in_columns(headSel)
  64. # Filter based on standard deviation
  65. def filter_by_sd(data, sd):
  66. if sd == 0:
  67. return data
  68. z_lm_cols = data.filter(regex="z_lm_")
  69. filter_vector = z_lm_cols.abs().ge(sd).any(axis=1)
  70. return data[filter_vector]
  71. REMcRdy = filter_by_sd(headSel.filter(regex="OrfRep|Gene|z_lm_"), args['sd'])
  72. shiftOnly = filter_by_sd(headSel.filter(regex="OrfRep|Gene|z_shift"), args['sd'])
  73. # Reorder columns to interleave Z_lm and Shift data
  74. def reorder_columns(data1, data2):
  75. combined_data = data1.copy()
  76. for i in range(2, data1.shape[1]):
  77. combined_data.insert(2 * i - 1, data2.columns[i], data2.iloc[:, i])
  78. return combined_data
  79. combI = reorder_columns(headSel2, shiftOnly)
  80. # Write output files
  81. REMcRdy.to_csv(os.path.join(args['out_dir'], "REMcRdy_lm_only.csv"), index=False, quotechar=False)
  82. shiftOnly.to_csv(os.path.join(args['out_dir'], "Shift_only.csv"), index=False, quotechar=False)
  83. # Relabel headers using experiment names from StudyInfo.csv
  84. def relabel_headers(headers, labels):
  85. new_labels = headers.copy()
  86. for i, header in enumerate(headers):
  87. suffix = header.split('.')[-1]
  88. if suffix.isdigit() and int(suffix) in range(1, 4):
  89. exp_name = labels.iloc[int(suffix) - 1, 1]
  90. new_labels[i] = header.replace(f".{suffix}", f"_{exp_name}")
  91. return new_labels
  92. LabelStd = pd.read_csv(args['study_info'])
  93. shiftOnly.columns = relabel_headers(shiftOnly.columns, LabelStd)
  94. REMcRdy.columns = relabel_headers(REMcRdy.columns, LabelStd)
  95. # Save relabeled files
  96. REMcRdy.to_csv(os.path.join(args['out_dir'], "REMcRdy_lm_only.csv"), index=False, quotechar=False)
  97. shiftOnly.to_csv(os.path.join(args['out_dir'], "Shift_only.csv"), index=False, quotechar=False)
  98. # Save updated parameters
  99. LabelStd.iloc[:, 3] = args['sd']
  100. LabelStd.to_csv(os.path.join(args['out_dir'], "parameters.csv"), index=False)
  101. LabelStd.to_csv(args['study_info'], index=False)