Squashed initial commit

2024-09-10 13:47:29 -04:00
commit 8ebb6ad265
6221 changed files with 2512206 additions and 0 deletions
--- a/qhtcp-workflow/apps/python/join_interactions.py
+++ b/qhtcp-workflow/apps/python/join_interactions.py
@@ -0,0 +1,125 @@
+import pandas as pd
+import os
+import sys
+import numpy as np
+
+# Function to parse and set arguments
+def parse_arguments():
+    if len(sys.argv) == 1:  # Interactive mode
+        args = [
+            "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
+            2,
+            "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
+            "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
+            "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
+        ]
+    else:
+        args = sys.argv[1:]
+
+    return {
+        "out_dir": os.path.abspath(args[0]),
+        "sd": float(args[1]),
+        "study_info": os.path.abspath(args[2]),
+        "input_dirs": args[3:]
+    }
+
+args = parse_arguments()
+
+# Create an array for the zscores files
+def get_zscores_files(dirs):
+    files = [os.path.join(study, "zscores", "zscores_interaction.csv") 
+             for study in dirs if os.path.exists(os.path.join(study, "zscores", "zscores_interaction.csv"))]
+    return files
+
+zscores_files = get_zscores_files(args['input_dirs'])
+print(f"The SD value is: {args['sd']}")
+
+# Ensure there are enough files to compare
+if len(zscores_files) < 2:
+    sys.exit("Not enough experiments to compare, exiting script")
+
+# Function to join zscores files
+def join_zscores_files(files):
+    joined_data = pd.read_csv(files[0])
+    for file in files[1:]:
+        temp_data = pd.read_csv(file)
+        joined_data = pd.merge(joined_data, temp_data, on="OrfRep", how="outer")
+    return joined_data
+
+# Load and join zscores files
+joined_data = join_zscores_files(zscores_files)
+
+# Order and select columns
+def order_and_select_columns(data):
+    ordered_data = data[sorted(data.columns)]
+    selected_headers = ordered_data.filter(regex="OrfRep|Gene|z_lm_k|z_shift_k|z_lm_l|z_shift_l")
+    return selected_headers
+
+selected_headers = order_and_select_columns(joined_data)
+
+# Remove redundant columns like "Gene.1"
+def clean_headers(data, suffixes):
+    suffixes_to_remove = [f"Gene.{i}" for i in range(1, suffixes+1)]
+    return data.drop(columns=suffixes_to_remove, errors='ignore')
+
+headSel = clean_headers(selected_headers, len(zscores_files) - 1)
+headSel2 = clean_headers(joined_data.filter(regex="OrfRep|Gene"), len(zscores_files) - 1)
+
+# Fill NA values in Shift and Z_lm columns
+def fill_na_in_columns(data):
+    for column in data.columns:
+        if "Shift" in column:
+            data[column].fillna(0.001, inplace=True)
+        elif "Z_lm_" in column:
+            data[column].fillna(0.0001, inplace=True)
+    return data
+
+headSel = fill_na_in_columns(headSel)
+
+# Filter based on standard deviation
+def filter_by_sd(data, sd):
+    if sd == 0:
+        return data
+    z_lm_cols = data.filter(regex="z_lm_")
+    filter_vector = z_lm_cols.abs().ge(sd).any(axis=1)
+    return data[filter_vector]
+
+REMcRdy = filter_by_sd(headSel.filter(regex="OrfRep|Gene|z_lm_"), args['sd'])
+shiftOnly = filter_by_sd(headSel.filter(regex="OrfRep|Gene|z_shift"), args['sd'])
+
+# Reorder columns to interleave Z_lm and Shift data
+def reorder_columns(data1, data2):
+    combined_data = data1.copy()
+    for i in range(2, data1.shape[1]):
+        combined_data.insert(2 * i - 1, data2.columns[i], data2.iloc[:, i])
+    return combined_data
+
+combI = reorder_columns(headSel2, shiftOnly)
+
+# Write output files
+REMcRdy.to_csv(os.path.join(args['out_dir'], "REMcRdy_lm_only.csv"), index=False, quotechar=False)
+shiftOnly.to_csv(os.path.join(args['out_dir'], "Shift_only.csv"), index=False, quotechar=False)
+
+# Relabel headers using experiment names from StudyInfo.csv
+def relabel_headers(headers, labels):
+    new_labels = headers.copy()
+    for i, header in enumerate(headers):
+        suffix = header.split('.')[-1]
+        if suffix.isdigit() and int(suffix) in range(1, 4):
+            exp_name = labels.iloc[int(suffix) - 1, 1]
+            new_labels[i] = header.replace(f".{suffix}", f"_{exp_name}")
+    return new_labels
+
+LabelStd = pd.read_csv(args['study_info'])
+
+shiftOnly.columns = relabel_headers(shiftOnly.columns, LabelStd)
+REMcRdy.columns = relabel_headers(REMcRdy.columns, LabelStd)
+
+# Save relabeled files
+REMcRdy.to_csv(os.path.join(args['out_dir'], "REMcRdy_lm_only.csv"), index=False, quotechar=False)
+shiftOnly.to_csv(os.path.join(args['out_dir'], "Shift_only.csv"), index=False, quotechar=False)
+
+# Save updated parameters
+LabelStd.iloc[:, 3] = args['sd']
+LabelStd.to_csv(os.path.join(args['out_dir'], "parameters.csv"), index=False)
+LabelStd.to_csv(args['study_info'], index=False)