Squashed initial commit
This commit is contained in:
125
qhtcp-workflow/apps/python/join_interactions.py
Normal file
125
qhtcp-workflow/apps/python/join_interactions.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import pandas as pd
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
# Function to parse and set arguments
|
||||
def parse_arguments():
|
||||
if len(sys.argv) == 1: # Interactive mode
|
||||
args = [
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
|
||||
2,
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
|
||||
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
|
||||
]
|
||||
else:
|
||||
args = sys.argv[1:]
|
||||
|
||||
return {
|
||||
"out_dir": os.path.abspath(args[0]),
|
||||
"sd": float(args[1]),
|
||||
"study_info": os.path.abspath(args[2]),
|
||||
"input_dirs": args[3:]
|
||||
}
|
||||
|
||||
args = parse_arguments()
|
||||
|
||||
# Create an array for the zscores files
|
||||
def get_zscores_files(dirs):
|
||||
files = [os.path.join(study, "zscores", "zscores_interaction.csv")
|
||||
for study in dirs if os.path.exists(os.path.join(study, "zscores", "zscores_interaction.csv"))]
|
||||
return files
|
||||
|
||||
zscores_files = get_zscores_files(args['input_dirs'])
|
||||
print(f"The SD value is: {args['sd']}")
|
||||
|
||||
# Ensure there are enough files to compare
|
||||
if len(zscores_files) < 2:
|
||||
sys.exit("Not enough experiments to compare, exiting script")
|
||||
|
||||
# Function to join zscores files
|
||||
def join_zscores_files(files):
|
||||
joined_data = pd.read_csv(files[0])
|
||||
for file in files[1:]:
|
||||
temp_data = pd.read_csv(file)
|
||||
joined_data = pd.merge(joined_data, temp_data, on="OrfRep", how="outer")
|
||||
return joined_data
|
||||
|
||||
# Load and join zscores files
|
||||
joined_data = join_zscores_files(zscores_files)
|
||||
|
||||
# Order and select columns
|
||||
def order_and_select_columns(data):
|
||||
ordered_data = data[sorted(data.columns)]
|
||||
selected_headers = ordered_data.filter(regex="OrfRep|Gene|z_lm_k|z_shift_k|z_lm_l|z_shift_l")
|
||||
return selected_headers
|
||||
|
||||
selected_headers = order_and_select_columns(joined_data)
|
||||
|
||||
# Remove redundant columns like "Gene.1"
|
||||
def clean_headers(data, suffixes):
|
||||
suffixes_to_remove = [f"Gene.{i}" for i in range(1, suffixes+1)]
|
||||
return data.drop(columns=suffixes_to_remove, errors='ignore')
|
||||
|
||||
headSel = clean_headers(selected_headers, len(zscores_files) - 1)
|
||||
headSel2 = clean_headers(joined_data.filter(regex="OrfRep|Gene"), len(zscores_files) - 1)
|
||||
|
||||
# Fill NA values in Shift and Z_lm columns
|
||||
def fill_na_in_columns(data):
|
||||
for column in data.columns:
|
||||
if "Shift" in column:
|
||||
data[column].fillna(0.001, inplace=True)
|
||||
elif "Z_lm_" in column:
|
||||
data[column].fillna(0.0001, inplace=True)
|
||||
return data
|
||||
|
||||
headSel = fill_na_in_columns(headSel)
|
||||
|
||||
# Filter based on standard deviation
|
||||
def filter_by_sd(data, sd):
|
||||
if sd == 0:
|
||||
return data
|
||||
z_lm_cols = data.filter(regex="z_lm_")
|
||||
filter_vector = z_lm_cols.abs().ge(sd).any(axis=1)
|
||||
return data[filter_vector]
|
||||
|
||||
REMcRdy = filter_by_sd(headSel.filter(regex="OrfRep|Gene|z_lm_"), args['sd'])
|
||||
shiftOnly = filter_by_sd(headSel.filter(regex="OrfRep|Gene|z_shift"), args['sd'])
|
||||
|
||||
# Reorder columns to interleave Z_lm and Shift data
|
||||
def reorder_columns(data1, data2):
|
||||
combined_data = data1.copy()
|
||||
for i in range(2, data1.shape[1]):
|
||||
combined_data.insert(2 * i - 1, data2.columns[i], data2.iloc[:, i])
|
||||
return combined_data
|
||||
|
||||
combI = reorder_columns(headSel2, shiftOnly)
|
||||
|
||||
# Write output files
|
||||
REMcRdy.to_csv(os.path.join(args['out_dir'], "REMcRdy_lm_only.csv"), index=False, quotechar=False)
|
||||
shiftOnly.to_csv(os.path.join(args['out_dir'], "Shift_only.csv"), index=False, quotechar=False)
|
||||
|
||||
# Relabel headers using experiment names from StudyInfo.csv
|
||||
def relabel_headers(headers, labels):
|
||||
new_labels = headers.copy()
|
||||
for i, header in enumerate(headers):
|
||||
suffix = header.split('.')[-1]
|
||||
if suffix.isdigit() and int(suffix) in range(1, 4):
|
||||
exp_name = labels.iloc[int(suffix) - 1, 1]
|
||||
new_labels[i] = header.replace(f".{suffix}", f"_{exp_name}")
|
||||
return new_labels
|
||||
|
||||
LabelStd = pd.read_csv(args['study_info'])
|
||||
|
||||
shiftOnly.columns = relabel_headers(shiftOnly.columns, LabelStd)
|
||||
REMcRdy.columns = relabel_headers(REMcRdy.columns, LabelStd)
|
||||
|
||||
# Save relabeled files
|
||||
REMcRdy.to_csv(os.path.join(args['out_dir'], "REMcRdy_lm_only.csv"), index=False, quotechar=False)
|
||||
shiftOnly.to_csv(os.path.join(args['out_dir'], "Shift_only.csv"), index=False, quotechar=False)
|
||||
|
||||
# Save updated parameters
|
||||
LabelStd.iloc[:, 3] = args['sd']
|
||||
LabelStd.to_csv(os.path.join(args['out_dir'], "parameters.csv"), index=False)
|
||||
LabelStd.to_csv(args['study_info'], index=False)
|
||||
Reference in New Issue
Block a user