import pandas as pd import os import sys import numpy as np # Function to parse and set arguments def parse_arguments(): if len(sys.argv) == 1: # Interactive mode args = [ "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD", 2, "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv", "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1", "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2" ] else: args = sys.argv[1:] return { "out_dir": os.path.abspath(args[0]), "sd": float(args[1]), "study_info": os.path.abspath(args[2]), "input_dirs": args[3:] } args = parse_arguments() # Create an array for the zscores files def get_zscores_files(dirs): files = [os.path.join(study, "zscores", "zscores_interaction.csv") for study in dirs if os.path.exists(os.path.join(study, "zscores", "zscores_interaction.csv"))] return files zscores_files = get_zscores_files(args['input_dirs']) print(f"The SD value is: {args['sd']}") # Ensure there are enough files to compare if len(zscores_files) < 2: sys.exit("Not enough experiments to compare, exiting script") # Function to join zscores files def join_zscores_files(files): joined_data = pd.read_csv(files[0]) for file in files[1:]: temp_data = pd.read_csv(file) joined_data = pd.merge(joined_data, temp_data, on="OrfRep", how="outer") return joined_data # Load and join zscores files joined_data = join_zscores_files(zscores_files) # Order and select columns def order_and_select_columns(data): ordered_data = data[sorted(data.columns)] selected_headers = ordered_data.filter(regex="OrfRep|Gene|z_lm_k|z_shift_k|z_lm_l|z_shift_l") return selected_headers selected_headers = order_and_select_columns(joined_data) # Remove redundant columns like "Gene.1" def clean_headers(data, suffixes): suffixes_to_remove = [f"Gene.{i}" for i in range(1, suffixes+1)] return data.drop(columns=suffixes_to_remove, errors='ignore') headSel = clean_headers(selected_headers, len(zscores_files) - 1) headSel2 = clean_headers(joined_data.filter(regex="OrfRep|Gene"), len(zscores_files) - 1) # Fill NA values in Shift and Z_lm columns def fill_na_in_columns(data): for column in data.columns: if "Shift" in column: data[column].fillna(0.001, inplace=True) elif "Z_lm_" in column: data[column].fillna(0.0001, inplace=True) return data headSel = fill_na_in_columns(headSel) # Filter based on standard deviation def filter_by_sd(data, sd): if sd == 0: return data z_lm_cols = data.filter(regex="z_lm_") filter_vector = z_lm_cols.abs().ge(sd).any(axis=1) return data[filter_vector] REMcRdy = filter_by_sd(headSel.filter(regex="OrfRep|Gene|z_lm_"), args['sd']) shiftOnly = filter_by_sd(headSel.filter(regex="OrfRep|Gene|z_shift"), args['sd']) # Reorder columns to interleave Z_lm and Shift data def reorder_columns(data1, data2): combined_data = data1.copy() for i in range(2, data1.shape[1]): combined_data.insert(2 * i - 1, data2.columns[i], data2.iloc[:, i]) return combined_data combI = reorder_columns(headSel2, shiftOnly) # Write output files REMcRdy.to_csv(os.path.join(args['out_dir'], "REMcRdy_lm_only.csv"), index=False, quotechar=False) shiftOnly.to_csv(os.path.join(args['out_dir'], "Shift_only.csv"), index=False, quotechar=False) # Relabel headers using experiment names from StudyInfo.csv def relabel_headers(headers, labels): new_labels = headers.copy() for i, header in enumerate(headers): suffix = header.split('.')[-1] if suffix.isdigit() and int(suffix) in range(1, 4): exp_name = labels.iloc[int(suffix) - 1, 1] new_labels[i] = header.replace(f".{suffix}", f"_{exp_name}") return new_labels LabelStd = pd.read_csv(args['study_info']) shiftOnly.columns = relabel_headers(shiftOnly.columns, LabelStd) REMcRdy.columns = relabel_headers(REMcRdy.columns, LabelStd) # Save relabeled files REMcRdy.to_csv(os.path.join(args['out_dir'], "REMcRdy_lm_only.csv"), index=False, quotechar=False) shiftOnly.to_csv(os.path.join(args['out_dir'], "Shift_only.csv"), index=False, quotechar=False) # Save updated parameters LabelStd.iloc[:, 3] = args['sd'] LabelStd.to_csv(os.path.join(args['out_dir'], "parameters.csv"), index=False) LabelStd.to_csv(args['study_info'], index=False)