diff --git a/workflow/apps/r/joinInteractExps.R b/workflow/apps/r/joinInteractExps.R index fc85d2ca..65e46187 100644 --- a/workflow/apps/r/joinInteractExps.R +++ b/workflow/apps/r/joinInteractExps.R @@ -26,8 +26,8 @@ args <- parse_arguments() # Create an array for the zscores files get_zscores_files <- function(dirs) { - files <- sapply(dirs, function(study) { - file_path <- file.path(study, "zscores", "zscores_interaction.csv") + files <- sapply(dirs, function(exp) { + file_path <- file.path(exp, "zscores", "zscores_interaction.csv") if (file.exists(file_path)) file_path else NULL }, simplify = TRUE, USE.NAMES = FALSE) diff --git a/workflow/apps/r/join_interaction_zscores.R b/workflow/apps/r/join_interaction_zscores.R new file mode 100644 index 00000000..8343d040 --- /dev/null +++ b/workflow/apps/r/join_interaction_zscores.R @@ -0,0 +1,113 @@ +suppressMessages({ + library("dplyr") + library("data.table") + library("readr") + library("stringr") +}) + +# Function to parse arguments +parse_arguments <- function() { + if (interactive()) { + args <- c( + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD", + 3, # sd value + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1", + "Experiment 1: Doxo versus HLD", + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2", + "Experiment 2: HLD versus Doxo" + ) + } else { + args <- commandArgs(trailingOnly = TRUE) + } + + out_dir <- normalizePath(args[1], mustWork = FALSE) + sd <- as.numeric(args[2]) + paths <- normalizePath(args[seq(3, length(args), by = 2)], mustWork = FALSE) + names <- args[seq(4, length(args), by = 2)] + experiments <- setNames(paths, names) + + list( + out_dir = out_dir, + sd = sd, + experiments = experiments + ) +} + +args <- parse_arguments() + +# Ensure main output directory exists +dir.create(args$out_dir, showWarnings = FALSE, recursive = TRUE) + +# Function to read and combine z-score interaction files +combine_zscores <- function(experiments, out_dir) { + combined_data <- lapply(names(experiments), function(exp_name) { + exp_dir <- experiments[[exp_name]] + zscore_file <- file.path(exp_dir, "zscores", "zscores_interaction.csv") + + if (!file.exists(zscore_file)) { + stop("Z-score file does not exist for ", exp_name, " at ", zscore_file) + } + + message("Reading z-score file for ", exp_name, " from ", zscore_file) + data <- fread(zscore_file) + data$Experiment <- exp_name + + return(data) + }) %>% + bind_rows() + + combined_output_file <- file.path(out_dir, "combined_zscores.csv") + fwrite(combined_data, combined_output_file, row.names = FALSE) + + message("Combined z-score file saved to: ", combined_output_file) +} + +# Function to read and combine summary statistics files +combine_summary_stats <- function(experiments, out_dir) { + combined_stats <- lapply(names(experiments), function(exp_name) { + exp_dir <- experiments[[exp_name]] + summary_file <- file.path(exp_dir, "zscores", "summary_stats_all_strains.csv") + + if (!file.exists(summary_file)) { + stop("Summary stats file does not exist for ", exp_name, " at ", summary_file) + } + + message("Reading summary stats file for ", exp_name, " from ", summary_file) + data <- fread(summary_file) + data$Experiment <- exp_name + + return(data) + }) %>% + bind_rows() + + combined_output_file <- file.path(out_dir, "combined_summary_stats.csv") + fwrite(combined_stats, combined_output_file, row.names = FALSE) + + message("Combined summary stats file saved to: ", combined_output_file) +} + +# Function to generate final summary report +generate_final_report <- function(out_dir) { + combined_zscores <- file.path(out_dir, "combined_zscores.csv") + combined_stats <- file.path(out_dir, "combined_summary_stats.csv") + + if (!file.exists(combined_zscores) || !file.exists(combined_stats)) { + stop("Combined z-scores or summary stats files do not exist.") + } + + zscores_data <- fread(combined_zscores) + stats_data <- fread(combined_stats) + + message("Merging z-score and summary stats data...") + final_report <- merge(zscores_data, stats_data, by = c("OrfRep", "Experiment"), all = TRUE, allow.cartesian = TRUE) + + final_report_file <- file.path(out_dir, "final_combined_report.csv") + fwrite(final_report, final_report_file, row.names = FALSE) + + message("Final combined report saved to: ", final_report_file) +} + +# Process all experiments and generate outputs +combine_zscores(args$experiments, args$out_dir) +combine_summary_stats(args$experiments, args$out_dir) +generate_final_report(args$out_dir) diff --git a/workflow/qhtcp-workflow b/workflow/qhtcp-workflow index 5683d036..0e4962c4 100755 --- a/workflow/qhtcp-workflow +++ b/workflow/qhtcp-workflow @@ -1296,6 +1296,7 @@ qhtcp() { # Run R interactions script on all studies calculate_interaction_zscores \ + && join_interaction_zscores \ && remc \ && gtf \ && gta @@ -1316,7 +1317,6 @@ remc() { # If any wrappers fail the rest will not run, this is fundamental to module design # Remove leading && to run regardless - #r_join_interactions \ java_extract \ && r_add_shift_values \ && r_create_heat_maps \ @@ -1522,7 +1522,50 @@ calculate_interaction_zscores() { "${EXP_PATHS_AND_NAMES[@]}" [[ -f "$out_path/zscores_interaction.csv" ]] || (echo "$out_path/zscores_interaction.csv does not exist"; return 1) - ((DEBUG)) && declare -p && exit # when the going gets rough +} + + +wrapper join_interaction_zscores +# shellcheck disable=SC2120 +# @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv +# +# TODO +# +# * Needs more loops to reduce verbosity +# +# INPUT +# +# * /out/PROJECT/STUDY/exp#/zscores/zscores_interaction.csv +# +# OUTPUT +# +# * combined_zscores.csv (REMcRdy_lm_only.csv) +# * combined_summary_stats (Shift_only.csv) +# * final_combined_report (parameters.csv) +# +# @arg $1 string output directory +# @arg $2 string sd value (default: 2) +# @arg $3 array pairs of experiment paths and names +join_interaction_zscores() { + debug "Running: ${FUNCNAME[0]} $*" + declare script="$APPS_DIR/r/join_interaction_zscores.R" + declare -a out_files=( + "${1:-$STUDY_RESULTS_DIR}/combined_zscores.csv" + "${1:-$STUDY_RESULTS_DIR}/combined_summary_stats.csv" + "${1:-$STUDY_RESULTS_DIR}/final_combined_report.csv" + ) + + # ((DEBUG)) && declare -p # when the going gets tough + + execute "$RSCRIPT" "$script" \ + "${1:-$STUDY_RESULTS_DIR}" \ + "${2:-2}" \ + "${@:3:}" \ + "${EXP_PATHS_AND_NAMES[@]}" + + for f in "${out_files[@]}"; do + [[ -f $f ]] || (echo "$f does not exist"; return 1) + done } @@ -1671,52 +1714,6 @@ r_gta_heatmaps() { } -# wrapper r_join_interactions -# # shellcheck disable=SC2120 -# # @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv -# # -# # TODO -# # -# # * Needs more loops to reduce verbosity -# # -# # INPUT -# # -# # * study info file -# # * /out/PROJECT/STUDY/exp#/zscores/zscores_interaction.csv -# # -# # OUTPUT -# # -# # * REMcRdy_lm_only.csv -# # * Shift_only.csv -# # * parameters.csv -# # -# # @arg $1 string output directory (required) -# # @arg $2 string sd value (default: 2) (required) -# # @arg $3 string study info file (required) -# # @arg $4 array studies (required) -# r_join_interactions() { -# debug "Running: ${FUNCNAME[0]} $*" -# declare script="$APPS_DIR/r/joinInteractExps.R" -# declare -a out_files=( -# "${1:-$STUDY_RESULTS_DIR}/REMcRdy_lm_only.csv" -# "${1:-$STUDY_RESULTS_DIR}/Shift_only.csv" -# "${1:-$STUDY_RESULTS_DIR}/parameters.csv" -# ) - -# # ((DEBUG)) && declare -p # when the going gets tough - -# execute "$RSCRIPT" "$script" \ -# "${1:-$STUDY_RESULTS_DIR}" \ -# "${2:-2}" \ -# "${3:-$STUDY_INFO_FILE}" \ -# "${@:4:-${EXP_PATHS[@]}}" - -# for f in "${out_files[@]}"; do -# [[ -f $f ]] || (echo "$f does not exist"; return 1) -# done -# } - - wrapper java_extract # shellcheck disable=SC2120 # @description Jingyu's REMc java utility @@ -1727,14 +1724,14 @@ wrapper java_extract # # INPUT # -# * REMcRdy_lm_only.csv +# * study_dir/combined_zscores.csv (REMcRdy_lm_only.csv) # # OUTPUT # -# * REMcRdy_lm_only.csv-finalTable.csv +# * study_dir/combined_zscores_final.csv (REMcRdy_lm_only.csv-finalTable.csv) # # @arg $1 string output directory -# @arg $2 string REMcRdy_lm_only.csv +# @arg $2 string combined_zscores.csv # @arg $3 string GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab # @arg $4 string ORF_List_Without_DAmPs.txt # @exitcode 0 if expected output file exists @@ -1743,14 +1740,14 @@ java_extract() { debug "Running: ${FUNCNAME[0]}" classpath="$APPS_DIR/java/weka-clustering/weka-clustering.jar" - output_file="${1:-$STUDY_RESULTS_DIR}/REMcRdy_lm_only.csv-finalTable.csv" + output_file="${1:-$STUDY_RESULTS_DIR}/combined_zscores_final.csv" [[ -f $output_file ]] && backup "$output_file" java_cmd=( "$JAVA" -Xms512m -Xmx2048m -Dfile.encoding=UTF-8 -classpath "$classpath" ExecMain - "${2:-"$STUDY_RESULTS_DIR/REMcRdy_lm_only.csv"}" + "${2:-"$STUDY_RESULTS_DIR/combined_zscores.csv"}" "${3:-"$APPS_DIR/java/GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab"}" "${4:-"$APPS_DIR/java/ORF_List_Without_DAmPs.txt"}" 1 @@ -2297,9 +2294,9 @@ main() { # Sanitize wrappers for i in "${!WRAPPERS[@]}"; do - IFS=',' read -ra args <<< "$wrapper" # load the wrapper and args + IFS=',' read -ra args <<< "${WRAPPERS[$i]}" # load the wrapper and args if ! [[ " ${ALL_WRAPPERS[*]} " =~ [[:space:]]${args[0]}[[:space:]] ]]; then - echo "Wrapper ${WRAPPERS[$i]} is not available, removing" + echo "Wrapper ${args[0]} is not available, removing" unset "WRAPPERS[$i]" fi done