Some more refactoring and cleanup

2024-08-02 18:48:09 -04:00
parent 5ca6ef8f01
commit b5aaf9ffb4
5 changed files with 45 additions and 2770 deletions
--- a/workflow/qhtcp-workflow
+++ b/workflow/qhtcp-workflow
@@ -74,7 +74,7 @@ print_help() {
 		  script-run-workflow [[OPTION] [VALUE]]...

 		  Some options (--project, --include, --exclude) can be passed multiple times or
-		  by using comma deliminated strings (see EXAMPLES below)
+		  by using comma-separated strings (see EXAMPLES below)

 		OPTIONS:
 		  --project, -p PROJECT
@@ -117,6 +117,7 @@ print_help() {
 		  script-run-workflow --module=${ALL_MODULES[0]},${ALL_MODULES[1]}
 		  script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT
 		  script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT --module=${ALL_MODULES[1]},${ALL_MODULES[2]} --yes --debug
+		  script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT --submodule ${ALL_SUBMODULES[2]} \"/path/to/genefile.txt,/path/to/output/dir\" --submodule ${ALL_SUBMODULES[3]} \"/path/to/sgofile\"
 	EOF
 }

@@ -326,7 +327,7 @@ print_header() {
  # Let user choose project(s)
  if [[ -z ${PROJECTS[*]} ]]; then
    num=$((${#projects[@]}))
-    echo "Enter comma delimited project #'s (from list) to analyze"
+    echo "Enter a comma-separated list of project numbers to analyze"
    read -r -p "Or hit Enter to add a new project" response
    [[ -z $response ]] && ask_pn && PROJECTS+=("${ADD_PROJECTS[@]}")
    ((YES)) || read -r -p "Hit enter for default ($num): " response
@@ -346,7 +347,7 @@ print_header() {
  done

  if [[ -z ${MODULES[*]} && -z ${EXCLUDE_MODULES[*]} ]]; then
-    echo "Enter module #'s to run (by #, comma delimited)"
+    echo "Enter a comma-separated list of modules to run"
    ((YES)) || read -r -p "Hit Enter for all (default) or '0' for none: " response
    if [[ -n $response && $response -ne 0 ]]; then
      IFS=',' read -ra arr <<< "$response"
@@ -359,7 +360,8 @@ print_header() {

  if [[ -z ${MODULES[*]} && -z ${EXCLUDE_MODULES[*]} && -z ${SUBMODULES[*]} ]]; then
    while :; do
-      echo "Enter a submodule followed by its arguments as a case delimited string in quotes"
+      echo "Enter a submodule followed by its arguments as a comma-separated string"
+      echo "Quote your string if there are any whitespaces"
      echo "Example: ${ALL_SUBMODULES[0]} \"arg1,arg2,arg3...\""
      ((YES)) || read -r -p "Or hit Enter to continue: " response
      [[ -z $response ]] && break
@@ -520,6 +522,7 @@ init_project() {
    if ask "You can edit this file in the qhtcp module"; then
      cat <<-EOF > "$STUDY_INFO_FILE"
 				"ExpNumb","ExpLabel","BackgroundSD","ZscoreJoinSD","AnalysisBy"
+				
 			EOF
    fi
  fi
@@ -943,15 +946,10 @@ qhtcp() {


 module remc
-# @section GTF
-# @description GTF module for QHTCP
-# TODO which components of remc can be parallelized?
-# The submodules in remc really like to be run from the REMc dir
-# so we pop in and out for now
-# NOTE the remc modules could use some love
-#   * Don't cd within scripts, it's confusing
-#   * Use arguments to pass configuration variables
-#     * This allows us to abstract the program away in script-run-workflow and treat it like a module
+# @section remc
+# @description remc module for QHTCP
+# TODO 
+#   * Which components can be parallelized?
 # @arg $1 string studyInfo file
 remc() {
  debug "Running: ${FUNCNAME[0]} $*"
@@ -992,14 +990,14 @@ module gtf
 gtf() {
  debug "Running: ${FUNCNAME[0]}"
  gtf_out_dir="${1:-$QHTCP_PROJECT_DIR/out/gtf}"
+  gene_association_sgd="${2:-"$APPS_DIR/r/gene_association.sgd"}"
+  gene_ontology_obo="${3:-"$APPS_DIR/r/gene_ontology_edit.obo"}"
+  orf_list="${4:-"$APPS_DIR/r/ORF_List_Without_DAmPs.txt"}"
+
  process_dir="$gtf_out_dir/process"
  function_dir="$gtf_out_dir/function"
  component_dir="$gtf_out_dir/component"

-  gene_association_sgd="${2:-"$APPS_DIR/perl/gene_association.sgd"}"
-  gene_ontology_obo="${3:-"$APPS_DIR/perl/gene_ontology_edit.obo"}"
-  orf_list="${4:-"$APPS_DIR/perl/ORF_List_Without_DAmPs.txt"}"
-
  py_gtf_dcon \
    "$process_dir" \
    "$gtf_out_dir"
@@ -1040,26 +1038,26 @@ module gta
 # TODO
 #   * 
 #   * 
-# @set GTA_OUT_DIR string The GTA output results dir
-# @set all_sgd_terms_csv string The all_SGD_GOTerms_for_QHTCPtk.csv file
-# @set sgd_terms_tfile string The go_terms.tab file
-# @set sgd_features_file string The gene_association.sgd file
-# @set gene_ontology_file string The gene_ontology_edit.obo file 
-# @set zscores_file string The ZScores_interaction.csv file
+# @arg $1 string output directory
+# @arg $2 string gene_association.sgd
+# @arg $3 string gene_ontology_edit.obo
+# @arg $4 string go_terms.tab
+# @arg $5 string All_SGD_GOTerms_for_QHTCPtk.csv
+# @arg $6 string zscores_interaction.csv
 gta() {
  debug "Running: ${FUNCNAME[0]}"

-  GTA_OUT_DIR="$QHTCP_PROJECT_DIR/gta"
-  all_sgd_terms_csv="$APPS_DIR/r/All_SGD_GOTerms_for_QHTCPtk.csv" 
-  sgd_terms_tfile="$APPS_DIR/r/go_terms.tab"
-  sgd_features_file="$APPS_DIR/r/gene_association.sgd"
-  gene_ontology_file="$APPS_DIR/r/gene_ontology_edit.obo"
-  zscores_file="zscores/zscores_interaction.csv"
-
+  gta_out_dir="${1:-"$QHTCP_PROJECT_DIR/gta"}"
+  gene_association_sgd="${2:-"$APPS_DIR/r/gene_association.sgd"}"
+  gene_ontology_obo="${3:-"$APPS_DIR/r/gene_ontology_edit.obo"}"
+  sgd_terms_tfile="${4:-"$APPS_DIR/r/go_terms.tab"}"
+  all_sgd_terms_csv="${5:-"$APPS_DIR/r/All_SGD_GOTerms_for_QHTCPtk.csv"}" 
+  zscores_file="${6:-"$gta_out_dir/zscores/zscores_interaction.csv"}" # TODO This could be wrong, it could be in main results
+    
  # Sets STUDIES_NUM and NUM_STUDIES
  get_studies "$STUDY_INFO_FILE"

-  [[ -d $GTA_OUT_DIR ]] || mkdir "$GTA_OUT_DIR"
+  [[ -d $gta_out_dir ]] || mkdir "$gta_out_dir"

  # Loop over the array and create pairwise arrays
  for ((i=0; i<${#STUDIES_NUMS[@]}; i++)); do
@@ -1088,13 +1086,13 @@ gta() {
  for s in "${STUDIES_NUMS[@]}"; do
    zscores_file="$QHTCP_PROJECT_DIR/Exp$s/$zscores_file"
    if [[ -f $zscores_file ]]; then
-      mkdir "$GTA_OUT_DIR/Exp$s"
+      mkdir "$gta_out_dir/Exp$s"
      r_gta \
        "Exp$s" \
        "$zscores_file" \
        "$sgd_terms_tfile" \
-        "$sgd_features_file" \
-        "$GTA_OUT_DIR"
+        "$gene_association_sgd" \
+        "$gta_out_dir"
    fi
  done

@@ -1102,7 +1100,7 @@ gta() {
  for combo in "${study_combos[@]}"; do
    # Split on comma and assign to array
    IFS=',' read -ra studies <<< "$combo"
-    r_gta_pairwiselk "${studies[0]}" "${studies[1]}" "$STUDY_INFO_FILE" "$GTA_OUT_DIR"
+    r_gta_pairwiselk "${studies[0]}" "${studies[1]}" "$STUDY_INFO_FILE" "$gta_out_dir"
  done

  # All studies
@@ -1110,7 +1108,7 @@ gta() {
  # are required
  r_gta_heatmaps \
    "$STUDY_INFO_FILE" \
-    "$gene_ontology_file" \
+    "$gene_ontology_obo" \
    "$sgd_terms_tfile" \
    "$all_sgd_terms_csv" \
    "$zscores_file" \
@@ -1121,14 +1119,12 @@ gta() {


 # @section Submodules
-# @description Submodules provide functionality to modules and should be reusable
-# A submodule only runs by default if called by a module
-# Use a submodule for:
-#   * Calling external scripts
-#   * Performing repetitive tasks
-#   * Generalizing code
-#   * Functions you do not want to perform by default (submodules should be called modules)
-#   * Should not call cd or pushd (let module dictate)
+# @description Submodules are shell wrappers for workflow components in external languages.
+# Submodules:
+#   * Allow scripts to be called by the main workflow script using input\
+#     and output arguments as a translation mechanism.
+#   * Only run by default if called by a module.
+#     * Can be called directly with its arguments as a comma-separated string


 submodule r_gta
@@ -1320,6 +1316,10 @@ r_interactions() {

 submodule r_join_interactions
 # @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv
+# Output files:
+#   * REMcRdy_lm_only.csv
+#   * Shift_only.csv
+#   * parameters.csv
 # @arg $1 string The output directory
 # @arg $2 string The sd value
 # @arg $3 string The studyInfo file
@@ -1552,19 +1552,15 @@ get_studies() {

 # submodule choose_easy_results_dir #
 # # @description Chooses an EASY scans directory if the information is undefined
-# # TODO: Standardize EASY output, it's hard to understand
+# # TODO Standardize EASY output, it's hard to understand
 # # TODO eventually we could run this on multiple results dirs simultaneously with some refactoring
 # # @exitcode 0 if successfully choose an EASY results dir
 # # @set EASY_RESULTS_DIR string The working EASY output directory
 # choose_easy_results_dir() {
 #   debug "Running: ${FUNCNAME[0]}"
-
-  
-
 #   # Always backup existing output
 #   # This would happen if you ran the same experiment twice in one day, for instance
 #   [[ -d $EASY_RESULTS_DIR ]] && backup "$EASY_RESULTS_DIR"
-
 #   if [[ ! -d $EASY_RESULTS_DIR ]]; then
 #     debug "mkdir $EASY_RESULTS_DIR"
 #     mkdir "$EASY_RESULTS_DIR"
@@ -1572,7 +1568,6 @@ get_studies() {
 #     err "Could not create $EASY_RESULTS_DIR"
 #     return 0
 #   fi
-
 #   # echo "Hit enter to use the default EASY results directory: $default_easy_results_dir"
 #   # if ! (( YES )); then
 #   #   read -r -p "Or enter a custom directory name, example: $PROJECT" dirname