hartman-server/workflow/script-run-workflow

#!/usr/bin/env bash
# Copyright 2024 Bryan C. Roessler
#
# This is a code scratchpad for organizing the Hartman Lab Server workflow
# It contains a mixture of code/pseudocode and shouldn't be run until this message is removed
#
# Allow indirect functions
# shellcheck disable=SC2317
#
# shdoc info
# @name HartmanLabWorkflow
# @brief One script to rule them all (see: xkcd #927)
# @description Executes the Hartman Lab image analysis workflow
# @option -p<value> | --project=<value> Include one or more projects in the analysis
# @option -i<value> | --include=<value> Include one or more modules in the analysis (default: all modules)
# @option -x<value> | --exclude=<value> Exclude one or more modules in the analysis
# @option -y | --yes | --auto Assume yes answer to all questions (non-interactive mode)
# @option -d | --debug Turn on extra debugging output
# @option -h | --help Print help message and exit (overrides other options)

DEBUG=1 # Turn debugging ON by default during development
shopt -s extglob

# @section Libraries
# @description Change these variables to use different libraries
JAVA="${JAVA:-java}"
PYTHON="${PYTHON:-python3}"
PERL="${PERL:-perl}"

# @section Help
# @description Print a helpful message
print_help() {
  debug "Running: ${FUNCNAME[0]}"

  install_dependencies --get-depends # Loads the dependency arrays

  cat <<-EOF
		USAGE:
		  script-run-workflow [[OPTION] [VALUE]]...

		  Some options (--project, --include, --exclude) can be passed multiple times or
		  by using comma deliminated strings (see EXAMPLES below)

		OPTIONS:
		  --project, -p PROJECT
		    PROJECT should follow the pattern ${PROJECT_PREFIX}_PROJECT_NAME
		  --include, -i MODULE
		    See MODULES section below for list of available modules
		    If no --include is specified, all modules are run
		  --exclude, -x MODULE
		    See MODULES section below for list of modules to exclude
		  --yes, -y, --auto
		    Always answer yes to questions (non-interactive mode)
		  --debug, -d
		    Print extra debugging info
		  --help, -h
		    Print this help message and exit

		MODULES:
		  ${ALL_MODULES[*]}

		SUBMODULES:
		  ${ALL_SUBMODULES[*]}

		DEPENDENCIES:
		  deb: ${depends_deb[@]}
		  rpm: ${depends_rpm[@]}
		  brew: ${depends_brew[@]}
		  perl: ${depends_perl[@]}
		  R: ${depends_r[@]}
		  BiocManager: ${depends_bioc[@]}

		EXAMPLES:
		  script-run-workflow --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
		  script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
		  script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[1]} --module ${ALL_MODULES[2]} --yes
		  script-run-workflow --module=${ALL_MODULES[0]},${ALL_MODULES[1]}
		  script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT
		  script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT --module=${ALL_MODULES[1]},${ALL_MODULES[2]} --yes --debug
	EOF
}


# @section User Input
# @description Creates array and switches from user input
parse_input() {
  debug "Running: ${FUNCNAME[0]}" "$@"

  long_opts="project:,include:,exclude:,yes,auto,debug,help"
  short_opts="+p:i:x:yhd"

  if input=$(getopt -o $short_opts -l $long_opts -- "$@"); then
    eval set -- "$input"
    while true; do
      case $1 in
        --project|-p)
          shift
          if [[ $1 == *','* ]] ; then # check for commas
            IFS=',' read -ra PROJECTS <<< "$1"
          else
            PROJECTS+=("$1")
          fi
          ;;
        --include|-i)
          shift
          if [[ $1 == *','* ]] ; then # check for commas
            IFS=',' read -ra INCLUDE_MODULES <<< "$1"
          else
            INCLUDE_MODULES+=("$1")
          fi
          ;;
        --exclude|-x)
          shift
          if [[ $1 == *','* ]] ; then # check for commas
            IFS=',' read -ra EXCLUDE_MODULES <<< "$1"
          else
            EXCLUDE_MODULES+=("$1")
          fi
          ;;
        --yes|-y|--auto)
          declare -g YES=1
          ;;
        --debug|-d)
          declare -g DEBUG=1
          ;;
        --help|-h)
          print_help; exit 0
          ;;
        --)
          shift
          break
          ;;
      esac
      shift
    done
  else
    err "Incorrect options provided"; exit 1
  fi
}

# @section Helper functions
# @internal
module() {
  debug "Adding $1 module"
  ALL_MODULES+=("$1")
  declare -gA "$1"
}
submodule() {
  debug "Adding $1 submodule"
  ALL_SUBMODULES+=("$1")
  declare -gA "$1"
}
# This function will only work if users have an actual name registered on the server
# TODO for now just use username
# user_initials() {
#   user_record="$(getent passwd "$(whoami)")"
#   user_gecos_field="$(echo "$user_record" | cut -d ':' -f 5)"
#   user_full_name="$(echo "$user_gecos_field" | cut -d ',' -f 1)"
#   last="${user_full_name#* }"
#   echo "${user_full_name:0:1}${last:0:1}"
# }
ask() {
    declare response
    (( YES )) && return 0
    read -r -p "$* [y/N]: " response
    [[ ${response,,} =~ ^(yes|y)$ ]]
}
err() { echo "Error: $*" >&2; }
ask_pn() {
  declare -g PROJECT
  read -r -p "Enter a full project name (ex. ${PROJECT_PREFIX}_PROJECT_NAME): " PROJECT
}
debug() { (( DEBUG )) && echo "Debug: $*"; }


# @section Modules
# @description A module contains a cohesive set of actions/experiments to run on a project
# Use a module when:
#   * Building a new type of analysis
#   * Combining submodules
#
#

module install_dependencies
# @description Installs dependencies for the workflow
install_dependencies() {
  debug "Running: ${FUNCNAME[0]}" "$@"

  # Dependency arrays
  depends_rpm=(graphviz pandoc pdftk-java gd-devel shdoc)
  depends_deb=(graphviz pandoc pdftk-java libgd-dev shdoc)
  depends_brew=(graphiz pandoc gd pdftk-java shdoc)
  depends_perl=(File::Map ExtUtils::PkgConfig GD GO::TermFinder)
  depends_r=(BiocManager ontologyIndex ggrepel tidyverse sos openxlsx ggplot2
    plyr extrafont gridExtra gplots stringr plotly ggthemes pandoc rmarkdown)
  depends_bioc=(org.Sc.sgd.db)

  [[ $1 == "--get-depends" ]] && return 0 # if we just want to read the depends vars

  # Install system-wide dependencies
  echo "Installing system dependencies"
  case "$(uname -s)" in
    Linux*|CYGWIN*|MINGW*)
      ask "Detected Linux platform, continue?" || return 1
      echo "You may be prompted for your sudo password to install system packages"
      if hash dnf &>/dev/null; then
        sudo dnf install "${depends_rpm[@]}"
      elif hash apt &>/dev/null; then
        sudo apt install "${depends_deb[@]}"
      fi
      ;;
    Darwin*)
      ask "Detected Mac platform, continue?" || return 1
      export HOMEBREW_BREW_GIT_REMOTE="https://github.com/Homebrew/brew"
      curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh|bash
      brew install "${depends_brew[@]}"
      ;;
    *)
      echo "Your system could not be detected, please install dependencies manually"
      ;;
  esac

  # Install perl CPAN modules
  echo "Installing perl CPAN modules"
  debug "cpan" "${depends_perl[@]}"
  cpan "${depends_perl[@]}"

  # Install R packages
  echo "Installing R packages"

  depends_r_str=""
  depends_r_to_string() {
    for d in "${depends_r[@]}"; do
      depends_r_str+="$d\", \""
    done
    depends_r_str="${depends_r_str::-3}" # strip last , " (comma and quote)
  }
  depends_r_to_string

  debug "Rscript -e install.packages(c(\"$depends_r_str), dep=TRUE, repos=\"https://cloud.r-project.org\")"
  Rscript -e "install.packages(c(\"$depends_r_str), dep=TRUE, repos=\"https://cloud.r-project.org\")"
  Rscript -e "BiocManager::install(\"${depends_bioc[0]}\")"
}


module init_job
# @section Initialize a new job in the scans directory
# @description Create a new ExpJobs project
# TODO Copy over source image directories from robot - are these alse named by the ExpJobs name?
init_job() {
  debug "Running: ${FUNCNAME[0]}"

  if [[ -d $SCAN_DIR ]]; then
    ask "$SCAN_DIR already exists, re-initialize?" || return 0
  else
    ask "Initialize a new project at $SCAN_DIR?" || return 1
    mkdir -p "$SCAN_DIR"
  fi

  [[ -d $SCAN_DIR/MasterPlateFiles ]] || mkdir -p "$SCAN_DIR/MasterPlateFiles"

  DRUG_MEDIA_FILE="$SCAN_DIR/MasterPlateFiles/DrugMedia_$PROJECT.xls"
  MASTER_PLATE_FILE="$SCAN_DIR/MasterPlateFiles/MasterPlate_$PROJECT.xls"

  # TODO Where are the actual templates?
  for f in $DRUG_MEDIA_FILE $MASTER_PLATE_FILE; do
    touch "$f"
  done
}


module easy
# @section EASY
# @description Start an EASY analysis
# The QHTCPImageFolders and ‘MasterPlateFiles’ folder are the inputs for image analysis with EASY software.
# EASY will automatically generate a ‘Results’ directory (within the ExpJobs/‘ExperimentJob’ folder) w/ timestamp and an optional short description provided by the user (Fig.2).
# The ‘Results’ directory is created and entered, using the “File >> New Experiment” dropdown in EASY.
# Multiple ‘Results’ files may be created (and uniquely named) within an ‘ExperimentJob’ folder.
easy() {
  debug "Running: ${FUNCNAME[0]}"
  EASY="/mnt/data/EASY/EasyDev2024/BU/EASY240430AppExported/EstartConsole.m"

  pushd "$SCAN_DIR" || return 1

  # Launch graphical matlab if the user wants
  ask "Start EASY in MATLAB? This requires a GUI." && matlab -nosplash -r "$EASY"

  # glob EASY output and make sure it exists
  shopt -s nullglob
  EASY_RESULTS_DIRS=( Results* )
  shopt -u nullglob
  [[ ${#EASY_RESULTS_DIRS} -ge 1 ]] || (echo "Missing EASY output"; exit 1)

  declare -a EASY_OUT_ARRAY
  for EASY_RESULTS_DIR in "${EASY_RESULTS_DIRS[@]}"; do
    [[ -d $EASY_RESULTS_DIR ]] && echo "Found EASY Results directory: $EASY_RESULTS_DIR"
    EASY_PRINT_RESULTS_DIR="$EASY_RESULTS_DIR/PrintResults"
    [[ -d $EASY_PRINT_RESULTS_DIR ]] && echo "Found EASY PrintResults directory: $EASY_PRINT_RESULTS_DIR"
    EASY_PRINT_RESULTS_FILES=(
      "$EASY_PRINT_RESULTS_DIR/!!"*
      "$EASY_PRINT_RESULTS_DIR"/NoGrowth_*.txt
      "$EASY_PRINT_RESULTS_DIR"/GrowthOnly_*.txt
    )
    EASY_OUT_ARRAY+=("$EASY_RESULTS_DIR" "$EASY_PRINT_RESULTS_DIR" "${EASY_PRINT_RESULTS_FILES[@]}")
  done

  echo "EASY OUTPUT ARRAY: " "${EASY_OUT_ARRAY[@]}"
}


module ezview
# @section EZView
ezview() {
  debug "Running: ${FUNCNAME[0]}"
  EZVIEW_DIR="/mnt/data/EZVIEW"
  echo "$EZVIEW_DIR"
}


module qhtcp
# @section QHTCP
# @description Main QHTCP module (functional rewrite of REMcMaster3.sh)
qhtcp() {
  debug "Running: ${FUNCNAME[0]}"
  TEMPLATE_DIR="$SCRIPT_DIR/templates/qhtcp"
  QHTCP_DIR="/mnt/data/StudiesQHTCP/$PROJECT"

  # Our list of submodules (functions) to run for this module
  # Put these in the appropriate order of operations
  submodules=(
    r_join_interact
    java_extract
    r_add_shift_values
    r_heat_maps_zscores
    r_heat_maps_homology
    py_gtf
    r_compile_gtf
  )

  while [[ -d $QHTCP_DIR ]]; do
    echo "A project already exists at $QHTCP_DIR"
    ask "Safely update $QHTCP_DIR from the $TEMPLATE_DIR template?" && break
    if ask "Back up $QHTCP_DIR to $QHTCP_DIR.bk and start fresh?"; then
      mv "$QHTCP_DIR" "$QHTCP_DIR.bk" || (echo "Backup unsuccessful, exiting"; exit 1)
    fi
  done

  # Copy template to QHTCP project directory
  if rsync --archive --update "$TEMPLATE_DIR"/ "$QHTCP_DIR"; then
    echo "New project created at $QHTCP_DIR"
  fi

  # Create StudyInfo.csv
  # Right now this is identical to the template but we can change it later
  cat <<-EOF > "$QHTCP_DIR/Code/StudyInfo.csv"
		ExpNumb,ExpLabel,BackgroundSD,ZscoreJoinSD,AnalysisBy
		1,ExpName1,NA,NA,UserInitials
		2,ExpName2,NA,NA,UserInitials
		3,ExpName3,NA,NA,UserInitials
		4,ExpName4,NA,NA,UserInitials
	EOF

  # Enter REMc directory to run the scripts there
  pushd "$QHTCP_DIR/REMc" || return 1

  # Run each submodule
  for s in "${submodules[@]}"; do "$s"; done

  popd || return 1
}


# @section Submodules
# @description Submodules provide functionaility to modules and are reusable between modules
# Use a submodule when:
#   * Calling external scripts
#   * Performing repetitive tasks
#   *
#

submodule r_join_interact
# @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv
r_join_interact() {
  debug "Running: ${FUNCNAME[0]}"
  echo "Rscript JoinInteractExps3dev.R"
  Rscript JoinInteractExps3dev.R
  out_file="REMcRdy_lm_only.csv"
  out_file2="Shift_only.csv"
  for f in "$out_file" "$out_file2"; do
    [[ -f $f ]] || (echo "$f does not exist"; return 1)
  done
}


submodule java_extract
# @description Jingyu's REMc java utility using file input file REMcRdy_lm_only.csv
# and output REMcRdy_lm_only.csv-finalTable.csv
java_extract() {
  debug "Running: ${FUNCNAME[0]}"
  classpath="jingyuJava_1_7_extractLib.jar"
  out_file="REMcRdy_lm_only.csv-finalTable.csv"

  # backup REMcRdy_lm_only.csv-finalTable.csv
  [[ -f $out_file ]] && mv "$out_file" "$out_file.bk"

  java_cmd=(
    "$JAVA" -Xms512m -Xmx2048m -Dfile.encoding=UTF-8 -classpath "$classpath" ExecMain
    "REMcRdy_lm_only.csv"
    "GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab"
    "ORF_List_Without_DAmPs.txt" 1 true true
  )

  echo "${java_cmd[@]}"
  "${java_cmd[@]}"
  [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}


submodule r_add_shift_values
# @description Add shift values back to REMcRdy_lm_only.csv-finalTable.csv
# and output "REMcWithShift.csv" for use with the REMc heat maps
r_add_shift_values() {
  debug "Running: ${FUNCNAME[0]}"
  out_file="REMcHeatmaps/REMcWithShift.csv"
  echo "Rscript AddShiftVals2.R"
  Rscript AddShiftVals2.R
  rm -f "REMcHeatmaps/"*.pdf
  [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}


submodule r_heat_maps_zscores
# @description Execute REMcHeatmaps_zscores.R
r_heat_maps_zscores() {
  debug "Running: ${FUNCNAME[0]}"
  out_file="REMcHeatmaps/compiledREMcHeatmaps.pdf"
  echo "Rscript REMcHeatmaps_zscores.R"
  Rscript REMcHeatmaps_zscores.R
  pdfs=(REMcHeatmaps/*.pdf)
  echo "pdftk ${pdfs[*]} output $out_file"
  pdftk "${pdfs[@]}" output "$out_file"
  [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}


submodule r_heat_maps_homology
# @description Execute REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R
r_heat_maps_homology() {
  debug "Running: ${FUNCNAME[0]}"
  work_dir="REMcHeatmapsWithHomology"
  source_file="REMcHeatmaps/REMcWithShift.csv"
  target_file="$work_dir/REMcWithShift.csv"
  out_file="$work_dir/Homology/compiledREMcHomologyHeatmaps.pdf"
  echo "rsync --archive $source_file $target_file"
  rsync --archive "$source_file" "$target_file"

  # Clean old output
  rm "$work_dir/Homology/"*.{pdf,csv}

  pushd "$work_dir" || return 1
  Rscript \
    REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R \
    REMcWithShift.csv \
    Homology \
    17_0503_DAmPs_Only.txt \
    Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv
  popd || return 1

  pdfs=("$work_dir"/Homology/*.pdf)
  pdftk "${pdfs[@]}" output "$out_file"

  [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}


submodule py_gtf
# @description Perform GTF
py_gtf() {
  debug "Running: ${FUNCNAME[0]}"
  process_dir="GTF/Process"
  function_dir="GTF/Function"
  component_dir="GTF/Component"
  in_file="REMcRdy_lm_only.csv-finalTable.csv"
  out_file="$process_dir/REMcRdy_lm_only/1-0-0-finaltable.csv"
  echo "$PYTHON DconJG2.py $in_file $process_dir/"
  "$PYTHON" DconJG2.py "$in_file" "$process_dir/"
  [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
  unset out_file
  rsync -a "$process_dir/REMcRdy_lm_only" GTF/Function/
  rsync -a "$process_dir/REMcRdy_lm_only" GTF/Component/

  # @description Not sure what to name this
  # @arg $1 string directory name
  _process() {
    debug "Running: ${FUNCNAME[0]}" "$@"
    pushd "$1" || return 1

    shopt -s nullglob
    set2=(REMcRdy_lm_only/*.txt)
    shopt -u nullglob

    for s in "${set2[@]}"; do
      pl_analyze "$set1" "$s"
      pl_terms2tsv "$s"
		  "$PERL" terms2tsv_v4.pl "$s.terms" > "$s.tsv"
    done

    # Concat the process ontology outputs from the /REMcReady_lm_only folder
    echo "$PYTHON Concatenate_GTF_results.py REMcRdy_lm_only/ $out_file"
    echo "TODO: Concatenate_GTF_results.py should be translated to bash"
    "$PYTHON" Concatenate_GTF_results.py REMcRdy_lm_only/ "$out_file"

    [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
    popd || return 1
  }

  # Perform operations in each directory
  for d in "$process_dir" "$function_dir" "$component_dir"; do
    set1="ORF_List_Without_DAmPs.txt"
    out_file="${d##*/}Results.txt" # Use the dirname to create each Results filename
    _process "$d" & # parallelize
  done
}


submodule pl_analyze
# @description Perl analyze submodule
# This seems weird to me because we're just overwriting the same data for all set2 members
# https://metacpan.org/dist/GO-TermFinder/view/examples/analyze.pl
# Is there a reason you need a custom version and not the original from cpan?
# @arg $1 string Set 1
# @arg $2 string Set 2
pl_analyze() {
  script="analyze_v2.pl"
  an="gene_association.sgd"
  out_file="gene_ontology_edit.obo"
  debug "$PERL $script -an $an -as P -o $out_file -b $1 $2"
  "$PERL" "$script" -an "$an" -as P -o "$out_file" -b "$1" "$2"
}


submodule pl_terms2tsv
# @description Perl terms2tsv submodule
# Probably should be translated to shell/python
# @arg $1 string Set 2
pl_terms2tsv() {
  script="terms2tsv_v4.pl"
  debug "$PERL $script $1.terms > $1.tsv"
  "$PERL" "$script" "$1.terms" > "$1.tsv"
}


submodule documentation
# @section Documentation
# @description Generates markdown documentation from this script using shdoc
documentation() {
  debug "Running: ${FUNCNAME[0]}"

  # Print markdown to stdout
  shdoc < "$SCRIPT"

  # Create markdown file
  shdoc < "$SCRIPT" > documentation.md

}


# @description Compile GTF in R
r_compile_gtf() {
  debug "Running: ${FUNCNAME[0]}"
  echo "Rscript CompileGTF.R"
  Rscript CompileGTF.R
}


# @description The main loop of script-run-workflow
# May eventually need to add git ops
# Passes on arguments
# Most variables in main() are user configurable or can be overriden by env
main() {
  debug "Running: ${FUNCNAME[0]}" "$@"

  # Where are we located?
  SCRIPT=$(realpath -s "${BASH_SOURCE[0]}")
  SCRIPT_DIR=$(dirname "$SCRIPT")

  # Set the automatic project directory prefix
  PROJECT_PREFIX="$(whoami)_$(date +%y_%m_%d)"
  san() { [[ $1 =~ .+_[0-9][0-9]_[0-9][0-9]_[0-9][0-9]_.+ ]]; } # sanitizer regex for prefix

  declare -ag PROJECTS=() # this array will hold all of the projects for this run

  parse_input "$@" # parse arguments with getopt

  # Prompt user for the PROJECT if we still don't have one
  if [[ ${#PROJECTS[@]} -eq 0 ]]; then # still allows for environment overrides
    ask_pn
    PROJECTS+=("$PROJECT")
  fi

  for i in "${!PROJECTS[@]}"; do
    if ! san "${PROJECTS[i]}"; then
      echo "Project name ${PROJECTS[i]} is invalid"
      echo "Enter a replacement"
      ask_pn
      san "$PROJECT" || (echo "RTFM"; return 1)
      PROJECTS[i]="$PROJECT"
    fi
  done

  SCANS_DIR="${SCANS_DIR:-"/mnt/data/ExpJobs"}" # TODO propose changing this to something else

  # If we don't catch with getopt or env, run all
  if [[ ${#INCLUDE_MODULES[@]} -eq 0 ]]; then
    MODULES=("${ALL_MODULES[@]}")
  else
    MODULES=("${INCLUDE_MODULES[@]}")
  fi

  # Exclude modules from --exclude
  arr=()
  for m in "${MODULES[@]}"; do
    [[ " ${EXCLUDE_MODULES[*]} " =~ [[:space:]]${m}[[:space:]] ]] || arr+=("$m")
  done
  MODULES=("${arr[@]}")
  unset arr

  # Sanitize MODULES
  for i in "${!MODULES[@]}"; do
    if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then
      echo "Module $m not in the module list"
      echo "Available modules: ${ALL_MODULES[*]}"
      read -r -p "Enter replacement module name: " MODULE
      ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULE}[[:space:]] ]] || (echo "RTFM"; return 1)
      MODULES[i]="$MODULE"
    fi
  done

  # Loop over projects
  for PROJECT in "${PROJECTS[@]}"; do
    SCAN_DIR="$SCANS_DIR/$PROJECT"

    # Run selected modules
    for m in "${MODULES[@]}"; do
      ask "Run $m" && "$m"
    done
  done

}

main "$@"

exit $?