First attempt at script-run-workflow

2024-07-21 23:50:24 -04:00
parent 628356652d
commit 06dd700680
290 changed files with 5524411 additions and 0 deletions
--- a/workflow/script-run-workflow
+++ b/workflow/script-run-workflow
@@ -0,0 +1,531 @@
+#!/usr/bin/env bash
+# Copyright 2024 Bryan C. Roessler
+# This is currently a code scratchpad for organizing the Hartman Lab Server workflow
+# It contains a mixture of code/pseudocode and shouldn't be run until this message is removed
+# Allow indirect functions
+# shellcheck disable=SC2317
+#
+# shdoc info
+# @name HartmanLabWorkflow
+# @brief One script to rule them all (see: xkcd #927)
+# @description Executes the Hartman Lab image analysis workflow
+# @arg $1 string A project name
+#
+shopt -s extglob
+
+# @section Libraries
+# @description Change these variables to use different libraries
+JAVA="${JAVA:-java}"
+PYTHON="${PYTHON:-python3}"
+PERL="${PERL:-perl}"
+
+# @section Help
+# @description Print a helpful message
+print_help() {
+  echo "Running: ${FUNCNAME[0]}"
+
+  cat <<-EOF
+		USAGE:
+			script-run-workflow [[OPTION] [VALUE]]...
+
+			Some options (--project, --module) can be passed multiple times for batch operations.
+
+		OPTIONS:
+			--project, -p PROJECT
+				PROJECT should follow the pattern ${PROJECT_PREFIX}_UNIQUE_PROJECT_NAME
+			--include, -i MODULE
+				See MODULES section below for list of available modules
+				If no --module is specified, all modules are run
+			--exclude, -x MODULE
+				See MODULES section below for list of available modules
+			--yes, -y, --auto
+				Always answer yes to questions (non-interactive mode)
+			--debug, -d
+				Print extra debugging info
+			--help, -h
+				Print this help message and exit
+
+		MODULES:
+			${ALL_MODULES[*]}
+
+		DEPENDENCIES:
+			binaries (system): graphviz pandoc pdftk-java gd-devel
+			perl (cpan): File::Map ExtUtils::PkgConfig GD GO::TermFinder
+			R (default): BiocManager ontologyIndex ggrepel tidyverse sos openxlsx ggplot2 plyr extrafont gridExtra gplots stringr plotly ggthemes pandoc rmarkdown
+			R (BiocManager):
+
+		EXAMPLES:
+			script-run-workflow --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
+			script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
+			script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[1]} --module ${ALL_MODULES[2]} --yes
+	EOF
+}
+
+
+# @section User Input
+# @description Creates array and switches from user input
+parse_input() {
+  echo "Running: ${FUNCNAME[0]}" "$@"
+
+  long_opts="project:,include:,exclude:,yes,auto,debug,help"
+  #long_opts+="restorefile:,betapass:,"
+  short_opts="+p:i:x:yhd"
+
+  if input=$(getopt -o $short_opts -l $long_opts -- "$@"); then
+    eval set -- "$input"
+    while true; do
+      case $1 in
+        --project|-p)
+          shift
+          declare -ga PROJECT_NAMES+=("$1")
+          ;;
+        --include|-i)
+          shift
+          declare -ga MODULES+=("$1")
+          ;;
+        --exclude|-x)
+          shift
+          declare -ga EXCLUDE_MODULES+=("$1")
+          ;;
+        --yes|-y|--auto)
+          declare -g YES=1
+          ;;
+        --debug|-d)
+          declare -g DEBUG=1
+          ;;
+        --help|-h)
+          print_help; exit 0
+          ;;
+        --)
+          shift
+          break
+          ;;
+      esac
+      shift
+    done
+  else
+    err "Incorrect options provided"; exit 1
+  fi
+}
+
+
+# @section Initialize a new job in the scans directory
+# @description Create a new ExpJobs project
+# TODO Copy over source image directories from robot - are these alse named by the ExpJobs name?
+init_job() {
+  echo "Running: ${FUNCNAME[0]}"
+
+  if [[ -d $SCAN_DIR ]]; then
+    ask "$SCAN_DIR already exists, re-initialize?" || return 0
+  else
+    ask "Initialize a new project at $SCAN_DIR?" || return 1
+    mkdir -p "$SCAN_DIR"
+  fi
+
+  [[ -d $SCAN_DIR/MasterPlateFiles ]] || mkdir -p "$SCAN_DIR/MasterPlateFiles"
+
+  DRUG_MEDIA_FILE="$SCAN_DIR/MasterPlateFiles/DrugMedia_$PROJECT_NAME.xls"
+  MASTER_PLATE_FILE="$SCAN_DIR/MasterPlateFiles/MasterPlate_$PROJECT_NAME.xls"
+
+  # TODO Where are the actual templates?
+  for f in $DRUG_MEDIA_FILE $MASTER_PLATE_FILE; do
+    touch "$f"
+  done
+
+}
+
+
+# @section EASY
+# @description Start an EASY analysis
+# The QHTCPImageFolders and ‘MasterPlateFiles’ folder are the inputs for image analysis with EASY software. 
+# EASY will automatically generate a ‘Results’ directory (within the ExpJobs/‘ExperimentJob’ folder) w/ timestamp and an optional short description provided by the user (Fig.2). 
+# The ‘Results’ directory is created and entered, using the “File >> New Experiment” dropdown in EASY. 
+# Multiple ‘Results’ files may be created (and uniquely named) within an ‘ExperimentJob’ folder.
+easy() {
+  echo "Running: ${FUNCNAME[0]}"
+  EASY="/mnt/data/EASY/EasyDev2024/BU/EASY240430AppExported/EstartConsole.m"
+
+  pushd "$SCAN_DIR" || return 1 
+
+  # Launch graphical matlab if the user wants
+  ask "Start EASY in MATLAB? This requires a GUI." && matlab -nosplash -r "$EASY"
+
+  # glob EASY output and make sure it exists
+  shopt -s nullglob
+  EASY_RESULTS_DIRS=( Results* )
+  shopt -u nullglob
+  [[ ${#EASY_RESULTS_DIRS} -ge 1 ]] || (echo "Missing EASY output"; exit 1)
+
+  declare -a EASY_OUT_ARRAY
+  for EASY_RESULTS_DIR in "${EASY_RESULTS_DIRS[@]}"; do
+    [[ -d $EASY_RESULTS_DIR ]] && echo "Found EASY Results directory: $EASY_RESULTS_DIR"
+    EASY_PRINT_RESULTS_DIR="$EASY_RESULTS_DIR/PrintResults"
+    [[ -d $EASY_PRINT_RESULTS_DIR ]] && echo "Found EASY PrintResults directory: $EASY_PRINT_RESULTS_DIR"
+    EASY_PRINT_RESULTS_FILES=(
+      "$EASY_PRINT_RESULTS_DIR/!!"* 
+      "$EASY_PRINT_RESULTS_DIR"/NoGrowth_*.txt
+      "$EASY_PRINT_RESULTS_DIR"/GrowthOnly_*.txt
+    )
+    EASY_OUT_ARRAY+=("$EASY_RESULTS_DIR" "$EASY_PRINT_RESULTS_DIR" "${EASY_PRINT_RESULTS_FILES[@]}")
+  done
+
+  echo "EASY OUTPUT ARRAY: " "${EASY_OUT_ARRAY[@]}"
+}
+
+# @section EZView
+ezview() {
+  echo "Running: ${FUNCNAME[0]}"
+  EZVIEW_DIR="/mnt/data/EZVIEW"
+  echo "$EZVIEW_DIR"
+}
+
+
+# @section StudiesQHTCP
+# @ description This section is derived from the earliest work of Jinyu Guo.  As such it uses Perl scripts. Without porting these two Perl scripts into a new intergrated R script or Python script, one is contrained to use the rather crude copy-paste and and shell script inherient in the original procedures. These two Perl scripts are analyze_v2.pl and terms2tsv_v4.pl which were written in 2003 by Gavin Sherlock for the SGD gene ontology system and require perl installations of such files.  These also require that the gene_ontology_edit.obo, SGD_features.tab files used in the ../Code also be included here. Without rewriting the code, one must compromise directory convenience.  
+
+# @description Main loop for qhtcp modules (rewrite of REMcMaster3.sh)
+qhtcp() {
+  echo "Running: ${FUNCNAME[0]}"
+  TEMPLATE_DIR="$SCRIPT_DIR/templates/qhtcp"
+  QHTCP_DIR="/mnt/data/StudiesQHTCP/$PROJECT_NAME"
+  
+  while [[ -d $QHTCP_DIR ]]; do
+    echo "A project already exists at $QHTCP_DIR"
+    ask "Safely update $QHTCP_DIR from the $TEMPLATE_DIR template?" && break
+    if ask "Back up $QHTCP_DIR to $QHTCP_DIR.bk and start fresh?"; then
+      mv "$QHTCP_DIR" "$QHTCP_DIR.bk" || (echo "Backup unsuccessful, exiting"; exit 1)
+    fi
+  done
+
+  # Copy template to QHTCP project directory
+  if rsync --archive --update "$TEMPLATE_DIR"/ "$QHTCP_DIR"; then
+    echo "New project created at $QHTCP_DIR"
+  fi
+
+  # Enter REMc directory to run the scripts there
+  pushd "$QHTCP_DIR/REMc" || return 1
+
+  r_join_interact &&
+  java_jingyu_extract &&
+  r_add_shift_values &&
+  r_heat_maps_zscores &&
+  r_heat_maps_homology &&
+  py_gtf &&
+  r_compile_gtf
+
+  popd || return 1
+}
+
+
+# @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv
+r_join_interact() {
+  echo "Running: ${FUNCNAME[0]}"
+  echo "Rscript JoinInteractExps3dev.R"
+  Rscript JoinInteractExps3dev.R
+  out_file="REMcRdy_lm_only.csv"
+  out_file2="Shift_only.csv"
+  for f in "$out_file" "$out_file2"; do
+    [[ -f $f ]] || (echo "$f does not exist"; return 1)
+  done
+}
+
+
+# @description Jingyu's REMc java utility using file input file REMcRdy_lm_only.csv 
+# and output REMcRdy_lm_only.csv-finalTable.csv
+java_jingyu_extract() {
+  echo "Running: ${FUNCNAME[0]}"
+  classpath="jingyuJava_1_7_extractLib.jar"
+  out_file="REMcRdy_lm_only.csv-finalTable.csv"
+
+  # backup REMcRdy_lm_only.csv-finalTable.csv
+  [[ -f $out_file ]] && mv "$out_file" "$out_file.bk"
+
+  java_cmd=(
+    "$JAVA" -Xms512m -Xmx2048m -Dfile.encoding=UTF-8 -classpath "$classpath" ExecMain
+    "REMcRdy_lm_only.csv"
+    "GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab"
+    "ORF_List_Without_DAmPs.txt" 1 true true
+  )
+
+  echo "${java_cmd[@]}"
+  "${java_cmd[@]}"
+  [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
+}
+
+
+# @description Add shift values back to REMcRdy_lm_only.csv-finalTable.csv
+# and output "REMcWithShift.csv" for use with the REMc heat maps
+r_add_shift_values() {
+  echo "Running: ${FUNCNAME[0]}"
+  out_file="REMcHeatmaps/REMcWithShift.csv"
+  echo "Rscript AddShiftVals2.R"
+  Rscript AddShiftVals2.R
+  rm -f "REMcHeatmaps/"*.pdf
+  [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
+}
+
+
+# @description Execute REMcHeatmaps_zscores.R
+r_heat_maps_zscores() {
+  echo "Running: ${FUNCNAME[0]}"
+  out_file="REMcHeatmaps/compiledREMcHeatmaps.pdf"
+  echo "Rscript REMcHeatmaps_zscores.R"
+  Rscript REMcHeatmaps_zscores.R
+  pdfs=(REMcHeatmaps/*.pdf)
+  echo "pdftk ${pdfs[*]} output $out_file"
+  pdftk "${pdfs[@]}" output "$out_file"
+  [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
+}
+
+
+# @description Execute REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R
+r_heat_maps_homology() {
+  echo "Running: ${FUNCNAME[0]}"
+  work_dir="REMcHeatmapsWithHomology"
+  source_file="REMcHeatmaps/REMcWithShift.csv"
+  target_file="$work_dir/REMcWithShift.csv"
+  out_file="$work_dir/Homology/compiledREMcHomologyHeatmaps.pdf"
+  echo "rsync --archive $source_file $target_file"
+  rsync --archive "$source_file" "$target_file"
+
+  # Clean old output
+  rm "$work_dir/Homology/"*.{pdf,csv}
+
+  pushd "$work_dir" || return 1
+  Rscript \
+    REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R \
+    REMcWithShift.csv \
+    Homology \
+    17_0503_DAmPs_Only.txt \
+    Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv
+  popd || return 1
+
+  pdfs=("$work_dir"/Homology/*.pdf)
+  pdftk "${pdfs[@]}" output "$out_file"
+
+  [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
+}
+
+
+# @description Perform GTF
+py_gtf() {
+  echo "Running: ${FUNCNAME[0]}"
+  process_dir="GTF/Process"
+  function_dir="GTF/Function"
+  component_dir="GTF/Component"
+  in_file="REMcRdy_lm_only.csv-finalTable.csv"
+  out_file="$process_dir/REMcRdy_lm_only/1-0-0-finaltable.csv"
+  echo "$PYTHON DconJG2.py $in_file $process_dir/"
+  "$PYTHON" DconJG2.py "$in_file" "$process_dir/"
+  [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
+  unset out_file
+  rsync -a "$process_dir/REMcRdy_lm_only" GTF/Function/
+  rsync -a "$process_dir/REMcRdy_lm_only" GTF/Component/
+
+  # @description Not sure what to name this
+  # @arg $1 string directory name
+  _process() {
+    echo "Running: ${FUNCNAME[0]}" "$@"
+    pushd "$1" || return 1
+
+    shopt -s nullglob
+    set2=(REMcRdy_lm_only/*.txt)
+    shopt -u nullglob
+
+    for s in "${set2[@]}"; do
+      echo "$PERL analyze_v2.pl -an gene_association.sgd -as P -o gene_ontology_edit.obo -b $set1 $s"
+      echo "TODO: analyze_v2.pl should be translated"
+      "$PERL" analyze_v2.pl -an gene_association.sgd -as P -o gene_ontology_edit.obo -b "$set1" "$s"
+      echo "$PERL terms2tsv_v4.pl $s.terms > $s.tsv"
+      echo "TODO: terms2tsv_v4.pl should be translated"
+		  "$PERL" terms2tsv_v4.pl "$s.terms" > "$s.tsv"
+    done
+
+    # Concat the process ontology outputs from the /REMcReady_lm_only folder
+    echo "$PYTHON Concatenate_GTF_results.py REMcRdy_lm_only/ $out_file"
+    echo "TODO: Concatenate_GTF_results.py should be translated to bash"
+    "$PYTHON" Concatenate_GTF_results.py REMcRdy_lm_only/ "$out_file"
+
+    [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
+    popd || return 1
+  }
+
+  # Perform operations in each directory
+  for d in "$process_dir" "$function_dir" "$component_dir"; do 
+    set1="ORF_List_Without_DAmPs.txt"
+    out_file="${d##*/}Results.txt" # Use the dirname to create each Results filename
+    _process "$d" & # parallelize
+  done
+
+}
+
+# @description Compile GTF in R
+r_compile_gtf() {
+  echo "Running: ${FUNCNAME[0]}"
+  echo "Rscript CompileGTF.R"
+  Rscript CompileGTF.R
+}
+
+
+# @description Installs dependencies for the workflow
+install_dependencies() {
+  echo "Running: ${FUNCNAME[0]}"
+
+  # Install system-wide dependencies
+  echo "Installing system dependencies"
+  case "$(uname -s)" in
+    Linux*|CYGWIN*|MINGW*)
+      ask "Detected Linux platform, continue?" || return 1
+      echo "You may be prompted for your sudo password to install system packages"
+      if hash dnf &>/dev/null; then
+        sudo dnf install graphviz pandoc pdftk-java gd-devel
+      elif hash apt &>/dev/null; then
+        sudo apt install graphviz pandoc pdftk-java libgd-dev
+      fi
+      ;;
+    Darwin*)
+      ask "Detected Mac platform, continue?" || return 1
+      export HOMEBREW_BREW_GIT_REMOTE="https://github.com/Homebrew/brew"
+      curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh|bash
+      brew install graphiz
+      brew install gd
+      brew install pdftk-java
+      brew install pandoc 
+      cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder
+      ;;
+    *)          
+      echo "Your system could not be detected, please install dependencies manually"
+      ;;
+  esac
+  
+  # Install perl CPAN modules
+  echo "Installing perl CPAN modules"
+  echo "cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder"
+  cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder
+
+  # Install R packages
+  echo "Installing R packages"
+  Rscript -e 'install.packages(c(\
+    "BiocManager", \
+    "ontologyIndex" \
+    "ggrepel" \
+    "tidyverse" \
+    "sos" \
+    "openxlsx" \
+    "ggplot2" \
+    "plyr" \
+    "extrafont" \
+    "gridExtra" \
+    "gplots" \
+    "stringr" \
+    "plotly" \
+    "ggthemes" \
+    "pandoc" \
+    "rmarkdown" \
+    ), dep=TRUE, repos="https://cloud.r-project.org")'
+  Rscript -e 'BiocManager::install("org.Sc.sgd.db")'
+}
+
+
+# @internal
+ask() {
+    declare response
+    (( YES )) && return 0
+    read -r -p "$* [y/N]: " response
+    [[ ${response,,} =~ ^(yes|y)$ ]]
+}
+# @internal
+err() { echo "Error: $*" >&2; }
+# @internal
+ask_pn() { 
+  declare -g PROJECT_NAME
+  read -r -p "Enter a full project name (ex. ${PROJECT_PREFIX}_PROJECT_NAME): " PROJECT_NAME
+}
+# @internal
+debug() { (( DEBUG )) && echo "Debug: $*"; }
+
+
+# @description The main loop of script-run-workflow
+# May eventually need to add git ops
+# Passes on arguments
+# Most variables in main() are user configurable or can be overriden by env
+main() {
+  echo "Running: ${FUNCNAME[0]}" "$@"
+  
+  # Where are we located?
+  SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+  # Set the automatic project directory prefix
+  PROJECT_PREFIX="$(whoami)_$(date +%y_%m_%d)"
+
+  # When adding a module, it also should be added to this list
+  ALL_MODULES=(
+    install_dependencies
+    init_job
+    easy
+    ezview
+    qhtcp
+  )
+  
+  declare -a PROJECT_NAMES=() # this array will hold all of the projects to run
+  [[ $# -eq 1 ]] && PROJECT_NAMES+=("$1") # easy way to run on single dir
+  [[ $# -ge 2 ]] && parse_input "$@" # parse arguments with getopt
+
+  # Prompt user for the PROJECT_NAME if we still don't have one
+  if [[ ${#PROJECT_NAMES[@]} -eq 0 ]]; then # still allows for environment overrides
+    ask_pn
+    PROJECT_NAMES+=("$PROJECT_NAME")
+  fi
+
+  # Sanitize PROJECT_NAMES
+  # This regex should match PROJECT_PREFIX
+  san() { [[ $1 =~ .+_[0-9][0-9]_[0-9][0-9]_[0-9][0-9]_.+ ]]; }
+  for i in "${!PROJECT_NAMES[@]}"; do
+    if ! san "${PROJECT_NAME[i]}"; then
+      echo "Project name ${PROJECT_NAME[$i]} is invalid"
+      echo "Enter a replacement"
+      ask_pn
+      san "$PROJECT_NAME" || (echo "RTFM"; return 1)
+      PROJECT_NAME[i]="$PROJECT_NAME"
+    fi
+  done
+
+  SCANS_DIR="${SCANS_DIR:-"/mnt/data/ExpJobs"}" # TODO propose changing this to something else
+
+  # If we don't catch with getopt or env, run all
+  [[ ${#MODULES[@]} -eq 0 ]] && MODULES=("${ALL_MODULES[@]}")
+
+  # Exclude modules overrides include
+  arr=()
+  for m in "${MODULES[@]}"; do
+    [[ " ${EXCLUDE_MODULES[*]} " =~ [[:space:]]${m}[[:space:]] ]] || arr+=("$m")
+  done
+  MODULES=("${arr[@]}")
+  unset arr
+    
+  # Sanitize MODULES
+  for i in "${!MODULES[@]}"; do
+    if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then
+      echo "Module $m not in the module list"
+      echo "Available modules: ${ALL_MODULES[*]}"
+      read -r -p "Enter replacement name: " MODULE
+      ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULE}[[:space:]] ]] || (echo "RTFM"; return 1)
+      MODULES[i]="$MODULE"
+    fi
+  done
+
+  # Loop over projects
+  for PROJECT_NAME in "${PROJECT_NAMES[@]}"; do 
+    SCAN_DIR="$SCANS_DIR/$PROJECT_NAME"
+
+    # Run selected modules
+    for m in "${MODULES[@]}"; do
+      ask "Run $m" && "$m"
+    done
+  done
+
+}
+
+main "$@"
+
+exit $?