Files
hartman-server/workflow/script-run-workflow
2024-07-22 16:33:35 -04:00

656 lines
20 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Copyright 2024 Bryan C. Roessler
#
# This is a code scratchpad for organizing the Hartman Lab Server workflow
# It contains a mixture of code/pseudocode and shouldn't be run until this message is removed
#
# Allow indirect functions
# shellcheck disable=SC2317
#
# shdoc info
# @name HartmanLabWorkflow
# @brief One script to rule them all (see: xkcd #927)
# @description Executes the Hartman Lab image analysis workflow
# @option -p<value> | --project=<value> Include one or more projects in the analysis
# @option -i<value> | --include=<value> Include one or more modules in the analysis (default: all modules)
# @option -x<value> | --exclude=<value> Exclude one or more modules in the analysis
# @option -y | --yes | --auto Assume yes answer to all questions (non-interactive mode)
# @option -d | --debug Turn on extra debugging output
# @option -h | --help Print help message and exit (overrides other options)
DEBUG=1 # Turn debugging ON by default during development
shopt -s extglob
# @section Libraries
# @description Change these variables to use different libraries
JAVA="${JAVA:-java}"
PYTHON="${PYTHON:-python3}"
PERL="${PERL:-perl}"
# @section Help
# @description Print a helpful message
print_help() {
debug "Running: ${FUNCNAME[0]}"
install_dependencies --get-depends # Loads the dependency arrays
cat <<-EOF
USAGE:
script-run-workflow [[OPTION] [VALUE]]...
Some options (--project, --include, --exclude) can be passed multiple times or
by using comma deliminated strings (see EXAMPLES below)
OPTIONS:
--project, -p PROJECT
PROJECT should follow the pattern ${PROJECT_PREFIX}_PROJECT_NAME
--include, -i MODULE
See MODULES section below for list of available modules
If no --include is specified, all modules are run
--exclude, -x MODULE
See MODULES section below for list of modules to exclude
--yes, -y, --auto
Always answer yes to questions (non-interactive mode)
--debug, -d
Print extra debugging info
--help, -h
Print this help message and exit
MODULES:
${ALL_MODULES[*]}
SUBMODULES:
${ALL_SUBMODULES[*]}
DEPENDENCIES:
deb: ${depends_deb[@]}
rpm: ${depends_rpm[@]}
brew: ${depends_brew[@]}
perl: ${depends_perl[@]}
R: ${depends_r[@]}
BiocManager: ${depends_bioc[@]}
EXAMPLES:
script-run-workflow --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[1]} --module ${ALL_MODULES[2]} --yes
script-run-workflow --module=${ALL_MODULES[0]},${ALL_MODULES[1]}
script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT
script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT --module=${ALL_MODULES[1]},${ALL_MODULES[2]} --yes --debug
EOF
}
# @section User Input
# @description Creates array and switches from user input
parse_input() {
debug "Running: ${FUNCNAME[0]}" "$@"
long_opts="project:,include:,exclude:,yes,auto,debug,help"
short_opts="+p:i:x:yhd"
if input=$(getopt -o $short_opts -l $long_opts -- "$@"); then
eval set -- "$input"
while true; do
case $1 in
--project|-p)
shift
if [[ $1 == *','* ]] ; then # check for commas
IFS=',' read -ra PROJECTS <<< "$1"
else
PROJECTS+=("$1")
fi
;;
--include|-i)
shift
if [[ $1 == *','* ]] ; then # check for commas
IFS=',' read -ra INCLUDE_MODULES <<< "$1"
else
INCLUDE_MODULES+=("$1")
fi
;;
--exclude|-x)
shift
if [[ $1 == *','* ]] ; then # check for commas
IFS=',' read -ra EXCLUDE_MODULES <<< "$1"
else
EXCLUDE_MODULES+=("$1")
fi
;;
--yes|-y|--auto)
declare -g YES=1
;;
--debug|-d)
declare -g DEBUG=1
;;
--help|-h)
print_help; exit 0
;;
--)
shift
break
;;
esac
shift
done
else
err "Incorrect options provided"; exit 1
fi
}
# @section Helper functions
# @internal
module() {
debug "Adding $1 module"
ALL_MODULES+=("$1")
declare -gA "$1"
}
submodule() {
debug "Adding $1 submodule"
ALL_SUBMODULES+=("$1")
declare -gA "$1"
}
# This function will only work if users have an actual name registered on the server
# TODO for now just use username
# user_initials() {
# user_record="$(getent passwd "$(whoami)")"
# user_gecos_field="$(echo "$user_record" | cut -d ':' -f 5)"
# user_full_name="$(echo "$user_gecos_field" | cut -d ',' -f 1)"
# last="${user_full_name#* }"
# echo "${user_full_name:0:1}${last:0:1}"
# }
ask() {
declare response
(( YES )) && return 0
read -r -p "$* [y/N]: " response
[[ ${response,,} =~ ^(yes|y)$ ]]
}
err() { echo "Error: $*" >&2; }
ask_pn() {
declare -g PROJECT
read -r -p "Enter a full project name (ex. ${PROJECT_PREFIX}_PROJECT_NAME): " PROJECT
}
debug() { (( DEBUG )) && echo "Debug: $*"; }
# @section Modules
# @description A module contains a cohesive set of actions/experiments to run on a project
# Use a module when:
# * Building a new type of analysis
# * Combining submodules
#
#
module install_dependencies
# @description Installs dependencies for the workflow
install_dependencies() {
debug "Running: ${FUNCNAME[0]}" "$@"
# Dependency arrays
depends_rpm=(graphviz pandoc pdftk-java gd-devel shdoc)
depends_deb=(graphviz pandoc pdftk-java libgd-dev shdoc)
depends_brew=(graphiz pandoc gd pdftk-java shdoc)
depends_perl=(File::Map ExtUtils::PkgConfig GD GO::TermFinder)
depends_r=(BiocManager ontologyIndex ggrepel tidyverse sos openxlsx ggplot2
plyr extrafont gridExtra gplots stringr plotly ggthemes pandoc rmarkdown)
depends_bioc=(org.Sc.sgd.db)
[[ $1 == "--get-depends" ]] && return 0 # if we just want to read the depends vars
# Install system-wide dependencies
echo "Installing system dependencies"
case "$(uname -s)" in
Linux*|CYGWIN*|MINGW*)
ask "Detected Linux platform, continue?" || return 1
echo "You may be prompted for your sudo password to install system packages"
if hash dnf &>/dev/null; then
sudo dnf install "${depends_rpm[@]}"
elif hash apt &>/dev/null; then
sudo apt install "${depends_deb[@]}"
fi
;;
Darwin*)
ask "Detected Mac platform, continue?" || return 1
export HOMEBREW_BREW_GIT_REMOTE="https://github.com/Homebrew/brew"
curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh|bash
brew install "${depends_brew[@]}"
;;
*)
echo "Your system could not be detected, please install dependencies manually"
;;
esac
# Install perl CPAN modules
echo "Installing perl CPAN modules"
debug "cpan" "${depends_perl[@]}"
cpan "${depends_perl[@]}"
# Install R packages
echo "Installing R packages"
depends_r_str=""
depends_r_to_string() {
for d in "${depends_r[@]}"; do
depends_r_str+="$d\", \""
done
depends_r_str="${depends_r_str::-3}" # strip last , " (comma and quote)
}
depends_r_to_string
debug "Rscript -e install.packages(c(\"$depends_r_str), dep=TRUE, repos=\"https://cloud.r-project.org\")"
Rscript -e "install.packages(c(\"$depends_r_str), dep=TRUE, repos=\"https://cloud.r-project.org\")"
Rscript -e "BiocManager::install(\"${depends_bioc[0]}\")"
}
module init_job
# @section Initialize a new job in the scans directory
# @description Create a new ExpJobs project
# TODO Copy over source image directories from robot - are these alse named by the ExpJobs name?
init_job() {
debug "Running: ${FUNCNAME[0]}"
if [[ -d $SCAN_DIR ]]; then
ask "$SCAN_DIR already exists, re-initialize?" || return 0
else
ask "Initialize a new project at $SCAN_DIR?" || return 1
mkdir -p "$SCAN_DIR"
fi
[[ -d $SCAN_DIR/MasterPlateFiles ]] || mkdir -p "$SCAN_DIR/MasterPlateFiles"
DRUG_MEDIA_FILE="$SCAN_DIR/MasterPlateFiles/DrugMedia_$PROJECT.xls"
MASTER_PLATE_FILE="$SCAN_DIR/MasterPlateFiles/MasterPlate_$PROJECT.xls"
# TODO Where are the actual templates?
for f in $DRUG_MEDIA_FILE $MASTER_PLATE_FILE; do
touch "$f"
done
}
module easy
# @section EASY
# @description Start an EASY analysis
# The QHTCPImageFolders and MasterPlateFiles folder are the inputs for image analysis with EASY software.
# EASY will automatically generate a Results directory (within the ExpJobs/ExperimentJob folder) w/ timestamp and an optional short description provided by the user (Fig.2).
# The Results directory is created and entered, using the “File >> New Experiment” dropdown in EASY.
# Multiple Results files may be created (and uniquely named) within an ExperimentJob folder.
easy() {
debug "Running: ${FUNCNAME[0]}"
EASY="/mnt/data/EASY/EasyDev2024/BU/EASY240430AppExported/EstartConsole.m"
pushd "$SCAN_DIR" || return 1
# Launch graphical matlab if the user wants
ask "Start EASY in MATLAB? This requires a GUI." && matlab -nosplash -r "$EASY"
# glob EASY output and make sure it exists
shopt -s nullglob
EASY_RESULTS_DIRS=( Results* )
shopt -u nullglob
[[ ${#EASY_RESULTS_DIRS} -ge 1 ]] || (echo "Missing EASY output"; exit 1)
declare -a EASY_OUT_ARRAY
for EASY_RESULTS_DIR in "${EASY_RESULTS_DIRS[@]}"; do
[[ -d $EASY_RESULTS_DIR ]] && echo "Found EASY Results directory: $EASY_RESULTS_DIR"
EASY_PRINT_RESULTS_DIR="$EASY_RESULTS_DIR/PrintResults"
[[ -d $EASY_PRINT_RESULTS_DIR ]] && echo "Found EASY PrintResults directory: $EASY_PRINT_RESULTS_DIR"
EASY_PRINT_RESULTS_FILES=(
"$EASY_PRINT_RESULTS_DIR/!!"*
"$EASY_PRINT_RESULTS_DIR"/NoGrowth_*.txt
"$EASY_PRINT_RESULTS_DIR"/GrowthOnly_*.txt
)
EASY_OUT_ARRAY+=("$EASY_RESULTS_DIR" "$EASY_PRINT_RESULTS_DIR" "${EASY_PRINT_RESULTS_FILES[@]}")
done
echo "EASY OUTPUT ARRAY: " "${EASY_OUT_ARRAY[@]}"
}
module ezview
# @section EZView
ezview() {
debug "Running: ${FUNCNAME[0]}"
EZVIEW_DIR="/mnt/data/EZVIEW"
echo "$EZVIEW_DIR"
}
module qhtcp
# @section QHTCP
# @description Main QHTCP module (functional rewrite of REMcMaster3.sh)
qhtcp() {
debug "Running: ${FUNCNAME[0]}"
TEMPLATE_DIR="$SCRIPT_DIR/templates/qhtcp"
QHTCP_DIR="/mnt/data/StudiesQHTCP/$PROJECT"
# Our list of submodules (functions) to run for this module
# Put these in the appropriate order of operations
submodules=(
r_join_interact
java_extract
r_add_shift_values
r_heat_maps_zscores
r_heat_maps_homology
py_gtf
r_compile_gtf
)
while [[ -d $QHTCP_DIR ]]; do
echo "A project already exists at $QHTCP_DIR"
ask "Safely update $QHTCP_DIR from the $TEMPLATE_DIR template?" && break
if ask "Back up $QHTCP_DIR to $QHTCP_DIR.bk and start fresh?"; then
mv "$QHTCP_DIR" "$QHTCP_DIR.bk" || (echo "Backup unsuccessful, exiting"; exit 1)
fi
done
# Copy template to QHTCP project directory
if rsync --archive --update "$TEMPLATE_DIR"/ "$QHTCP_DIR"; then
echo "New project created at $QHTCP_DIR"
fi
# Create StudyInfo.csv
# Right now this is identical to the template but we can change it later
cat <<-EOF > "$QHTCP_DIR/Code/StudyInfo.csv"
ExpNumb,ExpLabel,BackgroundSD,ZscoreJoinSD,AnalysisBy
1,ExpName1,NA,NA,UserInitials
2,ExpName2,NA,NA,UserInitials
3,ExpName3,NA,NA,UserInitials
4,ExpName4,NA,NA,UserInitials
EOF
# Enter REMc directory to run the scripts there
pushd "$QHTCP_DIR/REMc" || return 1
# Run each submodule
for s in "${submodules[@]}"; do "$s"; done
popd || return 1
}
# @section Submodules
# @description Submodules provide functionaility to modules and are reusable between modules
# Use a submodule when:
# * Calling external scripts
# * Performing repetitive tasks
# *
#
submodule r_join_interact
# @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv
r_join_interact() {
debug "Running: ${FUNCNAME[0]}"
echo "Rscript JoinInteractExps3dev.R"
Rscript JoinInteractExps3dev.R
out_file="REMcRdy_lm_only.csv"
out_file2="Shift_only.csv"
for f in "$out_file" "$out_file2"; do
[[ -f $f ]] || (echo "$f does not exist"; return 1)
done
}
submodule java_extract
# @description Jingyu's REMc java utility using file input file REMcRdy_lm_only.csv
# and output REMcRdy_lm_only.csv-finalTable.csv
java_extract() {
debug "Running: ${FUNCNAME[0]}"
classpath="jingyuJava_1_7_extractLib.jar"
out_file="REMcRdy_lm_only.csv-finalTable.csv"
# backup REMcRdy_lm_only.csv-finalTable.csv
[[ -f $out_file ]] && mv "$out_file" "$out_file.bk"
java_cmd=(
"$JAVA" -Xms512m -Xmx2048m -Dfile.encoding=UTF-8 -classpath "$classpath" ExecMain
"REMcRdy_lm_only.csv"
"GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab"
"ORF_List_Without_DAmPs.txt" 1 true true
)
echo "${java_cmd[@]}"
"${java_cmd[@]}"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
submodule r_add_shift_values
# @description Add shift values back to REMcRdy_lm_only.csv-finalTable.csv
# and output "REMcWithShift.csv" for use with the REMc heat maps
r_add_shift_values() {
debug "Running: ${FUNCNAME[0]}"
out_file="REMcHeatmaps/REMcWithShift.csv"
echo "Rscript AddShiftVals2.R"
Rscript AddShiftVals2.R
rm -f "REMcHeatmaps/"*.pdf
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
submodule r_heat_maps_zscores
# @description Execute REMcHeatmaps_zscores.R
r_heat_maps_zscores() {
debug "Running: ${FUNCNAME[0]}"
out_file="REMcHeatmaps/compiledREMcHeatmaps.pdf"
echo "Rscript REMcHeatmaps_zscores.R"
Rscript REMcHeatmaps_zscores.R
pdfs=(REMcHeatmaps/*.pdf)
echo "pdftk ${pdfs[*]} output $out_file"
pdftk "${pdfs[@]}" output "$out_file"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
submodule r_heat_maps_homology
# @description Execute REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R
r_heat_maps_homology() {
debug "Running: ${FUNCNAME[0]}"
work_dir="REMcHeatmapsWithHomology"
source_file="REMcHeatmaps/REMcWithShift.csv"
target_file="$work_dir/REMcWithShift.csv"
out_file="$work_dir/Homology/compiledREMcHomologyHeatmaps.pdf"
echo "rsync --archive $source_file $target_file"
rsync --archive "$source_file" "$target_file"
# Clean old output
rm "$work_dir/Homology/"*.{pdf,csv}
pushd "$work_dir" || return 1
Rscript \
REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R \
REMcWithShift.csv \
Homology \
17_0503_DAmPs_Only.txt \
Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv
popd || return 1
pdfs=("$work_dir"/Homology/*.pdf)
pdftk "${pdfs[@]}" output "$out_file"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
submodule py_gtf
# @description Perform GTF
py_gtf() {
debug "Running: ${FUNCNAME[0]}"
process_dir="GTF/Process"
function_dir="GTF/Function"
component_dir="GTF/Component"
in_file="REMcRdy_lm_only.csv-finalTable.csv"
out_file="$process_dir/REMcRdy_lm_only/1-0-0-finaltable.csv"
echo "$PYTHON DconJG2.py $in_file $process_dir/"
"$PYTHON" DconJG2.py "$in_file" "$process_dir/"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
unset out_file
rsync -a "$process_dir/REMcRdy_lm_only" GTF/Function/
rsync -a "$process_dir/REMcRdy_lm_only" GTF/Component/
# @description Not sure what to name this
# @arg $1 string directory name
_process() {
debug "Running: ${FUNCNAME[0]}" "$@"
pushd "$1" || return 1
shopt -s nullglob
set2=(REMcRdy_lm_only/*.txt)
shopt -u nullglob
for s in "${set2[@]}"; do
pl_analyze "$set1" "$s"
pl_terms2tsv "$s"
"$PERL" terms2tsv_v4.pl "$s.terms" > "$s.tsv"
done
# Concat the process ontology outputs from the /REMcReady_lm_only folder
echo "$PYTHON Concatenate_GTF_results.py REMcRdy_lm_only/ $out_file"
echo "TODO: Concatenate_GTF_results.py should be translated to bash"
"$PYTHON" Concatenate_GTF_results.py REMcRdy_lm_only/ "$out_file"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
popd || return 1
}
# Perform operations in each directory
for d in "$process_dir" "$function_dir" "$component_dir"; do
set1="ORF_List_Without_DAmPs.txt"
out_file="${d##*/}Results.txt" # Use the dirname to create each Results filename
_process "$d" & # parallelize
done
}
submodule pl_analyze
# @description Perl analyze submodule
# This seems weird to me because we're just overwriting the same data for all set2 members
# https://metacpan.org/dist/GO-TermFinder/view/examples/analyze.pl
# Is there a reason you need a custom version and not the original from cpan?
# @arg $1 string Set 1
# @arg $2 string Set 2
pl_analyze() {
script="analyze_v2.pl"
an="gene_association.sgd"
out_file="gene_ontology_edit.obo"
debug "$PERL $script -an $an -as P -o $out_file -b $1 $2"
"$PERL" "$script" -an "$an" -as P -o "$out_file" -b "$1" "$2"
}
submodule pl_terms2tsv
# @description Perl terms2tsv submodule
# Probably should be translated to shell/python
# @arg $1 string Set 2
pl_terms2tsv() {
script="terms2tsv_v4.pl"
debug "$PERL $script $1.terms > $1.tsv"
"$PERL" "$script" "$1.terms" > "$1.tsv"
}
submodule documentation
# @section Documentation
# @description Generates markdown documentation from this script using shdoc
documentation() {
debug "Running: ${FUNCNAME[0]}"
# Print markdown to stdout
shdoc < "$SCRIPT"
# Create markdown file
shdoc < "$SCRIPT" > documentation.md
}
# @description Compile GTF in R
r_compile_gtf() {
debug "Running: ${FUNCNAME[0]}"
echo "Rscript CompileGTF.R"
Rscript CompileGTF.R
}
# @description The main loop of script-run-workflow
# May eventually need to add git ops
# Passes on arguments
# Most variables in main() are user configurable or can be overriden by env
main() {
debug "Running: ${FUNCNAME[0]}" "$@"
# Where are we located?
SCRIPT=$(realpath -s "${BASH_SOURCE[0]}")
SCRIPT_DIR=$(dirname "$SCRIPT")
# Set the automatic project directory prefix
PROJECT_PREFIX="$(whoami)_$(date +%y_%m_%d)"
san() { [[ $1 =~ .+_[0-9][0-9]_[0-9][0-9]_[0-9][0-9]_.+ ]]; } # sanitizer regex for prefix
declare -ag PROJECTS=() # this array will hold all of the projects for this run
parse_input "$@" # parse arguments with getopt
# Prompt user for the PROJECT if we still don't have one
if [[ ${#PROJECTS[@]} -eq 0 ]]; then # still allows for environment overrides
ask_pn
PROJECTS+=("$PROJECT")
fi
for i in "${!PROJECTS[@]}"; do
if ! san "${PROJECTS[i]}"; then
echo "Project name ${PROJECTS[i]} is invalid"
echo "Enter a replacement"
ask_pn
san "$PROJECT" || (echo "RTFM"; return 1)
PROJECTS[i]="$PROJECT"
fi
done
SCANS_DIR="${SCANS_DIR:-"/mnt/data/ExpJobs"}" # TODO propose changing this to something else
# If we don't catch with getopt or env, run all
if [[ ${#INCLUDE_MODULES[@]} -eq 0 ]]; then
MODULES=("${ALL_MODULES[@]}")
else
MODULES=("${INCLUDE_MODULES[@]}")
fi
# Exclude modules from --exclude
arr=()
for m in "${MODULES[@]}"; do
[[ " ${EXCLUDE_MODULES[*]} " =~ [[:space:]]${m}[[:space:]] ]] || arr+=("$m")
done
MODULES=("${arr[@]}")
unset arr
# Sanitize MODULES
for i in "${!MODULES[@]}"; do
if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then
echo "Module $m not in the module list"
echo "Available modules: ${ALL_MODULES[*]}"
read -r -p "Enter replacement module name: " MODULE
! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULE}[[:space:]] ]] || (echo "RTFM"; return 1)
MODULES[i]="$MODULE"
fi
done
# Loop over projects
for PROJECT in "${PROJECTS[@]}"; do
SCAN_DIR="$SCANS_DIR/$PROJECT"
# Run selected modules
for m in "${MODULES[@]}"; do
ask "Run $m" && "$m"
done
done
}
main "$@"
exit $?