First attempt at script-run-workflow

This commit is contained in:
2024-07-21 23:50:24 -04:00
parent 628356652d
commit 06dd700680
290 changed files with 5524411 additions and 0 deletions

531
workflow/script-run-workflow Executable file
View File

@@ -0,0 +1,531 @@
#!/usr/bin/env bash
# Copyright 2024 Bryan C. Roessler
# This is currently a code scratchpad for organizing the Hartman Lab Server workflow
# It contains a mixture of code/pseudocode and shouldn't be run until this message is removed
# Allow indirect functions
# shellcheck disable=SC2317
#
# shdoc info
# @name HartmanLabWorkflow
# @brief One script to rule them all (see: xkcd #927)
# @description Executes the Hartman Lab image analysis workflow
# @arg $1 string A project name
#
shopt -s extglob
# @section Libraries
# @description Change these variables to use different libraries
JAVA="${JAVA:-java}"
PYTHON="${PYTHON:-python3}"
PERL="${PERL:-perl}"
# @section Help
# @description Print a helpful message
print_help() {
echo "Running: ${FUNCNAME[0]}"
cat <<-EOF
USAGE:
script-run-workflow [[OPTION] [VALUE]]...
Some options (--project, --module) can be passed multiple times for batch operations.
OPTIONS:
--project, -p PROJECT
PROJECT should follow the pattern ${PROJECT_PREFIX}_UNIQUE_PROJECT_NAME
--include, -i MODULE
See MODULES section below for list of available modules
If no --module is specified, all modules are run
--exclude, -x MODULE
See MODULES section below for list of available modules
--yes, -y, --auto
Always answer yes to questions (non-interactive mode)
--debug, -d
Print extra debugging info
--help, -h
Print this help message and exit
MODULES:
${ALL_MODULES[*]}
DEPENDENCIES:
binaries (system): graphviz pandoc pdftk-java gd-devel
perl (cpan): File::Map ExtUtils::PkgConfig GD GO::TermFinder
R (default): BiocManager ontologyIndex ggrepel tidyverse sos openxlsx ggplot2 plyr extrafont gridExtra gplots stringr plotly ggthemes pandoc rmarkdown
R (BiocManager):
EXAMPLES:
script-run-workflow --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[1]} --module ${ALL_MODULES[2]} --yes
EOF
}
# @section User Input
# @description Creates array and switches from user input
parse_input() {
echo "Running: ${FUNCNAME[0]}" "$@"
long_opts="project:,include:,exclude:,yes,auto,debug,help"
#long_opts+="restorefile:,betapass:,"
short_opts="+p:i:x:yhd"
if input=$(getopt -o $short_opts -l $long_opts -- "$@"); then
eval set -- "$input"
while true; do
case $1 in
--project|-p)
shift
declare -ga PROJECT_NAMES+=("$1")
;;
--include|-i)
shift
declare -ga MODULES+=("$1")
;;
--exclude|-x)
shift
declare -ga EXCLUDE_MODULES+=("$1")
;;
--yes|-y|--auto)
declare -g YES=1
;;
--debug|-d)
declare -g DEBUG=1
;;
--help|-h)
print_help; exit 0
;;
--)
shift
break
;;
esac
shift
done
else
err "Incorrect options provided"; exit 1
fi
}
# @section Initialize a new job in the scans directory
# @description Create a new ExpJobs project
# TODO Copy over source image directories from robot - are these alse named by the ExpJobs name?
init_job() {
echo "Running: ${FUNCNAME[0]}"
if [[ -d $SCAN_DIR ]]; then
ask "$SCAN_DIR already exists, re-initialize?" || return 0
else
ask "Initialize a new project at $SCAN_DIR?" || return 1
mkdir -p "$SCAN_DIR"
fi
[[ -d $SCAN_DIR/MasterPlateFiles ]] || mkdir -p "$SCAN_DIR/MasterPlateFiles"
DRUG_MEDIA_FILE="$SCAN_DIR/MasterPlateFiles/DrugMedia_$PROJECT_NAME.xls"
MASTER_PLATE_FILE="$SCAN_DIR/MasterPlateFiles/MasterPlate_$PROJECT_NAME.xls"
# TODO Where are the actual templates?
for f in $DRUG_MEDIA_FILE $MASTER_PLATE_FILE; do
touch "$f"
done
}
# @section EASY
# @description Start an EASY analysis
# The QHTCPImageFolders and MasterPlateFiles folder are the inputs for image analysis with EASY software.
# EASY will automatically generate a Results directory (within the ExpJobs/ExperimentJob folder) w/ timestamp and an optional short description provided by the user (Fig.2).
# The Results directory is created and entered, using the “File >> New Experiment” dropdown in EASY.
# Multiple Results files may be created (and uniquely named) within an ExperimentJob folder.
easy() {
echo "Running: ${FUNCNAME[0]}"
EASY="/mnt/data/EASY/EasyDev2024/BU/EASY240430AppExported/EstartConsole.m"
pushd "$SCAN_DIR" || return 1
# Launch graphical matlab if the user wants
ask "Start EASY in MATLAB? This requires a GUI." && matlab -nosplash -r "$EASY"
# glob EASY output and make sure it exists
shopt -s nullglob
EASY_RESULTS_DIRS=( Results* )
shopt -u nullglob
[[ ${#EASY_RESULTS_DIRS} -ge 1 ]] || (echo "Missing EASY output"; exit 1)
declare -a EASY_OUT_ARRAY
for EASY_RESULTS_DIR in "${EASY_RESULTS_DIRS[@]}"; do
[[ -d $EASY_RESULTS_DIR ]] && echo "Found EASY Results directory: $EASY_RESULTS_DIR"
EASY_PRINT_RESULTS_DIR="$EASY_RESULTS_DIR/PrintResults"
[[ -d $EASY_PRINT_RESULTS_DIR ]] && echo "Found EASY PrintResults directory: $EASY_PRINT_RESULTS_DIR"
EASY_PRINT_RESULTS_FILES=(
"$EASY_PRINT_RESULTS_DIR/!!"*
"$EASY_PRINT_RESULTS_DIR"/NoGrowth_*.txt
"$EASY_PRINT_RESULTS_DIR"/GrowthOnly_*.txt
)
EASY_OUT_ARRAY+=("$EASY_RESULTS_DIR" "$EASY_PRINT_RESULTS_DIR" "${EASY_PRINT_RESULTS_FILES[@]}")
done
echo "EASY OUTPUT ARRAY: " "${EASY_OUT_ARRAY[@]}"
}
# @section EZView
ezview() {
echo "Running: ${FUNCNAME[0]}"
EZVIEW_DIR="/mnt/data/EZVIEW"
echo "$EZVIEW_DIR"
}
# @section StudiesQHTCP
# @ description This section is derived from the earliest work of Jinyu Guo. As such it uses Perl scripts. Without porting these two Perl scripts into a new intergrated R script or Python script, one is contrained to use the rather crude copy-paste and and shell script inherient in the original procedures. These two Perl scripts are analyze_v2.pl and terms2tsv_v4.pl which were written in 2003 by Gavin Sherlock for the SGD gene ontology system and require perl installations of such files. These also require that the gene_ontology_edit.obo, SGD_features.tab files used in the ../Code also be included here. Without rewriting the code, one must compromise directory convenience.
# @description Main loop for qhtcp modules (rewrite of REMcMaster3.sh)
qhtcp() {
echo "Running: ${FUNCNAME[0]}"
TEMPLATE_DIR="$SCRIPT_DIR/templates/qhtcp"
QHTCP_DIR="/mnt/data/StudiesQHTCP/$PROJECT_NAME"
while [[ -d $QHTCP_DIR ]]; do
echo "A project already exists at $QHTCP_DIR"
ask "Safely update $QHTCP_DIR from the $TEMPLATE_DIR template?" && break
if ask "Back up $QHTCP_DIR to $QHTCP_DIR.bk and start fresh?"; then
mv "$QHTCP_DIR" "$QHTCP_DIR.bk" || (echo "Backup unsuccessful, exiting"; exit 1)
fi
done
# Copy template to QHTCP project directory
if rsync --archive --update "$TEMPLATE_DIR"/ "$QHTCP_DIR"; then
echo "New project created at $QHTCP_DIR"
fi
# Enter REMc directory to run the scripts there
pushd "$QHTCP_DIR/REMc" || return 1
r_join_interact &&
java_jingyu_extract &&
r_add_shift_values &&
r_heat_maps_zscores &&
r_heat_maps_homology &&
py_gtf &&
r_compile_gtf
popd || return 1
}
# @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv
r_join_interact() {
echo "Running: ${FUNCNAME[0]}"
echo "Rscript JoinInteractExps3dev.R"
Rscript JoinInteractExps3dev.R
out_file="REMcRdy_lm_only.csv"
out_file2="Shift_only.csv"
for f in "$out_file" "$out_file2"; do
[[ -f $f ]] || (echo "$f does not exist"; return 1)
done
}
# @description Jingyu's REMc java utility using file input file REMcRdy_lm_only.csv
# and output REMcRdy_lm_only.csv-finalTable.csv
java_jingyu_extract() {
echo "Running: ${FUNCNAME[0]}"
classpath="jingyuJava_1_7_extractLib.jar"
out_file="REMcRdy_lm_only.csv-finalTable.csv"
# backup REMcRdy_lm_only.csv-finalTable.csv
[[ -f $out_file ]] && mv "$out_file" "$out_file.bk"
java_cmd=(
"$JAVA" -Xms512m -Xmx2048m -Dfile.encoding=UTF-8 -classpath "$classpath" ExecMain
"REMcRdy_lm_only.csv"
"GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab"
"ORF_List_Without_DAmPs.txt" 1 true true
)
echo "${java_cmd[@]}"
"${java_cmd[@]}"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
# @description Add shift values back to REMcRdy_lm_only.csv-finalTable.csv
# and output "REMcWithShift.csv" for use with the REMc heat maps
r_add_shift_values() {
echo "Running: ${FUNCNAME[0]}"
out_file="REMcHeatmaps/REMcWithShift.csv"
echo "Rscript AddShiftVals2.R"
Rscript AddShiftVals2.R
rm -f "REMcHeatmaps/"*.pdf
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
# @description Execute REMcHeatmaps_zscores.R
r_heat_maps_zscores() {
echo "Running: ${FUNCNAME[0]}"
out_file="REMcHeatmaps/compiledREMcHeatmaps.pdf"
echo "Rscript REMcHeatmaps_zscores.R"
Rscript REMcHeatmaps_zscores.R
pdfs=(REMcHeatmaps/*.pdf)
echo "pdftk ${pdfs[*]} output $out_file"
pdftk "${pdfs[@]}" output "$out_file"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
# @description Execute REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R
r_heat_maps_homology() {
echo "Running: ${FUNCNAME[0]}"
work_dir="REMcHeatmapsWithHomology"
source_file="REMcHeatmaps/REMcWithShift.csv"
target_file="$work_dir/REMcWithShift.csv"
out_file="$work_dir/Homology/compiledREMcHomologyHeatmaps.pdf"
echo "rsync --archive $source_file $target_file"
rsync --archive "$source_file" "$target_file"
# Clean old output
rm "$work_dir/Homology/"*.{pdf,csv}
pushd "$work_dir" || return 1
Rscript \
REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R \
REMcWithShift.csv \
Homology \
17_0503_DAmPs_Only.txt \
Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv
popd || return 1
pdfs=("$work_dir"/Homology/*.pdf)
pdftk "${pdfs[@]}" output "$out_file"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
# @description Perform GTF
py_gtf() {
echo "Running: ${FUNCNAME[0]}"
process_dir="GTF/Process"
function_dir="GTF/Function"
component_dir="GTF/Component"
in_file="REMcRdy_lm_only.csv-finalTable.csv"
out_file="$process_dir/REMcRdy_lm_only/1-0-0-finaltable.csv"
echo "$PYTHON DconJG2.py $in_file $process_dir/"
"$PYTHON" DconJG2.py "$in_file" "$process_dir/"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
unset out_file
rsync -a "$process_dir/REMcRdy_lm_only" GTF/Function/
rsync -a "$process_dir/REMcRdy_lm_only" GTF/Component/
# @description Not sure what to name this
# @arg $1 string directory name
_process() {
echo "Running: ${FUNCNAME[0]}" "$@"
pushd "$1" || return 1
shopt -s nullglob
set2=(REMcRdy_lm_only/*.txt)
shopt -u nullglob
for s in "${set2[@]}"; do
echo "$PERL analyze_v2.pl -an gene_association.sgd -as P -o gene_ontology_edit.obo -b $set1 $s"
echo "TODO: analyze_v2.pl should be translated"
"$PERL" analyze_v2.pl -an gene_association.sgd -as P -o gene_ontology_edit.obo -b "$set1" "$s"
echo "$PERL terms2tsv_v4.pl $s.terms > $s.tsv"
echo "TODO: terms2tsv_v4.pl should be translated"
"$PERL" terms2tsv_v4.pl "$s.terms" > "$s.tsv"
done
# Concat the process ontology outputs from the /REMcReady_lm_only folder
echo "$PYTHON Concatenate_GTF_results.py REMcRdy_lm_only/ $out_file"
echo "TODO: Concatenate_GTF_results.py should be translated to bash"
"$PYTHON" Concatenate_GTF_results.py REMcRdy_lm_only/ "$out_file"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
popd || return 1
}
# Perform operations in each directory
for d in "$process_dir" "$function_dir" "$component_dir"; do
set1="ORF_List_Without_DAmPs.txt"
out_file="${d##*/}Results.txt" # Use the dirname to create each Results filename
_process "$d" & # parallelize
done
}
# @description Compile GTF in R
r_compile_gtf() {
echo "Running: ${FUNCNAME[0]}"
echo "Rscript CompileGTF.R"
Rscript CompileGTF.R
}
# @description Installs dependencies for the workflow
install_dependencies() {
echo "Running: ${FUNCNAME[0]}"
# Install system-wide dependencies
echo "Installing system dependencies"
case "$(uname -s)" in
Linux*|CYGWIN*|MINGW*)
ask "Detected Linux platform, continue?" || return 1
echo "You may be prompted for your sudo password to install system packages"
if hash dnf &>/dev/null; then
sudo dnf install graphviz pandoc pdftk-java gd-devel
elif hash apt &>/dev/null; then
sudo apt install graphviz pandoc pdftk-java libgd-dev
fi
;;
Darwin*)
ask "Detected Mac platform, continue?" || return 1
export HOMEBREW_BREW_GIT_REMOTE="https://github.com/Homebrew/brew"
curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh|bash
brew install graphiz
brew install gd
brew install pdftk-java
brew install pandoc
cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder
;;
*)
echo "Your system could not be detected, please install dependencies manually"
;;
esac
# Install perl CPAN modules
echo "Installing perl CPAN modules"
echo "cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder"
cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder
# Install R packages
echo "Installing R packages"
Rscript -e 'install.packages(c(\
"BiocManager", \
"ontologyIndex" \
"ggrepel" \
"tidyverse" \
"sos" \
"openxlsx" \
"ggplot2" \
"plyr" \
"extrafont" \
"gridExtra" \
"gplots" \
"stringr" \
"plotly" \
"ggthemes" \
"pandoc" \
"rmarkdown" \
), dep=TRUE, repos="https://cloud.r-project.org")'
Rscript -e 'BiocManager::install("org.Sc.sgd.db")'
}
# @internal
ask() {
declare response
(( YES )) && return 0
read -r -p "$* [y/N]: " response
[[ ${response,,} =~ ^(yes|y)$ ]]
}
# @internal
err() { echo "Error: $*" >&2; }
# @internal
ask_pn() {
declare -g PROJECT_NAME
read -r -p "Enter a full project name (ex. ${PROJECT_PREFIX}_PROJECT_NAME): " PROJECT_NAME
}
# @internal
debug() { (( DEBUG )) && echo "Debug: $*"; }
# @description The main loop of script-run-workflow
# May eventually need to add git ops
# Passes on arguments
# Most variables in main() are user configurable or can be overriden by env
main() {
echo "Running: ${FUNCNAME[0]}" "$@"
# Where are we located?
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
# Set the automatic project directory prefix
PROJECT_PREFIX="$(whoami)_$(date +%y_%m_%d)"
# When adding a module, it also should be added to this list
ALL_MODULES=(
install_dependencies
init_job
easy
ezview
qhtcp
)
declare -a PROJECT_NAMES=() # this array will hold all of the projects to run
[[ $# -eq 1 ]] && PROJECT_NAMES+=("$1") # easy way to run on single dir
[[ $# -ge 2 ]] && parse_input "$@" # parse arguments with getopt
# Prompt user for the PROJECT_NAME if we still don't have one
if [[ ${#PROJECT_NAMES[@]} -eq 0 ]]; then # still allows for environment overrides
ask_pn
PROJECT_NAMES+=("$PROJECT_NAME")
fi
# Sanitize PROJECT_NAMES
# This regex should match PROJECT_PREFIX
san() { [[ $1 =~ .+_[0-9][0-9]_[0-9][0-9]_[0-9][0-9]_.+ ]]; }
for i in "${!PROJECT_NAMES[@]}"; do
if ! san "${PROJECT_NAME[i]}"; then
echo "Project name ${PROJECT_NAME[$i]} is invalid"
echo "Enter a replacement"
ask_pn
san "$PROJECT_NAME" || (echo "RTFM"; return 1)
PROJECT_NAME[i]="$PROJECT_NAME"
fi
done
SCANS_DIR="${SCANS_DIR:-"/mnt/data/ExpJobs"}" # TODO propose changing this to something else
# If we don't catch with getopt or env, run all
[[ ${#MODULES[@]} -eq 0 ]] && MODULES=("${ALL_MODULES[@]}")
# Exclude modules overrides include
arr=()
for m in "${MODULES[@]}"; do
[[ " ${EXCLUDE_MODULES[*]} " =~ [[:space:]]${m}[[:space:]] ]] || arr+=("$m")
done
MODULES=("${arr[@]}")
unset arr
# Sanitize MODULES
for i in "${!MODULES[@]}"; do
if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then
echo "Module $m not in the module list"
echo "Available modules: ${ALL_MODULES[*]}"
read -r -p "Enter replacement name: " MODULE
! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULE}[[:space:]] ]] || (echo "RTFM"; return 1)
MODULES[i]="$MODULE"
fi
done
# Loop over projects
for PROJECT_NAME in "${PROJECT_NAMES[@]}"; do
SCAN_DIR="$SCANS_DIR/$PROJECT_NAME"
# Run selected modules
for m in "${MODULES[@]}"; do
ask "Run $m" && "$m"
done
done
}
main "$@"
exit $?