Files
hartman-server/workflow/script-run-workflow

532 lines
17 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Copyright 2024 Bryan C. Roessler
# This is currently a code scratchpad for organizing the Hartman Lab Server workflow
# It contains a mixture of code/pseudocode and shouldn't be run until this message is removed
# Allow indirect functions
# shellcheck disable=SC2317
#
# shdoc info
# @name HartmanLabWorkflow
# @brief One script to rule them all (see: xkcd #927)
# @description Executes the Hartman Lab image analysis workflow
# @arg $1 string A project name
#
shopt -s extglob
# @section Libraries
# @description Change these variables to use different libraries
JAVA="${JAVA:-java}"
PYTHON="${PYTHON:-python3}"
PERL="${PERL:-perl}"
# @section Help
# @description Print a helpful message
print_help() {
echo "Running: ${FUNCNAME[0]}"
cat <<-EOF
USAGE:
script-run-workflow [[OPTION] [VALUE]]...
Some options (--project, --module) can be passed multiple times for batch operations.
OPTIONS:
--project, -p PROJECT
PROJECT should follow the pattern ${PROJECT_PREFIX}_UNIQUE_PROJECT_NAME
--include, -i MODULE
See MODULES section below for list of available modules
If no --module is specified, all modules are run
--exclude, -x MODULE
See MODULES section below for list of available modules
--yes, -y, --auto
Always answer yes to questions (non-interactive mode)
--debug, -d
Print extra debugging info
--help, -h
Print this help message and exit
MODULES:
${ALL_MODULES[*]}
DEPENDENCIES:
binaries (system): graphviz pandoc pdftk-java gd-devel
perl (cpan): File::Map ExtUtils::PkgConfig GD GO::TermFinder
R (default): BiocManager ontologyIndex ggrepel tidyverse sos openxlsx ggplot2 plyr extrafont gridExtra gplots stringr plotly ggthemes pandoc rmarkdown
R (BiocManager):
EXAMPLES:
script-run-workflow --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[1]} --module ${ALL_MODULES[2]} --yes
EOF
}
# @section User Input
# @description Creates array and switches from user input
parse_input() {
echo "Running: ${FUNCNAME[0]}" "$@"
long_opts="project:,include:,exclude:,yes,auto,debug,help"
#long_opts+="restorefile:,betapass:,"
short_opts="+p:i:x:yhd"
if input=$(getopt -o $short_opts -l $long_opts -- "$@"); then
eval set -- "$input"
while true; do
case $1 in
--project|-p)
shift
declare -ga PROJECT_NAMES+=("$1")
;;
--include|-i)
shift
declare -ga MODULES+=("$1")
;;
--exclude|-x)
shift
declare -ga EXCLUDE_MODULES+=("$1")
;;
--yes|-y|--auto)
declare -g YES=1
;;
--debug|-d)
declare -g DEBUG=1
;;
--help|-h)
print_help; exit 0
;;
--)
shift
break
;;
esac
shift
done
else
err "Incorrect options provided"; exit 1
fi
}
# @section Initialize a new job in the scans directory
# @description Create a new ExpJobs project
# TODO Copy over source image directories from robot - are these alse named by the ExpJobs name?
init_job() {
echo "Running: ${FUNCNAME[0]}"
if [[ -d $SCAN_DIR ]]; then
ask "$SCAN_DIR already exists, re-initialize?" || return 0
else
ask "Initialize a new project at $SCAN_DIR?" || return 1
mkdir -p "$SCAN_DIR"
fi
[[ -d $SCAN_DIR/MasterPlateFiles ]] || mkdir -p "$SCAN_DIR/MasterPlateFiles"
DRUG_MEDIA_FILE="$SCAN_DIR/MasterPlateFiles/DrugMedia_$PROJECT_NAME.xls"
MASTER_PLATE_FILE="$SCAN_DIR/MasterPlateFiles/MasterPlate_$PROJECT_NAME.xls"
# TODO Where are the actual templates?
for f in $DRUG_MEDIA_FILE $MASTER_PLATE_FILE; do
touch "$f"
done
}
# @section EASY
# @description Start an EASY analysis
# The QHTCPImageFolders and MasterPlateFiles folder are the inputs for image analysis with EASY software.
# EASY will automatically generate a Results directory (within the ExpJobs/ExperimentJob folder) w/ timestamp and an optional short description provided by the user (Fig.2).
# The Results directory is created and entered, using the “File >> New Experiment” dropdown in EASY.
# Multiple Results files may be created (and uniquely named) within an ExperimentJob folder.
easy() {
echo "Running: ${FUNCNAME[0]}"
EASY="/mnt/data/EASY/EasyDev2024/BU/EASY240430AppExported/EstartConsole.m"
pushd "$SCAN_DIR" || return 1
# Launch graphical matlab if the user wants
ask "Start EASY in MATLAB? This requires a GUI." && matlab -nosplash -r "$EASY"
# glob EASY output and make sure it exists
shopt -s nullglob
EASY_RESULTS_DIRS=( Results* )
shopt -u nullglob
[[ ${#EASY_RESULTS_DIRS} -ge 1 ]] || (echo "Missing EASY output"; exit 1)
declare -a EASY_OUT_ARRAY
for EASY_RESULTS_DIR in "${EASY_RESULTS_DIRS[@]}"; do
[[ -d $EASY_RESULTS_DIR ]] && echo "Found EASY Results directory: $EASY_RESULTS_DIR"
EASY_PRINT_RESULTS_DIR="$EASY_RESULTS_DIR/PrintResults"
[[ -d $EASY_PRINT_RESULTS_DIR ]] && echo "Found EASY PrintResults directory: $EASY_PRINT_RESULTS_DIR"
EASY_PRINT_RESULTS_FILES=(
"$EASY_PRINT_RESULTS_DIR/!!"*
"$EASY_PRINT_RESULTS_DIR"/NoGrowth_*.txt
"$EASY_PRINT_RESULTS_DIR"/GrowthOnly_*.txt
)
EASY_OUT_ARRAY+=("$EASY_RESULTS_DIR" "$EASY_PRINT_RESULTS_DIR" "${EASY_PRINT_RESULTS_FILES[@]}")
done
echo "EASY OUTPUT ARRAY: " "${EASY_OUT_ARRAY[@]}"
}
# @section EZView
ezview() {
echo "Running: ${FUNCNAME[0]}"
EZVIEW_DIR="/mnt/data/EZVIEW"
echo "$EZVIEW_DIR"
}
# @section StudiesQHTCP
# @ description This section is derived from the earliest work of Jinyu Guo. As such it uses Perl scripts. Without porting these two Perl scripts into a new intergrated R script or Python script, one is contrained to use the rather crude copy-paste and and shell script inherient in the original procedures. These two Perl scripts are analyze_v2.pl and terms2tsv_v4.pl which were written in 2003 by Gavin Sherlock for the SGD gene ontology system and require perl installations of such files. These also require that the gene_ontology_edit.obo, SGD_features.tab files used in the ../Code also be included here. Without rewriting the code, one must compromise directory convenience.
# @description Main loop for qhtcp modules (rewrite of REMcMaster3.sh)
qhtcp() {
echo "Running: ${FUNCNAME[0]}"
TEMPLATE_DIR="$SCRIPT_DIR/templates/qhtcp"
QHTCP_DIR="/mnt/data/StudiesQHTCP/$PROJECT_NAME"
while [[ -d $QHTCP_DIR ]]; do
echo "A project already exists at $QHTCP_DIR"
ask "Safely update $QHTCP_DIR from the $TEMPLATE_DIR template?" && break
if ask "Back up $QHTCP_DIR to $QHTCP_DIR.bk and start fresh?"; then
mv "$QHTCP_DIR" "$QHTCP_DIR.bk" || (echo "Backup unsuccessful, exiting"; exit 1)
fi
done
# Copy template to QHTCP project directory
if rsync --archive --update "$TEMPLATE_DIR"/ "$QHTCP_DIR"; then
echo "New project created at $QHTCP_DIR"
fi
# Enter REMc directory to run the scripts there
pushd "$QHTCP_DIR/REMc" || return 1
r_join_interact &&
java_jingyu_extract &&
r_add_shift_values &&
r_heat_maps_zscores &&
r_heat_maps_homology &&
py_gtf &&
r_compile_gtf
popd || return 1
}
# @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv
r_join_interact() {
echo "Running: ${FUNCNAME[0]}"
echo "Rscript JoinInteractExps3dev.R"
Rscript JoinInteractExps3dev.R
out_file="REMcRdy_lm_only.csv"
out_file2="Shift_only.csv"
for f in "$out_file" "$out_file2"; do
[[ -f $f ]] || (echo "$f does not exist"; return 1)
done
}
# @description Jingyu's REMc java utility using file input file REMcRdy_lm_only.csv
# and output REMcRdy_lm_only.csv-finalTable.csv
java_jingyu_extract() {
echo "Running: ${FUNCNAME[0]}"
classpath="jingyuJava_1_7_extractLib.jar"
out_file="REMcRdy_lm_only.csv-finalTable.csv"
# backup REMcRdy_lm_only.csv-finalTable.csv
[[ -f $out_file ]] && mv "$out_file" "$out_file.bk"
java_cmd=(
"$JAVA" -Xms512m -Xmx2048m -Dfile.encoding=UTF-8 -classpath "$classpath" ExecMain
"REMcRdy_lm_only.csv"
"GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab"
"ORF_List_Without_DAmPs.txt" 1 true true
)
echo "${java_cmd[@]}"
"${java_cmd[@]}"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
# @description Add shift values back to REMcRdy_lm_only.csv-finalTable.csv
# and output "REMcWithShift.csv" for use with the REMc heat maps
r_add_shift_values() {
echo "Running: ${FUNCNAME[0]}"
out_file="REMcHeatmaps/REMcWithShift.csv"
echo "Rscript AddShiftVals2.R"
Rscript AddShiftVals2.R
rm -f "REMcHeatmaps/"*.pdf
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
# @description Execute REMcHeatmaps_zscores.R
r_heat_maps_zscores() {
echo "Running: ${FUNCNAME[0]}"
out_file="REMcHeatmaps/compiledREMcHeatmaps.pdf"
echo "Rscript REMcHeatmaps_zscores.R"
Rscript REMcHeatmaps_zscores.R
pdfs=(REMcHeatmaps/*.pdf)
echo "pdftk ${pdfs[*]} output $out_file"
pdftk "${pdfs[@]}" output "$out_file"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
# @description Execute REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R
r_heat_maps_homology() {
echo "Running: ${FUNCNAME[0]}"
work_dir="REMcHeatmapsWithHomology"
source_file="REMcHeatmaps/REMcWithShift.csv"
target_file="$work_dir/REMcWithShift.csv"
out_file="$work_dir/Homology/compiledREMcHomologyHeatmaps.pdf"
echo "rsync --archive $source_file $target_file"
rsync --archive "$source_file" "$target_file"
# Clean old output
rm "$work_dir/Homology/"*.{pdf,csv}
pushd "$work_dir" || return 1
Rscript \
REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R \
REMcWithShift.csv \
Homology \
17_0503_DAmPs_Only.txt \
Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv
popd || return 1
pdfs=("$work_dir"/Homology/*.pdf)
pdftk "${pdfs[@]}" output "$out_file"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
# @description Perform GTF
py_gtf() {
echo "Running: ${FUNCNAME[0]}"
process_dir="GTF/Process"
function_dir="GTF/Function"
component_dir="GTF/Component"
in_file="REMcRdy_lm_only.csv-finalTable.csv"
out_file="$process_dir/REMcRdy_lm_only/1-0-0-finaltable.csv"
echo "$PYTHON DconJG2.py $in_file $process_dir/"
"$PYTHON" DconJG2.py "$in_file" "$process_dir/"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
unset out_file
rsync -a "$process_dir/REMcRdy_lm_only" GTF/Function/
rsync -a "$process_dir/REMcRdy_lm_only" GTF/Component/
# @description Not sure what to name this
# @arg $1 string directory name
_process() {
echo "Running: ${FUNCNAME[0]}" "$@"
pushd "$1" || return 1
shopt -s nullglob
set2=(REMcRdy_lm_only/*.txt)
shopt -u nullglob
for s in "${set2[@]}"; do
echo "$PERL analyze_v2.pl -an gene_association.sgd -as P -o gene_ontology_edit.obo -b $set1 $s"
echo "TODO: analyze_v2.pl should be translated"
"$PERL" analyze_v2.pl -an gene_association.sgd -as P -o gene_ontology_edit.obo -b "$set1" "$s"
echo "$PERL terms2tsv_v4.pl $s.terms > $s.tsv"
echo "TODO: terms2tsv_v4.pl should be translated"
"$PERL" terms2tsv_v4.pl "$s.terms" > "$s.tsv"
done
# Concat the process ontology outputs from the /REMcReady_lm_only folder
echo "$PYTHON Concatenate_GTF_results.py REMcRdy_lm_only/ $out_file"
echo "TODO: Concatenate_GTF_results.py should be translated to bash"
"$PYTHON" Concatenate_GTF_results.py REMcRdy_lm_only/ "$out_file"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
popd || return 1
}
# Perform operations in each directory
for d in "$process_dir" "$function_dir" "$component_dir"; do
set1="ORF_List_Without_DAmPs.txt"
out_file="${d##*/}Results.txt" # Use the dirname to create each Results filename
_process "$d" & # parallelize
done
}
# @description Compile GTF in R
r_compile_gtf() {
echo "Running: ${FUNCNAME[0]}"
echo "Rscript CompileGTF.R"
Rscript CompileGTF.R
}
# @description Installs dependencies for the workflow
install_dependencies() {
echo "Running: ${FUNCNAME[0]}"
# Install system-wide dependencies
echo "Installing system dependencies"
case "$(uname -s)" in
Linux*|CYGWIN*|MINGW*)
ask "Detected Linux platform, continue?" || return 1
echo "You may be prompted for your sudo password to install system packages"
if hash dnf &>/dev/null; then
sudo dnf install graphviz pandoc pdftk-java gd-devel
elif hash apt &>/dev/null; then
sudo apt install graphviz pandoc pdftk-java libgd-dev
fi
;;
Darwin*)
ask "Detected Mac platform, continue?" || return 1
export HOMEBREW_BREW_GIT_REMOTE="https://github.com/Homebrew/brew"
curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh|bash
brew install graphiz
brew install gd
brew install pdftk-java
brew install pandoc
cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder
;;
*)
echo "Your system could not be detected, please install dependencies manually"
;;
esac
# Install perl CPAN modules
echo "Installing perl CPAN modules"
echo "cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder"
cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder
# Install R packages
echo "Installing R packages"
Rscript -e 'install.packages(c(\
"BiocManager", \
"ontologyIndex" \
"ggrepel" \
"tidyverse" \
"sos" \
"openxlsx" \
"ggplot2" \
"plyr" \
"extrafont" \
"gridExtra" \
"gplots" \
"stringr" \
"plotly" \
"ggthemes" \
"pandoc" \
"rmarkdown" \
), dep=TRUE, repos="https://cloud.r-project.org")'
Rscript -e 'BiocManager::install("org.Sc.sgd.db")'
}
# @internal
ask() {
declare response
(( YES )) && return 0
read -r -p "$* [y/N]: " response
[[ ${response,,} =~ ^(yes|y)$ ]]
}
# @internal
err() { echo "Error: $*" >&2; }
# @internal
ask_pn() {
declare -g PROJECT_NAME
read -r -p "Enter a full project name (ex. ${PROJECT_PREFIX}_PROJECT_NAME): " PROJECT_NAME
}
# @internal
debug() { (( DEBUG )) && echo "Debug: $*"; }
# @description The main loop of script-run-workflow
# May eventually need to add git ops
# Passes on arguments
# Most variables in main() are user configurable or can be overriden by env
main() {
echo "Running: ${FUNCNAME[0]}" "$@"
# Where are we located?
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
# Set the automatic project directory prefix
PROJECT_PREFIX="$(whoami)_$(date +%y_%m_%d)"
# When adding a module, it also should be added to this list
ALL_MODULES=(
install_dependencies
init_job
easy
ezview
qhtcp
)
declare -a PROJECT_NAMES=() # this array will hold all of the projects to run
[[ $# -eq 1 ]] && PROJECT_NAMES+=("$1") # easy way to run on single dir
[[ $# -ge 2 ]] && parse_input "$@" # parse arguments with getopt
# Prompt user for the PROJECT_NAME if we still don't have one
if [[ ${#PROJECT_NAMES[@]} -eq 0 ]]; then # still allows for environment overrides
ask_pn
PROJECT_NAMES+=("$PROJECT_NAME")
fi
# Sanitize PROJECT_NAMES
# This regex should match PROJECT_PREFIX
san() { [[ $1 =~ .+_[0-9][0-9]_[0-9][0-9]_[0-9][0-9]_.+ ]]; }
for i in "${!PROJECT_NAMES[@]}"; do
if ! san "${PROJECT_NAME[i]}"; then
echo "Project name ${PROJECT_NAME[$i]} is invalid"
echo "Enter a replacement"
ask_pn
san "$PROJECT_NAME" || (echo "RTFM"; return 1)
PROJECT_NAME[i]="$PROJECT_NAME"
fi
done
SCANS_DIR="${SCANS_DIR:-"/mnt/data/ExpJobs"}" # TODO propose changing this to something else
# If we don't catch with getopt or env, run all
[[ ${#MODULES[@]} -eq 0 ]] && MODULES=("${ALL_MODULES[@]}")
# Exclude modules overrides include
arr=()
for m in "${MODULES[@]}"; do
[[ " ${EXCLUDE_MODULES[*]} " =~ [[:space:]]${m}[[:space:]] ]] || arr+=("$m")
done
MODULES=("${arr[@]}")
unset arr
# Sanitize MODULES
for i in "${!MODULES[@]}"; do
if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then
echo "Module $m not in the module list"
echo "Available modules: ${ALL_MODULES[*]}"
read -r -p "Enter replacement name: " MODULE
! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULE}[[:space:]] ]] || (echo "RTFM"; return 1)
MODULES[i]="$MODULE"
fi
done
# Loop over projects
for PROJECT_NAME in "${PROJECT_NAMES[@]}"; do
SCAN_DIR="$SCANS_DIR/$PROJECT_NAME"
# Run selected modules
for m in "${MODULES[@]}"; do
ask "Run $m" && "$m"
done
done
}
main "$@"
exit $?