#!/usr/bin/env bash # Copyright 2024 Bryan C. Roessler # This is currently a code scratchpad for organizing the Hartman Lab Server workflow # It contains a mixture of code/pseudocode and shouldn't be run until this message is removed # Allow indirect functions # shellcheck disable=SC2317 # # shdoc info # @name HartmanLabWorkflow # @brief One script to rule them all (see: xkcd #927) # @description Executes the Hartman Lab image analysis workflow # @arg $1 string A project name # shopt -s extglob # @section Libraries # @description Change these variables to use different libraries JAVA="${JAVA:-java}" PYTHON="${PYTHON:-python3}" PERL="${PERL:-perl}" # @section Help # @description Print a helpful message print_help() { echo "Running: ${FUNCNAME[0]}" cat <<-EOF USAGE: script-run-workflow [[OPTION] [VALUE]]... Some options (--project, --module) can be passed multiple times for batch operations. OPTIONS: --project, -p PROJECT PROJECT should follow the pattern ${PROJECT_PREFIX}_UNIQUE_PROJECT_NAME --include, -i MODULE See MODULES section below for list of available modules If no --module is specified, all modules are run --exclude, -x MODULE See MODULES section below for list of available modules --yes, -y, --auto Always answer yes to questions (non-interactive mode) --debug, -d Print extra debugging info --help, -h Print this help message and exit MODULES: ${ALL_MODULES[*]} DEPENDENCIES: binaries (system): graphviz pandoc pdftk-java gd-devel perl (cpan): File::Map ExtUtils::PkgConfig GD GO::TermFinder R (default): BiocManager ontologyIndex ggrepel tidyverse sos openxlsx ggplot2 plyr extrafont gridExtra gplots stringr plotly ggthemes pandoc rmarkdown R (BiocManager): EXAMPLES: script-run-workflow --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]} script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]} script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[1]} --module ${ALL_MODULES[2]} --yes EOF } # @section User Input # @description Creates array and switches from user input parse_input() { echo "Running: ${FUNCNAME[0]}" "$@" long_opts="project:,include:,exclude:,yes,auto,debug,help" #long_opts+="restorefile:,betapass:," short_opts="+p:i:x:yhd" if input=$(getopt -o $short_opts -l $long_opts -- "$@"); then eval set -- "$input" while true; do case $1 in --project|-p) shift declare -ga PROJECT_NAMES+=("$1") ;; --include|-i) shift declare -ga MODULES+=("$1") ;; --exclude|-x) shift declare -ga EXCLUDE_MODULES+=("$1") ;; --yes|-y|--auto) declare -g YES=1 ;; --debug|-d) declare -g DEBUG=1 ;; --help|-h) print_help; exit 0 ;; --) shift break ;; esac shift done else err "Incorrect options provided"; exit 1 fi } # @section Initialize a new job in the scans directory # @description Create a new ExpJobs project # TODO Copy over source image directories from robot - are these alse named by the ExpJobs name? init_job() { echo "Running: ${FUNCNAME[0]}" if [[ -d $SCAN_DIR ]]; then ask "$SCAN_DIR already exists, re-initialize?" || return 0 else ask "Initialize a new project at $SCAN_DIR?" || return 1 mkdir -p "$SCAN_DIR" fi [[ -d $SCAN_DIR/MasterPlateFiles ]] || mkdir -p "$SCAN_DIR/MasterPlateFiles" DRUG_MEDIA_FILE="$SCAN_DIR/MasterPlateFiles/DrugMedia_$PROJECT_NAME.xls" MASTER_PLATE_FILE="$SCAN_DIR/MasterPlateFiles/MasterPlate_$PROJECT_NAME.xls" # TODO Where are the actual templates? for f in $DRUG_MEDIA_FILE $MASTER_PLATE_FILE; do touch "$f" done } # @section EASY # @description Start an EASY analysis # The QHTCPImageFolders and ‘MasterPlateFiles’ folder are the inputs for image analysis with EASY software. # EASY will automatically generate a ‘Results’ directory (within the ExpJobs/‘ExperimentJob’ folder) w/ timestamp and an optional short description provided by the user (Fig.2). # The ‘Results’ directory is created and entered, using the “File >> New Experiment” dropdown in EASY. # Multiple ‘Results’ files may be created (and uniquely named) within an ‘ExperimentJob’ folder. easy() { echo "Running: ${FUNCNAME[0]}" EASY="/mnt/data/EASY/EasyDev2024/BU/EASY240430AppExported/EstartConsole.m" pushd "$SCAN_DIR" || return 1 # Launch graphical matlab if the user wants ask "Start EASY in MATLAB? This requires a GUI." && matlab -nosplash -r "$EASY" # glob EASY output and make sure it exists shopt -s nullglob EASY_RESULTS_DIRS=( Results* ) shopt -u nullglob [[ ${#EASY_RESULTS_DIRS} -ge 1 ]] || (echo "Missing EASY output"; exit 1) declare -a EASY_OUT_ARRAY for EASY_RESULTS_DIR in "${EASY_RESULTS_DIRS[@]}"; do [[ -d $EASY_RESULTS_DIR ]] && echo "Found EASY Results directory: $EASY_RESULTS_DIR" EASY_PRINT_RESULTS_DIR="$EASY_RESULTS_DIR/PrintResults" [[ -d $EASY_PRINT_RESULTS_DIR ]] && echo "Found EASY PrintResults directory: $EASY_PRINT_RESULTS_DIR" EASY_PRINT_RESULTS_FILES=( "$EASY_PRINT_RESULTS_DIR/!!"* "$EASY_PRINT_RESULTS_DIR"/NoGrowth_*.txt "$EASY_PRINT_RESULTS_DIR"/GrowthOnly_*.txt ) EASY_OUT_ARRAY+=("$EASY_RESULTS_DIR" "$EASY_PRINT_RESULTS_DIR" "${EASY_PRINT_RESULTS_FILES[@]}") done echo "EASY OUTPUT ARRAY: " "${EASY_OUT_ARRAY[@]}" } # @section EZView ezview() { echo "Running: ${FUNCNAME[0]}" EZVIEW_DIR="/mnt/data/EZVIEW" echo "$EZVIEW_DIR" } # @section StudiesQHTCP # @ description This section is derived from the earliest work of Jinyu Guo. As such it uses Perl scripts. Without porting these two Perl scripts into a new intergrated R script or Python script, one is contrained to use the rather crude copy-paste and and shell script inherient in the original procedures. These two Perl scripts are analyze_v2.pl and terms2tsv_v4.pl which were written in 2003 by Gavin Sherlock for the SGD gene ontology system and require perl installations of such files. These also require that the gene_ontology_edit.obo, SGD_features.tab files used in the ../Code also be included here. Without rewriting the code, one must compromise directory convenience. # @description Main loop for qhtcp modules (rewrite of REMcMaster3.sh) qhtcp() { echo "Running: ${FUNCNAME[0]}" TEMPLATE_DIR="$SCRIPT_DIR/templates/qhtcp" QHTCP_DIR="/mnt/data/StudiesQHTCP/$PROJECT_NAME" while [[ -d $QHTCP_DIR ]]; do echo "A project already exists at $QHTCP_DIR" ask "Safely update $QHTCP_DIR from the $TEMPLATE_DIR template?" && break if ask "Back up $QHTCP_DIR to $QHTCP_DIR.bk and start fresh?"; then mv "$QHTCP_DIR" "$QHTCP_DIR.bk" || (echo "Backup unsuccessful, exiting"; exit 1) fi done # Copy template to QHTCP project directory if rsync --archive --update "$TEMPLATE_DIR"/ "$QHTCP_DIR"; then echo "New project created at $QHTCP_DIR" fi # Enter REMc directory to run the scripts there pushd "$QHTCP_DIR/REMc" || return 1 r_join_interact && java_jingyu_extract && r_add_shift_values && r_heat_maps_zscores && r_heat_maps_homology && py_gtf && r_compile_gtf popd || return 1 } # @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv r_join_interact() { echo "Running: ${FUNCNAME[0]}" echo "Rscript JoinInteractExps3dev.R" Rscript JoinInteractExps3dev.R out_file="REMcRdy_lm_only.csv" out_file2="Shift_only.csv" for f in "$out_file" "$out_file2"; do [[ -f $f ]] || (echo "$f does not exist"; return 1) done } # @description Jingyu's REMc java utility using file input file REMcRdy_lm_only.csv # and output REMcRdy_lm_only.csv-finalTable.csv java_jingyu_extract() { echo "Running: ${FUNCNAME[0]}" classpath="jingyuJava_1_7_extractLib.jar" out_file="REMcRdy_lm_only.csv-finalTable.csv" # backup REMcRdy_lm_only.csv-finalTable.csv [[ -f $out_file ]] && mv "$out_file" "$out_file.bk" java_cmd=( "$JAVA" -Xms512m -Xmx2048m -Dfile.encoding=UTF-8 -classpath "$classpath" ExecMain "REMcRdy_lm_only.csv" "GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab" "ORF_List_Without_DAmPs.txt" 1 true true ) echo "${java_cmd[@]}" "${java_cmd[@]}" [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) } # @description Add shift values back to REMcRdy_lm_only.csv-finalTable.csv # and output "REMcWithShift.csv" for use with the REMc heat maps r_add_shift_values() { echo "Running: ${FUNCNAME[0]}" out_file="REMcHeatmaps/REMcWithShift.csv" echo "Rscript AddShiftVals2.R" Rscript AddShiftVals2.R rm -f "REMcHeatmaps/"*.pdf [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) } # @description Execute REMcHeatmaps_zscores.R r_heat_maps_zscores() { echo "Running: ${FUNCNAME[0]}" out_file="REMcHeatmaps/compiledREMcHeatmaps.pdf" echo "Rscript REMcHeatmaps_zscores.R" Rscript REMcHeatmaps_zscores.R pdfs=(REMcHeatmaps/*.pdf) echo "pdftk ${pdfs[*]} output $out_file" pdftk "${pdfs[@]}" output "$out_file" [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) } # @description Execute REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R r_heat_maps_homology() { echo "Running: ${FUNCNAME[0]}" work_dir="REMcHeatmapsWithHomology" source_file="REMcHeatmaps/REMcWithShift.csv" target_file="$work_dir/REMcWithShift.csv" out_file="$work_dir/Homology/compiledREMcHomologyHeatmaps.pdf" echo "rsync --archive $source_file $target_file" rsync --archive "$source_file" "$target_file" # Clean old output rm "$work_dir/Homology/"*.{pdf,csv} pushd "$work_dir" || return 1 Rscript \ REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R \ REMcWithShift.csv \ Homology \ 17_0503_DAmPs_Only.txt \ Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv popd || return 1 pdfs=("$work_dir"/Homology/*.pdf) pdftk "${pdfs[@]}" output "$out_file" [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) } # @description Perform GTF py_gtf() { echo "Running: ${FUNCNAME[0]}" process_dir="GTF/Process" function_dir="GTF/Function" component_dir="GTF/Component" in_file="REMcRdy_lm_only.csv-finalTable.csv" out_file="$process_dir/REMcRdy_lm_only/1-0-0-finaltable.csv" echo "$PYTHON DconJG2.py $in_file $process_dir/" "$PYTHON" DconJG2.py "$in_file" "$process_dir/" [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) unset out_file rsync -a "$process_dir/REMcRdy_lm_only" GTF/Function/ rsync -a "$process_dir/REMcRdy_lm_only" GTF/Component/ # @description Not sure what to name this # @arg $1 string directory name _process() { echo "Running: ${FUNCNAME[0]}" "$@" pushd "$1" || return 1 shopt -s nullglob set2=(REMcRdy_lm_only/*.txt) shopt -u nullglob for s in "${set2[@]}"; do echo "$PERL analyze_v2.pl -an gene_association.sgd -as P -o gene_ontology_edit.obo -b $set1 $s" echo "TODO: analyze_v2.pl should be translated" "$PERL" analyze_v2.pl -an gene_association.sgd -as P -o gene_ontology_edit.obo -b "$set1" "$s" echo "$PERL terms2tsv_v4.pl $s.terms > $s.tsv" echo "TODO: terms2tsv_v4.pl should be translated" "$PERL" terms2tsv_v4.pl "$s.terms" > "$s.tsv" done # Concat the process ontology outputs from the /REMcReady_lm_only folder echo "$PYTHON Concatenate_GTF_results.py REMcRdy_lm_only/ $out_file" echo "TODO: Concatenate_GTF_results.py should be translated to bash" "$PYTHON" Concatenate_GTF_results.py REMcRdy_lm_only/ "$out_file" [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) popd || return 1 } # Perform operations in each directory for d in "$process_dir" "$function_dir" "$component_dir"; do set1="ORF_List_Without_DAmPs.txt" out_file="${d##*/}Results.txt" # Use the dirname to create each Results filename _process "$d" & # parallelize done } # @description Compile GTF in R r_compile_gtf() { echo "Running: ${FUNCNAME[0]}" echo "Rscript CompileGTF.R" Rscript CompileGTF.R } # @description Installs dependencies for the workflow install_dependencies() { echo "Running: ${FUNCNAME[0]}" # Install system-wide dependencies echo "Installing system dependencies" case "$(uname -s)" in Linux*|CYGWIN*|MINGW*) ask "Detected Linux platform, continue?" || return 1 echo "You may be prompted for your sudo password to install system packages" if hash dnf &>/dev/null; then sudo dnf install graphviz pandoc pdftk-java gd-devel elif hash apt &>/dev/null; then sudo apt install graphviz pandoc pdftk-java libgd-dev fi ;; Darwin*) ask "Detected Mac platform, continue?" || return 1 export HOMEBREW_BREW_GIT_REMOTE="https://github.com/Homebrew/brew" curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh|bash brew install graphiz brew install gd brew install pdftk-java brew install pandoc cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder ;; *) echo "Your system could not be detected, please install dependencies manually" ;; esac # Install perl CPAN modules echo "Installing perl CPAN modules" echo "cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder" cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder # Install R packages echo "Installing R packages" Rscript -e 'install.packages(c(\ "BiocManager", \ "ontologyIndex" \ "ggrepel" \ "tidyverse" \ "sos" \ "openxlsx" \ "ggplot2" \ "plyr" \ "extrafont" \ "gridExtra" \ "gplots" \ "stringr" \ "plotly" \ "ggthemes" \ "pandoc" \ "rmarkdown" \ ), dep=TRUE, repos="https://cloud.r-project.org")' Rscript -e 'BiocManager::install("org.Sc.sgd.db")' } # @internal ask() { declare response (( YES )) && return 0 read -r -p "$* [y/N]: " response [[ ${response,,} =~ ^(yes|y)$ ]] } # @internal err() { echo "Error: $*" >&2; } # @internal ask_pn() { declare -g PROJECT_NAME read -r -p "Enter a full project name (ex. ${PROJECT_PREFIX}_PROJECT_NAME): " PROJECT_NAME } # @internal debug() { (( DEBUG )) && echo "Debug: $*"; } # @description The main loop of script-run-workflow # May eventually need to add git ops # Passes on arguments # Most variables in main() are user configurable or can be overriden by env main() { echo "Running: ${FUNCNAME[0]}" "$@" # Where are we located? SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # Set the automatic project directory prefix PROJECT_PREFIX="$(whoami)_$(date +%y_%m_%d)" # When adding a module, it also should be added to this list ALL_MODULES=( install_dependencies init_job easy ezview qhtcp ) declare -a PROJECT_NAMES=() # this array will hold all of the projects to run [[ $# -eq 1 ]] && PROJECT_NAMES+=("$1") # easy way to run on single dir [[ $# -ge 2 ]] && parse_input "$@" # parse arguments with getopt # Prompt user for the PROJECT_NAME if we still don't have one if [[ ${#PROJECT_NAMES[@]} -eq 0 ]]; then # still allows for environment overrides ask_pn PROJECT_NAMES+=("$PROJECT_NAME") fi # Sanitize PROJECT_NAMES # This regex should match PROJECT_PREFIX san() { [[ $1 =~ .+_[0-9][0-9]_[0-9][0-9]_[0-9][0-9]_.+ ]]; } for i in "${!PROJECT_NAMES[@]}"; do if ! san "${PROJECT_NAME[i]}"; then echo "Project name ${PROJECT_NAME[$i]} is invalid" echo "Enter a replacement" ask_pn san "$PROJECT_NAME" || (echo "RTFM"; return 1) PROJECT_NAME[i]="$PROJECT_NAME" fi done SCANS_DIR="${SCANS_DIR:-"/mnt/data/ExpJobs"}" # TODO propose changing this to something else # If we don't catch with getopt or env, run all [[ ${#MODULES[@]} -eq 0 ]] && MODULES=("${ALL_MODULES[@]}") # Exclude modules overrides include arr=() for m in "${MODULES[@]}"; do [[ " ${EXCLUDE_MODULES[*]} " =~ [[:space:]]${m}[[:space:]] ]] || arr+=("$m") done MODULES=("${arr[@]}") unset arr # Sanitize MODULES for i in "${!MODULES[@]}"; do if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then echo "Module $m not in the module list" echo "Available modules: ${ALL_MODULES[*]}" read -r -p "Enter replacement name: " MODULE ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULE}[[:space:]] ]] || (echo "RTFM"; return 1) MODULES[i]="$MODULE" fi done # Loop over projects for PROJECT_NAME in "${PROJECT_NAMES[@]}"; do SCAN_DIR="$SCANS_DIR/$PROJECT_NAME" # Run selected modules for m in "${MODULES[@]}"; do ask "Run $m" && "$m" done done } main "$@" exit $?