#!/usr/bin/env bash # Copyright 2024 Bryan C. Roessler # # This is a flexible yet opinionated analysis workflow for the Hartman Lab # It contains a mixture of code/pseudocode and shouldn't be run until this message is removed # # Allow indirect functions # shellcheck disable=SC2317 # # shdoc info # @name HartmanLabWorkflow # @brief One script to rule them all (see: xkcd #927) # @description Executes the Hartman Lab image analysis workflow # @option -p | --project= Include one or more projects in the analysis # @option -i | --include= Include one or more modules in the analysis (default: all modules) # @option -x | --exclude= Exclude one or more modules in the analysis # @option -m | --markdown Generate the shdoc markdown file for this program # @option -y | --yes | --auto Assume yes answer to all questions (non-interactive mode) # @option -d | --debug Turn on extra debugging output # @option -h | --help Print help message and exit (overrides other options) DEBUG=1 # Turn debugging ON by default during development shopt -s extglob # @section Libraries # @description Change these variables to use different libraries JAVA="${JAVA:-java}" PYTHON="${PYTHON:-python3}" PERL="${PERL:-perl}" # @section Help # @description Print a helpful message print_help() { debug "Running: ${FUNCNAME[0]}" install_dependencies --get-depends # Loads the dependency arrays cat <<-EOF USAGE: script-run-workflow [[OPTION] [VALUE]]... Some options (--project, --include, --exclude) can be passed multiple times or by using comma deliminated strings (see EXAMPLES below) OPTIONS: --project, -p PROJECT PROJECT should follow the pattern ${PROJECT_PREFIX}_PROJECT_NAME --include, -i MODULE See MODULES section below for list of available modules If no --include is specified, all modules are run --exclude, -x MODULE See MODULES section below for list of modules to exclude --markdown, -m Generate the shdoc markdown file for this program --yes, -y, --auto Always answer yes to questions (non-interactive mode) --debug, -d Print extra debugging info --help, -h Print this help message and exit MODULES: ${ALL_MODULES[*]} SUBMODULES: ${ALL_SUBMODULES[*]} DEPENDENCIES: deb: ${depends_deb[@]} rpm: ${depends_rpm[@]} brew: ${depends_brew[@]} perl: ${depends_perl[@]} R: ${depends_r[@]} BiocManager: ${depends_bioc[@]} EXAMPLES: script-run-workflow --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]} script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]} script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[1]} --module ${ALL_MODULES[2]} --yes script-run-workflow --module=${ALL_MODULES[0]},${ALL_MODULES[1]} script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT --module=${ALL_MODULES[1]},${ALL_MODULES[2]} --yes --debug EOF } # @section User Input # @description Creates array and switches from user input parse_input() { debug "Running: ${FUNCNAME[0]}" "$@" long_opts="project:,include:,exclude:,markdown,yes,auto,debug,help" short_opts="+p:i:x:mydh" if input=$(getopt -o $short_opts -l $long_opts -- "$@"); then eval set -- "$input" while true; do case $1 in --project|-p) shift if [[ $1 == *','* ]] ; then # check for commas IFS=',' read -ra PROJECTS <<< "$1" else PROJECTS+=("$1") fi ;; --include|-i) shift if [[ $1 == *','* ]] ; then # check for commas IFS=',' read -ra INCLUDE_MODULES <<< "$1" else INCLUDE_MODULES+=("$1") fi ;; --exclude|-x) shift if [[ $1 == *','* ]] ; then # check for commas IFS=',' read -ra EXCLUDE_MODULES <<< "$1" else EXCLUDE_MODULES+=("$1") fi ;; --markdown|-m) documentation ;; --yes|-y|--auto) declare -g YES=1 ;; --debug|-d) declare -g DEBUG=1 ;; --help|-h) print_help; exit 0 ;; --) shift break ;; esac shift done else err "Incorrect options provided"; exit 1 fi } # @section Helper functions # @internal module() { debug "Adding $1 module" ALL_MODULES+=("$1") declare -gA "$1" } submodule() { debug "Adding $1 submodule" ALL_SUBMODULES+=("$1") declare -gA "$1" } # This function will only work if users have an actual name registered on the server # TODO for now just use username # user_initials() { # user_record="$(getent passwd "$(whoami)")" # user_gecos_field="$(echo "$user_record" | cut -d ':' -f 5)" # user_full_name="$(echo "$user_gecos_field" | cut -d ',' -f 1)" # last="${user_full_name#* }" # echo "${user_full_name:0:1}${last:0:1}" # } ask() { declare response (( YES )) && return 0 read -r -p "$* [y/N]: " response [[ ${response,,} =~ ^(yes|y)$ ]] } err() { echo "Error: $*" >&2; } ask_pn() { declare -g PROJECT read -r -p "Enter a full project name (ex. ${PROJECT_PREFIX}_PROJECT_NAME): " PROJECT } debug() { (( DEBUG )) && echo "Debug: $*"; } # @section Modules # @description A module contains a cohesive set of actions/experiments to run on a project # Use a module when: # * Building a new type of analysis from scratch # * Generates project directories # * Can combine other modules and submodules # # module install_dependencies # @description Installs dependencies for the workflow install_dependencies() { debug "Running: ${FUNCNAME[0]}" "$@" # Dependency arrays depends_rpm=(graphviz pandoc pdftk-java gd-devel shdoc) depends_deb=(graphviz pandoc pdftk-java libgd-dev shdoc) depends_brew=(graphiz pandoc gd pdftk-java shdoc) depends_perl=(File::Map ExtUtils::PkgConfig GD GO::TermFinder) depends_r=(BiocManager ontologyIndex ggrepel tidyverse sos openxlsx ggplot2 plyr extrafont gridExtra gplots stringr plotly ggthemes pandoc rmarkdown) depends_bioc=(org.Sc.sgd.db) [[ $1 == "--get-depends" ]] && return 0 # if we just want to read the depends vars # Install system-wide dependencies echo "Installing system dependencies" case "$(uname -s)" in Linux*|CYGWIN*|MINGW*) ask "Detected Linux platform, continue?" || return 1 echo "You may be prompted for your sudo password to install system packages" if hash dnf &>/dev/null; then sudo dnf install "${depends_rpm[@]}" elif hash apt &>/dev/null; then sudo apt install "${depends_deb[@]}" fi ;; Darwin*) ask "Detected Mac platform, continue?" || return 1 export HOMEBREW_BREW_GIT_REMOTE="https://github.com/Homebrew/brew" curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh|bash brew install "${depends_brew[@]}" ;; *) echo "Your system could not be detected, please install dependencies manually" ;; esac # Install perl CPAN modules echo "Installing perl CPAN modules" debug "cpan" "${depends_perl[@]}" cpan "${depends_perl[@]}" # Install R packages echo "Installing R packages" depends_r_str="" depends_r_to_string() { for d in "${depends_r[@]}"; do depends_r_str+="$d\", \"" done depends_r_str="${depends_r_str::-3}" # strip last , " (comma and quote) } depends_r_to_string debug "Rscript -e install.packages(c(\"$depends_r_str), dep=TRUE, repos=\"https://cloud.r-project.org\")" Rscript -e "install.packages(c(\"$depends_r_str), dep=TRUE, repos=\"https://cloud.r-project.org\")" Rscript -e "BiocManager::install(\"${depends_bioc[0]}\")" } module init_job # @section Initialize a new job in the scans directory # @description Create a new ExpJobs project # TODO Copy over source image directories from robot - are these alse named by the ExpJobs name? init_job() { debug "Running: ${FUNCNAME[0]}" if [[ -d $SCAN_DIR ]]; then ask "$SCAN_DIR already exists, re-initialize?" || return 0 else ask "Initialize a new project at $SCAN_DIR?" || return 1 mkdir -p "$SCAN_DIR" fi [[ -d $SCAN_DIR/MasterPlateFiles ]] || mkdir -p "$SCAN_DIR/MasterPlateFiles" DRUG_MEDIA_FILE="$SCAN_DIR/MasterPlateFiles/DrugMedia_$PROJECT.xls" MASTER_PLATE_FILE="$SCAN_DIR/MasterPlateFiles/MasterPlate_$PROJECT.xls" # TODO Where are the actual templates? for f in $DRUG_MEDIA_FILE $MASTER_PLATE_FILE; do touch "$f" done } module easy # @section EASY # @description Start an EASY analysis # The QHTCPImageFolders and ‘MasterPlateFiles’ folder are the inputs for image analysis with EASY software. # EASY will automatically generate a ‘Results’ directory (within the ExpJobs/‘ExperimentJob’ folder) w/ timestamp and an optional short description provided by the user (Fig.2). # The ‘Results’ directory is created and entered, using the “File >> New Experiment” dropdown in EASY. # Multiple ‘Results’ files may be created (and uniquely named) within an ‘ExperimentJob’ folder. easy() { debug "Running: ${FUNCNAME[0]}" EASY="/mnt/data/EASY/EasyDev2024/BU/EASY240430AppExported/EstartConsole.m" pushd "$SCAN_DIR" || return 1 # Launch graphical matlab if the user wants ask "Start EASY in MATLAB? This requires a GUI." && matlab -nosplash -r "$EASY" # glob EASY output and make sure it exists shopt -s nullglob EASY_RESULTS_DIRS=( Results* ) shopt -u nullglob [[ ${#EASY_RESULTS_DIRS} -ge 1 ]] || (echo "Missing EASY output"; exit 1) declare -a EASY_OUT_ARRAY for EASY_RESULTS_DIR in "${EASY_RESULTS_DIRS[@]}"; do [[ -d $EASY_RESULTS_DIR ]] && echo "Found EASY Results directory: $EASY_RESULTS_DIR" EASY_PRINT_RESULTS_DIR="$EASY_RESULTS_DIR/PrintResults" [[ -d $EASY_PRINT_RESULTS_DIR ]] && echo "Found EASY PrintResults directory: $EASY_PRINT_RESULTS_DIR" EASY_PRINT_RESULTS_FILES=( "$EASY_PRINT_RESULTS_DIR/!!"* "$EASY_PRINT_RESULTS_DIR"/NoGrowth_*.txt "$EASY_PRINT_RESULTS_DIR"/GrowthOnly_*.txt ) EASY_OUT_ARRAY+=("$EASY_RESULTS_DIR" "$EASY_PRINT_RESULTS_DIR" "${EASY_PRINT_RESULTS_FILES[@]}") done echo "EASY OUTPUT ARRAY: " "${EASY_OUT_ARRAY[@]}" } module ezview # @section EZView ezview() { debug "Running: ${FUNCNAME[0]}" EZVIEW_DIR="/mnt/data/EZVIEW" echo "$EZVIEW_DIR" } module qhtcp # @section QHTCP # @description Main QHTCP module (functional rewrite of REMcMaster3.sh) qhtcp() { debug "Running: ${FUNCNAME[0]}" TEMPLATE_DIR="$SCRIPT_DIR/templates/qhtcp" QHTCP_DIR="/mnt/data/StudiesQHTCP/$PROJECT" # Our list of submodules (functions) to run for this module # Put these in the appropriate order of operations modules=( r_join_interact java_extract r_add_shift_values r_heat_maps_zscores r_heat_maps_homology gtf r_compile_gtf ) while [[ -d $QHTCP_DIR ]]; do echo "A project already exists at $QHTCP_DIR" ask "Safely update $QHTCP_DIR from the $TEMPLATE_DIR template?" && break if ask "Back up $QHTCP_DIR to $QHTCP_DIR.bk and start fresh?"; then mv "$QHTCP_DIR" "$QHTCP_DIR.bk" || (echo "Backup unsuccessful, exiting"; exit 1) fi done # Copy template to QHTCP project directory if rsync --archive --update "$TEMPLATE_DIR"/ "$QHTCP_DIR"; then echo "New project created at $QHTCP_DIR" fi # Create StudyInfo.csv # Right now this is identical to the template but we can change it later cat <<-EOF > "$QHTCP_DIR/Code/StudyInfo.csv" ExpNumb,ExpLabel,BackgroundSD,ZscoreJoinSD,AnalysisBy 1,ExpName1,NA,NA,UserInitials 2,ExpName2,NA,NA,UserInitials 3,ExpName3,NA,NA,UserInitials 4,ExpName4,NA,NA,UserInitials EOF # Enter REMc directory to run the scripts there pushd "$QHTCP_DIR/REMc" || return 1 # Run each submodule for s in "${modules[@]}"; do "$s"; done popd || return 1 } module gtf # @section GTF # @description GTF module for QHTCP gtf() { debug "Running: ${FUNCNAME[0]}" process_dir="GTF/Process" function_dir="GTF/Function" component_dir="GTF/Component" py_gtf "$process_dir" # Perform operations in each directory in parallel for d in "$process_dir" "$function_dir" "$component_dir"; do rsync -a "$process_dir/REMcRdy_lm_only" "$d"/ out_file="${d##*/}Results.txt" # Use the dirname to create each Results filename pl_gtf "$d" "$out_file" & # parallelize done } # @section Submodules # @description Submodules provide functionaility to modules and are reusable between modules # Use a submodule when: # * Calling external scripts # * Performing repetitive tasks # * # submodule r_join_interact # @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv r_join_interact() { debug "Running: ${FUNCNAME[0]}" echo "Rscript JoinInteractExps3dev.R" Rscript JoinInteractExps3dev.R out_file="REMcRdy_lm_only.csv" out_file2="Shift_only.csv" for f in "$out_file" "$out_file2"; do [[ -f $f ]] || (echo "$f does not exist"; return 1) done } submodule java_extract # @description Jingyu's REMc java utility using file input file REMcRdy_lm_only.csv # and output REMcRdy_lm_only.csv-finalTable.csv java_extract() { debug "Running: ${FUNCNAME[0]}" classpath="jingyuJava_1_7_extractLib.jar" out_file="REMcRdy_lm_only.csv-finalTable.csv" # backup REMcRdy_lm_only.csv-finalTable.csv [[ -f $out_file ]] && mv "$out_file" "$out_file.bk" java_cmd=( "$JAVA" -Xms512m -Xmx2048m -Dfile.encoding=UTF-8 -classpath "$classpath" ExecMain "REMcRdy_lm_only.csv" "GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab" "ORF_List_Without_DAmPs.txt" 1 true true ) echo "${java_cmd[@]}" "${java_cmd[@]}" [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) } submodule r_add_shift_values # @description Add shift values back to REMcRdy_lm_only.csv-finalTable.csv # and output "REMcWithShift.csv" for use with the REMc heat maps r_add_shift_values() { debug "Running: ${FUNCNAME[0]}" out_file="REMcHeatmaps/REMcWithShift.csv" echo "Rscript AddShiftVals2.R" Rscript AddShiftVals2.R rm -f "REMcHeatmaps/"*.pdf [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) } submodule r_heat_maps_zscores # @description Execute REMcHeatmaps_zscores.R r_heat_maps_zscores() { debug "Running: ${FUNCNAME[0]}" out_file="REMcHeatmaps/compiledREMcHeatmaps.pdf" echo "Rscript REMcHeatmaps_zscores.R" Rscript REMcHeatmaps_zscores.R pdfs=(REMcHeatmaps/*.pdf) echo "pdftk ${pdfs[*]} output $out_file" pdftk "${pdfs[@]}" output "$out_file" [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) } submodule r_heat_maps_homology # @description Execute REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R r_heat_maps_homology() { debug "Running: ${FUNCNAME[0]}" work_dir="REMcHeatmapsWithHomology" source_file="REMcHeatmaps/REMcWithShift.csv" target_file="$work_dir/REMcWithShift.csv" out_file="$work_dir/Homology/compiledREMcHomologyHeatmaps.pdf" echo "rsync --archive $source_file $target_file" rsync --archive "$source_file" "$target_file" # Clean old output rm "$work_dir/Homology/"*.{pdf,csv} pushd "$work_dir" || return 1 Rscript \ REMcHeatmaps_Z_lm_wDAmPs_andHomology_221212.R \ REMcWithShift.csv \ Homology \ 17_0503_DAmPs_Only.txt \ Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv popd || return 1 pdfs=("$work_dir"/Homology/*.pdf) pdftk "${pdfs[@]}" output "$out_file" [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) } submodule py_gtf # @description Perform python portion of GTF # @arg $1 string Directory to process py_gtf() { debug "Running: ${FUNCNAME[0]}" in_file="REMcRdy_lm_only.csv-finalTable.csv" out_file="$1/REMcRdy_lm_only/1-0-0-finaltable.csv" debug "$PYTHON DconJG2.py $in_file $1/" "$PYTHON" DconJG2.py "$in_file" "$1/" [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) } submodule pl_gtf # @description Perl module for GTF # @arg $1 string working directory # @arg $2 string output file pl_gtf() { debug "Running: ${FUNCNAME[0]}" "$@" set1="ORF_List_Without_DAmPs.txt" pushd "$1" || return 1 shopt -s nullglob set2=(REMcRdy_lm_only/*.txt) shopt -u nullglob for s in "${set2[@]}"; do debug "pl_analyze $set1 $s" pl_analyze "$set1" "$s" debug "pl_terms2tsv $s" pl_terms2tsv "$s" done # Concat the process ontology outputs from the /REMcReady_lm_only folder echo "$PYTHON Concatenate_GTF_results.py REMcRdy_lm_only/ $2" echo "TODO: Concatenate_GTF_results.py should be translated to bash" "$PYTHON" Concatenate_GTF_results.py REMcRdy_lm_only/ "$2" [[ -f $2 ]] || (echo "$2 does not exist"; return 1) popd || return 1 } submodule pl_analyze # @description Perl analyze submodule # This seems weird to me because we're just overwriting the same data for all set2 members # https://metacpan.org/dist/GO-TermFinder/view/examples/analyze.pl # Is there a reason you need a custom version and not the original from cpan? # @arg $1 string Set 1 # @arg $2 string Set 2 pl_analyze() { script="analyze_v2.pl" an="gene_association.sgd" out_file="gene_ontology_edit.obo" debug "$PERL $script -an $an -as P -o $out_file -b $1 $2" "$PERL" "$script" -an "$an" -as P -o "$out_file" -b "$1" "$2" } submodule pl_terms2tsv # @description Perl terms2tsv submodule # Probably should be translated to shell/python # @arg $1 string Set 2 pl_terms2tsv() { script="terms2tsv_v4.pl" debug "$PERL $script $1.terms > $1.tsv" "$PERL" "$script" "$1.terms" > "$1.tsv" } submodule r_compile_gtf # @description Compile GTF in R r_compile_gtf() { debug "Running: ${FUNCNAME[0]}" echo "Rscript CompileGTF.R" Rscript CompileGTF.R } submodule documentation # @section Documentation # @description Generates markdown documentation from this script using shdoc documentation() { debug "Running: ${FUNCNAME[0]}" # Print markdown to stdout ((DEBUG)) && shdoc < "$SCRIPT" # Create markdown file shdoc < "$SCRIPT" > documentation.md } # @description The main loop of script-run-workflow # May eventually need to add git ops # Passes on arguments # Most variables in main() are user configurable or can be overriden by env main() { debug "Running: ${FUNCNAME[0]}" "$@" # Where are we located? SCRIPT=$(realpath -s "${BASH_SOURCE[0]}") SCRIPT_DIR=$(dirname "$SCRIPT") # Set the automatic project directory prefix PROJECT_PREFIX="$(whoami)_$(date +%y_%m_%d)" san() { [[ $1 =~ .+_[0-9][0-9]_[0-9][0-9]_[0-9][0-9]_.+ ]]; } # sanitizer regex for prefix declare -ag PROJECTS=() # this array will hold all of the projects for this run parse_input "$@" # parse arguments with getopt # Prompt user for the PROJECT if we still don't have one if [[ ${#PROJECTS[@]} -eq 0 ]]; then # still allows for environment overrides ask_pn PROJECTS+=("$PROJECT") fi for i in "${!PROJECTS[@]}"; do if ! san "${PROJECTS[i]}"; then echo "Project name ${PROJECTS[i]} is invalid" echo "Enter a replacement" ask_pn san "$PROJECT" || (echo "RTFM"; return 1) PROJECTS[i]="$PROJECT" fi done SCANS_DIR="${SCANS_DIR:-"/mnt/data/ExpJobs"}" # TODO propose changing this to something else # If we don't catch with getopt or env, run all if [[ ${#INCLUDE_MODULES[@]} -eq 0 ]]; then MODULES=("${ALL_MODULES[@]}") else MODULES=("${INCLUDE_MODULES[@]}") fi # Exclude modules from --exclude arr=() for m in "${MODULES[@]}"; do [[ " ${EXCLUDE_MODULES[*]} " =~ [[:space:]]${m}[[:space:]] ]] || arr+=("$m") done MODULES=("${arr[@]}") unset arr # Sanitize MODULES for i in "${!MODULES[@]}"; do if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then echo "Module $m not in the module list" echo "Available modules: ${ALL_MODULES[*]}" read -r -p "Enter replacement module name: " MODULE ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULE}[[:space:]] ]] || (echo "RTFM"; return 1) MODULES[i]="$MODULE" fi done # Loop over projects for PROJECT in "${PROJECTS[@]}"; do SCAN_DIR="$SCANS_DIR/$PROJECT" # Run selected modules for m in "${MODULES[@]}"; do ask "Run $m" && "$m" done done } main "$@" exit $?