Files
hartman-server/workflow/qhtcp-workflow

2105 lines
92 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Copyright 2024 Bryan C. Roessler
#
# Allow indirect functions
# shellcheck disable=SC2317
#
# @name Hartman Lab QHTCP Workflow
# @brief An opinionated yet flexible QHTCP analysis framework for the Hartman Lab.
#
# @description
#
# See the [User Input](#user-input) section for getting started.
#
# Insert a general description of Q-HTCP and the Q-HTCP process here.
shopt -s extglob # Turn on extended globbing
DEBUG=1 # Turn debugging ON by default during development
# @description Use `--help` to print the help message.
# @internal
print_help() {
debug "Running: ${FUNCNAME[0]}"
install_dependencies --get-depends # Loads the dependency arrays
cat <<-EOF
USAGE:
script-run-workflow [[OPTION] [VALUE]]...
Some options (--project, --include, --exclude) can be passed multiple times or
by using comma-separated strings (see EXAMPLES below)
OPTIONS:
--project, -p PROJECT
PROJECT should follow the pattern ${PROJECT_PREFIX}_PROJECT_NAME
--module, -m MODULE[,MODULE...]
See MODULES section below for list of available modules
If no --include is specified, all modules are run
--submodule, -s SUBMODULE "[ARG1],[ARG2]..." (string of comma delimited arguments)
See SUBMODULES section below for list of available modules
See documentation for submodule argument usage
--nomodule, -n MODULE[,MODULE...]
See MODULES and SUBMODULES section below for list of modules to exclude
--markdown
Generate the shdoc markdown README.md file for this program
--yes, -y, --auto
Always answer yes to questions (non-interactive mode)
--debug, -d
Print extra debugging info
--help, -h
Print this help message and exit
MODULES:
${ALL_MODULES[*]}
SUBMODULES:
${ALL_SUBMODULES[*]}
DEPENDENCIES:
deb: ${depends_deb[@]}
rpm: ${depends_rpm[@]}
brew: ${depends_brew[@]}
perl: ${depends_perl[@]}
R: ${depends_r[@]}
BiocManager: ${depends_bioc[@]}
EXAMPLES:
script-run-workflow --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[1]} --module ${ALL_MODULES[2]} --yes
script-run-workflow --module=${ALL_MODULES[0]},${ALL_MODULES[1]}
script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT
script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT --module=${ALL_MODULES[1]},${ALL_MODULES[2]} --yes --debug
script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT --submodule ${ALL_SUBMODULES[2]} \"/path/to/genefile.txt,/path/to/output/dir\" --submodule ${ALL_SUBMODULES[3]} \"/path/to/sgofile\"
EOF
}
# @section Notes
# @description
#
# ### TO-DO
#
# * Variable scoping is horrible right now
# * I wrote this sequentially and tried to keep track the best I could
# * Local vars have a higher likelihood of being lower case, global vars are UPPER
# * See MODULE specific TODOs below
#
# ### General guidelines for writing external scripts
#
# * External scripts must be modular enough to handle input and output from multiple directories
# * Don't cd in scripts (if you must, do it in a subshell!)
# * Pass variables
# * Pass options
# * Pass arguments
#
# ## Project layout
#
# &nbsp;&nbsp;**qhtcp-workflow/**
#
# &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**scans/**
#
# * This directory contains raw image data and image analysis results for the entire collection of Q-HTCP experiments.
# * Subdirectories within "scans" should represent a single Q-HTCP study and be named using the following convention: yyymmdd_username_experimentDescription
# * Each subdirectory contains the Raw Image Folders for that study.
# * Each Raw Image Folder contains a series of N folders with successive integer labels 1 to N, each folder containing the time series of images for a single cell array.
# * It also contains a user-supplied subfolder, which must be named "MasterPlateFiles" and must contain two excel files, one named 'DrugMedia_experimentDescription' and the other named 'MasterPlate_experimentDescription'.
# * If the standard MasterPlate_Template file is being used, it's not needed to customize then name.
# * If the template is modified, it is recommended to rename it and describe accordingly - a useful convention is to use the same experimentDescription for the MP files as given to the experiment
# * The 'MasterPlate_' file contain associated cell array information (culture IDs for all of the cell arrays in the experiment) while the 'DrugMedia_' file contains information about the media that the cell array is printed to.
# * Together they encapsulate and define the experimental design.
# * The QHTCPImageFolders and 'MasterPlateFiles' folder are the inputs for image analysis with EASY software.
# * As further described below, EASY will automatically generate a 'Results' directory (within the ExpJobs/'ExperimentJob' folder) with a name that consists of a system-generated timestamp and an optional short description provided by the user (Fig.2). The 'Results' directory is created and entered, using the "File >> New Experiment" dropdown in EASY. Multiple 'Results' files may be created (and uniquely named) within an 'ExperimentJob' folder.
# &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**apps/easy/**
#
# * This directory contains the GUI-enabled MATLAB software to accomplish image analysis and growth curve fitting.
# * EASY analyzes Q-HTCP image data within an 'ExperimentJob'' folder (described above; each cell array has its own folder containing its entire time series of images).
# * EASY analysis produces image quantification data and growth curve fitting results for each cell array; these results are subsequently assembled into a single file and labeled, using information contained in the 'MasterPlate_' and 'DrugMedia_' files in the 'MasterPlateFiles' subdirectory.
# * The final files (named '!!ResultsStd_.txt' or '!!ResultsELr_.txt') are produced in a subdirectory that EASY creates within the 'ExpJob#' folder, named '/ResultsTimeStampDesc/PrintResults' (Fig. 2).
# * The /EASY directory is simply where the latest EASY version resides (additional versions in development or legacy versions may also be stored there).
# * The raw data inputs and result outputs for EASY are kept in the 'ExpJobs' directory.
# * EASY also outputs a '.mat' file that is stored in the 'matResults' folder and is named with the TimeStamp and user-provided name appended to the 'Results' folder name when 'New Experiment' is executed from the 'File' Dropdown menu in the EASY console.
# &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**apps/ezview/**
#
# * This directory contains the GUI-enabled MATLAB software to conveniently and efficiently mine the raw cell array image data for a Q-HTCP experiment.
# * It takes the Results.m file (created by EASY software) as an input and permits the user to navigate through the raw image data and growth curve results for the experiment.
# * The /EZview provides a place for storing the the latest EZview version (as well as other EZview versions).
# * The /EZview provides a GUI for examining the EASY results as provided in the …/matResults/… .mat file.
#
# &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**Master Plates**
#
# * This optional folder is a convenient place to store copies of the 'MasterPlate_' and a 'DrugMedia_' file templates, along with previously used files that may have been modified and could be reused or further modified to enable future analyses.
# * These two file types are required in the 'MasterPlateFiles' folder, which catalogs experimental information specific to individual Jobs in the ExpJobs folder, as described further below.
#
#
# Some example decorators for markdown:
#
#
# @description
# `--project`, `--module`, `--nomodule`, and `--submodule` can be passed multiple times or with a comma-separated string
# @option -p<value> | --project=<value> One or more projects to analyze, can be passed multiple times or with a comma-separated string
# @option -m<value> | --module=<value> One or more modules to run (default: all), can be passed multiple times or with a comma-separated string
# @option -s<value> | --submodule=<value> Requires two arguments: the name of the submodule and its arguments, can be passed multiple times
# @option -n<value> | --nomodule=<value> One or more modules (default: none) to exclude from the analysis
# @option --markdown Generate the shdoc markdown file for this program
# @option -y | --yes | --auto Assume yes answer to all questions (non-interactive mode)
# @option -d | --debug Turn on extra debugging output
# @option -h | --help Print help message and exit (overrides other options)
# @set PROJECTS array List of projects to cycle through
# @set MODULES array List of modules to run on each project
# @set SUBMODULES array List of submodules and their arguments to run on each project
# @set EXCLUDE_MODULES array List of modules not to run on each project
# @set DEBUG int Turn debugging on
# @set YES int Turn assume yes on
parse_input() {
debug "Running: ${FUNCNAME[0]} $*"
long_opts="project:,module:,submodule:,nomodule:,markdown,yes,auto,debug,help"
short_opts="+p:m:s:n:ydh"
if input=$(getopt -o $short_opts -l $long_opts -- "$@"); then
eval set -- "$input"
while true; do
case $1 in
--project|-p)
shift
IFS=',' read -ra PROJECTS <<< "$1"
;;
--module|-m)
shift
IFS=',' read -ra MODULES <<< "$1"
;;
--submodule|-s)
shift
IFS=',' read -ra SUBMODULES <<< "$1"
shift
IFS=',' read -ra SUBMODULES <<< "$1"
;;
--nomodule|-n)
shift
EXCLUDE_MODULES+=("$1")
;;
--markdown)
documentation; exit 0 # TODO disable the exit after development
;;
--yes|-y|--auto)
declare -g YES=1
;;
--debug|-d)
declare -g DEBUG=1
;;
--help|-h)
print_help; exit 0
;;
--)
shift
break
;;
esac
shift
done
else
err "Incorrect options provided"; exit 1
fi
}
# @section Modules
# @description
#
# A module contains a cohesive set of actions/experiments to run on a project
#
# Use a module to:
#
# * Build a new type of analysis from scratch
# * Generate project directories
# * Group multiple submodules (and modules) into a larger task
# * Dictate the ordering of multiple submodules
# * Competently handle pushd and popd for their submodules if they do not reside in the SCANS/PROJECT_DIR
# * Call their submodules with the appropriate arguments
#
# @description
module() {
debug "Adding $1 module"
ALL_MODULES+=("$1")
declare -gA "$1"
}
# @description Ask the user a yes/no question
# @arg $1 string The question to ask
# @exitcode 0 If yes
# @exitcode 1 If no
# @internal
ask() {
declare response
(( YES )) && return 0
read -r -p "$* [y/N]: " response
[[ ${response,,} =~ ^(yes|y)$ ]]
}
err() { echo "Error: $*" >&2; }
ask_pn() {
unset PROJECT
declare -ag ADD_PROJECTS
example_pn="${PROJECT_PREFIX}_$(random_three_words)"
cat <<-EOF
Enter a new or existing project name
If entering a new project, use the suggested prefix: ${PROJECT_PREFIX}_
You may choose any combination of words/characters following the prefix, but be sensible.
Make it descriptive and avoid spaces and special characters.
EOF
trys=3 # give the user up to 3 tries to enter a valid project name
for ((i=1; i<=trys; i++)); do
read -r -p "Enter a new or existing project name or hit Enter for default ($example_pn): " response
if [[ -z $response ]]; then
ADD_PROJECTS+=("$example_pn")
break
else
if sanitize_pn "$response"; then
ADD_PROJECTS+=("$response")
echo "$response successfully added as a project"
i=1 # resetting trys counter in case user wants to add more than 3 projects
else
err "Invalid project name: $response"
echo "Retrying ($i of $trys)"
fi
fi
done
[[ -z ${ADD_PROJECTS[*]} ]]
}
debug() { (( DEBUG )) && echo "Debug: $*"; }
# Not super portable but nice to have
random_three_words() {
local -a arr
adjectives=(
"adorable" "adventurous" "agile" "amazing" "angry" "beautiful" "bold" "brave" "bright" "calm"
"charming" "cheerful" "courageous" "creative" "delicate" "elegant" "energetic" "exciting" "fast" "friendly"
"gentle" "happy" "healthy" "helpful" "honest" "humble" "intelligent" "jovial" "kind" "lively"
"lovable" "magnificent" "mellow" "modest" "noble" "outgoing" "passionate" "peaceful" "powerful" "quick"
"radiant" "reliable" "resourceful" "respectful" "shy" "smart" "strong" "sweet" "tender" "thoughtful"
"timid" "unique" "upbeat" "vibrant" "warm" "wise" "wonderful" "youthful" "zealous" "eager"
"friendly" "generous" "imaginative" "independent" "inspired" "joyful" "luminous" "mysterious" "playful" "serene"
"spontaneous" "steady" "spirited" "stylish" "tough" "understanding" "vivid" "zany" "bold" "calm"
"dynamic" "innovative" "proud" "reliable" "sincere" "strong" "talented" "trustworthy" "vivid" "zealous"
)
participles=(
"abandoning" "absorbing" "accelerating" "achieving" "acquiring" "admiring" "advising" "agreeing"
"allowing" "analyzing" "appearing" "applying" "arguing" "assembling" "assisting" "attracting"
"believing" "browsing" "calculating" "calling" "caring" "celebrating" "cleaning" "climbing"
"coaching" "collecting" "combining" "communicating" "competing" "confessing" "considering"
"cooking" "correcting" "creating" "debating" "defining" "delivering" "designing" "discussing"
"driving" "enjoying" "exploring" "feeling" "finishing" "fixing" "forming" "gathering" "growing"
"guiding" "happening" "helping" "hoping" "improving" "increasing" "influencing" "involving"
"learning" "leading" "looking" "managing" "measuring" "moving" "noticing" "observing" "offering"
"organizing" "performing" "preparing" "presenting" "producing" "protecting" "questioning"
"recommending" "recovering" "running" "saving" "searching" "seeing" "sharing" "solving"
"starting" "studying" "succeeding" "supporting" "teaching" "thinking" "understanding" "using"
"validating" "waiting" "working" "writing"
)
animals=(
"antelope" "baboon" "badger" "bat" "bear" "beaver" "bison" "booby" "buffalo" "bull"
"camel" "cat" "cheetah" "chicken" "chimpanzee" "clam" "cobra" "cougar" "cow" "crab"
"crane" "crocodile" "crow" "deer" "dog" "dolphin" "dove" "duck" "eagle" "echidna"
"eel" "elephant" "emu" "falcon" "ferret" "fish" "flamingo" "fox" "frog" "gazelle"
"giraffe" "goat" "goose" "gorilla" "hare" "hawk" "hedgehog" "hippo" "horse" "hyena"
"iguana" "impala" "jaguar" "kangaroo" "koala" "lion" "llama" "lobster" "lynx" "macaw"
"manatee" "mole" "monkey" "moose" "mouse" "mule" "octopus" "okapi" "opossum" "ostrich"
"otter" "owl" "panda" "panther" "parrot" "penguin" "pig" "platypus" "porcupine" "quail"
"rabbit" "rat" "raven" "reindeer" "rhinoceros" "robin" "salmon" "seal" "shark" "sheep"
"shrimp" "skunk" "sloth" "snail" "snake" "sparrow" "spider" "squid" "squirrel" "starling"
"stingray" "swan" "tapir" "tiger" "toad" "toucan" "turtle" "vulture" "walrus" "wolverine"
"wolf" "wombat" "zebra"
)
arr+=(
"$(shuf -n1 -e "${adjectives[@]}")"
"$(shuf -n1 -e "${participles[@]}")"
"$(shuf -n1 -e "${animals[@]}")"
)
printf "%s_" "${arr[@]}" | sed 's/_$//'
}
# @description Backup one or more files to an incremented .bk file
# @exitcode backup iterator max 255
# @internal
backup() {
debug "Running: ${FUNCNAME[0]} $*"
for f in "$@"; do
[[ -e $f ]] || continue
count=1
while [[ -f $f.bk.$count ]]; do
count=$((count++))
done
echo "Backing up $f to $f.bk.$count"
debug "rsync -a $f $f.bk.$count"
rsync -a "$f" "$f.bk.$count"
done
}
# @description Prints a helpful message add program start
# @internal
interactive_header() {
debug "Running: ${FUNCNAME[0]}"
cat <<-'EOF'
_ _ _ _ _
| | | | | | | | | |
| |__| | __ _ _ __| |_ _ __ ___ __ _ _ __ | | __ _| |__
| __ |/ _` | `__| __| `_ ` _ \ / _` | `_ \ | | / _` | `_ \
| | | | (_| | | | |_| | | | | | (_| | | | | | |___| (_| | |_) |
|_| |_|\__,_|_| \__|_| |_| |_|\__,_|_| |_| |______\__,_|_.__/
___ _ _ _____ ____ ____
/ _ \| | | |_ _/ ___| _ \
| | | | |_| | | || | | |_) |
| |_| | _ | | || |___| __/
\__\_|_| |_| |_| \____|_|
EOF
echo ""
echo "Using scans directory: $SCANS_DIR"
echo "Using output directory: $OUT_DIR"
echo "Change the SCANS_DIR or OUT_DIR environment variable(s) to override"
echo "Example: SCANS_DIR=/path/to/scans OUT_DIR=/path/to/out ./qhtcp-workflow $*"
echo ""
# Ask to make our custom R library the default
if [[ -z $R_LIBS_USER || $R_LIBS_USER != "$HOME/R/$SCRIPT_NAME" ]]; then
echo "This script uses a local R library at $HOME/R/$SCRIPT_NAME"
echo "You can install the R dependencies to this library using the install_dependencies module"
if ((YES)) || ask "Would you like to make this R library the default for your user?"; then
line="export R_LIBS_USER=$HOME/R/$SCRIPT_NAME"
if ! grep -qF "$line" ~/.bashrc; then
echo "Adding $line to your .bashrc"
echo "If you use a different shell, update your R_LIBS_USER environment variable accordingly"
echo "$line" >> ~/.bashrc
fi
fi
declare -gx R_LIBS_USER="$HOME/R/$SCRIPT_NAME"
else
debug "R_LIBS_USER already set to $HOME/R/$SCRIPT_NAME"
fi
echo ""
echo "Available Modules:"
for i in "${!ALL_MODULES[@]}"; do
printf "%d. %s\n" "$((i+1))" "${ALL_MODULES[i]}"
done
echo ""
echo "Available Submodules:"
for i in "${!ALL_SUBMODULES[@]}"; do
printf "%d. %s\n" "$((i+1))" "${ALL_SUBMODULES[i]}"
done
echo ""
# Gather and list projects from SCANS_DIR
shopt -s nullglob
projects=("$SCANS_DIR"/*/)
shopt -u nullglob
if [[ ${#projects[@]} -eq 0 ]]; then
echo "No projects found in $SCANS_DIR"
ask_pn && PROJECTS+=("${ADD_PROJECTS[@]}")
else
echo "Available Projects:"
projects=("${projects[@]%/}") # strip comma first!
projects=("${projects[@]##*/}")
for i in "${!projects[@]}"; do
printf "%d. %s\n" "$((i+1))" "${projects[i]}"
done
fi
echo ""
# Let user choose or add project(s)
if [[ -z ${PROJECTS[*]} ]]; then
num=$((${#projects[@]}))
if [[ $num -eq 0 ]]; then
ask_pn && PROJECTS+=("${ADD_PROJECTS[@]}")
else
echo "Enter a comma-separated list of project numbers to analyze"
((YES)) || read -r -p "Enter \"new\" to add a new project or hit Enter for default ($num): " response
if [[ $response == "new" ]]; then
ask_pn && PROJECTS+=("${ADD_PROJECTS[@]}")
else
[[ -z $response ]] && response=$num
IFS=',' read -ra arr <<< "$response"
for i in "${arr[@]}"; do
PROJECTS+=("${projects[$((i-1))]}")
done
fi
fi
unset response arr i
fi
# Sanitize project names
for i in "${!PROJECTS[@]}"; do
if ! sanitize_pn "${PROJECTS[i]}"; then
echo "Project name ${PROJECTS[i]} is invalid"
ask_pn && unset "PROJECTS[i]" && PROJECTS+=("${ADD_PROJECTS[@]}")
fi
done
if [[ -z ${MODULES[*]} && -z ${EXCLUDE_MODULES[*]} ]]; then
echo "Enter a comma-separated list of modules to run"
((YES)) || read -r -p "Hit Enter for all (default) or '0' for none: " response
if [[ -z $response ]]; then
MODULES=("${ALL_MODULES[@]}")
elif [[ $response -eq 0 ]]; then
EXCLUDE_MODULES=("${ALL_MODULES[@]}")
else
IFS=',' read -ra arr <<< "$response"
for i in "${arr[@]}"; do
MODULES+=("${ALL_MODULES[$((i-1))]}")
done
fi
unset response arr i
fi
if [[ -z ${MODULES[*]} && -z ${EXCLUDE_MODULES[*]} && -z ${SUBMODULES[*]} ]]; then
while :; do
echo "Enter a submodule followed by its arguments as a comma-separated string"
echo "Quote your string if there are any whitespaces"
echo "Example: ${ALL_SUBMODULES[0]} \"arg1,arg2,arg3...\""
((YES)) || read -r -p "Or hit Enter to continue: " response
[[ -z $response ]] && break
IFS=' ' read -ra arr <<< "$response"
if [[ ! ${#arr[@]} -eq 2 ]]; then
err "The second argument is required and may be an empty string, \"\""
else
SUBMODULES+=("${arr[@]}")
fi
unset response arr i
done
fi
# cat <<-EOF
# Available modules: ${ALL_MODULES[*]}
# Available submodules: ${ALL_SUBMODULES[*]}
# EOF
}
module install_dependencies
# @description This module will automatically install the dependencies for running QHTCP.
#
# If you wish to install them manually, you can use the following information to do so:
#
# #### System dependencies
#
# * R
# * Perl
# * Java
# * MATLAB
#
# #### MacOS
#
# * `export HOMEBREW_BREW_GIT_REMOTE=https://github.com/Homebrew/brew`
# * `/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"`
# * `cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder`
# * `brew install graphiz gd pdftk-java pandoc shdoc nano rsync coreutils`
#
# #### Linux DEB
#
# * `apt install graphviz pandoc pdftk-java libgd-dev perl shdoc nano rsync coreutils libcurl-dev openssl-dev`
#
# #### Linux RPM
#
# * `dnf install graphviz pandoc pdftk-java gd-devel perl-CPAN shdoc nano rsync coreutils libcurl-devel openssl-devel`
#
# #### Perl
#
# * `cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder`
#
# #### R
#
# * `install.packages(c('BiocManager', 'ontologyIndex', 'ggrepel', 'tidyverse', 'sos', 'openxlsx', 'ggplot2', 'plyr', 'extrafont', 'gridExtra', 'gplots', 'stringr', 'plotly', 'ggthemes', 'pandoc', 'rmarkdown', 'plotly', 'htmlwidgets'), dep=TRUE)`
# * `BiocManager::install('UCSC.utils')`
# * `BiocManager::install('org.Sc.sgd.db')`
#
#
install_dependencies() {
debug "Running: ${FUNCNAME[0]} $*"
# Dependency arrays
depends_rpm=(graphviz pandoc pdftk-java gd-devel perl-CPAN shdoc nano rsync coreutils libcurl-devel openssl-devel)
depends_deb=(graphviz pandoc pdftk-java libgd-dev perl shdoc nano rsync coreutils libcurl-dev openssl-dev)
depends_brew=(graphiz pandoc gd pdftk-java shdoc nano perl rsync coreutils)
depends_perl=(File::Map Sub::Uplevel ExtUtils::Config ExtUtils::PkgConfig Module::Build::Tiny IPC::Run GD GO::TermFinder)
depends_r=(BiocManager ontologyIndex ggrepel tidyverse sos openxlsx ggplot2
plyr extrafont gridExtra gplots stringr plotly ggthemes pandoc rmarkdown
plotly htmlwidgets)
depends_bioc=(UCSC.utils org.Sc.sgd.db)
[[ $1 == "--get-depends" ]] && return 0 # if we just want to read the depends vars
# Install system-wide dependencies
echo "Installing system dependencies"
echo "You may be prompted for your sudo password to install packages using your system package manager"
case "$(uname -s)" in
Linux*|CYGWIN*|MINGW*)
if hash dnf &>/dev/null; then
ask "Detected Linux RPM platform, continue?" || return 1
sudo dnf install "${depends_rpm[@]}"
elif hash apt &>/dev/null; then
ask "Detected Linux DEB platform, continue?" || return 1
sudo apt install "${depends_deb[@]}"
else
echo "Sorry, your Linux platform is not supported for automatic dependency installation"
echo "You will need to resolve dependencies manually"
fi
;;
Darwin*)
ask "Detected Mac platform, continue?" || return 1
export HOMEBREW_BREW_GIT_REMOTE="https://github.com/Homebrew/brew"
curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh|bash
brew install "${depends_brew[@]}"
;;
*)
echo "Your system could not be detected, please install dependencies manually"
;;
esac
# Install perl CPAN modules
echo "Installing perl CPAN modules"
echo "It is recommended to use the local::lib perl library if prompted"
debug "cpan -I -i ${depends_perl[*]}"
cpan -I -i "${depends_perl[@]}"
# Install R packages
echo "Installing R packages"
depends_r_str=""
depends_r_to_string() {
for d in "${depends_r[@]}"; do
depends_r_str+="$d\", \""
done
depends_r_str="${depends_r_str::-3}" # strip last , " (comma and quote)
}
depends_r_to_string
# Install R packages
for d in "${depends_r[@]}"; do
debug "$RSCRIPT -e \"if (!require(\"$d\", quietly = TRUE)) {install.packages(\"$d\", dep=TRUE, lib=\"$R_LIBS_USER\", repos=\"https://cloud.r-project.org\")}\""
"$RSCRIPT" -e "if (!require(\"$d\", quietly = TRUE)) {install.packages(\"$d\", dep=TRUE, lib=\"$R_LIBS_USER\", repos=\"https://cloud.r-project.org\")}"
done
# Install Bioc packages
for d in "${depends_bioc[@]}"; do
debug "$RSCRIPT -e \"BiocManager::install(\"$d\", lib=\"$R_LIBS_USER\")\""
"$RSCRIPT" -e "BiocManager::install(\"$d\", lib=\"$R_LIBS_USER\")"
done
hash "$MATLAB" &>/dev/null || echo "You will also need MATLAB installed for GUI modules"
}
module init_project
# @description This function creates and initializes project directories
#
# This module:
#
# * Initializes a project directory in the scans directory
#
# TODO
#
# * Copy over source image directories from robot - are these alse named by the ExpJobs name?
# * Suggest renaming ExpJobs to something like "scans" or "images"
# * MasterPlate_ file **should not** be an xlsx file, no portability
#
# NOTES
#
# * Copy over the images from the robot and then DO NOT TOUCH that directory except to copy from it
# * Write-protect (read-only) if we need to
# * Copy data from scans/images directory to the project working dir and then begin analysis
# * You may think...but doesn't that 2x data?
# * No, btrfs subvolume uses reflinks, only data that is altered will be duplicated
# * Most of the data are static images that are not written to, so the data is deduplicated
#
init_project() {
debug "Running: ${FUNCNAME[0]}"
# We handle this in main() and pushd to it
# But do it one more time in case this is run as a module
ask "(Re)-Initialize a project at $PROJECT_SCANS_DIR?" || return 1
[[ -d $PROJECT_SCANS_DIR ]] || (mkdir -p "$PROJECT_SCANS_DIR" || return 1)
# Create the study info file
if [[ ! -f $STUDY_INFO_FILE ]]; then
echo "Create a default study info file?";
if ask "You can edit this file in the qhtcp module"; then
cat <<-EOF > "$STUDY_INFO_FILE"
"ExpNumb","ExpLabel","BackgroundSD","ZscoreJoinSD","AnalysisBy"
EOF
fi
fi
# Write skeleton files in csv
# If we have to convert to xlsx later, so be it
echo "In the future we will copy the DrugMedia file from robot here"
# cat <<-EOF > "$DRUG_MEDIA_FILE"
# EOF
# TODO here we'll copy scans from robot but for now let's pause and wait for transfer
echo "In the future we will copy the MasterPlate file from robot here"
# cat <<-EOF > "$MASTER_PLATE_FILE"
# EOF
# TODO here we'll copy scans from robot but for now let's pause and wait for transfer
echo "In the future we will copy scans from robot here"
read -r -p "Hit <Enter> to continue: "
}
module easy
# @description
# Run the EASY matlab program
#
# TODO
#
# * Don't create output in the scans folder, put it in an output directory
# * The !!Results output files need standardized naming
# * The input MasterPlate and DrugMedia sheets need to be converted to something standard like csv/tsv
# * This would allow them to be created programmatically as well
#
# NOTES
#
# * I've modularized EASY to fit into this workflow but there may be things broken (especially in "stand-alone" mode)
# * The scans/images and 'MasterPlateFiles' folder are the inputs for image analysis with EASY software.
# * EASY will automatically generate a 'Results' directory (within the ExpJobs/'ExperimentJob' folder) w/ timestamp and an optional short description provided by the user (Fig.2).
# * The 'Results' directory is created and entered, using the "File >> New Experiment" dropdown in EASY.
# * Multiple 'Results' files may be created (and uniquely named) within an 'ExperimentJob' folder.
#
# INSTRUCTIONS
#
# * This program should handle the relevant directory and file creation and load the correct project into EASY
#
# #### Pin-tool mapping
#
# * Select at least two images from your experiment (or another experiment) to place in a 'PTmapFiles' folder.
# * Sometimes an experiment doesn't have a complete set of quality spots for producing a pin tool map that will be used to start the spot search process.
# * In this case the folder for Master Plate 1 (MP 1) is almost good but has a slight problem.
# * At P13 the spot is extended. We might use this one but would like to include others that are more centered if possible.
# * The other plates with higher drug concentrations could be used, but since a recent experiment has a better reference plate image, we will add it to the set of images to produce the pin tool map.
#
# ![Bad Pin Map](docs/imgs/easy/1-bad-pin-map.png "See the problem in Column 1, Row 13?")
#
# * We will find a good image from another experiment
#
# ![Good Pin Map](docs/imgs/easy/2-good-pin-map.png "Nice pin map")
#
# * We now have some images to generate a composite centered map for EASY to search and find the nucleation area of each spot as it forms.
# * Click the Run menu tab.
# * A drop down list of options is presented.
# * Click the first item → [Plate Map Pintool ].
#
# ![Open PTmaps](docs/imgs/easy/3-open-ptmaps-dir.png "Where to find PTmaps")
#
# * Open the PTmapFiles folder.
# * Then click on the .bmp files you wish to include to make the pin tool map.
# * Click the Open button.
#
# ![Open BMPs](docs/imgs/easy/4-select-bmps.png "Open BMPs")
#
# * A warning dialog box may appear.
# * This is nothing to be concerned about.
# * Click OK and continue.
#
# ![Direct Map](docs/imgs/easy/5-direct-map.png "Map view, very pretty")
#
# * 'Retry' takes you back so that to can select a different .bmp files from which to create the map from.
# * In this case the spots from the two images are well aligned and give coverage to all the spots therefore we do not have to add new images.
# * Remember, this map is just a good guess as to where to start looking for each spot not where it will focus to capture intensities.
# * Click 'Open' again.
#
# ![Open BMPs](docs/imgs/easy/4-select-bmps.png "Open BMPs")
#
# * We can now shift these values to get a better 'hard' start for this search.
# * Maybe we can move this search point to the left a bit by decreasing the 'Initial Col Position' slightly to 120 and clicking continue.
#
# ![Open BMPs](docs/imgs/easy/6-shift-values.png "Open BMPs")
#
# * Even though the first result image using 126 may have given a good map, we will use the improve second initiation point by clicking the 'Continue/Finish' button.
#
# ![Shifted map](docs/imgs/easy/7-shifted-map.png "See the dots are redder and centered")
#
# * Note that the red “hot” spot is now well centered in each composite image spot.
# * We can now click 'Continue / Finish' to proceed.
# * The coordinates and parameters will be stored in the results folder 'PTmats'.
# * This is where the .mat files which contain localization data for use in the next section of our work.
# * The EASY GUI will come back.
# * Now click the 'Run' → 'Image Curve ComboAnalysi'.
# * This will perform the image quantification and then generate the curve fits for each plate selected.
# * Typically we pick only one plate at a time for one or two master plates.
# * The software will present the final image of the search if only 1 master plate is selected.
# * If multiple plates are selected, no search images will be presented or stored as figures.
# * However all the position data for every spot at every time point will be stored.
# * This large data trove is used by EZview to produce the image click-on hot spot maps and the photo strips.
#
# ![Curve analysis](docs/imgs/easy/8-curve-analysis.png "Curve analysis")
#
# * Note the 'Select Files' dialog.
# * It allow the user to select the specific .bmp files to use.
# * This can be useful if there are bad ones that need to be removed from the folder due to contamination.
# * If all are good we can select them all and click 'Open' to run the process.
# * There are other parameters that can be selected.
# * For now we will continue and come back to those later.
#
# ![Select all BMPs](docs/imgs/easy/9-select-all-dialog.png "Select all BMPs")
#
# ![Contamination view](docs/imgs/easy/10-contamination-view.png "Contamination view")
#
# * The search focus of each spot at the end of the run is presented for examination
# * Notice that these have floated and locked in to a position determined on the fly to a point where the initial growth has exceed has reach a point of maturity.
# * This prevents a jump to a late onset jump to a contamination site.
# * If we found that we need to adjust our pin tool map or make other modifications, we can do that and rerun these single runs until we are satisfied.
#
# ![Search focus](docs/imgs/easy/11-search-focus.png "Search focus")
#
# * Next we will run the entire experiment by clicking on all the other master plates from the list box.
#
# ![Run experiment](docs/imgs/easy/12-run-experiment.png "Run experiment")
#
# * Depending on the number of master plates and the number of time point images taken for each, this next step can take a while.
# * Click continue and do something else while the computer works.
# * When the work is complete the EASY GUI will reappear without the master plate list.
# * Now look in the /Results* /PrintResults folder to check that all the plates have run and produced data.
#
# ![Experiment complete](docs/imgs/easy/13-complete-experiment.png "Complete experiment")
#
# * This is a legacy print copy of data, but is still useful to check that all the quantification was completed successfully.
#
# ![Check results](docs/imgs/easy/14-check-results.png "Check results")
#
# #### Generate Reports
#
# * Generate a MPDM.mat file from the Excel master plate and drug media sheets that the user prepared as part of the experiment preparation.
# * These sheets must conform to certain format rules.
# * It is best when creating these to use a working copy as a template and replace the data with that of the current experiment.
# * See Master Plate and Drug-Media Plate topic for details.
# * Click on the 'GenReports' menu tab and a drop down menu is presented the the first item 'DrugMediaMP Generate .mat'.
# * This will take you to the /MasterPlateFiles folder within the experiment currently being analyzed.
# * Do as the dialog box instructs. Select the Master Plate Excel file first.
# * Important note: These files (both for master plates and drug-medias) must be generated or converted to the Excel 95 version to be read in Linux.
# * This can be done on either a Windows or an Apple machine running Excel.
#
#
# ![Navigate to MasterPlateFiles](docs/imgs/easy/15-generate-mpdm-mat1.png "Navigate to MasterPlateFiles")
#
# ![Create a new MPDM.mat file](docs/imgs/easy/16-generate-mpdm-mat2.png "Create a new MPDM.mat file")
#
# ![Navigate to MasterPlateFiles](docs/imgs/easy/17-generate-mpdm-mat3.png "Navigate to MasterPlateFiles")
#
# ![Click OK](docs/imgs/easy/18-generate-mpdm-mat4.png "Click OK")
#
# * A message dialog pops up.
# * Click 'OK'.
#
# ![Navigate to MasterPlateFiles](docs/imgs/easy/19-generate-mpdm-mat5.png "Navigate to MasterPlateFiles")
#
# * Next click on the 'GenReports' menu tab and the second item in the drop down list 'ResultsDB Generate'.
#
# ![Generate Reports](docs/imgs/easy/20-gen-reports.png "Generate Reports")
#
# * A dialog box with options appears.
# * The default is 'Both'.
# * 'Res' produces only a standard result sheet in the current experiments /Results*/PrintResults folder.
# * 'DB' produces only a special file for convenient upload to databases.
# * This file has no blank rows separating the plates and combines the raw data for each line item into a 'blob' as this is a convenient way to store data of variant lengths in a single database field.
# * The concatenation of data for each row take a while. But is useful for uploading data.
# * Typically 'Both' is the preferred option, however, if one needs to review the results quickly, this provides that option.
#
# * We can open the !!Results MI 16_0919 yor1-1 copy.txt text file using Libre Open Office to review the results.
#
# ![Results file](docs/imgs/easy/21-results-file.png "Results file")
#
# * We can do likewise with the !!Dbase_MI 16_0919_yor1-2 copy.txt text file.
#
# ![Db file](docs/imgs/easy/22-dbase-file.png "Db file")
#
# * Note that there are no headers or empty rows.
# * Since Libre may corrupt the text files, it could be advisable to only read them and refrain from any 'Save' options presented.
#
# #### Master Plate and Drug Media Spreadsheets
#
# * The Master Plate and Drug- Media Spreadsheets correlate to the collected and calculated data with the defining definitions of the cultures, drugs and media involved in producing the experimental data.
# * These spreadsheets have a very specific format which was instigated at the beginning of our work.
# * To maintain compatibility over the years, we maintain that format.
# * To begin with, our system can be used with Windows, Linux and Apple operating systems.
# * To accommodate these OS's, the Excel version must be an older Excel 95 version which is cross compatible for Matlab versions within all three major OS's.
# * Windows is more tolerant, but to avoid problems producing results reports, ALWAYS use the Excel 95 format for your spreadsheets.
# * Do not remove any header rows. They can be modified with exception of the triple hash (###).
# * Do not change the number or order of the columns.
# * Next place a 'space' in unused empty spreadsheet entry positions.
# * This can cause problems in general for some software utilities.
# * It is just best to make this a standard practice.
# * Avoid using special characters.
# * Depending on the OS and software utility (especially database utilities), these can be problematic.
# * Certain 'date' associated entries such as Oct1 or OCT1 will be interpreted by Excel as a date and automatically formatted as such.
# * Do not use Oct1 (which is a yeast gene name) instead use Oct1_ or it's ORF name instead.
# * When creating a Master Plate spreadsheet, it is best to start with a working spreadsheet template and adjust it to your descriptive data.
# * Be sure that ### mark is always present in the first column of the header for each plate.
# * This is important convention as it is used to defined a new plate set of entry data.
# * Each plate is expected to have 384 rows of data correlated with the 384 wells of the source master plates.
# * These have a particular order going through all 24 columns each row before proceeding to the next row.
# * Gene names and ORF name entries should be as short as possible (4-5 character max if possible) as these are used repeatedly as part of concatenated descriptors.
# * The 'Replicate' field and the 'Specifics' fields can be used for additional information.
# * The 'Replicate' field was originally designed to allow the user to sort replicates but it can be used for other relevant information.
# * The 'Specifics' field was created to handle special cases where the liquid media in which cultures were grown on a single source plate was selectively varied.
# * This gives the researcher a way to sort by growth media as well as gene or ORF name.
# * It can also be used to sort other properties instead of modifying the gene name field.
# * Thoughtful experiment design and layout are important for the successful analysis of the resultant data.
# * It is typically a good idea to create at least one reference full plate and make that plate the first source master plate.
# * Typically we give those reference cultures the 'Gene Name' RF1.
# * Traditionally we also made a second full reference plate with its cultures labeled RF2.
# * More recently some researchers have gone to dispersing RF1 control reference cultures throughout the source master plates series in addition to the first full source master plate.
# * The EZview software has been updated accordingly to find these references and perform associated calculations.
#
# ![Master Plate file](docs/imgs/easy/23-mp-file.png "Master Plate file")
#
# * There are a number of fields on the spreadsheet which in this case were left empty.
# * This spreadsheet format was created originally with studies of whole yeast genome SGA modifications incorporated.
# * Therefore all fields may not be relevant.
# * However, when ever relevant it is strongly advised to fill in all the appropriate data.
# * The Drug-Media spreadsheet defines the perturbation components of each type of agar plate that the source master plates are printed on.
# * Again the format adherence is essential.
# * There is a '1' in the first column- second row (A2).
# * This has as legacy going back to early use.
# * It is still necessary and should not be deleted.
# * The header row must not be deleted.
# * A triple hash(###) must be placed in the cell below the last entry in the Drug field (Column 2).
# * Again insert a 'space' in each unused or empty cell in each field.
# * Again avoid special characters which may cause problems if not in the experiment quantification in subsequent analysis utilities.
# * A utility looking for a text field may end up reading a null and respond inappropriately.
# * As with the master plate Excel sheet, it is a good idea to use a working copy of an existing Drug-Media spreadsheet and adapt it to ones needs.
#
# ![Drug Media file](docs/imgs/easy/24-dm-file.png "Drug Media file")
#
#
#
#
#
3
#
# To analyze a new Q-HTCP experiment:
#
# * Open the EASY Software.
# * Open 'EstartConsole.m' with MATLAB
# * Click the Run icon (play button)
# * When prompted, click "Change Folder" (do not select "Add to Path").
# * In the pop-up display, select from the 'File' dropdown: 'New Experiment'.
# * From the pop-up, choose where to save the new file.
# * Navigate to the relevant job in the ExpJobs folder, name the file accordingly, and click 'save'.
# * The newly created .mat file in the newly created Results folder will automatically be loaded.
# * The file name will then be automatically appended by the code with the current date information (e.g. 'A1.mat' will become 'Results2023-07-19A1)
# * If the experiment has already been created, it can be reloaded by clicking 'Load Experiment' instead of 'New Experiment' and selecting the relevant results
# * In the pop-up display, click on the 'Run' dropdown menu and select 'Image CurveFit ComboAnalysis'.
# * In the updated pop-up, choose/highlight all desired image folders for analysis (this is generally all of the folders, since only the ones that need analysis should be there) and then click on 'continue'.
# * As the program is running, updates will periodically appear in the Command Window; there will be an initial pause at "Before call to NIscanIntens…..".
# * When the curve fitting is finished, the EASY console will pop back up.
# * Check to see the completed analysis results in the newly created 'PrintResults' Folder, inside of the 'Results' Folder.
# * Other folders ('CFfigs', 'figs', 'Fotos') are created for later optional use and will be empty.
# * **NOTE:** The image analysis is completed independent of labeling the data (strains, media type, etc. Labeling happens next with the 'GenReports' function).
# * Click on the 'GenReports' dropdown and select 'DrugMediaMP Generate .mat'
# * **NOTE:** The 'MasterPlate' and 'DrugMedia' files have very specific formats and should be completed from a template.
# * The Masterplate file must be exact (it must contain all and only the strains that were actually tested).
# * For example, if only part of a library is tested, the complete library file must be modified to remove irrelevant strains.
# * You will be prompted to first select the 'MasterPlate' file. You will need to navigate away from the working directory to get to it.
# * It is fine for the 'MasterPlate_' file to be .xlsx (or .xls), and if you don't see it in the popup window, then change the file type from '.xls' to "all files" and then select it.
# * Once it is selected, a report of the number of master plates in the file will pop up; when the report appears, assuming it is correct, click on 'OK'.
# * You will then be prompted to select the 'DrugMedia' file from the relevant job folder. You will automatically return to the correct prior directory location.
# * Choose it and click 'OK'. You may see a warning about column headers being modified, but that's ok.
# * This will create an additional file in the 'MasterPlatesFiles' folder named 'MPDMmat.mat'
# * Click on the 'GenReports' dropdown and select 'Results_Generate.'
# * You will first see '!!ResultsElr_.txt' generated in the 'PrintResults' folder.
# * Refreshing will reveal an increasing file size until you see the '!!ResultsStd_.txt' being generated.
# * When finished, the '!!ResultsStd_.txt' will be about the same file size and it should be used in the following StudiesQHTCP analysis.
# * 'NoGrowth_.txt', and 'GrowthOnly_.txt' files will be generated in the 'PrintResults' folder.
#
#
#
#
# Issues:
# * We need full documentation for all of the current workflow. There are different documents that need to be integrated. This will need to be updated as we make improvements to the system.
# * MasterPlate_ file must have ydl227c in orf column, or else it Z_interaction.R will fail, because it can't calculate shift values.
# * Make sure there are no special characters; e.g., (), “, ', ?, etc.; dash and underscore are ok as delimiters
# * Drug_Media_ file must have letter character to be read as 'text'.
# * MasterPlate_ file and DrugMedia_ are .xlsx or .xls, but !!Results_ is .txt.
# * In Z_interactions.R, does it require a zero concentration/perturbation (should we use zero for the low conc, even if it's not zero), e.g., in order to do the shift correctly.
# * Need to enable all file types (not only .xls) as the default for GenerateResults (to select MP and DM files as .xlsx).
# * Explore differences between the ELR and STD files - 24_0414; John R modified Z script to format ELR file for Z_interactions.R analysis.
# * To keep time stamps when transferring with FileZilla, go to the transfer drop down and turn it on, see https://filezillapro.com/docs/v3/advanced/preserve-timestamps/
# * Could we change the 'MasterPlateFiles' folder label in EASY to 'MasterPlate_DrugMedia' (since there should be only one MP and there is also a DM file required?
# * I was also thinking of adding a 'MasterPlateFilesOnly' folder to the QHTCP directory template where one could house different MPFiles (e.g., with and without damps, with and without Refs on all MPs, etc; other custom MPFiles, updated versions, etc)
# * Currently updated files are in '23_1011_NewUpdatedMasterPlate_Files' on Mac (yeast strains/23_0914…/)
# * For EASY to report cell array positions (plate_row_column) to facilitate analyzing plate artifacts. The MP File in Col 3 is called 'LibraryLocation' and is reported after 'Specifics' in the !!Results.
# * Can EASY/StudiesQ-HTCP be updated at any time by rerunning with updated MP file (new information for gene, desc, etc)- or maybe better to always start with a new template?
# * Need to be aware of file formatting to avoid dates (e.g., with gene names like MAY24, OCT1, etc, and with plate locations 1E1, 1E2, etc)- this has been less of a problem.
# * In StudiesQHTCP folders, remember to annotate Exp1, Exp2, in the StudyInfo.csv file.
# * Where are gene names called from for labeling REMc heatmaps, TSHeatmaps, Z-interaction graphs, etc? Is this file in the QHTCP 'code' folder, or is it in the the results file (and thus ultimately the MP file)?
# * Is it ok for a MasterPlate_ file to have multiple sheets (e.g., readme tab- is only the first tab read in)?
# * What are the rules for pulling information from the MasterPlateFile to the !!Results_ (e.g., is it the column or the Header Name, etc that is searched? Particular cells in the DrugMedia file?).
# * Modifier, Conc are from DM sheet, and refer to the agar media arrays. OrfRep is from MasterPlate_ File. 'Specifics' (Last Column) is experiment specific and accommodate designs involving differences across the multi-well liquid arrays. 'StrainBkGrd' (now 'Library location') is in the 3rd column and reported after 'Specifics' at the last col of the '!!Results..' file.
# * Do we have / could we make an indicator- work in progress or idle/complete with MP/DM and after gen-report. Now, we can check for the MPDMmat.mat file, or we can look in PrintResults, but would be nice to know without looking there.
# * File>>Load Experiment wasn't working (no popup to redirect). Check this again.
easy() {
debug "Running: ${FUNCNAME[0]}"
cat <<-EOF
To analyze a new Q-HTCP experiment:
* Open the EASY Software.
* Open 'EASYconsole.m' with MATLAB
* Click the Run icon (play button)
* When prompted, click "Change Folder" (do not select "Add to Path").
* In the pop-up display, select from the 'File' dropdown: 'New Experiment'.
* From the pop-up, choose where to save the new file.
* Navigate to the relevant job in the ExpJobs folder, name the file accordingly, and click 'save'.
* The newly created .mat file in the newly created Results folder will automatically be loaded.
* The file name will then be automatically appended by the code with the current date information (e.g. 'A1.mat' will become 'Results2023-07-19A1)
* If the experiment has already been created, it can be reloaded by clicking 'Load Experiment' instead of 'New Experiment' and selecting the relevant results
* Next, in the pop-up display, click on the 'Run' dropdown menu and select 'Image CurveFit ComboAnalysis'.
* In the updated pop-up, choose/highlight all desired image folders for analysis (this is generally all of the folders, since only the ones that need analysis should be there) and then click on 'continue'.
* As the program is running, updates will periodically appear in the Command Window; there will be an initial pause at "Before call to NIscanIntens…..".
* When the curve fitting is finished, the EASY console will pop back up.
* Check to see the completed analysis results in the newly created 'PrintResults' Folder, inside of the 'Results' Folder.
* Other folders ('CFfigs', 'figs', 'Fotos') are created for later optional use and will be empty.
* NOTE: The image analysis is completed independent of labeling the data (strains, media type, etc. Labeling happens next with the 'GenReports' function).
* Next, click on the 'GenReports' dropdown and select 'DrugMediaMP Generate .mat'
* NOTE: The 'MasterPlate' and 'DrugMedia' files have very specific formats and should be completed from a template.
* The Masterplate file must be exact (it must contain all and only the strains that were actually tested).
* For example, if only part of a library is tested, the complete library file must be modified to remove irrelevant strains.
* You will be prompted to first select the 'MasterPlate' file. You will need to navigate away from the working directory to get to it.
* It is fine for the 'MasterPlate_' file to be .xlsx (or .xls), and if you don't see it in the popup window, then change the file type from '.xls' to "all files" and then select it.
* Once it is selected, a report of the number of master plates in the file will pop up; when the report appears, assuming it is correct, click on 'OK'.
* You will then be prompted to select the 'DrugMedia' file from the relevant job folder. You will automatically return to the correct prior directory location.
* Choose it and click 'OK'. You may see a warning about column headers being modified, but that's ok.
* This will create an additional file in the 'MasterPlatesFiles' folder named 'MPDMmat.mat'
* Click on the 'GenReports' dropdown and select 'Results_Generate.'
* You will first see '!!ResultsElr_.txt' generated in the 'PrintResults' folder.
* Refreshing will reveal an increasing file size until you see the '!!ResultsStd_.txt' being generated.
* When finished, the '!!ResultsStd_.txt' will be about the same file size and it should be used in the following StudiesQHTCP analysis.
'NoGrowth_.txt', and 'GrowthOnly_.txt' files will be generated in the 'PrintResults' folder.
EOF
declare -gx EASY_DIR="$APPS_DIR/matlab/easy"
declare -gx EASY_RESULTS_DIR="$EASY_OUT_DIR/$PROJECT_PREFIX"
script="$EASY_DIR/EASYconsole.m"
# Prompt user for suffix
echo "Using EASY results directory: $EASY_RESULTS_DIR"
((YES)) || read -r -p "Enter a custom suffix and/or hit enter to use the default directory (no suffix): " EASY_SUFFIX
[[ -n $EASY_SUFFIX ]] && EASY_RESULTS_DIR+="_$EASY_SUFFIX"
# This dirname is separate from the project's so multiple EASY results can be generated
declare -gx EASY_PROJECT_NAME="${EASY_RESULTS_DIR##*/}"
debug "EASY results project name: $EASY_PROJECT_NAME"
# Backup and create EASY results dirs
[[ -d $EASY_RESULTS_DIR ]] && backup "$EASY_RESULTS_DIR"
[[ -d $EASY_RESULTS_DIR ]] || mkdir -p "$EASY_RESULTS_DIR"
# Make EASY dirs
dirs=('PrintResults' 'CFfigs' 'Fotos')
for d in "${dirs[@]}"; do
if [[ ! -d $EASY_RESULTS_DIR/$d ]]; then
debug "mkdir $EASY_RESULTS_DIR/$d"
mkdir "$EASY_RESULTS_DIR/$d"
fi
done
# Copy Templates
declare -gx DRUG_MEDIA_FILE="$EASY_RESULTS_DIR/DrugMedia_$PROJECT.xls"
declare -gx MASTER_PLATE_FILE="$EASY_RESULTS_DIR/MasterPlate_$PROJECT.xls"
rsync -a "$EASY_DIR"/{figs,PTmats} "$EASY_RESULTS_DIR"
# Ask the user to launch EASYconsole.m in MATLAB
# MATLAB doesn't support passing args to scripts se we have to use ENV VARS instead
# TODO will need to play with the -sd startup option to see what works (well)
# Skip this step altogether in auto mode since it requires graphical interaction
if ! ((YES)) && ask "Start EASY in MATLAB? This requires a GUI."; then
# Add EASY directory to the Matlab path
# If this does not work we can try changing the -sd argument and if that fails then pushing/popping
debug "Adding EASY directory to the Matlab path"
"$MATLAB" -nodisplay -nosplash -nodesktop -nojvm -batch "addpath('$EASY_DIR')"
# Launch matlab
# matlab -nosplash -sd "$PROJECT_SCANS_DIR" -r "run $script"
"$MATLAB" -nosplash -r "run $script"
fi
}
module ezview
# @description TODO WIP
ezview() {
debug "Running: ${FUNCNAME[0]}"
declare -gx EZVIEW_DIR="$APPS_DIR/matlab/ezview"
script="$EZVIEW_DIR/EZviewGui.m"
if ! ((YES)) && ask "Start EASY in MATLAB? This requires a GUI."; then
# Make EZview dirs
# Start EZview
"$MATLAB" -nosplash -r "run $script"
fi
}
module qhtcp
# @description System for Multi-QHTCP-Experiment Gene Interaction Profiling Analysis
#
# * Functional rewrite of REMcMaster3.sh, RemcMaster2.sh, REMcJar2.sh, ExpFrontend.m, mProcess.sh, mFunction.sh, mComponent.sh
# * Added a newline character to the end of StudyInfo.csv so it is a valid text file
#
# TODO
#
# * Suggest renaming StudiesQHTCP to something like qhtcp qhtcp_output or output
# * Store StudyInfo somewhere better
# * Move (hide) the study template somewhere else
# * StudiesArchive should be smarter:
# * Create a database with as much information as possible
# * Write a function that easily loads and parses databse into easy-to-use variables
# * Allow users to reference those variables to write their own modules
# * Should not be using initials
# * not unique enough and we don't have that data easily on hand
# * usernames are unique and make more sense
# * I don't know what all would have to be modified atm
#
# Rerunning this module uses rsync --update to only copy files that are newer in the template
# If you wish for the template to overwrite your changes, delete the file from your QHTCP project dir
#
# To create a new study (Experiment Specific Interaction Zscores generation)
#
# * StudyInfo.csv instructions:
# * In your files directory, open the /Code folder, edit the 'StudyInfo.csv' spreadsheet, and save it as a 'csv' file to give each experiment the labels you wish to be used for the plots and specific files.
# * Enter the desired Experiment names- **order the names in the way you want them to appear in the REMc heatmaps; and make sure to run the front end programs (below) in the correct order (e.g., run front end in 'exp1' folder to call the !!Results file for the experiment you named as exp1 in the StudyInfo.csv file)
# * The GTA and pairwise, TSHeatmaps, JoinInteractions and GTF Heatmap scripts use this table to label results and heatmaps in a meaningful way for the user and others. The BackgroundSD and ZscoreJoinSD fields will be filled automatically according to user specifications, at a later step in the QHTCP study process.
#
# * MATLAB ExpFrontend.m was made for recording into a spreadsheet ('StudiesDataArchive.txt') the date and files used (i.e., directory paths to the !!Results files used as input for Z-interaction script) for each multi-experiment study.
# Give each experiment the labels you wish to be used for the plots and specific files.
# Enter the desired Experiment names and order them in the way you want them to appear in the REMc heatmaps;
# Run the front end MATLAB programs in the correct order (e.g., run front end in 'exp1' folder to call the !!Results file for the experiment you named as exp1 in the StudyInfo.csv file)
# The GTA and pairwise, TSHeatmaps, JoinInteractions and GTF Heatmap scripts use this table to label results and heatmaps in a meaningful way for the user and others.
# The BackgroundSD and ZscoreJoinSD fields will be filled automatically according to user specifications, at a later step in the QHTCP study process.
#
# * Open MATLAB and in the application navigate to each specific /Exp folder, call and execute ExpFrontend.m by clicking the play icon.
# * Use the "Open file" function from within Matlab.
# * Do not double-click on the file from the directory.
# * When prompted, navigate to the ExpJobs folder and the PrintResults folder within the correct job folder.
# * Repeat this for every Exp# folder depending on how many experiments are being performed.
# * Note: Before doing this, it's a good idea to compare the ref and non-ref CPP average and median values. If they are not approximately equal, then may be helpful to standardize Ref values to the measures of central tendency of the Non-refs, because the Ref CPPs are used for the z-scores, which should be centered around zero.
# * This script will copy the !!ResultsStd file (located in /PrintResults in the relevant job folder in /scans **rename this !!Results file before running front end; we normally use the 'STD' (not the 'ELR' file) chosen to the Exp# directory as can be seen in the “Current Folder” column in MATLAB, and it updates 'StudiesDataArchive.txt' file that resides in the /StudiesQHTCP folder. 'StudiesDataArchive.txt' is a log of file paths used for different studies, including timestamps.
#
# Do this to document the names, dates and paths of all the studies and experiment data used in each study. Note, one should only have a single '!!Results…' file for each /Exp_ to prevent ambiguity and confusion. If you decide to use a new or different '!!Results…' sheet from what was used in a previous “QHTCP Study”, remove the one not being used. NOTE: if you copy a '!!Results…' file in by hand, it will not be recorded in the 'StudiesDataArchive.txt' file and so will not be documented for future reference. If you use the ExpFrontend.m utility it will append the new source for the raw !!Results… to the 'StudiesDataArchive.txt' file.
# As stated above, it is advantageous to think about the comparisons one wishes to make so as to order the experiments in a rational way as it relates to the presentation of plots. That is, which results from sheets and selected 'interaction … .R', user modified script, is used in /Exp1, Exp2, Exp3 and Exp4 as explained in the following section.
# TODO MUST CLEAN UP QHTCP TEMPLATE DIRECTORY
#
#
# As stated earlier, the user can add folders to back up temporary results, study-related notes, or other related work.
# However, it is advised to set up and use separate STUDIES when evaluating differing data sets whether that is from experiment results files or from differing data selections in the first interaction … .R script stage.
# This reduces confusion at the time of the study and especially for those reviewing study analysis in the future.
#
# How-To Procedure: Execute a Multi-experiment Study:
#
# * Consider the goals of the study and design a strategy of experiments to include in the study.
# * Consider the quality of the experiment runs using EZview to see if there are systematic problems that are readily detectable.
# * In some cases, one may wish to design a 'pilot' study for discovery purposes.
# * There is no problem doing that, just take a template study, copy and rename it as XYZpilotStudy etc.
# * However, careful examination of the experimental results using EZview will likely save time in the long run.
# * One may be able to relatively quickly run the interaction Z scores (the main challenge there is the user creation of customized interaction… .R code.
# * I have tried to simplify this by locating the user edits near the top).
#
#
qhtcp() {
debug "Running: ${FUNCNAME[0]}"
if [[ -d $QHTCP_PROJECT_DIR ]]; then
# Handle existing output directory
echo "A project already exists at $QHTCP_PROJECT_DIR"
echo "Would you like to (c)ontinue, (u)pdate it from the template and continue, or (b)ack it up and continue from scratch?"
for i in {1..3}; do # give the user three chances to get it right
((YES)) || read -r -p "Hit [Enter] or c to continue: " response
[[ -z $response ]] && break
case $response in
u)
echo "Updating project from template"
echo "Only files that are newer in the template will be overwritten"
if rsync --archive --update "$QHTCP_TEMPLATE_DIR"/ "$QHTCP_PROJECT_DIR"; then
echo "Project updated with template"
fi
;;
b)
backup "$QHTCP_PROJECT_DIR" && rm -rf "$QHTCP_PROJECT_DIR"
if rsync --archive --update "$QHTCP_TEMPLATE_DIR"/ "$QHTCP_PROJECT_DIR"; then
echo "New project created at $QHTCP_PROJECT_DIR"
fi
;;
c)
break
;;
*)
err "Invalid response, please try again"
continue
;;
esac
break
done
fi
# Sets STUDIES_NUMS and NUM_STUDIES (yes this makes sense)
get_studies "$STUDY_INFO_FILE"
# Construct the next auto-entry
# 1,ExpName1,NA,NA,UserInitials
next_study_num=$(( NUM_STUDIES + 1 ))
# If the next Exp dir already exists don't use it
while [[ -d $QHTCP_PROJECT_DIR/Exp$next_study_num ]]; do
(( next_study_num++ ))
done
# Use initials from project not whoami
# Best I can do is first two letters of username
# See TODO in markdown
initials="${PROJECT_USER:0:2}"
INITIALS=${initials^^}
next_study_entry="$next_study_num,$PROJECT_SUFFIX,NA,NA,$INITIALS"
debug "$next_study_entry"
# Print current studies
[[ -f $STUDY_INFO_FILE ]] &&
echo "Current studies from $STUDY_INFO_FILE: " &&
cat "$STUDY_INFO_FILE"
# Ask user to edit STUDY_INFO_FILE
if ! ((YES)) && ask "Would you like to edit $STUDY_INFO_FILE to add or modify studies?"; then
cat <<-EOF
Give each experiment labels to be used for the plots and specific files.
Enter the desired Experiment names and order them in the way you want them to appear in the REMc heatmaps
Auto-entry suggestion: $next_study_entry
EOF
if ask "Would you like to add (y) the auto-entry suggestion to $STUDY_INFO_FILE or edit STUDY_INFO_FILE in nano (n)?"; then
echo "$next_study_entry" >> "$STUDY_INFO_FILE"
else
debug "nano $STUDY_INFO_FILE"
nano "$STUDY_INFO_FILE"
fi
fi
# Initialize missing dirs
STUDIES_DIRS=()
for s in "${STUDIES_NUMS[@]}"; do
STUDY_DIR="$QHTCP_PROJECT_DIR/Exp$s"
STUDIES_DIRS+=("$STUDY_DIR")
if ! [[ -d $STUDY_DIR ]]; then
if ! rsync --archive "$STUDY_TEMPLATE_DIR" "$STUDY_DIR"; then
err "Could not copy $STUDY_TEMPLATE_DIR template to $STUDY_DIR"
continue
fi
fi
done
unset STUDY_DIR
# Replacing ExpFrontend.m
choose_easy_results_dir
# Create studies archive file if missing
if ! [[ -d $STUDIES_ARCHIVE_FILE ]]; then
header=(StudyDate tStudyName StudyPath ExpNum ExpDate ExpPath ResultFile)
printf "%s\t" "${header[@]}" > "$STUDIES_ARCHIVE_FILE"
fi
# TODO Add them all to StudiesDataArchive?
# Probably better to always add and remove dupes later since each invocation "counts"?
for f in "${EASY_RESULTS_FILES[@]}"; do
for s in "${STUDIES_NUMS[@]}"; do
# Trying to match old ExpFrontend formatting
printf "%s\t" \
"${DATE//_/}" "$PROJECT" "$QHTCP_PROJECT_DIR" "Exp$s" \
"$PROJECT_DATE" "$PROJECT_SCANS_DIR" "$EASY_RESULTS_DIR" "${f##*/}" \
>> "$STUDIES_ARCHIVE_FILE"
done
done
# Run R interactions script on all studies
for s in "${STUDIES_NUMS[@]}"; do
STUDY_DIR="$QHTCP_PROJECT_DIR/Exp$s"
r_interactions \
"$STUDY_DIR" \
"$STUDY_INFO_FILE" \
"$STUDY_DIR/zscores/" \
"$APPS_DIR/r/SGD_features.tab" \
5
done
# Run remc as part of the QHTCP process
# pass all the study directories to it so the scripts have all the paths
remc "$STUDY_INFO_FILE" "${STUDIES_DIRS[@]}"
}
module remc
# @description remc module for QHTCP
#
# TODO
#
# * Which components can be parallelized?
# @arg $1 string studyInfo file
remc() {
debug "Running: ${FUNCNAME[0]} $*"
# If any submodules fail the rest will not run, this is fundamental to module design
# Remove leading && to run regardless
# TODO can this be
r_join_interactions \
"$QHTCP_PROJECT_DIR/out" # output directory
2 \ % sd value
"$1" # studyInfo file
"${@:2}" \
&& java_extract \
"$QHTCP_PROJECT_DIR/out/" \
&& r_add_shift_values \
"$QHTCP_PROJECT_DIR/REMcRdy_lm_only.csv-finalTable.csv" \
"$QHTCP_PROJECT_DIR/Shift_only.csv" \
"$1" \
"$QHTCP_PROJECT_DIR/REMcWithShift.csv" \
&& r_create_heat_maps \
"$QHTCP_PROJECT_DIR/REMcWithShift.csv" \
"$QHTCP_PROJECT_DIR/out" \
&& r_heat_maps_homology \
"$QHTCP_PROJECT_DIR/REMcRdy_lm_only.csv-finalTable.csv" \
"$APPS_DIR/r/170503_DAmPs_Only.txt" \
"$APPS_DIR/r/Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv" \
"$QHTCP_PROJECT_DIR/out/homology"
}
module gtf
# @description GTF module for QHTCP
# @arg $1 string output directory
# @arg $2 string gene_association.sgd
# @arg $3 string gene_ontology_edit.obo
# @arg $4 string ORF_List_Without_DAmPs.txt
gtf() {
debug "Running: ${FUNCNAME[0]}"
gtf_out_dir="${1:-$QHTCP_PROJECT_DIR/out/gtf}"
gene_association_sgd="${2:-"$APPS_DIR/r/gene_association.sgd"}"
gene_ontology_obo="${3:-"$APPS_DIR/r/gene_ontology_edit.obo"}"
orf_list="${4:-"$APPS_DIR/r/ORF_List_Without_DAmPs.txt"}"
process_dir="$gtf_out_dir/process"
function_dir="$gtf_out_dir/function"
component_dir="$gtf_out_dir/component"
py_gtf_dcon \
"$process_dir" \
"$gtf_out_dir"
# Reproduce the function and components dirs from the process dir
for d in "$function_dir" "$component_dir"; do
debug "rsync -a $process_dir/ $d/"
rsync -a "$process_dir/" "$d/"
done
for d in "$process_dir" "$function_dir" "$component_dir"; do
out_file="${d##*/}Results.txt" # Use the dirname to create each Results filename
shopt -s nullglob
txts=("$d"/*.txt) # glob all txt files from each dir
shopt -u nullglob
for txt in "${txts[@]}"; do
debug "pl_gtf_analyze -an $gene_association_sgd -as P -o $gene_ontology_obo -b $orf_list $txt"
pl_gtf_analyze \
'-an' "$gene_association_sgd" \
'-as' 'P' \
'-o' "$gene_ontology_obo" \
'-b' "$orf_list" \
"$txt"
debug "pl_terms2tsv $txt"
pl_gtf_terms2tsv "$txt"
done
debug "py_gtf_concat $gtf_out_dir $out_file"
py_gtf_concat "$gtf_out_dir" "$out_file"
done
r_compile_gtf "$gtf_out_dir"
}
module gta
# @description GTA module for QHTCP
#
# TODO
#
# *
# *
#
# @arg $1 string output directory
# @arg $2 string gene_association.sgd
# @arg $3 string gene_ontology_edit.obo
# @arg $4 string go_terms.tab
# @arg $5 string All_SGD_GOTerms_for_QHTCPtk.csv
# @arg $6 string zscores_interaction.csv
gta() {
debug "Running: ${FUNCNAME[0]}"
gta_out_dir="${1:-"$QHTCP_PROJECT_DIR/gta"}"
gene_association_sgd="${2:-"$APPS_DIR/r/gene_association.sgd"}"
gene_ontology_obo="${3:-"$APPS_DIR/r/gene_ontology_edit.obo"}"
sgd_terms_tfile="${4:-"$APPS_DIR/r/go_terms.tab"}"
all_sgd_terms_csv="${5:-"$APPS_DIR/r/All_SGD_GOTerms_for_QHTCPtk.csv"}"
zscores_file="${6:-"$gta_out_dir/zscores/zscores_interaction.csv"}" # TODO This could be wrong, it could be in main results
# Sets STUDIES_NUM and NUM_STUDIES
get_studies "$STUDY_INFO_FILE"
[[ -d $gta_out_dir ]] || mkdir "$gta_out_dir"
# Loop over the array and create pairwise arrays
for ((i=0; i<${#STUDIES_NUMS[@]}; i++)); do
for ((j=i+1; j<${#STUDIES_NUMS[@]}; j++)); do
pair=("${STUDIES_NUMS[i]}" "${STUDIES_NUMS[j]}")
echo "${pair[@]}"
done
done
# Create unique parwise combinations of study nums from dir names
study_combos=()
for ((i=0; i<${#STUDIES_NUMS[@]}; i++)); do
# Loop through the array again
for ((j=0; j<${#STUDIES_NUMS[@]}; j++)); do
# If the indices are not the same
if [ "$i" != "$j" ]; then
# Print the unique combination
study_combos+=("${STUDIES_NUMS[$i]},${STUDIES_NUMS[$j]}")
fi
done
done
# The following are three types of studies
# Individual studies
for s in "${STUDIES_NUMS[@]}"; do
zscores_file="$QHTCP_PROJECT_DIR/Exp$s/$zscores_file"
if [[ -f $zscores_file ]]; then
mkdir "$gta_out_dir/Exp$s"
r_gta \
"Exp$s" \
"$zscores_file" \
"$sgd_terms_tfile" \
"$gene_association_sgd" \
"$gta_out_dir"
fi
done
# Combination studies (for pairwise comparisons)
for combo in "${study_combos[@]}"; do
# Split on comma and assign to array
IFS=',' read -ra studies <<< "$combo"
r_gta_pairwiselk "${studies[0]}" "${studies[1]}" "$STUDY_INFO_FILE" "$gta_out_dir"
done
# All studies
# If you have an unknown # of studies it must be passed last and any preceding arguments
# are required
r_gta_heatmaps \
"$STUDY_INFO_FILE" \
"$gene_ontology_obo" \
"$sgd_terms_tfile" \
"$all_sgd_terms_csv" \
"$zscores_file" \
"$QHTCP_PROJECT_DIR" \
"$QHTCP_PROJECT_DIR/TermSpecificHeatmaps" \
"${STUDIES_NUMS[@]}"
}
# @section Submodules
# @description
#
# Submodules are shell wrappers for workflow components in external languages
#
# Submodules:
#
# * Allow scripts to be called by the main workflow script using input and output arguments as a translation mechanism.
# * Only run by default if called by a module.
# * Can be called directly with its arguments as a comma-separated string
#
# @description
submodule() {
debug "Adding $1 submodule"
ALL_SUBMODULES+=("$1")
declare -gA "$1"
}
submodule r_gta
# @description GTAtemplate R script
#
# TODO
#
# * Is GTAtemplate.R actually a template?
# * Do we need to allow user customization?
#
# Files
#
# * [gene_association.sgd](https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd)
# * go_terms.tab
#
# Output
#
# *
#
# @arg $1 string Exp# name
# @arg $2 string ZScores_Interaction.csv file
# @arg $3 string go_terms.tab file
# @arg $4 string [gene_association.sgd](https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd)
# @arg $5 string output directory
#
r_gta() {
debug "Running: ${FUNCNAME[0]} $*"
cat <<-EOF
EOF
script="$APPS_DIR/r/GTAtemplate.R"
[[ -d $5 ]] || mkdir -p "$5"
debug "$RSCRIPT $script $*"
"$RSCRIPT" "$script" \
"$1" \
"$2" \
"$3" \
"$4" \
"$5" \
"${@:6}"
}
submodule r_gta_pairwiselk
# @description PairwiseLK.R R script
#
# TODO
#
# * Should move directory creation from PairwiseLK.R to gta module
#
# Files
#
# *
# *
#
# Output
#
# *
#
# This submodule:
#
# * Will perform both L and K comparisons for the specified experiment folders.
# * The code uses the naming convention of PairwiseCompare_Exp#-Exp# to standardize and keep simple the structural naming (where X is either K or L and Y is the number of the experiment GTA results to be found in ../GTAresult/Exp_).
# * {FYI There are also individual scripts that just do the L or K pairwise studies in the ../Code folder.}
#
# @arg $1 string First Exp# name
# @arg $2 string Second Exp# name
# @arg $3 string StudyInfo.txt file
# @arg $4 string output directory
#
r_gta_pairwiselk() {
debug "Running: ${FUNCNAME[0]}"
cat <<-EOF
EOF
script="$APPS_DIR/r/PairwiseLK.R"
[[ -d $4 ]] || mkdir -p "$4"
debug "$RSCRIPT $script $*"
"$RSCRIPT" "$script" "$@"
}
submodule r_gta_heatmaps
# @description TSHeatmaps5dev2.R R script
#
# TODO
#
# * Script could use rename
# * Script should be refactored to automatically allow more studies
# * Script should be refactored with more looping to reduce verbosity
#
# Files
#
# *
# *
#
# Output
#
# *
#
# This submodule:
#
# * The Term Specific Heatmaps are produced directly from the ../ExpStudy/Exp_/ZScores/ZScores_Interaction.csv file generated by the user modified interaction… .R script.
# * The heatmap labeling is per the names the user wrote into the StudyInfo.txt spreadsheet.
# * Verify that the All_SGD_GOTerms_for_QHTCPtk.csv found in ../Code is what you wish to use or if you wish to use a custom modified version.
# * If you wish to use a custom modified version, create it and modify the TSHeatmaps template script (TSHeatmaps5dev2.R) and save it as a TSH_study specific name.
#
# @arg $1 string StudyInfo.csv file
# @arg $2 string gene_ontology_edit.obo file
# @arg $3 string go_terms.tab file
# @arg $4 string All_SGD_GOTerms_for_QHTCPtk.csv
# @arg $5 string ZScores_interaction.csv
# @arg $6 string base directory
# @arg $7 string output directory
#
r_gta_heatmaps() {
debug "Running: ${FUNCNAME[0]} $*"
cat <<-EOF
EOF
script="$APPS_DIR/r/TSHeatmaps5dev2.R"
[[ -d $7 ]] || mkdir -p "$7"
debug "$RSCRIPT $script $*"
"$RSCRIPT" "$script" "$@"
}
submodule r_interactions
# @description Run the R interactions analysis (Z_InteractionTemplate.R)
#
# TODO
#
# * Don't want to rename Z_InteractionTemplate.R because that will break logic, just edit in place instead
#
# NOTES
#
# *
#
# @arg $1 string The current working directory
r_interactions() {
debug "Running: ${FUNCNAME[0]}"
cat <<-EOF
Edit the Z_InteractionTemplate.R script in each Exp dir beginning at the '++BEGIN USER DATA SELECTION++'
This is designed so that the data of interest for each experiment is appropriately selected from the !!Results…txt file
Be sure to enter Background noise filter standard deviation i.e., 3 or 5 per Sean
Enter Standard deviation value for removing data for cultures due to high background (e.g., contaminated cultures). Generally set this very high (e.g., '20') on the first run in order NOT to remove data, e.g. '20'. Review QC data and inspect raw image data to decide if it is desirable to remove data, and then rerun analysis.
Enter a Background SD threshold for EXCLUDING culture data from further analysis:
This Background value removes data where there is high pixel intensity in the background regions of a spot culture (i.e., suspected contamination). 5 is a minimum recommended value, because lower values result in more data being removed, and often times this is undesirable if contamination occurs late after the carrying capacity of the yeast culture is reached.
This is most often "trial and error", meaning there is a 'Frequency_Delta_Background.pdf' report in the /Exp_/ZScores/QC/ folder to evaluate whether the chosen value was suitable (and if not the analysis can simply be rerun with a more optimal choice). In general, err on the high side, with BSD of 10 or 12…. One can also use EZview to examine the raw images and individual cultures potentially included/excluded as a consequence of the selected value. Background values are reported in the results sheet and so could also be analyzed there..
EOF
script="$APPS_DIR/r/interactions.R"
debug "$RSCRIPT $script" "$@"
"$RSCRIPT" "$script" \
"$1" \
"${@:2}" # optional arguments
}
submodule r_join_interactions
# @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv
#
# Output
#
# * REMcRdy_lm_only.csv
# * Shift_only.csv
# * parameters.csv
#
# @arg $1 string The output directory
# @arg $2 string The sd value
# @arg $3 string The studyInfo file
r_join_interactions() {
debug "Running: ${FUNCNAME[0]} $*"
script="$APPS_DIR/r/joinInteractExps.R"
debug "$RSCRIPT $script $*"
"$RSCRIPT" "$script" \
"$1" \
"$2" \
"$3" \
"${@:4}" # optional arguments
local out_files=("$1/REMcRdy_lm_only.csv" "$1/Shift_only.csv" "$1/parameters.csv")
for f in "${out_files[@]}"; do
[[ -f $f ]] || (echo "$f does not exist"; return 1)
done
}
submodule java_extract
# @description Jingyu's REMc java utility
#
# Input
#
# * REMcRdy_lm_only.csv
#
# Output
#
# * REMcRdy_lm_only.csv-finalTable.csv
#
# NOTE
#
# * Closed-source w/ hardcoded output directory, so have to pushd/popd to run (not ideal)
#
# @arg $1 string The output directory
java_extract() {
debug "Running: ${FUNCNAME[0]}"
classpath="$APPS_DIR/java/javaExtract.jar"
# backup REMcRdy_lm_only.csv-finalTable.csv
if ! backup "$out_file"; then
ask "Backup of $out_file failed, continue?" || return 1
fi
java_cmd=(
"$JAVA" -Xms512m -Xmx2048m -Dfile.encoding=UTF-8 -classpath "$classpath" ExecMain
"$QHTCP_PROJECT_DIR/REMcRdy_lm_only.csv"
"$APPS_DIR/java/GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab"
"$APPS_DIR/java/ORF_List_Without_DAmPs.txt" 1 true true
)
debug "pushd && ${java_cmd[*]} && popd"
pushd "$1" && "${java_cmd[@]}" && popd || return 1
out_file="$1/REMcRdy_lm_only.csv-finalTable.csv"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
submodule r_add_shift_values
# @description Add shift values back to REMcRdy_lm_only.csv-finalTable.csv
# and output "REMcWithShift.csv" for use with the REMc heat maps
# @arg $1 string REMcRdy_lm_only.csv-finalTable.csv
# @arg $2 string Shift_only.csv
# @arg $3 string StudyInfo.csv file
# @arg $4 string The sd value
r_add_shift_values() {
debug "Running: ${FUNCNAME[0]} $*"
script="$APPS_DIR/r/addShiftVals.R"
debug "$RSCRIPT $script $*"
"$RSCRIPT" "$script" \
"$1" \
"$2" \
"$3" \
"$4" \
"${@:5}" # optional arguments
rm -f "$QHTCP_PROJECT_DIR/REMcHeatmaps/"*.pdf
out_file="$QHTCP_PROJECT_DIR/REMcWithShift.csv"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
submodule r_create_heat_maps
# @description Execute createHeatMaps.R
# @arg $1 string The final shift table (REMcWithShift.csv)
# @arg $2 string The output directory
r_create_heat_maps() {
debug "Running: ${FUNCNAME[0]} $*"
script="$APPS_DIR/r/createHeatMaps.R"
debug "$RSCRIPT $script $*"
"$RSCRIPT" "$script" \
"$1" \
"$2" \
"${@:3}" # optional arguments
pdfs=(REMcHeatmaps/*.pdf)
debug "pdftk ${pdfs[*]} output $out_file"
pdftk "${pdfs[@]}" output "$out_file"
out_file="$2/compiledREMcHeatmaps.pdf"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
submodule r_heat_maps_homology
# @description Execute createHeatMapsAll.R
# @arg $1 string REMcRdy_lm_only.csv-finalTable.csv
# @arg $2 string Shift_only.csv
# @arg $3 string The (Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv)
# @arg $4 string The output directory
r_heat_maps_homology() {
debug "Running: ${FUNCNAME[0]} $*"
script="$APPS_DIR/r/createHeatMapsHomology.R"
# Remove old output
debug "Removing old pdfs and csvs from $4"
rm "$4/"*.{pdf,csv}
"$RSCRIPT" "$script" \
"$1" \
"$2" \
"$3" \
"$4" \
"${@:5}" # optional arguments
pdfs=("$work_dir"/homology/*.pdf)
pdftk "${pdfs[@]}" output "$out_file"
out_file="$4/compiledREMcHomologyHeatmaps.pdf"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
submodule py_gtf_dcon
# @description Perform python dcon portion of GTF
#
# Output
#
# * 1-0-0-finaltable.csv
# @arg $1 string Directory to process
# @arg $2 string Output directory name
py_gtf_dcon() {
debug "Running: ${FUNCNAME[0]} $*"
script="$APPS_DIR/python/DconJG2.py"
debug "$PYTHON $script $1 $2/"
"$PYTHON" "$script" \
"$1" \
"$2/" \
"${@:3}" # optional arguments
out_file="$2/1-0-0-finaltable.csv"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
}
submodule pl_gtf_analyze
# @description Perl analyze submodule
# This seems weird to me because we're just overwriting the same data for all set2 members
# https://metacpan.org/dist/GO-TermFinder/view/examples/analyze.pl
# Is there a reason you need a custom version and not the original from cpan?
# @arg $1 string gene_association.sgd
# @arg $2 string gene_ontology_edit.obo
# @arg $3 string ORF_List_Without_DAmPs.txt
# @arg $4 string TODO txt to anaylze? I'm not sure what this is called
pl_gtf_analyze() {
debug "Running: ${FUNCNAME[0]} $*"
script="$APPS_DIR/perl/analyze_v2.pl"
debug "$PERL $script $*"
"$PERL" "$script" \
"$1" \
"$2" \
"$3" \
"$4" \
"${@:5}" # optional arguments
}
submodule pl_gtf_terms2tsv
# @description Perl terms2tsv submodule
# Probably should be translated to shell/python
#
# @arg $1 string Terms file TODO naming
pl_gtf_terms2tsv() {
debug "Running: ${FUNCNAME[0]} $*"
script="$APPS_DIR/perl/terms2tsv.pl"
debug "$PERL $script $1.terms > $1.tsv"
"$PERL" "$script" "$1.terms" > "$1.tsv"
}
submodule py_gtf_concat
# @description Python concat submodule for GTF
# Concat the process ontology outputs from the /REMcReady_lm_only folder
# Probably should be translated to bash
# @arg $1 string output directory name to look for txt files
# @arg $2 string output file
py_gtf_concat() {
debug "Running: ${FUNCNAME[0]} $*"
script="$APPS_DIR/python/concatGTFResults.py"
debug "$PYTHON $script $1/ $2"
"$PYTHON" "$script" "$1/" "$2"
[[ -f $2 ]] || (echo "$2 does not exist"; return 1)
}
submodule r_compile_gtf
# @description Compile GTF in R
# @arg $1 string gtf output directory
r_compile_gtf() {
debug "Running: ${FUNCNAME[0]} $*"
script="$APPS_DIR/r/CompileGTF.R"
debug "$RSCRIPT $script $1"
"$RSCRIPT" "$script" "$1"
}
submodule get_studies
# @description Parse study names from StudyInfo.csv files
#
# TODO
#
# * This whole submodule should eventually be either
# * Removed
# * Expanded into a file that stores all project/study settings (database)
# * I had to had a new line to the end of StudyInfo.csv, may break things?
#
# Example:
# ExpNumb,ExpLabel,BackgroundSD,ZscoreJoinSD,AnalysisBy
# 1,ExpName1,NA,NA,UserInitials
# 2,ExpName2,NA,NA,UserInitials
# 3,ExpName3,NA,NA,UserInitials
# @exitcode 0 If one or more studies found
# @exitcode 1 If no studies found
# @set STUDIES_NUMS array Contains Exp numbers
# @set NUM_STUDIES int Number of existing studies
# @arg $1 string File to read
get_studies() {
debug "Running: ${FUNCNAME[0]}"
declare -ga STUDIES_NUMS=()
while IFS=',' read -r col1 _; do # split on comma, get second col
STUDIES_NUMS+=("$col1")
done < <(tail -n +2 "$1") # skip header
[[ ${#STUDIES_NUMS[@]} -gt 0 ]] &&
NUM_STUDIES="${#STUDIES_NUMS{@}}"
}
# submodule choose_easy_results_dir #
# # @description Chooses an EASY scans directory if the information is undefined
# # TODO Standardize EASY output, it's hard to understand
# # TODO eventually we could run this on multiple results dirs simultaneously with some refactoring
# # @exitcode 0 if successfully choose an EASY results dir
# # @set EASY_RESULTS_DIR string The working EASY output directory
# choose_easy_results_dir() {
# debug "Running: ${FUNCNAME[0]}"
# # Always backup existing output
# # This would happen if you ran the same experiment twice in one day, for instance
# [[ -d $EASY_RESULTS_DIR ]] && backup "$EASY_RESULTS_DIR"
# if [[ ! -d $EASY_RESULTS_DIR ]]; then
# debug "mkdir $EASY_RESULTS_DIR"
# mkdir "$EASY_RESULTS_DIR"
# else
# err "Could not create $EASY_RESULTS_DIR"
# return 0
# fi
# # echo "Hit enter to use the default EASY results directory: $default_easy_results_dir"
# # if ! (( YES )); then
# # read -r -p "Or enter a custom directory name, example: $PROJECT" dirname
# # [[ -z $dirname ]] && EASY_RESULTS_DIR="$default_easy_results_dir" && return 0
# # fi
# # ((YES)) && return 0
# # # Let's get a little fancy
# # shopt -s nullglob
# # declare -la easy_results_dirs=( "$easy_out_dir/"*/ )
# # shopt -u nullglob
# # # Sort the dirs
# # mapfile -t easy_results_dirs < <(printf '%s\n' "${easy_results_dirs[@]}" | sort)
# # last_index=$(( ${#easy_results_dirs} - 1 ))
# # ((YES)) && EASY_RESULTS_DIR="${easy_results_dirs[$last_index]}" && return 0
# # echo "Multiple EASY results dirs found in $PROJECT_SCANS_DIR"
# # echo "Here is a list: "
# # for i in "${!easy_results_dirs[@]}"; do
# # printf "%d. %s\n" "$((i+1))" "${easy_results_dirs[i]}"
# # done
# # printf "%s\n" "${easy_results_dirs[@]}"
# # last_index=$(( ${#easy_results_dirs} - 1 ))
# # read -r -p "Enter the item number to select EASY results directory, default ($last_index): " response
# # [[ -z $response ]] && response=$last_index
# # response=$(( response - 1 )) # bash arrays use zero indexing
# # EASY_RESULTS_DIR="${easy_results_dirs[$response]}"
# # EASY_RESULTS_FILES=("$EASY_RESULTS_DIR/"*"/PrintResults/!!"*)
# # [[ ${#easy_results_dirs[@]} -gt 0 ]]
# }
submodule documentation
# @description Generates shdoc markdown from this script
# @noargs
# @internal
documentation() {
debug "Running: ${FUNCNAME[0]}"
# Print markdown to stdout
((DEBUG)) && shdoc < "$SCRIPT"
# Create markdown file
shdoc < "$SCRIPT" > README.md
}
# @description The main loop of script-run-workflow
# May eventually need to add git ops
# Passes on arguments
# Most variables in main() are user configurable or can be overriden by env
# @internal
main() {
debug "Running: ${FUNCNAME[0]} $*"
# Libraries
declare -g JAVA="${JAVA:-$(which java 2>/dev/null)}"
declare -g PYTHON="${PYTHON:-$(which python3 2>/dev/null)}"
declare -g PERL="${PERL:-$(which perl 2>/dev/null)}"
declare -g RSCRIPT="${RSCRIPT:-$(which Rscript 2>/dev/null)}"
declare -g MATLAB="${MATLAB:-$(which matlab 2>/dev/null)}"
# Global vars
SCRIPT_NAME="${BASH_SOURCE[0]##*/}"
SCRIPT=$(realpath -s "${BASH_SOURCE[0]}")
SCRIPT_DIR=$(dirname "$SCRIPT")
APPS_DIR="$SCRIPT_DIR/apps"
TEMPLATES_DIR="$SCRIPT_DIR/templates"
PROJECT_USER="$(whoami)"
DATE="$(date +%Y%m%d)" # change in EASYconsole.m to match 'hardcode'
# Find a scans directory
# local scans_heirarchy=("./scans" "/mnt/data/scans" "/mnt/data/ExpJobs" "./scans")
local scans_heirarchy=(
"$SCANS_DIR"
"$SCRIPT_DIR/scans"
"/mnt/data/scans"
"$SCRIPT_DIR/templates/scans-demo"
"$SCRIPT_DIR/scans" # fallback and create if others not found
)
[[ -z $SCANS_DIR ]] && for d in "${scans_heirarchy[@]}"; do
if [[ -d $d ]]; then
declare -gx SCANS_DIR="$d"
fi
done
if ! [[ -d $SCANS_DIR ]]; then
# This is not something we do often, so ask
if ask "Create the scans directory: $SCANS_DIR?"; then
mkdir -p "$SCANS_DIR"
else
echo "No scans directory available, exiting"
exit 1;
fi
fi
# Make sure we are using the absolute path
SCANS_DIR=$(realpath -s "$SCANS_DIR")
# Find an output directory
local out_heirarchy=("$(dirname "$SCANS_DIR")/out" "$SCRIPT_DIR/out" "/mnt/data/out")
for d in "${out_heirarchy[@]}"; do
if [[ -d $d ]]; then
debug "Using output directory: $d"
declare -g OUT_DIR="$d"
break
fi
done
if [[ -z $OUT_DIR ]]; then
echo "No output directory found"
declare -gx OUT_DIR="$SCRIPT_DIR/out"
# This is not something we do often, so ask
if ask "Create $SCRIPT_DIR/out?"; then
debug "mkdir $SCRIPT_DIR/out"
mkdir "$SCRIPT_DIR/out"
else
err "No output directory, attempting to continue..."
fi
fi
# Make sure we are using the absolute path
OUT_DIR=$(realpath -s "$OUT_DIR")
# Set the automatic project directory prefix
PROJECT_PREFIX="${DATE}_${PROJECT_USER}" # reversed these so easier to sort and parse date
sanitize_pn() { [[ $1 =~ [0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]_.+_.+ ]]; } # sanitizer regex for prefix
declare -ag PROJECTS=() # this array will hold all of the projects for this run
parse_input "$@" # parse arguments with getopt
interactive_header "$@"
# # Prompt user for the PROJECT if we still don't have one
# if [[ ${#PROJECTS[@]} -eq 0 ]]; then
# ask_pn && PROJECTS+=("${ADD_PROJECTS[@]}")
# fi
for i in "${!PROJECTS[@]}"; do
if ! sanitize_pn "${PROJECTS[i]}"; then
echo "Project name ${PROJECTS[i]} is invalid"
echo "Enter a replacement"
ask_pn && unset "PROJECTS[i]" && PROJECTS+=("${ADD_PROJECTS[@]}")
fi
done
# Exclude modules from --exclude
for i in "${!MODULES[@]}"; do
[[ " ${EXCLUDE_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]] && unset "MODULES[i]"
done
# Sanitize MODULES
for i in "${!MODULES[@]}"; do
if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then
echo "Module ${MODULES[$i]} not in the module list"
echo "Available modules:"
printf "%s\n" "${ALL_MODULES[@]}"
read -r -p "Enter replacement module name: " module
! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${module}[[:space:]] ]] || (echo "RTFM"; return 1)
MODULES[i]="$module"
fi
unset module
done
# Sanitize SUBMODULES
for i in "${!SUBMODULES[@]}"; do
if ! [[ " ${ALL_SUBMODULES[*]} " =~ [[:space:]]${SUBMODULES[i]}[[:space:]] ]]; then
echo "Submodule ${SUBMODULES[$i]} not in the module list, removing"
unset "SUBMODULES[i]" "SUBMODULES[$((i+1))]"
fi
continue 2 # skip the arguments string
done
# Loop over projects
for PROJECT_NAME in "${PROJECTS[@]}"; do
declare -gx PROJECT_NAME
declare -gx PROJECT_SCANS_DIR="$SCANS_DIR/$PROJECT_NAME"
declare -gx PROJECT_DATE="${PROJECT_NAME%"${PROJECT_NAME#????????}"}" # e.g. 20240723
declare -gx PROJECT_SUFFIX="${PROJECT_NAME#????????_*_}"
declare -gx PROJECT_USER="${PROJECT_NAME#????????_}"; PROJECT_USER="${PROJECT_USER%%_*}"
declare -gx STUDIES_ARCHIVE_FILE="$OUT_DIR/StudiesDataArchive.txt"
declare -gx QHTCP_PROJECT_DIR="$OUT_DIR/$PROJECT_NAME"
declare -gx QHTCP_TEMPLATE_DIR="$TEMPLATES_DIR/qhtcp"
declare -gx STUDY_TEMPLATE_DIR="$TEMPLATES_DIR/exp"
declare -gx STUDY_INFO_FILE="$QHTCP_PROJECT_DIR/StudyInfo.csv"
declare -gx EASY_OUT_DIR="$QHTCP_PROJECT_DIR/easy"
declare -gx R_LIBS_USER=${R_LIBS_USER:-"$HOME/R/$SCRIPT_NAME"}
if ((DEBUG)); then
declare -p SCANS_DIR OUT_DIR TEMPLATES_DIR APPS_DIR \
PROJECTS PROJECT_NAME \
PROJECT_SCANS_DIR PROJECT_DATE PROJECT_SUFFIX PROJECT_USER \
STUDIES_ARCHIVE_FILE QHTCP_PROJECT_DIR QHTCP_TEMPLATE_DIR \
STUDY_TEMPLATE_DIR STUDY_INFO_FILE
fi
debug "Active modules: ${MODULES[*]}"
debug "Active submodules and their args: ${SUBMODULES[*]}"
# Run selected modules
for m in "${MODULES[@]}"; do
ask "Run $m module?" && "$m"
done
# Run selected submodules
for i in "${!SUBMODULES[@]}"; do
IFS=',' read -ra cmds <<< "${SUBMODULES[$((i+1))]}" # load the command args
ask "Run ${SUBMODULES[i]} submodule with args ${cmds[*]}?" &&
"${SUBMODULES[i]}" "${cmds[@]}"
continue 2 # skip the command string
done
done
}
# (Safe) main loop
for ((i=0; i<=1; i++)); do
main "$@" && i=0 # on successful run, reset the counter
done
exit $?