1603 lines
71 KiB
Bash
Executable File
1603 lines
71 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Copyright 2024 Bryan C. Roessler
|
||
# This program contains a mixture of code/pseudocode and shouldn't be run until this message is removed
|
||
#
|
||
# Allow indirect functions
|
||
# shellcheck disable=SC2317
|
||
#
|
||
# @name Hartman Lab Self-Documenting Workflow
|
||
# @brief One script to rule them all (see: xkcd #927)
|
||
# @description A flexible yet opinionated analysis framework for the Hartman Lab
|
||
# There should be at least 4 subdirectories to organize Q-HTCP data and analysis. The parent directory is simply called 'Q-HTCP' and the 4 are subdirectories described below (Fig. 1):
|
||
# * **ExpJobs**
|
||
# * This directory contains raw image data and image analysis results for the entire collection of Q-HTCP experiments.
|
||
# * We recommend each subdirectory within 'ExpJobs" should represent a single Q-HTCP experiment and be named using the following convention (AB yyyy_mmdd_PerturbatationsOfInterest): experimenter initials ('AB '), date ('yyyy_mmdd_'), and brief description ('drugs_medias').
|
||
# * Each subdirectory contains the Raw Image Folders for that experiment (a series of N folders with successive integer labels 1 to N, each folder containing the time series of images for a single cell array). It also contains a user-supplied subfolder, which must be named ''MasterPlateFiles" and must contain two excel files, one named 'DrugMedia_experimentdescription' and the other named 'MasterPlate_experimentdescription'. The bolded part of the file name including the underscore is required. The italicized part is optional description. Generally the 'DrugMedia_' file merits description.
|
||
# * If the standard MasterPlate_Template file is being used, it's not needed to customize then name. On the other hand if the template is modified, it is recommended to rename it and describe accordingly - a useful convention is to use the same name for the MP files as given to the experiment (i.e, the parent ExpJobs subdirectory described above) after the underscores.
|
||
# * The 'MasterPlate_' file contain associated cell array information (culture IDs for all of the cell arrays in the experiment) while the 'DrugMedia_' file contains information about the media that the cell array is printed to.
|
||
# * Together they encapsulate and define the experimental design.
|
||
# * The QHTCPImageFolders and 'MasterPlateFiles' folder are the inputs for image analysis with EASY software.
|
||
# * As further described below, EASY will automatically generate a 'Results' directory (within the ExpJobs/'ExperimentJob' folder) with a name that consists of a system-generated timestamp and an optional short description provided by the user (Fig.2). The 'Results' directory is created and entered, using the "File >> New Experiment" dropdown in EASY. Multiple 'Results' files may be created (and uniquely named) within an 'ExperimentJob' folder.
|
||
# * **EASY**
|
||
# * This directory contains the GUI-enabled MATLAB software to accomplish image analysis and growth curve fitting.
|
||
# * EASY analyzes Q-HTCP image data within an 'ExperimentJob'' folder (described above; each cell array has its own folder containing its entire time series of images).
|
||
# * EASY analysis produces image quantification data and growth curve fitting results for each cell array; these results are subsequently assembled into a single file and labeled, using information contained in the 'MasterPlate_' and 'DrugMedia_' files in the 'MasterPlateFiles' subdirectory.
|
||
# * The final files (named '!!ResultsStd_.txt' or '!!ResultsELr_.txt') are produced in a subdirectory that EASY creates within the 'ExperimentJob' folder, named '/ResultsTimeStampDesc/PrintResults' (Fig. 2).
|
||
# * The /EASY directory is simply where the latest EASY version resides (additional versions in development or legacy versions may also be stored there).
|
||
# * The raw data inputs and result outputs for EASY are kept in the 'ExpJobs' directory.
|
||
# * EASY also outputs a '.mat' file that is stored in the 'matResults' folder and is named with the TimeStamp and user-provided name appended to the 'Results' folder name when 'New Experiment' is executed from the 'File' Dropdown menu in the EASY console.
|
||
# * **EZview**
|
||
# * This directory contains the GUI-enabled MATLAB software to conveniently and efficiently mine the raw cell array image data for a Q-HTCP experiment.
|
||
# * It takes the Results.m file (created by EASY software) as an input and permits the user to navigate through the raw image data and growth curve results for the experiment.
|
||
# * The /EZview provides a place for storing the the latest EZview version (as well as other EZview versions).
|
||
# * The /EZview provides a GUI for examining the EASY results as provided in the …/matResults/… .mat file.
|
||
# * **StudiesQHTCP**
|
||
# * This directory contains the GUI-enabled JAVA software composite (MATLAB, JAVA, R, Python, Perl, Shell) that takes growth curve results (created by EASY software) as an input and successively generates interaction Z-score results, which are used for graphing gene interactions, Clustering, Gene Ontology analysis, and other ways of interpreting and visualizing the experimental quality and outcomes. {The /StudiesQHTCP folder contains the ordered command line scripts that call sets of other scripts to perform data selection and adaptation from the extracted text results spreadsheet found in the /ExpJobs/experiment name/Results…/PrintResults/ folder. In particular the 'user customize interactionCode4experiment.R' file. It also contains a multitude of R generated plots based on the selected data and possible adaptation. All clustering and Gene ontology analysis are derived from the 'ZScores_Interaction.csv' file found in the/ZScores subdirectory.}
|
||
# * **Master Plates**
|
||
# * This optional folder is a convenient place to store copies of the 'MasterPlate_' and a 'DrugMedia_' file templates, along with previously used files that may have been modified and could be reused or further modified to enable future analyses.
|
||
# * These two file types are required in the 'MasterPlateFiles' folder, which catalogs experimental information specific to individual Jobs in the ExpJobs folder, as described further below.
|
||
#
|
||
# NOTES:
|
||
# * For the time being I have tried to balance the recognizability of your current workflow with better practices that allow this program to function.
|
||
#
|
||
# TODO:
|
||
# * Scripts should be made modular enough that they can be stored in the same dir
|
||
# * Don't cd in scripts
|
||
# * If you must, do it in a subshell at least!
|
||
# * Pass variables
|
||
# * Pass options
|
||
# * Pass arguments
|
||
# * Variable scoping is horrible right now
|
||
# * I wrote this sequentially and tried to keep track the best I could
|
||
# * Local vars have a higher likelihood of being lower case, global vars are UPPER
|
||
#
|
||
# @option -p<value> | --project=<value> Include one or more projects in the analysis
|
||
# @option -i<value> | --include=<value> Include one or more modules in the analysis (default: all modules)
|
||
# @option -x<value> | --exclude=<value> Exclude one or more modules in the analysis
|
||
# @option -m | --markdown Generate the shdoc markdown file for this program
|
||
# @option -y | --yes | --auto Assume yes answer to all questions (non-interactive mode)
|
||
# @option -d | --debug Turn on extra debugging output
|
||
# @option -h | --help Print help message and exit (overrides other options)
|
||
|
||
DEBUG=1 # Turn debugging ON by default during development
|
||
shopt -s extglob
|
||
|
||
# @section Libraries
|
||
# @description Change these variables to use different libraries
|
||
JAVA="${JAVA:-java}"
|
||
PYTHON="${PYTHON:-python3}"
|
||
PERL="${PERL:-perl}"
|
||
RSCRIPT="${RSCRIPT:-Rscript}"
|
||
|
||
# @section Help
|
||
# @description Print a helpful message
|
||
# @noargs
|
||
print_help() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
|
||
install_dependencies --get-depends # Loads the dependency arrays
|
||
|
||
cat <<-EOF
|
||
USAGE:
|
||
script-run-workflow [[OPTION] [VALUE]]...
|
||
|
||
Some options (--project, --include, --exclude) can be passed multiple times or
|
||
by using comma deliminated strings (see EXAMPLES below)
|
||
|
||
OPTIONS:
|
||
--project, -p PROJECT
|
||
PROJECT should follow the pattern ${PROJECT_PREFIX}_PROJECT_NAME
|
||
--include, -i MODULE
|
||
See MODULES section below for list of available modules
|
||
If no --include is specified, all modules are run
|
||
--exclude, -x MODULE
|
||
See MODULES section below for list of modules to exclude
|
||
--markdown, -m
|
||
Generate the shdoc markdown README.md file for this program
|
||
--yes, -y, --auto
|
||
Always answer yes to questions (non-interactive mode)
|
||
--debug, -d
|
||
Print extra debugging info
|
||
--help, -h
|
||
Print this help message and exit
|
||
|
||
MODULES:
|
||
${ALL_MODULES[*]}
|
||
|
||
SUBMODULES:
|
||
${ALL_SUBMODULES[*]}
|
||
|
||
DEPENDENCIES:
|
||
deb: ${depends_deb[@]}
|
||
rpm: ${depends_rpm[@]}
|
||
brew: ${depends_brew[@]}
|
||
perl: ${depends_perl[@]}
|
||
R: ${depends_r[@]}
|
||
BiocManager: ${depends_bioc[@]}
|
||
|
||
EXAMPLES:
|
||
script-run-workflow --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
|
||
script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]}
|
||
script-run-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[1]} --module ${ALL_MODULES[2]} --yes
|
||
script-run-workflow --module=${ALL_MODULES[0]},${ALL_MODULES[1]}
|
||
script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT
|
||
script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT --module=${ALL_MODULES[1]},${ALL_MODULES[2]} --yes --debug
|
||
EOF
|
||
}
|
||
|
||
|
||
# @section User Input
|
||
# @set PROJECTS array List of projects to work on
|
||
# @set INCLUDE_MODULES array List of modules to run
|
||
# @set EXCLUDE_MODULES array List of modules not to run
|
||
# @set DEBUG int Turn debugging on
|
||
# @set YES int Turn assume yes on
|
||
# @description Creates array and switches from user input
|
||
# parse_input() takes all of the arguments passed to the script
|
||
parse_input() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
|
||
long_opts="project:,include:,exclude:,markdown,yes,auto,debug,help"
|
||
short_opts="+p:i:x:mydh"
|
||
|
||
if input=$(getopt -o $short_opts -l $long_opts -- "$@"); then
|
||
eval set -- "$input"
|
||
while true; do
|
||
case $1 in
|
||
--project|-p)
|
||
shift
|
||
if [[ $1 == *','* ]] ; then # check for commas
|
||
IFS=',' read -ra PROJECTS <<< "$1"
|
||
else
|
||
PROJECTS+=("$1")
|
||
fi
|
||
;;
|
||
--include|-i)
|
||
shift
|
||
if [[ $1 == *','* ]] ; then # check for commas
|
||
IFS=',' read -ra INCLUDE_MODULES <<< "$1"
|
||
else
|
||
INCLUDE_MODULES+=("$1")
|
||
fi
|
||
;;
|
||
--exclude|-x)
|
||
shift
|
||
if [[ $1 == *','* ]] ; then # check for commas
|
||
IFS=',' read -ra EXCLUDE_MODULES <<< "$1"
|
||
else
|
||
EXCLUDE_MODULES+=("$1")
|
||
fi
|
||
;;
|
||
--markdown|-m)
|
||
documentation; exit 0 # TODO disable the exit after development
|
||
;;
|
||
--yes|-y|--auto)
|
||
declare -g YES=1
|
||
;;
|
||
--debug|-d)
|
||
declare -g DEBUG=1
|
||
;;
|
||
--help|-h)
|
||
print_help; exit 0
|
||
;;
|
||
--)
|
||
shift
|
||
break
|
||
;;
|
||
esac
|
||
shift
|
||
done
|
||
else
|
||
err "Incorrect options provided"; exit 1
|
||
fi
|
||
}
|
||
|
||
# @section Helper functions
|
||
# @arg $1 array A module to initialize (add to ALL_MODULES)
|
||
# @set ALL_MODULES array A list of all available modules
|
||
# @internal
|
||
module() {
|
||
debug "Adding $1 module"
|
||
ALL_MODULES+=("$1")
|
||
declare -gA "$1"
|
||
}
|
||
|
||
# @arg $1 array A submodule to initialize (add to ALL_SUBMODULES)
|
||
# @set ALL_SUBMODULES array A list of all available submodules
|
||
# @internal
|
||
submodule() {
|
||
debug "Adding $1 submodule"
|
||
ALL_SUBMODULES+=("$1")
|
||
declare -gA "$1"
|
||
}
|
||
# This function will only work if users have an actual name registered on the server
|
||
# TODO for now just use username
|
||
# user_initials() {
|
||
# user_record="$(getent passwd "$(whoami)")"
|
||
# user_gecos_field="$(echo "$user_record" | cut -d ':' -f 5)"
|
||
# user_full_name="$(echo "$user_gecos_field" | cut -d ',' -f 1)"
|
||
# last="${user_full_name#* }"
|
||
# echo "${user_full_name:0:1}${last:0:1}"
|
||
# }
|
||
ask() {
|
||
declare response
|
||
(( YES )) && return 0
|
||
read -r -p "$* [y/N]: " response
|
||
[[ ${response,,} =~ ^(yes|y)$ ]]
|
||
}
|
||
err() { echo "Error: $*" >&2; }
|
||
ask_pn() {
|
||
unset PROJECT
|
||
declare -g PROJECT
|
||
example_pn="${PROJECT_PREFIX}_$(random_words 3)"
|
||
cat <<-EOF
|
||
Enter a new or existing project name
|
||
If entering a new project, use the suggested prefix: ${PROJECT_PREFIX}_
|
||
You may choose any combination of words/characters following the prefix, but be sensible.
|
||
Make it descriptive and avoid spaces and special characters.
|
||
Example: ${example_pn}
|
||
EOF
|
||
trys=3 # give the user up to 3 tries to enter a valid project name
|
||
for ((i=1; i<=trys; i++)); do
|
||
read -r -p "Enter a new or existing project name or enter for default ($example_pn): " PROJECT
|
||
if [[ -z $PROJECT ]]; then
|
||
PROJECT="$example_pn" && break
|
||
else
|
||
sanitize_pn "$PROJECT" && break
|
||
err "Invalid project name: $PROJECT"
|
||
echo "Retrying ($i of $trys)"
|
||
fi
|
||
done
|
||
[[ -z $PROJECT ]] && exit 1 # TODO eventually switch to return, but treat as fatal for now
|
||
echo "Using project name: $PROJECT"
|
||
}
|
||
debug() { (( DEBUG )) && echo "Debug: $*"; }
|
||
# Not super portable but nice to have
|
||
random_words() {
|
||
local num=${1:-2}
|
||
local -a arr
|
||
for ((i=0;i<num;i++)); do
|
||
word=$(shuf -n1 /usr/share/dict/words | sed -e 's/-//g' -e 's/_//g')
|
||
# Sanitize
|
||
word="${word//-/}"
|
||
word="${word//_/}"
|
||
word="${word,,}"
|
||
arr+=("$word")
|
||
done
|
||
printf "%s_" "${arr[@]}" | sed 's/_$//'
|
||
}
|
||
# @description Backup one or more files to an incremented .bk file
|
||
backup() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
for f in "$@"; do
|
||
[[ -f $f ]] || continue
|
||
count=1
|
||
while [[ -f $f.bk.$count ]]; do
|
||
count=$((count+1))
|
||
done
|
||
debug "rsync -a $f $f.bk.$count"
|
||
rsync -a "$f" "$f.bk.$count"
|
||
done
|
||
}
|
||
|
||
|
||
# @section Modules
|
||
# @description A module contains a cohesive set of actions/experiments to run on a project
|
||
# Use a module when:
|
||
# * Building a new type of analysis from scratch
|
||
# * Generating project directories
|
||
# * Grouping multiple submodules (and modules) into a larger task
|
||
# * Dictating the ordering of multiple submodules
|
||
# * Modules should competently handle pushd and popd for their submodules if they do not reside in the SCANS/PROJECT_DIR
|
||
# * Apps and submodules should avoid changing directories
|
||
# * Pass input data from somewhere and output data somewhere
|
||
|
||
module install_dependencies
|
||
# @section Install dependencies
|
||
# @description Installs dependencies for the workflow
|
||
#
|
||
#
|
||
#
|
||
# Dependencies
|
||
# * R
|
||
# * Perl
|
||
# * Java
|
||
# * MATLAB
|
||
#
|
||
# For MacOS
|
||
# * export HOMEBREW_BREW_GIT_REMOTE=https://github.com/Homebrew/brew
|
||
# * /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
|
||
# * cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder
|
||
# * brew install graphiz gd pdftk-java pandoc shdoc nano rsync
|
||
#
|
||
# For Linux
|
||
# * cpan File::Map ExtUtils::PkgConfig GD GO::TermFinder
|
||
# * apt-get install graphviz libgd-dev pdftk-java pandoc shdoc nano rsync
|
||
# or
|
||
# * dnf install graphviz pandoc pdftk-java gd-devel shdoc nano rsync
|
||
#
|
||
# For R
|
||
# * install.packages(“BiocManager”)
|
||
# * BiocManager::install(“org.Sc.sgd.db”)
|
||
# * install.packages(c('ontologyIndex', 'ggrepel', 'tidyverse', 'sos', 'openxlsx'), dep=TRUE)
|
||
# @noargs
|
||
install_dependencies() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
|
||
# Dependency arrays
|
||
depends_rpm=(graphviz pandoc pdftk-java gd-devel shdoc nano rsync coreutils)
|
||
depends_deb=(graphviz pandoc pdftk-java libgd-dev shdoc nano rsync coreutils)
|
||
depends_brew=(graphiz pandoc gd pdftk-java shdoc nano rsync coreutils)
|
||
depends_perl=(File::Map ExtUtils::PkgConfig GD GO::TermFinder)
|
||
depends_r=(BiocManager ontologyIndex ggrepel tidyverse sos openxlsx ggplot2
|
||
plyr extrafont gridExtra gplots stringr plotly ggthemes pandoc rmarkdown
|
||
plotly htmlwidgets)
|
||
depends_bioc=(org.Sc.sgd.db)
|
||
|
||
[[ $1 == "--get-depends" ]] && return 0 # if we just want to read the depends vars
|
||
|
||
# Install system-wide dependencies
|
||
echo "Installing system dependencies"
|
||
case "$(uname -s)" in
|
||
Linux*|CYGWIN*|MINGW*)
|
||
ask "Detected Linux platform, continue?" || return 1
|
||
echo "You may be prompted for your sudo password to install system packages"
|
||
if hash dnf &>/dev/null; then
|
||
sudo dnf install "${depends_rpm[@]}"
|
||
elif hash apt &>/dev/null; then
|
||
sudo apt install "${depends_deb[@]}"
|
||
fi
|
||
;;
|
||
Darwin*)
|
||
ask "Detected Mac platform, continue?" || return 1
|
||
export HOMEBREW_BREW_GIT_REMOTE="https://github.com/Homebrew/brew"
|
||
curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh|bash
|
||
brew install "${depends_brew[@]}"
|
||
;;
|
||
*)
|
||
echo "Your system could not be detected, please install dependencies manually"
|
||
;;
|
||
esac
|
||
|
||
# Install perl CPAN modules
|
||
echo "Installing perl CPAN modules"
|
||
debug "cpan" "${depends_perl[@]}"
|
||
cpan "${depends_perl[@]}"
|
||
|
||
# Install R packages
|
||
echo "Installing R packages"
|
||
|
||
depends_r_str=""
|
||
depends_r_to_string() {
|
||
for d in "${depends_r[@]}"; do
|
||
depends_r_str+="$d\", \""
|
||
done
|
||
depends_r_str="${depends_r_str::-3}" # strip last , " (comma and quote)
|
||
}
|
||
depends_r_to_string
|
||
|
||
debug "$RSCRIPT -e install.packages(c(\"$depends_r_str), dep=TRUE, repos=\"https://cloud.r-project.org\")"
|
||
"$RSCRIPT" -e "install.packages(c(\"$depends_r_str), dep=TRUE, repos=\"https://cloud.r-project.org\")"
|
||
"$RSCRIPT" -e "BiocManager::install(\"${depends_bioc[0]}\")"
|
||
}
|
||
|
||
|
||
module init_project
|
||
# @section Initialize a new project in the scans directory
|
||
# @description This function creates and initializes project directories
|
||
# This module is responsible for the following tasks:
|
||
# * Initializing a project directory in the scans directory
|
||
# * Initializing a QHTCP project directory in the qhtcp directory
|
||
#
|
||
# TODO
|
||
# * Copy over source image directories from robot - are these alse named by the ExpJobs name?
|
||
# * Suggest renaming ExpJobs to something like "scans" or "images"
|
||
# * MasterPlate_ file **should not** be an xlsx file, no portability
|
||
#
|
||
# NOTES
|
||
# * Copy over the images from the robot and then DO NOT TOUCH that directory except to copy from it
|
||
# * Write-protect (read-only) if we need to
|
||
# * Copy data from scans/images directory to the project working dir and then begin analysis
|
||
# * You may think...but doesn't that 2x data?
|
||
# * No, btrfs subvolume uses reflinks, only data that is altered will be duplicated
|
||
# * Most of the data are static images that are not written to, so the data is deduplicated
|
||
init_project() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
|
||
# We handle this in main() and pushd to it
|
||
# But do it one more time in case this is run as a module
|
||
|
||
ask "(Re)-Initialize a project at $PROJECT_SCANS_DIR?" || return 1
|
||
|
||
[[ -d $PROJECT_SCANS_DIR ]] || (mkdir -p "$PROJECT_SCANS_DIR" || return 1)
|
||
|
||
# Write skeleton files in csv
|
||
# If we have to convert to xlsx later, so be it
|
||
cat <<-EOF > "$DRUG_MEDIA_FILE"
|
||
|
||
|
||
|
||
EOF
|
||
|
||
cat <<-EOF > "$MASTER_PLATE_FILE"
|
||
|
||
|
||
|
||
EOF
|
||
|
||
# TODO here we'll copy scans from robot but for now let's pause and wait for transfer
|
||
echo "In the future we will copy scans from robot here"
|
||
read -r -p "Hit <Enter> to continue: "
|
||
|
||
}
|
||
|
||
|
||
module easy
|
||
# @section EASY
|
||
# @description Start an EASY analysis
|
||
# TODO Don't create output in the scans folder, put it in an output directory
|
||
# TODO The !!Results output files need standardized naming
|
||
# TODO Don't perform directory operations in EASY
|
||
# * The scans/images and 'MasterPlateFiles' folder are the inputs for image analysis with EASY software.
|
||
# * EASY will automatically generate a 'Results' directory (within the ExpJobs/'ExperimentJob' folder) w/ timestamp and an optional short description provided by the user (Fig.2).
|
||
# * The 'Results' directory is created and entered, using the "File >> New Experiment" dropdown in EASY.
|
||
# * Multiple 'Results' files may be created (and uniquely named) within an 'ExperimentJob' folder.
|
||
#
|
||
# Template:
|
||
# templates/easy
|
||
# * [datatipp.m](templates/easy/datatipp.m)
|
||
# * [DgenResults.m](templates/easy/DgenResults.m)
|
||
# * [DMPexcel2mat.m](templates/easy/DMPexcel2mat.m)
|
||
# * [EASYconsole.asv](templates/easy/EASYconsole.asv)
|
||
# * [EASYconsole.fig](templates/easy/EASYconsole.fig)
|
||
# * [EASYconsole.m](templates/easy/EASYconsole.m)
|
||
# * [figs](templates/easy/figs)
|
||
# * [NPTdirect.fig](templates/easy/figs/NPTdirect.fig)
|
||
# * [searchNPTIm.fig](templates/easy/figs/searchNPTIm.fig)
|
||
# * [NCdisplayGui.m](templates/easy/NCdisplayGui.m)
|
||
# * [NCfitImCFparforFailGbl2.m](templates/easy/NCfitImCFparforFailGbl2.m)
|
||
# * [NCscurImCF_3parfor.m](templates/easy/NCscurImCF_3parfor.m)
|
||
# * [NCsingleDisplay.m](templates/easy/NCsingleDisplay.m)
|
||
# * [NIcircle.m](templates/easy/NIcircle.m)
|
||
# * [NImParamRadiusGui.m](templates/easy/NImParamRadiusGui.m)
|
||
# * [NIscanIntensBGpar4GblFnc.m](templates/easy/NIscanIntensBGpar4GblFnc.m)
|
||
# * [p4loop8c.m](templates/easy/p4loop8c.m)
|
||
# * [par4GblFnc8c.m](templates/easy/par4GblFnc8c.m)
|
||
# * [par4Gbl_Main8c.m](templates/easy/par4Gbl_Main8c.m)
|
||
# * [PTmats](templates/easy/PTmats)
|
||
# * [Nbdg.mat](templates/easy/PTmats/Nbdg.mat)
|
||
# * [NCFparms.mat](templates/easy/PTmats/NCFparms.mat)
|
||
# * [NImParameters.mat](templates/easy/PTmats/NImParameters.mat)
|
||
# * [NPTdirectParameters.mat](templates/easy/PTmats/NPTdirectParameters.mat)
|
||
# * [NPTmapDirect.mat](templates/easy/PTmats/NPTmapDirect.mat)
|
||
# * [NPTmapSearch.mat](templates/easy/PTmats/NPTmapSearch.mat)
|
||
# * [NPTsearchParameters.mat](templates/easy/PTmats/NPTsearchParameters.mat)
|
||
#
|
||
# To analyze a new Q-HTCP experiment:
|
||
# * Open the EASY Software.
|
||
# * Open 'EstartConsole.m' with MATLAB
|
||
# * Click the Run icon (play button)
|
||
# * When prompted, click "Change Folder" (do not select "Add to Path").
|
||
# * In the pop-up display, select from the 'File' dropdown: 'New Experiment'.
|
||
# * From the pop-up, choose where to save the new file.
|
||
# * Navigate to the relevant job in the ExpJobs folder, name the file accordingly, and click 'save'.
|
||
# * The newly created .mat file in the newly created Results folder will automatically be loaded.
|
||
# * The file name will then be automatically appended by the code with the current date information (e.g. 'A1.mat' will become 'Results2023-07-19A1)
|
||
# * If the experiment has already been created, it can be reloaded by clicking 'Load Experiment' instead of 'New Experiment' and selecting the relevant results
|
||
# * In the pop-up display, click on the 'Run' dropdown menu and select 'Image CurveFit ComboAnalysis'.
|
||
# * In the updated pop-up, choose/highlight all desired image folders for analysis (this is generally all of the folders, since only the ones that need analysis should be there) and then click on 'continue'.
|
||
# * As the program is running, updates will periodically appear in the Command Window; there will be an initial pause at "Before call to NIscanIntens…..".
|
||
# * When the curve fitting is finished, the EASY console will pop back up.
|
||
# * Check to see the completed analysis results in the newly created 'PrintResults' Folder, inside of the 'Results' Folder.
|
||
# * Other folders ('CFfigs', 'figs', 'Fotos') are created for later optional use and will be empty.
|
||
# * **NOTE:** The image analysis is completed independent of labeling the data (strains, media type, etc. Labeling happens next with the 'GenReports' function).
|
||
# * Click on the 'GenReports' dropdown and select 'DrugMediaMP Generate .mat'
|
||
# * **NOTE:** The 'MasterPlate' and 'DrugMedia' files have very specific formats and should be completed from a template.
|
||
# * The Masterplate file must be exact (it must contain all and only the strains that were actually tested).
|
||
# * For example, if only part of a library is tested, the complete library file must be modified to remove irrelevant strains.
|
||
# * You will be prompted to first select the 'MasterPlate' file. You will need to navigate away from the working directory to get to it.
|
||
# * It is fine for the 'MasterPlate_' file to be .xlsx (or .xls), and if you don't see it in the popup window, then change the file type from '.xls' to "all files" and then select it.
|
||
# * Once it is selected, a report of the number of master plates in the file will pop up; when the report appears, assuming it is correct, click on 'OK'.
|
||
# * You will then be prompted to select the 'DrugMedia' file from the relevant job folder. You will automatically return to the correct prior directory location.
|
||
# * Choose it and click 'OK'. You may see a warning about column headers being modified, but that's ok.
|
||
# * This will create an additional file in the 'MasterPlatesFiles' folder named 'MPDMmat.mat'
|
||
# * Click on the 'GenReports' dropdown and select 'Results_Generate.'
|
||
# * You will first see '!!ResultsElr_.txt' generated in the 'PrintResults' folder.
|
||
# * Refreshing will reveal an increasing file size until you see the '!!ResultsStd_.txt' being generated.
|
||
# * When finished, the '!!ResultsStd_.txt' will be about the same file size and it should be used in the following StudiesQHTCP analysis.
|
||
# * 'NoGrowth_.txt', and 'GrowthOnly_.txt' files will be generated in the 'PrintResults' folder.
|
||
#
|
||
# Issues:
|
||
# * We need full documentation for all of the current workflow. There are different documents that need to be integrated. This will need to be updated as we make improvements to the system.
|
||
# * MasterPlate_ file must have ydl227c in orf column, or else it Z_interaction.R will fail, because it can't calculate shift values.
|
||
# * Make sure there are no special characters; e.g., (), “, ', ?, etc.; dash and underscore are ok as delimiters
|
||
# * Drug_Media_ file must have letter character to be read as 'text'.
|
||
# * MasterPlate_ file and DrugMedia_ are .xlsx or .xls, but !!Results_ is .txt.
|
||
# * In Z_interactions.R, does it require a zero concentration/perturbation (should we use zero for the low conc, even if it's not zero), e.g., in order to do the shift correctly.
|
||
# * Need to enable all file types (not only .xls) as the default for GenerateResults (to select MP and DM files as .xlsx).
|
||
# * Explore differences between the ELR and STD files - 24_0414; John R modified Z script to format ELR file for Z_interactions.R analysis.
|
||
# * To keep time stamps when transferring with FileZilla, go to the transfer drop down and turn it on, see https://filezillapro.com/docs/v3/advanced/preserve-timestamps/
|
||
# * Could we change the 'MasterPlateFiles' folder label in EASY to 'MasterPlate_DrugMedia' (since there should be only one MP and there is also a DM file required?
|
||
# * I was also thinking of adding a 'MasterPlateFilesOnly' folder to the QHTCP directory template where one could house different MPFiles (e.g., with and without damps, with and without Refs on all MPs, etc; other custom MPFiles, updated versions, etc)
|
||
# * Currently updated files are in '23_1011_NewUpdatedMasterPlate_Files' on Mac (yeast strains/23_0914…/)
|
||
# * For EASY to report cell array positions (plate_row_column) to facilitate analyzing plate artifacts. The MP File in Col 3 is called 'LibraryLocation' and is reported after 'Specifics' in the !!Results.
|
||
# * Can EASY/StudiesQ-HTCP be updated at any time by rerunning with updated MP file (new information for gene, desc, etc)- or maybe better to always start with a new template?
|
||
# * Need to be aware of file formatting to avoid dates (e.g., with gene names like MAY24, OCT1, etc, and with plate locations 1E1, 1E2, etc)- this has been less of a problem.
|
||
# * In StudiesQHTCP folders, remember to annotate Exp1, Exp2, in the StudyInfo.csv file.
|
||
# * Where are gene names called from for labeling REMc heatmaps, TSHeatmaps, Z-interaction graphs, etc? Is this file in the QHTCP 'code' folder, or is it in the the results file (and thus ultimately the MP file)?
|
||
# * Is it ok for a MasterPlate_ file to have multiple sheets (e.g., readme tab- is only the first tab read in)?
|
||
# * What are the rules for pulling information from the MasterPlateFile to the !!Results_ (e.g., is it the column or the Header Name, etc that is searched? Particular cells in the DrugMedia file?).
|
||
# * Modifier, Conc are from DM sheet, and refer to the agar media arrays. OrfRep is from MasterPlate_ File. 'Specifics' (Last Column) is experiment specific and accommodate designs involving differences across the multi-well liquid arrays. 'StrainBkGrd' (now 'Library location') is in the 3rd column and reported after 'Specifics' at the last col of the '!!Results..' file.
|
||
# * Do we have / could we make an indicator- work in progress or idle/complete with MP/DM and after gen-report. Now, we can check for the MPDMmat.mat file, or we can look in PrintResults, but would be nice to know without looking there.
|
||
# * File>>Load Experiment wasn't working (no popup to redirect). Check this again.
|
||
|
||
easy() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
cat <<-EOF
|
||
To analyze a new Q-HTCP experiment:
|
||
* Open the EASY Software.
|
||
* Open 'EASYConsole.m' with MATLAB
|
||
* Click the Run icon (play button)
|
||
* When prompted, click "Change Folder" (do not select "Add to Path").
|
||
* In the pop-up display, select from the 'File' dropdown: 'New Experiment'.
|
||
* From the pop-up, choose where to save the new file.
|
||
* Navigate to the relevant job in the ExpJobs folder, name the file accordingly, and click 'save'.
|
||
* The newly created .mat file in the newly created Results folder will automatically be loaded.
|
||
* The file name will then be automatically appended by the code with the current date information (e.g. 'A1.mat' will become 'Results2023-07-19A1)
|
||
* If the experiment has already been created, it can be reloaded by clicking 'Load Experiment' instead of 'New Experiment' and selecting the relevant results
|
||
* Next, in the pop-up display, click on the 'Run' dropdown menu and select 'Image CurveFit ComboAnalysis'.
|
||
* In the updated pop-up, choose/highlight all desired image folders for analysis (this is generally all of the folders, since only the ones that need analysis should be there) and then click on 'continue'.
|
||
* As the program is running, updates will periodically appear in the Command Window; there will be an initial pause at "Before call to NIscanIntens…..".
|
||
* When the curve fitting is finished, the EASY console will pop back up.
|
||
* Check to see the completed analysis results in the newly created 'PrintResults' Folder, inside of the 'Results' Folder.
|
||
* Other folders ('CFfigs', 'figs', 'Fotos') are created for later optional use and will be empty.
|
||
* NOTE: The image analysis is completed independent of labeling the data (strains, media type, etc. Labeling happens next with the 'GenReports' function).
|
||
* Next, click on the 'GenReports' dropdown and select 'DrugMediaMP Generate .mat'
|
||
* NOTE: The 'MasterPlate' and 'DrugMedia' files have very specific formats and should be completed from a template.
|
||
* The Masterplate file must be exact (it must contain all and only the strains that were actually tested).
|
||
* For example, if only part of a library is tested, the complete library file must be modified to remove irrelevant strains.
|
||
* You will be prompted to first select the 'MasterPlate' file. You will need to navigate away from the working directory to get to it.
|
||
* It is fine for the 'MasterPlate_' file to be .xlsx (or .xls), and if you don't see it in the popup window, then change the file type from '.xls' to "all files" and then select it.
|
||
* Once it is selected, a report of the number of master plates in the file will pop up; when the report appears, assuming it is correct, click on 'OK'.
|
||
* You will then be prompted to select the 'DrugMedia' file from the relevant job folder. You will automatically return to the correct prior directory location.
|
||
* Choose it and click 'OK'. You may see a warning about column headers being modified, but that's ok.
|
||
* This will create an additional file in the 'MasterPlatesFiles' folder named 'MPDMmat.mat'
|
||
* Click on the 'GenReports' dropdown and select 'Results_Generate.'
|
||
* You will first see '!!ResultsElr_.txt' generated in the 'PrintResults' folder.
|
||
* Refreshing will reveal an increasing file size until you see the '!!ResultsStd_.txt' being generated.
|
||
* When finished, the '!!ResultsStd_.txt' will be about the same file size and it should be used in the following StudiesQHTCP analysis.
|
||
|
||
'NoGrowth_.txt', and 'GrowthOnly_.txt' files will be generated in the 'PrintResults' folder.
|
||
EOF
|
||
|
||
EASY_DIR="$APPS_DIR/matlab/easy"
|
||
script="$EASY_DIR/EASYConsole.m"
|
||
|
||
# Copy templates
|
||
|
||
# Add EASY directory to the Matlab path
|
||
# If this does not work we can try changing the -sd argument and if that fails then pushing/popping
|
||
debug "Adding EASY directory to the Matlab path"
|
||
hash matlab &>/dev/null &&
|
||
matlab -nodisplay -nosplash -nodesktop -nojvm -batch "addpath('$EASY_DIR')"
|
||
|
||
# Ask the user to launch EASYconsole.m in MATLAB
|
||
# MATLAB doesn't support passing args to scripts se we have to use ENV VARS instead
|
||
# These are proably already set in our scope but be explicit just in case so we don't have to export?
|
||
# TODO will need to play with the -sd startup option to see what works (well)
|
||
# Skip this step altogether in auto mode since it requires graphical interaction
|
||
if ! ((YES)) && ask "Start EASY in MATLAB? This requires a GUI."; then
|
||
export PROJECT_SCANS_DIR PROJECT_DATE PROJECT_USER PROJECT
|
||
export EASY_DIR EASY_NAME EASY_SUFFIX EASY_RESULTS_DIR
|
||
export MASTER_PLATE_FILE DRUG_MEDIA_FILE
|
||
|
||
choose_easy_results_dir
|
||
|
||
DRUG_MEDIA_FILE="$EASY_RESULTS_DIR/DrugMedia_$PROJECT.xls"
|
||
MASTER_PLATE_FILE="$EASY_RESULTS_DIR/MasterPlate_$PROJECT.xls"
|
||
|
||
if ((DEBUG)); then
|
||
declare -p EASY
|
||
fi
|
||
|
||
|
||
# Make EASY dirs
|
||
debug "mkdir -p $EASY_RESULTS_DIR"
|
||
mkdir -p "$EASY_RESULTS_DIR"
|
||
dirs=('PrintResults' 'CFfigs' 'Fotos')
|
||
for d in "${dirs[@]}"; do
|
||
debug "mkdir $EASY_RESULTS_DIR/$d"
|
||
mkdir "$EASY_RESULTS_DIR/$d"
|
||
done
|
||
|
||
# Copy templates
|
||
rsync -a "$EASY_DIR"/{figs,Ptmats} "$EASY_RESULTS_DIR"
|
||
|
||
# Launch matlab
|
||
# matlab -nosplash -sd "$PROJECT_SCANS_DIR" -r "run $script"
|
||
matlab -nosplash -r "run $script"
|
||
fi
|
||
|
||
# Use the function return code see if we succeeded
|
||
choose_easy_results "$PROJECT_SCANS_DIR" || (err "No EASY results found in $PROJECT_SCANS_DIR"; return 1)
|
||
}
|
||
|
||
|
||
module ezview
|
||
# @section EZview
|
||
# @description TODO WIP
|
||
ezview() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
export PROJECT_SCANS_DIR PROJECT_DATE PROJECT_USER EZVIEW_DIR
|
||
export EASY_RESULTS_DIR
|
||
|
||
EZVIEW_DIR="$APPS_DIR/matlab/ezview"
|
||
script="$EZVIEW_DIR/EZviewGui.m"
|
||
|
||
# Try to load EASY output dir fromENV or easy module
|
||
# Then scan for EASY results
|
||
[[ -z $EASY_RESULTS_DIR ]] && choose_easy_results_dir
|
||
|
||
# Make EZview dirs
|
||
matlab -nosplash -r "run $script"
|
||
}
|
||
|
||
|
||
module qhtcp
|
||
# @section QHTCP
|
||
# @description System for Multi-QHTCP-Experiment Gene Interaction Profiling Analysis
|
||
# * Functional rewrite of REMcMaster3.sh, RemcMaster2.sh, REMcJar2.sh, ExpFrontend.m, mProcess.sh, mFunction.sh, mComponent.sh
|
||
# * Added a newline character to the end of StudyInfo.csv so it is a valid text file
|
||
# TODO Suggest renaming StudiesQHTCP to something like qhtcp qhtcp_output or output
|
||
# TODO Store StudyInfo somewhere better
|
||
# TODO Move (hide) the study template somewhere else
|
||
# TODO StudiesArchive should be smarter:
|
||
# * Create a database with as much information as possible
|
||
# * Write a function that easily loads and parses databse into easy-to-use variables
|
||
# * Allow users to reference those variables to write their own modules
|
||
# TODO Should not be using initials
|
||
# * not unique enough and we don't have that data easily on hand
|
||
# * usernames are unique and make more sense
|
||
# * I don't know what all would have to be modified atm
|
||
#
|
||
# Rerunning this module uses rsync --update to only copy files that are newer in the template
|
||
# If you wish for the template to overwrite your changes, delete the file from your QHTCP project dir
|
||
#
|
||
# To create a new study (Experiment Specific Interaction Zscores generation)
|
||
#
|
||
# * StudyInfo.csv instructions:
|
||
# * In your files directory, open the /Code folder, edit the 'StudyInfo.csv' spreadsheet, and save it as a 'csv' file to give each experiment the labels you wish to be used for the plots and specific files.
|
||
# * Enter the desired Experiment names- **order the names in the way you want them to appear in the REMc heatmaps; and make sure to run the front end programs (below) in the correct order (e.g., run front end in 'exp1' folder to call the !!Results file for the experiment you named as exp1 in the StudyInfo.csv file)
|
||
# * The GTA and pairwise, TSHeatmaps, JoinInteractions and GTF Heatmap scripts use this table to label results and heatmaps in a meaningful way for the user and others. The BackgroundSD and ZscoreJoinSD fields will be filled automatically according to user specifications, at a later step in the QHTCP study process.
|
||
#
|
||
# * MATLAB ExpFrontend.m was made for recording into a spreadsheet ('StudiesDataArchive.txt') the date and files used (i.e., directory paths to the !!Results files used as input for Z-interaction script) for each multi-experiment study.
|
||
# Give each experiment the labels you wish to be used for the plots and specific files.
|
||
# Enter the desired Experiment names and order them in the way you want them to appear in the REMc heatmaps;
|
||
# Run the front end MATLAB programs in the correct order (e.g., run front end in 'exp1' folder to call the !!Results file for the experiment you named as exp1 in the StudyInfo.csv file)
|
||
# The GTA and pairwise, TSHeatmaps, JoinInteractions and GTF Heatmap scripts use this table to label results and heatmaps in a meaningful way for the user and others.
|
||
# The BackgroundSD and ZscoreJoinSD fields will be filled automatically according to user specifications, at a later step in the QHTCP study process.
|
||
#
|
||
# * Open MATLAB and in the application navigate to each specific /Exp folder, call and execute ExpFrontend.m by clicking the play icon.
|
||
# * Use the "Open file" function from within Matlab.
|
||
# * Do not double-click on the file from the directory.
|
||
# * When prompted, navigate to the ExpJobs folder and the PrintResults folder within the correct job folder.
|
||
# * Repeat this for every Exp# folder depending on how many experiments are being performed.
|
||
# * Note: Before doing this, it's a good idea to compare the ref and non-ref CPP average and median values. If they are not approximately equal, then may be helpful to standardize Ref values to the measures of central tendency of the Non-refs, because the Ref CPPs are used for the z-scores, which should be centered around zero.
|
||
# * This script will copy the !!ResultsStd file (located in /PrintResults in the relevant job folder in /ExpJobs **rename this !!Results file before running front end; we normally use the 'STD' (not the 'ELR' file) chosen to the Exp# directory as can be seen in the “Current Folder” column in MATLAB, and it updates 'StudiesDataArchive.txt' file that resides in the /StudiesQHTCP folder. 'StudiesDataArchive.txt' is a log of file paths used for different studies, including timestamps.
|
||
#
|
||
# Do this to document the names, dates and paths of all the studies and experiment data used in each study. Note, one should only have a single '!!Results…' file for each /Exp_ to prevent ambiguity and confusion. If you decide to use a new or different '!!Results…' sheet from what was used in a previous “QHTCP Study”, remove the one not being used. NOTE: if you copy a '!!Results…' file in by hand, it will not be recorded in the 'StudiesDataArchive.txt' file and so will not be documented for future reference. If you use the ExpFrontend.m utility it will append the new source for the raw !!Results… to the 'StudiesDataArchive.txt' file.
|
||
# As stated above, it is advantageous to think about the comparisons one wishes to make so as to order the experiments in a rational way as it relates to the presentation of plots. That is, which results from sheets and selected 'interaction … .R', user modified script, is used in /Exp1, Exp2, Exp3 and Exp4 as explained in the following section.
|
||
# TODO MUST CLEAN UP QHTCP TEMPLATE DIRECTORY
|
||
#
|
||
#
|
||
# As stated earlier, the user can add folders to back up temporary results, study-related notes, or other related work.
|
||
# However, it is advised to set up and use separate STUDIES when evaluating differing data sets whether that is from experiment results files or from differing data selections in the first interaction … .R script stage.
|
||
# This reduces confusion at the time of the study and especially for those reviewing study analysis in the future.
|
||
# How-To Procedure: Execute a Multi-experiment Study
|
||
# * Consider the goals of the study and design a strategy of experiments to include in the study.
|
||
# * Consider the quality of the experiment runs using EZview to see if there are systematic problems that are readily detectable.
|
||
# * In some cases, one may wish to design a 'pilot' study for discovery purposes.
|
||
# * There is no problem doing that, just take a template study, copy and rename it as XYZpilotStudy etc.
|
||
# * However, careful examination of the experimental results using EZview will likely save time in the long run.
|
||
# * One may be able to relatively quickly run the interaction Z scores (the main challenge there is the user creation of customized interaction… .R code.
|
||
# * I have tried to simplify this by locating the user edits near the top).
|
||
#
|
||
#
|
||
qhtcp() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
|
||
|
||
|
||
if [[ -d $QHTCP_PROJECT_DIR ]]; then
|
||
echo "A project already exists at $QHTCP_PROJECT_DIR"
|
||
ask "Safely update $QHTCP_PROJECT_DIR from the $QHTCP_TEMPLATE_DIR template?"
|
||
if ! ((YES)) && ask "Back up $QHTCP_PROJECT_DIR to $QHTCP_PROJECT_DIR.bk first and start fresh?"; then
|
||
mv "$QHTCP_PROJECT_DIR" "$QHTCP_PROJECT_DIR.bk" || (echo "Backup unsuccessful, exiting"; exit 1)
|
||
fi
|
||
fi
|
||
|
||
# Copy template to QHTCP project directory
|
||
if rsync --archive --update "$QHTCP_TEMPLATE_DIR"/ "$QHTCP_PROJECT_DIR"; then
|
||
echo "New project created at $QHTCP_PROJECT_DIR"
|
||
fi
|
||
|
||
# Sets STUDIES_NUMS and NUM_STUDIES (yes this makes sense)
|
||
get_studies "$STUDY_INFO_FILE"
|
||
|
||
# Construct the next auto-entry
|
||
# 1,ExpName1,NA,NA,UserInitials
|
||
next_study_num=$(( NUM_STUDIES + 1 ))
|
||
|
||
# If the next Exp dir already exists don't use it
|
||
while [[ -d $QHTCP_PROJECT_DIR/Exp$next_study_num ]]; do
|
||
(( next_study_num=next_study_num+1 ))
|
||
done
|
||
|
||
# Use initials from project not whoami
|
||
# Best I can do is first two letters of username
|
||
# See TODO in markdown
|
||
initials="${PROJECT_USER:0:2}"
|
||
INITIALS=${initials^^}
|
||
next_study_entry="$next_study_num,$PROJECT_SUFFIX,NA,NA,$INITIALS"
|
||
debug "$next_study_entry"
|
||
|
||
# Print current studies
|
||
[[ -f $STUDY_INFO_FILE ]] &&
|
||
echo "Current studies from $STUDY_INFO_FILE: " &&
|
||
cat "$STUDY_INFO_FILE"
|
||
|
||
# Ask user to edit STUDY_INFO_FILE
|
||
if ! ((YES)) && ask "Would you like to edit $STUDY_INFO_FILE to add or modify studies?"; then
|
||
cat <<-EOF
|
||
Give each experiment the labels you wish to be used for the plots and specific files.
|
||
Enter the desired Experiment names and order them in the way you want them to appear in the REMc heatmaps
|
||
|
||
Auto-entry suggestion: $next_study_entry
|
||
EOF
|
||
if ask "Would you like to add (y) the auto-entry suggestion to $STUDY_INFO_FILE or edit STUDY_INFO_FILE in nano (n)?"; then
|
||
echo "$next_study_entry" >> "$STUDY_INFO_FILE"
|
||
else
|
||
debug "nano $STUDY_INFO_FILE"
|
||
nano "$STUDY_INFO_FILE"
|
||
fi
|
||
fi
|
||
|
||
# Initialize missing dirs
|
||
STUDIES_DIRS=()
|
||
for s in "${STUDIES_NUMS[@]}"; do
|
||
STUDY_DIR="$QHTCP_PROJECT_DIR/Exp$s"
|
||
STUDIES_DIRS+=("$STUDY_DIR")
|
||
if ! [[ -d $STUDY_DIR ]]; then
|
||
if ! rsync --archive "$STUDY_TEMPLATE_DIR" "$STUDY_DIR"; then
|
||
err "Could not copy $STUDY_TEMPLATE_DIR template to $STUDY_DIR"
|
||
continue
|
||
fi
|
||
fi
|
||
done
|
||
unset STUDY_DIR
|
||
|
||
# Replacing ExpFrontend.m
|
||
choose_easy_results_dir
|
||
|
||
# Create studies archive file if missing
|
||
if ! [[ -d $STUDIES_ARCHIVE_FILE ]]; then
|
||
header=(StudyDate tStudyName StudyPath ExpNum ExpDate ExpPath ResultFile)
|
||
printf "%s\t" "${header[@]}" > "$STUDIES_ARCHIVE_FILE"
|
||
fi
|
||
# TODO Add them all to StudiesDataArchive?
|
||
# Probably better to always add and remove dupes later since each invocation "counts"?
|
||
for f in "${EASY_RESULTS_FILES[@]}"; do
|
||
for s in "${STUDIES_NUMS[@]}"; do
|
||
# Trying to match old ExpFrontend formatting
|
||
printf "%s\t" \
|
||
"${DATE//_/}" "$PROJECT" "$QHTCP_PROJECT_DIR" "Exp$s" \
|
||
"$PROJECT_DATE" "$PROJECT_SCANS_DIR" "$EASY_RESULTS_DIR" "${f##*/}" \
|
||
>> "$STUDIES_ARCHIVE_FILE"
|
||
done
|
||
done
|
||
|
||
# Run R interactions script on all studies
|
||
for s in "${STUDIES_NUMS[@]}"; do
|
||
STUDY_DIR="$QHTCP_PROJECT_DIR/Exp$s"
|
||
r_interactions \
|
||
"$STUDY_DIR" \
|
||
"$STUDY_INFO_FILE" \
|
||
"$STUDY_DIR/zscores/" \
|
||
"$APPS_DIR/r/SGD_features.tab" \
|
||
5
|
||
done
|
||
|
||
# Run remc as part of the QHTCP process
|
||
# pass all the study directories to it so the scripts have all the paths
|
||
remc "$STUDY_INFO_FILE" "${STUDIES_DIRS[@]}"
|
||
}
|
||
|
||
|
||
module remc
|
||
# @section GTF
|
||
# @description GTF module for QHTCP
|
||
# TODO which components of remc can be parallelized?
|
||
# The submodules in remc really like to be run from the REMc dir
|
||
# so we pop in and out for now
|
||
# NOTE the remc modules could use some love
|
||
# * Don't cd within scripts, it's confusing
|
||
# * Use arguments to pass configuration variables
|
||
# * This allows us to abstract the program away in script-run-workflow and treat it like a module
|
||
# @arg $1 string studyInfo file
|
||
remc() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
|
||
# If any submodules fail the rest will not run, this is fundamental to module design
|
||
# Remove leading && to run regardless
|
||
# TODO can this be
|
||
r_join_interactions \
|
||
"$QHTCP_PROJECT_DIR/out" # output directory
|
||
2 \ % sd value
|
||
"$1" # studyInfo file
|
||
"${@:2}" \
|
||
&& java_extract \
|
||
"$QHTCP_PROJECT_DIR/out/" \
|
||
&& r_add_shift_values \
|
||
"$QHTCP_PROJECT_DIR/REMcRdy_lm_only.csv-finalTable.csv" \
|
||
"$QHTCP_PROJECT_DIR/Shift_only.csv" \
|
||
"$1" \
|
||
"$QHTCP_PROJECT_DIR/REMcWithShift.csv" \
|
||
&& r_create_heat_maps \
|
||
"$QHTCP_PROJECT_DIR/REMcWithShift.csv" \
|
||
"$QHTCP_PROJECT_DIR/out" \
|
||
&& r_heat_maps_homology \
|
||
"$QHTCP_PROJECT_DIR/REMcRdy_lm_only.csv-finalTable.csv" \
|
||
"$APPS_DIR/r/170503_DAmPs_Only.txt" \
|
||
"$APPS_DIR/r/Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv" \
|
||
"$QHTCP_PROJECT_DIR/out/homology"
|
||
}
|
||
|
||
|
||
module gtf
|
||
# @section GTF
|
||
# @description GTF module for QHTCP
|
||
gtf() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
process_dir="$QHTCP_PROJECT_DIR/out/gtf/process"
|
||
function_dir="$QHTCP_PROJECT_DIR/out/gtf/function"
|
||
component_dir="$QHTCP_PROJECT_DIR/out/gtf/component"
|
||
out_dir="$QHTCP_PROJECT_DIR/out/gtf"
|
||
|
||
py_gtf_dcon \
|
||
"$process_dir" \
|
||
"$out_dir"
|
||
|
||
# Perform operations on each directory in parallel
|
||
for d in "$function_dir" "$component_dir"; do
|
||
debug "rsync -a $process_dir/ $d/"
|
||
rsync -a "$process_dir/" "$d/"
|
||
done
|
||
for d in "$process_dir" "$function_dir" "$component_dir"; do
|
||
out_file="${d##*/}Results.txt" # Use the dirname to create each Results filename
|
||
pl_gtf "$d" "$out_dir" & # parallelize
|
||
py_gtf_concat "$d" "$out_dir" "$out_file"
|
||
done
|
||
|
||
r_compile_gtf "$out_dir"
|
||
}
|
||
|
||
|
||
module gta
|
||
# @section GTA
|
||
# @description GTA module for QHTCP
|
||
# NOTES
|
||
# * Heavily modified GTAtemplate.R
|
||
# TODO
|
||
# *
|
||
# *
|
||
# @set GTA_OUT_DIR string The GTA output results dir
|
||
# @set all_sgd_terms_csv string The all_SGD_GOTerms_for_QHTCPtk.csv file
|
||
# @set sgd_terms_tfile string The go_terms.tab file
|
||
# @set sgd_features_file string The gene_association.sgd file
|
||
# @set gene_ontology_file string The gene_ontology_edit.obo file
|
||
# @set zscores_file string The ZScores_interaction.csv file
|
||
gta() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
|
||
GTA_OUT_DIR="$QHTCP_PROJECT_DIR/gta"
|
||
all_sgd_terms_csv="$APPS_DIR/r/All_SGD_GOTerms_for_QHTCPtk.csv"
|
||
sgd_terms_tfile="$APPS_DIR/r/go_terms.tab"
|
||
sgd_features_file="$APPS_DIR/r/gene_association.sgd"
|
||
gene_ontology_file="$APPS_DIR/r/gene_ontology_edit.obo"
|
||
zscores_file="zscores/zscores_interaction.csv"
|
||
|
||
# Sets STUDIES_NUM and NUM_STUDIES
|
||
get_studies "$STUDY_INFO_FILE"
|
||
|
||
[[ -d $GTA_OUT_DIR ]] || mkdir "$GTA_OUT_DIR"
|
||
|
||
# Loop over the array and create pairwise arrays
|
||
for ((i=0; i<${#STUDIES_NUMS[@]}; i++)); do
|
||
for ((j=i+1; j<${#STUDIES_NUMS[@]}; j++)); do
|
||
pair=("${STUDIES_NUMS[i]}" "${STUDIES_NUMS[j]}")
|
||
echo "${pair[@]}"
|
||
done
|
||
done
|
||
|
||
# Create unique parwise combinations of study nums from dir names
|
||
study_combos=()
|
||
for ((i=0; i<${#STUDIES_NUMS[@]}; i++)); do
|
||
# Loop through the array again
|
||
for ((j=0; j<${#STUDIES_NUMS[@]}; j++)); do
|
||
# If the indices are not the same
|
||
if [ "$i" != "$j" ]; then
|
||
# Print the unique combination
|
||
study_combos+=("${STUDIES_NUMS[$i]},${STUDIES_NUMS[$j]}")
|
||
fi
|
||
done
|
||
done
|
||
|
||
# The following are three types of studies
|
||
|
||
# Individual studies
|
||
for s in "${STUDIES_NUMS[@]}"; do
|
||
zscores_file="$QHTCP_PROJECT_DIR/Exp$s/$zscores_file"
|
||
if [[ -f $zscores_file ]]; then
|
||
mkdir "$GTA_OUT_DIR/Exp$s"
|
||
r_gta "Exp$s" "$zscores_file" "$sgd_terms_tfile" "$sgd_features_file" "$GTA_OUT_DIR"
|
||
fi
|
||
done
|
||
|
||
# Combination studies (for pairwise comparisons)
|
||
for combo in "${study_combos[@]}"; do
|
||
# Split on comma and assign to array
|
||
IFS=',' read -ra studies <<< "$combo"
|
||
r_gta_pairwiselk "${studies[0]}" "${studies[1]}" "$STUDY_INFO_FILE" "$GTA_OUT_DIR"
|
||
done
|
||
|
||
# All studies
|
||
# If you have an unknown # of studies it must be passed last and any preceding arguments
|
||
# are required
|
||
r_gta_heatmaps \
|
||
"$STUDY_INFO_FILE" \
|
||
"$gene_ontology_file" \
|
||
"$sgd_terms_tfile" \
|
||
"$all_sgd_terms_csv" \
|
||
"$zscores_file" \
|
||
"$QHTCP_PROJECT_DIR" \
|
||
"$QHTCP_PROJECT_DIR/TermSpecificHeatmaps" \
|
||
"${STUDIES_NUMS[@]}"
|
||
}
|
||
|
||
|
||
# @section Submodules
|
||
# @description Submodules provide functionality to modules and should be reusable
|
||
# A submodule only runs by default if called by a module
|
||
# Use a submodule for:
|
||
# * Calling external scripts
|
||
# * Performing repetitive tasks
|
||
# * Generalizing code
|
||
# * Functions you do not want to perform by default (submodules should be called modules)
|
||
# * Should not call cd or pushd (let module dictate)
|
||
|
||
|
||
submodule r_gta
|
||
# @description GTAtemplate R script
|
||
# TODO:
|
||
# * Is GTAtemplate.R actually a template?
|
||
#
|
||
# Files:
|
||
# * gene_association.sgd: https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
|
||
# * go_terms.tab
|
||
#
|
||
# Output:
|
||
# *
|
||
#
|
||
# This submodule:
|
||
# *
|
||
# *
|
||
#
|
||
# @arg $1 string Exp# name
|
||
# @arg $2 string ZScores_Interaction.csv file
|
||
# @arg $3 string go_terms.tab file
|
||
# @arg $4 string gene_association.sgd
|
||
# @arg $5 string output directory
|
||
#
|
||
r_gta() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
cat <<-EOF
|
||
|
||
EOF
|
||
|
||
script="$APPS_DIR/r/GTAtemplate.R"
|
||
|
||
[[ -d $5 ]] || mkdir -p "$5"
|
||
|
||
debug "$RSCRIPT $script $*"
|
||
"$RSCRIPT" "$script" "$@"
|
||
}
|
||
|
||
|
||
submodule r_gta_pairwiselk
|
||
# @description PairwiseLK.R R script
|
||
# TODO:
|
||
# * Should move directory creation from PairwiseLK.R to gta module
|
||
#
|
||
# Files:
|
||
# *
|
||
# *
|
||
#
|
||
# Output:
|
||
# *
|
||
#
|
||
# This submodule:
|
||
# * Will perform both L and K comparisons for the specified experiment folders.
|
||
# * The code uses the naming convention of PairwiseCompare_Exp’#’-Exp’#’ to standardize and keep simple the structural naming (where ‘X’ is either K or L and ‘Y’ is the number of the experiment GTA results to be found in ../GTAresult/Exp_).
|
||
# * {FYI There are also individual scripts that just do the ‘L’ or ‘K’ pairwise studies in the ../Code folder.}
|
||
#
|
||
# @arg $1 string First Exp# name
|
||
# @arg $2 string Second Exp# name
|
||
# @arg $3 string StudyInfo.txt file
|
||
# @arg $4 string output directory
|
||
#
|
||
r_gta_pairwiselk() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
cat <<-EOF
|
||
|
||
EOF
|
||
|
||
script="$APPS_DIR/r/PairwiseLK.R"
|
||
|
||
[[ -d $4 ]] || mkdir -p "$4"
|
||
|
||
debug "$RSCRIPT $script $*"
|
||
"$RSCRIPT" "$script" "$@"
|
||
}
|
||
|
||
|
||
submodule r_gta_heatmaps
|
||
# @description TSHeatmaps5dev2.R R script
|
||
# TODO:
|
||
# * Script could use rename
|
||
# * Script should be refactored to automatically allow more studies
|
||
# * Script should be refactored with more looping to reduce verbosity
|
||
#
|
||
# Files:
|
||
# *
|
||
# *
|
||
#
|
||
# Output:
|
||
# *
|
||
#
|
||
# This submodule:
|
||
# * The Term Specific Heatmaps are produced directly from the ../ExpStudy/Exp_/ZScores/ZScores_Interaction.csv file generated by the user modified interaction… .R script.
|
||
# * The heatmap labeling is per the names the user wrote into the StudyInfo.txt spreadsheet.
|
||
# * Verify that the All_SGD_GOTerms_for_QHTCPtk.csv found in ../Code is what you wish to use or if you wish to use a custom modified version.
|
||
# * If you wish to use a custom modified version, create it and modify the TSHeatmaps template script (TSHeatmaps5dev2.R) and save it as a ‘TSH_study specific name’.
|
||
#
|
||
# @arg $1 string StudyInfo.csv file
|
||
# @arg $2 string gene_ontology_edit.obo file
|
||
# @arg $3 string go_terms.tab file
|
||
# @arg $4 string All_SGD_GOTerms_for_QHTCPtk.csv
|
||
# @arg $5 string ZScores_interaction.csv
|
||
# @arg $6 string base directory
|
||
# @arg $7 string output directory
|
||
#
|
||
r_gta_heatmaps() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
cat <<-EOF
|
||
|
||
EOF
|
||
|
||
script="$APPS_DIR/r/TSHeatmaps5dev2.R"
|
||
|
||
[[ -d $5 ]] || mkdir -p "$5"
|
||
|
||
debug "$RSCRIPT $script $*"
|
||
"$RSCRIPT" "$script" "$@"
|
||
}
|
||
|
||
|
||
|
||
# submodule mat_exp_frontend
|
||
# # @description Run the ExpFrontend.m program
|
||
# # This submodule:
|
||
# # * Pushes into the Study template directory (ExpTemplate)
|
||
# # * Prompts the user to run ExpFrontend.m
|
||
# # * Pops out
|
||
# # NOTES:
|
||
# # * ExpFrontend.m should be or is being rewritten
|
||
# mat_exp_frontend() {
|
||
# debug "Running: ${FUNCNAME[0]}"
|
||
# cat <<-EOF
|
||
# ExpFrontend.m was made for recording into a spreadsheet
|
||
# ('StudiesDataArchive.txt') the date and files used (i.e., directory paths to the
|
||
# !!Results files used as input for Z-interaction script) for each multi-experiment study.
|
||
|
||
# Run the front end MATLAB programs in the correct order (e.g., run front end in "exp1"
|
||
# folder to call the !!Results file for the experiment you named as exp1 in the StudyInfo.csv file)
|
||
# The GTA and pairwise, TSHeatmaps, JoinInteractions and GTF Heatmap scripts use this table
|
||
# to label results and heatmaps in a meaningful way for the user and others.
|
||
# The BackgroundSD and ZscoreJoinSD fields will be filled automatically according to user
|
||
# specifications, at a later step in the QHTCP study process.
|
||
|
||
# COpen MATLAB and in the application navigate to each specific /Exp folder,
|
||
# call and execute ExpFrontend.m by clicking the play icon.
|
||
# Use the "Open file" function from within Matlab.
|
||
# Do not double-click on the file from the directory.
|
||
# When prompted, navigate to the ExpJobs folder and the PrintResults folder within the correct job folder.
|
||
# Repeat this for every Exp# folder depending on how many experiments are being performed.
|
||
# The Exp# folder must correspond to the StudyInfo.csv created above.
|
||
# EOF
|
||
|
||
# script="ExpFrontend.m"
|
||
# if ! ((YES)) &&
|
||
# ask "Start MATLAB to run $script? This requires a GUI."; then
|
||
# matlab -nosplash -r "$script"
|
||
# fi
|
||
# }
|
||
|
||
|
||
submodule r_interactions
|
||
# @description Run the R interactions analysis (Z_InteractionTemplate.R)
|
||
# TODO
|
||
# * don't want to rename Z_InteractionTemplate.R because that will break logic, just edit in place instead
|
||
# NOTES
|
||
# @arg $1 string The current working directory
|
||
r_interactions() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
cat <<-EOF
|
||
Edit the Z_InteractionTemplate.R script in each Exp dir beginning at the '++BEGIN USER DATA SELECTION++'
|
||
This is designed so that the data of interest for each experiment is appropriately selected from the !!Results…txt file
|
||
|
||
Be sure to enter Background noise filter standard deviation i.e., 3 or 5 per Sean
|
||
Enter Standard deviation value for removing data for cultures due to high background (e.g., contaminated cultures). Generally set this very high (e.g., '20') on the first run in order NOT to remove data, e.g. '20'. Review QC data and inspect raw image data to decide if it is desirable to remove data, and then rerun analysis.
|
||
Enter a Background SD threshold for EXCLUDING culture data from further analysis:
|
||
This Background value removes data where there is high pixel intensity in the background regions of a spot culture (i.e., suspected contamination). 5 is a minimum recommended value, because lower values result in more data being removed, and often times this is undesirable if contamination occurs late after the carrying capacity of the yeast culture is reached.
|
||
This is most often "trial and error", meaning there is a 'Frequency_Delta_Background.pdf' report in the /Exp_/ZScores/QC/ folder to evaluate whether the chosen value was suitable (and if not the analysis can simply be rerun with a more optimal choice). In general, err on the high side, with BSD of 10 or 12…. One can also use EZview to examine the raw images and individual cultures potentially included/excluded as a consequence of the selected value. Background values are reported in the results sheet and so could also be analyzed there..
|
||
EOF
|
||
|
||
script="$APPS_DIR/r/interactions.R"
|
||
|
||
debug "$RSCRIPT $script" "$@"
|
||
"$RSCRIPT" "$script" "$@"
|
||
}
|
||
|
||
|
||
submodule r_join_interactions
|
||
# @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv
|
||
# @arg $1 string The output directory
|
||
# @arg $2 string The sd value
|
||
# @arg $3 string The studyInfo file
|
||
r_join_interactions() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
script="$APPS_DIR/r/joinInteractExps.R"
|
||
debug "$RSCRIPT $script $*"
|
||
"$RSCRIPT" "$script" "$@"
|
||
local out_files=("$1/REMcRdy_lm_only.csv" "$1/Shift_only.csv" "$1/parameters.csv")
|
||
for f in "${out_files[@]}"; do
|
||
[[ -f $f ]] || (echo "$f does not exist"; return 1)
|
||
done
|
||
}
|
||
|
||
|
||
submodule java_extract
|
||
# @description Jingyu's REMc java utility using file input file REMcRdy_lm_only.csv
|
||
# and output REMcRdy_lm_only.csv-finalTable.csv
|
||
# I'm not sure if the output dir is configurable so we can copy data around or push/pop
|
||
# @arg $1 string The output directory
|
||
java_extract() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
classpath="$APPS_DIR/java/javaExtract.jar"
|
||
out_file="$1/REMcRdy_lm_only.csv-finalTable.csv"
|
||
|
||
# backup REMcRdy_lm_only.csv-finalTable.csv
|
||
if ! backup "$out_file"; then
|
||
ask "Backup of $out_file failed, continue?" || return 1
|
||
fi
|
||
|
||
java_cmd=(
|
||
"$JAVA" -Xms512m -Xmx2048m -Dfile.encoding=UTF-8 -classpath "$classpath" ExecMain
|
||
"$QHTCP_PROJECT_DIR/REMcRdy_lm_only.csv"
|
||
"$APPS_DIR/java/GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab"
|
||
"$APPS_DIR/java/ORF_List_Without_DAmPs.txt" 1 true true
|
||
)
|
||
|
||
debug "pushd && ${java_cmd[*]} && popd"
|
||
pushd "$1" && "${java_cmd[@]}" && popd || return 1
|
||
|
||
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
|
||
}
|
||
|
||
|
||
submodule r_add_shift_values
|
||
# @description Add shift values back to REMcRdy_lm_only.csv-finalTable.csv
|
||
# and output "REMcWithShift.csv" for use with the REMc heat maps
|
||
# @arg $1 string The output csv file REMcRdy_lm_only.csv-finalTable.csv
|
||
# @arg $2 string Shift_only.csv
|
||
# @arg $3 string REMcWithShift.csv
|
||
# @arg $4 string The sd value
|
||
r_add_shift_values() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
script="$APPS_DIR/r/addShiftVals.R"
|
||
out_file="$QHTCP_PROJECT_DIR/REMcWithShift.csv"
|
||
debug "$RSCRIPT $script $*"
|
||
"$RSCRIPT" "$script" "$@"
|
||
rm -f "REMcHeatmaps/"*.pdf
|
||
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
|
||
}
|
||
|
||
|
||
submodule r_create_heat_maps
|
||
# @description Execute createHeatMaps.R
|
||
# @arg $1 string The final shift table (REMcWithShift.csv)
|
||
# @arg $2 string The output directory
|
||
r_create_heat_maps() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
script="$APPS_DIR/r/createHeatMaps.R"
|
||
out_file="$QHTCP_PROJECT_DIR/compiledREMcHeatmaps.pdf"
|
||
debug "$RSCRIPT $script $*"
|
||
"$RSCRIPT" "$script" "$@"
|
||
pdfs=(REMcHeatmaps/*.pdf)
|
||
debug "pdftk ${pdfs[*]} output $out_file"
|
||
pdftk "${pdfs[@]}" output "$out_file"
|
||
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
|
||
}
|
||
|
||
|
||
submodule r_heat_maps_homology
|
||
# @description Execute createHeatMapsAll.R
|
||
# @arg $1 string The final shift table (REMcRdy_lm_only.csv-finalTable.csv)
|
||
# @arg $2 string The (Shift_only.csv)
|
||
# @arg $3 string The (Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv)
|
||
# @arg $4 string The output directory
|
||
r_heat_maps_homology() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
script="$APPS_DIR/r/createHeatMapsHomology.R"
|
||
out_file="$4/compiledREMcHomologyHeatmaps.pdf"
|
||
|
||
# Clean old output
|
||
rm "$4/"*.{pdf,csv}
|
||
|
||
"$RSCRIPT" "$script" \
|
||
REMcWithShift.csv \
|
||
Homology \
|
||
"$APPS_DIR/r/170503_DAmPs_Only.txt" \
|
||
Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv
|
||
|
||
pdfs=("$work_dir"/homology/*.pdf)
|
||
pdftk "${pdfs[@]}" output "$out_file"
|
||
|
||
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
|
||
}
|
||
|
||
|
||
submodule py_gtf_dcon
|
||
# @description Perform python dcon portion of GTF
|
||
# @arg $1 string Directory to process
|
||
# @arg $2 string Output directory name
|
||
py_gtf_dcon() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
script="$APPS_DIR/python/DconJG2.py"
|
||
debug "$PYTHON $SCRIPT $1 $2/"
|
||
"$PYTHON" "$SCRIPT" "$1" "$2/"
|
||
out_file="$2/1-0-0-finaltable.csv"
|
||
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
|
||
}
|
||
|
||
|
||
submodule pl_gtf
|
||
# @description Perl modules for GTF
|
||
# @arg $1 string Working directory
|
||
# @arg $2 string Output directory name to look for txt files
|
||
pl_gtf() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
set1="$APPS_DIR/perl/ORF_List_Without_DAmPs.txt"
|
||
shopt -s nullglob
|
||
set2=("$2"/*.txt) # glob them all
|
||
shopt -u nullglob
|
||
for s2 in "${set2[@]}"; do
|
||
debug "pl_gtf_analyze $set1 $s2"
|
||
pl_gtf_analyze "$set1" "$s2"
|
||
debug "pl_terms2tsv $s2"
|
||
pl_gtf_terms2tsv "$s2"
|
||
done
|
||
}
|
||
|
||
|
||
submodule pl_gtf_analyze
|
||
# @description Perl analyze submodule
|
||
# This seems weird to me because we're just overwriting the same data for all set2 members
|
||
# https://metacpan.org/dist/GO-TermFinder/view/examples/analyze.pl
|
||
# Is there a reason you need a custom version and not the original from cpan?
|
||
# @arg $1 string Set 1 TODO naming
|
||
# @arg $2 string Set 2 TODO naming
|
||
pl_gtf_analyze() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
script="$APPS_DIR/perl/analyze_v2.pl"
|
||
an="$APPS_DIR/perl/gene_association.sgd"
|
||
obo="$APPS_DIR/perl/gene_ontology_edit.obo"
|
||
debug "$PERL $script -an $an -as P -o $obo -b $1 $2"
|
||
"$PERL" "$script" -an "$an" -as P -o "$obo" -b "$1" "$2"
|
||
}
|
||
|
||
|
||
submodule pl_gtf_terms2tsv
|
||
# @description Perl terms2tsv submodule
|
||
# Probably should be translated to shell/python
|
||
#
|
||
# @arg $1 string Terms file TODO naming
|
||
pl_gtf_terms2tsv() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
script="$APPS_DIR/perl/terms2tsv.pl"
|
||
debug "$PERL $script $1.terms > $1.tsv"
|
||
"$PERL" "$script" "$1.terms" > "$1.tsv"
|
||
}
|
||
|
||
|
||
submodule py_gtf_concat
|
||
# @description Python concat submodule for GTF
|
||
# Concat the process ontology outputs from the /REMcReady_lm_only folder
|
||
# Probably should be translated to bash
|
||
# @arg $1 string working directory
|
||
# @arg $2 string output directory name to look for txt files
|
||
# @arg $3 string output file
|
||
py_gtf_concat() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
script="$APPS_DIR/python/concatGTFResults.py"
|
||
debug "$PYTHON $script $2/ $3"
|
||
"$PYTHON" "$script" "$2/" "$3"
|
||
[[ -f $3 ]] || (echo "$3 does not exist"; return 1)
|
||
}
|
||
|
||
|
||
submodule r_compile_gtf
|
||
# @description Compile GTF in R
|
||
# @arg $1 string gtf output directory
|
||
r_compile_gtf() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
script="$APPS_DIR/r/CompileGTF.R"
|
||
debug "$RSCRIPT $script $1"
|
||
"$RSCRIPT" "$script" "$1"
|
||
}
|
||
|
||
|
||
submodule get_studies
|
||
# @description Parse study names from StudyInfo.csv files
|
||
# TODO: This whole submodule should eventually be either
|
||
# * Removed
|
||
# * Expanded into a file that stores all project/study settings (database)
|
||
# I had to had a new line to the end of StudyInfo.csv, may break things?
|
||
# Example:
|
||
# ExpNumb,ExpLabel,BackgroundSD,ZscoreJoinSD,AnalysisBy
|
||
# 1,ExpName1,NA,NA,UserInitials
|
||
# 2,ExpName2,NA,NA,UserInitials
|
||
# 3,ExpName3,NA,NA,UserInitials
|
||
# @exitcode 0 If one or more studies found
|
||
# @exitcode 1 If no studies found
|
||
# @set STUDIES_NUMS array Contains Exp numbers
|
||
# @set NUM_STUDIES int Number of existing studies
|
||
# @arg $1 string File to read
|
||
get_studies() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
declare -ga STUDIES_NUMS=()
|
||
while IFS=',' read -r col1 _; do # split on comma, get second col
|
||
STUDIES_NUMS+=("$col1")
|
||
done < <(tail -n +2 "$1") # skip header
|
||
|
||
[[ ${#STUDIES_NUMS[@]} -gt 0 ]] &&
|
||
NUM_STUDIES="${#STUDIES_NUMS{@}}"
|
||
}
|
||
|
||
|
||
submodule choose_easy_results_dir #
|
||
# @description Chooses an EASY scans directory if the information is undefined
|
||
# TODO: Standardize EASY output, it's hard to understand
|
||
# TODO eventually we could run this on multiple results dirs simultaneously with some refactoring
|
||
# @exitcode 0 if successfully choose an EASY results dir
|
||
# @set EASY_RESULTS_DIR string The working EASY output directory
|
||
choose_easy_results_dir() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
|
||
declare -g EASY_RESULTS_DIR="$QHTCP_PROJECT_DIR/easy"
|
||
|
||
# Always backup existing output
|
||
# This would happen if you ran the same experiment twice in one day, for instance
|
||
if [[ -d $EASY_RESULTS_DIR ]] ; then
|
||
backup_dir="$EASY_RESULTS_DIR"
|
||
count=1
|
||
while [[ -d $backup_dir ]]; do
|
||
backup_dir="$backup_dir.$((count++))"
|
||
done
|
||
echo "Backing up existing output from $EASY_RESULTS_DIR to $backup_dir"
|
||
debug "rsync -a $EASY_RESULTS_DIR $backup_dir"
|
||
rsync -a "$EASY_RESULTS_DIR" "$backup_dir" || return 1
|
||
fi
|
||
|
||
if [[ ! -d $EASY_RESULTS_DIR ]]; then
|
||
debug "mkdir $EASY_RESULTS_DIR"
|
||
mkdir "$EASY_RESULTS_DIR"
|
||
else
|
||
err "Could not create $EASY_RESULTS_DIR"
|
||
return 0
|
||
fi
|
||
|
||
# echo "Hit enter to use the default EASY results directory: $default_easy_results_dir"
|
||
# if ! (( YES )); then
|
||
# read -r -p "Or enter a custom directory name, example: $PROJECT" dirname
|
||
# [[ -z $dirname ]] && EASY_RESULTS_DIR="$default_easy_results_dir" && return 0
|
||
# fi
|
||
# ((YES)) && return 0
|
||
# # Let's get a little fancy
|
||
# shopt -s nullglob
|
||
# declare -la easy_results_dirs=( "$easy_out_dir/"*/ )
|
||
# shopt -u nullglob
|
||
# # Sort the dirs
|
||
# mapfile -t easy_results_dirs < <(printf '%s\n' "${easy_results_dirs[@]}" | sort)
|
||
# last_index=$(( ${#easy_results_dirs} - 1 ))
|
||
# ((YES)) && EASY_RESULTS_DIR="${easy_results_dirs[$last_index]}" && return 0
|
||
# echo "Multiple EASY results dirs found in $PROJECT_SCANS_DIR"
|
||
# echo "Here is a list: "
|
||
# for i in "${!easy_results_dirs[@]}"; do
|
||
# printf "%d. %s\n" "$((i+1))" "${easy_results_dirs[i]}"
|
||
# done
|
||
# printf "%s\n" "${easy_results_dirs[@]}"
|
||
# last_index=$(( ${#easy_results_dirs} - 1 ))
|
||
# read -r -p "Enter the item number to select EASY results directory, default ($last_index): " response
|
||
# [[ -z $response ]] && response=$last_index
|
||
# response=$(( response - 1 )) # bash arrays use zero indexing
|
||
# EASY_RESULTS_DIR="${easy_results_dirs[$response]}"
|
||
# EASY_RESULTS_FILES=("$EASY_RESULTS_DIR/"*"/PrintResults/!!"*)
|
||
# [[ ${#easy_results_dirs[@]} -gt 0 ]]
|
||
}
|
||
|
||
|
||
submodule documentation
|
||
# @section Documentation
|
||
# @description Generates markdown documentation from this script using shdoc
|
||
#
|
||
# TODO
|
||
# * We can include images in the markdown file but not natively with shdoc
|
||
# * Need to add a post processor
|
||
# * Or use a 'veryuniqueword' and some fancy sed
|
||
# @noargs
|
||
documentation() {
|
||
debug "Running: ${FUNCNAME[0]}"
|
||
# Print markdown to stdout
|
||
((DEBUG)) && shdoc < "$SCRIPT"
|
||
# Create markdown file
|
||
shdoc < "$SCRIPT" > README.md
|
||
}
|
||
|
||
|
||
# @description The main loop of script-run-workflow
|
||
# May eventually need to add git ops
|
||
# Passes on arguments
|
||
# Most variables in main() are user configurable or can be overriden by env
|
||
# @internal
|
||
main() {
|
||
debug "Running: ${FUNCNAME[0]}" "$@"
|
||
|
||
# Where are we
|
||
SCRIPT=$(realpath -s "${BASH_SOURCE[0]}")
|
||
SCRIPT_DIR=$(dirname "$SCRIPT")
|
||
|
||
# Global directory variables
|
||
declare -g TEMPLATES_DIR="$SCRIPT_DIR/templates"
|
||
APPS_DIR="$SCRIPT_DIR/apps"
|
||
DATE="$(date +%Y%m%d)" # change in EASYConsole.m to match
|
||
# scans_heirarchy=("./scans" "/mnt/data/scans" "/mnt/data/ExpJobs" "./scans")
|
||
local scans_heirarchy=( "$SCANS_DIR" "./scans" "/mnt/data/scans" "./scans") # TODO change back for production, avoid actual scan dirs during testing
|
||
# Find a scans directory
|
||
[[ -z $SCANS_DIR ]] && for d in "${scans_heirarchy[@]}"; do
|
||
if [[ -d $d ]]; then
|
||
declare -g SCANS_DIR="$d"
|
||
fi
|
||
done
|
||
if ! [[ -d $SCANS_DIR ]]; then
|
||
# This is not something we do often, so ask
|
||
if ask "Create the scans directory: $SCANS_DIR?"; then
|
||
mkdir -p "$SCANS_DIR"
|
||
else
|
||
echo "No scans directory available, exiting"
|
||
exit 1;
|
||
fi
|
||
fi
|
||
echo "Using scans directory: $SCANS_DIR"
|
||
echo "Change the SCANS environment variable to override"
|
||
echo "Example: SCANS=/path/to/scans ./qhtcp-workflow"
|
||
|
||
local out_heirarchy=("$(dirname "$SCANS_DIR")/out" "$SCRIPT_DIR/out" "/mnt/data/out")
|
||
for d in "${out_heirarchy[@]}"; do
|
||
if [[ -d $d ]]; then
|
||
debug "Using output directory: $d"
|
||
declare -g OUT_DIR="$d"
|
||
break
|
||
fi
|
||
done
|
||
|
||
if [[ -z $OUT_DIR ]]; then
|
||
echo "No output directory found"
|
||
declare -g OUT_DIR="$SCRIPT_DIR/out"
|
||
if ask "Create $SCRIPT_DIR/out?"; then
|
||
debug "mkdir $SCRIPT_DIR/out"
|
||
mkdir "$SCRIPT_DIR/out"
|
||
else
|
||
err "No output directory, but attempting to continue..."
|
||
fi
|
||
fi
|
||
|
||
# Set the automatic project directory prefix
|
||
PROJECT_USER="$(whoami)"
|
||
PROJECT_PREFIX="${DATE}_${PROJECT_USER}" # reversed these so easier to sort and parse date
|
||
sanitize_pn() { [[ $1 =~ [0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]_.+_.+ ]]; } # sanitizer regex for prefix
|
||
|
||
declare -a PROJECTS=() # this array will hold all of the projects for this run
|
||
|
||
parse_input "$@" # parse arguments with getopt
|
||
|
||
# Prompt user for the PROJECT if we still don't have one
|
||
if [[ ${#PROJECTS[@]} -eq 0 ]]; then
|
||
ask_pn && PROJECTS+=("$PROJECT")
|
||
fi
|
||
|
||
for i in "${!PROJECTS[@]}"; do
|
||
if ! sanitize_pn "${PROJECTS[i]}"; then
|
||
echo "Project name ${PROJECTS[i]} is invalid"
|
||
echo "Enter a replacement"
|
||
ask_pn && PROJECTS[i]="$PROJECT"
|
||
fi
|
||
done
|
||
|
||
# If we don't catch with getopt or env, run all
|
||
if [[ ${#INCLUDE_MODULES[@]} -eq 0 ]]; then
|
||
MODULES=("${ALL_MODULES[@]}")
|
||
else
|
||
MODULES=("${INCLUDE_MODULES[@]}")
|
||
fi
|
||
|
||
# Exclude modules from --exclude
|
||
arr=()
|
||
for m in "${MODULES[@]}"; do
|
||
[[ " ${EXCLUDE_MODULES[*]} " =~ [[:space:]]${m}[[:space:]] ]] || arr+=("$m")
|
||
done
|
||
MODULES=("${arr[@]}")
|
||
unset arr
|
||
|
||
# Sanitize MODULES
|
||
for i in "${!MODULES[@]}"; do
|
||
if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then
|
||
echo "Module $m not in the module list"
|
||
echo "Available modules: ${ALL_MODULES[*]}"
|
||
read -r -p "Enter replacement module name: " MODULE
|
||
! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULE}[[:space:]] ]] || (echo "RTFM"; return 1)
|
||
MODULES[i]="$MODULE"
|
||
fi
|
||
done
|
||
|
||
# Loop over projects
|
||
for PROJECT in "${PROJECTS[@]}"; do
|
||
declare -g PROJECT_SCANS_DIR="$SCANS_DIR/$PROJECT"
|
||
declare -g PROJECT_DATE="${PROJECT%"${PROJECT#????????}"}" # e.g. 20240723
|
||
declare -g PROJECT_SUFFIX="${PROJECT#????????_*_}"
|
||
declare -g PROJECT_USER="${PROJECT#????????_}"; PROJECT_USER="${PROJECT_USER%%_*}"
|
||
declare -g STUDIES_ARCHIVE_FILE="$OUT_DIR/StudiesDataArchive.txt"
|
||
declare -g QHTCP_PROJECT_DIR="$OUT_DIR/$PROJECT"
|
||
declare -g QHTCP_TEMPLATE_DIR="$TEMPLATES_DIR/qhtcp"
|
||
declare -g STUDY_TEMPLATE_DIR="$TEMPLATES_DIR/exp"
|
||
declare -g STUDY_INFO_FILE="$QHTCP_PROJECT_DIR/StudyInfo.csv"
|
||
|
||
if ((DEBUG)); then
|
||
declare -p SCANS_DIR OUT_DIR TEMPLATES_DIR APPS_DIR \
|
||
PROJECTS PROJECT \
|
||
PROJECT_SCANS_DIR PROJECT_DATE PROJECT_SUFFIX \
|
||
PROJECT_USER STUDIES_ARCHIVE_FILE QHTCP_PROJECT_DIR QHTCP_TEMPLATE_DIR \
|
||
STUDY_TEMPLATE_DIR STUDY_INFO_FILE
|
||
fi
|
||
|
||
# Run selected modules
|
||
for m in "${MODULES[@]}"; do
|
||
ask "Run $m module?" && "$m"
|
||
done
|
||
done
|
||
}
|
||
|
||
main "$@"
|
||
|
||
exit $?
|