Some more refactoring and cleanup

This commit is contained in:
2024-08-02 18:48:09 -04:00
parent 5ca6ef8f01
commit b5aaf9ffb4
5 changed files with 45 additions and 2770 deletions

View File

@@ -74,7 +74,7 @@ print_help() {
script-run-workflow [[OPTION] [VALUE]]...
Some options (--project, --include, --exclude) can be passed multiple times or
by using comma deliminated strings (see EXAMPLES below)
by using comma-separated strings (see EXAMPLES below)
OPTIONS:
--project, -p PROJECT
@@ -117,6 +117,7 @@ print_help() {
script-run-workflow --module=${ALL_MODULES[0]},${ALL_MODULES[1]}
script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT
script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT,${PROJECT_PREFIX}_MY_OTHER_PROJECT --module=${ALL_MODULES[1]},${ALL_MODULES[2]} --yes --debug
script-run-workflow --project=${PROJECT_PREFIX}_MY_PROJECT --submodule ${ALL_SUBMODULES[2]} \"/path/to/genefile.txt,/path/to/output/dir\" --submodule ${ALL_SUBMODULES[3]} \"/path/to/sgofile\"
EOF
}
@@ -326,7 +327,7 @@ print_header() {
# Let user choose project(s)
if [[ -z ${PROJECTS[*]} ]]; then
num=$((${#projects[@]}))
echo "Enter comma delimited project #'s (from list) to analyze"
echo "Enter a comma-separated list of project numbers to analyze"
read -r -p "Or hit Enter to add a new project" response
[[ -z $response ]] && ask_pn && PROJECTS+=("${ADD_PROJECTS[@]}")
((YES)) || read -r -p "Hit enter for default ($num): " response
@@ -346,7 +347,7 @@ print_header() {
done
if [[ -z ${MODULES[*]} && -z ${EXCLUDE_MODULES[*]} ]]; then
echo "Enter module #'s to run (by #, comma delimited)"
echo "Enter a comma-separated list of modules to run"
((YES)) || read -r -p "Hit Enter for all (default) or '0' for none: " response
if [[ -n $response && $response -ne 0 ]]; then
IFS=',' read -ra arr <<< "$response"
@@ -359,7 +360,8 @@ print_header() {
if [[ -z ${MODULES[*]} && -z ${EXCLUDE_MODULES[*]} && -z ${SUBMODULES[*]} ]]; then
while :; do
echo "Enter a submodule followed by its arguments as a case delimited string in quotes"
echo "Enter a submodule followed by its arguments as a comma-separated string"
echo "Quote your string if there are any whitespaces"
echo "Example: ${ALL_SUBMODULES[0]} \"arg1,arg2,arg3...\""
((YES)) || read -r -p "Or hit Enter to continue: " response
[[ -z $response ]] && break
@@ -520,6 +522,7 @@ init_project() {
if ask "You can edit this file in the qhtcp module"; then
cat <<-EOF > "$STUDY_INFO_FILE"
"ExpNumb","ExpLabel","BackgroundSD","ZscoreJoinSD","AnalysisBy"
EOF
fi
fi
@@ -943,15 +946,10 @@ qhtcp() {
module remc
# @section GTF
# @description GTF module for QHTCP
# TODO which components of remc can be parallelized?
# The submodules in remc really like to be run from the REMc dir
# so we pop in and out for now
# NOTE the remc modules could use some love
# * Don't cd within scripts, it's confusing
# * Use arguments to pass configuration variables
# * This allows us to abstract the program away in script-run-workflow and treat it like a module
# @section remc
# @description remc module for QHTCP
# TODO
# * Which components can be parallelized?
# @arg $1 string studyInfo file
remc() {
debug "Running: ${FUNCNAME[0]} $*"
@@ -992,14 +990,14 @@ module gtf
gtf() {
debug "Running: ${FUNCNAME[0]}"
gtf_out_dir="${1:-$QHTCP_PROJECT_DIR/out/gtf}"
gene_association_sgd="${2:-"$APPS_DIR/r/gene_association.sgd"}"
gene_ontology_obo="${3:-"$APPS_DIR/r/gene_ontology_edit.obo"}"
orf_list="${4:-"$APPS_DIR/r/ORF_List_Without_DAmPs.txt"}"
process_dir="$gtf_out_dir/process"
function_dir="$gtf_out_dir/function"
component_dir="$gtf_out_dir/component"
gene_association_sgd="${2:-"$APPS_DIR/perl/gene_association.sgd"}"
gene_ontology_obo="${3:-"$APPS_DIR/perl/gene_ontology_edit.obo"}"
orf_list="${4:-"$APPS_DIR/perl/ORF_List_Without_DAmPs.txt"}"
py_gtf_dcon \
"$process_dir" \
"$gtf_out_dir"
@@ -1040,26 +1038,26 @@ module gta
# TODO
# *
# *
# @set GTA_OUT_DIR string The GTA output results dir
# @set all_sgd_terms_csv string The all_SGD_GOTerms_for_QHTCPtk.csv file
# @set sgd_terms_tfile string The go_terms.tab file
# @set sgd_features_file string The gene_association.sgd file
# @set gene_ontology_file string The gene_ontology_edit.obo file
# @set zscores_file string The ZScores_interaction.csv file
# @arg $1 string output directory
# @arg $2 string gene_association.sgd
# @arg $3 string gene_ontology_edit.obo
# @arg $4 string go_terms.tab
# @arg $5 string All_SGD_GOTerms_for_QHTCPtk.csv
# @arg $6 string zscores_interaction.csv
gta() {
debug "Running: ${FUNCNAME[0]}"
GTA_OUT_DIR="$QHTCP_PROJECT_DIR/gta"
all_sgd_terms_csv="$APPS_DIR/r/All_SGD_GOTerms_for_QHTCPtk.csv"
sgd_terms_tfile="$APPS_DIR/r/go_terms.tab"
sgd_features_file="$APPS_DIR/r/gene_association.sgd"
gene_ontology_file="$APPS_DIR/r/gene_ontology_edit.obo"
zscores_file="zscores/zscores_interaction.csv"
gta_out_dir="${1:-"$QHTCP_PROJECT_DIR/gta"}"
gene_association_sgd="${2:-"$APPS_DIR/r/gene_association.sgd"}"
gene_ontology_obo="${3:-"$APPS_DIR/r/gene_ontology_edit.obo"}"
sgd_terms_tfile="${4:-"$APPS_DIR/r/go_terms.tab"}"
all_sgd_terms_csv="${5:-"$APPS_DIR/r/All_SGD_GOTerms_for_QHTCPtk.csv"}"
zscores_file="${6:-"$gta_out_dir/zscores/zscores_interaction.csv"}" # TODO This could be wrong, it could be in main results
# Sets STUDIES_NUM and NUM_STUDIES
get_studies "$STUDY_INFO_FILE"
[[ -d $GTA_OUT_DIR ]] || mkdir "$GTA_OUT_DIR"
[[ -d $gta_out_dir ]] || mkdir "$gta_out_dir"
# Loop over the array and create pairwise arrays
for ((i=0; i<${#STUDIES_NUMS[@]}; i++)); do
@@ -1088,13 +1086,13 @@ gta() {
for s in "${STUDIES_NUMS[@]}"; do
zscores_file="$QHTCP_PROJECT_DIR/Exp$s/$zscores_file"
if [[ -f $zscores_file ]]; then
mkdir "$GTA_OUT_DIR/Exp$s"
mkdir "$gta_out_dir/Exp$s"
r_gta \
"Exp$s" \
"$zscores_file" \
"$sgd_terms_tfile" \
"$sgd_features_file" \
"$GTA_OUT_DIR"
"$gene_association_sgd" \
"$gta_out_dir"
fi
done
@@ -1102,7 +1100,7 @@ gta() {
for combo in "${study_combos[@]}"; do
# Split on comma and assign to array
IFS=',' read -ra studies <<< "$combo"
r_gta_pairwiselk "${studies[0]}" "${studies[1]}" "$STUDY_INFO_FILE" "$GTA_OUT_DIR"
r_gta_pairwiselk "${studies[0]}" "${studies[1]}" "$STUDY_INFO_FILE" "$gta_out_dir"
done
# All studies
@@ -1110,7 +1108,7 @@ gta() {
# are required
r_gta_heatmaps \
"$STUDY_INFO_FILE" \
"$gene_ontology_file" \
"$gene_ontology_obo" \
"$sgd_terms_tfile" \
"$all_sgd_terms_csv" \
"$zscores_file" \
@@ -1121,14 +1119,12 @@ gta() {
# @section Submodules
# @description Submodules provide functionality to modules and should be reusable
# A submodule only runs by default if called by a module
# Use a submodule for:
# * Calling external scripts
# * Performing repetitive tasks
# * Generalizing code
# * Functions you do not want to perform by default (submodules should be called modules)
# * Should not call cd or pushd (let module dictate)
# @description Submodules are shell wrappers for workflow components in external languages.
# Submodules:
# * Allow scripts to be called by the main workflow script using input\
# and output arguments as a translation mechanism.
# * Only run by default if called by a module.
# * Can be called directly with its arguments as a comma-separated string
submodule r_gta
@@ -1320,6 +1316,10 @@ r_interactions() {
submodule r_join_interactions
# @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv
# Output files:
# * REMcRdy_lm_only.csv
# * Shift_only.csv
# * parameters.csv
# @arg $1 string The output directory
# @arg $2 string The sd value
# @arg $3 string The studyInfo file
@@ -1552,19 +1552,15 @@ get_studies() {
# submodule choose_easy_results_dir #
# # @description Chooses an EASY scans directory if the information is undefined
# # TODO: Standardize EASY output, it's hard to understand
# # TODO Standardize EASY output, it's hard to understand
# # TODO eventually we could run this on multiple results dirs simultaneously with some refactoring
# # @exitcode 0 if successfully choose an EASY results dir
# # @set EASY_RESULTS_DIR string The working EASY output directory
# choose_easy_results_dir() {
# debug "Running: ${FUNCNAME[0]}"
# # Always backup existing output
# # This would happen if you ran the same experiment twice in one day, for instance
# [[ -d $EASY_RESULTS_DIR ]] && backup "$EASY_RESULTS_DIR"
# if [[ ! -d $EASY_RESULTS_DIR ]]; then
# debug "mkdir $EASY_RESULTS_DIR"
# mkdir "$EASY_RESULTS_DIR"
@@ -1572,7 +1568,6 @@ get_studies() {
# err "Could not create $EASY_RESULTS_DIR"
# return 0
# fi
# # echo "Hit enter to use the default EASY results directory: $default_easy_results_dir"
# # if ! (( YES )); then
# # read -r -p "Or enter a custom directory name, example: $PROJECT" dirname