Add gta module

2024-07-24 00:06:56 -04:00
parent b049d58e79
commit 7b8ce3e7fd
3 changed files with 259 additions and 174 deletions
--- a/workflow/script-run-workflow
+++ b/workflow/script-run-workflow
@@ -111,45 +111,45 @@ parse_input() {
    while true; do
      case $1 in
        --project|-p)
-          shift
+            shift
-          if [[ $1 == *','* ]] ; then # check for commas
+            if [[ $1 == *','* ]] ; then # check for commas
-            IFS=',' read -ra PROJECTS <<< "$1"
+              IFS=',' read -ra PROJECTS <<< "$1"
-          else
+            else
-            PROJECTS+=("$1")
+              PROJECTS+=("$1")
-          fi
+            fi
-          ;;
+            ;;
        --include|-i)
-          shift
+            shift
-          if [[ $1 == *','* ]] ; then # check for commas
+            if [[ $1 == *','* ]] ; then # check for commas
-            IFS=',' read -ra INCLUDE_MODULES <<< "$1"
+              IFS=',' read -ra INCLUDE_MODULES <<< "$1"
-          else
+            else
-            INCLUDE_MODULES+=("$1")
+              INCLUDE_MODULES+=("$1")
-          fi
+            fi
-          ;;
+            ;;
        --exclude|-x)
-          shift
+            shift
-          if [[ $1 == *','* ]] ; then # check for commas
+            if [[ $1 == *','* ]] ; then # check for commas
-            IFS=',' read -ra EXCLUDE_MODULES <<< "$1"
+              IFS=',' read -ra EXCLUDE_MODULES <<< "$1"
-          else
+            else
-            EXCLUDE_MODULES+=("$1")
+              EXCLUDE_MODULES+=("$1")
-          fi
+            fi
-          ;;
+            ;;
        --markdown|-m)
-          documentation
+            documentation
-          ;;
+            ;;
        --yes|-y|--auto)
-          declare -g YES=1
+            declare -g YES=1
-          ;;
+            ;;
        --debug|-d)
-          declare -g DEBUG=1
+            declare -g DEBUG=1
-          ;;
+            ;;
        --help|-h)
-          print_help; exit 0
+            print_help; exit 0
-          ;;
+            ;;
        --)
-          shift
+            shift
-          break
+            break
-          ;;
+            ;;
      esac
      shift
    done
@@ -320,15 +320,15 @@ init_project() {
  # We handle this in main() and pushd to it
  # But do it one more time in case this is run as a module
-  [[ -d $SCAN_DIR ]] || 
+  [[ -d $SCANS_DIR ]] || 
-    ( mkdir -p "$SCAN_DIR" && pushd "$SCAN_DIR" || return 1 )
+    ( mkdir -p "$SCANS_DIR" && pushd "$SCANS_DIR" || return 1 )
-  ask "Initialize a project at $SCAN_DIR?" || return 1
+  ask "Initialize a project at $SCANS_DIR?" || return 1
-  [[ -d $SCAN_DIR/MasterPlateFiles ]] || mkdir -p "$SCAN_DIR/MasterPlateFiles"
+  [[ -d $SCANS_DIR/MasterPlateFiles ]] || mkdir -p "$SCANS_DIR/MasterPlateFiles"
-  DRUG_MEDIA_FILE="$SCAN_DIR/MasterPlateFiles/DrugMedia_$PROJECT.xls"
+  DRUG_MEDIA_FILE="$SCANS_DIR/MasterPlateFiles/DrugMedia_$PROJECT.xls"
-  MASTER_PLATE_FILE="$SCAN_DIR/MasterPlateFiles/MasterPlate_$PROJECT.xls"
+  MASTER_PLATE_FILE="$SCANS_DIR/MasterPlateFiles/MasterPlate_$PROJECT.xls"
  for f in $DRUG_MEDIA_FILE $MASTER_PLATE_FILE; do
    touch "$f"
@@ -482,7 +482,7 @@ easy() {
  popd || return 1
  # Use the function return code see if we succeeded
-  get_easy_results "$SCAN_DIR" || return 1
+  get_easy_results "$SCANS_DIR" || return 1
 }
@@ -499,12 +499,20 @@ ezview() {
 module qhtcp
 # @section QHTCP
 # @description System for Multi-QHTCP-Experiment Gene Interaction Profiling Analysis
-# * Functional rewrite of REMcMaster3.sh
+# * Functional rewrite of REMcMaster3.sh, RemcMaster2.sh, REMcJar2.sh, ExpFrontend.m, mProcess.sh, mFunction.sh, mComponent.sh
 # * Added a newline character to the end of StudyInfo.csv so it is a valid text file
 # TODO Suggest renaming StudiesQHTCP to something like qhtcp qhtcp_output or output
 # TODO Store StudyInfo somewhere better
 # TODO Move (hide) the study template somewhere else
-# TODO StudiesArchive should be smarter
+# TODO StudiesArchive should be smarter:
 #   * Create a database with as much information as possible
 #   * Write a function that easily loads and parses databse into easy-to-use variables
 #   * Allow users to reference those variables to write their own modules
 # TODO Should not be using initials
 #    * not unique enough and we don't have that data easily on hand
 #    * usernames are unique and make more sense
 #    * I don't know what all would have to be modified atm
 #
 # Rerunning this module uses rsync --update to only copy files that are newer in the template
 # If you wish for the template to overwrite your changes, delete the file from your QHTCP project dir
 #
@@ -848,45 +856,70 @@ module qhtcp
 qhtcp() {
  debug "Running: ${FUNCNAME[0]}"
-  OUTPUT_DIR="/mnt/data/StudiesQHTCP"
+  QHTCP_BASE_DIR="/mnt/data/StudiesQHTCP"
-  STUDIES_ARCHIVE="$OUTPUT_DIR/StudiesDataArchive.txt"
+  QHTCP_PROJECT_DIR="$QHTCP_BASE_DIR/$PROJECT"
-  QHTCP_DIR="$OUTPUT_DIR/$PROJECT"
+  CODE_DIR="$QHTCP_PROJECT_DIR/Code"
-  STUDY_INFO="$QHTCP_DIR/Code/StudyInfo.csv"
+  STUDIES_ARCHIVE_FILE="$QHTCP_BASE_DIR/StudiesDataArchive.txt"
  STUDY_INFO_FILE="$CODE_DIR/StudyInfo.csv"
-  if [[ -d $QHTCP_DIR ]]; then
+  if [[ -d $QHTCP_PROJECT_DIR ]]; then
-    echo "A project already exists at $QHTCP_DIR"
+    echo "A project already exists at $QHTCP_PROJECT_DIR"
-    ask "Safely update $QHTCP_DIR from the $QHTCP_TEMPLATE_DIR template?"
+    ask "Safely update $QHTCP_PROJECT_DIR from the $QHTCP_TEMPLATE_DIR template?"
-    if ! ((YES)) && ask "Back up $QHTCP_DIR to $QHTCP_DIR.bk first and start fresh?"; then
+    if ! ((YES)) && ask "Back up $QHTCP_PROJECT_DIR to $QHTCP_PROJECT_DIR.bk first and start fresh?"; then
-      mv "$QHTCP_DIR" "$QHTCP_DIR.bk" || (echo "Backup unsuccessful, exiting"; exit 1)
+      mv "$QHTCP_PROJECT_DIR" "$QHTCP_PROJECT_DIR.bk" || (echo "Backup unsuccessful, exiting"; exit 1)
    fi
  fi
  # Copy template to QHTCP project directory
-  if rsync --archive --update "$QHTCP_TEMPLATE_DIR"/ "$QHTCP_DIR"; then
+  if rsync --archive --update "$QHTCP_TEMPLATE_DIR"/ "$QHTCP_PROJECT_DIR"; then
-    echo "New project created at $QHTCP_DIR"
+    echo "New project created at $QHTCP_PROJECT_DIR"
  fi
  # Sets STUDIES_NUMS and NUM_STUDIES (yes this makes sense)
  get_studies "$STUDY_INFO_FILE"
  # Construct the next auto-entry
  # 1,ExpName1,NA,NA,UserInitials
  next_study_num=$(( NUM_STUDIES + 1 ))
  # If the next Exp dir already exists don't use it
  while [[ -d $QHTCP_PROJECT_DIR/Exp$next_study_num ]]; do
    (( next_study_num=next_study_num+1 ))
  done
  # Now this is tricker than first appears
  # Use initials from project not whoami
  # Best I can do is first two letters of username
  # See TODO in markdown
  initials="${PROJECT_USER:0:2}"
  INITIALS=${initials^^}
  next_study_entry="$next_study_num,$PROJECT_SUFFIX,NA,NA,$INITIALS"
  debug "$next_study_entry"
  # Print current studies
-  [[ -f $STUDY_INFO ]] && 
+  [[ -f $STUDY_INFO_FILE ]] && 
-  echo "Current studies from $STUDY_INFO: " &&
+  echo "Current studies from $STUDY_INFO_FILE: " &&
-  cat "$STUDY_INFO"
+  cat "$STUDY_INFO_FILE"
-  # Ask user to edit STUDY_INFO
+  # Ask user to edit STUDY_INFO_FILE
-  if ! ((YES)) && ask "Would you like to edit $STUDY_INFO to add or modify studies?"; then
+  if ! ((YES)) && ask "Would you like to edit $STUDY_INFO_FILE to add or modify studies?"; then
    cat <<-EOF 
 			Give each experiment the labels you wish to be used for the plots and specific files. 
 			Enter the desired Experiment names and order them in the way you want them to appear in the REMc heatmaps
 			Auto-entry suggestion: $next_study_entry
 		EOF
-    read -r -p "Press enter to continue"
+    if ask "Would you like to add (y) the auto-entry suggestion to $STUDY_INFO_FILE or edit STUDY_INFO_FILE in nano (n)?"; then
-    nano "$STUDY_INFO"
+      echo "$next_study_entry" >> "$STUDY_INFO_FILE"
    else
      debug "nano $STUDY_INFO_FILE"
      nano "$STUDY_INFO_FILE"
    fi
  fi
  # Sets STUDIES_NUM
  get_studies "$STUDY_INFO"
  # Initialize missing dirs
-  for s in "${STUDIES_NUM[@]}"; do
+  for s in "${STUDIES_NUMS[@]}"; do
-    STUDY_DIR="$QHTCP_DIR/Exp$s"
+    STUDY_DIR="$QHTCP_PROJECT_DIR/Exp$s"
    if ! [[ -d $STUDY_DIR ]]; then
      if ! rsync --archive "$STUDY_TEMPLATE_DIR" "$STUDY_DIR"; then
        err "Could not copy $STUDY_TEMPLATE_DIR template to $STUDY_DIR"
@@ -897,7 +930,7 @@ qhtcp() {
  unset STUDY_DIR
  # Replacing ExpFrontend.m
-  get_easy_results "$SCAN_DIR" || (err "No EASY results found in $SCAN_DIR"; return 1)
+  get_easy_results "$SCANS_DIR" || (err "No EASY results found in $SCANS_DIR"; return 1)
  # Set the right results directory
  # TODO eventually we could run this on multiple results dirs simultaneously with some refactoring
@@ -905,7 +938,7 @@ qhtcp() {
    # In automatic mode just choose the first OUT DIR in the list
    EASY_RESULT_DIR="${EASY_RESULTS_DIRS[0]}" # TODO right now just choose
  else
-    echo "Multiple EASY results dirs found in $SCAN_DIR"
+    echo "Multiple EASY results dirs found in $SCANS_DIR"
    echo "Here is a list: "
    for (( i=0; i<${#EASY_RESULTS_DIRS[@]}; i++ )); do
      printf "%d. %s\n" "$((i+1))" "${EASY_RESULTS_DIRS[i]}"
@@ -919,37 +952,38 @@ qhtcp() {
  # TODO probably broken
  EASY_RESULTS_FILES=("$EASY_RESULTS_DIR/"*"/PrintResults/!!"*) 
  # Create studies archive file if missing
-  if ! [[ -d $STUDIES_ARCHIVE ]]; then
+  if ! [[ -d $STUDIES_ARCHIVE_FILE ]]; then
    header=(StudyDate tStudyName StudyPath ExpNum ExpDate ExpPath ResultFile)
-    printf "%s\t" "${header[@]}" > "$STUDIES_ARCHIVE"
+    printf "%s\t" "${header[@]}" > "$STUDIES_ARCHIVE_FILE"
  fi
  # TODO Add them all to StudiesDataArchive?
  # Probably better to always add and remove dupes later since each invocation "counts"?
  for f in "${EASY_RESULTS_FILES[@]}"; do
-    for s in "${STUDIES_NUM[@]}"; do
+    for s in "${STUDIES_NUMS[@]}"; do
-      # Trying to match old ExpFrontend
+      # Trying to match old ExpFrontend formatting
      printf "%s\t" \
-        "${DATE//_/}" "$PROJECT" "$QHTCP_DIR" "Exp$s" \
+        "${DATE//_/}" "$PROJECT" "$QHTCP_PROJECT_DIR" "Exp$s" \
-        "$PROJECT_DATE" "$SCAN_DIR" "$EASY_RESULT_DIR" "${f##*/}" \
+        "$PROJECT_DATE" "$SCANS_DIR" "$EASY_RESULT_DIR" "${f##*/}" \
-        >> "$STUDIES_ARCHIVE"
+        >> "$STUDIES_ARCHIVE_FILE"
    done
  done
  # Run R interactions script on all studies
-  for s in "${STUDIES_NUM[@]}"; do
+  for s in "${STUDIES_NUMS[@]}"; do
-    STUDY_DIR="$QHTCP_DIR/Exp$s"
+    STUDY_DIR="$QHTCP_PROJECT_DIR/Exp$s"
    pushd "$STUDY_DIR" || return 1
    r_interactions \
      "$STUDY_DIR"\
-      "$STUDY_INFO"\
+      "$STUDY_INFO_FILE"\
      "/ZScores/" \
      "../Code/SGD_features.tab" \
      5
    popd || return 1
  done
-  # Global modules
+  # Run remc as part of the QHTCP process
  remc
  remc
 }
@@ -967,7 +1001,7 @@ remc() {
  debug "Running: ${FUNCNAME[0]}"
  # Enter REMc directory to run the scripts there
-  pushd "$QHTCP_DIR/REMc" || return 1
+  pushd "$QHTCP_PROJECT_DIR/REMc" || return 1
  # Run modules
  # If any modules fail the rest will not run, this is fundamental to module design
@@ -1009,15 +1043,29 @@ gtf() {
 module gta
 # @section GTA
 # @description GTA module for QHTCP
 # TODO
 #   * Heavily modified GTAtemplate.R
 #   * 
 gta() {
  debug "Running: ${FUNCNAME[0]}"
  sgd_terms_tfile="$CODE_DIR/go_terms.tab"
  sgd_features_file="$CODE_DIR/gene_association.sgd"
  gta_out_dir="$QHTCP_PROJECT_DIR/GTAresults"
  # Sets STUDIES_NUM and NUM_STUDIES
  get_studies
  [[ -d $gta_out_dir ]] || mkdir "$gta_out_dir"
-
+  # Loop over studies
-
+  for s in "${STUDIES_NUMS[@]}"; do
-
+    zscores_file="$QHTCP_PROJECT_DIR/Exp$s/ZScores/ZScores_Interaction.csv"
    if [[ -f $zscores_file ]]; then
      mkdir "$gta_out_dir/Exp$s"
      r_gta "Exp$s" "$zscores_file" "$sgd_terms_tfile" "$sgd_features_file" "$gta_out_dir"
    fi
  done
 }
@@ -1031,6 +1079,43 @@ gta() {
 #   * Functions you do not want to perform by default (submodules should be called modules)
 #   * Should not call cd or pushd (let module dictate)
 submodule r_gta
 # @description GTAtemplate R script
 # TODO:
 #   * Is GTAtemplate.R actually a template?
 #
 # Files:
 #   * gene_association.sgd: https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
 #   * go_terms.tab
 #
 # Output:
 #   *
 #
 # This submodule:
 #   * 
 #   *
 #
 # @arg $1 string Exp# name
 # @arg $2 string ZScores_Interaction.csv file
 # @arg $3 string go_terms.tab file
 # @arg $4 string gene_association.sgd
 # @arg $5 string output directory
 #
 r_gta() {
  debug "Running: ${FUNCNAME[0]}" "$@"
  cat <<-EOF
 	EOF
  script="$CODE_DIR/GTAtemplate.R"
  debug "$RSCRIPT $script $*"
  "$RSCRIPT" "$script" "$@"
 }
 submodule mat_exp_frontend
 # @description Run the ExpFrontend.m program
 # This submodule:
@@ -1071,55 +1156,30 @@ mat_exp_frontend() {
 submodule r_interactions
-# @description Run the R interactions analysis
+# @description Run the R interactions analysis (Z_InteractionTemplate.R)
-# TODO don't want to rename Z_InteractionTemplate.R because that will break logic, just edit in place instead
+# TODO 
-# Is this script interactive
+#   * don't want to rename Z_InteractionTemplate.R because that will break logic, just edit in place instead
 # NOTE
 #   * I modified the input logic of Z_InteractionTemplate script so that it can be treated like a native module
 #   * Note how little code is required to call it in r_interactions()
 # @arg $1 string The current working directory
 r_interactions() {
  debug "Running: ${FUNCNAME[0]}"
  cat <<-EOF
-		In each /Exp# folder, rename the Z_InteractionTemplate.R script according to the experiment focus
+		Edit the Z_InteractionTemplate.R script in each Exp dir beginning at the '++BEGIN USER DATA SELECTION++'
 		Example:  Interaction, Experimenter Initials, Experiment Focus --> 'int_RM_2PE.R'
 		5. Open the renamed interaction script, and edit each one beginning at the '++BEGIN USER DATA SELECTION++'
 		This is designed so that the data of interest for each experiment is appropriately selected from the !!Results…txt file
 		The user can edit, step through, and test the R script without running through the whole routine by observing the resultant data table created in RStudio.
 		The Z_InteractionTemplate.R script has a collection of code lines that have been used for prior analyses (generally to select data from various !!Results…txt files), which may be commented out (if not relevant), reused as needed, and/or modified for a new study. These include lines associated with the removal of 'dAmps', specific concentrations, and items described in the 'Specifics' and 'Media', i.e., information specific to a particular experiment design. There are also code lines to replace gene names 'OCT1/YKL134C' /'MAY24/YPR153W' and that get converted to date format in excel, by using only the ORF name and to remove data rows with 'Blank' listed; these lines of code convenient to reuse. Hopefully, these code lines can be used, commented out, or adapted to aid the user in modifying this section to the specific data requirements of the study. As a new user data filter code is developed for each 'Study' (and vetted), those lines can be added to the InteractionTemplate230119.R code in the /StudyTemplate folders to aid in future studies.
 		6. Open a terminal, navigate to each /Exp# folder, and execute the (customized) 'Z_InteractionTemplate_…' script by using the command line below:
-		Rscript RenamedInteractionTemplate.R \!\!Results… .txt
+		Be sure to enter Background noise filter standard deviation i.e., 3 or 5 per Sean
-
+		Enter Standard deviation value for removing data for cultures due to high background (e.g., contaminated cultures). Generally set this very high (e.g., '20') on the first run in order NOT to remove data, e.g. '20'. Review QC data and inspect raw image data to decide if it is desirable to remove data, and then rerun analysis. 
 		**need to change wording to choose SD of Delta_Background to exclude Data from analysis.
 		[1] "Be sure to enter Background noise filter standard deviation i.e., 3 or 5 per Sean"
 		Enter a Standard Deviation value to noise filter >>
 		[1] Enter Standard deviation value for removing data for cultures due to high background (e.g., contaminated cultures). Generally set this very high (e.g., '20') on the first run in order NOT to remove data, e.g. '20'. Review QC data and inspect raw image data to decide if it is desirable to remove data, and then rerun analysis. 
 		Enter a Background SD threshold for EXCLUDING culture data from further analysis:
-
+		This Background value removes data where there is high pixel intensity in the background regions of a spot culture (i.e., suspected contamination). 5 is a minimum recommended value, because lower values result in more data being removed, and often times this is undesirable if contamination occurs late after the carrying capacity of the yeast culture is reached.
-
+		This is most often "trial and error", meaning there is a 'Frequency_Delta_Background.pdf' report in the /Exp_/ZScores/QC/ folder to evaluate whether the chosen value was suitable (and if not the analysis can simply be rerun with a more optimal choice). In general, err on the high side, with BSD of 10 or 12…. One can also use EZview to examine the raw images and individual cultures potentially included/excluded as a consequence of the selected value. Background values are reported in the results sheet and so could also be analyzed there.. 
 		The script will request for the user to input a 'Background Standard Deviation Value'.  This Background value removes data where there is high pixel intensity in the background regions of a spot culture (i.e., suspected contamination). 5 is a minimum recommended value, because lower values result in more data being removed, and often times this is undesirable if contamination occurs late after the carrying capacity of the yeast culture is reached. This is most often “trial and error”, meaning there is a 'Frequency_Delta_Background.pdf' report in the /Exp_/ZScores/QC/ folder to evaluate whether the chosen value was suitable (and if not the analysis can simply be rerun with a more optimal choice). In general, err on the high side, with BSD of 10 or 12…. One can also use EZview to examine the raw images and individual cultures potentially included/excluded as a consequence of the selected value. Background values are reported in the results sheet and so could also be analyzed there.. 
 			(For new terminal users, directory navigation tips are described below)
 		To navigate to the directory one can use the directory GUI (in X2Go, use the GUI to navigate to desired operating directory and then from the 'File' menu, choose “Open in Terminal')
 		Alternatively, navigate there through the terminal window: 'pwd' “prints the current working directory”, 'ls' “lists” the subfolders in the current directory. 'cd'' followed by the name of the 'subdirectory' will move down into it. “cd .. “ changes to the parent directory
 		The tab key can be used to autofill unique characters after typing the initial letters of a folder or file you wish to call. 
 		The template structure above assists the user with organization and management of Q-HTCP files and provides a uniform directory structure to streamline reference across different users and experiments.
 		Since we are systematically comparing perturbations, most Q-HTCP studies will consist of either 2 or 4 experiment subfolders. 
 		The Zscores files are used for subsequent analyses, including REMc, GTA and Term Specific Heatmaps. These further analyses are described below and can be completed in any order and/or concurrently from separate terminals.
 		**Annotate Files produced and comment out code that produces files that are obsolete or clutter.
 	EOF
  script="Z_InteractionTemplate.R"
-  debug "$RSCRIPT $script"
+  debug "$RSCRIPT $script" "$@"
-  "$RSCRIPT" "$script" 
+  "$RSCRIPT" "$script" "$@"
  #   1. Path to input file
  #   2. /output/ directory
  #   3. Path to StudyInfo.csv
  #   4. Standard deviation value
 }
@@ -1316,6 +1376,7 @@ r_compile_gtf() {
  "$RSCRIPT" "$script"
 }
 submodule get_studies
 # @description Parse study names from StudyInfo.csv files
 # TODO: This whole submodule should eventually be either
@@ -1326,14 +1387,21 @@ submodule get_studies
 #   ExpNumb,ExpLabel,BackgroundSD,ZscoreJoinSD,AnalysisBy
 #   1,ExpName1,NA,NA,UserInitials
 #   2,ExpName2,NA,NA,UserInitials
-#   3,ExpName3,NA,NA,UserInitials 
+#   3,ExpName3,NA,NA,UserInitials
 # @exitcode 0 If one or more studies found
 # @exitcode 1 If no studies found
 # @set STUDIES_NUMS array Contains Exp numbers
 # @set NUM_STUDIES int Number of existing studies
 # @arg $1 string File to read
 get_studies() {
  debug "Running: ${FUNCNAME[0]}"
-  declare -ga STUDIES_NUM=()
+  declare -ga STUDIES_NUMS=()
  while IFS=',' read -r col1 _; do # split on comma, get second col
-    STUDIES_NUM+=("$col1")
+    STUDIES_NUMS+=("$col1")
  done < <(tail -n +2 "$1") # skip header
  [[ ${#STUDIES_NUMS[@]} -gt 0 ]] &&
  NUM_STUDIES="${#STUDIES_NUMS{@}}"
 }
@@ -1342,8 +1410,8 @@ submodule get_easy_results # lol
 # TODO: Standardize EASY output, it's hard to understand
 # @exitcode 0 if at least one results directory exists
 # @exitcode 1 if no results directories exist
-# @set EASY_RESULTS_DIRS array Globbed results files from SCAN_DIR/
+# @set EASY_RESULTS_DIRS array Globbed results files from SCANS_DIR/
-# @set EASY_RESULTS_FILES array Globbed results files from SCAN_DIR/
+# @set EASY_RESULTS_FILES array Globbed results files from SCANS_DIR/
 # @arg $1 string Project scans (ExpJobs)
 get_easy_results() {
  debug "Running: ${FUNCNAME[0]}"
@@ -1374,6 +1442,10 @@ get_easy_results() {
 submodule documentation
 # @section Documentation
 # @description Generates markdown documentation from this script using shdoc
 # TODO
 #   * We can include images in the markdown file but not natively with shdoc
 #   * Need to add a post processor
 #     * Or use a 'veryuniqueword' and some fancy sed
 # @noargs
 documentation() {
  debug "Running: ${FUNCNAME[0]}"
@@ -1399,12 +1471,13 @@ main() {
  QHTCP_TEMPLATE_DIR="$SCRIPT_DIR/templates/qhtcp"
  STUDY_TEMPLATE_DIR="$QHTCP_TEMPLATE_DIR/ExpTemplate"
  EASY_TEMPLATE_DIR="$SCRIPT_DIR/templates/easy"
  IMAGES="${IMAGES:-"/mnt/data/ExpJobs"}" 
  DATE="$(date +%y_%m%d)"
  # Set the automatic project directory prefix
  PROJECT_PREFIX="${DATE}_$(whoami)_" # reversed these so easier to sort and parse date
-  san() { [[ $1 =~ .+_[0-9][0-9]_[0-9][0-9]_[0-9][0-9]_.+ ]]; } # sanitizer regex for prefix
+  san() { [[ $1 =~ [0-9][0-9]_[0-9][0-9]_[0-9][0-9]_.+_.+ ]]; } # sanitizer regex for prefix
  declare -ag PROJECTS=() # this array will hold all of the projects for this run
@@ -1430,8 +1503,6 @@ main() {
    fi
  done
  SCANS_DIR="${SCANS_DIR:-"/mnt/data/ExpJobs"}" 
  # If we don't catch with getopt or env, run all
  if [[ ${#INCLUDE_MODULES[@]} -eq 0 ]]; then
    MODULES=("${ALL_MODULES[@]}")
@@ -1460,8 +1531,11 @@ main() {
  # Loop over projects
  for PROJECT in "${PROJECTS[@]}"; do 
-    SCAN_DIR="$SCANS_DIR/$PROJECT"
+    SCANS_DIR="$IMAGES/$PROJECT"
-    PROJECT_DATE=${PROJECT%"${PROJECT#??_????}"} # e.g. 24_0723
+    PROJECT_DATE="${PROJECT%"${PROJECT#??_????}"}" # e.g. 24_0723
    PROJECT_SUFFIX="${PROJECT#??_????_*_}"
    PROJECT_USER="${PROJECT#??_????_}"
    PROJECT_USER="${PROJECT_USER%%_*}"
    # Run selected modules
    for m in "${MODULES[@]}"; do
      ask "Run $m" && "$m"
--- a/workflow/templates/qhtcp/Code/GTAtemplate.R
+++ b/workflow/templates/qhtcp/Code/GTAtemplate.R
@@ -1,59 +1,70 @@
-#GTA (GoTermAveraging) Starting (Working Directory is /Code) All paths relative to /Code
+#!/usr/bin/env R 
-#Your output  may not be reproducible as org.Sc.sgd.db is uploaded from Bioconductor R library and changes
+# GTA (GoTermAveraging)
-#Loops thru the number of experiments involved in study. JWR
+# Your output  may not be reproducible as org.Sc.sgd.db is uploaded from Bioconductor R library and changes
-Wstudy= getwd()
+#
-if (file.exists('../Exp1/ZScores/ZScores_Interaction.csv')){
+# Updated 240724 Bryan C Roessler to improve file operations and portability
-  inputFile <- '../Exp1/ZScores/ZScores_Interaction.csv'
+# NOTE: The script now has 2 additional OPTIONAL arguments: 
-  expName= "Exp1"
+#   1. Path to SGD terms file (go.terms.tab)
-  dir.create("../GTAresults/Exp1")
+#   2. Path to SGD features file (gene_association.sgd)
 }
 if (file.exists('../Exp2/ZScores/ZScores_Interaction.csv')){
  inputFile[2] <- '../Exp2/ZScores/ZScores_Interaction.csv'
  expName[2]= "Exp2"
  dir.create("../GTAresults/Exp2")
 }
 if (file.exists('../Exp3/ZScores/ZScores_Interaction.csv')){
  inputFile[3] <- '../Exp3/ZScores/ZScores_Interaction.csv'
  expName[3]= "Exp3"
  dir.create("../GTAresults/Exp3")
 }
 if (file.exists('../Exp4/ZScores/ZScores_Interaction.csv')){
  inputFile[4] <- '../Exp4/ZScores/ZScores_Interaction.csv'
  expName[4]= "Exp4"
  dir.create("../GTAresults/Exp4")
 }
 outputPathGTA= "../GTAresults"
        #dir.create(outPathGTA)
 library("stringr")
 library("org.Sc.sgd.db")
 library("plyr")
 #build in command args to apply this code to a given !!results sheet
-SGD_Terms_file <- "../Code/go_terms.tab"  #ArgsScore[2]
+# Parse arguments
-        #https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
+args <- commandArgs(TRUE)
 SGD_features_file <- "../Code/gene_association.sgd"  #ArgsScore[3]
-#R and Rstudio have issues: The for loop(189) seemed to fail to evaluate the paste (or paste0) to build the inputFile inside the for loop and bail as the second loop began. This crude fix below, seems to have alleviated the failure to loop problem at least for now. Also for some annoying reason, the underscores between word are sometimes not shown when they exist. No ryme of reason!!!
+exp_name <- args[1]
 if (length(args) > 2) {
  zscores_file <- args[2]
 } else {
  zscores_file <- "ZScores/ZScores_Interaction.csv" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
 }
 if (length(args) > 3) {
  sgd_terms_file <- args[3]
 } else {
  sgd_terms_file <- "../Code/go_terms.tab"
 }
 if (length(args) > 4) {
  sgd_features_file <- args[4]
 } else {
  sgd_features_file <- "../Code/gene_association.sgd" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
 }
 if (length(args) > 5) {
  output_dir <- args[5]
 } else {
  output_dir <- "../GTAresults" # https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd
 }
 # # Set SGDgeneList file path
 # if (length(args) > 4) {
 #   SGDgeneList <- args[4]
 # } else {
 #   SGDgeneList <- "../Code/SGD_features.tab"
 #Begin for loop for experiments in this study-----------------ZScores_Interaction.csv
-for(m in 1:length(inputFile)){
+for(m in 1:length(zscores_file)){
-  #inputFile <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="")  #ArgsScore[1]
+  #zscores_file <- paste(Wstudy,"/",expName[m],'/ZScores/ZScores_Interaction.csv',sep="")  #ArgsScore[1]
-  X <- read.csv(file = inputFile[m],stringsAsFactors=FALSE,header = TRUE)
+  X <- read.csv(file = zscores_file[m],stringsAsFactors=FALSE,header = TRUE)
  if(colnames(X)[1] == "OrfRep"){
    colnames(X)[1] <- "ORF"
  }
  #Terms is the GO term list
-  Terms <- read.delim(file = SGD_Terms_file,header=FALSE,quote = "",col.names = c("GO_ID","GO_Term","GO_Aspect","GO_Term_Definition"))
+  Terms <- read.delim(file = sgd_terms_file,header=FALSE,quote = "",col.names = c("GO_ID","GO_Term","GO_Aspect","GO_Term_Definition"))
  #all ORFs associated with GO term
  GO2ALLORFs <- as.list(org.Sc.sgdGO2ALLORFS)
  #Gene_Association is the gene association to GO term file
-  Gene_Association <- read.delim(SGD_features_file,skip=8,header=FALSE,quote="",col.names = c("Database","Database_Object_ID","Database_Object_Symbol","NOT","GO_ID","Database_Reference","Evidence","With_or_From","Aspect","Database_Object_Name","Database_Object_Synonym","Database_Object_Type","taxon","Date","Assigned_By","OtherInfo","Empty"))
+  Gene_Association <- read.delim(sgd_features_file,skip=8,header=FALSE,quote="",col.names = c("Database","Database_Object_ID","Database_Object_Symbol","NOT","GO_ID","Database_Reference","Evidence","With_or_From","Aspect","Database_Object_Name","Database_Object_Synonym","Database_Object_Type","taxon","Date","Assigned_By","OtherInfo","Empty"))
  #Get the ORF names associated with each gene/GO term
  Gene_Association$ORF <- str_split_fixed(as.character(Gene_Association$Database_Object_Synonym),"\\|",2)[,1]
  #Get the numeric GO ID for matching
@@ -140,7 +151,7 @@ for(m in 1:length(inputFile)){
  X2 <- X2[,order(names(X2))]
  X2 <- X2[!is.na(X2$Z_lm_L_Avg),]
  #create output file
-  write.csv(X2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All.csv",sep=""),row.names=FALSE)
+  write.csv(X2,file=paste(output_dir,"/",expName[m],"/Average_GOTerms_All.csv",sep=""),row.names=FALSE)
  #remove NAs
  X3 <- X2[!is.na(X2$Z_lm_L_Avg),]
  #identify redundant GO terms
@@ -167,21 +178,21 @@ for(m in 1:length(inputFile)){
  }
  Y1 <- unique(Y)
-  write.csv(Y1,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_All_NonRedundantTerms.csv",sep=""),row.names = FALSE)
+  write.csv(Y1,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_All_NonRedundantTerms.csv",sep=""),row.names = FALSE)
  Y2 <- Y1[Y1$Z_lm_L_Avg >= 2 | Y1$Z_lm_L_Avg <= -2,]
  Y2 <- Y2[!is.na(Y2$Z_lm_L_Avg),]
-  write.csv(Y2,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L.csv",sep=""),row.names = FALSE)
+  write.csv(Y2,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_L.csv",sep=""),row.names = FALSE)
  Y3 <- Y2[Y2$NumGenes_Avg > 2,]
-  write.csv(Y3,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv",sep=""),row.names = FALSE)
+  write.csv(Y3,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_L_Above2Genes.csv",sep=""),row.names = FALSE)
  Y4 <- Y1[Y1$Z_lm_K_Avg >= 2 | Y1$Z_lm_K_Avg <= -2,]
  Y4 <- Y4[!is.na(Y4$Z_lm_K_Avg),]
-  write.csv(Y4,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K.csv",sep=""),row.names = FALSE)
+  write.csv(Y4,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_K.csv",sep=""),row.names = FALSE)
  Y5 <- Y4[Y4$NumGenes_Avg > 2,]
-  write.csv(Y5,file=paste(outputPathGTA,"/",expName[m],"/Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv",sep=""),row.names = FALSE)
+  write.csv(Y5,file=paste(output_dir,"/",exp_name,"/Average_GOTerms_NonRedundantTerms_Above2SD_K_Above2Genes.csv",sep=""),row.names = FALSE)
  #End of 'for loop'
 }
--- a/workflow/templates/qhtcp/ExpTemplate/Z_InteractionTemplate.R
+++ b/workflow/templates/qhtcp/ExpTemplate/Z_InteractionTemplate.R
@@ -1,7 +1,7 @@
 #!/usr/bin/env R
 # Based on InteractionTemplate.R which is based on Sean Santos's Interaction_V5 script
 #
-# Updated 240723 Bryan C Roessler to improve file operations and portability
+# Updated 240724 Bryan C Roessler to improve file operations and portability
 # NOTE: The script now has 4 additional OPTIONAL arguments: 
 #   1. Path to input file
 #   2. /output/ directory