From bcc8dba3e4e4d4fff2829dafe7c5d0d0096187dd Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Thu, 22 Aug 2024 04:20:40 -0400 Subject: [PATCH] Add new configuration file --- .../apps/shell/submodules/parse_study_info | 121 ++ workflow/apps/python/join_interactions.py | 125 ++ workflow/apps/r/PairwiseLK.R | 282 ++-- .../apps/r/calculate_interaction_zscores.R | 335 +++++ workflow/apps/r/calculate_pairwise_lk.R | 209 +++ workflow/apps/r/interactions.R | 18 +- workflow/apps/r/joinInteractExps.R | 557 +++++--- workflow/qhtcp-workflow | 1193 ++++++++--------- 8 files changed, 1824 insertions(+), 1016 deletions(-) create mode 100644 workflow/.old/apps/shell/submodules/parse_study_info create mode 100644 workflow/apps/python/join_interactions.py create mode 100644 workflow/apps/r/calculate_interaction_zscores.R create mode 100644 workflow/apps/r/calculate_pairwise_lk.R diff --git a/workflow/.old/apps/shell/submodules/parse_study_info b/workflow/.old/apps/shell/submodules/parse_study_info new file mode 100644 index 00000000..c11c2d4d --- /dev/null +++ b/workflow/.old/apps/shell/submodules/parse_study_info @@ -0,0 +1,121 @@ +#!/usr/bin/env/bash +# @description Creates, modifies, and parses the study info file +# +# TODO +# +# * Needs refactoring +# * Ended up combining a few functions into one +# +# +# @exitcode 0 If one or more studies found +# @exitcode 1 If no studies found +# @set STUDIES array contains array of "exp# sd ExpDir" +parse_study_info() { + debug "Running: ${FUNCNAME[0]}" + + # Only run this once per project + # in case we run multiple modules + (( SET_STUDIES )) && return 0 + declare -g SET_STUDIES=1 + + # Use initials from project or whoami? + # Best I can do is first two letters of username + # See TODO in markdown + initials="${USER:0:2}" + INITIALS=${initials^^} + + empty_study=1 + # Find an Exp directory that does not exist + while [[ -d $STUDY_RESULTS_DIR/exp$empty_study ]]; do + (( empty_study++ )) + done + + next_study_entry="$empty_study,$PROJECT_NAME,NA,NA,$INITIALS" + + echo "${underline}Study Info File${nounderline}" + + if [[ -f $STUDY_INFO_FILE ]]; then + # Get latest entry + while IFS=',' read -r col1 _; do # split on comma, get Exp # from 1st column + studies_nums+=("$col1") + done < <(tail -n +2 "$STUDY_INFO_FILE") + largest=${studies_nums[0]} + for i in "${studies_nums[@]}"; do + if ((i > largest)); then + largest=$i + fi + done + empty_study=$((largest+1)) + next_study_entry="$((empty_study)),$PROJECT_NAME,NA,NA,$INITIALS" + else # create a default study info file + echo "ExpNumb,ExpLabel,BackgroundSD,ZscoreJoinSD,AnalysisBy" > "$STUDY_INFO_FILE" + echo "$next_study_entry" >> "$STUDY_INFO_FILE" + next_study_entry="$((empty_study+1)),$PROJECT_NAME,NA,NA,$INITIALS" + fi + + # Print current studies + cat <<-EOF + * Give each experiment labels to be used for the plots and specific files. + * Enter the desired experiment names in the order they should appear in the REMc heatmaps + + Current study info file contents: + + ${underline}$STUDY_INFO_FILE${nounderline} + $(cat "$STUDY_INFO_FILE") + + EOF + + # Allow user to add/edit the study info file + if ! ((YES)); then + for ((i=1; i<2; i++)); do + cat <<-EOF + Next entry suggestion: "$next_study_entry" + + Would you like to: + * (a)dd the suggested entry + * (e)dit the study info file manually + * (c)ontinue (default) + EOF + read -r -p "(c): " response + echo "" + [[ -z $response ]] && break + case $response in + a) + echo "Adding auto-entry suggestion to $STUDY_INFO_FILE" + echo "$next_study_entry" >> "$STUDY_INFO_FILE" + next_study_entry="$((empty_study+1)),$PROJECT_NAME,NA,NA,$INITIALS" + i=0 + ;; + e) + debug "$EDITOR $STUDY_INFO_FILE" + "$EDITOR" "$STUDY_INFO_FILE" + ;; + c) + break + ;; + *) + err "Invalid response, please try again" + i=0 + ;; + esac + break + done + fi + + # Read study info file + declare -ga STUDIES + while IFS=',' read -r num _ sd _; do + STUDIES+=("$num $sd $STUDY_RESULTS_DIR/exp$num") + done < <(tail -n +2 "$STUDY_INFO_FILE") # skip header + + # Initialize missing Exp dirs + for study in "${STUDIES[@]}"; do + read -r _ _ dir <<< "$study" + [[ -d $dir ]] || execute mkdir "$dir" + done + + ((DEBUG)) && declare -p STUDIES + + # Return true if at least one study was found + [[ ${#STUDIES[@]} -gt 0 ]] +} \ No newline at end of file diff --git a/workflow/apps/python/join_interactions.py b/workflow/apps/python/join_interactions.py new file mode 100644 index 00000000..ecd40d3f --- /dev/null +++ b/workflow/apps/python/join_interactions.py @@ -0,0 +1,125 @@ +import pandas as pd +import os +import sys +import numpy as np + +# Function to parse and set arguments +def parse_arguments(): + if len(sys.argv) == 1: # Interactive mode + args = [ + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD", + 2, + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv", + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1", + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2" + ] + else: + args = sys.argv[1:] + + return { + "out_dir": os.path.abspath(args[0]), + "sd": float(args[1]), + "study_info": os.path.abspath(args[2]), + "input_dirs": args[3:] + } + +args = parse_arguments() + +# Create an array for the zscores files +def get_zscores_files(dirs): + files = [os.path.join(study, "zscores", "zscores_interaction.csv") + for study in dirs if os.path.exists(os.path.join(study, "zscores", "zscores_interaction.csv"))] + return files + +zscores_files = get_zscores_files(args['input_dirs']) +print(f"The SD value is: {args['sd']}") + +# Ensure there are enough files to compare +if len(zscores_files) < 2: + sys.exit("Not enough experiments to compare, exiting script") + +# Function to join zscores files +def join_zscores_files(files): + joined_data = pd.read_csv(files[0]) + for file in files[1:]: + temp_data = pd.read_csv(file) + joined_data = pd.merge(joined_data, temp_data, on="OrfRep", how="outer") + return joined_data + +# Load and join zscores files +joined_data = join_zscores_files(zscores_files) + +# Order and select columns +def order_and_select_columns(data): + ordered_data = data[sorted(data.columns)] + selected_headers = ordered_data.filter(regex="OrfRep|Gene|z_lm_k|z_shift_k|z_lm_l|z_shift_l") + return selected_headers + +selected_headers = order_and_select_columns(joined_data) + +# Remove redundant columns like "Gene.1" +def clean_headers(data, suffixes): + suffixes_to_remove = [f"Gene.{i}" for i in range(1, suffixes+1)] + return data.drop(columns=suffixes_to_remove, errors='ignore') + +headSel = clean_headers(selected_headers, len(zscores_files) - 1) +headSel2 = clean_headers(joined_data.filter(regex="OrfRep|Gene"), len(zscores_files) - 1) + +# Fill NA values in Shift and Z_lm columns +def fill_na_in_columns(data): + for column in data.columns: + if "Shift" in column: + data[column].fillna(0.001, inplace=True) + elif "Z_lm_" in column: + data[column].fillna(0.0001, inplace=True) + return data + +headSel = fill_na_in_columns(headSel) + +# Filter based on standard deviation +def filter_by_sd(data, sd): + if sd == 0: + return data + z_lm_cols = data.filter(regex="z_lm_") + filter_vector = z_lm_cols.abs().ge(sd).any(axis=1) + return data[filter_vector] + +REMcRdy = filter_by_sd(headSel.filter(regex="OrfRep|Gene|z_lm_"), args['sd']) +shiftOnly = filter_by_sd(headSel.filter(regex="OrfRep|Gene|z_shift"), args['sd']) + +# Reorder columns to interleave Z_lm and Shift data +def reorder_columns(data1, data2): + combined_data = data1.copy() + for i in range(2, data1.shape[1]): + combined_data.insert(2 * i - 1, data2.columns[i], data2.iloc[:, i]) + return combined_data + +combI = reorder_columns(headSel2, shiftOnly) + +# Write output files +REMcRdy.to_csv(os.path.join(args['out_dir'], "REMcRdy_lm_only.csv"), index=False, quotechar=False) +shiftOnly.to_csv(os.path.join(args['out_dir'], "Shift_only.csv"), index=False, quotechar=False) + +# Relabel headers using experiment names from StudyInfo.csv +def relabel_headers(headers, labels): + new_labels = headers.copy() + for i, header in enumerate(headers): + suffix = header.split('.')[-1] + if suffix.isdigit() and int(suffix) in range(1, 4): + exp_name = labels.iloc[int(suffix) - 1, 1] + new_labels[i] = header.replace(f".{suffix}", f"_{exp_name}") + return new_labels + +LabelStd = pd.read_csv(args['study_info']) + +shiftOnly.columns = relabel_headers(shiftOnly.columns, LabelStd) +REMcRdy.columns = relabel_headers(REMcRdy.columns, LabelStd) + +# Save relabeled files +REMcRdy.to_csv(os.path.join(args['out_dir'], "REMcRdy_lm_only.csv"), index=False, quotechar=False) +shiftOnly.to_csv(os.path.join(args['out_dir'], "Shift_only.csv"), index=False, quotechar=False) + +# Save updated parameters +LabelStd.iloc[:, 3] = args['sd'] +LabelStd.to_csv(os.path.join(args['out_dir'], "parameters.csv"), index=False) +LabelStd.to_csv(args['study_info'], index=False) diff --git a/workflow/apps/r/PairwiseLK.R b/workflow/apps/r/PairwiseLK.R index 3026a3c3..0c6318aa 100644 --- a/workflow/apps/r/PairwiseLK.R +++ b/workflow/apps/r/PairwiseLK.R @@ -1,12 +1,4 @@ -#!/usr/bin/env Rscript # This R script performs GTA L and K Pairwise Compares for user specified pairs of Experiments -# -# Updated 240724 Bryan C Roessler to improve file operations and portability -# NOTE: The two required arguments are the same and now there are two optional arguments -# 1. Exp1 -# 2. Exp2 -# 3. StudyInfo.csv file -# 4. Output Directory library("ggplot2") library("plotly") @@ -16,31 +8,14 @@ library("grid") library("ggthemes") args <- commandArgs(TRUE) -exp_name <- args[1] -exp_name2 <- args[2] +exp1_name <- args[1] +exp1_file <- args[2] +exp2_name <- args[3] +exp2_file <- args[4] +output_dir <- args[5] -if (length(args) >= 3) { - study_info_file <- args[3] -} else { - study_info_file <- "StudyInfo.csv" -} - -if (length(args) >= 4) { - output_dir <- args[4] -} else { - output_dir <- "gta" -} - -expNumber1 <- as.numeric(sub("^.*?(\\d+)$", "\\1", exp_name)) -expNumber2 <- as.numeric(sub("^.*?(\\d+)$", "\\1", exp_name2)) -Labels <- read.csv(file = study_info_file, stringsAsFactors = FALSE) -Name1 <- Labels[expNumber1, 2] -Name2 <- Labels[expNumber2, 2] -go_terms_file <- "Average_GOTerms_All.csv" -input_file1 <- file.path(output_dir, exp_name, go_terms_file) -input_file2 <- file.path(output_dir, exp_name2, go_terms_file) -pairDirL <- file.path(output_dir, paste0("PairwiseCompareL_", exp_name, "-", exp_name2)) -pairDirK <- file.path(output_dir, paste0("PairwiseCompareK_", exp_name, "-", exp_name2)) +pairDirL <- file.path(output_dir, paste0("PairwiseCompareL_", exp1_name, "-", exp2_name)) +pairDirK <- file.path(output_dir, paste0("PairwiseCompareK_", exp1_name, "-", exp2_name)) # Pairwise L # outputPlotly <- "../GTAresults/PairwiseCompareL/" #"/GTAresults/PairwiseCompareL/" @@ -75,21 +50,6 @@ theme_Publication <- function(base_size = 14, base_family = "sans") { ) } -scale_fill_Publication <- function(...) { - library(scales) - discrete_scale("fill", "Publication", manual_pal( - values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", - "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")), ... - ) -} - -scale_colour_Publication <- function(...) { - discrete_scale("colour", "Publication", manual_pal( - values = c("#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", - "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")), ... - ) -} - theme_Publication_legend_right <- function(base_size = 14, base_family = "sans") { (theme_foundation(base_size = base_size, base_family = base_family) + theme( @@ -131,8 +91,8 @@ scale_colour_Publication <- function(...) { "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")), ...) } -X1 <- read.csv(file = input_file1, stringsAsFactors = FALSE, header = TRUE) -X2 <- read.csv(file = input_file2, stringsAsFactors = FALSE, header = TRUE) +X1 <- read.csv(file = exp1_file, stringsAsFactors = FALSE, header = TRUE) +X2 <- read.csv(file = exp2_file, stringsAsFactors = FALSE, header = TRUE) X <- merge(X1, X2, by = "Term_Avg", all = TRUE, suffixes = c("_X1", "_X2")) gg <- ggplot(data = X, aes( @@ -146,7 +106,7 @@ gg <- ggplot(data = X, aes( SD_1 = Z_lm_L_SD_X1, SD_2 = Z_lm_L_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_rect( aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", @@ -156,12 +116,12 @@ gg <- ggplot(data = X, aes( fill = NA ) + geom_point(shape = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, " vs. ", Name2) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOntology.pdf")), + file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOntology.pdf")), width = 12, height = 8 ) @@ -169,7 +129,7 @@ pdf( gg dev.off() pgg <- ggplotly(gg) -fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_byOntology.html")) +fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOntology.html")) htmlwidgets::saveWidget(pgg, fname) # ID aggravators and alleviators, regardless of whether they meet 2SD threshold @@ -184,19 +144,19 @@ X2_Specific_Alleviators_X1_Aggravators <- X[which(X$Z_lm_L_Avg_X2 <= -2 & X$Z_lm X$Overlap_Avg <- NA try(X[X$Term_Avg %in% X1_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <- - paste(Name1, "Specific_Deletion_Enhancers", sep = "_")) + paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_")) try(X[X$Term_Avg %in% X1_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <- - paste(Name1, "Specific_Deletion_Suppresors", sep = "_")) + paste(exp1_name, "Specific_Deletion_Suppresors", sep = "_")) try(X[X$Term_Avg %in% X2_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <- - paste(Name2, "Specific_Deletion_Enhancers", sep = "_")) + paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_")) try(X[X$Term_Avg %in% X2_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <- - paste(Name2, "Specific_Deletion_Suppressors", sep = "_")) + paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_")) try(X[X$Term_Avg %in% Overlap_Aggravators$Term_Avg, ]$Overlap_Avg <- "Overlapping_Deletion_Enhancers") try(X[X$Term_Avg %in% Overlap_Alleviators$Term_Avg, ]$Overlap_Avg <- "Overlapping_Deletion_Suppressors") try(X[X$Term_Avg %in% X2_Specific_Aggravators_X1_Alleviatiors$Term_Avg, ]$Overlap_Avg <- - paste(Name2, "Deletion_Enhancers", Name1, "Deletion_Suppressors", sep = "_")) + paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_")) try(X[X$Term_Avg %in% X2_Specific_Alleviators_X1_Aggravators$Term_Avg, ]$Overlap_Avg <- - paste(Name2, "Deletion_Suppressors", Name1, "Deletion_Enhancers", sep = "_")) + paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_")) gg <- ggplot(data = X, aes( x = Z_lm_L_Avg_X1, @@ -209,7 +169,7 @@ gg <- ggplot(data = X, aes( SD_1 = Z_lm_L_SD_X1, SD_2 = Z_lm_L_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_rect( aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", @@ -219,12 +179,12 @@ gg <- ggplot(data = X, aes( fill = NA ) + geom_point(shape = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, " vs. ", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap.pdf")), + file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap.pdf")), width = 12, height = 8 ) @@ -232,7 +192,7 @@ pdf( gg dev.off() pgg <- ggplotly(gg) -fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_byOverlap.html")) +fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html")) htmlwidgets::saveWidget(pgg, fname) x_rem2_gene <- X[X$NumGenes_Avg_X1 >= 2 & X$NumGenes_Avg_X2 >= 2, ] @@ -249,15 +209,15 @@ gg <- ggplot(data = x_rem2_gene, aes( SD_1 = Z_lm_L_SD_X1, SD_2 = Z_lm_L_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_above2genes.pdf")), + file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_above2genes.pdf")), width = 12, height = 8 ) @@ -266,7 +226,7 @@ gg dev.off() pgg <- ggplotly(gg) #pgg -fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_byOverlap_above2genes.html")) +fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html")) htmlwidgets::saveWidget(pgg, fname) #4 @@ -282,15 +242,15 @@ gg <- ggplot(data = X_overlap_nothresold, aes( SD_1 = Z_lm_L_SD_X1, SD_2 = Z_lm_L_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_Above2SD_ByOverlap.pdf")), + file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.pdf")), width = 12, height = 8 ) @@ -299,7 +259,7 @@ gg dev.off() pgg <- ggplotly(gg) #pgg -fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_Above2SD_ByOverlap.html")) +fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.html")) htmlwidgets::saveWidget(pgg, fname) # Only output GTA terms where average score is still above 2 after subtracting the SD @@ -320,16 +280,16 @@ X2_Specific_Aggravators2_X1_Alleviatiors2 <- Z1[which(Z1$L_Subtract_SD_X2 >= 2 & X2_Specific_Alleviators2_X1_Aggravators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & Z1$L_Subtract_SD_X1 >= 2), ] X$Overlap <- NA -try(X[X$Term_Avg %in% X1_Specific_Aggravators2$Term_Avg, ]$Overlap <- paste(Name1, "Specific_Deletion_Enhancers", sep = "_")) -try(X[X$Term_Avg %in% X1_Specific_Alleviators2$Term_Avg, ]$Overlap <- paste(Name1, "Specific_Deletion_Suppresors", sep = "_")) -try(X[X$Term_Avg %in% X2_Specific_Aggravators2$Term_Avg, ]$Overlap <- paste(Name2, "Specific_Deletion_Enhancers", sep = "_")) -try(X[X$Term_Avg %in% X2_Specific_Alleviators2$Term_Avg, ]$Overlap <- paste(Name2, "Specific_Deletion_Suppressors", sep = "_")) +try(X[X$Term_Avg %in% X1_Specific_Aggravators2$Term_Avg, ]$Overlap <- paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_")) +try(X[X$Term_Avg %in% X1_Specific_Alleviators2$Term_Avg, ]$Overlap <- paste(exp1_name, "Specific_Deletion_Suppresors", sep = "_")) +try(X[X$Term_Avg %in% X2_Specific_Aggravators2$Term_Avg, ]$Overlap <- paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_")) +try(X[X$Term_Avg %in% X2_Specific_Alleviators2$Term_Avg, ]$Overlap <- paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_")) try(X[X$Term_Avg %in% Overlap_Aggravators2$Term_Avg, ]$Overlap <- "Overlapping_Deletion_Enhancers") try(X[X$Term_Avg %in% Overlap_Alleviators2$Term_Avg, ]$Overlap <- "Overlapping_Deletion_Suppressors") try(X[X$Term_Avg %in% X2_Specific_Aggravators2_X1_Alleviatiors2$Term_Avg, ]$Overlap <- - paste(Name2, "Deletion_Enhancers", Name1, "Deletion_Suppressors", sep = "_")) + paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_")) try(X[X$Term_Avg %in% X2_Specific_Alleviators2_X1_Aggravators2$Term_Avg, ]$Overlap <- - paste(Name2, "Deletion_Suppressors", Name1, "Deletion_Enhancers", sep = "_")) + paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_")) #5 X_abovethreshold <- X[!(is.na(X$Overlap)), ] @@ -345,15 +305,15 @@ gg <- ggplot(data = X_abovethreshold, aes( SD_1 = Z_lm_L_SD_X1, SD_2 = Z_lm_L_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, " vs. ", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold.pdf")), + file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.pdf")), width = 12, height = 8 ) @@ -362,7 +322,7 @@ gg dev.off() pgg <- ggplotly(gg) #pgg -fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold.html")) +fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.html")) htmlwidgets::saveWidget(pgg, fname) #6 @@ -377,16 +337,16 @@ gg <- ggplot(data = X_abovethreshold, aes( SD_1 = Z_lm_L_SD_X1, SD_2 = Z_lm_L_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_text(aes(label = Term_Avg), nudge_y = 0.25, size = 2) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3, size = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_names.pdf")), + file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.pdf")), width = 20, height = 20 ) @@ -395,7 +355,7 @@ gg dev.off() pgg <- ggplotly(gg) #pgg -fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_names.html")) +fname <- file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.html")) htmlwidgets::saveWidget(pgg, fname) X_abovethreshold$X1_Rank <- NA @@ -415,16 +375,16 @@ gg <- ggplot(data = X_abovethreshold, aes( SD_1 = Z_lm_L_SD_X1, SD_2 = Z_lm_L_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_text(aes(label = X1_Rank), nudge_y = 0.25, size = 4) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3, size = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_numberedX1.pdf")), + file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.pdf")), width = 15, height = 15 ) @@ -435,7 +395,7 @@ pgg <- ggplotly(gg) #pgg fname <- - file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_numberedX1.html")) + file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.html")) htmlwidgets::saveWidget(pgg, fname) #8 @@ -450,16 +410,16 @@ gg <- ggplot(data = X_abovethreshold, aes( SD_1 = Z_lm_L_SD_X1, SD_2 = Z_lm_L_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_text(aes(label = X2_Rank), nudge_y = 0.25, size = 4) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3, size = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_numberedX2.pdf")), + file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.pdf")), width = 15, height = 15 ) @@ -469,18 +429,18 @@ dev.off() pgg <- ggplotly(gg) #pgg fname <- - file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_numberedX2.html")) + file.path(pairDirL, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.html")) htmlwidgets::saveWidget(pgg, fname) write.csv( x = X, - file.path(pairDirL, paste0("All_GTA_Avg_Scores_", Name1, "_vs_", Name2, ".csv")), + file.path(pairDirL, paste0("All_GTA_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")), row.names = FALSE ) write.csv( x = X_abovethreshold, - file = file.path(pairDirL, paste0("AboveThreshold_GTA_Avg_Scores_", Name1, "_vs_", Name2, ".csv")), + file = file.path(pairDirL, paste0("AboveThreshold_GTA_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")), row.names = FALSE ) @@ -582,8 +542,8 @@ scale_colour_Publication <- function(...) { ) } -X1 <- read.csv(file = input_file1, stringsAsFactors = FALSE, header = TRUE) -X2 <- read.csv(file = input_file2, stringsAsFactors = FALSE, header = TRUE) +X1 <- read.csv(file = exp1_file, stringsAsFactors = FALSE, header = TRUE) +X2 <- read.csv(file = exp2_file, stringsAsFactors = FALSE, header = TRUE) #1 X <- merge(X1, X2, by = "Term_Avg", all = TRUE, suffixes = c("_X1", "_X2")) @@ -599,15 +559,15 @@ gg <- ggplot(data = X, aes( SD_1 = Z_lm_K_SD_X1, SD_2 = Z_lm_K_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", Name1, "_vs_", Name2, "_All_ByOntology.pdf")), + file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOntology.pdf")), width = 12, height = 8 ) @@ -616,7 +576,7 @@ gg dev.off() pgg <- ggplotly(gg) #pgg -fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_byOntology.html")) +fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOntology.html")) htmlwidgets::saveWidget(pgg, fname) #2 @@ -632,23 +592,23 @@ X2_Specific_Alleviators_X1_Aggravators <- X[which(X$Z_lm_K_Avg_X2 <= -2 & X$Z_lm X$Overlap_Avg <- NA try(X[X$Term_Avg %in% X1_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <- - paste(Name1, "Specific_Deletion_Suppressors", sep = "_")) + paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_")) try(X[X$Term_Avg %in% X1_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <- - paste(Name1, "Specific_Deletion_Enhancers", sep = "_")) + paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_")) try(X[X$Term_Avg %in% X2_Specific_Aggravators$Term_Avg, ]$Overlap_Avg <- - paste(Name2, "Specific_Deletion_Suppressors", sep = "_")) + paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_")) try(X[X$Term_Avg %in% X2_Specific_Alleviators$Term_Avg, ]$Overlap_Avg <- - paste(Name2, "Specific_Deletion_Enhancers", sep = "_")) + paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_")) try(X[X$Term_Avg %in% Overlap_Aggravators$Term_Avg, ]$Overlap_Avg <- "Overlapping_Deletion_Suppressors") try(X[X$Term_Avg %in% Overlap_Alleviators$Term_Avg, ]$Overlap_Avg <- "Overlapping_Deletion_Enhancers") try(X[X$Term_Avg %in% X2_Specific_Aggravators_X1_Alleviatiors$Term_Avg, ]$Overlap_Avg <- - paste(Name2, "Deletion_Suppressors", Name1, "Deletion_Enhancers", sep = "_")) + paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_")) try(X[X$Term_Avg %in% X2_Specific_Alleviators_X1_Aggravators$Term_Avg, ]$Overlap_Avg <- - paste(Name2, "Deletion_Enhancers", Name1, "Deletion_Suppressors", sep = "_")) + paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_")) -plotly_path <- file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", Name1, "_vs_", Name2, "_All_byOverlap.html")) +plotly_path <- file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html")) gg <- ggplot(data = X, aes( x = Z_lm_K_Avg_X1, y = Z_lm_K_Avg_X2, @@ -660,15 +620,15 @@ gg <- ggplot(data = X, aes( SD_1 = Z_lm_K_SD_X1, SD_2 = Z_lm_K_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap.pdf")), + file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap.pdf")), width = 12, height = 8 ) @@ -678,12 +638,12 @@ dev.off() pgg <- ggplotly(gg) #2 -fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_byOverlap.html")) +fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap.html")) htmlwidgets::saveWidget(pgg, fname) #3 x_rem2_gene <- X[X$NumGenes_Avg_X1 >= 2 & X$NumGenes_Avg_X2 >= 2, ] -plotly_path <- file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", Name1, "_vs_", Name2, "_All_byOverlap_above2genes.html")) +plotly_path <- file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html")) gg <- ggplot(data = x_rem2_gene, aes( x = Z_lm_K_Avg_X1, y = Z_lm_K_Avg_X2, @@ -695,15 +655,15 @@ gg <- ggplot(data = x_rem2_gene, aes( SD_1 = Z_lm_K_SD_X1, SD_2 = Z_lm_K_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_above2genes.pdf")), + file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_above2genes.pdf")), width = 12, height = 8 ) @@ -712,7 +672,7 @@ gg dev.off() pgg <- ggplotly(gg) #pgg -fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_byOverlap_above2genes.html")) +fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_byOverlap_above2genes.html")) htmlwidgets::saveWidget(pgg, fname) #4 @@ -728,15 +688,15 @@ gg <- ggplot(data = X_overlap_nothresold, aes( SD_1 = Z_lm_K_SD_X1, SD_2 = Z_lm_K_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", Name1, "_vs_", Name2, "_Above2SD_ByOverlap.pdf")), + file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.pdf")), width = 12, height = 8 ) @@ -746,7 +706,7 @@ dev.off() pgg <- ggplotly(gg) #pgg -fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_Above2SD_ByOverlap.html")) +fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_Above2SD_ByOverlap.html")) htmlwidgets::saveWidget(pgg, fname) #5 @@ -769,21 +729,21 @@ X2_Specific_Alleviators2_X1_Aggravators2 <- Z2[which(Z2$L_Subtract_SD_X2 <= -2 & X$Overlap <- NA try(X[X$Term_Avg %in% X1_Specific_Aggravators2$Term_Avg, ]$Overlap <- - paste(Name1, "Specific_Deletion_Suppressors", sep = "_")) + paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_")) try(X[X$Term_Avg %in% X1_Specific_Alleviators2$Term_Avg, ]$Overlap <- - paste(Name1, "Specific_Deletion_Enhancers", sep = "_")) + paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_")) try(X[X$Term_Avg %in% X2_Specific_Aggravators2$Term_Avg, ]$Overlap <- - paste(Name2, "Specific_Deletion_Suppressors", sep = "_")) + paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_")) try(X[X$Term_Avg %in% X2_Specific_Alleviators2$Term_Avg, ]$Overlap <- - paste(Name2, "Specific_Deletion_Enhancers", sep = "_")) + paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_")) try(X[X$Term_Avg %in% Overlap_Aggravators2$Term_Avg, ]$Overlap <- "Overlapping_Deletion_Suppressors") try(X[X$Term_Avg %in% Overlap_Alleviators2$Term_Avg, ]$Overlap <- "Overlapping_Deletion_Enhancers") try(X[X$Term_Avg %in% X2_Specific_Aggravators2_X1_Alleviatiors2$Term_Avg, ]$Overlap <- - paste(Name2, "Deletion_Suppressors", Name1, "Deletion_Enhancers", sep = "_")) + paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_")) try(X[X$Term_Avg %in% X2_Specific_Alleviators2_X1_Aggravators2$Term_Avg, ]$Overlap <- - paste(Name2, "Deletion_Enhancers", Name1, "Deletion_Suppressors", sep = "_")) + paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_")) X_abovethreshold <- X[!(is.na(X$Overlap)), ] gg <- ggplot(data = X_abovethreshold, aes( @@ -797,15 +757,15 @@ gg <- ggplot(data = X_abovethreshold, aes( SD_1 = Z_lm_K_SD_X1, SD_2 = Z_lm_K_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold.pdf")), + file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.pdf")), width = 12, height = 8 ) @@ -814,7 +774,7 @@ gg dev.off() pgg <- ggplotly(gg) #pgg -fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold.html")) +fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold.html")) htmlwidgets::saveWidget(pgg, fname) #6 @@ -829,16 +789,16 @@ gg <- ggplot(data = X_abovethreshold, aes( SD_1 = Z_lm_K_SD_X1, SD_2 = Z_lm_K_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_text(aes(label = Term_Avg), nudge_y = 0.25, size = 2) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3, size = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, " vs. ", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, " vs. ", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_names.pdf")), + file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.pdf")), width = 20, height = 20 ) @@ -846,7 +806,7 @@ gg dev.off() pgg <- ggplotly(gg) #pgg -fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_names.html")) +fname <- file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_names.html")) htmlwidgets::saveWidget(pgg, fname) #7 @@ -866,16 +826,16 @@ gg <- ggplot(data = X_abovethreshold, aes( SD_1 = Z_lm_K_SD_X1, SD_2 = Z_lm_K_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_text(aes(label = X1_Rank), nudge_y = 0.25, size = 4) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3, size = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_numberedX1.pdf")), + file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.pdf")), width = 15, height = 15 ) @@ -885,7 +845,7 @@ dev.off() pgg <- ggplotly(gg) #pgg fname <- - file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_numberedX1.html")) + file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX1.html")) htmlwidgets::saveWidget(pgg, fname) #8 @@ -900,16 +860,16 @@ gg <- ggplot(data = X_abovethreshold, aes( SD_1 = Z_lm_K_SD_X1, SD_2 = Z_lm_K_SD_X2 )) + - xlab(paste("GO Term Avg lm Z for", Name1)) + + xlab(paste("GO Term Avg lm Z for", exp1_name)) + geom_text(aes(label = X2_Rank), nudge_y = 0.25, size = 4) + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + geom_point(shape = 3, size = 3) + - ylab(paste("GO Term Avg lm Z for", Name2)) + - ggtitle(paste("Comparing Average GO Term Z lm for", Name1, "vs.", Name2)) + + ylab(paste("GO Term Avg lm Z for", exp2_name)) + + ggtitle(paste("Comparing Average GO Term Z lm for", exp1_name, "vs.", exp2_name)) + theme_Publication_legend_right() pdf( - file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_numberedX2.pdf")), + file.path(pairDirK, paste0("Scatter_lm_GTF_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.pdf")), width = 15, height = 15 ) @@ -919,17 +879,17 @@ dev.off() pgg <- ggplotly(gg) #pgg fname <- - file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", Name1, "_vs_", Name2, "_All_ByOverlap_AboveThreshold_numberedX2.html")) + file.path(pairDirK, paste0("Scatter_lm_GTA_Averages_", exp1_name, "_vs_", exp2_name, "_All_ByOverlap_AboveThreshold_numberedX2.html")) htmlwidgets::saveWidget(pgg, fname) write.csv( x = X, - file = file.path(pairDirK, paste0("All_GTF_Avg_Scores_", Name1, "_vs_", Name2, ".csv")), + file = file.path(pairDirK, paste0("All_GTF_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")), row.names = FALSE ) write.csv( x = X_abovethreshold, - file = file.path(pairDirK, paste0("AboveThreshold_GTF_Avg_Scores_", Name1, "_vs_", Name2, ".csv")), + file = file.path(pairDirK, paste0("AboveThreshold_GTF_Avg_Scores_", exp1_name, "_vs_", exp2_name, ".csv")), row.names = FALSE ) diff --git a/workflow/apps/r/calculate_interaction_zscores.R b/workflow/apps/r/calculate_interaction_zscores.R new file mode 100644 index 00000000..39d12f61 --- /dev/null +++ b/workflow/apps/r/calculate_interaction_zscores.R @@ -0,0 +1,335 @@ +suppressMessages({ + if (!require("ggplot2")) stop("Package ggplot2 is required but not installed.") + if (!require("plotly")) stop("Package plotly is required but not installed.") + if (!require("htmlwidgets")) stop("Package htmlwidgets is required but not installed.") + if (!require("dplyr")) stop("Package dplyr is required but not installed.") + if (!require("ggthemes")) stop("Package ggthemes is required but not installed.") + if (!require("plyr")) stop("Package plyr is required but not installed.") +}) + +# Constants for configuration +PLOT_WIDTH <- 14 +PLOT_HEIGHT <- 9 +BASE_SIZE <- 14 + +options(warn = 2, max.print = 100) + +parse_arguments <- function() { + args <- if (interactive()) { + c( + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD", + 3, + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab", + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2_DoxoHLD/results_std.txt", + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1", + "Experiment 1: Doxo versus HLD", + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2", + "Experiment 2: HLD versus Doxo", + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp3", + "Experiment 3: HLD versus WT", + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp4", + "Experiment 4: Doxo versus WT" + ) + } else { + commandArgs(trailingOnly = TRUE) + } + + paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE) + names <- args[seq(6, length(args), by = 2)] + experiments <- setNames(paths, names) + + list( + out_dir = normalizePath(file.path(args[1]), mustWork = FALSE), + sd = as.numeric(args[2]), + sgd_gene_list = normalizePath(file.path(args[3]), mustWork = FALSE), + easy_results_file = normalizePath(file.path(args[4]), mustWork = FALSE), + experiments = experiments + ) +} + +args <- parse_arguments() + +dir.create(args$out_dir, showWarnings = FALSE) + +# Define themes and scales +theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") { + theme_foundation(base_size = base_size, base_family = base_family) + + theme( + plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5), + text = element_text(), + panel.background = element_rect(colour = NA), + plot.background = element_rect(colour = NA), + panel.border = element_rect(colour = NA), + axis.title = element_text(face = "bold", size = rel(1)), + axis.title.y = element_text(angle = 90, vjust = 2), + axis.title.x = element_text(vjust = -0.2), + axis.line = element_line(colour = "black"), + panel.grid.major = element_line(colour = "#f0f0f0"), + panel.grid.minor = element_blank(), + legend.key = element_rect(colour = NA), + legend.position = "bottom", + legend.direction = "horizontal", + plot.margin = unit(c(10, 5, 5, 5), "mm"), + strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"), + strip.text = element_text(face = "bold") + ) +} + +theme_publication_legend_right <- function(base_size = BASE_SIZE, base_family = "sans") { + theme_publication(base_size, base_family) + + theme( + legend.position = "right", + legend.direction = "vertical", + legend.key.size = unit(0.5, "cm"), + legend.spacing = unit(0, "cm"), + legend.title = element_text(face = "italic") + ) +} + +scale_fill_publication <- function(...) { + discrete_scale("fill", "Publication", manual_pal(values = c( + "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", + "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" + )), ...) +} + +scale_colour_publication <- function(...) { + discrete_scale("colour", "Publication", manual_pal(values = c( + "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", + "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" + )), ...) +} + +# Load SGD gene list +sgd_genes <- function(sgd_gene_list) { + read.delim(file = sgd_gene_list, quote = "", header = FALSE, + colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) %>% + dplyr::rename(ORF = V4, GeneName = V5) +} + +genes <- sgd_genes(args$sgd_gene_list) + +load_and_preprocess_data <- function(easy_results_file, genes) { + df <- tryCatch({ + read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) + }, error = function(e) { + stop("Error reading file: ", easy_results_file, "\n", e$message) + }) %>% + filter(!.[[1]] %in% c("", "Scan")) # Fixed syntax + + numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd") + df[numeric_columns[numeric_columns %in% colnames(df)]] <- + lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric) + + df <- df %>% + mutate( + L = if ("l" %in% colnames(.)) l else {warning("Missing column: l"); NA}, + AUC = if ("AUC96" %in% colnames(.)) AUC96 else {warning("Missing column: AUC96"); NA}, + conc_num = if ("Conc" %in% colnames(.)) as.numeric(gsub("[^0-9\\.]", "", Conc)) else NA, + delta_bg = if (all(c("X1stBackgrd", "LstBackgrd") %in% colnames(.))) + LstBackgrd - X1stBackgrd else {warning("Missing columns for delta_bg calculation"); NA}, + GeneName = vapply(ORF, function(orf) { + gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName) + ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name) + }, character(1)) # Ensures a character vector is returned + ) + + if (nrow(df) == 0) warning("Dataframe is empty after filtering") + + return(df) +} + +create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "") { + plot_func <- if (plot_type == "scatter") geom_point else geom_boxplot + filtered_df <- filter(df, is.finite(.data[[var]])) + + p <- ggplot(filtered_df, aes(Scan, .data[[var]], color = as.factor(conc_num))) + + plot_func(shape = 3, size = 0.2, position = "jitter") + + stat_summary(fun = mean, geom = "point", size = 0.6) + + stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar") + + ggtitle(paste("Plate analysis by Drug Conc for", var, "before quality control")) + + theme_publication() + + pdf_path <- file.path(out_dir_qc, paste0("plate_analysis_", var, suffix, ".pdf")) + pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT) + print(p) + dev.off() + + html_path <- sub(".pdf$", ".html", pdf_path) + pgg <- suppressWarnings(ggplotly(p, tooltip = c("L", "K", "ORF", "Gene", "delta_bg", "GeneName")) %>% + layout(legend = list(orientation = "h"))) + saveWidget(pgg, html_path, selfcontained = TRUE) +} + + +publish_summary_stats <- function(df, variables, out_dir) { + stats_list <- lapply(variables, function(var) { + df %>% + dplyr::group_by(OrfRep, conc_num) %>% + dplyr::summarize( + N = dplyr::n(), # Ensure that the correct version of n() is used + mean_val = mean(.data[[var]], na.rm = TRUE), + sd_val = sd(.data[[var]], na.rm = TRUE), + se_val = sd_val / sqrt(N) + ) + }) + summary_stats_df <- dplyr::bind_rows(stats_list, .id = "variable") + write.csv(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) +} + + +publish_interaction_scores <- function(df, out_dir) { + interaction_scores <- df %>% + group_by(OrfRep) %>% + summarize( + mean_L = mean(L, na.rm = TRUE), + mean_K = mean(K, na.rm = TRUE), + mean_r = mean(r, na.rm = TRUE), + mean_AUC = mean(AUC, na.rm = TRUE), + delta_bg_mean = mean(delta_bg, na.rm = TRUE), + delta_bg_sd = sd(delta_bg, na.rm = TRUE) + ) %>% + mutate( + l_rank = rank(mean_L), + k_rank = rank(mean_K), + r_rank = rank(mean_r), + auc_rank = rank(mean_AUC) + ) + + write.csv(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE) + write.csv(arrange(interaction_scores, l_rank, k_rank), + file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE) +} + +publish_zscores <- function(df, out_dir) { + zscores <- df %>% + mutate( + zscore_L = scale(L, center = TRUE, scale = TRUE), + zscore_K = scale(K, center = TRUE, scale = TRUE), + zscore_r = scale(r, center = TRUE, scale = TRUE), + zscore_AUC = scale(AUC, center = TRUE, scale = TRUE) + ) + + write.csv(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE) +} + +generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) { + variables <- c("L", "K", "r", "AUC", "delta_bg") + lapply(variables, create_and_publish_plot, df = df, plot_type = "scatter", out_dir_qc = out_dir_qc) + delta_bg_above_tolerance <- filter(df, delta_bg >= delta_bg_tolerance) + lapply(variables, create_and_publish_plot, df = delta_bg_above_tolerance, + plot_type = "scatter", out_dir_qc = out_dir_qc, suffix = "_above_tolerance") +} + +process_exp_dir <- function(exp_dir, exp_name, genes, easy_results_file) { + out_dir <- file.path(exp_dir, "zscores") + out_dir_qc <- file.path(exp_dir, "qc") + dir.create(out_dir, showWarnings = FALSE) + dir.create(out_dir_qc, showWarnings = FALSE) + df <- load_and_preprocess_data(easy_results_file, genes) + delta_bg_tolerance <- mean(df$delta_bg, na.rm = TRUE) + 3 * sd(df$delta_bg, na.rm = TRUE) + + generate_and_publish_qc(df, delta_bg_tolerance, out_dir_qc) + + variables <- c("L", "K", "r", "AUC", "delta_bg") + publish_summary_stats(df, variables, out_dir) + publish_interaction_scores(df, out_dir) + publish_zscores(df, out_dir) + + list( + zscores_file = file.path(out_dir, "zscores_interaction.csv"), + exp_name = exp_name + ) +} + +processed_experiments <- lapply(names(args$experiments), function(exp_name) { + process_exp_dir(args$experiments[[exp_name]], exp_name, genes, args$easy_results_file) +}) + +zscores_files <- sapply(processed_experiments, `[[`, "zscores_file") +exp_names <- sapply(processed_experiments, `[[`, "exp_name") + +combine_zscores <- function(zscores_files) { + if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script") + + joined_data <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE) + for (file in zscores_files[-1]) { + temp_data <- read.csv(file = file, stringsAsFactors = FALSE) + joined_data <- plyr::join(joined_data, temp_data, by = "OrfRep", type = "full") + } + joined_data +} + +process_combined_zscores <- function(joined_data, sd, out_dir, exp_names) { + ordered_data <- joined_data %>% + select(contains("OrfRep"), matches("Gene"), + contains("z_lm_k"), contains("z_shift_k"), + contains("z_lm_l"), contains("z_shift_l")) %>% + arrange(contains("OrfRep")) + + clean_headers <- function(data, suffixes) { + suffixes_to_remove <- paste0("Gene.", seq_len(suffixes)) + select(data, -all_of(suffixes_to_remove)) + } + + headSel <- clean_headers(ordered_data, length(zscores_files) - 1) + headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1) + + fill_na_in_columns <- function(data) { + for (header in colnames(data)) { + if (grepl("Shift", header)) { + data[[header]][is.na(data[[header]])] <- 0.001 + } else if (grepl("Z_lm_", header)) { + data[[header]][is.na(data[[header]])] <- 0.0001 + } + } + data + } + + headSel <- fill_na_in_columns(headSel) + + filter_by_sd <- function(data, sd) { + if (sd == 0) return(data) + + z_lm_cols <- select(data, contains("z_lm_")) + filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0 + data[filter_vector, ] + } + + REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), sd) + shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), sd) + + reorder_columns <- function(data1, data2) { + combined_data <- data1 + for (i in 3:ncol(data1)) { + combined_data <- cbind(combined_data, data2[i], data1[i]) + } + combined_data + } + + combI <- reorder_columns(headSel2, shiftOnly) + + write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE) + write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE) + + relabel_headers <- function(headers, exp_names) { + new_labels <- headers + for (i in seq_along(headers)) { + suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i]) + if (suffix %in% seq_along(exp_names)) { + exp_name <- exp_names[as.numeric(suffix)] + new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i]) + } + } + new_labels + } + + colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), exp_names) + colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), exp_names) + + write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE) + write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE) +} + +joined_data <- combine_zscores(zscores_files) +process_combined_zscores(joined_data, args$sd, args$out_dir, exp_names) diff --git a/workflow/apps/r/calculate_pairwise_lk.R b/workflow/apps/r/calculate_pairwise_lk.R new file mode 100644 index 00000000..ee7876fa --- /dev/null +++ b/workflow/apps/r/calculate_pairwise_lk.R @@ -0,0 +1,209 @@ +suppressMessages({ + library("ggplot2") + library("plotly") + library("htmlwidgets") + library("extrafont") + library("grid") + library("ggthemes") +}) + +# Constants for configuration +PLOT_WIDTH <- 12 +PLOT_HEIGHT <- 8 +BASE_SIZE <- 14 + +options(warn = 2, max.print = 100) + +# Parse arguments +parse_arguments <- function() { + if (interactive()) { + args <- c( + "Exp1", + "/path/to/exp1_file.csv", + "Exp2", + "/path/to/exp2_file.csv", + "/path/to/output_dir" + ) + } else { + args <- commandArgs(trailingOnly = TRUE) + } + list( + exp1_name = args[1], + exp1_file = normalizePath(args[2], mustWork = TRUE), + exp2_name = args[3], + exp2_file = normalizePath(args[4], mustWork = TRUE), + output_dir = normalizePath(args[5], mustWork = FALSE) + ) +} + +args <- parse_arguments() + +# Create output directories if they don't exist +pairDirL <- file.path(args$output_dir, paste0("PairwiseCompareL_", args$exp1_name, "-", args$exp2_name)) +pairDirK <- file.path(args$output_dir, paste0("PairwiseCompareK_", args$exp1_name, "-", args$exp2_name)) +dir.create(pairDirL, showWarnings = FALSE, recursive = TRUE) +dir.create(pairDirK, showWarnings = FALSE, recursive = TRUE) + +# Define themes and scales +theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") { + theme_foundation(base_size = base_size, base_family = base_family) + + theme( + plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5), + panel.background = element_rect(colour = NA), + plot.background = element_rect(colour = NA), + panel.border = element_rect(colour = NA), + axis.title = element_text(face = "bold", size = rel(1)), + axis.line = element_line(colour = "black"), + panel.grid.major = element_line(colour = "#f0f0f0"), + panel.grid.minor = element_blank(), + legend.position = "bottom", + legend.direction = "horizontal", + legend.key.size = unit(0.2, "cm"), + plot.margin = unit(c(10, 5, 5, 5), "mm"), + strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"), + strip.text = element_text(face = "bold") + ) +} + +theme_publication_legend_right <- function(base_size = BASE_SIZE, base_family = "sans") { + theme_publication(base_size, base_family) + + theme( + legend.position = "right", + legend.direction = "vertical", + legend.key.size = unit(0.5, "cm") + ) +} + +scale_fill_publication <- function(...) { + discrete_scale("fill", "Publication", manual_pal(values = c( + "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", + "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" + )), ...) +} + +scale_colour_publication <- function(...) { + discrete_scale("colour", "Publication", manual_pal(values = c( + "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", + "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" + )), ...) +} + +# Read and merge data +load_and_merge_data <- function(file1, file2) { + df1 <- read.csv(file = file1, stringsAsFactors = FALSE, header = TRUE) + df2 <- read.csv(file = file2, stringsAsFactors = FALSE, header = TRUE) + merge(df1, df2, by = "Term_Avg", all = TRUE, suffixes = c("_df1", "_df2")) +} + +# Generate a plot and save to PDF and HTML +generate_plot <- function(data, x_var, y_var, color_var, title, file_path, theme_function) { + gg <- ggplot(data = data, aes_string( + x = x_var, + y = y_var, + color = color_var + )) + + xlab(paste("GO Term Avg lm Z for", args$exp1_name)) + + geom_rect(aes(xmin = -2, xmax = 2, ymin = -2, ymax = 2), color = "grey20", size = 0.25, alpha = 0.1, inherit.aes = FALSE, fill = NA) + + geom_point(shape = 3) + + ylab(paste("GO Term Avg lm Z for", args$exp2_name)) + + ggtitle(title) + + theme_function() + + pdf(file.path(file_path, ".pdf"), width = PLOT_WIDTH, height = PLOT_HEIGHT) + print(gg) + dev.off() + + pgg <- ggplotly(gg) + htmlwidgets::saveWidget(pgg, file.path(file_path, ".html")) +} + +# Identify and annotate specific interactions +annotate_interactions <- function(df, exp1_name, exp2_name, suffix) { + df$Overlap_Avg <- NA + interactions <- list( + "df1_Specific_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] < 2), + "df1_Specific_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] > -2), + "df2_Specific_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] < 2), + "df2_Specific_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] > -2), + "Overlap_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2), + "Overlap_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2), + "df2_Aggravators_df1_Alleviators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] >= 2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] <= -2), + "df2_Alleviators_df1_Aggravators" = which(df[[paste0("Z_lm_", suffix, "_Avg_df2")]] <= -2 & df[[paste0("Z_lm_", suffix, "_Avg_df1")]] >= 2) + ) + annotation <- list( + df1_Specific_Aggravators = paste(exp1_name, "Specific_Deletion_Enhancers", sep = "_"), + df1_Specific_Alleviators = paste(exp1_name, "Specific_Deletion_Suppressors", sep = "_"), + df2_Specific_Aggravators = paste(exp2_name, "Specific_Deletion_Enhancers", sep = "_"), + df2_Specific_Alleviators = paste(exp2_name, "Specific_Deletion_Suppressors", sep = "_"), + Overlap_Aggravators = "Overlapping_Deletion_Enhancers", + Overlap_Alleviators = "Overlapping_Deletion_Suppressors", + df2_Aggravators_df1_Alleviators = paste(exp2_name, "Deletion_Enhancers", exp1_name, "Deletion_Suppressors", sep = "_"), + df2_Alleviators_df1_Aggravators = paste(exp2_name, "Deletion_Suppressors", exp1_name, "Deletion_Enhancers", sep = "_") + ) + for (key in names(interactions)) { + try(df$Overlap_Avg[interactions[[key]]] <- annotation[[key]]) + } + df +} + +# Rank and filter data +rank_and_filter_data <- function(df, suffix) { + z1 <- df + z1[[paste0("L_Subtract_SD_", suffix, "_df1")]] <- z1[[paste0("Z_lm_", suffix, "_Avg_df1")]] - z1[[paste0("Z_lm_", suffix, "_SD_df1")]] + z1[[paste0("L_Subtract_SD_", suffix, "_df2")]] <- z1[[paste0("Z_lm_", suffix, "_Avg_df2")]] - z1[[paste0("Z_lm_", suffix, "_SD_df2")]] + + z2 <- df + z2[[paste0("L_Subtract_SD_", suffix, "_df1")]] <- z2[[paste0("Z_lm_", suffix, "_Avg_df1")]] + z2[[paste0("Z_lm_", suffix, "_SD_df1")]] + z2[[paste0("L_Subtract_SD_", suffix, "_df2")]] <- z2[[paste0("Z_lm_", suffix, "_Avg_df2")]] + z2[[paste0("Z_lm_", suffix, "_SD_df2")]] + + df_above_threshold <- df[!is.na(df$Overlap_Avg), ] + df_above_threshold$df1_Rank <- rank(-df_above_threshold[[paste0("Z_lm_", suffix, "_Avg_df1")]], ties.method = "random") + df_above_threshold$df2_Rank <- rank(-df_above_threshold[[paste0("Z_lm_", suffix, "_Avg_df2")]], ties.method = "random") + + list(z1 = z1, z2 = z2, df_above_threshold = df_above_threshold) +} + +# Main execution function for a pairwise comparison +run_pairwise_comparison <- function(suffix, dir) { + df <- load_and_merge_data(args$exp1_file, args$exp2_file) + + # Generate initial ontology-based plot + generate_plot(df, + paste0("Z_lm_", suffix, "_Avg_df1"), paste0("Z_lm_", suffix, "_Avg_df2"), "Ontology_Avg_df1", + paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name), + file.path(dir, paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOntology")), + theme_publication_legend_right) + + # Annotate interactions and generate overlap-based plot + df <- annotate_interactions(df, args$exp1_name, args$exp2_name, suffix) + ranks <- rank_and_filter_data(df, suffix) + + generate_plot(df, + paste0("Z_lm_", suffix, "_Avg_df1"), + paste0("Z_lm_", suffix, "_Avg_df2"), + "Overlap_Avg", + paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name), + file.path(dir, paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOverlap")), + theme_publication_legend_right) + + generate_plot(ranks$df_above_threshold, + paste0("Z_lm_", suffix, "_Avg_df1"), + paste0("Z_lm_", suffix, "_Avg_df2"), + "Overlap_Avg", + paste("Comparing Average GO Term Z lm for", args$exp1_name, "vs.", args$exp2_name, "Above Threshold"), + file.path(dir, + paste0("Scatter_lm_GTA_Averages_", args$exp1_name, "_vs_", args$exp2_name, "_All_ByOverlap_AboveThreshold")), + theme_publication_legend_right) + + # Save CSV files + write.csv(df, file.path(dir, + paste0("All_GTA_Avg_Scores_", args$exp1_name, "_vs_", args$exp2_name, ".csv")), + row.names = FALSE) + write.csv(ranks$df_above_threshold, + file.path(dir, paste0("AboveThreshold_GTA_Avg_Scores_", args$exp1_name, "_vs_", args$exp2_name, ".csv")), + row.names = FALSE) +} + +# Execute Pairwise L and Pairwise K comparisons +run_pairwise_comparison("L", pairDirL) +run_pairwise_comparison("K", pairDirK) diff --git a/workflow/apps/r/interactions.R b/workflow/apps/r/interactions.R index 3d396583..1c04f797 100644 --- a/workflow/apps/r/interactions.R +++ b/workflow/apps/r/interactions.R @@ -14,7 +14,7 @@ BASE_SIZE <- 14 options(warn = 2, max.print = 100) -# Function to parse arguments +# Parse arguments parse_arguments <- function() { if (interactive()) { args <- c( @@ -92,7 +92,7 @@ scale_colour_publication <- function(...) { )), ...) } -# Function to load and preprocess data +# Load and preprocess data load_and_preprocess_data <- function(input_file) { df <- tryCatch({ read.delim(input_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) @@ -124,7 +124,7 @@ df <- df %>% mutate(OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep)) %>% filter(!Gene %in% c("BLANK", "Blank", "blank"), Drug != "BMH21") -# Function to create plot +# Create plot create_plot <- function(df, var, plot_type) { filtered_df <- df %>% filter(is.finite(.data[[var]])) p <- ggplot(filtered_df, aes(Scan, .data[[var]], color = as.factor(conc_num))) + @@ -143,7 +143,7 @@ create_plot <- function(df, var, plot_type) { return(p) } -# Function to publish plot to PDF and HTML (Plotly) +# Publish plot to PDF and HTML (Plotly) publish_plot <- function(plot, plot_path) { # if (file.exists(plot_path)) { # file.rename(plot_path, paste0(plot_path, BACKUP_SUFFIX)) @@ -153,13 +153,13 @@ publish_plot <- function(plot, plot_path) { print(plot) dev.off() - pgg <- suppressWarnings(ggplotly(plot, + pgg <- suppressWarnings(ggplotly(plot, tooltip = c("L", "K", "ORF", "Gene", "delta_bg")) %>% layout(legend = list(orientation = "h"))) saveWidget(pgg, sub(".pdf$", ".html", plot_path), selfcontained = TRUE) } -# Function to publish multiple plots +# Publish multiple plots publish_multiple_plots <- function(df, variables, plot_type, out_dir_qc, suffix = "") { lapply(variables, function(var) { plot <- create_plot(df, var, plot_type) @@ -167,7 +167,7 @@ publish_multiple_plots <- function(df, variables, plot_type, out_dir_qc, suffix }) } -# Function to calculate and publish summary statistics +# Calculate and publish summary statistics publish_summary_stats <- function(df, variables, out_dir) { stats_list <- lapply(variables, function(var) { df %>% @@ -183,7 +183,7 @@ publish_summary_stats <- function(df, variables, out_dir) { write.csv(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) } -# Function to calculate and publish interaction scores +# Calculate and publish interaction scores publish_interaction_scores <- function(df, out_dir) { interaction_scores <- df %>% group_by(OrfRep) %>% @@ -207,7 +207,7 @@ publish_interaction_scores <- function(df, out_dir) { arrange(l_rank, k_rank), file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE) } -# Function to publish z-scores +# Publish z-scores publish_zscores <- function(df, out_dir) { zscores <- df %>% mutate( diff --git a/workflow/apps/r/joinInteractExps.R b/workflow/apps/r/joinInteractExps.R index a2557ad3..3fd986d4 100644 --- a/workflow/apps/r/joinInteractExps.R +++ b/workflow/apps/r/joinInteractExps.R @@ -1,238 +1,375 @@ -#!/usr/bin/env Rscript -# JoinInteractExps.R - -library("plyr") -library("sos") library("dplyr") -args <- commandArgs(TRUE) - -# Set output dir -if (length(args) >= 1) { - out_dir <- file.path(args[1]) -} else { - out_dir <- "./" # for legacy workflow -} - -# Set sd value -if (length(args) >= 2) { - sd <- as.numeric(args[2]) -} else { - sd <- 2 # default value -} - -sprintf("SD value is: %d", sd) - -# Set study_info file -if (length(args) >= 3) { - study_info <- file.path(args[3]) -} else { - study_info <- "../Code/StudyInfo.csv" # for legacy workflow -} - -studies <- args[4:length(args)] -print(studies) -input_files <- c() -for (i in seq_along(studies)) { - study <- studies[i] - zs_file <- file.path(study, "zscores", "zscores_interaction.csv") - if (file.exists(zs_file)) { - input_files[i] <- zs_file +# Function to parse and set arguments +parse_arguments <- function() { + if (interactive()) { + args <- c( + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD", + 2, + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv", + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1", + "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2" + ) + } else { + args <- commandArgs(trailingOnly = TRUE) } + + list( + out_dir = normalizePath(file.path(args[1]), mustWork = FALSE), + sd = as.numeric(args[2]), + study_info = normalizePath(file.path(args[3]), mustWork = FALSE), + input_dirs = args[4:length(args)] + ) } -rm(zs_file, study) -# for (var in ls()) { -# print(paste(var, ":", get(var))) +args <- parse_arguments() + +# Create an array for the zscores files +get_zscores_files <- function(dirs) { + files <- sapply(dirs, function(study) { + file_path <- file.path(study, "zscores", "zscores_interaction.csv") + if (file.exists(file_path)) file_path else NULL + }, simplify = TRUE, USE.NAMES = FALSE) + + # Filter out NULL entries + files[!sapply(files, is.null)] +} + +zscores_files <- get_zscores_files(args$input_dirs) +sprintf("The SD value is: %d", args$sd) + +# Ensure there are enough files to compare +if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script") + +# Function to join zscores files +join_zscores_files <- function(files) { + joined_data <- read.csv(file = files[1], stringsAsFactors = FALSE) + for (file in files[-1]) { + temp_data <- read.csv(file = file, stringsAsFactors = FALSE) + joined_data <- join(joined_data, temp_data, by = "OrfRep") + } + joined_data +} + +# Load and join zscores files +joined_data <- join_zscores_files(zscores_files) + +# Order and select columns +order_and_select_columns <- function(data) { + ordered_data <- data[, order(colnames(data))] + selected_headers <- select(ordered_data, + contains("OrfRep"), matches("Gene"), + contains("z_lm_k"), contains("z_shift_k"), + contains("z_lm_l"), contains("z_shift_l")) + selected_headers +} + +selected_headers <- order_and_select_columns(joined_data) + +# Remove redundant columns like "Gene.1" +clean_headers <- function(data, suffixes) { + suffixes_to_remove <- paste0("Gene.", seq_len(suffixes)) + select(data, -all_of(suffixes_to_remove)) +} + +headSel <- clean_headers(selected_headers, length(zscores_files) - 1) +headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1) + +# Fill NA values in Shift and Z_lm columns +fill_na_in_columns <- function(data) { + for (header in colnames(data)) { + if (grepl("Shift", header)) { + data[[header]][is.na(data[[header]])] <- 0.001 + } else if (grepl("Z_lm_", header)) { + data[[header]][is.na(data[[header]])] <- 0.0001 + } + } + data +} + +headSel <- fill_na_in_columns(headSel) + +# Filter based on standard deviation +filter_by_sd <- function(data, sd) { + if (sd == 0) return(data) + + z_lm_cols <- select(data, contains("z_lm_")) + filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0 + data[filter_vector, ] +} + +REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), args$sd) +shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), args$sd) + +# Reorder columns to interleave Z_lm and Shift data +reorder_columns <- function(data1, data2) { + combined_data <- data1 + for (i in 3:ncol(data1)) { + combined_data <- cbind(combined_data, data2[i], data1[i]) + } + combined_data +} + +combI <- reorder_columns(headSel2, shiftOnly) + +# Write output files +write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE) +write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE) + +# Relabel headers using experiment names from StudyInfo.csv +relabel_headers <- function(headers, labels) { + new_labels <- headers + for (i in seq_along(headers)) { + suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i]) + if (suffix %in% 1:3) { + exp_name <- labels[as.numeric(suffix), 2] + new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i]) + } + } + new_labels +} + +LabelStd <- read.csv(file = args$study_info, stringsAsFactors = FALSE) +colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), LabelStd) +colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), LabelStd) + +# Save relabeled files +write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE) +write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE) + +# Save updated parameters +LabelStd[, 4] <- args$sd +write.csv(LabelStd, file.path(args$out_dir, "parameters.csv"), row.names = FALSE) +write.csv(LabelStd, file = args$study_info, row.names = FALSE) + + + + + + + +# library("plyr") +# library("sos") +# library("dplyr") + +# # Function to parse and set arguments +# parse_arguments <- function() { +# if (interactive()) { +# args <- c( +# "./", # Default out_dir +# "2", # Default SD value +# "../Code/StudyInfo.csv" # Default study_info path +# ) +# } else { +# args <- commandArgs(trailingOnly = TRUE) +# } + +# list( +# out_dir = normalizePath(file.path(args[1]), mustWork = FALSE), +# sd = as.numeric(args[2]), +# study_info = normalizePath(file.path(args[3]), mustWork = FALSE), +# input_dirs = args[4:length(args)] +# ) # } -# print(input_files) -# print(length(input_files)) +# args <- parse_arguments() -# TODO this is better handled in a loop in case you want to compare more experiments? -# The input is already designed for this -# Read in the files for your experiment and -# Join the two files at a time as a function of how many inputFile -# list the larger file first ? in this example X2 has the larger number of genes -# If X1 has a larger number of genes, switch the order of X1 and X2 -if (length(input_files) < 2) { - print("Note enough Exps to compare, skipping join") - stop("Exiting script") -} +# # Create an array for the zscores files +# zscores_files <- sapply(args$input_dirs, function(study) { +# file_path <- file.path(study, "zscores", "zscores_interaction.csv") +# if (file.exists(file_path)) file_path else NULL +# }, simplify = TRUE, USE.NAMES = FALSE) -if (length(input_files) >= 2) { - X1 <- read.csv(file = input_files[1], stringsAsFactors = FALSE) - X2 <- read.csv(file = input_files[2], stringsAsFactors = FALSE) - X <- join(X1, X2, by = "OrfRep") - OBH <- X[, order(colnames(X))] # OrderByHeader - headers <- select(OBH, contains("OrfRep"), matches("Gene"), - contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l")) - headSel <- select(headers, -"Gene.1") # remove "Gene.1 column - headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums - headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column # frame for interleaving Z_lm with Shift colums -} +# # Filter out NULL entries +# zscores_files <- zscores_files[!sapply(zscores_files, is.null)] -if (length(input_files) >= 3) { - X3 <- read.csv(file = input_files[3], stringsAsFactors = FALSE) - X <- join(X, X3, by = "OrfRep") - headSel <- select(headers, -"Gene.1", -"Gene.2") - headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) - headSel2 <- select(headSel2, -"Gene.1", -"Gene.2") -} +# sprintf("The SD value is: %d", args$sd) +# # print(args$input_dirs) -if (length(input_files) >= 4) { - X4 <- read.csv(file = input_files[4], stringsAsFactors = FALSE) - X <- join(X, X4, by = "OrfRep") - headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3") - headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) - headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3") -} +# # TODO this is better handled in a loop in case you want to compare more experiments? +# # The input is already designed for this +# # Read in the files for your experiment and +# # Join the two files at a time as a function of how many inputFile +# # list the larger file first ? in this example X2 has the larger number of genes +# # If X1 has a larger number of genes, switch the order of X1 and X2 +# if (length(zscores_files) < 2) { +# print("Note enough Exps to compare, skipping join") +# stop("Exiting script") +# } -# print(headers) -# headSel$contains("Z_Shift") %>% replace_na(0.001) -headers <- colnames(headSel) -# print(headers) -i <- 0 -for (i in 1:length(headers)) { - if (grepl("Shift", headers[i])) { - headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001 - } - if (grepl("Z_lm_", headers[i])) { - headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001 - } -} +# if (length(zscores_files) >= 2) { +# X1 <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE) +# X2 <- read.csv(file = zscores_files[2], stringsAsFactors = FALSE) +# X <- join(X1, X2, by = "OrfRep") +# OBH <- X[, order(colnames(X))] # OrderByHeader +# headers <- select(OBH, contains("OrfRep"), matches("Gene"), +# contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l")) +# headSel <- select(headers, -"Gene.1") # remove "Gene.1 column +# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums +# headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column # frame for interleaving Z_lm with Shift colums +# } -# 2SD option code to exclude Z_lm values less than 2 standard Deviations -REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")) -shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")) +# if (length(zscores_files) >= 3) { +# X3 <- read.csv(file = zscores_files[3], stringsAsFactors = FALSE) +# X <- join(X, X3, by = "OrfRep") +# headSel <- select(headers, -"Gene.1", -"Gene.2") +# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) +# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2") +# } -# Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt -Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",") +# if (length(zscores_files) >= 4) { +# X4 <- read.csv(file = zscores_files[4], stringsAsFactors = FALSE) +# X <- join(X, X4, by = "OrfRep") +# headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3") +# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) +# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3") +# } -# Using Text search grepl to relabel headers -REMcRdyHdr <- colnames(REMcRdy) -REMcRdyLabels <- "asdf" -shftHdr <- colnames(shiftOnly) -shiftLabels <- "asdf" -shiftLabels[1:2] <- shftHdr[1:2] -REMcRdyLabels[1:2] <- REMcRdyHdr[1:2] +# # print(headers) +# # headSel$contains("Z_Shift") %>% replace_na(0.001) +# headers <- colnames(headSel) +# # print(headers) +# i <- 0 +# for (i in 1:length(headers)) { +# if (grepl("Shift", headers[i])) { +# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001 +# } +# if (grepl("Z_lm_", headers[i])) { +# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001 +# } +# } + +# # 2SD option code to exclude Z_lm values less than 2 standard Deviations +# REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")) +# shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")) + +# # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt +# Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",") + +# # Using Text search grepl to relabel headers +# REMcRdyHdr <- colnames(REMcRdy) +# REMcRdyLabels <- "asdf" +# shftHdr <- colnames(shiftOnly) +# shiftLabels <- "asdf" +# shiftLabels[1:2] <- shftHdr[1:2] +# REMcRdyLabels[1:2] <- REMcRdyHdr[1:2] -for (i in 3:(length(shftHdr))) { - if (i == 3) { - shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3]) - REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3]) - } - if (i == 5) { - shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5]) - REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5]) - } - if (i == 7) { - shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7]) - REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7]) - } - if (grepl(".1", shftHdr[i], fixed = true)) { - shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i]) - REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i]) - } - if (grepl(".2", shftHdr[i], fixed = true)) { - shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i]) - REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i]) - } - if (grepl(".3", shftHdr[i], fixed = true)) { - shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i]) - REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i]) - } -} +# for (i in 3:(length(shftHdr))) { +# if (i == 3) { +# shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3]) +# REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3]) +# } +# if (i == 5) { +# shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5]) +# REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5]) +# } +# if (i == 7) { +# shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7]) +# REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7]) +# } +# if (grepl(".1", shftHdr[i], fixed = true)) { +# shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i]) +# REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i]) +# } +# if (grepl(".2", shftHdr[i], fixed = true)) { +# shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i]) +# REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i]) +# } +# if (grepl(".3", shftHdr[i], fixed = true)) { +# shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i]) +# REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i]) +# } +# } -for (i in 3:(length(REMcRdyLabels))) { - j <- as.integer(i) - REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j]) - shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j]) -} +# for (i in 3:(length(REMcRdyLabels))) { +# j <- as.integer(i) +# REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j]) +# shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j]) +# } -colnames(shiftOnly) <- shiftLabels -colnames(REMcRdy) <- REMcRdyLabels +# colnames(shiftOnly) <- shiftLabels +# colnames(REMcRdy) <- REMcRdyLabels -combI <- headSel2 # starting Template orf, Genename columns +# combI <- headSel2 # starting Template orf, Genename columns -# headersRemc<-colnames(REMcRdy) -# Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps. -for (i in 3:length(colnames(REMcRdy))) { - combI <- cbind.data.frame(combI, shiftOnly[i]) - combI <- cbind.data.frame(combI, REMcRdy[i]) -} +# # headersRemc<-colnames(REMcRdy) +# # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps. +# for (i in 3:length(colnames(REMcRdy))) { +# combI <- cbind.data.frame(combI, shiftOnly[i]) +# combI <- cbind.data.frame(combI, REMcRdy[i]) +# } -Vec1 <- NA -Vec2 <- NA -Vec3 <- NA -Vec4 <- NA -Vec5 <- NA -Vec6 <- NA -Vec7 <- NA -Vec8 <- NA +# Vec1 <- NA +# Vec2 <- NA +# Vec3 <- NA +# Vec4 <- NA +# Vec5 <- NA +# Vec6 <- NA +# Vec7 <- NA +# Vec8 <- NA -if (length(REMcRdy) == 6) { - Vec1 <- abs(REMcRdy[, 3]) >= sd - Vec2 <- abs(REMcRdy[, 4]) >= sd - Vec3 <- abs(REMcRdy[, 5]) >= sd - Vec4 <- abs(REMcRdy[, 6]) >= sd - bolVec <- Vec1 | Vec2 | Vec3 | Vec4 - REMcRdyGT2 <- REMcRdy[bolVec, 1:2] - REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6] - shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] - shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6] -} +# if (length(REMcRdy) == 6) { +# Vec1 <- abs(REMcRdy[, 3]) >= sd +# Vec2 <- abs(REMcRdy[, 4]) >= sd +# Vec3 <- abs(REMcRdy[, 5]) >= sd +# Vec4 <- abs(REMcRdy[, 6]) >= sd +# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 +# REMcRdyGT2 <- REMcRdy[bolVec, 1:2] +# REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6] +# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] +# shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6] +# } -if (length(REMcRdy) == 8) { - Vec1 <- abs(REMcRdy[, 3]) >= sd - Vec2 <- abs(REMcRdy[, 4]) >= sd - Vec3 <- abs(REMcRdy[, 5]) >= sd - Vec4 <- abs(REMcRdy[, 6]) >= sd - Vec5 <- abs(REMcRdy[, 7]) >= sd - Vec6 <- abs(REMcRdy[, 8]) >= sd - bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 - REMcRdyGT2 <- REMcRdy[bolVec, 1:2] - REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8] - shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] - shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8] -} +# if (length(REMcRdy) == 8) { +# Vec1 <- abs(REMcRdy[, 3]) >= sd +# Vec2 <- abs(REMcRdy[, 4]) >= sd +# Vec3 <- abs(REMcRdy[, 5]) >= sd +# Vec4 <- abs(REMcRdy[, 6]) >= sd +# Vec5 <- abs(REMcRdy[, 7]) >= sd +# Vec6 <- abs(REMcRdy[, 8]) >= sd +# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 +# REMcRdyGT2 <- REMcRdy[bolVec, 1:2] +# REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8] +# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] +# shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8] +# } -if (length(REMcRdy) == 10) { - Vec1 <- abs(REMcRdy[, 3]) >= sd - Vec2 <- abs(REMcRdy[, 4]) >= sd - Vec3 <- abs(REMcRdy[, 5]) >= sd - Vec4 <- abs(REMcRdy[, 6]) >= sd - Vec5 <- abs(REMcRdy[, 7]) >= sd - Vec6 <- abs(REMcRdy[, 8]) >= sd - Vec7 <- abs(REMcRdy[, 9]) >= sd - Vec8 <- abs(REMcRdy[, 10]) >= sd - bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8 - REMcRdyGT2 <- REMcRdy[bolVec, 1:2] - REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10] - shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] - shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10] -} +# if (length(REMcRdy) == 10) { +# Vec1 <- abs(REMcRdy[, 3]) >= sd +# Vec2 <- abs(REMcRdy[, 4]) >= sd +# Vec3 <- abs(REMcRdy[, 5]) >= sd +# Vec4 <- abs(REMcRdy[, 6]) >= sd +# Vec5 <- abs(REMcRdy[, 7]) >= sd +# Vec6 <- abs(REMcRdy[, 8]) >= sd +# Vec7 <- abs(REMcRdy[, 9]) >= sd +# Vec8 <- abs(REMcRdy[, 10]) >= sd +# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8 +# REMcRdyGT2 <- REMcRdy[bolVec, 1:2] +# REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10] +# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] +# shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10] +# } -if (sd != 0) { - REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)] - shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)] -} +# if (sd != 0) { +# REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)] +# shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)] +# } -if (sd == 0) { - REMcRdy <- REMcRdy # [,2:length(REMcRdy)] - shiftOnly <- shiftOnly # [,2:length(shiftOnly)] -} +# if (sd == 0) { +# REMcRdy <- REMcRdy # [,2:length(REMcRdy)] +# shiftOnly <- shiftOnly # [,2:length(shiftOnly)] +# } -# R places hidden "" around the header names. The following -# is intended to remove those quote so that the "" do not blow up the Java REMc. -# Use ,quote=F in the write.csv statement to fix R output file. -# write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE) -write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE) -write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE) -# LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",") +# # R places hidden "" around the header names. The following +# # is intended to remove those quote so that the "" do not blow up the Java REMc. +# # Use ,quote=F in the write.csv statement to fix R output file. +# # write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE) +# write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE) +# write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE) +# # LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",") -LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE) -# print(sd) -LabelStd[, 4] <- as.numeric(sd) -write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE) -write.csv(LabelStd, file = study_info, row.names = FALSE) +# LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE) +# # print(sd) +# LabelStd[, 4] <- as.numeric(sd) +# write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE) +# write.csv(LabelStd, file = study_info, row.names = FALSE) diff --git a/workflow/qhtcp-workflow b/workflow/qhtcp-workflow index c20b3353..71d820b6 100755 --- a/workflow/qhtcp-workflow +++ b/workflow/qhtcp-workflow @@ -14,6 +14,7 @@ # Insert a general description of Q-HTCP and the Q-HTCP process here. shopt -s extglob # Turn on extended globbing +shopt -s nullglob # Allow null globs DEBUG=${DEBUG:-1} # Turn debugging ON by default during development # @description Use `--help` to print the help message. @@ -27,12 +28,12 @@ print_help() { USAGE: qhtcp-workflow [[OPTION] [VALUE]]... - Some options (--project, --include, --exclude) can be passed multiple times or - by using comma-separated strings (see EXAMPLES below) + --project, --module, --wrapper, and --nomodule can be passed multiple + times or as comma-separated strings (see EXAMPLES below) OPTIONS: --project, -p PROJECT[,PROJECT...] - PROJECT should follow the pattern ${PROJECT_PREFIX}_PROJECT_NAME + PROJECT should follow the pattern ${PROJECT_PREFIX}_PROJECT --module, -m MODULE[,MODULE...] See MODULES section below for list of available modules If no --include is specified, all modules are run @@ -56,14 +57,6 @@ print_help() { WRAPPERS: ${ALL_WRAPPERS[*]} - DEPENDENCIES: - deb: ${depends_deb[@]} - rpm: ${depends_rpm[@]} - brew: ${depends_brew[@]} - perl: ${depends_perl[@]} - R: ${depends_r[@]} - BiocManager: ${depends_bioc[@]} - EXAMPLES: qhtcp-workflow --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]} qhtcp-workflow --project ${PROJECT_PREFIX}_MY_PROJECT --module ${ALL_MODULES[0]} --module ${ALL_MODULES[1]} @@ -75,6 +68,15 @@ print_help() { EOF } + # DEPENDENCIES: + # deb: ${depends_deb[@]} + # rpm: ${depends_rpm[@]} + # brew: ${depends_brew[@]} + # perl: ${depends_perl[@]} + # R: ${depends_r[@]} + # BiocManager: ${depends_bioc[@]} + + # @section Notes # @description # @@ -243,35 +245,57 @@ ask() { [[ ${response,,} =~ ^(yes|y)$ ]] } err() { echo "Error: $*" >&2; } -ask_pn() { - declare -ag ADD_PROJECTS - example_pn="${PROJECT_PREFIX}_$(random_three_words)" + +# @description Ask the user to input a name +# @arg $1 string formatted name +# @arg $2 string variable name +# @exitcode 0 If set +# @exitcode 1 If not +# @internal +ask_name() { + debug "Running: ${FUNCNAME[0]} $*" + declare -a to_add + declare example_pn + declare -g -a "$2" + declare -n ref="$2" + example_pn="${STUDY_PREFIX}_$(random_three_words)" cat <<-EOF - Enter a new or existing project name - If entering a new project, use the suggested prefix: ${PROJECT_PREFIX}_ - You may choose any combination of words/characters following the prefix, but be sensible. - Make it descriptive and avoid spaces and special characters. + ${underline}Enter a new or existing $1${nounderline} + * Suggested prefix: ${STUDY_PREFIX}_ + * You may choose any combination of words/characters following the prefix, but be sensible. + * Make it descriptive and avoid spaces and special characters. + * Example: $example_pn EOF - trys=3 # give the user up to 3 tries to enter a valid project name + trys=3 # give the user up to 3 tries to enter a valid name for ((i=1; i<=trys; i++)); do - read -r -p "Enter a new or existing project name or hit Enter for default ($example_pn): " response + read -r -e -p "Enter a new or existing $1: " -i "${STUDY_PREFIX}_" response if [[ -z $response ]]; then - ADD_PROJECTS+=("$example_pn") + to_add+=("$example_pn") break else if sanitize_pn "$response"; then - ADD_PROJECTS+=("$response") - echo "$response successfully added as a project" - i=0 # resetting trys counter in case user wants to add more than 3 projects + to_add+=("$response") + echo "$response successfully added as a $1" + i=0 # resetting trys counter in case user wants to add more than 3 names else err "Invalid project name: $response" echo "Retrying ($i of $trys)" fi fi done - [[ ${#ADD_PROJECTS[@]} -gt 0 ]] + # shellcheck disable=SC2034 + ref=("${to_add[@]}") } + +# @description Sanitizer regex for prefix +sanitize_pn() { + [[ $1 =~ ^[0-9]{8}_.+_.+$ ]] +} + +# @description handle debug output globally debug() { (( DEBUG )) && echo "Debug: $*"; } + +# @description Create a random three word string # Not super portable but nice to have random_three_words() { local -a arr @@ -327,6 +351,69 @@ random_three_words() { printf "%s_" "${arr[@]}" | sed 's/_$//' } +# # @description Function to set a global array variable +# # @arg $1 The name of the global array +# # @arg $2 The values of the array +# # @internal +# set_global_array() { +# debug "Running: ${FUNCNAME[0]} $*" +# declare -n array="$1" +# shift +# # Declare the global array using the nameref +# declare -g -a "${!arr}" +# # Populate the array +# arr=("$@") +# } + + +# @description Print an array in columns +# @arg $1 array to print +print_in_columns() { + debug "Running: ${FUNCNAME[0]} $*" + declare -n array="$1" + declare num=${#array[@]} + + if [[ $num -gt 8 ]]; then + # Calculate the number of elements in each column + local num_columns=$(( (num + 1) / 2 )) + + # Determine the maximum width of the first column + local max_width=0 + for ((i=0; i for none + * for none (break) * A comma-separated list of wrappers and their arguments - * Both arguments are required * Quote the argument string if it contains whitespace - * Example: ${ALL_WRAPPERS[0]},\"arg1,arg2,arg3...\",${ALL_WRAPPERS[1]},\"arg1,arg2,arg3...\" + * Example: \""${ALL_WRAPPERS[0]},arg1,arg2,arg3...\" EOF ((YES)) || read -r -p "(none): " response echo "" [[ -z $response ]] && break IFS=',' read -ra arr <<< "$response" - if [[ $((${#arr[@]} % 2)) -eq 0 ]]; then # check if array is even - WRAPPERS+=("${arr[@]}") - else - err "The second argument is required (may be an empty string, \"\")" - fi + WRAPPERS+=("${arr[@]}") unset response arr i done fi @@ -589,7 +552,7 @@ interactive_header() { if [[ ${#PROJECTS[@]} -eq 0 ]]; then num=${#projects[@]} if [[ $num -eq 0 ]]; then - ask_pn && PROJECTS+=("${ADD_PROJECTS[@]}") + ask_name "project" "ADD_PROJECTS" && PROJECTS+=("${ADD_PROJECTS[@]}") else for ((c=1; c<2; c++)); do # safe loop that only runs once cat <<-EOF @@ -599,11 +562,11 @@ interactive_header() { * 0 to add a new project EOF ((YES)) || read -r -p "($num): " response - echo "" + echo if [[ $response == 0 ]]; then - ask_pn && PROJECTS+=("${ADD_PROJECTS[@]}") + ask_name "project" "ADD_PROJECTS" && PROJECTS+=("${ADD_PROJECTS[@]}") else - [[ -z $response ]] && response="$num" + response="${response:-$num}" IFS=',' read -ra arr <<< "$response" for i in "${arr[@]}"; do if [[ -n ${projects[$((i-1))]} ]]; then @@ -623,7 +586,7 @@ interactive_header() { for i in "${!PROJECTS[@]}"; do if ! sanitize_pn "${PROJECTS[i]}"; then echo "Project name ${PROJECTS[i]} is invalid" - ask_pn && unset "PROJECTS[i]" && PROJECTS+=("${ADD_PROJECTS[@]}") + ask_name "project" "ADD_PROJECTS" && unset "PROJECTS[i]" && PROJECTS+=("${ADD_PROJECTS[@]}") fi done } @@ -731,7 +694,7 @@ install_dependencies() { echo "Installing R packages" # Make R library directory if it doesn't exist - [[ -d "$R_LIBS_USER" ]] || mkdir -p "$R_LIBS_USER" + [[ -d "$R_LIBS_USER" ]] || execute mkdir -p "$R_LIBS_USER" depends_r_str="" depends_r_to_string() { @@ -803,59 +766,14 @@ init_project() { debug "Running: ${FUNCNAME[0]}" # Create a project scans dir - [[ -d $PROJECT_SCANS_DIR ]] || (mkdir -p "$PROJECT_SCANS_DIR" || return 1) + [[ -d $PROJECT_SCANS_DIR ]] || (execute mkdir -p "$PROJECT_SCANS_DIR" || return 1) # TODO eventually copy scans from robot but for now let's pause and wait for transfer echo "In the future we will copy scans from robot here" # read -r -p "Hit to continue: " - # Create the QHTCP out directory - if [[ -d $QHTCP_RESULTS_DIR ]]; then - # Handle existing output directory - cat <<-EOF - A project already exists at $QHTCP_RESULTS_DIR - Would you like to - * (u)pdate it from the template and continue - * (b)ack it up and start from scratch - * (c)ontinue (default) - EOF - - for i in {1..3}; do # give the user three chances to get it right - ((YES)) || read -r -p "(c): " response - echo "" - [[ -z $response ]] && break - case $response in - u) - echo "Updating project from template" - echo "Only files that are newer in the template will be overwritten" - if execute rsync --archive --update "$QHTCP_TEMPLATE_DIR"/ "$QHTCP_RESULTS_DIR"; then - echo "Project updated with template" - fi - ;; - b) - if backup "$QHTCP_RESULTS_DIR"; then - mkdir "$QHTCP_RESULTS_DIR" - execute rsync --archive "$QHTCP_TEMPLATE_DIR"/ "$QHTCP_RESULTS_DIR" - echo "Created new project at $QHTCP_RESULTS_DIR" - fi - ;; - c) - break - ;; - *) - err "Invalid response, please try again" - continue - ;; - esac - break - done - else - echo "Creating project results dir at $QHTCP_RESULTS_DIR" - execute mkdir "$QHTCP_RESULTS_DIR" - execute rsync --archive "$QHTCP_TEMPLATE_DIR"/ "$QHTCP_RESULTS_DIR" - fi - - study_info + # Create the project results out directory + [[ -d $PROJECT_RESULTS_DIR ]] || execute mkdir -p "$PROJECT_RESULTS_DIR" # Write skeleton files in csv # If we have to convert to xlsx later, so be it @@ -876,14 +794,15 @@ module easy # @description # Run the EASY matlab program # -# INPUT FILES +# INPUT +# # * MasterPlate_.xls # * DrugMedia_.xls # -# OUTPUT FILES -# * !!ResultsStd_.txt -# * !!ResultsELr_.txt +# OUTPUT # +# * out/PROJECT/easy/results_std.txt +# * out/PROJECT/easy/results_elr.txt # # TODO # @@ -1222,19 +1141,19 @@ easy() { # Backup and create EASY results dirs [[ -d $EASY_RESULTS_DIR ]] && backup "$EASY_RESULTS_DIR" - [[ -d $EASY_RESULTS_DIR ]] || execute mkdir "$EASY_RESULTS_DIR" + [[ -d $EASY_RESULTS_DIR ]] || execute mkdir -p "$EASY_RESULTS_DIR" # Make EASY dirs dirs=('PrintResults' 'CFfigs' 'Fotos') for d in "${dirs[@]}"; do if [[ ! -d $EASY_RESULTS_DIR/$d ]]; then - execute mkdir "$EASY_RESULTS_DIR/$d" + execute mkdir -p "$EASY_RESULTS_DIR/$d" fi done # Copy Templates - declare -gx DRUG_MEDIA_FILE="$SCANS_DIR/DrugMedia_$PROJECT_NAME.xls" - declare -gx MASTER_PLATE_FILE="$EASY_RESULTS_DIR/MasterPlate_$PROJECT_NAME.xls" + declare -gx DRUG_MEDIA_FILE="$SCANS_DIR/DrugMedia_$PROJECT.xls" + declare -gx MASTER_PLATE_FILE="$EASY_RESULTS_DIR/MasterPlate_$PROJECT.xls" execute rsync -a "$EASY_DIR"/{figs,PTmats} "$EASY_RESULTS_DIR" # Ask the user to launch EASYconsole.m in MATLAB @@ -1335,13 +1254,8 @@ module qhtcp qhtcp() { debug "Running: ${FUNCNAME[0]}" - [[ -d $QHTCP_RESULTS_DIR ]] || - err "$QHTCP_RESULTS_DIR does not exist, have you run the init_project module?" - - # Sets STUDIES - study_info - - choose_easy_results "$EASY_OUT_DIR" + [[ -d $PROJECT_RESULTS_DIR ]] || + err "$PROJECT_RESULTS_DIR does not exist, have you run the init_project module?" # # Create studies archive file if missing # if ! [[ -d $STUDIES_ARCHIVE_FILE ]]; then @@ -1355,18 +1269,14 @@ qhtcp() { # read -r num sd dir <<< "$study" # # Trying to match old ExpFrontend formatting # printf "%s\t" \ - # "${DATE//_/}" "$PROJECT_NAME" "$QHTCP_RESULTS_DIR" "Exp$num" \ + # "${DATE//_/}" "$PROJECT" "$PROJECT_RESULTS_DIR" "Exp$num" \ # "$PROJECT_DATE" "$PROJECT_SCANS_DIR" "$EASY_RESULTS_DIR" "${f##*/}" \ # >> "$STUDIES_ARCHIVE_FILE" # done # done # Run R interactions script on all studies - for study in "${STUDIES[@]}"; do - read -r num sd _ <<< "$study" - r_interactions "$num" "$sd" & - done - wait -n \ + calculate_interaction_zscores \ && remc \ && gtf \ && gta @@ -1379,20 +1289,16 @@ module remc # TODO # # * Which components can be parallelized? -# * Move Exp directory discovery from scripts to this module # # # @arg $1 string study info file remc() { debug "Running: ${FUNCNAME[0]}" - # Sets STUDIES - study_info - # If any wrappers fail the rest will not run, this is fundamental to module design # Remove leading && to run regardless - r_join_interactions \ - && java_extract \ + #r_join_interactions \ + java_extract \ && r_add_shift_values \ && r_create_heat_maps \ && r_heat_maps_homology @@ -1424,9 +1330,7 @@ gtf() { for d in "$process_dir" "$function_dir" "$component_dir"; do out_file="${d##*/}Results.txt" # Use the dirname to create each Results filename - shopt -s nullglob txts=("$d"/*.txt) # glob all txt files from each dir - shopt -u nullglob for txt in "${txts[@]}"; do pl_gtf_analyze "$txt" pl_gtf_terms2tsv "$txt" @@ -1460,34 +1364,27 @@ gta() { all_sgd_terms_csv="${5:-"$APPS_DIR/r/All_SGD_GOTerms_for_QHTCPtk.csv"}" # TODO This could be wrong, it could be in main results - - # Sets STUDIES - study_info [[ -d $GTA_OUT_DIR ]] && backup "$GTA_OUT_DIR" - execute mkdir "$GTA_OUT_DIR" + execute mkdir -p "$GTA_OUT_DIR" # Loop over the array and create pairwise arrays - for ((i=0; i<${#STUDIES[@]}; i++)); do - for ((j=i+1; j<${#STUDIES[@]}; j++)); do - read -r num1 _ _ <<< "${STUDIES[i]}" - read -r num2 _ _ <<< "${STUDIES[j]}" - pair=("$num1" "$num2") + for ((i=0; i<${#EXP_NUMS[@]}; i++)); do + for ((j=i+1; j<${#EXP_NUMS[@]}; j++)); do + pair=("${EXP_NUMS[i]}" "${EXP_NUMS[j]}") echo "${pair[@]}" done done # Create unique parwise combinations of study nums from dir names - study_combos=() - for ((i=0; i<${#STUDIES[@]}; i++)); do + exp_combos=() + for ((i=0; i<${#EXP_NUMS[@]}; i++)); do # Loop through the array again - for ((j=0; j<${#STUDIES[@]}; j++)); do + for ((j=0; j<${#EXP_NUMS[@]}; j++)); do # If the indices are not the same if [ "$i" != "$j" ]; then # Print the unique combination - read -r num1 _ _ <<< "${STUDIES[i]}" - read -r num2 _ _ <<< "${STUDIES[j]}" - study_combos+=("$num1,$num2") + exp_combos+=("${EXP_NUMS[i]},${EXP_NUMS[j]}") fi done done @@ -1495,28 +1392,21 @@ gta() { # The following are three types of studies # Individual studies - for study in "${STUDIES[@]}"; do - read -r num _ dir <<< "$study" - zscores_file="$dir/zscores/zscores_interaction.csv" - if [[ -f $zscores_file ]]; then - mkdir "$GTA_OUT_DIR/Exp$num" - r_gta "Exp$num" "$zscores_file" - fi + for exp_num in "${EXP_NUMS[@]}"; do + execute mkdir -p "$GTA_OUT_DIR/exp$exp_num" + r_gta "exp$exp_num" "$STUDY_RESULTS_DIR/exp$exp_num/zscores/zscores_interaction.csv" done # Combination studies (for pairwise comparisons) - for combo in "${study_combos[@]}"; do + for combo in "${exp_combos[@]}"; do # Split on comma and assign to array - IFS=',' read -ra studies <<< "$combo" - r_gta_pairwiselk "${studies[0]}" "${studies[1]}" - done - - # All studies - # All preceding arguments are required so we can pass multiple studies - declare -a nums - for study in "${STUDIES[@]}"; do - read -r num _ _ <<< "$study" - nums+=("$num") + IFS=',' read -ra exps <<< "$combo" + r_gta_pairwiselk \ + "${EXPERIMENTS[exp${exps[0]},name]}" \ + "$GTA_OUT_DIR/exp${exps[0]}/Average_GOTerms_All.csv" \ + "${EXPERIMENTS[exp${exps[1]},name]}" \ + "$GTA_OUT_DIR/exp${exps[1]}/Average_GOTerms_All.csv" \ + "$GTA_OUT_DIR" done r_gta_heatmaps \ @@ -1524,9 +1414,9 @@ gta() { "$gene_ontology_obo" \ "$sgd_terms_tfile" \ "$all_sgd_terms_csv" \ - "$QHTCP_RESULTS_DIR" \ - "$QHTCP_RESULTS_DIR/TermSpecificHeatmaps" \ - "${nums[@]}" + "$STUDY_RESULTS_DIR" \ + "$STUDY_RESULTS_DIR/TermSpecificHeatmaps" \ + "${EXP_NUMS[@]}" } @@ -1547,6 +1437,74 @@ wrapper() { } +wrapper calculate_interaction_zscores +# @description Run the R interactions analysis (deprecates Z_InteractionTemplate.R) +# shellcheck disable=SC2120 +# +# SCRIPT: apps/r/calculate_interaction_zscores.R +# +# TODO +# +# * Needs simplification w/ StudyInfo, should we just use a studies dir and name the dir with study name? +# +# INPUT +# +# * easy/results_std.txt +# +# OUTPUT +# +# * zscores/zscores_interaction.csv +# * etc. +# +# NOTES +# +# * +# +# @arg $1 integer output directory +# @arg $2 integer delta SD background value (default: 3) +# @arg $3 string study info file +# @arg $4 string SGD_features.tab +# @arg $5 string easy/results_std.txt +# @arg $6 array pairs of experiment paths and names +calculate_interaction_zscores() { + debug "Running: ${FUNCNAME[0]} $*" + cat <<-EOF + * Be sure to enter Background noise filter standard deviation i.e., 3 or 5 per Sean + * Enter Standard deviation value for removing data for cultures due to high background (e.g., contaminated cultures). + * Generally set this very high (e.g., '20') on the first run in order NOT to remove data, e.g. '20'. Review QC data and inspect raw image data to decide if it is desirable to remove data, and then rerun analysis. + * Enter a Background SD threshold for EXCLUDING culture data from further analysis: + * This Background value removes data where there is high pixel intensity in the background regions of a spot culture (i.e., suspected contamination). + * 5 is a minimum recommended value, because lower values result in more data being removed, and often times this is undesirable if contamination occurs late after the carrying capacity of the yeast culture is reached. + * This is most often "trial and error", meaning there is a 'Frequency_Delta_Background.pdf' report in the /Exp_/ZScores/QC/ folder to evaluate whether the chosen value was suitable (and if not the analysis can simply be rerun with a more optimal choice). + * In general, err on the high side, with BSD of 10 or 12…. One can also use EZview to examine the raw images and individual cultures potentially included/excluded as a consequence of the selected value. + * Background values are reported in the results sheet and so could also be analyzed there. + EOF + + declare script="$APPS_DIR/r/calculate_interaction_zscores.R" + declare combined_out_path="${1:-"$STUDY_RESULTS_DIR/zscores"}" + declare -a out_paths=("${EXP_PATHS[@]}/zscores" "$combined_out_path") + + # TODO we'll need to change this behaviour after testing + # we can test for existence and then choose to skip or rerun later + # possibly handle in the module? + for out_path in "${out_paths[@]}"; do + backup "$out_path" + execute mkdir -p "$out_path" "$out_path/qc" + done + + execute "$RSCRIPT" "$script" \ + "${1:-"$out_path"}" \ + "${2:-3}" \ + "${3:-"$APPS_DIR/r/SGD_features.tab"}" \ + "${4:-"$EASY_RESULTS_DIR/results_std.txt"}" \ + "${@:5}" \ + "${EXP_PATHS_AND_NAMES[@]}" + + [[ -f "$out_path/zscores_interaction.csv" ]] || (echo "$out_path/zscores_interaction.csv does not exist"; return 1) + ((DEBUG)) && declare -p && exit # when the going gets rough +} + + wrapper r_gta # @description GTAtemplate R script # @@ -1565,12 +1523,11 @@ wrapper r_gta # * Average_GOTerms_All.csv # # -# @arg $1 string Exp# name (required) +# @arg $1 string exp# name (required) # @arg $2 string zscores_interaction.csv (required) # @arg $3 string go_terms.tab file # @arg $4 string [gene_association.sgd](https://downloads.yeastgenome.org/curation/chromosomal_feature/gene_association.sgd) # @arg $5 string output directory -# r_gta() { debug "Running: ${FUNCNAME[0]} $*" cat <<-EOF @@ -1606,20 +1563,21 @@ wrapper r_gta_pairwiselk # * Average_GOTerms_All.csv # * # -# Output +# OUTPUT # # * # # This wrapper: # -# * Will perform both L and K comparisons for the specified experiment folders. -# * The code uses the naming convention of PairwiseCompare_Exp’#’-Exp’#’ to standardize and keep simple the structural naming (where ‘X’ is either K or L and ‘Y’ is the number of the experiment GTA results to be found in ../GTAresult/Exp_). -# * {FYI There are also individual scripts that just do the ‘L’ or ‘K’ pairwise studies in the ../Code folder.} +# * Will perform both L and K comparisons for the specified experiment folders. +# * The code uses the naming convention of PairwiseCompare_Exp’#’-Exp’#’ to standardize and keep simple the structural naming (where ‘X’ is either K or L and ‘Y’ is the number of the experiment GTA results to be found in ../GTAresult/Exp_). +# * {FYI There are also individual scripts that just do the ‘L’ or ‘K’ pairwise studies in the ../Code folder.} # -# @arg $1 string First Exp# name (required) -# @arg $2 string Second Exp# name (required) -# @arg $3 string study info file -# @arg $4 string output directory +# @arg $1 string First exp# name (required) +# @arg $2 string First exp# go terms file (required) +# @arg $3 string Second exp# name (required) +# @arg $4 string Second exp# go terms file (required) +# @arg $5 string output directory # r_gta_pairwiselk() { debug "Running: ${FUNCNAME[0]} $*" @@ -1627,16 +1585,15 @@ r_gta_pairwiselk() { EOF - script="$APPS_DIR/r/PairwiseLK.R" - - [[ -d $4 ]] || mkdir -p "$4" + script="$APPS_DIR/r/calculate_pairwise_lk.R" execute "$RSCRIPT" "$script" \ "$1" \ "$2" \ - "${3:-$STUDY_INFO_FILE}" \ - "${4:-"$GTA_OUT_DIR"}" \ - "${@:5}" # future arguments + "$3" \ + "$4" \ + "${5:-"$GTA_OUT_DIR"}" \ + "${@:6}" # future arguments } @@ -1680,145 +1637,63 @@ r_gta_heatmaps() { EOF script="$APPS_DIR/r/TSHeatmaps5dev2.R" - [[ -d $7 ]] || mkdir -p "$7" + [[ -d $7 ]] || execute mkdir -p "$7" execute "$RSCRIPT" "$script" \ "${1:-$STUDY_INFO_FILE}" \ "${2:-"$APPS_DIR/r/gene_ontology_edit.obo"}" \ "${3:-"$APPS_DIR/r/go_terms.tab"}" \ "${4:-"$APPS_DIR/r/All_SGD_GOTerms_for_QHTCPtk.csv"}" \ - "${5:-"$QHTCP_RESULTS_DIR"}" \ - "${6:-"$QHTCP_RESULTS_DIR/TermSpecificHeatmaps"}" \ + "${5:-"$STUDY_RESULTS_DIR"}" \ + "${6:-"$STUDY_RESULTS_DIR/TermSpecificHeatmaps"}" \ "${@:7}" # studies } -wrapper r_interactions -# @description Run the R interactions analysis (deprecates Z_InteractionTemplate.R) -# -# SCRIPT: interactions.R -# -# TODO -# -# * Parallelization (need to consult with Sean) -# * Needs more loops to reduce verbosity, but don't want to limit flexibility -# * Replace 1:length() with seq_along() -# * Re-enable disabled linter checks -# * Reduce cyclomatic complexity of some of the for loops -# * There needs to be one point of truth for the SD factor -# * Replace most paste() functions with sprintf() -# -# INPUT -# -# * easy/results_std.txt -# -# OUTPUT -# -# * zscores/zscores_interaction.csv -# * etc. -# -# NOTES -# -# * -# -# @arg $1 integer Exp number (required) -# @arg $2 integer delta SD background value (default: 3) -# @arg $3 string study info file -# @arg $4 string SGD_features.tab -# @arg $5 string easy/results_std.txt -# @arg $6 string output directory -r_interactions() { - debug "Running: ${FUNCNAME[0]} $*" - cat <<-EOF - * Be sure to enter Background noise filter standard deviation i.e., 3 or 5 per Sean - * Enter Standard deviation value for removing data for cultures due to high background (e.g., contaminated cultures). - * Generally set this very high (e.g., '20') on the first run in order NOT to remove data, e.g. '20'. Review QC data and inspect raw image data to decide if it is desirable to remove data, and then rerun analysis. - * Enter a Background SD threshold for EXCLUDING culture data from further analysis: - * This Background value removes data where there is high pixel intensity in the background regions of a spot culture (i.e., suspected contamination). - * 5 is a minimum recommended value, because lower values result in more data being removed, and often times this is undesirable if contamination occurs late after the carrying capacity of the yeast culture is reached. - * This is most often "trial and error", meaning there is a 'Frequency_Delta_Background.pdf' report in the /Exp_/ZScores/QC/ folder to evaluate whether the chosen value was suitable (and if not the analysis can simply be rerun with a more optimal choice). - * In general, err on the high side, with BSD of 10 or 12…. One can also use EZview to examine the raw images and individual cultures potentially included/excluded as a consequence of the selected value. - * Background values are reported in the results sheet and so could also be analyzed there. - EOF +# wrapper r_join_interactions +# # shellcheck disable=SC2120 +# # @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv +# # +# # TODO +# # +# # * Needs more loops to reduce verbosity +# # +# # INPUT +# # +# # * study info file +# # * /out/PROJECT/STUDY/exp#/zscores/zscores_interaction.csv +# # +# # OUTPUT +# # +# # * REMcRdy_lm_only.csv +# # * Shift_only.csv +# # * parameters.csv +# # +# # @arg $1 string output directory (required) +# # @arg $2 string sd value (default: 2) (required) +# # @arg $3 string study info file (required) +# # @arg $4 array studies (required) +# r_join_interactions() { +# debug "Running: ${FUNCNAME[0]} $*" +# declare script="$APPS_DIR/r/joinInteractExps.R" +# declare -a out_files=( +# "${1:-$STUDY_RESULTS_DIR}/REMcRdy_lm_only.csv" +# "${1:-$STUDY_RESULTS_DIR}/Shift_only.csv" +# "${1:-$STUDY_RESULTS_DIR}/parameters.csv" +# ) - declare script="$APPS_DIR/r/interactions.R" - declare out_dir="${6:-"$QHTCP_RESULTS_DIR/Exp$1/zscores"}" - - backup "$out_dir" - execute mkdir "$out_dir" - execute mkdir "$out_dir/qc" - - execute "$RSCRIPT" "$script" \ - "$1" \ - "${2:-3}" \ - "${3:-"$STUDY_INFO_FILE"}" \ - "${4:-"$APPS_DIR/r/SGD_features.tab"}" \ - "${5:-"$EASY_RESULTS_DIR/results_std.txt"}" \ - "$out_dir" \ - "${@:7}" # future arguments - - [[ -f "$out_dir/zscores_interaction.csv" ]] || (echo "$out_dir/zscores_interaction.csv does not exist"; return 1) -} - - -wrapper r_join_interactions -# shellcheck disable=SC2120 -# @description JoinInteractExps3dev.R creates REMcRdy_lm_only.csv and Shift_only.csv -# -# TODO -# -# * Needs more loops to reduce verbosity -# -# INPUT -# -# * study info file -# * Exp#/zscores/zscores_interaction.csv -# -# OUTPUT -# -# * REMcRdy_lm_only.csv -# * Shift_only.csv -# * parameters.csv -# -# @arg $1 string output directory (required) -# @arg $2 string sd value (default: 2) (required) -# @arg $3 string study info file (required) -# @arg $4 array studies (required) -r_join_interactions() { - debug "Running: ${FUNCNAME[0]} $*" - declare script="$APPS_DIR/r/joinInteractExps.R" - declare -a dirs - declare -a out_files=( - "${1:-$QHTCP_RESULTS_DIR}/REMcRdy_lm_only.csv" - "${1:-$QHTCP_RESULTS_DIR}/Shift_only.csv" - "${1:-$QHTCP_RESULTS_DIR}/parameters.csv" - ) - - # ((DEBUG)) && declare -p # for when the going gets tough - - backup "${out_files[@]}" - - # If user provides study dirs, use those - if [[ $# -gt 3 ]]; then - dirs=("${@:4}") - else - study_info - for study in "${STUDIES[@]}"; do - read -r _ _ dir <<< "$study" - dirs+=("$dir") - done - fi +# # ((DEBUG)) && declare -p # when the going gets tough - execute "$RSCRIPT" "$script" \ - "${1:-$QHTCP_RESULTS_DIR}" \ - "${2:-2}" \ - "${3:-$STUDY_INFO_FILE}" \ - "${dirs[@]}" +# execute "$RSCRIPT" "$script" \ +# "${1:-$STUDY_RESULTS_DIR}" \ +# "${2:-2}" \ +# "${3:-$STUDY_INFO_FILE}" \ +# "${@:4:-${EXP_PATHS[@]}}" - for f in "${out_files[@]}"; do - [[ -f $f ]] || (echo "$f does not exist"; return 1) - done -} +# for f in "${out_files[@]}"; do +# [[ -f $f ]] || (echo "$f does not exist"; return 1) +# done +# } wrapper java_extract @@ -1847,14 +1722,14 @@ java_extract() { debug "Running: ${FUNCNAME[0]}" classpath="$APPS_DIR/java/weka-clustering/weka-clustering.jar" - output_file="${1:-$QHTCP_RESULTS_DIR}/REMcRdy_lm_only.csv-finalTable.csv" + output_file="${1:-$STUDY_RESULTS_DIR}/REMcRdy_lm_only.csv-finalTable.csv" [[ -f $output_file ]] && backup "$output_file" java_cmd=( "$JAVA" -Xms512m -Xmx2048m -Dfile.encoding=UTF-8 -classpath "$classpath" ExecMain - "${2:-"$QHTCP_RESULTS_DIR/REMcRdy_lm_only.csv"}" + "${2:-"$STUDY_RESULTS_DIR/REMcRdy_lm_only.csv"}" "${3:-"$APPS_DIR/java/GeneByGOAttributeMatrix_nofiltering-2009Dec07.tab"}" "${4:-"$APPS_DIR/java/ORF_List_Without_DAmPs.txt"}" 1 @@ -1862,7 +1737,7 @@ java_extract() { ) debug "pushd && ${java_cmd[*]} && popd" - pushd "${1:-$QHTCP_RESULTS_DIR}" && "${java_cmd[@]}" && popd || return 1 + pushd "${1:-$STUDY_RESULTS_DIR}" && "${java_cmd[@]}" && popd || return 1 [[ -f $output_file ]] } @@ -1881,14 +1756,14 @@ r_add_shift_values() { script="$APPS_DIR/r/addShiftVals.R" execute "$RSCRIPT" "$script" \ - "${1:-"$QHTCP_RESULTS_DIR/REMcRdy_lm_only.csv-finalTable.csv"}" \ - "${2:-"$QHTCP_RESULTS_DIR/Shift_only.csv"}" \ + "${1:-"$STUDY_RESULTS_DIR/REMcRdy_lm_only.csv-finalTable.csv"}" \ + "${2:-"$STUDY_RESULTS_DIR/Shift_only.csv"}" \ "${3:-$STUDY_INFO_FILE}" \ - "${4:-"$QHTCP_RESULTS_DIR/REMcWithShift.csv"}" \ + "${4:-"$STUDY_RESULTS_DIR/REMcWithShift.csv"}" \ "${@:5}" # future arguments - rm -f "$QHTCP_RESULTS_DIR/REMcHeatmaps/"*.pdf - out_file="${4:-"$QHTCP_RESULTS_DIR/REMcWithShift.csv"}" + rm -f "$STUDY_RESULTS_DIR/REMcHeatmaps/"*.pdf # TODO why? + out_file="${4:-"$STUDY_RESULTS_DIR/REMcWithShift.csv"}" [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) } @@ -1918,8 +1793,8 @@ r_create_heat_maps() { script="$APPS_DIR/r/createHeatMaps.R" execute "$RSCRIPT" "$script" \ - "${1:-"$QHTCP_RESULTS_DIR/REMcWithShift.csv"}" \ - "${2:-"$QHTCP_RESULTS_DIR"}" \ + "${1:-"$STUDY_RESULTS_DIR/REMcWithShift.csv"}" \ + "${2:-"$STUDY_RESULTS_DIR"}" \ "${@:3}" # future arguments pdfs=(REMcHeatmaps/*.pdf) @@ -1941,19 +1816,19 @@ r_heat_maps_homology() { debug "Running: ${FUNCNAME[0]} $*" script="$APPS_DIR/r/createHeatMapsHomology.R" - out_file="${1:-"$QHTCP_RESULTS_DIR/homology"}/compiledREMcHomologyHeatmaps.pdf" + out_file="${1:-"$STUDY_RESULTS_DIR/homology"}/compiledREMcHomologyHeatmaps.pdf" - debug "Removing old pdf and csv files from ${1:-"$QHTCP_RESULTS_DIR/homology"}" - rm "${1:-"$QHTCP_RESULTS_DIR/homology"}/"*.{pdf,csv} + debug "Removing old pdf and csv files from ${1:-"$STUDY_RESULTS_DIR/homology"}" + rm "${1:-"$STUDY_RESULTS_DIR/homology"}/"*.{pdf,csv} execute "$RSCRIPT" "$script" \ - "${1:-"$QHTCP_RESULTS_DIR/homology"}" \ - "${2:-"$QHTCP_RESULTS_DIR/REMcRdy_lm_only.csv-finalTable.csv"}" \ + "${1:-"$STUDY_RESULTS_DIR/homology"}" \ + "${2:-"$STUDY_RESULTS_DIR/REMcRdy_lm_only.csv-finalTable.csv"}" \ "${3:-"$APPS_DIR/r/170503_DAmPs_Only.txt"}" \ "${4:-"$APPS_DIR/r/Yeast_Human_Homology_Mapping_biomaRt_18_0920.csv"}" \ "${@:5}" # future arguments - pdfs=("${1:-"$QHTCP_RESULTS_DIR/homology"}"/*.pdf) + pdfs=("${1:-"$STUDY_RESULTS_DIR/homology"}"/*.pdf) execute pdftk "${pdfs[@]}" output "$out_file" [[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) } @@ -2054,184 +1929,72 @@ r_compile_gtf() { } -# @description Creates, modifies, and parses the study info file -# -# TODO -# -# * Needs refactoring -# * Ended up combining a few functions into one -# -# -# @exitcode 0 If one or more studies found -# @exitcode 1 If no studies found -# @set STUDIES array contains array of "Exp# sd ExpDir" -study_info() { - debug "Running: ${FUNCNAME[0]}" - - # Only run this once per project - # in case we run multiple modules - (( SET_STUDIES )) && return 0 - declare -g SET_STUDIES=1 - - # Use initials from project or whoami? - # Best I can do is first two letters of username - # See TODO in markdown - initials="${USER:0:2}" - INITIALS=${initials^^} - - empty_study=1 - # Find an Exp directory that does not exist - while [[ -d $QHTCP_RESULTS_DIR/Exp$empty_study ]]; do - (( empty_study++ )) - done - - next_study_entry="$empty_study,$PROJECT_SUFFIX,NA,NA,$INITIALS" - - echo "${underline}Study Info File${nounderline}" - - if [[ -f $STUDY_INFO_FILE ]]; then - # Get latest entry - while IFS=',' read -r col1 _; do # split on comma, get Exp # from 1st column - studies_nums+=("$col1") - done < <(tail -n +2 "$STUDY_INFO_FILE") - largest=${studies_nums[0]} - for i in "${studies_nums[@]}"; do - if ((i > largest)); then - largest=$i - fi - done - empty_study=$((largest+1)) - next_study_entry="$((empty_study)),$PROJECT_SUFFIX,NA,NA,$INITIALS" - else # create a default study info file - echo "ExpNumb,ExpLabel,BackgroundSD,ZscoreJoinSD,AnalysisBy" > "$STUDY_INFO_FILE" - echo "$next_study_entry" >> "$STUDY_INFO_FILE" - next_study_entry="$((empty_study+1)),$PROJECT_SUFFIX,NA,NA,$INITIALS" - fi - - # Print current studies - cat <<-EOF - * Give each experiment labels to be used for the plots and specific files. - * Enter the desired experiment names in the order they should appear in the REMc heatmaps - - Current study info file contents: - - ${underline}$STUDY_INFO_FILE${nounderline} - $(cat "$STUDY_INFO_FILE") - - EOF - - # Allow user to add/edit the study info file - if ! ((YES)); then - for ((i=1; i<2; i++)); do - cat <<-EOF - Next entry suggestion: "$next_study_entry" - - Would you like to: - * (a)dd the suggested entry - * (e)dit the study info file manually - * (c)ontinue (default) - EOF - read -r -p "(c): " response - echo "" - [[ -z $response ]] && break - case $response in - a) - echo "Adding auto-entry suggestion to $STUDY_INFO_FILE" - echo "$next_study_entry" >> "$STUDY_INFO_FILE" - next_study_entry="$((empty_study+1)),$PROJECT_SUFFIX,NA,NA,$INITIALS" - i=0 - ;; - e) - debug "${EDITOR:-nano} $STUDY_INFO_FILE" - ${EDITOR:-nano} "$STUDY_INFO_FILE" - ;; - c) - break - ;; - *) - err "Invalid response, please try again" - i=0 - ;; - esac - break - done - fi - - # Read study info file - declare -ga STUDIES - while IFS=',' read -r num _ sd _; do - STUDIES+=("$num $sd $QHTCP_RESULTS_DIR/Exp$num") - done < <(tail -n +2 "$STUDY_INFO_FILE") # skip header - - # Initialize missing Exp dirs - for study in "${STUDIES[@]}"; do - read -r _ _ dir <<< "$study" - [[ -d $dir ]] || mkdir "$dir" - done - - # # We don't need a template anymore? - # # if ! rsync --archive "$STUDY_TEMPLATE_DIR" "$STUDY_DIR"; then - # # err "Could not copy $STUDY_TEMPLATE_DIR template to $STUDY_DIR" - # # continue - # # fi - # done - - ((DEBUG)) && declare -p STUDIES - - # Return true if at least one study was found - [[ ${#STUDIES[@]} -gt 0 ]] -} - - -# @description Chooses an EASY scans directory if the information is undefined -# TODO Standardize EASY output, it's hard to understand -# TODO eventually we could run this on multiple results dirs simultaneously with some refactoring -# @exitcode 0 if successfully choose anEASY results dir +# @description Selects a results directory +# @exitcode 0 if successfully chose a results dir # @set EASY_RESULTS_DIR string The working EASY output directory -# @arg $1 string directory containing EASY results dirs -choose_easy_results() { +# @arg $1 string directory containing results dirs matching the prefix regex +# @arg $2 string variable name to declare +handle_results_dir() { debug "Running: ${FUNCNAME[0]} $*" + declare results_dir + declare -a results_dirs=("$1"/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]_*_*/) # glob for results directories + results_dirs=("${results_dirs[@]%/}") # remove trailing slashes - shopt -s nullglob - declare -a easy_results_dirs=( "$1/"*/ ) - shopt -u nullglob - - # Strip trailing slash - easy_results_dirs=("${easy_results_dirs[@]%/}") - - num=${#easy_results_dirs[@]} - - if [[ $num -eq 0 ]]; then - echo "No easy results dirs found in $EASY_OUT_DIR" - echo "Run easy module first to generate results files" - return 1 - elif [[ $num -eq 1 ]]; then - EASY_RESULTS_DIR="${easy_results_dirs[0]}" - return 0 - fi - - # Sort the dirs - mapfile -t easy_results_dirs < <(printf '%s\n' "${easy_results_dirs[@]}" | sort) - - # Use latest in auto mode - if ((YES)); then - EASY_RESULTS_DIR="${easy_results_dirs[$((num-1))]}" - return 0 - fi - - for ((i=0; i for the latest easy results ($num) - EOF - read -r -p "($num): " response - [[ -z $response ]] && response="$num" - EASY_RESULTS_DIR="${easy_results_dirs[$((response-1))]}" - # EASY_RESULTS_FILES=("$EASY_RESULTS_DIR/"*"/PrintResults/!!"*) - [[ -d $EASY_RESULTS_DIR ]] + # Sort filtered directories by date prefix in case the glob was mixed + mapfile -t results_dirs < <(printf '%s\n' "${filtered_dirs[@]}" | sort) + + num=${#results_dirs[@]} + + ((DEBUG)) && print_in_columns "results_dirs" + + # Choose a results dir based on number of results dirs + if [[ $num -eq 1 ]] && ((YES)); then + results_dir="${results_dirs[0]}" + elif ((YES)); then + results_dir="${results_dirs[-1]}" + else + ask_name "$2" ADD_RESULTS + results_dirs=("${ADD_RESULTS[@]}") + if [[ $num -eq 1 ]]; then + results_dir="${results_dirs[0]}" + else + # Print results dirs + [[ ${#results_dirs[@]} -gt 0 ]] || (err "No ${2}s found"; return 1) + print_in_columns "results_dirs" + # Results selection prompt + cat <<-EOF + ${underline}Select $2 by number${nounderline} + * for the latest $2 ($num) + * 0 to create a new $2 + EOF + read -r -p "($num): " response + echo + response="${response:-$num}" + if [[ $response -eq 0 ]]; then + ask_name "$2" ADD_RESULTS + results_dirs=("${ADD_RESULTS[@]}") + results_dir="${results_dirs[-1]}" + else + results_dir="${results_dirs[$((response-1))]}" + fi + fi + fi + + # Set the fallback + [[ -z $results_dir ]] && results_dir="$1/${STUDY_PREFIX}_${PROJECT_NAME}" + + # Create directory and set global variable + [[ -d $results_dir ]] || execute mkdir -p "$results_dir" + declare -gx "$2"="$results_dir" } @@ -2248,6 +2011,158 @@ generate_markdown() { } +# @description Parses a simple "bashy" config file +# @arg $1 study config file +# @internal +handle_config() { + debug "Running: ${FUNCNAME[0]} $*" + + # Determine the configuration file path + declare config_file="${1:-${STUDY_CONFIG_FILE:-"$STUDY_RESULTS_DIR/study_config"}}" + declare config_array_name="EXPERIMENTS" + + # Ensure the function runs only once per project (in case of multiple modules) + (( PARSED_CONFIG )) && return 0 + declare -g PARSED_CONFIG=1 + + # Default experiment settings for a study + declare -gA default_study_config + # shellcheck disable=SC2034 + add_experiment() { + echo "Adding default Exp $1 to the study config file" + default_study_config["exp$1,name"]="${2:-"Experiment $1 for ${STUDY_RESULTS_DIR##*/} study"}" + default_study_config["exp$1,path"]="${3:-"$STUDY_RESULTS_DIR/exp$1"}" + default_study_config["exp$1,sd"]="${4:-3}" + default_study_config["exp$1,sd_L"]="${5:-0}" + default_study_config["exp$1,sd_K"]="${6:-0}" + default_study_config["exp$1,sd_pair_L"]="${7:-0}" + default_study_config["exp$1,sd_pair_K"]="${8:-0}" + default_study_config["exp$1,gene_exclude"]="${9:-"YDL227C"}" + default_study_config["exp$1,gene_include"]="${10:-0}" + } + + # Check if the config file exists + if ! [[ -f $config_file ]]; then + declare -a exp_dirs + [[ -n $1 ]] && dirname="${1%/*}" + exp_dirs=("${dirname:-$STUDY_RESULTS_DIR}/exp"[0-9]*/) + exp_dirs=("${exp_dirs[@]%/}") + + if [[ ${#exp_dirs[@]} -gt 0 ]]; then + for exp_dir in "${exp_dirs[@]}"; do + num="${exp_dir##*/}" + num="${num#exp}" + add_experiment "$num" + done + else + echo "No config file found at $config_file" + echo "No experiment directories found in $STUDY_RESULTS_DIR" + if ask "Would you like to create a study config file at $config_file?"; then + ((YES)) || read -r -p "Number of experiments to create (4): " response + response="${response:-4}" + for ((i = 1; i <= response; i++)); do + add_experiment "$i" + done + fi + fi + write_config "$config_file" "default_study_config" "$config_array_name" && + echo "Configuration file generated at $config_file" + fi + + # Source the config file for the config array + # shellcheck disable=SC1090 + . "$config_file" + + # Declare a nameref to refer to the actual array using the variable name + declare -n config_array_ref="$config_array_name" + + [[ -z ${!config_array_ref[*]} ]] && (err "No $config_array_name array found in $config_file" && return 1) + + for key in "${!config_array_ref[@]}"; do + IFS=',' read -r main_key sub_key <<< "$key" + lines+=("${config_array_name}[$main_key,$sub_key]=\"${config_array_ref[$key]}\"") + done + + mapfile -t lines < <(printf "%s\n" "${lines[@]}" | sort) + + echo "${underline}$config_file${nounderline}" + printf "%s\n" "${lines[@]}" + unset lines + + # Prompt user to edit study config file if not in auto mode + if ! ((YES)) && ask "Would you like to edit the study config file?"; then + "$EDITOR" "$config_file" + # shellcheck disable=SC1090 + . "$config_file" + fi + + # Create some helpful arrays + declare -ga EXP_PATHS + # Loop over the keys in the associative array + for key in "${!config_array_ref[@]}"; do + if [[ $key == *,path ]]; then + EXP_PATHS+=("${config_array_ref[$key]}") + fi + done + + declare -ga EXP_NUMS=("${EXP_PATHS[@]##*[!0-9]}") + + declare -ga EXP_PATHS_AND_NAMES + for key in "${!config_array_ref[@]}"; do + if [[ $key == *,path ]]; then + main_key="${key%,*}" + name_key="${main_key},name" + if [[ -n ${config_array_ref[$name_key]} ]]; then + EXP_PATHS_AND_NAMES+=("${config_array_ref[$key]}" "${config_array_ref[$name_key]}") + fi + fi + done + + # declare -ga EXP_PATHS_AND_NAMES + # for key in "${!config_array_ref[@]}"; do + # if [[ $key == *,path ]]; then + # EXP_PATHS_AND_NAMES+=("${config_array_ref[$key]}") + # fi + # if [[ $key == *,name ]]; then + # EXP_PATHS_AND_NAMES+=("${config_array_ref[$key]}") + # fi + # done + return 0 +} + + +# @description Write an associative array to a config file +# @arg $1 file to write +# @arg $2 name of the associative array +# @arg $3 name of the config array +write_config() { + debug "Running: ${FUNCNAME[0]} $*" + declare file="$1" + declare -n array="$2" + declare array_name="${3:-"EXPERIMENTS"}" + + declare -a lines=() + # Iterate over the associative array to populate the lines array + for key in "${!array[@]}"; do + IFS=',' read -r main_key sub_key <<< "$key" + lines+=("${array_name}[$main_key,$sub_key]=\"${array[$key]}\"") + done + + mapfile -t lines < <(printf "%s\n" "${lines[@]}" | sort) # re-sort the lines alphabetically + + # Write the contents to the file using a HEREDOC + cat <<- EOF > "$file" + #!/usr/bin/env bash + + # Declare the main associative array + declare -Ax ${array_name} + + $(printf "%s\n" "${lines[@]}") + + EOF +} + + # @description The main loop of qhtcp-workflow # # @internal @@ -2260,13 +2175,14 @@ main() { declare -g PERL="${PERL:-$(which perl 2>/dev/null || echo perl)}" declare -g RSCRIPT="${RSCRIPT:-$(which Rscript 2>/dev/null || echo Rscript)}" declare -g MATLAB="${MATLAB:-$(which matlab 2>/dev/null || echo matlab)}" + declare -g EDITOR="${EDITOR:-"nano"}" # Global vars SCRIPT_NAME="${BASH_SOURCE[0]##*/}" SCRIPT=$(realpath -s "${BASH_SOURCE[0]}") SCRIPT_DIR=$(dirname "$SCRIPT") APPS_DIR="$SCRIPT_DIR/apps" - TEMPLATES_DIR="$SCRIPT_DIR/templates" + # TEMPLATES_DIR="$SCRIPT_DIR/templates" USER="${USER:-$(whoami)}" DATE="$(date +%Y%m%d)" # change in EASYconsole.m to match 'hardcode' @@ -2292,7 +2208,7 @@ main() { if ! [[ -d $SCANS_DIR ]]; then # This is not something we do often, so ask if ask "Create the scans directory: $SCANS_DIR?"; then - mkdir -p "$SCANS_DIR" + execute mkdir -p "$SCANS_DIR" else echo "No scans directory available, exiting" exit 1; @@ -2303,7 +2219,7 @@ main() { SCANS_DIR=$(realpath -s "$SCANS_DIR") # Find an output directory - local out_heirarchy=("$(dirname "$SCANS_DIR")/out" "$SCRIPT_DIR/out" "/mnt/data/out") + local out_heirarchy=("${SCANS_DIR%/*}/out" "$SCRIPT_DIR/out" "/mnt/data/out") for d in "${out_heirarchy[@]}"; do if [[ -d $d ]]; then debug "Using output directory: $d" @@ -2317,8 +2233,7 @@ main() { declare -gx OUT_DIR="$SCRIPT_DIR/out" # This is not something we do often, so ask if ask "Create $SCRIPT_DIR/out?"; then - debug "mkdir $SCRIPT_DIR/out" - mkdir "$SCRIPT_DIR/out" + execute mkdir -p "$SCRIPT_DIR/out" else err "No output directory, attempting to continue..." fi @@ -2327,15 +2242,11 @@ main() { # Make sure we are using the absolute path OUT_DIR=$(realpath -s "$OUT_DIR") - # Set the automatic project directory prefix - PROJECT_PREFIX="${DATE}_${USER}" # reversed these so easier to sort and parse date - sanitize_pn() { [[ $1 =~ [0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]_.+_.+ ]]; } # sanitizer regex for prefix - declare -ag PROJECTS=() # this array will hold all of the projects for this run parse_input "$@" # parse arguments with getopt - ((DEBUG)) && declare -p + # ((DEBUG)) && declare -p # when the going gets rough interactive_header "$@" @@ -2343,7 +2254,7 @@ main() { if ! sanitize_pn "${PROJECTS[$i]}"; then echo "Project name ${PROJECTS[$i]} is invalid" echo "Enter a replacement" - ask_pn && unset "PROJECTS[i]" && PROJECTS+=("${ADD_PROJECTS[@]}") + ask_name "project" "ADD_PROJECTS" && unset "PROJECTS[i]" && PROJECTS+=("${ADD_PROJECTS[@]}") fi done @@ -2377,23 +2288,33 @@ main() { [[ ${MODULES[*]} == "install_dependencies" ]] && install_dependencies # Loop over projects - for PROJECT_NAME in "${PROJECTS[@]}"; do - declare -gx PROJECT_NAME - declare -gx PROJECT_SCANS_DIR="$SCANS_DIR/$PROJECT_NAME" - declare -gx PROJECT_DATE="${PROJECT_NAME%"${PROJECT_NAME#????????}"}" # e.g. 20240723 - declare -gx PROJECT_SUFFIX="${PROJECT_NAME#????????_*_}" - declare -gx PROJECT_USER="${PROJECT_NAME#????????_}"; PROJECT_USER="${PROJECT_USER%%_*}" + for PROJECT in "${PROJECTS[@]}"; do + declare -gx PROJECT + declare -gx PROJECT_SCANS_DIR="$SCANS_DIR/$PROJECT" + declare -gx PROJECT_DATE="${PROJECT:0:8}" # extract the first 8 characters (e.g., "20241215") + PROJECT_NO_DATE="${PROJECT:9}" # extract the part after the date and underscore + declare -gx PROJECT_USER="${PROJECT_NO_DATE%%_*}" # strip suffix to get user (e.g., "username") + declare -gx PROJECT_PREFIX="${PROJECT_DATE}_${PROJECT_USER}" + declare -gx PROJECT_NAME="${PROJECT_NO_DATE#*_}" # Remove the username and following underscore (e.g., "nameof_project") declare -gx STUDIES_ARCHIVE_FILE="$OUT_DIR/StudiesDataArchive.txt" - declare -gx QHTCP_RESULTS_DIR="$OUT_DIR/$PROJECT_NAME" - declare -gx QHTCP_TEMPLATE_DIR="$TEMPLATES_DIR/qhtcp" - declare -gx STUDY_INFO_FILE="$QHTCP_RESULTS_DIR/StudyInfo.csv" - declare -gx EASY_OUT_DIR="$QHTCP_RESULTS_DIR/easy" - declare -gx EASY_RESULTS_DIR="$EASY_OUT_DIR/$PROJECT_PREFIX" - declare -gx GTA_OUT_DIR="$QHTCP_RESULTS_DIR/gta" - declare -gx GTF_OUT_DIR="$QHTCP_RESULTS_DIR/gtf" - - # ((DEBUG)) && declare -p # for when the going gets rough - debug "Project: $PROJECT_NAME" + declare -gx PROJECT_RESULTS_DIR="$OUT_DIR/$PROJECT" + declare -gx EASY_OUT_DIR="$PROJECT_RESULTS_DIR/easy" + + # Set the automatic project directory prefix + declare -gx STUDY_PREFIX="${DATE}_${USER}" # reversed these so easier to sort and parse by date + + # Use more advanced function to set the correct results directories + # This area could be streamlined and improved with better logic + [[ -z $EASY_RESULTS_DIR ]] && handle_results_dir "$EASY_OUT_DIR" "EASY_RESULTS_DIR" + [[ -z $STUDY_RESULTS_DIR ]] && handle_results_dir "$PROJECT_RESULTS_DIR" "STUDY_RESULTS_DIR" + # [[ -z $STUDY_RESULTS_DIR ]] && declare -gx STUDY_RESULTS_DIR="$PROJECT_RESULTS_DIR/${STUDY_PREFIX}_${PROJECT_NAME}" + + declare -gx GTA_OUT_DIR="$STUDY_RESULTS_DIR/gta" + declare -gx GTF_OUT_DIR="$STUDY_RESULTS_DIR/gtf" + declare -gx STUDY_CONFIG_FILE="$STUDY_RESULTS_DIR/study_config" + handle_config "$STUDY_CONFIG_FILE" + + debug "Project: $PROJECT" debug "Active modules: ${MODULES[*]}" debug "Active wrappers and their args: ${WRAPPERS[*]}" @@ -2417,7 +2338,7 @@ main() { [[ ${#WRAPPERS[@]} -gt 0 ]] && echo "Successfully ran wrapper(s): ${WRAPPERS[*]}" [[ ${#PROJECTS[@]} -gt 0 ]] && echo "On project(s): ${PROJECTS[*]}" - unset MODULES WRAPPERS EXCLUDE_MODULES STUDIES SET_STUDIES YES + unset MODULES WRAPPERS EXCLUDE_MODULES STUDIES PARSED_CONFIG YES } # During development it's better to just exit