376 lines
13 KiB
R
376 lines
13 KiB
R
library("dplyr")
|
|
|
|
# Function to parse and set arguments
|
|
parse_arguments <- function() {
|
|
if (interactive()) {
|
|
args <- c(
|
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
|
|
2,
|
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
|
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
|
|
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
|
|
)
|
|
} else {
|
|
args <- commandArgs(trailingOnly = TRUE)
|
|
}
|
|
|
|
list(
|
|
out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
|
|
sd = as.numeric(args[2]),
|
|
study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
|
|
input_dirs = args[4:length(args)]
|
|
)
|
|
}
|
|
|
|
args <- parse_arguments()
|
|
|
|
# Create an array for the zscores files
|
|
get_zscores_files <- function(dirs) {
|
|
files <- sapply(dirs, function(exp) {
|
|
file_path <- file.path(exp, "zscores", "zscores_interaction.csv")
|
|
if (file.exists(file_path)) file_path else NULL
|
|
}, simplify = TRUE, USE.NAMES = FALSE)
|
|
|
|
# Filter out NULL entries
|
|
files[!sapply(files, is.null)]
|
|
}
|
|
|
|
zscores_files <- get_zscores_files(args$input_dirs)
|
|
sprintf("The SD value is: %d", args$sd)
|
|
|
|
# Ensure there are enough files to compare
|
|
if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script")
|
|
|
|
# Function to join zscores files
|
|
join_zscores_files <- function(files) {
|
|
joined_data <- read.csv(file = files[1], stringsAsFactors = FALSE)
|
|
for (file in files[-1]) {
|
|
temp_data <- read.csv(file = file, stringsAsFactors = FALSE)
|
|
joined_data <- join(joined_data, temp_data, by = "OrfRep")
|
|
}
|
|
joined_data
|
|
}
|
|
|
|
# Load and join zscores files
|
|
joined_data <- join_zscores_files(zscores_files)
|
|
|
|
# Order and select columns
|
|
order_and_select_columns <- function(data) {
|
|
ordered_data <- data[, order(colnames(data))]
|
|
selected_headers <- select(ordered_data,
|
|
contains("OrfRep"), matches("Gene"),
|
|
contains("z_lm_k"), contains("z_shift_k"),
|
|
contains("z_lm_l"), contains("z_shift_l"))
|
|
selected_headers
|
|
}
|
|
|
|
selected_headers <- order_and_select_columns(joined_data)
|
|
|
|
# Remove redundant columns like "Gene.1"
|
|
clean_headers <- function(data, suffixes) {
|
|
suffixes_to_remove <- paste0("Gene.", seq_len(suffixes))
|
|
select(data, -all_of(suffixes_to_remove))
|
|
}
|
|
|
|
headSel <- clean_headers(selected_headers, length(zscores_files) - 1)
|
|
headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1)
|
|
|
|
# Fill NA values in Shift and Z_lm columns
|
|
fill_na_in_columns <- function(data) {
|
|
for (header in colnames(data)) {
|
|
if (grepl("Shift", header)) {
|
|
data[[header]][is.na(data[[header]])] <- 0.001
|
|
} else if (grepl("Z_lm_", header)) {
|
|
data[[header]][is.na(data[[header]])] <- 0.0001
|
|
}
|
|
}
|
|
data
|
|
}
|
|
|
|
headSel <- fill_na_in_columns(headSel)
|
|
|
|
# Filter based on standard deviation
|
|
filter_by_sd <- function(data, sd) {
|
|
if (sd == 0) return(data)
|
|
|
|
z_lm_cols <- select(data, contains("z_lm_"))
|
|
filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0
|
|
data[filter_vector, ]
|
|
}
|
|
|
|
REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), args$sd)
|
|
shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), args$sd)
|
|
|
|
# Reorder columns to interleave Z_lm and Shift data
|
|
reorder_columns <- function(data1, data2) {
|
|
combined_data <- data1
|
|
for (i in 3:ncol(data1)) {
|
|
combined_data <- cbind(combined_data, data2[i], data1[i])
|
|
}
|
|
combined_data
|
|
}
|
|
|
|
combI <- reorder_columns(headSel2, shiftOnly)
|
|
|
|
# Write output files
|
|
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
|
|
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
|
|
|
|
# Relabel headers using experiment names from StudyInfo.csv
|
|
relabel_headers <- function(headers, labels) {
|
|
new_labels <- headers
|
|
for (i in seq_along(headers)) {
|
|
suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i])
|
|
if (suffix %in% 1:3) {
|
|
exp_name <- labels[as.numeric(suffix), 2]
|
|
new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i])
|
|
}
|
|
}
|
|
new_labels
|
|
}
|
|
|
|
LabelStd <- read.csv(file = args$study_info, stringsAsFactors = FALSE)
|
|
colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), LabelStd)
|
|
colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), LabelStd)
|
|
|
|
# Save relabeled files
|
|
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
|
|
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
|
|
|
|
# Save updated parameters
|
|
LabelStd[, 4] <- args$sd
|
|
write.csv(LabelStd, file.path(args$out_dir, "parameters.csv"), row.names = FALSE)
|
|
write.csv(LabelStd, file = args$study_info, row.names = FALSE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# library("plyr")
|
|
# library("sos")
|
|
# library("dplyr")
|
|
|
|
# # Function to parse and set arguments
|
|
# parse_arguments <- function() {
|
|
# if (interactive()) {
|
|
# args <- c(
|
|
# "./", # Default out_dir
|
|
# "2", # Default SD value
|
|
# "../Code/StudyInfo.csv" # Default study_info path
|
|
# )
|
|
# } else {
|
|
# args <- commandArgs(trailingOnly = TRUE)
|
|
# }
|
|
|
|
# list(
|
|
# out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
|
|
# sd = as.numeric(args[2]),
|
|
# study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
|
|
# input_dirs = args[4:length(args)]
|
|
# )
|
|
# }
|
|
|
|
# args <- parse_arguments()
|
|
|
|
# # Create an array for the zscores files
|
|
# zscores_files <- sapply(args$input_dirs, function(study) {
|
|
# file_path <- file.path(study, "zscores", "zscores_interaction.csv")
|
|
# if (file.exists(file_path)) file_path else NULL
|
|
# }, simplify = TRUE, USE.NAMES = FALSE)
|
|
|
|
# # Filter out NULL entries
|
|
# zscores_files <- zscores_files[!sapply(zscores_files, is.null)]
|
|
|
|
# sprintf("The SD value is: %d", args$sd)
|
|
# # print(args$input_dirs)
|
|
|
|
# # TODO this is better handled in a loop in case you want to compare more experiments?
|
|
# # The input is already designed for this
|
|
# # Read in the files for your experiment and
|
|
# # Join the two files at a time as a function of how many inputFile
|
|
# # list the larger file first ? in this example X2 has the larger number of genes
|
|
# # If X1 has a larger number of genes, switch the order of X1 and X2
|
|
# if (length(zscores_files) < 2) {
|
|
# print("Note enough Exps to compare, skipping join")
|
|
# stop("Exiting script")
|
|
# }
|
|
|
|
# if (length(zscores_files) >= 2) {
|
|
# X1 <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE)
|
|
# X2 <- read.csv(file = zscores_files[2], stringsAsFactors = FALSE)
|
|
# X <- join(X1, X2, by = "OrfRep")
|
|
# OBH <- X[, order(colnames(X))] # OrderByHeader
|
|
# headers <- select(OBH, contains("OrfRep"), matches("Gene"),
|
|
# contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l"))
|
|
# headSel <- select(headers, -"Gene.1") # remove "Gene.1 column
|
|
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums
|
|
# headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column # frame for interleaving Z_lm with Shift colums
|
|
# }
|
|
|
|
# if (length(zscores_files) >= 3) {
|
|
# X3 <- read.csv(file = zscores_files[3], stringsAsFactors = FALSE)
|
|
# X <- join(X, X3, by = "OrfRep")
|
|
# headSel <- select(headers, -"Gene.1", -"Gene.2")
|
|
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
|
|
# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2")
|
|
# }
|
|
|
|
# if (length(zscores_files) >= 4) {
|
|
# X4 <- read.csv(file = zscores_files[4], stringsAsFactors = FALSE)
|
|
# X <- join(X, X4, by = "OrfRep")
|
|
# headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3")
|
|
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
|
|
# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3")
|
|
# }
|
|
|
|
# # print(headers)
|
|
# # headSel$contains("Z_Shift") %>% replace_na(0.001)
|
|
# headers <- colnames(headSel)
|
|
# # print(headers)
|
|
# i <- 0
|
|
# for (i in 1:length(headers)) {
|
|
# if (grepl("Shift", headers[i])) {
|
|
# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001
|
|
# }
|
|
# if (grepl("Z_lm_", headers[i])) {
|
|
# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001
|
|
# }
|
|
# }
|
|
|
|
# # 2SD option code to exclude Z_lm values less than 2 standard Deviations
|
|
# REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_"))
|
|
# shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift"))
|
|
|
|
# # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt
|
|
# Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",")
|
|
|
|
# # Using Text search grepl to relabel headers
|
|
# REMcRdyHdr <- colnames(REMcRdy)
|
|
# REMcRdyLabels <- "asdf"
|
|
# shftHdr <- colnames(shiftOnly)
|
|
# shiftLabels <- "asdf"
|
|
# shiftLabels[1:2] <- shftHdr[1:2]
|
|
# REMcRdyLabels[1:2] <- REMcRdyHdr[1:2]
|
|
|
|
# for (i in 3:(length(shftHdr))) {
|
|
# if (i == 3) {
|
|
# shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3])
|
|
# REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3])
|
|
# }
|
|
# if (i == 5) {
|
|
# shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5])
|
|
# REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5])
|
|
# }
|
|
# if (i == 7) {
|
|
# shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7])
|
|
# REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7])
|
|
# }
|
|
# if (grepl(".1", shftHdr[i], fixed = true)) {
|
|
# shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i])
|
|
# REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i])
|
|
# }
|
|
# if (grepl(".2", shftHdr[i], fixed = true)) {
|
|
# shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i])
|
|
# REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i])
|
|
# }
|
|
# if (grepl(".3", shftHdr[i], fixed = true)) {
|
|
# shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i])
|
|
# REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i])
|
|
# }
|
|
# }
|
|
|
|
# for (i in 3:(length(REMcRdyLabels))) {
|
|
# j <- as.integer(i)
|
|
# REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j])
|
|
# shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j])
|
|
# }
|
|
|
|
# colnames(shiftOnly) <- shiftLabels
|
|
# colnames(REMcRdy) <- REMcRdyLabels
|
|
|
|
# combI <- headSel2 # starting Template orf, Genename columns
|
|
|
|
# # headersRemc<-colnames(REMcRdy)
|
|
# # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps.
|
|
# for (i in 3:length(colnames(REMcRdy))) {
|
|
# combI <- cbind.data.frame(combI, shiftOnly[i])
|
|
# combI <- cbind.data.frame(combI, REMcRdy[i])
|
|
# }
|
|
|
|
# Vec1 <- NA
|
|
# Vec2 <- NA
|
|
# Vec3 <- NA
|
|
# Vec4 <- NA
|
|
# Vec5 <- NA
|
|
# Vec6 <- NA
|
|
# Vec7 <- NA
|
|
# Vec8 <- NA
|
|
|
|
# if (length(REMcRdy) == 6) {
|
|
# Vec1 <- abs(REMcRdy[, 3]) >= sd
|
|
# Vec2 <- abs(REMcRdy[, 4]) >= sd
|
|
# Vec3 <- abs(REMcRdy[, 5]) >= sd
|
|
# Vec4 <- abs(REMcRdy[, 6]) >= sd
|
|
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4
|
|
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
|
|
# REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6]
|
|
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
|
|
# shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6]
|
|
# }
|
|
|
|
# if (length(REMcRdy) == 8) {
|
|
# Vec1 <- abs(REMcRdy[, 3]) >= sd
|
|
# Vec2 <- abs(REMcRdy[, 4]) >= sd
|
|
# Vec3 <- abs(REMcRdy[, 5]) >= sd
|
|
# Vec4 <- abs(REMcRdy[, 6]) >= sd
|
|
# Vec5 <- abs(REMcRdy[, 7]) >= sd
|
|
# Vec6 <- abs(REMcRdy[, 8]) >= sd
|
|
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6
|
|
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
|
|
# REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8]
|
|
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
|
|
# shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8]
|
|
# }
|
|
|
|
# if (length(REMcRdy) == 10) {
|
|
# Vec1 <- abs(REMcRdy[, 3]) >= sd
|
|
# Vec2 <- abs(REMcRdy[, 4]) >= sd
|
|
# Vec3 <- abs(REMcRdy[, 5]) >= sd
|
|
# Vec4 <- abs(REMcRdy[, 6]) >= sd
|
|
# Vec5 <- abs(REMcRdy[, 7]) >= sd
|
|
# Vec6 <- abs(REMcRdy[, 8]) >= sd
|
|
# Vec7 <- abs(REMcRdy[, 9]) >= sd
|
|
# Vec8 <- abs(REMcRdy[, 10]) >= sd
|
|
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8
|
|
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
|
|
# REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10]
|
|
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
|
|
# shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10]
|
|
# }
|
|
|
|
# if (sd != 0) {
|
|
# REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)]
|
|
# shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)]
|
|
# }
|
|
|
|
# if (sd == 0) {
|
|
# REMcRdy <- REMcRdy # [,2:length(REMcRdy)]
|
|
# shiftOnly <- shiftOnly # [,2:length(shiftOnly)]
|
|
# }
|
|
|
|
# # R places hidden "" around the header names. The following
|
|
# # is intended to remove those quote so that the "" do not blow up the Java REMc.
|
|
# # Use ,quote=F in the write.csv statement to fix R output file.
|
|
# # write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE)
|
|
# write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
|
|
# write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
|
|
# # LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",")
|
|
|
|
# LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE)
|
|
# # print(sd)
|
|
# LabelStd[, 4] <- as.numeric(sd)
|
|
# write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE)
|
|
# write.csv(LabelStd, file = study_info, row.names = FALSE)
|