Files
hartman-server/qhtcp-workflow/apps/r/joinInteractExps.R

376 lines
13 KiB
R

library("dplyr")
# Function to parse and set arguments
parse_arguments <- function() {
if (interactive()) {
args <- c(
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
2,
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
"/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
)
} else {
args <- commandArgs(trailingOnly = TRUE)
}
list(
out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
sd = as.numeric(args[2]),
study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
input_dirs = args[4:length(args)]
)
}
args <- parse_arguments()
# Create an array for the zscores files
get_zscores_files <- function(dirs) {
files <- sapply(dirs, function(exp) {
file_path <- file.path(exp, "zscores", "zscores_interaction.csv")
if (file.exists(file_path)) file_path else NULL
}, simplify = TRUE, USE.NAMES = FALSE)
# Filter out NULL entries
files[!sapply(files, is.null)]
}
zscores_files <- get_zscores_files(args$input_dirs)
sprintf("The SD value is: %d", args$sd)
# Ensure there are enough files to compare
if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script")
# Function to join zscores files
join_zscores_files <- function(files) {
joined_data <- read.csv(file = files[1], stringsAsFactors = FALSE)
for (file in files[-1]) {
temp_data <- read.csv(file = file, stringsAsFactors = FALSE)
joined_data <- join(joined_data, temp_data, by = "OrfRep")
}
joined_data
}
# Load and join zscores files
joined_data <- join_zscores_files(zscores_files)
# Order and select columns
order_and_select_columns <- function(data) {
ordered_data <- data[, order(colnames(data))]
selected_headers <- select(ordered_data,
contains("OrfRep"), matches("Gene"),
contains("z_lm_k"), contains("z_shift_k"),
contains("z_lm_l"), contains("z_shift_l"))
selected_headers
}
selected_headers <- order_and_select_columns(joined_data)
# Remove redundant columns like "Gene.1"
clean_headers <- function(data, suffixes) {
suffixes_to_remove <- paste0("Gene.", seq_len(suffixes))
select(data, -all_of(suffixes_to_remove))
}
headSel <- clean_headers(selected_headers, length(zscores_files) - 1)
headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1)
# Fill NA values in Shift and Z_lm columns
fill_na_in_columns <- function(data) {
for (header in colnames(data)) {
if (grepl("Shift", header)) {
data[[header]][is.na(data[[header]])] <- 0.001
} else if (grepl("Z_lm_", header)) {
data[[header]][is.na(data[[header]])] <- 0.0001
}
}
data
}
headSel <- fill_na_in_columns(headSel)
# Filter based on standard deviation
filter_by_sd <- function(data, sd) {
if (sd == 0) return(data)
z_lm_cols <- select(data, contains("z_lm_"))
filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0
data[filter_vector, ]
}
REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), args$sd)
shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), args$sd)
# Reorder columns to interleave Z_lm and Shift data
reorder_columns <- function(data1, data2) {
combined_data <- data1
for (i in 3:ncol(data1)) {
combined_data <- cbind(combined_data, data2[i], data1[i])
}
combined_data
}
combI <- reorder_columns(headSel2, shiftOnly)
# Write output files
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
# Relabel headers using experiment names from StudyInfo.csv
relabel_headers <- function(headers, labels) {
new_labels <- headers
for (i in seq_along(headers)) {
suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i])
if (suffix %in% 1:3) {
exp_name <- labels[as.numeric(suffix), 2]
new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i])
}
}
new_labels
}
LabelStd <- read.csv(file = args$study_info, stringsAsFactors = FALSE)
colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), LabelStd)
colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), LabelStd)
# Save relabeled files
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
# Save updated parameters
LabelStd[, 4] <- args$sd
write.csv(LabelStd, file.path(args$out_dir, "parameters.csv"), row.names = FALSE)
write.csv(LabelStd, file = args$study_info, row.names = FALSE)
# library("plyr")
# library("sos")
# library("dplyr")
# # Function to parse and set arguments
# parse_arguments <- function() {
# if (interactive()) {
# args <- c(
# "./", # Default out_dir
# "2", # Default SD value
# "../Code/StudyInfo.csv" # Default study_info path
# )
# } else {
# args <- commandArgs(trailingOnly = TRUE)
# }
# list(
# out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
# sd = as.numeric(args[2]),
# study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
# input_dirs = args[4:length(args)]
# )
# }
# args <- parse_arguments()
# # Create an array for the zscores files
# zscores_files <- sapply(args$input_dirs, function(study) {
# file_path <- file.path(study, "zscores", "zscores_interaction.csv")
# if (file.exists(file_path)) file_path else NULL
# }, simplify = TRUE, USE.NAMES = FALSE)
# # Filter out NULL entries
# zscores_files <- zscores_files[!sapply(zscores_files, is.null)]
# sprintf("The SD value is: %d", args$sd)
# # print(args$input_dirs)
# # TODO this is better handled in a loop in case you want to compare more experiments?
# # The input is already designed for this
# # Read in the files for your experiment and
# # Join the two files at a time as a function of how many inputFile
# # list the larger file first ? in this example X2 has the larger number of genes
# # If X1 has a larger number of genes, switch the order of X1 and X2
# if (length(zscores_files) < 2) {
# print("Note enough Exps to compare, skipping join")
# stop("Exiting script")
# }
# if (length(zscores_files) >= 2) {
# X1 <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE)
# X2 <- read.csv(file = zscores_files[2], stringsAsFactors = FALSE)
# X <- join(X1, X2, by = "OrfRep")
# OBH <- X[, order(colnames(X))] # OrderByHeader
# headers <- select(OBH, contains("OrfRep"), matches("Gene"),
# contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l"))
# headSel <- select(headers, -"Gene.1") # remove "Gene.1 column
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums
# headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column # frame for interleaving Z_lm with Shift colums
# }
# if (length(zscores_files) >= 3) {
# X3 <- read.csv(file = zscores_files[3], stringsAsFactors = FALSE)
# X <- join(X, X3, by = "OrfRep")
# headSel <- select(headers, -"Gene.1", -"Gene.2")
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2")
# }
# if (length(zscores_files) >= 4) {
# X4 <- read.csv(file = zscores_files[4], stringsAsFactors = FALSE)
# X <- join(X, X4, by = "OrfRep")
# headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3")
# headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
# headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3")
# }
# # print(headers)
# # headSel$contains("Z_Shift") %>% replace_na(0.001)
# headers <- colnames(headSel)
# # print(headers)
# i <- 0
# for (i in 1:length(headers)) {
# if (grepl("Shift", headers[i])) {
# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001
# }
# if (grepl("Z_lm_", headers[i])) {
# headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001
# }
# }
# # 2SD option code to exclude Z_lm values less than 2 standard Deviations
# REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_"))
# shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift"))
# # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt
# Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",")
# # Using Text search grepl to relabel headers
# REMcRdyHdr <- colnames(REMcRdy)
# REMcRdyLabels <- "asdf"
# shftHdr <- colnames(shiftOnly)
# shiftLabels <- "asdf"
# shiftLabels[1:2] <- shftHdr[1:2]
# REMcRdyLabels[1:2] <- REMcRdyHdr[1:2]
# for (i in 3:(length(shftHdr))) {
# if (i == 3) {
# shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3])
# REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3])
# }
# if (i == 5) {
# shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5])
# REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5])
# }
# if (i == 7) {
# shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7])
# REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7])
# }
# if (grepl(".1", shftHdr[i], fixed = true)) {
# shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i])
# REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i])
# }
# if (grepl(".2", shftHdr[i], fixed = true)) {
# shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i])
# REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i])
# }
# if (grepl(".3", shftHdr[i], fixed = true)) {
# shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i])
# REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i])
# }
# }
# for (i in 3:(length(REMcRdyLabels))) {
# j <- as.integer(i)
# REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j])
# shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j])
# }
# colnames(shiftOnly) <- shiftLabels
# colnames(REMcRdy) <- REMcRdyLabels
# combI <- headSel2 # starting Template orf, Genename columns
# # headersRemc<-colnames(REMcRdy)
# # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps.
# for (i in 3:length(colnames(REMcRdy))) {
# combI <- cbind.data.frame(combI, shiftOnly[i])
# combI <- cbind.data.frame(combI, REMcRdy[i])
# }
# Vec1 <- NA
# Vec2 <- NA
# Vec3 <- NA
# Vec4 <- NA
# Vec5 <- NA
# Vec6 <- NA
# Vec7 <- NA
# Vec8 <- NA
# if (length(REMcRdy) == 6) {
# Vec1 <- abs(REMcRdy[, 3]) >= sd
# Vec2 <- abs(REMcRdy[, 4]) >= sd
# Vec3 <- abs(REMcRdy[, 5]) >= sd
# Vec4 <- abs(REMcRdy[, 6]) >= sd
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
# REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6]
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
# shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6]
# }
# if (length(REMcRdy) == 8) {
# Vec1 <- abs(REMcRdy[, 3]) >= sd
# Vec2 <- abs(REMcRdy[, 4]) >= sd
# Vec3 <- abs(REMcRdy[, 5]) >= sd
# Vec4 <- abs(REMcRdy[, 6]) >= sd
# Vec5 <- abs(REMcRdy[, 7]) >= sd
# Vec6 <- abs(REMcRdy[, 8]) >= sd
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
# REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8]
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
# shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8]
# }
# if (length(REMcRdy) == 10) {
# Vec1 <- abs(REMcRdy[, 3]) >= sd
# Vec2 <- abs(REMcRdy[, 4]) >= sd
# Vec3 <- abs(REMcRdy[, 5]) >= sd
# Vec4 <- abs(REMcRdy[, 6]) >= sd
# Vec5 <- abs(REMcRdy[, 7]) >= sd
# Vec6 <- abs(REMcRdy[, 8]) >= sd
# Vec7 <- abs(REMcRdy[, 9]) >= sd
# Vec8 <- abs(REMcRdy[, 10]) >= sd
# bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8
# REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
# REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10]
# shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
# shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10]
# }
# if (sd != 0) {
# REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)]
# shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)]
# }
# if (sd == 0) {
# REMcRdy <- REMcRdy # [,2:length(REMcRdy)]
# shiftOnly <- shiftOnly # [,2:length(shiftOnly)]
# }
# # R places hidden "" around the header names. The following
# # is intended to remove those quote so that the "" do not blow up the Java REMc.
# # Use ,quote=F in the write.csv statement to fix R output file.
# # write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE)
# write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
# write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
# # LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",")
# LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE)
# # print(sd)
# LabelStd[, 4] <- as.numeric(sd)
# write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE)
# write.csv(LabelStd, file = study_info, row.names = FALSE)