123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375 |
- library("dplyr")
- # Function to parse and set arguments
- parse_arguments <- function() {
- if (interactive()) {
- args <- c(
- "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
- 2,
- "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
- "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
- "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
- )
- } else {
- args <- commandArgs(trailingOnly = TRUE)
- }
-
- list(
- out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
- sd = as.numeric(args[2]),
- study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
- input_dirs = args[4:length(args)]
- )
- }
- args <- parse_arguments()
- # Create an array for the zscores files
- get_zscores_files <- function(dirs) {
- files <- sapply(dirs, function(exp) {
- file_path <- file.path(exp, "zscores", "zscores_interaction.csv")
- if (file.exists(file_path)) file_path else NULL
- }, simplify = TRUE, USE.NAMES = FALSE)
-
- # Filter out NULL entries
- files[!sapply(files, is.null)]
- }
- zscores_files <- get_zscores_files(args$input_dirs)
- sprintf("The SD value is: %d", args$sd)
- # Ensure there are enough files to compare
- if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script")
- # Function to join zscores files
- join_zscores_files <- function(files) {
- joined_data <- read.csv(file = files[1], stringsAsFactors = FALSE)
- for (file in files[-1]) {
- temp_data <- read.csv(file = file, stringsAsFactors = FALSE)
- joined_data <- join(joined_data, temp_data, by = "OrfRep")
- }
- joined_data
- }
- # Load and join zscores files
- joined_data <- join_zscores_files(zscores_files)
- # Order and select columns
- order_and_select_columns <- function(data) {
- ordered_data <- data[, order(colnames(data))]
- selected_headers <- select(ordered_data,
- contains("OrfRep"), matches("Gene"),
- contains("z_lm_k"), contains("z_shift_k"),
- contains("z_lm_l"), contains("z_shift_l"))
- selected_headers
- }
- selected_headers <- order_and_select_columns(joined_data)
- # Remove redundant columns like "Gene.1"
- clean_headers <- function(data, suffixes) {
- suffixes_to_remove <- paste0("Gene.", seq_len(suffixes))
- select(data, -all_of(suffixes_to_remove))
- }
- headSel <- clean_headers(selected_headers, length(zscores_files) - 1)
- headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1)
- # Fill NA values in Shift and Z_lm columns
- fill_na_in_columns <- function(data) {
- for (header in colnames(data)) {
- if (grepl("Shift", header)) {
- data[[header]][is.na(data[[header]])] <- 0.001
- } else if (grepl("Z_lm_", header)) {
- data[[header]][is.na(data[[header]])] <- 0.0001
- }
- }
- data
- }
- headSel <- fill_na_in_columns(headSel)
- # Filter based on standard deviation
- filter_by_sd <- function(data, sd) {
- if (sd == 0) return(data)
-
- z_lm_cols <- select(data, contains("z_lm_"))
- filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0
- data[filter_vector, ]
- }
- REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), args$sd)
- shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), args$sd)
- # Reorder columns to interleave Z_lm and Shift data
- reorder_columns <- function(data1, data2) {
- combined_data <- data1
- for (i in 3:ncol(data1)) {
- combined_data <- cbind(combined_data, data2[i], data1[i])
- }
- combined_data
- }
- combI <- reorder_columns(headSel2, shiftOnly)
- # Write output files
- write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
- write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
- # Relabel headers using experiment names from StudyInfo.csv
- relabel_headers <- function(headers, labels) {
- new_labels <- headers
- for (i in seq_along(headers)) {
- suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i])
- if (suffix %in% 1:3) {
- exp_name <- labels[as.numeric(suffix), 2]
- new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i])
- }
- }
- new_labels
- }
- LabelStd <- read.csv(file = args$study_info, stringsAsFactors = FALSE)
- colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), LabelStd)
- colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), LabelStd)
- # Save relabeled files
- write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
- write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
- # Save updated parameters
- LabelStd[, 4] <- args$sd
- write.csv(LabelStd, file.path(args$out_dir, "parameters.csv"), row.names = FALSE)
- write.csv(LabelStd, file = args$study_info, row.names = FALSE)
- # library("plyr")
- # library("sos")
- # library("dplyr")
- # # Function to parse and set arguments
- # parse_arguments <- function() {
- # if (interactive()) {
- # args <- c(
- # "./", # Default out_dir
- # "2", # Default SD value
- # "../Code/StudyInfo.csv" # Default study_info path
- # )
- # } else {
- # args <- commandArgs(trailingOnly = TRUE)
- # }
-
- # list(
- # out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
- # sd = as.numeric(args[2]),
- # study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
- # input_dirs = args[4:length(args)]
- # )
- # }
- # args <- parse_arguments()
- # # Create an array for the zscores files
- # zscores_files <- sapply(args$input_dirs, function(study) {
- # file_path <- file.path(study, "zscores", "zscores_interaction.csv")
- # if (file.exists(file_path)) file_path else NULL
- # }, simplify = TRUE, USE.NAMES = FALSE)
- # # Filter out NULL entries
- # zscores_files <- zscores_files[!sapply(zscores_files, is.null)]
- # sprintf("The SD value is: %d", args$sd)
- # # print(args$input_dirs)
- # # TODO this is better handled in a loop in case you want to compare more experiments?
- # # The input is already designed for this
- # # Read in the files for your experiment and
- # # Join the two files at a time as a function of how many inputFile
- # # list the larger file first ? in this example X2 has the larger number of genes
- # # If X1 has a larger number of genes, switch the order of X1 and X2
- # if (length(zscores_files) < 2) {
- # print("Note enough Exps to compare, skipping join")
- # stop("Exiting script")
- # }
- # if (length(zscores_files) >= 2) {
- # X1 <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE)
- # X2 <- read.csv(file = zscores_files[2], stringsAsFactors = FALSE)
- # X <- join(X1, X2, by = "OrfRep")
- # OBH <- X[, order(colnames(X))] # OrderByHeader
- # headers <- select(OBH, contains("OrfRep"), matches("Gene"),
- # contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l"))
- # headSel <- select(headers, -"Gene.1") # remove "Gene.1 column
- # headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums
- # headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column # frame for interleaving Z_lm with Shift colums
- # }
- # if (length(zscores_files) >= 3) {
- # X3 <- read.csv(file = zscores_files[3], stringsAsFactors = FALSE)
- # X <- join(X, X3, by = "OrfRep")
- # headSel <- select(headers, -"Gene.1", -"Gene.2")
- # headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
- # headSel2 <- select(headSel2, -"Gene.1", -"Gene.2")
- # }
- # if (length(zscores_files) >= 4) {
- # X4 <- read.csv(file = zscores_files[4], stringsAsFactors = FALSE)
- # X <- join(X, X4, by = "OrfRep")
- # headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3")
- # headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
- # headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3")
- # }
- # # print(headers)
- # # headSel$contains("Z_Shift") %>% replace_na(0.001)
- # headers <- colnames(headSel)
- # # print(headers)
- # i <- 0
- # for (i in 1:length(headers)) {
- # if (grepl("Shift", headers[i])) {
- # headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001
- # }
- # if (grepl("Z_lm_", headers[i])) {
- # headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001
- # }
- # }
- # # 2SD option code to exclude Z_lm values less than 2 standard Deviations
- # REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_"))
- # shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift"))
- # # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt
- # Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",")
- # # Using Text search grepl to relabel headers
- # REMcRdyHdr <- colnames(REMcRdy)
- # REMcRdyLabels <- "asdf"
- # shftHdr <- colnames(shiftOnly)
- # shiftLabels <- "asdf"
- # shiftLabels[1:2] <- shftHdr[1:2]
- # REMcRdyLabels[1:2] <- REMcRdyHdr[1:2]
-
- # for (i in 3:(length(shftHdr))) {
- # if (i == 3) {
- # shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3])
- # REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3])
- # }
- # if (i == 5) {
- # shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5])
- # REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5])
- # }
- # if (i == 7) {
- # shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7])
- # REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7])
- # }
- # if (grepl(".1", shftHdr[i], fixed = true)) {
- # shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i])
- # REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i])
- # }
- # if (grepl(".2", shftHdr[i], fixed = true)) {
- # shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i])
- # REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i])
- # }
- # if (grepl(".3", shftHdr[i], fixed = true)) {
- # shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i])
- # REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i])
- # }
- # }
- # for (i in 3:(length(REMcRdyLabels))) {
- # j <- as.integer(i)
- # REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j])
- # shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j])
- # }
- # colnames(shiftOnly) <- shiftLabels
- # colnames(REMcRdy) <- REMcRdyLabels
- # combI <- headSel2 # starting Template orf, Genename columns
- # # headersRemc<-colnames(REMcRdy)
- # # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps.
- # for (i in 3:length(colnames(REMcRdy))) {
- # combI <- cbind.data.frame(combI, shiftOnly[i])
- # combI <- cbind.data.frame(combI, REMcRdy[i])
- # }
- # Vec1 <- NA
- # Vec2 <- NA
- # Vec3 <- NA
- # Vec4 <- NA
- # Vec5 <- NA
- # Vec6 <- NA
- # Vec7 <- NA
- # Vec8 <- NA
- # if (length(REMcRdy) == 6) {
- # Vec1 <- abs(REMcRdy[, 3]) >= sd
- # Vec2 <- abs(REMcRdy[, 4]) >= sd
- # Vec3 <- abs(REMcRdy[, 5]) >= sd
- # Vec4 <- abs(REMcRdy[, 6]) >= sd
- # bolVec <- Vec1 | Vec2 | Vec3 | Vec4
- # REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
- # REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6]
- # shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
- # shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6]
- # }
- # if (length(REMcRdy) == 8) {
- # Vec1 <- abs(REMcRdy[, 3]) >= sd
- # Vec2 <- abs(REMcRdy[, 4]) >= sd
- # Vec3 <- abs(REMcRdy[, 5]) >= sd
- # Vec4 <- abs(REMcRdy[, 6]) >= sd
- # Vec5 <- abs(REMcRdy[, 7]) >= sd
- # Vec6 <- abs(REMcRdy[, 8]) >= sd
- # bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6
- # REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
- # REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8]
- # shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
- # shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8]
- # }
- # if (length(REMcRdy) == 10) {
- # Vec1 <- abs(REMcRdy[, 3]) >= sd
- # Vec2 <- abs(REMcRdy[, 4]) >= sd
- # Vec3 <- abs(REMcRdy[, 5]) >= sd
- # Vec4 <- abs(REMcRdy[, 6]) >= sd
- # Vec5 <- abs(REMcRdy[, 7]) >= sd
- # Vec6 <- abs(REMcRdy[, 8]) >= sd
- # Vec7 <- abs(REMcRdy[, 9]) >= sd
- # Vec8 <- abs(REMcRdy[, 10]) >= sd
- # bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8
- # REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
- # REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10]
- # shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
- # shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10]
- # }
- # if (sd != 0) {
- # REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)]
- # shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)]
- # }
- # if (sd == 0) {
- # REMcRdy <- REMcRdy # [,2:length(REMcRdy)]
- # shiftOnly <- shiftOnly # [,2:length(shiftOnly)]
- # }
- # # R places hidden "" around the header names. The following
- # # is intended to remove those quote so that the "" do not blow up the Java REMc.
- # # Use ,quote=F in the write.csv statement to fix R output file.
- # # write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE)
- # write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
- # write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
- # # LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",")
- # LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE)
- # # print(sd)
- # LabelStd[, 4] <- as.numeric(sd)
- # write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE)
- # write.csv(LabelStd, file = study_info, row.names = FALSE)
|