library("dplyr") # Function to parse and set arguments parse_arguments <- function() { if (interactive()) { args <- c( "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD", 2, "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv", "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1", "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2" ) } else { args <- commandArgs(trailingOnly = TRUE) } list( out_dir = normalizePath(file.path(args[1]), mustWork = FALSE), sd = as.numeric(args[2]), study_info = normalizePath(file.path(args[3]), mustWork = FALSE), input_dirs = args[4:length(args)] ) } args <- parse_arguments() # Create an array for the zscores files get_zscores_files <- function(dirs) { files <- sapply(dirs, function(exp) { file_path <- file.path(exp, "zscores", "zscores_interaction.csv") if (file.exists(file_path)) file_path else NULL }, simplify = TRUE, USE.NAMES = FALSE) # Filter out NULL entries files[!sapply(files, is.null)] } zscores_files <- get_zscores_files(args$input_dirs) sprintf("The SD value is: %d", args$sd) # Ensure there are enough files to compare if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script") # Function to join zscores files join_zscores_files <- function(files) { joined_data <- read.csv(file = files[1], stringsAsFactors = FALSE) for (file in files[-1]) { temp_data <- read.csv(file = file, stringsAsFactors = FALSE) joined_data <- join(joined_data, temp_data, by = "OrfRep") } joined_data } # Load and join zscores files joined_data <- join_zscores_files(zscores_files) # Order and select columns order_and_select_columns <- function(data) { ordered_data <- data[, order(colnames(data))] selected_headers <- select(ordered_data, contains("OrfRep"), matches("Gene"), contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l")) selected_headers } selected_headers <- order_and_select_columns(joined_data) # Remove redundant columns like "Gene.1" clean_headers <- function(data, suffixes) { suffixes_to_remove <- paste0("Gene.", seq_len(suffixes)) select(data, -all_of(suffixes_to_remove)) } headSel <- clean_headers(selected_headers, length(zscores_files) - 1) headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1) # Fill NA values in Shift and Z_lm columns fill_na_in_columns <- function(data) { for (header in colnames(data)) { if (grepl("Shift", header)) { data[[header]][is.na(data[[header]])] <- 0.001 } else if (grepl("Z_lm_", header)) { data[[header]][is.na(data[[header]])] <- 0.0001 } } data } headSel <- fill_na_in_columns(headSel) # Filter based on standard deviation filter_by_sd <- function(data, sd) { if (sd == 0) return(data) z_lm_cols <- select(data, contains("z_lm_")) filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0 data[filter_vector, ] } REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), args$sd) shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), args$sd) # Reorder columns to interleave Z_lm and Shift data reorder_columns <- function(data1, data2) { combined_data <- data1 for (i in 3:ncol(data1)) { combined_data <- cbind(combined_data, data2[i], data1[i]) } combined_data } combI <- reorder_columns(headSel2, shiftOnly) # Write output files write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE) write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE) # Relabel headers using experiment names from StudyInfo.csv relabel_headers <- function(headers, labels) { new_labels <- headers for (i in seq_along(headers)) { suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i]) if (suffix %in% 1:3) { exp_name <- labels[as.numeric(suffix), 2] new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i]) } } new_labels } LabelStd <- read.csv(file = args$study_info, stringsAsFactors = FALSE) colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), LabelStd) colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), LabelStd) # Save relabeled files write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE) write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE) # Save updated parameters LabelStd[, 4] <- args$sd write.csv(LabelStd, file.path(args$out_dir, "parameters.csv"), row.names = FALSE) write.csv(LabelStd, file = args$study_info, row.names = FALSE) # library("plyr") # library("sos") # library("dplyr") # # Function to parse and set arguments # parse_arguments <- function() { # if (interactive()) { # args <- c( # "./", # Default out_dir # "2", # Default SD value # "../Code/StudyInfo.csv" # Default study_info path # ) # } else { # args <- commandArgs(trailingOnly = TRUE) # } # list( # out_dir = normalizePath(file.path(args[1]), mustWork = FALSE), # sd = as.numeric(args[2]), # study_info = normalizePath(file.path(args[3]), mustWork = FALSE), # input_dirs = args[4:length(args)] # ) # } # args <- parse_arguments() # # Create an array for the zscores files # zscores_files <- sapply(args$input_dirs, function(study) { # file_path <- file.path(study, "zscores", "zscores_interaction.csv") # if (file.exists(file_path)) file_path else NULL # }, simplify = TRUE, USE.NAMES = FALSE) # # Filter out NULL entries # zscores_files <- zscores_files[!sapply(zscores_files, is.null)] # sprintf("The SD value is: %d", args$sd) # # print(args$input_dirs) # # TODO this is better handled in a loop in case you want to compare more experiments? # # The input is already designed for this # # Read in the files for your experiment and # # Join the two files at a time as a function of how many inputFile # # list the larger file first ? in this example X2 has the larger number of genes # # If X1 has a larger number of genes, switch the order of X1 and X2 # if (length(zscores_files) < 2) { # print("Note enough Exps to compare, skipping join") # stop("Exiting script") # } # if (length(zscores_files) >= 2) { # X1 <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE) # X2 <- read.csv(file = zscores_files[2], stringsAsFactors = FALSE) # X <- join(X1, X2, by = "OrfRep") # OBH <- X[, order(colnames(X))] # OrderByHeader # headers <- select(OBH, contains("OrfRep"), matches("Gene"), # contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l")) # headSel <- select(headers, -"Gene.1") # remove "Gene.1 column # headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums # headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column # frame for interleaving Z_lm with Shift colums # } # if (length(zscores_files) >= 3) { # X3 <- read.csv(file = zscores_files[3], stringsAsFactors = FALSE) # X <- join(X, X3, by = "OrfRep") # headSel <- select(headers, -"Gene.1", -"Gene.2") # headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # headSel2 <- select(headSel2, -"Gene.1", -"Gene.2") # } # if (length(zscores_files) >= 4) { # X4 <- read.csv(file = zscores_files[4], stringsAsFactors = FALSE) # X <- join(X, X4, by = "OrfRep") # headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3") # headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3") # } # # print(headers) # # headSel$contains("Z_Shift") %>% replace_na(0.001) # headers <- colnames(headSel) # # print(headers) # i <- 0 # for (i in 1:length(headers)) { # if (grepl("Shift", headers[i])) { # headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001 # } # if (grepl("Z_lm_", headers[i])) { # headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001 # } # } # # 2SD option code to exclude Z_lm values less than 2 standard Deviations # REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")) # shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")) # # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt # Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",") # # Using Text search grepl to relabel headers # REMcRdyHdr <- colnames(REMcRdy) # REMcRdyLabels <- "asdf" # shftHdr <- colnames(shiftOnly) # shiftLabels <- "asdf" # shiftLabels[1:2] <- shftHdr[1:2] # REMcRdyLabels[1:2] <- REMcRdyHdr[1:2] # for (i in 3:(length(shftHdr))) { # if (i == 3) { # shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3]) # REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3]) # } # if (i == 5) { # shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5]) # REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5]) # } # if (i == 7) { # shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7]) # REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7]) # } # if (grepl(".1", shftHdr[i], fixed = true)) { # shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i]) # REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i]) # } # if (grepl(".2", shftHdr[i], fixed = true)) { # shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i]) # REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i]) # } # if (grepl(".3", shftHdr[i], fixed = true)) { # shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i]) # REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i]) # } # } # for (i in 3:(length(REMcRdyLabels))) { # j <- as.integer(i) # REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j]) # shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j]) # } # colnames(shiftOnly) <- shiftLabels # colnames(REMcRdy) <- REMcRdyLabels # combI <- headSel2 # starting Template orf, Genename columns # # headersRemc<-colnames(REMcRdy) # # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps. # for (i in 3:length(colnames(REMcRdy))) { # combI <- cbind.data.frame(combI, shiftOnly[i]) # combI <- cbind.data.frame(combI, REMcRdy[i]) # } # Vec1 <- NA # Vec2 <- NA # Vec3 <- NA # Vec4 <- NA # Vec5 <- NA # Vec6 <- NA # Vec7 <- NA # Vec8 <- NA # if (length(REMcRdy) == 6) { # Vec1 <- abs(REMcRdy[, 3]) >= sd # Vec2 <- abs(REMcRdy[, 4]) >= sd # Vec3 <- abs(REMcRdy[, 5]) >= sd # Vec4 <- abs(REMcRdy[, 6]) >= sd # bolVec <- Vec1 | Vec2 | Vec3 | Vec4 # REMcRdyGT2 <- REMcRdy[bolVec, 1:2] # REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6] # shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] # shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6] # } # if (length(REMcRdy) == 8) { # Vec1 <- abs(REMcRdy[, 3]) >= sd # Vec2 <- abs(REMcRdy[, 4]) >= sd # Vec3 <- abs(REMcRdy[, 5]) >= sd # Vec4 <- abs(REMcRdy[, 6]) >= sd # Vec5 <- abs(REMcRdy[, 7]) >= sd # Vec6 <- abs(REMcRdy[, 8]) >= sd # bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 # REMcRdyGT2 <- REMcRdy[bolVec, 1:2] # REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8] # shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] # shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8] # } # if (length(REMcRdy) == 10) { # Vec1 <- abs(REMcRdy[, 3]) >= sd # Vec2 <- abs(REMcRdy[, 4]) >= sd # Vec3 <- abs(REMcRdy[, 5]) >= sd # Vec4 <- abs(REMcRdy[, 6]) >= sd # Vec5 <- abs(REMcRdy[, 7]) >= sd # Vec6 <- abs(REMcRdy[, 8]) >= sd # Vec7 <- abs(REMcRdy[, 9]) >= sd # Vec8 <- abs(REMcRdy[, 10]) >= sd # bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8 # REMcRdyGT2 <- REMcRdy[bolVec, 1:2] # REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10] # shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] # shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10] # } # if (sd != 0) { # REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)] # shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)] # } # if (sd == 0) { # REMcRdy <- REMcRdy # [,2:length(REMcRdy)] # shiftOnly <- shiftOnly # [,2:length(shiftOnly)] # } # # R places hidden "" around the header names. The following # # is intended to remove those quote so that the "" do not blow up the Java REMc. # # Use ,quote=F in the write.csv statement to fix R output file. # # write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE) # write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE) # write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE) # # LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",") # LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE) # # print(sd) # LabelStd[, 4] <- as.numeric(sd) # write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE) # write.csv(LabelStd, file = study_info, row.names = FALSE)