#!/usr/bin/env Rscript # JoinInteractExps.R library("plyr") library("sos") library("dplyr") args <- commandArgs(TRUE) # Set output dir if (length(args) >= 1) { out_dir <- file.path(args[1]) } else { out_dir <- "./" # for legacy workflow } # Set sd value if (length(args) >= 2) { sd <- as.numeric(args[2]) } else { sd <- 2 # default value } sprintf("SD value is: %d", sd) # Set study_info file if (length(args) >= 3) { study_info <- file.path(args[3]) } else { study_info <- "../Code/StudyInfo.csv" # for legacy workflow } studies <- args[4:length(args)] print(studies) input_files <- c() for (i in seq_along(studies)) { study <- studies[i] zs_file <- file.path(study, "zscores", "zscores_interaction.csv") if (file.exists(zs_file)) { input_files[i] <- zs_file } } rm(zs_file, study) # for (var in ls()) { # print(paste(var, ":", get(var))) # } # print(input_files) # print(length(input_files)) # TODO this is better handled in a loop in case you want to compare more experiments? # The input is already designed for this # Read in the files for your experiment and # Join the two files at a time as a function of how many inputFile # list the larger file first ? in this example X2 has the larger number of genes # If X1 has a larger number of genes, switch the order of X1 and X2 if (length(input_files) < 2) { print("Note enough Exps to compare, skipping join") stop("Exiting script") } if (length(input_files) >= 2) { X1 <- read.csv(file = input_files[1], stringsAsFactors = FALSE) X2 <- read.csv(file = input_files[2], stringsAsFactors = FALSE) X <- join(X1, X2, by = "OrfRep") OBH <- X[, order(colnames(X))] # OrderByHeader headers <- select(OBH, contains("OrfRep"), matches("Gene"), contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l")) headSel <- select(headers, -"Gene.1") # remove "Gene.1 column headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column # frame for interleaving Z_lm with Shift colums } if (length(input_files) >= 3) { X3 <- read.csv(file = input_files[3], stringsAsFactors = FALSE) X <- join(X, X3, by = "OrfRep") headSel <- select(headers, -"Gene.1", -"Gene.2") headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) headSel2 <- select(headSel2, -"Gene.1", -"Gene.2") } if (length(input_files) >= 4) { X4 <- read.csv(file = input_files[4], stringsAsFactors = FALSE) X <- join(X, X4, by = "OrfRep") headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3") headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3") } # print(headers) # headSel$contains("Z_Shift") %>% replace_na(0.001) headers <- colnames(headSel) # print(headers) i <- 0 for (i in 1:length(headers)) { if (grepl("Shift", headers[i])) { headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001 } if (grepl("Z_lm_", headers[i])) { headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001 } } # 2SD option code to exclude Z_lm values less than 2 standard Deviations REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")) shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")) # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",") # Using Text search grepl to relabel headers REMcRdyHdr <- colnames(REMcRdy) REMcRdyLabels <- "asdf" shftHdr <- colnames(shiftOnly) shiftLabels <- "asdf" shiftLabels[1:2] <- shftHdr[1:2] REMcRdyLabels[1:2] <- REMcRdyHdr[1:2] for (i in 3:(length(shftHdr))) { if (i == 3) { shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3]) REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3]) } if (i == 5) { shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5]) REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5]) } if (i == 7) { shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7]) REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7]) } if (grepl(".1", shftHdr[i], fixed = true)) { shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i]) REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i]) } if (grepl(".2", shftHdr[i], fixed = true)) { shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i]) REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i]) } if (grepl(".3", shftHdr[i], fixed = true)) { shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i]) REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i]) } } for (i in 3:(length(REMcRdyLabels))) { j <- as.integer(i) REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j]) shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j]) } colnames(shiftOnly) <- shiftLabels colnames(REMcRdy) <- REMcRdyLabels combI <- headSel2 # starting Template orf, Genename columns # headersRemc<-colnames(REMcRdy) # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps. for (i in 3:length(colnames(REMcRdy))) { combI <- cbind.data.frame(combI, shiftOnly[i]) combI <- cbind.data.frame(combI, REMcRdy[i]) } Vec1 <- NA Vec2 <- NA Vec3 <- NA Vec4 <- NA Vec5 <- NA Vec6 <- NA Vec7 <- NA Vec8 <- NA if (length(REMcRdy) == 6) { Vec1 <- abs(REMcRdy[, 3]) >= sd Vec2 <- abs(REMcRdy[, 4]) >= sd Vec3 <- abs(REMcRdy[, 5]) >= sd Vec4 <- abs(REMcRdy[, 6]) >= sd bolVec <- Vec1 | Vec2 | Vec3 | Vec4 REMcRdyGT2 <- REMcRdy[bolVec, 1:2] REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6] shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6] } if (length(REMcRdy) == 8) { Vec1 <- abs(REMcRdy[, 3]) >= sd Vec2 <- abs(REMcRdy[, 4]) >= sd Vec3 <- abs(REMcRdy[, 5]) >= sd Vec4 <- abs(REMcRdy[, 6]) >= sd Vec5 <- abs(REMcRdy[, 7]) >= sd Vec6 <- abs(REMcRdy[, 8]) >= sd bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 REMcRdyGT2 <- REMcRdy[bolVec, 1:2] REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8] shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8] } if (length(REMcRdy) == 10) { Vec1 <- abs(REMcRdy[, 3]) >= sd Vec2 <- abs(REMcRdy[, 4]) >= sd Vec3 <- abs(REMcRdy[, 5]) >= sd Vec4 <- abs(REMcRdy[, 6]) >= sd Vec5 <- abs(REMcRdy[, 7]) >= sd Vec6 <- abs(REMcRdy[, 8]) >= sd Vec7 <- abs(REMcRdy[, 9]) >= sd Vec8 <- abs(REMcRdy[, 10]) >= sd bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8 REMcRdyGT2 <- REMcRdy[bolVec, 1:2] REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10] shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10] } if (sd != 0) { REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)] shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)] } if (sd == 0) { REMcRdy <- REMcRdy # [,2:length(REMcRdy)] shiftOnly <- shiftOnly # [,2:length(shiftOnly)] } # R places hidden "" around the header names. The following # is intended to remove those quote so that the "" do not blow up the Java REMc. # Use ,quote=F in the write.csv statement to fix R output file. # write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE) write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE) write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE) # LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",") LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE) # print(sd) LabelStd[, 4] <- as.numeric(sd) write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE) write.csv(LabelStd, file = study_info, row.names = FALSE)