#!/usr/bin/env Rscript # JoinInteractExps.R library(plyr) library(sos) library(dplyr) args <- commandArgs(TRUE) # Set output dir if (length(args) >= 1) { outDir <- file.path(args[1]) } else { outDir <- "./" # for legacy workflow } # Set sd value if (length(args) >= 2) { sd <- args[2] } else { sd <- 2 # default value } print(paste("SD=", sd)) # Set studyInfo file if (length(args) >= 3) { studyInfo <- file.path(args[3]) } else { studyInfo <- "../Code/StudyInfo.csv" # for legacy workflow } studies <- args[3:length(args)] inputFiles <- c() for (study in 1:length(studies)) { zsFile <- file.path(study, "zscores", "zscores_interaction.csv") if (file.exists(zsFile)) { inputFiles[study] <- zsFile } } print(length(inputFiles)) # TODO this is better handled in a loop in case you want to compare more experiments? # The input is already designed for this # Read in the files for your experiment and # Join the two files at a time as a function of how many inputFile # list the larger file first ? in this example X2 has the larger number of genes # If X1 has a larger number of genes, switch the order of X1 and X2 if (length(inputFiles) == 2) { X1 <- read.csv(file = inputFiles[1], stringsAsFactors = FALSE) X2 <- read.csv(file = inputFiles[2], stringsAsFactors = FALSE) X <- join(X1, X2, by = "OrfRep") OBH <- X[, order(colnames(X))] # OrderByHeader headSel <- select(OBH, contains("OrfRep"), matches("Gene"), contains("Z_lm_K"), contains("Z_Shift_K"), contains("Z_lm_L"), contains("Z_Shift_L")) headSel <- select(headSel, -"Gene.1") #remove "Gene.1 column headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) #Frame for interleaving Z_lm with Shift colums headSel2 <- select(headSel2, -"Gene.1") #remove "Gene.1 column #Frame for interleaving Z_lm with Shift colums } else if (length(inputFiles) == 3) { X1 <- read.csv(file = inputFiles[1], stringsAsFactors = FALSE) #exp1File,stringsAsFactors = FALSE) X2 <- read.csv(file = inputFiles[2], stringsAsFactors = FALSE) #exp2File,stringsAsFactors = FALSE) X3 <- read.csv(file = inputFiles[3], stringsAsFactors = FALSE) #exp3File,stringsAsFactors = FALSE) X <- join(X1, X2, by = "OrfRep") X <- join(X, X3, by = "OrfRep") OBH <- X[, order(colnames(X))] #OrderByHeader headSel <- select(OBH, contains("OrfRep"), matches("Gene"), contains("Z_lm_K"), contains("Z_Shift_K"), contains("Z_lm_L"), contains("Z_Shift_L")) headSel <- select(headSel, -"Gene.1", -"Gene.2") headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) headSel2 <- select(headSel2, -"Gene.1", -"Gene.2") } else if (length(inputFiles) == 4) { X1 <- read.csv(file = inputFiles[1], stringsAsFactors = FALSE) #exp1File,stringsAsFactors = FALSE) X2 <- read.csv(file = inputFiles[2], stringsAsFactors = FALSE) #exp2File,stringsAsFactors = FALSE) X3 <- read.csv(file = inputFiles[3], stringsAsFactors = FALSE) #exp3File,stringsAsFactors = FALSE) X4 <- read.csv(file = inputFiles[4], stringsAsFactors = FALSE) #exp4File,stringsAsFactors = FALSE) X <- join(X1, X2, by = "OrfRep") X <- join(X, X3, by = "OrfRep") X <- join(X, X4, by = "OrfRep") OBH <- X[, order(colnames(X))] #OrderByHeader headSel <- select(OBH, contains("OrfRep"), matches("Gene"), contains("Z_lm_K"), contains("Z_Shift_K"), contains("Z_lm_L"), contains("Z_Shift_L")) headSel <- select(headSel, -"Gene.1", -"Gene.2", -"Gene.3") headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3") } # headSel$contains("Z_Shift") %>% replace_na(0.001) headers <- colnames(headSel) i <- 0 for (i in 1:length(headers)) { if (grepl("Shift", headers[i])) { headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001 } if (grepl("Z_lm_", headers[i])) { headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001 } } # 2SD option code to exclude Z_lm values less than 2 standard Deviations REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("Z_lm_")) shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("Z_Shift")) # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt Labels <- read.csv(file = "../Code/StudyInfo.csv", stringsAsFactors = FALSE, sep = ",") # Using Text search grepl to relabel headers REMcRdyHdr <- colnames(REMcRdy) REMcRdyLabels <- "asdf" shftHdr <- colnames(shiftOnly) shiftLabels <- "asdf" shiftLabels[1:2] <- shftHdr[1:2] REMcRdyLabels[1:2] <- REMcRdyHdr[1:2] for (i in 3:(length(shftHdr))) { if (i == 3) { shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3]) REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3]) } if (i == 5) { shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5]) REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5]) } if (i == 7) { shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7]) REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7]) } if (grepl(".1", shftHdr[i], fixed = true)) { shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i]) REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i]) } if (grepl(".2", shftHdr[i], fixed = true)) { shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i]) REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i]) } if (grepl(".3", shftHdr[i], fixed = true)) { shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i]) REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i]) } } for (i in 3:(length(REMcRdyLabels))) { j <- as.integer(i) REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j]) shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j]) } colnames(shiftOnly) <- shiftLabels colnames(REMcRdy) <- REMcRdyLabels combI <- headSel2 # starting Template orf, Genename columns # headersRemc<-colnames(REMcRdy) # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps. for (i in 3:length(colnames(REMcRdy))) { combI <- cbind.data.frame(combI, shiftOnly[i]) combI <- cbind.data.frame(combI, REMcRdy[i]) } Vec1 <- NA Vec2 <- NA Vec3 <- NA Vec4 <- NA Vec5 <- NA Vec6 <- NA Vec7 <- NA Vec8 <- NA if (length(REMcRdy) == 6) { Vec1 <- abs(REMcRdy[, 3]) >= std Vec2 <- abs(REMcRdy[, 4]) >= std Vec3 <- abs(REMcRdy[, 5]) >= std Vec4 <- abs(REMcRdy[, 6]) >= std bolVec <- Vec1 | Vec2 | Vec3 | Vec4 REMcRdyGT2 <- REMcRdy[bolVec, 1:2] REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6] shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6] } if (length(REMcRdy) == 8) { Vec1 <- abs(REMcRdy[, 3]) >= std Vec2 <- abs(REMcRdy[, 4]) >= std Vec3 <- abs(REMcRdy[, 5]) >= std Vec4 <- abs(REMcRdy[, 6]) >= std Vec5 <- abs(REMcRdy[, 7]) >= std Vec6 <- abs(REMcRdy[, 8]) >= std bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 REMcRdyGT2 <- REMcRdy[bolVec, 1:2] REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8] shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8] } if (length(REMcRdy) == 10) { Vec1 <- abs(REMcRdy[, 3]) >= std Vec2 <- abs(REMcRdy[, 4]) >= std Vec3 <- abs(REMcRdy[, 5]) >= std Vec4 <- abs(REMcRdy[, 6]) >= std Vec5 <- abs(REMcRdy[, 7]) >= std Vec6 <- abs(REMcRdy[, 8]) >= std Vec7 <- abs(REMcRdy[, 9]) >= std Vec8 <- abs(REMcRdy[, 10]) >= std bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8 REMcRdyGT2 <- REMcRdy[bolVec, 1:2] REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10] shiftOnlyGT2 <- shiftOnly[bolVec, 1:2] shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10] } if (std != 0) { REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)] shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)] } if (std == 0) { REMcRdy <- REMcRdy # [,2:length(REMcRdy)] shiftOnly <- shiftOnly # [,2:length(shiftOnly)] } # R places hidden "" around the header names. The following # is intended to remove those quote so that the "" do not blow up the Java REMc. # Use ,quote=F in the write.csv statement to fix R output file. # write.csv(combI,file.path(outDir,"CombinedKLzscores.csv"), row.names = FALSE) write.csv(REMcRdy, file.path(outDir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE) write.csv(shiftOnly, file.path(outDir, "Shift_only.csv"), row.names = FALSE, quote = FALSE) #LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",") LabelStd <- read.csv(file = studyInfo, stringsAsFactors = FALSE) print(std) LabelStd[, 4] <- as.numeric(std) write.csv(LabelStd, file = file.path(outDir, "parameters.csv"), row.names = FALSE) write.csv(LabelStd, file = studyInfo, row.names = FALSE)