joinInteractExps.R 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. library("dplyr")
  2. # Function to parse and set arguments
  3. parse_arguments <- function() {
  4. if (interactive()) {
  5. args <- c(
  6. "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
  7. 2,
  8. "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
  9. "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
  10. "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
  11. )
  12. } else {
  13. args <- commandArgs(trailingOnly = TRUE)
  14. }
  15. list(
  16. out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
  17. sd = as.numeric(args[2]),
  18. study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
  19. input_dirs = args[4:length(args)]
  20. )
  21. }
  22. args <- parse_arguments()
  23. # Create an array for the zscores files
  24. get_zscores_files <- function(dirs) {
  25. files <- sapply(dirs, function(exp) {
  26. file_path <- file.path(exp, "zscores", "zscores_interaction.csv")
  27. if (file.exists(file_path)) file_path else NULL
  28. }, simplify = TRUE, USE.NAMES = FALSE)
  29. # Filter out NULL entries
  30. files[!sapply(files, is.null)]
  31. }
  32. zscores_files <- get_zscores_files(args$input_dirs)
  33. sprintf("The SD value is: %d", args$sd)
  34. # Ensure there are enough files to compare
  35. if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script")
  36. # Function to join zscores files
  37. join_zscores_files <- function(files) {
  38. joined_data <- read.csv(file = files[1], stringsAsFactors = FALSE)
  39. for (file in files[-1]) {
  40. temp_data <- read.csv(file = file, stringsAsFactors = FALSE)
  41. joined_data <- join(joined_data, temp_data, by = "OrfRep")
  42. }
  43. joined_data
  44. }
  45. # Load and join zscores files
  46. joined_data <- join_zscores_files(zscores_files)
  47. # Order and select columns
  48. order_and_select_columns <- function(data) {
  49. ordered_data <- data[, order(colnames(data))]
  50. selected_headers <- select(ordered_data,
  51. contains("OrfRep"), matches("Gene"),
  52. contains("z_lm_k"), contains("z_shift_k"),
  53. contains("z_lm_l"), contains("z_shift_l"))
  54. selected_headers
  55. }
  56. selected_headers <- order_and_select_columns(joined_data)
  57. # Remove redundant columns like "Gene.1"
  58. clean_headers <- function(data, suffixes) {
  59. suffixes_to_remove <- paste0("Gene.", seq_len(suffixes))
  60. select(data, -all_of(suffixes_to_remove))
  61. }
  62. headSel <- clean_headers(selected_headers, length(zscores_files) - 1)
  63. headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1)
  64. # Fill NA values in Shift and Z_lm columns
  65. fill_na_in_columns <- function(data) {
  66. for (header in colnames(data)) {
  67. if (grepl("Shift", header)) {
  68. data[[header]][is.na(data[[header]])] <- 0.001
  69. } else if (grepl("Z_lm_", header)) {
  70. data[[header]][is.na(data[[header]])] <- 0.0001
  71. }
  72. }
  73. data
  74. }
  75. headSel <- fill_na_in_columns(headSel)
  76. # Filter based on standard deviation
  77. filter_by_sd <- function(data, sd) {
  78. if (sd == 0) return(data)
  79. z_lm_cols <- select(data, contains("z_lm_"))
  80. filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0
  81. data[filter_vector, ]
  82. }
  83. REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), args$sd)
  84. shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), args$sd)
  85. # Reorder columns to interleave Z_lm and Shift data
  86. reorder_columns <- function(data1, data2) {
  87. combined_data <- data1
  88. for (i in 3:ncol(data1)) {
  89. combined_data <- cbind(combined_data, data2[i], data1[i])
  90. }
  91. combined_data
  92. }
  93. combI <- reorder_columns(headSel2, shiftOnly)
  94. # Write output files
  95. write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
  96. write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
  97. # Relabel headers using experiment names from StudyInfo.csv
  98. relabel_headers <- function(headers, labels) {
  99. new_labels <- headers
  100. for (i in seq_along(headers)) {
  101. suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i])
  102. if (suffix %in% 1:3) {
  103. exp_name <- labels[as.numeric(suffix), 2]
  104. new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i])
  105. }
  106. }
  107. new_labels
  108. }
  109. LabelStd <- read.csv(file = args$study_info, stringsAsFactors = FALSE)
  110. colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), LabelStd)
  111. colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), LabelStd)
  112. # Save relabeled files
  113. write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
  114. write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
  115. # Save updated parameters
  116. LabelStd[, 4] <- args$sd
  117. write.csv(LabelStd, file.path(args$out_dir, "parameters.csv"), row.names = FALSE)
  118. write.csv(LabelStd, file = args$study_info, row.names = FALSE)
  119. # library("plyr")
  120. # library("sos")
  121. # library("dplyr")
  122. # # Function to parse and set arguments
  123. # parse_arguments <- function() {
  124. # if (interactive()) {
  125. # args <- c(
  126. # "./", # Default out_dir
  127. # "2", # Default SD value
  128. # "../Code/StudyInfo.csv" # Default study_info path
  129. # )
  130. # } else {
  131. # args <- commandArgs(trailingOnly = TRUE)
  132. # }
  133. # list(
  134. # out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
  135. # sd = as.numeric(args[2]),
  136. # study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
  137. # input_dirs = args[4:length(args)]
  138. # )
  139. # }
  140. # args <- parse_arguments()
  141. # # Create an array for the zscores files
  142. # zscores_files <- sapply(args$input_dirs, function(study) {
  143. # file_path <- file.path(study, "zscores", "zscores_interaction.csv")
  144. # if (file.exists(file_path)) file_path else NULL
  145. # }, simplify = TRUE, USE.NAMES = FALSE)
  146. # # Filter out NULL entries
  147. # zscores_files <- zscores_files[!sapply(zscores_files, is.null)]
  148. # sprintf("The SD value is: %d", args$sd)
  149. # # print(args$input_dirs)
  150. # # TODO this is better handled in a loop in case you want to compare more experiments?
  151. # # The input is already designed for this
  152. # # Read in the files for your experiment and
  153. # # Join the two files at a time as a function of how many inputFile
  154. # # list the larger file first ? in this example X2 has the larger number of genes
  155. # # If X1 has a larger number of genes, switch the order of X1 and X2
  156. # if (length(zscores_files) < 2) {
  157. # print("Note enough Exps to compare, skipping join")
  158. # stop("Exiting script")
  159. # }
  160. # if (length(zscores_files) >= 2) {
  161. # X1 <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE)
  162. # X2 <- read.csv(file = zscores_files[2], stringsAsFactors = FALSE)
  163. # X <- join(X1, X2, by = "OrfRep")
  164. # OBH <- X[, order(colnames(X))] # OrderByHeader
  165. # headers <- select(OBH, contains("OrfRep"), matches("Gene"),
  166. # contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l"))
  167. # headSel <- select(headers, -"Gene.1") # remove "Gene.1 column
  168. # headSel2 <- select(OBH, contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums
  169. # headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column # frame for interleaving Z_lm with Shift colums
  170. # }
  171. # if (length(zscores_files) >= 3) {
  172. # X3 <- read.csv(file = zscores_files[3], stringsAsFactors = FALSE)
  173. # X <- join(X, X3, by = "OrfRep")
  174. # headSel <- select(headers, -"Gene.1", -"Gene.2")
  175. # headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
  176. # headSel2 <- select(headSel2, -"Gene.1", -"Gene.2")
  177. # }
  178. # if (length(zscores_files) >= 4) {
  179. # X4 <- read.csv(file = zscores_files[4], stringsAsFactors = FALSE)
  180. # X <- join(X, X4, by = "OrfRep")
  181. # headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3")
  182. # headSel2 <- select(OBH, contains("OrfRep"), matches("Gene"))
  183. # headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3")
  184. # }
  185. # # print(headers)
  186. # # headSel$contains("Z_Shift") %>% replace_na(0.001)
  187. # headers <- colnames(headSel)
  188. # # print(headers)
  189. # i <- 0
  190. # for (i in 1:length(headers)) {
  191. # if (grepl("Shift", headers[i])) {
  192. # headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001
  193. # }
  194. # if (grepl("Z_lm_", headers[i])) {
  195. # headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001
  196. # }
  197. # }
  198. # # 2SD option code to exclude Z_lm values less than 2 standard Deviations
  199. # REMcRdy <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_"))
  200. # shiftOnly <- select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift"))
  201. # # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt
  202. # Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",")
  203. # # Using Text search grepl to relabel headers
  204. # REMcRdyHdr <- colnames(REMcRdy)
  205. # REMcRdyLabels <- "asdf"
  206. # shftHdr <- colnames(shiftOnly)
  207. # shiftLabels <- "asdf"
  208. # shiftLabels[1:2] <- shftHdr[1:2]
  209. # REMcRdyLabels[1:2] <- REMcRdyHdr[1:2]
  210. # for (i in 3:(length(shftHdr))) {
  211. # if (i == 3) {
  212. # shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3])
  213. # REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3])
  214. # }
  215. # if (i == 5) {
  216. # shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5])
  217. # REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5])
  218. # }
  219. # if (i == 7) {
  220. # shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7])
  221. # REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7])
  222. # }
  223. # if (grepl(".1", shftHdr[i], fixed = true)) {
  224. # shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i])
  225. # REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i])
  226. # }
  227. # if (grepl(".2", shftHdr[i], fixed = true)) {
  228. # shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i])
  229. # REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i])
  230. # }
  231. # if (grepl(".3", shftHdr[i], fixed = true)) {
  232. # shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i])
  233. # REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i])
  234. # }
  235. # }
  236. # for (i in 3:(length(REMcRdyLabels))) {
  237. # j <- as.integer(i)
  238. # REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j])
  239. # shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j])
  240. # }
  241. # colnames(shiftOnly) <- shiftLabels
  242. # colnames(REMcRdy) <- REMcRdyLabels
  243. # combI <- headSel2 # starting Template orf, Genename columns
  244. # # headersRemc<-colnames(REMcRdy)
  245. # # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps.
  246. # for (i in 3:length(colnames(REMcRdy))) {
  247. # combI <- cbind.data.frame(combI, shiftOnly[i])
  248. # combI <- cbind.data.frame(combI, REMcRdy[i])
  249. # }
  250. # Vec1 <- NA
  251. # Vec2 <- NA
  252. # Vec3 <- NA
  253. # Vec4 <- NA
  254. # Vec5 <- NA
  255. # Vec6 <- NA
  256. # Vec7 <- NA
  257. # Vec8 <- NA
  258. # if (length(REMcRdy) == 6) {
  259. # Vec1 <- abs(REMcRdy[, 3]) >= sd
  260. # Vec2 <- abs(REMcRdy[, 4]) >= sd
  261. # Vec3 <- abs(REMcRdy[, 5]) >= sd
  262. # Vec4 <- abs(REMcRdy[, 6]) >= sd
  263. # bolVec <- Vec1 | Vec2 | Vec3 | Vec4
  264. # REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
  265. # REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6]
  266. # shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
  267. # shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6]
  268. # }
  269. # if (length(REMcRdy) == 8) {
  270. # Vec1 <- abs(REMcRdy[, 3]) >= sd
  271. # Vec2 <- abs(REMcRdy[, 4]) >= sd
  272. # Vec3 <- abs(REMcRdy[, 5]) >= sd
  273. # Vec4 <- abs(REMcRdy[, 6]) >= sd
  274. # Vec5 <- abs(REMcRdy[, 7]) >= sd
  275. # Vec6 <- abs(REMcRdy[, 8]) >= sd
  276. # bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6
  277. # REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
  278. # REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8]
  279. # shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
  280. # shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8]
  281. # }
  282. # if (length(REMcRdy) == 10) {
  283. # Vec1 <- abs(REMcRdy[, 3]) >= sd
  284. # Vec2 <- abs(REMcRdy[, 4]) >= sd
  285. # Vec3 <- abs(REMcRdy[, 5]) >= sd
  286. # Vec4 <- abs(REMcRdy[, 6]) >= sd
  287. # Vec5 <- abs(REMcRdy[, 7]) >= sd
  288. # Vec6 <- abs(REMcRdy[, 8]) >= sd
  289. # Vec7 <- abs(REMcRdy[, 9]) >= sd
  290. # Vec8 <- abs(REMcRdy[, 10]) >= sd
  291. # bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8
  292. # REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
  293. # REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10]
  294. # shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
  295. # shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10]
  296. # }
  297. # if (sd != 0) {
  298. # REMcRdy <- REMcRdyGT2 # [,2:length(REMcRdyGT2)]
  299. # shiftOnly <- shiftOnlyGT2 # [,2:length(shiftOnlyGT2)]
  300. # }
  301. # if (sd == 0) {
  302. # REMcRdy <- REMcRdy # [,2:length(REMcRdy)]
  303. # shiftOnly <- shiftOnly # [,2:length(shiftOnly)]
  304. # }
  305. # # R places hidden "" around the header names. The following
  306. # # is intended to remove those quote so that the "" do not blow up the Java REMc.
  307. # # Use ,quote=F in the write.csv statement to fix R output file.
  308. # # write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE)
  309. # write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
  310. # write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
  311. # # LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",")
  312. # LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE)
  313. # # print(sd)
  314. # LabelStd[, 4] <- as.numeric(sd)
  315. # write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE)
  316. # write.csv(LabelStd, file = study_info, row.names = FALSE)