createHeatMaps.R 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. #!/usr/bin/env Rscript
  2. # This script will make heatmaps for the REMc analysis
  3. # need to give the input "finalTable.csv" file after running REMc generated by eclipse
  4. library(RColorBrewer)
  5. library(gplots)
  6. args <- commandArgs(TRUE)
  7. # Set output dir
  8. if (length(args) >= 1) {
  9. input_finalTable <- file.path(args[1])
  10. } else {
  11. input_finalTable <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow
  12. }
  13. if (length(args) >= 2) {
  14. outDir <- file.path(args[2])
  15. } else {
  16. outDir <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow
  17. }
  18. hmapfile <- data.frame(read.csv(file = input_finalTable, header = TRUE, sep = ",", stringsAsFactors = FALSE))
  19. # set NAs to NA
  20. hmapfile[hmapfile == -100] <- NA
  21. hmapfile[hmapfile == 100] <- NA
  22. hmapfile[hmapfile == 0.001] <- NA
  23. hmapfile[hmapfile == -0.001] <- NA
  24. # select the number of rows based on the number of genes
  25. num_total_genes <- length(hmapfile[, 1])
  26. # Break out the cluster names so each part of the cluster origin can be accessed
  27. # line below removed because it adds to many genes to clusters when going past 1-0-10
  28. # since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl.
  29. # hmapfile$cluster.origin = gsub(" ","",x=hmapfile$cluster.origin)
  30. hmapfile$cluster.origin <- gsub(";", " ;", x = hmapfile$cluster.origin)
  31. hmapfile$cluster.origin <- strsplit(hmapfile$cluster.origin, ";")
  32. #use tail(x,n) for accessing the outward most cluster
  33. clust_rounds <- 0
  34. for (i in 1:num_total_genes) {
  35. if (length(hmapfile$cluster.origin[[i]]) > clust_rounds) {
  36. clust_rounds <- length(hmapfile$cluster.origin[[i]])
  37. }
  38. }
  39. unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes])
  40. unique_clusts <- unique_clusts[unique_clusts != " "]
  41. # Select only the unique cluster names
  42. unique_clusts <- sort(unique(unlist(unique_clusts, use.names = FALSE)), decreasing = FALSE)
  43. num_unique_clusts <- length(unique_clusts)
  44. # Base the color key on a statistical analysis of the L and K data
  45. # need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors)
  46. # scale() will calculate the mean and standard deviation of the entire vector,
  47. # then "scale" each element by those values by subtracting the mean and dividing by the sd.
  48. # hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)])
  49. # change so that the L data is multiplied to be on the same scale as the K data
  50. KEY_MIN <- 0
  51. KEY_MAX <- 0
  52. K_MIN <- 0
  53. L_MAX <- 0
  54. KcolumnValues <- vector()
  55. LcolumnValues <- vector()
  56. for (i in 4:(length(hmapfile[1, ]) - 2)){
  57. if (grepl("_Z_lm_K", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
  58. KcolumnValues <- append(KcolumnValues, i)
  59. }
  60. if (grepl("_Z_lm_L", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
  61. LcolumnValues <- append(LcolumnValues, i)
  62. }
  63. }
  64. # L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[4]
  65. # K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[2]
  66. # L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm=TRUE)[4]
  67. # K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm=TRUE)[2]
  68. # Z scores are
  69. L_MAX <- 12
  70. K_MIN <- -12
  71. # L_Multiplier <- as.numeric(abs(K_MIN/L_MAX))
  72. # hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier
  73. # if(grepl("SHIFT",colnames(hmapfile)[4],fixed=TRUE) == TRUE){
  74. # print("FOUND SHIFT VALUES")
  75. # hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier
  76. # }
  77. # KEY_MAX <- as.numeric(L_MAX * L_Multiplier)
  78. # KEY_MIN <- as.numeric(K_MIN)
  79. KEY_MAX <- as.numeric(L_MAX)
  80. KEY_MIN <- as.numeric(K_MIN)
  81. print(KEY_MIN)
  82. print(L_MAX)
  83. # print(L_Multiplier)
  84. colormapbreaks <- c(KEY_MIN, KEY_MIN * (5 / 6), KEY_MIN * (4 / 6), KEY_MIN * (3 / 6), KEY_MIN * (2 / 6),
  85. KEY_MIN * (1 / 6), KEY_MAX * (1 / 6), KEY_MAX * (2 / 6), KEY_MAX * (3 / 6), KEY_MAX * (4 / 6), KEY_MAX * (5 / 6), KEY_MAX)
  86. # print(colormapbreaks)
  87. # Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?)
  88. # However since also using this to amend the first part.
  89. # Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers
  90. # then could just use these values and create spaces only between interaction values
  91. # possibly could get rid of redundant shift values if we don't want to view these
  92. # could we pool all the shift data/average it?
  93. if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == TRUE) {
  94. even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 2)
  95. # ev_repeat = rep("white",length(even_columns))
  96. # ev_repeat = rep("red",(length(hmapfile[1,]) - 5))
  97. # middle_col <- (length(hmapfile[1,]) - 5)/2
  98. # ev_repeat[(middle_col/2)] <- "black"
  99. # print(ev_repeat)
  100. }
  101. if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == FALSE) {
  102. even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 1)
  103. print("NO SHIFT VALS FOUND")
  104. }
  105. # FOR THIS SCRIPT ONLY (rap tem hu script)
  106. # even_columns <- c(2,5,7,10,12,15,17)
  107. # m <- 0
  108. colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1, ]) - 2)])
  109. # print(colnames_edit)
  110. for (i in 1:length(colnames_edit)) {
  111. if (grepl("Shift", colnames_edit[i], fixed = TRUE) == TRUE) {
  112. colnames_edit[i] <- ""
  113. colnames_edit[i + 1] <- gsub(pattern = "_Z_lm_", replacement = " ", x = colnames_edit[i + 1])
  114. try(colnames_edit[i + 1] <- gsub(pattern = "_", replacement = " ", x = colnames_edit[i + 1]))
  115. # INT_store <- strsplit(colnames_edit[i+1], "Z_lm")
  116. # print(length(unlist(INT_store)))
  117. # if(length(unlist(INT_store)) == 4){
  118. # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep=" ")
  119. # }
  120. # if(length(unlist(INT_store)) == 3){
  121. #
  122. # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep=" ")
  123. # }
  124. # if(length(unlist(INT_store)) == 5){
  125. # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep=" ")
  126. # }
  127. # if(length(unlist(INT_store)) == 6){
  128. # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep=" ")
  129. # }
  130. }
  131. }
  132. print(colnames_edit)
  133. # break()
  134. # colnames_edit[5] <- "TEM HLEG K"
  135. # colnames_edit[10] <- "TEM HL K"
  136. # colnames_edit[15] <- "TEM HLEG L"
  137. # colnames_edit[20] <- "TEM HL L"
  138. # Create the heatmaps
  139. for (i in 1:num_unique_clusts) {
  140. cluster <- unique_clusts[i]
  141. cluster_data <- subset(hmapfile, grepl(cluster, cluster.origin))
  142. cluster_length <- length(cluster_data[, 1])
  143. if (cluster_length != 1) {
  144. X0 <- as.matrix(cluster_data[, 4:(length(hmapfile[1, ]) - 2)])
  145. if (cluster_length >= 2001) {
  146. mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  147. pdf(file = mypath, height = 20, width = 15)
  148. heatmap.2(
  149. x = X0,
  150. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  151. dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
  152. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  153. # zlim=c(-132, 132),
  154. xlab = "Type of Media", ylab = "Gene Name",
  155. # cellnote = round(X0,digits = 0), notecex = 0.1, key = TRUE,
  156. keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
  157. na.color = "red", col = brewer.pal(11, "PuOr"),
  158. main = cluster,
  159. # ColSideColors=ev_repeat,
  160. labRow = as.character(cluster_data$Gene), labCol = colnames_edit
  161. )
  162. # abline(v=0.5467,col="black")
  163. dev.off()
  164. }
  165. if (cluster_length >= 201 && cluster_length <= 2000) {
  166. mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  167. pdf(file = mypath, height = 15, width = 12)
  168. heatmap.2(
  169. x = X0,
  170. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  171. dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
  172. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  173. # zlim=c(-132,132),
  174. xlab = "Type of Media", ylab = "Gene Name",
  175. cellnote = round(X0, digits = 0), notecex = 0.1, key = TRUE,
  176. keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
  177. na.color = "red", col = brewer.pal(11, "PuOr"),
  178. main = cluster,
  179. labRow = as.character(cluster_data$Gene), labCol = colnames_edit
  180. )
  181. # abline(v=0.5316,col="black")
  182. dev.off()
  183. }
  184. if (cluster_length >= 150 && cluster_length <= 200) {
  185. mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  186. pdf(file = mypath, height = 12, width = 12)
  187. heatmap.2(
  188. x = X0,
  189. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  190. dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
  191. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  192. # zlim=c(-132,132),
  193. xlab = "Type of Media", ylab = "Gene Name",
  194. cellnote = round(X0, digits = 0), notecex = 0.2, key = TRUE,
  195. keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
  196. na.color = "red", col = brewer.pal(11, "PuOr"),
  197. main = cluster,
  198. labRow = as.character(cluster_data$Gene), labCol = colnames_edit
  199. )
  200. dev.off()
  201. }
  202. if (cluster_length >= 101 && cluster_length <= 149) {
  203. mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  204. pdf(file = mypath, mypath, height = 12, width = 12)
  205. heatmap.2(
  206. x = X0,
  207. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  208. dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none",
  209. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  210. #zlim=c(-132,132),
  211. xlab = "Type of Media", ylab = "Gene Name",
  212. cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
  213. keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
  214. na.color = "red", col = brewer.pal(11, "PuOr"),
  215. main = cluster,
  216. labRow = as.character(cluster_data$Gene), labCol = colnames_edit
  217. )
  218. dev.off()
  219. }
  220. if (cluster_length >= 60 && cluster_length <= 100) {
  221. mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  222. pdf(file = mypath, height = 12, width = 12)
  223. heatmap.2(
  224. x = X0,
  225. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  226. dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none",
  227. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  228. # zlim=c(-132,132),
  229. xlab = "Type of Media", ylab = "Gene Name",
  230. cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
  231. keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
  232. na.color = "red", col = brewer.pal(11, "PuOr"),
  233. main = cluster,
  234. labRow = as.character(cluster_data$Gene), labCol = colnames_edit
  235. )
  236. dev.off()
  237. }
  238. if (cluster_length <= 59 && cluster_length >= 30) {
  239. mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  240. pdf(file = mypath, height = 9, width = 12)
  241. heatmap.2(
  242. x = X0,
  243. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  244. dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none",
  245. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  246. # zlim=c(-132,132),
  247. xlab = "Type of Media", ylab = "Gene Name",
  248. cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
  249. keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
  250. na.color = "red", col = brewer.pal(11, "PuOr"),
  251. main = cluster,
  252. labRow = as.character(cluster_data$Gene), labCol = colnames_edit
  253. )
  254. dev.off()
  255. }
  256. if (cluster_length <= 29) {
  257. mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  258. pdf(file = mypath, height = 7, width = 12)
  259. heatmap.2(
  260. x = X0,
  261. Rowv = TRUE, Colv = NA,
  262. distfun = dist, hclustfun = hclust,
  263. dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none",
  264. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  265. # zlim=c(-132,132),
  266. xlab = "Type of Media", ylab = "Gene Name",
  267. cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
  268. keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
  269. na.color = "red", col = brewer.pal(11, "PuOr"),
  270. main = cluster,
  271. labRow = as.character(cluster_data$Gene), labCol = colnames_edit
  272. )
  273. dev.off()
  274. }
  275. }
  276. # print(paste("FINISHED", "CLUSTER",cluster,sep=" "))
  277. }