createHeatMapsHomology.R 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. #!/usr/bin/env Rscript
  2. library("RColorBrewer")
  3. library("gplots")
  4. library("tidyverse")
  5. args <- commandArgs(TRUE)
  6. # Define the output path for the heatmaps - create this folder first - in linux terminal in the working folder use > mkdir filename_heatmaps
  7. output_path <- file.path(Args[1])
  8. # Need to give the input "finalTable.csv" file after running REMc generated by eclipse
  9. final_table <- file.path(args[2])
  10. # Give the damp_list.txt as the third argument - will color the gene names differently
  11. damps <- file.path(Args[3])
  12. damp_list <- read.delim(file = damps, header = FALSE, stringsAsFactors = FALSE)
  13. # Give the yeast human homology mapping as the fourth argument - will add the genes to the finalTable and use info for heatmaps
  14. map_file <- file.path(Args[4])
  15. mapping <- read.csv(file = map_file, stringsAsFactors = FALSE)
  16. # Read in finalTablewithShift
  17. hmapfile <- data.frame(read.csv(file = final_table, header = TRUE, sep = ",", stringsAsFactors = FALSE))
  18. # Map the finalTable to the human homolog file
  19. hmapfile_map <- hmapfile
  20. # Match using OrfRep after dropping the _1 _2 _3 _4
  21. # But need to also account for some older files have ORF as column name rather than OrfRep in finalTable file
  22. if (colnames(hmapfile_map)[2] == "OrfRep") {
  23. try(hmapfile_map$ORFMatch <- hmapfile_map$OrfRep)
  24. }
  25. if (colnames(hmapfile_map)[2] == "ORF") {
  26. try(hmapfile_map$ORFMatch <- hmapfile_map$ORF)
  27. }
  28. hmapfile_map$ORFMatch <- gsub("_1", "", x = hmapfile_map$ORFMatch)
  29. hmapfile_map$ORFMatch <- gsub("_2", "", x = hmapfile_map$ORFMatch)
  30. hmapfile_map$ORFMatch <- gsub("_3", "", x = hmapfile_map$ORFMatch)
  31. hmapfile_map$ORFMatch <- gsub("_4", "", x = hmapfile_map$ORFMatch)
  32. # Join the hmapfile using
  33. hmapfile_w_homolog <- full_join(hmapfile_map, mapping, by = c("ORFMatch" = "ensembl_gene_id"))
  34. # Remove matches that are not from the finalTable
  35. hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$likelihood) == FASLE, ]
  36. # Write csv with all info from mapping file
  37. write.csv(hmapfile_w_homolog, file.path(output_path, paste0(final_table, "_WithHomologAll.csv")), row.names = FALSE)
  38. # Remove the non matches and output another mapping file - this is also one used to make heatmaps
  39. hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$external_gene_name_Human) == FALSE, ]
  40. write.csv(hmapfile_w_homolog, file.path(output_path, paste0(final_table, "_WithHomologMatchesOnly.csv"), row.names = FALSE))
  41. # Add human gene name to the Gene column
  42. hmapfile_w_homolog$Gene <- paste(hmapfile_w_homolog$Gene, hmapfile_w_homolog$external_gene_name_Human, sep = "/")
  43. # Only keep the finalTable file columns and the homology info
  44. hmap_len <- dim(hmapfile)[2]
  45. hmapfile_w_homolog_remake <-
  46. cbind(hmapfile_w_homolog[, 1:hmap_len], hsapiens_homolog_orthology_type = hmapfile_w_homolog$hsapiens_homolog_orthology_type)
  47. hmapfile <- hmapfile_w_homolog_remake
  48. # Set NAs to NA
  49. hmapfile[hmapfile == -100] <- NA
  50. hmapfile[hmapfile == 100] <- NA
  51. hmapfile[hmapfile == 0.001] <- NA
  52. hmapfile[hmapfile == -0.001] <- NA
  53. # Select the number of rows based on the number of genes
  54. num_total_genes <- length(hmapfile[, 1])
  55. # Break out the cluster names so each part of the cluster origin can be accessed
  56. # Line below removed because it adds to many genes to clusters when going past 1-0-10
  57. # since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl.
  58. # hmapfile$cluster.origin = gsub(" ","",x = hmapfile$cluster.origin)
  59. hmapfile$cluster.origin <- gsub(";", " ;", x = hmapfile$cluster.origin)
  60. hmapfile$cluster.origin <- strsplit(hmapfile$cluster.origin, ";")
  61. # use tail(x,n) for accessing the outward most cluster
  62. clust_rounds <- 0
  63. for (i in 1:num_total_genes) {
  64. if (length(hmapfile$cluster.origin[[i]]) > clust_rounds) {
  65. clust_rounds <- length(hmapfile$cluster.origin[[i]])
  66. }
  67. }
  68. unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes])
  69. unique_clusts <- unique_clusts[unique_clusts != " "]
  70. #select only the unique cluster names
  71. unique_clusts <- sort(unique(unlist(unique_clusts, use.names = FALSE)), decreasing = FALSE)
  72. num_unique_clusts <- length(unique_clusts)
  73. # Base the color key on a statistical analysis of the L and K data
  74. # need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors)
  75. # scale() will calculate the mean and standard deviation of the entire vector
  76. # then "scale" each element by those values by subtracting the mean and dividing by the sd
  77. # hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)])
  78. # Change so that the L data is multiplied to be on the same scale as the K data
  79. KEY_MIN <- 0
  80. KEY_MAX <- 0
  81. K_MIN <- 0
  82. L_MAX <- 0
  83. KcolumnValues <- vector()
  84. LcolumnValues <- vector()
  85. for (i in 4:(length(hmapfile[1, ]) - 3)){
  86. if (grepl("_Z_lm_K", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
  87. KcolumnValues <- append(KcolumnValues, i)
  88. }
  89. if (grepl("_Z_lm_L", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
  90. LcolumnValues <- append(LcolumnValues, i)
  91. }
  92. }
  93. # L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm = TRUE)[4]
  94. # K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm = TRUE)[2]
  95. # L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm = TRUE)[4]
  96. # K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm = TRUE)[2]
  97. # Z scores are
  98. L_MAX <- 12
  99. K_MIN <- -12
  100. # L_Multiplier <- as.numeric(abs(K_MIN/L_MAX))
  101. # hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier
  102. # if(grepl("SHIFT",colnames(hmapfile)[4],fixed = TRUE) == TRUE){
  103. # print("FOUND SHIFT VALUES")
  104. # hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier
  105. # }
  106. #KEY_MAX <- as.numeric(L_MAX * L_Multiplier)
  107. #KEY_MIN <- as.numeric(K_MIN)
  108. KEY_MAX <- as.numeric(L_MAX)
  109. KEY_MIN <- as.numeric(K_MIN)
  110. print(KEY_MIN)
  111. print(L_MAX)
  112. #print(L_Multiplier)
  113. colormapbreaks <- c(KEY_MIN, KEY_MIN * (5 / 6), KEY_MIN * (4 / 6), KEY_MIN * (3 / 6),
  114. KEY_MIN * (2 / 6), KEY_MIN * (1 / 6), KEY_MAX * (1 / 6), KEY_MAX * (2 / 6),
  115. KEY_MAX * (3 / 6), KEY_MAX * (4 / 6), KEY_MAX * (5 / 6), KEY_MAX)
  116. # print(colormapbreaks)
  117. # Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?)
  118. # However since also using this to amend the first part.
  119. # Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers
  120. # then could just use these values and create spaces only between interaction values
  121. # possibly could get rid of redundant shift values if we don't want to view these
  122. # could we pool all the shift data/average it?
  123. if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == TRUE) {
  124. even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 2)
  125. # ev_repeat = rep("white",length(even_columns))
  126. # ev_repeat = rep("red",(length(hmapfile[1,]) - 5))
  127. # middle_col <- (length(hmapfile[1,]) - 5)/2
  128. # ev_repeat[(middle_col/2)] <- "black"
  129. # print(ev_repeat)
  130. }
  131. if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == FALSE) {
  132. even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 1)
  133. print("NO SHIFT VALS FOUND")
  134. }
  135. # for this script only (rap tem hu script)
  136. # even_columns <- c(2,5,7,10,12,15,17)
  137. # m <- 0
  138. colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1, ]) - 3)])
  139. colnames(damp_list)[1] <- "ORF"
  140. hmapfile$damps <- "YKO"
  141. colnames(hmapfile)[2] <- "ORF"
  142. try(hmapfile[hmapfile$ORF %in% damp_list$ORF, ]$damps <- "YKD")
  143. # X <- X[order(X$damps,decreasing = TRUE),]
  144. hmapfile$color2 <- NA
  145. try(hmapfile[hmapfile$damps == "YKO", ]$color2 <- "black")
  146. try(hmapfile[hmapfile$damps == "YKD", ]$color2 <- "red")
  147. hmapfile$color <- NA
  148. try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_many2many", ]$color <- "#F8766D")
  149. try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2many", ]$color <- "#00BA38")
  150. try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2one", ]$color <- "#619CFF")
  151. # print(colnames_edit)
  152. for (i in 1:length(colnames_edit)) {
  153. if (grepl("Shift", colnames_edit[i], fixed = TRUE) == TRUE) {
  154. colnames_edit[i] <- ""
  155. colnames_edit[i + 1] <- gsub(pattern = "_Z_lm_", replacement = " ", x = colnames_edit[i + 1])
  156. try(colnames_edit[i + 1] <- gsub(pattern = "_", replacement = " ", x = colnames_edit[i + 1]))
  157. # INT_store <- strsplit(colnames_edit[i+1], "Z_lm")
  158. # print(length(unlist(INT_store)))
  159. # if(length(unlist(INT_store)) == 4){
  160. # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep = " ")
  161. # }
  162. # if(length(unlist(INT_store)) == 3){
  163. #
  164. # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep = " ")
  165. # }
  166. # if(length(unlist(INT_store)) == 5){
  167. # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep = " ")
  168. # }
  169. # if(length(unlist(INT_store)) == 6){
  170. # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep = " ")
  171. # }
  172. }
  173. }
  174. print(colnames_edit)
  175. # break()
  176. # colnames_edit[5] <- "TEM HLEG K"
  177. # colnames_edit[10] <- "TEM HL K"
  178. # colnames_edit[15] <- "TEM HLEG L"
  179. # colnames_edit[20] <- "TEM HL L"
  180. # Create the heatmaps
  181. for (i in 1:num_unique_clusts) {
  182. cluster <- unique_clusts[i]
  183. cluster_data <- subset(hmapfile, grepl(cluster, cluster.origin))
  184. cluster_length <- length(cluster_data[, 1])
  185. if (cluster_length != 1) {
  186. X0 <- as.matrix(cluster_data[, 4:(length(hmapfile[1, ]) - 6)])
  187. if (cluster_length >= 2001) {
  188. mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  189. pdf(file = mypath, height = 20, width = 15)
  190. heatmap.2(
  191. x = X0,
  192. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  193. dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
  194. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  195. # zlim = c(-132,132),
  196. xlab = "Type of Media", ylab = "Gene Name",
  197. # cellnote = round(X0,digits = 0), notecex = 0.1, key = TRUE,
  198. keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
  199. na.color = "red", col = brewer.pal(11, "PuOr"),
  200. main = cluster,
  201. # ColSideColors = ev_repeat,
  202. labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
  203. )
  204. # abline(v = 0.5467,col = "black")
  205. dev.off()
  206. }
  207. if (cluster_length >= 201 && cluster_length <= 2000) {
  208. mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  209. pdf(file = mypath, height = 15, width = 12)
  210. heatmap.2(
  211. x = X0,
  212. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  213. dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
  214. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  215. # zlim = c(-132,132),
  216. xlab = "Type of Media", ylab = "Gene Name",
  217. cellnote = round(X0, digits = 0), notecex = 0.1, key = TRUE,
  218. keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
  219. na.color = "red", col = brewer.pal(11, "PuOr"),
  220. main = cluster,
  221. labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
  222. )
  223. # abline(v = 0.5316,col = "black")
  224. dev.off()
  225. }
  226. if (cluster_length >= 150 && cluster_length <= 200) {
  227. mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  228. pdf(file = mypath, height = 12, width = 12)
  229. heatmap.2(
  230. x = X0,
  231. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  232. dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
  233. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  234. # zlim = c(-132,132),
  235. xlab = "Type of Media", ylab = "Gene Name",
  236. cellnote = round(X0, digits = 0), notecex = 0.2, key = TRUE,
  237. keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
  238. na.color = "red", col = brewer.pal(11, "PuOr"),
  239. main = cluster,
  240. labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
  241. )
  242. dev.off()
  243. }
  244. if (cluster_length >= 101 && cluster_length <= 149) {
  245. mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  246. pdf(file = mypath, height = 12, width = 12)
  247. heatmap.2(
  248. x = X0,
  249. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  250. dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none",
  251. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  252. # zlim = c(-132,132),
  253. xlab = "Type of Media", ylab = "Gene Name",
  254. cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
  255. keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
  256. na.color = "red", col = brewer.pal(11, "PuOr"),
  257. main = cluster,
  258. labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
  259. )
  260. dev.off()
  261. }
  262. if (cluster_length >= 60 && cluster_length <= 100) {
  263. mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  264. pdf(file = mypath, height = 12, width = 12)
  265. heatmap.2(
  266. x = X0,
  267. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  268. dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none",
  269. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  270. # zlim = c(-132,132),
  271. xlab = "Type of Media", ylab = "Gene Name",
  272. cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
  273. keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
  274. na.color = "red", col = brewer.pal(11, "PuOr"),
  275. main = cluster,
  276. labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
  277. )
  278. dev.off()
  279. }
  280. if (cluster_length <= 59 && cluster_length >= 30) {
  281. mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  282. pdf(file = mypath, height = 9, width = 12)
  283. heatmap.2(
  284. x = X0,
  285. Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
  286. dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none",
  287. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  288. # zlim = c(-132,132),
  289. xlab = "Type of Media", ylab = "Gene Name",
  290. cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
  291. keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
  292. na.color = "red", col = brewer.pal(11, "PuOr"),
  293. main = cluster,
  294. labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
  295. )
  296. dev.off()
  297. }
  298. if (cluster_length <= 29) {
  299. mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
  300. pdf(file = mypath, height = 7, width = 12)
  301. heatmap.2(
  302. x = X0,
  303. Rowv = TRUE, Colv = NA,
  304. distfun = dist, hclustfun = hclust,
  305. dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none",
  306. breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
  307. # zlim = c(-132,132),
  308. xlab = "Type of Media", ylab = "Gene Name",
  309. cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
  310. keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
  311. na.color = "red", col = brewer.pal(11, "PuOr"),
  312. main = cluster,
  313. labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
  314. )
  315. dev.off()
  316. }
  317. }
  318. # print(paste("FINISHED", "CLUSTER",cluster,sep = " "))
  319. }