123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364 |
- #!/usr/bin/env Rscript
- library("RColorBrewer")
- library("gplots")
- library("tidyverse")
- args <- commandArgs(TRUE)
- # Define the output path for the heatmaps - create this folder first - in linux terminal in the working folder use > mkdir filename_heatmaps
- output_path <- file.path(Args[1])
- # Need to give the input "finalTable.csv" file after running REMc generated by eclipse
- final_table <- file.path(args[2])
- # Give the damp_list.txt as the third argument - will color the gene names differently
- damps <- file.path(Args[3])
- damp_list <- read.delim(file = damps, header = FALSE, stringsAsFactors = FALSE)
- # Give the yeast human homology mapping as the fourth argument - will add the genes to the finalTable and use info for heatmaps
- map_file <- file.path(Args[4])
- mapping <- read.csv(file = map_file, stringsAsFactors = FALSE)
- # Read in finalTablewithShift
- hmapfile <- data.frame(read.csv(file = final_table, header = TRUE, sep = ",", stringsAsFactors = FALSE))
- # Map the finalTable to the human homolog file
- hmapfile_map <- hmapfile
- # Match using OrfRep after dropping the _1 _2 _3 _4
- # But need to also account for some older files have ORF as column name rather than OrfRep in finalTable file
- if (colnames(hmapfile_map)[2] == "OrfRep") {
- try(hmapfile_map$ORFMatch <- hmapfile_map$OrfRep)
- }
- if (colnames(hmapfile_map)[2] == "ORF") {
- try(hmapfile_map$ORFMatch <- hmapfile_map$ORF)
- }
- hmapfile_map$ORFMatch <- gsub("_1", "", x = hmapfile_map$ORFMatch)
- hmapfile_map$ORFMatch <- gsub("_2", "", x = hmapfile_map$ORFMatch)
- hmapfile_map$ORFMatch <- gsub("_3", "", x = hmapfile_map$ORFMatch)
- hmapfile_map$ORFMatch <- gsub("_4", "", x = hmapfile_map$ORFMatch)
- # Join the hmapfile using
- hmapfile_w_homolog <- full_join(hmapfile_map, mapping, by = c("ORFMatch" = "ensembl_gene_id"))
- # Remove matches that are not from the finalTable
- hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$likelihood) == FASLE, ]
- # Write csv with all info from mapping file
- write.csv(hmapfile_w_homolog, file.path(output_path, paste0(final_table, "_WithHomologAll.csv")), row.names = FALSE)
- # Remove the non matches and output another mapping file - this is also one used to make heatmaps
- hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$external_gene_name_Human) == FALSE, ]
- write.csv(hmapfile_w_homolog, file.path(output_path, paste0(final_table, "_WithHomologMatchesOnly.csv"), row.names = FALSE))
- # Add human gene name to the Gene column
- hmapfile_w_homolog$Gene <- paste(hmapfile_w_homolog$Gene, hmapfile_w_homolog$external_gene_name_Human, sep = "/")
- # Only keep the finalTable file columns and the homology info
- hmap_len <- dim(hmapfile)[2]
- hmapfile_w_homolog_remake <-
- cbind(hmapfile_w_homolog[, 1:hmap_len], hsapiens_homolog_orthology_type = hmapfile_w_homolog$hsapiens_homolog_orthology_type)
- hmapfile <- hmapfile_w_homolog_remake
- # Set NAs to NA
- hmapfile[hmapfile == -100] <- NA
- hmapfile[hmapfile == 100] <- NA
- hmapfile[hmapfile == 0.001] <- NA
- hmapfile[hmapfile == -0.001] <- NA
- # Select the number of rows based on the number of genes
- num_total_genes <- length(hmapfile[, 1])
- # Break out the cluster names so each part of the cluster origin can be accessed
- # Line below removed because it adds to many genes to clusters when going past 1-0-10
- # since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl.
- # hmapfile$cluster.origin = gsub(" ","",x = hmapfile$cluster.origin)
- hmapfile$cluster.origin <- gsub(";", " ;", x = hmapfile$cluster.origin)
- hmapfile$cluster.origin <- strsplit(hmapfile$cluster.origin, ";")
- # use tail(x,n) for accessing the outward most cluster
- clust_rounds <- 0
- for (i in 1:num_total_genes) {
- if (length(hmapfile$cluster.origin[[i]]) > clust_rounds) {
- clust_rounds <- length(hmapfile$cluster.origin[[i]])
- }
- }
- unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes])
- unique_clusts <- unique_clusts[unique_clusts != " "]
- #select only the unique cluster names
- unique_clusts <- sort(unique(unlist(unique_clusts, use.names = FALSE)), decreasing = FALSE)
- num_unique_clusts <- length(unique_clusts)
- # Base the color key on a statistical analysis of the L and K data
- # need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors)
- # scale() will calculate the mean and standard deviation of the entire vector
- # then "scale" each element by those values by subtracting the mean and dividing by the sd
- # hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)])
- # Change so that the L data is multiplied to be on the same scale as the K data
- KEY_MIN <- 0
- KEY_MAX <- 0
- K_MIN <- 0
- L_MAX <- 0
- KcolumnValues <- vector()
- LcolumnValues <- vector()
- for (i in 4:(length(hmapfile[1, ]) - 3)){
- if (grepl("_Z_lm_K", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
- KcolumnValues <- append(KcolumnValues, i)
- }
- if (grepl("_Z_lm_L", colnames(hmapfile)[i], fixed = TRUE) == TRUE) {
- LcolumnValues <- append(LcolumnValues, i)
- }
- }
- # L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm = TRUE)[4]
- # K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm = TRUE)[2]
- # L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm = TRUE)[4]
- # K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm = TRUE)[2]
- # Z scores are
- L_MAX <- 12
- K_MIN <- -12
- # L_Multiplier <- as.numeric(abs(K_MIN/L_MAX))
- # hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier
- # if(grepl("SHIFT",colnames(hmapfile)[4],fixed = TRUE) == TRUE){
- # print("FOUND SHIFT VALUES")
- # hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier
- # }
- #KEY_MAX <- as.numeric(L_MAX * L_Multiplier)
- #KEY_MIN <- as.numeric(K_MIN)
- KEY_MAX <- as.numeric(L_MAX)
- KEY_MIN <- as.numeric(K_MIN)
- print(KEY_MIN)
- print(L_MAX)
- #print(L_Multiplier)
- colormapbreaks <- c(KEY_MIN, KEY_MIN * (5 / 6), KEY_MIN * (4 / 6), KEY_MIN * (3 / 6),
- KEY_MIN * (2 / 6), KEY_MIN * (1 / 6), KEY_MAX * (1 / 6), KEY_MAX * (2 / 6),
- KEY_MAX * (3 / 6), KEY_MAX * (4 / 6), KEY_MAX * (5 / 6), KEY_MAX)
- # print(colormapbreaks)
- # Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?)
- # However since also using this to amend the first part.
- # Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers
- # then could just use these values and create spaces only between interaction values
- # possibly could get rid of redundant shift values if we don't want to view these
- # could we pool all the shift data/average it?
- if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == TRUE) {
- even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 2)
- # ev_repeat = rep("white",length(even_columns))
- # ev_repeat = rep("red",(length(hmapfile[1,]) - 5))
- # middle_col <- (length(hmapfile[1,]) - 5)/2
- # ev_repeat[(middle_col/2)] <- "black"
- # print(ev_repeat)
- }
- if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == FALSE) {
- even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 1)
- print("NO SHIFT VALS FOUND")
- }
- # for this script only (rap tem hu script)
- # even_columns <- c(2,5,7,10,12,15,17)
- # m <- 0
- colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1, ]) - 3)])
- colnames(damp_list)[1] <- "ORF"
- hmapfile$damps <- "YKO"
- colnames(hmapfile)[2] <- "ORF"
- try(hmapfile[hmapfile$ORF %in% damp_list$ORF, ]$damps <- "YKD")
- # X <- X[order(X$damps,decreasing = TRUE),]
- hmapfile$color2 <- NA
- try(hmapfile[hmapfile$damps == "YKO", ]$color2 <- "black")
- try(hmapfile[hmapfile$damps == "YKD", ]$color2 <- "red")
- hmapfile$color <- NA
- try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_many2many", ]$color <- "#F8766D")
- try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2many", ]$color <- "#00BA38")
- try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2one", ]$color <- "#619CFF")
- # print(colnames_edit)
- for (i in 1:length(colnames_edit)) {
- if (grepl("Shift", colnames_edit[i], fixed = TRUE) == TRUE) {
- colnames_edit[i] <- ""
- colnames_edit[i + 1] <- gsub(pattern = "_Z_lm_", replacement = " ", x = colnames_edit[i + 1])
- try(colnames_edit[i + 1] <- gsub(pattern = "_", replacement = " ", x = colnames_edit[i + 1]))
-
- # INT_store <- strsplit(colnames_edit[i+1], "Z_lm")
- # print(length(unlist(INT_store)))
- # if(length(unlist(INT_store)) == 4){
- # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep = " ")
- # }
- # if(length(unlist(INT_store)) == 3){
- #
- # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep = " ")
- # }
- # if(length(unlist(INT_store)) == 5){
- # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep = " ")
- # }
- # if(length(unlist(INT_store)) == 6){
- # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep = " ")
- # }
- }
- }
- print(colnames_edit)
- # break()
- # colnames_edit[5] <- "TEM HLEG K"
- # colnames_edit[10] <- "TEM HL K"
- # colnames_edit[15] <- "TEM HLEG L"
- # colnames_edit[20] <- "TEM HL L"
- # Create the heatmaps
- for (i in 1:num_unique_clusts) {
- cluster <- unique_clusts[i]
- cluster_data <- subset(hmapfile, grepl(cluster, cluster.origin))
- cluster_length <- length(cluster_data[, 1])
- if (cluster_length != 1) {
- X0 <- as.matrix(cluster_data[, 4:(length(hmapfile[1, ]) - 6)])
- if (cluster_length >= 2001) {
- mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
- pdf(file = mypath, height = 20, width = 15)
- heatmap.2(
- x = X0,
- Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
- dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
- breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
- # zlim = c(-132,132),
- xlab = "Type of Media", ylab = "Gene Name",
- # cellnote = round(X0,digits = 0), notecex = 0.1, key = TRUE,
- keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
- na.color = "red", col = brewer.pal(11, "PuOr"),
- main = cluster,
- # ColSideColors = ev_repeat,
- labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
- )
- # abline(v = 0.5467,col = "black")
- dev.off()
- }
- if (cluster_length >= 201 && cluster_length <= 2000) {
- mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
- pdf(file = mypath, height = 15, width = 12)
- heatmap.2(
- x = X0,
- Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
- dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
- breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
- # zlim = c(-132,132),
- xlab = "Type of Media", ylab = "Gene Name",
- cellnote = round(X0, digits = 0), notecex = 0.1, key = TRUE,
- keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8),
- na.color = "red", col = brewer.pal(11, "PuOr"),
- main = cluster,
- labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
- )
- # abline(v = 0.5316,col = "black")
- dev.off()
- }
- if (cluster_length >= 150 && cluster_length <= 200) {
- mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
- pdf(file = mypath, height = 12, width = 12)
- heatmap.2(
- x = X0,
- Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
- dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
- breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
- # zlim = c(-132,132),
- xlab = "Type of Media", ylab = "Gene Name",
- cellnote = round(X0, digits = 0), notecex = 0.2, key = TRUE,
- keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
- na.color = "red", col = brewer.pal(11, "PuOr"),
- main = cluster,
- labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
- )
- dev.off()
- }
- if (cluster_length >= 101 && cluster_length <= 149) {
- mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
- pdf(file = mypath, height = 12, width = 12)
- heatmap.2(
- x = X0,
- Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
- dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none",
- breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
- # zlim = c(-132,132),
- xlab = "Type of Media", ylab = "Gene Name",
- cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
- keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
- na.color = "red", col = brewer.pal(11, "PuOr"),
- main = cluster,
- labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
- )
- dev.off()
- }
- if (cluster_length >= 60 && cluster_length <= 100) {
- mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
- pdf(file = mypath, height = 12, width = 12)
- heatmap.2(
- x = X0,
- Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
- dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none",
- breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
- # zlim = c(-132,132),
- xlab = "Type of Media", ylab = "Gene Name",
- cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE,
- keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
- na.color = "red", col = brewer.pal(11, "PuOr"),
- main = cluster,
- labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
- )
- dev.off()
- }
- if (cluster_length <= 59 && cluster_length >= 30) {
- mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
- pdf(file = mypath, height = 9, width = 12)
- heatmap.2(
- x = X0,
- Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust,
- dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none",
- breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
- # zlim = c(-132,132),
- xlab = "Type of Media", ylab = "Gene Name",
- cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
- keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
- na.color = "red", col = brewer.pal(11, "PuOr"),
- main = cluster,
- labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
- )
- dev.off()
- }
- if (cluster_length <= 29) {
- mypath <- file.path(output_path, paste0("cluster_", gsub(" ", "", cluster), ".pdf"))
- pdf(file = mypath, height = 7, width = 12)
- heatmap.2(
- x = X0,
- Rowv = TRUE, Colv = NA,
- distfun = dist, hclustfun = hclust,
- dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none",
- breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1,
- # zlim = c(-132,132),
- xlab = "Type of Media", ylab = "Gene Name",
- cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE,
- keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8),
- na.color = "red", col = brewer.pal(11, "PuOr"),
- main = cluster,
- labRow = as.character(cluster_data$Gene), labCol = colnames_edit, colRow = cluster_data$color2, RowSideColors = cluster_data$color
- )
- dev.off()
- }
- }
- # print(paste("FINISHED", "CLUSTER",cluster,sep = " "))
- }
|