#!/usr/bin/env Rscript # This script will make heatmaps for the REMc analysis # need to give the input "finalTable.csv" file after running REMc generated by eclipse library(RColorBrewer) library(gplots) args <- commandArgs(TRUE) # Set output dir if (length(args) >= 1) { input_finalTable <- file.path(args[1]) } else { input_finalTable <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow } if (length(args) >= 2) { outDir <- file.path(args[2]) } else { outDir <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow } hmapfile <- data.frame(read.csv(file = input_finalTable, header = TRUE, sep = ",", stringsAsFactors = FALSE)) # set NAs to NA hmapfile[hmapfile == -100] <- NA hmapfile[hmapfile == 100] <- NA hmapfile[hmapfile == 0.001] <- NA hmapfile[hmapfile == -0.001] <- NA # select the number of rows based on the number of genes num_total_genes <- length(hmapfile[, 1]) # Break out the cluster names so each part of the cluster origin can be accessed # line below removed because it adds to many genes to clusters when going past 1-0-10 # since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl. # hmapfile$cluster.origin = gsub(" ","",x=hmapfile$cluster.origin) hmapfile$cluster.origin <- gsub(";", " ;", x = hmapfile$cluster.origin) hmapfile$cluster.origin <- strsplit(hmapfile$cluster.origin, ";") #use tail(x,n) for accessing the outward most cluster clust_rounds <- 0 for (i in 1:num_total_genes) { if (length(hmapfile$cluster.origin[[i]]) > clust_rounds) { clust_rounds <- length(hmapfile$cluster.origin[[i]]) } } unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes]) unique_clusts <- unique_clusts[unique_clusts != " "] # Select only the unique cluster names unique_clusts <- sort(unique(unlist(unique_clusts, use.names = FALSE)), decreasing = FALSE) num_unique_clusts <- length(unique_clusts) # Base the color key on a statistical analysis of the L and K data # need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors) # scale() will calculate the mean and standard deviation of the entire vector, # then "scale" each element by those values by subtracting the mean and dividing by the sd. # hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)]) # change so that the L data is multiplied to be on the same scale as the K data KEY_MIN <- 0 KEY_MAX <- 0 K_MIN <- 0 L_MAX <- 0 KcolumnValues <- vector() LcolumnValues <- vector() for (i in 4:(length(hmapfile[1, ]) - 2)){ if (grepl("_Z_lm_K", colnames(hmapfile)[i], fixed = TRUE) == TRUE) { KcolumnValues <- append(KcolumnValues, i) } if (grepl("_Z_lm_L", colnames(hmapfile)[i], fixed = TRUE) == TRUE) { LcolumnValues <- append(LcolumnValues, i) } } # L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[4] # K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[2] # L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm=TRUE)[4] # K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm=TRUE)[2] # Z scores are L_MAX <- 12 K_MIN <- -12 # L_Multiplier <- as.numeric(abs(K_MIN/L_MAX)) # hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier # if(grepl("SHIFT",colnames(hmapfile)[4],fixed=TRUE) == TRUE){ # print("FOUND SHIFT VALUES") # hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier # } # KEY_MAX <- as.numeric(L_MAX * L_Multiplier) # KEY_MIN <- as.numeric(K_MIN) KEY_MAX <- as.numeric(L_MAX) KEY_MIN <- as.numeric(K_MIN) print(KEY_MIN) print(L_MAX) # print(L_Multiplier) colormapbreaks <- c(KEY_MIN, KEY_MIN * (5 / 6), KEY_MIN * (4 / 6), KEY_MIN * (3 / 6), KEY_MIN * (2 / 6), KEY_MIN * (1 / 6), KEY_MAX * (1 / 6), KEY_MAX * (2 / 6), KEY_MAX * (3 / 6), KEY_MAX * (4 / 6), KEY_MAX * (5 / 6), KEY_MAX) # print(colormapbreaks) # Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?) # However since also using this to amend the first part. # Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers # then could just use these values and create spaces only between interaction values # possibly could get rid of redundant shift values if we don't want to view these # could we pool all the shift data/average it? if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == TRUE) { even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 2) # ev_repeat = rep("white",length(even_columns)) # ev_repeat = rep("red",(length(hmapfile[1,]) - 5)) # middle_col <- (length(hmapfile[1,]) - 5)/2 # ev_repeat[(middle_col/2)] <- "black" # print(ev_repeat) } if (grepl("Shift", colnames(hmapfile)[4], fixed = TRUE) == FALSE) { even_columns <- seq(from = 2, to = (length(hmapfile[1, ]) - 7), by = 1) print("NO SHIFT VALS FOUND") } # FOR THIS SCRIPT ONLY (rap tem hu script) # even_columns <- c(2,5,7,10,12,15,17) # m <- 0 colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1, ]) - 2)]) # print(colnames_edit) for (i in 1:length(colnames_edit)) { if (grepl("Shift", colnames_edit[i], fixed = TRUE) == TRUE) { colnames_edit[i] <- "" colnames_edit[i + 1] <- gsub(pattern = "_Z_lm_", replacement = " ", x = colnames_edit[i + 1]) try(colnames_edit[i + 1] <- gsub(pattern = "_", replacement = " ", x = colnames_edit[i + 1])) # INT_store <- strsplit(colnames_edit[i+1], "Z_lm") # print(length(unlist(INT_store))) # if(length(unlist(INT_store)) == 4){ # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep=" ") # } # if(length(unlist(INT_store)) == 3){ # # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep=" ") # } # if(length(unlist(INT_store)) == 5){ # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep=" ") # } # if(length(unlist(INT_store)) == 6){ # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep=" ") # } } } print(colnames_edit) # break() # colnames_edit[5] <- "TEM HLEG K" # colnames_edit[10] <- "TEM HL K" # colnames_edit[15] <- "TEM HLEG L" # colnames_edit[20] <- "TEM HL L" # Create the heatmaps for (i in 1:num_unique_clusts) { cluster <- unique_clusts[i] cluster_data <- subset(hmapfile, grepl(cluster, cluster.origin)) cluster_length <- length(cluster_data[, 1]) if (cluster_length != 1) { X0 <- as.matrix(cluster_data[, 4:(length(hmapfile[1, ]) - 2)]) if (cluster_length >= 2001) { mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf")) pdf(file = mypath, height = 20, width = 15) heatmap.2( x = X0, Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none", breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1, # zlim=c(-132, 132), xlab = "Type of Media", ylab = "Gene Name", # cellnote = round(X0,digits = 0), notecex = 0.1, key = TRUE, keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8), na.color = "red", col = brewer.pal(11, "PuOr"), main = cluster, # ColSideColors=ev_repeat, labRow = as.character(cluster_data$Gene), labCol = colnames_edit ) # abline(v=0.5467,col="black") dev.off() } if (cluster_length >= 201 && cluster_length <= 2000) { mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf")) pdf(file = mypath, height = 15, width = 12) heatmap.2( x = X0, Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none", breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1, # zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0, digits = 0), notecex = 0.1, key = TRUE, keysize = 0.7, trace = "none", density.info = c("none"), margins = c(10, 8), na.color = "red", col = brewer.pal(11, "PuOr"), main = cluster, labRow = as.character(cluster_data$Gene), labCol = colnames_edit ) # abline(v=0.5316,col="black") dev.off() } if (cluster_length >= 150 && cluster_length <= 200) { mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf")) pdf(file = mypath, height = 12, width = 12) heatmap.2( x = X0, Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none", breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1, # zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0, digits = 0), notecex = 0.2, key = TRUE, keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8), na.color = "red", col = brewer.pal(11, "PuOr"), main = cluster, labRow = as.character(cluster_data$Gene), labCol = colnames_edit ) dev.off() } if (cluster_length >= 101 && cluster_length <= 149) { mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf")) pdf(file = mypath, mypath, height = 12, width = 12) heatmap.2( x = X0, Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none", breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1, #zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE, keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8), na.color = "red", col = brewer.pal(11, "PuOr"), main = cluster, labRow = as.character(cluster_data$Gene), labCol = colnames_edit ) dev.off() } if (cluster_length >= 60 && cluster_length <= 100) { mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf")) pdf(file = mypath, height = 12, width = 12) heatmap.2( x = X0, Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none", breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1, # zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0, digits = 0), notecex = 0.3, key = TRUE, keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8), na.color = "red", col = brewer.pal(11, "PuOr"), main = cluster, labRow = as.character(cluster_data$Gene), labCol = colnames_edit ) dev.off() } if (cluster_length <= 59 && cluster_length >= 30) { mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf")) pdf(file = mypath, height = 9, width = 12) heatmap.2( x = X0, Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none", breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1, # zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE, keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8), na.color = "red", col = brewer.pal(11, "PuOr"), main = cluster, labRow = as.character(cluster_data$Gene), labCol = colnames_edit ) dev.off() } if (cluster_length <= 29) { mypath <- file.path(outDir, paste0("cluster_", gsub(" ", "", cluster), ".pdf")) pdf(file = mypath, height = 7, width = 12) heatmap.2( x = X0, Rowv = TRUE, Colv = NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none", breaks = colormapbreaks, symbreaks = FALSE, colsep = even_columns, sepcolor = "white", offsetCol = 0.1, # zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0, digits = 0), notecex = 0.4, key = TRUE, keysize = 1, trace = "none", density.info = c("none"), margins = c(10, 8), na.color = "red", col = brewer.pal(11, "PuOr"), main = cluster, labRow = as.character(cluster_data$Gene), labCol = colnames_edit ) dev.off() } } # print(paste("FINISHED", "CLUSTER",cluster,sep=" ")) }