#!/usr/bin/env Rscript # This script will make heatmaps for the REMc analysis # need to give the input "finalTable.csv" file after running REMc generated by eclipse library(RColorBrewer) library(gplots) args <- commandArgs(TRUE) # Set output dir if (length(args) > 1) { input_finalTable <- args[1] } else { input_finalTable <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow } if (length(args) > 2) { outDir <- args[2] } else { outDir <- "/REMcHeatmaps/REMcWithShift.csv" # for legacy workflow } hmapfile <- data.frame(read.csv(file=input_finalTable,header=TRUE,sep=",",stringsAsFactors = FALSE)) # set NAs to NA hmapfile[hmapfile == -100] <- NA hmapfile[hmapfile == 100] <- NA hmapfile[hmapfile == 0.001] <- NA hmapfile[hmapfile == -0.001] <- NA # select the number of rows based on the number of genes num_total_genes <- length(hmapfile[,1]) # Break out the cluster names so each part of the cluster origin can be accessed # line below removed because it adds to many genes to clusters when going past 1-0-10 since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl. # hmapfile$cluster.origin = gsub(" ","",x=hmapfile$cluster.origin) hmapfile$cluster.origin = gsub(";"," ;",x=hmapfile$cluster.origin) hmapfile$cluster.origin = strsplit(hmapfile$cluster.origin,';') #use tail(x,n) for accessing the outward most cluster clust_rounds <- 0 for(i in 1:num_total_genes){ if(length(hmapfile$cluster.origin[[i]]) > clust_rounds){ clust_rounds <- length(hmapfile$cluster.origin[[i]]) } } unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes]) unique_clusts <- unique_clusts[unique_clusts != " "] # Select only the unique cluster names unique_clusts <- sort(unique(unlist(unique_clusts,use.names= FALSE)),decreasing=FALSE) num_unique_clusts <- length(unique_clusts) # Base the color key on a statistical analysis of the L and K data # need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors) # scale() will calculate the mean and standard deviation of the entire vector, then "scale" each element by those values by subtracting the mean and dividing by the sd. # hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)]) # change so that the L data is multiplied to be on the same scale as the K data KEY_MIN <- 0 KEY_MAX <- 0 K_MIN <- 0 L_MAX <- 0 KcolumnValues <- vector() LcolumnValues <- vector() for(i in 4:(length(hmapfile[1,]) - 2)){ if(grepl("_Z_lm_K",colnames(hmapfile)[i],fixed=TRUE) == TRUE){ KcolumnValues <- append(KcolumnValues,i) } if(grepl("_Z_lm_L",colnames(hmapfile)[i],fixed=TRUE) == TRUE){ LcolumnValues <- append(LcolumnValues,i) } } # L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[4] # K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[2] # L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm=TRUE)[4] # K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm=TRUE)[2] # Z scores are L_MAX <- 12 K_MIN <- -12 # L_Multiplier <- as.numeric(abs(K_MIN/L_MAX)) # hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier # if(grepl("SHIFT",colnames(hmapfile)[4],fixed=TRUE) == TRUE){ # print("FOUND SHIFT VALUES") # hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier # } # KEY_MAX <- as.numeric(L_MAX * L_Multiplier) # KEY_MIN <- as.numeric(K_MIN) KEY_MAX <- as.numeric(L_MAX) KEY_MIN <- as.numeric(K_MIN) print(KEY_MIN) print(L_MAX) # print(L_Multiplier) colormapbreaks <- c(KEY_MIN,KEY_MIN*(5/6),KEY_MIN*(4/6),KEY_MIN*(3/6),KEY_MIN*(2/6),KEY_MIN*(1/6),KEY_MAX*(1/6),KEY_MAX*(2/6),KEY_MAX*(3/6),KEY_MAX*(4/6),KEY_MAX*(5/6),KEY_MAX) # print(colormapbreaks) # Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?) # However since also using this to amend the first part. Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers # then could just use these values and create spaces only between interaction values - possibly could get rid of redundant shift values if we don't want to view these # could we pool all the shift data/average it? if(grepl("Shift",colnames(hmapfile)[4],fixed=TRUE) == TRUE){ even_columns <- seq(from= 2, to= (length(hmapfile[1,]) - 7),by=2) #ev_repeat = rep("white",length(even_columns)) #ev_repeat = rep("red",(length(hmapfile[1,]) - 5)) #middle_col <- (length(hmapfile[1,]) - 5)/2 #ev_repeat[(middle_col/2)] <- "black" #print(ev_repeat) } if(grepl("Shift",colnames(hmapfile)[4],fixed=TRUE) == FALSE){ even_columns <- seq(from= 2, to= (length(hmapfile[1,]) - 7),by=1) print("NO SHIFT VALS FOUND") } #FOR THIS SCRIPT ONLY (rap tem hu script) #even_columns <- c(2,5,7,10,12,15,17) #m <- 0 colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1,]) - 2)]) #print(colnames_edit) for(i in 1:length(colnames_edit)){ if(grepl("Shift",colnames_edit[i],fixed=TRUE) == TRUE){ colnames_edit[i] <- "" colnames_edit[i+1] <- gsub(pattern = "_Z_lm_",replacement = " ",x = colnames_edit[i+1]) try(colnames_edit[i+1] <- gsub(pattern = "_",replacement = " ",x = colnames_edit[i+1])) # INT_store <- strsplit(colnames_edit[i+1], "Z_lm") # print(length(unlist(INT_store))) # if(length(unlist(INT_store)) == 4){ # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep=" ") # } # if(length(unlist(INT_store)) == 3){ # # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep=" ") # } # if(length(unlist(INT_store)) == 5){ # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep=" ") # } # if(length(unlist(INT_store)) == 6){ # colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep=" ") # } } } print(colnames_edit) #break() #colnames_edit[5] <- "TEM HLEG K" #colnames_edit[10] <- "TEM HL K" #colnames_edit[15] <- "TEM HLEG L" #colnames_edit[20] <- "TEM HL L" # Create the heatmaps for(i in 1:num_unique_clusts){ cluster <- unique_clusts[i] cluster_data <- subset(hmapfile,grepl(cluster,cluster.origin)) cluster_length <- length(cluster_data[,1]) if(cluster_length != 1){ X0 <- as.matrix(cluster_data[,4:(length(hmapfile[1,]) - 2)]) if(cluster_length >= 2001){ mypath = file.path(outDir,paste("cluster_",gsub(" ","",cluster), ".pdf",sep="")) pdf(file=mypath,height=20,width=15) heatmap.2(x=X0, Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none", breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor= "white", offsetCol = 0.1, #zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", #cellnote = round(X0,digits=0), notecex = 0.1, key=TRUE, keysize=0.7, trace="none", density.info=c("none"), margins=c(10, 8), na.color="red", col=brewer.pal(11,"PuOr"), main=cluster, #ColSideColors=ev_repeat, labRow=as.character(cluster_data$Gene), labCol=colnames_edit) #abline(v=0.5467,col="black") dev.off() } if(cluster_length >= 201 && cluster_length <= 2000){ mypath = file.path(outDir,paste("cluster_",gsub(" ","",cluster), ".pdf",sep="")) pdf(file=mypath,height=15,width=12) heatmap.2(x=X0, Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none", breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1, #zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0,digits=0), notecex = 0.1, key=TRUE, keysize=0.7, trace="none", density.info=c("none"), margins=c(10, 8), na.color="red", col=brewer.pal(11,"PuOr"), main=cluster, labRow=as.character(cluster_data$Gene), labCol=colnames_edit) #abline(v=0.5316,col="black") dev.off() } if(cluster_length >= 150 && cluster_length <= 200){ mypath = file.path(outDir,paste("cluster_",gsub(" ","",cluster), ".pdf",sep="")) pdf(file=mypath,height=12,width=12) heatmap.2(x=X0, Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none", breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1, #zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0,digits=0), notecex = 0.2, key=TRUE, keysize=1, trace="none", density.info=c("none"), margins=c(10, 8), na.color="red", col=brewer.pal(11,"PuOr"), main=cluster, labRow=as.character(cluster_data$Gene), labCol=colnames_edit) dev.off() } if(cluster_length >= 101 && cluster_length <= 149){ mypath = file.path(outDir,paste("cluster_",gsub(" ","",cluster), ".pdf",sep="")) pdf(file=mypath,mypath,height=12,width=12) heatmap.2(x=X0, Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none", breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1, #zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0,digits=0), notecex = 0.3, key=TRUE, keysize=1, trace="none", density.info=c("none"), margins=c(10, 8), na.color="red", col=brewer.pal(11,"PuOr"), main=cluster, labRow=as.character(cluster_data$Gene), labCol=colnames_edit) dev.off() } if(cluster_length >= 60 && cluster_length <= 100){ mypath = file.path(outDir,paste("cluster_",gsub(" ","",cluster), ".pdf",sep="")) pdf(file=mypath,height=12,width=12) heatmap.2(x=X0, Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none", breaks=colormapbreaks,symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1, #zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0,digits=0), notecex = 0.3, key=TRUE, keysize=1, trace="none", density.info=c("none"), margins=c(10, 8), na.color="red", col=brewer.pal(11,"PuOr"), main=cluster, labRow=as.character(cluster_data$Gene), labCol=colnames_edit) dev.off() } if(cluster_length <= 59 && cluster_length >= 30){ mypath = file.path(outDir,paste("cluster_",gsub(" ","",cluster), ".pdf",sep="")) pdf(file=mypath,height=9,width=12) heatmap.2(x=X0, Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none", breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1, #zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0,digits=0), notecex = 0.4, key=TRUE, keysize=1, trace="none", density.info=c("none"), margins=c(10, 8), na.color="red", col=brewer.pal(11,"PuOr"), main=cluster, labRow=as.character(cluster_data$Gene), labCol=colnames_edit) dev.off() } if(cluster_length <= 29){ mypath = file.path(outDir,paste("cluster_",gsub(" ","",cluster), ".pdf",sep="")) pdf(file=mypath,height=7,width=12) heatmap.2(x=X0, Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust, dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none", breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1, #zlim=c(-132,132), xlab = "Type of Media", ylab = "Gene Name", cellnote = round(X0,digits=0), notecex = 0.4, key=TRUE, keysize=1, trace="none", density.info=c("none"), margins=c(10, 8), na.color="red", col=brewer.pal(11,"PuOr"), main=cluster, labRow=as.character(cluster_data$Gene), labCol=colnames_edit) dev.off() } } #print(paste("FINISHED", "CLUSTER",cluster,sep=" ")) }