Commit earlier refactoring

2024-07-29 11:44:45 -04:00
parent 29cbce0754
commit 527068e683
294 changed files with 5524008 additions and 0 deletions
--- a/workflow/apps/r/createHeatMapsHomology.R
+++ b/workflow/apps/r/createHeatMapsHomology.R
@@ -0,0 +1,351 @@
+#!/usr/bin/env Rscript
+# This script will make homology heatmaps for the REMc analysis
+# This script didn't have any hard set inputs so I didn't bother
+
+library(RColorBrewer)
+library(gplots)
+library(tidyverse)
+
+args <- commandArgs(TRUE)
+# Need to give the input "finalTable.csv" file after running REMc generated by eclipse
+inputFinalTable <- args[1]
+
+# Give the DAmP_list.txt as the third argument - will color the gene names differently
+DAmPs <- Args[2]
+DAmP_list <- read.delim(file=DAmPs,header=F,stringsAsFactors = F)
+
+# Give the yeast human homology mapping as the fourth argument - will add the genes to the finalTable and use info for heatmaps
+mapFile <- Args[3]
+mapping <- read.csv(file=mapFile,stringsAsFactors = F)
+
+# Define the output path for the heatmaps - create this folder first - in linux terminal in the working folder use > mkdir filename_heatmaps
+outputPath <- Args[4]
+
+# Read in finalTablewithShift
+hmapfile <- data.frame(read.csv(file=inputFinalTable,header=TRUE,sep=",",stringsAsFactors = FALSE))
+
+# Map the finalTable to the human homolog file
+hmapfile_map <- hmapfile
+# Match using OrfRep after dropping the _1 _2 _3 _4 
+# But need to also account for some older files have ORF as column name rather than OrfRep in finalTable file
+if(colnames(hmapfile_map)[2] == "OrfRep"){
+  try(hmapfile_map$ORFMatch <- hmapfile_map$OrfRep)
+}
+if(colnames(hmapfile_map)[2] == "ORF"){
+  try(hmapfile_map$ORFMatch <- hmapfile_map$ORF)
+}
+hmapfile_map$ORFMatch <- gsub("_1","",x=hmapfile_map$ORFMatch)
+hmapfile_map$ORFMatch <- gsub("_2","",x=hmapfile_map$ORFMatch)
+hmapfile_map$ORFMatch <- gsub("_3","",x=hmapfile_map$ORFMatch)
+hmapfile_map$ORFMatch <- gsub("_4","",x=hmapfile_map$ORFMatch)
+
+# Join the hmapfile using 
+hmapfile_w_homolog <- full_join(hmapfile_map,mapping,by=c("ORFMatch"="ensembl_gene_id"))
+
+# Remove matches that are not from the finalTable
+hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$likelihood) == F,]
+
+# Write csv with all info from mapping file
+write.csv(hmapfile_w_homolog,file=paste(outputPath,"/",inputFinalTable,"_WithHomologAll.csv",sep=""),row.names = F)
+
+# Remove the non matches and output another mapping file - this is also one used to make heatmaps
+hmapfile_w_homolog <- hmapfile_w_homolog[is.na(hmapfile_w_homolog$external_gene_name_Human) == F,]
+write.csv(hmapfile_w_homolog,file=paste(outputPath,"/",inputFinalTable,"_WithHomologMatchesOnly.csv",sep=""),row.names = F)
+
+# Add human gene name to the Gene column
+hmapfile_w_homolog$Gene <- paste(hmapfile_w_homolog$Gene,hmapfile_w_homolog$external_gene_name_Human,sep="/")
+
+# Only keep the finalTable file columns and the homology info
+hmap_len <- dim(hmapfile)[2]
+
+hmapfile_w_homolog_remake <- cbind(hmapfile_w_homolog[,1:hmap_len], hsapiens_homolog_orthology_type=hmapfile_w_homolog$hsapiens_homolog_orthology_type)
+hmapfile <- hmapfile_w_homolog_remake
+
+# Set NAs to NA
+hmapfile[hmapfile == -100] <- NA
+hmapfile[hmapfile == 100] <- NA
+hmapfile[hmapfile == 0.001] <- NA
+hmapfile[hmapfile == -0.001] <- NA
+
+# Select the number of rows based on the number of genes
+num_total_genes <- length(hmapfile[,1])
+
+# break out the cluster names so each part of the cluster origin can be accessed
+# line below removed because it adds to many genes to clusters when going past 1-0-10 since it cannot differentiate between 1-0-1 and 1-0-10 when using grepl.
+# hmapfile$cluster.origin = gsub(" ","",x=hmapfile$cluster.origin)
+
+hmapfile$cluster.origin = gsub(";"," ;",x=hmapfile$cluster.origin)
+hmapfile$cluster.origin = strsplit(hmapfile$cluster.origin,';')
+# use tail(x,n) for accessing the outward most cluster
+
+clust_rounds <- 0
+for(i in 1:num_total_genes){
+  if(length(hmapfile$cluster.origin[[i]]) > clust_rounds){
+    clust_rounds <- length(hmapfile$cluster.origin[[i]])
+  }
+}
+
+unique_clusts <- unique(hmapfile$cluster.origin[1:num_total_genes])
+unique_clusts <- unique_clusts[unique_clusts != " "]
+
+#select only the unique cluster names
+unique_clusts <- sort(unique(unlist(unique_clusts,use.names= FALSE)),decreasing=FALSE)
+num_unique_clusts <- length(unique_clusts)
+
+# Base the color key on a statistical analysis of the L and K data
+# need to create "breaks" to set the color key, need to have 12 different breaks (for 11 colors)
+# scale() will calculate the mean and standard deviation of the entire vector, then "scale" each element by those values by subtracting the mean and dividing by the sd.
+
+# hmapfile[,4:(length(hmapfile[1,]) - 2)] <- scale(hmapfile[,4:(length(hmapfile[1,]) - 2)])
+
+# Change so that the L data is multiplied to be on the same scale as the K data
+
+KEY_MIN <- 0
+KEY_MAX <- 0
+K_MIN <- 0
+L_MAX <- 0
+KcolumnValues <- vector()
+LcolumnValues <- vector()
+
+for(i in 4:(length(hmapfile[1,]) - 3)){
+  if(grepl("_Z_lm_K",colnames(hmapfile)[i],fixed=TRUE) == TRUE){
+    KcolumnValues <- append(KcolumnValues,i)
+  }
+  if(grepl("_Z_lm_L",colnames(hmapfile)[i],fixed=TRUE) == TRUE){
+    LcolumnValues <- append(LcolumnValues,i)
+  }
+}
+
+# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[4]
+# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.01,.5,.99,1),na.rm=TRUE)[2]
+# L_MAX <- quantile(hmapfile[,LcolumnValues],c(0,.01,.5,.975,1),na.rm=TRUE)[4]
+# K_MIN <- quantile(hmapfile[,KcolumnValues],c(0,.025,.5,.99,1),na.rm=TRUE)[2]
+
+# Z scores are
+L_MAX <- 12
+K_MIN <- -12
+
+# L_Multiplier <- as.numeric(abs(K_MIN/L_MAX))
+# hmapfile[,LcolumnValues] <- hmapfile[,LcolumnValues] * L_Multiplier
+
+# if(grepl("SHIFT",colnames(hmapfile)[4],fixed=TRUE) == TRUE){
+#  print("FOUND SHIFT VALUES")
+#  hmapfile[,(LcolumnValues - 1)] <- hmapfile[,(LcolumnValues-1)] * L_Multiplier
+# }
+
+#KEY_MAX <- as.numeric(L_MAX * L_Multiplier)
+#KEY_MIN <- as.numeric(K_MIN)
+KEY_MAX <- as.numeric(L_MAX)
+KEY_MIN <- as.numeric(K_MIN)
+
+print(KEY_MIN)
+print(L_MAX)
+#print(L_Multiplier)
+
+colormapbreaks <- c(KEY_MIN,KEY_MIN*(5/6),KEY_MIN*(4/6),KEY_MIN*(3/6),KEY_MIN*(2/6),KEY_MIN*(1/6),KEY_MAX*(1/6),KEY_MAX*(2/6),KEY_MAX*(3/6),KEY_MAX*(4/6),KEY_MAX*(5/6),KEY_MAX)
+#print(colormapbreaks)
+
+# Probably should give a way to detect shift in case that is is not in the first row... (maybe just grepl for the whole column name?)
+# However since also using this to amend the first part. Could possibly identify all the ones that contain the word shift and then create an object containing just those numbers
+# then could just use these values and create spaces only between interaction values - possibly could get rid of redundant shift values if we don't want to view these
+# could we pool all the shift data/average it?
+if(grepl("Shift",colnames(hmapfile)[4],fixed=TRUE) == TRUE){
+  even_columns <- seq(from= 2, to= (length(hmapfile[1,]) - 7),by=2)
+  # ev_repeat = rep("white",length(even_columns))
+  # ev_repeat = rep("red",(length(hmapfile[1,]) - 5))
+  # middle_col <- (length(hmapfile[1,]) - 5)/2
+  # ev_repeat[(middle_col/2)] <- "black"
+  # print(ev_repeat)
+}
+
+if(grepl("Shift",colnames(hmapfile)[4],fixed=TRUE) == FALSE){
+  even_columns <- seq(from= 2, to= (length(hmapfile[1,]) - 7),by=1)
+  print("NO SHIFT VALS FOUND")
+}
+
+# for this script only (rap tem hu script)
+# even_columns <- c(2,5,7,10,12,15,17)
+
+# m <- 0
+colnames_edit <- as.character(colnames(hmapfile)[4:(length(hmapfile[1,]) - 3)])
+
+colnames(DAmP_list)[1] <- "ORF"
+hmapfile$DAmPs <- "YKO"
+colnames(hmapfile)[2] <- "ORF"
+try(hmapfile[hmapfile$ORF %in% DAmP_list$ORF,]$DAmPs <- "YKD")
+# X <- X[order(X$DAmPs,decreasing = TRUE),]
+hmapfile$color2 <- NA
+try(hmapfile[hmapfile$DAmPs == "YKO",]$color2 <- "black")
+try(hmapfile[hmapfile$DAmPs == "YKD",]$color2 <- "red")
+
+hmapfile$color <- NA
+try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_many2many",]$color <- "#F8766D")
+try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2many",]$color <- "#00BA38")
+try(hmapfile[hmapfile$hsapiens_homolog_orthology_type == "ortholog_one2one",]$color <- "#619CFF")
+
+# print(colnames_edit)
+for(i in 1:length(colnames_edit)){
+  if(grepl("Shift",colnames_edit[i],fixed=TRUE) == TRUE){
+    colnames_edit[i] <- ""
+    colnames_edit[i+1] <- gsub(pattern = "_Z_lm_",replacement = " ",x = colnames_edit[i+1])
+    try(colnames_edit[i+1] <- gsub(pattern = "_",replacement = " ",x = colnames_edit[i+1]))
+    
+    # INT_store <- strsplit(colnames_edit[i+1], "Z_lm")
+    # print(length(unlist(INT_store)))
+    # if(length(unlist(INT_store)) == 4){
+    #   colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],sep=" ")
+    # }
+    # if(length(unlist(INT_store)) == 3){
+    # 
+    #   colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],sep=" ")
+    # }
+    # if(length(unlist(INT_store)) == 5){
+    #   colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[3],unlist(INT_store)[4],sep=" ")
+    # }
+    # if(length(unlist(INT_store)) == 6){
+    #   colnames_edit[i+1] <- paste(unlist(INT_store)[1],unlist(INT_store)[2],unlist(INT_store)[6],sep=" ")
+    # }
+
+  }
+}
+
+print(colnames_edit)
+# break()
+# colnames_edit[5] <- "TEM HLEG K"
+# colnames_edit[10] <- "TEM HL K"
+# colnames_edit[15] <- "TEM HLEG L"
+# colnames_edit[20] <- "TEM HL L"
+
+# Create the heatmaps
+for(i in 1:num_unique_clusts){
+  cluster <- unique_clusts[i]
+  cluster_data <- subset(hmapfile,grepl(cluster,cluster.origin))
+  cluster_length <- length(cluster_data[,1])
+  if(cluster_length != 1){
+    X0 <- as.matrix(cluster_data[,4:(length(hmapfile[1,]) - 6)])
+    if(cluster_length >= 2001){
+      mypath = file.path(outputPath,paste("cluster_",gsub(" ","",cluster), ".pdf",sep=""))
+      pdf(file=mypath,height=20,width=15)
+      heatmap.2(
+        x=X0,
+        Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust,
+        dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
+        breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor= "white", offsetCol = 0.1,
+        # zlim=c(-132,132), 
+        xlab = "Type of Media", ylab = "Gene Name",
+        # cellnote = round(X0,digits=0), notecex = 0.1, key=TRUE,
+        keysize=0.7, trace="none", density.info=c("none"), margins=c(10, 8),
+        na.color="red", col=brewer.pal(11,"PuOr"),
+        main=cluster, 
+        # ColSideColors=ev_repeat,
+        labRow=as.character(cluster_data$Gene), labCol=colnames_edit, colRow=cluster_data$color2,RowSideColors=cluster_data$color)
+      # abline(v=0.5467,col="black")
+      dev.off()
+    }
+    if(cluster_length >= 201 && cluster_length <= 2000){
+      mypath = file.path(outputPath,paste("cluster_",gsub(" ","",cluster), ".pdf",sep=""))
+      pdf(file=mypath,height=15,width=12)
+      heatmap.2(
+        x=X0,
+        Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust,
+        dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
+        breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1,
+        # zlim=c(-132,132), 
+        xlab = "Type of Media", ylab = "Gene Name",
+        cellnote = round(X0,digits=0), notecex = 0.1, key=TRUE,
+        keysize=0.7, trace="none", density.info=c("none"), margins=c(10, 8),
+        na.color="red", col=brewer.pal(11,"PuOr"),
+        main=cluster,
+        labRow=as.character(cluster_data$Gene), labCol=colnames_edit, colRow=cluster_data$color2,RowSideColors=cluster_data$color)
+      # abline(v=0.5316,col="black")
+      dev.off()
+    }
+    if(cluster_length >= 150 && cluster_length <= 200){
+      mypath = file.path(outputPath,paste("cluster_",gsub(" ","",cluster), ".pdf",sep=""))
+      pdf(file=mypath,height=12,width=12)
+      heatmap.2(
+        x=X0,
+        Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust,
+        dendrogram = "row", cexCol = 0.8, cexRow = 0.1, scale = "none",
+        breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1,
+        # zlim=c(-132,132), 
+        xlab = "Type of Media", ylab = "Gene Name",
+        cellnote = round(X0,digits=0), notecex = 0.2, key=TRUE,
+        keysize=1, trace="none", density.info=c("none"), margins=c(10, 8),
+        na.color="red", col=brewer.pal(11,"PuOr"),
+        main=cluster,
+        labRow=as.character(cluster_data$Gene), labCol=colnames_edit, colRow=cluster_data$color2,RowSideColors=cluster_data$color)
+      dev.off()
+    }
+    if(cluster_length >= 101 && cluster_length <= 149){
+      mypath = file.path(outputPath,paste("cluster_",gsub(" ","",cluster), ".pdf",sep=""))
+      pdf(file=mypath,mypath,height=12,width=12)
+      heatmap.2(
+        x=X0,
+        Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust,
+        dendrogram = "row", cexCol = 0.8, cexRow = 0.2, scale = "none",
+        breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1,
+        # zlim=c(-132,132), 
+        xlab = "Type of Media", ylab = "Gene Name",
+        cellnote = round(X0,digits=0), notecex = 0.3, key=TRUE,
+        keysize=1, trace="none", density.info=c("none"), margins=c(10, 8),
+        na.color="red", col=brewer.pal(11,"PuOr"),
+        main=cluster,
+        labRow=as.character(cluster_data$Gene), labCol=colnames_edit, colRow=cluster_data$color2,RowSideColors=cluster_data$color)
+      dev.off()
+    }
+    if(cluster_length >= 60 && cluster_length <= 100){
+      mypath = file.path(outputPath,paste("cluster_",gsub(" ","",cluster), ".pdf",sep=""))
+      pdf(file=mypath,height=12,width=12)
+      heatmap.2(
+        x=X0,
+        Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust,
+        dendrogram = "row", cexCol = 0.8, cexRow = 0.4, scale = "none",
+        breaks=colormapbreaks,symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1,
+        #zlim=c(-132,132),
+        xlab = "Type of Media", ylab = "Gene Name",
+        cellnote = round(X0,digits=0), notecex = 0.3, key=TRUE,
+        keysize=1, trace="none", density.info=c("none"), margins=c(10, 8),
+        na.color="red", col=brewer.pal(11,"PuOr"),
+        main=cluster,
+        labRow=as.character(cluster_data$Gene), labCol=colnames_edit, colRow=cluster_data$color2,RowSideColors=cluster_data$color)
+      dev.off()
+    }
+    if(cluster_length <= 59 && cluster_length >= 30){
+      mypath = file.path(outputPath,paste("cluster_",gsub(" ","",cluster), ".pdf",sep=""))
+      pdf(file=mypath,height=9,width=12)
+      heatmap.2(
+        x=X0,
+        Rowv=TRUE, Colv=NA, distfun = dist, hclustfun = hclust,
+        dendrogram = "row", cexCol = 0.8, cexRow = 0.6, scale = "none",
+        breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1,
+        #zlim=c(-132,132), 
+        xlab = "Type of Media", ylab = "Gene Name",
+        cellnote = round(X0,digits=0), notecex = 0.4, key=TRUE,
+        keysize=1, trace="none", density.info=c("none"), margins=c(10, 8),
+        na.color="red", col=brewer.pal(11,"PuOr"),
+        main=cluster,
+        labRow=as.character(cluster_data$Gene), labCol=colnames_edit, colRow=cluster_data$color2,RowSideColors=cluster_data$color)
+      dev.off()
+    }
+    if(cluster_length <= 29){
+      mypath = file.path(outputPath,paste("cluster_",gsub(" ","",cluster), ".pdf",sep=""))
+      pdf(file=mypath,height=7,width=12)
+      heatmap.2(
+        x=X0,
+        Rowv=TRUE, Colv=NA, 
+        distfun = dist, hclustfun = hclust,
+        dendrogram = "row", cexCol = 0.8, cexRow = 0.9, scale = "none",
+        breaks=colormapbreaks, symbreaks=FALSE, colsep = even_columns, sepcolor="white", offsetCol = 0.1,
+        #zlim=c(-132,132), 
+        xlab = "Type of Media", ylab = "Gene Name",
+        cellnote = round(X0,digits=0), notecex = 0.4, key=TRUE,
+        keysize=1, trace="none", density.info=c("none"), margins=c(10, 8),
+        na.color="red", col=brewer.pal(11,"PuOr"),
+        main=cluster,
+        labRow=as.character(cluster_data$Gene), labCol=colnames_edit, colRow=cluster_data$color2,RowSideColors=cluster_data$color)
+      dev.off()
+    }
+  }
+  # print(paste("FINISHED", "CLUSTER",cluster,sep=" "))
+}