hartman-server/qhtcp-workflow/apps/r/joinInteractExps.R

library("dplyr")

# Function to parse and set arguments
parse_arguments <- function() {
  if (interactive()) {
    args <- c(
      "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD",
      2,
      "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
      "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1",
      "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp2"
    )
  } else {
    args <- commandArgs(trailingOnly = TRUE)
  }

  list(
    out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
    sd = as.numeric(args[2]),
    study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
    input_dirs = args[4:length(args)]
  )
}

args <- parse_arguments()

# Create an array for the zscores files
get_zscores_files <- function(dirs) {
  files <- sapply(dirs, function(exp) {
    file_path <- file.path(exp, "zscores", "zscores_interaction.csv")
    if (file.exists(file_path)) file_path else NULL
  }, simplify = TRUE, USE.NAMES = FALSE)

  # Filter out NULL entries
  files[!sapply(files, is.null)]
}

zscores_files <- get_zscores_files(args$input_dirs)
sprintf("The SD value is: %d", args$sd)

# Ensure there are enough files to compare
if (length(zscores_files) < 2) stop("Not enough experiments to compare, exiting script")

# Function to join zscores files
join_zscores_files <- function(files) {
  joined_data <- read.csv(file = files[1], stringsAsFactors = FALSE)
  for (file in files[-1]) {
    temp_data <- read.csv(file = file, stringsAsFactors = FALSE)
    joined_data <- join(joined_data, temp_data, by = "OrfRep")
  }
  joined_data
}

# Load and join zscores files
joined_data <- join_zscores_files(zscores_files)

# Order and select columns
order_and_select_columns <- function(data) {
  ordered_data <- data[, order(colnames(data))]
  selected_headers <- select(ordered_data,
    contains("OrfRep"), matches("Gene"),
    contains("z_lm_k"), contains("z_shift_k"),
    contains("z_lm_l"), contains("z_shift_l"))
  selected_headers
}

selected_headers <- order_and_select_columns(joined_data)

# Remove redundant columns like "Gene.1"
clean_headers <- function(data, suffixes) {
  suffixes_to_remove <- paste0("Gene.", seq_len(suffixes))
  select(data, -all_of(suffixes_to_remove))
}

headSel <- clean_headers(selected_headers, length(zscores_files) - 1)
headSel2 <- clean_headers(select(joined_data, contains("OrfRep"), matches("Gene")), length(zscores_files) - 1)

# Fill NA values in Shift and Z_lm columns
fill_na_in_columns <- function(data) {
  for (header in colnames(data)) {
    if (grepl("Shift", header)) {
      data[[header]][is.na(data[[header]])] <- 0.001
    } else if (grepl("Z_lm_", header)) {
      data[[header]][is.na(data[[header]])] <- 0.0001
    }
  }
  data
}

headSel <- fill_na_in_columns(headSel)

# Filter based on standard deviation
filter_by_sd <- function(data, sd) {
  if (sd == 0) return(data)

  z_lm_cols <- select(data, contains("z_lm_"))
  filter_vector <- rowSums(abs(z_lm_cols) >= sd) > 0
  data[filter_vector, ]
}

REMcRdy <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_lm_")), args$sd)
shiftOnly <- filter_by_sd(select(headSel, contains("OrfRep"), matches("Gene"), contains("z_shift")), args$sd)

# Reorder columns to interleave Z_lm and Shift data
reorder_columns <- function(data1, data2) {
  combined_data <- data1
  for (i in 3:ncol(data1)) {
    combined_data <- cbind(combined_data, data2[i], data1[i])
  }
  combined_data
}

combI <- reorder_columns(headSel2, shiftOnly)

# Write output files
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)

# Relabel headers using experiment names from StudyInfo.csv
relabel_headers <- function(headers, labels) {
  new_labels <- headers
  for (i in seq_along(headers)) {
    suffix <- sub("^.*\\.(\\d+)$", "\\1", headers[i])
    if (suffix %in% 1:3) {
      exp_name <- labels[as.numeric(suffix), 2]
      new_labels[i] <- gsub(paste0(".", suffix), paste0("_", exp_name), headers[i])
    }
  }
  new_labels
}

LabelStd <- read.csv(file = args$study_info, stringsAsFactors = FALSE)
colnames(shiftOnly) <- relabel_headers(colnames(shiftOnly), LabelStd)
colnames(REMcRdy) <- relabel_headers(colnames(REMcRdy), LabelStd)

# Save relabeled files
write.csv(REMcRdy, file.path(args$out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
write.csv(shiftOnly, file.path(args$out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)

# Save updated parameters
LabelStd[, 4] <- args$sd
write.csv(LabelStd, file.path(args$out_dir, "parameters.csv"), row.names = FALSE)
write.csv(LabelStd, file = args$study_info, row.names = FALSE)


# library("plyr")
# library("sos")
# library("dplyr")

# # Function to parse and set arguments
# parse_arguments <- function() {
#   if (interactive()) {
#     args <- c(
#       "./",                          # Default out_dir
#       "2",                           # Default SD value
#       "../Code/StudyInfo.csv"        # Default study_info path
#     )
#   } else {
#     args <- commandArgs(trailingOnly = TRUE)
#   }

#   list(
#     out_dir = normalizePath(file.path(args[1]), mustWork = FALSE),
#     sd = as.numeric(args[2]),
#     study_info = normalizePath(file.path(args[3]), mustWork = FALSE),
#     input_dirs = args[4:length(args)]
#   )
# }

# args <- parse_arguments()

# # Create an array for the zscores files
# zscores_files <- sapply(args$input_dirs, function(study) {
#   file_path <- file.path(study, "zscores", "zscores_interaction.csv")
#     if (file.exists(file_path)) file_path else NULL
#     }, simplify = TRUE, USE.NAMES = FALSE)

# # Filter out NULL entries
# zscores_files <- zscores_files[!sapply(zscores_files, is.null)]

# sprintf("The SD value is: %d", args$sd)
# # print(args$input_dirs)

# # TODO this is better handled in a loop in case you want to compare more experiments?
# # The input is already  designed for this
# # Read in the files for your experiment and
# # Join the two files at a time as a function of how many inputFile
# # list the larger file first ? in this example X2 has the larger number of genes
# # If X1 has a larger number of genes, switch the order of X1 and X2
# if (length(zscores_files) < 2) {
#   print("Note enough Exps to compare, skipping join")
#   stop("Exiting script")
# }

# if (length(zscores_files) >= 2) {
#   X1 <- read.csv(file = zscores_files[1], stringsAsFactors = FALSE)
#   X2 <- read.csv(file = zscores_files[2], stringsAsFactors = FALSE)
#   X <- join(X1, X2, by = "OrfRep")
#   OBH <- X[, order(colnames(X))]  # OrderByHeader
#   headers <- select(OBH, contains("OrfRep"), matches("Gene"),
#     contains("z_lm_k"), contains("z_shift_k"), contains("z_lm_l"), contains("z_shift_l"))
#   headSel <- select(headers, -"Gene.1") # remove "Gene.1 column
#   headSel2 <- select(OBH,  contains("OrfRep"), matches("Gene")) # frame for interleaving Z_lm with Shift colums
#   headSel2 <- select(headSel2, -"Gene.1") # remove "Gene.1 column  # frame for interleaving Z_lm with Shift colums
# }

# if (length(zscores_files) >= 3) {
#   X3 <- read.csv(file = zscores_files[3], stringsAsFactors = FALSE)
#   X <- join(X, X3, by = "OrfRep")
#   headSel <- select(headers, -"Gene.1", -"Gene.2")
#   headSel2 <- select(OBH,  contains("OrfRep"), matches("Gene"))
#   headSel2 <- select(headSel2, -"Gene.1", -"Gene.2")
# }

# if (length(zscores_files) >= 4) {
#   X4 <- read.csv(file = zscores_files[4], stringsAsFactors = FALSE)
#   X <- join(X, X4, by = "OrfRep")
#   headSel <- select(headers, -"Gene.1", -"Gene.2", -"Gene.3")
#   headSel2 <- select(OBH,  contains("OrfRep"), matches("Gene"))
#   headSel2 <- select(headSel2, -"Gene.1", -"Gene.2", -"Gene.3")
# }

# # print(headers)
# # headSel$contains("Z_Shift") %>% replace_na(0.001)
# headers <- colnames(headSel)
# # print(headers)
# i <- 0
# for (i in 1:length(headers)) {
#   if (grepl("Shift", headers[i])) {
#     headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.001
#   }
#   if (grepl("Z_lm_", headers[i])) {
#     headSel[headers[i]][is.na(headSel[headers[i]])] <- 0.0001
#   }
# }

# # 2SD option code to exclude Z_lm values less than 2 standard Deviations
# REMcRdy <- select(headSel,  contains("OrfRep"), matches("Gene"), contains("z_lm_"))
# shiftOnly <- select(headSel,  contains("OrfRep"), matches("Gene"), contains("z_shift"))

# # Code to replace the numeric (.1 .2 .3) headers with experiment names from StudyInfo.txt
# Labels <- read.csv(file = study_info, stringsAsFactors = FALSE, sep = ",")

# # Using Text search grepl to relabel headers
# REMcRdyHdr <- colnames(REMcRdy)
# REMcRdyLabels <- "asdf"
# shftHdr <- colnames(shiftOnly)
# shiftLabels <- "asdf"
# shiftLabels[1:2] <- shftHdr[1:2]
# REMcRdyLabels[1:2] <- REMcRdyHdr[1:2]

# for (i in 3:(length(shftHdr))) {
#   if (i == 3) {
#     shiftLabels[3] <- paste0(Labels[1, 2], ".", shftHdr[3])
#     REMcRdyLabels[3] <- paste0(Labels[1, 2], ".", REMcRdyHdr[3])
#   }
#   if (i == 5) {
#     shiftLabels[5] <- paste0(Labels[1, 2], ".", shftHdr[5])
#     REMcRdyLabels[5] <- paste0(Labels[1, 2], ".", REMcRdyHdr[5])
#   }
#   if (i == 7) {
#     shiftLabels[7] <- paste0(Labels[1, 2], ".", shftHdr[7])
#     REMcRdyLabels[7] <- paste0(Labels[1, 2], ".", REMcRdyHdr[7])
#   }
#   if (grepl(".1", shftHdr[i], fixed = true)) {
#     shiftLabels[i] <- paste0(Labels[2, 2], ".", shftHdr[i])
#     REMcRdyLabels[i] <- paste0(Labels[2, 2], ".", REMcRdyHdr[i])
#   }
#   if (grepl(".2", shftHdr[i], fixed = true)) {
#     shiftLabels[i] < -paste0(Labels[3, 2], ".", shftHdr[i])
#     REMcRdyLabels[i] <- paste0(Labels[3, 2], ".", REMcRdyHdr[i])
#   }
#   if (grepl(".3", shftHdr[i], fixed = true)) {
#     shiftLabels[i] <- paste0(Labels[4, 2], ".", shftHdr[i])
#     REMcRdyLabels[i] <- paste0(Labels[4, 2], ".", REMcRdyHdr[i])
#   }
# }

# for (i in 3:(length(REMcRdyLabels))) {
#   j <- as.integer(i)
#   REMcRdyLabels[j] <- gsub("[.]", "_", REMcRdyLabels[j])
#   shiftLabels[j] <- gsub("[.]", "_", shiftLabels[j])
# }

# colnames(shiftOnly) <- shiftLabels
# colnames(REMcRdy) <- REMcRdyLabels

# combI <- headSel2 # starting Template orf, Genename columns

# # headersRemc<-colnames(REMcRdy)
# # Reorder columns to produce an interleaved set of Z_lm and Shift data for all the cpps.
# for (i in 3:length(colnames(REMcRdy))) {
#   combI <- cbind.data.frame(combI, shiftOnly[i])
#   combI <- cbind.data.frame(combI, REMcRdy[i])
# }

# Vec1 <- NA
# Vec2 <- NA
# Vec3 <- NA
# Vec4 <- NA
# Vec5 <- NA
# Vec6 <- NA
# Vec7 <- NA
# Vec8 <- NA

# if (length(REMcRdy) == 6) {
#   Vec1 <- abs(REMcRdy[, 3]) >= sd
#   Vec2 <- abs(REMcRdy[, 4]) >= sd
#   Vec3 <- abs(REMcRdy[, 5]) >= sd
#   Vec4 <- abs(REMcRdy[, 6]) >= sd
#   bolVec <- Vec1 | Vec2 | Vec3 | Vec4
#   REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
#   REMcRdyGT2[, 3:6] <- REMcRdy[bolVec, 3:6]
#   shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
#   shiftOnlyGT2[, 3:6] <- shiftOnly[bolVec, 3:6]
# }

# if (length(REMcRdy) == 8) {
#   Vec1 <- abs(REMcRdy[, 3]) >= sd
#   Vec2 <- abs(REMcRdy[, 4]) >= sd
#   Vec3 <- abs(REMcRdy[, 5]) >= sd
#   Vec4 <- abs(REMcRdy[, 6]) >= sd
#   Vec5 <- abs(REMcRdy[, 7]) >= sd
#   Vec6 <- abs(REMcRdy[, 8]) >= sd
#   bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6
#   REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
#   REMcRdyGT2[, 3:8] <- REMcRdy[bolVec, 3:8]
#   shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
#   shiftOnlyGT2[, 3:8] <- shiftOnly[bolVec, 3:8]
# }

# if (length(REMcRdy) == 10) {
#   Vec1 <- abs(REMcRdy[, 3]) >= sd
#   Vec2 <- abs(REMcRdy[, 4]) >= sd
#   Vec3 <- abs(REMcRdy[, 5]) >= sd
#   Vec4 <- abs(REMcRdy[, 6]) >= sd
#   Vec5 <- abs(REMcRdy[, 7]) >= sd
#   Vec6 <- abs(REMcRdy[, 8]) >= sd
#   Vec7 <- abs(REMcRdy[, 9]) >= sd
#   Vec8 <- abs(REMcRdy[, 10]) >= sd
#   bolVec <- Vec1 | Vec2 | Vec3 | Vec4 | Vec5 | Vec6 | Vec7 | Vec8
#   REMcRdyGT2 <- REMcRdy[bolVec, 1:2]
#   REMcRdyGT2[, 3:10] <- REMcRdy[bolVec, 3:10]
#   shiftOnlyGT2 <- shiftOnly[bolVec, 1:2]
#   shiftOnlyGT2[, 3:10] <- shiftOnly[bolVec, 3:10]
# }

# if (sd != 0) {
#   REMcRdy <- REMcRdyGT2  # [,2:length(REMcRdyGT2)]
#   shiftOnly <- shiftOnlyGT2  # [,2:length(shiftOnlyGT2)]
# }

# if (sd == 0) {
#   REMcRdy <- REMcRdy # [,2:length(REMcRdy)]
#   shiftOnly <- shiftOnly # [,2:length(shiftOnly)]
# }

# # R places hidden "" around the header names. The following
# # is intended to remove those quote so that the "" do not blow up the Java REMc.
# # Use ,quote=F in the write.csv statement to fix R output file.
# # write.csv(combI,file.path(out_dir,"CombinedKLzscores.csv"), row.names = FALSE)
# write.csv(REMcRdy, file.path(out_dir, "REMcRdy_lm_only.csv"), row.names = FALSE, quote = FALSE)
# write.csv(shiftOnly, file.path(out_dir, "Shift_only.csv"), row.names = FALSE, quote = FALSE)
# # LabelStd <- read.table(file="./parameters.csv",stringsAsFactors = FALSE,sep = ",")

# LabelStd <- read.csv(file = study_info, stringsAsFactors = FALSE)
# # print(sd)
# LabelStd[, 4] <- as.numeric(sd)
# write.csv(LabelStd, file = file.path(out_dir, "parameters.csv"), row.names = FALSE)
# write.csv(LabelStd, file = study_info, row.names = FALSE)