hartman-server/workflow/.old/apps/r/interactions.R

suppressMessages({
  library("ggplot2")
  library("plotly")
  library("htmlwidgets")
  library("dplyr")
  library("ggthemes")
})

# Constants for configuration
PLOT_WIDTH <- 14
PLOT_HEIGHT <- 9
BASE_SIZE <- 14
# BACKUP_SUFFIX <- paste0(".backup_", Sys.Date())

options(warn = 2, max.print = 100)

# Parse arguments
parse_arguments <- function() {
  if (interactive()) {
    args <- c(
      "1",
      "3",
      "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/StudyInfo.csv",
      "/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab",
      "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2/results_std.txt",
      "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/Exp1/zscores"
    )
  } else {
    args <- commandArgs(trailingOnly = TRUE)
  }
  list(
    exp_number = as.numeric(args[1]),
    delta_bg_factor = as.numeric(args[2]),
    study_info_file = normalizePath(file.path(args[3]), mustWork = FALSE),
    sgd_gene_list = normalizePath(file.path(args[4]), mustWork = FALSE),
    input_file = normalizePath(file.path(args[5]), mustWork = FALSE),
    out_dir = normalizePath(file.path(args[6]), mustWork = FALSE),
    out_dir_qc = normalizePath(file.path(args[6], "qc"), mustWork = FALSE)
  )
}

args <- parse_arguments()
dir.create(args$out_dir, showWarnings = FALSE)
dir.create(args$out_dir_qc, showWarnings = FALSE)

# Define themes and scales
theme_publication <- function(base_size = BASE_SIZE, base_family = "sans") {
  theme_foundation(base_size = base_size, base_family = base_family) +
    theme(
      plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5),
      text = element_text(),
      panel.background = element_rect(colour = NA),
      plot.background = element_rect(colour = NA),
      panel.border = element_rect(colour = NA),
      axis.title = element_text(face = "bold", size = rel(1)),
      axis.title.y = element_text(angle = 90, vjust = 2),
      axis.title.x = element_text(vjust = -0.2),
      axis.line = element_line(colour = "black"),
      panel.grid.major = element_line(colour = "#f0f0f0"),
      panel.grid.minor = element_blank(),
      legend.key = element_rect(colour = NA),
      legend.position = "bottom",
      legend.direction = "horizontal",
      plot.margin = unit(c(10, 5, 5, 5), "mm"),
      strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
      strip.text = element_text(face = "bold")
    )
}

theme_publication_legend_right <- function(base_size = BASE_SIZE, base_family = "sans") {
  theme_publication(base_size, base_family) +
    theme(
      legend.position = "right",
      legend.direction = "vertical",
      legend.key.size = unit(0.5, "cm"),
      legend.spacing = unit(0, "cm"),
      legend.title = element_text(face = "italic")
    )
}

scale_fill_publication <- function(...) {
  discrete_scale("fill", "Publication", manual_pal(values = c(
    "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
    "#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
  )), ...)
}

scale_colour_publication <- function(...) {
  discrete_scale("colour", "Publication", manual_pal(values = c(
    "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506",
    "#a6cee3", "#fb9a99", "#984ea3", "#ffff33"
  )), ...)
}


# Load and preprocess data
load_and_preprocess_data <- function(input_file, genes) {
  df <- tryCatch({
    read.delim(input_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
  }, error = function(e) {
    stop("Error reading file: ", input_file, "\n", e$message)
  })

  # Filter out rows where the first column is empty or "Scan"
  df <- df[!df[[1]] %in% c("", "Scan"), ]

  # Convert specific columns to numeric
  numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd")
  df[numeric_columns[numeric_columns %in% colnames(df)]] <- lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric)

  # Handle missing critical columns with warnings
  df$L <- if ("l" %in% colnames(df)) df$l else {warning("Missing column: l"); NA}
  df$AUC <- if ("AUC96" %in% colnames(df)) df$AUC96 else {warning("Missing column: AUC96"); NA}
  df$conc_num <- if ("Conc" %in% colnames(df)) as.numeric(gsub("[^0-9\\.]", "", df$Conc)) else NA
  df$delta_bg <- if (all(c("X1stBackgrd", "LstBackgrd") %in% colnames(df))) df$LstBackgrd - df$X1stBackgrd else {warning("Missing columns for delta_bg calculation"); NA}

  # Replace ORF IDs with Gene Names using the SGD gene list
  df$GeneName <- vapply(df$ORF, function(orf) {
    gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName)
    ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name)
  }, character(1))

  if (nrow(df) == 0) warning("Dataframe is empty after filtering")

  return(df)
}


df <- load_and_preprocess_data(args$input_file)

delta_bg_tolerance <- mean(df$delta_bg, na.rm = TRUE) + args$delta_bg_factor * sd(df$delta_bg, na.rm = TRUE)
message("Exp", args$exp_number, ": The delta_bg_factor is: ", args$delta_bg_factor)
message("Exp", args$exp_number, ": The delta_bg_tolerance is ", delta_bg_tolerance)

df <- df %>%
  mutate(OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep)) %>%
    filter(!Gene %in% c("BLANK", "Blank", "blank"), Drug != "BMH21")

# Create plot
create_plot <- function(df, var, plot_type) {
  filtered_df <- df %>% filter(is.finite(.data[[var]]))
  p <- ggplot(filtered_df, aes(Scan, .data[[var]], color = as.factor(conc_num))) +
    ggtitle(paste("Plate analysis by Drug Conc for", var, "before quality control")) +
    theme_publication()

  if (plot_type == "scatter") {
    p <- p +
      geom_point(shape = 3, size = 0.2, position = "jitter") +
      stat_summary(fun = mean, geom = "point", size = 0.6) +
      stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar")
  } else {
    p <- p + geom_boxplot()
  }

  return(p)
}

# Publish plot to PDF and HTML (Plotly)
publish_plot <- function(plot, plot_path) {
  # if (file.exists(plot_path)) {
  #   file.rename(plot_path, paste0(plot_path, BACKUP_SUFFIX))
  # }

  pdf(plot_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
  print(plot)
  dev.off()

  pgg <- suppressWarnings(ggplotly(plot,
    tooltip = c("L", "K", "ORF", "Gene", "delta_bg")) %>%
      layout(legend = list(orientation = "h")))
  saveWidget(pgg, sub(".pdf$", ".html", plot_path), selfcontained = TRUE)
}

# Publish multiple plots
publish_multiple_plots <- function(df, variables, plot_type, out_dir_qc, suffix = "") {
  lapply(variables, function(var) {
    plot <- create_plot(df, var, plot_type)
    publish_plot(plot, file.path(out_dir_qc, paste0("plate_analysis_", var, suffix, ".pdf")))
  })
}

# Calculate and publish summary statistics
publish_summary_stats <- function(df, variables, out_dir) {
  stats_list <- lapply(variables, function(var) {
    df %>%
      group_by(OrfRep, conc_num) %>%
      summarize(
        N = n(),
        mean_val = mean(.data[[var]], na.rm = TRUE),
        sd_val = sd(.data[[var]], na.rm = TRUE),
        se_val = sd_val / sqrt(N)
      )
  })
  summary_stats_df <- bind_rows(stats_list, .id = "variable")
  write.csv(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
}

# Calculate and publish interaction scores
publish_interaction_scores <- function(df, out_dir) {
  interaction_scores <- df %>%
    group_by(OrfRep) %>%
    summarize(
      mean_L = mean(L, na.rm = TRUE),
      mean_K = mean(K, na.rm = TRUE),
      mean_r = mean(r, na.rm = TRUE),
      mean_AUC = mean(AUC, na.rm = TRUE),
      delta_bg_mean = mean(delta_bg, na.rm = TRUE),
      delta_bg_sd = sd(delta_bg, na.rm = TRUE)
    ) %>%
    mutate(
      l_rank = rank(mean_L),
      k_rank = rank(mean_K),
      r_rank = rank(mean_r),
      auc_rank = rank(mean_AUC)
    )

  write.csv(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
  write.csv(interaction_scores %>%
    arrange(l_rank, k_rank), file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
}

# Publish z-scores
publish_zscores <- function(df, out_dir) {
  zscores <- df %>%
    mutate(
      zscore_L = scale(L, center = TRUE, scale = TRUE),
      zscore_K = scale(K, center = TRUE, scale = TRUE),
      zscore_r = scale(r, center = TRUE, scale = TRUE),
      zscore_AUC = scale(AUC, center = TRUE, scale = TRUE)
    )

  write.csv(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
}

# QC plot generation and publication
generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
  publish_multiple_plots(df, c("L", "K", "r", "AUC", "delta_bg"), "scatter", out_dir_qc)

  delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ]
  publish_multiple_plots(delta_bg_above_tolerance, c("L", "K", "r", "AUC", "delta_bg"), "scatter", out_dir_qc, "_above_tolerance")
}

# Run QC plot generation and publication
generate_and_publish_qc(df, delta_bg_tolerance, args$out_dir_qc)

# Run summary statistics, interaction score, and z-score calculations and publications
variables <- c("L", "K", "r", "AUC", "delta_bg")
publish_summary_stats(df, variables, args$out_dir)
publish_interaction_scores(df, args$out_dir)
publish_zscores(df, args$out_dir)