Before old plotting removal

2024-09-04 02:03:23 -04:00
parent 111909914c
commit d52903f8fa
2 changed files with 327 additions and 282 deletions
--- a/workflow/apps/r/calculate_interaction_zscores5.R
+++ b/workflow/apps/r/calculate_interaction_zscores5.R
@@ -163,102 +163,6 @@ update_gene_names <- function(df, sgd_gene_list) {
  return(df)
 }

-generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num", title, x_label = NULL, y_label = NULL) {
-  # Start the ggplot with just the x_var and color_var
-  plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) +
-    ggtitle(title) +
-    theme_publication()
-  
-  # If y_var is not NULL, add it to the plot aesthetics
-  if (!is.null(y_var)) {
-    plot <- plot + aes(y = !!sym(y_var))
-  }
-  
-  # Handle different plot types
-  plot <- switch(plot_type,
-    "scatter" = plot + geom_point(shape = 3, size = 0.2, position = "jitter") +
-      stat_summary(fun.data = mean_sdl, geom = "errorbar") +
-      stat_summary(fun = mean, geom = "point", size = 0.6),
-    "box" = plot + geom_boxplot(),
-    "density" = plot + geom_density(),
-    "bar" = plot + geom_bar(stat = ifelse(is.null(y_var), "count", "identity"))
-  )
-  
-  # Add optional labels if provided
-  if (!is.null(x_label)) plot <- plot + xlab(x_label)
-  if (!is.null(y_label)) plot <- plot + ylab(y_label)
-  
-  return(plot)
-}
-
-
-
-
-generate_and_save_plots <- function(df, output_dir, prefix, variables, include_qc = FALSE) {
-  plots <- list()
-  
-  if (nrow(df) == 0) {
-    message("The \"", deparse(substitute(df)), "\" dataframe is empty, skipping plots")
-    return()
-  }
-  
-  message("Generating plots for \"", deparse(substitute(df)), "\" dataframe")
-  
-  for (var in variables) {
-    scatter_plot <-
-      generate_plot(df, x_var = "scan", y_var = var, plot_type = "scatter",
-        title = paste(prefix, "Scatter Plot for", var))
-    boxplot <-
-      generate_plot(df, x_var = "scan", y_var = var, plot_type = "box",
-        title = paste(prefix, "Box Plot for", var))
-    
-    plots[[paste0(var, "_scatter")]] <- scatter_plot
-    plots[[paste0(var, "_box")]] <- boxplot
-  }
-  
-  if (include_qc) {
-    plots[["Raw_L_vs_K"]] <-
-      generate_plot(df, x_var = "L", y_var = "K", plot_type = "scatter",
-        title = "Raw L vs K before QC")
-    plots[["Delta_bg_Density"]] <-
-      generate_plot(df, x_var = "delta_bg", plot_type = "density", color_var = "conc_num",
-        title = "Density plot for Delta Background by Conc All Data")
-    plots[["Delta_bg_Bar"]] <-
-      generate_plot(df, x_var = "delta_bg", plot_type = "bar",
-        title = "Bar plot for Delta Background by Conc All Data")
-  }
-  
-  save_plots(prefix, plots, output_dir)
-}
-
-# Ensure all plots are saved and printed to PDF
-save_plots <- function(file_name, plot_list, output_dir) {
-  # Save to PDF
-  pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
-  lapply(plot_list, function(plot) {
-    print(plot)
-  })
-  dev.off()
-  
-  # Save to HTML with horizontal legend orientation
-  lapply(names(plot_list), function(plot_name) {
-    message("Generating plot: ", plot_name)
-    pgg <- tryCatch({
-      suppressWarnings(ggplotly(plot_list[[plot_name]]) %>%
-        layout(legend = list(orientation = "h")))
-    }, error = function(e) {
-      message("Error in plot: ", plot_name, "\n", e)
-      return(NULL)
-    })
-    if (!is.null(pgg)) {
-      saveWidget(pgg,
-        file = file.path(output_dir,
-        paste0(file_name, "_", plot_name, ".html")),
-        selfcontained = TRUE)
-    }
-  })
-}
-
 # Process strains (deletion and reference)
 process_strains <- function(df) {
  df_strains <- data.frame()  # Initialize an empty dataframe to store results
@@ -295,7 +199,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
      max = ~max(.x, na.rm = TRUE),
      min = ~min(.x, na.rm = TRUE),
      sd = ~sd(.x, na.rm = TRUE),
-      se = ~sd(.x, na.rm = TRUE) / sqrt(n() - 1)
+      se = ~sd(.x, na.rm = TRUE) / sqrt(n() - 1) # TODO why - 1?
    ), .names = "{.col}_{.fn}"))
  
  return(summary_stats)
@@ -305,48 +209,70 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_

  # Pull the background means
  print("Calculating background means")
-  l_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_mean)
-  k_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_mean)
+  L_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_mean)
+  K_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_mean)
  r_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(r_mean)
-  auc_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_mean)
+  AUC_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_mean)
+
+  L_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_sd)
+  K_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_sd)
+  r_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(r_sd)
+  AUC_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_sd)
  
-  # Calculate all necessary statistics and shifts in one step
+  # Calculate summary statistics and shifts
  print("Calculating interaction scores part 1")
  interaction_scores_all <- df %>%
    group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
    summarise(
      N = n(),
-      across(all_of(variables), list(mean = mean, sd = sd), na.rm = TRUE),
-      NG = sum(NG, na.rm = TRUE),
-      SM = sum(SM, na.rm = TRUE),
-      raw_shift_l = mean(L, na.rm = TRUE) - l_mean_bg,
-      raw_shift_k = mean(K, na.rm = TRUE) - k_mean_bg,
-      raw_shift_r = mean(r, na.rm = TRUE) - r_mean_bg,
-      raw_shift_auc = mean(AUC, na.rm = TRUE) - auc_mean_bg,
-      z_shift_l = raw_shift_l / L_sd[1],
-      z_shift_k = raw_shift_k / K_sd[1],
-      z_shift_r = raw_shift_r / r_sd[1],
-      z_shift_auc = raw_shift_auc / AUC_sd[1],
-      exp_l = l_mean_bg + raw_shift_l,
-      exp_k = k_mean_bg + raw_shift_k,
-      exp_r = r_mean_bg + raw_shift_r,
-      exp_auc = auc_mean_bg + raw_shift_auc,
-      delta_l = mean(L, na.rm = TRUE) - exp_l,
-      delta_k = mean(K, na.rm = TRUE) - exp_k,
-      delta_r = mean(r, na.rm = TRUE) - exp_r,
-      delta_auc = mean(AUC, na.rm = TRUE) - exp_auc,
-      zscore_l = delta_l / L_sd,
-      zscore_k = delta_k / K_sd,
-      zscore_r = delta_r / r_sd,
-      zscore_auc = delta_auc / AUC_sd,
-      sum_z_score_l = sum(zscore_l, na.rm = TRUE),
-      avg_zscore_l = sum_z_score_l / (length(variables) - sum(NG, na.rm = TRUE)),
-      sum_z_score_k = sum(zscore_k, na.rm = TRUE),
-      avg_zscore_k = sum_z_score_k / (length(variables) - sum(NG, na.rm = TRUE)),
-      sum_z_score_r = sum(zscore_r, na.rm = TRUE),
-      avg_zscore_r = sum_z_score_r / (length(variables) - sum(NG, na.rm = TRUE)),
-      sum_z_score_auc = sum(zscore_auc, na.rm = TRUE),
-      avg_zscore_auc = sum_z_score_auc / (length(variables) - sum(NG, na.rm = TRUE))
+      L_mean = mean(L),
+      L_median = median(L),
+      L_sd = sd(L),
+      L_se = L_sd / sqrt(N),
+      K_mean = mean(K),
+      K_median = median(K),
+      K_sd = sd(K),
+      K_se = K_sd / sqrt(N),
+      r_mean = mean(r),
+      r_median = median(r),
+      r_sd = sd(r),
+      r_se = r_sd / sqrt(N),
+      AUC_mean = mean(AUC),
+      AUC_median = median(AUC),
+      AUC_sd = sd(AUC),
+      AUC_se = AUC_sd / sqrt(N),
+      NG = sum(NG),
+      DB = sum(DB),
+      SM = sum(SM),
+      Raw_Shift_L = L_mean - L_mean_bg[[1]],
+      Raw_Shift_K = K_mean - K_mean_bg[[1]],
+      Raw_Shift_r = r_mean - r_mean_bg[[1]],
+      Raw_Shift_AUC = AUC_mean - AUC_mean_bg[[1]],
+      Z_Shift_L = Raw_Shift_L / L_sd[[0]],
+      Z_Shift_K = Raw_Shift_K / K_sd[[0]],
+      Z_Shift_r = Raw_Shift_r / r_sd[[0]],
+      Z_Shift_AUC = Raw_Shift_AUC / AUC_sd[[0]],
+      WT_l = df$L_mean,
+      WT_K = df$K_mean,
+      WT_r = df$r_mean,
+      WT_AUC = df$AUC_mean,
+      WT_sd_l = L_sd_bg[[1]],
+      WT_sd_K = K_sd_bg[[1]],
+      WT_sd_r = r_sd_bg[[1]],
+      WT_sd_AUC = AUC_sd_bg[[1]],
+      Exp_L = L_mean_bg[[1]] + Raw_Shift_L,
+      Exp_K = K_mean_bg[[1]] + Raw_Shift_K,
+      Exp_r = r_mean_bg[[1]] + Raw_Shift_r,
+      Exp_AUC = AUC_mean_bg[[1]] + Raw_Shift_AUC,
+      Delta_L = ifelse(NG == 1, mean(L) - WT_l, Delta_L),
+      Delta_L = ifelse(SM == 1, mean(L) - WT_l, Delta_L),
+      Delta_K = ifelse(NG == 1, mean(K) - WT_K, Delta_K),
+      Delta_r = ifelse(NG == 1, mean(r) - WT_r, Delta_r),
+      Delta_AUC = ifelse(NG == 1, mean(AUC) - WT_AUC, Delta_AUC),
+      Zscore_L = Delta_L / WT_sd_l,
+      Zscore_K = Delta_K / WT_sd_K,
+      Zscore_r = Delta_r / WT_sd_r,
+      Zscore_AUC = Delta_AUC / WT_sd_AUC,
    ) %>%
    ungroup()
  
@@ -355,10 +281,10 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_
  interaction_scores <- interaction_scores_all %>%
    group_by(across(all_of(group_vars))) %>%
    summarise(
-      lm_l = list(lm(delta_l ~ conc_num_factor)),
-      lm_k = list(lm(delta_k ~ conc_num_factor)),
-      lm_r = list(lm(delta_r ~ conc_num_factor)),
-      lm_auc = list(lm(delta_auc ~ conc_num_factor)),
+      lm_l = list(lm(Delta_L ~ conc_num_factor)),
+      lm_k = list(lm(Delta_K ~ conc_num_factor)),
+      lm_r = list(lm(Delta_r ~ conc_num_factor)),
+      lm_auc = list(lm(Delta_AUC ~ conc_num_factor)),
      lm_Score_L = max_conc * coef(lm_l[[1]])[2] + coef(lm_l[[1]])[1],
      lm_Score_K = max_conc * coef(lm_k[[1]])[2] + coef(lm_k[[1]])[1],
      lm_Score_r = max_conc * coef(lm_r[[1]])[2] + coef(lm_r[[1]])[1],
@@ -367,66 +293,192 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_
      R_Squared_K = summary(lm_k[[1]])$r.squared,
      R_Squared_r = summary(lm_r[[1]])$r.squared,
      R_Squared_AUC = summary(lm_auc[[1]])$r.squared,
-      Sum_Z_Score_L = sum(zscore_l, na.rm = TRUE),
-      Avg_Zscore_L = avg_zscore_l[1],
-      Sum_Z_Score_K = sum(zscore_k, na.rm = TRUE),
-      Avg_Zscore_K = avg_zscore_k[1],
-      Sum_Z_Score_r = sum(zscore_r, na.rm = TRUE),
-      Avg_Zscore_r = avg_zscore_r[1],
-      Sum_Z_Score_AUC = sum(zscore_auc, na.rm = TRUE),
-      Avg_Zscore_AUC = avg_zscore_auc[1],
-      NG = sum(NG, na.rm = TRUE),
-      DB = sum(DB, na.rm = TRUE),
-      SM = sum(SM, na.rm = TRUE)
+      Sum_Z_Score_L = sum(Zscore_L),
+      Avg_Zscore_L = Sum_Z_Score_L / (length(variables) - sum(NG)),
+      Sum_Z_Score_K = sum(Zscore_K),
+      Avg_Zscore_K = Sum_Z_Score_K / (length(variables) - sum(NG)),
+      Sum_Z_Score_r = sum(Zscore_r),
+      Avg_Zscore_r = Sum_Z_Score_r / (length(variables) - sum(NG)),
+      Sum_Z_Score_AUC = sum(Zscore_AUC),
+      Avg_Zscore_AUC = Sum_Z_Score_AUC / (length(variables) - sum(NG)),
+      NG = sum(NG),
+      DB = sum(DB),
+      SM = sum(SM)
    ) %>%
    ungroup()
  
-  # Ensure interaction_scores has only the specified columns
-  interaction_scores <- interaction_scores %>%
-    select(Gene, raw_shift_l, z_shift_l, lm_Score_L, Z_lm_L = lm_Score_L, R_Squared_L, sum_z_score_l, avg_zscore_l,
-           raw_shift_k, z_shift_k, lm_Score_K, Z_lm_K = lm_Score_K, R_Squared_K, sum_z_score_k, avg_zscore_k,
-           raw_shift_r, z_shift_r, lm_Score_r, Z_lm_r = lm_Score_r, R_Squared_r, sum_z_score_r, avg_zscore_r,
-           raw_shift_auc, z_shift_auc, lm_Score_AUC, Z_lm_AUC = lm_Score_AUC, R_Squared_AUC, sum_z_score_auc, avg_zscore_auc,
-           NG, DB, SM)
-  
  return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
 }

-generate_rf_plots <- function(df_calculations, df_interactions, output_dir, file_prefix = "RF") {
-  variables <- c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC")
-  WT_sds <- list(WT_sd_l = 2, WT_sd_K = 2, WT_sd_r = 0.65, WT_sd_AUC = 6500)
+
+# generate_rf_plots <- function(df_calculations, df_interactions, output_dir, file_prefix = "RF") {
+#   variables <- c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC")
+#   WT_sds <- list(WT_sd_l = 2, WT_sd_K = 2, WT_sd_r = 0.65, WT_sd_AUC = 6500)
  
-  plot_list <- lapply(seq_along(variables), function(i) {
-    var <- variables[i]
-    WT_sd <- WT_sds[[i]]
+#   plot_list <- lapply(seq_along(variables), function(i) {
+#     var <- variables[i]
+#     WT_sd <- WT_sds[[i]]
    
-    ggplot(df_calculations, aes(conc_num_factor, !!sym(var))) +
-      geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
-      coord_cartesian(ylim = c(-WT_sd, WT_sd)) +
-      geom_errorbar(aes(ymin = 0 - (2 * WT_sd), ymax = 0 + (2 * WT_sd)), alpha = 0.3) +
-      ggtitle(paste(df_calculations$OrfRep[1], df_calculations$Gene[1], sep = " ")) +
-      annotate("text", x = 1, y = 0.9 * WT_sd, label = paste("ZShift =", round(df_interactions[[paste0("Z_Shift_", var)]], 2))) +
-      annotate("text", x = 1, y = 0.7 * WT_sd, label = paste("lm Zscore =", round(df_interactions[[paste0("Z_lm_", var)]], 2))) +
-      annotate("text", x = 1, y = -0.7 * WT_sd, label = paste("NG =", df_interactions$NG)) +
-      annotate("text", x = 1, y = -0.9 * WT_sd, label = paste("DB =", df_interactions$DB)) +
-      annotate("text", x = 1, y = -1.1 * WT_sd, label = paste("SM =", df_interactions$SM)) +
-      scale_x_continuous(
-        name = unique(df_calculations$Drug[1]),
-        breaks = unique(df_calculations$conc_num_factor),
-        labels = unique(as.character(df_calculations$conc_num))) +
-      theme_publication()
-  })
+#     ggplot(df_calculations, aes(conc_num_factor, !!sym(var))) +
+#       geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
+#       coord_cartesian(ylim = c(-WT_sd, WT_sd)) +
+#       geom_errorbar(aes(ymin = 0 - (2 * WT_sd), ymax = 0 + (2 * WT_sd)), alpha = 0.3) +
+#       ggtitle(paste(df_calculations$OrfRep[1], df_calculations$Gene[1], sep = " ")) +
+#       annotate("text", x = 1, y = 0.9 * WT_sd, label = paste("ZShift =", round(df_interactions[[paste0("Z_Shift_", var)]], 2))) +
+#       annotate("text", x = 1, y = 0.7 * WT_sd, label = paste("lm Zscore =", round(df_interactions[[paste0("Z_lm_", var)]], 2))) +
+#       annotate("text", x = 1, y = -0.7 * WT_sd, label = paste("NG =", df_interactions$NG)) +
+#       annotate("text", x = 1, y = -0.9 * WT_sd, label = paste("DB =", df_interactions$DB)) +
+#       annotate("text", x = 1, y = -1.1 * WT_sd, label = paste("SM =", df_interactions$SM)) +
+#       scale_x_continuous(
+#         name = unique(df_calculations$Drug[1]),
+#         breaks = unique(df_calculations$conc_num_factor),
+#         labels = unique(as.character(df_calculations$conc_num))) +
+#       theme_publication()
+#   })
  
-  save_plots(file_prefix, plot_list, output_dir)
+#   save_plots(file_prefix, plot_list, output_dir)
+# }
+
+# generate_summary_plots <- function(df, output_dir) {
+#   variables <- c("L", "K", "r", "AUC")
+#   plot_list <- lapply(variables, function(var) {
+#     generate_plot(df, x_var = "conc_num_factor", y_var = var, plot_type = "scatter", title = paste("Summary Plot for", var))
+#   })
+  
+#   save_plots("Summary_Plots", plot_list, output_dir)
+# }
+
+
+# # Generate ranked plots for a specific metric
+# generate_ranked_plot <- function(df, rank_var, zscore_var, sd_threshold, title_prefix) {
+#   ggplot(df, aes(x = {{rank_var}}, y = {{zscore_var}})) +
+#     ggtitle(paste(title_prefix, "above", sd_threshold, "SD")) +
+#     xlab("Rank") + ylab(paste("Avg Z score", title_prefix)) +
+#     annotate("rect", xmin = -Inf, xmax = Inf, ymin = sd_threshold, ymax = Inf, fill = "#542788", alpha = 0.3) +
+#     annotate("rect", xmin = -Inf, xmax = Inf, ymin = -sd_threshold, ymax = -Inf, fill = "orange", alpha = 0.3) +
+#     geom_hline(yintercept = c(-sd_threshold, sd_threshold)) +
+#     geom_point(size = 0.1, shape = 3) +
+#     theme_publication()
+# }
+
+# # Generate and save all ranked plots
+# generate_and_save_ranked_plots <- function(df, output_dir, prefix) {
+#   rank_metrics <- list(
+#     list("L_Rank", "Avg_Zscore_L", "L"),
+#     list("K_Rank", "Avg_Zscore_K", "K"),
+#     list("r_Rank", "Avg_Zscore_r", "r"),
+#     list("AUC_Rank", "Avg_Zscore_AUC", "AUC"),
+#     list("L_Rank_lm", "Z_lm_L", "L"),
+#     list("K_Rank_lm", "Z_lm_K", "K"),
+#     list("r_Rank_lm", "Z_lm_r", "r"),
+#     list("AUC_Rank_lm", "Z_lm_AUC", "AUC")
+#   )
+  
+#   pdf(file.path(output_dir, paste0(prefix, ".pdf")), width = 18, height = 12, onefile = TRUE)
+  
+#   for (sd_threshold in c(1, 2, 3)) {
+#     for (metric in rank_metrics) {
+#       plot <- generate_ranked_plot(df, sym(metric[[1]]), sym(metric[[2]]), sd_threshold, metric[[3]])
+#       print(plot)
+#     }
+#   }
+  
+#   dev.off()
+# }
+
+
+
+
+generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num",
+  title, x_label = NULL, y_label = NULL, ylim_vals = NULL) {
+  plot <- ggplot(df, aes_string(x = x_var, color = color_var))
+  
+  if (!is.null(y_var)) {
+    plot <- plot + aes_string(y = y_var)
+  }
+  
+  # Set up the plot based on the requested plot type
+  plot <- switch(plot_type,
+    "scatter" = plot + geom_point() + geom_smooth(method = "lm", se = FALSE),
+    "box" = plot + geom_boxplot(),
+    "density" = plot + geom_density(),
+    "bar" = plot + geom_bar(stat = "identity"),
+    plot  # Default: return the plot as is
+  )
+  
+  if (!is.null(ylim_vals)) {
+    plot <- plot + coord_cartesian(ylim = ylim_vals)
+  }
+  
+  # Add titles and labels if available
+  plot <- plot + ggtitle(title) + theme_publication()
+  if (!is.null(x_label)) plot <- plot + xlab(x_label)
+  if (!is.null(y_label)) plot <- plot + ylab(y_label)
+  
+  return(plot)
 }

-generate_summary_plots <- function(df, output_dir) {
-  variables <- c("L", "K", "r", "AUC")
-  plot_list <- lapply(variables, function(var) {
-    generate_plot(df, x_var = "conc_num_factor", y_var = var, plot_type = "scatter", title = paste("Summary Plot for", var))
-  })
+
+generate_and_save_plots <- function(df, output_dir, file_prefix, variables, plot_type = "scatter", include_qc = FALSE, ylim_vals = NULL) {
+  plots <- list()
  
-  save_plots("Summary_Plots", plot_list, output_dir)
+  if (nrow(df) == 0) {
+    message("The \"", deparse(substitute(df)), "\" dataframe is empty, skipping plots")
+    return()
+  }
+  
+  message("Generating plots for \"", deparse(substitute(df)), "\" dataframe")
+  
+  # Create plots for the given variables
+  for (var in variables) {
+    plot <- generate_plot(
+      df = df,
+      x_var = "scan",
+      y_var = var,
+      plot_type = plot_type,
+      title = paste(file_prefix, "Plot for", var),
+      ylim_vals = ylim_vals
+    )
+    plots[[paste0(var, "_", plot_type)]] <- plot
+  }
+  
+  # Include additional QC plots if requested
+  if (include_qc) {
+    plots[["Raw_L_vs_K"]] <- generate_plot(df, "L", "K", "scatter", "Raw L vs K before QC")
+    plots[["Delta_bg_Density"]] <- generate_plot(df, "delta_bg", NULL, "density", "Density plot for Delta Background")
+    plots[["Delta_bg_Bar"]] <- generate_plot(df, "delta_bg", NULL, "bar", "Bar plot for Delta Background")
+  }
+  
+  save_plots(file_prefix, plots, output_dir)
+}
+
+
+# Ensure all plots are saved and printed to PDF
+save_plots <- function(file_name, plot_list, output_dir) {
+  # Save to PDF
+  pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
+  lapply(plot_list, function(plot) {
+    print(plot)
+  })
+  dev.off()
+  
+  # Save to HTML with horizontal legend orientation
+  lapply(names(plot_list), function(plot_name) {
+    message("Generating plot: ", plot_name)
+    pgg <- tryCatch({
+      suppressWarnings(ggplotly(plot_list[[plot_name]]) %>%
+        layout(legend = list(orientation = "h")))
+    }, error = function(e) {
+      message("Error in plot: ", plot_name, "\n", e)
+      return(NULL)
+    })
+    if (!is.null(pgg)) {
+      saveWidget(pgg,
+        file = file.path(output_dir,
+        paste0(file_name, "_", plot_name, ".html")),
+        selfcontained = TRUE)
+    }
+  })
 }


@@ -472,42 +524,6 @@ adjust_missing_and_rank <- function(df) {
  return(df)
 }

-# Generate ranked plots for a specific metric
-generate_ranked_plot <- function(df, rank_var, zscore_var, sd_threshold, title_prefix) {
-  ggplot(df, aes(x = {{rank_var}}, y = {{zscore_var}})) +
-    ggtitle(paste(title_prefix, "above", sd_threshold, "SD")) +
-    xlab("Rank") + ylab(paste("Avg Z score", title_prefix)) +
-    annotate("rect", xmin = -Inf, xmax = Inf, ymin = sd_threshold, ymax = Inf, fill = "#542788", alpha = 0.3) +
-    annotate("rect", xmin = -Inf, xmax = Inf, ymin = -sd_threshold, ymax = -Inf, fill = "orange", alpha = 0.3) +
-    geom_hline(yintercept = c(-sd_threshold, sd_threshold)) +
-    geom_point(size = 0.1, shape = 3) +
-    theme_publication()
-}
-
-# Generate and save all ranked plots
-generate_and_save_ranked_plots <- function(df, output_dir, prefix) {
-  rank_metrics <- list(
-    list("L_Rank", "Avg_Zscore_L", "L"),
-    list("K_Rank", "Avg_Zscore_K", "K"),
-    list("r_Rank", "Avg_Zscore_r", "r"),
-    list("AUC_Rank", "Avg_Zscore_AUC", "AUC"),
-    list("L_Rank_lm", "Z_lm_L", "L"),
-    list("K_Rank_lm", "Z_lm_K", "K"),
-    list("r_Rank_lm", "Z_lm_r", "r"),
-    list("AUC_Rank_lm", "Z_lm_AUC", "AUC")
-  )
-  
-  pdf(file.path(output_dir, paste0(prefix, ".pdf")), width = 18, height = 12, onefile = TRUE)
-  
-  for (sd_threshold in c(1, 2, 3)) {
-    for (metric in rank_metrics) {
-      plot <- generate_ranked_plot(df, sym(metric[[1]]), sym(metric[[2]]), sd_threshold, metric[[3]])
-      print(plot)
-    }
-  }
-  
-  dev.off()
-}

 # Function to create and save all ranked plots
 create_ranked_plots <- function(df, output_dir) {
@@ -520,8 +536,45 @@ create_ranked_plots <- function(df, output_dir) {
  generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_lm_naRM")
 }

+
+generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", title_prefix = "",
+  x_label = NULL, y_label = NULL, ylim_vals = NULL, output_dir, file_prefix = "plot") {
+  plot_list <- list()
+  
+  # Iterate over each y variable and generate the corresponding plot
+  for (var in y_vars) {
+    plot <- ggplot(df, aes(x = !!sym(x_var), y = !!sym(var), color = as.factor(!!sym(color_var)))) +
+      ggtitle(paste(title_prefix, var)) +
+      theme_publication()
+    
+    if (plot_type == "scatter") {
+      plot <- plot +
+        geom_point() +
+        geom_smooth(method = "lm", formula = y ~ x, se = FALSE)
+      
+      if (!is.null(ylim_vals)) {
+        plot <- plot + coord_cartesian(ylim = ylim_vals)
+      }
+    } else if (plot_type == "box") {
+      plot <- plot + geom_boxplot()
+    } else if (plot_type == "density") {
+      plot <- plot + geom_density()
+    } else if (plot_type == "bar") {
+      plot <- plot + geom_bar(stat = "identity")
+    }
+    
+    if (!is.null(x_label)) plot <- plot + xlab(x_label)
+    if (!is.null(y_label)) plot <- plot + ylab(y_label)
+    
+    plot_list[[var]] <- plot
+  }
+  
+  # Save plots to PDF and HTML
+  save_plots(file_prefix, plot_list, output_dir)
+}
+
+
 main <- function() {
-  # Applying to all experiments
  lapply(names(args$experiments), function(exp_name) {
    exp <- args$experiments[[exp_name]]
    exp_path <- exp$path
@@ -537,34 +590,28 @@ main <- function() {
    
    max_conc <- max(df$conc_num_factor)
    
-    # QC
-    # Filter the df above sd tolerance for raw L vs K plot
+    # QC steps and filtering
    df_above_tolerance <- df %>% filter(DB == 1)
-    
-    # Set vars above the delta background tolerance to NA
    df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .x)))
-
-    # Remove rows where L is 0
    df_no_zeros <- df_na %>% filter(L > 0)
-
-    # Flag and remove non-finite data, printing affected rows
+    
    df_na_filtered <- df_na %>%
-      filter(if_any(c(L), ~ !is.finite(.))) %>% # Add L, r, AUC, K as needed for debugging
+      filter(if_any(c(L), ~ !is.finite(.))) %>%
      {
        if (nrow(.) > 0) {
          message("Removing non-finite rows:\n")
          print(head(., n = 10))
        }
-        df_na %>% filter(if_all(c(L), is.finite)) # Add L, r, AUC, K as needed for debugging
+        df_na %>% filter(if_all(c(L), is.finite))
      }
-    
-    # Generate QC PDFs and HTMLs
-    # message("Generating QC plots")
-    # variables <- c("L", "K", "r", "AUC", "delta_bg")
-    # generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
-    # generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
-    # generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
-    # generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)
+
+    # Generate and save QC plots using the new generalized function
+    message("Generating QC plots")
+    variables <- c("L", "K", "r", "AUC", "delta_bg")
+    generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
+    generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
+    generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
+    generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)

    rm(df, df_above_tolerance, df_no_zeros)
    
@@ -577,38 +624,36 @@ main <- function() {

    # Originally this filtered L NA's

-    # Filter data within 2SD
+    # Filter data within and outside 2SD
    within_2sd_k <- stats_joined %>%
      filter(K >= (K_mean - 2 * K_sd) & K <= (K_mean + 2 * K_sd))
-
-    # Filter data outside 2SD
    outside_2sd_k <- stats_joined %>%
      filter(K < (K_mean - 2 * K_sd) | K > (K_mean + 2 * K_sd))

-    message("Calculating summary statistics for L within 2SD of K")
+    # Summary statistics for within and outside 2SD of K
    l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
+    l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
+
+    # Write CSV files
+    write.csv(l_within_2sd_k, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
+    write.csv(l_outside_2sd_k, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
+
+    message("Calculating summary statistics for L within 2SD of K")
    cols_to_remove <- names(l_within_2sd_k)
    cols_to_keep <- c("conc_num", "conc_num_factor")
    within_2sd_k_clean <- within_2sd_k %>%
      select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
    l_within_2sd_k_joined <- within_2sd_k_clean %>%
      left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor"))
-    write.csv(l_within_2sd_k,
-      file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"),
-      row.names = FALSE)
-    
+
    message("Calculating summary statistics for for L outside 2SD of K")
-    l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
    cols_to_remove <- names(l_outside_2sd_k)
    cols_to_keep <- c("conc_num", "conc_num_factor")
    outside_2sd_k_clean <- outside_2sd_k %>%
      select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
    l_outside_2sd_k_joined <- outside_2sd_k_clean %>%
      left_join(l_outside_2sd_k, by = c("conc_num", "conc_num_factor"))
-    write.csv(l_outside_2sd_k,
-      file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"),
-      row.names = FALSE)
-    
+
    # Process background strains
    bg_strains <- c("YDL227C")
    lapply(bg_strains, function(strain) {
@@ -646,18 +691,14 @@ main <- function() {
        filter(OrfRep != strain) %>%
        mutate(SM = 0)
      
-      message("Processing reference strain")
      reference_strain <- process_strains(l_within_2sd_k_joined)
-      message("Processing deletion strains")
      deletion_strains <- process_strains(l_within_2sd_k_joined)
      
      # TODO we may need to add "num" to grouping vars
      
      # Calculate interactions
      variables <- c("L", "K", "r", "AUC")
-      message("Calculating reference interaction scores")
      reference_results <- calculate_interaction_scores(stats_joined, reference_strain, max_conc, variables)
-      message("Calculating deletion interaction scores")
      deletion_results <- calculate_interaction_scores(stats_joined, deletion_strains, max_conc, variables)
      
      zscores_calculations_reference <- reference_results$zscores_calculations
@@ -666,11 +707,11 @@ main <- function() {
      zscores_interactions <- deletion_results$zscores_interactions
      
      # TODO: I don't know if we need this?
-      zscores_interactions <- zscores_interactions %>%
-        arrange(desc(Z_lm_L)) %>%
-        arrange(desc(NG))
+      # zscores_interactions <- zscores_interactions %>%
+      #   arrange(desc(Z_lm_L)) %>%
+      #   arrange(desc(NG))
      
-      message("Writing zscores csv files")
+      # Writing Z-Scores to file
      write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE)
      write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE)
      write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE)
@@ -678,10 +719,11 @@ main <- function() {
      
      # Define conditions for enhancers and suppressors
      # TODO Add to study config file?
-      enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= 2
-      suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -2
-      enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= 2
-      suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -2
+      threshold <- 2
+      enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= threshold
+      suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -threshold
+      enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= threshold
+      suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -threshold
      
      # Subset data
      enhancers_L <- zscores_interactions[enhancer_condition_L, ]
@@ -707,10 +749,11 @@ main <- function() {
        file = file.path(out_dir, "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K.csv"), row.names = FALSE)
      
      # Handle linear model based enhancers and suppressors
-      enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= 2, ]
-      suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -2, ]
-      enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= 2, ]
-      suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -2, ]
+      lm_threshold <- 2
+      enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= lm_threshold, ]
+      suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -lm_threshold, ]
+      enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= lm_threshold, ]
+      suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -lm_threshold, ]
      
      # Save linear model based enhancers and suppressors
      message("Writing linear model enhancer/suppressor csv files")
@@ -724,9 +767,11 @@ main <- function() {
        file = file.path(out_dir, "ZScores_Interaction_DeletionSuppressors_K_lm.csv"), row.names = FALSE)
      
      # Generate plots for interaction scores
-      generate_rf_plots(zscores_calculations_reference, zscores_interactions_reference, out_dir)
-      generate_rf_plots(zscores_calculations, zscores_interactions, out_dir)
-      
+      # generate_rf_plots(zscores_calculations_reference, zscores_interactions_reference, out_dir)
+      # generate_rf_plots(zscores_calculations, zscores_interactions, out_dir)
+      generate_and_save_plots(zscores_calculations_reference, out_dir, "Reference_Calculations", variables)
+      generate_and_save_plots(zscores_calculations, out_dir, "Deletion_Calculations", variables)
+
      # Apply filtering to remove NA values before further analysis
      zscores_interactions_filtered <- zscores_interactions %>%
        filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L))