Output interaction df in calculate_interaction_zscores.R

2024-08-24 02:30:39 -04:00
parent 3cb6e89360
commit 17521b817f
1 changed files with 313 additions and 52 deletions
--- a/workflow/apps/r/calculate_interaction_zscores.R
+++ b/workflow/apps/r/calculate_interaction_zscores.R
@@ -123,7 +123,6 @@ load_and_preprocess_data <- function(easy_results_file, genes) {
    mutate(L = if ("l" %in% colnames(.)) l else {warning("Missing column: l"); NA}) %>%
    mutate(AUC = if ("AUC96" %in% colnames(.)) AUC96 else {warning("Missing column: AUC96"); NA}) %>%
    filter(!.[[1]] %in% c("", "Scan")) %>%            # Filter out empty or Scan rows
-    # {if ("Conc" %in% colnames(.)) filter(., Conc != "0ug/mL") else .} %>%
    filter(Gene != "BLANK" & Gene != "Blank" & ORF != "Blank" & Gene != "blank") %>%  # Remove blank genes
    filter(Drug != "BMH21") %>%                       # Filter out specific drugs if necessary
    filter(!is.na(ORF) & ORF != "") %>%               # Ensure ORF is not NA or empty
@@ -142,8 +141,6 @@ load_and_preprocess_data <- function(easy_results_file, genes) {
  return(df)
 }

-
-
 create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "") {
  if (!("Scan" %in% colnames(df))) {
    warning("'Scan' column is not present in the data. Skipping this plot.")
@@ -186,52 +183,103 @@ publish_summary_stats <- function(df, variables, out_dir) {
  fwrite(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
 }

-publish_interaction_scores <- function(df, out_dir) {
-  interaction_scores <- df %>%
-    dplyr::group_by(OrfRep) %>%
-    dplyr::summarize(
-      mean_L = mean(L, na.rm = TRUE),
-      mean_K = mean(K, na.rm = TRUE),
-      mean_r = mean(r, na.rm = TRUE),
-      mean_AUC = mean(AUC, na.rm = TRUE),
-      delta_bg_mean = mean(delta_bg, na.rm = TRUE),
-      delta_bg_sd = sd(delta_bg, na.rm = TRUE)
-    ) %>%
-    dplyr::mutate(
-      l_rank = rank(mean_L),
-      k_rank = rank(mean_K),
-      r_rank = rank(mean_r),
-      auc_rank = rank(mean_AUC)
+# Compute Interaction Scores
+compute_interaction_scores <- function(df) {
+  # Calculate raw shifts
+  raw_shifts <- df %>%
+    group_by(OrfRep) %>%
+    reframe(
+      Raw_Shift_L = mean(L, na.rm = TRUE),
+      Raw_Shift_K = mean(K, na.rm = TRUE),
+      Raw_Shift_r = mean(r, na.rm = TRUE),
+      Raw_Shift_AUC = mean(AUC, na.rm = TRUE),
+      NG = n()
    )
  
-  fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
-  fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank),
-    file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
+  # Calculate Z-scores
+  z_scores <- df %>%
+    group_by(OrfRep) %>%
+    reframe(
+      Z_Shift_L = mean(scale(L, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
+      Z_Shift_K = mean(scale(K, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
+      Z_Shift_r = mean(scale(r, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
+      Z_Shift_AUC = mean(scale(AUC, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE)
+    )
+  
+  # Linear Model Scores and R-Squared
+  lm_scores <- df %>%
+    group_by(OrfRep) %>%
+    reframe(
+      lm_Score_L = coef(lm(L ~ delta_bg, data = .))[2],
+      R_Squared_L = summary(lm(L ~ delta_bg, data = .))$r.squared,
+      lm_Score_K = coef(lm(K ~ delta_bg, data = .))[2],
+      R_Squared_K = summary(lm(K ~ delta_bg, data = .))$r.squared,
+      lm_Score_r = coef(lm(r ~ delta_bg, data = .))[2],
+      R_Squared_r = summary(lm(r ~ delta_bg, data = .))$r.squared,
+      lm_Score_AUC = coef(lm(AUC ~ delta_bg, data = .))[2],
+      R_Squared_AUC = summary(lm(AUC ~ delta_bg, data = .))$r.squared
+    )
+  
+  # Calculate Sum and Average Z-scores
+  sum_avg_z_scores <- df %>%
+    group_by(OrfRep) %>%
+    reframe(
+      Sum_Z_Score_L = sum(scale(L, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
+      Avg_Zscore_L = mean(scale(L, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
+      Sum_Z_Score_K = sum(scale(K, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
+      Avg_Zscore_K = mean(scale(K, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
+      Sum_Z_Score_r = sum(scale(r, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
+      Avg_Zscore_r = mean(scale(r, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
+      Sum_Z_Score_AUC = sum(scale(AUC, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE),
+      Avg_Zscore_AUC = mean(scale(AUC, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE)
+    )
+  
+  # Combine all calculations
+  interaction_scores <- raw_shifts %>%
+    left_join(z_scores, by = "OrfRep") %>%
+    left_join(lm_scores, by = "OrfRep") %>%
+    left_join(sum_avg_z_scores, by = "OrfRep") %>%
+    mutate(
+      l_rank = rank(-Raw_Shift_L),
+      k_rank = rank(-Raw_Shift_K),
+      r_rank = rank(-Raw_Shift_r),
+      auc_rank = rank(-Raw_Shift_AUC)
+    )
+  
+  return(interaction_scores)
+}
+
+
+publish_interaction_scores <- function(df, out_dir) {
+  interaction_scores <- compute_interaction_scores(df)
  
  # Additional enhancer and suppressor calculations and outputs
-  deletion_enhancers_L <- interaction_scores[interaction_scores$mean_L >= 2, ]
+  deletion_enhancers_L <- interaction_scores[interaction_scores$Raw_Shift_L >= 2, ]
  deletion_enhancers_L <- deletion_enhancers_L[!is.na(deletion_enhancers_L$OrfRep), ]

-  deletion_enhancers_K <- interaction_scores[interaction_scores$mean_K <= -2, ]
+  deletion_enhancers_K <- interaction_scores[interaction_scores$Raw_Shift_K <= -2, ]
  deletion_enhancers_K <- deletion_enhancers_K[!is.na(deletion_enhancers_K$OrfRep), ]

-  deletion_suppressors_L <- interaction_scores[interaction_scores$mean_L <= -2, ]
+  deletion_suppressors_L <- interaction_scores[interaction_scores$Raw_Shift_L <= -2, ]
  deletion_suppressors_L <- deletion_suppressors_L[!is.na(deletion_suppressors_L$OrfRep), ]

-  deletion_suppressors_K <- interaction_scores[interaction_scores$mean_K >= 2, ]
+  deletion_suppressors_K <- interaction_scores[interaction_scores$Raw_Shift_K >= 2, ]
  deletion_suppressors_K <- deletion_suppressors_K[!is.na(deletion_suppressors_K$OrfRep), ]

  deletion_enhancers_and_suppressors_L <- interaction_scores[
-    interaction_scores$mean_L >= 2 | interaction_scores$mean_L <= -2, ]
+    interaction_scores$Raw_Shift_L >= 2 | interaction_scores$Raw_Shift_L <= -2, ]
  deletion_enhancers_and_suppressors_L <- deletion_enhancers_and_suppressors_L[
    !is.na(deletion_enhancers_and_suppressors_L$OrfRep), ]

  deletion_enhancers_and_suppressors_K <- interaction_scores[
-    interaction_scores$mean_K >= 2 | interaction_scores$mean_K <= -2, ]
+    interaction_scores$Raw_Shift_K >= 2 | interaction_scores$Raw_Shift_K <= -2, ]
  deletion_enhancers_and_suppressors_K <- deletion_enhancers_and_suppressors_K[
    !is.na(deletion_enhancers_and_suppressors_K$OrfRep), ]

  # Write CSV files with updated filenames
+  fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
+  fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank), 
+        file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
  fwrite(deletion_enhancers_L, file.path(out_dir, "deletion_enhancers_l.csv"), row.names = FALSE)
  fwrite(deletion_enhancers_K, file.path(out_dir, "deletion_enhancers_k.csv"), row.names = FALSE)
  fwrite(deletion_suppressors_L, file.path(out_dir, "deletion_suppressors_l.csv"), row.names = FALSE)
@@ -239,23 +287,22 @@ publish_interaction_scores <- function(df, out_dir) {
  fwrite(deletion_enhancers_and_suppressors_L, file.path(out_dir, "deletion_enhancers_and_suppressors_l.csv"), row.names = FALSE)
  fwrite(deletion_enhancers_and_suppressors_K, file.path(out_dir, "deletion_enhancers_and_suppressors_k.csv"), row.names = FALSE)

-  return(interaction_scores)  # Return the updated data frame with rank columns
+  return(interaction_scores)  # Return the updated data frame with all calculated columns
 }

-
-
 publish_zscores <- function(df, out_dir) {
  zscores <- df %>%
    dplyr::mutate(
-      zscore_L = scale(L, center = TRUE, scale = TRUE),
-      zscore_K = scale(K, center = TRUE, scale = TRUE),
-      zscore_r = scale(r, center = TRUE, scale = TRUE),
-      zscore_AUC = scale(AUC, center = TRUE, scale = TRUE)
+      zscore_L = scale(Raw_Shift_L, center = TRUE, scale = TRUE),
+      zscore_K = scale(Raw_Shift_K, center = TRUE, scale = TRUE),
+      zscore_r = scale(Raw_Shift_r, center = TRUE, scale = TRUE),
+      zscore_AUC = scale(Raw_Shift_AUC, center = TRUE, scale = TRUE)
    )

  fwrite(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
 }

+
 generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
  variables <- c("L", "K", "r", "AUC", "delta_bg")
  
@@ -287,9 +334,6 @@ generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
  })
 }

-
-
-# Create rank plots
 create_rank_plots <- function(interaction_scores, out_dir) {
  rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank")
  
@@ -319,12 +363,12 @@ create_correlation_plot <- function(interaction_scores, out_dir) {

  # Generate correlation plots for each pair of variables
  pairs <- list(
-    c("mean_L", "mean_K"),
-    c("mean_L", "mean_r"),
-    c("mean_L", "mean_AUC"),
-    c("mean_K", "mean_r"),
-    c("mean_K", "mean_AUC"),
-    c("mean_r", "mean_AUC")
+    c("Raw_Shift_L", "Raw_Shift_K"),
+    c("Raw_Shift_L", "Raw_Shift_r"),
+    c("Raw_Shift_L", "Raw_Shift_AUC"),
+    c("Raw_Shift_K", "Raw_Shift_r"),
+    c("Raw_Shift_K", "Raw_Shift_AUC"),
+    c("Raw_Shift_r", "Raw_Shift_AUC")
  )

  lapply(pairs, function(vars) {
@@ -347,7 +391,6 @@ create_correlation_plot <- function(interaction_scores, out_dir) {
  })
 }

-
 process_experiment <- function(exp_name, exp_dir, genes, output_dir) {
  out_dir <- file.path(exp_dir, "zscores")
  out_dir_qc <- file.path(out_dir, "qc")
@@ -367,14 +410,14 @@ process_experiment <- function(exp_name, exp_dir, genes, output_dir) {
  variables <- c("L", "K", "r", "AUC", "delta_bg")
  publish_summary_stats(data, variables, out_dir)
  interaction_scores <- publish_interaction_scores(data, out_dir)
-  publish_zscores(data, out_dir)
+  publish_zscores(interaction_scores, out_dir)  # Now writing interaction_scores, not original data

  # Generate rank plots and correlation plots
  create_rank_plots(interaction_scores, out_dir)
  create_correlation_plot(interaction_scores, out_dir)

  output_file <- file.path(out_dir, "zscores_interaction.csv")
-  fwrite(data, output_file, row.names = FALSE)
+  fwrite(interaction_scores, output_file, row.names = FALSE)  # Write interaction_scores here

  return(output_file)
 }
@@ -390,6 +433,224 @@ if (length(processed_files) > 1) {
    merge(fread(x), fread(y), by = "OrfRep", all = TRUE, allow.cartesian = TRUE)
  }, processed_files)

-  combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv")
-  fwrite(combined_data, combined_output_file, row.names = FALSE)
+combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv")
+fwrite(combined_data, combined_output_file, row.names = FALSE)
 }
+
+
+
+
+
+
+
+
+
+
+
+# publish_interaction_scores <- function(df, out_dir) {
+#   interaction_scores <- df %>%
+#     dplyr::group_by(OrfRep) %>%
+#     dplyr::summarize(
+#       mean_L = mean(L, na.rm = TRUE),
+#       mean_K = mean(K, na.rm = TRUE),
+#       mean_r = mean(r, na.rm = TRUE),
+#       mean_AUC = mean(AUC, na.rm = TRUE),
+#       delta_bg_mean = mean(delta_bg, na.rm = TRUE),
+#       delta_bg_sd = sd(delta_bg, na.rm = TRUE)
+#     ) %>%
+#     dplyr::mutate(
+#       l_rank = rank(mean_L),
+#       k_rank = rank(mean_K),
+#       r_rank = rank(mean_r),
+#       auc_rank = rank(mean_AUC)
+#     )
+
+#   fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE)
+#   fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank),
+#     file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE)
+
+#   # Additional enhancer and suppressor calculations and outputs
+#   deletion_enhancers_L <- interaction_scores[interaction_scores$mean_L >= 2, ]
+#   deletion_enhancers_L <- deletion_enhancers_L[!is.na(deletion_enhancers_L$OrfRep), ]
+
+#   deletion_enhancers_K <- interaction_scores[interaction_scores$mean_K <= -2, ]
+#   deletion_enhancers_K <- deletion_enhancers_K[!is.na(deletion_enhancers_K$OrfRep), ]
+
+#   deletion_suppressors_L <- interaction_scores[interaction_scores$mean_L <= -2, ]
+#   deletion_suppressors_L <- deletion_suppressors_L[!is.na(deletion_suppressors_L$OrfRep), ]
+
+#   deletion_suppressors_K <- interaction_scores[interaction_scores$mean_K >= 2, ]
+#   deletion_suppressors_K <- deletion_suppressors_K[!is.na(deletion_suppressors_K$OrfRep), ]
+
+#   deletion_enhancers_and_suppressors_L <- interaction_scores[
+#     interaction_scores$mean_L >= 2 | interaction_scores$mean_L <= -2, ]
+#   deletion_enhancers_and_suppressors_L <- deletion_enhancers_and_suppressors_L[
+#     !is.na(deletion_enhancers_and_suppressors_L$OrfRep), ]
+
+#   deletion_enhancers_and_suppressors_K <- interaction_scores[
+#     interaction_scores$mean_K >= 2 | interaction_scores$mean_K <= -2, ]
+#   deletion_enhancers_and_suppressors_K <- deletion_enhancers_and_suppressors_K[
+#     !is.na(deletion_enhancers_and_suppressors_K$OrfRep), ]
+
+#   # Write CSV files with updated filenames
+#   fwrite(deletion_enhancers_L, file.path(out_dir, "deletion_enhancers_l.csv"), row.names = FALSE)
+#   fwrite(deletion_enhancers_K, file.path(out_dir, "deletion_enhancers_k.csv"), row.names = FALSE)
+#   fwrite(deletion_suppressors_L, file.path(out_dir, "deletion_suppressors_l.csv"), row.names = FALSE)
+#   fwrite(deletion_suppressors_K, file.path(out_dir, "deletion_suppressors_k.csv"), row.names = FALSE)
+#   fwrite(deletion_enhancers_and_suppressors_L, file.path(out_dir, "deletion_enhancers_and_suppressors_l.csv"), row.names = FALSE)
+#   fwrite(deletion_enhancers_and_suppressors_K, file.path(out_dir, "deletion_enhancers_and_suppressors_k.csv"), row.names = FALSE)
+
+#   return(interaction_scores)  # Return the updated data frame with rank columns
+# }
+
+
+
+# publish_zscores <- function(df, out_dir) {
+#   zscores <- df %>%
+#     dplyr::mutate(
+#       zscore_L = scale(L, center = TRUE, scale = TRUE),
+#       zscore_K = scale(K, center = TRUE, scale = TRUE),
+#       zscore_r = scale(r, center = TRUE, scale = TRUE),
+#       zscore_AUC = scale(AUC, center = TRUE, scale = TRUE)
+#     )
+
+#   fwrite(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE)
+# }
+
+# generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
+#   variables <- c("L", "K", "r", "AUC", "delta_bg")
+  
+#   # Pre-QC plots
+#   lapply(variables, function(var) {
+#     if (var %in% colnames(df)) {
+#       create_and_publish_plot(df, var, "scatter", out_dir_qc)
+#     }
+#   })
+
+#   # Filter data based on delta background tolerance for Post-QC analysis
+#   df_post_qc <- df %>%
+#     mutate(across(all_of(variables), 
+#                   ~ ifelse(delta_bg >= delta_bg_tolerance, NA, .)))
+
+#   # Post-QC plots
+#   lapply(variables, function(var) {
+#     if (var %in% colnames(df_post_qc)) {
+#       create_and_publish_plot(df_post_qc, var, "scatter", out_dir_qc, suffix = "_after_qc")
+#     }
+#   })
+
+#   # For plots specifically for data above the tolerance threshold
+#   delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ]
+#   lapply(variables, function(var) {
+#     if (var %in% colnames(delta_bg_above_tolerance)) {
+#       create_and_publish_plot(delta_bg_above_tolerance, var, "scatter", out_dir_qc, suffix = "_above_tolerance")
+#     }
+#   })
+# }
+
+
+
+# # Create rank plots
+# create_rank_plots <- function(interaction_scores, out_dir) {
+#   rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank")
+  
+#   lapply(rank_vars, function(rank_var) {
+#     p <- ggplot(interaction_scores, aes(x = !!sym(rank_var))) +
+#       geom_bar() +
+#       ggtitle(paste("Rank Distribution for", rank_var)) +
+#       theme_publication()
+    
+#     pdf_path <- file.path(out_dir, paste0("rank_distribution_", rank_var, ".pdf"))
+#     pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
+#     print(p)
+#     dev.off()
+
+#     # Generate HTML output
+#     html_path <- sub(".pdf$", ".html", pdf_path)
+#     pgg <- suppressWarnings(ggplotly(p) %>%
+#       layout(legend = list(orientation = "h")))
+#     saveWidget(pgg, html_path, selfcontained = TRUE)
+#   })
+# }
+
+# create_correlation_plot <- function(interaction_scores, out_dir) {
+#   # Check for non-finite values and remove them from the dataset
+#   interaction_scores <- interaction_scores %>%
+#     filter_all(all_vars(is.finite(.)))
+
+#   # Generate correlation plots for each pair of variables
+#   pairs <- list(
+#     c("mean_L", "mean_K"),
+#     c("mean_L", "mean_r"),
+#     c("mean_L", "mean_AUC"),
+#     c("mean_K", "mean_r"),
+#     c("mean_K", "mean_AUC"),
+#     c("mean_r", "mean_AUC")
+#   )
+
+#   lapply(pairs, function(vars) {
+#     p <- ggplot(interaction_scores, aes(x = !!sym(vars[1]), y = !!sym(vars[2]))) +
+#       geom_point() +
+#       geom_smooth(method = "lm", se = FALSE) +
+#       ggtitle(paste("Correlation between", vars[1], "and", vars[2])) +
+#       theme_publication()
+    
+#     pdf_path <- file.path(out_dir, paste0("correlation_", vars[1], "_", vars[2], ".pdf"))
+#     pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT)
+#     print(p)
+#     dev.off()
+
+#     # Generate HTML output
+#     html_path <- sub(".pdf$", ".html", pdf_path)
+#     pgg <- suppressWarnings(ggplotly(p, tooltip = c(vars[1], vars[2])) %>%
+#       layout(legend = list(orientation = "h")))
+#     saveWidget(pgg, html_path, selfcontained = TRUE)
+#   })
+# }
+
+
+# process_experiment <- function(exp_name, exp_dir, genes, output_dir) {
+#   out_dir <- file.path(exp_dir, "zscores")
+#   out_dir_qc <- file.path(out_dir, "qc")
+#   dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
+#   dir.create(out_dir_qc, showWarnings = FALSE)
+
+#   # Load and preprocess the data
+#   data <- load_and_preprocess_data(args$easy_results_file, genes)
+
+#   # Calculate delta background tolerance
+#   delta_bg_tolerance <- mean(data$delta_bg, na.rm = TRUE) + 3 * sd(data$delta_bg, na.rm = TRUE)
+
+#   # Generate and publish QC plots (both pre-QC and post-QC)
+#   generate_and_publish_qc(data, delta_bg_tolerance, out_dir_qc)
+
+#   # Process and publish summary stats, interaction scores, and z-scores
+#   variables <- c("L", "K", "r", "AUC", "delta_bg")
+#   publish_summary_stats(data, variables, out_dir)
+#   interaction_scores <- publish_interaction_scores(data, out_dir)
+#   publish_zscores(data, out_dir)
+
+#   # Generate rank plots and correlation plots
+#   create_rank_plots(interaction_scores, out_dir)
+#   create_correlation_plot(interaction_scores, out_dir)
+
+#   output_file <- file.path(out_dir, "zscores_interaction.csv")
+#   fwrite(data, output_file, row.names = FALSE)
+
+#   return(output_file)
+# }
+
+# # Process all experiments
+# processed_files <- lapply(names(args$experiments), function(exp_name) {
+#   process_experiment(exp_name, args$experiments[[exp_name]], genes, args$out_dir)
+# })
+
+# # Combine results from all experiments if multiple experiments exist
+# if (length(processed_files) > 1) {
+#   combined_data <- Reduce(function(x, y) {
+#     merge(fread(x), fread(y), by = "OrfRep", all = TRUE, allow.cartesian = TRUE)
+#   }, processed_files)
+
+#   combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv")
+#   fwrite(combined_data, combined_output_file, row.names = FALSE)
+# }