From 17521b817f440308223e2f52d930be02b98ea065 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Sat, 24 Aug 2024 02:30:39 -0400 Subject: [PATCH] Output interaction df in calculate_interaction_zscores.R --- .../apps/r/calculate_interaction_zscores.R | 365 +++++++++++++++--- 1 file changed, 313 insertions(+), 52 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores.R b/workflow/apps/r/calculate_interaction_zscores.R index c739e34a..0a9378b8 100644 --- a/workflow/apps/r/calculate_interaction_zscores.R +++ b/workflow/apps/r/calculate_interaction_zscores.R @@ -100,7 +100,7 @@ scale_colour_publication <- function(...) { # Load SGD gene list sgd_genes <- function(sgd_gene_list) { read.delim(file = sgd_gene_list, quote = "", header = FALSE, - colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) %>% + colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) %>% dplyr::rename(ORF = V4, GeneName = V5) } @@ -123,7 +123,6 @@ load_and_preprocess_data <- function(easy_results_file, genes) { mutate(L = if ("l" %in% colnames(.)) l else {warning("Missing column: l"); NA}) %>% mutate(AUC = if ("AUC96" %in% colnames(.)) AUC96 else {warning("Missing column: AUC96"); NA}) %>% filter(!.[[1]] %in% c("", "Scan")) %>% # Filter out empty or Scan rows - # {if ("Conc" %in% colnames(.)) filter(., Conc != "0ug/mL") else .} %>% filter(Gene != "BLANK" & Gene != "Blank" & ORF != "Blank" & Gene != "blank") %>% # Remove blank genes filter(Drug != "BMH21") %>% # Filter out specific drugs if necessary filter(!is.na(ORF) & ORF != "") %>% # Ensure ORF is not NA or empty @@ -142,8 +141,6 @@ load_and_preprocess_data <- function(easy_results_file, genes) { return(df) } - - create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "") { if (!("Scan" %in% colnames(df))) { warning("'Scan' column is not present in the data. Skipping this plot.") @@ -186,52 +183,103 @@ publish_summary_stats <- function(df, variables, out_dir) { fwrite(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) } -publish_interaction_scores <- function(df, out_dir) { - interaction_scores <- df %>% - dplyr::group_by(OrfRep) %>% - dplyr::summarize( - mean_L = mean(L, na.rm = TRUE), - mean_K = mean(K, na.rm = TRUE), - mean_r = mean(r, na.rm = TRUE), - mean_AUC = mean(AUC, na.rm = TRUE), - delta_bg_mean = mean(delta_bg, na.rm = TRUE), - delta_bg_sd = sd(delta_bg, na.rm = TRUE) - ) %>% - dplyr::mutate( - l_rank = rank(mean_L), - k_rank = rank(mean_K), - r_rank = rank(mean_r), - auc_rank = rank(mean_AUC) +# Compute Interaction Scores +compute_interaction_scores <- function(df) { + # Calculate raw shifts + raw_shifts <- df %>% + group_by(OrfRep) %>% + reframe( + Raw_Shift_L = mean(L, na.rm = TRUE), + Raw_Shift_K = mean(K, na.rm = TRUE), + Raw_Shift_r = mean(r, na.rm = TRUE), + Raw_Shift_AUC = mean(AUC, na.rm = TRUE), + NG = n() ) + + # Calculate Z-scores + z_scores <- df %>% + group_by(OrfRep) %>% + reframe( + Z_Shift_L = mean(scale(L, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), + Z_Shift_K = mean(scale(K, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), + Z_Shift_r = mean(scale(r, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), + Z_Shift_AUC = mean(scale(AUC, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE) + ) + + # Linear Model Scores and R-Squared + lm_scores <- df %>% + group_by(OrfRep) %>% + reframe( + lm_Score_L = coef(lm(L ~ delta_bg, data = .))[2], + R_Squared_L = summary(lm(L ~ delta_bg, data = .))$r.squared, + lm_Score_K = coef(lm(K ~ delta_bg, data = .))[2], + R_Squared_K = summary(lm(K ~ delta_bg, data = .))$r.squared, + lm_Score_r = coef(lm(r ~ delta_bg, data = .))[2], + R_Squared_r = summary(lm(r ~ delta_bg, data = .))$r.squared, + lm_Score_AUC = coef(lm(AUC ~ delta_bg, data = .))[2], + R_Squared_AUC = summary(lm(AUC ~ delta_bg, data = .))$r.squared + ) + + # Calculate Sum and Average Z-scores + sum_avg_z_scores <- df %>% + group_by(OrfRep) %>% + reframe( + Sum_Z_Score_L = sum(scale(L, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), + Avg_Zscore_L = mean(scale(L, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), + Sum_Z_Score_K = sum(scale(K, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), + Avg_Zscore_K = mean(scale(K, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), + Sum_Z_Score_r = sum(scale(r, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), + Avg_Zscore_r = mean(scale(r, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), + Sum_Z_Score_AUC = sum(scale(AUC, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), + Avg_Zscore_AUC = mean(scale(AUC, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE) + ) + + # Combine all calculations + interaction_scores <- raw_shifts %>% + left_join(z_scores, by = "OrfRep") %>% + left_join(lm_scores, by = "OrfRep") %>% + left_join(sum_avg_z_scores, by = "OrfRep") %>% + mutate( + l_rank = rank(-Raw_Shift_L), + k_rank = rank(-Raw_Shift_K), + r_rank = rank(-Raw_Shift_r), + auc_rank = rank(-Raw_Shift_AUC) + ) + + return(interaction_scores) +} - fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE) - fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank), - file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE) +publish_interaction_scores <- function(df, out_dir) { + interaction_scores <- compute_interaction_scores(df) + # Additional enhancer and suppressor calculations and outputs - deletion_enhancers_L <- interaction_scores[interaction_scores$mean_L >= 2, ] + deletion_enhancers_L <- interaction_scores[interaction_scores$Raw_Shift_L >= 2, ] deletion_enhancers_L <- deletion_enhancers_L[!is.na(deletion_enhancers_L$OrfRep), ] - deletion_enhancers_K <- interaction_scores[interaction_scores$mean_K <= -2, ] + deletion_enhancers_K <- interaction_scores[interaction_scores$Raw_Shift_K <= -2, ] deletion_enhancers_K <- deletion_enhancers_K[!is.na(deletion_enhancers_K$OrfRep), ] - deletion_suppressors_L <- interaction_scores[interaction_scores$mean_L <= -2, ] + deletion_suppressors_L <- interaction_scores[interaction_scores$Raw_Shift_L <= -2, ] deletion_suppressors_L <- deletion_suppressors_L[!is.na(deletion_suppressors_L$OrfRep), ] - deletion_suppressors_K <- interaction_scores[interaction_scores$mean_K >= 2, ] + deletion_suppressors_K <- interaction_scores[interaction_scores$Raw_Shift_K >= 2, ] deletion_suppressors_K <- deletion_suppressors_K[!is.na(deletion_suppressors_K$OrfRep), ] deletion_enhancers_and_suppressors_L <- interaction_scores[ - interaction_scores$mean_L >= 2 | interaction_scores$mean_L <= -2, ] + interaction_scores$Raw_Shift_L >= 2 | interaction_scores$Raw_Shift_L <= -2, ] deletion_enhancers_and_suppressors_L <- deletion_enhancers_and_suppressors_L[ !is.na(deletion_enhancers_and_suppressors_L$OrfRep), ] deletion_enhancers_and_suppressors_K <- interaction_scores[ - interaction_scores$mean_K >= 2 | interaction_scores$mean_K <= -2, ] + interaction_scores$Raw_Shift_K >= 2 | interaction_scores$Raw_Shift_K <= -2, ] deletion_enhancers_and_suppressors_K <- deletion_enhancers_and_suppressors_K[ !is.na(deletion_enhancers_and_suppressors_K$OrfRep), ] # Write CSV files with updated filenames + fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE) + fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank), + file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE) fwrite(deletion_enhancers_L, file.path(out_dir, "deletion_enhancers_l.csv"), row.names = FALSE) fwrite(deletion_enhancers_K, file.path(out_dir, "deletion_enhancers_k.csv"), row.names = FALSE) fwrite(deletion_suppressors_L, file.path(out_dir, "deletion_suppressors_l.csv"), row.names = FALSE) @@ -239,23 +287,22 @@ publish_interaction_scores <- function(df, out_dir) { fwrite(deletion_enhancers_and_suppressors_L, file.path(out_dir, "deletion_enhancers_and_suppressors_l.csv"), row.names = FALSE) fwrite(deletion_enhancers_and_suppressors_K, file.path(out_dir, "deletion_enhancers_and_suppressors_k.csv"), row.names = FALSE) - return(interaction_scores) # Return the updated data frame with rank columns + return(interaction_scores) # Return the updated data frame with all calculated columns } - - publish_zscores <- function(df, out_dir) { zscores <- df %>% dplyr::mutate( - zscore_L = scale(L, center = TRUE, scale = TRUE), - zscore_K = scale(K, center = TRUE, scale = TRUE), - zscore_r = scale(r, center = TRUE, scale = TRUE), - zscore_AUC = scale(AUC, center = TRUE, scale = TRUE) + zscore_L = scale(Raw_Shift_L, center = TRUE, scale = TRUE), + zscore_K = scale(Raw_Shift_K, center = TRUE, scale = TRUE), + zscore_r = scale(Raw_Shift_r, center = TRUE, scale = TRUE), + zscore_AUC = scale(Raw_Shift_AUC, center = TRUE, scale = TRUE) ) fwrite(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE) } + generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) { variables <- c("L", "K", "r", "AUC", "delta_bg") @@ -268,7 +315,7 @@ generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) { # Filter data based on delta background tolerance for Post-QC analysis df_post_qc <- df %>% - mutate(across(all_of(variables), + mutate(across(all_of(variables), ~ ifelse(delta_bg >= delta_bg_tolerance, NA, .))) # Post-QC plots @@ -287,9 +334,6 @@ generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) { }) } - - -# Create rank plots create_rank_plots <- function(interaction_scores, out_dir) { rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank") @@ -319,12 +363,12 @@ create_correlation_plot <- function(interaction_scores, out_dir) { # Generate correlation plots for each pair of variables pairs <- list( - c("mean_L", "mean_K"), - c("mean_L", "mean_r"), - c("mean_L", "mean_AUC"), - c("mean_K", "mean_r"), - c("mean_K", "mean_AUC"), - c("mean_r", "mean_AUC") + c("Raw_Shift_L", "Raw_Shift_K"), + c("Raw_Shift_L", "Raw_Shift_r"), + c("Raw_Shift_L", "Raw_Shift_AUC"), + c("Raw_Shift_K", "Raw_Shift_r"), + c("Raw_Shift_K", "Raw_Shift_AUC"), + c("Raw_Shift_r", "Raw_Shift_AUC") ) lapply(pairs, function(vars) { @@ -347,7 +391,6 @@ create_correlation_plot <- function(interaction_scores, out_dir) { }) } - process_experiment <- function(exp_name, exp_dir, genes, output_dir) { out_dir <- file.path(exp_dir, "zscores") out_dir_qc <- file.path(out_dir, "qc") @@ -367,14 +410,14 @@ process_experiment <- function(exp_name, exp_dir, genes, output_dir) { variables <- c("L", "K", "r", "AUC", "delta_bg") publish_summary_stats(data, variables, out_dir) interaction_scores <- publish_interaction_scores(data, out_dir) - publish_zscores(data, out_dir) + publish_zscores(interaction_scores, out_dir) # Now writing interaction_scores, not original data # Generate rank plots and correlation plots create_rank_plots(interaction_scores, out_dir) create_correlation_plot(interaction_scores, out_dir) output_file <- file.path(out_dir, "zscores_interaction.csv") - fwrite(data, output_file, row.names = FALSE) + fwrite(interaction_scores, output_file, row.names = FALSE) # Write interaction_scores here return(output_file) } @@ -390,6 +433,224 @@ if (length(processed_files) > 1) { merge(fread(x), fread(y), by = "OrfRep", all = TRUE, allow.cartesian = TRUE) }, processed_files) - combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv") - fwrite(combined_data, combined_output_file, row.names = FALSE) -} \ No newline at end of file +combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv") +fwrite(combined_data, combined_output_file, row.names = FALSE) +} + + + + + + + + + + + +# publish_interaction_scores <- function(df, out_dir) { +# interaction_scores <- df %>% +# dplyr::group_by(OrfRep) %>% +# dplyr::summarize( +# mean_L = mean(L, na.rm = TRUE), +# mean_K = mean(K, na.rm = TRUE), +# mean_r = mean(r, na.rm = TRUE), +# mean_AUC = mean(AUC, na.rm = TRUE), +# delta_bg_mean = mean(delta_bg, na.rm = TRUE), +# delta_bg_sd = sd(delta_bg, na.rm = TRUE) +# ) %>% +# dplyr::mutate( +# l_rank = rank(mean_L), +# k_rank = rank(mean_K), +# r_rank = rank(mean_r), +# auc_rank = rank(mean_AUC) +# ) + +# fwrite(interaction_scores, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE) +# fwrite(dplyr::arrange(interaction_scores, l_rank, k_rank), +# file.path(out_dir, "rf_zscores_interaction_ranked.csv"), row.names = FALSE) + +# # Additional enhancer and suppressor calculations and outputs +# deletion_enhancers_L <- interaction_scores[interaction_scores$mean_L >= 2, ] +# deletion_enhancers_L <- deletion_enhancers_L[!is.na(deletion_enhancers_L$OrfRep), ] + +# deletion_enhancers_K <- interaction_scores[interaction_scores$mean_K <= -2, ] +# deletion_enhancers_K <- deletion_enhancers_K[!is.na(deletion_enhancers_K$OrfRep), ] + +# deletion_suppressors_L <- interaction_scores[interaction_scores$mean_L <= -2, ] +# deletion_suppressors_L <- deletion_suppressors_L[!is.na(deletion_suppressors_L$OrfRep), ] + +# deletion_suppressors_K <- interaction_scores[interaction_scores$mean_K >= 2, ] +# deletion_suppressors_K <- deletion_suppressors_K[!is.na(deletion_suppressors_K$OrfRep), ] + +# deletion_enhancers_and_suppressors_L <- interaction_scores[ +# interaction_scores$mean_L >= 2 | interaction_scores$mean_L <= -2, ] +# deletion_enhancers_and_suppressors_L <- deletion_enhancers_and_suppressors_L[ +# !is.na(deletion_enhancers_and_suppressors_L$OrfRep), ] + +# deletion_enhancers_and_suppressors_K <- interaction_scores[ +# interaction_scores$mean_K >= 2 | interaction_scores$mean_K <= -2, ] +# deletion_enhancers_and_suppressors_K <- deletion_enhancers_and_suppressors_K[ +# !is.na(deletion_enhancers_and_suppressors_K$OrfRep), ] + +# # Write CSV files with updated filenames +# fwrite(deletion_enhancers_L, file.path(out_dir, "deletion_enhancers_l.csv"), row.names = FALSE) +# fwrite(deletion_enhancers_K, file.path(out_dir, "deletion_enhancers_k.csv"), row.names = FALSE) +# fwrite(deletion_suppressors_L, file.path(out_dir, "deletion_suppressors_l.csv"), row.names = FALSE) +# fwrite(deletion_suppressors_K, file.path(out_dir, "deletion_suppressors_k.csv"), row.names = FALSE) +# fwrite(deletion_enhancers_and_suppressors_L, file.path(out_dir, "deletion_enhancers_and_suppressors_l.csv"), row.names = FALSE) +# fwrite(deletion_enhancers_and_suppressors_K, file.path(out_dir, "deletion_enhancers_and_suppressors_k.csv"), row.names = FALSE) + +# return(interaction_scores) # Return the updated data frame with rank columns +# } + + + +# publish_zscores <- function(df, out_dir) { +# zscores <- df %>% +# dplyr::mutate( +# zscore_L = scale(L, center = TRUE, scale = TRUE), +# zscore_K = scale(K, center = TRUE, scale = TRUE), +# zscore_r = scale(r, center = TRUE, scale = TRUE), +# zscore_AUC = scale(AUC, center = TRUE, scale = TRUE) +# ) + +# fwrite(zscores, file.path(out_dir, "zscores_interaction.csv"), row.names = FALSE) +# } + +# generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) { +# variables <- c("L", "K", "r", "AUC", "delta_bg") + +# # Pre-QC plots +# lapply(variables, function(var) { +# if (var %in% colnames(df)) { +# create_and_publish_plot(df, var, "scatter", out_dir_qc) +# } +# }) + +# # Filter data based on delta background tolerance for Post-QC analysis +# df_post_qc <- df %>% +# mutate(across(all_of(variables), +# ~ ifelse(delta_bg >= delta_bg_tolerance, NA, .))) + +# # Post-QC plots +# lapply(variables, function(var) { +# if (var %in% colnames(df_post_qc)) { +# create_and_publish_plot(df_post_qc, var, "scatter", out_dir_qc, suffix = "_after_qc") +# } +# }) + +# # For plots specifically for data above the tolerance threshold +# delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ] +# lapply(variables, function(var) { +# if (var %in% colnames(delta_bg_above_tolerance)) { +# create_and_publish_plot(delta_bg_above_tolerance, var, "scatter", out_dir_qc, suffix = "_above_tolerance") +# } +# }) +# } + + + +# # Create rank plots +# create_rank_plots <- function(interaction_scores, out_dir) { +# rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank") + +# lapply(rank_vars, function(rank_var) { +# p <- ggplot(interaction_scores, aes(x = !!sym(rank_var))) + +# geom_bar() + +# ggtitle(paste("Rank Distribution for", rank_var)) + +# theme_publication() + +# pdf_path <- file.path(out_dir, paste0("rank_distribution_", rank_var, ".pdf")) +# pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT) +# print(p) +# dev.off() + +# # Generate HTML output +# html_path <- sub(".pdf$", ".html", pdf_path) +# pgg <- suppressWarnings(ggplotly(p) %>% +# layout(legend = list(orientation = "h"))) +# saveWidget(pgg, html_path, selfcontained = TRUE) +# }) +# } + +# create_correlation_plot <- function(interaction_scores, out_dir) { +# # Check for non-finite values and remove them from the dataset +# interaction_scores <- interaction_scores %>% +# filter_all(all_vars(is.finite(.))) + +# # Generate correlation plots for each pair of variables +# pairs <- list( +# c("mean_L", "mean_K"), +# c("mean_L", "mean_r"), +# c("mean_L", "mean_AUC"), +# c("mean_K", "mean_r"), +# c("mean_K", "mean_AUC"), +# c("mean_r", "mean_AUC") +# ) + +# lapply(pairs, function(vars) { +# p <- ggplot(interaction_scores, aes(x = !!sym(vars[1]), y = !!sym(vars[2]))) + +# geom_point() + +# geom_smooth(method = "lm", se = FALSE) + +# ggtitle(paste("Correlation between", vars[1], "and", vars[2])) + +# theme_publication() + +# pdf_path <- file.path(out_dir, paste0("correlation_", vars[1], "_", vars[2], ".pdf")) +# pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT) +# print(p) +# dev.off() + +# # Generate HTML output +# html_path <- sub(".pdf$", ".html", pdf_path) +# pgg <- suppressWarnings(ggplotly(p, tooltip = c(vars[1], vars[2])) %>% +# layout(legend = list(orientation = "h"))) +# saveWidget(pgg, html_path, selfcontained = TRUE) +# }) +# } + + +# process_experiment <- function(exp_name, exp_dir, genes, output_dir) { +# out_dir <- file.path(exp_dir, "zscores") +# out_dir_qc <- file.path(out_dir, "qc") +# dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) +# dir.create(out_dir_qc, showWarnings = FALSE) + +# # Load and preprocess the data +# data <- load_and_preprocess_data(args$easy_results_file, genes) + +# # Calculate delta background tolerance +# delta_bg_tolerance <- mean(data$delta_bg, na.rm = TRUE) + 3 * sd(data$delta_bg, na.rm = TRUE) + +# # Generate and publish QC plots (both pre-QC and post-QC) +# generate_and_publish_qc(data, delta_bg_tolerance, out_dir_qc) + +# # Process and publish summary stats, interaction scores, and z-scores +# variables <- c("L", "K", "r", "AUC", "delta_bg") +# publish_summary_stats(data, variables, out_dir) +# interaction_scores <- publish_interaction_scores(data, out_dir) +# publish_zscores(data, out_dir) + +# # Generate rank plots and correlation plots +# create_rank_plots(interaction_scores, out_dir) +# create_correlation_plot(interaction_scores, out_dir) + +# output_file <- file.path(out_dir, "zscores_interaction.csv") +# fwrite(data, output_file, row.names = FALSE) + +# return(output_file) +# } + +# # Process all experiments +# processed_files <- lapply(names(args$experiments), function(exp_name) { +# process_experiment(exp_name, args$experiments[[exp_name]], genes, args$out_dir) +# }) + +# # Combine results from all experiments if multiple experiments exist +# if (length(processed_files) > 1) { +# combined_data <- Reduce(function(x, y) { +# merge(fread(x), fread(y), by = "OrfRep", all = TRUE, allow.cartesian = TRUE) +# }, processed_files) + +# combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv") +# fwrite(combined_data, combined_output_file, row.names = FALSE) +# } \ No newline at end of file