diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 2bbd07f9..e772c7e7 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -5,52 +5,75 @@ suppressMessages({ library(dplyr) library(ggthemes) library(data.table) + library(unix) }) options(warn = 2, max.print = 1000) +options(width = 10000) + +# Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes) +soft_limit <- 30 * 1024 * 1024 * 1024 +hard_limit <- 30 * 1024 * 1024 * 1024 +rlimit_as(soft_limit, hard_limit) # Constants for configuration -PLOT_WIDTH <- 14 -PLOT_HEIGHT <- 9 -BASE_SIZE <- 14 +plot_width <- 14 +plot_height <- 9 +base_size <- 14 parse_arguments <- function() { args <- if (interactive()) { c( "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD", - 3, "/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab", "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2_DoxoHLD/results_std.txt", "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1", "Experiment 1: Doxo versus HLD", + 3, "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2", - "Experiment 2: HLD versus Doxo" + "Experiment 2: HLD versus Doxo", + 3 ) } else { commandArgs(trailingOnly = TRUE) } - paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE) - names <- args[seq(6, length(args), by = 2)] - experiments <- setNames(paths, names) + # Extract paths, names, and standard deviations + paths <- args[seq(4, length(args), by = 3)] + names <- args[seq(5, length(args), by = 3)] + sds <- as.numeric(args[seq(6, length(args), by = 3)]) + + # Normalize paths + normalized_paths <- normalizePath(paths, mustWork = FALSE) + + # Create named list of experiments + experiments <- list() + for (i in seq_along(paths)) { + experiments[[names[i]]] <- list( + path = normalized_paths[i], + sd = sds[i] + ) + } list( - out_dir = normalizePath(file.path(args[1]), mustWork = FALSE), - sd = as.numeric(args[2]), - sgd_gene_list = normalizePath(file.path(args[3]), mustWork = FALSE), - easy_results_file = normalizePath(file.path(args[4]), mustWork = FALSE), + out_dir = normalizePath(args[1], mustWork = FALSE), + sgd_gene_list = normalizePath(args[2], mustWork = FALSE), + easy_results_file = normalizePath(args[3], mustWork = FALSE), experiments = experiments ) } args <- parse_arguments() -dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE) -dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE) +# Should we keep output in exp dirs or combine in the study output dir? +# dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE) +# dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE) # Define themes and scales -theme_publication <- function(base_size = BASE_SIZE, base_family = "sans", legend_position = "bottom") { - theme_foundation(base_size = base_size, base_family = base_family) + +theme_publication <- function(base_size = 14, base_family = "sans", legend_position = "bottom") { + theme_foundation <- ggplot2::theme_grey(base_size = base_size, base_family = base_family) + + theme_foundation %+replace% theme( plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5), text = element_text(), @@ -86,502 +109,837 @@ scale_colour_publication <- function(...) { )), ...) } -# Load SGD gene list -sgd_genes <- function(sgd_gene_list) { - read.delim(file = sgd_gene_list, quote = "", header = FALSE, - colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) %>% - dplyr::rename(ORF = V4, GeneName = V5) -} - -genes <- sgd_genes(args$sgd_gene_list) - -# Define the adjust_values function -adjust_values <- function(x, delta_bg, tolerance) { - valid_x <- x[x != 0 & !is.na(x)] - max_x <- ifelse(length(valid_x) == 0, NA_real_, max(valid_x, na.rm = TRUE)) +# Load the initial dataframe from the easy_results_file +load_and_process_data <- function(easy_results_file, sd = 3) { + df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) - x_adjusted <- ifelse(x == 0 & !is.na(x), max_x, x) - - # Adjust only if delta_bg is valid and above tolerance - x_adjusted <- ifelse(!is.na(delta_bg) & delta_bg >= tolerance, NA_real_, x_adjusted) - - return(x_adjusted) -} - - - -load_and_preprocess_data <- function(easy_results_file, genes) { - df <- - read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) - - # Apply further transformations df <- df %>% - filter( - !is.na(ORF) & ORF != "", - !ORF %in% c("Blank"), - !is.na(Gene) & !Gene %in% c("BLANK", "Blank", "blank"), - Drug != "BMH21", - !is.na(Scan), - !is.na(l), !is.na(K), - !is.na(r), !is.na(AUC96) - ) %>% + filter(!(.[[1]] %in% c("", "Scan"))) %>% + filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>% + # Rename columns + rename(L = l, num = Num., AUC = AUC96, scan = Scan, last_bg = LstBackgrd, first_bg = X1stBackgrd) %>% mutate( - Col = suppressWarnings(as.numeric(Col)), - Row = suppressWarnings(as.numeric(Row)), - L = suppressWarnings(as.numeric(l)), - K = suppressWarnings(as.numeric(K)), - r = suppressWarnings(as.numeric(r)), - Scan = suppressWarnings(as.numeric(Scan)), - AUC = suppressWarnings(as.numeric(AUC96)), - LstBackgrd = suppressWarnings(as.numeric(LstBackgrd)), - X1stBackgrd = suppressWarnings(as.numeric(X1stBackgrd)), - delta_bg = ifelse(is.na(LstBackgrd) | is.na(X1stBackgrd), NA_real_, LstBackgrd - X1stBackgrd), - delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + 3 * sd(delta_bg, na.rm = TRUE), - OrfRep = ifelse(ORF == "YDL227C", "YDL227C", OrfRep), - OrfRepUnique = paste(OrfRep, Gene, Num., sep = "_"), - conc_num = suppressWarnings(as.numeric(gsub("[^0-9\\.]", "", Conc))), - DB = as.integer(delta_bg >= delta_bg_tolerance), + across(c(Col, Row, num, L, K, r, scan, AUC, last_bg, first_bg), as.numeric), + delta_bg = last_bg - first_bg, + delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + (sd * sd(delta_bg, na.rm = TRUE)), + NG = if_else(L == 0 & !is.na(L), 1, 0), + DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0), SM = 0, - NG = as.integer(l == 0 & !is.na(l)), - L_adjusted = adjust_values(L, delta_bg, delta_bg_tolerance), - r_adjusted = adjust_values(r, delta_bg, delta_bg_tolerance), - AUC_adjusted = adjust_values(AUC, delta_bg, delta_bg_tolerance), - K_adjusted = adjust_values(K, delta_bg, delta_bg_tolerance), - max_L_adjusted = ifelse(all(is.na(L_adjusted)), NA_real_, max(L_adjusted, na.rm = TRUE)), - SM = ifelse(!is.na(max_L_adjusted) & L_adjusted == max_L_adjusted, 1, SM) + OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded? + conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)), + conc_num_factor = as.numeric(as.factor(conc_num)) - 1 + ) + + return(df) +} + +# Update Gene names using the SGD gene list +update_gene_names <- function(df, sgd_gene_list) { + # Load SGD gene list + genes <- read.delim(file = sgd_gene_list, + quote = "", header = FALSE, + colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) + + # Create a named vector for mapping ORF to GeneName + gene_map <- setNames(genes$V5, genes$V4) + # Vectorized match to find the GeneName from gene_map + mapped_genes <- gene_map[df$ORF] + # Replace NAs in mapped_genes with original Gene names (preserves existing Gene names if ORF is not found) + updated_genes <- ifelse(is.na(mapped_genes) | df$OrfRep == "YDL227C", df$Gene, mapped_genes) + # Ensure Gene is not left blank or incorrectly updated to "OCT1" + df <- df %>% + mutate(Gene = ifelse(updated_genes == "" | updated_genes == "OCT1", OrfRep, updated_genes)) + + return(df) +} + +# Calculate summary statistics for all variables +calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) { + df <- df %>% + mutate(across(all_of(variables), ~ ifelse(. == 0, NA, .))) + + summary_stats <- df %>% + group_by(across(all_of(group_vars))) %>% + summarise( + N = sum(!is.na(L)), + across(all_of(variables), list( + mean = ~mean(., na.rm = TRUE), + median = ~median(., na.rm = TRUE), + max = ~max(., na.rm = TRUE), + min = ~min(., na.rm = TRUE), + sd = ~sd(., na.rm = TRUE), + se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1) + ), .names = "{.fn}_{.col}") ) - # Efficiently handle the assignment of Gene names - gene_map <- setNames(genes$GeneName, genes$ORF) - df$Gene <- ifelse(df$OrfRep != "YDL227C", gene_map[df$ORF], df$Gene) - df$Gene <- ifelse(df$Gene == "" | df$Gene == "OCT1", df$OrfRep, df$Gene) + df_cleaned <- df %>% + select(-any_of(names(summary_stats))) + + df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars) + + return(list(summary_stats = summary_stats, df_with_stats = df_with_stats)) +} + + +calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { + + # Calculate total concentration variables + total_conc_num <- length(unique(df$conc_num)) + num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1 + + # Pull the background means and standard deviations from zero concentration + bg_means <- list( + L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(), + K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(), + r = df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first(), + AUC = df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first() + ) + + bg_sd <- list( + L = df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first(), + K = df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first(), + r = df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first(), + AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() + ) + + interaction_scores <- df %>% + mutate( + WT_L = df$mean_L, + WT_K = df$mean_K, + WT_r = df$mean_r, + WT_AUC = df$mean_AUC, + WT_sd_L = df$sd_L, + WT_sd_K = df$sd_K, + WT_sd_r = df$sd_r, + WT_sd_AUC = df$sd_AUC + ) %>% + group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% + mutate( + N = sum(!is.na(L)), + NG = sum(NG, na.rm = TRUE), + DB = sum(DB, na.rm = TRUE), + SM = sum(SM, na.rm = TRUE), + across(all_of(variables), list( + mean = ~mean(., na.rm = TRUE), + median = ~median(., na.rm = TRUE), + max = ~max(., na.rm = TRUE), + min = ~min(., na.rm = TRUE), + sd = ~sd(., na.rm = TRUE), + se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1) + ), .names = "{.fn}_{.col}") + ) %>% + ungroup() + + interaction_scores <- interaction_scores %>% + group_by(across(all_of(group_vars))) %>% + mutate( + Raw_Shift_L = mean_L[[1]] - bg_means$L, + Raw_Shift_K = mean_K[[1]] - bg_means$K, + Raw_Shift_r = mean_r[[1]] - bg_means$r, + Raw_Shift_AUC = mean_AUC[[1]] - bg_means$AUC, + Z_Shift_L = Raw_Shift_L[[1]] / df$sd_L[[1]], + Z_Shift_K = Raw_Shift_K[[1]] / df$sd_K[[1]], + Z_Shift_r = Raw_Shift_r[[1]] / df$sd_r[[1]], + Z_Shift_AUC = Raw_Shift_AUC[[1]] / df$sd_AUC[[1]] + ) + + interaction_scores <- interaction_scores %>% + mutate( + Exp_L = WT_L + Raw_Shift_L, + Delta_L = mean_L - Exp_L, + Exp_K = WT_K + Raw_Shift_K, + Delta_K = mean_K - Exp_K, + Exp_r = WT_r + Raw_Shift_r, + Delta_r = mean_r - Exp_r, + Exp_AUC = WT_AUC + Raw_Shift_AUC, + Delta_AUC = mean_AUC - Exp_AUC + ) + + interaction_scores <- interaction_scores %>% + mutate( + Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L), + Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K), + Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), + Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC), + Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) + ) + + # Calculate linear models and interaction scores + interaction_scores <- interaction_scores %>% + mutate( + lm_L = lm(Delta_L ~ conc_num_factor), + lm_K = lm(Delta_K ~ conc_num_factor), + lm_r = lm(Delta_r ~ conc_num_factor), + lm_AUC = lm(Delta_AUC ~ conc_num_factor), + Zscore_L = Delta_L / WT_sd_L, + Zscore_K = Delta_K / WT_sd_K, + Zscore_r = Delta_r / WT_sd_r, + Zscore_AUC = Delta_AUC / WT_sd_AUC + ) + + interaction_scores <- interaction_scores %>% + mutate( + Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE), + Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE), + Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE), + Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) + ) + + interaction_scores_all <- interaction_scores %>% + mutate( + Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs, + Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs, + Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1), + Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1), + lm_Score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1], + lm_Score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1], + lm_Score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1], + lm_Score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1], + r_squared_L = summary(lm_L)$r.squared, + r_squared_K = summary(lm_K)$r.squared, + r_squared_r = summary(lm_r)$r.squared, + r_squared_AUC = summary(lm_AUC)$r.squared + ) + + # Calculate Z_lm for each variable + interaction_scores_all <- interaction_scores_all %>% + mutate( + Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE), + Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE), + Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE), + Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE) + ) + + # Arrange results by Z_lm_L and NG + interaction_scores_all <- interaction_scores_all %>% + arrange(desc(lm_Score_L)) %>% + arrange(desc(NG)) %>% + ungroup() + + return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) +} + +generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) { + + `%||%` <- function(a, b) if (!is.null(a)) a else b + + # Generalized plot generation + plots <- lapply(plot_configs, function(config) { + + df <- config$df + plot <- ggplot(df, aes(x = !!sym(config$x_var), y = !!sym(config$y_var), color = as.factor(!!sym(config$color_var)))) + + # Rank plots with SD annotations + if (config$plot_type == "rank") { + plot <- plot + geom_point(size = 0.1, shape = 3) + + # Adding SD bands + if (!is.null(config$sd_band)) { + for (i in seq_len(config$sd_band)) { + plot <- plot + + annotate("rect", xmin = -Inf, xmax = Inf, ymin = i, ymax = Inf, fill = "#542788", alpha = 0.3) + + annotate("rect", xmin = -Inf, xmax = Inf, ymin = -i, ymax = -Inf, fill = "orange", alpha = 0.3) + + geom_hline(yintercept = c(-i, i), color = "gray") + } + } + + # Optionally add enhancer/suppressor count annotations + if (!is.null(config$enhancer_label)) { + plot <- plot + annotate("text", x = config$enhancer_label$x, + y = config$enhancer_label$y, label = config$enhancer_label$label) + + annotate("text", x = config$suppressor_label$x, + y = config$suppressor_label$y, label = config$suppressor_label$label) + } + } + + # Correlation plots + if (config$plot_type == "correlation") { + plot <- plot + geom_point(shape = 3, color = "gray70") + + geom_smooth(method = "lm", color = "tomato3") + + annotate("text", x = 0, y = 0, label = config$correlation_text) + } + + # General scatter/boxplot/density handling + if (!is.null(config$y_var)) { + plot <- plot + aes(y = !!sym(config$y_var)) + + y_mean_col <- paste0("mean_", config$y_var) + y_sd_col <- paste0("sd_", config$y_var) + + if (config$y_var == "delta_bg" && config$plot_type == "scatter") { + plot <- plot + geom_point(shape = 3, size = 0.2, position = "jitter") + + geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col), + ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) + + geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) + } else if (config$error_bar %||% FALSE) { + plot <- plot + + geom_point(shape = 3, size = 0.2) + + geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col), + ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) + + geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) + } + } + + # Plot type selection + plot <- switch(config$plot_type, + "box" = plot + geom_boxplot(), + "density" = plot + geom_density(), + "bar" = plot + geom_bar(stat = "identity"), + plot + geom_point() + geom_smooth(method = "lm", se = FALSE)) + + # Adding y-limits if provided + if (!is.null(config$ylim_vals)) { + plot <- plot + coord_cartesian(ylim = config$ylim_vals) + } + + # Setting legend position, titles, and labels + legend_position <- config$legend_position %||% "bottom" + plot <- plot + ggtitle(config$title) + theme_Publication(legend_position = legend_position) + + if (!is.null(config$x_label)) plot <- plot + xlab(config$x_label) + if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label) + + # Adding text annotations if provided + if (!is.null(config$annotations)) { + for (annotation in config$annotations) { + plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label) + } + } + + return(plot) + }) + + # If grid_layout is provided, arrange plots in a grid and save in a single PDF + if (!is.null(grid_layout)) { + pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9) + + # Loop through plots in chunks defined by ncol and nrow + for (start_idx in seq(1, length(plots), by = grid_layout$ncol * grid_layout$nrow)) { + end_idx <- min(start_idx + grid_layout$ncol * grid_layout$nrow - 1, length(plots)) + plot_subset <- plots[start_idx:end_idx] + + # Arrange plots in a grid + do.call(grid.arrange, c(plot_subset, ncol = grid_layout$ncol, nrow = grid_layout$nrow)) + } + + dev.off() + + # Save as HTML (optional) + plotly_plots <- lapply(plots, function(plot) { + suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h"))) + }) + combined_plot <- subplot(plotly_plots, nrows = grid_layout$nrow, margin = 0.05) + saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, "_grid.html")), selfcontained = TRUE) + + } else { + # Save individual plots to PDF + pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9) + lapply(plots, print) + dev.off() + + # Convert plots to plotly and save as HTML + plotly_plots <- lapply(plots, function(plot) { + suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h"))) + }) + combined_plot <- subplot(plotly_plots, nrows = length(plotly_plots), margin = 0.05) + saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE) + } +} + +generate_interaction_plot_configs <- function(df, variables) { + plot_configs <- list() + + for (variable in variables) { + # Define the y-limits based on the variable being plotted + ylim_vals <- switch(variable, + "L" = c(-65, 65), + "K" = c(-65, 65), + "r" = c(-0.65, 0.65), + "AUC" = c(-6500, 6500) + ) + + # Dynamically generate the column names for standard deviation and delta + wt_sd_col <- paste0("WT_sd_", variable) + delta_var <- paste0("Delta_", variable) + z_shift <- paste0("Z_Shift_", variable) + z_lm <- paste0("Z_lm_", variable) + + # Set annotations for ZShift, Z lm Score, NG, DB, SM + annotations <- list( + list(x = 1, y = ifelse(variable == "L", 45, ifelse(variable == "K", 45, + ifelse(variable == "r", 0.45, 4500))), label = paste("ZShift =", round(df[[z_shift]], 2))), + list(x = 1, y = ifelse(variable == "L", 25, ifelse(variable == "K", 25, + ifelse(variable == "r", 0.25, 2500))), label = paste("lm ZScore =", round(df[[z_lm]], 2))), + list(x = 1, y = ifelse(variable == "L", -25, ifelse(variable == "K", -25, + ifelse(variable == "r", -0.25, -2500))), label = paste("NG =", df$NG)), + list(x = 1, y = ifelse(variable == "L", -35, ifelse(variable == "K", -35, + ifelse(variable == "r", -0.35, -3500))), label = paste("DB =", df$DB)), + list(x = 1, y = ifelse(variable == "L", -45, ifelse(variable == "K", -45, + ifelse(variable == "r", -0.45, -4500))), label = paste("SM =", df$SM)) + ) + + # Add scatter plot configuration for this variable + plot_configs[[length(plot_configs) + 1]] <- list( + df = df, + x_var = "conc_num_factor", + y_var = delta_var, + plot_type = "scatter", + title = sprintf("%s %s", df$OrfRep[1], df$Gene[1]), + ylim_vals = ylim_vals, + annotations = annotations, + error_bar = list( + ymin = 0 - (2 * df[[wt_sd_col]][1]), + ymax = 0 + (2 * df[[wt_sd_col]][1]) + ), + x_breaks = unique(df$conc_num_factor), + x_labels = unique(as.character(df$conc_num)), + x_label = unique(df$Drug[1]) + ) + + # Add box plot configuration for this variable + plot_configs[[length(plot_configs) + 1]] <- list( + df = df, + x_var = "conc_num_factor", + y_var = variable, + plot_type = "box", + title = sprintf("%s %s (Boxplot)", df$OrfRep[1], df$Gene[1]), + ylim_vals = ylim_vals, + annotations = annotations, + error_bar = FALSE, # Boxplots typically don't need error bars + x_breaks = unique(df$conc_num_factor), + x_labels = unique(as.character(df$conc_num)), + x_label = unique(df$Drug[1]) + ) + } + + return(plot_configs) +} + +generate_rank_plot_configs <- function(df, rank_var, zscore_var, label_prefix) { + configs <- list() + + for (sd_band in c(1, 2, 3)) { + # Annotated version + configs[[length(configs) + 1]] <- list( + df = df, + x_var = rank_var, + y_var = zscore_var, + plot_type = "rank", + title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"), + sd_band = sd_band, + enhancer_label = list( + x = nrow(df) / 2, y = 10, + label = paste("Deletion Enhancers =", nrow(df[df[[zscore_var]] >= sd_band, ])) + ), + suppressor_label = list( + x = nrow(df) / 2, y = -10, + label = paste("Deletion Suppressors =", nrow(df[df[[zscore_var]] <= -sd_band, ])) + ) + ) + + # Non-annotated version + configs[[length(configs) + 1]] <- list( + df = df, + x_var = rank_var, + y_var = zscore_var, + plot_type = "rank", + title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"), + sd_band = sd_band + ) + } + + return(configs) +} + +generate_correlation_plot_configs <- function(df, lm_list, lm_summaries) { + lapply(seq_along(lm_list), function(i) { + r_squared <- round(lm_summaries[[i]]$r.squared, 3) + list( + x_var = names(lm_list)[i][1], + y_var = names(lm_list)[i][2], + plot_type = "scatter", + title = paste("Correlation between", names(lm_list)[i][1], "and", names(lm_list)[i][2]), + annotations = list(list(x = 0, y = 0, label = paste("R-squared =", r_squared))) + ) + }) +} + +# Adjust missing values and calculate ranks +adjust_missing_and_rank <- function(df, variables) { + + # Adjust missing values in Avg_Zscore and Z_lm columns, and apply rank to the specified variables + df <- df %>% + mutate(across(all_of(variables), list( + Avg_Zscore = ~ if_else(is.na(get(paste0("Avg_Zscore_", cur_column()))), 0.001, get(paste0("Avg_Zscore_", cur_column()))), + Z_lm = ~ if_else(is.na(get(paste0("Z_lm_", cur_column()))), 0.001, get(paste0("Z_lm_", cur_column()))), + Rank = ~ rank(get(paste0("Avg_Zscore_", cur_column()))), + Rank_lm = ~ rank(get(paste0("Z_lm_", cur_column()))) + ), .names = "{fn}_{col}")) return(df) } - - - -generate_plot <- function(df, var, plot_type = "scatter", out_dir_qc, suffix = "") { - - # Filter out non-finite values before plotting - df <- df %>% - filter(is.finite(!!sym(var))) - - plot_func <- switch(plot_type, - scatter = geom_point(shape = 3, size = 0.2, position = "jitter"), - boxplot = geom_boxplot()) - - p <- ggplot(df, aes(Scan, !!sym(var), color = as.factor(conc_num))) + - plot_func + - stat_summary(fun = mean, geom = "point", size = 0.6) + - stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar") + - ggtitle(paste("Plate analysis by Drug Conc for", var, "before quality control")) + - theme_publication() - - pdf_path <- file.path(out_dir_qc, paste0("plate_analysis_", var, suffix, ".pdf")) - pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT) - print(p) - dev.off() - - html_path <- sub(".pdf$", ".html", pdf_path) - pgg <- suppressWarnings(ggplotly(p) %>% - layout(legend = list(orientation = "h"))) - saveWidget(pgg, html_path, selfcontained = TRUE) -} - -generate_and_publish_qc <- function(df, out_dir_qc) { - variables <- c("L", "K", "r", "AUC", "delta_bg") - - # Access delta_bg_tolerance from the dataframe - delta_bg_tolerance <- df$delta_bg_tolerance[1] - - for (var in variables) { - if (var %in% colnames(df)) { - generate_plot(df, var, "scatter", out_dir_qc) - } - } - - df_post_qc <- df %>% - mutate(across(all_of(variables), ~ ifelse(delta_bg >= delta_bg_tolerance, NA, .))) - - for (var in variables) { - if (var %in% colnames(df_post_qc)) { - generate_plot(df_post_qc, var, "scatter", out_dir_qc, suffix = "_after_qc") - } - } - - delta_bg_above_tolerance <- df[df$delta_bg >= delta_bg_tolerance, ] - for (var in variables) { - if (var %in% colnames(delta_bg_above_tolerance)) { - generate_plot(delta_bg_above_tolerance, var, "scatter", out_dir_qc, suffix = "_above_tolerance") - } - } - - df_finite <- df %>% filter(is.finite(delta_bg)) - delta_bg_frequency_plot <- ggplot(df_finite, aes(delta_bg, color = as.factor(conc_num))) + - geom_density() + - ggtitle("Density plot for Delta Background by Conc All Data") + - theme_publication(legend_position = "right") - - delta_bg_bar_plot <- ggplot(df_finite, aes(delta_bg, color = as.factor(conc_num))) + - geom_bar() + - ggtitle("Bar plot for Delta Background by Conc All Data") + - theme_publication(legend_position = "right") - - pdf(file = file.path(out_dir_qc, "frequency_delta_background.pdf"), width = 12, height = 8) - print(delta_bg_frequency_plot) - print(delta_bg_bar_plot) - dev.off() - - saveWidget(ggplotly(delta_bg_frequency_plot), - file = file.path(out_dir_qc, "frequency_delta_background_density.html"), selfcontained = TRUE) - saveWidget(ggplotly(delta_bg_bar_plot), - file = file.path(out_dir_qc, "frequency_delta_background_bar.html"), selfcontained = TRUE) - - df_no_zero <- df %>% filter(L > 0 & is.finite(L)) - - scatter_plots <- list() - box_plots <- list() - for (var in variables) { - if (var %in% colnames(df_no_zero)) { - scatter_plots[[var]] <- ggplot(df_no_zero, aes(Scan, .data[[var]], color = as.factor(conc_num))) + - geom_point(shape = 3, size = 0.2) + - stat_summary(fun = mean, geom = "point", size = 0.6) + - stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar") + - ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) + - theme_publication() - - box_plots[[var]] <- ggplot(df_no_zero, aes(as.factor(Scan), .data[[var]], color = as.factor(conc_num))) + - geom_boxplot() + - ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) + - theme_publication() - } - } - - pdf(file = file.path(out_dir_qc, "plate_analysis_no_zeros.pdf"), width = 14, height = 9) - lapply(scatter_plots, print) - dev.off() - - pdf(file = file.path(out_dir_qc, "plate_analysis_no_zeros_boxplots.pdf"), width = 18, height = 9) - lapply(box_plots, print) - dev.off() - - for (var in names(scatter_plots)) { - html_path <- file.path(out_dir_qc, paste0("plate_analysis_no_zeros_", var, ".html")) - - pgg <- suppressWarnings(ggplotly(scatter_plots[[var]]) %>% - layout(legend = list(orientation = "h"))) - saveWidget(pgg, html_path, selfcontained = TRUE) - } - - for (var in names(box_plots)) { - html_path <- file.path(out_dir_qc, paste0("plate_analysis_no_zeros_boxplots_", var, ".html")) - - pgg <- suppressWarnings(ggplotly(box_plots[[var]]) %>% - layout(legend = list(orientation = "h"))) - saveWidget(pgg, html_path, selfcontained = TRUE) - } -} - -compute_experiment_summary_stats <- function(df, out_dir, background_strain = NULL) { - if (!is.null(background_strain)) { - df <- df %>% filter(OrfRep == background_strain) - } - - summary_stats <- df %>% - group_by(OrfRep, conc_num) %>% - summarize( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - max_K = max(K, na.rm = TRUE), - min_K = min(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - se_K = sd_K / sqrt(N), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - max_r = max(r, na.rm = TRUE), - min_r = min(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - se_r = sd_r / sqrt(N), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - max_AUC = max(AUC, na.rm = TRUE), - min_AUC = min(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - se_AUC = sd_AUC / sqrt(N) - ) - - if (is.null(background_strain)) { - fwrite(summary_stats, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) - } else { - fwrite(summary_stats, file.path(out_dir, "summary_stats_background_strains.csv"), row.names = FALSE) - } -} - - -filter_and_calculate_sd_k_stats <- function(df, out_dir_qc) { - df_cleaned <- df %>% - filter(is.finite(L_adjusted) & is.finite(K_adjusted)) - - df_within_2SD_K <- df_cleaned %>% - filter(abs(K_adjusted - mean(K_adjusted, na.rm = TRUE)) <= 2 * sd(K_adjusted, na.rm = TRUE)) %>% - group_by(conc_num) %>% - summarize( - N = n(), - mean_L = mean(L_adjusted, na.rm = TRUE), - max_L = ifelse(all(is.na(L_adjusted)), NA_real_, max(L_adjusted, na.rm = TRUE)), - sd_L = sd(L_adjusted, na.rm = TRUE) - ) - fwrite(df_within_2SD_K, file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2SD_K.csv")) - - df_outside_2SD_K <- df_cleaned %>% - filter(abs(K_adjusted - mean(K_adjusted, na.rm = TRUE)) > 2 * sd(K_adjusted, na.rm = TRUE)) %>% - group_by(conc_num) %>% - summarize( - N = n(), - mean_L = mean(L_adjusted, na.rm = TRUE), - max_L = ifelse(all(is.na(L_adjusted)), NA_real_, max(L_adjusted, na.rm = TRUE)), - sd_L = sd(L_adjusted, na.rm = TRUE) - ) - fwrite(df_outside_2SD_K, file.path(out_dir_qc, "max_observed_L_vals_for_spots_outside_2SD_K.csv")) - - p_raw_before <- ggplot(df_cleaned, - aes(x = L_adjusted, y = K_adjusted, color = as.factor(conc_num))) + - geom_point() + - ggtitle("Raw L vs K before QC") + - theme_publication(legend_position = "right") - - pdf(file.path(out_dir_qc, "raw_L_vs_K_before_QC.pdf")) - print(p_raw_before) - dev.off() - - saveWidget(ggplotly(p_raw_before), file.path(out_dir_qc, "raw_L_vs_K_before_QC.html")) - - p_above_threshold <- ggplot(df_cleaned %>% filter(delta_bg > delta_bg_tolerance), - aes(x = L_adjusted, y = K_adjusted, color = as.factor(conc_num))) + - geom_point() + - ggtitle("Raw L vs K for strains above delta background threshold") + - theme_publication(legend_position = "right") - - pdf(file.path(out_dir_qc, "raw_L_vs_K_for_strains_above_delta_bg_threshold.pdf")) - print(p_above_threshold) - dev.off() - - saveWidget(ggplotly(p_above_threshold), file.path(out_dir_qc, "raw_L_vs_K_for_strains_above_delta_bg_threshold.html")) -} - -compute_rf_interaction_scores <- function(df_rf, output_dir) { - lm_sd_L <- sd(df_rf$lm_Score_L, na.rm = TRUE) - lm_sd_K <- sd(df_rf$lm_Score_K, na.rm = TRUE) - lm_sd_r <- sd(df_rf$lm_Score_r, na.rm = TRUE) - lm_sd_AUC <- sd(df_rf$lm_Score_AUC, na.rm = TRUE) - - lm_mean_L <- mean(df_rf$lm_Score_L, na.rm = TRUE) - lm_mean_K <- mean(df_rf$lm_Score_K, na.rm = TRUE) - lm_mean_r <- mean(df_rf$lm_Score_r, na.rm = TRUE) - lm_mean_AUC <- mean(df_rf$lm_Score_AUC, na.rm = TRUE) - - df_rf <- df_rf %>% - mutate( - Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L, - Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K, - Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r, - Z_lm_AUC = (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC - ) - - df_rf <- df_rf %>% - arrange(desc(Z_lm_L), desc(NG)) - - fwrite(df_rf, file.path(output_dir, "rf_zscores_interaction.csv"), row.names = FALSE) - - return(df_rf) -} - -create_rank_plots <- function(interaction_scores_ranks, out_dir) { - rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank") - - lapply(rank_vars, function(rank_var) { - p <- ggplot(interaction_scores_ranks, aes(x = !!sym(rank_var))) + - geom_bar() + - ggtitle(paste("Rank Distribution for", rank_var)) + - theme_publication() +main <- function() { + lapply(names(args$experiments), function(exp_name) { + exp <- args$experiments[[exp_name]] + exp_path <- exp$path + exp_sd <- exp$sd + out_dir <- file.path(exp_path, "zscores") + out_dir_qc <- file.path(exp_path, "zscores", "qc") + dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) + dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE) - pdf_path <- file.path(out_dir, paste0("rank_distribution_", rank_var, ".pdf")) - pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT) - print(p) - dev.off() + # Load and process data + df <- load_and_process_data(args$easy_results_file, sd = exp_sd) + df <- update_gene_names(df, args$sgd_gene_list) - html_path <- sub(".pdf$", ".html", pdf_path) - pgg <- suppressWarnings(ggplotly(p) %>% - layout(legend = list(orientation = "h"))) - saveWidget(pgg, html_path, selfcontained = TRUE) + max_conc <- max(df$conc_num_factor) + + # QC steps and filtering + df_above_tolerance <- df %>% filter(DB == 1) + + # Calculate the half-medians for `L` and `K` for rows above tolerance + L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2 + K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2 + + # Get the number of rows that are above tolerance + rows_to_remove <- nrow(df_above_tolerance) + + # Set L, r, K, and AUC to NA for rows that are above tolerance + df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .))) + + # Calculate summary statistics for all strains, including both background and the deletions + message("Calculating summary statistics for all strains") + variables <- c("L", "K", "r", "AUC") + ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) + summary_stats <- ss$summary_stats + df_na_stats <- ss$df_with_stats + write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) + + print("Summary stats:") + print(head(summary_stats), width = 200) + + # Remove rows with 0 values in L + df_no_zeros <- df_na %>% filter(L > 0) + + # Additional filtering for non-finite values + df_na_filtered <- df_na %>% + filter(if_any(c(L), ~ !is.finite(.))) %>% + { + if (nrow(.) > 0) { + message("Removing non-finite rows:\n") + print(head(., n = 10)) + } + df_na %>% filter(if_all(c(L), is.finite)) + } + + # Filter data within and outside 2SD + message("Filtering by 2SD of K") + df_na_within_2sd_k <- df_na_stats %>% + filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) + df_na_outside_2sd_k <- df_na_stats %>% + filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) + + # Summary statistics for within and outside 2SD of K + message("Calculating summary statistics for L within 2SD of K") + # TODO We're omitting the original z_max calculation, not sure if needed? + ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + l_within_2sd_k_stats <- ss$summary_stats + df_na_l_within_2sd_k_stats <- ss$df_with_stats + message("Calculating summary statistics for L outside 2SD of K") + ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + l_outside_2sd_k_stats <- ss$summary_stats + df_na_l_outside_2sd_k_stats <- ss$df_with_stats + # Write CSV files + write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE) + write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) + + # Plots + + # Print quality control graphs before removing data due to contamination and + # adjusting missing data to max theoretical values + l_vs_k_plots <- list( + list(df = df, x_var = "L", y_var = "K", plot_type = "scatter", + title = "Raw L vs K before QC", + color_var = "conc_num", + legend_position = "right" + ) + ) + + above_threshold_plots <- list( + list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter", + title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"), + color_var = "conc_num", + annotations = list( + list( + x = L_half_median, + y = K_half_median, + label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance)) + ) + ), + error_bar = FALSE, + legend_position = "right" + ) + ) + + frequency_delta_bg_plots <- list( + list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density", + title = "Density plot for Delta Background by Conc All Data", + color_var = "conc_num", + x_label = "Delta Background", + y_label = "Density", + error_bar = FALSE, + legend_position = "right" + ), + list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar", + title = "Bar plot for Delta Background by Conc All Data", + color_var = "conc_num", + x_label = "Delta Background", + y_label = "Count", + error_bar = FALSE, + legend_position = "right" + ) + ) + + plate_analysis_plots <- list() + for (plot_type in c("scatter", "box")) { + variables <- c("L", "K", "r", "AUC", "delta_bg") + for (var in variables) { + for (stage in c("before", "after")) { + if (stage == "before") { + df_plot <- df + } else { + df_plot <- df_na # TODO use df_na_filtered if necessary + } + + # Set error_bar = TRUE only for scatter plots + error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE) + + # Create the plot configuration + plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type, + title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"), + error_bar = error_bar, color_var = "conc_num") + + plate_analysis_plots <- append(plate_analysis_plots, list(plot_config)) + } + } + } + + plate_analysis_no_zero_plots <- list() + for (plot_type in c("scatter", "box")) { + variables <- c("L", "K", "r", "AUC", "delta_bg") + for (var in variables) { + + # Set error_bar = TRUE only for scatter plots + error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE) + + # Create the plot configuration + plot_config <- list( + df = df_no_zeros, + x_var = "scan", + y_var = var, + plot_type = plot_type, + title = paste("Plate analysis by Drug Conc for", var, "after quality control"), + error_bar = error_bar, + color_var = "conc_num" + ) + plate_analysis_plots <- append(plate_analysis_plots, list(plot_config)) + } + } + + l_outside_2sd_k_plots <- list( + list(df = X_outside_2SD_K, x_var = "l", y_var = "K", plot_type = "scatter", + title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc", + color_var = "conc_num", + legend_position = "right" + ) + ) + + delta_bg_outside_2sd_k_plots <- list( + list(df = X_outside_2SD_K, x_var = "delta_bg", y_var = "K", plot_type = "scatter", + title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc", + color_var = "conc_num", + legend_position = "right" + ) + ) + + # Generate and save plots for each QC step + message("Generating QC plots") + generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots) + generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) + generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) + generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) + generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots) + generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots) + generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots) + + # Clean up + rm(df, df_above_tolerance, df_no_zeros) + + # TODO: Originally this filtered L NA's + # Let's try to avoid for now since stats have already been calculated + + # Process background strains + bg_strains <- c("YDL227C") + lapply(bg_strains, function(strain) { + + message("Processing background strain: ", strain) + + # Handle missing data by setting zero values to NA + # and then removing any rows with NA in L col + df_bg <- df_na %>% + filter(OrfRep == strain) %>% + mutate( + L = if_else(L == 0, NA, L), + K = if_else(K == 0, NA, K), + r = if_else(r == 0, NA, r), + AUC = if_else(AUC == 0, NA, AUC) + ) %>% + filter(!is.na(L)) + + # Recalculate summary statistics for the background strain + message("Calculating summary statistics for background strain") + ss <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) + summary_stats_bg <- ss$summary_stats + df_bg_stats <- ss$df_with_stats + write.csv(summary_stats_bg, + file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), + row.names = FALSE) + + # Filter reference and deletion strains + # Formerly X2_RF (reference strains) + df_reference <- df_na_stats %>% + filter(OrfRep == strain) %>% + mutate(SM = 0) + + # Formerly X2 (deletion strains) + df_deletion <- df_na_stats %>% + filter(OrfRep != strain) %>% + mutate(SM = 0) + + # Set the missing values to the highest theoretical value at each drug conc for L + # Leave other values as 0 for the max/min + reference_strain <- df_reference %>% + group_by(conc_num) %>% + mutate( + max_l_theoretical = max(max_L, na.rm = TRUE), + L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), + SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM), + L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% + ungroup() + + # Ditto for deletion strains + deletion_strains <- df_deletion %>% + group_by(conc_num) %>% + mutate( + max_l_theoretical = max(max_L, na.rm = TRUE), + L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), + SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM), + L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% + ungroup() + + # Calculate interactions + variables <- c("L", "K", "r", "AUC") + message("Calculating interaction scores") + print("Reference strain:") + print(head(reference_strain)) + reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables) + print("Deletion strains:") + print(head(deletion_strains)) + deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables) + + zscores_calculations_reference <- reference_results$zscores_calculations + zscores_interactions_reference <- reference_results$zscores_interactions + zscores_calculations <- deletion_results$zscores_calculations + zscores_interactions <- deletion_results$zscores_interactions + + # Writing Z-Scores to file + write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE) + write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE) + write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE) + write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE) + + # Create interaction plots + reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables) + deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables) + generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3)) + generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3)) + + # Define conditions for enhancers and suppressors + # TODO Add to study config file? + threshold <- 2 + enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= threshold + suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -threshold + enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= threshold + suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -threshold + + # Subset data + enhancers_L <- zscores_interactions[enhancer_condition_L, ] + suppressors_L <- zscores_interactions[suppressor_condition_L, ] + enhancers_K <- zscores_interactions[enhancer_condition_K, ] + suppressors_K <- zscores_interactions[suppressor_condition_K, ] + + # Save enhancers and suppressors + message("Writing enhancer/suppressor csv files") + write.csv(enhancers_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L.csv"), row.names = FALSE) + write.csv(suppressors_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L.csv"), row.names = FALSE) + write.csv(enhancers_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K.csv"), row.names = FALSE) + write.csv(suppressors_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K.csv"), row.names = FALSE) + + # Combine conditions for enhancers and suppressors + enhancers_and_suppressors_L <- zscores_interactions[enhancer_condition_L | suppressor_condition_L, ] + enhancers_and_suppressors_K <- zscores_interactions[enhancer_condition_K | suppressor_condition_K, ] + + # Save combined enhancers and suppressors + write.csv(enhancers_and_suppressors_L, + file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_L.csv"), row.names = FALSE) + write.csv(enhancers_and_suppressors_K, + file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_K.csv"), row.names = FALSE) + + # Handle linear model based enhancers and suppressors + lm_threshold <- 2 + enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= lm_threshold, ] + suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -lm_threshold, ] + enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= lm_threshold, ] + suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -lm_threshold, ] + + # Save linear model based enhancers and suppressors + message("Writing linear model enhancer/suppressor csv files") + write.csv(enhancers_lm_L, + file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L_lm.csv"), row.names = FALSE) + write.csv(suppressors_lm_L, + file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L_lm.csv"), row.names = FALSE) + write.csv(enhancers_lm_K, + file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K_lm.csv"), row.names = FALSE) + write.csv(suppressors_lm_K, + file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE) + + zscores_interactions_adjusted <- adjust_missing_and_rank(zscores_interactions) + + # Generate all rank plot configurations for L and K + rank_plot_configs <- c( + generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_L", "Avg_Zscore_L", "L"), + generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_K", "Avg_Zscore_K", "K") + ) + + # Generate and save rank plots + generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots", + plot_configs = rank_plot_config, grid_layout = list(ncol = 3, nrow = 2)) + + # # Correlation plots + # lm_list <- list( + # lm(Z_lm_K ~ Z_lm_L, data = zscores_interactions_filtered), + # lm(Z_lm_r ~ Z_lm_L, data = zscores_interactions_filtered), + # lm(Z_lm_AUC ~ Z_lm_L, data = zscores_interactions_filtered), + # lm(Z_lm_r ~ Z_lm_K, data = zscores_interactions_filtered), + # lm(Z_lm_AUC ~ Z_lm_K, data = zscores_interactions_filtered), + # lm(Z_lm_AUC ~ Z_lm_r, data = zscores_interactions_filtered) + # ) + + lm_summaries <- lapply(lm_list, summary) + correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries) + generate_and_save_plots(zscores_interactions_filtered, output_dir, correlation_plot_configs) + }) }) } - -create_correlation_plot <- function(interaction_scores, out_dir) { - interaction_scores <- interaction_scores %>% - filter_all(all_vars(is.finite(.))) - - pairs <- list( - c("Raw_Shift_L", "Raw_Shift_K"), - c("Raw_Shift_L", "Raw_Shift_r"), - c("Raw_Shift_L", "Raw_Shift_AUC"), - c("Raw_Shift_K", "Raw_Shift_r"), - c("Raw_Shift_K", "Raw_Shift_AUC"), - c("Raw_Shift_r", "Raw_Shift_AUC") - ) - - lapply(pairs, function(vars) { - p <- ggplot(interaction_scores, aes(x = !!sym(vars[1]), y = !!sym(vars[2]))) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) + - ggtitle(paste("Correlation between", vars[1], "and", vars[2])) + - theme_publication() - - pdf_path <- file.path(out_dir, paste0("correlation_", vars[1], "_", vars[2], ".pdf")) - pdf(pdf_path, width = PLOT_WIDTH, height = PLOT_HEIGHT) - print(p) - dev.off() - - html_path <- sub(".pdf$", ".html", pdf_path) - pgg <- suppressWarnings(ggplotly(p, tooltip = c(vars[1], vars[2])) %>% - layout(legend = list(orientation = "h"))) - saveWidget(pgg, html_path, selfcontained = TRUE) - }) -} - -process_experiment <- function(exp_name, exp_dir, genes, output_dir) { - out_dir <- file.path(exp_dir, "zscores") - out_dir_qc <- file.path(out_dir, "qc") - dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) - dir.create(out_dir_qc, showWarnings = FALSE) - - data <- load_and_preprocess_data(args$easy_results_file, genes) - - # Generate and publish QC plots - generate_and_publish_qc(data, out_dir_qc) - - # Calculate and save summary statistics - compute_experiment_summary_stats(data, out_dir) - compute_experiment_summary_stats(data, out_dir, background_strain = "YDL227C") - - InteractionScores_RF <- data %>% - select(OrfRepUnique) %>% - distinct() %>% - mutate( - Gene = NA, - Raw_Shift_L = NA, Z_Shift_L = NA, lm_Score_L = NA, Z_lm_L = NA, - R_Squared_L = NA, Sum_Z_Score_L = NA, Avg_Zscore_L = NA, - Raw_Shift_K = NA, Z_Shift_K = NA, lm_Score_K = NA, Z_lm_K = NA, - R_Squared_K = NA, Sum_Z_Score_K = NA, Avg_Zscore_K = NA, - Raw_Shift_r = NA, Z_Shift_r = NA, lm_Score_r = NA, Z_lm_r = NA, - R_Squared_r = NA, Sum_Z_Score_r = NA, Avg_Zscore_r = NA, - Raw_Shift_AUC = NA, Z_Shift_AUC = NA, lm_Score_AUC = NA, Z_lm_AUC = NA, - R_Squared_AUC = NA, Sum_Z_Score_AUC = NA, Avg_Zscore_AUC = NA, - NG = NA, SM = NA - ) - - for (i in seq_along(unique(InteractionScores_RF$OrfRepUnique))) { - Gene_Sel <- unique(InteractionScores_RF$OrfRepUnique)[i] - X_Gene_Sel <- data[data$OrfRepUnique == Gene_Sel,] - - X_stats_interaction <- X_Gene_Sel %>% - group_by(OrfRepUnique, Gene) %>% - summarize( - Raw_Shift_L = mean(L_adjusted, na.rm = TRUE), - Raw_Shift_K = mean(K_adjusted, na.rm = TRUE), - Raw_Shift_r = mean(r_adjusted, na.rm = TRUE), - Raw_Shift_AUC = mean(AUC_adjusted, na.rm = TRUE), - Z_Shift_L = mean(scale(L_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), - Z_Shift_K = mean(scale(K_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), - Z_Shift_r = mean(scale(r_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), - Z_Shift_AUC = mean(scale(AUC_adjusted, center = TRUE, scale = TRUE)[, 1], na.rm = TRUE), - lm_Score_L = coef(lm(L_adjusted ~ delta_bg, data = X_Gene_Sel))[2], - R_Squared_L = summary(lm(L_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared, - lm_Score_K = coef(lm(K_adjusted ~ delta_bg, data = X_Gene_Sel))[2], - R_Squared_K = summary(lm(K_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared, - lm_Score_r = coef(lm(r_adjusted ~ delta_bg, data = X_Gene_Sel))[2], - R_Squared_r = summary(lm(r_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared, - lm_Score_AUC = coef(lm(AUC_adjusted ~ delta_bg, data = X_Gene_Sel))[2], - R_Squared_AUC = summary(lm(AUC_adjusted ~ delta_bg, data = X_Gene_Sel))$r.squared, - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) - ) - - InteractionScores_RF <- InteractionScores_RF %>% - mutate( - Gene = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Gene[1], Gene), - Raw_Shift_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_L[1], Raw_Shift_L), - Z_Shift_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_L[1], Z_Shift_L), - lm_Score_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_L[1], lm_Score_L), - R_Squared_L = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_L[1], R_Squared_L), - Sum_Z_Score_L = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_L, na.rm = TRUE), Sum_Z_Score_L), - Avg_Zscore_L = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_L, na.rm = TRUE), Avg_Zscore_L), - Raw_Shift_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_K[1], Raw_Shift_K), - Z_Shift_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_K[1], Z_Shift_K), - lm_Score_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_K[1], lm_Score_K), - R_Squared_K = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_K[1], R_Squared_K), - Sum_Z_Score_K = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_K, na.rm = TRUE), Sum_Z_Score_K), - Avg_Zscore_K = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_K, na.rm = TRUE), Avg_Zscore_K), - Raw_Shift_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_r[1], Raw_Shift_r), - Z_Shift_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_r[1], Z_Shift_r), - lm_Score_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_r[1], lm_Score_r), - R_Squared_r = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_r[1], R_Squared_r), - Sum_Z_Score_r = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_r, na.rm = TRUE), Sum_Z_Score_r), - Avg_Zscore_r = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_r, na.rm = TRUE), Avg_Zscore_r), - Raw_Shift_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Raw_Shift_AUC[1], Raw_Shift_AUC), - Z_Shift_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$Z_Shift_AUC[1], Z_Shift_AUC), - lm_Score_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$lm_Score_AUC[1], lm_Score_AUC), - R_Squared_AUC = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$R_Squared_AUC[1], R_Squared_AUC), - Sum_Z_Score_AUC = ifelse(OrfRepUnique == Gene_Sel, sum(X_stats_interaction$Z_Shift_AUC, na.rm = TRUE), Sum_Z_Score_AUC), - Avg_Zscore_AUC = ifelse(OrfRepUnique == Gene_Sel, mean(X_stats_interaction$Z_Shift_AUC, na.rm = TRUE), Avg_Zscore_AUC), - NG = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$NG[1], NG), - DB = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$DB[1], DB), - SM = ifelse(OrfRepUnique == Gene_Sel, X_stats_interaction$SM[1], SM) - ) - } - - fwrite(InteractionScores_RF, file.path(out_dir, "rf_zscores_interaction.csv"), row.names = FALSE) - - generate_and_publish_qc(data, out_dir_qc) - - filter_and_calculate_sd_k_stats(data, out_dir_qc) - - variables <- c("L", "K", "r", "AUC", "delta_bg") - compute_experiment_summary_stats(data, variables, out_dir) - compute_experiment_summary_stats(data, variables, out_dir, background_strain = "YDL227C") - - compute_rf_interaction_scores(InteractionScores_RF, out_dir) - - publish_scores(InteractionScores_RF, out_dir) - - create_rank_plots(InteractionScores_RF, out_dir) - create_correlation_plot(InteractionScores_RF, out_dir) - - output_file <- file.path(out_dir, "zscores_interaction.csv") - fwrite(InteractionScores_RF, output_file, row.names = FALSE) - - return(output_file) -} - -processed_files <- lapply(names(args$experiments), function(exp_name) { - process_experiment(exp_name, args$experiments[[exp_name]], genes, args$out_dir) -}) - -if (length(processed_files) > 1) { - combined_data <- Reduce(function(x, y) { - merge(fread(x), fread(y), by = "OrfRepUnique", all = TRUE) - }, processed_files) - - combined_output_file <- file.path(args$out_dir, "zscores", "zscores_interaction_combined.csv") - fwrite(combined_data, combined_output_file, row.names = FALSE) -} +main() diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores2.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores2.R deleted file mode 100644 index 9f5c82a1..00000000 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores2.R +++ /dev/null @@ -1,1774 +0,0 @@ -suppressMessages({ - library(ggplot2) - library(plotly) - library(htmlwidgets) - library(dplyr) - library(ggthemes) - library(data.table) -}) - -options(warn = 2, max.print = 1000) - -# Constants for configuration -PLOT_WIDTH <- 14 -PLOT_HEIGHT <- 9 -BASE_SIZE <- 14 - -parse_arguments <- function() { - args <- if (interactive()) { - c( - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD", - 3, - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2_DoxoHLD/results_std.txt", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1", - "Experiment 1: Doxo versus HLD", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2", - "Experiment 2: HLD versus Doxo" - ) - } else { - commandArgs(trailingOnly = TRUE) - } - - paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE) - names <- args[seq(6, length(args), by = 2)] - experiments <- setNames(paths, names) - - list( - out_dir = normalizePath(file.path(args[1]), mustWork = FALSE), - sd = as.numeric(args[2]), - sgd_gene_list = normalizePath(file.path(args[3]), mustWork = FALSE), - easy_results_file = normalizePath(file.path(args[4]), mustWork = FALSE), - experiments = experiments - ) -} - -args <- parse_arguments() - -dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE) -dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE) - -# Define themes and scales -theme_publication <- function(base_size = BASE_SIZE, base_family = "sans", legend_position = "bottom") { - theme_foundation(base_size = base_size, base_family = base_family) + - theme( - plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5), - text = element_text(), - panel.background = element_rect(colour = NA), - plot.background = element_rect(colour = NA), - panel.border = element_rect(colour = NA), - axis.title = element_text(face = "bold", size = rel(1)), - axis.title.y = element_text(angle = 90, vjust = 2), - axis.title.x = element_text(vjust = -0.2), - axis.line = element_line(colour = "black"), - panel.grid.major = element_line(colour = "#f0f0f0"), - panel.grid.minor = element_blank(), - legend.key = element_rect(colour = NA), - legend.position = legend_position, - legend.direction = ifelse(legend_position == "right", "vertical", "horizontal"), - plot.margin = unit(c(10, 5, 5, 5), "mm"), - strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"), - strip.text = element_text(face = "bold") - ) -} - -scale_fill_publication <- function(...) { - discrete_scale("fill", "Publication", manual_pal(values = c( - "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", - "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" - )), ...) -} - -scale_colour_publication <- function(...) { - discrete_scale("colour", "Publication", manual_pal(values = c( - "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", - "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" - )), ...) -} - -# Load SGD gene list -sgd_genes <- function(sgd_gene_list) { - read.delim(file = sgd_gene_list, quote = "", header = FALSE, - colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) %>% - dplyr::rename(ORF = V4, GeneName = V5) -} - -genes <- sgd_genes(args$sgd_gene_list) - -# Load the initial dataframe from the easy_results_file -load_and_preprocess_data <- function(easy_results_file, std) { - df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) - - # Clean and convert columns to numeric where appropriate - df <- df %>% - filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>% - mutate( - Col = as.numeric(Col), - Row = as.numeric(Row), - L = as.numeric(l), - K = as.numeric(K), - r = as.numeric(r), - Scan = as.numeric(Scan), - AUC = as.numeric(AUC96), - LstBackgrd = as.numeric(LstBackgrd), - X1stBackgrd = as.numeric(X1stBackgrd), - OrfRep = ifelse(ORF == "YDL227C", "YDL227C", OrfRep), - Conc_Num = as.numeric(str_extract(Conc, "\\-*\\d+\\.*\\d*")), - Conc_Num_Factor = as.numeric(as.factor(Conc_Num)) - 1 - ) - - # Define max concentration factor - max_conc <- max(df$Conc_Num_Factor, na.rm = TRUE) - - return(list(df = df, max_conc = max_conc)) -} - -# Function to update Gene names using the SGD gene list -update_gene_names <- function(df, sgd_gene_list) { - genes <- read.delim(file = sgd_gene_list, quote = "", header = FALSE, colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) - gene_map <- setNames(genes$V5, genes$V4) - - df <- df %>% - rowwise() %>% - mutate(Gene = ifelse(OrfRep != "YDL227C", gene_map[[ORF]], Gene)) %>% - ungroup() %>% - mutate(Gene = ifelse(Gene == "" | Gene == "OCT1", OrfRep, Gene)) - - return(df) -} - -# Apply the functions -result <- load_and_preprocess_data(args$easy_results_file, args$sd) -df <- update_gene_names(result$df, args$sgd_gene_list) -max_conc <- result$max_conc - - -# Function to perform initial calculations and QC before filtering -perform_initial_calculations <- function(df, output_dir_qc, delta_bg_sd_factor) { - - # Add Delta_Backgrd column - df <- df %>% - mutate(Delta_Backgrd = LstBackgrd - X1stBackgrd) - - # Calculate Delta_Background_Tolerance - Delta_Background_Tolerance <- mean(df$Delta_Backgrd, na.rm = TRUE) + - (delta_bg_sd_factor * sd(df$Delta_Backgrd, na.rm = TRUE)) - print(paste("Delta_Background_Tolerance is", Delta_Background_Tolerance)) - - # Save plots to PDF and HTML - plot_and_save <- function(plot_obj, filename, output_dir_qc) { - pdf_path <- file.path(output_dir_qc, paste0(filename, ".pdf")) - html_path <- file.path(output_dir_qc, paste0(filename, ".html")) - pdf(pdf_path, width = 12, height = 8) - print(plot_obj) - dev.off() - pgg <- ggplotly(plot_obj) - saveWidget(pgg, html_path, selfcontained = TRUE) - } - - # Generate and save plate analysis plots - generate_plate_analysis_plots <- function(df, variable, output_dir_qc, delta_bg = FALSE) { - if (!delta_bg) { - plot_name <- paste("Plate analysis by Drug Conc for", variable, "before quality control") - } else { - plot_name <- "Plate analysis by Drug Conc for Delta_Backgrd before quality control" - variable <- "Delta_Backgrd" - } - - p1 <- ggplot(df, aes(Scan, .data[[variable]], color = as.factor(Conc_Num))) + - geom_point(shape = 3, size = 0.2, position = "jitter") + - stat_summary(fun.data = mean_sdl, geom = "errorbar") + - stat_summary(fun = mean, geom = "point", size = 0.6) + - ggtitle(plot_name) + - theme_publication() - - p2 <- ggplot(df, aes(as.factor(Scan), .data[[variable]], color = as.factor(Conc_Num))) + - geom_boxplot() + - ggtitle(plot_name) + - theme_publication() - - plot_and_save(p1, paste0("Plate_Analysis_", variable), output_dir_qc) - plot_and_save(p2, paste0("Plate_Analysis_", variable, "_Box"), output_dir_qc) - } - - generate_plate_analysis_plots(df, "L", output_dir_qc) - generate_plate_analysis_plots(df, "K", output_dir_qc) - generate_plate_analysis_plots(df, "r", output_dir_qc) - generate_plate_analysis_plots(df, "AUC", output_dir_qc) - generate_plate_analysis_plots(df, "Delta_Backgrd", output_dir_qc, delta_bg = TRUE) - - # Raw L vs K plot before QC - Raw_l_vs_K_beforeQC <- ggplot(df, aes(l, K, color = as.factor(Conc_Num))) + - geom_point(aes(ORF = ORF, Gene = Gene, Delta_Backgrd = Delta_Backgrd), shape = 3) + - ggtitle("Raw L vs K before QC") + - theme_publication_legend_right() - - plot_and_save(Raw_l_vs_K_beforeQC, "Raw_L_vs_K_beforeQC", output_dir_qc) - - # Subset and plot data above Delta_Background_Tolerance - df_above_tolerance <- df %>% filter(Delta_Backgrd >= Delta_Background_Tolerance) - df_above_tolerance_K_halfmedian <- median(df_above_tolerance$K, na.rm = TRUE) / 2 - df_above_tolerance_L_halfmedian <- median(df_above_tolerance$l, na.rm = TRUE) / 2 - df_above_tolerance_toRemove <- nrow(df_above_tolerance) - - X_Delta_Backgrd_above_Tolerance_L_vs_K <- ggplot(df_above_tolerance, aes(l, K, color = as.factor(Conc_Num))) + - geom_point(aes(ORF = ORF, Gene = Gene, Delta_Backgrd = Delta_Backgrd), shape = 3) + - ggtitle(paste("Raw L vs K for strains above delta background threshold of", Delta_Background_Tolerance, "or above")) + - annotate("text", x = df_above_tolerance_L_halfmedian, y = df_above_tolerance_K_halfmedian, - label = paste("Strains above delta background tolerance = ", df_above_tolerance_toRemove)) + - theme_publication_legend_right() - - plot_and_save(X_Delta_Backgrd_above_Tolerance_L_vs_K, "Raw_L_vs_K_for_strains_above_deltabackgrd_threshold", output_dir_qc) - - # Frequency and Bar plots for Delta Background - DeltaBackground_Frequency_Plot <- ggplot(df, aes(Delta_Backgrd, color = as.factor(Conc_Num))) + - geom_density() + - ggtitle("Density plot for Delta Background by Conc All Data") + - theme_publication_legend_right() - - DeltaBackground_Bar_Plot <- ggplot(df, aes(Delta_Backgrd, color = as.factor(Conc_Num))) + - geom_bar() + - ggtitle("Bar plot for Delta Background by Conc All Data") + - theme_publication_legend_right() - - plot_and_save(DeltaBackground_Frequency_Plot, "Frequency_Delta_Background", output_dir_qc) - plot_and_save(DeltaBackground_Bar_Plot, "Bar_Delta_Background", output_dir_qc) - - return(df) -} - -# Example usage with df and other parameters from the previous steps -df <- perform_initial_calculations(df, args$out_dir, args$sd) - - -# Identify missing data (No Growth) -df <- df %>% - mutate( - NG = ifelse(L == 0 & !is.na(l), 1, 0), - DB = ifelse(Delta_Backgrd >= Delta_Background_Tolerance, 1, 0) - ) - -# Replace values for l, r, AUC, and K where Delta_Backgrd exceeds tolerance -df <- df %>% - mutate( - L = ifelse(DB == 1, NA, L), - r = ifelse(DB == 1, NA, r), - AUC = ifelse(DB == 1, NA, AUC), - K = ifelse(DB == 1, NA, K) - ) - -# Function to generate and save plate analysis plots after QC -generate_qc_plots <- function(df, var, output_dir_qc) { - plot_scatter <- ggplot(df, aes(Scan, !!sym(var), color = as.factor(Conc_Num))) + - geom_point(shape = 3, size = 0.2) + - stat_summary(fun.data = mean_sdl, geom = "errorbar") + - stat_summary(fun = mean, geom = "point", size = 0.6) + - ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) + - theme_publication() - - plot_box <- ggplot(df, aes(as.factor(Scan), !!sym(var), color = as.factor(Conc_Num))) + - geom_boxplot() + - ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) + - theme_publication() - - plot_and_save(plot_scatter, paste0("Plate_Analysis_", var, "_afterQC"), output_dir_qc) - plot_and_save(plot_box, paste0("Plate_Analysis_", var, "_Box_afterQC"), output_dir_qc) -} - -# Generate and save QC plots for l, K, r, AUC, and Delta_Backgrd after QC -variables <- c("L", "K", "r", "AUC", "Delta_Backgrd") -lapply(variables, generate_qc_plots, df = df, output_dir_qc = args$out_dir) - -# Remove rows where l is zero for additional QC plots -df_noZero <- df %>% filter(L > 0) - -lapply(variables, generate_qc_plots, df = df_noZero, output_dir_qc = args$out_dir) - -# Print plate analysis data before and after QC to PDFs -pdf(file = file.path(args$out_dir, "Plate_Analysis.pdf"), width = 14, height = 9) -print(Plate_Analysis_L) -print(Plate_Analysis_L_afterQC) -print(Plate_Analysis_K) -print(Plate_Analysis_K_afterQC) -print(Plate_Analysis_r) -print(Plate_Analysis_r_afterQC) -print(Plate_Analysis_AUC) -print(Plate_Analysis_AUC_afterQC) -print(Plate_Analysis_Delta_Backgrd) -print(Plate_Analysis_Delta_Backgrd_afterQC) -dev.off() - -pdf(file = file.path(args$out_dir, "Plate_Analysis_Boxplots.pdf"), width = 18, height = 9) -print(Plate_Analysis_L_Box) -print(Plate_Analysis_L_Box_afterQC) -print(Plate_Analysis_K_Box) -print(Plate_Analysis_K_Box_afterQC) -print(Plate_Analysis_r_Box) -print(Plate_Analysis_r_Box_afterQC) -print(Plate_Analysis_AUC_Box) -print(Plate_Analysis_AUC_Box_afterQC) -print(Plate_Analysis_Delta_Backgrd_Box) -print(Plate_Analysis_Delta_Backgrd_Box_afterQC) -dev.off() - -pdf(file = file.path(args$out_dir, "Plate_Analysis_noZeros.pdf"), width = 14, height = 9) -print(Plate_Analysis_L_afterQC_Z) -print(Plate_Analysis_K_afterQC_Z) -print(Plate_Analysis_r_afterQC_Z) -print(Plate_Analysis_AUC_afterQC_Z) -print(Plate_Analysis_Delta_Backgrd_afterQC_Z) -dev.off() - -pdf(file = file.path(args$out_dir, "Plate_Analysis_noZeros_Boxplots.pdf"), width = 18, height = 9) -print(Plate_Analysis_L_Box_afterQC_Z) -print(Plate_Analysis_K_Box_afterQC_Z) -print(Plate_Analysis_r_Box_afterQC_Z) -print(Plate_Analysis_AUC_Box_afterQC_Z) -print(Plate_Analysis_Delta_Backgrd_Box_afterQC_Z) -dev.off() - -# Clean up -rm(df_noZero) - -# Calculate and save summary statistics for all strains -summary_stats <- df %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N - 1), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - max_K = max(K, na.rm = TRUE), - min_K = min(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - se_K = sd_K / sqrt(N - 1), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - max_r = max(r, na.rm = TRUE), - min_r = min(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - se_r = sd_r / sqrt(N - 1), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - max_AUC = max(AUC, na.rm = TRUE), - min_AUC = min(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - se_AUC = sd_AUC / sqrt(N - 1) - ) - -write.csv(summary_stats, file = file.path(args$out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) - -# Background Strains -Background_Strains <- c("YDL227C") - -# Iterate over each background strain -for (s in Background_Strains) { - # Subset the dataframe for the current background strain - df_background <- df %>% filter(OrfRep == s) - - # Handle missing data by setting zero values to NA - if (sum(df_background$l == 0, na.rm = TRUE) > 0) { - df_background <- df_background %>% - mutate( - L = ifelse(L == 0, NA, L), - K = ifelse(K == 0, NA, K), - r = ifelse(r == 0, NA, r), - AUC = ifelse(AUC == 0, NA, AUC) - ) - } - - # Remove rows where l is NA after the adjustment - df_background <- df_background %>% filter(!is.na(L)) - - # Calculate summary statistics for L, K, r, and AUC - df_stats_by_L <- df_background %>% - group_by(OrfRep, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N - 1) - ) - - max_sd_L <- max(df_stats_by_L$sd_L, na.rm = TRUE) - - df_stats_by_K <- df_background %>% - group_by(OrfRep, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - max_K = max(K, na.rm = TRUE), - min_K = min(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - se_K = sd_K / sqrt(N - 1) - ) - - max_sd_K <- max(df_stats_by_K$sd_K, na.rm = TRUE) - - df_stats_by_r <- df_background %>% - group_by(OrfRep, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - max_r = max(r, na.rm = TRUE), - min_r = min(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - se_r = sd_r / sqrt(N - 1) - ) - - max_sd_r <- max(df_stats_by_r$sd_r, na.rm = TRUE) - - df_stats_by_AUC <- df_background %>% - group_by(OrfRep, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - max_AUC = max(AUC, na.rm = TRUE), - min_AUC = min(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - se_AUC = sd_AUC / sqrt(N - 1) - ) - - max_sd_AUC <- max(df_stats_by_AUC$sd_AUC, na.rm = TRUE) - - # Combine all summary statistics into one dataframe - df_stats_by <- df_background %>% - group_by(OrfRep, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N - 1), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - max_K = max(K, na.rm = TRUE), - min_K = min(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - se_K = sd_K / sqrt(N - 1), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - max_r = max(r, na.rm = TRUE), - min_r = min(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - se_r = sd_r / sqrt(N - 1), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - max_AUC = max(AUC, na.rm = TRUE), - min_AUC = min(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - se_AUC = sd_AUC / sqrt(N - 1) - ) - - # Save the summary statistics for the background strain - write.csv(df_stats_by, file = file.path(args$out_dir, "SummaryStats_BackgroundStrains.csv"), row.names = FALSE) - - # Calculate max theoretical L values within 2SD of the reference strain - df_within_2SD_K <- data.frame() - df_outside_2SD_K <- data.frame() - - for (q in unique(df$Conc_Num_Factor)) { - if (q == 0) { - df_within_2SD_K <- df %>% filter(Conc_Num_Factor == q & !is.na(L)) - df_stats_TEMP_K <- df_stats_by_K %>% filter(Conc_Num_Factor == q) - df_within_2SD_K <- df_within_2SD_K %>% - filter(K >= (df_stats_TEMP_K$mean_K - (2 * df_stats_TEMP_K$sd_K)) & - K <= (df_stats_TEMP_K$mean_K + (2 * df_stats_TEMP_K$sd_K))) - - df_outside_2SD_K <- df %>% filter(Conc_Num_Factor == q & !is.na(L)) - df_outside_2SD_K <- df_outside_2SD_K %>% - filter(K <= (df_stats_TEMP_K$mean_K - (2 * df_stats_TEMP_K$sd_K)) | - K >= (df_stats_TEMP_K$mean_K + (2 * df_stats_TEMP_K$sd_K))) - } else { - df_within_2SD_K_temp <- df %>% filter(Conc_Num_Factor == q & !is.na(L)) - df_stats_TEMP_K <- df_stats_by_K %>% filter(Conc_Num_Factor == q) - df_within_2SD_K_temp <- df_within_2SD_K_temp %>% - filter(K >= (df_stats_TEMP_K$mean_K - (2 * df_stats_TEMP_K$sd_K)) & - K <= (df_stats_TEMP_K$mean_K + (2 * df_stats_TEMP_K$sd_K))) - - df_within_2SD_K <- bind_rows(df_within_2SD_K, df_within_2SD_K_temp) - - df_outside_2SD_K_temp <- df %>% filter(Conc_Num_Factor == q & !is.na(L)) - df_outside_2SD_K_temp <- df_outside_2SD_K_temp %>% - filter(K <= (df_stats_TEMP_K$mean_K - (2 * df_stats_TEMP_K$sd_K)) | - K >= (df_stats_TEMP_K$mean_K + (2 * df_stats_TEMP_K$sd_K))) - - df_outside_2SD_K <- bind_rows(df_outside_2SD_K, df_outside_2SD_K_temp) - } - } - - # Calculate summary statistics for L within and outside 2SD of K - df_stats_by_L_within_2SD_K <- df_within_2SD_K %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N - 1), - z_max = (max_L - mean_L) / sd_L - ) - - max_sd_within_2SD_K <- max(df_stats_by_L_within_2SD_K$sd_L, na.rm = TRUE) - - write.csv(df_stats_by_L_within_2SD_K, file = file.path(args$out_dir, "Max_Observed_L_Vals_for_spots_within_2SD_K.csv"), row.names = FALSE) - - df_stats_by_L_outside_2SD_K <- df_outside_2SD_K %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N - 1) - ) - - max_sd_outside_2SD_K <- max(df_stats_by_L_outside_2SD_K$sd_L, na.rm = TRUE) - - # Plot and save L vs K for strains outside 2SD of K - plot_L_vs_K_outside_2SD_K <- ggplot(df_outside_2SD_K, aes(L, K, color = as.factor(Conc_Num))) + - geom_point(aes(ORF = ORF, Gene = Gene, Delta_Backgrd = Delta_Backgrd), shape = 3) + - ggtitle("Raw L vs K for strains falling outside 2SD of the K mean at each conc") + - theme_publication_legend_right() - - pdf(file = file.path(args$out_dir, "Raw_L_vs_K_for_strains_2SD_outside_mean_K.pdf"), width = 10, height = 8) - print(plot_L_vs_K_outside_2SD_K) - dev.off() - - saveWidget(ggplotly(plot_L_vs_K_outside_2SD_K), file = file.path(args$out_dir, "RawL_vs_K_for_strains_outside_2SD_K.html"), selfcontained = TRUE) - - # Plot and save DeltaBackground vs K for strains outside 2SD of K - plot_DeltaBackground_vs_K_outside_2SD_K <- ggplot(df_outside_2SD_K, aes(Delta_Backgrd, K, color = as.factor(Conc_Num))) + - geom_point(aes(L = l, ORF = ORF, Gene = Gene), shape = 3, position = "jitter") + - ggtitle("DeltaBackground vs K for strains falling outside 2SD of the K mean at each conc") + - theme_publication_legend_right() - - pdf(file = file.path(args$out_dir, "DeltaBackground_vs_K_for_strains_2SD_outside_mean_K.pdf"), width = 10, height = 8) - print(plot_DeltaBackground_vs_K_outside_2SD_K) - dev.off() - - saveWidget(ggplotly(plot_DeltaBackground_vs_K_outside_2SD_K), file = file.path(args$out_dir, "DeltaBackground_vs_K_for_strains_outside_2SD_K.html"), selfcontained = TRUE) - - # Get the background strain mean values at the no-drug concentration (Conc_Num_Factor = 0) - background_mean_L <- df_stats_by_L %>% filter(Conc_Num_Factor == 0) %>% pull(mean_L) - background_mean_K <- df_stats_by_K %>% filter(Conc_Num_Factor == 0) %>% pull(mean_K) - background_mean_r <- df_stats_by_r %>% filter(Conc_Num_Factor == 0) %>% pull(mean_r) - background_mean_AUC <- df_stats_by_AUC %>% filter(Conc_Num_Factor == 0) %>% pull(mean_AUC) - - # Initialize empty plots (placeholder for future plotting) - p_l <- ggplot() - p_K <- ggplot() - p_r <- ggplot() - p_AUC <- ggplot() - p_rf_l <- ggplot() - p_rf_K <- ggplot() - p_rf_r <- ggplot() - p_rf_AUC <- ggplot() - - # Get only the deletion strains (excluding the background strain) - df_deletion_strains <- df %>% filter(OrfRep != s) - - # Initialize SM (Set to Max) column - df_deletion_strains <- df_deletion_strains %>% mutate(SM = 0) - - # Set missing values to the highest theoretical value at each drug concentration for L - df_deletion_strains_new <- data.frame() - - for (i in seq_along(unique(df_deletion_strains$Conc_Num))) { - concentration <- unique(df_deletion_strains$Conc_Num)[i] - df_temp <- df_deletion_strains %>% filter(Conc_Num == concentration) - - if (concentration == 0) { - df_deletion_strains_new <- df_temp - message(paste("Check loop order, conc =", concentration)) - } else { - max_L_theoretical <- df_stats_by_L_within_2SD_K %>% filter(Conc_Num_Factor == concentration) %>% pull(max_L) - - df_temp <- df_temp %>% - mutate( - L = ifelse(L == 0 & !is.na(L), max_L_theoretical, L), - SM = ifelse(L >= max_L_theoretical & !is.na(L), 1, SM), - L = ifelse(L >= max_L_theoretical & !is.na(L), max_L_theoretical, L) - ) - - df_deletion_strains_new <- bind_rows(df_deletion_strains_new, df_temp) - message(paste("Check loop order, conc =", concentration)) - } - } - - df_deletion_strains <- df_deletion_strains_new - - # Get only the reference strains (background strain) - df_reference_strains <- df %>% filter(OrfRep == s) - - # Initialize SM (Set to Max) column - df_reference_strains <- df_reference_strains %>% mutate(SM = 0) - - # Set missing values to the highest theoretical value at each drug concentration for L - df_reference_strains_new <- data.frame() - - for (i in seq_along(unique(df_reference_strains$Conc_Num))) { - concentration <- unique(df_reference_strains$Conc_Num)[i] - df_rf_temp <- df_reference_strains %>% filter(Conc_Num == concentration) - - if (concentration == 0) { - df_reference_strains_new <- df_rf_temp - message(paste("Check loop order, conc =", concentration)) - } else { - max_L_theoretical <- df_stats_by_L_within_2SD_K %>% filter(Conc_Num_Factor == concentration) %>% pull(max_L) - - df_rf_temp <- df_rf_temp %>% - mutate( - L = ifelse(L == 0 & !is.na(L), max_L_theoretical, L), - SM = ifelse(L >= max_L_theoretical & !is.na(L), 1, SM), - L = ifelse(L >= max_L_theoretical & !is.na(L), max_L_theoretical, L) - ) - - df_reference_strains_new <- bind_rows(df_reference_strains_new, df_rf_temp) - message(paste("Check loop order, if error, refs have no L values outside theoretical max L, for REFs, conc =", concentration)) - } - } - - df_reference_strains <- df_reference_strains_new - - df_RF <- df_RF_new - - # Get the RF Z score values - - # Change the OrfRep column to include the RF strain, Gene name, and Num so each RF gets its own score - df_RF <- df_RF %>% - mutate(OrfRep = paste(OrfRep, Gene, Num, sep = "_")) - - num_genes_RF <- length(unique(df_RF$OrfRep)) - print(num_genes_RF) - - # Create the output dataframe containing columns for each RF strain - interaction_scores_RF <- df_RF %>% - distinct(OrfRep) %>% - mutate( - Gene = NA, - Raw_Shift_L = NA, Z_Shift_L = NA, lm_Score_L = NA, Z_lm_L = NA, R_Squared_L = NA, Sum_Z_Score_L = NA, Avg_Zscore_L = NA, - Raw_Shift_K = NA, Z_Shift_K = NA, lm_Score_K = NA, Z_lm_K = NA, R_Squared_K = NA, Sum_Z_Score_K = NA, Avg_Zscore_K = NA, - Raw_Shift_r = NA, Z_Shift_r = NA, lm_Score_r = NA, Z_lm_r = NA, R_Squared_r = NA, Sum_Z_Score_r = NA, Avg_Zscore_r = NA, - Raw_Shift_AUC = NA, Z_Shift_AUC = NA, lm_Score_AUC = NA, Z_lm_AUC = NA, R_Squared_AUC = NA, Sum_Z_Score_AUC = NA, Avg_Zscore_AUC = NA, - NG = NA, SM = NA - ) - - for (i in seq_len(num_genes_RF)) { - # Get each deletion strain ORF - gene_sel <- unique(df_RF$OrfRep)[i] - - # Extract only the current deletion strain and its data - df_gene_sel <- df_RF %>% filter(OrfRep == gene_sel) - - # Calculate summary statistics for the selected gene - df_stats_interaction <- df_gene_sel %>% - group_by(OrfRep, Gene, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N - 1), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - se_K = sd_K / sqrt(N - 1), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - se_r = sd_r / sqrt(N - 1), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - se_AUC = sd_AUC / sqrt(N - 1), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) - ) - - # Get shift values - if (is.na(df_stats_interaction$mean_L[1]) || df_stats_interaction$mean_L[1] == 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Raw_Shift_L = 0, Z_Shift_L = 0, - Raw_Shift_K = 0, Z_Shift_K = 0, - Raw_Shift_r = 0, Z_Shift_r = 0, - Raw_Shift_AUC = 0, Z_Shift_AUC = 0 - ) - } else { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Raw_Shift_L = mean_L[1] - background_mean_L, - Z_Shift_L = Raw_Shift_L / df_stats_by_L$sd[1], - Raw_Shift_K = mean_K[1] - background_mean_K, - Z_Shift_K = Raw_Shift_K / df_stats_by_K$sd[1], - Raw_Shift_r = mean_r[1] - background_mean_r, - Z_Shift_r = Raw_Shift_r / df_stats_by_r$sd[1], - Raw_Shift_AUC = mean_AUC[1] - background_mean_AUC, - Z_Shift_AUC = Raw_Shift_AUC / df_stats_by_AUC$sd[1] - ) - } - - # Add wild-type (WT) values and standard deviations - df_stats_interaction <- df_stats_interaction %>% - mutate( - WT_l = df_stats_by_L$mean, WT_sd_l = df_stats_by_L$sd, - WT_K = df_stats_by_K$mean, WT_sd_K = df_stats_by_K$sd, - WT_r = df_stats_by_r$mean, WT_sd_r = df_stats_by_r$sd, - WT_AUC = df_stats_by_AUC$mean, WT_sd_AUC = df_stats_by_AUC$sd - ) - - # Calculate scores if there is growth at no drug concentration - if (df_stats_interaction$mean_L[1] != 0 && !is.na(df_stats_interaction$mean_L[1])) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Exp_L = WT_l + Raw_Shift_L, - Exp_K = WT_K + Raw_Shift_K, - Exp_r = WT_r + Raw_Shift_r, - Exp_AUC = WT_AUC + Raw_Shift_AUC, - Delta_L = mean_L - Exp_L, - Delta_K = mean_K - Exp_K, - Delta_r = mean_r - Exp_r, - Delta_AUC = mean_AUC - Exp_AUC, - Zscore_L = Delta_L / WT_sd_l, - Zscore_K = Delta_K / WT_sd_K, - Zscore_r = Delta_r / WT_sd_r, - Zscore_AUC = Delta_AUC / WT_sd_AUC - ) - - # Handle no growth (NG) and set to max (SM) values - if (sum(df_stats_interaction$NG, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Delta_L = ifelse(NG == 1, mean_L - WT_l, Delta_L), - Delta_K = ifelse(NG == 1, mean_K - WT_K, Delta_K), - Delta_r = ifelse(NG == 1, mean_r - WT_r, Delta_r), - Delta_AUC = ifelse(NG == 1, mean_AUC - WT_AUC, Delta_AUC) - ) - } - - if (sum(df_stats_interaction$SM, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate(Delta_L = ifelse(SM == 1, mean_L - WT_l, Delta_L)) - } - - # Calculate linear models - gene_lm_L <- lm(Delta_L ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_K <- lm(Delta_K ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_r <- lm(Delta_r ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_AUC <- lm(Delta_AUC ~ Conc_Num_Factor, data = df_stats_interaction) - - # Calculate interaction scores and R-squared values - gene_interaction_L <- max_conc * coef(gene_lm_L)[2] + coef(gene_lm_L)[1] - r_squared_l <- summary(gene_lm_L)$r.squared - gene_interaction_K <- max_conc * coef(gene_lm_K)[2] + coef(gene_lm_K)[1] - r_squared_K <- summary(gene_lm_K)$r.squared - gene_interaction_r <- max_conc * coef(gene_lm_r)[2] + coef(gene_lm_r)[1] - r_squared_r <- summary(gene_lm_r)$r.squared - gene_interaction_AUC <- max_conc * coef(gene_lm_AUC)[2] + coef(gene_lm_AUC)[1] - r_squared_AUC <- summary(gene_lm_AUC)$r.squared - - # Total non-removed concentrations - num_non_removed_conc <- Total_Conc_Nums - sum(df_stats_interaction$DB, na.rm = TRUE) - 1 - - # Report the scores - interaction_scores_RF <- interaction_scores_RF %>% - mutate( - Raw_Shift_L = replace(Raw_Shift_L, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_L[1]), - Z_Shift_L = replace(Z_Shift_L, OrfRep == gene_sel, df_stats_interaction$Z_Shift_L[1]), - lm_Score_L = replace(lm_Score_L, OrfRep == gene_sel, gene_interaction_L), - R_Squared_L = replace(R_Squared_L, OrfRep == gene_sel, r_squared_l), - Sum_Z_Score_L = replace(Sum_Z_Score_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE)), - Avg_Zscore_L = replace(Avg_Zscore_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_K = replace(Raw_Shift_K, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_K[1]), - Z_Shift_K = replace(Z_Shift_K, OrfRep == gene_sel, df_stats_interaction$Z_Shift_K[1]), - lm_Score_K = replace(lm_Score_K, OrfRep == gene_sel, gene_interaction_K), - R_Squared_K = replace(R_Squared_K, OrfRep == gene_sel, r_squared_K), - Sum_Z_Score_K = replace(Sum_Z_Score_K, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_K, na.rm = TRUE)), - Avg_Zscore_K = replace(Avg_Zscore_K, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_K, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_r = replace(Raw_Shift_r, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_r[1]), - Z_Shift_r = replace(Z_Shift_r, OrfRep == gene_sel, df_stats_interaction$Z_Shift_r[1]), - lm_Score_r = replace(lm_Score_r, OrfRep == gene_sel, gene_interaction_r), - R_Squared_r = replace(R_Squared_r, OrfRep == gene_sel, r_squared_r), - Sum_Z_Score_r = replace(Sum_Z_Score_r, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_r, na.rm = TRUE)), - Avg_Zscore_r = replace(Avg_Zscore_r, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_r, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_AUC = replace(Raw_Shift_AUC, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_AUC[1]), - Z_Shift_AUC = replace(Z_Shift_AUC, OrfRep == gene_sel, df_stats_interaction$Z_Shift_AUC[1]), - lm_Score_AUC = replace(lm_Score_AUC, OrfRep == gene_sel, gene_interaction_AUC), - R_Squared_AUC = replace(R_Squared_AUC, OrfRep == gene_sel, r_squared_AUC), - Sum_Z_Score_AUC = replace(Sum_Z_Score_AUC, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_AUC, na.rm = TRUE)), - Avg_Zscore_AUC = replace(Avg_Zscore_AUC, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_AUC, na.rm = TRUE) / num_non_removed_conc) - ) - } else { - # Handle case where mean_L is 0 or NA - interaction_scores_RF <- interaction_scores_RF %>% - mutate( - Raw_Shift_L = replace(Raw_Shift_L, OrfRep == gene_sel, NA), - Z_Shift_L = replace(Z_Shift_L, OrfRep == gene_sel, NA), - lm_Score_L = replace(lm_Score_L, OrfRep == gene_sel, NA), - R_Squared_L = replace(R_Squared_L, OrfRep == gene_sel, NA), - Sum_Z_Score_L = replace(Sum_Z_Score_L, OrfRep == gene_sel, NA), - Avg_Zscore_L = replace(Avg_Zscore_L, OrfRep == gene_sel, NA), - Raw_Shift_K = replace(Raw_Shift_K, OrfRep == gene_sel, NA), - Z_Shift_K = replace(Z_Shift_K, OrfRep == gene_sel, NA), - lm_Score_K = replace(lm_Score_K, OrfRep == gene_sel, NA), - R_Squared_K = replace(R_Squared_K, OrfRep == gene_sel, NA), - Sum_Z_Score_K = replace(Sum_Z_Score_K, OrfRep == gene_sel, NA), - Avg_Zscore_K = replace(Avg_Zscore_K, OrfRep == gene_sel, NA), - Raw_Shift_r = replace(Raw_Shift_r, OrfRep == gene_sel, NA), - Z_Shift_r = replace(Z_Shift_r, OrfRep == gene_sel, NA), - lm_Score_r = replace(lm_Score_r, OrfRep == gene_sel, NA), - R_Squared_r = replace(R_Squared_r, OrfRep == gene_sel, NA), - Sum_Z_Score_r = replace(Sum_Z_Score_r, OrfRep == gene_sel, NA), - Avg_Zscore_r = replace(Avg_Zscore_r, OrfRep == gene_sel, NA), - Raw_Shift_AUC = replace(Raw_Shift_AUC, OrfRep == gene_sel, NA), - Z_Shift_AUC = replace(Z_Shift_AUC, OrfRep == gene_sel, NA), - lm_Score_AUC = replace(lm_Score_AUC, OrfRep == gene_sel, NA), - R_Squared_AUC = replace(R_Squared_AUC, OrfRep == gene_sel, NA), - Sum_Z_Score_AUC = replace(Sum_Z_Score_AUC, OrfRep == gene_sel, NA), - Avg_Zscore_AUC = replace(Avg_Zscore_AUC, OrfRep == gene_sel, NA) - ) - } - - # Append the interaction statistics for all RFs - if (i == 1) { - df_stats_interaction_all_RF <- df_stats_interaction - } else { - df_stats_interaction_all_RF <- bind_rows(df_stats_interaction_all_RF, df_stats_interaction) - } - - # Add NG, DB, and SM values to the InteractionScores_RF dataframe - interaction_scores_RF <- interaction_scores_RF %>% - mutate( - NG = replace(NG, OrfRep == gene_sel, sum(df_stats_interaction$NG, na.rm = TRUE)), - DB = replace(DB, OrfRep == gene_sel, sum(df_stats_interaction$DB, na.rm = TRUE)), - SM = replace(SM, OrfRep == gene_sel, sum(df_stats_interaction$SM, na.rm = TRUE)) - ) - } - - print("Pass RF Calculation loop") - - # Calculate summary statistics for the linear models - lm_sd_L <- sd(interaction_scores_RF$lm_Score_L, na.rm = TRUE) - lm_sd_K <- sd(interaction_scores_RF$lm_Score_K, na.rm = TRUE) - lm_sd_r <- sd(interaction_scores_RF$lm_Score_r, na.rm = TRUE) - lm_sd_AUC <- sd(interaction_scores_RF$lm_Score_AUC, na.rm = TRUE) - - lm_mean_L <- mean(interaction_scores_RF$lm_Score_L, na.rm = TRUE) - lm_mean_K <- mean(interaction_scores_RF$lm_Score_K, na.rm = TRUE) - lm_mean_r <- mean(interaction_scores_RF$lm_Score_r, na.rm = TRUE) - lm_mean_AUC <- mean(interaction_scores_RF$lm_Score_AUC, na.rm = TRUE) - - print(paste("Mean RF linear regression score L:", lm_mean_L)) - - # Calculate Z scores for the linear models - interaction_scores_RF <- interaction_scores_RF %>% - mutate( - Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L, - Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K, - Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r, - Z_lm_AUC = (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC - ) - - # Sort the dataframe by Z_lm_L and NG (No Growth) - interaction_scores_RF <- interaction_scores_RF %>% - arrange(desc(Z_lm_L), desc(NG)) - - write.csv(interaction_scores_RF, file = file.path(output_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE) - - # Generate ggplot objects for each RF strain - for (i in seq_len(num_genes_RF)) { - gene_sel <- unique(interaction_scores_RF$OrfRep)[i] - df_z_calculations <- df_stats_interaction_all_RF %>% filter(OrfRep == gene_sel) - df_int_scores <- interaction_scores_RF %>% filter(OrfRep == gene_sel) - - p_rf_l[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_L)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-65, 65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_l), ymax = 0 + (2 * WT_sd_l)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 45, label = paste("ZShift =", round(df_int_scores$Z_Shift_L, 2))) + - annotate("text", x = 1, y = 25, label = paste("lm Zscore =", round(df_int_scores$Z_lm_L, 2))) + - annotate("text", x = 1, y = -25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = c(-60, -50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 50, 60)) + - theme_publication() - - p_rf_K[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_K)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-65, 65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_K), ymax = 0 + (2 * WT_sd_K)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 45, label = paste("ZShift =", round(df_int_scores$Z_Shift_K, 2))) + - annotate("text", x = 1, y = 25, label = paste("lm Zscore =", round(df_int_scores$Z_lm_K, 2))) + - annotate("text", x = 1, y = -25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = c(-60, -50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 50, 60)) + - theme_publication() - - p_rf_r[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_r)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-0.65, 0.65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_r), ymax = 0 + (2 * WT_sd_r)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 0.45, label = paste("ZShift =", round(df_int_scores$Z_Shift_r, 2))) + - annotate("text", x = 1, y = 0.25, label = paste("lm Zscore =", round(df_int_scores$Z_lm_r, 2))) + - annotate("text", x = 1, y = -0.25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -0.35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -0.45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = c(-0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6)) + - theme_publication() - - p_rf_AUC[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_AUC)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-6500, 6500)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_AUC), ymax = 0 + (2 * WT_sd_AUC)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 4500, label = paste("ZShift =", round(df_int_scores$Z_Shift_AUC, 2))) + - annotate("text", x = 1, y = 2500, label = paste("lm Zscore =", round(df_int_scores$Z_lm_AUC, 2))) + - annotate("text", x = 1, y = -2500, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -3500, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -4500, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = c(-6000, -5000, -4000, -3000, -2000, -1000, 0, 1000, 2000, 3000, 4000, 5000, 6000)) + - theme_publication() - - # Append the final interaction statistics for all RFs - if (i == 1) { - df_stats_interaction_all_RF_final <- df_z_calculations - } else { - df_stats_interaction_all_RF_final <- bind_rows(df_stats_interaction_all_RF_final, df_z_calculations) - } - } - - print("Pass RF ggplot loop") - - # Save the final interaction statistics - write.csv(df_stats_interaction_all_RF_final, file = file.path(output_dir, "RF_ZScore_Calculations.csv"), row.names = FALSE) - - ###### Part 5 - Get Z-scores for Gene Deletion Strains - - # Get the total number of genes for the loop - num_genes <- length(unique(df_deletion$OrfRep)) - print(num_genes) - - # Create the output dataframe containing columns for each deletion strain - interaction_scores_deletion <- unique(df_deletion["OrfRep"]) - interaction_scores_deletion <- interaction_scores_deletion %>% - mutate( - Gene = NA, - Raw_Shift_L = NA, Z_Shift_L = NA, lm_Score_L = NA, Z_lm_L = NA, R_Squared_L = NA, - Sum_Z_Score_L = NA, Avg_Zscore_L = NA, - Raw_Shift_K = NA, Z_Shift_K = NA, lm_Score_K = NA, Z_lm_K = NA, R_Squared_K = NA, - Sum_Z_Score_K = NA, Avg_Zscore_K = NA, - Raw_Shift_r = NA, Z_Shift_r = NA, lm_Score_r = NA, Z_lm_r = NA, R_Squared_r = NA, - Sum_Z_Score_r = NA, Avg_Zscore_r = NA, - Raw_Shift_AUC = NA, Z_Shift_AUC = NA, lm_Score_AUC = NA, Z_lm_AUC = NA, R_Squared_AUC = NA, - Sum_Z_Score_AUC = NA, Avg_Zscore_AUC = NA, - NG = NA, DB = NA, SM = NA - ) - - for (i in seq_len(num_genes)) { - gene_sel <- unique(df_deletion$OrfRep)[i] - df_gene_sel <- df_deletion %>% filter(OrfRep == gene_sel) - - df_stats_interaction <- df_gene_sel %>% - group_by(OrfRep, Gene, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = length(L), mean_L = mean(L, na.rm = TRUE), median_L = median(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), se_L = sd_L / sqrt(N - 1), - mean_K = mean(K, na.rm = TRUE), median_K = median(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), se_K = sd_K / sqrt(N - 1), - mean_r = mean(r, na.rm = TRUE), median_r = median(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), se_r = sd_r / sqrt(N - 1), - mean_AUC = mean(AUC, na.rm = TRUE), median_AUC = median(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), se_AUC = sd_AUC / sqrt(N - 1), - NG = sum(NG, na.rm = TRUE), DB = sum(DB, na.rm = TRUE), SM = sum(SM, na.rm = TRUE) - ) %>% ungroup() - - if (is.na(df_stats_interaction$mean_L[1]) || df_stats_interaction$mean_L[1] == 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Raw_Shift_L = 0, Raw_Shift_K = 0, Raw_Shift_r = 0, Raw_Shift_AUC = 0, - Z_Shift_L = 0, Z_Shift_K = 0, Z_Shift_r = 0, Z_Shift_AUC = 0 - ) - } else { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Raw_Shift_L = mean_L[1] - background_L, Raw_Shift_K = mean_K[1] - background_K, - Raw_Shift_r = mean_r[1] - background_r, Raw_Shift_AUC = mean_AUC[1] - background_AUC, - Z_Shift_L = Raw_Shift_L[1] / df_stats_BY_L$sd[1], - Z_Shift_K = Raw_Shift_K[1] / df_stats_BY_K$sd[1], - Z_Shift_r = Raw_Shift_r[1] / df_stats_BY_r$sd[1], - Z_Shift_AUC = Raw_Shift_AUC[1] / df_stats_BY_AUC$sd[1] - ) - } - - df_stats_interaction <- df_stats_interaction %>% - mutate( - WT_l = df_stats_BY_L$mean, WT_K = df_stats_BY_K$mean, - WT_r = df_stats_BY_r$mean, WT_AUC = df_stats_BY_AUC$mean, - WT_sd_l = df_stats_BY_L$sd, WT_sd_K = df_stats_BY_K$sd, - WT_sd_r = df_stats_BY_r$sd, WT_sd_AUC = df_stats_BY_AUC$sd - ) - - if (df_stats_interaction$mean_L[1] != 0 && !is.na(df_stats_interaction$mean_L[1])) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Exp_L = WT_l + Raw_Shift_L, Exp_K = WT_K + Raw_Shift_K, - Exp_r = WT_r + Raw_Shift_r, Exp_AUC = WT_AUC + Raw_Shift_AUC, - Delta_L = mean_L - Exp_L, Delta_K = mean_K - Exp_K, - Delta_r = mean_r - Exp_r, Delta_AUC = mean_AUC - Exp_AUC - ) - - if (sum(df_stats_interaction$NG, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Delta_L = if_else(NG == 1, mean_L - WT_l, Delta_L), - Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K), - Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), - Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC) - ) - } - - if (sum(df_stats_interaction$SM, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate(Delta_L = if_else(SM == 1, mean_L - WT_l, Delta_L)) - } - - df_stats_interaction <- df_stats_interaction %>% - mutate( - Zscore_L = Delta_L / WT_sd_l, Zscore_K = Delta_K / WT_sd_K, - Zscore_r = Delta_r / WT_sd_r, Zscore_AUC = Delta_AUC / WT_sd_AUC - ) - - gene_lm_L <- lm(Delta_L ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_K <- lm(Delta_K ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_r <- lm(Delta_r ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_AUC <- lm(Delta_AUC ~ Conc_Num_Factor, data = df_stats_interaction) - - gene_interaction_L <- max_conc * coef(gene_lm_L)[2] + coef(gene_lm_L)[1] - gene_interaction_K <- max_conc * coef(gene_lm_K)[2] + coef(gene_lm_K)[1] - gene_interaction_r <- max_conc * coef(gene_lm_r)[2] + coef(gene_lm_r)[1] - gene_interaction_AUC <- max_conc * coef(gene_lm_AUC)[2] + coef(gene_lm_AUC)[1] - - r_squared_l <- summary(gene_lm_L)$r.squared - r_squared_K <- summary(gene_lm_K)$r.squared - r_squared_r <- summary(gene_lm_r)$r.squared - r_squared_AUC <- summary(gene_lm_AUC)$r.squared - - num_non_removed_conc <- total_conc_nums - sum(df_stats_interaction$DB, na.rm = TRUE) - 1 - - interaction_scores_deletion <- interaction_scores_deletion %>% - mutate( - Gene = replace(Gene, OrfRep == gene_sel, df_gene_sel$Gene[1]), - Raw_Shift_L = replace(Raw_Shift_L, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_L[1]), - Z_Shift_L = replace(Z_Shift_L, OrfRep == gene_sel, df_stats_interaction$Z_Shift_L[1]), - lm_Score_L = replace(lm_Score_L, OrfRep == gene_sel, gene_interaction_L), - Z_lm_L = replace(Z_lm_L, OrfRep == gene_sel, (gene_interaction_L - lm_mean_L) / lm_sd_L), - R_Squared_L = replace(R_Squared_L, OrfRep == gene_sel, r_squared_l), - Sum_Z_Score_L = replace(Sum_Z_Score_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE)), - Avg_Zscore_L = replace(Avg_Zscore_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_K = replace(Raw_Shift_K, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_K[1]), - Z_Shift_K = replace(Z_Shift_K, OrfRep == gene_sel, df_stats_interaction$Z_Shift_K[1]), - lm_Score_K = replace(lm_Score_K, OrfRep == gene_sel, gene_interaction_K), - Z_lm_K = replace(Z_lm_K, OrfRep == gene_sel, (gene_interaction_K - lm_mean_K) / lm_sd_K), - R_Squared_K = replace(R_Squared_K, OrfRep == gene_sel, r_squared_K), - Sum_Z_Score_K = replace(Sum_Z_Score_K, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_K, na.rm = TRUE)), - Avg_Zscore_K = replace(Avg_Zscore_K, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_K, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_r = replace(Raw_Shift_r, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_r[1]), - Z_Shift_r = replace(Z_Shift_r, OrfRep == gene_sel, df_stats_interaction$Z_Shift_r[1]), - lm_Score_r = replace(lm_Score_r, OrfRep == gene_sel, gene_interaction_r), - Z_lm_r = replace(Z_lm_r, OrfRep == gene_sel, (gene_interaction_r - lm_mean_r) / lm_sd_r), - R_Squared_r = replace(R_Squared_r, OrfRep == gene_sel, r_squared_r), - Sum_Z_Score_r = replace(Sum_Z_Score_r, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_r, na.rm = TRUE)), - Avg_Zscore_r = replace(Avg_Zscore_r, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_r, na.rm = TRUE) / (total_conc_nums - 1)), - Raw_Shift_AUC = replace(Raw_Shift_AUC, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_AUC[1]), - Z_Shift_AUC = replace(Z_Shift_AUC, OrfRep == gene_sel, df_stats_interaction$Z_Shift_AUC[1]), - lm_Score_AUC = replace(lm_Score_AUC, OrfRep == gene_sel, gene_interaction_AUC), - Z_lm_AUC = replace(Z_lm_AUC, OrfRep == gene_sel, (gene_interaction_AUC - lm_mean_AUC) / lm_sd_AUC), - R_Squared_AUC = replace(R_Squared_AUC, OrfRep == gene_sel, r_squared_AUC), - Sum_Z_Score_AUC = replace(Sum_Z_Score_AUC, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_AUC, na.rm = TRUE)), - Avg_Zscore_AUC = replace(Avg_Zscore_AUC, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_AUC, na.rm = TRUE) / (total_conc_nums - 1)) - ) - } else { - # Similar logic for when mean_L is 0 or NA, setting relevant variables to NA or appropriate values - interaction_scores_deletion <- interaction_scores_deletion %>% - mutate( - Gene = replace(Gene, OrfRep == gene_sel, df_gene_sel$Gene[1]), - Raw_Shift_L = replace(Raw_Shift_L, OrfRep == gene_sel, NA), - Z_Shift_L = replace(Z_Shift_L, OrfRep == gene_sel, NA), - lm_Score_L = replace(lm_Score_L, OrfRep == gene_sel, NA), - Z_lm_L = replace(Z_lm_L, OrfRep == gene_sel, NA), - R_Squared_L = replace(R_Squared_L, OrfRep == gene_sel, NA), - Sum_Z_Score_L = replace(Sum_Z_Score_L, OrfRep == gene_sel, NA), - Avg_Zscore_L = replace(Avg_Zscore_L, OrfRep == gene_sel, NA), - Raw_Shift_K = replace(Raw_Shift_K, OrfRep == gene_sel, NA), - Z_Shift_K = replace(Z_Shift_K, OrfRep == gene_sel, NA), - lm_Score_K = replace(lm_Score_K, OrfRep == gene_sel, NA), - Z_lm_K = replace(Z_lm_K, OrfRep == gene_sel, NA), - R_Squared_K = replace(R_Squared_K, OrfRep == gene_sel, NA), - Sum_Z_Score_K = replace(Sum_Z_Score_K, OrfRep == gene_sel, NA), - Avg_Zscore_K = replace(Avg_Zscore_K, OrfRep == gene_sel, NA), - Raw_Shift_r = replace(Raw_Shift_r, OrfRep == gene_sel, NA), - Z_Shift_r = replace(Z_Shift_r, OrfRep == gene_sel, NA), - lm_Score_r = replace(lm_Score_r, OrfRep == gene_sel, NA), - Z_lm_r = replace(Z_lm_r, OrfRep == gene_sel, NA), - R_Squared_r = replace(R_Squared_r, OrfRep == gene_sel, NA), - Sum_Z_Score_r = replace(Sum_Z_Score_r, OrfRep == gene_sel, NA), - Avg_Zscore_r = replace(Avg_Zscore_r, OrfRep == gene_sel, NA), - Raw_Shift_AUC = replace(Raw_Shift_AUC, OrfRep == gene_sel, NA), - Z_Shift_AUC = replace(Z_Shift_AUC, OrfRep == gene_sel, NA), - lm_Score_AUC = replace(lm_Score_AUC, OrfRep == gene_sel, NA), - Z_lm_AUC = replace(Z_lm_AUC, OrfRep == gene_sel, NA), - R_Squared_AUC = replace(R_Squared_AUC, OrfRep == gene_sel, NA), - Sum_Z_Score_AUC = replace(Sum_Z_Score_AUC, OrfRep == gene_sel, NA), - Avg_Zscore_AUC = replace(Avg_Zscore_AUC, OrfRep == gene_sel, NA) - ) - } - - if (i == 1) { - df_stats_interaction_all <- df_stats_interaction - } else { - df_stats_interaction_all <- bind_rows(df_stats_interaction_all, df_stats_interaction) - } - - interaction_scores_deletion <- interaction_scores_deletion %>% - mutate( - NG = replace(NG, OrfRep == gene_sel, sum(df_stats_interaction$NG, na.rm = TRUE)), - DB = replace(DB, OrfRep == gene_sel, sum(df_stats_interaction$DB, na.rm = TRUE)), - SM = replace(SM, OrfRep == gene_sel, sum(df_stats_interaction$SM, na.rm = TRUE)) - ) - } - - print("Pass Int Calculation loop") - - # Order the interaction scores by Z_lm_L and NG - interaction_scores_deletion <- interaction_scores_deletion %>% - arrange(desc(Z_lm_L)) %>% - arrange(desc(NG)) - - # Save the interaction scores and filtered sets for enhancers and suppressors - output_files <- list( - "ZScores_Interaction.csv" = interaction_scores_deletion, - "ZScores_Interaction_DeletionEnhancers_L.csv" = filter(interaction_scores_deletion, Avg_Zscore_L >= 2), - "ZScores_Interaction_DeletionEnhancers_K.csv" = filter(interaction_scores_deletion, Avg_Zscore_K <= -2), - "ZScores_Interaction_DeletionSuppressors_L.csv" = filter(interaction_scores_deletion, Avg_Zscore_L <= -2), - "ZScores_Interaction_DeletionSuppressors_K.csv" = filter(interaction_scores_deletion, Avg_Zscore_K >= 2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_L.csv" = filter(interaction_scores_deletion, Avg_Zscore_L >= 2 | Avg_Zscore_L <= -2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K.csv" = filter(interaction_scores_deletion, Avg_Zscore_K >= 2 | Avg_Zscore_K <= -2), - "ZScores_Interaction_Suppressors_and_lm_Enhancers_L.csv" = filter(interaction_scores_deletion, Z_lm_L >= 2 & Avg_Zscore_L <= -2), - "ZScores_Interaction_Enhancers_and_lm_Suppressors_L.csv" = filter(interaction_scores_deletion, Z_lm_L <= -2 & Avg_Zscore_L >= 2), - "ZScores_Interaction_Suppressors_and_lm_Enhancers_K.csv" = filter(interaction_scores_deletion, Z_lm_K <= -2 & Avg_Zscore_K >= 2), - "ZScores_Interaction_Enhancers_and_lm_Suppressors_K.csv" = filter(interaction_scores_deletion, Z_lm_K >= 2 & Avg_Zscore_K <= -2) - ) - - for (file_name in names(output_files)) { - write.csv(output_files[[file_name]], file = file.path(output_dir, file_name), row.names = FALSE) - } - - # Further filtering for linear regression enhancers and suppressors - output_files_lm <- list( - "ZScores_Interaction_DeletionEnhancers_L_lm.csv" = filter(interaction_scores_deletion, Z_lm_L >= 2), - "ZScores_Interaction_DeletionEnhancers_K_lm.csv" = filter(interaction_scores_deletion, Z_lm_K <= -2), - "ZScores_Interaction_DeletionSuppressors_L_lm.csv" = filter(interaction_scores_deletion, Z_lm_L <= -2), - "ZScores_Interaction_DeletionSuppressors_K_lm.csv" = filter(interaction_scores_deletion, Z_lm_K >= 2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_L_lm.csv" = filter(interaction_scores_deletion, Z_lm_L >= 2 | Z_lm_L <= -2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K_lm.csv" = filter(interaction_scores_deletion, Z_lm_K >= 2 | Z_lm_K <= -2) - ) - - for (file_name in names(output_files_lm)) { - write.csv(output_files_lm[[file_name]], file = file.path(output_dir, file_name), row.names = FALSE) - } - - - # Loop through each gene to generate plots - for (i in 1:num_genes) { - gene_sel <- unique(interaction_scores_deletion$OrfRep)[i] - df_z_calculations <- df_stats_interaction_all %>% filter(OrfRep == gene_sel) - df_int_scores <- interaction_scores_deletion %>% filter(OrfRep == gene_sel) - - p_l[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_L)) + - geom_point() + - geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-65, 65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_l), ymax = 0 + (2 * WT_sd_l)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 45, label = paste("ZShift =", round(df_int_scores$Z_Shift_L, 2))) + - annotate("text", x = 1, y = 25, label = paste("Z lm Score =", round(df_int_scores$Z_lm_L, 2))) + - annotate("text", x = 1, y = -25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-60, 60, 10)) + - theme_Publication() - - p_K[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_K)) + - geom_point() + - geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-65, 65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_K), ymax = 0 + (2 * WT_sd_K)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 45, label = paste("ZShift =", round(df_int_scores$Z_Shift_K, 2))) + - annotate("text", x = 1, y = 25, label = paste("Z lm Score =", round(df_int_scores$Z_lm_K, 2))) + - annotate("text", x = 1, y = -25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-60, 60, 10)) + - theme_Publication() - - p_r[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_r)) + - geom_point() + - geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-0.65, 0.65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_r), ymax = 0 + (2 * WT_sd_r)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 0.45, label = paste("ZShift =", round(df_int_scores$Z_Shift_r, 2))) + - annotate("text", x = 1, y = 0.25, label = paste("Z lm Score =", round(df_int_scores$Z_lm_r, 2))) + - annotate("text", x = 1, y = -0.25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -0.35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -0.45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-0.6, 0.6, 0.2)) + - theme_Publication() - - p_AUC[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_AUC)) + - geom_point() + - geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-6500, 6500)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_AUC), ymax = 0 + (2 * WT_sd_AUC)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 4500, label = paste("ZShift =", round(df_int_scores$Z_Shift_AUC, 2))) + - annotate("text", x = 1, y = 2500, label = paste("Z lm Score =", round(df_int_scores$Z_lm_AUC, 2))) + - annotate("text", x = 1, y = -2500, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -3500, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -4500, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-6000, 6000, 1000)) + - theme_Publication() - - if (i == 1) { - df_stats_interaction_all_final <- df_z_calculations - } else { - df_stats_interaction_all_final <- bind_rows(df_stats_interaction_all_final, df_z_calculations) - } - } - - print("Pass Int ggplot loop") - write.csv(df_stats_interaction_all_final, file = file.path(output_dir, "ZScore_Calculations.csv"), row.names = FALSE) - - # Generate a blank plot for alignment purposes - blank_plot <- ggplot(df2_rf) + geom_blank() - - # Create PDF for interaction plots - pdf(file.path(output_dir, "InteractionPlots.pdf"), width = 16, height = 16, onefile = TRUE) - - # Summarize stats for X2_RF - df_stats_rf <- df2_rf %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise( - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - max_K = max(K, na.rm = TRUE), - min_K = min(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - max_r = max(r, na.rm = TRUE), - min_r = min(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - max_AUC = max(AUC, na.rm = TRUE), - min_AUC = min(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) - ) - - # Create L statistics scatter plot - plot_l_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, L)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for L with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 160)) + - annotate("text", x = -0.25, y = 10, label = "NG") + - annotate("text", x = -0.25, y = 5, label = "DB") + - annotate("text", x = -0.25, y = 0, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 10, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 5, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 0, label = df_stats_rf$SM) + - theme_Publication() - - # Create K statistics scatter plot - plot_k_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, K)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for K with SD", sep = " ")) + - coord_cartesian(ylim = c(-20, 160)) + - annotate("text", x = -0.25, y = -5, label = "NG") + - annotate("text", x = -0.25, y = -12.5, label = "DB") + - annotate("text", x = -0.25, y = -20, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -5, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -12.5, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -20, label = df_stats_rf$SM) + - theme_Publication() - - # Create r statistics scatter plot - plot_r_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, r)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for r with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 1)) + - annotate("text", x = -0.25, y = .9, label = "NG") + - annotate("text", x = -0.25, y = .8, label = "DB") + - annotate("text", x = -0.25, y = .7, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .9, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .8, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .7, label = df_stats_rf$SM) + - theme_Publication() - - # Create AUC statistics scatter plot - plot_auc_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, AUC)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for AUC with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 12500)) + - annotate("text", x = -0.25, y = 11000, label = "NG") + - annotate("text", x = -0.25, y = 10000, label = "DB") + - annotate("text", x = -0.25, y = 9000, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 11000, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 10000, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 9000, label = df_stats_rf$SM) + - theme_Publication() - - # Arrange and plot scatter plots - grid.arrange(plot_l_stats, plot_k_stats, plot_r_stats, plot_auc_stats, ncol = 2, nrow = 2) - - # Create box plots for each statistic - plot_l_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), L)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for L with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 160)) + - theme_Publication() - - plot_k_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), K)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for K with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 130)) + - theme_Publication() - - plot_r_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), r)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for r with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 1)) + - theme_Publication() - - plot_auc_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), AUC)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for AUC with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 12500)) + - theme_Publication() - - # Arrange and plot box plots - grid.arrange(plot_l_stats_box, plot_k_stats_box, plot_r_stats_box, plot_auc_stats_box, ncol = 2, nrow = 2) - - # Loop to arrange and print combined plots - plot_indices <- seq(1, (num_genes - 1), by = 3) - for (m in seq_along(plot_indices)) { - grid.arrange( - p_l[[plot_indices[m]]], p_K[[plot_indices[m]]], p_r[[plot_indices[m]]], p_AUC[[plot_indices[m]]], - p_l[[plot_indices[m] + 1]], p_K[[plot_indices[m] + 1]], p_r[[plot_indices[m] + 1]], p_AUC[[plot_indices[m] + 1]], - p_l[[plot_indices[m] + 2]], p_K[[plot_indices[m] + 2]], p_r[[plot_indices[m] + 2]], p_AUC[[plot_indices[m] + 2]], - ncol = 4, nrow = 3 - ) - } - - # Handle leftover plots if num_genes is not a multiple of 3 - remaining_plots <- num_genes - max(plot_indices + 2) - if (remaining_plots > 0) { - plot_grid_list <- lapply(seq_len(remaining_plots), function(i) { - list(p_l[[plot_indices[length(plot_indices)] + i]], p_K[[plot_indices[length(plot_indices)] + i]], p_r[[plot_indices[length(plot_indices)] + i]], p_AUC[[plot_indices[length(plot_indices)] + i]]) - }) - do.call(grid.arrange, c(plot_grid_list, list(ncol = 4, nrow = 3))) - } - - dev.off() - - # Additional PDF output for RF interaction plots - # Generate PDF for RF interaction plots - pdf(file.path(output_dir, "RF_InteractionPlots.pdf"), width = 16, height = 16, onefile = TRUE) - - # Summarize stats for RF data - df_stats_rf <- df2_rf %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise( - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - max_K = max(K, na.rm = TRUE), - min_K = min(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - max_r = max(r, na.rm = TRUE), - min_r = min(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - max_AUC = max(AUC, na.rm = TRUE), - min_AUC = min(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) - ) - - # Create L statistics scatter plot for RF data - plot_rf_l_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, L)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for L with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 130)) + - annotate("text", x = -0.25, y = 10, label = "NG") + - annotate("text", x = -0.25, y = 5, label = "DB") + - annotate("text", x = -0.25, y = 0, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 10, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 5, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 0, label = df_stats_rf$SM) + - theme_Publication() - - # Create K statistics scatter plot for RF data - plot_rf_k_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, K)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for K with SD", sep = " ")) + - coord_cartesian(ylim = c(-20, 160)) + - annotate("text", x = -0.25, y = -5, label = "NG") + - annotate("text", x = -0.25, y = -12.5, label = "DB") + - annotate("text", x = -0.25, y = -20, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -5, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -12.5, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -20, label = df_stats_rf$SM) + - theme_Publication() - - # Create r statistics scatter plot for RF data - plot_rf_r_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, r)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for r with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 1)) + - annotate("text", x = -0.25, y = .9, label = "NG") + - annotate("text", x = -0.25, y = .8, label = "DB") + - annotate("text", x = -0.25, y = .7, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .9, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .8, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .7, label = df_stats_rf$SM) + - theme_Publication() - - # Create AUC statistics scatter plot for RF data - plot_rf_auc_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, AUC)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for AUC with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 12500)) + - annotate("text", x = -0.25, y = 11000, label = "NG") + - annotate("text", x = -0.25, y = 10000, label = "DB") + - annotate("text", x = -0.25, y = 9000, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 11000, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 10000, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 9000, label = df_stats_rf$SM) + - theme_Publication() - - # Arrange and plot RF scatter plots - grid.arrange(plot_rf_l_stats, plot_rf_k_stats, plot_rf_r_stats, plot_rf_auc_stats, ncol = 2, nrow = 2) - - # Create box plots for each RF statistic - plot_rf_l_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), L)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for L with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 130)) + - theme_Publication() - - plot_rf_k_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), K)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for K with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 160)) + - theme_Publication() - - plot_rf_r_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), r)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for r with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 1)) + - theme_Publication() - - plot_rf_auc_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), AUC)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for AUC with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 12500)) + - theme_Publication() - - # Arrange and plot RF box plots - grid.arrange(plot_rf_l_stats_box, plot_rf_k_stats_box, plot_rf_r_stats_box, plot_rf_auc_stats_box, ncol = 2, nrow = 2) - - # Loop to arrange and print combined RF plots - plot_indices_rf <- seq(1, (num_genes_RF - 1), by = 3) - for (m in seq_along(plot_indices_rf)) { - grid.arrange( - p_rf_l[[plot_indices_rf[m]]], p_rf_K[[plot_indices_rf[m]]], p_rf_r[[plot_indices_rf[m]]], p_rf_AUC[[plot_indices_rf[m]]], - p_rf_l[[plot_indices_rf[m] + 1]], p_rf_K[[plot_indices_rf[m] + 1]], p_rf_r[[plot_indices_rf[m] + 1]], p_rf_AUC[[plot_indices_rf[m] + 1]], - p_rf_l[[plot_indices_rf[m] + 2]], p_rf_K[[plot_indices_rf[m] + 2]], p_rf_r[[plot_indices_rf[m] + 2]], p_rf_AUC[[plot_indices_rf[m] + 2]], - ncol = 4, nrow = 3 - ) - } - - # Handle leftover RF plots if num_genes_RF is not a multiple of 3 - remaining_rf_plots <- num_genes_RF - max(plot_indices_rf + 2) - if (remaining_rf_plots > 0) { - plot_grid_rf_list <- lapply(seq_len(remaining_rf_plots), function(i) { - list(p_rf_l[[plot_indices_rf[length(plot_indices_rf)] + i]], p_rf_K[[plot_indices_rf[length(plot_indices_rf)] + i]], p_rf_r[[plot_indices_rf[length(plot_indices_rf)] + i]], p_rf_AUC[[plot_indices_rf[length(plot_indices_rf)] + i]]) - }) - do.call(grid.arrange, c(plot_grid_rf_list, list(ncol = 4, nrow = 3))) - } - - dev.off() - -} - - -# Calculate linear models and R-squared values for all CPPs in results 1 vs results 2 -lm_list <- list( - lm(Z_lm_K ~ Z_lm_L, data = df_na_rm), - lm(Z_lm_r ~ Z_lm_L, data = df_na_rm), - lm(Z_lm_AUC ~ Z_lm_L, data = df_na_rm), - lm(Z_lm_r ~ Z_lm_K, data = df_na_rm), - lm(Z_lm_AUC ~ Z_lm_K, data = df_na_rm), - lm(Z_lm_AUC ~ Z_lm_r, data = df_na_rm) -) - -lm_summaries <- lapply(lm_list, summary) - -# Create PDF for correlation plots of CPPs -pdf(file.path(output_dir, "Correlation_CPPs.pdf"), width = 10, height = 7, onefile = TRUE) - -# Generate correlation plots for each combination -plot_list <- list( - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_K)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction L vs. Interaction K") + - xlab("z-score L") + ylab("z-score K") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[1]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_r)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction L vs. Interaction r") + - xlab("z-score L") + ylab("z-score r") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[2]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction L vs. Interaction AUC") + - xlab("z-score L") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[3]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_K, Z_lm_r)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction K vs. Interaction r") + - xlab("z-score K") + ylab("z-score r") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[4]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_K, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction K vs. Interaction AUC") + - xlab("z-score K") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[5]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_r, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction r vs. Interaction AUC") + - xlab("z-score r") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[6]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)) -) - -# Print all correlation plots to the PDF -lapply(plot_list, print) - -# Create additional plots with InteractionScores_RF highlighted in cyan -interaction_scores_rf_filtered <- interaction_scores_rf[!is.na(interaction_scores_rf$Z_lm_L), ] - -highlighted_plot_list <- list( - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_K)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_L, Z_lm_K), color = "cyan") + - ggtitle("Interaction L vs. Interaction K") + - xlab("z-score L") + ylab("z-score K") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[1]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_r)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_L, Z_lm_r), color = "cyan") + - ggtitle("Interaction L vs. Interaction r") + - xlab("z-score L") + ylab("z-score r") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[2]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_L, Z_lm_AUC), color = "cyan") + - ggtitle("Interaction L vs. Interaction AUC") + - xlab("z-score L") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[3]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_K, Z_lm_r)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_K, Z_lm_r), color = "cyan") + - ggtitle("Interaction K vs. Interaction r") + - xlab("z-score K") + ylab("z-score r") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[4]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_K, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_K, Z_lm_AUC), color = "cyan") + - ggtitle("Interaction K vs. Interaction AUC") + - xlab("z-score K") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[5]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_r, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_r, Z_lm_AUC), color = "cyan") + - ggtitle("Interaction r vs. Interaction AUC") + - xlab("z-score r") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[6]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)) -) - -# Print all highlighted plots to the PDF -lapply(highlighted_plot_list, print) - -dev.off() diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores3.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores3.R deleted file mode 100644 index ac117ca5..00000000 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores3.R +++ /dev/null @@ -1,1219 +0,0 @@ -suppressMessages({ - library(ggplot2) - library(plotly) - library(htmlwidgets) - library(dplyr) - library(ggthemes) - library(data.table) -}) - -options(warn = 2, max.print = 1000) - -# Constants for configuration -PLOT_WIDTH <- 14 -PLOT_HEIGHT <- 9 -BASE_SIZE <- 14 - -parse_arguments <- function() { - args <- if (interactive()) { - c( - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD", - 3, - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2_DoxoHLD/results_std.txt", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1", - "Experiment 1: Doxo versus HLD", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2", - "Experiment 2: HLD versus Doxo" - ) - } else { - commandArgs(trailingOnly = TRUE) - } - - paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE) - names <- args[seq(6, length(args), by = 2)] - experiments <- setNames(paths, names) - - list( - out_dir = normalizePath(file.path(args[1]), mustWork = FALSE), - sd = as.numeric(args[2]), - sgd_gene_list = normalizePath(file.path(args[3]), mustWork = FALSE), - easy_results_file = normalizePath(file.path(args[4]), mustWork = FALSE), - experiments = experiments - ) -} - -args <- parse_arguments() - -dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE) -dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE) - -# Define themes and scales -theme_publication <- function(base_size = BASE_SIZE, base_family = "sans", legend_position = "bottom") { - theme_foundation(base_size = base_size, base_family = base_family) + - theme( - plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5), - text = element_text(), - panel.background = element_rect(colour = NA), - plot.background = element_rect(colour = NA), - panel.border = element_rect(colour = NA), - axis.title = element_text(face = "bold", size = rel(1)), - axis.title.y = element_text(angle = 90, vjust = 2), - axis.title.x = element_text(vjust = -0.2), - axis.line = element_line(colour = "black"), - panel.grid.major = element_line(colour = "#f0f0f0"), - panel.grid.minor = element_blank(), - legend.key = element_rect(colour = NA), - legend.position = legend_position, - legend.direction = ifelse(legend_position == "right", "vertical", "horizontal"), - plot.margin = unit(c(10, 5, 5, 5), "mm"), - strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"), - strip.text = element_text(face = "bold") - ) -} - -scale_fill_publication <- function(...) { - discrete_scale("fill", "Publication", manual_pal(values = c( - "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", - "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" - )), ...) -} - -scale_colour_publication <- function(...) { - discrete_scale("colour", "Publication", manual_pal(values = c( - "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", - "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" - )), ...) -} - -# Load SGD gene list -sgd_genes <- function(sgd_gene_list) { - read.delim(file = sgd_gene_list, quote = "", header = FALSE, - colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) %>% - dplyr::rename(ORF = V4, GeneName = V5) -} - -genes <- sgd_genes(args$sgd_gene_list) - -# Load the initial dataframe from the easy_results_file -load_and_preprocess_data <- function(easy_results_file, std) { - df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) - - # Clean and convert columns to numeric where appropriate - df <- df %>% - filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>% - mutate( - Col = as.numeric(Col), - Row = as.numeric(Row), - L = as.numeric(l), - K = as.numeric(K), - r = as.numeric(r), - Scan = as.numeric(Scan), - AUC = as.numeric(AUC96), - LstBackgrd = as.numeric(LstBackgrd), - X1stBackgrd = as.numeric(X1stBackgrd), - OrfRep = ifelse(ORF == "YDL227C", "YDL227C", OrfRep), - Conc_Num = as.numeric(gsub("[^0-9\\.]", "", Conc)), - Conc_Num_Factor = as.numeric(as.factor(Conc_Num)) - 1 - ) - - # Define max concentration factor - max_conc <- max(df$Conc_Num_Factor, na.rm = TRUE) - - return(list(df = df, max_conc = max_conc)) -} - -# Function to update Gene names using the SGD gene list -update_gene_names <- function(df, sgd_gene_list) { - genes <- read.delim(file = sgd_gene_list, quote = "", header = FALSE, colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) - gene_map <- setNames(genes$V5, genes$V4) - - df <- df %>% - rowwise() %>% - mutate(Gene = ifelse(OrfRep != "YDL227C", gene_map[[ORF]], Gene)) %>% - ungroup() %>% - mutate(Gene = ifelse(Gene == "" | Gene == "OCT1", OrfRep, Gene)) - - return(df) -} - -# Apply the functions -result <- load_and_preprocess_data(args$easy_results_file, args$sd) -df <- update_gene_names(result$df, args$sgd_gene_list) -max_conc <- result$max_conc - -# Function to perform initial calculations and QC before filtering -perform_initial_calculations <- function(df, output_dir_qc, delta_bg_sd_factor) { - - # Add Delta_Backgrd column and calculate Delta_Background_Tolerance - df <- df %>% mutate(Delta_Backgrd = LstBackgrd - X1stBackgrd) - Delta_Background_Tolerance <- mean(df$Delta_Backgrd, na.rm = TRUE) + - (delta_bg_sd_factor * sd(df$Delta_Backgrd, na.rm = TRUE)) - print(paste("Delta_Background_Tolerance is", Delta_Background_Tolerance)) - - # Save plots to PDF and HTML - plot_and_save <- function(plot_obj, filename) { - pdf_path <- file.path(output_dir_qc, paste0(filename, ".pdf")) - html_path <- file.path(output_dir_qc, paste0(filename, ".html")) - pdf(pdf_path, width = 12, height = 8) - print(plot_obj) - dev.off() - pgg <- ggplotly(plot_obj) - saveWidget(pgg, html_path, selfcontained = TRUE) - } - - # Generate and save plate analysis plots - generate_plate_analysis_plots <- function(variable, delta_bg = FALSE) { - plot_name <- if (delta_bg) { - "Plate analysis by Drug Conc for Delta_Backgrd before quality control" - } else { - paste("Plate analysis by Drug Conc for", variable, "before quality control") - } - var <- if (delta_bg) "Delta_Backgrd" else variable - - p1 <- ggplot(df, aes(Scan, .data[[var]], color = as.factor(Conc_Num))) + - geom_point(shape = 3, size = 0.2, position = "jitter") + - stat_summary(fun.data = mean_sdl, geom = "errorbar") + - stat_summary(fun = mean, geom = "point", size = 0.6) + - ggtitle(plot_name) + - theme_publication() - - p2 <- ggplot(df, aes(as.factor(Scan), .data[[var]], color = as.factor(Conc_Num))) + - geom_boxplot() + - ggtitle(plot_name) + - theme_publication() - - plot_and_save(p1, paste0("Plate_Analysis_", var)) - plot_and_save(p2, paste0("Plate_Analysis_", var, "_Box")) - } - - variables <- c("L", "K", "r", "AUC") - lapply(variables, generate_plate_analysis_plots) - generate_plate_analysis_plots("Delta_Backgrd", delta_bg = TRUE) - - # Raw L vs K plot before QC - Raw_l_vs_K_beforeQC <- ggplot(df, aes(l, K, color = as.factor(Conc_Num))) + - geom_point(aes(ORF = ORF, Gene = Gene, Delta_Backgrd = Delta_Backgrd), shape = 3) + - ggtitle("Raw L vs K before QC") + - theme_publication_legend_right() - - plot_and_save(Raw_l_vs_K_beforeQC, "Raw_L_vs_K_beforeQC") - - # Subset and plot data above Delta_Background_Tolerance - df_above_tolerance <- df %>% filter(Delta_Backgrd >= Delta_Background_Tolerance) - df_above_tolerance_K_halfmedian <- median(df_above_tolerance$K, na.rm = TRUE) / 2 - df_above_tolerance_L_halfmedian <- median(df_above_tolerance$l, na.rm = TRUE) / 2 - df_above_tolerance_toRemove <- nrow(df_above_tolerance) - - df_Delta_Backgrd_above_Tolerance_L_vs_K <- ggplot(df_above_tolerance, aes(l, K, color = as.factor(Conc_Num))) + - geom_point(aes(ORF = ORF, Gene = Gene, Delta_Backgrd = Delta_Backgrd), shape = 3) + - ggtitle(paste("Raw L vs K for strains above delta background threshold of", Delta_Background_Tolerance, "or above")) + - annotate("text", x = df_above_tolerance_L_halfmedian, y = df_above_tolerance_K_halfmedian, - label = paste("Strains above delta background tolerance = ", df_above_tolerance_toRemove)) + - theme_publication_legend_right() - - plot_and_save(df_Delta_Backgrd_above_Tolerance_L_vs_K, "Raw_L_vs_K_for_strains_above_deltabackgrd_threshold") - - # Frequency and Bar plots for Delta Background - plot_types <- list( - "Density" = geom_density(), - "Bar" = geom_bar() - ) - - for (type in names(plot_types)) { - plot <- ggplot(df, aes(Delta_Backgrd, color = as.factor(Conc_Num))) + - plot_types[[type]] + - ggtitle(paste(type, "plot for Delta Background by Conc All Data")) + - theme_publication_legend_right() - - plot_and_save(plot, paste(type, "Delta_Background", sep = "_")) - } - - return(df) -} - -# Calling the function -df <- perform_initial_calculations(df, args$out_dir, args$sd) - -# Identify missing data (No Growth) and replace values where Delta_Backgrd exceeds tolerance -df <- df %>% - mutate( - NG = ifelse(L == 0 & !is.na(l), 1, 0), - DB = ifelse(Delta_Backgrd >= Delta_Background_Tolerance, 1, 0), - L = ifelse(DB == 1, NA, L), - r = ifelse(DB == 1, NA, r), - AUC = ifelse(DB == 1, NA, AUC), - K = ifelse(DB == 1, NA, K) - ) - -# Function to create and save plots -create_plot <- function(df, var, plot_type) { - if (plot_type == "scatter") { - plot <- ggplot(df, aes(Scan, !!sym(var), color = as.factor(Conc_Num))) + - geom_point(shape = 3, size = 0.2) + - stat_summary(fun.data = mean_sdl, geom = "errorbar") + - stat_summary(fun = mean, geom = "point", size = 0.6) + - ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) + - theme_publication() - } else if (plot_type == "box") { - plot <- ggplot(df, aes(as.factor(Scan), !!sym(var), color = as.factor(Conc_Num))) + - geom_boxplot() + - ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) + - theme_publication() - } - plot -} - -# Function to generate and save both scatter and box plots -generate_and_save_qc_plots <- function(df, variables, output_dir_qc) { - lapply(variables, function(var) { - scatter_plot <- create_plot(df, var, "scatter") - box_plot <- create_plot(df, var, "box") - - plot_and_save(scatter_plot, paste0("Plate_Analysis_", var, "_afterQC"), output_dir_qc) - plot_and_save(box_plot, paste0("Plate_Analysis_", var, "_Box_afterQC"), output_dir_qc) - }) -} - -# Define the variables to generate QC plots for -variables <- c("L", "K", "r", "AUC", "Delta_Backgrd") - -# Generate and save QC plots for all data -generate_and_save_qc_plots(df, variables, args$out_dir) - -# Generate and save QC plots for data without zero values for L -df_noZero <- df %>% filter(L > 0) -generate_and_save_qc_plots(df_noZero, variables, args$out_dir) - -# Print plate analysis data before and after QC to PDFs -pdf(file = file.path(args$out_dir, "Plate_Analysis.pdf"), width = 14, height = 9) -print(Plate_Analysis_L) -print(Plate_Analysis_L_afterQC) -print(Plate_Analysis_K) -print(Plate_Analysis_K_afterQC) -print(Plate_Analysis_r) -print(Plate_Analysis_r_afterQC) -print(Plate_Analysis_AUC) -print(Plate_Analysis_AUC_afterQC) -print(Plate_Analysis_Delta_Backgrd) -print(Plate_Analysis_Delta_Backgrd_afterQC) -dev.off() - -pdf(file = file.path(args$out_dir, "Plate_Analysis_Boxplots.pdf"), width = 18, height = 9) -print(Plate_Analysis_L_Box) -print(Plate_Analysis_L_Box_afterQC) -print(Plate_Analysis_K_Box) -print(Plate_Analysis_K_Box_afterQC) -print(Plate_Analysis_r_Box) -print(Plate_Analysis_r_Box_afterQC) -print(Plate_Analysis_AUC_Box) -print(Plate_Analysis_AUC_Box_afterQC) -print(Plate_Analysis_Delta_Backgrd_Box) -print(Plate_Analysis_Delta_Backgrd_Box_afterQC) -dev.off() - -pdf(file = file.path(args$out_dir, "Plate_Analysis_noZeros.pdf"), width = 14, height = 9) -print(Plate_Analysis_L_afterQC_Z) -print(Plate_Analysis_K_afterQC_Z) -print(Plate_Analysis_r_afterQC_Z) -print(Plate_Analysis_AUC_afterQC_Z) -print(Plate_Analysis_Delta_Backgrd_afterQC_Z) -dev.off() - -pdf(file = file.path(args$out_dir, "Plate_Analysis_noZeros_Boxplots.pdf"), width = 18, height = 9) -print(Plate_Analysis_L_Box_afterQC_Z) -print(Plate_Analysis_K_Box_afterQC_Z) -print(Plate_Analysis_r_Box_afterQC_Z) -print(Plate_Analysis_AUC_Box_afterQC_Z) -print(Plate_Analysis_Delta_Backgrd_Box_afterQC_Z) -dev.off() - -# Clean up -rm(df_noZero) - -# Calculate and save summary statistics for all strains for a given variable -calculate_summary_stats <- function(df, variable) { - df %>% - summarise( - N = n(), - mean_val = mean(.data[[variable]], na.rm = TRUE), - median_val = median(.data[[variable]], na.rm = TRUE), - max_val = max(.data[[variable]], na.rm = TRUE), - min_val = min(.data[[variable]], na.rm = TRUE), - sd_val = sd(.data[[variable]], na.rm = TRUE), - se_val = sd_val / sqrt(N - 1) - ) -} - -# Function to calculate and save summary statistics for all variables -generate_summary_stats <- function(df, variables, output_dir) { - summary_stats <- df %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise(across(all_of(variables), ~ list(calculate_summary_stats(cur_data(), .)), .names = "{.col}_stats")) %>% - unnest_wider(c(L_stats, K_stats, r_stats, AUC_stats)) - - write.csv(summary_stats, file = file.path(output_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) -} - -# Define the variables to calculate statistics for -variables <- c("L", "K", "r", "AUC") -generate_summary_stats(df, variables, args$out_dir) - -# Background Strains -background_strains <- c("YDL227C") - -# Function to calculate and return summary statistics for a background strain -calculate_background_stats <- function(df, strain, variables) { - df_background <- df %>% - filter(OrfRep == strain) %>% - mutate(across(all_of(variables), ~ ifelse(. == 0, NA, .))) %>% - filter(!is.na(L)) - - # Calculate summary statistics for each variable and combine them - df_stats_combined <- df_background %>% - group_by(OrfRep, Conc_Num, Conc_Num_Factor) %>% - summarise( - across(all_of(variables), list( - mean = ~ mean(.x, na.rm = TRUE), - median = ~ median(.x, na.rm = TRUE), - max = ~ max(.x, na.rm = TRUE), - min = ~ min(.x, na.rm = TRUE), - sd = ~ sd(.x, na.rm = TRUE), - se = ~ sd(.x, na.rm = TRUE) / sqrt(n() - 1) - ), .names = "{.col}_{.fn}") - ) - - return(df_stats_combined) -} - -# Initialize an empty dataframe to collect results for all background strains -df_all_background_stats <- tibble() - -# Calculate and accumulate statistics for each background strain -for (strain in background_strains) { - df_stats_combined <- calculate_background_stats(df, strain, variables) - df_all_background_stats <- bind_rows(df_all_background_stats, df_stats_combined) -} - -# Save the combined summary statistics for all background strains -write.csv(df_all_background_stats, file = file.path(args$out_dir, "SummaryStats_BackgroundStrains.csv"), row.names = FALSE) - -# Function to filter data within and outside 2SD of a given variable -filter_and_save_within_2SD <- function(df, df_stats_by_var, variable, output_dir) { - df_within_2SD <- filter_within_2SD(df, df_stats_by_var, variable) - df_outside_2SD <- anti_join(df, df_within_2SD, by = c("OrfRep", "Conc_Num", "Conc_Num_Factor")) - - df_stats_by_L_within_2SD <- calculate_summary_stats(df_within_2SD, "L") - df_stats_by_L_outside_2SD <- calculate_summary_stats(df_outside_2SD, "L") - - write.csv(df_stats_by_L_within_2SD, file = file.path(output_dir, paste0("Max_Observed_L_Vals_for_spots_within_2SD_", variable, ".csv")), row.names = FALSE) - - return(list(within_2SD = df_within_2SD, outside_2SD = df_outside_2SD)) -} - -calculate_interaction_scores <- function(df_stats_interaction, background_means, df_stats_by_var) { - if (is.na(df_stats_interaction$mean_L[1]) || df_stats_interaction$mean_L[1] == 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate(across(c(Raw_Shift_L, Z_Shift_L, Raw_Shift_K, Z_Shift_K, - Raw_Shift_r, Z_Shift_r, Raw_Shift_AUC, Z_Shift_AUC), ~ 0)) - } else { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Raw_Shift_L = mean_L - background_means$L, - Z_Shift_L = Raw_Shift_L / df_stats_by_var$sd[1], - Raw_Shift_K = mean_K - background_means$K, - Z_Shift_K = Raw_Shift_K / df_stats_by_var$sd[1], - Raw_Shift_r = mean_r - background_means$r, - Z_Shift_r = Raw_Shift_r / df_stats_by_var$sd[1], - Raw_Shift_AUC = mean_AUC - background_means$AUC, - Z_Shift_AUC = Raw_Shift_AUC / df_stats_by_var$sd[1] - ) - } - - df_stats_interaction <- df_stats_interaction %>% - mutate( - WT_l = df_stats_by_var$mean, WT_sd_l = df_stats_by_var$sd, - WT_K = df_stats_by_var$mean, WT_sd_K = df_stats_by_var$sd, - WT_r = df_stats_by_var$mean, WT_sd_r = df_stats_by_var$sd, - WT_AUC = df_stats_by_var$mean, WT_sd_AUC = df_stats_by_var$sd - ) - - df_stats_interaction -} - -update_interaction_scores <- function(df_stats_interaction, background_means, interaction_scores_RF, gene_sel, num_non_removed_conc, max_conc) { - df_stats_interaction <- calculate_interaction_scores(df_stats_interaction, background_means, df_stats_by_var) - - if (df_stats_interaction$mean_L[1] != 0 && !is.na(df_stats_interaction$mean_L[1])) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Exp_L = WT_l + Raw_Shift_L, - Exp_K = WT_K + Raw_Shift_K, - Exp_r = WT_r + Raw_Shift_r, - Exp_AUC = WT_AUC + Raw_Shift_AUC, - Delta_L = mean_L - Exp_L, - Delta_K = mean_K - Exp_K, - Delta_r = mean_r - Exp_r, - Delta_AUC = mean_AUC - Exp_AUC, - Zscore_L = Delta_L / WT_sd_l, - Zscore_K = Delta_K / WT_sd_K, - Zscore_r = Delta_r / WT_sd_r, - Zscore_AUC = Delta_AUC / WT_sd_AUC - ) - - # Handle cases for NG and SM - df_stats_interaction <- handle_special_cases(df_stats_interaction) - - # Linear models - lm_scores <- calculate_lm_scores(df_stats_interaction, max_conc) - - # Update the interaction scores dataframe - interaction_scores_RF <- update_scores_dataframe(interaction_scores_RF, df_stats_interaction, lm_scores, gene_sel, num_non_removed_conc) - } else { - interaction_scores_RF <- handle_na_scores(interaction_scores_RF, gene_sel) - } - - interaction_scores_RF -} - -handle_special_cases <- function(df_stats_interaction) { - if (sum(df_stats_interaction$NG, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate(across(c(Delta_L, Delta_K, Delta_r, Delta_AUC), ~ ifelse(NG == 1, mean_L - WT_l, .))) - } - - if (sum(df_stats_interaction$SM, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate(Delta_L = ifelse(SM == 1, mean_L - WT_l, Delta_L)) - } - - df_stats_interaction -} - -calculate_lm_scores <- function(df_stats_interaction, max_conc) { - gene_lm_L <- lm(Delta_L ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_K <- lm(Delta_K ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_r <- lm(Delta_r ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_AUC <- lm(Delta_AUC ~ Conc_Num_Factor, data = df_stats_interaction) - - list( - lm_L = max_conc * coef(gene_lm_L)[2] + coef(gene_lm_L)[1], - r_squared_L = summary(gene_lm_L)$r.squared, - lm_K = max_conc * coef(gene_lm_K)[2] + coef(gene_lm_K)[1], - r_squared_K = summary(gene_lm_K)$r.squared, - lm_r = max_conc * coef(gene_lm_r)[2] + coef(gene_lm_r)[1], - r_squared_r = summary(gene_lm_r)$r.squared, - lm_AUC = max_conc * coef(gene_lm_AUC)[2] + coef(gene_lm_AUC)[1], - r_squared_AUC = summary(gene_lm_AUC)$r.squared - ) -} - -update_scores_dataframe <- function(interaction_scores_RF, df_stats_interaction, lm_scores, gene_sel, num_non_removed_conc) { - interaction_scores_RF %>% - mutate( - Raw_Shift_L = replace(Raw_Shift_L, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_L[1]), - Z_Shift_L = replace(Z_Shift_L, OrfRep == gene_sel, df_stats_interaction$Z_Shift_L[1]), - lm_Score_L = replace(lm_Score_L, OrfRep == gene_sel, lm_scores$lm_L), - R_Squared_L = replace(R_Squared_L, OrfRep == gene_sel, lm_scores$r_squared_L), - Sum_Z_Score_L = replace(Sum_Z_Score_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE)), - Avg_Zscore_L = replace(Avg_Zscore_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE) / num_non_removed_conc), - # Repeat similar for K, r, AUC - NG = replace(NG, OrfRep == gene_sel, sum(df_stats_interaction$NG, na.rm = TRUE)), - DB = replace(DB, OrfRep == gene_sel, sum(df_stats_interaction$DB, na.rm = TRUE)), - SM = replace(SM, OrfRep == gene_sel, sum(df_stats_interaction$SM, na.rm = TRUE)) - ) -} - -handle_na_scores <- function(interaction_scores_RF, gene_sel) { - interaction_scores_RF %>% - mutate(across(starts_with("Raw_Shift"), ~ replace(., OrfRep == gene_sel, NA)), - across(starts_with("Z_Shift"), ~ replace(., OrfRep == gene_sel, NA)), - across(starts_with("lm_Score"), ~ replace(., OrfRep == gene_sel, NA)), - across(starts_with("R_Squared"), ~ replace(., OrfRep == gene_sel, NA)), - across(starts_with("Sum_Z_Score"), ~ replace(., OrfRep == gene_sel, NA)), - across(starts_with("Avg_Zscore"), ~ replace(., OrfRep == gene_sel, NA))) -} - -calculate_z_scores <- function(interaction_scores_RF) { - lm_sd <- sapply(interaction_scores_RF %>% select(starts_with("lm_Score")), sd, na.rm = TRUE) - lm_mean <- sapply(interaction_scores_RF %>% select(starts_with("lm_Score")), mean, na.rm = TRUE) - - interaction_scores_RF %>% - mutate( - Z_lm_L = (lm_Score_L - lm_mean["lm_Score_L"]) / lm_sd["lm_Score_L"], - Z_lm_K = (lm_Score_K - lm_mean["lm_Score_K"]) / lm_sd["lm_Score_K"], - Z_lm_r = (lm_Score_r - lm_mean["lm_Score_r"]) / lm_sd["lm_Score_r"], - Z_lm_AUC = (lm_Score_AUC - lm_mean["lm_Score_AUC"]) / lm_sd["lm_Score_AUC"] - ) %>% - arrange(desc(Z_lm_L), desc(NG)) -} - -plot_interaction_scores <- function(df_stats_interaction_all_RF, interaction_scores_RF, max_conc) { - plots <- list(L = list(), K = list(), r = list(), AUC = list()) - - for (i in seq_along(unique(interaction_scores_RF$OrfRep))) { - gene_sel <- unique(interaction_scores_RF$OrfRep)[i] - df_z_calculations <- df_stats_interaction_all_RF %>% filter(OrfRep == gene_sel) - df_int_scores <- interaction_scores_RF %>% filter(OrfRep == gene_sel) - - plots$L[[i]] <- create_plot(df_z_calculations, df_int_scores, "Delta_L", "Z_Shift_L", "Z_lm_L", "WT_sd_l", "Conc_Num_Factor", "L", max_conc) - plots$K[[i]] <- create_plot(df_z_calculations, df_int_scores, "Delta_K", "Z_Shift_K", "Z_lm_K", "WT_sd_K", "Conc_Num_Factor", "K", max_conc) - plots$r[[i]] <- create_plot(df_z_calculations, df_int_scores, "Delta_r", "Z_Shift_r", "Z_lm_r", "WT_sd_r", "Conc_Num_Factor", "r", max_conc) - plots$AUC[[i]] <- create_plot(df_z_calculations, df_int_scores, "Delta_AUC", "Z_Shift_AUC", "Z_lm_AUC", "WT_sd_AUC", "Conc_Num_Factor", "AUC", max_conc) - } - - plots -} - -# For interaction plots -create_plot <- function(df_z_calculations, df_int_scores, delta_var, zshift_var, zlm_var, wt_sd_var, conc_var, label, max_conc) { - ggplot(df_z_calculations, aes_string(conc_var, delta_var)) + - geom_point() + - geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-65, 65)) + - geom_errorbar(aes(ymin = 0 - (2 * !!sym(wt_sd_var)), ymax = 0 + (2 * !!sym(wt_sd_var))), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 45, label = paste(zshift_var, "=", round(df_int_scores[[zshift_var]], 2))) + - annotate("text", x = 1, y = 25, label = paste(zlm_var, "=", round(df_int_scores[[zlm_var]], 2))) + - annotate("text", x = 1, y = -25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations[[conc_var]]), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-65, 65, by = 10)) + - theme_publication() -} - -# For RF plots -generate_plot <- function(df_z_calculations, df_int_scores, delta_var, zshift_var, zlm_var, wt_sd_var, label, ylim_range) { - ggplot(df_z_calculations, aes(Conc_Num_Factor, !!sym(delta_var))) + - geom_point() + - geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = ylim_range) + - geom_errorbar(aes(ymin = 0 - (2 * !!sym(wt_sd_var)), ymax = 0 + (2 * !!sym(wt_sd_var))), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = ylim_range[2] * 0.7, label = paste("ZShift =", round(df_int_scores[[zshift_var]], 2))) + - annotate("text", x = 1, y = ylim_range[2] * 0.5, label = paste("lm Zscore =", round(df_int_scores[[zlm_var]], 2))) + - annotate("text", x = 1, y = ylim_range[2] * -0.3, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = ylim_range[2] * -0.4, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = ylim_range[2] * -0.5, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(ylim_range[1], ylim_range[2], by = (ylim_range[2] - ylim_range[1]) / 10)) + - theme_publication() -} - -generate_all_plots <- function(df_z_calculations, df_int_scores) { - plots <- list() - plots$l <- generate_plot( - df_z_calculations, df_int_scores, - "Delta_L", "Z_Shift_L", "Z_lm_L", "WT_sd_l", - "L", c(-65, 65)) - plots$K <- generate_plot( - df_z_calculations, df_int_scores, - "Delta_K", "Z_Shift_K", "Z_lm_K", "WT_sd_K", - "K", c(-65, 65)) - plots$r <- generate_plot( - df_z_calculations, df_int_scores, - "Delta_r", "Z_Shift_r", "Z_lm_r", "WT_sd_r", - "r", c(-0.65, 0.65)) - plots$AUC <- generate_plot( - df_z_calculations, df_int_scores, - "Delta_AUC", "Z_Shift_AUC", "Z_lm_AUC", "WT_sd_AUC", - "AUC", c(-6500, 6500)) - plots -} - - -calculate_interaction_stats <- function(df_gene_sel, background_stats, df_stats_BY, max_conc, total_conc_nums) { - df_stats_interaction <- df_gene_sel %>% - group_by(OrfRep, Gene, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = length(L), - mean_L = mean(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - mean_K = mean(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - mean_r = mean(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - mean_AUC = mean(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) - ) %>% - mutate( - Raw_Shift_L = ifelse(is.na(mean_L) || mean_L == 0, 0, mean_L - background_stats$L), - Raw_Shift_K = ifelse(is.na(mean_K) || mean_K == 0, 0, mean_K - background_stats$K), - Raw_Shift_r = ifelse(is.na(mean_r) || mean_r == 0, 0, mean_r - background_stats$r), - Raw_Shift_AUC = ifelse(is.na(mean_AUC) || mean_AUC == 0, 0, mean_AUC - background_stats$AUC), - Z_Shift_L = Raw_Shift_L / df_stats_BY$L$sd, - Z_Shift_K = Raw_Shift_K / df_stats_BY$K$sd, - Z_Shift_r = Raw_Shift_r / df_stats_BY$r$sd, - Z_Shift_AUC = Raw_Shift_AUC / df_stats_BY$AUC$sd, - WT_l = df_stats_BY$L$mean, - WT_K = df_stats_BY$K$mean, - WT_r = df_stats_BY$r$mean, - WT_AUC = df_stats_BY$AUC$mean, - WT_sd_l = df_stats_BY$L$sd, - WT_sd_K = df_stats_BY$K$sd, - WT_sd_r = df_stats_BY$r$sd, - WT_sd_AUC = df_stats_BY$AUC$sd - ) %>% - mutate( - Exp_L = WT_l + Raw_Shift_L, - Exp_K = WT_K + Raw_Shift_K, - Exp_r = WT_r + Raw_Shift_r, - Exp_AUC = WT_AUC + Raw_Shift_AUC, - Delta_L = mean_L - Exp_L, - Delta_K = mean_K - Exp_K, - Delta_r = mean_r - Exp_r, - Delta_AUC = mean_AUC - Exp_AUC, - Zscore_L = Delta_L / WT_sd_l, - Zscore_K = Delta_K / WT_sd_K, - Zscore_r = Delta_r / WT_sd_r, - Zscore_AUC = Delta_AUC / WT_sd_AUC - ) - - if (sum(df_stats_interaction$NG, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Delta_L = if_else(NG == 1, mean_L - WT_l, Delta_L), - Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K), - Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), - Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC) - ) - } - - if (sum(df_stats_interaction$SM, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate(Delta_L = if_else(SM == 1, mean_L - WT_l, Delta_L)) - } - - lm_L <- lm(Delta_L ~ Conc_Num_Factor, data = df_stats_interaction) - lm_K <- lm(Delta_K ~ Conc_Num_Factor, data = df_stats_interaction) - lm_r <- lm(Delta_r ~ Conc_Num_Factor, data = df_stats_interaction) - lm_AUC <- lm(Delta_AUC ~ Conc_Num_Factor, data = df_stats_interaction) - - df_stats_interaction <- df_stats_interaction %>% - mutate( - gene_interaction_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1], - gene_interaction_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1], - gene_interaction_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1], - gene_interaction_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1], - r_squared_l = summary(lm_L)$r.squared, - r_squared_K = summary(lm_K)$r.squared, - r_squared_r = summary(lm_r)$r.squared, - r_squared_AUC = summary(lm_AUC)$r.squared - ) - - return(df_stats_interaction) -} - - -update_interaction_scores_deletion <- function(interaction_scores_deletion, df_stats_interaction, gene_sel, lm_means, lm_sds, total_conc_nums) { - num_non_removed_conc <- total_conc_nums - sum(df_stats_interaction$DB, na.rm = TRUE) - 1 - - interaction_scores_deletion <- interaction_scores_deletion %>% - mutate( - Gene = replace(Gene, OrfRep == gene_sel, df_stats_interaction$Gene[1]), - Raw_Shift_L = replace(Raw_Shift_L, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_L[1]), - Z_Shift_L = replace(Z_Shift_L, OrfRep == gene_sel, df_stats_interaction$Z_Shift_L[1]), - lm_Score_L = replace(lm_Score_L, OrfRep == gene_sel, df_stats_interaction$gene_interaction_L[1]), - Z_lm_L = replace(Z_lm_L, OrfRep == gene_sel, (df_stats_interaction$gene_interaction_L[1] - lm_means$L) / lm_sds$L), - R_Squared_L = replace(R_Squared_L, OrfRep == gene_sel, df_stats_interaction$r_squared_l[1]), - Sum_Z_Score_L = replace(Sum_Z_Score_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE)), - Avg_Zscore_L = replace(Avg_Zscore_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_K = replace(Raw_Shift_K, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_K[1]), - Z_Shift_K = replace(Z_Shift_K, OrfRep == gene_sel, df_stats_interaction$Z_Shift_K[1]), - lm_Score_K = replace(lm_Score_K, OrfRep == gene_sel, df_stats_interaction$gene_interaction_K[1]), - Z_lm_K = replace(Z_lm_K, OrfRep == gene_sel, (df_stats_interaction$gene_interaction_K[1] - lm_means$K) / lm_sds$K), - R_Squared_K = replace(R_Squared_K, OrfRep == gene_sel, df_stats_interaction$r_squared_K[1]), - Sum_Z_Score_K = replace(Sum_Z_Score_K, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_K, na.rm = TRUE)), - Avg_Zscore_K = replace(Avg_Zscore_K, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_K, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_r = replace(Raw_Shift_r, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_r[1]), - Z_Shift_r = replace(Z_Shift_r, OrfRep == gene_sel, df_stats_interaction$Z_Shift_r[1]), - lm_Score_r = replace(lm_Score_r, OrfRep == gene_sel, df_stats_interaction$gene_interaction_r[1]), - Z_lm_r = replace(Z_lm_r, OrfRep == gene_sel, (df_stats_interaction$gene_interaction_r[1] - lm_means$r) / lm_sds$r), - R_Squared_r = replace(R_Squared_r, OrfRep == gene_sel, df_stats_interaction$r_squared_r[1]), - Sum_Z_Score_r = replace(Sum_Z_Score_r, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_r, na.rm = TRUE)), - Avg_Zscore_r = replace(Avg_Zscore_r, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_r, na.rm = TRUE) / (total_conc_nums - 1)), - Raw_Shift_AUC = replace(Raw_Shift_AUC, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_AUC[1]), - Z_Shift_AUC = replace(Z_Shift_AUC, OrfRep == gene_sel, df_stats_interaction$Z_Shift_AUC[1]), - lm_Score_AUC = replace(lm_Score_AUC, OrfRep == gene_sel, df_stats_interaction$gene_interaction_AUC[1]), - Z_lm_AUC = replace(Z_lm_AUC, OrfRep == gene_sel, (df_stats_interaction$gene_interaction_AUC[1] - lm_means$AUC) / lm_sds$AUC), - R_Squared_AUC = replace(R_Squared_AUC, OrfRep == gene_sel, df_stats_interaction$r_squared_AUC[1]), - Sum_Z_Score_AUC = replace(Sum_Z_Score_AUC, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_AUC, na.rm = TRUE)), - Avg_Zscore_AUC = replace(Avg_Zscore_AUC, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_AUC, na.rm = TRUE) / (total_conc_nums - 1)), - NG = replace(NG, OrfRep == gene_sel, sum(df_stats_interaction$NG, na.rm = TRUE)), - DB = replace(DB, OrfRep == gene_sel, sum(df_stats_interaction$DB, na.rm = TRUE)), - SM = replace(SM, OrfRep == gene_sel, sum(df_stats_interaction$SM, na.rm = TRUE)) - ) - - return(interaction_scores_deletion) -} - - -order_and_save_interaction_scores <- function(interaction_scores, output_dir) { - interaction_scores <- interaction_scores %>% - arrange(desc(Z_lm_L), desc(NG)) - - output_files <- list( - "ZScores_Interaction.csv" = interaction_scores, - "ZScores_Interaction_DeletionEnhancers_L.csv" = filter(interaction_scores, Avg_Zscore_L >= 2), - "ZScores_Interaction_DeletionEnhancers_K.csv" = filter(interaction_scores, Avg_Zscore_K <= -2), - "ZScores_Interaction_DeletionSuppressors_L.csv" = filter(interaction_scores, Avg_Zscore_L <= -2), - "ZScores_Interaction_DeletionSuppressors_K.csv" = filter(interaction_scores, Avg_Zscore_K >= 2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_L.csv" = filter(interaction_scores, Avg_Zscore_L >= 2 | Avg_Zscore_L <= -2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K.csv" = filter(interaction_scores, Avg_Zscore_K >= 2 | Avg_Zscore_K <= -2), - "ZScores_Interaction_Suppressors_and_lm_Enhancers_L.csv" = filter(interaction_scores, Z_lm_L >= 2 & Avg_Zscore_L <= -2), - "ZScores_Interaction_Enhancers_and_lm_Suppressors_L.csv" = filter(interaction_scores, Z_lm_L <= -2 & Avg_Zscore_L >= 2), - "ZScores_Interaction_Suppressors_and_lm_Enhancers_K.csv" = filter(interaction_scores, Z_lm_K <= -2 & Avg_Zscore_K >= 2), - "ZScores_Interaction_Enhancers_and_lm_Suppressors_K.csv" = filter(interaction_scores, Z_lm_K >= 2 & Avg_Zscore_K <= -2) - ) - - for (file_name in names(output_files)) { - write.csv(output_files[[file_name]], file = file.path(output_dir, file_name), row.names = FALSE) - } -} - - -filter_and_save_lm_interaction_scores <- function(interaction_scores, output_dir) { - output_files_lm <- list( - "ZScores_Interaction_DeletionEnhancers_L_lm.csv" = filter(interaction_scores, Z_lm_L >= 2), - "ZScores_Interaction_DeletionEnhancers_K_lm.csv" = filter(interaction_scores, Z_lm_K <= -2), - "ZScores_Interaction_DeletionSuppressors_L_lm.csv" = filter(interaction_scores, Z_lm_L <= -2), - "ZScores_Interaction_DeletionSuppressors_K_lm.csv" = filter(interaction_scores, Z_lm_K >= 2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_L_lm.csv" = filter(interaction_scores, Z_lm_L >= 2 | Z_lm_L <= -2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K_lm.csv" = filter(interaction_scores, Z_lm_K >= 2 | Z_lm_K <= -2) - ) - - for (file_name in names(output_files_lm)) { - write.csv(output_files_lm[[file_name]], file = file.path(output_dir, file_name), row.names = FALSE) - } -} - - -generate_and_save_plots <- function(df_stats_interaction_all, interaction_scores, output_dir) { - plot_list <- list(p_l = list(), p_K = list(), p_r = list(), p_AUC = list()) - df_stats_interaction_all_final <- tibble() - - for (i in seq_len(length(unique(interaction_scores$OrfRep)))) { - gene_sel <- unique(interaction_scores$OrfRep)[i] - df_z_calculations <- df_stats_interaction_all %>% filter(OrfRep == gene_sel) - df_int_scores <- interaction_scores %>% filter(OrfRep == gene_sel) - - plot_list$p_l[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_L)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-65, 65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_l), ymax = 0 + (2 * WT_sd_l)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 45, label = paste("ZShift =", round(df_int_scores$Z_Shift_L, 2))) + - annotate("text", x = 1, y = 25, label = paste("Z lm Score =", round(df_int_scores$Z_lm_L, 2))) + - annotate("text", x = 1, y = -25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-60, 60, 10)) + - theme_Publication() - - plot_list$p_K[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_K)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-65, 65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_K), ymax = 0 + (2 * WT_sd_K)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 45, label = paste("ZShift =", round(df_int_scores$Z_Shift_K, 2))) + - annotate("text", x = 1, y = 25, label = paste("Z lm Score =", round(df_int_scores$Z_lm_K, 2))) + - annotate("text", x = 1, y = -25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-60, 60, 10)) + - theme_Publication() - - plot_list$p_r[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_r)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-0.65, 0.65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_r), ymax = 0 + (2 * WT_sd_r)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 0.45, label = paste("ZShift =", round(df_int_scores$Z_Shift_r, 2))) + - annotate("text", x = 1, y = 0.25, label = paste("Z lm Score =", round(df_int_scores$Z_lm_r, 2))) + - annotate("text", x = 1, y = -0.25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -0.35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -0.45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-0.6, 0.6, 0.2)) + - theme_Publication() - - plot_list$p_AUC[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_AUC)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-6500, 6500)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_AUC), ymax = 0 + (2 * WT_sd_AUC)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 4500, label = paste("ZShift =", round(df_int_scores$Z_Shift_AUC, 2))) + - annotate("text", x = 1, y = 2500, label = paste("Z lm Score =", round(df_int_scores$Z_lm_AUC, 2))) + - annotate("text", x = 1, y = -2500, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -3500, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -4500, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-6000, 6000, 1000)) + - theme_Publication() - - df_stats_interaction_all_final <- if (i == 1) { - df_z_calculations - } else { - bind_rows(df_stats_interaction_all_final, df_z_calculations) - } - } - - print("Pass Int ggplot loop") - write.csv(df_stats_interaction_all_final, file = file.path(output_dir, "ZScore_Calculations.csv"), row.names = FALSE) -} - -# Function to create scatter plots for different statistics -create_scatter_plot <- function(df, stat, ylim_range, title_suffix) { - ggplot(df, aes(Conc_Num_Factor, .data[[stat]])) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df$Conc_Num_Factor), labels = as.character(unique(df$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for", title_suffix, "with SD")) + - coord_cartesian(ylim = ylim_range) + - annotate("text", x = -0.25, y = ylim_range[2] * 0.1, label = "NG") + - annotate("text", x = -0.25, y = ylim_range[2] * 0.05, label = "DB") + - annotate("text", x = -0.25, y = ylim_range[2] * 0, label = "SM") + - annotate("text", x = unique(df$Conc_Num_Factor), y = ylim_range[2] * 0.1, label = df_stats_rf$NG) + - annotate("text", x = unique(df$Conc_Num_Factor), y = ylim_range[2] * 0.05, label = df_stats_rf$DB) + - annotate("text", x = unique(df$Conc_Num_Factor), y = ylim_range[2] * 0, label = df_stats_rf$SM) + - theme_Publication() -} - -# Function to create box plots for different statistics -create_box_plot <- function(df, stat, ylim_range, title_suffix) { - ggplot(df, aes(as.factor(Conc_Num_Factor), .data[[stat]])) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df$Conc_Num_Factor), labels = as.character(unique(df$Conc_Num))) + - ggtitle(paste(s, "Boxplot RF for", title_suffix, "with SD")) + - coord_cartesian(ylim = ylim_range) + - theme_Publication() -} - - - -# Generate and save interaction plots for RF data -generate_rf_interaction_plots <- function(df_rf, output_dir, drug_name, plot_list_rf, num_genes_rf) { - # Summarize stats for RF data - df_stats_rf <- df_rf %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise( - mean_L = mean(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - mean_K = mean(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - mean_r = mean(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - mean_AUC = mean(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) - ) - - create_rf_scatter_plot <- function(df_rf, df_stats_rf, y_var, y_lim, y_labels, title_suffix) { - ggplot(df_rf, aes(Conc_Num_Factor, .data[[y_var]])) + - geom_point(position = "jitter", size = 1) + - stat_summary(fun = mean, fun.min = ~ mean(.) - sd(.), fun.max = ~ mean(.) + sd(.), geom = "errorbar", color = "red") + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = drug_name, breaks = unique(df_rf$Conc_Num_Factor), labels = as.character(unique(df_rf$Conc_Num))) + - ggtitle(paste(s, title_suffix, sep = " ")) + - coord_cartesian(ylim = y_lim) + - annotate("text", x = -0.25, y = y_labels[1], label = "NG") + - annotate("text", x = -0.25, y = y_labels[2], label = "DB") + - annotate("text", x = -0.25, y = y_labels[3], label = "SM") + - annotate("text", x = unique(df_rf$Conc_Num_Factor), y = y_labels[1], label = df_stats_rf$NG) + - annotate("text", x = unique(df_rf$Conc_Num_Factor), y = y_labels[2], label = df_stats_rf$DB) + - annotate("text", x = unique(df_rf$Conc_Num_Factor), y = y_labels[3], label = df_stats_rf$SM) + - theme_Publication() - } - - # Create and save RF scatter plots - plot_rf_l_stats <- create_rf_scatter_plot(df_rf, df_stats_rf, "L", c(0, 130), c(10, 5, 0), "Scatter RF for L with SD") - plot_rf_k_stats <- create_rf_scatter_plot(df_rf, df_stats_rf, "K", c(-20, 160), c(-5, -12.5, -20), "Scatter RF for K with SD") - plot_rf_r_stats <- create_rf_scatter_plot(df_rf, df_stats_rf, "r", c(0, 1), c(0.9, 0.8, 0.7), "Scatter RF for r with SD") - plot_rf_auc_stats <- create_rf_scatter_plot(df_rf, df_stats_rf, "AUC", c(0, 12500), c(11000, 10000, 9000), "Scatter RF for AUC with SD") - - # Create and save RF box plots - create_rf_box_plot <- function(df_rf, y_var, y_lim, title_suffix) { - ggplot(df_rf, aes(as.factor(Conc_Num_Factor), .data[[y_var]])) + - geom_boxplot() + - scale_x_discrete(name = drug_name, breaks = unique(df_rf$Conc_Num_Factor), labels = as.character(unique(df_rf$Conc_Num))) + - ggtitle(paste(s, title_suffix, sep = " ")) + - coord_cartesian(ylim = y_lim) + - theme_Publication() - } - - plot_rf_l_stats_box <- create_rf_box_plot(df_rf, "L", c(0, 130), "Scatter RF for L with SD") - plot_rf_k_stats_box <- create_rf_box_plot(df_rf, "K", c(0, 160), "Scatter RF for K with SD") - plot_rf_r_stats_box <- create_rf_box_plot(df_rf, "r", c(0, 1), "Scatter RF for r with SD") - plot_rf_auc_stats_box <- create_rf_box_plot(df_rf, "AUC", c(0, 12500), "Scatter RF for AUC with SD") - - # Save RF scatter and box plots to PDF - pdf(file.path(output_dir, "RF_InteractionPlots.pdf"), width = 16, height = 16, onefile = TRUE) - grid.arrange(plot_rf_l_stats, plot_rf_k_stats, plot_rf_r_stats, plot_rf_auc_stats, ncol = 2, nrow = 2) - grid.arrange(plot_rf_l_stats_box, plot_rf_k_stats_box, plot_rf_r_stats_box, plot_rf_auc_stats_box, ncol = 2, nrow = 2) - - # Loop to arrange and print combined RF plots - plot_indices_rf <- seq(1, (num_genes_rf - 1), by = 3) - for (m in seq_along(plot_indices_rf)) { - grid.arrange( - plot_list_rf$l[[plot_indices_rf[m]]], plot_list_rf$K[[plot_indices_rf[m]]], plot_list_rf$r[[plot_indices_rf[m]]], plot_list_rf$AUC[[plot_indices_rf[m]]], - plot_list_rf$l[[plot_indices_rf[m] + 1]], plot_list_rf$K[[plot_indices_rf[m] + 1]], plot_list_rf$r[[plot_indices_rf[m] + 1]], plot_list_rf$AUC[[plot_indices_rf[m] + 1]], - plot_list_rf$l[[plot_indices_rf[m] + 2]], plot_list_rf$K[[plot_indices_rf[m] + 2]], plot_list_rf$r[[plot_indices_rf[m] + 2]], plot_list_rf$AUC[[plot_indices_rf[m] + 2]], - ncol = 4, nrow = 3 - ) - } - - # Handle leftover RF plots if num_genes_rf is not a multiple of 3 - remaining_rf_plots <- num_genes_rf - max(plot_indices_rf + 2) - if (remaining_rf_plots > 0) { - plot_grid_rf_list <- lapply(seq_len(remaining_rf_plots), function(i) { - list(plot_list_rf$l[[plot_indices_rf[length(plot_indices_rf)] + i]], plot_list_rf$K[[plot_indices_rf[length(plot_indices_rf)] + i]], plot_list_rf$r[[plot_indices_rf[length(plot_indices_rf)] + i]], plot_list_rf$AUC[[plot_indices_rf[length(plot_indices_rf)] + i]]) - }) - do.call(grid.arrange, c(plot_grid_rf_list, list(ncol = 4, nrow = 3))) - } - dev.off() -} - - - -# Apply the filtering and save for a specific strain -for (s in background_strains) { - background_means <- list( - L = df_stats_by_L %>% filter(Conc_Num_Factor == 0) %>% pull(mean_L), - K = df_stats_by_K %>% filter(Conc_Num_Factor == 0) %>% pull(mean_K), - r = df_stats_by_r %>% filter(Conc_Num_Factor == 0) %>% pull(mean_r), - AUC = df_stats_by_AUC %>% filter(Conc_Num_Factor == 0) %>% pull(mean_AUC) - ) - - # Filter and calculate summary statistics - df_within_2SD_K <- filter_within_2SD(df, df_stats_by_K, "K") - df_outside_2SD_K <- anti_join(df, df_within_2SD_K, by = c("OrfRep", "Conc_Num", "Conc_Num_Factor")) - - df_stats_by_L_within_2SD_K <- calculate_summary_stats(df_within_2SD_K, "L") - df_stats_by_L_outside_2SD_K <- calculate_summary_stats(df_outside_2SD_K, "L") - - write.csv(df_stats_by_L_within_2SD_K, file = file.path(args$out_dir, "Max_Observed_L_Vals_for_spots_within_2SD_K.csv"), row.names = FALSE) - - plot_and_save(df_outside_2SD_K, "L", "K", "Raw L vs K for strains falling outside 2SD of the K mean at each conc", - "Raw_L_vs_K_for_strains_2SD_outside_mean_K", args$out_dir) - plot_and_save(df_outside_2SD_K, "Delta_Backgrd", "K", "DeltaBackground vs K for strains falling outside 2SD of the K mean at each conc", - "DeltaBackground_vs_K_for_strains_2SD_outside_mean_K", args$out_dir) - - # Update interaction scores - interaction_scores_RF <- update_interaction_scores(filter_results$stats_within_2SD, background_means, interaction_scores_RF, s, num_non_removed_conc, max_conc) - - # Calculate Z-scores - interaction_scores_RF <- calculate_z_scores(interaction_scores_RF) - - # Save the final interaction statistics - write.csv(interaction_scores_RF, file = file.path(args$out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE) - - # Generate ggplot objects for each RF strain - df_stats_interaction_all_RF_final <- tibble() - for (i in seq_len(num_genes_RF)) { - gene_sel <- unique(interaction_scores_RF$OrfRep)[i] - df_z_calculations <- df_stats_interaction_all_RF %>% filter(OrfRep == gene_sel) - df_int_scores <- interaction_scores_RF %>% filter(OrfRep == gene_sel) - - # Generate all required plots - plots <- generate_all_plots(df_z_calculations, df_int_scores) - - p_rf_l[[i]] <- plots$l - p_rf_K[[i]] <- plots$K - p_rf_r[[i]] <- plots$r - p_rf_AUC[[i]] <- plots$AUC - - # Append the final interaction statistics for all RFs - df_stats_interaction_all_RF_final <- bind_rows(df_stats_interaction_all_RF_final, df_z_calculations) - # Original Logic - # if (i == 1) { - # df_stats_interaction_all_RF_final <- df_z_calculations - # } else { - # df_stats_interaction_all_RF_final <- bind_rows(df_stats_interaction_all_RF_final, df_z_calculations) - # } - } - - print("Pass RF ggplot loop") - - # Get Z-scores for Gene Deletion Strains - - # Save the final interaction statistics - write.csv(df_stats_interaction_all_RF_final, file = file.path(args$out_dir, "RF_ZScore_Calculations.csv"), row.names = FALSE) - - # Order and save interaction scores - order_and_save_interaction_scores(interaction_scores_deletion, output_dir) - - # Filter and save linear regression enhancers and suppressors - filter_and_save_lm_interaction_scores(interaction_scores_deletion, output_dir) - - # Generate and save ggplot objects - generate_and_save_plots(df_stats_interaction_all, interaction_scores_deletion, output_dir) - - # Summarize stats for df2_rf - df_stats_rf <- df2_rf %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise( - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - max_K = max(K, na.rm = TRUE), - min_K = min(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - max_r = max(r, na.rm = TRUE), - min_r = min(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - max_AUC = max(AUC, na.rm = TRUE), - min_AUC = min(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) - ) - - # Define ylim ranges for each plot type - ylim_ranges <- list( - L = c(0, 160), - K = c(-20, 160), - r = c(0, 1), - AUC = c(0, 12500) - ) - - # Create PDF for interaction plots - pdf(file.path(output_dir, "InteractionPlots.pdf"), width = 16, height = 16, onefile = TRUE) - - # Create scatter and box plots for each statistic - plot_l_stats <- create_scatter_plot(df2_rf, "L", ylim_ranges$L, "L") - plot_k_stats <- create_scatter_plot(df2_rf, "K", ylim_ranges$K, "K") - plot_r_stats <- create_scatter_plot(df2_rf, "r", ylim_ranges$r, "r") - plot_auc_stats <- create_scatter_plot(df2_rf, "AUC", ylim_ranges$AUC, "AUC") - - plot_l_stats_box <- create_box_plot(df2_rf, "L", ylim_ranges$L, "L") - plot_k_stats_box <- create_box_plot(df2_rf, "K", ylim_ranges$K, "K") - plot_r_stats_box <- create_box_plot(df2_rf, "r", ylim_ranges$r, "r") - plot_auc_stats_box <- create_box_plot(df2_rf, "AUC", ylim_ranges$AUC, "AUC") - - # Arrange and plot scatter and box plots - grid.arrange(plot_l_stats, plot_k_stats, plot_r_stats, plot_auc_stats, ncol = 2, nrow = 2) - grid.arrange(plot_l_stats_box, plot_k_stats_box, plot_r_stats_box, plot_auc_stats_box, ncol = 2, nrow = 2) - - # Loop to arrange and print combined plots - plot_indices <- seq(1, (num_genes - 1), by = 3) - for (m in seq_along(plot_indices)) { - grid.arrange( - p_l[[plot_indices[m]]], p_K[[plot_indices[m]]], p_r[[plot_indices[m]]], p_AUC[[plot_indices[m]]], - p_l[[plot_indices[m] + 1]], p_K[[plot_indices[m] + 1]], p_r[[plot_indices[m] + 1]], p_AUC[[plot_indices[m] + 1]], - p_l[[plot_indices[m] + 2]], p_K[[plot_indices[m] + 2]], p_r[[plot_indices[m] + 2]], p_AUC[[plot_indices[m] + 2]], - ncol = 4, nrow = 3 - ) - } - - # Handle leftover plots if num_genes is not a multiple of 3 - remaining_plots <- num_genes - max(plot_indices + 2) - if (remaining_plots > 0) { - plot_grid_list <- lapply(seq_len(remaining_plots), function(i) { - list(p_l[[plot_indices[length(plot_indices)] + i]], p_K[[plot_indices[length(plot_indices)] + i]], p_r[[plot_indices[length(plot_indices)] + i]], p_AUC[[plot_indices[length(plot_indices)] + i]]) - }) - do.call(grid.arrange, c(plot_grid_list, list(ncol = 4, nrow = 3))) - } - - dev.off() - - generate_rf_interaction_plots(df2_rf, output_dir, unique(df$Drug[1]), plot_list_rf, num_genes_rf) -} - -# Function to calculate linear models and summaries -calculate_lm_summaries <- function(df_na_rm) { - lm_list <- list( - lm(Z_lm_K ~ Z_lm_L, data = df_na_rm), - lm(Z_lm_r ~ Z_lm_L, data = df_na_rm), - lm(Z_lm_AUC ~ Z_lm_L, data = df_na_rm), - lm(Z_lm_r ~ Z_lm_K, data = df_na_rm), - lm(Z_lm_AUC ~ Z_lm_K, data = df_na_rm), - lm(Z_lm_AUC ~ Z_lm_r, data = df_na_rm) - ) - - lapply(lm_list, summary) -} - -# Function to generate correlation plots, with optional highlighting -generate_plots <- function(df_na_rm, lm_summaries, interaction_scores_rf_filtered = NULL) { - plot_data <- list( - list(x_var = "Z_lm_L", y_var = "Z_lm_K", title = "Interaction L vs. Interaction K", summary_index = 1), - list(x_var = "Z_lm_L", y_var = "Z_lm_r", title = "Interaction L vs. Interaction r", summary_index = 2), - list(x_var = "Z_lm_L", y_var = "Z_lm_AUC", title = "Interaction L vs. Interaction AUC", summary_index = 3), - list(x_var = "Z_lm_K", y_var = "Z_lm_r", title = "Interaction K vs. Interaction r", summary_index = 4), - list(x_var = "Z_lm_K", y_var = "Z_lm_AUC", title = "Interaction K vs. Interaction AUC", summary_index = 5), - list(x_var = "Z_lm_r", y_var = "Z_lm_AUC", title = "Interaction r vs. Interaction AUC", summary_index = 6) - ) - - plot_list <- lapply(plot_data, function(pd) { - plot <- ggplot(df_na_rm, aes_string(x = pd$x_var, y = pd$y_var)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle(pd$title) + - xlab(paste("z-score", gsub("Z_lm_", "", pd$x_var))) + - ylab(paste("z-score", gsub("Z_lm_", "", pd$y_var))) + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[pd$summary_index]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme( - panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - axis.text.x = element_text(size = 16), - axis.title.x = element_text(size = 18), - axis.text.y = element_text(size = 16), - axis.title.y = element_text(size = 18) - ) - - if (!is.null(interaction_scores_rf_filtered)) { - plot <- plot + geom_point(data = interaction_scores_rf_filtered, aes_string(x = pd$x_var, y = pd$y_var), color = "cyan") - } - - plot - }) - - plot_list -} - -# Function to generate and save plots to PDF -generate_and_save_plots <- function(output_dir, df_na_rm, interaction_scores_rf) { - lm_summaries <- calculate_lm_summaries(df_na_rm) - - # Create PDF for correlation plots - pdf(file.path(output_dir, "Correlation_CPPs.pdf"), width = 10, height = 7, onefile = TRUE) - - # Generate and print regular plots - plot_list <- generate_plots(df_na_rm, lm_summaries) - lapply(plot_list, print) - - # Generate and print highlighted plots - interaction_scores_rf_filtered <- interaction_scores_rf[!is.na(interaction_scores_rf$Z_lm_L), ] - highlighted_plot_list <- generate_plots(df_na_rm, lm_summaries, interaction_scores_rf_filtered) - lapply(highlighted_plot_list, print) - - dev.off() -} - -# Call the function to generate plots and save to PDF -generate_and_save_plots(output_dir, df_na_rm, interaction_scores_rf) - diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores4.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores4.R deleted file mode 100644 index 3fa82380..00000000 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores4.R +++ /dev/null @@ -1,1797 +0,0 @@ -suppressMessages({ - library(ggplot2) - library(plotly) - library(htmlwidgets) - library(dplyr) - library(ggthemes) - library(data.table) -}) - -options(warn = 2, max.print = 1000) - -# Constants for configuration -PLOT_WIDTH <- 14 -PLOT_HEIGHT <- 9 -BASE_SIZE <- 14 - -parse_arguments <- function() { - args <- if (interactive()) { - c( - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD", - 3, - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2_DoxoHLD/results_std.txt", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1", - "Experiment 1: Doxo versus HLD", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2", - "Experiment 2: HLD versus Doxo" - ) - } else { - commandArgs(trailingOnly = TRUE) - } - - paths <- normalizePath(file.path(args[seq(5, length(args), by = 2)]), mustWork = FALSE) - names <- args[seq(6, length(args), by = 2)] - experiments <- setNames(paths, names) - - list( - out_dir = normalizePath(file.path(args[1]), mustWork = FALSE), - sd = as.numeric(args[2]), - sgd_gene_list = normalizePath(file.path(args[3]), mustWork = FALSE), - easy_results_file = normalizePath(file.path(args[4]), mustWork = FALSE), - experiments = experiments - ) -} - -args <- parse_arguments() - -dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE) -dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE) - -# Define themes and scales -theme_publication <- function(base_size = BASE_SIZE, base_family = "sans", legend_position = "bottom") { - theme_foundation(base_size = base_size, base_family = base_family) + - theme( - plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5), - text = element_text(), - panel.background = element_rect(colour = NA), - plot.background = element_rect(colour = NA), - panel.border = element_rect(colour = NA), - axis.title = element_text(face = "bold", size = rel(1)), - axis.title.y = element_text(angle = 90, vjust = 2), - axis.title.x = element_text(vjust = -0.2), - axis.line = element_line(colour = "black"), - panel.grid.major = element_line(colour = "#f0f0f0"), - panel.grid.minor = element_blank(), - legend.key = element_rect(colour = NA), - legend.position = legend_position, - legend.direction = ifelse(legend_position == "right", "vertical", "horizontal"), - plot.margin = unit(c(10, 5, 5, 5), "mm"), - strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"), - strip.text = element_text(face = "bold") - ) -} - -scale_fill_publication <- function(...) { - discrete_scale("fill", "Publication", manual_pal(values = c( - "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", - "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" - )), ...) -} - -scale_colour_publication <- function(...) { - discrete_scale("colour", "Publication", manual_pal(values = c( - "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", - "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" - )), ...) -} - -# Load SGD gene list -sgd_genes <- function(sgd_gene_list) { - read.delim(file = sgd_gene_list, quote = "", header = FALSE, - colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) %>% - dplyr::rename(ORF = V4, GeneName = V5) -} - -genes <- sgd_genes(args$sgd_gene_list) - -# Load the initial dataframe from the easy_results_file -load_and_preprocess_data <- function(easy_results_file, std) { - df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) - - # Clean and convert columns to numeric where appropriate - df <- df %>% - filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>% - mutate( - Col = as.numeric(Col), - Row = as.numeric(Row), - L = as.numeric(l), - K = as.numeric(K), - r = as.numeric(r), - Scan = as.numeric(Scan), - AUC = as.numeric(AUC96), - LstBackgrd = as.numeric(LstBackgrd), - X1stBackgrd = as.numeric(X1stBackgrd), - OrfRep = ifelse(ORF == "YDL227C", "YDL227C", OrfRep), - Conc_Num = as.numeric(gsub("[^0-9\\.]", "", Conc)), - Conc_Num_Factor = as.numeric(as.factor(Conc_Num)) - 1 - ) - - # Define max concentration factor - max_conc <- max(df$Conc_Num_Factor, na.rm = TRUE) - - return(list(df = df, max_conc = max_conc)) -} - -# Function to update Gene names using the SGD gene list -update_gene_names <- function(df, sgd_gene_list) { - genes <- read.delim(file = sgd_gene_list, quote = "", header = FALSE, colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) - gene_map <- setNames(genes$V5, genes$V4) - - df <- df %>% - rowwise() %>% - mutate(Gene = ifelse(OrfRep != "YDL227C", gene_map[[ORF]], Gene)) %>% - ungroup() %>% - mutate(Gene = ifelse(Gene == "" | Gene == "OCT1", OrfRep, Gene)) - - return(df) -} - -# Apply the functions -result <- load_and_preprocess_data(args$easy_results_file, args$sd) -df <- update_gene_names(result$df, args$sgd_gene_list) -max_conc <- result$max_conc - -# Function to perform initial calculations and QC before filtering -perform_initial_calculations <- function(df, output_dir_qc, delta_bg_sd_factor) { - - # Add Delta_Backgrd column - df <- df %>% - mutate(Delta_Backgrd = LstBackgrd - X1stBackgrd) - - # Calculate Delta_Background_Tolerance - Delta_Background_Tolerance <- mean(df$Delta_Backgrd, na.rm = TRUE) + - (delta_bg_sd_factor * sd(df$Delta_Backgrd, na.rm = TRUE)) - print(paste("Delta_Background_Tolerance is", Delta_Background_Tolerance)) - - # Save plots to PDF and HTML - plot_and_save <- function(plot_obj, filename, output_dir_qc) { - pdf_path <- file.path(output_dir_qc, paste0(filename, ".pdf")) - html_path <- file.path(output_dir_qc, paste0(filename, ".html")) - pdf(pdf_path, width = 12, height = 8) - print(plot_obj) - dev.off() - pgg <- ggplotly(plot_obj) - saveWidget(pgg, html_path, selfcontained = TRUE) - } - - # Generate and save plate analysis plots - generate_plate_analysis_plots <- function(df, variable, output_dir_qc, delta_bg = FALSE) { - if (!delta_bg) { - plot_name <- paste("Plate analysis by Drug Conc for", variable, "before quality control") - } else { - plot_name <- "Plate analysis by Drug Conc for Delta_Backgrd before quality control" - variable <- "Delta_Backgrd" - } - - p1 <- ggplot(df, aes(Scan, .data[[variable]], color = as.factor(Conc_Num))) + - geom_point(shape = 3, size = 0.2, position = "jitter") + - stat_summary(fun.data = mean_sdl, geom = "errorbar") + - stat_summary(fun = mean, geom = "point", size = 0.6) + - ggtitle(plot_name) + - theme_publication() - - p2 <- ggplot(df, aes(as.factor(Scan), .data[[variable]], color = as.factor(Conc_Num))) + - geom_boxplot() + - ggtitle(plot_name) + - theme_publication() - - plot_and_save(p1, paste0("Plate_Analysis_", variable), output_dir_qc) - plot_and_save(p2, paste0("Plate_Analysis_", variable, "_Box"), output_dir_qc) - } - - variables <- c("L", "K", "r", "AUC") - lapply(variables, generate_plate_analysis_plots) - generate_plate_analysis_plots("Delta_Backgrd", delta_bg = TRUE) - - # Raw L vs K plot before QC - Raw_l_vs_K_beforeQC <- ggplot(df, aes(l, K, color = as.factor(Conc_Num))) + - geom_point(aes(ORF = ORF, Gene = Gene, Delta_Backgrd = Delta_Backgrd), shape = 3) + - ggtitle("Raw L vs K before QC") + - theme_publication_legend_right() - - plot_and_save(Raw_l_vs_K_beforeQC, "Raw_L_vs_K_beforeQC", output_dir_qc) - - # Subset and plot data above Delta_Background_Tolerance - df_above_tolerance <- df %>% filter(Delta_Backgrd >= Delta_Background_Tolerance) - df_above_tolerance_K_halfmedian <- median(df_above_tolerance$K, na.rm = TRUE) / 2 - df_above_tolerance_L_halfmedian <- median(df_above_tolerance$L, na.rm = TRUE) / 2 - df_above_tolerance_toRemove <- nrow(df_above_tolerance) - - df_Delta_Backgrd_above_Tolerance_L_vs_K <- ggplot(df_above_tolerance, aes(l, K, color = as.factor(Conc_Num))) + - geom_point(aes(ORF = ORF, Gene = Gene, Delta_Backgrd = Delta_Backgrd), shape = 3) + - ggtitle(paste("Raw L vs K for strains above delta background threshold of", Delta_Background_Tolerance, "or above")) + - annotate("text", x = df_above_tolerance_L_halfmedian, y = df_above_tolerance_K_halfmedian, - label = paste("Strains above delta background tolerance = ", df_above_tolerance_toRemove)) + - theme_publication_legend_right() - - plot_and_save(df_Delta_Backgrd_above_Tolerance_L_vs_K, "Raw_L_vs_K_for_strains_above_deltabackgrd_threshold", output_dir_qc) - - # Frequency and Bar plots for Delta Background - DeltaBackground_Frequency_Plot <- ggplot(df, aes(Delta_Backgrd, color = as.factor(Conc_Num))) + - geom_density() + - ggtitle("Density plot for Delta Background by Conc All Data") + - theme_publication_legend_right() - - DeltaBackground_Bar_Plot <- ggplot(df, aes(Delta_Backgrd, color = as.factor(Conc_Num))) + - geom_bar() + - ggtitle("Bar plot for Delta Background by Conc All Data") + - theme_publication_legend_right() - - plot_and_save(DeltaBackground_Frequency_Plot, "Frequency_Delta_Background", output_dir_qc) - plot_and_save(DeltaBackground_Bar_Plot, "Bar_Delta_Background", output_dir_qc) - - return(df) -} - -# Example usage with df and other parameters from the previous steps -df <- perform_initial_calculations(df, args$out_dir, args$sd) - -# Identify missing data (No Growth) and replace values where Delta_Backgrd exceeds tolerance -df <- df %>% - mutate( - NG = ifelse(L == 0 & !is.na(l), 1, 0), - DB = ifelse(Delta_Backgrd >= Delta_Background_Tolerance, 1, 0), - L = ifelse(DB == 1, NA, L), - r = ifelse(DB == 1, NA, r), - AUC = ifelse(DB == 1, NA, AUC), - K = ifelse(DB == 1, NA, K) - ) - -# Function to create and save plots -create_qc_plot <- function(df, var, plot_type) { - if (plot_type == "scatter") { - plot <- ggplot(df, aes(Scan, !!sym(var), color = as.factor(Conc_Num))) + - geom_point(shape = 3, size = 0.2) + - stat_summary(fun.data = mean_sdl, geom = "errorbar") + - stat_summary(fun = mean, geom = "point", size = 0.6) + - ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) + - theme_publication() - } else if (plot_type == "box") { - plot <- ggplot(df, aes(as.factor(Scan), !!sym(var), color = as.factor(Conc_Num))) + - geom_boxplot() + - ggtitle(paste("Plate analysis by Drug Conc for", var, "after quality control")) + - theme_publication() - } - plot -} - -# Function to generate and save both scatter and box plots -generate_and_save_qc_plots <- function(df, variables, output_dir_qc) { - lapply(variables, function(var) { - scatter_plot <- create_qc_plot(df, var, "scatter") - box_plot <- create_qc_plot(df, var, "box") - - plot_and_save(scatter_plot, paste0("Plate_Analysis_", var, "_afterQC"), output_dir_qc) - plot_and_save(box_plot, paste0("Plate_Analysis_", var, "_Box_afterQC"), output_dir_qc) - }) -} - -# Generate and save QC plots for l, K, r, AUC, and Delta_Backgrd after QC -variables <- c("L", "K", "r", "AUC", "Delta_Backgrd") - -# Generate and save QC plots for all data -generate_and_save_qc_plots(df, variables, args$out_dir) - -# Generate and save QC plots for data without zero values for L -df_noZero <- df %>% filter(L > 0) -generate_and_save_qc_plots(df_noZero, variables, args$out_dir) - -# Print plate analysis data before and after QC to PDFs -pdf(file = file.path(args$out_dir, "Plate_Analysis.pdf"), width = 14, height = 9) -print(Plate_Analysis_L) -print(Plate_Analysis_L_afterQC) -print(Plate_Analysis_K) -print(Plate_Analysis_K_afterQC) -print(Plate_Analysis_r) -print(Plate_Analysis_r_afterQC) -print(Plate_Analysis_AUC) -print(Plate_Analysis_AUC_afterQC) -print(Plate_Analysis_Delta_Backgrd) -print(Plate_Analysis_Delta_Backgrd_afterQC) -dev.off() - -pdf(file = file.path(args$out_dir, "Plate_Analysis_Boxplots.pdf"), width = 18, height = 9) -print(Plate_Analysis_L_Box) -print(Plate_Analysis_L_Box_afterQC) -print(Plate_Analysis_K_Box) -print(Plate_Analysis_K_Box_afterQC) -print(Plate_Analysis_r_Box) -print(Plate_Analysis_r_Box_afterQC) -print(Plate_Analysis_AUC_Box) -print(Plate_Analysis_AUC_Box_afterQC) -print(Plate_Analysis_Delta_Backgrd_Box) -print(Plate_Analysis_Delta_Backgrd_Box_afterQC) -dev.off() - -pdf(file = file.path(args$out_dir, "Plate_Analysis_noZeros.pdf"), width = 14, height = 9) -print(Plate_Analysis_L_afterQC_Z) -print(Plate_Analysis_K_afterQC_Z) -print(Plate_Analysis_r_afterQC_Z) -print(Plate_Analysis_AUC_afterQC_Z) -print(Plate_Analysis_Delta_Backgrd_afterQC_Z) -dev.off() - -pdf(file = file.path(args$out_dir, "Plate_Analysis_noZeros_Boxplots.pdf"), width = 18, height = 9) -print(Plate_Analysis_L_Box_afterQC_Z) -print(Plate_Analysis_K_Box_afterQC_Z) -print(Plate_Analysis_r_Box_afterQC_Z) -print(Plate_Analysis_AUC_Box_afterQC_Z) -print(Plate_Analysis_Delta_Backgrd_Box_afterQC_Z) -dev.off() - -# Clean up -rm(df_noZero) - -# Calculate and save summary statistics for all strains for a given variable -calculate_summary_stats <- function(df, variable, group_by = NULL) { - if (!is.null(group_by)) { - df %>% - group_by(group_by) %>% - summarise( - N = n(), - mean_val = mean(.data[[variable]], na.rm = TRUE), - median_val = median(.data[[variable]], na.rm = TRUE), - max_val = max(.data[[variable]], na.rm = TRUE), - min_val = min(.data[[variable]], na.rm = TRUE), - sd_val = sd(.data[[variable]], na.rm = TRUE), - se_val = sd_val / sqrt(N - 1) - ) - } else { - df %>% - summarise( - N = n(), - mean_val = mean(.data[[variable]], na.rm = TRUE), - median_val = median(.data[[variable]], na.rm = TRUE), - max_val = max(.data[[variable]], na.rm = TRUE), - min_val = min(.data[[variable]], na.rm = TRUE), - sd_val = sd(.data[[variable]], na.rm = TRUE), - se_val = sd_val / sqrt(N - 1) - ) - } -} - -# Function to calculate and save summary statistics for all variables -save_summary_stats <- function(df, variables, output_dir) { - summary_stats <- df %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise(across(all_of(variables), ~ list(calculate_summary_stats(cur_data(), .)), .names = "{.col}_stats")) %>% - unnest_wider(c(L_stats, K_stats, r_stats, AUC_stats)) - - write.csv(summary_stats, file = file.path(output_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) -} - -# Define the variables to calculate statistics for -variables <- c("L", "K", "r", "AUC") - -save_summary_stats(df, variables, args$out_dir) - -# Background Strains -background_strains <- c("YDL227C") - -# Function to calculate and save summary statistics for a background strain -save_background_summary_stats <- function(df, strain, variables, output_dir, group_by = NULL) { - df_background <- df %>% - filter(OrfRep == strain) %>% - mutate(across(all_of(variables), ~ ifelse(. == 0, NA, .))) %>% - filter(!is.na(L)) - - # Calculate summary statistics for each variable - df_stats_combined <- df_background %>% - group_by(group_by) %>% - summarise( - across(all_of(variables), list( - mean = ~ mean(.x, na.rm = TRUE), - median = ~ median(.x, na.rm = TRUE), - max = ~ max(.x, na.rm = TRUE), - min = ~ min(.x, na.rm = TRUE), - sd = ~ sd(.x, na.rm = TRUE), - se = ~ sd(.x, na.rm = TRUE) / sqrt(n() - 1) - ), .names = "{.col}_{.fn}") - ) - - # Save the summary statistics for the background strain - write.csv(df_stats_combined, file = file.path(output_dir, "SummaryStats_BackgroundStrains.csv"), row.names = FALSE) -} - -for (strain in background_strains) { - save_background_summary_stats(df, strain, variables, args$out_dir, group_by = c("OrfRep", "Conc_Num", "Conc_Num_Factor")) -} - -# Iterate over each background strain -for (s in background_strains) { - - # Calculate summary statistics for L, K, r, and AUC - df_stats_by_L <- df_background %>% - group_by(OrfRep, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N - 1) - ) - - max_sd_L <- max(df_stats_by_L$sd_L, na.rm = TRUE) - - df_stats_by_K <- df_background %>% - group_by(OrfRep, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - max_K = max(K, na.rm = TRUE), - min_K = min(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - se_K = sd_K / sqrt(N - 1) - ) - - max_sd_K <- max(df_stats_by_K$sd_K, na.rm = TRUE) - - df_stats_by_r <- df_background %>% - group_by(OrfRep, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - max_r = max(r, na.rm = TRUE), - min_r = min(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - se_r = sd_r / sqrt(N - 1) - ) - - max_sd_r <- max(df_stats_by_r$sd_r, na.rm = TRUE) - - df_stats_by_AUC <- df_background %>% - group_by(OrfRep, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - max_AUC = max(AUC, na.rm = TRUE), - min_AUC = min(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - se_AUC = sd_AUC / sqrt(N - 1) - ) - - max_sd_AUC <- max(df_stats_by_AUC$sd_AUC, na.rm = TRUE) - - # Combine all summary statistics into one dataframe - df_stats_by <- df_background %>% - group_by(OrfRep, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N - 1), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - max_K = max(K, na.rm = TRUE), - min_K = min(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - se_K = sd_K / sqrt(N - 1), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - max_r = max(r, na.rm = TRUE), - min_r = min(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - se_r = sd_r / sqrt(N - 1), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - max_AUC = max(AUC, na.rm = TRUE), - min_AUC = min(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - se_AUC = sd_AUC / sqrt(N - 1) - ) - - # Save the summary statistics for the background strain - write.csv(df_stats_by, file = file.path(args$out_dir, "SummaryStats_BackgroundStrains.csv"), row.names = FALSE) - - # Calculate max theoretical L values within 2SD of the reference strain - df_within_2SD_K <- data.frame() - df_outside_2SD_K <- data.frame() - - for (q in unique(df$Conc_Num_Factor)) { - if (q == 0) { - df_within_2SD_K <- df %>% filter(Conc_Num_Factor == q & !is.na(L)) - df_stats_TEMP_K <- df_stats_by_K %>% filter(Conc_Num_Factor == q) - df_within_2SD_K <- df_within_2SD_K %>% - filter(K >= (df_stats_TEMP_K$mean_K - (2 * df_stats_TEMP_K$sd_K)) & - K <= (df_stats_TEMP_K$mean_K + (2 * df_stats_TEMP_K$sd_K))) - - df_outside_2SD_K <- df %>% filter(Conc_Num_Factor == q & !is.na(L)) - df_outside_2SD_K <- df_outside_2SD_K %>% - filter(K <= (df_stats_TEMP_K$mean_K - (2 * df_stats_TEMP_K$sd_K)) | - K >= (df_stats_TEMP_K$mean_K + (2 * df_stats_TEMP_K$sd_K))) - } else { - df_within_2SD_K_temp <- df %>% filter(Conc_Num_Factor == q & !is.na(L)) - df_stats_TEMP_K <- df_stats_by_K %>% filter(Conc_Num_Factor == q) - df_within_2SD_K_temp <- df_within_2SD_K_temp %>% - filter(K >= (df_stats_TEMP_K$mean_K - (2 * df_stats_TEMP_K$sd_K)) & - K <= (df_stats_TEMP_K$mean_K + (2 * df_stats_TEMP_K$sd_K))) - - df_within_2SD_K <- bind_rows(df_within_2SD_K, df_within_2SD_K_temp) - - df_outside_2SD_K_temp <- df %>% filter(Conc_Num_Factor == q & !is.na(L)) - df_outside_2SD_K_temp <- df_outside_2SD_K_temp %>% - filter(K <= (df_stats_TEMP_K$mean_K - (2 * df_stats_TEMP_K$sd_K)) | - K >= (df_stats_TEMP_K$mean_K + (2 * df_stats_TEMP_K$sd_K))) - - df_outside_2SD_K <- bind_rows(df_outside_2SD_K, df_outside_2SD_K_temp) - } - } - - # Calculate summary statistics for L within and outside 2SD of K - df_stats_by_L_within_2SD_K <- df_within_2SD_K %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N - 1), - z_max = (max_L - mean_L) / sd_L - ) - - max_sd_within_2SD_K <- max(df_stats_by_L_within_2SD_K$sd_L, na.rm = TRUE) - - write.csv(df_stats_by_L_within_2SD_K, file = file.path(args$out_dir, "Max_Observed_L_Vals_for_spots_within_2SD_K.csv"), row.names = FALSE) - - df_stats_by_L_outside_2SD_K <- df_outside_2SD_K %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N - 1) - ) - - max_sd_outside_2SD_K <- max(df_stats_by_L_outside_2SD_K$sd_L, na.rm = TRUE) - - # Plot and save L vs K for strains outside 2SD of K - plot_L_vs_K_outside_2SD_K <- ggplot(df_outside_2SD_K, aes(L, K, color = as.factor(Conc_Num))) + - geom_point(aes(ORF = ORF, Gene = Gene, Delta_Backgrd = Delta_Backgrd), shape = 3) + - ggtitle("Raw L vs K for strains falling outside 2SD of the K mean at each conc") + - theme_publication_legend_right() - - pdf(file = file.path(args$out_dir, "Raw_L_vs_K_for_strains_2SD_outside_mean_K.pdf"), width = 10, height = 8) - print(plot_L_vs_K_outside_2SD_K) - dev.off() - - saveWidget(ggplotly(plot_L_vs_K_outside_2SD_K), file = file.path(args$out_dir, "RawL_vs_K_for_strains_outside_2SD_K.html"), selfcontained = TRUE) - - # Plot and save DeltaBackground vs K for strains outside 2SD of K - plot_DeltaBackground_vs_K_outside_2SD_K <- ggplot(df_outside_2SD_K, aes(Delta_Backgrd, K, color = as.factor(Conc_Num))) + - geom_point(aes(L = l, ORF = ORF, Gene = Gene), shape = 3, position = "jitter") + - ggtitle("DeltaBackground vs K for strains falling outside 2SD of the K mean at each conc") + - theme_publication_legend_right() - - pdf(file = file.path(args$out_dir, "DeltaBackground_vs_K_for_strains_2SD_outside_mean_K.pdf"), width = 10, height = 8) - print(plot_DeltaBackground_vs_K_outside_2SD_K) - dev.off() - - saveWidget(ggplotly(plot_DeltaBackground_vs_K_outside_2SD_K), file = file.path(args$out_dir, "DeltaBackground_vs_K_for_strains_outside_2SD_K.html"), selfcontained = TRUE) - - # Get the background strain mean values at the no-drug concentration (Conc_Num_Factor = 0) - background_mean_L <- df_stats_by_L %>% filter(Conc_Num_Factor == 0) %>% pull(mean_L) - background_mean_K <- df_stats_by_K %>% filter(Conc_Num_Factor == 0) %>% pull(mean_K) - background_mean_r <- df_stats_by_r %>% filter(Conc_Num_Factor == 0) %>% pull(mean_r) - background_mean_AUC <- df_stats_by_AUC %>% filter(Conc_Num_Factor == 0) %>% pull(mean_AUC) - - # Initialize empty plots (placeholder for future plotting) - p_l <- ggplot() - p_K <- ggplot() - p_r <- ggplot() - p_AUC <- ggplot() - p_rf_l <- ggplot() - p_rf_K <- ggplot() - p_rf_r <- ggplot() - p_rf_AUC <- ggplot() - - # Get only the deletion strains (excluding the background strain) - df_deletion_strains <- df %>% filter(OrfRep != s) - - # Initialize SM (Set to Max) column - df_deletion_strains <- df_deletion_strains %>% mutate(SM = 0) - - # Set missing values to the highest theoretical value at each drug concentration for L - df_deletion_strains_new <- data.frame() - - for (i in seq_along(unique(df_deletion_strains$Conc_Num))) { - concentration <- unique(df_deletion_strains$Conc_Num)[i] - df_temp <- df_deletion_strains %>% filter(Conc_Num == concentration) - - if (concentration == 0) { - df_deletion_strains_new <- df_temp - message(paste("Check loop order, conc =", concentration)) - } else { - max_L_theoretical <- df_stats_by_L_within_2SD_K %>% filter(Conc_Num_Factor == concentration) %>% pull(max_L) - - df_temp <- df_temp %>% - mutate( - L = ifelse(L == 0 & !is.na(L), max_L_theoretical, L), - SM = ifelse(L >= max_L_theoretical & !is.na(L), 1, SM), - L = ifelse(L >= max_L_theoretical & !is.na(L), max_L_theoretical, L) - ) - - df_deletion_strains_new <- bind_rows(df_deletion_strains_new, df_temp) - message(paste("Check loop order, conc =", concentration)) - } - } - - df_deletion_strains <- df_deletion_strains_new - - # Get only the reference strains (background strain) - df_reference_strains <- df %>% filter(OrfRep == s) - - # Initialize SM (Set to Max) column - df_reference_strains <- df_reference_strains %>% mutate(SM = 0) - - # Set missing values to the highest theoretical value at each drug concentration for L - df_reference_strains_new <- data.frame() - - for (i in seq_along(unique(df_reference_strains$Conc_Num))) { - concentration <- unique(df_reference_strains$Conc_Num)[i] - df_rf_temp <- df_reference_strains %>% filter(Conc_Num == concentration) - - if (concentration == 0) { - df_reference_strains_new <- df_rf_temp - message(paste("Check loop order, conc =", concentration)) - } else { - max_L_theoretical <- df_stats_by_L_within_2SD_K %>% filter(Conc_Num_Factor == concentration) %>% pull(max_L) - - df_rf_temp <- df_rf_temp %>% - mutate( - L = ifelse(L == 0 & !is.na(L), max_L_theoretical, L), - SM = ifelse(L >= max_L_theoretical & !is.na(L), 1, SM), - L = ifelse(L >= max_L_theoretical & !is.na(L), max_L_theoretical, L) - ) - - df_reference_strains_new <- bind_rows(df_reference_strains_new, df_rf_temp) - message(paste("Check loop order, if error, refs have no L values outside theoretical max L, for REFs, conc =", concentration)) - } - } - - df_reference_strains <- df_reference_strains_new - - df_RF <- df_RF_new - - # Get the RF Z score values - - # Change the OrfRep column to include the RF strain, Gene name, and Num so each RF gets its own score - df_RF <- df_RF %>% - mutate(OrfRep = paste(OrfRep, Gene, Num, sep = "_")) - - num_genes_RF <- length(unique(df_RF$OrfRep)) - print(num_genes_RF) - - # Create the output dataframe containing columns for each RF strain - interaction_scores_RF <- df_RF %>% - distinct(OrfRep) %>% - mutate( - Gene = NA, - Raw_Shift_L = NA, Z_Shift_L = NA, lm_Score_L = NA, Z_lm_L = NA, R_Squared_L = NA, Sum_Z_Score_L = NA, Avg_Zscore_L = NA, - Raw_Shift_K = NA, Z_Shift_K = NA, lm_Score_K = NA, Z_lm_K = NA, R_Squared_K = NA, Sum_Z_Score_K = NA, Avg_Zscore_K = NA, - Raw_Shift_r = NA, Z_Shift_r = NA, lm_Score_r = NA, Z_lm_r = NA, R_Squared_r = NA, Sum_Z_Score_r = NA, Avg_Zscore_r = NA, - Raw_Shift_AUC = NA, Z_Shift_AUC = NA, lm_Score_AUC = NA, Z_lm_AUC = NA, R_Squared_AUC = NA, Sum_Z_Score_AUC = NA, Avg_Zscore_AUC = NA, - NG = NA, SM = NA - ) - - for (i in seq_len(num_genes_RF)) { - # Get each deletion strain ORF - gene_sel <- unique(df_RF$OrfRep)[i] - - # Extract only the current deletion strain and its data - df_gene_sel <- df_RF %>% filter(OrfRep == gene_sel) - - # Calculate summary statistics for the selected gene - df_stats_interaction <- df_gene_sel %>% - group_by(OrfRep, Gene, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = n(), - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - se_L = sd_L / sqrt(N - 1), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - se_K = sd_K / sqrt(N - 1), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - se_r = sd_r / sqrt(N - 1), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - se_AUC = sd_AUC / sqrt(N - 1), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) - ) - - # Get shift values - if (is.na(df_stats_interaction$mean_L[1]) || df_stats_interaction$mean_L[1] == 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Raw_Shift_L = 0, Z_Shift_L = 0, - Raw_Shift_K = 0, Z_Shift_K = 0, - Raw_Shift_r = 0, Z_Shift_r = 0, - Raw_Shift_AUC = 0, Z_Shift_AUC = 0 - ) - } else { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Raw_Shift_L = mean_L[1] - background_mean_L, - Z_Shift_L = Raw_Shift_L / df_stats_by_L$sd[1], - Raw_Shift_K = mean_K[1] - background_mean_K, - Z_Shift_K = Raw_Shift_K / df_stats_by_K$sd[1], - Raw_Shift_r = mean_r[1] - background_mean_r, - Z_Shift_r = Raw_Shift_r / df_stats_by_r$sd[1], - Raw_Shift_AUC = mean_AUC[1] - background_mean_AUC, - Z_Shift_AUC = Raw_Shift_AUC / df_stats_by_AUC$sd[1] - ) - } - - # Add wild-type (WT) values and standard deviations - df_stats_interaction <- df_stats_interaction %>% - mutate( - WT_l = df_stats_by_L$mean, WT_sd_l = df_stats_by_L$sd, - WT_K = df_stats_by_K$mean, WT_sd_K = df_stats_by_K$sd, - WT_r = df_stats_by_r$mean, WT_sd_r = df_stats_by_r$sd, - WT_AUC = df_stats_by_AUC$mean, WT_sd_AUC = df_stats_by_AUC$sd - ) - - # Calculate scores if there is growth at no drug concentration - if (df_stats_interaction$mean_L[1] != 0 && !is.na(df_stats_interaction$mean_L[1])) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Exp_L = WT_l + Raw_Shift_L, - Exp_K = WT_K + Raw_Shift_K, - Exp_r = WT_r + Raw_Shift_r, - Exp_AUC = WT_AUC + Raw_Shift_AUC, - Delta_L = mean_L - Exp_L, - Delta_K = mean_K - Exp_K, - Delta_r = mean_r - Exp_r, - Delta_AUC = mean_AUC - Exp_AUC, - Zscore_L = Delta_L / WT_sd_l, - Zscore_K = Delta_K / WT_sd_K, - Zscore_r = Delta_r / WT_sd_r, - Zscore_AUC = Delta_AUC / WT_sd_AUC - ) - - # Handle no growth (NG) and set to max (SM) values - if (sum(df_stats_interaction$NG, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Delta_L = ifelse(NG == 1, mean_L - WT_l, Delta_L), - Delta_K = ifelse(NG == 1, mean_K - WT_K, Delta_K), - Delta_r = ifelse(NG == 1, mean_r - WT_r, Delta_r), - Delta_AUC = ifelse(NG == 1, mean_AUC - WT_AUC, Delta_AUC) - ) - } - - if (sum(df_stats_interaction$SM, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate(Delta_L = ifelse(SM == 1, mean_L - WT_l, Delta_L)) - } - - # Calculate linear models - gene_lm_L <- lm(Delta_L ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_K <- lm(Delta_K ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_r <- lm(Delta_r ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_AUC <- lm(Delta_AUC ~ Conc_Num_Factor, data = df_stats_interaction) - - # Calculate interaction scores and R-squared values - gene_interaction_L <- max_conc * coef(gene_lm_L)[2] + coef(gene_lm_L)[1] - r_squared_l <- summary(gene_lm_L)$r.squared - gene_interaction_K <- max_conc * coef(gene_lm_K)[2] + coef(gene_lm_K)[1] - r_squared_K <- summary(gene_lm_K)$r.squared - gene_interaction_r <- max_conc * coef(gene_lm_r)[2] + coef(gene_lm_r)[1] - r_squared_r <- summary(gene_lm_r)$r.squared - gene_interaction_AUC <- max_conc * coef(gene_lm_AUC)[2] + coef(gene_lm_AUC)[1] - r_squared_AUC <- summary(gene_lm_AUC)$r.squared - - # Total non-removed concentrations - num_non_removed_conc <- Total_Conc_Nums - sum(df_stats_interaction$DB, na.rm = TRUE) - 1 - - # Report the scores - interaction_scores_RF <- interaction_scores_RF %>% - mutate( - Raw_Shift_L = replace(Raw_Shift_L, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_L[1]), - Z_Shift_L = replace(Z_Shift_L, OrfRep == gene_sel, df_stats_interaction$Z_Shift_L[1]), - lm_Score_L = replace(lm_Score_L, OrfRep == gene_sel, gene_interaction_L), - R_Squared_L = replace(R_Squared_L, OrfRep == gene_sel, r_squared_l), - Sum_Z_Score_L = replace(Sum_Z_Score_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE)), - Avg_Zscore_L = replace(Avg_Zscore_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_K = replace(Raw_Shift_K, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_K[1]), - Z_Shift_K = replace(Z_Shift_K, OrfRep == gene_sel, df_stats_interaction$Z_Shift_K[1]), - lm_Score_K = replace(lm_Score_K, OrfRep == gene_sel, gene_interaction_K), - R_Squared_K = replace(R_Squared_K, OrfRep == gene_sel, r_squared_K), - Sum_Z_Score_K = replace(Sum_Z_Score_K, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_K, na.rm = TRUE)), - Avg_Zscore_K = replace(Avg_Zscore_K, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_K, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_r = replace(Raw_Shift_r, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_r[1]), - Z_Shift_r = replace(Z_Shift_r, OrfRep == gene_sel, df_stats_interaction$Z_Shift_r[1]), - lm_Score_r = replace(lm_Score_r, OrfRep == gene_sel, gene_interaction_r), - R_Squared_r = replace(R_Squared_r, OrfRep == gene_sel, r_squared_r), - Sum_Z_Score_r = replace(Sum_Z_Score_r, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_r, na.rm = TRUE)), - Avg_Zscore_r = replace(Avg_Zscore_r, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_r, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_AUC = replace(Raw_Shift_AUC, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_AUC[1]), - Z_Shift_AUC = replace(Z_Shift_AUC, OrfRep == gene_sel, df_stats_interaction$Z_Shift_AUC[1]), - lm_Score_AUC = replace(lm_Score_AUC, OrfRep == gene_sel, gene_interaction_AUC), - R_Squared_AUC = replace(R_Squared_AUC, OrfRep == gene_sel, r_squared_AUC), - Sum_Z_Score_AUC = replace(Sum_Z_Score_AUC, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_AUC, na.rm = TRUE)), - Avg_Zscore_AUC = replace(Avg_Zscore_AUC, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_AUC, na.rm = TRUE) / num_non_removed_conc) - ) - } else { - # Handle case where mean_L is 0 or NA - interaction_scores_RF <- interaction_scores_RF %>% - mutate( - Raw_Shift_L = replace(Raw_Shift_L, OrfRep == gene_sel, NA), - Z_Shift_L = replace(Z_Shift_L, OrfRep == gene_sel, NA), - lm_Score_L = replace(lm_Score_L, OrfRep == gene_sel, NA), - R_Squared_L = replace(R_Squared_L, OrfRep == gene_sel, NA), - Sum_Z_Score_L = replace(Sum_Z_Score_L, OrfRep == gene_sel, NA), - Avg_Zscore_L = replace(Avg_Zscore_L, OrfRep == gene_sel, NA), - Raw_Shift_K = replace(Raw_Shift_K, OrfRep == gene_sel, NA), - Z_Shift_K = replace(Z_Shift_K, OrfRep == gene_sel, NA), - lm_Score_K = replace(lm_Score_K, OrfRep == gene_sel, NA), - R_Squared_K = replace(R_Squared_K, OrfRep == gene_sel, NA), - Sum_Z_Score_K = replace(Sum_Z_Score_K, OrfRep == gene_sel, NA), - Avg_Zscore_K = replace(Avg_Zscore_K, OrfRep == gene_sel, NA), - Raw_Shift_r = replace(Raw_Shift_r, OrfRep == gene_sel, NA), - Z_Shift_r = replace(Z_Shift_r, OrfRep == gene_sel, NA), - lm_Score_r = replace(lm_Score_r, OrfRep == gene_sel, NA), - R_Squared_r = replace(R_Squared_r, OrfRep == gene_sel, NA), - Sum_Z_Score_r = replace(Sum_Z_Score_r, OrfRep == gene_sel, NA), - Avg_Zscore_r = replace(Avg_Zscore_r, OrfRep == gene_sel, NA), - Raw_Shift_AUC = replace(Raw_Shift_AUC, OrfRep == gene_sel, NA), - Z_Shift_AUC = replace(Z_Shift_AUC, OrfRep == gene_sel, NA), - lm_Score_AUC = replace(lm_Score_AUC, OrfRep == gene_sel, NA), - R_Squared_AUC = replace(R_Squared_AUC, OrfRep == gene_sel, NA), - Sum_Z_Score_AUC = replace(Sum_Z_Score_AUC, OrfRep == gene_sel, NA), - Avg_Zscore_AUC = replace(Avg_Zscore_AUC, OrfRep == gene_sel, NA) - ) - } - - # Append the interaction statistics for all RFs - if (i == 1) { - df_stats_interaction_all_RF <- df_stats_interaction - } else { - df_stats_interaction_all_RF <- bind_rows(df_stats_interaction_all_RF, df_stats_interaction) - } - - # Add NG, DB, and SM values to the InteractionScores_RF dataframe - interaction_scores_RF <- interaction_scores_RF %>% - mutate( - NG = replace(NG, OrfRep == gene_sel, sum(df_stats_interaction$NG, na.rm = TRUE)), - DB = replace(DB, OrfRep == gene_sel, sum(df_stats_interaction$DB, na.rm = TRUE)), - SM = replace(SM, OrfRep == gene_sel, sum(df_stats_interaction$SM, na.rm = TRUE)) - ) - } - - print("Pass RF Calculation loop") - - # Calculate summary statistics for the linear models - lm_sd_L <- sd(interaction_scores_RF$lm_Score_L, na.rm = TRUE) - lm_sd_K <- sd(interaction_scores_RF$lm_Score_K, na.rm = TRUE) - lm_sd_r <- sd(interaction_scores_RF$lm_Score_r, na.rm = TRUE) - lm_sd_AUC <- sd(interaction_scores_RF$lm_Score_AUC, na.rm = TRUE) - - lm_mean_L <- mean(interaction_scores_RF$lm_Score_L, na.rm = TRUE) - lm_mean_K <- mean(interaction_scores_RF$lm_Score_K, na.rm = TRUE) - lm_mean_r <- mean(interaction_scores_RF$lm_Score_r, na.rm = TRUE) - lm_mean_AUC <- mean(interaction_scores_RF$lm_Score_AUC, na.rm = TRUE) - - print(paste("Mean RF linear regression score L:", lm_mean_L)) - - # Calculate Z scores for the linear models - interaction_scores_RF <- interaction_scores_RF %>% - mutate( - Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L, - Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K, - Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r, - Z_lm_AUC = (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC - ) - - # Sort the dataframe by Z_lm_L and NG (No Growth) - interaction_scores_RF <- interaction_scores_RF %>% - arrange(desc(Z_lm_L), desc(NG)) - - write.csv(interaction_scores_RF, file = file.path(output_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE) - - # Generate ggplot objects for each RF strain - for (i in seq_len(num_genes_RF)) { - gene_sel <- unique(interaction_scores_RF$OrfRep)[i] - df_z_calculations <- df_stats_interaction_all_RF %>% filter(OrfRep == gene_sel) - df_int_scores <- interaction_scores_RF %>% filter(OrfRep == gene_sel) - - p_rf_l[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_L)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-65, 65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_l), ymax = 0 + (2 * WT_sd_l)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 45, label = paste("ZShift =", round(df_int_scores$Z_Shift_L, 2))) + - annotate("text", x = 1, y = 25, label = paste("lm Zscore =", round(df_int_scores$Z_lm_L, 2))) + - annotate("text", x = 1, y = -25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = c(-60, -50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 50, 60)) + - theme_publication() - - p_rf_K[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_K)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-65, 65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_K), ymax = 0 + (2 * WT_sd_K)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 45, label = paste("ZShift =", round(df_int_scores$Z_Shift_K, 2))) + - annotate("text", x = 1, y = 25, label = paste("lm Zscore =", round(df_int_scores$Z_lm_K, 2))) + - annotate("text", x = 1, y = -25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = c(-60, -50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 50, 60)) + - theme_publication() - - p_rf_r[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_r)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-0.65, 0.65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_r), ymax = 0 + (2 * WT_sd_r)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 0.45, label = paste("ZShift =", round(df_int_scores$Z_Shift_r, 2))) + - annotate("text", x = 1, y = 0.25, label = paste("lm Zscore =", round(df_int_scores$Z_lm_r, 2))) + - annotate("text", x = 1, y = -0.25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -0.35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -0.45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = c(-0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6)) + - theme_publication() - - p_rf_AUC[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_AUC)) + - geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-6500, 6500)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_AUC), ymax = 0 + (2 * WT_sd_AUC)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 4500, label = paste("ZShift =", round(df_int_scores$Z_Shift_AUC, 2))) + - annotate("text", x = 1, y = 2500, label = paste("lm Zscore =", round(df_int_scores$Z_lm_AUC, 2))) + - annotate("text", x = 1, y = -2500, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -3500, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -4500, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = c(-6000, -5000, -4000, -3000, -2000, -1000, 0, 1000, 2000, 3000, 4000, 5000, 6000)) + - theme_publication() - - # Append the final interaction statistics for all RFs - if (i == 1) { - df_stats_interaction_all_RF_final <- df_z_calculations - } else { - df_stats_interaction_all_RF_final <- bind_rows(df_stats_interaction_all_RF_final, df_z_calculations) - } - } - - print("Pass RF ggplot loop") - - # Save the final interaction statistics - write.csv(df_stats_interaction_all_RF_final, file = file.path(output_dir, "RF_ZScore_Calculations.csv"), row.names = FALSE) - - ###### Part 5 - Get Z-scores for Gene Deletion Strains - - # Get the total number of genes for the loop - num_genes <- length(unique(df_deletion$OrfRep)) - print(num_genes) - - # Create the output dataframe containing columns for each deletion strain - interaction_scores_deletion <- unique(df_deletion["OrfRep"]) - interaction_scores_deletion <- interaction_scores_deletion %>% - mutate( - Gene = NA, - Raw_Shift_L = NA, Z_Shift_L = NA, lm_Score_L = NA, Z_lm_L = NA, R_Squared_L = NA, - Sum_Z_Score_L = NA, Avg_Zscore_L = NA, - Raw_Shift_K = NA, Z_Shift_K = NA, lm_Score_K = NA, Z_lm_K = NA, R_Squared_K = NA, - Sum_Z_Score_K = NA, Avg_Zscore_K = NA, - Raw_Shift_r = NA, Z_Shift_r = NA, lm_Score_r = NA, Z_lm_r = NA, R_Squared_r = NA, - Sum_Z_Score_r = NA, Avg_Zscore_r = NA, - Raw_Shift_AUC = NA, Z_Shift_AUC = NA, lm_Score_AUC = NA, Z_lm_AUC = NA, R_Squared_AUC = NA, - Sum_Z_Score_AUC = NA, Avg_Zscore_AUC = NA, - NG = NA, DB = NA, SM = NA - ) - - for (i in seq_len(num_genes)) { - gene_sel <- unique(df_deletion$OrfRep)[i] - df_gene_sel <- df_deletion %>% filter(OrfRep == gene_sel) - - df_stats_interaction <- df_gene_sel %>% - group_by(OrfRep, Gene, Conc_Num, Conc_Num_Factor) %>% - summarise( - N = length(L), mean_L = mean(L, na.rm = TRUE), median_L = median(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), se_L = sd_L / sqrt(N - 1), - mean_K = mean(K, na.rm = TRUE), median_K = median(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), se_K = sd_K / sqrt(N - 1), - mean_r = mean(r, na.rm = TRUE), median_r = median(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), se_r = sd_r / sqrt(N - 1), - mean_AUC = mean(AUC, na.rm = TRUE), median_AUC = median(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), se_AUC = sd_AUC / sqrt(N - 1), - NG = sum(NG, na.rm = TRUE), DB = sum(DB, na.rm = TRUE), SM = sum(SM, na.rm = TRUE) - ) %>% ungroup() - - if (is.na(df_stats_interaction$mean_L[1]) || df_stats_interaction$mean_L[1] == 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Raw_Shift_L = 0, Raw_Shift_K = 0, Raw_Shift_r = 0, Raw_Shift_AUC = 0, - Z_Shift_L = 0, Z_Shift_K = 0, Z_Shift_r = 0, Z_Shift_AUC = 0 - ) - } else { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Raw_Shift_L = mean_L[1] - background_L, Raw_Shift_K = mean_K[1] - background_K, - Raw_Shift_r = mean_r[1] - background_r, Raw_Shift_AUC = mean_AUC[1] - background_AUC, - Z_Shift_L = Raw_Shift_L[1] / df_stats_BY_L$sd[1], - Z_Shift_K = Raw_Shift_K[1] / df_stats_BY_K$sd[1], - Z_Shift_r = Raw_Shift_r[1] / df_stats_BY_r$sd[1], - Z_Shift_AUC = Raw_Shift_AUC[1] / df_stats_BY_AUC$sd[1] - ) - } - - df_stats_interaction <- df_stats_interaction %>% - mutate( - WT_l = df_stats_BY_L$mean, WT_K = df_stats_BY_K$mean, - WT_r = df_stats_BY_r$mean, WT_AUC = df_stats_BY_AUC$mean, - WT_sd_l = df_stats_BY_L$sd, WT_sd_K = df_stats_BY_K$sd, - WT_sd_r = df_stats_BY_r$sd, WT_sd_AUC = df_stats_BY_AUC$sd - ) - - if (df_stats_interaction$mean_L[1] != 0 && !is.na(df_stats_interaction$mean_L[1])) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Exp_L = WT_l + Raw_Shift_L, Exp_K = WT_K + Raw_Shift_K, - Exp_r = WT_r + Raw_Shift_r, Exp_AUC = WT_AUC + Raw_Shift_AUC, - Delta_L = mean_L - Exp_L, Delta_K = mean_K - Exp_K, - Delta_r = mean_r - Exp_r, Delta_AUC = mean_AUC - Exp_AUC - ) - - if (sum(df_stats_interaction$NG, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate( - Delta_L = if_else(NG == 1, mean_L - WT_l, Delta_L), - Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K), - Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), - Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC) - ) - } - - if (sum(df_stats_interaction$SM, na.rm = TRUE) > 0) { - df_stats_interaction <- df_stats_interaction %>% - mutate(Delta_L = if_else(SM == 1, mean_L - WT_l, Delta_L)) - } - - df_stats_interaction <- df_stats_interaction %>% - mutate( - Zscore_L = Delta_L / WT_sd_l, Zscore_K = Delta_K / WT_sd_K, - Zscore_r = Delta_r / WT_sd_r, Zscore_AUC = Delta_AUC / WT_sd_AUC - ) - - gene_lm_L <- lm(Delta_L ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_K <- lm(Delta_K ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_r <- lm(Delta_r ~ Conc_Num_Factor, data = df_stats_interaction) - gene_lm_AUC <- lm(Delta_AUC ~ Conc_Num_Factor, data = df_stats_interaction) - - gene_interaction_L <- max_conc * coef(gene_lm_L)[2] + coef(gene_lm_L)[1] - gene_interaction_K <- max_conc * coef(gene_lm_K)[2] + coef(gene_lm_K)[1] - gene_interaction_r <- max_conc * coef(gene_lm_r)[2] + coef(gene_lm_r)[1] - gene_interaction_AUC <- max_conc * coef(gene_lm_AUC)[2] + coef(gene_lm_AUC)[1] - - r_squared_l <- summary(gene_lm_L)$r.squared - r_squared_K <- summary(gene_lm_K)$r.squared - r_squared_r <- summary(gene_lm_r)$r.squared - r_squared_AUC <- summary(gene_lm_AUC)$r.squared - - num_non_removed_conc <- total_conc_nums - sum(df_stats_interaction$DB, na.rm = TRUE) - 1 - - interaction_scores_deletion <- interaction_scores_deletion %>% - mutate( - Gene = replace(Gene, OrfRep == gene_sel, df_gene_sel$Gene[1]), - Raw_Shift_L = replace(Raw_Shift_L, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_L[1]), - Z_Shift_L = replace(Z_Shift_L, OrfRep == gene_sel, df_stats_interaction$Z_Shift_L[1]), - lm_Score_L = replace(lm_Score_L, OrfRep == gene_sel, gene_interaction_L), - Z_lm_L = replace(Z_lm_L, OrfRep == gene_sel, (gene_interaction_L - lm_mean_L) / lm_sd_L), - R_Squared_L = replace(R_Squared_L, OrfRep == gene_sel, r_squared_l), - Sum_Z_Score_L = replace(Sum_Z_Score_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE)), - Avg_Zscore_L = replace(Avg_Zscore_L, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_L, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_K = replace(Raw_Shift_K, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_K[1]), - Z_Shift_K = replace(Z_Shift_K, OrfRep == gene_sel, df_stats_interaction$Z_Shift_K[1]), - lm_Score_K = replace(lm_Score_K, OrfRep == gene_sel, gene_interaction_K), - Z_lm_K = replace(Z_lm_K, OrfRep == gene_sel, (gene_interaction_K - lm_mean_K) / lm_sd_K), - R_Squared_K = replace(R_Squared_K, OrfRep == gene_sel, r_squared_K), - Sum_Z_Score_K = replace(Sum_Z_Score_K, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_K, na.rm = TRUE)), - Avg_Zscore_K = replace(Avg_Zscore_K, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_K, na.rm = TRUE) / num_non_removed_conc), - Raw_Shift_r = replace(Raw_Shift_r, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_r[1]), - Z_Shift_r = replace(Z_Shift_r, OrfRep == gene_sel, df_stats_interaction$Z_Shift_r[1]), - lm_Score_r = replace(lm_Score_r, OrfRep == gene_sel, gene_interaction_r), - Z_lm_r = replace(Z_lm_r, OrfRep == gene_sel, (gene_interaction_r - lm_mean_r) / lm_sd_r), - R_Squared_r = replace(R_Squared_r, OrfRep == gene_sel, r_squared_r), - Sum_Z_Score_r = replace(Sum_Z_Score_r, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_r, na.rm = TRUE)), - Avg_Zscore_r = replace(Avg_Zscore_r, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_r, na.rm = TRUE) / (total_conc_nums - 1)), - Raw_Shift_AUC = replace(Raw_Shift_AUC, OrfRep == gene_sel, df_stats_interaction$Raw_Shift_AUC[1]), - Z_Shift_AUC = replace(Z_Shift_AUC, OrfRep == gene_sel, df_stats_interaction$Z_Shift_AUC[1]), - lm_Score_AUC = replace(lm_Score_AUC, OrfRep == gene_sel, gene_interaction_AUC), - Z_lm_AUC = replace(Z_lm_AUC, OrfRep == gene_sel, (gene_interaction_AUC - lm_mean_AUC) / lm_sd_AUC), - R_Squared_AUC = replace(R_Squared_AUC, OrfRep == gene_sel, r_squared_AUC), - Sum_Z_Score_AUC = replace(Sum_Z_Score_AUC, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_AUC, na.rm = TRUE)), - Avg_Zscore_AUC = replace(Avg_Zscore_AUC, OrfRep == gene_sel, sum(df_stats_interaction$Zscore_AUC, na.rm = TRUE) / (total_conc_nums - 1)) - ) - } else { - # Similar logic for when mean_L is 0 or NA, setting relevant variables to NA or appropriate values - interaction_scores_deletion <- interaction_scores_deletion %>% - mutate( - Gene = replace(Gene, OrfRep == gene_sel, df_gene_sel$Gene[1]), - Raw_Shift_L = replace(Raw_Shift_L, OrfRep == gene_sel, NA), - Z_Shift_L = replace(Z_Shift_L, OrfRep == gene_sel, NA), - lm_Score_L = replace(lm_Score_L, OrfRep == gene_sel, NA), - Z_lm_L = replace(Z_lm_L, OrfRep == gene_sel, NA), - R_Squared_L = replace(R_Squared_L, OrfRep == gene_sel, NA), - Sum_Z_Score_L = replace(Sum_Z_Score_L, OrfRep == gene_sel, NA), - Avg_Zscore_L = replace(Avg_Zscore_L, OrfRep == gene_sel, NA), - Raw_Shift_K = replace(Raw_Shift_K, OrfRep == gene_sel, NA), - Z_Shift_K = replace(Z_Shift_K, OrfRep == gene_sel, NA), - lm_Score_K = replace(lm_Score_K, OrfRep == gene_sel, NA), - Z_lm_K = replace(Z_lm_K, OrfRep == gene_sel, NA), - R_Squared_K = replace(R_Squared_K, OrfRep == gene_sel, NA), - Sum_Z_Score_K = replace(Sum_Z_Score_K, OrfRep == gene_sel, NA), - Avg_Zscore_K = replace(Avg_Zscore_K, OrfRep == gene_sel, NA), - Raw_Shift_r = replace(Raw_Shift_r, OrfRep == gene_sel, NA), - Z_Shift_r = replace(Z_Shift_r, OrfRep == gene_sel, NA), - lm_Score_r = replace(lm_Score_r, OrfRep == gene_sel, NA), - Z_lm_r = replace(Z_lm_r, OrfRep == gene_sel, NA), - R_Squared_r = replace(R_Squared_r, OrfRep == gene_sel, NA), - Sum_Z_Score_r = replace(Sum_Z_Score_r, OrfRep == gene_sel, NA), - Avg_Zscore_r = replace(Avg_Zscore_r, OrfRep == gene_sel, NA), - Raw_Shift_AUC = replace(Raw_Shift_AUC, OrfRep == gene_sel, NA), - Z_Shift_AUC = replace(Z_Shift_AUC, OrfRep == gene_sel, NA), - lm_Score_AUC = replace(lm_Score_AUC, OrfRep == gene_sel, NA), - Z_lm_AUC = replace(Z_lm_AUC, OrfRep == gene_sel, NA), - R_Squared_AUC = replace(R_Squared_AUC, OrfRep == gene_sel, NA), - Sum_Z_Score_AUC = replace(Sum_Z_Score_AUC, OrfRep == gene_sel, NA), - Avg_Zscore_AUC = replace(Avg_Zscore_AUC, OrfRep == gene_sel, NA) - ) - } - - if (i == 1) { - df_stats_interaction_all <- df_stats_interaction - } else { - df_stats_interaction_all <- bind_rows(df_stats_interaction_all, df_stats_interaction) - } - - interaction_scores_deletion <- interaction_scores_deletion %>% - mutate( - NG = replace(NG, OrfRep == gene_sel, sum(df_stats_interaction$NG, na.rm = TRUE)), - DB = replace(DB, OrfRep == gene_sel, sum(df_stats_interaction$DB, na.rm = TRUE)), - SM = replace(SM, OrfRep == gene_sel, sum(df_stats_interaction$SM, na.rm = TRUE)) - ) - } - - print("Pass Int Calculation loop") - - # Order the interaction scores by Z_lm_L and NG - interaction_scores_deletion <- interaction_scores_deletion %>% - arrange(desc(Z_lm_L)) %>% - arrange(desc(NG)) - - # Save the interaction scores and filtered sets for enhancers and suppressors - output_files <- list( - "ZScores_Interaction.csv" = interaction_scores_deletion, - "ZScores_Interaction_DeletionEnhancers_L.csv" = filter(interaction_scores_deletion, Avg_Zscore_L >= 2), - "ZScores_Interaction_DeletionEnhancers_K.csv" = filter(interaction_scores_deletion, Avg_Zscore_K <= -2), - "ZScores_Interaction_DeletionSuppressors_L.csv" = filter(interaction_scores_deletion, Avg_Zscore_L <= -2), - "ZScores_Interaction_DeletionSuppressors_K.csv" = filter(interaction_scores_deletion, Avg_Zscore_K >= 2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_L.csv" = filter(interaction_scores_deletion, Avg_Zscore_L >= 2 | Avg_Zscore_L <= -2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K.csv" = filter(interaction_scores_deletion, Avg_Zscore_K >= 2 | Avg_Zscore_K <= -2), - "ZScores_Interaction_Suppressors_and_lm_Enhancers_L.csv" = filter(interaction_scores_deletion, Z_lm_L >= 2 & Avg_Zscore_L <= -2), - "ZScores_Interaction_Enhancers_and_lm_Suppressors_L.csv" = filter(interaction_scores_deletion, Z_lm_L <= -2 & Avg_Zscore_L >= 2), - "ZScores_Interaction_Suppressors_and_lm_Enhancers_K.csv" = filter(interaction_scores_deletion, Z_lm_K <= -2 & Avg_Zscore_K >= 2), - "ZScores_Interaction_Enhancers_and_lm_Suppressors_K.csv" = filter(interaction_scores_deletion, Z_lm_K >= 2 & Avg_Zscore_K <= -2) - ) - - for (file_name in names(output_files)) { - write.csv(output_files[[file_name]], file = file.path(output_dir, file_name), row.names = FALSE) - } - - # Further filtering for linear regression enhancers and suppressors - output_files_lm <- list( - "ZScores_Interaction_DeletionEnhancers_L_lm.csv" = filter(interaction_scores_deletion, Z_lm_L >= 2), - "ZScores_Interaction_DeletionEnhancers_K_lm.csv" = filter(interaction_scores_deletion, Z_lm_K <= -2), - "ZScores_Interaction_DeletionSuppressors_L_lm.csv" = filter(interaction_scores_deletion, Z_lm_L <= -2), - "ZScores_Interaction_DeletionSuppressors_K_lm.csv" = filter(interaction_scores_deletion, Z_lm_K >= 2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_L_lm.csv" = filter(interaction_scores_deletion, Z_lm_L >= 2 | Z_lm_L <= -2), - "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K_lm.csv" = filter(interaction_scores_deletion, Z_lm_K >= 2 | Z_lm_K <= -2) - ) - - for (file_name in names(output_files_lm)) { - write.csv(output_files_lm[[file_name]], file = file.path(output_dir, file_name), row.names = FALSE) - } - - - # Loop through each gene to generate plots - for (i in 1:num_genes) { - gene_sel <- unique(interaction_scores_deletion$OrfRep)[i] - df_z_calculations <- df_stats_interaction_all %>% filter(OrfRep == gene_sel) - df_int_scores <- interaction_scores_deletion %>% filter(OrfRep == gene_sel) - - p_l[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_L)) + - geom_point() + - geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-65, 65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_l), ymax = 0 + (2 * WT_sd_l)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 45, label = paste("ZShift =", round(df_int_scores$Z_Shift_L, 2))) + - annotate("text", x = 1, y = 25, label = paste("Z lm Score =", round(df_int_scores$Z_lm_L, 2))) + - annotate("text", x = 1, y = -25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-60, 60, 10)) + - theme_Publication() - - p_K[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_K)) + - geom_point() + - geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-65, 65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_K), ymax = 0 + (2 * WT_sd_K)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 45, label = paste("ZShift =", round(df_int_scores$Z_Shift_K, 2))) + - annotate("text", x = 1, y = 25, label = paste("Z lm Score =", round(df_int_scores$Z_lm_K, 2))) + - annotate("text", x = 1, y = -25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-60, 60, 10)) + - theme_Publication() - - p_r[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_r)) + - geom_point() + - geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-0.65, 0.65)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_r), ymax = 0 + (2 * WT_sd_r)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 0.45, label = paste("ZShift =", round(df_int_scores$Z_Shift_r, 2))) + - annotate("text", x = 1, y = 0.25, label = paste("Z lm Score =", round(df_int_scores$Z_lm_r, 2))) + - annotate("text", x = 1, y = -0.25, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -0.35, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -0.45, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-0.6, 0.6, 0.2)) + - theme_Publication() - - p_AUC[[i]] <- ggplot(df_z_calculations, aes(Conc_Num_Factor, Delta_AUC)) + - geom_point() + - geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + - coord_cartesian(ylim = c(-6500, 6500)) + - geom_errorbar(aes(ymin = 0 - (2 * WT_sd_AUC), ymax = 0 + (2 * WT_sd_AUC)), alpha = 0.3) + - ggtitle(paste(df_z_calculations$OrfRep[1], df_z_calculations$Gene[1], sep = " ")) + - annotate("text", x = 1, y = 4500, label = paste("ZShift =", round(df_int_scores$Z_Shift_AUC, 2))) + - annotate("text", x = 1, y = 2500, label = paste("Z lm Score =", round(df_int_scores$Z_lm_AUC, 2))) + - annotate("text", x = 1, y = -2500, label = paste("NG =", df_int_scores$NG)) + - annotate("text", x = 1, y = -3500, label = paste("DB =", df_int_scores$DB)) + - annotate("text", x = 1, y = -4500, label = paste("SM =", df_int_scores$SM)) + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df_z_calculations$Conc_Num_Factor), labels = unique(as.character(df_z_calculations$Conc_Num))) + - scale_y_continuous(breaks = seq(-6000, 6000, 1000)) + - theme_Publication() - - if (i == 1) { - df_stats_interaction_all_final <- df_z_calculations - } else { - df_stats_interaction_all_final <- bind_rows(df_stats_interaction_all_final, df_z_calculations) - } - } - - print("Pass Int ggplot loop") - write.csv(df_stats_interaction_all_final, file = file.path(output_dir, "ZScore_Calculations.csv"), row.names = FALSE) - - # Generate a blank plot for alignment purposes - blank_plot <- ggplot(df2_rf) + geom_blank() - - # Create PDF for interaction plots - pdf(file.path(output_dir, "InteractionPlots.pdf"), width = 16, height = 16, onefile = TRUE) - - # Summarize stats for X2_RF - df_stats_rf <- df2_rf %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise( - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - max_K = max(K, na.rm = TRUE), - min_K = min(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - max_r = max(r, na.rm = TRUE), - min_r = min(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - max_AUC = max(AUC, na.rm = TRUE), - min_AUC = min(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) - ) - - # Create L statistics scatter plot - plot_l_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, L)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for L with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 160)) + - annotate("text", x = -0.25, y = 10, label = "NG") + - annotate("text", x = -0.25, y = 5, label = "DB") + - annotate("text", x = -0.25, y = 0, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 10, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 5, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 0, label = df_stats_rf$SM) + - theme_Publication() - - # Create K statistics scatter plot - plot_k_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, K)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for K with SD", sep = " ")) + - coord_cartesian(ylim = c(-20, 160)) + - annotate("text", x = -0.25, y = -5, label = "NG") + - annotate("text", x = -0.25, y = -12.5, label = "DB") + - annotate("text", x = -0.25, y = -20, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -5, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -12.5, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -20, label = df_stats_rf$SM) + - theme_Publication() - - # Create r statistics scatter plot - plot_r_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, r)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for r with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 1)) + - annotate("text", x = -0.25, y = .9, label = "NG") + - annotate("text", x = -0.25, y = .8, label = "DB") + - annotate("text", x = -0.25, y = .7, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .9, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .8, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .7, label = df_stats_rf$SM) + - theme_Publication() - - # Create AUC statistics scatter plot - plot_auc_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, AUC)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for AUC with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 12500)) + - annotate("text", x = -0.25, y = 11000, label = "NG") + - annotate("text", x = -0.25, y = 10000, label = "DB") + - annotate("text", x = -0.25, y = 9000, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 11000, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 10000, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 9000, label = df_stats_rf$SM) + - theme_Publication() - - # Arrange and plot scatter plots - grid.arrange(plot_l_stats, plot_k_stats, plot_r_stats, plot_auc_stats, ncol = 2, nrow = 2) - - # Create box plots for each statistic - plot_l_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), L)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for L with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 160)) + - theme_Publication() - - plot_k_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), K)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for K with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 130)) + - theme_Publication() - - plot_r_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), r)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for r with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 1)) + - theme_Publication() - - plot_auc_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), AUC)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for AUC with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 12500)) + - theme_Publication() - - # Arrange and plot box plots - grid.arrange(plot_l_stats_box, plot_k_stats_box, plot_r_stats_box, plot_auc_stats_box, ncol = 2, nrow = 2) - - # Loop to arrange and print combined plots - plot_indices <- seq(1, (num_genes - 1), by = 3) - for (m in seq_along(plot_indices)) { - grid.arrange( - p_l[[plot_indices[m]]], p_K[[plot_indices[m]]], p_r[[plot_indices[m]]], p_AUC[[plot_indices[m]]], - p_l[[plot_indices[m] + 1]], p_K[[plot_indices[m] + 1]], p_r[[plot_indices[m] + 1]], p_AUC[[plot_indices[m] + 1]], - p_l[[plot_indices[m] + 2]], p_K[[plot_indices[m] + 2]], p_r[[plot_indices[m] + 2]], p_AUC[[plot_indices[m] + 2]], - ncol = 4, nrow = 3 - ) - } - - # Handle leftover plots if num_genes is not a multiple of 3 - remaining_plots <- num_genes - max(plot_indices + 2) - if (remaining_plots > 0) { - plot_grid_list <- lapply(seq_len(remaining_plots), function(i) { - list(p_l[[plot_indices[length(plot_indices)] + i]], p_K[[plot_indices[length(plot_indices)] + i]], p_r[[plot_indices[length(plot_indices)] + i]], p_AUC[[plot_indices[length(plot_indices)] + i]]) - }) - do.call(grid.arrange, c(plot_grid_list, list(ncol = 4, nrow = 3))) - } - - dev.off() - - # Additional PDF output for RF interaction plots - # Generate PDF for RF interaction plots - pdf(file.path(output_dir, "RF_InteractionPlots.pdf"), width = 16, height = 16, onefile = TRUE) - - # Summarize stats for RF data - df_stats_rf <- df2_rf %>% - group_by(Conc_Num, Conc_Num_Factor) %>% - summarise( - mean_L = mean(L, na.rm = TRUE), - median_L = median(L, na.rm = TRUE), - max_L = max(L, na.rm = TRUE), - min_L = min(L, na.rm = TRUE), - sd_L = sd(L, na.rm = TRUE), - mean_K = mean(K, na.rm = TRUE), - median_K = median(K, na.rm = TRUE), - max_K = max(K, na.rm = TRUE), - min_K = min(K, na.rm = TRUE), - sd_K = sd(K, na.rm = TRUE), - mean_r = mean(r, na.rm = TRUE), - median_r = median(r, na.rm = TRUE), - max_r = max(r, na.rm = TRUE), - min_r = min(r, na.rm = TRUE), - sd_r = sd(r, na.rm = TRUE), - mean_AUC = mean(AUC, na.rm = TRUE), - median_AUC = median(AUC, na.rm = TRUE), - max_AUC = max(AUC, na.rm = TRUE), - min_AUC = min(AUC, na.rm = TRUE), - sd_AUC = sd(AUC, na.rm = TRUE), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE) - ) - - # Create L statistics scatter plot for RF data - plot_rf_l_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, L)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for L with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 130)) + - annotate("text", x = -0.25, y = 10, label = "NG") + - annotate("text", x = -0.25, y = 5, label = "DB") + - annotate("text", x = -0.25, y = 0, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 10, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 5, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 0, label = df_stats_rf$SM) + - theme_Publication() - - # Create K statistics scatter plot for RF data - plot_rf_k_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, K)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for K with SD", sep = " ")) + - coord_cartesian(ylim = c(-20, 160)) + - annotate("text", x = -0.25, y = -5, label = "NG") + - annotate("text", x = -0.25, y = -12.5, label = "DB") + - annotate("text", x = -0.25, y = -20, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -5, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -12.5, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = -20, label = df_stats_rf$SM) + - theme_Publication() - - # Create r statistics scatter plot for RF data - plot_rf_r_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, r)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for r with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 1)) + - annotate("text", x = -0.25, y = .9, label = "NG") + - annotate("text", x = -0.25, y = .8, label = "DB") + - annotate("text", x = -0.25, y = .7, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .9, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .8, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = .7, label = df_stats_rf$SM) + - theme_Publication() - - # Create AUC statistics scatter plot for RF data - plot_rf_auc_stats <- ggplot(df2_rf, aes(Conc_Num_Factor, AUC)) + - geom_point(position = "jitter", size = 1) + - stat_summary( - fun = mean, - fun.min = ~ mean(.) - sd(.), - fun.max = ~ mean(.) + sd(.), - geom = "errorbar", color = "red" - ) + - stat_summary(fun = mean, geom = "point", color = "red") + - scale_x_continuous(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for AUC with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 12500)) + - annotate("text", x = -0.25, y = 11000, label = "NG") + - annotate("text", x = -0.25, y = 10000, label = "DB") + - annotate("text", x = -0.25, y = 9000, label = "SM") + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 11000, label = df_stats_rf$NG) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 10000, label = df_stats_rf$DB) + - annotate("text", x = unique(df2_rf$Conc_Num_Factor), y = 9000, label = df_stats_rf$SM) + - theme_Publication() - - # Arrange and plot RF scatter plots - grid.arrange(plot_rf_l_stats, plot_rf_k_stats, plot_rf_r_stats, plot_rf_auc_stats, ncol = 2, nrow = 2) - - # Create box plots for each RF statistic - plot_rf_l_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), L)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for L with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 130)) + - theme_Publication() - - plot_rf_k_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), K)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for K with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 160)) + - theme_Publication() - - plot_rf_r_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), r)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for r with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 1)) + - theme_Publication() - - plot_rf_auc_stats_box <- ggplot(df2_rf, aes(as.factor(Conc_Num_Factor), AUC)) + - geom_boxplot() + - scale_x_discrete(name = unique(df$Drug[1]), breaks = unique(df2_rf$Conc_Num_Factor), labels = as.character(unique(df2_rf$Conc_Num))) + - ggtitle(paste(s, "Scatter RF for AUC with SD", sep = " ")) + - coord_cartesian(ylim = c(0, 12500)) + - theme_Publication() - - # Arrange and plot RF box plots - grid.arrange(plot_rf_l_stats_box, plot_rf_k_stats_box, plot_rf_r_stats_box, plot_rf_auc_stats_box, ncol = 2, nrow = 2) - - # Loop to arrange and print combined RF plots - plot_indices_rf <- seq(1, (num_genes_RF - 1), by = 3) - for (m in seq_along(plot_indices_rf)) { - grid.arrange( - p_rf_l[[plot_indices_rf[m]]], p_rf_K[[plot_indices_rf[m]]], p_rf_r[[plot_indices_rf[m]]], p_rf_AUC[[plot_indices_rf[m]]], - p_rf_l[[plot_indices_rf[m] + 1]], p_rf_K[[plot_indices_rf[m] + 1]], p_rf_r[[plot_indices_rf[m] + 1]], p_rf_AUC[[plot_indices_rf[m] + 1]], - p_rf_l[[plot_indices_rf[m] + 2]], p_rf_K[[plot_indices_rf[m] + 2]], p_rf_r[[plot_indices_rf[m] + 2]], p_rf_AUC[[plot_indices_rf[m] + 2]], - ncol = 4, nrow = 3 - ) - } - - # Handle leftover RF plots if num_genes_RF is not a multiple of 3 - remaining_rf_plots <- num_genes_RF - max(plot_indices_rf + 2) - if (remaining_rf_plots > 0) { - plot_grid_rf_list <- lapply(seq_len(remaining_rf_plots), function(i) { - list(p_rf_l[[plot_indices_rf[length(plot_indices_rf)] + i]], p_rf_K[[plot_indices_rf[length(plot_indices_rf)] + i]], p_rf_r[[plot_indices_rf[length(plot_indices_rf)] + i]], p_rf_AUC[[plot_indices_rf[length(plot_indices_rf)] + i]]) - }) - do.call(grid.arrange, c(plot_grid_rf_list, list(ncol = 4, nrow = 3))) - } - dev.off() -} - -# Calculate linear models and R-squared values for all CPPs in results 1 vs results 2 -lm_list <- list( - lm(Z_lm_K ~ Z_lm_L, data = df_na_rm), - lm(Z_lm_r ~ Z_lm_L, data = df_na_rm), - lm(Z_lm_AUC ~ Z_lm_L, data = df_na_rm), - lm(Z_lm_r ~ Z_lm_K, data = df_na_rm), - lm(Z_lm_AUC ~ Z_lm_K, data = df_na_rm), - lm(Z_lm_AUC ~ Z_lm_r, data = df_na_rm) -) - -lm_summaries <- lapply(lm_list, summary) - -# Create PDF for correlation plots of CPPs -pdf(file.path(output_dir, "Correlation_CPPs.pdf"), width = 10, height = 7, onefile = TRUE) - -# Generate correlation plots for each combination -plot_list <- list( - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_K)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction L vs. Interaction K") + - xlab("z-score L") + ylab("z-score K") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[1]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_r)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction L vs. Interaction r") + - xlab("z-score L") + ylab("z-score r") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[2]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction L vs. Interaction AUC") + - xlab("z-score L") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[3]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_K, Z_lm_r)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction K vs. Interaction r") + - xlab("z-score K") + ylab("z-score r") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[4]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_K, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction K vs. Interaction AUC") + - xlab("z-score K") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[5]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_r, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle("Interaction r vs. Interaction AUC") + - xlab("z-score r") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[6]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)) -) - -# Print all correlation plots to the PDF -lapply(plot_list, print) - -# Create additional plots with InteractionScores_RF highlighted in cyan -interaction_scores_rf_filtered <- interaction_scores_rf[!is.na(interaction_scores_rf$Z_lm_L), ] - -highlighted_plot_list <- list( - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_K)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_L, Z_lm_K), color = "cyan") + - ggtitle("Interaction L vs. Interaction K") + - xlab("z-score L") + ylab("z-score K") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[1]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_r)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_L, Z_lm_r), color = "cyan") + - ggtitle("Interaction L vs. Interaction r") + - xlab("z-score L") + ylab("z-score r") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[2]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_L, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_L, Z_lm_AUC), color = "cyan") + - ggtitle("Interaction L vs. Interaction AUC") + - xlab("z-score L") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[3]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_K, Z_lm_r)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_K, Z_lm_r), color = "cyan") + - ggtitle("Interaction K vs. Interaction r") + - xlab("z-score K") + ylab("z-score r") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[4]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_K, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_K, Z_lm_AUC), color = "cyan") + - ggtitle("Interaction K vs. Interaction AUC") + - xlab("z-score K") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[5]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)), - - ggplot(df_na_rm, aes(Z_lm_r, Z_lm_AUC)) + - geom_point(shape = 3, color = "gray70") + - geom_point(data = interaction_scores_rf_filtered, aes(Z_lm_r, Z_lm_AUC), color = "cyan") + - ggtitle("Interaction r vs. Interaction AUC") + - xlab("z-score r") + ylab("z-score AUC") + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[6]]$r.squared, 3))) + - theme_Publication_legend_right() + - theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(size = 16), axis.title.x = element_text(size = 18), axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 18)) -) - -# Print all highlighted plots to the PDF -lapply(highlighted_plot_list, print) - -dev.off() diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores5.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores5.R deleted file mode 100644 index e772c7e7..00000000 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores5.R +++ /dev/null @@ -1,945 +0,0 @@ -suppressMessages({ - library(ggplot2) - library(plotly) - library(htmlwidgets) - library(dplyr) - library(ggthemes) - library(data.table) - library(unix) -}) - -options(warn = 2, max.print = 1000) -options(width = 10000) - -# Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes) -soft_limit <- 30 * 1024 * 1024 * 1024 -hard_limit <- 30 * 1024 * 1024 * 1024 -rlimit_as(soft_limit, hard_limit) - -# Constants for configuration -plot_width <- 14 -plot_height <- 9 -base_size <- 14 - -parse_arguments <- function() { - args <- if (interactive()) { - c( - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240116_jhartman2_DoxoHLD", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/apps/r/SGD_features.tab", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/easy/20240116_jhartman2_DoxoHLD/results_std.txt", - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp1", - "Experiment 1: Doxo versus HLD", - 3, - "/home/bryan/documents/develop/scripts/hartmanlab/workflow/out/20240116_jhartman2_DoxoHLD/20240822_jhartman2_DoxoHLD/exp2", - "Experiment 2: HLD versus Doxo", - 3 - ) - } else { - commandArgs(trailingOnly = TRUE) - } - - # Extract paths, names, and standard deviations - paths <- args[seq(4, length(args), by = 3)] - names <- args[seq(5, length(args), by = 3)] - sds <- as.numeric(args[seq(6, length(args), by = 3)]) - - # Normalize paths - normalized_paths <- normalizePath(paths, mustWork = FALSE) - - # Create named list of experiments - experiments <- list() - for (i in seq_along(paths)) { - experiments[[names[i]]] <- list( - path = normalized_paths[i], - sd = sds[i] - ) - } - - list( - out_dir = normalizePath(args[1], mustWork = FALSE), - sgd_gene_list = normalizePath(args[2], mustWork = FALSE), - easy_results_file = normalizePath(args[3], mustWork = FALSE), - experiments = experiments - ) -} - -args <- parse_arguments() - -# Should we keep output in exp dirs or combine in the study output dir? -# dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE) -# dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE) - -# Define themes and scales -theme_publication <- function(base_size = 14, base_family = "sans", legend_position = "bottom") { - theme_foundation <- ggplot2::theme_grey(base_size = base_size, base_family = base_family) - - theme_foundation %+replace% - theme( - plot.title = element_text(face = "bold", size = rel(1.2), hjust = 0.5), - text = element_text(), - panel.background = element_rect(colour = NA), - plot.background = element_rect(colour = NA), - panel.border = element_rect(colour = NA), - axis.title = element_text(face = "bold", size = rel(1)), - axis.title.y = element_text(angle = 90, vjust = 2), - axis.title.x = element_text(vjust = -0.2), - axis.line = element_line(colour = "black"), - panel.grid.major = element_line(colour = "#f0f0f0"), - panel.grid.minor = element_blank(), - legend.key = element_rect(colour = NA), - legend.position = legend_position, - legend.direction = ifelse(legend_position == "right", "vertical", "horizontal"), - plot.margin = unit(c(10, 5, 5, 5), "mm"), - strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"), - strip.text = element_text(face = "bold") - ) -} - -scale_fill_publication <- function(...) { - discrete_scale("fill", "Publication", manual_pal(values = c( - "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", - "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" - )), ...) -} - -scale_colour_publication <- function(...) { - discrete_scale("colour", "Publication", manual_pal(values = c( - "#386cb0", "#fdb462", "#7fc97f", "#ef3b2c", "#662506", - "#a6cee3", "#fb9a99", "#984ea3", "#ffff33" - )), ...) -} - -# Load the initial dataframe from the easy_results_file -load_and_process_data <- function(easy_results_file, sd = 3) { - df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) - - df <- df %>% - filter(!(.[[1]] %in% c("", "Scan"))) %>% - filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>% - # Rename columns - rename(L = l, num = Num., AUC = AUC96, scan = Scan, last_bg = LstBackgrd, first_bg = X1stBackgrd) %>% - mutate( - across(c(Col, Row, num, L, K, r, scan, AUC, last_bg, first_bg), as.numeric), - delta_bg = last_bg - first_bg, - delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + (sd * sd(delta_bg, na.rm = TRUE)), - NG = if_else(L == 0 & !is.na(L), 1, 0), - DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0), - SM = 0, - OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded? - conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)), - conc_num_factor = as.numeric(as.factor(conc_num)) - 1 - ) - - return(df) -} - -# Update Gene names using the SGD gene list -update_gene_names <- function(df, sgd_gene_list) { - # Load SGD gene list - genes <- read.delim(file = sgd_gene_list, - quote = "", header = FALSE, - colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) - - # Create a named vector for mapping ORF to GeneName - gene_map <- setNames(genes$V5, genes$V4) - # Vectorized match to find the GeneName from gene_map - mapped_genes <- gene_map[df$ORF] - # Replace NAs in mapped_genes with original Gene names (preserves existing Gene names if ORF is not found) - updated_genes <- ifelse(is.na(mapped_genes) | df$OrfRep == "YDL227C", df$Gene, mapped_genes) - # Ensure Gene is not left blank or incorrectly updated to "OCT1" - df <- df %>% - mutate(Gene = ifelse(updated_genes == "" | updated_genes == "OCT1", OrfRep, updated_genes)) - - return(df) -} - -# Calculate summary statistics for all variables -calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) { - df <- df %>% - mutate(across(all_of(variables), ~ ifelse(. == 0, NA, .))) - - summary_stats <- df %>% - group_by(across(all_of(group_vars))) %>% - summarise( - N = sum(!is.na(L)), - across(all_of(variables), list( - mean = ~mean(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1) - ), .names = "{.fn}_{.col}") - ) - - df_cleaned <- df %>% - select(-any_of(names(summary_stats))) - - df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars) - - return(list(summary_stats = summary_stats, df_with_stats = df_with_stats)) -} - - -calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { - - # Calculate total concentration variables - total_conc_num <- length(unique(df$conc_num)) - num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1 - - # Pull the background means and standard deviations from zero concentration - bg_means <- list( - L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(), - K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(), - r = df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first(), - AUC = df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first() - ) - - bg_sd <- list( - L = df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first(), - K = df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first(), - r = df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first(), - AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() - ) - - interaction_scores <- df %>% - mutate( - WT_L = df$mean_L, - WT_K = df$mean_K, - WT_r = df$mean_r, - WT_AUC = df$mean_AUC, - WT_sd_L = df$sd_L, - WT_sd_K = df$sd_K, - WT_sd_r = df$sd_r, - WT_sd_AUC = df$sd_AUC - ) %>% - group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% - mutate( - N = sum(!is.na(L)), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE), - across(all_of(variables), list( - mean = ~mean(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1) - ), .names = "{.fn}_{.col}") - ) %>% - ungroup() - - interaction_scores <- interaction_scores %>% - group_by(across(all_of(group_vars))) %>% - mutate( - Raw_Shift_L = mean_L[[1]] - bg_means$L, - Raw_Shift_K = mean_K[[1]] - bg_means$K, - Raw_Shift_r = mean_r[[1]] - bg_means$r, - Raw_Shift_AUC = mean_AUC[[1]] - bg_means$AUC, - Z_Shift_L = Raw_Shift_L[[1]] / df$sd_L[[1]], - Z_Shift_K = Raw_Shift_K[[1]] / df$sd_K[[1]], - Z_Shift_r = Raw_Shift_r[[1]] / df$sd_r[[1]], - Z_Shift_AUC = Raw_Shift_AUC[[1]] / df$sd_AUC[[1]] - ) - - interaction_scores <- interaction_scores %>% - mutate( - Exp_L = WT_L + Raw_Shift_L, - Delta_L = mean_L - Exp_L, - Exp_K = WT_K + Raw_Shift_K, - Delta_K = mean_K - Exp_K, - Exp_r = WT_r + Raw_Shift_r, - Delta_r = mean_r - Exp_r, - Exp_AUC = WT_AUC + Raw_Shift_AUC, - Delta_AUC = mean_AUC - Exp_AUC - ) - - interaction_scores <- interaction_scores %>% - mutate( - Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L), - Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K), - Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), - Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC), - Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) - ) - - # Calculate linear models and interaction scores - interaction_scores <- interaction_scores %>% - mutate( - lm_L = lm(Delta_L ~ conc_num_factor), - lm_K = lm(Delta_K ~ conc_num_factor), - lm_r = lm(Delta_r ~ conc_num_factor), - lm_AUC = lm(Delta_AUC ~ conc_num_factor), - Zscore_L = Delta_L / WT_sd_L, - Zscore_K = Delta_K / WT_sd_K, - Zscore_r = Delta_r / WT_sd_r, - Zscore_AUC = Delta_AUC / WT_sd_AUC - ) - - interaction_scores <- interaction_scores %>% - mutate( - Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE), - Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE), - Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE), - Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) - ) - - interaction_scores_all <- interaction_scores %>% - mutate( - Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs, - Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs, - Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1), - Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1), - lm_Score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1], - lm_Score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1], - lm_Score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1], - lm_Score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1], - r_squared_L = summary(lm_L)$r.squared, - r_squared_K = summary(lm_K)$r.squared, - r_squared_r = summary(lm_r)$r.squared, - r_squared_AUC = summary(lm_AUC)$r.squared - ) - - # Calculate Z_lm for each variable - interaction_scores_all <- interaction_scores_all %>% - mutate( - Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE), - Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE), - Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE), - Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE) - ) - - # Arrange results by Z_lm_L and NG - interaction_scores_all <- interaction_scores_all %>% - arrange(desc(lm_Score_L)) %>% - arrange(desc(NG)) %>% - ungroup() - - return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) -} - -generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) { - - `%||%` <- function(a, b) if (!is.null(a)) a else b - - # Generalized plot generation - plots <- lapply(plot_configs, function(config) { - - df <- config$df - plot <- ggplot(df, aes(x = !!sym(config$x_var), y = !!sym(config$y_var), color = as.factor(!!sym(config$color_var)))) - - # Rank plots with SD annotations - if (config$plot_type == "rank") { - plot <- plot + geom_point(size = 0.1, shape = 3) - - # Adding SD bands - if (!is.null(config$sd_band)) { - for (i in seq_len(config$sd_band)) { - plot <- plot + - annotate("rect", xmin = -Inf, xmax = Inf, ymin = i, ymax = Inf, fill = "#542788", alpha = 0.3) + - annotate("rect", xmin = -Inf, xmax = Inf, ymin = -i, ymax = -Inf, fill = "orange", alpha = 0.3) + - geom_hline(yintercept = c(-i, i), color = "gray") - } - } - - # Optionally add enhancer/suppressor count annotations - if (!is.null(config$enhancer_label)) { - plot <- plot + annotate("text", x = config$enhancer_label$x, - y = config$enhancer_label$y, label = config$enhancer_label$label) + - annotate("text", x = config$suppressor_label$x, - y = config$suppressor_label$y, label = config$suppressor_label$label) - } - } - - # Correlation plots - if (config$plot_type == "correlation") { - plot <- plot + geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - annotate("text", x = 0, y = 0, label = config$correlation_text) - } - - # General scatter/boxplot/density handling - if (!is.null(config$y_var)) { - plot <- plot + aes(y = !!sym(config$y_var)) - - y_mean_col <- paste0("mean_", config$y_var) - y_sd_col <- paste0("sd_", config$y_var) - - if (config$y_var == "delta_bg" && config$plot_type == "scatter") { - plot <- plot + geom_point(shape = 3, size = 0.2, position = "jitter") + - geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col), - ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) + - geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) - } else if (config$error_bar %||% FALSE) { - plot <- plot + - geom_point(shape = 3, size = 0.2) + - geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col), - ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) + - geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) - } - } - - # Plot type selection - plot <- switch(config$plot_type, - "box" = plot + geom_boxplot(), - "density" = plot + geom_density(), - "bar" = plot + geom_bar(stat = "identity"), - plot + geom_point() + geom_smooth(method = "lm", se = FALSE)) - - # Adding y-limits if provided - if (!is.null(config$ylim_vals)) { - plot <- plot + coord_cartesian(ylim = config$ylim_vals) - } - - # Setting legend position, titles, and labels - legend_position <- config$legend_position %||% "bottom" - plot <- plot + ggtitle(config$title) + theme_Publication(legend_position = legend_position) - - if (!is.null(config$x_label)) plot <- plot + xlab(config$x_label) - if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label) - - # Adding text annotations if provided - if (!is.null(config$annotations)) { - for (annotation in config$annotations) { - plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label) - } - } - - return(plot) - }) - - # If grid_layout is provided, arrange plots in a grid and save in a single PDF - if (!is.null(grid_layout)) { - pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9) - - # Loop through plots in chunks defined by ncol and nrow - for (start_idx in seq(1, length(plots), by = grid_layout$ncol * grid_layout$nrow)) { - end_idx <- min(start_idx + grid_layout$ncol * grid_layout$nrow - 1, length(plots)) - plot_subset <- plots[start_idx:end_idx] - - # Arrange plots in a grid - do.call(grid.arrange, c(plot_subset, ncol = grid_layout$ncol, nrow = grid_layout$nrow)) - } - - dev.off() - - # Save as HTML (optional) - plotly_plots <- lapply(plots, function(plot) { - suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h"))) - }) - combined_plot <- subplot(plotly_plots, nrows = grid_layout$nrow, margin = 0.05) - saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, "_grid.html")), selfcontained = TRUE) - - } else { - # Save individual plots to PDF - pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9) - lapply(plots, print) - dev.off() - - # Convert plots to plotly and save as HTML - plotly_plots <- lapply(plots, function(plot) { - suppressWarnings(ggplotly(plot) %>% layout(legend = list(orientation = "h"))) - }) - combined_plot <- subplot(plotly_plots, nrows = length(plotly_plots), margin = 0.05) - saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE) - } -} - -generate_interaction_plot_configs <- function(df, variables) { - plot_configs <- list() - - for (variable in variables) { - # Define the y-limits based on the variable being plotted - ylim_vals <- switch(variable, - "L" = c(-65, 65), - "K" = c(-65, 65), - "r" = c(-0.65, 0.65), - "AUC" = c(-6500, 6500) - ) - - # Dynamically generate the column names for standard deviation and delta - wt_sd_col <- paste0("WT_sd_", variable) - delta_var <- paste0("Delta_", variable) - z_shift <- paste0("Z_Shift_", variable) - z_lm <- paste0("Z_lm_", variable) - - # Set annotations for ZShift, Z lm Score, NG, DB, SM - annotations <- list( - list(x = 1, y = ifelse(variable == "L", 45, ifelse(variable == "K", 45, - ifelse(variable == "r", 0.45, 4500))), label = paste("ZShift =", round(df[[z_shift]], 2))), - list(x = 1, y = ifelse(variable == "L", 25, ifelse(variable == "K", 25, - ifelse(variable == "r", 0.25, 2500))), label = paste("lm ZScore =", round(df[[z_lm]], 2))), - list(x = 1, y = ifelse(variable == "L", -25, ifelse(variable == "K", -25, - ifelse(variable == "r", -0.25, -2500))), label = paste("NG =", df$NG)), - list(x = 1, y = ifelse(variable == "L", -35, ifelse(variable == "K", -35, - ifelse(variable == "r", -0.35, -3500))), label = paste("DB =", df$DB)), - list(x = 1, y = ifelse(variable == "L", -45, ifelse(variable == "K", -45, - ifelse(variable == "r", -0.45, -4500))), label = paste("SM =", df$SM)) - ) - - # Add scatter plot configuration for this variable - plot_configs[[length(plot_configs) + 1]] <- list( - df = df, - x_var = "conc_num_factor", - y_var = delta_var, - plot_type = "scatter", - title = sprintf("%s %s", df$OrfRep[1], df$Gene[1]), - ylim_vals = ylim_vals, - annotations = annotations, - error_bar = list( - ymin = 0 - (2 * df[[wt_sd_col]][1]), - ymax = 0 + (2 * df[[wt_sd_col]][1]) - ), - x_breaks = unique(df$conc_num_factor), - x_labels = unique(as.character(df$conc_num)), - x_label = unique(df$Drug[1]) - ) - - # Add box plot configuration for this variable - plot_configs[[length(plot_configs) + 1]] <- list( - df = df, - x_var = "conc_num_factor", - y_var = variable, - plot_type = "box", - title = sprintf("%s %s (Boxplot)", df$OrfRep[1], df$Gene[1]), - ylim_vals = ylim_vals, - annotations = annotations, - error_bar = FALSE, # Boxplots typically don't need error bars - x_breaks = unique(df$conc_num_factor), - x_labels = unique(as.character(df$conc_num)), - x_label = unique(df$Drug[1]) - ) - } - - return(plot_configs) -} - -generate_rank_plot_configs <- function(df, rank_var, zscore_var, label_prefix) { - configs <- list() - - for (sd_band in c(1, 2, 3)) { - # Annotated version - configs[[length(configs) + 1]] <- list( - df = df, - x_var = rank_var, - y_var = zscore_var, - plot_type = "rank", - title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"), - sd_band = sd_band, - enhancer_label = list( - x = nrow(df) / 2, y = 10, - label = paste("Deletion Enhancers =", nrow(df[df[[zscore_var]] >= sd_band, ])) - ), - suppressor_label = list( - x = nrow(df) / 2, y = -10, - label = paste("Deletion Suppressors =", nrow(df[df[[zscore_var]] <= -sd_band, ])) - ) - ) - - # Non-annotated version - configs[[length(configs) + 1]] <- list( - df = df, - x_var = rank_var, - y_var = zscore_var, - plot_type = "rank", - title = paste("Average Z score vs. Rank for", label_prefix, "above", sd_band, "SD"), - sd_band = sd_band - ) - } - - return(configs) -} - -generate_correlation_plot_configs <- function(df, lm_list, lm_summaries) { - lapply(seq_along(lm_list), function(i) { - r_squared <- round(lm_summaries[[i]]$r.squared, 3) - list( - x_var = names(lm_list)[i][1], - y_var = names(lm_list)[i][2], - plot_type = "scatter", - title = paste("Correlation between", names(lm_list)[i][1], "and", names(lm_list)[i][2]), - annotations = list(list(x = 0, y = 0, label = paste("R-squared =", r_squared))) - ) - }) -} - -# Adjust missing values and calculate ranks -adjust_missing_and_rank <- function(df, variables) { - - # Adjust missing values in Avg_Zscore and Z_lm columns, and apply rank to the specified variables - df <- df %>% - mutate(across(all_of(variables), list( - Avg_Zscore = ~ if_else(is.na(get(paste0("Avg_Zscore_", cur_column()))), 0.001, get(paste0("Avg_Zscore_", cur_column()))), - Z_lm = ~ if_else(is.na(get(paste0("Z_lm_", cur_column()))), 0.001, get(paste0("Z_lm_", cur_column()))), - Rank = ~ rank(get(paste0("Avg_Zscore_", cur_column()))), - Rank_lm = ~ rank(get(paste0("Z_lm_", cur_column()))) - ), .names = "{fn}_{col}")) - - return(df) -} - -main <- function() { - lapply(names(args$experiments), function(exp_name) { - exp <- args$experiments[[exp_name]] - exp_path <- exp$path - exp_sd <- exp$sd - out_dir <- file.path(exp_path, "zscores") - out_dir_qc <- file.path(exp_path, "zscores", "qc") - dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) - dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE) - - # Load and process data - df <- load_and_process_data(args$easy_results_file, sd = exp_sd) - df <- update_gene_names(df, args$sgd_gene_list) - - max_conc <- max(df$conc_num_factor) - - # QC steps and filtering - df_above_tolerance <- df %>% filter(DB == 1) - - # Calculate the half-medians for `L` and `K` for rows above tolerance - L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2 - K_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2 - - # Get the number of rows that are above tolerance - rows_to_remove <- nrow(df_above_tolerance) - - # Set L, r, K, and AUC to NA for rows that are above tolerance - df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .))) - - # Calculate summary statistics for all strains, including both background and the deletions - message("Calculating summary statistics for all strains") - variables <- c("L", "K", "r", "AUC") - ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) - summary_stats <- ss$summary_stats - df_na_stats <- ss$df_with_stats - write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) - - print("Summary stats:") - print(head(summary_stats), width = 200) - - # Remove rows with 0 values in L - df_no_zeros <- df_na %>% filter(L > 0) - - # Additional filtering for non-finite values - df_na_filtered <- df_na %>% - filter(if_any(c(L), ~ !is.finite(.))) %>% - { - if (nrow(.) > 0) { - message("Removing non-finite rows:\n") - print(head(., n = 10)) - } - df_na %>% filter(if_all(c(L), is.finite)) - } - - # Filter data within and outside 2SD - message("Filtering by 2SD of K") - df_na_within_2sd_k <- df_na_stats %>% - filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) - df_na_outside_2sd_k <- df_na_stats %>% - filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) - - # Summary statistics for within and outside 2SD of K - message("Calculating summary statistics for L within 2SD of K") - # TODO We're omitting the original z_max calculation, not sure if needed? - ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - l_within_2sd_k_stats <- ss$summary_stats - df_na_l_within_2sd_k_stats <- ss$df_with_stats - message("Calculating summary statistics for L outside 2SD of K") - ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - l_outside_2sd_k_stats <- ss$summary_stats - df_na_l_outside_2sd_k_stats <- ss$df_with_stats - # Write CSV files - write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE) - write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) - - # Plots - - # Print quality control graphs before removing data due to contamination and - # adjusting missing data to max theoretical values - l_vs_k_plots <- list( - list(df = df, x_var = "L", y_var = "K", plot_type = "scatter", - title = "Raw L vs K before QC", - color_var = "conc_num", - legend_position = "right" - ) - ) - - above_threshold_plots <- list( - list(df = df_above_tolerance, x_var = "L", y_var = "K", plot_type = "scatter", - title = paste("Raw L vs K for strains above delta background threshold of", df_above_tolerance$delta_bg_tolerance[[1]], "or above"), - color_var = "conc_num", - annotations = list( - list( - x = L_half_median, - y = K_half_median, - label = paste("Strains above delta background tolerance =", nrow(df_above_tolerance)) - ) - ), - error_bar = FALSE, - legend_position = "right" - ) - ) - - frequency_delta_bg_plots <- list( - list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "density", - title = "Density plot for Delta Background by Conc All Data", - color_var = "conc_num", - x_label = "Delta Background", - y_label = "Density", - error_bar = FALSE, - legend_position = "right" - ), - list(df = df, x_var = "delta_bg", y_var = NULL, plot_type = "bar", - title = "Bar plot for Delta Background by Conc All Data", - color_var = "conc_num", - x_label = "Delta Background", - y_label = "Count", - error_bar = FALSE, - legend_position = "right" - ) - ) - - plate_analysis_plots <- list() - for (plot_type in c("scatter", "box")) { - variables <- c("L", "K", "r", "AUC", "delta_bg") - for (var in variables) { - for (stage in c("before", "after")) { - if (stage == "before") { - df_plot <- df - } else { - df_plot <- df_na # TODO use df_na_filtered if necessary - } - - # Set error_bar = TRUE only for scatter plots - error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE) - - # Create the plot configuration - plot_config <- list(df = df_plot, x_var = "scan", y_var = var, plot_type = plot_type, - title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"), - error_bar = error_bar, color_var = "conc_num") - - plate_analysis_plots <- append(plate_analysis_plots, list(plot_config)) - } - } - } - - plate_analysis_no_zero_plots <- list() - for (plot_type in c("scatter", "box")) { - variables <- c("L", "K", "r", "AUC", "delta_bg") - for (var in variables) { - - # Set error_bar = TRUE only for scatter plots - error_bar <- ifelse(plot_type == "scatter", TRUE, FALSE) - - # Create the plot configuration - plot_config <- list( - df = df_no_zeros, - x_var = "scan", - y_var = var, - plot_type = plot_type, - title = paste("Plate analysis by Drug Conc for", var, "after quality control"), - error_bar = error_bar, - color_var = "conc_num" - ) - plate_analysis_plots <- append(plate_analysis_plots, list(plot_config)) - } - } - - l_outside_2sd_k_plots <- list( - list(df = X_outside_2SD_K, x_var = "l", y_var = "K", plot_type = "scatter", - title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc", - color_var = "conc_num", - legend_position = "right" - ) - ) - - delta_bg_outside_2sd_k_plots <- list( - list(df = X_outside_2SD_K, x_var = "delta_bg", y_var = "K", plot_type = "scatter", - title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc", - color_var = "conc_num", - legend_position = "right" - ) - ) - - # Generate and save plots for each QC step - message("Generating QC plots") - generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots) - generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) - generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) - generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) - generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots) - generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots) - generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots) - - # Clean up - rm(df, df_above_tolerance, df_no_zeros) - - # TODO: Originally this filtered L NA's - # Let's try to avoid for now since stats have already been calculated - - # Process background strains - bg_strains <- c("YDL227C") - lapply(bg_strains, function(strain) { - - message("Processing background strain: ", strain) - - # Handle missing data by setting zero values to NA - # and then removing any rows with NA in L col - df_bg <- df_na %>% - filter(OrfRep == strain) %>% - mutate( - L = if_else(L == 0, NA, L), - K = if_else(K == 0, NA, K), - r = if_else(r == 0, NA, r), - AUC = if_else(AUC == 0, NA, AUC) - ) %>% - filter(!is.na(L)) - - # Recalculate summary statistics for the background strain - message("Calculating summary statistics for background strain") - ss <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) - summary_stats_bg <- ss$summary_stats - df_bg_stats <- ss$df_with_stats - write.csv(summary_stats_bg, - file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), - row.names = FALSE) - - # Filter reference and deletion strains - # Formerly X2_RF (reference strains) - df_reference <- df_na_stats %>% - filter(OrfRep == strain) %>% - mutate(SM = 0) - - # Formerly X2 (deletion strains) - df_deletion <- df_na_stats %>% - filter(OrfRep != strain) %>% - mutate(SM = 0) - - # Set the missing values to the highest theoretical value at each drug conc for L - # Leave other values as 0 for the max/min - reference_strain <- df_reference %>% - group_by(conc_num) %>% - mutate( - max_l_theoretical = max(max_L, na.rm = TRUE), - L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), - SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM), - L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% - ungroup() - - # Ditto for deletion strains - deletion_strains <- df_deletion %>% - group_by(conc_num) %>% - mutate( - max_l_theoretical = max(max_L, na.rm = TRUE), - L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), - SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM), - L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% - ungroup() - - # Calculate interactions - variables <- c("L", "K", "r", "AUC") - message("Calculating interaction scores") - print("Reference strain:") - print(head(reference_strain)) - reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables) - print("Deletion strains:") - print(head(deletion_strains)) - deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables) - - zscores_calculations_reference <- reference_results$zscores_calculations - zscores_interactions_reference <- reference_results$zscores_interactions - zscores_calculations <- deletion_results$zscores_calculations - zscores_interactions <- deletion_results$zscores_interactions - - # Writing Z-Scores to file - write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE) - write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE) - write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE) - write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE) - - # Create interaction plots - reference_plot_configs <- generate_interaction_plot_configs(df_reference, variables) - deletion_plot_configs <- generate_interaction_plot_configs(df_deletion, variables) - generate_and_save_plots(out_dir, "RF_interactionPlots", reference_plot_configs, grid_layout = list(ncol = 4, nrow = 3)) - generate_and_save_plots(out_dir, "InteractionPlots", deletion_plot_configs, grid_layout = list(ncol = 4, nrow = 3)) - - # Define conditions for enhancers and suppressors - # TODO Add to study config file? - threshold <- 2 - enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= threshold - suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -threshold - enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= threshold - suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -threshold - - # Subset data - enhancers_L <- zscores_interactions[enhancer_condition_L, ] - suppressors_L <- zscores_interactions[suppressor_condition_L, ] - enhancers_K <- zscores_interactions[enhancer_condition_K, ] - suppressors_K <- zscores_interactions[suppressor_condition_K, ] - - # Save enhancers and suppressors - message("Writing enhancer/suppressor csv files") - write.csv(enhancers_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L.csv"), row.names = FALSE) - write.csv(suppressors_L, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L.csv"), row.names = FALSE) - write.csv(enhancers_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K.csv"), row.names = FALSE) - write.csv(suppressors_K, file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K.csv"), row.names = FALSE) - - # Combine conditions for enhancers and suppressors - enhancers_and_suppressors_L <- zscores_interactions[enhancer_condition_L | suppressor_condition_L, ] - enhancers_and_suppressors_K <- zscores_interactions[enhancer_condition_K | suppressor_condition_K, ] - - # Save combined enhancers and suppressors - write.csv(enhancers_and_suppressors_L, - file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_L.csv"), row.names = FALSE) - write.csv(enhancers_and_suppressors_K, - file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_and_Suppressors_K.csv"), row.names = FALSE) - - # Handle linear model based enhancers and suppressors - lm_threshold <- 2 - enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= lm_threshold, ] - suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -lm_threshold, ] - enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= lm_threshold, ] - suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -lm_threshold, ] - - # Save linear model based enhancers and suppressors - message("Writing linear model enhancer/suppressor csv files") - write.csv(enhancers_lm_L, - file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_L_lm.csv"), row.names = FALSE) - write.csv(suppressors_lm_L, - file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_L_lm.csv"), row.names = FALSE) - write.csv(enhancers_lm_K, - file = file.path(out_dir, "ZScores_Interaction_Deletion_Enhancers_K_lm.csv"), row.names = FALSE) - write.csv(suppressors_lm_K, - file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE) - - zscores_interactions_adjusted <- adjust_missing_and_rank(zscores_interactions) - - # Generate all rank plot configurations for L and K - rank_plot_configs <- c( - generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_L", "Avg_Zscore_L", "L"), - generate_rank_plot_configs(zscores_interactions_adjusted, "Rank_K", "Avg_Zscore_K", "K") - ) - - # Generate and save rank plots - generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots", - plot_configs = rank_plot_config, grid_layout = list(ncol = 3, nrow = 2)) - - # # Correlation plots - # lm_list <- list( - # lm(Z_lm_K ~ Z_lm_L, data = zscores_interactions_filtered), - # lm(Z_lm_r ~ Z_lm_L, data = zscores_interactions_filtered), - # lm(Z_lm_AUC ~ Z_lm_L, data = zscores_interactions_filtered), - # lm(Z_lm_r ~ Z_lm_K, data = zscores_interactions_filtered), - # lm(Z_lm_AUC ~ Z_lm_K, data = zscores_interactions_filtered), - # lm(Z_lm_AUC ~ Z_lm_r, data = zscores_interactions_filtered) - # ) - - lm_summaries <- lapply(lm_list, summary) - correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries) - generate_and_save_plots(zscores_interactions_filtered, output_dir, correlation_plot_configs) - }) - }) -} -main() diff --git a/qhtcp-workflow/qhtcp-workflow b/qhtcp-workflow/qhtcp-workflow index e516d682..f753f921 100755 --- a/qhtcp-workflow/qhtcp-workflow +++ b/qhtcp-workflow/qhtcp-workflow @@ -1498,7 +1498,7 @@ calculate_interaction_zscores() { * Background values are reported in the results sheet and so could also be analyzed there. EOF - declare script="$APPS_DIR/r/calculate_interaction_zscores5.R" + declare script="$APPS_DIR/r/calculate_interaction_zscores.R" declare -a out_paths=("${1:-"$STUDY_RESULTS_DIR/zscores"}") for path in "${EXP_PATHS[@]}"; do out_paths+=("${path}/zscores")