From 2f42357aa1f70bfe9ea7d5b75e9bf6b76c2417e2 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Mon, 30 Sep 2024 01:26:36 -0400 Subject: [PATCH] Simplify plot config format --- .../apps/r/calculate_interaction_zscores.R | 710 ++++++++---------- 1 file changed, 301 insertions(+), 409 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 2cde11cf..83050df7 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -112,7 +112,7 @@ theme_publication <- function(base_size = 14, base_family = "sans", legend_posit legend.spacing = unit(0, "cm"), legend.title = element_text(face = "italic", size = rel(1.3)), legend.text = element_text(size = rel(1.2)), - plot.margin = unit(c(10, 5, 5, 5), "mm"), + plot.margin = unit(c(10, 5, 5, 5), "mm") # strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"), # strip.text = element_text(face = "bold") ) @@ -150,9 +150,8 @@ load_and_filter_data <- function(easy_results_file, sd = 3) { SM = 0, OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded? conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)), - conc_num_factor_new = factor(conc_num), - conc_num_factor_zeroed = factor(as.numeric(conc_num_factor_new) - 1), - conc_num_factor = as.numeric(conc_num_factor_zeroed) # for legacy purposes + conc_num_factor = as.numeric(as.factor(conc_num)) - 1, # for legacy purposes + conc_num_factor_factor = factor(conc_num) ) return(df) @@ -179,25 +178,25 @@ update_gene_names <- function(df, sgd_gene_list) { } calculate_summary_stats <- function(df, variables, group_vars) { - + summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% summarise( N = n(), across(all_of(variables), list( - mean = ~mean(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), - min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), - sd = ~sd(., na.rm = TRUE), - se = ~sd(., na.rm = TRUE) / sqrt(N - 1) # Bessel's correction + mean = ~ mean(.x, na.rm = TRUE), + median = ~ median(.x, na.rm = TRUE), + max = ~ ifelse(all(is.na(.x)), NA, max(.x, na.rm = TRUE)), + min = ~ ifelse(all(is.na(.x)), NA, min(.x, na.rm = TRUE)), + sd = ~ sd(.x, na.rm = TRUE), + se = ~ sd(.x, na.rm = TRUE) / sqrt(n() - 1) ), .names = "{.fn}_{.col}" ), .groups = "drop" ) - + # Create a cleaned version of df that doesn't overlap with summary_stats cleaned_df <- df %>% select(-any_of(setdiff(intersect(names(df), names(summary_stats)), group_vars))) @@ -335,7 +334,8 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, # Declare column order for output calculations <- calculations %>% select( - "OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "N", + "OrfRep", "Gene", "num", "N", + "conc_num", "conc_num_factor", "conc_num_factor_factor", "mean_L", "mean_K", "mean_r", "mean_AUC", "median_L", "median_K", "median_r", "median_AUC", "sd_L", "sd_K", "sd_r", "sd_AUC", @@ -352,7 +352,8 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, interactions <- interactions %>% select( - "OrfRep", "Gene", "conc_num", "conc_num_factor", "num", "NG", "DB", "SM", + "OrfRep", "Gene", "num", "NG", "DB", "SM", + "conc_num", "conc_num_factor", "conc_num_factor_factor", "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", "Sum_Z_Score_L", "Sum_Z_Score_K", "Sum_Z_Score_r", "Sum_Z_Score_AUC", @@ -366,10 +367,11 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, cleaned_df <- df %>% select(-any_of( setdiff(intersect(names(df), names(calculations)), - c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")))) + c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "conc_num_factor_factor")))) # Join the original dataframe with calculations - df_with_calculations <- left_join(cleaned_df, calculations, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) + df_with_calculations <- left_join(cleaned_df, calculations, + by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "conc_num_factor_factor")) # Remove overlapping columns between df_with_calculations and interactions df_with_calculations_clean <- df_with_calculations %>% @@ -378,7 +380,8 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")))) # Join with interactions to create the full dataset - full_data <- left_join(df_with_calculations_clean, interactions, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) + full_data <- left_join(df_with_calculations_clean, interactions, + by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "conc_num_factor_factor")) return(list( calculations = calculations, @@ -390,131 +393,67 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, generate_and_save_plots <- function(out_dir, filename, plot_configs) { message("Generating ", filename, ".pdf and ", filename, ".html") - for (config_group in plot_configs) { - plot_list <- config_group$plots - grid_nrow <- config_group$grid_layout$nrow - grid_ncol <- config_group$grid_layout$ncol + grid_nrow <- ifelse(is.null(plot_configs$grid_layout$nrow), 1, plot_configs$grid_layout$nrow) + grid_ncol <- ifelse(is.null(plot_configs$grid_layout$ncol), length(plot_configs$plots), plot_configs$grid_layout$ncol) - # Set defaults if nrow or ncol are not provided - if (is.null(grid_nrow) || is.null(grid_ncol)) { - num_plots <- length(plot_list) - grid_nrow <- ifelse(is.null(grid_nrow), 1, grid_nrow) - grid_ncol <- ifelse(is.null(grid_ncol), num_plots, grid_ncol) + static_plots <- list() + plotly_plots <- list() + + for (i in seq_along(plot_configs$plots)) { + config <- plot_configs$plots[[i]] + df <- config$df + + # Define aes_mapping and handle color_var defaulting + aes_mapping <- if (config$plot_type == "bar") { + aes(x = .data[[config$x_var]], fill = ifelse(!is.null(config$color_var), .data[[config$color_var]], "black"), + color = ifelse(!is.null(config$color_var), .data[[config$color_var]], "black")) + } else if (config$plot_type == "density") { + aes(x = .data[[config$x_var]], color = ifelse(!is.null(config$color_var), .data[[config$color_var]], "black")) + } else { + aes(x = .data[[config$x_var]], y = .data[[config$y_var]], color = ifelse(!is.null(config$color_var), .data[[config$color_var]], "black")) } - # Prepare lists to collect static and interactive plots - static_plots <- list() - plotly_plots <- list() + # Apply theme_publication with legend_position + plot <- ggplot(df, aes_mapping) + theme_publication(legend_position = config$legend_position) - # Generate each individual plot based on the configuration - for (i in seq_along(plot_list)) { - config <- plot_list[[i]] - df <- config$df - - # Create the base plot - aes_mapping <- if (config$plot_type == "bar") { - if (!is.null(config$color_var)) { - aes(x = .data[[config$x_var]], fill = as.factor(.data[[config$color_var]]), color = as.factor(.data[[config$color_var]])) - } else { - aes(x = .data[[config$x_var]]) - } - } else if (config$plot_type == "density") { - if (!is.null(config$color_var)) { - aes(x = .data[[config$x_var]], color = as.factor(.data[[config$color_var]])) - } else { - aes(x = .data[[config$x_var]]) - } - } else { - if (!is.null(config$color_var)) { - aes(x = .data[[config$x_var]], y = .data[[config$y_var]], color = as.factor(.data[[config$color_var]])) - } else { - aes(x = .data[[config$x_var]], y = .data[[config$y_var]]) - } - } - - plot <- ggplot(df, aes_mapping) - - # Apply theme_publication with legend_position from config - legend_position <- if (!is.null(config$legend_position)) config$legend_position else "bottom" - plot <- plot + theme_publication(legend_position = legend_position) - - # Use appropriate helper function based on plot type - plot <- switch(config$plot_type, - "scatter" = generate_scatter_plot(plot, config), - "box" = generate_box_plot(plot, config), - "density" = plot + geom_density(), - "bar" = plot + geom_bar(), - plot # default case if no type matches - ) - - # Add title and labels - if (!is.null(config$title)) { - plot <- plot + ggtitle(config$title) - } - if (!is.null(config$x_label)) { - plot <- plot + xlab(config$x_label) - } - if (!is.null(config$y_label)) { - plot <- plot + ylab(config$y_label) - } - - # Add cartesian coordinates if specified - if (!is.null(config$coord_cartesian)) { - plot <- plot + coord_cartesian(ylim = config$coord_cartesian) - } - - # Apply scale_color_discrete(guide = FALSE) when color_var is NULL - if (is.null(config$color_var)) { - plot <- plot + scale_color_discrete(guide = "none") - } - - # Add interactive tooltips for plotly - tooltip_vars <- c() - if (config$plot_type == "scatter") { - if (!is.null(config$delta_bg_point) && config$delta_bg_point) { - tooltip_vars <- c(tooltip_vars, "OrfRep", "Gene", "delta_bg") - } else if (!is.null(config$gene_point) && config$gene_point) { - tooltip_vars <- c(tooltip_vars, "OrfRep", "Gene") - } else if (!is.null(config$y_var) && !is.null(config$x_var)) { - tooltip_vars <- c(config$x_var, config$y_var) - } - } - - # Convert to plotly object and suppress warnings here - plotly_plot <- suppressWarnings({ - if (length(tooltip_vars) > 0) { - plotly::ggplotly(plot, tooltip = tooltip_vars) - } else { - plotly::ggplotly(plot, tooltip = "none") - } - }) - - # Adjust legend position if specified - if (!is.null(config$legend_position) && config$legend_position == "bottom") { - plotly_plot <- plotly_plot %>% layout(legend = list(orientation = "h")) - } - - # Add plots to lists - static_plots[[i]] <- plot - plotly_plots[[i]] <- plotly_plot - } - - # Save static PDF plot(s) for the current grid - pdf(file.path(out_dir, paste0(filename, ".pdf")), width = 16, height = 9) - grid.arrange(grobs = static_plots, ncol = grid_ncol, nrow = grid_nrow) - dev.off() - - combined_plot <- plotly::subplot( - plotly_plots, - nrows = grid_nrow, - ncols = grid_ncol, - margin = 0.05 + # Apply appropriate plot function + plot <- switch(config$plot_type, + "scatter" = generate_scatter_plot(plot, config), + "box" = generate_box_plot(plot, config), + "density" = plot + geom_density(), + "bar" = plot + geom_bar(), + plot # default case ) - # Save combined HTML plot(s) - html_file <- file.path(out_dir, paste0(filename, ".html")) - saveWidget(combined_plot, file = html_file, selfcontained = TRUE) + # Add titles and labels + if (!is.null(config$title)) plot <- plot + ggtitle(config$title) + if (!is.null(config$x_label)) plot <- plot + xlab(config$x_label) + if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label) + if (!is.null(config$coord_cartesian)) plot <- plot + coord_cartesian(ylim = config$coord_cartesian) + + # Add interactive tooltips for plotly + tooltip_vars <- ifelse(!is.null(config$tooltip_vars), config$tooltip_vars, "none") + plotly_plot <- suppressWarnings(plotly::ggplotly(plot, tooltip = tooltip_vars)) + + # Adjust legend position in plotly + if (!is.null(config$legend_position) && config$legend_position == "bottom") { + plotly_plot <- plotly_plot %>% layout(legend = list(orientation = "h")) + } + + # Add static and interactive plots to lists + static_plots[[i]] <- plot + plotly_plots[[i]] <- plotly_plot } + + # Save static PDF plots + pdf(file.path(out_dir, paste0(filename, ".pdf")), width = 16, height = 9) + grid.arrange(grobs = static_plots, ncol = grid_ncol, nrow = grid_nrow) + dev.off() + + # Save combined interactive HTML plots + combined_plot <- plotly::subplot(plotly_plots, nrows = grid_nrow, ncols = grid_ncol, margin = 0.05) + html_file <- file.path(out_dir, paste0(filename, ".html")) + saveWidget(combined_plot, file = html_file, selfcontained = TRUE) } generate_scatter_plot <- function(plot, config) { @@ -671,51 +610,39 @@ generate_box_plot <- function(plot, config) { return(plot) } -generate_plate_analysis_plot_configs <- function(variables, stages = c("before", "after"), - df_before = NULL, df_after = NULL, plot_type = "scatter") { +generate_plate_analysis_plot_configs <- function(variables, df_before = NULL, df_after = NULL, + plot_type = "scatter", stages = c("before", "after")) { plots <- list() + for (var in variables) { for (stage in stages) { df_plot <- if (stage == "before") df_before else df_after # Check for non-finite values in the y-variable - df_plot_filtered <- df_plot %>% - filter(is.finite(!!sym(var))) - - # Count removed rows - removed_rows <- nrow(df_plot) - nrow(df_plot_filtered) - if (removed_rows > 0) { - message(sprintf("Removed %d non-finite values for variable %s during stage %s", removed_rows, var, stage)) - } - + df_plot_filtered <- df_plot %>% filter(is.finite(!!sym(var))) + # Adjust settings based on plot_type - if (plot_type == "scatter") { - error_bar <- TRUE - position <- "jitter" - } else if (plot_type == "box") { - error_bar <- FALSE - position <- NULL - } - config <- list( - df = df_plot, + df = df_plot_filtered, x_var = "scan", y_var = var, plot_type = plot_type, title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"), - error_bar = error_bar, - color_var = "conc_num_factor_zeroed", - position = position, - size = 0.2 + color_var = "conc_num_factor_factor", + position = if (plot_type == "scatter") "jitter" else NULL, + size = 0.2, + error_bar = (plot_type == "scatter") ) + + # Add config to plots list plots <- append(plots, list(config)) } } - return(plots) + + return(list(grid_layout = list(ncol = 1, nrow = length(plots)), plots = plots)) } generate_interaction_plot_configs <- function(df, limits_map = NULL, plot_type = "reference") { - # Define limits if not provided if (is.null(limits_map)) { limits_map <- list( L = c(0, 130), @@ -729,94 +656,74 @@ generate_interaction_plot_configs <- function(df, limits_map = NULL, plot_type = ) } - # Define grouping variables and filter data based on plot type - if (plot_type == "reference") { - group_vars <- c("OrfRep", "Gene", "num") - df_filtered <- df %>% - mutate( - OrfRepCombined = paste(OrfRep, Gene, num, sep = "_") - ) - } else if (plot_type == "deletion") { - group_vars <- c("OrfRep", "Gene") - df_filtered <- df %>% - mutate( - OrfRepCombined = paste(OrfRep, Gene, sep = "_") # Compare by OrfRep and Gene for deletion - ) - } + group_vars <- if (plot_type == "reference") c("OrfRep", "Gene", "num") else c("OrfRep", "Gene") + df_filtered <- df %>% + mutate(OrfRepCombined = if (plot_type == "reference") paste(OrfRep, Gene, num, sep = "_") else paste(OrfRep, Gene, sep = "_")) - # Create a list to store all configs - configs <- list() + plots <- list() - # Generate the first 8 scatter/box plots for L, K, r, AUC (shared between reference and deletion) - overall_vars <- c("L", "K", "r", "AUC") - for (var in overall_vars) { + # Generate plots for overall variables (L, K, r, AUC) + for (var in c("L", "K", "r", "AUC")) { y_limits <- limits_map[[var]] config <- list( df = df_filtered, plot_type = "scatter", - x_var = "conc_num_factor_new", + x_var = "conc_num_factor_factor", y_var = var, x_label = unique(df_filtered$Drug)[1], title = sprintf("Scatter RF for %s with SD", var), coord_cartesian = y_limits, error_bar = TRUE, - x_breaks = unique(df_filtered$conc_num_factor_new), + x_breaks = unique(df_filtered$conc_num_factor_factor), x_labels = as.character(unique(df_filtered$conc_num)), grid_layout = list(ncol = 2, nrow = 2), - position = "jitter" + position = "jitter", + smooth = TRUE ) - configs <- append(configs, list(config)) + plots <- append(plots, list(config)) } - # Generate Delta comparison plots (4x3 grid for deletion and reference) + # Generate Delta comparison plots unique_groups <- df_filtered %>% select(all_of(group_vars)) %>% distinct() for (i in seq_len(nrow(unique_groups))) { group <- unique_groups[i, ] - group_data <- df_filtered %>% filter(across(all_of(group_vars), ~ . == group[[cur_column()]])) + group_data <- df_filtered %>% semi_join(group, by = group_vars) OrfRep <- as.character(group$OrfRep) Gene <- if ("Gene" %in% names(group)) as.character(group$Gene) else "" num <- if ("num" %in% names(group)) as.character(group$num) else "" - OrfRepCombined <- paste(OrfRep, Gene, num, sep = "_") - # Generate plots for Delta variables - delta_vars <- c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC") - for (var in delta_vars) { + for (var in c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC")) { y_limits <- limits_map[[var]] - upper_y <- y_limits[2] - lower_y <- y_limits[1] - y_span <- upper_y - lower_y - - # Get WT_sd_var for error bar calculations + y_span <- y_limits[2] - y_limits[1] + + # Error bars WT_sd_var <- paste0("WT_sd_", sub("Delta_", "", var)) WT_sd_value <- group_data[[WT_sd_var]][1] error_bar_ymin <- 0 - (2 * WT_sd_value) error_bar_ymax <- 0 + (2 * WT_sd_value) - - # Set annotations (Z_Shifts, lm Z-Scores, NG, DB, SM values) - Z_Shift_var <- paste0("Z_Shift_", sub("Delta_", "", var)) - Z_lm_var <- paste0("Z_lm_", sub("Delta_", "", var)) - Z_Shift_value <- round(group_data[[Z_Shift_var]][1], 2) - Z_lm_value <- round(group_data[[Z_lm_var]][1], 2) + + # Annotations + Z_Shift_value <- round(group_data[[paste0("Z_Shift_", sub("Delta_", "", var))]][1], 2) + Z_lm_value <- round(group_data[[paste0("Z_lm_", sub("Delta_", "", var))]][1], 2) NG_value <- group_data$NG[1] DB_value <- group_data$DB[1] SM_value <- group_data$SM[1] annotations <- list( - list(x = 1, y = upper_y - 0.2 * y_span, label = paste("ZShift =", Z_Shift_value)), - list(x = 1, y = upper_y - 0.3 * y_span, label = paste("lm ZScore =", Z_lm_value)), - list(x = 1, y = lower_y + 0.2 * y_span, label = paste("NG =", NG_value)), - list(x = 1, y = lower_y + 0.1 * y_span, label = paste("DB =", DB_value)), - list(x = 1, y = lower_y, label = paste("SM =", SM_value)) + list(x = 1, y = y_limits[2] - 0.2 * y_span, label = paste("ZShift =", Z_Shift_value)), + list(x = 1, y = y_limits[2] - 0.3 * y_span, label = paste("lm ZScore =", Z_lm_value)), + list(x = 1, y = y_limits[1] + 0.2 * y_span, label = paste("NG =", NG_value)), + list(x = 1, y = y_limits[1] + 0.1 * y_span, label = paste("DB =", DB_value)), + list(x = 1, y = y_limits[1], label = paste("SM =", SM_value)) ) - # Create configuration for each Delta plot config <- list( df = group_data, plot_type = "scatter", - x_var = "conc_num", + x_var = "conc_num_factor_factor", y_var = var, x_label = unique(group_data$Drug)[1], title = paste(OrfRep, Gene, sep = " "), @@ -827,32 +734,30 @@ generate_interaction_plot_configs <- function(df, limits_map = NULL, plot_type = ymin = error_bar_ymin, ymax = error_bar_ymax ), - lm_smooth = TRUE, - x_breaks = unique(group_data$conc_num_factor_new), + smooth = TRUE, + x_breaks = unique(group_data$conc_num_factor_factor), x_labels = as.character(unique(group_data$conc_num)), ylim_vals = y_limits, - grid_layout = list(ncol = 4, nrow = 3) # Adjust grid layout for gene-gene comparisons + grid_layout = list(ncol = 4, nrow = 3) ) - configs <- append(configs, list(config)) + plots <- append(plots, list(config)) } } - return(configs) + return(list(grid_layout = list(ncol = 2, nrow = 2), plots = plots)) } generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FALSE, overlap_color = FALSE) { - sd_bands <- c(1, 2, 3) - avg_zscore_cols <- paste0("Avg_Zscore_", variables) z_lm_cols <- paste0("Z_lm_", variables) rank_avg_zscore_cols <- paste0("Rank_", variables) rank_z_lm_cols <- paste0("Rank_lm_", variables) - + configs <- list() + # Adjust values if necessary if (adjust) { - message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns for ranks") df <- df %>% mutate( across(all_of(avg_zscore_cols), ~ifelse(is.na(.), 0.001, .)), @@ -860,34 +765,22 @@ generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FA ) } - message("Calculating ranks for Avg_Zscore and Z_lm columns") - rank_col_mapping <- setNames(rank_avg_zscore_cols, avg_zscore_cols) + # Calculate rank columns for Avg_Zscore and Z_lm columns df_ranked <- df %>% - mutate(across(all_of(avg_zscore_cols), ~rank(., na.last = "keep"), .names = "{rank_col_mapping[.col]}")) + mutate(across(all_of(avg_zscore_cols), ~rank(., na.last = "keep"), .names = paste0("Rank_", avg_zscore_cols))) %>% + mutate(across(all_of(z_lm_cols), ~rank(., na.last = "keep"), .names = paste0("Rank_lm_", z_lm_cols))) - rank_lm_col_mapping <- setNames(rank_z_lm_cols, z_lm_cols) - df_ranked <- df_ranked %>% - mutate(across(all_of(z_lm_cols), ~rank(., na.last = "keep"), .names = "{rank_lm_col_mapping[.col]}")) - - # SD-based plots for L and K + # Generate plots for SD-based L and K variables for (variable in c("L", "K")) { - - if (is_lm) { - rank_var <- paste0("Rank_lm_", variable) - zscore_var <- paste0("Z_lm_", variable) - y_label <- paste("Int Z score", variable) - } else { - rank_var <- paste0("Rank_", variable) - zscore_var <- paste0("Avg_Zscore_", variable) - y_label <- paste("Avg Z score", variable) - } - + rank_var <- if (is_lm) paste0("Rank_lm_", variable) else paste0("Rank_", variable) + zscore_var <- if (is_lm) paste0("Z_lm_", variable) else paste0("Avg_Zscore_", variable) + y_label <- if (is_lm) paste("Int Z score", variable) else paste("Avg Z score", variable) + for (sd_band in sd_bands) { - num_enhancers <- sum(df_ranked[[zscore_var]] >= sd_band, na.rm = TRUE) num_suppressors <- sum(df_ranked[[zscore_var]] <= -sd_band, na.rm = TRUE) - - # Annotated plot configuration + + # Plot with annotations configs[[length(configs) + 1]] <- list( df = df_ranked, x_var = rank_var, @@ -903,27 +796,22 @@ generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FA list( x = median(df_ranked[[rank_var]], na.rm = TRUE), y = max(df_ranked[[zscore_var]], na.rm = TRUE) * 0.9, - label = paste("Deletion Enhancers =", num_enhancers), - hjust = 0.5, - vjust = 1 + label = paste("Deletion Enhancers =", num_enhancers) ), list( x = median(df_ranked[[rank_var]], na.rm = TRUE), y = min(df_ranked[[zscore_var]], na.rm = TRUE) * 0.9, - label = paste("Deletion Suppressors =", num_suppressors), - hjust = 0.5, - vjust = 0 + label = paste("Deletion Suppressors =", num_suppressors) ) ), shape = 3, size = 0.1, y_label = y_label, x_label = "Rank", - legend_position = "none", - grid_layout = list(ncol = 3, nrow = 2) + legend_position = "none" ) - - # Non-Annotated Plot Configuration + + # Plot without annotations configs[[length(configs) + 1]] <- list( df = df_ranked, x_var = rank_var, @@ -940,42 +828,26 @@ generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FA size = 0.1, y_label = y_label, x_label = "Rank", - legend_position = "none", - grid_layout = list(ncol = 3, nrow = 2) + legend_position = "none" ) } } - - # Avg ZScore and Rank Avg ZScore Plots for variables + + # Generate Avg ZScore and Rank Avg ZScore plots for each variable for (variable in variables) { for (plot_type in c("Avg Zscore vs lm", "Rank Avg Zscore vs lm")) { - title <- paste(plot_type, variable) # Define specific variables based on plot type - if (plot_type == "Avg Zscore vs lm") { - x_var <- paste0("Avg_Zscore_", variable) - y_var <- paste0("Z_lm_", variable) - rectangles <- list( - list(xmin = -2, xmax = 2, ymin = -2, ymax = 2, - fill = NA, color = "grey20", alpha = 0.1 - ) - ) - } else if (plot_type == "Rank Avg Zscore vs lm") { - x_var <- paste0("Rank_", variable) - y_var <- paste0("Rank_lm_", variable) - rectangles <- NULL - } + x_var <- if (plot_type == "Avg Zscore vs lm") paste0("Avg_Zscore_", variable) else paste0("Rank_", variable) + y_var <- if (plot_type == "Avg Zscore vs lm") paste0("Z_lm_", variable) else paste0("Rank_lm_", variable) # Fit the linear model lm_model <- lm(as.formula(paste(y_var, "~", x_var)), data = df_ranked) - - # Extract intercept, slope, and R-squared from the model coefficients intercept <- coef(lm_model)[1] slope <- coef(lm_model)[2] r_squared <- summary(lm_model)$r.squared - # Annotations: include R-squared in the plot annotations <- list( list( x = mean(range(df_ranked[[x_var]], na.rm = TRUE)), @@ -987,6 +859,12 @@ generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FA ) ) + rectangles <- if (plot_type == "Avg Zscore vs lm") { + list(list(xmin = -2, xmax = 2, ymin = -2, ymax = 2, fill = NA, color = "grey20", alpha = 0.1)) + } else { + NULL + } + configs[[length(configs) + 1]] <- list( df = df_ranked, x_var = x_var, @@ -1007,11 +885,11 @@ generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FA ) } } - return(configs) + + return(list(grid_layout = list(ncol = 3, nrow = 2), plots = configs)) } generate_correlation_plot_configs <- function(df, highlight_cyan = FALSE) { - # Define relationships for plotting relationships <- list( list(x = "Z_lm_L", y = "Z_lm_K", label = "Interaction L vs. Interaction K"), list(x = "Z_lm_L", y = "Z_lm_r", label = "Interaction L vs. Interaction r"), @@ -1021,56 +899,36 @@ generate_correlation_plot_configs <- function(df, highlight_cyan = FALSE) { list(x = "Z_lm_r", y = "Z_lm_AUC", label = "Interaction r vs. Interaction AUC") ) - configs <- list() + plots <- list() for (rel in relationships) { - # Fit linear model lm_model <- lm(as.formula(paste(rel$y, "~", rel$x)), data = df) - lm_summary <- summary(lm_model) - intercept <- coef(lm_model)[1] - slope <- coef(lm_model)[2] - r_squared <- lm_summary$r.squared + r_squared <- summary(lm_model)$r.squared - # Construct plot configuration config <- list( df = df, x_var = rel$x, y_var = rel$y, plot_type = "scatter", title = rel$label, - x_label = paste("z-score", gsub("Z_lm_", "", rel$x)), - y_label = paste("z-score", gsub("Z_lm_", "", rel$y)), annotations = list( - list( - x = mean(range(df[[rel$x]], na.rm = TRUE)), - y = mean(range(df[[rel$y]], na.rm = TRUE)), - label = paste("R-squared =", round(r_squared, 3)), - hjust = 0.5, - vjust = 1, - size = 5, - color = "black" - ) + list(x = mean(df[[rel$x]], na.rm = TRUE), + y = mean(df[[rel$y]], na.rm = TRUE), + label = paste("R-squared =", round(r_squared, 3))) ), smooth = TRUE, smooth_color = "tomato3", - lm_line = list(intercept = intercept, slope = slope), - legend_position = "right", + lm_line = list(intercept = coef(lm_model)[1], slope = coef(lm_model)[2]), shape = 3, size = 0.5, color_var = "Overlap", - rectangles = list( - list( - xmin = -2, xmax = 2, ymin = -2, ymax = 2, - fill = NA, color = "grey20", alpha = 0.1 - ) - ), - cyan_points = highlight_cyan, # Toggle cyan point highlighting + cyan_points = highlight_cyan ) - configs[[length(configs) + 1]] <- config + plots <- append(plots, list(config)) } - return(configs) + return(list(grid_layout = list(ncol = 3, nrow = 2), plots = plots)) } main <- function() { @@ -1085,8 +943,6 @@ main <- function() { summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across interaction_vars <- c("L", "K", "r", "AUC") # fields to calculate interaction z-scores - print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor", - "delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB") message("Loading and filtering data for experiment: ", exp_name) df <- load_and_filter_data(args$easy_results_file, sd = exp_sd) %>% @@ -1099,7 +955,7 @@ main <- function() { df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero # Save some constants - max_conc <- max(df$conc_num_factor_zeroed_num) + max_conc <- max(df$conc_num_factor) l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2 k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2 @@ -1107,13 +963,13 @@ main <- function() { df_stats <- calculate_summary_stats( df = df, variables = summary_vars, - group_vars = c("conc_num", "conc_num_factor"))$df_with_stats + group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$df_with_stats message("Calculating summary statistics after quality control") ss <- calculate_summary_stats( df = df_na, variables = summary_vars, - group_vars = c("conc_num", "conc_num_factor")) + group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")) df_na_ss <- ss$summary_stats df_na_stats <- ss$df_with_stats write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) @@ -1150,7 +1006,7 @@ main <- function() { df_no_zeros_stats <- calculate_summary_stats( df = df_no_zeros, variables = summary_vars, - group_vars = c("conc_num", "conc_num_factor") + group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor") )$df_with_stats message("Filtering by 2SD of K") @@ -1161,13 +1017,13 @@ main <- function() { message("Calculating summary statistics for L within 2SD of K") # TODO We're omitting the original z_max calculation, not sure if needed? - ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))$summary_stats + ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats write.csv(ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) message("Calculating summary statistics for L outside 2SD of K") - ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")) df_na_l_outside_2sd_k_stats <- ss$df_with_stats write.csv(ss$summary_stats, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), @@ -1175,64 +1031,75 @@ main <- function() { # Each plots list corresponds to a file l_vs_k_plot_configs <- list( - list( - df = df, - x_var = "L", - y_var = "K", - plot_type = "scatter", - delta_bg_point = TRUE, - title = "Raw L vs K before quality control", - color_var = "conc_num_factor_zeroed", - error_bar = FALSE, - legend_position = "right" + grid_layout = list(ncol = 1, nrow = 1), + plots = list( + list( + df = df, + x_var = "L", + y_var = "K", + plot_type = "scatter", + tooltip_vars = c("OrfRep", "Gene", "delta_bg"), + title = "Raw L vs K before quality control", + color_var = "conc_num_factor_factor", + error_bar = FALSE, + legend_position = "right" + ) ) ) frequency_delta_bg_plot_configs <- list( - list( - df = df_stats, - x_var = "delta_bg", - y_var = NULL, - plot_type = "density", - title = "Density plot for Delta Background by [Drug] (All Data)", - color_var = "conc_num_factor_zeroed", - x_label = "Delta Background", - y_label = "Density", - error_bar = FALSE, - legend_position = "right"), - list( - df = df_stats, - x_var = "delta_bg", - y_var = NULL, - plot_type = "bar", - title = "Bar plot for Delta Background by [Drug] (All Data)", - color_var = "conc_num_factor_zeroed", - x_label = "Delta Background", - y_label = "Count", - error_bar = FALSE, - legend_position = "right") + grid_layout = list(ncol = 1, nrow = 1), + plots = list( + list( + df = df_stats, + x_var = "delta_bg", + y_var = NULL, + plot_type = "density", + title = "Density plot for Delta Background by [Drug] (All Data)", + color_var = "conc_num_factor_factor", + x_label = "Delta Background", + y_label = "Density", + error_bar = FALSE, + legend_position = "right" + ), + list( + df = df_stats, + x_var = "delta_bg", + y_var = NULL, + plot_type = "bar", + title = "Bar plot for Delta Background by [Drug] (All Data)", + color_var = "conc_num_factor_factor", + x_label = "Delta Background", + y_label = "Count", + error_bar = FALSE, + legend_position = "right" + ) + ) ) above_threshold_plot_configs <- list( - list( - df = df_above_tolerance, - x_var = "L", - y_var = "K", - plot_type = "scatter", - delta_bg_point = TRUE, - title = paste("Raw L vs K for strains above Delta Background threshold of", - round(df_above_tolerance$delta_bg_tolerance[[1]], 3), "or above"), - color_var = "conc_num_factor_zeroed", - position = "jitter", - annotations = list( - list( - x = l_half_median, - y = k_half_median, - label = paste("# strains above Delta Background tolerance =", nrow(df_above_tolerance)) - ) - ), - error_bar = FALSE, - legend_position = "right" + grid_layout = list(ncol = 1, nrow = 1), + plots = list( + list( + df = df_above_tolerance, + x_var = "L", + y_var = "K", + plot_type = "scatter", + tooltip_vars = c("OrfRep", "Gene", "delta_bg"), + title = paste("Raw L vs K for strains above Delta Background threshold of", + round(df_above_tolerance$delta_bg_tolerance[[1]], 3), "or above"), + color_var = "conc_num_factor_factor", + position = "jitter", + annotations = list( + list( + x = l_half_median, + y = k_half_median, + label = paste("# strains above Delta Background tolerance =", nrow(df_above_tolerance)) + ) + ), + error_bar = FALSE, + legend_position = "right" + ) ) ) @@ -1263,59 +1130,84 @@ main <- function() { ) l_outside_2sd_k_plot_configs <- list( - list( - df = df_na_l_outside_2sd_k_stats, - x_var = "L", - y_var = "K", - plot_type = "scatter", - delta_bg_point = TRUE, - title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc", - color_var = "conc_num_factor_zeroed", - position = "jitter", - legend_position = "right" + grid_layout = list(ncol = 1, nrow = 1), # Ensures it's compatible with generate_and_save_plots + plots = list( + list( + df = df_na_l_outside_2sd_k_stats, + x_var = "L", + y_var = "K", + plot_type = "scatter", + title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc", + color_var = "conc_num_factor_factor", + position = "jitter", # Apply jitter for better visibility + tooltip_vars = c("OrfRep", "Gene", "delta_bg"), + annotations = list( + list( + x = mean(df_na_l_outside_2sd_k_stats$L, na.rm = TRUE), + y = mean(df_na_l_outside_2sd_k_stats$K, na.rm = TRUE), + label = paste("Total strains:", nrow(df_na_l_outside_2sd_k_stats)) + ) + ), + error_bar = FALSE, # No error bars for this plot + legend_position = "right" + ) ) ) delta_bg_outside_2sd_k_plot_configs <- list( - list( - df = df_na_l_outside_2sd_k_stats, - x_var = "delta_bg", - y_var = "K", - plot_type = "scatter", - gene_point = TRUE, - title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc", - color_var = "conc_num_factor_zeroed", - position = "jitter", - legend_position = "right" + grid_layout = list(ncol = 1, nrow = 1), # Ensures it's compatible with generate_and_save_plots + plots = list( + list( + df = df_na_l_outside_2sd_k_stats, + x_var = "delta_bg", + y_var = "K", + plot_type = "scatter", + title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc", + color_var = "conc_num_factor_factor", + position = "jitter", # Apply jitter for better visibility + tooltip_vars = c("OrfRep", "Gene", "delta_bg"), + annotations = list( + list( + x = mean(df_na_l_outside_2sd_k_stats$delta_bg, na.rm = TRUE), + y = mean(df_na_l_outside_2sd_k_stats$K, na.rm = TRUE), + label = paste("Total strains:", nrow(df_na_l_outside_2sd_k_stats)) + ) + ), + error_bar = FALSE, # No error bars for this plot + legend_position = "right" + ) ) ) message("Generating quality control plots in parallel") - # future::plan(future::multicore, workers = parallel::detectCores()) - future::plan(future::multisession, workers = 3) # generate 3 plots in parallel - plot_configs <- list( - list(out_dir = out_dir_qc, filename = "L_vs_K_before_quality_control", - plot_configs = l_vs_k_plot_configs), - list(out_dir = out_dir_qc, filename = "frequency_delta_background", - plot_configs = frequency_delta_bg_plot_configs), - list(out_dir = out_dir_qc, filename = "L_vs_K_above_threshold", - plot_configs = above_threshold_plot_configs), - list(out_dir = out_dir_qc, filename = "plate_analysis", - plot_configs = plate_analysis_plot_configs), - list(out_dir = out_dir_qc, filename = "plate_analysis_boxplots", - plot_configs = plate_analysis_boxplot_configs), - list(out_dir = out_dir_qc, filename = "plate_analysis_no_zeros", - plot_configs = plate_analysis_no_zeros_plot_configs), - list(out_dir = out_dir_qc, filename = "plate_analysis_no_zeros_boxplots", - plot_configs = plate_analysis_no_zeros_boxplot_configs), - list(out_dir = out_dir_qc, filename = "L_vs_K_for_strains_2SD_outside_mean_K", - plot_configs = l_outside_2sd_k_plot_configs), - list(out_dir = out_dir_qc, filename = "delta_background_vs_K_for_strains_2sd_outside_mean_K", - plot_configs = delta_bg_outside_2sd_k_plot_configs) - ) + generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plot_configs) + quit() + # # future::plan(future::multicore, workers = parallel::detectCores()) + # future::plan(future::multisession, workers = 3) # generate 3 plots in parallel - # Generating quality control plots in parallel + # plot_configs <- list( + # list(out_dir = out_dir_qc, filename = "L_vs_K_before_quality_control", + # plot_configs = l_vs_k_plot_configs), + # list(out_dir = out_dir_qc, filename = "frequency_delta_background", + # plot_configs = frequency_delta_bg_plot_configs), + # list(out_dir = out_dir_qc, filename = "L_vs_K_above_threshold", + # plot_configs = above_threshold_plot_configs), + # list(out_dir = out_dir_qc, filename = "plate_analysis", + # plot_configs = plate_analysis_plot_configs), + # list(out_dir = out_dir_qc, filename = "plate_analysis_boxplots", + # plot_configs = plate_analysis_boxplot_configs), + # list(out_dir = out_dir_qc, filename = "plate_analysis_no_zeros", + # plot_configs = plate_analysis_no_zeros_plot_configs), + # list(out_dir = out_dir_qc, filename = "plate_analysis_no_zeros_boxplots", + # plot_configs = plate_analysis_no_zeros_boxplot_configs), + # list(out_dir = out_dir_qc, filename = "L_vs_K_for_strains_2SD_outside_mean_K", + # plot_configs = l_outside_2sd_k_plot_configs), + # list(out_dir = out_dir_qc, filename = "delta_background_vs_K_for_strains_2sd_outside_mean_K", + # plot_configs = delta_bg_outside_2sd_k_plot_configs) + # ) + + # # Generating quality control plots in parallel # furrr::future_map(plot_configs, function(config) { # generate_and_save_plots(config$out_dir, config$filename, config$plot_configs) # }, .options = furrr_options(seed = TRUE)) @@ -1340,7 +1232,7 @@ main <- function() { # Recalculate summary statistics for the background strain message("Calculating summary statistics for background strain") - ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) + ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor")) summary_stats_bg <- ss_bg$summary_stats write.csv(summary_stats_bg, file = file.path(out_dir, paste0("summary_stats_background_strain_", strain, ".csv")), @@ -1351,7 +1243,7 @@ main <- function() { df_reference <- df_na_stats %>% # formerly X2_RF filter(OrfRep == strain) %>% filter(!is.na(L)) %>% - group_by(conc_num, conc_num_factor) %>% + group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), @@ -1364,7 +1256,7 @@ main <- function() { filter(OrfRep != strain) %>% filter(!is.na(L)) %>% mutate(SM = 0) %>% - group_by(conc_num, conc_num_factor) %>% + group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), @@ -1376,7 +1268,7 @@ main <- function() { df_reference_stats <- calculate_summary_stats( df = df_reference, variables = interaction_vars, - group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") + group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "conc_num_factor_factor") )$df_with_stats reference_results <- calculate_interaction_scores(df_reference_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene", "num")) zscore_calculations_reference <- reference_results$calculations @@ -1387,9 +1279,9 @@ main <- function() { df_deletion_stats <- calculate_summary_stats( df = df_deletion, variables = interaction_vars, - group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor") + group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor", "conc_num_factor_factor") )$df_with_stats - deletion_results <- calculate_interaction_scores(df_deletion_stats, max_conc, bg_stats, group_vars = c("OrfRep")) + deletion_results <- calculate_interaction_scores(df_deletion_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene")) zscore_calculations <- deletion_results$calculations zscore_interactions <- deletion_results$interactions zscore_interactions_joined <- deletion_results$full_data