diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index a9d449e8..8db71d49 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -37,28 +37,30 @@ parse_arguments <- function() { } else { commandArgs(trailingOnly = TRUE) } - - # Extract paths, names, and standard deviations - paths <- args[seq(4, length(args), by = 3)] - names <- args[seq(5, length(args), by = 3)] - sds <- as.numeric(args[seq(6, length(args), by = 3)]) - - # Normalize paths - normalized_paths <- normalizePath(paths, mustWork = FALSE) - - # Create named list of experiments + + out_dir <- normalizePath(args[1], mustWork = FALSE) + sgd_gene_list <- normalizePath(args[2], mustWork = FALSE) + easy_results_file <- normalizePath(args[3], mustWork = FALSE) + + # The remaining arguments should be in groups of 3 + exp_args <- args[-(1:3)] + if (length(exp_args) %% 3 != 0) { + stop("Experiment arguments should be in groups of 3: path, name, sd.") + } + experiments <- list() - for (i in seq_along(paths)) { - experiments[[names[i]]] <- list( - path = normalized_paths[i], - sd = sds[i] + for (i in seq(1, length(exp_args), by = 3)) { + exp_name <- exp_args[i + 1] + experiments[[exp_name]] <- list( + path = normalizePath(exp_args[i], mustWork = FALSE), + sd = as.numeric(exp_args[i + 2]) ) } - + list( - out_dir = normalizePath(args[1], mustWork = FALSE), - sgd_gene_list = normalizePath(args[2], mustWork = FALSE), - easy_results_file = normalizePath(args[3], mustWork = FALSE), + out_dir = out_dir, + sgd_gene_list = sgd_gene_list, + easy_results_file = easy_results_file, experiments = experiments ) } @@ -81,9 +83,11 @@ theme_publication <- function(base_size = 14, base_family = "sans", legend_posit plot.background = element_rect(colour = NA), panel.border = element_rect(colour = NA), axis.title = element_text(face = "bold", size = rel(1)), - axis.title.y = element_text(angle = 90, vjust = 2), - axis.title.x = element_text(vjust = -0.2), + axis.title.y = element_text(angle = 90, vjust = 2, size = 18), + axis.title.x = element_text(vjust = -0.2, size = 18), axis.line = element_line(colour = "black"), + axis.text.x = element_text(size = 16), + axis.text.y = element_text(size = 16), panel.grid.major = element_line(colour = "#f0f0f0"), panel.grid.minor = element_blank(), legend.key = element_rect(colour = NA), @@ -111,27 +115,42 @@ scale_colour_publication <- function(...) { # Load the initial dataframe from the easy_results_file load_and_process_data <- function(easy_results_file, sd = 3) { - df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) - + df <- read.delim( + easy_results_file, + skip = 2, + stringsAsFactors = FALSE, + row.names = 1, + strip.white = TRUE + ) + + # Filter and rename columns + df <- df %>% + filter(!is.na(ORF) & ORF != "") %>% + filter(!Gene %in% c("BLANK", "Blank", "blank")) %>% + filter(Drug != "BMH21") %>% + rename( + L = l, + num = Num., + AUC = AUC96, + scan = Scan, + last_bg = LstBackgrd, + first_bg = X1stBackgrd + ) %>% + mutate(across(c(Col, Row, num, L, K, r, scan, AUC, last_bg, first_bg), as.numeric)) + + # Calculate delta background and tolerance df <- df %>% - filter(!(.[[1]] %in% c("", "Scan"))) %>% - filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>% - # Rename columns - rename(L = l, num = Num., AUC = AUC96, scan = Scan, last_bg = LstBackgrd, first_bg = X1stBackgrd) %>% mutate( - across(c(Col, Row, num, L, K, r, scan, AUC, last_bg, first_bg), as.numeric), delta_bg = last_bg - first_bg, delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + (sd * sd(delta_bg, na.rm = TRUE)), NG = if_else(L == 0 & !is.na(L), 1, 0), DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0), SM = 0, - OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded? + OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)), conc_num_factor = as.factor(conc_num) - # conc_num_factor = factor(conc_num, levels = sort(unique(conc_num))) - # conc_num_numeric = as.numeric(conc_num_factor) - 1 ) - + return(df) } @@ -439,34 +458,43 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la dev.off() # Combine and save interactive HTML plots - combined_plot <- subplot(plotly_plots, nrows = grid_layout$nrow %||% length(plotly_plots), margin = 0.05) + combined_plot <- subplot(plotly_plots, + nrows = ifelse(is.null(grid_layout$nrow), length(plotly_plots), grid_layout$nrow), + margin = 0.05) saveWidget(combined_plot, file = file.path(output_dir, paste0(file_name, ".html")), selfcontained = TRUE) } generate_scatter_plot <- function(plot, config) { - # 1. Determine Shape, Size, and Position for geom_point + # Build the aes mapping with color if specified + if (!is.null(config$color_var)) { + plot <- plot + aes(color = .data[[config$color_var]]) + } + + # Determine Shape, Size, and Position for geom_point shape <- if (!is.null(config$shape)) config$shape else 3 - size <- if (!is.null(config$size)) { - config$size - } else { - if (!is.null(config$delta_bg_point) && config$delta_bg_point) 0.2 - else if (!is.null(config$gene_point) && config$gene_point) 0.2 - else 0.1 + size <- if (!is.null(config$size)) config$size else 0.1 + position <- if (!is.null(config$position) && config$position == "jitter") "jitter" else "identity" + + # Add geom_point with determined parameters + plot <- plot + geom_point( + aes(color = .data[[config$color_var]]), + shape = shape, + size = size, + position = position + ) + + if (!is.null(config$cyan_points)) { + plot <- plot + geom_point( + data = subset(config$df, is_cyan_point == TRUE), + aes(x = .data[[config$x_var]], y = .data[[config$y_var]]), + color = "cyan", + shape = 3, + size = 0.5 + ) } - position <- if (!is.null(config$delta_bg_point) && config$delta_bg_point) { - "identity" - } else if (!is.null(config$gene_point) && config$gene_point) { - "jitter" - } else { - if (!is.null(config$position) && config$position == "jitter") "jitter" else "identity" - } - - # 2. Add geom_point with determined parameters - plot <- plot + geom_point(shape = shape, size = size, position = position) - - # 3. Add Smooth Line if specified + # Add Smooth Line if specified if (!is.null(config$add_smooth) && config$add_smooth) { if (!is.null(config$lm_line)) { plot <- plot + @@ -485,7 +513,7 @@ generate_scatter_plot <- function(plot, config) { } } - # 4. Add SD Bands if specified + # Add SD Bands if specified if (!is.null(config$sd_band_values)) { for (sd_band in config$sd_band_values) { plot <- plot + @@ -509,8 +537,24 @@ generate_scatter_plot <- function(plot, config) { ) } } + + # Add Rectangles if specified + if (!is.null(config$rectangles)) { + for (rect in config$rectangles) { + plot <- plot + annotate( + "rect", + xmin = rect$xmin, + xmax = rect$xmax, + ymin = rect$ymin, + ymax = rect$ymax, + fill = ifelse(is.null(rect$fill), NA, rect$fill), + color = ifelse(is.null(rect$color), "black", rect$color), + alpha = ifelse(is.null(rect$alpha), 0.1, rect$alpha) + ) + } + } - # 5. Add Error Bars if specified + # Add Error Bars if specified if (!is.null(config$error_bar) && config$error_bar && !is.null(config$y_var)) { y_mean_col <- paste0("mean_", config$y_var) y_sd_col <- paste0("sd_", config$y_var) @@ -525,7 +569,7 @@ generate_scatter_plot <- function(plot, config) { ) } - # 6. Customize X-axis if specified + # Customize X-axis if specified if (!is.null(config$x_breaks) && !is.null(config$x_labels) && !is.null(config$x_label)) { plot <- plot + scale_x_discrete( @@ -535,17 +579,17 @@ generate_scatter_plot <- function(plot, config) { ) } - # 7. Apply coord_cartesian if specified + # Apply coord_cartesian if specified if (!is.null(config$coord_cartesian)) { plot <- plot + coord_cartesian(ylim = config$coord_cartesian) } - # 8. Set Y-axis limits if specified + # Set Y-axis limits if specified if (!is.null(config$ylim_vals)) { plot <- plot + scale_y_continuous(limits = config$ylim_vals) } - # 9. Add Annotations if specified + # Add Annotations if specified if (!is.null(config$annotations)) { for (annotation in config$annotations) { plot <- plot + @@ -559,12 +603,12 @@ generate_scatter_plot <- function(plot, config) { } } - # 10. Add Title if specified + # Add Title if specified if (!is.null(config$title)) { plot <- plot + ggtitle(config$title) } - # 11. Adjust Legend Position if specified + # Adjust Legend Position if specified if (!is.null(config$legend_position)) { plot <- plot + theme(legend.position = config$legend_position) } @@ -692,121 +736,131 @@ generate_interaction_plot_configs <- function(df, variables) { return(configs) } -generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE, adjust = FALSE) { - - # Define SD bands - sd_bands <- c(1, 2, 3) - - # Initialize list to store plot configurations - configs <- list() - - # SD-based plots for L and K - for (variable in c("L", "K")) { - for (sd_band in sd_bands) { - - # Determine columns based on whether it's lm or not - if (is_lm) { - rank_var <- paste0(variable, "_Rank_lm") - zscore_var <- paste0("Z_lm_", variable) - y_label <- paste("Int Z score", variable) - } else { - rank_var <- paste0(variable, "_Rank") - zscore_var <- paste0("Avg_Zscore_", variable) - y_label <- paste("Avg Z score", variable) - } - - # Annotated Plot Configuration - configs[[length(configs) + 1]] <- list( - df = df_filtered, - x_var = rank_var, - y_var = zscore_var, - plot_type = "scatter", - title = paste(y_label, "vs. Rank for", variable, "above", sd_band, "SD"), - sd_band = sd_band, - annotations = list( - list( - x = median(df_filtered[[rank_var]], na.rm = TRUE), - y = 10, - label = paste("Deletion Enhancers =", sum(df_filtered[[zscore_var]] >= sd_band, na.rm = TRUE)) - ), - list( - x = median(df_filtered[[rank_var]], na.rm = TRUE), - y = -10, - label = paste("Deletion Suppressors =", sum(df_filtered[[zscore_var]] <= -sd_band, na.rm = TRUE)) - ) - ), - sd_band_values = sd_band, - shape = 3, - size = 0.1 - ) - - # Non-Annotated Plot Configuration - configs[[length(configs) + 1]] <- list( - df = df_filtered, - x_var = rank_var, - y_var = zscore_var, - plot_type = "scatter", - title = paste(y_label, "vs. Rank for", variable, "above", sd_band, "SD No Annotations"), - sd_band = sd_band, - annotations = NULL, - sd_band_values = sd_band, - shape = 3, - size = 0.1 - ) +generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE) { + # Define SD bands + sd_bands <- c(1, 2, 3) + + # Initialize list to store plot configurations + configs <- list() + + # SD-based plots for L and K + for (variable in c("L", "K")) { + for (sd_band in sd_bands) { + # Determine columns based on whether it's lm or not + if (is_lm) { + rank_var <- paste0(variable, "_Rank_lm") + zscore_var <- paste0("Z_lm_", variable) + y_label <- paste("Int Z score", variable) + } else { + rank_var <- paste0(variable, "_Rank") + zscore_var <- paste0("Avg_Zscore_", variable) + y_label <- paste("Avg Z score", variable) + } + + # Calculate counts for annotations + num_enhancers <- sum(df_filtered[[zscore_var]] >= sd_band, na.rm = TRUE) + num_suppressors <- sum(df_filtered[[zscore_var]] <= -sd_band, na.rm = TRUE) + + # Annotated Plot Configuration + configs[[length(configs) + 1]] <- list( + df = df_filtered, + x_var = rank_var, + y_var = zscore_var, + plot_type = "scatter", + title = paste(y_label, "vs. Rank for", variable, "above", sd_band, "SD"), + sd_band = sd_band, + annotations = list( + list( + x = median(df_filtered[[rank_var]], na.rm = TRUE), + y = 10, + label = paste("Deletion Enhancers =", num_enhancers) + ), + list( + x = median(df_filtered[[rank_var]], na.rm = TRUE), + y = -10, + label = paste("Deletion Suppressors =", num_suppressors) + ) + ), + sd_band_values = sd_band, + shape = 3, + size = 0.1, + y_label = y_label, + x_label = "Rank", + legend_position = "none" + ) + + # Non-Annotated Plot Configuration + configs[[length(configs) + 1]] <- list( + df = df_filtered, + x_var = rank_var, + y_var = zscore_var, + plot_type = "scatter", + title = paste(y_label, "vs. Rank for", variable, "above", sd_band, "SD No Annotations"), + sd_band = sd_band, + annotations = NULL, + sd_band_values = sd_band, + shape = 3, + size = 0.1, + y_label = y_label, + x_label = "Rank", + legend_position = "none" + ) + } } - } - - # Avg ZScore and Rank Avg ZScore Plots for r, L, K, and AUC - for (variable in variables) { - for (plot_type in c("Avg_Zscore_vs_lm", "Rank_Avg_Zscore_vs_lm")) { - - # Define x and y variables based on plot type - if (plot_type == "Avg_Zscore_vs_lm") { - x_var <- paste0("Avg_Zscore_", variable) - y_var <- paste0("Z_lm_", variable) - title_suffix <- paste("Avg Zscore vs lm", variable) - } else if (plot_type == "Rank_Avg_Zscore_vs_lm") { - x_var <- paste0(variable, "_Rank") - y_var <- paste0(variable, "_Rank_lm") - title_suffix <- paste("Rank Avg Zscore vs lm", variable) - } - - # Determine y-axis label - if (plot_type == "Avg_Zscore_vs_lm") { - y_label <- paste("Z lm", variable) - } else { - y_label <- paste("Rank lm", variable) - } - - # Determine correlation text (R-squared) - lm_fit <- lm(df_filtered[[y_var]] ~ df_filtered[[x_var]], data = df_filtered) - r_squared <- summary(lm_fit)$r.squared - - # Plot Configuration - configs[[length(configs) + 1]] <- list( - df = df_filtered, - x_var = x_var, - y_var = y_var, - plot_type = "scatter", - title = title_suffix, - annotations = list( - list( - x = 0, - y = 0, - label = paste("R-squared =", round(r_squared, 2)) - ) - ), - sd_band_values = NULL, # Not applicable - shape = 3, - size = 0.1, - add_smooth = TRUE, - lm_line = list(intercept = coef(lm_fit)[1], slope = coef(lm_fit)[2]), - legend_position = "right" - ) + + # Avg ZScore and Rank Avg ZScore Plots for r, L, K, and AUC + for (variable in variables) { + for (plot_type in c("Avg_Zscore_vs_lm", "Rank_Avg_Zscore_vs_lm")) { + # Define x and y variables based on plot type + if (plot_type == "Avg_Zscore_vs_lm") { + x_var <- paste0("Avg_Zscore_", variable) + y_var <- paste0("Z_lm_", variable) + title_suffix <- paste("Avg Zscore vs lm", variable) + } else { + x_var <- paste0(variable, "_Rank") + y_var <- paste0(variable, "_Rank_lm") + title_suffix <- paste("Rank Avg Zscore vs lm", variable) + } + + # Fit linear model + lm_fit <- lm(df_filtered[[y_var]] ~ df_filtered[[x_var]], data = df_filtered) + + # Check for perfect fit + if (summary(lm_fit)$sigma == 0) { + next # Skip this iteration if the fit is perfect + } + + # Calculate R-squared + r_squared <- summary(lm_fit)$r.squared + + # Plot Configuration + configs[[length(configs) + 1]] <- list( + df = df_filtered, + x_var = x_var, + y_var = y_var, + plot_type = "scatter", + title = title_suffix, + annotations = list( + list( + x = 0, + y = 0, + label = paste("R-squared =", round(r_squared, 2)) + ) + ), + sd_band_values = NULL, # Not applicable + shape = 3, + size = 0.1, + add_smooth = TRUE, + lm_line = list(intercept = coef(lm_fit)[1], slope = coef(lm_fit)[2]), + legend_position = "right", + color_var = "Overlap", + x_label = x_var, + y_label = y_var + ) + } } - } - - return(configs) + + return(configs) } generate_correlation_plot_configs <- function(df) { @@ -831,7 +885,7 @@ generate_correlation_plot_configs <- function(df) { config <- list( df = df, x_var = rel$x, - y_var = rel$y, + y_var = rel.y, plot_type = "scatter", title = rel$label, x_label = paste("z-score", gsub("Z_lm_", "", rel$x)), @@ -839,9 +893,19 @@ generate_correlation_plot_configs <- function(df) { annotations = list( list(x = 0, y = 0, label = paste("R-squared =", round(lm_summary$r.squared, 3))) ), - add_smooth = TRUE, # This flags that a geom_smooth layer should be added - lm_line = list(intercept = coef(lm_model)[1], slope = coef(lm_model)[2]), # For direct geom_abline if needed - legend_position = "right" + add_smooth = TRUE, # Add regression line + lm_line = list(intercept = coef(lm_model)[1], slope = coef(lm_model)[2]), + legend_position = "right", + shape = 3, + size = 0.5, + color_var = "Overlap", + rectangles = list( + list( + xmin = -2, xmax = 2, ymin = -2, ymax = 2, + fill = NA, color = "grey20", alpha = 0.1 + ) + ), + cyan_points = TRUE ) configs[[length(configs) + 1]] <- config @@ -850,6 +914,7 @@ generate_correlation_plot_configs <- function(df) { return(configs) } + filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FALSE, rank = FALSE, limits_map = NULL, verbose = TRUE) { @@ -1188,19 +1253,16 @@ main <- function() { ) ) - # message("Generating quality control plots") - # generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots) - # generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) - # generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) - # generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) - # generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots) - # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots) - # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots) - # generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots) - # generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots) - - # TODO: Originally this filtered L NA's - # Let's try to avoid for now since stats have already been calculated + message("Generating quality control plots") + generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots) + generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) + generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) + generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) + generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots) + generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots) + generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots) + generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots) + generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots) # Process background strains bg_strains <- c("YDL227C") @@ -1349,8 +1411,7 @@ main <- function() { rank_plot_configs <- generate_rank_plot_configs( df = zscores_interactions_joined_filtered, variables = interaction_vars, - is_lm = FALSE, - adjust = TRUE + is_lm = FALSE ) generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots", plot_configs = rank_plot_configs, grid_layout = list(ncol = 3, nrow = 2)) @@ -1359,8 +1420,7 @@ main <- function() { rank_lm_plot_configs <- generate_rank_plot_configs( df = zscores_interactions_joined_filtered, variables = interaction_vars, - is_lm = TRUE, - adjust = TRUE + is_lm = TRUE ) generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm", plot_configs = rank_lm_plot_configs, grid_layout = list(ncol = 3, nrow = 2)) @@ -1370,8 +1430,6 @@ main <- function() { zscores_interactions_filtered <- zscores_interactions_joined %>% group_by(across(all_of(c("OrfRep", "Gene", "num")))) %>% filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>% - ungroup() %>% - rowwise() %>% mutate( lm_R_squared_L = if (n() > 1) summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared else NA, lm_R_squared_K = if (n() > 1) summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared else NA, @@ -1387,8 +1445,7 @@ main <- function() { Z_lm_L <= -2 & Avg_Zscore_L >= 2 ~ "Deletion Suppressor lm, Deletion Enhancer Avg Z score", TRUE ~ "No Effect" ) - ) %>% - ungroup() + ) # Re-rank zscores_interactions_filtered <- filter_data( @@ -1401,8 +1458,7 @@ main <- function() { rank_plot_filtered_configs <- generate_rank_plot_configs( df = zscores_interactions_filtered, variables = interaction_vars, - is_lm = FALSE, - adjust = FALSE + is_lm = FALSE ) message("Generating filtered ranked plots") @@ -1416,8 +1472,7 @@ main <- function() { rank_plot_lm_filtered_configs <- generate_rank_plot_configs( df = zscores_interactions_filtered, variables = interaction_vars, - is_lm = TRUE, - adjust = FALSE + is_lm = TRUE ) generate_and_save_plots( output_dir = out_dir,