diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index a25102ec..98086485 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -9,6 +9,7 @@ suppressMessages({ }) options(warn = 2, max.print = 1000) +options(width = 10000) # Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes) soft_limit <- 30 * 1024 * 1024 * 1024 @@ -156,7 +157,7 @@ update_gene_names <- function(df, sgd_gene_list) { return(df) } -# Process strains (deletion and reference) +# Process either deletion and or reference strain(s) process_strains <- function(df) { df_strains <- data.frame() # Initialize an empty dataframe to store results @@ -187,6 +188,9 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c df <- df %>% mutate(across(all_of(variables), ~ifelse(. == 0, NA, .))) + print("Head of df for summary stats calculations:") + print(head(df)) + # Calculate summary statistics, including a single N based on L summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% @@ -195,13 +199,16 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c across(all_of(variables), list( mean = ~mean(., na.rm = TRUE), median = ~median(., na.rm = TRUE), - max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), - min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), - sd = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE), 0), # If N == 1, sd is set to 0 - se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), 0) # If N == 1, se is set to 0 + max = ~max(., na.rm = TRUE), + min = ~min(., na.rm = TRUE), + sd = ~sd(., na.rm = TRUE), + se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1) # TODO unsure why this is - 1 ), .names = "{.fn}_{.col}") ) + print("Summary stats:") + print(head(summary_stats)) + # Remove existing stats columns from df if they already exist stat_columns <- setdiff(names(summary_stats), group_vars) df_cleaned <- df %>% select(-any_of(stat_columns)) @@ -215,10 +222,10 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c # Calculate interaction scores calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { - if (nrow(df) == 0) { - message("Dataframe is empty after filtering") - return(NULL) # Or handle the empty dataframe case as needed - } + # if (nrow(df) == 0) { + # message("Dataframe is empty after filtering") + # return(NULL) # Or handle the empty dataframe case as needed + # } # Calculate total concentration variables total_conc_num <- length(unique(df$conc_num)) @@ -304,6 +311,9 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c ) %>% ungroup() + print("Interaction scores:") + print(head(interaction_scores)) + # Calculate linear models and interaction scores per gene print("Calculating interaction scores part 2") interaction_scores_all <- interaction_scores %>% @@ -329,6 +339,107 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c } +generate_plot <- function(df, x_var, y_var = NULL, plot_type = "scatter", color_var = "conc_num", + title = "", x_label = NULL, y_label = NULL, ylim_vals = NULL) { + + # Initialize plot with dynamic x and color variables + plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) + + if (!is.null(y_var)) { + plot <- plot + aes(y = !!sym(y_var)) + + # Add scatter points with summary stats if `y_var` is present + y_mean_col <- paste0("mean_", y_var) + y_sd_col <- paste0("sd_", y_var) + + plot <- plot + geom_point(shape = 3, size = 0.2) + + geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col), + ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) + + geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) + } + + # Add the required plot type (scatter, box, density, bar) + plot <- switch(plot_type, + "box" = plot + geom_boxplot(), + "density" = plot + geom_density(), + "bar" = plot + geom_bar(stat = "identity"), + plot + geom_point() + geom_smooth(method = "lm", se = FALSE) # Default is scatter + ) + + # Apply y-axis limits if specified + if (!is.null(ylim_vals)) { + plot <- plot + coord_cartesian(ylim = ylim_vals) + } + + # Add title and axis labels + plot <- plot + ggtitle(title) + theme_publication() + if (!is.null(x_label)) plot <- plot + xlab(x_label) + if (!is.null(y_label)) plot <- plot + ylab(y_label) + + return(plot) +} + + +generate_and_save_plots <- function(df, output_dir, file_prefix, plot_configs) { + plots <- list() + + if (nrow(df) == 0) { + message("The dataframe is empty, skipping plots") + return() + } + + message("Generating plots for dataframe") + + for (config in plot_configs) { + # Generate the plot using the configurations + plot <- generate_plot( + df = df, + x_var = config$x_var, + y_var = config$y_var, + plot_type = config$plot_type, + title = config$title, + ylim_vals = config$ylim_vals + ) + + # Add custom annotations if provided + if (!is.null(config$annotations)) { + for (annotation in config$annotations) { + plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label) + } + } + + # Store the plot with variable name and plot type + plots[[paste0(config$y_var, "_", config$plot_type)]] <- plot + } + + # Save plots to PDF and HTML + save_plots(file_prefix, plots, output_dir) +} + + + +# Ensure all plots are saved and printed to PDF +save_plots <- function(file_name, plot_list, output_dir) { + # Save all plots to a single PDF + pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9) + lapply(plot_list, print) + dev.off() + + # Save each plot as an interactive HTML file + lapply(names(plot_list), function(plot_name) { + pgg <- tryCatch({ + suppressWarnings(ggplotly(plot_list[[plot_name]]) %>% + layout(legend = list(orientation = "h"))) + }, error = function(e) { + message("Error generating plot: ", plot_name) + return(NULL) + }) + + if (!is.null(pgg)) { + saveWidget(pgg, file = file.path(output_dir, paste0(file_name, "_", plot_name, ".html")), selfcontained = TRUE) + } + }) +} interaction_plot_configs <- function(df, variable) { @@ -361,102 +472,6 @@ interaction_plot_configs <- function(df, variable) { ) } -generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num", - title, x_label = NULL, y_label = NULL, ylim_vals = NULL) { - - # Use tidy evaluation with aes() and !!sym() for dynamic column names - plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) - - if (!is.null(y_var)) { - plot <- plot + aes(y = !!sym(y_var)) - } - - # Set up the plot based on the requested plot type - plot <- switch(plot_type, - "scatter" = plot + geom_point() + geom_smooth(method = "lm", se = FALSE), - "box" = plot + geom_boxplot(), - "density" = plot + geom_density(), - "bar" = plot + geom_bar(), - plot # Default: return the plot as is - ) - - if (!is.null(ylim_vals)) { - plot <- plot + coord_cartesian(ylim = ylim_vals) - } - - # Add titles and labels if available - plot <- plot + ggtitle(title) + theme_publication() - if (!is.null(x_label)) plot <- plot + xlab(x_label) - if (!is.null(y_label)) plot <- plot + ylab(y_label) - - return(plot) -} - -generate_and_save_plots <- function(df, output_dir, file_prefix, plot_configs) { - plots <- list() - - if (nrow(df) == 0) { - message("The \"", deparse(substitute(df)), "\" dataframe is empty, skipping plots") - return() - } - - message("Generating plots for \"", deparse(substitute(df)), "\" dataframe") - - for (config in plot_configs) { - plot <- generate_plot( - df = df, - x_var = config$x_var, - y_var = config$y_var, - plot_type = config$plot_type, - title = config$title, - ylim_vals = config$ylim_vals - ) - - # If custom annotations for interaction plots are required - if (!is.null(config$annotations)) { - for (annotation in config$annotations) { - plot <- plot + - annotate( - "text", x = annotation$x, y = annotation$y, label = annotation$label - ) - } - } - - # Store the plot with the variable name and plot type - plots[[paste0(config$y_var, "_", config$plot_type)]] <- plot - } - - save_plots(file_prefix, plots, output_dir) -} - -# Ensure all plots are saved and printed to PDF -save_plots <- function(file_name, plot_list, output_dir) { - # Save to PDF - pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9) - lapply(plot_list, function(plot) { - print(plot) - }) - dev.off() - - # Save to HTML with horizontal legend orientation - lapply(names(plot_list), function(plot_name) { - message("Generating plot: ", plot_name) - pgg <- tryCatch({ - suppressWarnings(ggplotly(plot_list[[plot_name]]) %>% - layout(legend = list(orientation = "h"))) - }, error = function(e) { - message("Error in plot: ", plot_name, "\n", e) - return(NULL) - }) - if (!is.null(pgg)) { - saveWidget(pgg, - file = file.path(output_dir, - paste0(file_name, "_", plot_name, ".html")), - selfcontained = TRUE) - } - }) -} - generate_interaction_plots <- function(df, output_file) { message("Generating interaction plots") @@ -503,21 +518,27 @@ generate_interaction_plots <- function(df, output_file) { generate_cpp_correlation_plots <- function(df_na_rm, lm_list, output_dir) { lm_summaries <- lapply(lm_list, summary) - plot_titles <- c("Interaction L vs. Interaction K", "Interaction L vs. Interaction r", "Interaction L vs. Interaction AUC", - "Interaction K vs. Interaction r", "Interaction K vs. Interaction AUC", "Interaction r vs. Interaction AUC") - plot_list <- lapply(seq_along(lm_list), function(i) { - ggplot(df_na_rm, aes_string(x = names(lm_list)[i][1], y = names(lm_list)[i][2])) + - geom_point(shape = 3, color = "gray70") + - geom_smooth(method = "lm", color = "tomato3") + - ggtitle(plot_titles[i]) + - annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[i]]$r.squared, 3))) + - theme_Publication_legend_right() + # Define plot titles and annotation based on R-squared values from the lm_list + plot_configs <- lapply(seq_along(lm_list), function(i) { + r_squared <- round(lm_summaries[[i]]$r.squared, 3) + plot_titles <- c("Interaction L vs. Interaction K", "Interaction L vs. Interaction r", "Interaction L vs. Interaction AUC", + "Interaction K vs. Interaction r", "Interaction K vs. Interaction AUC", "Interaction r vs. Interaction AUC") + + list( + x_var = names(lm_list)[i][1], + y_var = names(lm_list)[i][2], + plot_type = "scatter", + title = plot_titles[i], + annotations = list(list(x = 0, y = 0, label = paste("R-squared =", r_squared))) + ) }) - save_plots("Correlation_CPPs", plot_list, output_dir) + # Generate and save the plots using the new system + generate_and_save_plots(df_na_rm, output_dir, "Correlation_CPPs", plot_configs) } + # Adjust missing values and calculate ranks adjust_missing_and_rank <- function(df, variables) { @@ -570,39 +591,38 @@ generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", } # Function to generate rank plots for the provided dataframe -create_ranked_plots <- function(df, output_dir) { +# create_ranked_plots <- function(df, output_dir) { - # List of variables for which we need to generate rank plots - variables <- c("L", "K", "r", "AUC") +# # List of variables for which we need to generate rank plots +# variables <- c("L", "K", "r", "AUC") - # Adjust missing values and calculate ranks - df_adjusted <- adjust_missing_and_rank(df, variables) +# # Adjust missing values and calculate ranks +# df_adjusted <- adjust_missing_and_rank(df, variables) - # Generate rank plots for Avg_Zscore and Z_lm - for (var in variables) { - plot_rank_avg <- generate_plot( - df = df_adjusted, - x_var = paste0(var, "_Rank"), - y_var = paste0("Avg_Zscore_", var), - plot_type = "scatter", - title = paste("Average Z score vs Rank for", var) - ) +# # Generate rank plots +# for (var in variables) { +# plot_rank_avg <- generate_plot( +# df = df_adjusted, +# x_var = paste0(var, "_Rank"), +# y_var = paste0("Avg_Zscore_", var), +# plot_type = "scatter", +# title = paste("Average Z score vs Rank for", var) +# ) - plot_rank_lm <- generate_plot( - df = df_adjusted, - x_var = paste0(var, "_Rank_lm"), - y_var = paste0("Z_lm_", var), - plot_type = "scatter", - title = paste("Interaction Z score vs Rank for", var) - ) +# plot_rank_lm <- generate_plot( +# df = df_adjusted, +# x_var = paste0(var, "_Rank_lm"), +# y_var = paste0("Z_lm_", var), +# plot_type = "scatter", +# title = paste("Interaction Z score vs Rank for", var) +# ) - # Save the plots for Avg_Zscore and Z_lm - save_plots(paste0("RankPlots_", var), list( - plot_rank_avg = plot_rank_avg, - plot_rank_lm = plot_rank_lm - ), output_dir) - } -} +# save_plots(paste0("RankPlots_", var), list( +# plot_rank_avg = plot_rank_avg, +# plot_rank_lm = plot_rank_lm +# ), output_dir) +# } +# } main <- function() { lapply(names(args$experiments), function(exp_name) { @@ -622,8 +642,6 @@ main <- function() { # QC steps and filtering df_above_tolerance <- df %>% filter(DB == 1) - df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .))) - df_no_zeros <- df_na %>% filter(L > 0) # Calculate the half-medians for `L` and `K` for rows above tolerance L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2 @@ -632,7 +650,24 @@ main <- function() { # Get the number of rows that are above tolerance rows_to_remove <- nrow(df_above_tolerance) - # Additional filtering for non-finite values in df_na + # Set L, r, K, and AUC to NA for rows that are above tolerance + df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .))) + + # Calculate summary statistics for all strains, including both background and the deletions + message("Calculating summary statistics for all strains") + variables <- c("L", "K", "r", "AUC") + ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) + summary_stats <- ss$summary_stats + df_na_stats <- ss$df_with_stats + write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) + + print("Summary stats:") + print(head(summary_stats), width = 200) + + # Remove rows with 0 values in L + df_no_zeros <- df_na %>% filter(L > 0) + + # Additional filtering for non-finite values df_na_filtered <- df_na %>% filter(if_any(c(L), ~ !is.finite(.))) %>% { @@ -647,7 +682,7 @@ main <- function() { qc_plot_configs <- list( list(x_var = "L", y_var = "K", plot_type = "scatter", title = "Raw L vs K before QC", ylim_vals = NULL), list(x_var = "delta_bg", y_var = NULL, plot_type = "density", title = "Density plot for Delta Background", ylim_vals = NULL), - list(x_var = "delta_bg", y_var = NULL, plot_type = "bar", title = "Bar plot for Delta Background", ylim_vals = NULL) + list(x_var = "as.factor(delta_bg)", y_var = NULL, plot_type = "bar", title = "Bar plot for Delta Background", ylim_vals = NULL) ) above_tolerance_plot_configs <- list( @@ -659,7 +694,16 @@ main <- function() { ) no_zeros_plot_configs <- list( - list(x_var = "L", y_var = "K", plot_type = "scatter", title = "L vs K for Non-Zero L Values", ylim_vals = NULL) + list(x_var = "scan", y_var = "L", plot_type = "scatter", title = "L vs Scan for Non-Zero L Values"), + list(x_var = "scan", y_var = "K", plot_type = "scatter", title = "K vs Scan for Non-Zero L Values"), + list(x_var = "scan", y_var = "r", plot_type = "scatter", title = "r vs Scan for Non-Zero L Values"), + list(x_var = "scan", y_var = "AUC", plot_type = "scatter", title = "AUC vs Scan for Non-Zero L Values"), + list(x_var = "scan", y_var = "delta_bg", plot_type = "scatter", title = "Delta background vs Scan for Non-Zero L Values"), + list(x_var = "as.factor(scan)", y_var = "L", plot_type = "box", title = "L vs Scan for Non-Zero L Values"), + list(x_var = "as.factor(scan)", y_var = "K", plot_type = "box", title = "K vs Scan for Non-Zero L Values"), + list(x_var = "as.factor(scan)", y_var = "r", plot_type = "box", title = "r vs Scan for Non-Zero L Values"), + list(x_var = "as.factor(scan)", y_var = "AUC", plot_type = "box", title = "AUC vs Scan for Non-Zero L Values"), + list(x_var = "as.factor(scan)", y_var = "delta_bg", plot_type = "box", title = "Delta background vs Scan for Non-Zero L Values") ) # Generate and save plots for each QC step @@ -672,17 +716,6 @@ main <- function() { # Clean up rm(df, df_above_tolerance, df_no_zeros) - # Calculate summary statistics - message("Calculating summary statistics for all strains") - variables <- c("L", "K", "r", "AUC") - ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) - summary_stats <- ss$summary_stats - df_na_stats <- ss$df_with_stats - write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) - - print("Summary stats:") - print(head(summary_stats), width = 200) - # TODO: Originally this filtered L NA's # Let's try to avoid for now since stats have already been calculated @@ -695,6 +728,7 @@ main <- function() { # Summary statistics for within and outside 2SD of K message("Calculating summary statistics for L within 2SD of K") + # TODO We're omitting the original z_max calculation, not sure if needed? ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) l_within_2sd_k_stats <- ss$summary_stats df_na_l_within_2sd_k_stats <- ss$df_with_stats @@ -844,8 +878,7 @@ main <- function() { zscores_interactions_filtered <- zscores_interactions %>% filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) - # Generate summary and correlation plots - generate_summary_plots(df, out_dir) + lm_list <- list( lm(Z_lm_K ~ Z_lm_L, data = zscores_interactions_filtered), lm(Z_lm_r ~ Z_lm_L, data = zscores_interactions_filtered), @@ -854,12 +887,39 @@ main <- function() { lm(Z_lm_AUC ~ Z_lm_K, data = zscores_interactions_filtered), lm(Z_lm_AUC ~ Z_lm_r, data = zscores_interactions_filtered) ) - + # Generate cpp correlation plots - generate_cpp_correlation_plots(zscores_interactions_filtered, lm_list, out_dir) + correlation_plot_config <- list( + list(x_var = "Z_lm_L", y_var = "Z_lm_K", plot_type = "scatter", title = "Correlation between Z_lm_L and Z_lm_K", + annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ), + list(x_var = "Z_lm_L", y_var = "Z_lm_r", plot_type = "scatter", title = "Correlation between Z_lm_L and Z_lm_r", + annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ), + list(x_var = "Z_lm_L", y_var = "Z_lm_AUC", plot_type = "scatter", title = "Correlation between Z_lm_L and Z_lm_AUC", + annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ), + list(x_var = "Z_lm_K", y_var = "Z_lm_r", plot_type = "scatter", title = "Correlation between Z_lm_K and Z_lm_r", + annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ), + list(x_var = "Z_lm_K", y_var = "Z_lm_AUC", plot_type = "scatter", title = "Correlation between Z_lm_K and Z_lm_AUC", + annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ), + list(x_var = "Z_lm_r", y_var = "Z_lm_AUC", plot_type = "scatter", title = "Correlation between Z_lm_r and Z_lm_AUC", + annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ) + ) + generate_and_save_plots(zscores_interactions_filtered, output_dir, "CorrelationPlots", correlation_plot_config) + + # Generate cpp correlation plots + #generate_cpp_correlation_plots(zscores_interactions_filtered, lm_list, out_dir) # Generate ranked plots - create_ranked_plots(zscores_interactions_filtered, out_dir) + rank_plot_config <- list( + list(x_var = "L_Rank", y_var = "Avg_Zscore_L", plot_type = "scatter", title = "Rank vs Avg Z score for L"), + list(x_var = "K_Rank", y_var = "Avg_Zscore_K", plot_type = "scatter", title = "Rank vs Avg Z score for K"), + list(x_var = "r_Rank", y_var = "Avg_Zscore_r", plot_type = "scatter", title = "Rank vs Avg Z score for r"), + list(x_var = "AUC_Rank", y_var = "Avg_Zscore_AUC", plot_type = "scatter", title = "Rank vs Avg Z score for AUC") + ) + # Generate and save rank plots using the existing plotting framework + generate_and_save_plots(zscores_interactions_filtered, output_dir, "RankPlots", rank_plot_config) + + + }) }) }