From 8f2f8d28f0cbf42d488b6b4e56f6e992c680110a Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Sat, 7 Sep 2024 21:11:32 -0400 Subject: [PATCH] Post plotting refactor --- .../apps/r/calculate_interaction_zscores5.R | 323 ++++++------------ 1 file changed, 100 insertions(+), 223 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index 98086485..2760f67e 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -159,74 +159,48 @@ update_gene_names <- function(df, sgd_gene_list) { # Process either deletion and or reference strain(s) process_strains <- function(df) { - df_strains <- data.frame() # Initialize an empty dataframe to store results + message("Processing strains") - for (concentration in unique(df$conc_num)) { - message("Processing concentration: ", concentration) - df_temp <- df %>% filter(conc_num == concentration) - - if (concentration > 0) { - max_l_theoretical <- df_temp %>% pull(max_L) - - df_temp <- df_temp %>% - mutate( - L = ifelse(L == 0 & !is.na(L), max_l_theoretical, L), # Replace zero values with max theoretical - SM = ifelse(L >= max_l_theoretical & !is.na(L), 1, SM), # Set SM flag - L = ifelse(L >= max_l_theoretical & !is.na(L), max_l_theoretical, L) # Cap L values - ) - } - # Append the results of this concentration to df_strains - df_strains <- bind_rows(df_strains, df_temp) - } - - return(df_strains) + df %>% + group_by(conc_num) %>% + mutate( + max_l_theoretical = max(max_L, na.rm = TRUE), + L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), + SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM), + L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% + ungroup() } # Calculate summary statistics for all variables calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) { - # Replace 0 values with NA df <- df %>% - mutate(across(all_of(variables), ~ifelse(. == 0, NA, .))) + mutate(across(all_of(variables), ~ ifelse(. == 0, NA, .))) - print("Head of df for summary stats calculations:") - print(head(df)) - - # Calculate summary statistics, including a single N based on L summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% - reframe( - N = sum(!is.na(L)), # Single N based on L + summarise( + N = sum(!is.na(L)), across(all_of(variables), list( mean = ~mean(., na.rm = TRUE), median = ~median(., na.rm = TRUE), max = ~max(., na.rm = TRUE), min = ~min(., na.rm = TRUE), sd = ~sd(., na.rm = TRUE), - se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1) # TODO unsure why this is - 1 + se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1) ), .names = "{.fn}_{.col}") ) - print("Summary stats:") - print(head(summary_stats)) + df_cleaned <- df %>% + select(-any_of(names(summary_stats))) - # Remove existing stats columns from df if they already exist - stat_columns <- setdiff(names(summary_stats), group_vars) - df_cleaned <- df %>% select(-any_of(stat_columns)) - - # Join the summary stats back to the cleaned original dataframe df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars) - + return(list(summary_stats = summary_stats, df_with_stats = df_with_stats)) } # Calculate interaction scores calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { - # if (nrow(df) == 0) { - # message("Dataframe is empty after filtering") - # return(NULL) # Or handle the empty dataframe case as needed - # } - # Calculate total concentration variables total_conc_num <- length(unique(df$conc_num)) num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1 @@ -318,10 +292,11 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c print("Calculating interaction scores part 2") interaction_scores_all <- interaction_scores %>% group_by(across(all_of(group_vars))) %>% - mutate(lm_score_L = max_conc * coef(lm(Delta_L ~ conc_num_factor))[2] + coef(lm(Delta_L ~ conc_num_factor))[1], - lm_score_K = max_conc * coef(lm(Delta_K ~ conc_num_factor))[2] + coef(lm(Delta_K ~ conc_num_factor))[1], - lm_score_r = max_conc * coef(lm(Delta_r ~ conc_num_factor))[2] + coef(lm(Delta_r ~ conc_num_factor))[1], - lm_score_AUC = max_conc * coef(lm(Delta_AUC ~ conc_num_factor))[2] + coef(lm(Delta_AUC ~ conc_num_factor))[1]) %>% + mutate( + lm_score_L = max_conc * coef(lm(Delta_L ~ conc_num_factor))[2] + coef(lm(Delta_L ~ conc_num_factor))[1], + lm_score_K = max_conc * coef(lm(Delta_K ~ conc_num_factor))[2] + coef(lm(Delta_K ~ conc_num_factor))[1], + lm_score_r = max_conc * coef(lm(Delta_r ~ conc_num_factor))[2] + coef(lm(Delta_r ~ conc_num_factor))[1], + lm_score_AUC = max_conc * coef(lm(Delta_AUC ~ conc_num_factor))[2] + coef(lm(Delta_AUC ~ conc_num_factor))[1]) %>% mutate( Avg_Zscore_L = sum(Z_Shift_L, na.rm = TRUE) / num_non_removed_concs, Avg_Zscore_K = sum(Z_Shift_K, na.rm = TRUE) / num_non_removed_concs, @@ -338,11 +313,9 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) } - generate_plot <- function(df, x_var, y_var = NULL, plot_type = "scatter", color_var = "conc_num", - title = "", x_label = NULL, y_label = NULL, ylim_vals = NULL) { - - # Initialize plot with dynamic x and color variables + title = "", x_label = NULL, y_label = NULL, ylim_vals = NULL, annotations = NULL) { + plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) if (!is.null(y_var)) { @@ -353,12 +326,14 @@ generate_plot <- function(df, x_var, y_var = NULL, plot_type = "scatter", color_ y_sd_col <- paste0("sd_", y_var) plot <- plot + geom_point(shape = 3, size = 0.2) + - geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col), - ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) + + geom_errorbar(aes( + ymin = !!sym(y_mean_col) - !!sym(y_sd_col), + ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), + width = 0.1) + geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) } - # Add the required plot type (scatter, box, density, bar) + # Add the specified plot type plot <- switch(plot_type, "box" = plot + geom_boxplot(), "density" = plot + geom_density(), @@ -366,20 +341,27 @@ generate_plot <- function(df, x_var, y_var = NULL, plot_type = "scatter", color_ plot + geom_point() + geom_smooth(method = "lm", se = FALSE) # Default is scatter ) - # Apply y-axis limits if specified if (!is.null(ylim_vals)) { plot <- plot + coord_cartesian(ylim = ylim_vals) } - # Add title and axis labels plot <- plot + ggtitle(title) + theme_publication() + if (!is.null(x_label)) plot <- plot + xlab(x_label) if (!is.null(y_label)) plot <- plot + ylab(y_label) + if (!is.null(annotations)) { + for (annotation in annotations) { + plot <- plot + annotate("text", + x = annotation$x, + y = annotation$y, + label = annotation$label) + } + } + return(plot) } - generate_and_save_plots <- function(df, output_dir, file_prefix, plot_configs) { plots <- list() @@ -391,24 +373,16 @@ generate_and_save_plots <- function(df, output_dir, file_prefix, plot_configs) { message("Generating plots for dataframe") for (config in plot_configs) { - # Generate the plot using the configurations plot <- generate_plot( df = df, x_var = config$x_var, y_var = config$y_var, plot_type = config$plot_type, title = config$title, - ylim_vals = config$ylim_vals + ylim_vals = config$ylim_vals, + annotations = config$annotations ) - - # Add custom annotations if provided - if (!is.null(config$annotations)) { - for (annotation in config$annotations) { - plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label) - } - } - # Store the plot with variable name and plot type plots[[paste0(config$y_var, "_", config$plot_type)]] <- plot } @@ -416,16 +390,12 @@ generate_and_save_plots <- function(df, output_dir, file_prefix, plot_configs) { save_plots(file_prefix, plots, output_dir) } - - -# Ensure all plots are saved and printed to PDF +# Standardized saving of plots save_plots <- function(file_name, plot_list, output_dir) { - # Save all plots to a single PDF pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9) lapply(plot_list, print) dev.off() - # Save each plot as an interactive HTML file lapply(names(plot_list), function(plot_name) { pgg <- tryCatch({ suppressWarnings(ggplotly(plot_list[[plot_name]]) %>% @@ -436,109 +406,79 @@ save_plots <- function(file_name, plot_list, output_dir) { }) if (!is.null(pgg)) { - saveWidget(pgg, file = file.path(output_dir, paste0(file_name, "_", plot_name, ".html")), selfcontained = TRUE) + saveWidget(pgg, + file = file.path(output_dir, paste0(file_name, "_", plot_name, ".html")), + selfcontained = TRUE) } }) } +interaction_plot_configs <- function(df, variables) { + plot_configs <- list() -interaction_plot_configs <- function(df, variable) { - ylim_vals <- switch(variable, - "L" = c(-65, 65), - "K" = c(-65, 65), - "r" = c(-0.65, 0.65), - "AUC" = c(-6500, 6500), - NULL - ) - - wt_sd <- paste0("WT_sd_", variable) - delta_var <- paste0("Delta_", variable) - z_shift <- paste0("Z_Shift_", variable) - z_lm <- paste0("Z_lm_", variable) - - list( - x_var = "conc_num_factor", - y_var = delta_var, - plot_type = "scatter", - title = paste("Scatter plot for", variable), - ylim_vals = ylim_vals, - annotations = list( - list(x = 1, y = max(ylim_vals) * 0.7, label = paste("ZShift =", round(df[[z_shift]], 2))), - list(x = 1, y = max(ylim_vals) * 0.5, label = paste("Z lm Score =", round(df[[z_lm]], 2))), - list(x = 1, y = min(ylim_vals) * 0.7, label = paste("NG =", df$NG)), - list(x = 1, y = min(ylim_vals) * 0.85, label = paste("DB =", df$DB)), - list(x = 1, y = min(ylim_vals) * 1.1, label = paste("SM =", df$SM)) + for (variable in variables) { + # Define the y-limits based on the variable being plotted + ylim_vals <- switch(variable, + "L" = c(-65, 65), + "K" = c(-65, 65), + "r" = c(-0.65, 0.65), + "AUC" = c(-6500, 6500) ) - ) -} -generate_interaction_plots <- function(df, output_file) { - message("Generating interaction plots") + # Dynamically generate the column names for standard deviation and delta + wt_sd_col <- paste0("WT_sd_", variable) + delta_var <- paste0("Delta_", variable) + z_shift <- paste0("Z_Shift_", variable) + z_lm <- paste0("Z_lm_", variable) - # Variables to be plotted - variables <- c("L", "K", "r", "AUC") - ylims <- list( - L = c(0, 160), - K = c(-20, 160), - r = c(0, 1), - AUC = c(0, 12500) - ) - - plot_list <- list() - - # Generate plots for each variable using the existing plotting function - for (var in variables) { - plot <- generate_plot( - df = df, + # Set annotations for ZShift, Z lm Score, NG, DB, SM + annotations <- list( + list(x = 1, y = ifelse(variable == "L", 45, ifelse(variable == "K", 45, + ifelse(variable == "r", 0.45, 4500))), label = paste("ZShift =", round(df[[z_shift]], 2))), + list(x = 1, y = ifelse(variable == "L", 25, ifelse(variable == "K", 25, + ifelse(variable == "r", 0.25, 2500))), label = paste("lm ZScore =", round(df[[z_lm]], 2))), + list(x = 1, y = ifelse(variable == "L", -25, ifelse(variable == "K", -25, + ifelse(variable == "r", -0.25, -2500))), label = paste("NG =", df$NG)), + list(x = 1, y = ifelse(variable == "L", -35, ifelse(variable == "K", -35, + ifelse(variable == "r", -0.35, -3500))), label = paste("DB =", df$DB)), + list(x = 1, y = ifelse(variable == "L", -45, ifelse(variable == "K", -45, + ifelse(variable == "r", -0.45, -4500))), label = paste("SM =", df$SM)) + ) + + # Define the configuration for each variable (plot type, limits, labels) + plot_configs[[variable]] <- list( x_var = "conc_num_factor", - y_var = var, + y_var = delta_var, plot_type = "scatter", - title = paste("Scatter RF for", var, "with SD"), - ylim_vals = ylims[[var]] - ) + - annotate("text", x = -0.25, y = ifelse(var == "L", 10, ifelse(var == "K", -5, 0.9)), label = "NG") + - annotate("text", x = -0.25, y = ifelse(var == "L", 5, ifelse(var == "K", -12.5, 0.8)), label = "DB") + - annotate("text", x = -0.25, y = ifelse(var == "L", 0, ifelse(var == "K", -20, 0.7)), label = "SM") + - annotate("text", x = unique(df$conc_num_factor), y = ifelse(var == "L", 10, ifelse(var == "K", -5, 0.9)), label = df$NG) + - annotate("text", x = unique(df$conc_num_factor), y = ifelse(var == "L", 5, ifelse(var == "K", -12.5, 0.8)), label = df$DB) + - annotate("text", x = unique(df$conc_num_factor), y = ifelse(var == "L", 0, ifelse(var == "K", -20, 0.7)), label = df$SM) - - plot_list[[var]] <- plot + title = paste(df$OrfRep[1], df$Gene[1], sep = " "), + ylim_vals = ylim_vals, + annotations = annotations, + error_bar = list( + ymin = 0 - (2 * df[[wt_sd_col]]), + ymax = 0 + (2 * df[[wt_sd_col]]) + ), + x_breaks = unique(df$conc_num_factor), + x_labels = unique(as.character(df$conc_num)), + x_label = unique(df$Drug[1]) + ) } - # Save plots in a PDF - pdf(output_file, width = 16, height = 16) - grid.arrange( - plot_list$L, plot_list$K, - plot_list$r, plot_list$AUC, - ncol = 2, nrow = 2 - ) - dev.off() + return(plot_configs) } -generate_cpp_correlation_plots <- function(df_na_rm, lm_list, output_dir) { - lm_summaries <- lapply(lm_list, summary) - - # Define plot titles and annotation based on R-squared values from the lm_list - plot_configs <- lapply(seq_along(lm_list), function(i) { +correlation_plot_configs <- function(df, lm_list, lm_summaries) { + lapply(seq_along(lm_list), function(i) { r_squared <- round(lm_summaries[[i]]$r.squared, 3) - plot_titles <- c("Interaction L vs. Interaction K", "Interaction L vs. Interaction r", "Interaction L vs. Interaction AUC", - "Interaction K vs. Interaction r", "Interaction K vs. Interaction AUC", "Interaction r vs. Interaction AUC") - list( x_var = names(lm_list)[i][1], y_var = names(lm_list)[i][2], plot_type = "scatter", - title = plot_titles[i], + title = paste("Correlation between", names(lm_list)[i][1], "and", names(lm_list)[i][2]), annotations = list(list(x = 0, y = 0, label = paste("R-squared =", r_squared))) ) }) - - # Generate and save the plots using the new system - generate_and_save_plots(df_na_rm, output_dir, "Correlation_CPPs", plot_configs) } - # Adjust missing values and calculate ranks adjust_missing_and_rank <- function(df, variables) { @@ -553,7 +493,6 @@ adjust_missing_and_rank <- function(df, variables) { return(df) } - generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", title_prefix = "", x_label = NULL, y_label = NULL, ylim_vals = NULL, output_dir, file_prefix = "plot") { plot_list <- list() @@ -590,40 +529,6 @@ generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", save_plots(file_prefix, plot_list, output_dir) } -# Function to generate rank plots for the provided dataframe -# create_ranked_plots <- function(df, output_dir) { - -# # List of variables for which we need to generate rank plots -# variables <- c("L", "K", "r", "AUC") - -# # Adjust missing values and calculate ranks -# df_adjusted <- adjust_missing_and_rank(df, variables) - -# # Generate rank plots -# for (var in variables) { -# plot_rank_avg <- generate_plot( -# df = df_adjusted, -# x_var = paste0(var, "_Rank"), -# y_var = paste0("Avg_Zscore_", var), -# plot_type = "scatter", -# title = paste("Average Z score vs Rank for", var) -# ) - -# plot_rank_lm <- generate_plot( -# df = df_adjusted, -# x_var = paste0(var, "_Rank_lm"), -# y_var = paste0("Z_lm_", var), -# plot_type = "scatter", -# title = paste("Interaction Z score vs Rank for", var) -# ) - -# save_plots(paste0("RankPlots_", var), list( -# plot_rank_avg = plot_rank_avg, -# plot_rank_lm = plot_rank_lm -# ), output_dir) -# } -# } - main <- function() { lapply(names(args$experiments), function(exp_name) { exp <- args$experiments[[exp_name]] @@ -807,8 +712,10 @@ main <- function() { write.csv(zscores_interactions, file = file.path(out_dir, "ZScores_Interaction.csv"), row.names = FALSE) # Create interaction plots - generate_interaction_plots(df_reference, output_file = file.path(output_dir, "RF_InteractionPlots.pdf")) - generate_interaction_plots(df_deletion, output_file = file.path(output_dir, "InteractionPlots.pdf")) + deletion_plot_configs <- interaction_plot_configs(df_reference, variables) + generate_and_save_plots(zscores_calculations, out_dir, "RF_InteractionPlots", deletion_plot_configs) + deletion_plot_configs <- interaction_plot_configs(df_deletion, variables) + generate_and_save_plots(zscores_calculations, out_dir, "InteractionPlots", deletion_plot_configs) # Define conditions for enhancers and suppressors # TODO Add to study config file? @@ -861,24 +768,13 @@ main <- function() { # Interaction plots for reference strain variables <- c("L", "K", "r", "AUC") - reference_plot_configs <- lapply(variables, function(var) { - interaction_plot_configs(zscores_calculations_reference, var) - }) - - generate_and_save_plots(zscores_calculations_reference, out_dir, "RF_InteractionPlots", reference_plot_configs) - - # Interaction plots for deletion strains - deletion_plot_configs <- lapply(variables, function(var) { - interaction_plot_configs(zscores_calculations, var) - }) + reference_plot_configs <- interaction_plot_configs(df_reference, variables) + generate_and_save_plots(zscores_calculation_reference, out_dir, "RF_InteractionPlots", reference_plot_configs) + deletion_plot_configs <- interaction_plot_configs(df_deletion, variables) generate_and_save_plots(zscores_calculations, out_dir, "InteractionPlots", deletion_plot_configs) - # Apply filtering to remove NA values before further analysis - zscores_interactions_filtered <- zscores_interactions %>% - filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) - - + # Correlation plots lm_list <- list( lm(Z_lm_K ~ Z_lm_L, data = zscores_interactions_filtered), lm(Z_lm_r ~ Z_lm_L, data = zscores_interactions_filtered), @@ -888,25 +784,9 @@ main <- function() { lm(Z_lm_AUC ~ Z_lm_r, data = zscores_interactions_filtered) ) - # Generate cpp correlation plots - correlation_plot_config <- list( - list(x_var = "Z_lm_L", y_var = "Z_lm_K", plot_type = "scatter", title = "Correlation between Z_lm_L and Z_lm_K", - annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ), - list(x_var = "Z_lm_L", y_var = "Z_lm_r", plot_type = "scatter", title = "Correlation between Z_lm_L and Z_lm_r", - annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ), - list(x_var = "Z_lm_L", y_var = "Z_lm_AUC", plot_type = "scatter", title = "Correlation between Z_lm_L and Z_lm_AUC", - annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ), - list(x_var = "Z_lm_K", y_var = "Z_lm_r", plot_type = "scatter", title = "Correlation between Z_lm_K and Z_lm_r", - annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ), - list(x_var = "Z_lm_K", y_var = "Z_lm_AUC", plot_type = "scatter", title = "Correlation between Z_lm_K and Z_lm_AUC", - annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ), - list(x_var = "Z_lm_r", y_var = "Z_lm_AUC", plot_type = "scatter", title = "Correlation between Z_lm_r and Z_lm_AUC", - annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ) - ) + lm_summaries <- lapply(lm_list, summary) + correlation_plot_config <- correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries) generate_and_save_plots(zscores_interactions_filtered, output_dir, "CorrelationPlots", correlation_plot_config) - - # Generate cpp correlation plots - #generate_cpp_correlation_plots(zscores_interactions_filtered, lm_list, out_dir) # Generate ranked plots rank_plot_config <- list( @@ -917,9 +797,6 @@ main <- function() { ) # Generate and save rank plots using the existing plotting framework generate_and_save_plots(zscores_interactions_filtered, output_dir, "RankPlots", rank_plot_config) - - - }) }) }