From 756075a7e5af5503fe4f06962f6f562182aa128e Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Mon, 2 Sep 2024 01:53:04 -0400 Subject: [PATCH] Remove existing columns before join --- .../apps/r/calculate_interaction_zscores5.R | 116 +++++++++++------- 1 file changed, 69 insertions(+), 47 deletions(-) diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index dca58c42..0f17552b 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -58,8 +58,9 @@ parse_arguments <- function() { args <- parse_arguments() -dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE) -dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE) +# Should we keep output in exp dirs or combine in the study output dir? +# dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE) +# dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE) # Define themes and scales theme_publication <- function(base_size = 14, base_family = "sans", legend_position = "bottom") { @@ -156,33 +157,25 @@ update_gene_names <- function(df, sgd_gene_list) { } generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num", title, x_label = NULL, y_label = NULL) { - # Base ggplot object with conditional y mapping + # Start the ggplot with just the x_var and color_var + plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) + + ggtitle(title) + + theme_publication() + + # If y_var is not NULL, add it to the plot aesthetics if (!is.null(y_var)) { - plot <- ggplot(df, aes(x = !!sym(x_var), y = !!sym(y_var), color = as.factor(!!sym(color_var)))) - } else { - plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) + plot <- plot + aes(y = !!sym(y_var)) } # Handle different plot types - if (plot_type == "scatter") { - plot <- plot + - geom_point(shape = 3, size = 0.2, position = "jitter") + + plot <- switch(plot_type, + "scatter" = plot + geom_point(shape = 3, size = 0.2, position = "jitter") + stat_summary(fun.data = mean_sdl, geom = "errorbar") + - stat_summary(fun = mean, geom = "point", size = 0.6) - } else if (plot_type == "box") { - plot <- plot + geom_boxplot() - } else if (plot_type == "density") { - plot <- plot + geom_density() - } else if (plot_type == "bar") { - if (!is.null(y_var)) { - plot <- plot + geom_bar(stat = "identity") # Use y aesthetic if provided - } else { - plot <- plot + geom_bar() # Default to counting occurrences - } - } - - # Add titles and labels - plot <- plot + ggtitle(title) + theme_publication() + stat_summary(fun = mean, geom = "point", size = 0.6), + "box" = plot + geom_boxplot(), + "density" = plot + geom_density(), + "bar" = plot + geom_bar(stat = ifelse(is.null(y_var), "count", "identity")) + ) # Add optional labels if provided if (!is.null(x_label)) plot <- plot + xlab(x_label) @@ -192,6 +185,8 @@ generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_ } + + generate_and_save_plots <- function(df, output_dir, prefix, variables, include_qc = FALSE) { plots <- list() @@ -277,19 +272,24 @@ save_plots <- function(file_name, plot_list, output_dir) { # Process strains (deletion and reference) process_strains <- function(df, l_within_2sd_k, strain) { df_strains <- data.frame() # Initialize an empty dataframe to store results + + print(names(l_within_2sd_k)) for (concentration in unique(df$conc_num)) { df_temp <- df %>% filter(conc_num == concentration) + + if (concentration > 0) { max_l_theoretical <- l_within_2sd_k %>% filter(conc_num_factor == concentration) %>% pull(L_max) + df_temp <- df_temp %>% mutate( - L = ifelse(L == 0 & !is.na(L), max_l_theoretical, L), - SM = ifelse(L >= max_l_theoretical & !is.na(L), 1, SM), - L = ifelse(L >= max_l_theoretical & !is.na(L), max_l_theoretical, L) + L = ifelse(L == 0 & !is.na(L), max_l_theoretical, L), # Replace zero values with max theoretical + SM = ifelse(L >= max_l_theoretical & !is.na(L), 1, SM), # Set SM flag + L = ifelse(L >= max_l_theoretical & !is.na(L), max_l_theoretical, L) # Cap L values ) } df_strains <- bind_rows(df_strains, df_temp) # Append the results of this concentration @@ -298,6 +298,7 @@ process_strains <- function(df, l_within_2sd_k, strain) { return(df_strains) } + calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { # Pull the background means @@ -536,13 +537,9 @@ main <- function() { df_above_tolerance <- df %>% filter(DB == 1) # Set vars above the delta background tolerance to NA - df_na <- df %>% - mutate( - L = ifelse(DB == 1, NA, L), - r = ifelse(DB == 1, NA, r), - AUC = ifelse(DB == 1, NA, AUC), - K = ifelse(DB == 1, NA, K) - ) + df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .x))) + + # Remove rows where L is 0 df_no_zeros <- df_na %>% filter(L > 0) # Flag and remove non-finite data, printing affected rows @@ -551,18 +548,18 @@ main <- function() { { if (nrow(.) > 0) { message("Removing non-finite rows:\n") - #print(.) + print(head(., n = 10)) } df_na %>% filter(if_all(c(L), is.finite)) # Add L, r, AUC, K as needed for debugging } # Generate QC PDFs and HTMLs - message("Generating QC plots") - variables <- c("L", "K", "r", "AUC", "delta_bg") - generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE) - generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE) - generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables) - generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables) + # message("Generating QC plots") + # variables <- c("L", "K", "r", "AUC", "delta_bg") + # generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE) + # generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE) + # generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables) + # generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables) rm(df, df_above_tolerance, df_no_zeros) @@ -578,19 +575,44 @@ main <- function() { # Filter data within 2SD within_2sd_k <- stats_joined %>% filter(K >= (K_mean - 2 * K_sd) & K <= (K_mean + 2 * K_sd)) - + # Filter data outside 2SD outside_2sd_k <- stats_joined %>% filter(K < (K_mean - 2 * K_sd) | K > (K_mean + 2 * K_sd)) # Calculate summary statistics for L within and outside 2SD of K message("Calculating summary statistics for L within 2SD of K") + l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + + cols_to_remove <- names(l_within_2sd_k) + cols_to_keep <- c("conc_num", "conc_num_factor") + + within_2sd_k_clean <- within_2sd_k %>% + select(-all_of(setdiff(cols_to_remove, cols_to_keep))) + + l_within_2sd_k_joined <- within_2sd_k_clean %>% + left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor")) + + + + + #l_within_2sd_k_joined <- merge(within_2sd_k, l_within_2sd_k, by = c("conc_num", "conc_num_factor"), all.x = TRUE) + print("within_2sd_k") + print(head(within_2sd_k)) + print("l_within_2sd_k") + print(head(l_within_2sd_k)) + print("l_within_2sd_k_joined") + print(head(l_within_2sd_k_joined)) + + quit() + write.csv(l_within_2sd_k, - file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), + file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE) message("Calculating summary statistics for for L outside 2SD of K") l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + l_outside_2sd_k_joined <- merge(outside_2sd_k, l_outside_2sd_k, by = c("conc_num", "conc_num_factor"), all.x = TRUE) write.csv(l_outside_2sd_k, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) @@ -615,11 +637,11 @@ main <- function() { # Recalculate summary statistics for the background strain message("Calculating summary statistics for background strain") - stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")) + stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) write.csv(stats_bg, file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), row.names = FALSE) - stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "Gene", "conc_num", "conc_num_factor")) + stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "conc_num", "conc_num_factor")) # Filter reference and deletion strains # Formerly X2_RF (reference strain) @@ -633,9 +655,9 @@ main <- function() { mutate(SM = 0) message("Processing reference strain") - reference_strain <- process_strains(df_reference, l_within_2sd_k, strain) + reference_strain <- process_strains(df_reference, l_within_2sd_k_joined, strain) message("Processing deletion strains") - deletion_strains <- process_strains(df_deletion, l_within_2sd_k, strain) + deletion_strains <- process_strains(df_deletion, l_within_2sd_k_joined, strain) # TODO we may need to add "num" to grouping vars