From b23c6dafefd7f6d14a29656cb121959bf82eaa8d Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Thu, 3 Oct 2024 20:58:28 -0400 Subject: [PATCH] Fix interaction groupings --- .../apps/r/calculate_interaction_zscores.R | 390 ++++++++++-------- 1 file changed, 219 insertions(+), 171 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 5c460139..d63d0f25 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -163,23 +163,19 @@ load_and_filter_data <- function(easy_results_file, sd = 3) { return(df) } -# Update Gene names using the SGD gene list update_gene_names <- function(df, sgd_gene_list) { - # Load SGD gene list - genes <- read.delim(file = sgd_gene_list, - quote = "", header = FALSE, + genes <- read.delim(file = sgd_gene_list, quote = "", header = FALSE, colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))) - # Create a named vector for mapping ORF to GeneName - gene_map <- setNames(genes$V5, genes$V4) - # Vectorized match to find the GeneName from gene_map - mapped_genes <- gene_map[df$ORF] - # Replace NAs in mapped_genes with original Gene names (preserves existing Gene names if ORF is not found) - updated_genes <- ifelse(is.na(mapped_genes) | df$OrfRep == "YDL227C", df$Gene, mapped_genes) - # Ensure Gene is not left blank or incorrectly updated to "OCT1" + gene_map <- setNames(genes$V5, genes$V4) # ORF to GeneName mapping + df <- df %>% - mutate(Gene = ifelse(updated_genes == "" | updated_genes == "OCT1", OrfRep, updated_genes)) - + mutate( + mapped_genes = gene_map[ORF], + Gene = if_else(is.na(mapped_genes) | OrfRep == "YDL227C", Gene, mapped_genes), + Gene = if_else(Gene == "" | Gene == "OCT1", OrfRep, Gene) # Handle invalid names + ) + return(df) } @@ -203,61 +199,82 @@ calculate_summary_stats <- function(df, variables, group_vars) { ) # Create a cleaned version of df that doesn't overlap with summary_stats - cleaned_df <- df %>% + df_cleaned <- df %>% select(-any_of(setdiff(intersect(names(df), names(summary_stats)), group_vars))) - df_joined <- left_join(cleaned_df, summary_stats, by = group_vars) + df_joined <- left_join(df_cleaned, summary_stats, by = group_vars) return(list(summary_stats = summary_stats, df_with_stats = df_joined)) } -calculate_interaction_scores <- function(df, df_bg, group_vars) { +calculate_interaction_scores <- function(df, df_bg, group_vars, overlap_threshold = 2) { max_conc <- max(as.numeric(df$conc_num_factor), na.rm = TRUE) - - # Include background statistics per concentration - bg_stats <- df_bg %>% - group_by(conc_num, conc_num_factor) %>% + total_conc_num <- length(unique(df$conc_num)) + + # Calculate WT statistics from df_bg + wt_stats <- df_bg %>% + filter(conc_num == 0) %>% # use the zero drug concentration background summarise( - WT_L = first(mean_L), - WT_K = first(mean_K), - WT_r = first(mean_r), - WT_AUC = first(mean_AUC), - WT_sd_L = first(sd_L), - WT_sd_K = first(sd_K), - WT_sd_r = first(sd_r), - WT_sd_AUC = first(sd_AUC), + WT_L = mean(mean_L, na.rm = TRUE), + WT_sd_L = mean(sd_L, na.rm = TRUE), + WT_K = mean(mean_K, na.rm = TRUE), + WT_sd_K = mean(sd_K, na.rm = TRUE), + WT_r = mean(mean_r, na.rm = TRUE), + WT_sd_r = mean(sd_r, na.rm = TRUE), + WT_AUC = mean(mean_AUC, na.rm = TRUE), + WT_sd_AUC = mean(sd_AUC, na.rm = TRUE) + ) + + # Add WT statistics to df + df <- df %>% + mutate( + WT_L = wt_stats$WT_L, + WT_sd_L = wt_stats$WT_sd_L, + WT_K = wt_stats$WT_K, + WT_sd_K = wt_stats$WT_sd_K, + WT_r = wt_stats$WT_r, + WT_sd_r = wt_stats$WT_sd_r, + WT_AUC = wt_stats$WT_AUC, + WT_sd_AUC = wt_stats$WT_sd_AUC + ) + + # Compute mean values at zero concentration + mean_L_zero_df <- df %>% + filter(conc_num == 0) %>% + group_by(across(all_of(group_vars))) %>% + summarise( + mean_L_zero = mean(mean_L, na.rm = TRUE), + mean_K_zero = mean(mean_K, na.rm = TRUE), + mean_r_zero = mean(mean_r, na.rm = TRUE), + mean_AUC_zero = mean(mean_AUC, na.rm = TRUE), .groups = "drop" ) - - # Calculate total number of concentrations - total_conc_num <- length(unique(df$conc_num)) - - # Join background statistics to df - calculations <- df %>% - left_join(bg_stats, by = c("conc_num", "conc_num_factor")) - - # Perform calculations - calculations <- calculations %>% - group_by(across(all_of(group_vars))) %>% + + # Join mean_L_zero_df to df + df <- df %>% + left_join(mean_L_zero_df, by = group_vars) + + # Calculate Raw Shifts and Z Shifts + df <- df %>% mutate( - N = n(), - NG = sum(NG, na.rm = TRUE), - DB = sum(DB, na.rm = TRUE), - SM = sum(SM, na.rm = TRUE), - num_non_removed_concs = n_distinct(conc_num[DB != 1]), - - # Raw shifts - Raw_Shift_L = mean_L - WT_L, - Raw_Shift_K = mean_K - WT_K, - Raw_Shift_r = mean_r - WT_r, - Raw_Shift_AUC = mean_AUC - WT_AUC, - - # Z shifts + Raw_Shift_L = mean_L_zero - WT_L, + Raw_Shift_K = mean_K_zero - WT_K, + Raw_Shift_r = mean_r_zero - WT_r, + Raw_Shift_AUC = mean_AUC_zero - WT_AUC, Z_Shift_L = Raw_Shift_L / WT_sd_L, Z_Shift_K = Raw_Shift_K / WT_sd_K, Z_Shift_r = Raw_Shift_r / WT_sd_r, - Z_Shift_AUC = Raw_Shift_AUC / WT_sd_AUC, + Z_Shift_AUC = Raw_Shift_AUC / WT_sd_AUC + ) + + calculations <- df %>% + group_by(across(all_of(group_vars))) %>% + mutate( + NG_sum = sum(NG, na.rm = TRUE), + DB_sum = sum(DB, na.rm = TRUE), + SM_sum = sum(SM, na.rm = TRUE), + num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1, # Expected values Exp_L = WT_L + Raw_Shift_L, @@ -270,39 +287,33 @@ calculate_interaction_scores <- function(df, df_bg, group_vars) { Delta_K = mean_K - Exp_K, Delta_r = mean_r - Exp_r, Delta_AUC = mean_AUC - Exp_AUC, - - # Adjust Deltas for NG and SM + + # Adjust deltas for NG and SM Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L), Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K), Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC), Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L), - # Z-scores + # Calculate Z-scores Zscore_L = Delta_L / WT_sd_L, Zscore_K = Delta_K / WT_sd_K, Zscore_r = Delta_r / WT_sd_r, Zscore_AUC = Delta_AUC / WT_sd_AUC ) %>% - ungroup() - - # Fit linear models within each group - calculations <- calculations %>% - group_by(across(all_of(group_vars))) %>% group_modify(~ { - data <- .x - # Fit linear models - lm_L <- lm(Delta_L ~ conc_num_factor, data = data) - lm_K <- lm(Delta_K ~ conc_num_factor, data = data) - lm_r <- lm(Delta_r ~ conc_num_factor, data = data) - lm_AUC <- lm(Delta_AUC ~ conc_num_factor, data = data) - data <- data %>% + # Perform linear models + lm_L <- lm(Delta_L ~ conc_num_factor, data = .x) + lm_K <- lm(Delta_K ~ conc_num_factor, data = .x) + lm_r <- lm(Delta_r ~ conc_num_factor, data = .x) + lm_AUC <- lm(Delta_AUC ~ conc_num_factor, data = .x) + + .x %>% mutate( lm_intercept_L = coef(lm_L)[1], lm_slope_L = coef(lm_L)[2], R_Squared_L = summary(lm_L)$r.squared, lm_Score_L = max_conc * lm_slope_L + lm_intercept_L, - # Repeat for K, r, and AUC lm_intercept_K = coef(lm_K)[1], lm_slope_K = coef(lm_K)[2], R_Squared_K = summary(lm_K)$r.squared, @@ -316,11 +327,10 @@ calculate_interaction_scores <- function(df, df_bg, group_vars) { R_Squared_AUC = summary(lm_AUC)$r.squared, lm_Score_AUC = max_conc * lm_slope_AUC + lm_intercept_AUC ) - return(data) }) %>% ungroup() - - # Compute lm means and sds across all data without grouping + + # Summary statistics for lm scores lm_means_sds <- calculations %>% summarise( lm_mean_L = mean(lm_Score_L, na.rm = TRUE), @@ -330,26 +340,39 @@ calculate_interaction_scores <- function(df, df_bg, group_vars) { lm_mean_r = mean(lm_Score_r, na.rm = TRUE), lm_sd_r = sd(lm_Score_r, na.rm = TRUE), lm_mean_AUC = mean(lm_Score_AUC, na.rm = TRUE), - lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE) + lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE), + .groups = "drop" ) - - # Apply global lm means and sds to calculate Z_lm_* + calculations <- calculations %>% mutate( - Z_lm_L = (lm_Score_L - lm_means_sds$lm_mean_L) / lm_means_sds$lm_sd_L, - Z_lm_K = (lm_Score_K - lm_means_sds$lm_mean_K) / lm_means_sds$lm_sd_K, - Z_lm_r = (lm_Score_r - lm_means_sds$lm_mean_r) / lm_means_sds$lm_sd_r, - Z_lm_AUC = (lm_Score_AUC - lm_means_sds$lm_mean_AUC) / lm_means_sds$lm_sd_AUC + lm_mean_L = lm_means_sds$lm_mean_L, + lm_sd_L = lm_means_sds$lm_sd_L, + lm_mean_K = lm_means_sds$lm_mean_K, + lm_sd_K = lm_means_sds$lm_sd_K, + lm_mean_r = lm_means_sds$lm_mean_r, + lm_sd_r = lm_means_sds$lm_sd_r, + lm_mean_AUC = lm_means_sds$lm_mean_AUC, + lm_sd_AUC = lm_means_sds$lm_sd_AUC ) - # Build interactions data frame + # Continue with gene Z-scores and interactions + calculations <- calculations %>% + mutate( + Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L, + Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K, + Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r, + Z_lm_AUC = (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC + ) + + # Build summary stats (interactions) interactions <- calculations %>% group_by(across(all_of(group_vars))) %>% summarise( - Avg_Zscore_L = mean(Zscore_L, na.rm = TRUE), - Avg_Zscore_K = mean(Zscore_K, na.rm = TRUE), - Avg_Zscore_r = mean(Zscore_r, na.rm = TRUE), - Avg_Zscore_AUC = mean(Zscore_AUC, na.rm = TRUE), + Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / first(num_non_removed_concs), + Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / first(num_non_removed_concs), + Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / (total_conc_num - 1), + Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / (total_conc_num - 1), # Interaction Z-scores Z_lm_L = first(Z_lm_L), @@ -368,28 +391,68 @@ calculate_interaction_scores <- function(df, df_bg, group_vars) { Z_Shift_K = first(Z_Shift_K), Z_Shift_r = first(Z_Shift_r), Z_Shift_AUC = first(Z_Shift_AUC), - - # NG, DB, SM values - NG = first(NG), - DB = first(DB), - SM = first(SM), - + # R Squared values R_Squared_L = first(R_Squared_L), R_Squared_K = first(R_Squared_K), R_Squared_r = first(R_Squared_r), R_Squared_AUC = first(R_Squared_AUC), - # Include Drug - Drug = first(Drug), - + # NG, DB, SM values + NG = first(NG), + DB = first(DB), + SM = first(SM), + .groups = "drop" ) - - # Return the dataframes without creating full_data + + # Creating the final calculations and interactions dataframes with only required columns for csv output + calculations_df <- calculations %>% + select( + all_of(group_vars), + conc_num, conc_num_factor, conc_num_factor_factor, + N, NG, DB, SM, + mean_L, median_L, sd_L, se_L, + mean_K, median_K, sd_K, se_K, + mean_r, median_r, sd_r, se_r, + mean_AUC, median_AUC, sd_AUC, se_AUC, + Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC, + Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC, + WT_L, WT_K, WT_r, WT_AUC, + WT_sd_L, WT_sd_K, WT_sd_r, WT_sd_AUC, + Exp_L, Exp_K, Exp_r, Exp_AUC, + Delta_L, Delta_K, Delta_r, Delta_AUC, + Zscore_L, Zscore_K, Zscore_r, Zscore_AUC + ) + + interactions_df <- interactions %>% + select( + all_of(group_vars), + NG, DB, SM, + Avg_Zscore_L, Avg_Zscore_K, Avg_Zscore_r, Avg_Zscore_AUC, + Z_lm_L, Z_lm_K, Z_lm_r, Z_lm_AUC, + Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC, + Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC + ) + + calculations_no_overlap <- calculations %>% + # DB, NG, SM are same as in interactions, the rest may be different and need to be checked + select(-any_of(c( + "DB", "NG", "SM", + "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", + "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", + "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC" + ))) + + # Use left_join to avoid dimension mismatch issues + full_data <- calculations_no_overlap %>% + left_join(interactions, by = group_vars) + + # Return full_data and the two required dataframes (calculations and interactions) return(list( - calculations = calculations, - interactions = interactions + calculations = calculations_df, + interactions = interactions_df, + full_data = full_data )) } @@ -475,14 +538,17 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs) { ), color = error_bar_color) } } else { - # Original code for calculating from mean and sd + # Ensure the mean and sd columns exist y_mean_col <- paste0("mean_", config$y_var) y_sd_col <- paste0("sd_", config$y_var) - plot <- plot + geom_errorbar(aes( - x = .data[[config$x_var]], - ymin = .data[[y_mean_col]] - .data[[y_sd_col]], - ymax = .data[[y_mean_col]] + .data[[y_sd_col]] - ), color = error_bar_color) + + if (y_mean_col %in% colnames(df) && y_sd_col %in% colnames(df)) { + plot <- plot + geom_errorbar(aes( + x = .data[[config$x_var]], + ymin = .data[[y_mean_col]] - .data[[y_sd_col]], + ymax = .data[[y_mean_col]] + .data[[y_sd_col]] + ), color = error_bar_color) + } } } @@ -714,7 +780,7 @@ generate_plate_analysis_plot_configs <- function(variables, df_before = NULL, df return(list(plots = plot_configs)) } -generate_interaction_plot_configs <- function(df, df_calculations, df_interactions, type) { +generate_interaction_plot_configs <- function(df, type) { # Define the y-limits for the plots limits_map <- list( @@ -762,13 +828,14 @@ generate_interaction_plot_configs <- function(df, df_calculations, df_interactio ) plot_config$position <- "jitter" + # Annotation labels annotations <- list( - list(x = 0.25, y = y_limits[1] + 0.1 * y_span, label = "NG ="), # Slightly above y-min - list(x = 0.25, y = y_limits[1] + 0.05 * y_span, label = "DB ="), - list(x = 0.25, y = y_limits[1], label = "SM =") + list(x = 0, y = y_limits[1] + 0.1 * y_span, label = "NG ="), + list(x = 0, y = y_limits[1] + 0.05 * y_span, label = "DB ="), + list(x = 0, y = y_limits[1], label = "SM =") ) - - # Loop over unique x values and add NG, DB, SM values at calculated y positions + + # Annotation values for (x_val in unique(df$conc_num_factor_factor)) { current_df <- df %>% filter(.data[[plot_config$x_var]] == x_val) annotations <- append(annotations, list( @@ -794,7 +861,6 @@ generate_interaction_plot_configs <- function(df, df_calculations, df_interactio } } - # Delta interaction plots (use df_calculations and df_interactions) if (type == "reference") { group_vars <- c("OrfRep", "Gene", "num") } else if (type == "deletion") { @@ -1195,14 +1261,14 @@ main <- function() { ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num"))$summary_stats write.csv(ss, - file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), + file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2SD_K.csv"), row.names = FALSE) message("Calculating summary statistics for L outside 2SD of K") ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num")) df_na_l_outside_2sd_k_stats <- ss$df_with_stats write.csv(ss$summary_stats, - file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), + file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2SD_K.csv"), row.names = FALSE) plate_analysis_plot_configs <- generate_plate_analysis_plot_configs( @@ -1300,11 +1366,10 @@ main <- function() { plot_configs = plate_analysis_no_zeros_boxplot_configs), list(out_dir = out_dir_qc, filename = "L_vs_K_for_strains_2SD_outside_mean_K", plot_configs = l_outside_2sd_k_plot_configs), - list(out_dir = out_dir_qc, filename = "delta_background_vs_K_for_strains_2sd_outside_mean_K", + list(out_dir = out_dir_qc, filename = "delta_background_vs_K_for_strains_2SD_outside_mean_K", plot_configs = delta_bg_outside_2sd_k_plot_configs) ) - # Generating quality control plots in parallel # furrr::future_map(plot_configs, function(config) { # generate_and_save_plots(config$out_dir, config$filename, config$plot_configs) # }, .options = furrr_options(seed = TRUE)) @@ -1325,9 +1390,9 @@ main <- function() { ) %>% filter(!is.na(L)) - message("Calculating summary statistics for background strain") + message("Calculating background strain summary statistics") ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), - group_vars = c("OrfRep", "conc_num")) + group_vars = c("OrfRep", "Drug", "conc_num", "conc_num_factor_factor")) summary_stats_bg <- ss_bg$summary_stats df_bg_stats <- ss_bg$df_with_stats write.csv( @@ -1339,7 +1404,7 @@ main <- function() { df_reference <- df_na_stats %>% # formerly X2_RF filter(OrfRep == strain) %>% filter(!is.na(L)) %>% - group_by(conc_num) %>% + group_by(OrfRep, Drug, conc_num) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), @@ -1347,21 +1412,25 @@ main <- function() { L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% ungroup() - message("Calculating reference strain interaction scores") + message("Calculating reference strain summary statistics") df_reference_stats <- calculate_summary_stats( df = df_reference, variables = c("L", "K", "r", "AUC"), - group_vars = c("OrfRep", "Gene", "num", "conc_num") + group_vars = c("OrfRep", "Gene", "Drug", "num", "conc_num", "conc_num_factor_factor") )$df_with_stats - reference_results <- calculate_interaction_scores(df_reference_stats, df_bg_stats, group_vars = c("OrfRep", "Gene", "num")) - df_calculations_reference <- reference_results$calculations - df_interactions_reference <- reference_results$interactions + message("Calculating reference strain interaction scores") + results <- calculate_interaction_scores(df_reference_stats, df_bg_stats, group_vars = c("OrfRep", "Gene", "Drug", "num")) + df_calculations_reference <- results$calculations + df_interactions_reference <- results$interactions + df_interactions_reference_joined <- results$full_data + write.csv(df_calculations_reference, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE) + write.csv(df_interactions_reference, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE) message("Setting missing deletion values to the highest theoretical value at each drug conc for L") df_deletion <- df_na_stats %>% # formerly X2 filter(OrfRep != strain) %>% filter(!is.na(L)) %>% - group_by(conc_num) %>% + group_by(OrfRep, Gene, conc_num) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), @@ -1369,81 +1438,60 @@ main <- function() { L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% ungroup() - message("Calculating deletion strain(s) interactions scores") + message("Calculating deletion strain(s) summary statistics") df_deletion_stats <- calculate_summary_stats( df = df_deletion, variables = c("L", "K", "r", "AUC"), - group_vars = c("OrfRep", "Gene", "conc_num") + group_vars = c("OrfRep", "Gene", "Drug", "conc_num", "conc_num_factor_factor") )$df_with_stats - deletion_results <- calculate_interaction_scores(df_deletion_stats, df_bg_stats, group_vars = c("OrfRep", "Gene")) - df_calculations <- deletion_results$calculations - df_interactions <- deletion_results$interactions - - # Writing Z-Scores to file - write.csv(df_calculations_reference, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE) - write.csv(df_interactions_reference, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE) + message("Calculating deletion strain(s) interactions scores") + results <- calculate_interaction_scores(df_deletion_stats, df_bg_stats, group_vars = c("OrfRep", "Gene", "Drug")) + df_calculations <- results$calculations + df_interactions <- results$interactions + df_interactions_joined <- results$full_data write.csv(df_calculations, file = file.path(out_dir, "zscore_calculations.csv"), row.names = FALSE) write.csv(df_interactions, file = file.path(out_dir, "zscore_interactions.csv"), row.names = FALSE) # Create interaction plots message("Generating reference interaction plots") - reference_plot_configs <- generate_interaction_plot_configs( - df_reference_stats, df_calculations_reference, df_interactions_reference, "reference") + reference_plot_configs <- generate_interaction_plot_configs(df_interactions_reference_joined, df_bg_stats, "reference") generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs) message("Generating deletion interaction plots") - deletion_plot_configs <- generate_interaction_plot_configs( - df_deletion_stats, df_calculations, df_interactions, "deletion") + deletion_plot_configs <- generate_interaction_plot_configs(df_interactions_joined, df_bg_stats, "deletion") generate_and_save_plots(out_dir, "interaction_plots", deletion_plot_configs) - # Define conditions for enhancers and suppressors - # TODO Add to study config? - threshold <- 2 - enhancer_condition_L <- df_interactions$Avg_Zscore_L >= threshold - suppressor_condition_L <- df_interactions$Avg_Zscore_L <= -threshold - enhancer_condition_K <- df_interactions$Avg_Zscore_K >= threshold - suppressor_condition_K <- df_interactions$Avg_Zscore_K <= -threshold - - # Subset data + message("Writing enhancer/suppressor csv files") + interaction_threshold <- 2 # TODO add to study config? + enhancer_condition_L <- df_interactions$Avg_Zscore_L >= interaction_threshold + suppressor_condition_L <- df_interactions$Avg_Zscore_L <= -interaction_threshold + enhancer_condition_K <- df_interactions$Avg_Zscore_K >= interaction_threshold + suppressor_condition_K <- df_interactions$Avg_Zscore_K <= -interaction_threshold enhancers_L <- df_interactions[enhancer_condition_L, ] suppressors_L <- df_interactions[suppressor_condition_L, ] enhancers_K <- df_interactions[enhancer_condition_K, ] suppressors_K <- df_interactions[suppressor_condition_K, ] - - # Save enhancers and suppressors - message("Writing enhancer/suppressor csv files") + enhancers_and_suppressors_L <- df_interactions[enhancer_condition_L | suppressor_condition_L, ] + enhancers_and_suppressors_K <- df_interactions[enhancer_condition_K | suppressor_condition_K, ] write.csv(enhancers_L, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_L.csv"), row.names = FALSE) write.csv(suppressors_L, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_L.csv"), row.names = FALSE) write.csv(enhancers_K, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_K.csv"), row.names = FALSE) write.csv(suppressors_K, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_K.csv"), row.names = FALSE) - - # Combine conditions for enhancers and suppressors - enhancers_and_suppressors_L <- df_interactions[enhancer_condition_L | suppressor_condition_L, ] - enhancers_and_suppressors_K <- df_interactions[enhancer_condition_K | suppressor_condition_K, ] - - # Save combined enhancers and suppressors write.csv(enhancers_and_suppressors_L, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_and_suppressors_L.csv"), row.names = FALSE) write.csv(enhancers_and_suppressors_K, file = file.path(out_dir, "zscore_interaction_deletion_enhancers_and_suppressors_K.csv"), row.names = FALSE) - # Handle linear model based enhancers and suppressors - lm_threshold <- 2 # TODO add to study config? - enhancers_lm_L <- df_interactions[df_interactions$Z_lm_L >= lm_threshold, ] - suppressors_lm_L <- df_interactions[df_interactions$Z_lm_L <= -lm_threshold, ] - enhancers_lm_K <- df_interactions[df_interactions$Z_lm_K >= lm_threshold, ] - suppressors_lm_K <- df_interactions[df_interactions$Z_lm_K <= -lm_threshold, ] - - # Save linear model based enhancers and suppressors message("Writing linear model enhancer/suppressor csv files") - write.csv(enhancers_lm_L, - file = file.path(out_dir, "zscore_interactions_deletion_enhancers_lm_L.csv"), row.names = FALSE) - write.csv(suppressors_lm_L, - file = file.path(out_dir, "zscore_interactions_deletion_suppressors_lm_L.csv"), row.names = FALSE) - write.csv(enhancers_lm_K, - file = file.path(out_dir, "zscore_interactions_deletion_enhancers_lm_K.csv"), row.names = FALSE) - write.csv(suppressors_lm_K, - file = file.path(out_dir, "zscore_interactions_deletion_suppressors_lm_K.csv"), row.names = FALSE) + lm_interaction_threshold <- 2 # TODO add to study config? + enhancers_lm_L <- df_interactions[df_interactions$Z_lm_L >= lm_interaction_threshold, ] + suppressors_lm_L <- df_interactions[df_interactions$Z_lm_L <= -lm_interaction_threshold, ] + enhancers_lm_K <- df_interactions[df_interactions$Z_lm_K >= lm_interaction_threshold, ] + suppressors_lm_K <- df_interactions[df_interactions$Z_lm_K <= -lm_interaction_threshold, ] + write.csv(enhancers_lm_L, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_lm_L.csv"), row.names = FALSE) + write.csv(suppressors_lm_L, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_lm_L.csv"), row.names = FALSE) + write.csv(enhancers_lm_K, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_lm_K.csv"), row.names = FALSE) + write.csv(suppressors_lm_K, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_lm_K.csv"), row.names = FALSE) message("Generating rank plots") rank_plot_configs <- generate_rank_plot_configs( @@ -1463,7 +1511,7 @@ main <- function() { generate_and_save_plots(out_dir = out_dir, filename = "rank_plots_lm", plot_configs = rank_lm_plot_configs) - overlap_threshold <- 2 + overlap_threshold <- 2 # TODO add to study config? df_interactions_filtered <- df_interactions %>% filter(!is.na(Z_lm_L) & !is.na(Avg_Zscore_L)) %>% mutate(