|
@@ -155,7 +155,7 @@ update_gene_names <- function(df, sgd_gene_list) {
|
|
|
return(df)
|
|
|
}
|
|
|
|
|
|
-calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) {
|
|
|
+calculate_summary_stats <- function(df, variables, group_vars) {
|
|
|
|
|
|
summary_stats <- df %>%
|
|
|
group_by(across(all_of(group_vars))) %>%
|
|
@@ -168,7 +168,9 @@ calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "con
|
|
|
min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)),
|
|
|
sd = ~sd(., na.rm = TRUE),
|
|
|
se = ~ifelse(all(is.na(.)), NA, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1))
|
|
|
- ), .names = "{.fn}_{.col}")
|
|
|
+ ),
|
|
|
+ .names = "{.fn}_{.col}"),
|
|
|
+ .groups = "drop"
|
|
|
)
|
|
|
|
|
|
# Create a cleaned version of df that doesn't overlap with summary_stats
|
|
@@ -181,7 +183,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "con
|
|
|
return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
|
|
|
}
|
|
|
|
|
|
-calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
|
|
|
+calculate_interaction_scores <- function(df, max_conc, variables, group_vars) {
|
|
|
|
|
|
# Calculate total concentration variables
|
|
|
total_conc_num <- length(unique(df$conc_num))
|
|
@@ -201,8 +203,10 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
|
|
AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
|
|
|
)
|
|
|
|
|
|
- stats <- calculate_summary_stats(df, variables,
|
|
|
- group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))$summary_stats
|
|
|
+ stats <- calculate_summary_stats(df,
|
|
|
+ variables = variables,
|
|
|
+ group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"
|
|
|
+ ))$summary_stats
|
|
|
|
|
|
stats <- df %>%
|
|
|
group_by(OrfRep, Gene, num) %>%
|
|
@@ -302,7 +306,8 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
|
|
lm_slope_AUC = coef(lm_AUC)[2],
|
|
|
NG = sum(NG, na.rm = TRUE),
|
|
|
DB = sum(DB, na.rm = TRUE),
|
|
|
- SM = sum(SM, na.rm = TRUE)
|
|
|
+ SM = sum(SM, na.rm = TRUE),
|
|
|
+ .groups = "keep"
|
|
|
)
|
|
|
|
|
|
num_non_removed_concs <- total_conc_num - sum(stats$DB, na.rm = TRUE) - 1
|
|
@@ -402,7 +407,7 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
|
|
|
}
|
|
|
|
|
|
# Add interactive tooltips for plotly plots
|
|
|
- tooltip_vars <- c("x", "y") # default tooltip variables
|
|
|
+ tooltip_vars <- c("x", "y")
|
|
|
if (!is.null(config$tooltip_vars)) {
|
|
|
tooltip_vars <- config$tooltip_vars
|
|
|
} else {
|
|
@@ -687,14 +692,11 @@ generate_interaction_plot_configs <- function(df, variables) {
|
|
|
return(configs)
|
|
|
}
|
|
|
|
|
|
-generate_rank_plot_configs <- function(df_filtered, is_lm = FALSE, adjust = FALSE) {
|
|
|
+generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE, adjust = FALSE) {
|
|
|
|
|
|
# Define SD bands
|
|
|
sd_bands <- c(1, 2, 3)
|
|
|
|
|
|
- # Define variables for Avg ZScore and Rank Avg ZScore plots
|
|
|
- variables <- c("r", "L", "K", "AUC")
|
|
|
-
|
|
|
# Initialize list to store plot configurations
|
|
|
configs <- list()
|
|
|
|
|
@@ -963,8 +965,7 @@ main <- function() {
|
|
|
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
|
|
summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
|
|
|
- group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by
|
|
|
- orf_group_vars <- c("OrfRep", "Gene", "num")
|
|
|
+ interaction_vars <- c("L", "K", "r", "AUC") # fields to calculate interaction z-scores
|
|
|
print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor",
|
|
|
"delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB")
|
|
|
|
|
@@ -984,20 +985,29 @@ main <- function() {
|
|
|
k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
|
|
|
|
|
message("Calculating summary statistics before quality control")
|
|
|
- ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars)
|
|
|
+ ss <- calculate_summary_stats(
|
|
|
+ df = df,
|
|
|
+ variables = summary_vars,
|
|
|
+ group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
|
|
df_stats <- ss$df_with_stats
|
|
|
message("Filtering non-finite data")
|
|
|
df_filtered_stats <- filter_data(df_stats, c("L"), nf = TRUE)
|
|
|
|
|
|
message("Calculating summary statistics after quality control")
|
|
|
- ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars)
|
|
|
+ ss <- calculate_summary_stats(
|
|
|
+ df = df_na,
|
|
|
+ variables = summary_vars,
|
|
|
+ group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
|
|
df_na_ss <- ss$summary_stats
|
|
|
df_na_stats <- ss$df_with_stats
|
|
|
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
|
|
df_na_filtered_stats <- filter_data(df_na_stats, c("L"), nf = TRUE)
|
|
|
|
|
|
message("Calculating summary statistics after quality control excluding zero values")
|
|
|
- ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars)
|
|
|
+ ss <- calculate_summary_stats(
|
|
|
+ df = df_no_zeros,
|
|
|
+ variables = summary_vars,
|
|
|
+ group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
|
|
df_no_zeros_stats <- ss$df_with_stats
|
|
|
df_no_zeros_filtered_stats <- filter_data(df_no_zeros_stats, c("L"), nf = TRUE)
|
|
|
|
|
@@ -1011,12 +1021,16 @@ main <- function() {
|
|
|
# TODO We're omitting the original z_max calculation, not sure if needed?
|
|
|
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
|
|
# df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
|
|
- write.csv(ss$summary_stats, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE)
|
|
|
+ write.csv(ss$summary_stats,
|
|
|
+ file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
|
|
|
+ row.names = FALSE)
|
|
|
|
|
|
message("Calculating summary statistics for L outside 2SD of K")
|
|
|
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
|
|
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
|
|
- write.csv(ss$summary_stats, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE)
|
|
|
+ write.csv(ss$summary_stats,
|
|
|
+ file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"),
|
|
|
+ row.names = FALSE)
|
|
|
|
|
|
# Each plots list corresponds to a file
|
|
|
l_vs_k_plots <- list(
|
|
@@ -1183,16 +1197,16 @@ main <- function() {
|
|
|
)
|
|
|
)
|
|
|
|
|
|
- message("Generating quality control plots")
|
|
|
- generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
|
|
|
- generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
|
|
|
- generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
|
|
|
- generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
|
|
|
- generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
|
|
|
- generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
|
|
|
- generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
|
|
|
- generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
|
|
|
- generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
|
|
|
+ # message("Generating quality control plots")
|
|
|
+ # generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
|
|
|
+ # generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
|
|
|
+ # generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
|
|
|
+ # generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
|
|
|
+ # generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
|
|
|
+ # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
|
|
|
+ # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
|
|
|
+ # generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
|
|
|
+ # generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
|
|
|
|
|
|
# TODO: Originally this filtered L NA's
|
|
|
# Let's try to avoid for now since stats have already been calculated
|
|
@@ -1217,7 +1231,7 @@ main <- function() {
|
|
|
|
|
|
# Recalculate summary statistics for the background strain
|
|
|
message("Calculating summary statistics for background strain")
|
|
|
- ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = group_vars)
|
|
|
+ ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
|
|
summary_stats_bg <- ss_bg$summary_stats
|
|
|
# df_bg_stats <- ss_bg$df_with_stats
|
|
|
write.csv(summary_stats_bg,
|
|
@@ -1256,18 +1270,15 @@ main <- function() {
|
|
|
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
|
|
ungroup()
|
|
|
|
|
|
- message("Calculating interaction scores")
|
|
|
- interaction_vars <- c("L", "K", "r", "AUC")
|
|
|
-
|
|
|
- message("Calculating reference strain(s)")
|
|
|
- reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars, group_vars = orf_group_vars)
|
|
|
+ message("Calculating reference strain interaction scores")
|
|
|
+ reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num"))
|
|
|
zscores_calculations_reference <- reference_results$calculations
|
|
|
zscores_interactions_reference <- reference_results$interactions
|
|
|
zscores_calculations_reference_joined <- reference_results$calculations_joined
|
|
|
zscores_interactions_reference_joined <- reference_results$interactions_joined
|
|
|
|
|
|
- message("Calculating deletion strain(s)")
|
|
|
- deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars, group_vars = orf_group_vars)
|
|
|
+ message("Calculating deletion strain(s) interactions scores")
|
|
|
+ deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num"))
|
|
|
zscores_calculations <- deletion_results$calculations
|
|
|
zscores_interactions <- deletion_results$interactions
|
|
|
zscores_calculations_joined <- deletion_results$calculations_joined
|
|
@@ -1346,6 +1357,7 @@ main <- function() {
|
|
|
rank = TRUE)
|
|
|
rank_plot_configs <- generate_rank_plot_configs(
|
|
|
df = zscores_interactions_joined_filtered,
|
|
|
+ variables = interaction_vars,
|
|
|
is_lm = FALSE,
|
|
|
adjust = TRUE
|
|
|
)
|
|
@@ -1355,6 +1367,7 @@ main <- function() {
|
|
|
message("Generating ranked linear model plots")
|
|
|
rank_lm_plot_configs <- generate_rank_plot_configs(
|
|
|
df = zscores_interactions_joined_filtered,
|
|
|
+ variables = interaction_vars,
|
|
|
is_lm = TRUE,
|
|
|
adjust = TRUE
|
|
|
)
|
|
@@ -1364,7 +1377,7 @@ main <- function() {
|
|
|
message("Filtering and reranking plots")
|
|
|
# Formerly X_NArm
|
|
|
zscores_interactions_filtered <- zscores_interactions_joined %>%
|
|
|
- group_by(across(all_of(orf_group_vars))) %>%
|
|
|
+ group_by(across(all_of(c("OrfRep", "Gene", "num")))) %>%
|
|
|
filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
|
|
|
ungroup() %>%
|
|
|
rowwise() %>%
|
|
@@ -1396,6 +1409,7 @@ main <- function() {
|
|
|
|
|
|
rank_plot_filtered_configs <- generate_rank_plot_configs(
|
|
|
df = zscores_interactions_filtered,
|
|
|
+ variables = interaction_vars,
|
|
|
is_lm = FALSE,
|
|
|
adjust = FALSE
|
|
|
)
|
|
@@ -1410,6 +1424,7 @@ main <- function() {
|
|
|
message("Generating filtered ranked linear model plots")
|
|
|
rank_plot_lm_filtered_configs <- generate_rank_plot_configs(
|
|
|
df = zscores_interactions_filtered,
|
|
|
+ variables = interaction_vars,
|
|
|
is_lm = TRUE,
|
|
|
adjust = FALSE
|
|
|
)
|