Tidy some code up and make more specific

This commit is contained in:
2024-09-16 21:57:34 -04:00
parent 3bc0b532b7
commit cdeaf6e6ba

View File

@@ -155,7 +155,7 @@ update_gene_names <- function(df, sgd_gene_list) {
return(df) return(df)
} }
calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) { calculate_summary_stats <- function(df, variables, group_vars) {
summary_stats <- df %>% summary_stats <- df %>%
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
@@ -168,7 +168,9 @@ calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "con
min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)),
sd = ~sd(., na.rm = TRUE), sd = ~sd(., na.rm = TRUE),
se = ~ifelse(all(is.na(.)), NA, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)) se = ~ifelse(all(is.na(.)), NA, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1))
), .names = "{.fn}_{.col}") ),
.names = "{.fn}_{.col}"),
.groups = "drop"
) )
# Create a cleaned version of df that doesn't overlap with summary_stats # Create a cleaned version of df that doesn't overlap with summary_stats
@@ -181,7 +183,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("OrfRep", "con
return(list(summary_stats = summary_stats, df_with_stats = df_with_stats)) return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
} }
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { calculate_interaction_scores <- function(df, max_conc, variables, group_vars) {
# Calculate total concentration variables # Calculate total concentration variables
total_conc_num <- length(unique(df$conc_num)) total_conc_num <- length(unique(df$conc_num))
@@ -201,8 +203,10 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
) )
stats <- calculate_summary_stats(df, variables, stats <- calculate_summary_stats(df,
group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))$summary_stats variables = variables,
group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"
))$summary_stats
stats <- df %>% stats <- df %>%
group_by(OrfRep, Gene, num) %>% group_by(OrfRep, Gene, num) %>%
@@ -302,7 +306,8 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
lm_slope_AUC = coef(lm_AUC)[2], lm_slope_AUC = coef(lm_AUC)[2],
NG = sum(NG, na.rm = TRUE), NG = sum(NG, na.rm = TRUE),
DB = sum(DB, na.rm = TRUE), DB = sum(DB, na.rm = TRUE),
SM = sum(SM, na.rm = TRUE) SM = sum(SM, na.rm = TRUE),
.groups = "keep"
) )
num_non_removed_concs <- total_conc_num - sum(stats$DB, na.rm = TRUE) - 1 num_non_removed_concs <- total_conc_num - sum(stats$DB, na.rm = TRUE) - 1
@@ -402,7 +407,7 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
} }
# Add interactive tooltips for plotly plots # Add interactive tooltips for plotly plots
tooltip_vars <- c("x", "y") # default tooltip variables tooltip_vars <- c("x", "y")
if (!is.null(config$tooltip_vars)) { if (!is.null(config$tooltip_vars)) {
tooltip_vars <- config$tooltip_vars tooltip_vars <- config$tooltip_vars
} else { } else {
@@ -687,14 +692,11 @@ generate_interaction_plot_configs <- function(df, variables) {
return(configs) return(configs)
} }
generate_rank_plot_configs <- function(df_filtered, is_lm = FALSE, adjust = FALSE) { generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE, adjust = FALSE) {
# Define SD bands # Define SD bands
sd_bands <- c(1, 2, 3) sd_bands <- c(1, 2, 3)
# Define variables for Avg ZScore and Rank Avg ZScore plots
variables <- c("r", "L", "K", "AUC")
# Initialize list to store plot configurations # Initialize list to store plot configurations
configs <- list() configs <- list()
@@ -963,8 +965,7 @@ main <- function() {
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE) dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by interaction_vars <- c("L", "K", "r", "AUC") # fields to calculate interaction z-scores
orf_group_vars <- c("OrfRep", "Gene", "num")
print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor", print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor",
"delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB") "delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB")
@@ -984,20 +985,29 @@ main <- function() {
k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2 k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
message("Calculating summary statistics before quality control") message("Calculating summary statistics before quality control")
ss <- calculate_summary_stats(df, summary_vars, group_vars = group_vars) ss <- calculate_summary_stats(
df = df,
variables = summary_vars,
group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
df_stats <- ss$df_with_stats df_stats <- ss$df_with_stats
message("Filtering non-finite data") message("Filtering non-finite data")
df_filtered_stats <- filter_data(df_stats, c("L"), nf = TRUE) df_filtered_stats <- filter_data(df_stats, c("L"), nf = TRUE)
message("Calculating summary statistics after quality control") message("Calculating summary statistics after quality control")
ss <- calculate_summary_stats(df_na, summary_vars, group_vars = group_vars) ss <- calculate_summary_stats(
df = df_na,
variables = summary_vars,
group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
df_na_ss <- ss$summary_stats df_na_ss <- ss$summary_stats
df_na_stats <- ss$df_with_stats df_na_stats <- ss$df_with_stats
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
df_na_filtered_stats <- filter_data(df_na_stats, c("L"), nf = TRUE) df_na_filtered_stats <- filter_data(df_na_stats, c("L"), nf = TRUE)
message("Calculating summary statistics after quality control excluding zero values") message("Calculating summary statistics after quality control excluding zero values")
ss <- calculate_summary_stats(df_no_zeros, summary_vars, group_vars = group_vars) ss <- calculate_summary_stats(
df = df_no_zeros,
variables = summary_vars,
group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
df_no_zeros_stats <- ss$df_with_stats df_no_zeros_stats <- ss$df_with_stats
df_no_zeros_filtered_stats <- filter_data(df_no_zeros_stats, c("L"), nf = TRUE) df_no_zeros_filtered_stats <- filter_data(df_no_zeros_stats, c("L"), nf = TRUE)
@@ -1011,12 +1021,16 @@ main <- function() {
# TODO We're omitting the original z_max calculation, not sure if needed? # TODO We're omitting the original z_max calculation, not sure if needed?
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
# df_na_l_within_2sd_k_stats <- ss$df_with_stats # df_na_l_within_2sd_k_stats <- ss$df_with_stats
write.csv(ss$summary_stats, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) write.csv(ss$summary_stats,
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
row.names = FALSE)
message("Calculating summary statistics for L outside 2SD of K") message("Calculating summary statistics for L outside 2SD of K")
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
df_na_l_outside_2sd_k_stats <- ss$df_with_stats df_na_l_outside_2sd_k_stats <- ss$df_with_stats
write.csv(ss$summary_stats, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), row.names = FALSE) write.csv(ss$summary_stats,
file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"),
row.names = FALSE)
# Each plots list corresponds to a file # Each plots list corresponds to a file
l_vs_k_plots <- list( l_vs_k_plots <- list(
@@ -1183,16 +1197,16 @@ main <- function() {
) )
) )
message("Generating quality control plots") # message("Generating quality control plots")
generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots) # generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots) # generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots) # generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots) # generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots) # generate_and_save_plots(out_dir_qc, "plate_analysis_boxplots", plate_analysis_boxplots)
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots) # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots) # generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros_boxplots", plate_analysis_no_zeros_boxplots)
generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots) # generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots) # generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
# TODO: Originally this filtered L NA's # TODO: Originally this filtered L NA's
# Let's try to avoid for now since stats have already been calculated # Let's try to avoid for now since stats have already been calculated
@@ -1217,7 +1231,7 @@ main <- function() {
# Recalculate summary statistics for the background strain # Recalculate summary statistics for the background strain
message("Calculating summary statistics for background strain") message("Calculating summary statistics for background strain")
ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = group_vars) ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
summary_stats_bg <- ss_bg$summary_stats summary_stats_bg <- ss_bg$summary_stats
# df_bg_stats <- ss_bg$df_with_stats # df_bg_stats <- ss_bg$df_with_stats
write.csv(summary_stats_bg, write.csv(summary_stats_bg,
@@ -1256,18 +1270,15 @@ main <- function() {
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>% L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
ungroup() ungroup()
message("Calculating interaction scores") message("Calculating reference strain interaction scores")
interaction_vars <- c("L", "K", "r", "AUC") reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num"))
message("Calculating reference strain(s)")
reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars, group_vars = orf_group_vars)
zscores_calculations_reference <- reference_results$calculations zscores_calculations_reference <- reference_results$calculations
zscores_interactions_reference <- reference_results$interactions zscores_interactions_reference <- reference_results$interactions
zscores_calculations_reference_joined <- reference_results$calculations_joined zscores_calculations_reference_joined <- reference_results$calculations_joined
zscores_interactions_reference_joined <- reference_results$interactions_joined zscores_interactions_reference_joined <- reference_results$interactions_joined
message("Calculating deletion strain(s)") message("Calculating deletion strain(s) interactions scores")
deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars, group_vars = orf_group_vars) deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num"))
zscores_calculations <- deletion_results$calculations zscores_calculations <- deletion_results$calculations
zscores_interactions <- deletion_results$interactions zscores_interactions <- deletion_results$interactions
zscores_calculations_joined <- deletion_results$calculations_joined zscores_calculations_joined <- deletion_results$calculations_joined
@@ -1346,6 +1357,7 @@ main <- function() {
rank = TRUE) rank = TRUE)
rank_plot_configs <- generate_rank_plot_configs( rank_plot_configs <- generate_rank_plot_configs(
df = zscores_interactions_joined_filtered, df = zscores_interactions_joined_filtered,
variables = interaction_vars,
is_lm = FALSE, is_lm = FALSE,
adjust = TRUE adjust = TRUE
) )
@@ -1355,6 +1367,7 @@ main <- function() {
message("Generating ranked linear model plots") message("Generating ranked linear model plots")
rank_lm_plot_configs <- generate_rank_plot_configs( rank_lm_plot_configs <- generate_rank_plot_configs(
df = zscores_interactions_joined_filtered, df = zscores_interactions_joined_filtered,
variables = interaction_vars,
is_lm = TRUE, is_lm = TRUE,
adjust = TRUE adjust = TRUE
) )
@@ -1364,7 +1377,7 @@ main <- function() {
message("Filtering and reranking plots") message("Filtering and reranking plots")
# Formerly X_NArm # Formerly X_NArm
zscores_interactions_filtered <- zscores_interactions_joined %>% zscores_interactions_filtered <- zscores_interactions_joined %>%
group_by(across(all_of(orf_group_vars))) %>% group_by(across(all_of(c("OrfRep", "Gene", "num")))) %>%
filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>% filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
ungroup() %>% ungroup() %>%
rowwise() %>% rowwise() %>%
@@ -1396,6 +1409,7 @@ main <- function() {
rank_plot_filtered_configs <- generate_rank_plot_configs( rank_plot_filtered_configs <- generate_rank_plot_configs(
df = zscores_interactions_filtered, df = zscores_interactions_filtered,
variables = interaction_vars,
is_lm = FALSE, is_lm = FALSE,
adjust = FALSE adjust = FALSE
) )
@@ -1410,6 +1424,7 @@ main <- function() {
message("Generating filtered ranked linear model plots") message("Generating filtered ranked linear model plots")
rank_plot_lm_filtered_configs <- generate_rank_plot_configs( rank_plot_lm_filtered_configs <- generate_rank_plot_configs(
df = zscores_interactions_filtered, df = zscores_interactions_filtered,
variables = interaction_vars,
is_lm = TRUE, is_lm = TRUE,
adjust = FALSE adjust = FALSE
) )