Move max_conc into the dataframe to simplify args

This commit is contained in:
2024-10-02 19:11:30 -04:00
parent 4265663e17
commit f435fa3f11

View File

@@ -160,6 +160,11 @@ load_and_filter_data <- function(easy_results_file, sd = 3) {
conc_num_factor_factor = as.factor(conc_num) conc_num_factor_factor = as.factor(conc_num)
) )
# Set the max concentration across the whole dataframe
max_conc <- max(df$conc_num_factor, na.rm = TRUE)
df <- df %>%
mutate(max_conc = max_conc)
return(df) return(df)
} }
@@ -211,7 +216,7 @@ calculate_summary_stats <- function(df, variables, group_vars) {
return(list(summary_stats = summary_stats, df_with_stats = df_joined)) return(list(summary_stats = summary_stats, df_with_stats = df_joined))
} }
calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, overlap_threshold = 2) { calculate_interaction_scores <- function(df, bg_stats, group_vars, overlap_threshold = 2) {
# Calculate total concentration variables # Calculate total concentration variables
total_conc_num <- length(unique(df$conc_num)) total_conc_num <- length(unique(df$conc_num))
@@ -294,16 +299,12 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, group_vars, ove
lm_means_sds <- calculations %>% lm_means_sds <- calculations %>%
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
summarise( summarise(
mean_mean_L = mean(mean_L, na.rm = TRUE),
mean_lm_L = mean(lm_Score_L, na.rm = TRUE), mean_lm_L = mean(lm_Score_L, na.rm = TRUE),
sd_lm_L = sd(lm_Score_L, na.rm = TRUE), sd_lm_L = sd(lm_Score_L, na.rm = TRUE),
mean_mean_K = mean(mean_K, na.rm = TRUE),
mean_lm_K = mean(lm_Score_K, na.rm = TRUE), mean_lm_K = mean(lm_Score_K, na.rm = TRUE),
sd_lm_K = sd(lm_Score_K, na.rm = TRUE), sd_lm_K = sd(lm_Score_K, na.rm = TRUE),
mean_mean_r = mean(mean_r, na.rm = TRUE),
mean_lm_r = mean(lm_Score_r, na.rm = TRUE), mean_lm_r = mean(lm_Score_r, na.rm = TRUE),
sd_lm_r = sd(lm_Score_r, na.rm = TRUE), sd_lm_r = sd(lm_Score_r, na.rm = TRUE),
mean_mean_AUC = mean(mean_AUC, na.rm = TRUE),
mean_lm_AUC = mean(lm_Score_AUC, na.rm = TRUE), mean_lm_AUC = mean(lm_Score_AUC, na.rm = TRUE),
sd_lm_AUC = sd(lm_Score_AUC, na.rm = TRUE) sd_lm_AUC = sd(lm_Score_AUC, na.rm = TRUE)
) )
@@ -469,7 +470,12 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs) {
"red" "red"
} }
y_mean_col <- paste0("mean_mean_", config$y_var) y_mean_prefix <- if (!is.null(config$error_bar_params$y_mean_prefix)) {
config$error_bar_params$y_mean_prefix
} else {
"mean_"
}
y_mean_col <- paste0(y_mean_prefix, config$y_var)
# Dynamically set y_sd_col based on the provided prefix in error_bar_params # Dynamically set y_sd_col based on the provided prefix in error_bar_params
y_sd_prefix <- if (!is.null(config$error_bar_params$y_sd_prefix)) { y_sd_prefix <- if (!is.null(config$error_bar_params$y_sd_prefix)) {
@@ -482,7 +488,7 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs) {
if (!is.null(config$error_bar_params$center_point)) { if (!is.null(config$error_bar_params$center_point)) {
plot <- plot + geom_point(aes( plot <- plot + geom_point(aes(
x = .data[[config$x_var]], x = .data[[config$x_var]],
y = .data[[y_mean_col]]), y = first(.data[[y_mean_col]])),
color = error_bar_color, color = error_bar_color,
shape = 16) shape = 16)
} }
@@ -495,14 +501,12 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs) {
color = error_bar_color) color = error_bar_color)
} else { } else {
plot <- plot + geom_errorbar(aes( plot <- plot + geom_errorbar(aes(
ymin = .data[[y_mean_col]] - .data[[y_sd_col]], ymin = first(.data[[y_mean_col]]) - first(.data[[y_sd_col]]),
ymax = .data[[y_mean_col]] + .data[[y_sd_col]]), ymax = first(.data[[y_mean_col]]) + first(.data[[y_sd_col]])),
color = error_bar_color) color = error_bar_color)
} }
} }
# Convert ggplot to plotly for interactive version # Convert ggplot to plotly for interactive version
plotly_plot <- suppressWarnings(plotly::ggplotly(plot)) plotly_plot <- suppressWarnings(plotly::ggplotly(plot))
@@ -546,7 +550,7 @@ generate_scatter_plot <- function(plot, config) {
size <- if (!is.null(config$size)) config$size else 1.5 size <- if (!is.null(config$size)) config$size else 1.5
position <- position <-
if (!is.null(config$position) && config$position == "jitter") { if (!is.null(config$position) && config$position == "jitter") {
position_jitter(width = 0.3, height = 0.1) position_jitter(width = 0.4, height = 0.1)
} else { } else {
"identity" "identity"
} }
@@ -752,7 +756,7 @@ generate_interaction_plot_configs <- function(df, type) {
for (var in names(limits_map)) { for (var in names(limits_map)) {
y_limits <- limits_map[[var]] y_limits <- limits_map[[var]]
y_span <- y_limits[2] - y_limits[1] y_span <- y_limits[2] - y_limits[1]
# Common plot configuration # Common plot configuration
plot_config <- list( plot_config <- list(
@@ -773,15 +777,16 @@ generate_interaction_plot_configs <- function(df, type) {
plot_config$error_bar <- TRUE plot_config$error_bar <- TRUE
plot_config$error_bar_params <- list( plot_config$error_bar_params <- list(
y_sd_prefix = "WT_sd_", y_sd_prefix = "WT_sd_",
y_mean_prefix = "mean_",
color = "red", color = "red",
center_point = TRUE center_point = TRUE
) )
plot_config$position <- "jitter" plot_config$position <- "jitter"
annotations <- list( annotations <- list(
list(x = -0.25, y = y_limits[1] + 0.1 * y_span, label = "NG ="), # Slightly above y-min list(x = 0.25, y = y_limits[1] + 0.1 * y_span, label = "NG ="), # Slightly above y-min
list(x = -0.25, y = y_limits[1] + 0.05 * y_span, label = "DB ="), list(x = 0.25, y = y_limits[1] + 0.05 * y_span, label = "DB ="),
list(x = -0.25, y = y_limits[1], label = "SM =") list(x = 0.25, y = y_limits[1], label = "SM =")
) )
# Loop over unique x values and add NG, DB, SM values at calculated y positions # Loop over unique x values and add NG, DB, SM values at calculated y positions
@@ -1067,70 +1072,35 @@ main <- function() {
out_dir_qc <- file.path(exp_path, "zscores", "qc") out_dir_qc <- file.path(exp_path, "zscores", "qc")
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE) dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
interaction_vars <- c("L", "K", "r", "AUC") # fields to calculate interaction z-scores
message("Loading and filtering data for experiment: ", exp_name) message("Loading and filtering data for experiment: ", exp_name)
df <- load_and_filter_data(args$easy_results_file, sd = exp_sd) %>% df <- load_and_filter_data(args$easy_results_file, sd = exp_sd) %>%
update_gene_names(args$sgd_gene_list) %>% update_gene_names(args$sgd_gene_list) %>%
as_tibble() as_tibble()
# Filter rows above delta background tolerance
df_above_tolerance <- df %>% filter(DB == 1)
df_na <- df %>% mutate(across(all_of(summary_vars), ~ ifelse(DB == 1, NA, .))) # formerly X
df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero
# Save some constants
max_conc <- max(df$conc_num_factor)
message("Calculating summary statistics before quality control") message("Calculating summary statistics before quality control")
df_stats <- calculate_summary_stats( df_stats <- calculate_summary_stats(
df = df, df = df,
variables = summary_vars, variables = c("L", "K", "r", "AUC", "delta_bg"),
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$df_with_stats group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$df_with_stats
message("Calculating summary statistics after quality control") message("Calculating summary statistics after quality control")
df_na <- df %>% mutate(across(all_of(c("L", "K", "r", "AUC", "delta_bg")), ~ ifelse(DB == 1, NA, .))) # formerly X
ss <- calculate_summary_stats( ss <- calculate_summary_stats(
df = df_na, df = df_na,
variables = summary_vars, variables = c("L", "K", "r", "AUC", "delta_bg"),
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")) group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))
df_na_ss <- ss$summary_stats df_na_ss <- ss$summary_stats
df_na_stats <- ss$df_with_stats df_na_stats <- ss$df_with_stats # formerly X_stats_ALL
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
# For plotting (ggplot warns on NAs) # For plotting (ggplot warns on NAs)
df_na_stats_filtered <- df_na_stats %>% filter(if_all(all_of(summary_vars), is.finite)) df_na_stats_filtered <- df_na_stats %>% filter(if_all(all_of(c("L", "K", "r", "AUC", "delta_bg")), is.finite))
df_na_stats <- df_na_stats %>%
mutate(
WT_L = mean_L,
WT_K = mean_K,
WT_r = mean_r,
WT_AUC = mean_AUC,
WT_sd_L = sd_L,
WT_sd_K = sd_K,
WT_sd_r = sd_r,
WT_sd_AUC = sd_AUC
)
# Pull the background means and standard deviations from zero concentration for interactions
bg_stats <- df_na_stats %>%
filter(conc_num == 0) %>%
summarise(
mean_L = first(mean_L),
mean_K = first(mean_K),
mean_r = first(mean_r),
mean_AUC = first(mean_AUC),
sd_L = first(sd_L),
sd_K = first(sd_K),
sd_r = first(sd_r),
sd_AUC = first(sd_AUC)
)
message("Calculating summary statistics after quality control excluding zero values") message("Calculating summary statistics after quality control excluding zero values")
df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero
df_no_zeros_stats <- calculate_summary_stats( df_no_zeros_stats <- calculate_summary_stats(
df = df_no_zeros, df = df_no_zeros,
variables = summary_vars, variables = c("L", "K", "r", "AUC", "delta_bg"),
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor") group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")
)$df_with_stats )$df_with_stats
@@ -1142,7 +1112,8 @@ main <- function() {
message("Calculating summary statistics for L within 2SD of K") message("Calculating summary statistics for L within 2SD of K")
# TODO We're omitting the original z_max calculation, not sure if needed? # TODO We're omitting the original z_max calculation, not sure if needed?
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats ss <- calculate_summary_stats(df_na_within_2sd_k, "L",
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats
write.csv(ss, write.csv(ss,
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
row.names = FALSE) row.names = FALSE)
@@ -1153,7 +1124,7 @@ main <- function() {
write.csv(ss$summary_stats, write.csv(ss$summary_stats,
file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"),
row.names = FALSE) row.names = FALSE)
# Each list of plots corresponds to a file # Each list of plots corresponds to a file
l_vs_k_plot_configs <- list( l_vs_k_plot_configs <- list(
plots = list( plots = list(
@@ -1200,6 +1171,7 @@ main <- function() {
) )
) )
df_above_tolerance <- df %>% filter(DB == 1)
above_threshold_plot_configs <- list( above_threshold_plot_configs <- list(
plots = list( plots = list(
list( list(
@@ -1226,26 +1198,26 @@ main <- function() {
) )
plate_analysis_plot_configs <- generate_plate_analysis_plot_configs( plate_analysis_plot_configs <- generate_plate_analysis_plot_configs(
variables = summary_vars, variables = c("L", "K", "r", "AUC", "delta_bg"),
df_before = df_stats, df_before = df_stats,
df_after = df_na_stats_filtered df_after = df_na_stats_filtered
) )
plate_analysis_boxplot_configs <- generate_plate_analysis_plot_configs( plate_analysis_boxplot_configs <- generate_plate_analysis_plot_configs(
variables = summary_vars, variables = c("L", "K", "r", "AUC", "delta_bg"),
df_before = df_stats, df_before = df_stats,
df_after = df_na_stats_filtered, df_after = df_na_stats_filtered,
plot_type = "box" plot_type = "box"
) )
plate_analysis_no_zeros_plot_configs <- generate_plate_analysis_plot_configs( plate_analysis_no_zeros_plot_configs <- generate_plate_analysis_plot_configs(
variables = summary_vars, variables = c("L", "K", "r", "AUC", "delta_bg"),
stages = c("after"), # Only after QC stages = c("after"), # Only after QC
df_after = df_no_zeros_stats df_after = df_no_zeros_stats
) )
plate_analysis_no_zeros_boxplot_configs <- generate_plate_analysis_plot_configs( plate_analysis_no_zeros_boxplot_configs <- generate_plate_analysis_plot_configs(
variables = summary_vars, variables = c("L", "K", "r", "AUC", "delta_bg"),
stages = c("after"), # Only after QC stages = c("after"), # Only after QC
df_after = df_no_zeros_stats, df_after = df_no_zeros_stats,
plot_type = "box" plot_type = "box"
@@ -1329,7 +1301,6 @@ main <- function() {
# generate_and_save_plots(config$out_dir, config$filename, config$plot_configs) # generate_and_save_plots(config$out_dir, config$filename, config$plot_configs)
# }, .options = furrr_options(seed = TRUE)) # }, .options = furrr_options(seed = TRUE))
# Process background strains
bg_strains <- c("YDL227C") bg_strains <- c("YDL227C")
lapply(bg_strains, function(strain) { lapply(bg_strains, function(strain) {
@@ -1349,15 +1320,17 @@ main <- function() {
# Recalculate summary statistics for the background strain # Recalculate summary statistics for the background strain
message("Calculating summary statistics for background strain") message("Calculating summary statistics for background strain")
ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor")) ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"),
group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor"))
summary_stats_bg <- ss_bg$summary_stats summary_stats_bg <- ss_bg$summary_stats
ss_bg_stats <- ss_bg$df_with_stats
write.csv(summary_stats_bg, write.csv(summary_stats_bg,
file = file.path(out_dir, paste0("summary_stats_background_strain_", strain, ".csv")), file = file.path(out_dir, paste0("summary_stats_background_strain_", strain, ".csv")),
row.names = FALSE) row.names = FALSE)
# Set the missing values to the highest theoretical value at each drug conc for L # Set the missing values to the highest theoretical value at each drug conc for L
# Leave other values as 0 for the max/min # Leave other values as 0 for the max/min
df_reference <- df_na_stats %>% # formerly X2_RF df_reference <- df_bg_stats %>% # formerly X2_RF
filter(OrfRep == strain) %>% filter(OrfRep == strain) %>%
filter(!is.na(L)) %>% filter(!is.na(L)) %>%
group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>% group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>%
@@ -1384,10 +1357,10 @@ main <- function() {
message("Calculating reference strain interaction scores") message("Calculating reference strain interaction scores")
df_reference_stats <- calculate_summary_stats( df_reference_stats <- calculate_summary_stats(
df = df_reference, df = df_reference,
variables = interaction_vars, variables = c("L", "K", "r", "AUC"),
group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")
)$df_with_stats )$df_with_stats
reference_results <- calculate_interaction_scores(df_reference_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene", "num")) reference_results <- calculate_interaction_scores(df_reference_stats, bg_stats, group_vars = c("OrfRep", "Gene", "num"))
zscore_calculations_reference <- reference_results$calculations zscore_calculations_reference <- reference_results$calculations
zscore_interactions_reference <- reference_results$interactions zscore_interactions_reference <- reference_results$interactions
zscore_interactions_reference_joined <- reference_results$full_data zscore_interactions_reference_joined <- reference_results$full_data
@@ -1395,10 +1368,10 @@ main <- function() {
message("Calculating deletion strain(s) interactions scores") message("Calculating deletion strain(s) interactions scores")
df_deletion_stats <- calculate_summary_stats( df_deletion_stats <- calculate_summary_stats(
df = df_deletion, df = df_deletion,
variables = interaction_vars, variables = c("L", "K", "r", "AUC"),
group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor") group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")
)$df_with_stats )$df_with_stats
deletion_results <- calculate_interaction_scores(df_deletion_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene")) deletion_results <- calculate_interaction_scores(df_deletion_stats, bg_stats, group_vars = c("OrfRep", "Gene"))
zscore_calculations <- deletion_results$calculations zscore_calculations <- deletion_results$calculations
zscore_interactions <- deletion_results$interactions zscore_interactions <- deletion_results$interactions
zscore_interactions_joined <- deletion_results$full_data zscore_interactions_joined <- deletion_results$full_data