Left join summary stats df with input df

This commit is contained in:
2024-09-01 18:45:40 -04:00
parent f4c9985e40
commit f1cc7b8404

View File

@@ -216,27 +216,24 @@ generate_and_save_plots <- function(df, output_dir, prefix, variables, include_q
if (include_qc) {
plots[["Raw_L_vs_K"]] <-
generate_plot(df, x_var = "L", y_var = "K", plot_type = "scatter",
title = "Raw L vs K before QC")
title = "Raw L vs K before QC")
plots[["Delta_bg_Density"]] <-
generate_plot(df, x_var = "delta_bg", plot_type = "density", color_var = "conc_num",
title = "Density plot for Delta Background by Conc All Data")
title = "Density plot for Delta Background by Conc All Data")
plots[["Delta_bg_Bar"]] <-
generate_plot(df, x_var = "delta_bg", plot_type = "bar",
title = "Bar plot for Delta Background by Conc All Data")
title = "Bar plot for Delta Background by Conc All Data")
}
save_plots(prefix, plots, output_dir)
}
# Calculate summary statistics for all variables together
# Calculate summary statistics for all variables
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
# Calculate summary statistics with the grouping columns
summary_stats <- df %>%
group_by(across(all_of(group_vars))) %>%
summarise(across(all_of(variables), list(
N = ~{
message("Calculating summary statistics for ", cur_column())
n()
},
mean = ~mean(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE),
max = ~max(.x, na.rm = TRUE),
@@ -299,7 +296,7 @@ save_plots <- function(file_name, plot_list, output_dir) {
# Calculate background strain mean values
calculate_background_means <- function(df_stats_by_l, df_stats_by_k, df_stats_by_r, df_stats_by_auc) {
calculate_bg_means <- function(df_stats_by_l, df_stats_by_k, df_stats_by_r, df_stats_by_auc) {
list(
L = df_stats_by_l %>% filter(conc_num_factor == 0) %>% pull(mean_L),
K = df_stats_by_k %>% filter(conc_num_factor == 0) %>% pull(mean_K),
@@ -574,12 +571,20 @@ main <- function() {
)
df_no_zeros <- df_na %>% filter(L > 0)
# Flag and remove non-finite data, printing affected rows
df_na_filtered <- df_na %>%
filter(if_any(c(L, r, AUC, K), ~ !is.finite(.))) %>%
{
if (nrow(.) > 0) message("Removing non-finite rows:\n", print(.))
df_na %>% filter(if_all(c(L, r, AUC, K), is.finite))
}
# Generate QC PDFs and HTMLs
message("Generating QC plots")
variables <- c("L", "K", "r", "AUC", "delta_bg")
generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
generate_and_save_plots(df_na, out_dir_qc, "After_QC", variables)
generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)
rm(df, df_above_tolerance, df_no_zeros)
@@ -587,14 +592,14 @@ main <- function() {
# Calculate summary statistics
message("Calculating summary statistics for all strains")
variables <- c("L", "K", "r", "AUC")
summary_stats <- calculate_summary_stats(df_na, variables, group_vars = c("conc_num", "conc_num_factor"))
write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
stats <- calculate_summary_stats(df_na, variables, group_vars = c("conc_num", "conc_num_factor"))
write.csv(stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
stats_joined <- left_join(df_na, stats, by = c("conc_num", "conc_num_factor"))
# Calculate the pre-background stats once
stats_by_l <- summary_stats %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_k <- summary_stats %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_r <- summary_stats %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_auc <- summary_stats %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_l <- stats_joined %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_k <- stats_joined %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_r <- stats_joined %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_auc <- stats_joined %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor")
# Process background strains
background_strains <- c("YDL227C")
@@ -604,7 +609,7 @@ main <- function() {
# Handle missing data by setting zero values to NA
# and then removing any rows with NA in L col
df_background <- df_na %>%
df_bg <- df_na %>%
filter(OrfRep == strain) %>%
mutate(
L = if_else(L == 0, NA, L),
@@ -616,17 +621,18 @@ main <- function() {
# Recalculate summary statistics for the background strain
message("Calculating summary statistics for background strain")
stats_background <- calculate_summary_stats(df_background, variables, group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor"))
stats_by_l_background <- stats_background %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_k_background <- stats_background %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_r_background <- stats_background %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_auc_background <- stats_background %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor")
write.csv(stats_background,
stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor"))
stats_by_l_bg <- stats_bg %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_k_bg <- stats_bg %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_r_bg <- stats_bg %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor")
stats_by_auc_bg <- stats_bg %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor")
write.csv(stats_bg,
file = file.path(output_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
row.names = FALSE)
stats_bg_joined <- left_join(df_bg, stats_bg, by = c("conc_num", "conc_num_factor"))
# Filter L values within and outside 2SD of K
results_2sd <- calculate_l_2sd_of_k(df_background, stats_by_k_background)
results_2sd <- calculate_l_2sd_of_k(df_bg, stats_by_k_bg)
within_2sd_k <- results_2sd$within_2sd_k
outside_2sd_k <- results_2sd$outside_2sd_k
@@ -646,8 +652,7 @@ main <- function() {
generate_and_save_plots(outside_2sd_k, out_dir, "Raw_L_vs_K_for_strains_outside_2sd_k")
message("Calculating background means")
background_means <- calculate_background_means(stats_by_l_background,
stats_by_k_background, stats_by_r_background, stats_by_auc_background)
background_means <- calculate_bg_means(stats_by_l_bg, stats_by_k_bg, stats_by_r_bg, stats_by_auc_bg)
# Filter reference and deletion strains
# Formerly X2_RF (reference strain)