diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index bae8a882..4b2424f2 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -216,27 +216,24 @@ generate_and_save_plots <- function(df, output_dir, prefix, variables, include_q if (include_qc) { plots[["Raw_L_vs_K"]] <- generate_plot(df, x_var = "L", y_var = "K", plot_type = "scatter", - title = "Raw L vs K before QC") + title = "Raw L vs K before QC") plots[["Delta_bg_Density"]] <- generate_plot(df, x_var = "delta_bg", plot_type = "density", color_var = "conc_num", - title = "Density plot for Delta Background by Conc All Data") + title = "Density plot for Delta Background by Conc All Data") plots[["Delta_bg_Bar"]] <- generate_plot(df, x_var = "delta_bg", plot_type = "bar", - title = "Bar plot for Delta Background by Conc All Data") + title = "Bar plot for Delta Background by Conc All Data") } save_plots(prefix, plots, output_dir) } -# Calculate summary statistics for all variables together +# Calculate summary statistics for all variables calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) { + # Calculate summary statistics with the grouping columns summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% summarise(across(all_of(variables), list( - N = ~{ - message("Calculating summary statistics for ", cur_column()) - n() - }, mean = ~mean(.x, na.rm = TRUE), median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE), @@ -299,7 +296,7 @@ save_plots <- function(file_name, plot_list, output_dir) { # Calculate background strain mean values -calculate_background_means <- function(df_stats_by_l, df_stats_by_k, df_stats_by_r, df_stats_by_auc) { +calculate_bg_means <- function(df_stats_by_l, df_stats_by_k, df_stats_by_r, df_stats_by_auc) { list( L = df_stats_by_l %>% filter(conc_num_factor == 0) %>% pull(mean_L), K = df_stats_by_k %>% filter(conc_num_factor == 0) %>% pull(mean_K), @@ -573,13 +570,21 @@ main <- function() { K = ifelse(DB == 1, NA, K) ) df_no_zeros <- df_na %>% filter(L > 0) + + # Flag and remove non-finite data, printing affected rows + df_na_filtered <- df_na %>% + filter(if_any(c(L, r, AUC, K), ~ !is.finite(.))) %>% + { + if (nrow(.) > 0) message("Removing non-finite rows:\n", print(.)) + df_na %>% filter(if_all(c(L, r, AUC, K), is.finite)) + } # Generate QC PDFs and HTMLs message("Generating QC plots") variables <- c("L", "K", "r", "AUC", "delta_bg") generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE) generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE) - generate_and_save_plots(df_na, out_dir_qc, "After_QC", variables) + generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables) generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables) rm(df, df_above_tolerance, df_no_zeros) @@ -587,14 +592,14 @@ main <- function() { # Calculate summary statistics message("Calculating summary statistics for all strains") variables <- c("L", "K", "r", "AUC") - summary_stats <- calculate_summary_stats(df_na, variables, group_vars = c("conc_num", "conc_num_factor")) - write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) - - # Calculate the pre-background stats once - stats_by_l <- summary_stats %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor") - stats_by_k <- summary_stats %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor") - stats_by_r <- summary_stats %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor") - stats_by_auc <- summary_stats %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor") + stats <- calculate_summary_stats(df_na, variables, group_vars = c("conc_num", "conc_num_factor")) + write.csv(stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) + stats_joined <- left_join(df_na, stats, by = c("conc_num", "conc_num_factor")) + + stats_by_l <- stats_joined %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor") + stats_by_k <- stats_joined %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor") + stats_by_r <- stats_joined %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor") + stats_by_auc <- stats_joined %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor") # Process background strains background_strains <- c("YDL227C") @@ -604,7 +609,7 @@ main <- function() { # Handle missing data by setting zero values to NA # and then removing any rows with NA in L col - df_background <- df_na %>% + df_bg <- df_na %>% filter(OrfRep == strain) %>% mutate( L = if_else(L == 0, NA, L), @@ -616,17 +621,18 @@ main <- function() { # Recalculate summary statistics for the background strain message("Calculating summary statistics for background strain") - stats_background <- calculate_summary_stats(df_background, variables, group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")) - stats_by_l_background <- stats_background %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor") - stats_by_k_background <- stats_background %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor") - stats_by_r_background <- stats_background %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor") - stats_by_auc_background <- stats_background %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor") - write.csv(stats_background, + stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")) + stats_by_l_bg <- stats_bg %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor") + stats_by_k_bg <- stats_bg %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor") + stats_by_r_bg <- stats_bg %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor") + stats_by_auc_bg <- stats_bg %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor") + write.csv(stats_bg, file = file.path(output_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), row.names = FALSE) - + stats_bg_joined <- left_join(df_bg, stats_bg, by = c("conc_num", "conc_num_factor")) + # Filter L values within and outside 2SD of K - results_2sd <- calculate_l_2sd_of_k(df_background, stats_by_k_background) + results_2sd <- calculate_l_2sd_of_k(df_bg, stats_by_k_bg) within_2sd_k <- results_2sd$within_2sd_k outside_2sd_k <- results_2sd$outside_2sd_k @@ -646,8 +652,7 @@ main <- function() { generate_and_save_plots(outside_2sd_k, out_dir, "Raw_L_vs_K_for_strains_outside_2sd_k") message("Calculating background means") - background_means <- calculate_background_means(stats_by_l_background, - stats_by_k_background, stats_by_r_background, stats_by_auc_background) + background_means <- calculate_bg_means(stats_by_l_bg, stats_by_k_bg, stats_by_r_bg, stats_by_auc_bg) # Filter reference and deletion strains # Formerly X2_RF (reference strain)