diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index f603b89e..e0cf5831 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -245,27 +245,6 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c return(summary_stats) } -# Calculate L values within and outside 2SD of K -calculate_l_2sd_of_k <- function(df, df_stats_by_k) { - # Join the statistics to the main dataframe - df_joined <- df %>% - filter(!is.na(L)) %>% - left_join(df_stats_by_k, by = "conc_num_factor") - - # Filter data within 2SD and outside 2SD - df_within_2sd_k <- df_joined %>% - filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) - - df_outside_2sd_k <- df_joined %>% - filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) - - # Select relevant columns to avoid duplicated columns from the join - df_within_2sd_k <- df_within_2sd_k %>% select(names(df)) - df_outside_2sd_k <- df_outside_2sd_k %>% select(names(df)) - - list(within_2sd_k = df_within_2sd_k, outside_2sd_k = df_outside_2sd_k) -} - # Ensure all plots are saved and printed to PDF save_plots <- function(file_name, plot_list, output_dir) { # Save to PDF @@ -294,7 +273,6 @@ save_plots <- function(file_name, plot_list, output_dir) { }) } - # Calculate background strain mean values calculate_bg_means <- function(df_stats_by_l, df_stats_by_k, df_stats_by_r, df_stats_by_auc) { list( @@ -573,11 +551,14 @@ main <- function() { # Flag and remove non-finite data, printing affected rows df_na_filtered <- df_na %>% - filter(if_any(c(L, r, AUC, K), ~ !is.finite(.))) %>% - { - if (nrow(.) > 0) message("Removing non-finite rows:\n", print(.)) - df_na %>% filter(if_all(c(L, r, AUC, K), is.finite)) - } + filter(if_any(c(L), ~ !is.finite(.))) %>% # Add L, r, AUC, K as needed for debugging + { + if (nrow(.) > 0) { + message("Removing non-finite rows:\n") + #print(.) + } + df_na %>% filter(if_all(c(L), is.finite)) # Add L, r, AUC, K as needed for debugging + } # # Generate QC PDFs and HTMLs # message("Generating QC plots") @@ -596,10 +577,33 @@ main <- function() { write.csv(stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) stats_joined <- left_join(df_na, stats, by = c("conc_num", "conc_num_factor")) + # Create separate dataframes for each variable (we'll use later for plotting) stats_by_l <- stats_joined %>% select(starts_with("L_"), "OrfRep", "conc_num", "conc_num_factor") stats_by_k <- stats_joined %>% select(starts_with("K_"), "OrfRep", "conc_num", "conc_num_factor") stats_by_r <- stats_joined %>% select(starts_with("r_"), "OrfRep", "conc_num", "conc_num_factor") stats_by_auc <- stats_joined %>% select(starts_with("AUC_"), "OrfRep", "conc_num", "conc_num_factor") + + stats_by_k_joined <- left_join(df_na, stats_by_k, by = c("conc_num_factor")) # TODO may need to add OrfRep, conc_num, etc + + # Filter data within 2SD + within_2sd_k <- stats_by_k_joined %>% + filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) + + # Filter data outside 2SD + outside_2sd_k <- stats_by_k_joined %>% + filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) + + # Calculate summary statistics for L within and outside 2SD of K + message("Calculating summary statistics for L within 2SD of K") + l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + write.csv(l_within_2sd_k, + file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), + row.names = FALSE) + message("Calculating summary statistics for for L outside 2SD of K") + l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + write.csv(l_outside_2sd_k, + file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), + row.names = FALSE) # Process background strains background_strains <- c("YDL227C") @@ -630,26 +634,6 @@ main <- function() { file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), row.names = FALSE) stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "Gene", "conc_num", "conc_num_factor")) - - # Filter L values within and outside 2SD of K - results_2sd <- calculate_l_2sd_of_k(df_bg, stats_by_k_bg) - within_2sd_k <- results_2sd$within_2sd_k - outside_2sd_k <- results_2sd$outside_2sd_k - - # Calculate summary statistics for L within and outside 2SD of K - message("Calculating summary statistics for for L within 2SD of K") - l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - write.csv(l_within_2sd_k, - file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), - row.names = FALSE) - message("Calculating summary statistics for for L outside 2SD of K") - l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - write.csv(l_outside_2sd_k, - file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), - row.names = FALSE) - - message("Generating Raw_L_vs_K_for_strains_outside_2sd_k plots") - generate_and_save_plots(outside_2sd_k, out_dir, "Raw_L_vs_K_for_strains_outside_2sd_k") message("Calculating background means") background_means <- calculate_bg_means(stats_by_l_bg, stats_by_k_bg, stats_by_r_bg, stats_by_auc_bg)