From 268d7decb5a8fca5a2d8140c25298a429646d49b Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Tue, 17 Sep 2024 18:54:44 -0400 Subject: [PATCH] Tidy up groupings --- .../apps/r/calculate_interaction_zscores.R | 76 ++++++++++--------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index ec7763ce..8dd976c3 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -160,22 +160,33 @@ update_gene_names <- function(df, sgd_gene_list) { calculate_summary_stats <- function(df, variables, group_vars) { + summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% summarise( - N = sum(!is.na(L)), + N = n(), + sd_check = sd(L, na.rm = TRUE), # Test sd on a specific variable, e.g., L + se_check = sd(L, na.rm = TRUE) / sqrt(N) # Test se on a specific variable, e.g., L + ) + + summary_stats <- df %>% + group_by(across(all_of(group_vars))) %>% + summarise( + N = n(), across(all_of(variables), list( mean = ~mean(., na.rm = TRUE), median = ~median(., na.rm = TRUE), - max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), - min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), + max = ~max(., na.rm = TRUE), + min = ~min(., na.rm = TRUE), sd = ~sd(., na.rm = TRUE), - se = ~ifelse(all(is.na(.)), NA, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)) - ), - .names = "{.fn}_{.col}"), + se = ~sd(., na.rm = TRUE) / sqrt(N) # Corrected SE calculation + ), .names = "{.fn}_{.col}"), .groups = "drop" ) + # sd = ~sd(., na.rm = TRUE) + # se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1) + # Create a cleaned version of df that doesn't overlap with summary_stats cols_to_keep <- setdiff(names(df), names(summary_stats)[-which(names(summary_stats) %in% group_vars)]) df_cleaned <- df %>% @@ -193,22 +204,22 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars) { # Pull the background means and standard deviations from zero concentration bg_means <- list( - L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(), - K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(), - r = df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first(), - AUC = df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first() + L = df %>% filter(conc_num == 0) %>% pull(mean_L) %>% first(), + K = df %>% filter(conc_num == 0) %>% pull(mean_K) %>% first(), + r = df %>% filter(conc_num == 0) %>% pull(mean_r) %>% first(), + AUC = df %>% filter(conc_num == 0) %>% pull(mean_AUC) %>% first() ) bg_sd <- list( - L = df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first(), - K = df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first(), - r = df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first(), - AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() + L = df %>% filter(conc_num == 0) %>% pull(sd_L) %>% first(), + K = df %>% filter(conc_num == 0) %>% pull(sd_K) %>% first(), + r = df %>% filter(conc_num == 0) %>% pull(sd_r) %>% first(), + AUC = df %>% filter(conc_num == 0) %>% pull(sd_AUC) %>% first() ) stats <- calculate_summary_stats(df, variables = variables, - group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor" + group_vars = c("OrfRep", "Gene", "num", "conc_num_factor" ))$summary_stats stats <- df %>% @@ -346,10 +357,10 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars) { "NG", "SM", "DB") calculations_joined <- df %>% select(-any_of(setdiff(names(calculations), c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")))) - calculations_joined <- left_join(calculations_joined, calculations, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) + calculations_joined <- left_join(calculations_joined, calculations, by = c("OrfRep", "Gene", "num", "conc_num_factor")) interactions_joined <- df %>% select(-any_of(setdiff(names(interactions), c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")))) - interactions_joined <- left_join(interactions_joined, interactions, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) + interactions_joined <- left_join(interactions_joined, interactions, by = c("OrfRep", "Gene", "num", "conc_num_factor")) return(list(calculations = calculations, interactions = interactions, interactions_joined = interactions_joined, calculations_joined = calculations_joined)) @@ -819,7 +830,7 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE) { ) ) } else { - x_var <- paste0("Rank", variable) + x_var <- paste0("Rank_", variable) y_var <- paste0("Rank_lm_", variable) title_suffix <- paste("Rank Avg Zscore vs lm", variable) rectangles <- NULL @@ -927,11 +938,10 @@ generate_correlation_plot_configs <- function(df) { filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FALSE, rank = FALSE, limits_map = NULL, verbose = TRUE) { - # Precompute Column Names for Efficiency avg_zscore_cols <- paste0("Avg_Zscore_", variables) z_lm_cols <- paste0("Z_lm_", variables) - # Adjust NAs if 'adjust' is TRUE + # Adjust NAs to .001 for linear model if (adjust) { if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns.") df <- df %>% @@ -941,25 +951,23 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL ) } - # Filter Non-Finite Values if 'nf' is TRUE + # Filter non-finite values if (nf) { if (verbose) message("Filtering non-finite values for variables: ", paste(variables, collapse = ", ")) - # Identify non-finite rows for logging non_finite_df <- df %>% filter(if_any(all_of(variables), ~ !is.finite(.))) if (verbose && nrow(non_finite_df) > 0) { - message("Non-finite rows for variables ", paste(variables, collapse = ", "), ":") + message("Filtering non-finite rows for variables ", paste(variables, collapse = ", "), ":") print(non_finite_df) } - # Keep only rows where all specified variables are finite df <- df %>% filter(if_all(all_of(variables), ~ is.finite(.))) } - # Filter Missing Values if 'missing' is TRUE + # Filter missing values if (missing) { if (verbose) message("Filtering missing values for variables: ", paste(variables, collapse = ", ")) @@ -977,7 +985,7 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL filter(if_all(all_of(variables), ~ !is.na(.))) } - # Apply Limits from 'limits_map' if Provided + # Apply Limits from 'limits_map' if provided if (!is.null(limits_map)) { for (variable in names(limits_map)) { if (variable %in% variables) { @@ -985,7 +993,6 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL if (verbose) message("Applying limits for variable ", variable, ": [", ylim_vals[1], ", ", ylim_vals[2], "].") - # Identify out-of-range data for logging out_of_range_df <- df %>% filter(.data[[variable]] < ylim_vals[1] | .data[[variable]] > ylim_vals[2]) @@ -994,7 +1001,6 @@ filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FAL print(out_of_range_df) } - # Keep only rows within the specified limits df <- df %>% filter(.data[[variable]] >= ylim_vals[1] & .data[[variable]] <= ylim_vals[2]) } @@ -1053,7 +1059,7 @@ main <- function() { ss <- calculate_summary_stats( df = df, variables = summary_vars, - group_vars = c("OrfRep", "conc_num", "conc_num_factor")) + group_vars = c("OrfRep", "conc_num_factor")) df_stats <- ss$df_with_stats message("Filtering non-finite data") df_filtered_stats <- filter_data(df_stats, c("L"), nf = TRUE) @@ -1062,7 +1068,7 @@ main <- function() { ss <- calculate_summary_stats( df = df_na, variables = summary_vars, - group_vars = c("OrfRep", "conc_num", "conc_num_factor")) + group_vars = c("OrfRep", "conc_num_factor")) df_na_ss <- ss$summary_stats df_na_stats <- ss$df_with_stats write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) @@ -1072,7 +1078,7 @@ main <- function() { ss <- calculate_summary_stats( df = df_no_zeros, variables = summary_vars, - group_vars = c("OrfRep", "conc_num", "conc_num_factor")) + group_vars = c("OrfRep", "conc_num_factor")) df_no_zeros_stats <- ss$df_with_stats df_no_zeros_filtered_stats <- filter_data(df_no_zeros_stats, c("L"), nf = TRUE) @@ -1084,14 +1090,14 @@ main <- function() { message("Calculating summary statistics for L within 2SD of K") # TODO We're omitting the original z_max calculation, not sure if needed? - ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num_factor")) # df_na_l_within_2sd_k_stats <- ss$df_with_stats write.csv(ss$summary_stats, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) message("Calculating summary statistics for L outside 2SD of K") - ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num_factor")) df_na_l_outside_2sd_k_stats <- ss$df_with_stats write.csv(ss$summary_stats, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), @@ -1266,7 +1272,7 @@ main <- function() { # Set the missing values to the highest theoretical value at each drug conc for L # Leave other values as 0 for the max/min reference_strain <- df_reference %>% - group_by(conc_num) %>% + group_by(conc_num_factor) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), @@ -1276,7 +1282,7 @@ main <- function() { # Ditto for deletion strains deletion_strains <- df_deletion %>% - group_by(conc_num) %>% + group_by(conc_num_factor) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),