diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index e452a894..4b35d257 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -133,7 +133,6 @@ load_and_process_data <- function(easy_results_file, sd = 3) { return(df) } - # Update Gene names using the SGD gene list update_gene_names <- function(df, sgd_gene_list) { # Load SGD gene list @@ -184,21 +183,20 @@ process_strains <- function(df) { # Calculate summary statistics for all variables calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) { - # Generate summary statistics summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% reframe(across(all_of(variables), list( - N = ~sum(!is.na(.)), # Count of non-NA values - mean = ~mean(., na.rm = TRUE), # Mean ignoring NAs - median = ~median(., na.rm = TRUE), # Median ignoring NAs - max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), # Return NA if all values are NA - min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), # Return NA if all values are NA - sd = ~sd(., na.rm = TRUE), # Standard deviation ignoring NAs - se = ~ifelse(N > 1, sd(., na.rm = TRUE) / sqrt(N - 1), NA) # Standard Error using precomputed N + mean = ~mean(., na.rm = TRUE), + median = ~median(., na.rm = TRUE), + max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), + min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), + sd = ~sd(., na.rm = TRUE), + se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), NA) # TODO: not in original stats but better to do here than in calculate_interactions? # z_max = ~ifelse(sd(., na.rm = TRUE) == 0 | all(is.na(.)), NA, # (max(., na.rm = TRUE) - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE)) # Z-score - ), .names = "{.fn}_{.col}")) + ), .names = "{.fn}_{.col}")) %>% + mutate(N = ~sum(!is.na(L))) # count of non-NA L values # Join the summary stats back to the original dataframe df_with_stats <- left_join(df, summary_stats, by = group_vars) @@ -207,7 +205,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c return(list(summary_stats = summary_stats, df_with_stats = df_with_stats)) } - +# Calculate interaction scores calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { # Calculate total concentration variables @@ -690,6 +688,9 @@ main <- function() { write.csv(summary_stats_bg, file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), row.names = FALSE) + + print("Background summary stats:") + print(head(summary_stats_bg)) # Filter reference and deletion strains # Formerly X2_RF (reference strain)