Use a single N value

2024-09-05 20:18:36 -04:00
parent 6d678e661b
commit e82b6c5e59
1 changed files with 12 additions and 11 deletions
--- a/workflow/apps/r/calculate_interaction_zscores5.R
+++ b/workflow/apps/r/calculate_interaction_zscores5.R
@@ -133,7 +133,6 @@ load_and_process_data <- function(easy_results_file, sd = 3) {
  return(df)
 }
 # Update Gene names using the SGD gene list
 update_gene_names <- function(df, sgd_gene_list) {
  # Load SGD gene list
@@ -184,21 +183,20 @@ process_strains <- function(df) {
 # Calculate summary statistics for all variables
 calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
  # Generate summary statistics
  summary_stats <- df %>%
    group_by(across(all_of(group_vars))) %>%
    reframe(across(all_of(variables), list(
-      N = ~sum(!is.na(.)),  # Count of non-NA values
+      mean = ~mean(., na.rm = TRUE),
-      mean = ~mean(., na.rm = TRUE),  # Mean ignoring NAs
+      median = ~median(., na.rm = TRUE),
-      median = ~median(., na.rm = TRUE),  # Median ignoring NAs
+      max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)),
-      max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)),  # Return NA if all values are NA
+      min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)),
-      min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)),  # Return NA if all values are NA
+      sd = ~sd(., na.rm = TRUE),
-      sd = ~sd(., na.rm = TRUE),  # Standard deviation ignoring NAs
+      se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), NA)
      se = ~ifelse(N > 1, sd(., na.rm = TRUE) / sqrt(N - 1), NA)  # Standard Error using precomputed N
      # TODO: not in original stats but better to do here than in calculate_interactions?
      # z_max = ~ifelse(sd(., na.rm = TRUE) == 0 | all(is.na(.)), NA,
      #   (max(., na.rm = TRUE) - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE))  # Z-score
-    ), .names = "{.fn}_{.col}"))
+    ), .names = "{.fn}_{.col}")) %>%
    mutate(N = ~sum(!is.na(L))) # count of non-NA L values
  # Join the summary stats back to the original dataframe
  df_with_stats <- left_join(df, summary_stats, by = group_vars)
@@ -207,7 +205,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
  return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
 }
-
+# Calculate interaction scores
 calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
  # Calculate total concentration variables
@@ -691,6 +689,9 @@ main <- function() {
        file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
        row.names = FALSE)
      print("Background summary stats:")
      print(head(summary_stats_bg))
      # Filter reference and deletion strains
      # Formerly X2_RF (reference strain)
      df_reference <- df_bg_stats %>%