Przeglądaj źródła

Interactions refactor to improve lm object handling

Bryan Roessler 8 miesięcy temu
rodzic
commit
fa7544cb0e
1 zmienionych plików z 56 dodań i 65 usunięć
  1. 56 65
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 56 - 65
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -204,64 +204,59 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
     AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
   )
 
+  # Main statistics and shifts
   stats <- df %>%
     mutate(
-      WT_L = df$mean_L,
-      WT_K = df$mean_K,
-      WT_r = df$mean_r,
-      WT_AUC = df$mean_AUC,
-      WT_sd_L = df$sd_L,
-      WT_sd_K = df$sd_K,
-      WT_sd_r = df$sd_r,
-      WT_sd_AUC = df$sd_AUC
+      WT_L = mean_L,
+      WT_K = mean_K,
+      WT_r = mean_r,
+      WT_AUC = mean_AUC,
+      WT_sd_L = sd_L,
+      WT_sd_K = sd_K,
+      WT_sd_r = sd_r,
+      WT_sd_AUC = sd_AUC
     ) %>%
     group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
-      mutate(
-        N = sum(!is.na(L)),
-        NG = sum(NG, na.rm = TRUE),
-        DB = sum(DB, na.rm = TRUE),
-        SM = sum(SM, na.rm = TRUE),
-        across(all_of(variables), list(
-          mean = ~mean(., na.rm = TRUE),
-          median = ~median(., na.rm = TRUE),
-          max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)),
-          min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)),
-          sd = ~sd(., na.rm = TRUE),
-          se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), NA)
-        ), .names = "{.fn}_{.col}")
-      ) %>%
+    mutate(
+      N = sum(!is.na(L)),
+      NG = sum(NG, na.rm = TRUE),
+      DB = sum(DB, na.rm = TRUE),
+      SM = sum(SM, na.rm = TRUE),
+      across(all_of(variables), list(
+        mean = ~mean(., na.rm = TRUE),
+        median = ~median(., na.rm = TRUE),
+        max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)),
+        min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)),
+        sd = ~sd(., na.rm = TRUE),
+        se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), NA)
+      ), .names = "{.fn}_{.col}")
+    ) %>%
     ungroup()
 
   stats <- stats %>%
     group_by(across(all_of(group_vars))) %>%
-      mutate(
-        Raw_Shift_L = mean_L[[1]] - bg_means$L,
-        Raw_Shift_K = mean_K[[1]] - bg_means$K,
-        Raw_Shift_r = mean_r[[1]] - bg_means$r,
-        Raw_Shift_AUC = mean_AUC[[1]] - bg_means$AUC,
-        Z_Shift_L = Raw_Shift_L[[1]] / bg_sd$L,
-        Z_Shift_K = Raw_Shift_K[[1]] / bg_sd$K,
-        Z_Shift_r = Raw_Shift_r[[1]] / bg_sd$r,
-        Z_Shift_AUC = Raw_Shift_AUC[[1]] / bg_sd$AUC
-      )
+    mutate(
+      Raw_Shift_L = mean_L - bg_means$L,
+      Raw_Shift_K = mean_K - bg_means$K,
+      Raw_Shift_r = mean_r - bg_means$r,
+      Raw_Shift_AUC = mean_AUC - bg_means$AUC,
+      Z_Shift_L = Raw_Shift_L / bg_sd$L,
+      Z_Shift_K = Raw_Shift_K / bg_sd$K,
+      Z_Shift_r = Raw_Shift_r / bg_sd$r,
+      Z_Shift_AUC = Raw_Shift_AUC / bg_sd$AUC
+    )
 
   stats <- stats %>%
     mutate(
       Exp_L = WT_L + Raw_Shift_L,
       Exp_K = WT_K + Raw_Shift_K,
       Exp_r = WT_r + Raw_Shift_r,
-      Exp_AUC = WT_AUC + Raw_Shift_AUC
-    )
-
-  stats <- stats %>%
-    mutate(
+      Exp_AUC = WT_AUC + Raw_Shift_AUC,
       Delta_L = mean_L - Exp_L,
       Delta_K = mean_K - Exp_K,
       Delta_r = mean_r - Exp_r,
       Delta_AUC = mean_AUC - Exp_AUC
-    )
-
-  stats <- stats %>%
+    ) %>%
     mutate(
       Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L),
       Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K),
@@ -270,39 +265,34 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
       Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L)
     )
 
-  stats <- stats %>%
-    mutate(
-      Zscore_L = Delta_L / WT_sd_L,
-      Zscore_K = Delta_K / WT_sd_K,
-      Zscore_r = Delta_r / WT_sd_r,
-      Zscore_AUC = Delta_AUC / WT_sd_AUC
-    )
-
+  # Create linear models with proper error handling for insufficient data
   lms <- stats %>%
     summarise(
-      lm_L = list(lm(Delta_L ~ conc_num_factor)),
-      lm_K = list(lm(Delta_K ~ conc_num_factor)),
-      lm_r = list(lm(Delta_r ~ conc_num_factor)),
-      lm_AUC = list(lm(Delta_AUC ~ conc_num_factor))
+      lm_L = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_L)) > 1) list(lm(Delta_L ~ conc_num_factor)) else NULL,
+      lm_K = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_K)) > 1) list(lm(Delta_K ~ conc_num_factor)) else NULL,
+      lm_r = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_r)) > 1) list(lm(Delta_r ~ conc_num_factor)) else NULL,
+      lm_AUC = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_AUC)) > 1) list(lm(Delta_AUC ~ conc_num_factor)) else NULL
     )
 
+  # Join models and calculate interaction scores
   stats <- stats %>%
     left_join(lms, by = group_vars) %>%
     mutate(
-      lm_Score_L = sapply(lm_L, function(model) coef(model)[2] * max_conc + coef(model)[1]),
-      lm_Score_K = sapply(lm_K, function(model) coef(model)[2] * max_conc + coef(model)[1]),
-      lm_Score_r = sapply(lm_r, function(model) coef(model)[2] * max_conc + coef(model)[1]),
-      lm_Score_AUC = sapply(lm_AUC, function(model) coef(model)[2] * max_conc + coef(model)[1]),
-      R_Squared_L = sapply(lm_L, function(model) summary(model)$r.squared),
-      R_Squared_K = sapply(lm_K, function(model) summary(model)$r.squared),
-      R_Squared_r = sapply(lm_r, function(model) summary(model)$r.squared),
-      R_Squared_AUC = sapply(lm_AUC, function(model) summary(model)$r.squared),
+      lm_Score_L = sapply(lm_L, function(model) if (!is.null(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
+      lm_Score_K = sapply(lm_K, function(model) if (!is.null(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
+      lm_Score_r = sapply(lm_r, function(model) if (!is.null(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
+      lm_Score_AUC = sapply(lm_AUC, function(model) if (!is.null(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
+      R_Squared_L = sapply(lm_L, function(model) if (!is.null(model)) summary(model)$r.squared else NA),
+      R_Squared_K = sapply(lm_K, function(model) if (!is.null(model)) summary(model)$r.squared else NA),
+      R_Squared_r = sapply(lm_r, function(model) if (!is.null(model)) summary(model)$r.squared else NA),
+      R_Squared_AUC = sapply(lm_AUC, function(model) if (!is.null(model)) summary(model)$r.squared else NA),
       Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE),
       Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE),
       Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE),
       Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE)
     )
 
+  # Calculate Z-scores
   stats <- stats %>%
     mutate(
       Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs,
@@ -315,9 +305,10 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
       Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE)
     )
 
-  # Declare column order for output
+  # Final output preparation
   calculations <- stats %>%
-    select("OrfRep", "Gene", "num", "conc_num", "conc_num_factor",
+    select(
+      "OrfRep", "Gene", "num", "conc_num", "conc_num_factor",
       "mean_L", "mean_K", "mean_r", "mean_AUC",
       "median_L", "median_K", "median_r", "median_AUC",
       "sd_L", "sd_K", "sd_r", "sd_AUC",
@@ -332,10 +323,9 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
       "NG", "SM", "DB") %>%
     ungroup()
 
-  # Also arrange results by Z_lm_L and NG
   interactions <- stats %>%
-    select("OrfRep", "Gene", "num",
-      "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_AUC", "Raw_Shift_r",
+    select(
+      "OrfRep", "Gene", "num", "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_AUC", "Raw_Shift_r",
       "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC",
       "lm_Score_L", "lm_Score_K", "lm_Score_AUC", "lm_Score_r",
       "R_Squared_L", "R_Squared_K", "R_Squared_r", "R_Squared_AUC",
@@ -350,6 +340,7 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
   return(list(calculations = calculations, interactions = interactions))
 }
 
+
 generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
 
   message("Generating html and pdf plots for: ", file_name, ".pdf|html")