Procházet zdrojové kódy

Move more lm calcs out of dplyr

Bryan Roessler před 7 měsíci
rodič
revize
54b32f47c7
1 změnil soubory, kde provedl 31 přidání a 20 odebrání
  1. 31 20
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 31 - 20
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -277,23 +277,23 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
   # Create linear models with error handling for missing/insufficient data
   lms <- stats %>%
     summarise(
-      lm_L = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_L)) > 1) list(lm(Delta_L ~ conc_num_factor)) else NULL,
-      lm_K = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_K)) > 1) list(lm(Delta_K ~ conc_num_factor)) else NULL,
-      lm_r = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_r)) > 1) list(lm(Delta_r ~ conc_num_factor)) else NULL,
-      lm_AUC = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_AUC)) > 1) list(lm(Delta_AUC ~ conc_num_factor)) else NULL
+      lm_L = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_L)) > 1) list(lm(Delta_L ~ conc_num_factor)) else NA,
+      lm_K = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_K)) > 1) list(lm(Delta_K ~ conc_num_factor)) else NA,
+      lm_r = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_r)) > 1) list(lm(Delta_r ~ conc_num_factor)) else NA,
+      lm_AUC = if (n_distinct(conc_num_factor) > 1 && sum(!is.na(Delta_AUC)) > 1) list(lm(Delta_AUC ~ conc_num_factor)) else NA
     )
 
   stats <- stats %>%
     left_join(lms, by = group_vars) %>%
     mutate(
-      lm_Score_L = sapply(lm_L, function(model) if (!is.na(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
-      lm_Score_K = sapply(lm_K, function(model) if (!is.na(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
-      lm_Score_r = sapply(lm_r, function(model) if (!is.na(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
-      lm_Score_AUC = sapply(lm_AUC, function(model) if (!is.na(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
-      R_Squared_L = sapply(lm_L, function(model) if (!is.na(model)) summary(model)$r.squared else NA),
-      R_Squared_K = sapply(lm_K, function(model) if (!is.na(model)) summary(model)$r.squared else NA),
-      R_Squared_r = sapply(lm_r, function(model) if (!is.na(model)) summary(model)$r.squared else NA),
-      R_Squared_AUC = sapply(lm_AUC, function(model) if (!is.na(model)) summary(model)$r.squared else NA),
+      lm_Score_L = lapply(lm_L, function(model) if (!is.null(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
+      lm_Score_K = lapply(lm_K, function(model) if (!is.null(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
+      lm_Score_r = lapply(lm_r, function(model) if (!is.null(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
+      lm_Score_AUC = lapply(lm_AUC, function(model) if (!is.null(model)) coef(model)[2] * max_conc + coef(model)[1] else NA),
+      R_Squared_L = lapply(lm_L, function(model) if (!is.null(model)) summary(model)$r.squared else NA),
+      R_Squared_K = lapply(lm_K, function(model) if (!is.null(model)) summary(model)$r.squared else NA),
+      R_Squared_r = lapply(lm_r, function(model) if (!is.null(model)) summary(model)$r.squared else NA),
+      R_Squared_AUC = lapply(lm_AUC, function(model) if (!is.null(model)) summary(model)$r.squared else NA),
       Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE),
       Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE),
       Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE),
@@ -302,6 +302,10 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
 
   stats <- stats %>%
     mutate(
+      lm_Score_L = unlist(lm_Score_L),
+      lm_Score_K = unlist(lm_Score_K),
+      lm_Score_r = unlist(lm_Score_r),
+      lm_Score_AUC = unlist(lm_Score_AUC),
       Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs,
       Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs,
       Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1),
@@ -693,6 +697,7 @@ main <- function() {
 
     summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
     group_vars <- c("OrfRep", "conc_num", "conc_num_factor") # default fields to group by
+    orf_group_vars <- c("OrfRep", "Gene", "num")
     print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor",
       "delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB")
     
@@ -1024,10 +1029,10 @@ main <- function() {
       message("Calculating interaction scores")
       # print("Reference strain:")
       # print(head(reference_strain))
-      reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars)
+      reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars, group_vars = orf_group_vars)
       # print("Deletion strains:")
       # print(head(deletion_strains))
-      deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars)
+      deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars, group_vars = orf_group_vars)
       
       zscores_calculations_reference <- reference_results$calculations
       zscores_interactions_reference <- reference_results$interactions
@@ -1114,11 +1119,20 @@ main <- function() {
       
       # Formerly X_NArm
       zscores_interactions_filtered <- zscores_interactions %>%
-        group_by(across(all_of(group_vars))) %>%
+        group_by(across(all_of(orf_group_vars))) %>%
           filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L))
       
       # Final filtered correlation calculations and plots
+      lm_results <- zscores_interactions_filtered %>%
+        summarise(
+          lm_R_squared_L = if (n() > 1) summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared else NA,
+          lm_R_squared_K = if (n() > 1) summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared else NA,
+          lm_R_squared_r = if (n() > 1) summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared else NA,
+          lm_R_squared_AUC = if (n() > 1) summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared else NA
+        )
+
       zscores_interactions_filtered <- zscores_interactions_filtered %>%
+        left_join(lm_results, by = orf_group_vars) %>%
         mutate(
           Overlap = case_when(
             Z_lm_L >= 2 & Avg_Zscore_L >= 2 ~ "Deletion Enhancer Both",
@@ -1128,14 +1142,11 @@ main <- function() {
             Z_lm_L >= 2 & Avg_Zscore_L <= -2 ~ "Deletion Enhancer lm, Deletion Suppressor Avg Z score",
             Z_lm_L <= -2 & Avg_Zscore_L >= 2 ~ "Deletion Suppressor lm, Deletion Enhancer Avg Z score",
             TRUE ~ "No Effect"
-          ),
-          lm_R_squared_L = summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared,
-          lm_R_squared_K = summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared,
-          lm_R_squared_r = summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared,
-          lm_R_squared_AUC = summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared
+          )
         ) %>%
         ungroup()
 
+
       rank_plot_configs <- c(
         generate_rank_plot_configs(zscores_interactions_filtered, "Rank_L", "Avg_Zscore_L", "L"),
         generate_rank_plot_configs(zscores_interactions_filtered, "Rank_K", "Avg_Zscore_K", "K")