Przeglądaj źródła

Switch to map2() to parse lm coefficients

Bryan Roessler 6 miesięcy temu
rodzic
commit
d978407b06
1 zmienionych plików z 245 dodań i 110 usunięć
  1. 245 110
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 245 - 110
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -217,22 +217,6 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
     group_vars <- c("OrfRep", "Gene", "Drug")
   }
 
-  # This is a helper function to perform a linear regression and extract coefficients
-  perform_lm <- function(data, response, predictor, max_conc) {
-    valid_data <- data %>% filter(!is.na(data[[response]]))
-    if (nrow(valid_data) > 1) {
-      model <- lm(data[[response]] ~ data[[predictor]], data = valid_data)
-      list(
-        intercept = coef(model)[1],
-        slope = coef(model)[2],
-        r_squared = summary(model)$r.squared,
-        score = max_conc * coef(model)[2] + coef(model)[1]
-      )
-    } else {
-      list(intercept = NA, slope = NA, r_squared = NA, score = NA)
-    }
-  }
-
   # Calculate WT statistics from df_bg
   wt_stats <- df_bg %>%
     group_by(across(all_of(bg_group_vars))) %>%
@@ -312,37 +296,82 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
       Zscore_r = Delta_r / WT_sd_r,
       Zscore_AUC = Delta_AUC / WT_sd_AUC
     ) %>%
-    ungroup() %>%  # Ungroup before group_modify
+    ungroup()
+    
+  calculations <- calculations %>%
     group_by(across(all_of(group_vars))) %>%
-    group_modify(~ {
-      lm_L <- perform_lm(.x, "Delta_L", "conc_num_factor", max_conc)
-      lm_K <- perform_lm(.x, "Delta_K", "conc_num_factor", max_conc)
-      lm_r <- perform_lm(.x, "Delta_r", "conc_num_factor", max_conc)
-      lm_AUC <- perform_lm(.x, "Delta_AUC", "conc_num_factor", max_conc)
-
-      .x %>%
-        mutate(
-          lm_intercept_L = lm_L$intercept,
-          lm_slope_L = lm_L$slope,
-          R_Squared_L = lm_L$r_squared,
-          lm_Score_L = lm_L$score,
-
-          lm_intercept_K = lm_K$intercept,
-          lm_slope_K = lm_K$slope,
-          R_Squared_K = lm_K$r_squared,
-          lm_Score_K = lm_K$score,
-
-          lm_intercept_r = lm_r$intercept,
-          lm_slope_r = lm_r$slope,
-          R_Squared_r = lm_r$r_squared,
-          lm_Score_r = lm_r$score,
-
-          lm_intercept_AUC = lm_AUC$intercept,
-          lm_slope_AUC = lm_AUC$slope,
-          R_Squared_AUC = lm_AUC$r_squared,
-          lm_Score_AUC = lm_AUC$score
-        )
-    }) %>%
+    mutate(
+      lm_L = map2(.x = Delta_L, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+        list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+      } else {
+        lm(.x ~ .y) %>% {
+          list(
+            intercept = coef(.)[1],
+            slope = coef(.)[2],
+            r_squared = summary(.)$r.squared,
+            score = max_conc * coef(.)[2] + coef(.)[1]
+          )
+        }
+      }),
+      lm_K = map2(.x = Delta_K, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+        list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+      } else {
+        lm(.x ~ .y) %>% {
+          list(
+            intercept = coef(.)[1],
+            slope = coef(.)[2],
+            r_squared = summary(.)$r.squared,
+            score = max_conc * coef(.)[2] + coef(.)[1]
+          )
+        }
+      }),
+      lm_r = map2(.x = Delta_r, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+        list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+      } else {
+        lm(.x ~ .y) %>% {
+          list(
+            intercept = coef(.)[1],
+            slope = coef(.)[2],
+            r_squared = summary(.)$r.squared,
+            score = max_conc * coef(.)[2] + coef(.)[1]
+          )
+        }
+      }),
+      lm_AUC = map2(.x = Delta_AUC, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+        list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+      } else {
+        lm(.x ~ .y) %>% {
+          list(
+            intercept = coef(.)[1],
+            slope = coef(.)[2],
+            r_squared = summary(.)$r.squared,
+            score = max_conc * coef(.)[2] + coef(.)[1]
+          )
+        }
+      }),
+
+      # Extract coefficients and statistics for each model
+      lm_intercept_L = map_dbl(lm_L, "intercept"),
+      lm_slope_L = map_dbl(lm_L, "slope"),
+      R_Squared_L = map_dbl(lm_L, "r_squared"),
+      lm_Score_L = map_dbl(lm_L, "score"),
+
+      lm_intercept_K = map_dbl(lm_K, "intercept"),
+      lm_slope_K = map_dbl(lm_K, "slope"),
+      R_Squared_K = map_dbl(lm_K, "r_squared"),
+      lm_Score_K = map_dbl(lm_K, "score"),
+
+      lm_intercept_r = map_dbl(lm_r, "intercept"),
+      lm_slope_r = map_dbl(lm_r, "slope"),
+      R_Squared_r = map_dbl(lm_r, "r_squared"),
+      lm_Score_r = map_dbl(lm_r, "score"),
+
+      lm_intercept_AUC = map_dbl(lm_AUC, "intercept"),
+      lm_slope_AUC = map_dbl(lm_AUC, "slope"),
+      R_Squared_AUC = map_dbl(lm_AUC, "r_squared"),
+      lm_Score_AUC = map_dbl(lm_AUC, "score")
+) %>%
+    select(-lm_L, -lm_K, -lm_r, -lm_AUC) %>%
     ungroup()
 
   # For interaction plot error bars
@@ -480,77 +509,183 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
           Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
           TRUE ~ "No Effect"
         ),
-        
-        # For rank plots
-        lm_L = perform_lm(., "Z_lm_L", "Avg_Zscore_L", max_conc),
-        lm_K = perform_lm(., "Z_lm_K", "Avg_Zscore_K", max_conc),
-        lm_r = perform_lm(., "Z_lm_r", "Avg_Zscore_r", max_conc),
-        lm_AUC = perform_lm(., "Z_lm_AUC", "Avg_Zscore_AUC", max_conc),
+        lm_L = map2(.x = Z_lm_L, .y = Avg_Zscore_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+        } else {
+          lm(.y ~ .x) %>% {
+            list(
+              intercept = coef(.)[1],
+              slope = coef(.)[2],
+              r_squared = summary(.)$r.squared,
+              score = max_conc * coef(.)[2] + coef(.)[1]
+            )
+          }
+        }),
+        lm_K = map2(.x = Z_lm_K, .y = Avg_Zscore_K, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+        } else {
+          lm(.y ~ .x) %>% {
+            list(
+              intercept = coef(.)[1],
+              slope = coef(.)[2],
+              r_squared = summary(.)$r.squared,
+              score = max_conc * coef(.)[2] + coef(.)[1]
+            )
+          }
+        }),
+        lm_r = map2(.x = Z_lm_r, .y = Avg_Zscore_r, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+        } else {
+          lm(.y ~ .x) %>% {
+            list(
+              intercept = coef(.)[1],
+              slope = coef(.)[2],
+              r_squared = summary(.)$r.squared,
+              score = max_conc * coef(.)[2] + coef(.)[1]
+            )
+          }
+        }),
+        lm_AUC = map2(.x = Z_lm_AUC, .y = Avg_Zscore_AUC, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+        } else {
+          lm(.y ~ .x) %>% {
+            list(
+              intercept = coef(.)[1],
+              slope = coef(.)[2],
+              r_squared = summary(.)$r.squared,
+              score = max_conc * coef(.)[2] + coef(.)[1]
+            )
+          }
+        }),
 
         # For correlation plots
-        Z_lm_K_L = perform_lm(., "Z_lm_K", "Z_lm_L", max_conc),
-        Z_lm_r_L = perform_lm(., "Z_lm_r", "Z_lm_L", max_conc),
-        Z_lm_R_AUC_L = perform_lm(., "Z_lm_AUC", "Z_lm_L", max_conc),
-        Z_lm_R_r_K = perform_lm(., "Z_lm_r", "Z_lm_K", max_conc),
-        Z_lm_R_AUC_K = perform_lm(., "Z_lm_AUC", "Z_lm_K", max_conc),
-        Z_lm_R_AUC_r = perform_lm(., "Z_lm_AUC", "Z_lm_r", max_conc),
+        Z_lm_K_L = map2(.x = Z_lm_K, .y = Z_lm_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+        } else {
+          lm(.y ~ .x) %>% {
+            list(
+              intercept = coef(.)[1],
+              slope = coef(.)[2],
+              r_squared = summary(.)$r.squared,
+              score = max_conc * coef(.)[2] + coef(.)[1]
+            )
+          }
+        }),
+        Z_lm_r_L = map2(.x = Z_lm_r, .y = Z_lm_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+        } else {
+          lm(.y ~ .x) %>% {
+            list(
+              intercept = coef(.)[1],
+              slope = coef(.)[2],
+              r_squared = summary(.)$r.squared,
+              score = max_conc * coef(.)[2] + coef(.)[1]
+            )
+          }
+        }),
+        Z_lm_R_AUC_L = map2(.x = Z_lm_AUC, .y = Z_lm_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+        } else {
+          lm(.y ~ .x) %>% {
+            list(
+              intercept = coef(.)[1],
+              slope = coef(.)[2],
+              r_squared = summary(.)$r.squared,
+              score = max_conc * coef(.)[2] + coef(.)[1]
+            )
+          }
+        }),
+        Z_lm_R_r_K = map2(.x = Z_lm_r, .y = Z_lm_K, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+        } else {
+          lm(.y ~ .x) %>% {
+            list(
+              intercept = coef(.)[1],
+              slope = coef(.)[2],
+              r_squared = summary(.)$r.squared,
+              score = max_conc * coef(.)[2] + coef(.)[1]
+            )
+          }
+        }),
+        Z_lm_R_AUC_K = map2(.x = Z_lm_AUC, .y = Z_lm_K, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+        } else {
+          lm(.y ~ .x) %>% {
+            list(
+              intercept = coef(.)[1],
+              slope = coef(.)[2],
+              r_squared = summary(.)$r.squared,
+              score = max_conc * coef(.)[2] + coef(.)[1]
+            )
+          }
+        }),
+        Z_lm_R_AUC_r = map2(.x = Z_lm_AUC, .y = Z_lm_r, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
+        } else {
+          lm(.y ~ .x) %>% {
+            list(
+              intercept = coef(.)[1],
+              slope = coef(.)[2],
+              r_squared = summary(.)$r.squared,
+              score = max_conc * coef(.)[2] + coef(.)[1]
+            )
+          }
+        }),
 
         # Extract coefficients and statistics for each model
-        lm_rank_intercept_L = lm_L$intercept,
-        lm_rank_slope_L = lm_L$slope,
-        R_Squared_L = lm_L$r_squared,
-        lm_Score_L = lm_L$score,
-
-        lm_intercept_K = lm_K$intercept,
-        lm_slope_K = lm_K$slope,
-        R_Squared_K = lm_K$r_squared,
-        lm_Score_K = lm_K$score,
-
-        lm_intercept_r = lm_r$intercept,
-        lm_slope_r = lm_r$slope,
-        R_Squared_r = lm_r$r_squared,
-        lm_Score_r = lm_r$score,
-
-        lm_intercept_AUC = lm_AUC$intercept,
-        lm_slope_AUC = lm_AUC$slope,
-        R_Squared_AUC = lm_AUC$r_squared,
-        lm_Score_AUC = lm_AUC$score,
-
-        Z_lm_intercept_K_L = Z_lm_K_L$intercept,
-        Z_lm_slope_K_L = Z_lm_K_L$slope,
-        Z_lm_R_squared_K_L = Z_lm_K_L$r_squared,
-        Z_lm_Score_K_L = Z_lm_K_L$score,
-
-        Z_lm_intercept_r_L = Z_lm_r_L$intercept,
-        Z_lm_slope_r_L = Z_lm_r_L$slope,
-        Z_lm_R_squared_r_L = Z_lm_r_L$r_squared,
-        Z_lm_Score_r_L = Z_lm_r_L$score,
-
-        Z_lm_intercept_R_AUC_L = Z_lm_R_AUC_L$intercept,
-        Z_lm_slope_R_AUC_L = Z_lm_R_AUC_L$slope,
-        Z_lm_R_squared_R_AUC_L = Z_lm_R_AUC_L$r_squared,
-        Z_lm_Score_R_AUC_L = Z_lm_R_AUC_L$score,
-
-        Z_lm_intercept_R_r_K = Z_lm_R_r_K$intercept,
-        Z_lm_slope_R_r_K = Z_lm_R_r_K$slope,
-        Z_lm_R_squared_R_r_K = Z_lm_R_r_K$r_squared,
-        Z_lm_Score_R_r_K = Z_lm_R_r_K$score,
-
-        Z_lm_intercept_R_AUC_K = Z_lm_R_AUC_K$intercept,
-        Z_lm_slope_R_AUC_K = Z_lm_R_AUC_K$slope,
-        Z_lm_R_squared_R_AUC_K = Z_lm_R_AUC_K$r_squared,
-        Z_lm_Score_R_AUC_K = Z_lm_R_AUC_K$score,
-
-        Z_lm_intercept_R_AUC_r = Z_lm_R_AUC_r$intercept,
-        Z_lm_slope_R_AUC_r = Z_lm_R_AUC_r$slope,
-        Z_lm_R_squared_R_AUC_r = Z_lm_R_AUC_r$r_squared,
-        Z_lm_Score_R_AUC_r = Z_lm_R_AUC_r$score
+        lm_rank_intercept_L = map_dbl(lm_L, "intercept"),
+        lm_rank_slope_L = map_dbl(lm_L, "slope"),
+        R_Squared_L = map_dbl(lm_L, "r_squared"),
+        lm_Score_L = map_dbl(lm_L, "score"),
+
+        lm_intercept_K = map_dbl(lm_K, "intercept"),
+        lm_slope_K = map_dbl(lm_K, "slope"),
+        R_Squared_K = map_dbl(lm_K, "r_squared"),
+        lm_Score_K = map_dbl(lm_K, "score"),
+
+        lm_intercept_r = map_dbl(lm_r, "intercept"),
+        lm_slope_r = map_dbl(lm_r, "slope"),
+        R_Squared_r = map_dbl(lm_r, "r_squared"),
+        lm_Score_r = map_dbl(lm_r, "score"),
+
+        lm_intercept_AUC = map_dbl(lm_AUC, "intercept"),
+        lm_slope_AUC = map_dbl(lm_AUC, "slope"),
+        R_Squared_AUC = map_dbl(lm_AUC, "r_squared"),
+        lm_Score_AUC = map_dbl(lm_AUC, "score"),
+
+        Z_lm_intercept_K_L = map_dbl(Z_lm_K_L, "intercept"),
+        Z_lm_slope_K_L = map_dbl(Z_lm_K_L, "slope"),
+        Z_lm_R_squared_K_L = map_dbl(Z_lm_K_L, "r_squared"),
+        Z_lm_Score_K_L = map_dbl(Z_lm_K_L, "score"),
+
+        Z_lm_intercept_r_L = map_dbl(Z_lm_r_L, "intercept"),
+        Z_lm_slope_r_L = map_dbl(Z_lm_r_L, "slope"),
+        Z_lm_R_squared_r_L = map_dbl(Z_lm_r_L, "r_squared"),
+        Z_lm_Score_r_L = map_dbl(Z_lm_r_L, "score"),
+
+        Z_lm_intercept_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "intercept"),
+        Z_lm_slope_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "slope"),
+        Z_lm_R_squared_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "r_squared"),
+        Z_lm_Score_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "score"),
+
+        Z_lm_intercept_R_r_K = map_dbl(Z_lm_R_r_K, "intercept"),
+        Z_lm_slope_R_r_K = map_dbl(Z_lm_R_r_K, "slope"),
+        Z_lm_R_squared_R_r_K = map_dbl(Z_lm_R_r_K, "r_squared"),
+        Z_lm_Score_R_r_K = map_dbl(Z_lm_R_r_K, "score"),
+
+        Z_lm_intercept_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "intercept"),
+        Z_lm_slope_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "slope"),
+        Z_lm_R_squared_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "r_squared"),
+        Z_lm_Score_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "score"),
+
+        Z_lm_intercept_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "intercept"),
+        Z_lm_slope_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "slope"),
+        Z_lm_R_squared_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "r_squared"),
+        Z_lm_Score_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "score")
       ) %>%
-      # Drop the lm objects and keep only the lm results
       select(
         -lm_L, -lm_K, -lm_r, -lm_AUC, -Z_lm_K_L, -Z_lm_r_L, -Z_lm_R_AUC_L,
-        -Z_lm_R_r_K, -Z_lm_R_AUC_K, -Z_lm_R_AUC_r
-      )
+        -Z_lm_R_r_K, -Z_lm_R_AUC_K, -Z_lm_R_AUC_r)
   } # end deletion-specific block
 
   # Create the final calculations and interactions dataframes with only required columns for csv output