Switch to map2() to parse lm coefficients

2024-10-07 06:07:45 -04:00
parent 52381426b2
commit d978407b06
1 changed files with 232 additions and 97 deletions
--- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
+++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
@@ -217,22 +217,6 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
    group_vars <- c("OrfRep", "Gene", "Drug")
  }
  # This is a helper function to perform a linear regression and extract coefficients
  perform_lm <- function(data, response, predictor, max_conc) {
    valid_data <- data %>% filter(!is.na(data[[response]]))
    if (nrow(valid_data) > 1) {
      model <- lm(data[[response]] ~ data[[predictor]], data = valid_data)
      list(
        intercept = coef(model)[1],
        slope = coef(model)[2],
        r_squared = summary(model)$r.squared,
        score = max_conc * coef(model)[2] + coef(model)[1]
      )
    } else {
      list(intercept = NA, slope = NA, r_squared = NA, score = NA)
    }
  }
  # Calculate WT statistics from df_bg
  wt_stats <- df_bg %>%
    group_by(across(all_of(bg_group_vars))) %>%
@@ -312,37 +296,82 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
      Zscore_r = Delta_r / WT_sd_r,
      Zscore_AUC = Delta_AUC / WT_sd_AUC
    ) %>%
-    ungroup() %>%  # Ungroup before group_modify
+    ungroup()
  calculations <- calculations %>%
    group_by(across(all_of(group_vars))) %>%
-    group_modify(~ {
+    mutate(
-      lm_L <- perform_lm(.x, "Delta_L", "conc_num_factor", max_conc)
+      lm_L = map2(.x = Delta_L, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
-      lm_K <- perform_lm(.x, "Delta_K", "conc_num_factor", max_conc)
+        list(intercept = NA, slope = NA, r_squared = NA, score = NA)
-      lm_r <- perform_lm(.x, "Delta_r", "conc_num_factor", max_conc)
+      } else {
-      lm_AUC <- perform_lm(.x, "Delta_AUC", "conc_num_factor", max_conc)
+        lm(.x ~ .y) %>% {
          list(
            intercept = coef(.)[1],
            slope = coef(.)[2],
            r_squared = summary(.)$r.squared,
            score = max_conc * coef(.)[2] + coef(.)[1]
          )
        }
      }),
      lm_K = map2(.x = Delta_K, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
        list(intercept = NA, slope = NA, r_squared = NA, score = NA)
      } else {
        lm(.x ~ .y) %>% {
          list(
            intercept = coef(.)[1],
            slope = coef(.)[2],
            r_squared = summary(.)$r.squared,
            score = max_conc * coef(.)[2] + coef(.)[1]
          )
        }
      }),
      lm_r = map2(.x = Delta_r, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
        list(intercept = NA, slope = NA, r_squared = NA, score = NA)
      } else {
        lm(.x ~ .y) %>% {
          list(
            intercept = coef(.)[1],
            slope = coef(.)[2],
            r_squared = summary(.)$r.squared,
            score = max_conc * coef(.)[2] + coef(.)[1]
          )
        }
      }),
      lm_AUC = map2(.x = Delta_AUC, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
        list(intercept = NA, slope = NA, r_squared = NA, score = NA)
      } else {
        lm(.x ~ .y) %>% {
          list(
            intercept = coef(.)[1],
            slope = coef(.)[2],
            r_squared = summary(.)$r.squared,
            score = max_conc * coef(.)[2] + coef(.)[1]
          )
        }
      }),
-      .x %>%
+      # Extract coefficients and statistics for each model
-        mutate(
+      lm_intercept_L = map_dbl(lm_L, "intercept"),
-          lm_intercept_L = lm_L$intercept,
+      lm_slope_L = map_dbl(lm_L, "slope"),
-          lm_slope_L = lm_L$slope,
+      R_Squared_L = map_dbl(lm_L, "r_squared"),
-          R_Squared_L = lm_L$r_squared,
+      lm_Score_L = map_dbl(lm_L, "score"),
          lm_Score_L = lm_L$score,
-          lm_intercept_K = lm_K$intercept,
+      lm_intercept_K = map_dbl(lm_K, "intercept"),
-          lm_slope_K = lm_K$slope,
+      lm_slope_K = map_dbl(lm_K, "slope"),
-          R_Squared_K = lm_K$r_squared,
+      R_Squared_K = map_dbl(lm_K, "r_squared"),
-          lm_Score_K = lm_K$score,
+      lm_Score_K = map_dbl(lm_K, "score"),
-          lm_intercept_r = lm_r$intercept,
+      lm_intercept_r = map_dbl(lm_r, "intercept"),
-          lm_slope_r = lm_r$slope,
+      lm_slope_r = map_dbl(lm_r, "slope"),
-          R_Squared_r = lm_r$r_squared,
+      R_Squared_r = map_dbl(lm_r, "r_squared"),
-          lm_Score_r = lm_r$score,
+      lm_Score_r = map_dbl(lm_r, "score"),
-          lm_intercept_AUC = lm_AUC$intercept,
+      lm_intercept_AUC = map_dbl(lm_AUC, "intercept"),
-          lm_slope_AUC = lm_AUC$slope,
+      lm_slope_AUC = map_dbl(lm_AUC, "slope"),
-          R_Squared_AUC = lm_AUC$r_squared,
+      R_Squared_AUC = map_dbl(lm_AUC, "r_squared"),
-          lm_Score_AUC = lm_AUC$score
+      lm_Score_AUC = map_dbl(lm_AUC, "score")
-        )
+) %>%
-    }) %>%
+    select(-lm_L, -lm_K, -lm_r, -lm_AUC) %>%
    ungroup()
  # For interaction plot error bars
@@ -480,77 +509,183 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
          Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
          TRUE ~ "No Effect"
        ),
-        
+        lm_L = map2(.x = Z_lm_L, .y = Avg_Zscore_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
-        # For rank plots
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
-        lm_L = perform_lm(., "Z_lm_L", "Avg_Zscore_L", max_conc),
+        } else {
-        lm_K = perform_lm(., "Z_lm_K", "Avg_Zscore_K", max_conc),
+          lm(.y ~ .x) %>% {
-        lm_r = perform_lm(., "Z_lm_r", "Avg_Zscore_r", max_conc),
+            list(
-        lm_AUC = perform_lm(., "Z_lm_AUC", "Avg_Zscore_AUC", max_conc),
+              intercept = coef(.)[1],
              slope = coef(.)[2],
              r_squared = summary(.)$r.squared,
              score = max_conc * coef(.)[2] + coef(.)[1]
            )
          }
        }),
        lm_K = map2(.x = Z_lm_K, .y = Avg_Zscore_K, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
        } else {
          lm(.y ~ .x) %>% {
            list(
              intercept = coef(.)[1],
              slope = coef(.)[2],
              r_squared = summary(.)$r.squared,
              score = max_conc * coef(.)[2] + coef(.)[1]
            )
          }
        }),
        lm_r = map2(.x = Z_lm_r, .y = Avg_Zscore_r, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
        } else {
          lm(.y ~ .x) %>% {
            list(
              intercept = coef(.)[1],
              slope = coef(.)[2],
              r_squared = summary(.)$r.squared,
              score = max_conc * coef(.)[2] + coef(.)[1]
            )
          }
        }),
        lm_AUC = map2(.x = Z_lm_AUC, .y = Avg_Zscore_AUC, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
        } else {
          lm(.y ~ .x) %>% {
            list(
              intercept = coef(.)[1],
              slope = coef(.)[2],
              r_squared = summary(.)$r.squared,
              score = max_conc * coef(.)[2] + coef(.)[1]
            )
          }
        }),
        # For correlation plots
-        Z_lm_K_L = perform_lm(., "Z_lm_K", "Z_lm_L", max_conc),
+        Z_lm_K_L = map2(.x = Z_lm_K, .y = Z_lm_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
-        Z_lm_r_L = perform_lm(., "Z_lm_r", "Z_lm_L", max_conc),
+          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
-        Z_lm_R_AUC_L = perform_lm(., "Z_lm_AUC", "Z_lm_L", max_conc),
+        } else {
-        Z_lm_R_r_K = perform_lm(., "Z_lm_r", "Z_lm_K", max_conc),
+          lm(.y ~ .x) %>% {
-        Z_lm_R_AUC_K = perform_lm(., "Z_lm_AUC", "Z_lm_K", max_conc),
+            list(
-        Z_lm_R_AUC_r = perform_lm(., "Z_lm_AUC", "Z_lm_r", max_conc),
+              intercept = coef(.)[1],
              slope = coef(.)[2],
              r_squared = summary(.)$r.squared,
              score = max_conc * coef(.)[2] + coef(.)[1]
            )
          }
        }),
        Z_lm_r_L = map2(.x = Z_lm_r, .y = Z_lm_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
        } else {
          lm(.y ~ .x) %>% {
            list(
              intercept = coef(.)[1],
              slope = coef(.)[2],
              r_squared = summary(.)$r.squared,
              score = max_conc * coef(.)[2] + coef(.)[1]
            )
          }
        }),
        Z_lm_R_AUC_L = map2(.x = Z_lm_AUC, .y = Z_lm_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
        } else {
          lm(.y ~ .x) %>% {
            list(
              intercept = coef(.)[1],
              slope = coef(.)[2],
              r_squared = summary(.)$r.squared,
              score = max_conc * coef(.)[2] + coef(.)[1]
            )
          }
        }),
        Z_lm_R_r_K = map2(.x = Z_lm_r, .y = Z_lm_K, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
        } else {
          lm(.y ~ .x) %>% {
            list(
              intercept = coef(.)[1],
              slope = coef(.)[2],
              r_squared = summary(.)$r.squared,
              score = max_conc * coef(.)[2] + coef(.)[1]
            )
          }
        }),
        Z_lm_R_AUC_K = map2(.x = Z_lm_AUC, .y = Z_lm_K, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
        } else {
          lm(.y ~ .x) %>% {
            list(
              intercept = coef(.)[1],
              slope = coef(.)[2],
              r_squared = summary(.)$r.squared,
              score = max_conc * coef(.)[2] + coef(.)[1]
            )
          }
        }),
        Z_lm_R_AUC_r = map2(.x = Z_lm_AUC, .y = Z_lm_r, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) {
          list(intercept = NA, slope = NA, r_squared = NA, score = NA)
        } else {
          lm(.y ~ .x) %>% {
            list(
              intercept = coef(.)[1],
              slope = coef(.)[2],
              r_squared = summary(.)$r.squared,
              score = max_conc * coef(.)[2] + coef(.)[1]
            )
          }
        }),
        # Extract coefficients and statistics for each model
-        lm_rank_intercept_L = lm_L$intercept,
+        lm_rank_intercept_L = map_dbl(lm_L, "intercept"),
-        lm_rank_slope_L = lm_L$slope,
+        lm_rank_slope_L = map_dbl(lm_L, "slope"),
-        R_Squared_L = lm_L$r_squared,
+        R_Squared_L = map_dbl(lm_L, "r_squared"),
-        lm_Score_L = lm_L$score,
+        lm_Score_L = map_dbl(lm_L, "score"),
-        lm_intercept_K = lm_K$intercept,
+        lm_intercept_K = map_dbl(lm_K, "intercept"),
-        lm_slope_K = lm_K$slope,
+        lm_slope_K = map_dbl(lm_K, "slope"),
-        R_Squared_K = lm_K$r_squared,
+        R_Squared_K = map_dbl(lm_K, "r_squared"),
-        lm_Score_K = lm_K$score,
+        lm_Score_K = map_dbl(lm_K, "score"),
-        lm_intercept_r = lm_r$intercept,
+        lm_intercept_r = map_dbl(lm_r, "intercept"),
-        lm_slope_r = lm_r$slope,
+        lm_slope_r = map_dbl(lm_r, "slope"),
-        R_Squared_r = lm_r$r_squared,
+        R_Squared_r = map_dbl(lm_r, "r_squared"),
-        lm_Score_r = lm_r$score,
+        lm_Score_r = map_dbl(lm_r, "score"),
-        lm_intercept_AUC = lm_AUC$intercept,
+        lm_intercept_AUC = map_dbl(lm_AUC, "intercept"),
-        lm_slope_AUC = lm_AUC$slope,
+        lm_slope_AUC = map_dbl(lm_AUC, "slope"),
-        R_Squared_AUC = lm_AUC$r_squared,
+        R_Squared_AUC = map_dbl(lm_AUC, "r_squared"),
-        lm_Score_AUC = lm_AUC$score,
+        lm_Score_AUC = map_dbl(lm_AUC, "score"),
-        Z_lm_intercept_K_L = Z_lm_K_L$intercept,
+        Z_lm_intercept_K_L = map_dbl(Z_lm_K_L, "intercept"),
-        Z_lm_slope_K_L = Z_lm_K_L$slope,
+        Z_lm_slope_K_L = map_dbl(Z_lm_K_L, "slope"),
-        Z_lm_R_squared_K_L = Z_lm_K_L$r_squared,
+        Z_lm_R_squared_K_L = map_dbl(Z_lm_K_L, "r_squared"),
-        Z_lm_Score_K_L = Z_lm_K_L$score,
+        Z_lm_Score_K_L = map_dbl(Z_lm_K_L, "score"),
-        Z_lm_intercept_r_L = Z_lm_r_L$intercept,
+        Z_lm_intercept_r_L = map_dbl(Z_lm_r_L, "intercept"),
-        Z_lm_slope_r_L = Z_lm_r_L$slope,
+        Z_lm_slope_r_L = map_dbl(Z_lm_r_L, "slope"),
-        Z_lm_R_squared_r_L = Z_lm_r_L$r_squared,
+        Z_lm_R_squared_r_L = map_dbl(Z_lm_r_L, "r_squared"),
-        Z_lm_Score_r_L = Z_lm_r_L$score,
+        Z_lm_Score_r_L = map_dbl(Z_lm_r_L, "score"),
-        Z_lm_intercept_R_AUC_L = Z_lm_R_AUC_L$intercept,
+        Z_lm_intercept_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "intercept"),
-        Z_lm_slope_R_AUC_L = Z_lm_R_AUC_L$slope,
+        Z_lm_slope_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "slope"),
-        Z_lm_R_squared_R_AUC_L = Z_lm_R_AUC_L$r_squared,
+        Z_lm_R_squared_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "r_squared"),
-        Z_lm_Score_R_AUC_L = Z_lm_R_AUC_L$score,
+        Z_lm_Score_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "score"),
-        Z_lm_intercept_R_r_K = Z_lm_R_r_K$intercept,
+        Z_lm_intercept_R_r_K = map_dbl(Z_lm_R_r_K, "intercept"),
-        Z_lm_slope_R_r_K = Z_lm_R_r_K$slope,
+        Z_lm_slope_R_r_K = map_dbl(Z_lm_R_r_K, "slope"),
-        Z_lm_R_squared_R_r_K = Z_lm_R_r_K$r_squared,
+        Z_lm_R_squared_R_r_K = map_dbl(Z_lm_R_r_K, "r_squared"),
-        Z_lm_Score_R_r_K = Z_lm_R_r_K$score,
+        Z_lm_Score_R_r_K = map_dbl(Z_lm_R_r_K, "score"),
-        Z_lm_intercept_R_AUC_K = Z_lm_R_AUC_K$intercept,
+        Z_lm_intercept_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "intercept"),
-        Z_lm_slope_R_AUC_K = Z_lm_R_AUC_K$slope,
+        Z_lm_slope_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "slope"),
-        Z_lm_R_squared_R_AUC_K = Z_lm_R_AUC_K$r_squared,
+        Z_lm_R_squared_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "r_squared"),
-        Z_lm_Score_R_AUC_K = Z_lm_R_AUC_K$score,
+        Z_lm_Score_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "score"),
-        Z_lm_intercept_R_AUC_r = Z_lm_R_AUC_r$intercept,
+        Z_lm_intercept_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "intercept"),
-        Z_lm_slope_R_AUC_r = Z_lm_R_AUC_r$slope,
+        Z_lm_slope_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "slope"),
-        Z_lm_R_squared_R_AUC_r = Z_lm_R_AUC_r$r_squared,
+        Z_lm_R_squared_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "r_squared"),
-        Z_lm_Score_R_AUC_r = Z_lm_R_AUC_r$score
+        Z_lm_Score_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "score")
      ) %>%
      # Drop the lm objects and keep only the lm results
      select(
        -lm_L, -lm_K, -lm_r, -lm_AUC, -Z_lm_K_L, -Z_lm_r_L, -Z_lm_R_AUC_L,
-        -Z_lm_R_r_K, -Z_lm_R_AUC_K, -Z_lm_R_AUC_r
+        -Z_lm_R_r_K, -Z_lm_R_AUC_K, -Z_lm_R_AUC_r)
      )
  } # end deletion-specific block
  # Create the final calculations and interactions dataframes with only required columns for csv output