Browse Source

Correctly group linear modeling

Bryan Roessler 7 tháng trước cách đây
mục cha
commit
fdcd97d7f1
1 tập tin đã thay đổi với 30 bổ sung21 xóa
  1. 30 21
      qhtcp-workflow/apps/r/calculate_interaction_zscores.R

+ 30 - 21
qhtcp-workflow/apps/r/calculate_interaction_zscores.R

@@ -736,7 +736,7 @@ generate_interaction_plot_configs <- function(df, variables) {
 }
 
 generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE) {
-
+  
   sd_bands <- c(1, 2, 3)
   
   configs <- list()
@@ -824,23 +824,32 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE) {
         title_suffix <- paste("Rank Avg Zscore vs lm", variable)
         rectangles <- NULL
       }
-
-      # Check if there is sufficient variation
-      if (length(unique(df_filtered[[x_var]])) < 2 || length(unique(df_filtered[[y_var]])) < 2) {
-        print(df_filtered %>% select(all_of(c("scan", "Plate", "Row", "Col", "num", "conc_num", x_var, y_var))))
-        message("Not enough variation in ", x_var, " or ", y_var, " to fit linear model. Skipping.")
-        next
-      }
       
-      # Fit linear model
-      lm_fit <- lm(df_filtered[[y_var]] ~ df_filtered[[x_var]], data = df_filtered)
-      
-      # # Check for perfect fit
-      # if (summary(lm_fit)$sigma == 0) {
-      #   next  # Skip this iteration if the fit is perfect
-      # }
+      # Fit linear model grouped by OrfRep, Gene, num
+      lm_results <- df_filtered %>%
+        group_by(OrfRep, Gene, num) %>%
+        do({
+          model <- try(lm(as.formula(paste(y_var, "~", x_var)), data = .), silent = TRUE)
+          if (inherits(model, "try-error")) {
+            # Return NA if model fails
+            data.frame(intercept = NA, slope = NA, r_squared = NA)
+          } else {
+            summary_model <- summary(model)
+            data.frame(
+              intercept = coef(model)[1],
+              slope = coef(model)[2],
+              r_squared = summary_model$r.squared
+            )
+          }
+        }) %>%
+        ungroup()
       
-      r_squared <- summary(lm_fit)$r.squared
+      aggregated_lm <- lm_results %>%
+        summarize(
+          intercept = mean(intercept, na.rm = TRUE),
+          slope = mean(slope, na.rm = TRUE),
+          r_squared = mean(r_squared, na.rm = TRUE)
+        )
       
       configs[[length(configs) + 1]] <- list(
         df = df_filtered,
@@ -852,14 +861,14 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE) {
           list(
             x = 0,
             y = 0,
-            label = paste("R-squared =", round(r_squared, 2))
+            label = paste("R-squared =", round(aggregated_lm$r_squared, 2))
           )
         ),
-        sd_band_values = NULL,  # Not applicable
+        sd_band_values = NULL, # Not applicable
         shape = 3,
         size = 0.1,
         add_smooth = TRUE,
-        lm_line = list(intercept = coef(lm_fit)[1], slope = coef(lm_fit)[2]),
+        lm_line = list(intercept = aggregated_lm$intercept, slope = aggregated_lm$slope),
         legend_position = "right",
         color_var = "Overlap",
         x_label = x_var,
@@ -1273,14 +1282,14 @@ main <- function() {
         ungroup()
 
       message("Calculating reference strain interaction scores")
-      reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num"))
+      reference_results <- calculate_interaction_scores(reference_strain, max_conc, group_vars = c("OrfRep", "Gene", "num"))
       zscores_calculations_reference <- reference_results$calculations
       zscores_interactions_reference <- reference_results$interactions
       zscores_calculations_reference_joined <- reference_results$calculations_joined
       zscores_interactions_reference_joined <- reference_results$interactions_joined
 
       message("Calculating deletion strain(s) interactions scores")
-      deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num"))
+      deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, group_vars = c("OrfRep", "Gene", "num"))
       zscores_calculations <- deletion_results$calculations
       zscores_interactions <- deletion_results$interactions
       zscores_calculations_joined <- deletion_results$calculations_joined