From fdcd97d7f1a49ca07563941d62457a15af5914db Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Tue, 17 Sep 2024 23:25:58 -0400 Subject: [PATCH] Correctly group linear modeling --- .../apps/r/calculate_interaction_zscores.R | 51 +++++++++++-------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 3039e5e2..86218d91 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -736,7 +736,7 @@ generate_interaction_plot_configs <- function(df, variables) { } generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE) { - + sd_bands <- c(1, 2, 3) configs <- list() @@ -824,23 +824,32 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE) { title_suffix <- paste("Rank Avg Zscore vs lm", variable) rectangles <- NULL } - - # Check if there is sufficient variation - if (length(unique(df_filtered[[x_var]])) < 2 || length(unique(df_filtered[[y_var]])) < 2) { - print(df_filtered %>% select(all_of(c("scan", "Plate", "Row", "Col", "num", "conc_num", x_var, y_var)))) - message("Not enough variation in ", x_var, " or ", y_var, " to fit linear model. Skipping.") - next - } - # Fit linear model - lm_fit <- lm(df_filtered[[y_var]] ~ df_filtered[[x_var]], data = df_filtered) + # Fit linear model grouped by OrfRep, Gene, num + lm_results <- df_filtered %>% + group_by(OrfRep, Gene, num) %>% + do({ + model <- try(lm(as.formula(paste(y_var, "~", x_var)), data = .), silent = TRUE) + if (inherits(model, "try-error")) { + # Return NA if model fails + data.frame(intercept = NA, slope = NA, r_squared = NA) + } else { + summary_model <- summary(model) + data.frame( + intercept = coef(model)[1], + slope = coef(model)[2], + r_squared = summary_model$r.squared + ) + } + }) %>% + ungroup() - # # Check for perfect fit - # if (summary(lm_fit)$sigma == 0) { - # next # Skip this iteration if the fit is perfect - # } - - r_squared <- summary(lm_fit)$r.squared + aggregated_lm <- lm_results %>% + summarize( + intercept = mean(intercept, na.rm = TRUE), + slope = mean(slope, na.rm = TRUE), + r_squared = mean(r_squared, na.rm = TRUE) + ) configs[[length(configs) + 1]] <- list( df = df_filtered, @@ -852,14 +861,14 @@ generate_rank_plot_configs <- function(df_filtered, variables, is_lm = FALSE) { list( x = 0, y = 0, - label = paste("R-squared =", round(r_squared, 2)) + label = paste("R-squared =", round(aggregated_lm$r_squared, 2)) ) ), - sd_band_values = NULL, # Not applicable + sd_band_values = NULL, # Not applicable shape = 3, size = 0.1, add_smooth = TRUE, - lm_line = list(intercept = coef(lm_fit)[1], slope = coef(lm_fit)[2]), + lm_line = list(intercept = aggregated_lm$intercept, slope = aggregated_lm$slope), legend_position = "right", color_var = "Overlap", x_label = x_var, @@ -1273,14 +1282,14 @@ main <- function() { ungroup() message("Calculating reference strain interaction scores") - reference_results <- calculate_interaction_scores(reference_strain, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num")) + reference_results <- calculate_interaction_scores(reference_strain, max_conc, group_vars = c("OrfRep", "Gene", "num")) zscores_calculations_reference <- reference_results$calculations zscores_interactions_reference <- reference_results$interactions zscores_calculations_reference_joined <- reference_results$calculations_joined zscores_interactions_reference_joined <- reference_results$interactions_joined message("Calculating deletion strain(s) interactions scores") - deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, interaction_vars, group_vars = c("OrfRep", "Gene", "num")) + deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, group_vars = c("OrfRep", "Gene", "num")) zscores_calculations <- deletion_results$calculations zscores_interactions <- deletion_results$interactions zscores_calculations_joined <- deletion_results$calculations_joined