diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 6e878831..9318a042 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -217,22 +217,6 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) group_vars <- c("OrfRep", "Gene", "Drug") } - # This is a helper function to perform a linear regression and extract coefficients - perform_lm <- function(data, response, predictor, max_conc) { - valid_data <- data %>% filter(!is.na(data[[response]])) - if (nrow(valid_data) > 1) { - model <- lm(data[[response]] ~ data[[predictor]], data = valid_data) - list( - intercept = coef(model)[1], - slope = coef(model)[2], - r_squared = summary(model)$r.squared, - score = max_conc * coef(model)[2] + coef(model)[1] - ) - } else { - list(intercept = NA, slope = NA, r_squared = NA, score = NA) - } - } - # Calculate WT statistics from df_bg wt_stats <- df_bg %>% group_by(across(all_of(bg_group_vars))) %>% @@ -312,37 +296,82 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) Zscore_r = Delta_r / WT_sd_r, Zscore_AUC = Delta_AUC / WT_sd_AUC ) %>% - ungroup() %>% # Ungroup before group_modify + ungroup() + + calculations <- calculations %>% group_by(across(all_of(group_vars))) %>% - group_modify(~ { - lm_L <- perform_lm(.x, "Delta_L", "conc_num_factor", max_conc) - lm_K <- perform_lm(.x, "Delta_K", "conc_num_factor", max_conc) - lm_r <- perform_lm(.x, "Delta_r", "conc_num_factor", max_conc) - lm_AUC <- perform_lm(.x, "Delta_AUC", "conc_num_factor", max_conc) + mutate( + lm_L = map2(.x = Delta_L, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.x ~ .y) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), + lm_K = map2(.x = Delta_K, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.x ~ .y) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), + lm_r = map2(.x = Delta_r, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.x ~ .y) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), + lm_AUC = map2(.x = Delta_AUC, .y = conc_num_factor, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.x ~ .y) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), - .x %>% - mutate( - lm_intercept_L = lm_L$intercept, - lm_slope_L = lm_L$slope, - R_Squared_L = lm_L$r_squared, - lm_Score_L = lm_L$score, + # Extract coefficients and statistics for each model + lm_intercept_L = map_dbl(lm_L, "intercept"), + lm_slope_L = map_dbl(lm_L, "slope"), + R_Squared_L = map_dbl(lm_L, "r_squared"), + lm_Score_L = map_dbl(lm_L, "score"), - lm_intercept_K = lm_K$intercept, - lm_slope_K = lm_K$slope, - R_Squared_K = lm_K$r_squared, - lm_Score_K = lm_K$score, + lm_intercept_K = map_dbl(lm_K, "intercept"), + lm_slope_K = map_dbl(lm_K, "slope"), + R_Squared_K = map_dbl(lm_K, "r_squared"), + lm_Score_K = map_dbl(lm_K, "score"), - lm_intercept_r = lm_r$intercept, - lm_slope_r = lm_r$slope, - R_Squared_r = lm_r$r_squared, - lm_Score_r = lm_r$score, + lm_intercept_r = map_dbl(lm_r, "intercept"), + lm_slope_r = map_dbl(lm_r, "slope"), + R_Squared_r = map_dbl(lm_r, "r_squared"), + lm_Score_r = map_dbl(lm_r, "score"), - lm_intercept_AUC = lm_AUC$intercept, - lm_slope_AUC = lm_AUC$slope, - R_Squared_AUC = lm_AUC$r_squared, - lm_Score_AUC = lm_AUC$score - ) - }) %>% + lm_intercept_AUC = map_dbl(lm_AUC, "intercept"), + lm_slope_AUC = map_dbl(lm_AUC, "slope"), + R_Squared_AUC = map_dbl(lm_AUC, "r_squared"), + lm_Score_AUC = map_dbl(lm_AUC, "score") +) %>% + select(-lm_L, -lm_K, -lm_r, -lm_AUC) %>% ungroup() # For interaction plot error bars @@ -480,77 +509,183 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore", TRUE ~ "No Effect" ), - - # For rank plots - lm_L = perform_lm(., "Z_lm_L", "Avg_Zscore_L", max_conc), - lm_K = perform_lm(., "Z_lm_K", "Avg_Zscore_K", max_conc), - lm_r = perform_lm(., "Z_lm_r", "Avg_Zscore_r", max_conc), - lm_AUC = perform_lm(., "Z_lm_AUC", "Avg_Zscore_AUC", max_conc), + lm_L = map2(.x = Z_lm_L, .y = Avg_Zscore_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.y ~ .x) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), + lm_K = map2(.x = Z_lm_K, .y = Avg_Zscore_K, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.y ~ .x) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), + lm_r = map2(.x = Z_lm_r, .y = Avg_Zscore_r, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.y ~ .x) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), + lm_AUC = map2(.x = Z_lm_AUC, .y = Avg_Zscore_AUC, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.y ~ .x) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), # For correlation plots - Z_lm_K_L = perform_lm(., "Z_lm_K", "Z_lm_L", max_conc), - Z_lm_r_L = perform_lm(., "Z_lm_r", "Z_lm_L", max_conc), - Z_lm_R_AUC_L = perform_lm(., "Z_lm_AUC", "Z_lm_L", max_conc), - Z_lm_R_r_K = perform_lm(., "Z_lm_r", "Z_lm_K", max_conc), - Z_lm_R_AUC_K = perform_lm(., "Z_lm_AUC", "Z_lm_K", max_conc), - Z_lm_R_AUC_r = perform_lm(., "Z_lm_AUC", "Z_lm_r", max_conc), + Z_lm_K_L = map2(.x = Z_lm_K, .y = Z_lm_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.y ~ .x) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), + Z_lm_r_L = map2(.x = Z_lm_r, .y = Z_lm_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.y ~ .x) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), + Z_lm_R_AUC_L = map2(.x = Z_lm_AUC, .y = Z_lm_L, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.y ~ .x) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), + Z_lm_R_r_K = map2(.x = Z_lm_r, .y = Z_lm_K, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.y ~ .x) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), + Z_lm_R_AUC_K = map2(.x = Z_lm_AUC, .y = Z_lm_K, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.y ~ .x) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), + Z_lm_R_AUC_r = map2(.x = Z_lm_AUC, .y = Z_lm_r, ~ if (all(is.na(.x)) || all(is.na(.y)) || length(.x[!is.na(.x)]) == 0) { + list(intercept = NA, slope = NA, r_squared = NA, score = NA) + } else { + lm(.y ~ .x) %>% { + list( + intercept = coef(.)[1], + slope = coef(.)[2], + r_squared = summary(.)$r.squared, + score = max_conc * coef(.)[2] + coef(.)[1] + ) + } + }), # Extract coefficients and statistics for each model - lm_rank_intercept_L = lm_L$intercept, - lm_rank_slope_L = lm_L$slope, - R_Squared_L = lm_L$r_squared, - lm_Score_L = lm_L$score, + lm_rank_intercept_L = map_dbl(lm_L, "intercept"), + lm_rank_slope_L = map_dbl(lm_L, "slope"), + R_Squared_L = map_dbl(lm_L, "r_squared"), + lm_Score_L = map_dbl(lm_L, "score"), - lm_intercept_K = lm_K$intercept, - lm_slope_K = lm_K$slope, - R_Squared_K = lm_K$r_squared, - lm_Score_K = lm_K$score, + lm_intercept_K = map_dbl(lm_K, "intercept"), + lm_slope_K = map_dbl(lm_K, "slope"), + R_Squared_K = map_dbl(lm_K, "r_squared"), + lm_Score_K = map_dbl(lm_K, "score"), - lm_intercept_r = lm_r$intercept, - lm_slope_r = lm_r$slope, - R_Squared_r = lm_r$r_squared, - lm_Score_r = lm_r$score, + lm_intercept_r = map_dbl(lm_r, "intercept"), + lm_slope_r = map_dbl(lm_r, "slope"), + R_Squared_r = map_dbl(lm_r, "r_squared"), + lm_Score_r = map_dbl(lm_r, "score"), - lm_intercept_AUC = lm_AUC$intercept, - lm_slope_AUC = lm_AUC$slope, - R_Squared_AUC = lm_AUC$r_squared, - lm_Score_AUC = lm_AUC$score, + lm_intercept_AUC = map_dbl(lm_AUC, "intercept"), + lm_slope_AUC = map_dbl(lm_AUC, "slope"), + R_Squared_AUC = map_dbl(lm_AUC, "r_squared"), + lm_Score_AUC = map_dbl(lm_AUC, "score"), - Z_lm_intercept_K_L = Z_lm_K_L$intercept, - Z_lm_slope_K_L = Z_lm_K_L$slope, - Z_lm_R_squared_K_L = Z_lm_K_L$r_squared, - Z_lm_Score_K_L = Z_lm_K_L$score, + Z_lm_intercept_K_L = map_dbl(Z_lm_K_L, "intercept"), + Z_lm_slope_K_L = map_dbl(Z_lm_K_L, "slope"), + Z_lm_R_squared_K_L = map_dbl(Z_lm_K_L, "r_squared"), + Z_lm_Score_K_L = map_dbl(Z_lm_K_L, "score"), - Z_lm_intercept_r_L = Z_lm_r_L$intercept, - Z_lm_slope_r_L = Z_lm_r_L$slope, - Z_lm_R_squared_r_L = Z_lm_r_L$r_squared, - Z_lm_Score_r_L = Z_lm_r_L$score, + Z_lm_intercept_r_L = map_dbl(Z_lm_r_L, "intercept"), + Z_lm_slope_r_L = map_dbl(Z_lm_r_L, "slope"), + Z_lm_R_squared_r_L = map_dbl(Z_lm_r_L, "r_squared"), + Z_lm_Score_r_L = map_dbl(Z_lm_r_L, "score"), - Z_lm_intercept_R_AUC_L = Z_lm_R_AUC_L$intercept, - Z_lm_slope_R_AUC_L = Z_lm_R_AUC_L$slope, - Z_lm_R_squared_R_AUC_L = Z_lm_R_AUC_L$r_squared, - Z_lm_Score_R_AUC_L = Z_lm_R_AUC_L$score, + Z_lm_intercept_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "intercept"), + Z_lm_slope_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "slope"), + Z_lm_R_squared_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "r_squared"), + Z_lm_Score_R_AUC_L = map_dbl(Z_lm_R_AUC_L, "score"), - Z_lm_intercept_R_r_K = Z_lm_R_r_K$intercept, - Z_lm_slope_R_r_K = Z_lm_R_r_K$slope, - Z_lm_R_squared_R_r_K = Z_lm_R_r_K$r_squared, - Z_lm_Score_R_r_K = Z_lm_R_r_K$score, + Z_lm_intercept_R_r_K = map_dbl(Z_lm_R_r_K, "intercept"), + Z_lm_slope_R_r_K = map_dbl(Z_lm_R_r_K, "slope"), + Z_lm_R_squared_R_r_K = map_dbl(Z_lm_R_r_K, "r_squared"), + Z_lm_Score_R_r_K = map_dbl(Z_lm_R_r_K, "score"), - Z_lm_intercept_R_AUC_K = Z_lm_R_AUC_K$intercept, - Z_lm_slope_R_AUC_K = Z_lm_R_AUC_K$slope, - Z_lm_R_squared_R_AUC_K = Z_lm_R_AUC_K$r_squared, - Z_lm_Score_R_AUC_K = Z_lm_R_AUC_K$score, + Z_lm_intercept_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "intercept"), + Z_lm_slope_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "slope"), + Z_lm_R_squared_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "r_squared"), + Z_lm_Score_R_AUC_K = map_dbl(Z_lm_R_AUC_K, "score"), - Z_lm_intercept_R_AUC_r = Z_lm_R_AUC_r$intercept, - Z_lm_slope_R_AUC_r = Z_lm_R_AUC_r$slope, - Z_lm_R_squared_R_AUC_r = Z_lm_R_AUC_r$r_squared, - Z_lm_Score_R_AUC_r = Z_lm_R_AUC_r$score + Z_lm_intercept_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "intercept"), + Z_lm_slope_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "slope"), + Z_lm_R_squared_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "r_squared"), + Z_lm_Score_R_AUC_r = map_dbl(Z_lm_R_AUC_r, "score") ) %>% - # Drop the lm objects and keep only the lm results select( -lm_L, -lm_K, -lm_r, -lm_AUC, -Z_lm_K_L, -Z_lm_r_L, -Z_lm_R_AUC_L, - -Z_lm_R_r_K, -Z_lm_R_AUC_K, -Z_lm_R_AUC_r - ) + -Z_lm_R_r_K, -Z_lm_R_AUC_K, -Z_lm_R_AUC_r) } # end deletion-specific block # Create the final calculations and interactions dataframes with only required columns for csv output