Add correlation lms to calculate_interaction_scores()

This commit is contained in:
2024-10-07 01:20:36 -04:00
parent 642c74fa62
commit 2073517192

View File

@@ -217,6 +217,22 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
group_vars <- c("OrfRep", "Gene", "Drug") group_vars <- c("OrfRep", "Gene", "Drug")
} }
# This is a helper function to perform a linear regression and extract coefficients
perform_lm <- function(data, response, predictor, max_conc) {
valid_data <- data %>% filter(!is.na(.data[[response]]))
if (nrow(valid_data) > 1) {
model <- lm(.data[[response]] ~ .data[[predictor]], data = valid_data)
list(
intercept = coef(model)[1],
slope = coef(model)[2],
r_squared = summary(model)$r.squared,
score = max_conc * coef(model)[2] + coef(model)[1]
)
} else {
list(intercept = NA, slope = NA, r_squared = NA, score = NA)
}
}
# Calculate WT statistics from df_bg # Calculate WT statistics from df_bg
wt_stats <- df_bg %>% wt_stats <- df_bg %>%
group_by(across(all_of(bg_group_vars))) %>% group_by(across(all_of(bg_group_vars))) %>%
@@ -300,41 +316,32 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
ungroup() %>% # Ungroup before group_modify ungroup() %>% # Ungroup before group_modify
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
group_modify(~ { group_modify(~ {
lm_L <- perform_lm(.x, "Delta_L", "conc_num_factor", max_conc)
lm_K <- perform_lm(.x, "Delta_K", "conc_num_factor", max_conc)
lm_r <- perform_lm(.x, "Delta_r", "conc_num_factor", max_conc)
lm_AUC <- perform_lm(.x, "Delta_AUC", "conc_num_factor", max_conc)
# Filter each column for valid data or else linear modeling will fail
valid_data_L <- .x %>% filter(!is.na(Delta_L))
valid_data_K <- .x %>% filter(!is.na(Delta_K))
valid_data_r <- .x %>% filter(!is.na(Delta_r))
valid_data_AUC <- .x %>% filter(!is.na(Delta_AUC))
# Perform linear modeling
lm_L <- if (nrow(valid_data_L) > 1) lm(Delta_L ~ conc_num_factor, data = valid_data_L) else NULL
lm_K <- if (nrow(valid_data_K) > 1) lm(Delta_K ~ conc_num_factor, data = valid_data_K) else NULL
lm_r <- if (nrow(valid_data_r) > 1) lm(Delta_r ~ conc_num_factor, data = valid_data_r) else NULL
lm_AUC <- if (nrow(valid_data_AUC) > 1) lm(Delta_AUC ~ conc_num_factor, data = valid_data_AUC) else NULL
# Extract coefficients for calculations and plotting
.x %>% .x %>%
mutate( mutate(
lm_intercept_L = if (!is.null(lm_L)) coef(lm_L)[1] else NA, lm_intercept_L = lm_L$intercept,
lm_slope_L = if (!is.null(lm_L)) coef(lm_L)[2] else NA, lm_slope_L = lm_L$slope,
R_Squared_L = if (!is.null(lm_L)) summary(lm_L)$r.squared else NA, R_Squared_L = lm_L$r_squared,
lm_Score_L = if (!is.null(lm_L)) max_conc * coef(lm_L)[2] + coef(lm_L)[1] else NA, lm_Score_L = lm_L$score,
lm_intercept_K = if (!is.null(lm_K)) coef(lm_K)[1] else NA, lm_intercept_K = lm_K$intercept,
lm_slope_K = if (!is.null(lm_K)) coef(lm_K)[2] else NA, lm_slope_K = lm_K$slope,
R_Squared_K = if (!is.null(lm_K)) summary(lm_K)$r.squared else NA, R_Squared_K = lm_K$r_squared,
lm_Score_K = if (!is.null(lm_K)) max_conc * coef(lm_K)[2] + coef(lm_K)[1] else NA, lm_Score_K = lm_K$score,
lm_intercept_r = if (!is.null(lm_r)) coef(lm_r)[1] else NA, lm_intercept_r = lm_r$intercept,
lm_slope_r = if (!is.null(lm_r)) coef(lm_r)[2] else NA, lm_slope_r = lm_r$slope,
R_Squared_r = if (!is.null(lm_r)) summary(lm_r)$r.squared else NA, R_Squared_r = lm_r$r_squared,
lm_Score_r = if (!is.null(lm_r)) max_conc * coef(lm_r)[2] + coef(lm_r)[1] else NA, lm_Score_r = lm_r$score,
lm_intercept_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[1] else NA, lm_intercept_AUC = lm_AUC$intercept,
lm_slope_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[2] else NA, lm_slope_AUC = lm_AUC$slope,
R_Squared_AUC = if (!is.null(lm_AUC)) summary(lm_AUC)$r.squared else NA, R_Squared_AUC = lm_AUC$r_squared,
lm_Score_AUC = if (!is.null(lm_AUC)) max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1] else NA lm_Score_AUC = lm_AUC$score
) )
}) %>% }) %>%
ungroup() ungroup()
@@ -404,55 +411,116 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Z_Shift_r = first(Z_Shift_r), Z_Shift_r = first(Z_Shift_r),
Z_Shift_AUC = first(Z_Shift_AUC), Z_Shift_AUC = first(Z_Shift_AUC),
# R Squared values
R_Squared_L = first(R_Squared_L),
R_Squared_K = first(R_Squared_K),
R_Squared_r = first(R_Squared_r),
R_Squared_AUC = first(R_Squared_AUC),
# lm intercepts
lm_intercept_L = first(lm_intercept_L),
lm_intercept_K = first(lm_intercept_K),
lm_intercept_r = first(lm_intercept_r),
lm_intercept_AUC = first(lm_intercept_AUC),
# lm slopes
lm_slope_L = first(lm_slope_L),
lm_slope_K = first(lm_slope_K),
lm_slope_r = first(lm_slope_r),
lm_slope_AUC = first(lm_slope_AUC),
# NG, DB, SM values # NG, DB, SM values
NG = first(NG), NG = first(NG),
DB = first(DB), DB = first(DB),
SM = first(SM), SM = first(SM),
.groups = "drop" .groups = "drop"
) ) %>%
arrange(desc(Z_lm_L), desc(NG))
# Add overlap threshold categories based on Z-lm and Avg-Z scores # Deletion data ranking and linear modeling
interactions <- interactions %>% if (type == "deletion") {
filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>% interactions <- interactions %>%
mutate( mutate(
Overlap = case_when( Avg_Zscore_L_adjusted = ifelse(is.na(Avg_Zscore_L), 0.001, Avg_Zscore_L),
Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both", Avg_Zscore_K_adjusted = ifelse(is.na(Avg_Zscore_K), 0.001, Avg_Zscore_K),
Z_lm_L <= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Both", Avg_Zscore_r_adjusted = ifelse(is.na(Avg_Zscore_r), 0.001, Avg_Zscore_r),
Z_lm_L >= overlap_threshold & Avg_Zscore_L <= overlap_threshold ~ "Deletion Enhancer lm only", Avg_Zscore_AUC_adjusted = ifelse(is.na(Avg_Zscore_AUC), 0.001, Avg_Zscore_AUC),
Z_lm_L <= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Avg Zscore only", Z_lm_L_adjusted = ifelse(is.na(Z_lm_L), 0.001, Z_lm_L),
Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= -overlap_threshold ~ "Deletion Suppressor lm only", Z_lm_K_adjusted = ifelse(is.na(Z_lm_K), 0.001, Z_lm_K),
Z_lm_L >= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Avg Zscore only", Z_lm_r_adjusted = ifelse(is.na(Z_lm_r), 0.001, Z_lm_r),
Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore", Z_lm_AUC_adjusted = ifelse(is.na(Z_lm_AUC), 0.001, Z_lm_AUC)
Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore", ) %>%
TRUE ~ "No Effect" mutate(
), Rank_L = rank(Avg_Zscore_L_adjusted),
Rank_K = rank(Avg_Zscore_K_adjusted),
Rank_r = rank(Avg_Zscore_r_adjusted),
Rank_AUC = rank(Avg_Zscore_AUC_adjusted),
Rank_lm_L = rank(Z_lm_L_adjusted),
Rank_lm_K = rank(Z_lm_K_adjusted),
Rank_lm_r = rank(Z_lm_r_adjusted),
Rank_lm_AUC = rank(Z_lm_AUC_adjusted)
) %>%
mutate(
lm_R_squared_rank_L = summary(lm(Rank_lm_L ~ Rank_L, data = .))$r.squared,
lm_R_squared_rank_K = summary(lm(Rank_lm_K ~ Rank_K, data = .))$r.squared,
lm_R_squared_rank_r = summary(lm(Rank_lm_r ~ Rank_r, data = .))$r.squared,
lm_R_squared_rank_AUC = summary(lm(Rank_lm_AUC ~ Rank_AUC, data = .))$r.squared
)
# For correlation plots # Add overlap threshold categories based on Z-lm and Avg-Z scores
lm_R_squared_L = summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared, interactions <- interactions %>%
lm_R_squared_K = summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared, filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
lm_R_squared_r = summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared, mutate(
lm_R_squared_AUC = summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared Overlap = case_when(
) Z_lm_L >= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Both",
Z_lm_L <= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Both",
Z_lm_L >= overlap_threshold & Avg_Zscore_L <= overlap_threshold ~ "Deletion Enhancer lm only",
Z_lm_L <= overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Enhancer Avg Zscore only",
Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= -overlap_threshold ~ "Deletion Suppressor lm only",
Z_lm_L >= -overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Suppressor Avg Zscore only",
Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
TRUE ~ "No Effect"
)) %>%
group_modify(~ {
# Creating the final calculations and interactions dataframes with only required columns for csv output # For rank plots
lm_L <- perform_lm(.x, "Z_lm_L", "Avg_Zscore_L", max_conc)
lm_K <- perform_lm(.x, "Z_lm_K", "Avg_Zscore_K", max_conc)
lm_r <- perform_lm(.x, "Z_lm_r", "Avg_Zscore_r", max_conc)
lm_AUC <- perform_lm(.x, "Z_lm_AUC", "Avg_Zscore_AUC", max_conc)
# For correlation plots
Z_lm_K_L <- perform_lm(.x, "Z_lm_K", "Z_lm_L", max_conc)
Z_lm_r_L <- perform_lm(.x, "Z_lm_r", "Z_lm_L", max_conc)
Z_lm_R_AUC_L <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_L", max_conc)
Z_lm_R_r_K <- perform_lm(.x, "Z_lm_R_r", "Z_lm_K", max_conc)
Z_lm_R_AUC_K <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_K", max_conc)
Z_lm_R_AUC_r <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_r", max_conc)
.x %>%
mutate(
# For rank plots
lm_R_squared_L = lm_L$r.squared,
lm_R_squared_K = lm_K$r.squared,
lm_R_squared_r = lm_r$r.squared,
lm_R_squared_AUC = lm_AUC$r.squared,
# For correlation plots
Z_lm_R_squared_K_L = Z_lm_K_L$r.squared,
Z_lm_R_squared_r_L = Z_lm_r_L$r.squared,
Z_lm_R_squared_AUC_L = Z_lm_R_AUC_L$r.squared,
Z_lm_R_squared_r_K = Z_lm_R_r_K$r.squared,
Z_lm_R_squared_AUC_K = Z_lm_R_AUC_K$r.squared,
Z_lm_R_squared_AUC_r = Z_lm_R_AUC_r$r.squared,
Z_lm_intercept_K_L = Z_lm_K_L$intercept,
Z_lm_intercept_r_L = Z_lm_r_L$intercept,
Z_lm_intercept_AUC_L = Z_lm_R_AUC_L$intercept,
Z_lm_intercept_r_K = Z_lm_R_r_K$intercept,
Z_lm_intercept_AUC_K = Z_lm_R_AUC_K$intercept,
Z_lm_intercept_AUC_r = Z_lm_R_AUC_r$intercept,
Z_lm_slope_K_L = Z_lm_K_L$slope,
Z_lm_slope_r_L = Z_lm_r_L$slope,
Z_lm_slope_AUC_L = Z_lm_R_AUC_L$slope,
Z_lm_slope_r_K = Z_lm_R_r_K$slope,
Z_lm_slope_AUC_K = Z_lm_R_AUC_K$slope,
Z_lm_slope_AUC_r = Z_lm_R_AUC_r$slope,
Z_lm_Score_K_L = Z_lm_K_L$score,
Z_lm_Score_r_L = Z_lm_r_L$score,
Z_lm_Score_AUC_L = Z_lm_R_AUC_L$score,
Z_lm_Score_r_K = Z_lm_R_r_K$score,
Z_lm_Score_AUC_K = Z_lm_R_AUC_K$score,
Z_lm_Score_AUC_r = Z_lm_R_AUC_r$score
)
})
} # end deletion-specific block
# Create the final calculations and interactions dataframes with only required columns for csv output
df_calculations <- calculations %>% df_calculations <- calculations %>%
select( select(
all_of(group_vars), all_of(group_vars),
@@ -484,7 +552,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap
) )
# Join calculations and interactions to avoid dimension mismatch # Avoid column collision on left join for overlapping variables
calculations_no_overlap <- calculations %>% calculations_no_overlap <- calculations %>%
select(-any_of(c("DB", "NG", "SM", select(-any_of(c("DB", "NG", "SM",
"Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
@@ -1091,22 +1159,12 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
)) ))
} }
generate_rank_plot_configs <- function(df, is_lm = FALSE, adjust = FALSE, filter_na = FALSE, overlap_color = FALSE) { generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, overlap_color = FALSE) {
sd_bands <- c(1, 2, 3) sd_bands <- c(1, 2, 3)
plot_configs <- list() plot_configs <- list()
variables <- c("L", "K") variables <- c("L", "K")
# Adjust (if necessary) and rank columns
for (variable in variables) {
if (adjust) {
df[[paste0("Avg_Zscore_", variable)]] <- ifelse(is.na(df[[paste0("Avg_Zscore_", variable)]]), 0.001, df[[paste0("Avg_Zscore_", variable)]])
df[[paste0("Z_lm_", variable)]] <- ifelse(is.na(df[[paste0("Z_lm_", variable)]]), 0.001, df[[paste0("Z_lm_", variable)]])
}
df[[paste0("Rank_", variable)]] <- rank(df[[paste0("Avg_Zscore_", variable)]], na.last = "keep")
df[[paste0("Rank_lm_", variable)]] <- rank(df[[paste0("Z_lm_", variable)]], na.last = "keep")
}
# Helper function to create a rank plot configuration # Helper function to create a rank plot configuration
create_plot_config <- function(variable, rank_var, zscore_var, y_label, sd_band, filter_na, with_annotations = TRUE) { create_plot_config <- function(variable, rank_var, zscore_var, y_label, sd_band, filter_na, with_annotations = TRUE) {
num_enhancers <- sum(df[[zscore_var]] >= sd_band, na.rm = TRUE) num_enhancers <- sum(df[[zscore_var]] >= sd_band, na.rm = TRUE)
@@ -1578,9 +1636,9 @@ main <- function() {
write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE) write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE) write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
message("Generating reference interaction plots") # message("Generating reference interaction plots")
reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference") # reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16) # generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
message("Setting missing deletion values to the highest theoretical value at each drug conc for L") message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
df_deletion <- df_na_stats %>% # formerly X2 df_deletion <- df_na_stats %>% # formerly X2
@@ -1648,7 +1706,6 @@ main <- function() {
# rank_plot_configs <- generate_rank_plot_configs( # rank_plot_configs <- generate_rank_plot_configs(
# df_interactions, # df_interactions,
# is_lm = FALSE, # is_lm = FALSE,
# adjust = TRUE
# ) # )
# generate_and_save_plots(out_dir, "rank_plots", rank_plot_configs, # generate_and_save_plots(out_dir, "rank_plots", rank_plot_configs,
# page_width = 18, page_height = 12) # page_width = 18, page_height = 12)
@@ -1657,7 +1714,6 @@ main <- function() {
# rank_lm_plot_configs <- generate_rank_plot_configs( # rank_lm_plot_configs <- generate_rank_plot_configs(
# df_interactions, # df_interactions,
# is_lm = TRUE, # is_lm = TRUE,
# adjust = TRUE
# ) # )
# generate_and_save_plots(out_dir, "rank_plots_lm", rank_lm_plot_configs, # generate_and_save_plots(out_dir, "rank_plots_lm", rank_lm_plot_configs,
# page_width = 18, page_height = 12) # page_width = 18, page_height = 12)
@@ -1666,7 +1722,6 @@ main <- function() {
# rank_plot_filtered_configs <- generate_rank_plot_configs( # rank_plot_filtered_configs <- generate_rank_plot_configs(
# df_interactions, # df_interactions,
# is_lm = FALSE, # is_lm = FALSE,
# adjust = FALSE,
# filter_na = TRUE, # filter_na = TRUE,
# overlap_color = TRUE # overlap_color = TRUE
# ) # )
@@ -1677,7 +1732,6 @@ main <- function() {
# rank_plot_lm_filtered_configs <- generate_rank_plot_configs( # rank_plot_lm_filtered_configs <- generate_rank_plot_configs(
# df_interactions, # df_interactions,
# is_lm = TRUE, # is_lm = TRUE,
# adjust = FALSE,
# filter_na = TRUE, # filter_na = TRUE,
# overlap_color = TRUE # overlap_color = TRUE
# ) # )