Add correlation lms to calculate_interaction_scores()

This commit is contained in:
2024-10-07 01:20:36 -04:00
parent 642c74fa62
commit 2073517192

View File

@@ -217,6 +217,22 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
group_vars <- c("OrfRep", "Gene", "Drug") group_vars <- c("OrfRep", "Gene", "Drug")
} }
# This is a helper function to perform a linear regression and extract coefficients
perform_lm <- function(data, response, predictor, max_conc) {
valid_data <- data %>% filter(!is.na(.data[[response]]))
if (nrow(valid_data) > 1) {
model <- lm(.data[[response]] ~ .data[[predictor]], data = valid_data)
list(
intercept = coef(model)[1],
slope = coef(model)[2],
r_squared = summary(model)$r.squared,
score = max_conc * coef(model)[2] + coef(model)[1]
)
} else {
list(intercept = NA, slope = NA, r_squared = NA, score = NA)
}
}
# Calculate WT statistics from df_bg # Calculate WT statistics from df_bg
wt_stats <- df_bg %>% wt_stats <- df_bg %>%
group_by(across(all_of(bg_group_vars))) %>% group_by(across(all_of(bg_group_vars))) %>%
@@ -300,41 +316,32 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
ungroup() %>% # Ungroup before group_modify ungroup() %>% # Ungroup before group_modify
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
group_modify(~ { group_modify(~ {
lm_L <- perform_lm(.x, "Delta_L", "conc_num_factor", max_conc)
lm_K <- perform_lm(.x, "Delta_K", "conc_num_factor", max_conc)
lm_r <- perform_lm(.x, "Delta_r", "conc_num_factor", max_conc)
lm_AUC <- perform_lm(.x, "Delta_AUC", "conc_num_factor", max_conc)
# Filter each column for valid data or else linear modeling will fail
valid_data_L <- .x %>% filter(!is.na(Delta_L))
valid_data_K <- .x %>% filter(!is.na(Delta_K))
valid_data_r <- .x %>% filter(!is.na(Delta_r))
valid_data_AUC <- .x %>% filter(!is.na(Delta_AUC))
# Perform linear modeling
lm_L <- if (nrow(valid_data_L) > 1) lm(Delta_L ~ conc_num_factor, data = valid_data_L) else NULL
lm_K <- if (nrow(valid_data_K) > 1) lm(Delta_K ~ conc_num_factor, data = valid_data_K) else NULL
lm_r <- if (nrow(valid_data_r) > 1) lm(Delta_r ~ conc_num_factor, data = valid_data_r) else NULL
lm_AUC <- if (nrow(valid_data_AUC) > 1) lm(Delta_AUC ~ conc_num_factor, data = valid_data_AUC) else NULL
# Extract coefficients for calculations and plotting
.x %>% .x %>%
mutate( mutate(
lm_intercept_L = if (!is.null(lm_L)) coef(lm_L)[1] else NA, lm_intercept_L = lm_L$intercept,
lm_slope_L = if (!is.null(lm_L)) coef(lm_L)[2] else NA, lm_slope_L = lm_L$slope,
R_Squared_L = if (!is.null(lm_L)) summary(lm_L)$r.squared else NA, R_Squared_L = lm_L$r_squared,
lm_Score_L = if (!is.null(lm_L)) max_conc * coef(lm_L)[2] + coef(lm_L)[1] else NA, lm_Score_L = lm_L$score,
lm_intercept_K = if (!is.null(lm_K)) coef(lm_K)[1] else NA, lm_intercept_K = lm_K$intercept,
lm_slope_K = if (!is.null(lm_K)) coef(lm_K)[2] else NA, lm_slope_K = lm_K$slope,
R_Squared_K = if (!is.null(lm_K)) summary(lm_K)$r.squared else NA, R_Squared_K = lm_K$r_squared,
lm_Score_K = if (!is.null(lm_K)) max_conc * coef(lm_K)[2] + coef(lm_K)[1] else NA, lm_Score_K = lm_K$score,
lm_intercept_r = if (!is.null(lm_r)) coef(lm_r)[1] else NA, lm_intercept_r = lm_r$intercept,
lm_slope_r = if (!is.null(lm_r)) coef(lm_r)[2] else NA, lm_slope_r = lm_r$slope,
R_Squared_r = if (!is.null(lm_r)) summary(lm_r)$r.squared else NA, R_Squared_r = lm_r$r_squared,
lm_Score_r = if (!is.null(lm_r)) max_conc * coef(lm_r)[2] + coef(lm_r)[1] else NA, lm_Score_r = lm_r$score,
lm_intercept_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[1] else NA, lm_intercept_AUC = lm_AUC$intercept,
lm_slope_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[2] else NA, lm_slope_AUC = lm_AUC$slope,
R_Squared_AUC = if (!is.null(lm_AUC)) summary(lm_AUC)$r.squared else NA, R_Squared_AUC = lm_AUC$r_squared,
lm_Score_AUC = if (!is.null(lm_AUC)) max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1] else NA lm_Score_AUC = lm_AUC$score
) )
}) %>% }) %>%
ungroup() ungroup()
@@ -404,29 +411,42 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Z_Shift_r = first(Z_Shift_r), Z_Shift_r = first(Z_Shift_r),
Z_Shift_AUC = first(Z_Shift_AUC), Z_Shift_AUC = first(Z_Shift_AUC),
# R Squared values
R_Squared_L = first(R_Squared_L),
R_Squared_K = first(R_Squared_K),
R_Squared_r = first(R_Squared_r),
R_Squared_AUC = first(R_Squared_AUC),
# lm intercepts
lm_intercept_L = first(lm_intercept_L),
lm_intercept_K = first(lm_intercept_K),
lm_intercept_r = first(lm_intercept_r),
lm_intercept_AUC = first(lm_intercept_AUC),
# lm slopes
lm_slope_L = first(lm_slope_L),
lm_slope_K = first(lm_slope_K),
lm_slope_r = first(lm_slope_r),
lm_slope_AUC = first(lm_slope_AUC),
# NG, DB, SM values # NG, DB, SM values
NG = first(NG), NG = first(NG),
DB = first(DB), DB = first(DB),
SM = first(SM), SM = first(SM),
.groups = "drop" .groups = "drop"
) %>%
arrange(desc(Z_lm_L), desc(NG))
# Deletion data ranking and linear modeling
if (type == "deletion") {
interactions <- interactions %>%
mutate(
Avg_Zscore_L_adjusted = ifelse(is.na(Avg_Zscore_L), 0.001, Avg_Zscore_L),
Avg_Zscore_K_adjusted = ifelse(is.na(Avg_Zscore_K), 0.001, Avg_Zscore_K),
Avg_Zscore_r_adjusted = ifelse(is.na(Avg_Zscore_r), 0.001, Avg_Zscore_r),
Avg_Zscore_AUC_adjusted = ifelse(is.na(Avg_Zscore_AUC), 0.001, Avg_Zscore_AUC),
Z_lm_L_adjusted = ifelse(is.na(Z_lm_L), 0.001, Z_lm_L),
Z_lm_K_adjusted = ifelse(is.na(Z_lm_K), 0.001, Z_lm_K),
Z_lm_r_adjusted = ifelse(is.na(Z_lm_r), 0.001, Z_lm_r),
Z_lm_AUC_adjusted = ifelse(is.na(Z_lm_AUC), 0.001, Z_lm_AUC)
) %>%
mutate(
Rank_L = rank(Avg_Zscore_L_adjusted),
Rank_K = rank(Avg_Zscore_K_adjusted),
Rank_r = rank(Avg_Zscore_r_adjusted),
Rank_AUC = rank(Avg_Zscore_AUC_adjusted),
Rank_lm_L = rank(Z_lm_L_adjusted),
Rank_lm_K = rank(Z_lm_K_adjusted),
Rank_lm_r = rank(Z_lm_r_adjusted),
Rank_lm_AUC = rank(Z_lm_AUC_adjusted)
) %>%
mutate(
lm_R_squared_rank_L = summary(lm(Rank_lm_L ~ Rank_L, data = .))$r.squared,
lm_R_squared_rank_K = summary(lm(Rank_lm_K ~ Rank_K, data = .))$r.squared,
lm_R_squared_rank_r = summary(lm(Rank_lm_r ~ Rank_r, data = .))$r.squared,
lm_R_squared_rank_AUC = summary(lm(Rank_lm_AUC ~ Rank_AUC, data = .))$r.squared
) )
# Add overlap threshold categories based on Z-lm and Avg-Z scores # Add overlap threshold categories based on Z-lm and Avg-Z scores
@@ -443,16 +463,64 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore", Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore", Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
TRUE ~ "No Effect" TRUE ~ "No Effect"
), )) %>%
group_modify(~ {
# For rank plots
lm_L <- perform_lm(.x, "Z_lm_L", "Avg_Zscore_L", max_conc)
lm_K <- perform_lm(.x, "Z_lm_K", "Avg_Zscore_K", max_conc)
lm_r <- perform_lm(.x, "Z_lm_r", "Avg_Zscore_r", max_conc)
lm_AUC <- perform_lm(.x, "Z_lm_AUC", "Avg_Zscore_AUC", max_conc)
# For correlation plots # For correlation plots
lm_R_squared_L = summary(lm(Z_lm_L ~ Avg_Zscore_L))$r.squared, Z_lm_K_L <- perform_lm(.x, "Z_lm_K", "Z_lm_L", max_conc)
lm_R_squared_K = summary(lm(Z_lm_K ~ Avg_Zscore_K))$r.squared, Z_lm_r_L <- perform_lm(.x, "Z_lm_r", "Z_lm_L", max_conc)
lm_R_squared_r = summary(lm(Z_lm_r ~ Avg_Zscore_r))$r.squared, Z_lm_R_AUC_L <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_L", max_conc)
lm_R_squared_AUC = summary(lm(Z_lm_AUC ~ Avg_Zscore_AUC))$r.squared Z_lm_R_r_K <- perform_lm(.x, "Z_lm_R_r", "Z_lm_K", max_conc)
) Z_lm_R_AUC_K <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_K", max_conc)
Z_lm_R_AUC_r <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_r", max_conc)
# Creating the final calculations and interactions dataframes with only required columns for csv output .x %>%
mutate(
# For rank plots
lm_R_squared_L = lm_L$r.squared,
lm_R_squared_K = lm_K$r.squared,
lm_R_squared_r = lm_r$r.squared,
lm_R_squared_AUC = lm_AUC$r.squared,
# For correlation plots
Z_lm_R_squared_K_L = Z_lm_K_L$r.squared,
Z_lm_R_squared_r_L = Z_lm_r_L$r.squared,
Z_lm_R_squared_AUC_L = Z_lm_R_AUC_L$r.squared,
Z_lm_R_squared_r_K = Z_lm_R_r_K$r.squared,
Z_lm_R_squared_AUC_K = Z_lm_R_AUC_K$r.squared,
Z_lm_R_squared_AUC_r = Z_lm_R_AUC_r$r.squared,
Z_lm_intercept_K_L = Z_lm_K_L$intercept,
Z_lm_intercept_r_L = Z_lm_r_L$intercept,
Z_lm_intercept_AUC_L = Z_lm_R_AUC_L$intercept,
Z_lm_intercept_r_K = Z_lm_R_r_K$intercept,
Z_lm_intercept_AUC_K = Z_lm_R_AUC_K$intercept,
Z_lm_intercept_AUC_r = Z_lm_R_AUC_r$intercept,
Z_lm_slope_K_L = Z_lm_K_L$slope,
Z_lm_slope_r_L = Z_lm_r_L$slope,
Z_lm_slope_AUC_L = Z_lm_R_AUC_L$slope,
Z_lm_slope_r_K = Z_lm_R_r_K$slope,
Z_lm_slope_AUC_K = Z_lm_R_AUC_K$slope,
Z_lm_slope_AUC_r = Z_lm_R_AUC_r$slope,
Z_lm_Score_K_L = Z_lm_K_L$score,
Z_lm_Score_r_L = Z_lm_r_L$score,
Z_lm_Score_AUC_L = Z_lm_R_AUC_L$score,
Z_lm_Score_r_K = Z_lm_R_r_K$score,
Z_lm_Score_AUC_K = Z_lm_R_AUC_K$score,
Z_lm_Score_AUC_r = Z_lm_R_AUC_r$score
)
})
} # end deletion-specific block
# Create the final calculations and interactions dataframes with only required columns for csv output
df_calculations <- calculations %>% df_calculations <- calculations %>%
select( select(
all_of(group_vars), all_of(group_vars),
@@ -484,7 +552,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap
) )
# Join calculations and interactions to avoid dimension mismatch # Avoid column collision on left join for overlapping variables
calculations_no_overlap <- calculations %>% calculations_no_overlap <- calculations %>%
select(-any_of(c("DB", "NG", "SM", select(-any_of(c("DB", "NG", "SM",
"Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
@@ -1091,22 +1159,12 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
)) ))
} }
generate_rank_plot_configs <- function(df, is_lm = FALSE, adjust = FALSE, filter_na = FALSE, overlap_color = FALSE) { generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, overlap_color = FALSE) {
sd_bands <- c(1, 2, 3) sd_bands <- c(1, 2, 3)
plot_configs <- list() plot_configs <- list()
variables <- c("L", "K") variables <- c("L", "K")
# Adjust (if necessary) and rank columns
for (variable in variables) {
if (adjust) {
df[[paste0("Avg_Zscore_", variable)]] <- ifelse(is.na(df[[paste0("Avg_Zscore_", variable)]]), 0.001, df[[paste0("Avg_Zscore_", variable)]])
df[[paste0("Z_lm_", variable)]] <- ifelse(is.na(df[[paste0("Z_lm_", variable)]]), 0.001, df[[paste0("Z_lm_", variable)]])
}
df[[paste0("Rank_", variable)]] <- rank(df[[paste0("Avg_Zscore_", variable)]], na.last = "keep")
df[[paste0("Rank_lm_", variable)]] <- rank(df[[paste0("Z_lm_", variable)]], na.last = "keep")
}
# Helper function to create a rank plot configuration # Helper function to create a rank plot configuration
create_plot_config <- function(variable, rank_var, zscore_var, y_label, sd_band, filter_na, with_annotations = TRUE) { create_plot_config <- function(variable, rank_var, zscore_var, y_label, sd_band, filter_na, with_annotations = TRUE) {
num_enhancers <- sum(df[[zscore_var]] >= sd_band, na.rm = TRUE) num_enhancers <- sum(df[[zscore_var]] >= sd_band, na.rm = TRUE)
@@ -1578,9 +1636,9 @@ main <- function() {
write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE) write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE) write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
message("Generating reference interaction plots") # message("Generating reference interaction plots")
reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference") # reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16) # generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
message("Setting missing deletion values to the highest theoretical value at each drug conc for L") message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
df_deletion <- df_na_stats %>% # formerly X2 df_deletion <- df_na_stats %>% # formerly X2
@@ -1648,7 +1706,6 @@ main <- function() {
# rank_plot_configs <- generate_rank_plot_configs( # rank_plot_configs <- generate_rank_plot_configs(
# df_interactions, # df_interactions,
# is_lm = FALSE, # is_lm = FALSE,
# adjust = TRUE
# ) # )
# generate_and_save_plots(out_dir, "rank_plots", rank_plot_configs, # generate_and_save_plots(out_dir, "rank_plots", rank_plot_configs,
# page_width = 18, page_height = 12) # page_width = 18, page_height = 12)
@@ -1657,7 +1714,6 @@ main <- function() {
# rank_lm_plot_configs <- generate_rank_plot_configs( # rank_lm_plot_configs <- generate_rank_plot_configs(
# df_interactions, # df_interactions,
# is_lm = TRUE, # is_lm = TRUE,
# adjust = TRUE
# ) # )
# generate_and_save_plots(out_dir, "rank_plots_lm", rank_lm_plot_configs, # generate_and_save_plots(out_dir, "rank_plots_lm", rank_lm_plot_configs,
# page_width = 18, page_height = 12) # page_width = 18, page_height = 12)
@@ -1666,7 +1722,6 @@ main <- function() {
# rank_plot_filtered_configs <- generate_rank_plot_configs( # rank_plot_filtered_configs <- generate_rank_plot_configs(
# df_interactions, # df_interactions,
# is_lm = FALSE, # is_lm = FALSE,
# adjust = FALSE,
# filter_na = TRUE, # filter_na = TRUE,
# overlap_color = TRUE # overlap_color = TRUE
# ) # )
@@ -1677,7 +1732,6 @@ main <- function() {
# rank_plot_lm_filtered_configs <- generate_rank_plot_configs( # rank_plot_lm_filtered_configs <- generate_rank_plot_configs(
# df_interactions, # df_interactions,
# is_lm = TRUE, # is_lm = TRUE,
# adjust = FALSE,
# filter_na = TRUE, # filter_na = TRUE,
# overlap_color = TRUE # overlap_color = TRUE
# ) # )