Add correlation and rank coefficients to calculations

This commit is contained in:
2024-10-07 04:47:30 -04:00
parent 2073517192
commit 52381426b2

View File

@@ -219,9 +219,9 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
# This is a helper function to perform a linear regression and extract coefficients # This is a helper function to perform a linear regression and extract coefficients
perform_lm <- function(data, response, predictor, max_conc) { perform_lm <- function(data, response, predictor, max_conc) {
valid_data <- data %>% filter(!is.na(.data[[response]])) valid_data <- data %>% filter(!is.na(data[[response]]))
if (nrow(valid_data) > 1) { if (nrow(valid_data) > 1) {
model <- lm(.data[[response]] ~ .data[[predictor]], data = valid_data) model <- lm(data[[response]] ~ data[[predictor]], data = valid_data)
list( list(
intercept = coef(model)[1], intercept = coef(model)[1],
slope = coef(model)[2], slope = coef(model)[2],
@@ -286,7 +286,6 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
NG_sum = sum(NG, na.rm = TRUE), NG_sum = sum(NG, na.rm = TRUE),
DB_sum = sum(DB, na.rm = TRUE), DB_sum = sum(DB, na.rm = TRUE),
SM_sum = sum(SM, na.rm = TRUE), SM_sum = sum(SM, na.rm = TRUE),
num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
# Expected values # Expected values
Exp_L = WT_L + Raw_Shift_L, Exp_L = WT_L + Raw_Shift_L,
@@ -374,10 +373,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
lm_mean_r = mean(lm_Score_r, na.rm = TRUE), lm_mean_r = mean(lm_Score_r, na.rm = TRUE),
lm_sd_r = sd(lm_Score_r, na.rm = TRUE), lm_sd_r = sd(lm_Score_r, na.rm = TRUE),
lm_mean_AUC = mean(lm_Score_AUC, na.rm = TRUE), lm_mean_AUC = mean(lm_Score_AUC, na.rm = TRUE),
lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE) lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE),
) %>%
# Calculate Z-lm scores
mutate(
Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L, Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L,
Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K, Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K,
Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r, Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r,
@@ -388,10 +384,24 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
interactions <- calculations %>% interactions <- calculations %>%
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
summarise( summarise(
Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / first(num_non_removed_concs),
Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / first(num_non_removed_concs), num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / (total_conc_num - 1),
Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / (total_conc_num - 1), Sum_Z_Score_L = sum(Zscore_L, na.rm = TRUE),
Sum_Z_Score_K = sum(Zscore_K, na.rm = TRUE),
Sum_Z_Score_r = sum(Zscore_r, na.rm = TRUE),
Sum_Z_Score_AUC = sum(Zscore_AUC, na.rm = TRUE),
Avg_Zscore_L = Sum_Z_Score_L / first(num_non_removed_concs),
Avg_Zscore_K = Sum_Z_Score_K / first(num_non_removed_concs),
Avg_Zscore_r = Sum_Z_Score_r / first(num_non_removed_concs),
Avg_Zscore_AUC = Sum_Z_Score_AUC / first(num_non_removed_concs),
# R_Squared
R_Squared_L = first(R_Squared_L),
R_Squared_K = first(R_Squared_K),
R_Squared_r = first(R_Squared_r),
R_Squared_AUC = first(R_Squared_AUC),
# Interaction Z-scores # Interaction Z-scores
Z_lm_L = first(Z_lm_L), Z_lm_L = first(Z_lm_L),
@@ -411,13 +421,19 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Z_Shift_r = first(Z_Shift_r), Z_Shift_r = first(Z_Shift_r),
Z_Shift_AUC = first(Z_Shift_AUC), Z_Shift_AUC = first(Z_Shift_AUC),
# Gene-Gene Interaction
lm_Score_L = first(lm_Score_L),
lm_Score_K = first(lm_Score_K),
lm_Score_r = first(lm_Score_r),
lm_Score_AUC = first(lm_Score_AUC),
# NG, DB, SM values # NG, DB, SM values
NG = first(NG), NG_sum_int = sum(NG),
DB = first(DB), DB_sum_int = sum(DB),
SM = first(SM), SM_sum_int = sum(SM),
.groups = "drop" .groups = "drop"
) %>% ) %>%
arrange(desc(Z_lm_L), desc(NG)) arrange(desc(Z_lm_L), desc(NG_sum_int))
# Deletion data ranking and linear modeling # Deletion data ranking and linear modeling
if (type == "deletion") { if (type == "deletion") {
@@ -463,61 +479,78 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore", Z_lm_L >= overlap_threshold & Avg_Zscore_L <= -overlap_threshold ~ "Deletion Enhancer lm, Deletion Suppressor Avg Zscore",
Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore", Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
TRUE ~ "No Effect" TRUE ~ "No Effect"
)) %>% ),
group_modify(~ {
# For rank plots # For rank plots
lm_L <- perform_lm(.x, "Z_lm_L", "Avg_Zscore_L", max_conc) lm_L = perform_lm(., "Z_lm_L", "Avg_Zscore_L", max_conc),
lm_K <- perform_lm(.x, "Z_lm_K", "Avg_Zscore_K", max_conc) lm_K = perform_lm(., "Z_lm_K", "Avg_Zscore_K", max_conc),
lm_r <- perform_lm(.x, "Z_lm_r", "Avg_Zscore_r", max_conc) lm_r = perform_lm(., "Z_lm_r", "Avg_Zscore_r", max_conc),
lm_AUC <- perform_lm(.x, "Z_lm_AUC", "Avg_Zscore_AUC", max_conc) lm_AUC = perform_lm(., "Z_lm_AUC", "Avg_Zscore_AUC", max_conc),
# For correlation plots # For correlation plots
Z_lm_K_L <- perform_lm(.x, "Z_lm_K", "Z_lm_L", max_conc) Z_lm_K_L = perform_lm(., "Z_lm_K", "Z_lm_L", max_conc),
Z_lm_r_L <- perform_lm(.x, "Z_lm_r", "Z_lm_L", max_conc) Z_lm_r_L = perform_lm(., "Z_lm_r", "Z_lm_L", max_conc),
Z_lm_R_AUC_L <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_L", max_conc) Z_lm_R_AUC_L = perform_lm(., "Z_lm_AUC", "Z_lm_L", max_conc),
Z_lm_R_r_K <- perform_lm(.x, "Z_lm_R_r", "Z_lm_K", max_conc) Z_lm_R_r_K = perform_lm(., "Z_lm_r", "Z_lm_K", max_conc),
Z_lm_R_AUC_K <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_K", max_conc) Z_lm_R_AUC_K = perform_lm(., "Z_lm_AUC", "Z_lm_K", max_conc),
Z_lm_R_AUC_r <- perform_lm(.x, "Z_lm_R_AUC", "Z_lm_r", max_conc) Z_lm_R_AUC_r = perform_lm(., "Z_lm_AUC", "Z_lm_r", max_conc),
.x %>% # Extract coefficients and statistics for each model
mutate( lm_rank_intercept_L = lm_L$intercept,
# For rank plots lm_rank_slope_L = lm_L$slope,
lm_R_squared_L = lm_L$r.squared, R_Squared_L = lm_L$r_squared,
lm_R_squared_K = lm_K$r.squared, lm_Score_L = lm_L$score,
lm_R_squared_r = lm_r$r.squared,
lm_R_squared_AUC = lm_AUC$r.squared,
# For correlation plots lm_intercept_K = lm_K$intercept,
Z_lm_R_squared_K_L = Z_lm_K_L$r.squared, lm_slope_K = lm_K$slope,
Z_lm_R_squared_r_L = Z_lm_r_L$r.squared, R_Squared_K = lm_K$r_squared,
Z_lm_R_squared_AUC_L = Z_lm_R_AUC_L$r.squared, lm_Score_K = lm_K$score,
Z_lm_R_squared_r_K = Z_lm_R_r_K$r.squared,
Z_lm_R_squared_AUC_K = Z_lm_R_AUC_K$r.squared, lm_intercept_r = lm_r$intercept,
Z_lm_R_squared_AUC_r = Z_lm_R_AUC_r$r.squared, lm_slope_r = lm_r$slope,
R_Squared_r = lm_r$r_squared,
lm_Score_r = lm_r$score,
lm_intercept_AUC = lm_AUC$intercept,
lm_slope_AUC = lm_AUC$slope,
R_Squared_AUC = lm_AUC$r_squared,
lm_Score_AUC = lm_AUC$score,
Z_lm_intercept_K_L = Z_lm_K_L$intercept, Z_lm_intercept_K_L = Z_lm_K_L$intercept,
Z_lm_intercept_r_L = Z_lm_r_L$intercept,
Z_lm_intercept_AUC_L = Z_lm_R_AUC_L$intercept,
Z_lm_intercept_r_K = Z_lm_R_r_K$intercept,
Z_lm_intercept_AUC_K = Z_lm_R_AUC_K$intercept,
Z_lm_intercept_AUC_r = Z_lm_R_AUC_r$intercept,
Z_lm_slope_K_L = Z_lm_K_L$slope, Z_lm_slope_K_L = Z_lm_K_L$slope,
Z_lm_slope_r_L = Z_lm_r_L$slope, Z_lm_R_squared_K_L = Z_lm_K_L$r_squared,
Z_lm_slope_AUC_L = Z_lm_R_AUC_L$slope,
Z_lm_slope_r_K = Z_lm_R_r_K$slope,
Z_lm_slope_AUC_K = Z_lm_R_AUC_K$slope,
Z_lm_slope_AUC_r = Z_lm_R_AUC_r$slope,
Z_lm_Score_K_L = Z_lm_K_L$score, Z_lm_Score_K_L = Z_lm_K_L$score,
Z_lm_intercept_r_L = Z_lm_r_L$intercept,
Z_lm_slope_r_L = Z_lm_r_L$slope,
Z_lm_R_squared_r_L = Z_lm_r_L$r_squared,
Z_lm_Score_r_L = Z_lm_r_L$score, Z_lm_Score_r_L = Z_lm_r_L$score,
Z_lm_Score_AUC_L = Z_lm_R_AUC_L$score,
Z_lm_Score_r_K = Z_lm_R_r_K$score, Z_lm_intercept_R_AUC_L = Z_lm_R_AUC_L$intercept,
Z_lm_Score_AUC_K = Z_lm_R_AUC_K$score, Z_lm_slope_R_AUC_L = Z_lm_R_AUC_L$slope,
Z_lm_Score_AUC_r = Z_lm_R_AUC_r$score Z_lm_R_squared_R_AUC_L = Z_lm_R_AUC_L$r_squared,
Z_lm_Score_R_AUC_L = Z_lm_R_AUC_L$score,
Z_lm_intercept_R_r_K = Z_lm_R_r_K$intercept,
Z_lm_slope_R_r_K = Z_lm_R_r_K$slope,
Z_lm_R_squared_R_r_K = Z_lm_R_r_K$r_squared,
Z_lm_Score_R_r_K = Z_lm_R_r_K$score,
Z_lm_intercept_R_AUC_K = Z_lm_R_AUC_K$intercept,
Z_lm_slope_R_AUC_K = Z_lm_R_AUC_K$slope,
Z_lm_R_squared_R_AUC_K = Z_lm_R_AUC_K$r_squared,
Z_lm_Score_R_AUC_K = Z_lm_R_AUC_K$score,
Z_lm_intercept_R_AUC_r = Z_lm_R_AUC_r$intercept,
Z_lm_slope_R_AUC_r = Z_lm_R_AUC_r$slope,
Z_lm_R_squared_R_AUC_r = Z_lm_R_AUC_r$r_squared,
Z_lm_Score_R_AUC_r = Z_lm_R_AUC_r$score
) %>%
# Drop the lm objects and keep only the lm results
select(
-lm_L, -lm_K, -lm_r, -lm_AUC, -Z_lm_K_L, -Z_lm_r_L, -Z_lm_R_AUC_L,
-Z_lm_R_r_K, -Z_lm_R_AUC_K, -Z_lm_R_AUC_r
) )
})
} # end deletion-specific block } # end deletion-specific block
# Create the final calculations and interactions dataframes with only required columns for csv output # Create the final calculations and interactions dataframes with only required columns for csv output
@@ -536,35 +569,38 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Exp_L, Exp_K, Exp_r, Exp_AUC, Exp_L, Exp_K, Exp_r, Exp_AUC,
Delta_L, Delta_K, Delta_r, Delta_AUC, Delta_L, Delta_K, Delta_r, Delta_AUC,
mean_Delta_L, mean_Delta_K, mean_Delta_r, mean_Delta_AUC, mean_Delta_L, mean_Delta_K, mean_Delta_r, mean_Delta_AUC,
Zscore_L, Zscore_K, Zscore_r, Zscore_AUC Zscore_L, Zscore_K, Zscore_r, Zscore_AUC,
) NG_sum, DB_sum, SM_sum
) %>%
rename(NG = NG_sum, DB = DB_sum, SM = SM_sum)
df_interactions <- interactions %>% df_interactions <- interactions %>%
select( select(
all_of(group_vars), all_of(group_vars),
NG, DB, SM,
Avg_Zscore_L, Avg_Zscore_K, Avg_Zscore_r, Avg_Zscore_AUC, Avg_Zscore_L, Avg_Zscore_K, Avg_Zscore_r, Avg_Zscore_AUC,
Sum_Z_Score_L, Sum_Z_Score_K, Sum_Z_Score_r, Sum_Z_Score_AUC,
Z_lm_L, Z_lm_K, Z_lm_r, Z_lm_AUC, Z_lm_L, Z_lm_K, Z_lm_r, Z_lm_AUC,
Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC, Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC,
Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC, Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC,
lm_R_squared_L, lm_R_squared_K, lm_R_squared_r, lm_R_squared_AUC, lm_Score_L, lm_Score_K, lm_Score_r, lm_Score_AUC,
lm_intercept_L, lm_intercept_K, lm_intercept_r, lm_intercept_AUC, R_Squared_L, R_Squared_K, R_Squared_r, R_Squared_AUC,
lm_slope_L, lm_slope_K, lm_slope_r, lm_slope_AUC, Overlap NG_sum_int, DB_sum_int, SM_sum_int
) ) %>%
rename(NG = NG_sum_int, DB = DB_sum_int, SM = SM_sum_int)
# Avoid column collision on left join for overlapping variables # Avoid column collision on left join for overlapping variables
calculations_no_overlap <- calculations %>% calculations_no_overlap <- calculations %>%
select(-any_of(c("DB", "NG", "SM", select(-any_of(c("DB", "NG", "SM",
# Don't need these anywhere so easier to remove
"Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
"R_Squared_L", "R_Squared_K", "R_Squared_r", "R_Squared_AUC",
# we need these for the interactions but the original code has the same names in both datasets
"Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC",
"Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC", "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC"
"lm_R_squared_L", "lm_R_squared_K", "lm_R_squared_r", "lm_R_squared_AUC",
"lm_intercept_L", "lm_intercept_K", "lm_intercept_r", "lm_intercept_AUC",
"lm_slope_L", "lm_slope_K", "lm_slope_r", "lm_slope_AUC"
))) )))
full_data <- calculations_no_overlap %>% full_data <- calculations_no_overlap %>%
left_join(df_interactions, by = group_vars) left_join(interactions, by = group_vars)
# Return final dataframes # Return final dataframes
return(list( return(list(
@@ -1239,7 +1275,7 @@ generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, ove
generate_correlation_plot_configs <- function(df, df_reference) { generate_correlation_plot_configs <- function(df, df_reference) {
# Define relationships for different-variable correlations # Define relationships for different-variable correlations
relationships <- list( relationships <- list(
list(x = "L", y = "K"), list(x = "L", y = "K"), # x-var is predictor, y-var is reponse
list(x = "L", y = "r"), list(x = "L", y = "r"),
list(x = "L", y = "AUC"), list(x = "L", y = "AUC"),
list(x = "K", y = "r"), list(x = "K", y = "r"),
@@ -1261,17 +1297,17 @@ generate_correlation_plot_configs <- function(df, df_reference) {
for (highlight_cyan in highlight_cyan_options) { for (highlight_cyan in highlight_cyan_options) {
for (rel in relationships) { for (rel in relationships) {
# Extract relevant variable names for Z_lm values # Extract relevant variable names for Z_lm values
x_var <- paste0("Z_lm_", rel$x) x_var <- paste0("Z_lm_", rel$x) # predictor
y_var <- paste0("Z_lm_", rel$y) y_var <- paste0("Z_lm_", rel$y) # response
# Find the max and min of both dataframes for printing linear regression line # Find the max and min of both dataframes for printing linear regression line
xmin <- min(c(min(df[[x_var]], na.rm = TRUE), min(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE) xmin <- min(c(min(df[[x_var]], na.rm = TRUE), min(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE)
xmax <- max(c(max(df[[x_var]], na.rm = TRUE), max(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE) xmax <- max(c(max(df[[x_var]], na.rm = TRUE), max(df_reference[[x_var]], na.rm = TRUE)), na.rm = TRUE)
# Extract the R-squared, intercept, and slope from the df (first value) # Extract the R-squared, intercept, and slope from the df (first value)
intercept <- df[[paste0("lm_intercept_", rel$x)]][1] intercept <- df[[paste0("Z_lm_intercept_", rel$y, "_", rel$x)]][1]
slope <- df[[paste0("lm_slope_", rel$x)]][1] slope <- df[[paste0("Z_lm_slope_", rel$y, "_", rel$x)]][1]
r_squared <- df[[paste0("lm_R_squared_", rel$x)]][1] r_squared <- df[[paste0("Z_lm_R_squared_", rel$y, "_", rel$x)]][1]
# Generate the label for the plot # Generate the label for the plot
plot_label <- paste("Interaction", rel$x, "vs.", rel$y) plot_label <- paste("Interaction", rel$x, "vs.", rel$y)