Calculate ranks step-wise

This commit is contained in:
2024-10-08 11:59:24 -04:00
parent d456bcdadd
commit bff7e7cfb4

View File

@@ -222,7 +222,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
perform_lm <- function(response, predictor, max_conc) {
if (all(is.na(response)) || all(is.na(predictor)) ||
length(response[!is.na(response)]) == 0 || length(predictor[!is.na(predictor)]) == 0) {
length(response[!is.na(response)]) < 2 || length(predictor[!is.na(predictor)]) < 2) {
return(list(intercept = NA, slope = NA, r_squared = NA, score = NA))
} else {
fit <- lm(response ~ predictor)
@@ -337,52 +337,54 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / first(total_conc_num - 1),
Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / first(total_conc_num - 1),
# Perform gene-gene linear modeling
# Perform per-gene linear modeling
lm_L = list(perform_lm(Delta_L, conc_num_factor, max_conc)),
lm_K = list(perform_lm(Delta_K, conc_num_factor, max_conc)),
lm_r = list(perform_lm(Delta_r, conc_num_factor, max_conc)),
lm_AUC = list(perform_lm(Delta_AUC, conc_num_factor, max_conc)),
# Extract coefficients and statistics for each model
lm_intercept_L = lm_L[[1]]$intercept,
lm_slope_L = lm_L[[1]]$slope,
R_Squared_L = lm_L[[1]]$r_squared,
lm_Score_L = lm_L[[1]]$score,
lm_intercept_K = lm_K[[1]]$intercept,
lm_slope_K = lm_K[[1]]$slope,
R_Squared_K = lm_K[[1]]$r_squared,
lm_Score_K = lm_K[[1]]$score,
lm_intercept_r = lm_r[[1]]$intercept,
lm_slope_r = lm_r[[1]]$slope,
R_Squared_r = lm_r[[1]]$r_squared,
lm_Score_r = lm_r[[1]]$score,
lm_intercept_AUC = lm_AUC[[1]]$intercept,
lm_slope_AUC = lm_AUC[[1]]$slope,
R_Squared_AUC = lm_AUC[[1]]$r_squared,
lm_Score_AUC = lm_AUC[[1]]$score,
.groups = "drop"
) %>%
select(-c(lm_L, lm_K, lm_r, lm_AUC)) # drop linear models since we have coefficients
# Summary statistics for lm scores
interactions <- interactions %>%
group_by(across(all_of(group_vars))) %>%
mutate(
lm_mean_L = mean(lm_Score_L, na.rm = TRUE),
lm_sd_L = sd(lm_Score_L, na.rm = TRUE),
lm_mean_K = mean(lm_Score_K, na.rm = TRUE),
lm_sd_K = sd(lm_Score_K, na.rm = TRUE),
lm_mean_r = mean(lm_Score_r, na.rm = TRUE),
lm_sd_r = sd(lm_Score_r, na.rm = TRUE),
lm_mean_AUC = mean(lm_Score_AUC, na.rm = TRUE),
lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE),
Z_lm_L = (lm_Score_L - lm_mean_L) / lm_sd_L,
Z_lm_K = (lm_Score_K - lm_mean_K) / lm_sd_K,
Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r,
Z_lm_AUC = (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC
lm_intercept_L = map_dbl(lm_L, "intercept"),
lm_slope_L = map_dbl(lm_L, "slope"),
R_Squared_L = map_dbl(lm_L, "r_squared"),
lm_Score_L = map_dbl(lm_L, "score"),
lm_intercept_K = map_dbl(lm_K, "intercept"),
lm_slope_K = map_dbl(lm_K, "slope"),
R_Squared_K = map_dbl(lm_K, "r_squared"),
lm_Score_K = map_dbl(lm_K, "score"),
lm_intercept_r = map_dbl(lm_r, "intercept"),
lm_slope_r = map_dbl(lm_r, "slope"),
R_Squared_r = map_dbl(lm_r, "r_squared"),
lm_Score_r = map_dbl(lm_r, "score"),
lm_intercept_AUC = map_dbl(lm_AUC, "intercept"),
lm_slope_AUC = map_dbl(lm_AUC, "slope"),
R_Squared_AUC = map_dbl(lm_AUC, "r_squared"),
lm_Score_AUC = map_dbl(lm_AUC, "score")
) %>%
select(-c(lm_L, lm_K, lm_r, lm_AUC)) # drop linear models since we now have coefficients
lm_mean_L <- mean(interactions$lm_Score_L, na.rm = TRUE)
lm_sd_L <- sd(interactions$lm_Score_L, na.rm = TRUE)
lm_mean_K <- mean(interactions$lm_Score_K, na.rm = TRUE)
lm_sd_K <- sd(interactions$lm_Score_K, na.rm = TRUE)
lm_mean_r <- mean(interactions$lm_Score_r, na.rm = TRUE)
lm_sd_r <- sd(interactions$lm_Score_r, na.rm = TRUE)
lm_mean_AUC <- mean(interactions$lm_Score_AUC, na.rm = TRUE)
lm_sd_AUC <- sd(interactions$lm_Score_AUC, na.rm = TRUE)
interactions <- interactions %>%
mutate(
Z_lm_L = if (!is.na(lm_sd_L) && lm_sd_L != 0) (lm_Score_L - lm_mean_L) / lm_sd_L else NA_real_,
Z_lm_K = if (!is.na(lm_sd_K) && lm_sd_K != 0) (lm_Score_K - lm_mean_K) / lm_sd_K else NA_real_,
Z_lm_r = if (!is.na(lm_sd_r) && lm_sd_r != 0) (lm_Score_r - lm_mean_r) / lm_sd_r else NA_real_,
Z_lm_AUC = if (!is.na(lm_sd_AUC) && lm_sd_AUC != 0) (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC else NA_real_
) %>%
arrange(desc(Z_lm_L), desc(NG))
# Deletion data ranking and linear modeling
# Initialize this variable so we can return it as NULL for reference
@@ -408,11 +410,12 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Rank_lm_L = rank(Z_lm_adjusted_L),
Rank_lm_K = rank(Z_lm_adjusted_K),
Rank_lm_r = rank(Z_lm_adjusted_r),
Rank_lm_AUC = rank(Z_lm_adjusted_AUC),
Rank_lm_lm_L = list(perform_lm(Rank_lm_L, Rank_L, max_conc)),
Rank_lm_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)),
Rank_lm_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)),
Rank_lm_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)),
Rank_lm_AUC = rank(Z_lm_adjusted_AUC)
# Rank_lm_lm_L = list(perform_lm(Rank_lm_L, Rank_L, max_conc)),
# Rank_lm_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)),
# Rank_lm_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)),
# Rank_lm_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)),
)
# We are filtering invalid Z_lm scores so this must be in its own df
@@ -443,10 +446,10 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Rank_lm_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)),
Rank_lm_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)),
Rank_lm_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)),
# Rank_lm_R_Squared_L = Rank_lm_lm_L[[1]]$r_squared,
# Rank_lm_R_Squared_K = Rank_lm_lm_K[[1]]$r_squared,
# Rank_lm_R_Squared_r = Rank_lm_lm_r[[1]]$r_squared,
# Rank_lm_R_Squared_AUC = Rank_lm_lm_AUC[[1]]$r_squared,
Rank_lm_R_Squared_L = Rank_lm_lm_L[[1]]$r_squared,
Rank_lm_R_Squared_K = Rank_lm_lm_K[[1]]$r_squared,
Rank_lm_R_Squared_r = Rank_lm_lm_r[[1]]$r_squared,
Rank_lm_R_Squared_AUC = Rank_lm_lm_AUC[[1]]$r_squared,
Correlation_lm_L = list(perform_lm(Z_lm_L, Avg_Zscore_L, max_conc)),
Correlation_lm_K = list(perform_lm(Z_lm_K, Avg_Zscore_K, max_conc)),
Correlation_lm_r = list(perform_lm(Z_lm_r, Avg_Zscore_r, max_conc)),
@@ -497,7 +500,11 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Correlation_lm_slope_AUC_r = Correlation_lm_AUC_r[[1]]$slope,
Correlation_lm_R_Squared_AUC_r = Correlation_lm_AUC_r[[1]]$r_squared,
Correlation_lm_Score_AUC_r = Correlation_lm_AUC_r[[1]]$score
)
) %>%
select(-c(Rank_lm_lm_L, Rank_lm_lm_K, Rank_lm_lm_r, Rank_lm_lm_AUC,
Correlation_lm_L, Correlation_lm_K, Correlation_lm_r, Correlation_lm_AUC,
Correlation_lm_K_L, Correlation_lm_r_L, Correlation_lm_AUC_L,
Correlation_lm_r_K, Correlation_lm_AUC_K, Correlation_lm_AUC_r))
}
# Create the final calculations and interactions dataframes with specific columns for csv output
@@ -625,6 +632,23 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
)
}
# Add annotations if specified
if (!is.null(config$annotations)) {
for (annotation in config$annotations) {
plot <- plot +
annotate(
"text",
x = ifelse(is.null(annotation$x), 0, annotation$x),
y = ifelse(is.null(annotation$y), Inf, annotation$y),
label = annotation$label,
hjust = ifelse(is.null(annotation$hjust), 0.5, annotation$hjust),
vjust = ifelse(is.null(annotation$vjust), 1, annotation$vjust),
size = ifelse(is.null(annotation$size), 3, annotation$size),
color = ifelse(is.null(annotation$color), "black", annotation$color)
)
}
}
if (!is.null(config$grid_lines)) {
plot <- plot + theme(
panel.grid.major = if (config$grid_lines$major) element_line() else element_blank(),
@@ -783,6 +807,14 @@ generate_scatter_plot <- function(plot, config) {
# Extract necessary values
intercept <- config$lm_line$intercept # required
slope <- config$lm_line$slope # required
message("----- Generating lm_line -----")
message("Intercept: ", intercept)
message("Slope: ", slope)
message("Class: ", class(config$df[[config$x_var]]))
message("------------------------------")
xmin <- ifelse(!is.null(config$lm_line$xmin), config$lm_line$xmin, min(as.numeric(config$df[[config$x_var]])))
xmax <- ifelse(!is.null(config$lm_line$xmax), config$lm_line$xmax, max(as.numeric(config$df[[config$x_var]])))
color <- ifelse(!is.null(config$lm_line$color), config$lm_line$color, "blue")
@@ -1078,7 +1110,7 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
lm_slope_col <- paste0("lm_slope_", var)
lm_intercept <- first(group_data_filtered[[lm_intercept_col]])
lm_slope <- first(group_data_filtered[[lm_slope_col]])
plot_config <- list(
df = group_data_filtered,
plot_type = "scatter",
@@ -1183,7 +1215,7 @@ generate_rank_plot_configs <- function(df, is_lm = FALSE, filter_na = FALSE, ove
# Generate plots for each variable
for (variable in variables) {
rank_var <- if (is_lm) paste0("Rank_lm_", variable) else paste0("Rank_", variable)
zscore_var <- if (is_lm) paste0("Z_lm_", variable) else paste0("Avg_Zscore_", variable)
zscore_var <- if (is_lm) paste0("Z_lm_adjusted_", variable) else paste0("Avg_Zscore_adjusted_", variable)
y_label <- if (is_lm) paste("Int Z score", variable) else paste("Avg Z score", variable)
# Loop through SD bands
@@ -1614,9 +1646,9 @@ main <- function() {
write.csv(calculations_reference_csv, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
write.csv(interactions_reference_csv, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
message("Generating reference interaction plots")
reference_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_reference_joined, "reference")
generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
# message("Generating reference interaction plots")
# reference_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_reference_joined, "reference")
# generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
deletion_adjusted <- stats_na %>% # formerly X2
@@ -1647,9 +1679,9 @@ main <- function() {
write.csv(calculations_csv, file = file.path(out_dir, "zscore_calculations.csv"), row.names = FALSE)
write.csv(interactions_csv, file = file.path(out_dir, "zscore_interactions.csv"), row.names = FALSE)
message("Generating deletion interaction plots")
deletion_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_joined, "deletion")
generate_and_save_plots(out_dir, "interaction_plots", deletion_plot_configs, page_width = 16, page_height = 16)
# message("Generating deletion interaction plots")
# deletion_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_joined, "deletion")
# generate_and_save_plots(out_dir, "interaction_plots", deletion_plot_configs, page_width = 16, page_height = 16)
message("Writing enhancer/suppressor csv files")
interaction_threshold <- 2 # TODO add to study config?