Cleanup interaction scores

This commit is contained in:
2024-09-10 02:16:59 -04:00
parent 4e15fb80fb
commit a5c38ec47c

View File

@@ -180,20 +180,21 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
}
# Calculate interaction scores
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
# Calculate total concentration variables
total_conc_num <- length(unique(df$conc_num))
num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1
# Pull the background means and standard deviations from 0 concentration
# Pull the background means and standard deviations from zero concentration
bg_means <- list(
L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(),
K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(),
r = df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first(),
AUC = df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first()
)
bg_sd <- list(
L = df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first(),
K = df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first(),
@@ -201,56 +202,58 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
)
# Calculate NG, DB, SM, N
interaction_scores <- df %>%
mutate(
WT_L = df$mean_L,
WT_K = df$mean_K,
WT_r = df$mean_r,
WT_AUC = df$mean_AUC,
WT_sd_L = df$sd_L,
WT_sd_K = df$sd_K,
WT_sd_r = df$sd_r,
WT_sd_AUC = df$sd_AUC
) %>%
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
mutate(
N = sum(!is.na(L)),
NG = sum(NG, na.rm = TRUE),
DB = sum(DB, na.rm = TRUE),
SM = sum(SM, na.rm = TRUE),
N = sum(!is.na(L))
)
across(all_of(variables), list(
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
max = ~max(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)
), .names = "{.fn}_{.col}")
) %>%
ungroup()
# Calculate Raw_Shift and Z_Shift for each variable
interaction_scores <- interaction_scores %>%
mutate(
Raw_Shift_L = mean_L - bg_means$L,
Raw_Shift_K = mean_K - bg_means$K,
Raw_Shift_r = mean_r - bg_means$r,
Raw_Shift_AUC = mean_AUC - bg_means$AUC,
Z_Shift_L = Raw_Shift_L / bg_sd$L,
Z_Shift_K = Raw_Shift_K / bg_sd$K,
Z_Shift_r = Raw_Shift_r / bg_sd$r,
Z_Shift_AUC = Raw_Shift_AUC / bg_sd$AUC
Raw_Shift_L = mean_L[[1]] - bg_means$L,
Raw_Shift_K = mean_K[[1]] - bg_means$K,
Raw_Shift_r = mean_r[[1]] - bg_means$r,
Raw_Shift_AUC = mean_AUC[[1]] - bg_means$AUC,
Z_Shift_L = Raw_Shift_L[[1]] / df$sd_L[[1]],
Z_Shift_K = Raw_Shift_K[[1]] / df$sd_K[[1]],
Z_Shift_r = Raw_Shift_r[[1]] / df$sd_r[[1]],
Z_Shift_AUC = Raw_Shift_AUC[[1]] / df$sd_AUC[[1]]
)
# Calculate WT and WT_sd for each variable
interaction_scores <- interaction_scores %>%
mutate(
WT_L = mean(mean_L, na.rm = TRUE),
WT_K = mean(mean_K, na.rm = TRUE),
WT_r = mean(mean_r, na.rm = TRUE),
WT_AUC = mean(mean_AUC, na.rm = TRUE),
WT_sd_L = sd(mean_L, na.rm = TRUE),
WT_sd_K = sd(mean_K, na.rm = TRUE),
WT_sd_r = sd(mean_r, na.rm = TRUE),
WT_sd_AUC = sd(mean_AUC, na.rm = TRUE)
)
# Calculate Exp and Delta for each variable
interaction_scores <- interaction_scores %>%
mutate(
Exp_L = WT_L + Raw_Shift_L,
Delta_L = WT_L - Exp_L,
Delta_L = mean_L - Exp_L,
Exp_K = WT_K + Raw_Shift_K,
Delta_K = WT_K - Exp_K,
Delta_K = mean_K - Exp_K,
Exp_r = WT_r + Raw_Shift_r,
Delta_r = WT_r - Exp_r,
Delta_r = mean_r - Exp_r,
Exp_AUC = WT_AUC + Raw_Shift_AUC,
Delta_AUC = WT_AUC - Exp_AUC
Delta_AUC = mean_AUC - Exp_AUC
)
# Adjust Delta for NG and SM conditions
interaction_scores <- interaction_scores %>%
mutate(
Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L),
@@ -258,17 +261,38 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r),
Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC),
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L)
) %>%
ungroup()
)
# Calculate linear models and lm_Score for each variable
# Calculate Z-scores for each variable
interaction_scores <- interaction_scores %>%
mutate(
Zscore_L = Delta_L / WT_sd_L,
Zscore_K = Delta_K / WT_sd_K,
Zscore_r = Delta_r / WT_sd_r,
Zscore_AUC = Delta_AUC / WT_sd_AUC
)
# Calculate linear models
interaction_scores <- interaction_scores %>%
mutate(
lm_L = lm(Delta_L ~ conc_num_factor),
lm_K = lm(Delta_K ~ conc_num_factor),
lm_r = lm(Delta_r ~ conc_num_factor),
lm_AUC = Dlm(Delta_AUC ~ conc_num_factor)
)
# Calculate linear models and interaction scores
interaction_scores_all <- interaction_scores %>%
group_by(across(all_of(group_vars))) %>%
mutate(
lm_Score_L = max_conc * coef(lm(Delta_L ~ conc_num_factor))[2] + coef(lm(Delta_L ~ conc_num_factor))[1],
lm_Score_K = max_conc * coef(lm(Delta_K ~ conc_num_factor))[2] + coef(lm(Delta_K ~ conc_num_factor))[1],
lm_Score_r = max_conc * coef(lm(Delta_r ~ conc_num_factor))[2] + coef(lm(Delta_r ~ conc_num_factor))[1],
lm_Score_AUC = max_conc * coef(lm(Delta_AUC ~ conc_num_factor))[2] + coef(lm(Delta_AUC ~ conc_num_factor))[1]
lm_Score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1],
lm_Score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1],
lm_Score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1],
lm_Score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1],
r_squared_L = summary(lm_L)$r.squared,
r_squared_K = summary(lm_K)$r.squared,
r_squared_r = summary(lm_r)$r.squared,
r_squared_AUC = summary(lm_AUC)$r.squared
)
# Calculate Z_lm for each variable
@@ -277,16 +301,11 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE),
Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE),
Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE),
Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE)
)
# Calculate Average Z-scores for each variable
interaction_scores_all <- interaction_scores_all %>%
mutate(
Avg_Zscore_L = sum(Z_Shift_L, na.rm = TRUE) / num_non_removed_concs,
Avg_Zscore_K = sum(Z_Shift_K, na.rm = TRUE) / num_non_removed_concs,
Avg_Zscore_r = sum(Z_Shift_r, na.rm = TRUE) / (total_conc_num - 1),
Avg_Zscore_AUC = sum(Z_Shift_AUC, na.rm = TRUE) / (total_conc_num - 1)
Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE),
Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / num_non_removed_concs,
Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / num_non_removed_concs,
Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / (total_conc_num - 1),
Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / (total_conc_num - 1)
)
# Arrange results by Z_lm_L and NG
@@ -298,8 +317,6 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
}
# Main function
generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_layout = NULL) {
`%||%` <- function(a, b) if (!is.null(a)) a else b
@@ -308,9 +325,39 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
plots <- lapply(plot_configs, function(config) {
df <- config$df
plot <- ggplot(df, aes(x = if (config$plot_type == "box") as.factor(!!sym(config$x_var)) else !!sym(config$x_var),
color = as.factor(!!sym(config$color_var))))
plot <- ggplot(df, aes(x = !!sym(config$x_var), y = !!sym(config$y_var), color = as.factor(!!sym(config$color_var))))
# Rank plots with SD annotations
if (config$plot_type == "rank") {
plot <- plot + geom_point(size = 0.1, shape = 3)
# Adding SD bands
if (!is.null(config$sd_band)) {
for (i in seq_len(config$sd_band)) {
plot <- plot +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = i, ymax = Inf, fill = "#542788", alpha = 0.3) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = -i, ymax = -Inf, fill = "orange", alpha = 0.3) +
geom_hline(yintercept = c(-i, i), color = "gray")
}
}
# Optionally add enhancer/suppressor count annotations
if (!is.null(config$enhancer_label)) {
plot <- plot + annotate("text", x = config$enhancer_label$x,
y = config$enhancer_label$y, label = config$enhancer_label$label) +
annotate("text", x = config$suppressor_label$x,
y = config$suppressor_label$y, label = config$suppressor_label$label)
}
}
# Correlation plots
if (config$plot_type == "correlation") {
plot <- plot + geom_point(shape = 3, color = "gray70") +
geom_smooth(method = "lm", color = "tomato3") +
annotate("text", x = 0, y = 0, label = config$correlation_text)
}
# General scatter/boxplot/density handling
if (!is.null(config$y_var)) {
plot <- plot + aes(y = !!sym(config$y_var))
@@ -331,22 +378,26 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
}
}
# Plot type selection
plot <- switch(config$plot_type,
"box" = plot + geom_boxplot(),
"density" = plot + geom_density(),
"bar" = plot + geom_bar(stat = "identity"),
plot + geom_point() + geom_smooth(method = "lm", se = FALSE))
# Adding y-limits if provided
if (!is.null(config$ylim_vals)) {
plot <- plot + coord_cartesian(ylim = config$ylim_vals)
}
# Setting legend position, titles, and labels
legend_position <- config$legend_position %||% "bottom"
plot <- plot + ggtitle(config$title) + theme_publication(legend_position = legend_position)
plot <- plot + ggtitle(config$title) + theme_Publication(legend_position = legend_position)
if (!is.null(config$x_label)) plot <- plot + xlab(config$x_label)
if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label)
# Adding text annotations if provided
if (!is.null(config$annotations)) {
for (annotation in config$annotations) {
plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label)
@@ -393,7 +444,6 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs, grid_la
}
}
interaction_plot_configs <- function(df, variables) {
plot_configs <- list()
@@ -463,7 +513,6 @@ interaction_plot_configs <- function(df, variables) {
return(plot_configs)
}
correlation_plot_configs <- function(df, lm_list, lm_summaries) {
lapply(seq_along(lm_list), function(i) {
r_squared <- round(lm_summaries[[i]]$r.squared, 3)
@@ -480,17 +529,19 @@ correlation_plot_configs <- function(df, lm_list, lm_summaries) {
# Adjust missing values and calculate ranks
adjust_missing_and_rank <- function(df, variables) {
# Adjust missing values in Avg_Zscore and Z_lm columns, and apply rank to the specified variables
df <- df %>%
mutate(across(all_of(variables), list(
Avg_Zscore = if_else(is.na(.Avg_Zscore), 0.001, .Avg_Zscore),
Z_lm = if_else(is.na(.Z_lm), 0.001, .Z_lm),
Rank = rank(.Avg_Zscore),
Rank_lm = rank(.Z_lm),
), "{.fn}_{.col}"))
Avg_Zscore = ~ if_else(is.na(get(paste0("Avg_Zscore_", cur_column()))), 0.001, get(paste0("Avg_Zscore_", cur_column()))),
Z_lm = ~ if_else(is.na(get(paste0("Z_lm_", cur_column()))), 0.001, get(paste0("Z_lm_", cur_column()))),
Rank = ~ rank(get(paste0("Avg_Zscore_", cur_column()))),
Rank_lm = ~ rank(get(paste0("Z_lm_", cur_column())))
), .names = "{fn}_{col}"))
return(df)
}
main <- function() {
lapply(names(args$experiments), function(exp_name) {
exp <- args$experiments[[exp_name]]
@@ -566,9 +617,7 @@ main <- function() {
write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
# Plot configurations
# Each list corresponds to a group of plots in the same file
# Plots
# Print quality control graphs before removing data due to contamination and
# adjusting missing data to max theoretical values
@@ -842,8 +891,8 @@ main <- function() {
)
lm_summaries <- lapply(lm_list, summary)
correlation_plot_config <- correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries)
generate_and_save_plots(zscores_interactions_filtered, output_dir, correlation_plot_config)
correlation_plot_configs <- correlation_plot_configs(zscores_interactions_filtered, lm_list, lm_summaries)
generate_and_save_plots(zscores_interactions_filtered, output_dir, correlation_plot_configs)
# Generate ranked plots
rank_plot_config <- list(