Before old plotting removal

This commit is contained in:
2024-09-04 02:03:23 -04:00
parent 111909914c
commit d52903f8fa
2 changed files with 327 additions and 282 deletions

View File

@@ -163,102 +163,6 @@ update_gene_names <- function(df, sgd_gene_list) {
return(df)
}
generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num", title, x_label = NULL, y_label = NULL) {
# Start the ggplot with just the x_var and color_var
plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) +
ggtitle(title) +
theme_publication()
# If y_var is not NULL, add it to the plot aesthetics
if (!is.null(y_var)) {
plot <- plot + aes(y = !!sym(y_var))
}
# Handle different plot types
plot <- switch(plot_type,
"scatter" = plot + geom_point(shape = 3, size = 0.2, position = "jitter") +
stat_summary(fun.data = mean_sdl, geom = "errorbar") +
stat_summary(fun = mean, geom = "point", size = 0.6),
"box" = plot + geom_boxplot(),
"density" = plot + geom_density(),
"bar" = plot + geom_bar(stat = ifelse(is.null(y_var), "count", "identity"))
)
# Add optional labels if provided
if (!is.null(x_label)) plot <- plot + xlab(x_label)
if (!is.null(y_label)) plot <- plot + ylab(y_label)
return(plot)
}
generate_and_save_plots <- function(df, output_dir, prefix, variables, include_qc = FALSE) {
plots <- list()
if (nrow(df) == 0) {
message("The \"", deparse(substitute(df)), "\" dataframe is empty, skipping plots")
return()
}
message("Generating plots for \"", deparse(substitute(df)), "\" dataframe")
for (var in variables) {
scatter_plot <-
generate_plot(df, x_var = "scan", y_var = var, plot_type = "scatter",
title = paste(prefix, "Scatter Plot for", var))
boxplot <-
generate_plot(df, x_var = "scan", y_var = var, plot_type = "box",
title = paste(prefix, "Box Plot for", var))
plots[[paste0(var, "_scatter")]] <- scatter_plot
plots[[paste0(var, "_box")]] <- boxplot
}
if (include_qc) {
plots[["Raw_L_vs_K"]] <-
generate_plot(df, x_var = "L", y_var = "K", plot_type = "scatter",
title = "Raw L vs K before QC")
plots[["Delta_bg_Density"]] <-
generate_plot(df, x_var = "delta_bg", plot_type = "density", color_var = "conc_num",
title = "Density plot for Delta Background by Conc All Data")
plots[["Delta_bg_Bar"]] <-
generate_plot(df, x_var = "delta_bg", plot_type = "bar",
title = "Bar plot for Delta Background by Conc All Data")
}
save_plots(prefix, plots, output_dir)
}
# Ensure all plots are saved and printed to PDF
save_plots <- function(file_name, plot_list, output_dir) {
# Save to PDF
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
lapply(plot_list, function(plot) {
print(plot)
})
dev.off()
# Save to HTML with horizontal legend orientation
lapply(names(plot_list), function(plot_name) {
message("Generating plot: ", plot_name)
pgg <- tryCatch({
suppressWarnings(ggplotly(plot_list[[plot_name]]) %>%
layout(legend = list(orientation = "h")))
}, error = function(e) {
message("Error in plot: ", plot_name, "\n", e)
return(NULL)
})
if (!is.null(pgg)) {
saveWidget(pgg,
file = file.path(output_dir,
paste0(file_name, "_", plot_name, ".html")),
selfcontained = TRUE)
}
})
}
# Process strains (deletion and reference)
process_strains <- function(df) {
df_strains <- data.frame() # Initialize an empty dataframe to store results
@@ -295,7 +199,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
max = ~max(.x, na.rm = TRUE),
min = ~min(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE),
se = ~sd(.x, na.rm = TRUE) / sqrt(n() - 1)
se = ~sd(.x, na.rm = TRUE) / sqrt(n() - 1) # TODO why - 1?
), .names = "{.col}_{.fn}"))
return(summary_stats)
@@ -305,48 +209,70 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_
# Pull the background means
print("Calculating background means")
l_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_mean)
k_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_mean)
L_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_mean)
K_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_mean)
r_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(r_mean)
auc_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_mean)
AUC_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_mean)
# Calculate all necessary statistics and shifts in one step
L_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_sd)
K_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_sd)
r_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(r_sd)
AUC_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_sd)
# Calculate summary statistics and shifts
print("Calculating interaction scores part 1")
interaction_scores_all <- df %>%
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
summarise(
N = n(),
across(all_of(variables), list(mean = mean, sd = sd), na.rm = TRUE),
NG = sum(NG, na.rm = TRUE),
SM = sum(SM, na.rm = TRUE),
raw_shift_l = mean(L, na.rm = TRUE) - l_mean_bg,
raw_shift_k = mean(K, na.rm = TRUE) - k_mean_bg,
raw_shift_r = mean(r, na.rm = TRUE) - r_mean_bg,
raw_shift_auc = mean(AUC, na.rm = TRUE) - auc_mean_bg,
z_shift_l = raw_shift_l / L_sd[1],
z_shift_k = raw_shift_k / K_sd[1],
z_shift_r = raw_shift_r / r_sd[1],
z_shift_auc = raw_shift_auc / AUC_sd[1],
exp_l = l_mean_bg + raw_shift_l,
exp_k = k_mean_bg + raw_shift_k,
exp_r = r_mean_bg + raw_shift_r,
exp_auc = auc_mean_bg + raw_shift_auc,
delta_l = mean(L, na.rm = TRUE) - exp_l,
delta_k = mean(K, na.rm = TRUE) - exp_k,
delta_r = mean(r, na.rm = TRUE) - exp_r,
delta_auc = mean(AUC, na.rm = TRUE) - exp_auc,
zscore_l = delta_l / L_sd,
zscore_k = delta_k / K_sd,
zscore_r = delta_r / r_sd,
zscore_auc = delta_auc / AUC_sd,
sum_z_score_l = sum(zscore_l, na.rm = TRUE),
avg_zscore_l = sum_z_score_l / (length(variables) - sum(NG, na.rm = TRUE)),
sum_z_score_k = sum(zscore_k, na.rm = TRUE),
avg_zscore_k = sum_z_score_k / (length(variables) - sum(NG, na.rm = TRUE)),
sum_z_score_r = sum(zscore_r, na.rm = TRUE),
avg_zscore_r = sum_z_score_r / (length(variables) - sum(NG, na.rm = TRUE)),
sum_z_score_auc = sum(zscore_auc, na.rm = TRUE),
avg_zscore_auc = sum_z_score_auc / (length(variables) - sum(NG, na.rm = TRUE))
L_mean = mean(L),
L_median = median(L),
L_sd = sd(L),
L_se = L_sd / sqrt(N),
K_mean = mean(K),
K_median = median(K),
K_sd = sd(K),
K_se = K_sd / sqrt(N),
r_mean = mean(r),
r_median = median(r),
r_sd = sd(r),
r_se = r_sd / sqrt(N),
AUC_mean = mean(AUC),
AUC_median = median(AUC),
AUC_sd = sd(AUC),
AUC_se = AUC_sd / sqrt(N),
NG = sum(NG),
DB = sum(DB),
SM = sum(SM),
Raw_Shift_L = L_mean - L_mean_bg[[1]],
Raw_Shift_K = K_mean - K_mean_bg[[1]],
Raw_Shift_r = r_mean - r_mean_bg[[1]],
Raw_Shift_AUC = AUC_mean - AUC_mean_bg[[1]],
Z_Shift_L = Raw_Shift_L / L_sd[[0]],
Z_Shift_K = Raw_Shift_K / K_sd[[0]],
Z_Shift_r = Raw_Shift_r / r_sd[[0]],
Z_Shift_AUC = Raw_Shift_AUC / AUC_sd[[0]],
WT_l = df$L_mean,
WT_K = df$K_mean,
WT_r = df$r_mean,
WT_AUC = df$AUC_mean,
WT_sd_l = L_sd_bg[[1]],
WT_sd_K = K_sd_bg[[1]],
WT_sd_r = r_sd_bg[[1]],
WT_sd_AUC = AUC_sd_bg[[1]],
Exp_L = L_mean_bg[[1]] + Raw_Shift_L,
Exp_K = K_mean_bg[[1]] + Raw_Shift_K,
Exp_r = r_mean_bg[[1]] + Raw_Shift_r,
Exp_AUC = AUC_mean_bg[[1]] + Raw_Shift_AUC,
Delta_L = ifelse(NG == 1, mean(L) - WT_l, Delta_L),
Delta_L = ifelse(SM == 1, mean(L) - WT_l, Delta_L),
Delta_K = ifelse(NG == 1, mean(K) - WT_K, Delta_K),
Delta_r = ifelse(NG == 1, mean(r) - WT_r, Delta_r),
Delta_AUC = ifelse(NG == 1, mean(AUC) - WT_AUC, Delta_AUC),
Zscore_L = Delta_L / WT_sd_l,
Zscore_K = Delta_K / WT_sd_K,
Zscore_r = Delta_r / WT_sd_r,
Zscore_AUC = Delta_AUC / WT_sd_AUC,
) %>%
ungroup()
@@ -355,10 +281,10 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_
interaction_scores <- interaction_scores_all %>%
group_by(across(all_of(group_vars))) %>%
summarise(
lm_l = list(lm(delta_l ~ conc_num_factor)),
lm_k = list(lm(delta_k ~ conc_num_factor)),
lm_r = list(lm(delta_r ~ conc_num_factor)),
lm_auc = list(lm(delta_auc ~ conc_num_factor)),
lm_l = list(lm(Delta_L ~ conc_num_factor)),
lm_k = list(lm(Delta_K ~ conc_num_factor)),
lm_r = list(lm(Delta_r ~ conc_num_factor)),
lm_auc = list(lm(Delta_AUC ~ conc_num_factor)),
lm_Score_L = max_conc * coef(lm_l[[1]])[2] + coef(lm_l[[1]])[1],
lm_Score_K = max_conc * coef(lm_k[[1]])[2] + coef(lm_k[[1]])[1],
lm_Score_r = max_conc * coef(lm_r[[1]])[2] + coef(lm_r[[1]])[1],
@@ -367,66 +293,192 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_
R_Squared_K = summary(lm_k[[1]])$r.squared,
R_Squared_r = summary(lm_r[[1]])$r.squared,
R_Squared_AUC = summary(lm_auc[[1]])$r.squared,
Sum_Z_Score_L = sum(zscore_l, na.rm = TRUE),
Avg_Zscore_L = avg_zscore_l[1],
Sum_Z_Score_K = sum(zscore_k, na.rm = TRUE),
Avg_Zscore_K = avg_zscore_k[1],
Sum_Z_Score_r = sum(zscore_r, na.rm = TRUE),
Avg_Zscore_r = avg_zscore_r[1],
Sum_Z_Score_AUC = sum(zscore_auc, na.rm = TRUE),
Avg_Zscore_AUC = avg_zscore_auc[1],
NG = sum(NG, na.rm = TRUE),
DB = sum(DB, na.rm = TRUE),
SM = sum(SM, na.rm = TRUE)
Sum_Z_Score_L = sum(Zscore_L),
Avg_Zscore_L = Sum_Z_Score_L / (length(variables) - sum(NG)),
Sum_Z_Score_K = sum(Zscore_K),
Avg_Zscore_K = Sum_Z_Score_K / (length(variables) - sum(NG)),
Sum_Z_Score_r = sum(Zscore_r),
Avg_Zscore_r = Sum_Z_Score_r / (length(variables) - sum(NG)),
Sum_Z_Score_AUC = sum(Zscore_AUC),
Avg_Zscore_AUC = Sum_Z_Score_AUC / (length(variables) - sum(NG)),
NG = sum(NG),
DB = sum(DB),
SM = sum(SM)
) %>%
ungroup()
# Ensure interaction_scores has only the specified columns
interaction_scores <- interaction_scores %>%
select(Gene, raw_shift_l, z_shift_l, lm_Score_L, Z_lm_L = lm_Score_L, R_Squared_L, sum_z_score_l, avg_zscore_l,
raw_shift_k, z_shift_k, lm_Score_K, Z_lm_K = lm_Score_K, R_Squared_K, sum_z_score_k, avg_zscore_k,
raw_shift_r, z_shift_r, lm_Score_r, Z_lm_r = lm_Score_r, R_Squared_r, sum_z_score_r, avg_zscore_r,
raw_shift_auc, z_shift_auc, lm_Score_AUC, Z_lm_AUC = lm_Score_AUC, R_Squared_AUC, sum_z_score_auc, avg_zscore_auc,
NG, DB, SM)
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
}
generate_rf_plots <- function(df_calculations, df_interactions, output_dir, file_prefix = "RF") {
variables <- c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC")
WT_sds <- list(WT_sd_l = 2, WT_sd_K = 2, WT_sd_r = 0.65, WT_sd_AUC = 6500)
plot_list <- lapply(seq_along(variables), function(i) {
var <- variables[i]
WT_sd <- WT_sds[[i]]
# generate_rf_plots <- function(df_calculations, df_interactions, output_dir, file_prefix = "RF") {
# variables <- c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC")
# WT_sds <- list(WT_sd_l = 2, WT_sd_K = 2, WT_sd_r = 0.65, WT_sd_AUC = 6500)
ggplot(df_calculations, aes(conc_num_factor, !!sym(var))) +
geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
coord_cartesian(ylim = c(-WT_sd, WT_sd)) +
geom_errorbar(aes(ymin = 0 - (2 * WT_sd), ymax = 0 + (2 * WT_sd)), alpha = 0.3) +
ggtitle(paste(df_calculations$OrfRep[1], df_calculations$Gene[1], sep = " ")) +
annotate("text", x = 1, y = 0.9 * WT_sd, label = paste("ZShift =", round(df_interactions[[paste0("Z_Shift_", var)]], 2))) +
annotate("text", x = 1, y = 0.7 * WT_sd, label = paste("lm Zscore =", round(df_interactions[[paste0("Z_lm_", var)]], 2))) +
annotate("text", x = 1, y = -0.7 * WT_sd, label = paste("NG =", df_interactions$NG)) +
annotate("text", x = 1, y = -0.9 * WT_sd, label = paste("DB =", df_interactions$DB)) +
annotate("text", x = 1, y = -1.1 * WT_sd, label = paste("SM =", df_interactions$SM)) +
scale_x_continuous(
name = unique(df_calculations$Drug[1]),
breaks = unique(df_calculations$conc_num_factor),
labels = unique(as.character(df_calculations$conc_num))) +
theme_publication()
})
# plot_list <- lapply(seq_along(variables), function(i) {
# var <- variables[i]
# WT_sd <- WT_sds[[i]]
save_plots(file_prefix, plot_list, output_dir)
# ggplot(df_calculations, aes(conc_num_factor, !!sym(var))) +
# geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
# coord_cartesian(ylim = c(-WT_sd, WT_sd)) +
# geom_errorbar(aes(ymin = 0 - (2 * WT_sd), ymax = 0 + (2 * WT_sd)), alpha = 0.3) +
# ggtitle(paste(df_calculations$OrfRep[1], df_calculations$Gene[1], sep = " ")) +
# annotate("text", x = 1, y = 0.9 * WT_sd, label = paste("ZShift =", round(df_interactions[[paste0("Z_Shift_", var)]], 2))) +
# annotate("text", x = 1, y = 0.7 * WT_sd, label = paste("lm Zscore =", round(df_interactions[[paste0("Z_lm_", var)]], 2))) +
# annotate("text", x = 1, y = -0.7 * WT_sd, label = paste("NG =", df_interactions$NG)) +
# annotate("text", x = 1, y = -0.9 * WT_sd, label = paste("DB =", df_interactions$DB)) +
# annotate("text", x = 1, y = -1.1 * WT_sd, label = paste("SM =", df_interactions$SM)) +
# scale_x_continuous(
# name = unique(df_calculations$Drug[1]),
# breaks = unique(df_calculations$conc_num_factor),
# labels = unique(as.character(df_calculations$conc_num))) +
# theme_publication()
# })
# save_plots(file_prefix, plot_list, output_dir)
# }
# generate_summary_plots <- function(df, output_dir) {
# variables <- c("L", "K", "r", "AUC")
# plot_list <- lapply(variables, function(var) {
# generate_plot(df, x_var = "conc_num_factor", y_var = var, plot_type = "scatter", title = paste("Summary Plot for", var))
# })
# save_plots("Summary_Plots", plot_list, output_dir)
# }
# # Generate ranked plots for a specific metric
# generate_ranked_plot <- function(df, rank_var, zscore_var, sd_threshold, title_prefix) {
# ggplot(df, aes(x = {{rank_var}}, y = {{zscore_var}})) +
# ggtitle(paste(title_prefix, "above", sd_threshold, "SD")) +
# xlab("Rank") + ylab(paste("Avg Z score", title_prefix)) +
# annotate("rect", xmin = -Inf, xmax = Inf, ymin = sd_threshold, ymax = Inf, fill = "#542788", alpha = 0.3) +
# annotate("rect", xmin = -Inf, xmax = Inf, ymin = -sd_threshold, ymax = -Inf, fill = "orange", alpha = 0.3) +
# geom_hline(yintercept = c(-sd_threshold, sd_threshold)) +
# geom_point(size = 0.1, shape = 3) +
# theme_publication()
# }
# # Generate and save all ranked plots
# generate_and_save_ranked_plots <- function(df, output_dir, prefix) {
# rank_metrics <- list(
# list("L_Rank", "Avg_Zscore_L", "L"),
# list("K_Rank", "Avg_Zscore_K", "K"),
# list("r_Rank", "Avg_Zscore_r", "r"),
# list("AUC_Rank", "Avg_Zscore_AUC", "AUC"),
# list("L_Rank_lm", "Z_lm_L", "L"),
# list("K_Rank_lm", "Z_lm_K", "K"),
# list("r_Rank_lm", "Z_lm_r", "r"),
# list("AUC_Rank_lm", "Z_lm_AUC", "AUC")
# )
# pdf(file.path(output_dir, paste0(prefix, ".pdf")), width = 18, height = 12, onefile = TRUE)
# for (sd_threshold in c(1, 2, 3)) {
# for (metric in rank_metrics) {
# plot <- generate_ranked_plot(df, sym(metric[[1]]), sym(metric[[2]]), sd_threshold, metric[[3]])
# print(plot)
# }
# }
# dev.off()
# }
generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num",
title, x_label = NULL, y_label = NULL, ylim_vals = NULL) {
plot <- ggplot(df, aes_string(x = x_var, color = color_var))
if (!is.null(y_var)) {
plot <- plot + aes_string(y = y_var)
}
generate_summary_plots <- function(df, output_dir) {
variables <- c("L", "K", "r", "AUC")
plot_list <- lapply(variables, function(var) {
generate_plot(df, x_var = "conc_num_factor", y_var = var, plot_type = "scatter", title = paste("Summary Plot for", var))
})
# Set up the plot based on the requested plot type
plot <- switch(plot_type,
"scatter" = plot + geom_point() + geom_smooth(method = "lm", se = FALSE),
"box" = plot + geom_boxplot(),
"density" = plot + geom_density(),
"bar" = plot + geom_bar(stat = "identity"),
plot # Default: return the plot as is
)
save_plots("Summary_Plots", plot_list, output_dir)
if (!is.null(ylim_vals)) {
plot <- plot + coord_cartesian(ylim = ylim_vals)
}
# Add titles and labels if available
plot <- plot + ggtitle(title) + theme_publication()
if (!is.null(x_label)) plot <- plot + xlab(x_label)
if (!is.null(y_label)) plot <- plot + ylab(y_label)
return(plot)
}
generate_and_save_plots <- function(df, output_dir, file_prefix, variables, plot_type = "scatter", include_qc = FALSE, ylim_vals = NULL) {
plots <- list()
if (nrow(df) == 0) {
message("The \"", deparse(substitute(df)), "\" dataframe is empty, skipping plots")
return()
}
message("Generating plots for \"", deparse(substitute(df)), "\" dataframe")
# Create plots for the given variables
for (var in variables) {
plot <- generate_plot(
df = df,
x_var = "scan",
y_var = var,
plot_type = plot_type,
title = paste(file_prefix, "Plot for", var),
ylim_vals = ylim_vals
)
plots[[paste0(var, "_", plot_type)]] <- plot
}
# Include additional QC plots if requested
if (include_qc) {
plots[["Raw_L_vs_K"]] <- generate_plot(df, "L", "K", "scatter", "Raw L vs K before QC")
plots[["Delta_bg_Density"]] <- generate_plot(df, "delta_bg", NULL, "density", "Density plot for Delta Background")
plots[["Delta_bg_Bar"]] <- generate_plot(df, "delta_bg", NULL, "bar", "Bar plot for Delta Background")
}
save_plots(file_prefix, plots, output_dir)
}
# Ensure all plots are saved and printed to PDF
save_plots <- function(file_name, plot_list, output_dir) {
# Save to PDF
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
lapply(plot_list, function(plot) {
print(plot)
})
dev.off()
# Save to HTML with horizontal legend orientation
lapply(names(plot_list), function(plot_name) {
message("Generating plot: ", plot_name)
pgg <- tryCatch({
suppressWarnings(ggplotly(plot_list[[plot_name]]) %>%
layout(legend = list(orientation = "h")))
}, error = function(e) {
message("Error in plot: ", plot_name, "\n", e)
return(NULL)
})
if (!is.null(pgg)) {
saveWidget(pgg,
file = file.path(output_dir,
paste0(file_name, "_", plot_name, ".html")),
selfcontained = TRUE)
}
})
}
@@ -472,42 +524,6 @@ adjust_missing_and_rank <- function(df) {
return(df)
}
# Generate ranked plots for a specific metric
generate_ranked_plot <- function(df, rank_var, zscore_var, sd_threshold, title_prefix) {
ggplot(df, aes(x = {{rank_var}}, y = {{zscore_var}})) +
ggtitle(paste(title_prefix, "above", sd_threshold, "SD")) +
xlab("Rank") + ylab(paste("Avg Z score", title_prefix)) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = sd_threshold, ymax = Inf, fill = "#542788", alpha = 0.3) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = -sd_threshold, ymax = -Inf, fill = "orange", alpha = 0.3) +
geom_hline(yintercept = c(-sd_threshold, sd_threshold)) +
geom_point(size = 0.1, shape = 3) +
theme_publication()
}
# Generate and save all ranked plots
generate_and_save_ranked_plots <- function(df, output_dir, prefix) {
rank_metrics <- list(
list("L_Rank", "Avg_Zscore_L", "L"),
list("K_Rank", "Avg_Zscore_K", "K"),
list("r_Rank", "Avg_Zscore_r", "r"),
list("AUC_Rank", "Avg_Zscore_AUC", "AUC"),
list("L_Rank_lm", "Z_lm_L", "L"),
list("K_Rank_lm", "Z_lm_K", "K"),
list("r_Rank_lm", "Z_lm_r", "r"),
list("AUC_Rank_lm", "Z_lm_AUC", "AUC")
)
pdf(file.path(output_dir, paste0(prefix, ".pdf")), width = 18, height = 12, onefile = TRUE)
for (sd_threshold in c(1, 2, 3)) {
for (metric in rank_metrics) {
plot <- generate_ranked_plot(df, sym(metric[[1]]), sym(metric[[2]]), sd_threshold, metric[[3]])
print(plot)
}
}
dev.off()
}
# Function to create and save all ranked plots
create_ranked_plots <- function(df, output_dir) {
@@ -520,8 +536,45 @@ create_ranked_plots <- function(df, output_dir) {
generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_lm_naRM")
}
generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", title_prefix = "",
x_label = NULL, y_label = NULL, ylim_vals = NULL, output_dir, file_prefix = "plot") {
plot_list <- list()
# Iterate over each y variable and generate the corresponding plot
for (var in y_vars) {
plot <- ggplot(df, aes(x = !!sym(x_var), y = !!sym(var), color = as.factor(!!sym(color_var)))) +
ggtitle(paste(title_prefix, var)) +
theme_publication()
if (plot_type == "scatter") {
plot <- plot +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE)
if (!is.null(ylim_vals)) {
plot <- plot + coord_cartesian(ylim = ylim_vals)
}
} else if (plot_type == "box") {
plot <- plot + geom_boxplot()
} else if (plot_type == "density") {
plot <- plot + geom_density()
} else if (plot_type == "bar") {
plot <- plot + geom_bar(stat = "identity")
}
if (!is.null(x_label)) plot <- plot + xlab(x_label)
if (!is.null(y_label)) plot <- plot + ylab(y_label)
plot_list[[var]] <- plot
}
# Save plots to PDF and HTML
save_plots(file_prefix, plot_list, output_dir)
}
main <- function() {
# Applying to all experiments
lapply(names(args$experiments), function(exp_name) {
exp <- args$experiments[[exp_name]]
exp_path <- exp$path
@@ -537,34 +590,28 @@ main <- function() {
max_conc <- max(df$conc_num_factor)
# QC
# Filter the df above sd tolerance for raw L vs K plot
# QC steps and filtering
df_above_tolerance <- df %>% filter(DB == 1)
# Set vars above the delta background tolerance to NA
df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .x)))
# Remove rows where L is 0
df_no_zeros <- df_na %>% filter(L > 0)
# Flag and remove non-finite data, printing affected rows
df_na_filtered <- df_na %>%
filter(if_any(c(L), ~ !is.finite(.))) %>% # Add L, r, AUC, K as needed for debugging
filter(if_any(c(L), ~ !is.finite(.))) %>%
{
if (nrow(.) > 0) {
message("Removing non-finite rows:\n")
print(head(., n = 10))
}
df_na %>% filter(if_all(c(L), is.finite)) # Add L, r, AUC, K as needed for debugging
df_na %>% filter(if_all(c(L), is.finite))
}
# Generate QC PDFs and HTMLs
# message("Generating QC plots")
# variables <- c("L", "K", "r", "AUC", "delta_bg")
# generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
# generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
# generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
# generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)
# Generate and save QC plots using the new generalized function
message("Generating QC plots")
variables <- c("L", "K", "r", "AUC", "delta_bg")
generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)
rm(df, df_above_tolerance, df_no_zeros)
@@ -577,37 +624,35 @@ main <- function() {
# Originally this filtered L NA's
# Filter data within 2SD
# Filter data within and outside 2SD
within_2sd_k <- stats_joined %>%
filter(K >= (K_mean - 2 * K_sd) & K <= (K_mean + 2 * K_sd))
# Filter data outside 2SD
outside_2sd_k <- stats_joined %>%
filter(K < (K_mean - 2 * K_sd) | K > (K_mean + 2 * K_sd))
message("Calculating summary statistics for L within 2SD of K")
# Summary statistics for within and outside 2SD of K
l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
# Write CSV files
write.csv(l_within_2sd_k, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
write.csv(l_outside_2sd_k, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
message("Calculating summary statistics for L within 2SD of K")
cols_to_remove <- names(l_within_2sd_k)
cols_to_keep <- c("conc_num", "conc_num_factor")
within_2sd_k_clean <- within_2sd_k %>%
select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
l_within_2sd_k_joined <- within_2sd_k_clean %>%
left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor"))
write.csv(l_within_2sd_k,
file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"),
row.names = FALSE)
message("Calculating summary statistics for for L outside 2SD of K")
l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
cols_to_remove <- names(l_outside_2sd_k)
cols_to_keep <- c("conc_num", "conc_num_factor")
outside_2sd_k_clean <- outside_2sd_k %>%
select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
l_outside_2sd_k_joined <- outside_2sd_k_clean %>%
left_join(l_outside_2sd_k, by = c("conc_num", "conc_num_factor"))
write.csv(l_outside_2sd_k,
file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"),
row.names = FALSE)
# Process background strains
bg_strains <- c("YDL227C")
@@ -646,18 +691,14 @@ main <- function() {
filter(OrfRep != strain) %>%
mutate(SM = 0)
message("Processing reference strain")
reference_strain <- process_strains(l_within_2sd_k_joined)
message("Processing deletion strains")
deletion_strains <- process_strains(l_within_2sd_k_joined)
# TODO we may need to add "num" to grouping vars
# Calculate interactions
variables <- c("L", "K", "r", "AUC")
message("Calculating reference interaction scores")
reference_results <- calculate_interaction_scores(stats_joined, reference_strain, max_conc, variables)
message("Calculating deletion interaction scores")
deletion_results <- calculate_interaction_scores(stats_joined, deletion_strains, max_conc, variables)
zscores_calculations_reference <- reference_results$zscores_calculations
@@ -666,11 +707,11 @@ main <- function() {
zscores_interactions <- deletion_results$zscores_interactions
# TODO: I don't know if we need this?
zscores_interactions <- zscores_interactions %>%
arrange(desc(Z_lm_L)) %>%
arrange(desc(NG))
# zscores_interactions <- zscores_interactions %>%
# arrange(desc(Z_lm_L)) %>%
# arrange(desc(NG))
message("Writing zscores csv files")
# Writing Z-Scores to file
write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE)
write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE)
write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE)
@@ -678,10 +719,11 @@ main <- function() {
# Define conditions for enhancers and suppressors
# TODO Add to study config file?
enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= 2
suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -2
enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= 2
suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -2
threshold <- 2
enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= threshold
suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -threshold
enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= threshold
suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -threshold
# Subset data
enhancers_L <- zscores_interactions[enhancer_condition_L, ]
@@ -707,10 +749,11 @@ main <- function() {
file = file.path(out_dir, "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K.csv"), row.names = FALSE)
# Handle linear model based enhancers and suppressors
enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= 2, ]
suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -2, ]
enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= 2, ]
suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -2, ]
lm_threshold <- 2
enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= lm_threshold, ]
suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -lm_threshold, ]
enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= lm_threshold, ]
suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -lm_threshold, ]
# Save linear model based enhancers and suppressors
message("Writing linear model enhancer/suppressor csv files")
@@ -724,8 +767,10 @@ main <- function() {
file = file.path(out_dir, "ZScores_Interaction_DeletionSuppressors_K_lm.csv"), row.names = FALSE)
# Generate plots for interaction scores
generate_rf_plots(zscores_calculations_reference, zscores_interactions_reference, out_dir)
generate_rf_plots(zscores_calculations, zscores_interactions, out_dir)
# generate_rf_plots(zscores_calculations_reference, zscores_interactions_reference, out_dir)
# generate_rf_plots(zscores_calculations, zscores_interactions, out_dir)
generate_and_save_plots(zscores_calculations_reference, out_dir, "Reference_Calculations", variables)
generate_and_save_plots(zscores_calculations, out_dir, "Deletion_Calculations", variables)
# Apply filtering to remove NA values before further analysis
zscores_interactions_filtered <- zscores_interactions %>%

View File

@@ -785,7 +785,7 @@ init_project() {
debug "Running: ${FUNCNAME[0]}"
# Create a project scans dir
[[ -d $PROJECT_SCANS_DIR ]] || (execute mkdir -p "$PROJECT_SCANS_DIR" || return 1)
[[ -d $PROJECT_SCANS_DIR ]] || { execute mkdir -p "$PROJECT_SCANS_DIR" || return 1; }
# TODO eventually copy scans from robot but for now let's pause and wait for transfer
echo "In the future we will copy scans from robot here"
@@ -1561,7 +1561,7 @@ join_interaction_zscores() {
"${EXP_PATHS_AND_NAMES[@]}"
for f in "${out_files[@]}"; do
[[ -f $f ]] || (echo "$f does not exist"; return 1)
[[ -f $f ]] || { echo "$f does not exist"; return 1; }
done
}
@@ -1606,7 +1606,7 @@ r_gta() {
"${5:-"$GTA_OUT_DIR"}" \
"${@:6}" # future arguments
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
[[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; }
}
@@ -1779,7 +1779,7 @@ r_add_shift_values() {
rm -f "$STUDY_RESULTS_DIR/REMcHeatmaps/"*.pdf # TODO why?
out_file="${4:-"$STUDY_RESULTS_DIR/REMcWithShift.csv"}"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
[[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; }
}
@@ -1815,7 +1815,7 @@ r_create_heat_maps() {
pdfs=(REMcHeatmaps/*.pdf)
execute pdftk "${pdfs[@]}" output "$out_file"
out_file="$2/compiledREMcHeatmaps.pdf"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
[[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; }
}
@@ -1845,7 +1845,7 @@ r_heat_maps_homology() {
pdfs=("${1:-"$STUDY_RESULTS_DIR/homology"}"/*.pdf)
execute pdftk "${pdfs[@]}" output "$out_file"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
[[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; }
}
@@ -1869,7 +1869,7 @@ py_gtf_dcon() {
"$2/" \
"${@:3}" # future arguments
out_file="$2/1-0-0-finaltable.csv"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1)
[[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; }
}
@@ -1930,7 +1930,7 @@ py_gtf_concat() {
debug "Running: ${FUNCNAME[0]} $*"
script="$APPS_DIR/python/concatGTFResults.py"
execute "$PYTHON" "$script" "$1/" "$2"
[[ -f $2 ]] || (echo "$2 does not exist"; return 1)
[[ -f $2 ]] || { echo "$2 does not exist"; return 1; }
}
@@ -1983,7 +1983,7 @@ handle_results_dir() {
results_dir="${results_dirs[0]}"
else
# Print results dirs
[[ ${#results_dirs[@]} -gt 0 ]] || (err "No ${2}s found"; return 1)
[[ ${#results_dirs[@]} -gt 0 ]] || { err "No ${2}s found"; return 1; }
print_in_columns "results_dirs"
# Results selection prompt
cat <<-EOF
@@ -2091,7 +2091,7 @@ handle_config() {
# Declare a nameref to refer to the actual array using the variable name
declare -n config_array_ref="$config_array_name"
[[ -z ${!config_array_ref[*]} ]] && (err "No $config_array_name array found in $config_file" && return 1)
[[ -z ${!config_array_ref[*]} ]] && { err "No $config_array_name array found in $config_file" && return 1; }
for key in "${!config_array_ref[@]}"; do
IFS=',' read -r main_key sub_key <<< "$key"
@@ -2298,7 +2298,7 @@ main() {
if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then
echo "Module ${MODULES[$i]} is not available, removing"
read -r -p "Enter replacement module name: " module
! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${module}[[:space:]] ]] || (echo "RTFM"; return 1)
! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${module}[[:space:]] ]] || { echo "RTFM"; return 1; }
MODULES[i]="$module"
fi
unset module