Before old plotting removal

This commit is contained in:
2024-09-04 02:03:23 -04:00
parent 111909914c
commit d52903f8fa
2 changed files with 327 additions and 282 deletions

View File

@@ -163,102 +163,6 @@ update_gene_names <- function(df, sgd_gene_list) {
return(df) return(df)
} }
generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num", title, x_label = NULL, y_label = NULL) {
# Start the ggplot with just the x_var and color_var
plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) +
ggtitle(title) +
theme_publication()
# If y_var is not NULL, add it to the plot aesthetics
if (!is.null(y_var)) {
plot <- plot + aes(y = !!sym(y_var))
}
# Handle different plot types
plot <- switch(plot_type,
"scatter" = plot + geom_point(shape = 3, size = 0.2, position = "jitter") +
stat_summary(fun.data = mean_sdl, geom = "errorbar") +
stat_summary(fun = mean, geom = "point", size = 0.6),
"box" = plot + geom_boxplot(),
"density" = plot + geom_density(),
"bar" = plot + geom_bar(stat = ifelse(is.null(y_var), "count", "identity"))
)
# Add optional labels if provided
if (!is.null(x_label)) plot <- plot + xlab(x_label)
if (!is.null(y_label)) plot <- plot + ylab(y_label)
return(plot)
}
generate_and_save_plots <- function(df, output_dir, prefix, variables, include_qc = FALSE) {
plots <- list()
if (nrow(df) == 0) {
message("The \"", deparse(substitute(df)), "\" dataframe is empty, skipping plots")
return()
}
message("Generating plots for \"", deparse(substitute(df)), "\" dataframe")
for (var in variables) {
scatter_plot <-
generate_plot(df, x_var = "scan", y_var = var, plot_type = "scatter",
title = paste(prefix, "Scatter Plot for", var))
boxplot <-
generate_plot(df, x_var = "scan", y_var = var, plot_type = "box",
title = paste(prefix, "Box Plot for", var))
plots[[paste0(var, "_scatter")]] <- scatter_plot
plots[[paste0(var, "_box")]] <- boxplot
}
if (include_qc) {
plots[["Raw_L_vs_K"]] <-
generate_plot(df, x_var = "L", y_var = "K", plot_type = "scatter",
title = "Raw L vs K before QC")
plots[["Delta_bg_Density"]] <-
generate_plot(df, x_var = "delta_bg", plot_type = "density", color_var = "conc_num",
title = "Density plot for Delta Background by Conc All Data")
plots[["Delta_bg_Bar"]] <-
generate_plot(df, x_var = "delta_bg", plot_type = "bar",
title = "Bar plot for Delta Background by Conc All Data")
}
save_plots(prefix, plots, output_dir)
}
# Ensure all plots are saved and printed to PDF
save_plots <- function(file_name, plot_list, output_dir) {
# Save to PDF
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
lapply(plot_list, function(plot) {
print(plot)
})
dev.off()
# Save to HTML with horizontal legend orientation
lapply(names(plot_list), function(plot_name) {
message("Generating plot: ", plot_name)
pgg <- tryCatch({
suppressWarnings(ggplotly(plot_list[[plot_name]]) %>%
layout(legend = list(orientation = "h")))
}, error = function(e) {
message("Error in plot: ", plot_name, "\n", e)
return(NULL)
})
if (!is.null(pgg)) {
saveWidget(pgg,
file = file.path(output_dir,
paste0(file_name, "_", plot_name, ".html")),
selfcontained = TRUE)
}
})
}
# Process strains (deletion and reference) # Process strains (deletion and reference)
process_strains <- function(df) { process_strains <- function(df) {
df_strains <- data.frame() # Initialize an empty dataframe to store results df_strains <- data.frame() # Initialize an empty dataframe to store results
@@ -295,7 +199,7 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
max = ~max(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE),
min = ~min(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE), sd = ~sd(.x, na.rm = TRUE),
se = ~sd(.x, na.rm = TRUE) / sqrt(n() - 1) se = ~sd(.x, na.rm = TRUE) / sqrt(n() - 1) # TODO why - 1?
), .names = "{.col}_{.fn}")) ), .names = "{.col}_{.fn}"))
return(summary_stats) return(summary_stats)
@@ -305,48 +209,70 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_
# Pull the background means # Pull the background means
print("Calculating background means") print("Calculating background means")
l_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_mean) L_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_mean)
k_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_mean) K_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_mean)
r_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(r_mean) r_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(r_mean)
auc_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_mean) AUC_mean_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_mean)
# Calculate all necessary statistics and shifts in one step L_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(L_sd)
K_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(K_sd)
r_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(r_sd)
AUC_sd_bg <- df_ref %>% filter(conc_num_factor == 0) %>% pull(AUC_sd)
# Calculate summary statistics and shifts
print("Calculating interaction scores part 1") print("Calculating interaction scores part 1")
interaction_scores_all <- df %>% interaction_scores_all <- df %>%
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
summarise( summarise(
N = n(), N = n(),
across(all_of(variables), list(mean = mean, sd = sd), na.rm = TRUE), L_mean = mean(L),
NG = sum(NG, na.rm = TRUE), L_median = median(L),
SM = sum(SM, na.rm = TRUE), L_sd = sd(L),
raw_shift_l = mean(L, na.rm = TRUE) - l_mean_bg, L_se = L_sd / sqrt(N),
raw_shift_k = mean(K, na.rm = TRUE) - k_mean_bg, K_mean = mean(K),
raw_shift_r = mean(r, na.rm = TRUE) - r_mean_bg, K_median = median(K),
raw_shift_auc = mean(AUC, na.rm = TRUE) - auc_mean_bg, K_sd = sd(K),
z_shift_l = raw_shift_l / L_sd[1], K_se = K_sd / sqrt(N),
z_shift_k = raw_shift_k / K_sd[1], r_mean = mean(r),
z_shift_r = raw_shift_r / r_sd[1], r_median = median(r),
z_shift_auc = raw_shift_auc / AUC_sd[1], r_sd = sd(r),
exp_l = l_mean_bg + raw_shift_l, r_se = r_sd / sqrt(N),
exp_k = k_mean_bg + raw_shift_k, AUC_mean = mean(AUC),
exp_r = r_mean_bg + raw_shift_r, AUC_median = median(AUC),
exp_auc = auc_mean_bg + raw_shift_auc, AUC_sd = sd(AUC),
delta_l = mean(L, na.rm = TRUE) - exp_l, AUC_se = AUC_sd / sqrt(N),
delta_k = mean(K, na.rm = TRUE) - exp_k, NG = sum(NG),
delta_r = mean(r, na.rm = TRUE) - exp_r, DB = sum(DB),
delta_auc = mean(AUC, na.rm = TRUE) - exp_auc, SM = sum(SM),
zscore_l = delta_l / L_sd, Raw_Shift_L = L_mean - L_mean_bg[[1]],
zscore_k = delta_k / K_sd, Raw_Shift_K = K_mean - K_mean_bg[[1]],
zscore_r = delta_r / r_sd, Raw_Shift_r = r_mean - r_mean_bg[[1]],
zscore_auc = delta_auc / AUC_sd, Raw_Shift_AUC = AUC_mean - AUC_mean_bg[[1]],
sum_z_score_l = sum(zscore_l, na.rm = TRUE), Z_Shift_L = Raw_Shift_L / L_sd[[0]],
avg_zscore_l = sum_z_score_l / (length(variables) - sum(NG, na.rm = TRUE)), Z_Shift_K = Raw_Shift_K / K_sd[[0]],
sum_z_score_k = sum(zscore_k, na.rm = TRUE), Z_Shift_r = Raw_Shift_r / r_sd[[0]],
avg_zscore_k = sum_z_score_k / (length(variables) - sum(NG, na.rm = TRUE)), Z_Shift_AUC = Raw_Shift_AUC / AUC_sd[[0]],
sum_z_score_r = sum(zscore_r, na.rm = TRUE), WT_l = df$L_mean,
avg_zscore_r = sum_z_score_r / (length(variables) - sum(NG, na.rm = TRUE)), WT_K = df$K_mean,
sum_z_score_auc = sum(zscore_auc, na.rm = TRUE), WT_r = df$r_mean,
avg_zscore_auc = sum_z_score_auc / (length(variables) - sum(NG, na.rm = TRUE)) WT_AUC = df$AUC_mean,
WT_sd_l = L_sd_bg[[1]],
WT_sd_K = K_sd_bg[[1]],
WT_sd_r = r_sd_bg[[1]],
WT_sd_AUC = AUC_sd_bg[[1]],
Exp_L = L_mean_bg[[1]] + Raw_Shift_L,
Exp_K = K_mean_bg[[1]] + Raw_Shift_K,
Exp_r = r_mean_bg[[1]] + Raw_Shift_r,
Exp_AUC = AUC_mean_bg[[1]] + Raw_Shift_AUC,
Delta_L = ifelse(NG == 1, mean(L) - WT_l, Delta_L),
Delta_L = ifelse(SM == 1, mean(L) - WT_l, Delta_L),
Delta_K = ifelse(NG == 1, mean(K) - WT_K, Delta_K),
Delta_r = ifelse(NG == 1, mean(r) - WT_r, Delta_r),
Delta_AUC = ifelse(NG == 1, mean(AUC) - WT_AUC, Delta_AUC),
Zscore_L = Delta_L / WT_sd_l,
Zscore_K = Delta_K / WT_sd_K,
Zscore_r = Delta_r / WT_sd_r,
Zscore_AUC = Delta_AUC / WT_sd_AUC,
) %>% ) %>%
ungroup() ungroup()
@@ -355,10 +281,10 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_
interaction_scores <- interaction_scores_all %>% interaction_scores <- interaction_scores_all %>%
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
summarise( summarise(
lm_l = list(lm(delta_l ~ conc_num_factor)), lm_l = list(lm(Delta_L ~ conc_num_factor)),
lm_k = list(lm(delta_k ~ conc_num_factor)), lm_k = list(lm(Delta_K ~ conc_num_factor)),
lm_r = list(lm(delta_r ~ conc_num_factor)), lm_r = list(lm(Delta_r ~ conc_num_factor)),
lm_auc = list(lm(delta_auc ~ conc_num_factor)), lm_auc = list(lm(Delta_AUC ~ conc_num_factor)),
lm_Score_L = max_conc * coef(lm_l[[1]])[2] + coef(lm_l[[1]])[1], lm_Score_L = max_conc * coef(lm_l[[1]])[2] + coef(lm_l[[1]])[1],
lm_Score_K = max_conc * coef(lm_k[[1]])[2] + coef(lm_k[[1]])[1], lm_Score_K = max_conc * coef(lm_k[[1]])[2] + coef(lm_k[[1]])[1],
lm_Score_r = max_conc * coef(lm_r[[1]])[2] + coef(lm_r[[1]])[1], lm_Score_r = max_conc * coef(lm_r[[1]])[2] + coef(lm_r[[1]])[1],
@@ -367,66 +293,192 @@ calculate_interaction_scores <- function(df_ref, df, max_conc, variables, group_
R_Squared_K = summary(lm_k[[1]])$r.squared, R_Squared_K = summary(lm_k[[1]])$r.squared,
R_Squared_r = summary(lm_r[[1]])$r.squared, R_Squared_r = summary(lm_r[[1]])$r.squared,
R_Squared_AUC = summary(lm_auc[[1]])$r.squared, R_Squared_AUC = summary(lm_auc[[1]])$r.squared,
Sum_Z_Score_L = sum(zscore_l, na.rm = TRUE), Sum_Z_Score_L = sum(Zscore_L),
Avg_Zscore_L = avg_zscore_l[1], Avg_Zscore_L = Sum_Z_Score_L / (length(variables) - sum(NG)),
Sum_Z_Score_K = sum(zscore_k, na.rm = TRUE), Sum_Z_Score_K = sum(Zscore_K),
Avg_Zscore_K = avg_zscore_k[1], Avg_Zscore_K = Sum_Z_Score_K / (length(variables) - sum(NG)),
Sum_Z_Score_r = sum(zscore_r, na.rm = TRUE), Sum_Z_Score_r = sum(Zscore_r),
Avg_Zscore_r = avg_zscore_r[1], Avg_Zscore_r = Sum_Z_Score_r / (length(variables) - sum(NG)),
Sum_Z_Score_AUC = sum(zscore_auc, na.rm = TRUE), Sum_Z_Score_AUC = sum(Zscore_AUC),
Avg_Zscore_AUC = avg_zscore_auc[1], Avg_Zscore_AUC = Sum_Z_Score_AUC / (length(variables) - sum(NG)),
NG = sum(NG, na.rm = TRUE), NG = sum(NG),
DB = sum(DB, na.rm = TRUE), DB = sum(DB),
SM = sum(SM, na.rm = TRUE) SM = sum(SM)
) %>% ) %>%
ungroup() ungroup()
# Ensure interaction_scores has only the specified columns
interaction_scores <- interaction_scores %>%
select(Gene, raw_shift_l, z_shift_l, lm_Score_L, Z_lm_L = lm_Score_L, R_Squared_L, sum_z_score_l, avg_zscore_l,
raw_shift_k, z_shift_k, lm_Score_K, Z_lm_K = lm_Score_K, R_Squared_K, sum_z_score_k, avg_zscore_k,
raw_shift_r, z_shift_r, lm_Score_r, Z_lm_r = lm_Score_r, R_Squared_r, sum_z_score_r, avg_zscore_r,
raw_shift_auc, z_shift_auc, lm_Score_AUC, Z_lm_AUC = lm_Score_AUC, R_Squared_AUC, sum_z_score_auc, avg_zscore_auc,
NG, DB, SM)
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
} }
generate_rf_plots <- function(df_calculations, df_interactions, output_dir, file_prefix = "RF") {
variables <- c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC")
WT_sds <- list(WT_sd_l = 2, WT_sd_K = 2, WT_sd_r = 0.65, WT_sd_AUC = 6500)
plot_list <- lapply(seq_along(variables), function(i) { # generate_rf_plots <- function(df_calculations, df_interactions, output_dir, file_prefix = "RF") {
var <- variables[i] # variables <- c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC")
WT_sd <- WT_sds[[i]] # WT_sds <- list(WT_sd_l = 2, WT_sd_K = 2, WT_sd_r = 0.65, WT_sd_AUC = 6500)
ggplot(df_calculations, aes(conc_num_factor, !!sym(var))) + # plot_list <- lapply(seq_along(variables), function(i) {
geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + # var <- variables[i]
coord_cartesian(ylim = c(-WT_sd, WT_sd)) + # WT_sd <- WT_sds[[i]]
geom_errorbar(aes(ymin = 0 - (2 * WT_sd), ymax = 0 + (2 * WT_sd)), alpha = 0.3) +
ggtitle(paste(df_calculations$OrfRep[1], df_calculations$Gene[1], sep = " ")) +
annotate("text", x = 1, y = 0.9 * WT_sd, label = paste("ZShift =", round(df_interactions[[paste0("Z_Shift_", var)]], 2))) +
annotate("text", x = 1, y = 0.7 * WT_sd, label = paste("lm Zscore =", round(df_interactions[[paste0("Z_lm_", var)]], 2))) +
annotate("text", x = 1, y = -0.7 * WT_sd, label = paste("NG =", df_interactions$NG)) +
annotate("text", x = 1, y = -0.9 * WT_sd, label = paste("DB =", df_interactions$DB)) +
annotate("text", x = 1, y = -1.1 * WT_sd, label = paste("SM =", df_interactions$SM)) +
scale_x_continuous(
name = unique(df_calculations$Drug[1]),
breaks = unique(df_calculations$conc_num_factor),
labels = unique(as.character(df_calculations$conc_num))) +
theme_publication()
})
save_plots(file_prefix, plot_list, output_dir) # ggplot(df_calculations, aes(conc_num_factor, !!sym(var))) +
# geom_point() + geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
# coord_cartesian(ylim = c(-WT_sd, WT_sd)) +
# geom_errorbar(aes(ymin = 0 - (2 * WT_sd), ymax = 0 + (2 * WT_sd)), alpha = 0.3) +
# ggtitle(paste(df_calculations$OrfRep[1], df_calculations$Gene[1], sep = " ")) +
# annotate("text", x = 1, y = 0.9 * WT_sd, label = paste("ZShift =", round(df_interactions[[paste0("Z_Shift_", var)]], 2))) +
# annotate("text", x = 1, y = 0.7 * WT_sd, label = paste("lm Zscore =", round(df_interactions[[paste0("Z_lm_", var)]], 2))) +
# annotate("text", x = 1, y = -0.7 * WT_sd, label = paste("NG =", df_interactions$NG)) +
# annotate("text", x = 1, y = -0.9 * WT_sd, label = paste("DB =", df_interactions$DB)) +
# annotate("text", x = 1, y = -1.1 * WT_sd, label = paste("SM =", df_interactions$SM)) +
# scale_x_continuous(
# name = unique(df_calculations$Drug[1]),
# breaks = unique(df_calculations$conc_num_factor),
# labels = unique(as.character(df_calculations$conc_num))) +
# theme_publication()
# })
# save_plots(file_prefix, plot_list, output_dir)
# }
# generate_summary_plots <- function(df, output_dir) {
# variables <- c("L", "K", "r", "AUC")
# plot_list <- lapply(variables, function(var) {
# generate_plot(df, x_var = "conc_num_factor", y_var = var, plot_type = "scatter", title = paste("Summary Plot for", var))
# })
# save_plots("Summary_Plots", plot_list, output_dir)
# }
# # Generate ranked plots for a specific metric
# generate_ranked_plot <- function(df, rank_var, zscore_var, sd_threshold, title_prefix) {
# ggplot(df, aes(x = {{rank_var}}, y = {{zscore_var}})) +
# ggtitle(paste(title_prefix, "above", sd_threshold, "SD")) +
# xlab("Rank") + ylab(paste("Avg Z score", title_prefix)) +
# annotate("rect", xmin = -Inf, xmax = Inf, ymin = sd_threshold, ymax = Inf, fill = "#542788", alpha = 0.3) +
# annotate("rect", xmin = -Inf, xmax = Inf, ymin = -sd_threshold, ymax = -Inf, fill = "orange", alpha = 0.3) +
# geom_hline(yintercept = c(-sd_threshold, sd_threshold)) +
# geom_point(size = 0.1, shape = 3) +
# theme_publication()
# }
# # Generate and save all ranked plots
# generate_and_save_ranked_plots <- function(df, output_dir, prefix) {
# rank_metrics <- list(
# list("L_Rank", "Avg_Zscore_L", "L"),
# list("K_Rank", "Avg_Zscore_K", "K"),
# list("r_Rank", "Avg_Zscore_r", "r"),
# list("AUC_Rank", "Avg_Zscore_AUC", "AUC"),
# list("L_Rank_lm", "Z_lm_L", "L"),
# list("K_Rank_lm", "Z_lm_K", "K"),
# list("r_Rank_lm", "Z_lm_r", "r"),
# list("AUC_Rank_lm", "Z_lm_AUC", "AUC")
# )
# pdf(file.path(output_dir, paste0(prefix, ".pdf")), width = 18, height = 12, onefile = TRUE)
# for (sd_threshold in c(1, 2, 3)) {
# for (metric in rank_metrics) {
# plot <- generate_ranked_plot(df, sym(metric[[1]]), sym(metric[[2]]), sd_threshold, metric[[3]])
# print(plot)
# }
# }
# dev.off()
# }
generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num",
title, x_label = NULL, y_label = NULL, ylim_vals = NULL) {
plot <- ggplot(df, aes_string(x = x_var, color = color_var))
if (!is.null(y_var)) {
plot <- plot + aes_string(y = y_var)
}
# Set up the plot based on the requested plot type
plot <- switch(plot_type,
"scatter" = plot + geom_point() + geom_smooth(method = "lm", se = FALSE),
"box" = plot + geom_boxplot(),
"density" = plot + geom_density(),
"bar" = plot + geom_bar(stat = "identity"),
plot # Default: return the plot as is
)
if (!is.null(ylim_vals)) {
plot <- plot + coord_cartesian(ylim = ylim_vals)
}
# Add titles and labels if available
plot <- plot + ggtitle(title) + theme_publication()
if (!is.null(x_label)) plot <- plot + xlab(x_label)
if (!is.null(y_label)) plot <- plot + ylab(y_label)
return(plot)
} }
generate_summary_plots <- function(df, output_dir) {
variables <- c("L", "K", "r", "AUC")
plot_list <- lapply(variables, function(var) {
generate_plot(df, x_var = "conc_num_factor", y_var = var, plot_type = "scatter", title = paste("Summary Plot for", var))
})
save_plots("Summary_Plots", plot_list, output_dir) generate_and_save_plots <- function(df, output_dir, file_prefix, variables, plot_type = "scatter", include_qc = FALSE, ylim_vals = NULL) {
plots <- list()
if (nrow(df) == 0) {
message("The \"", deparse(substitute(df)), "\" dataframe is empty, skipping plots")
return()
}
message("Generating plots for \"", deparse(substitute(df)), "\" dataframe")
# Create plots for the given variables
for (var in variables) {
plot <- generate_plot(
df = df,
x_var = "scan",
y_var = var,
plot_type = plot_type,
title = paste(file_prefix, "Plot for", var),
ylim_vals = ylim_vals
)
plots[[paste0(var, "_", plot_type)]] <- plot
}
# Include additional QC plots if requested
if (include_qc) {
plots[["Raw_L_vs_K"]] <- generate_plot(df, "L", "K", "scatter", "Raw L vs K before QC")
plots[["Delta_bg_Density"]] <- generate_plot(df, "delta_bg", NULL, "density", "Density plot for Delta Background")
plots[["Delta_bg_Bar"]] <- generate_plot(df, "delta_bg", NULL, "bar", "Bar plot for Delta Background")
}
save_plots(file_prefix, plots, output_dir)
}
# Ensure all plots are saved and printed to PDF
save_plots <- function(file_name, plot_list, output_dir) {
# Save to PDF
pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
lapply(plot_list, function(plot) {
print(plot)
})
dev.off()
# Save to HTML with horizontal legend orientation
lapply(names(plot_list), function(plot_name) {
message("Generating plot: ", plot_name)
pgg <- tryCatch({
suppressWarnings(ggplotly(plot_list[[plot_name]]) %>%
layout(legend = list(orientation = "h")))
}, error = function(e) {
message("Error in plot: ", plot_name, "\n", e)
return(NULL)
})
if (!is.null(pgg)) {
saveWidget(pgg,
file = file.path(output_dir,
paste0(file_name, "_", plot_name, ".html")),
selfcontained = TRUE)
}
})
} }
@@ -472,42 +524,6 @@ adjust_missing_and_rank <- function(df) {
return(df) return(df)
} }
# Generate ranked plots for a specific metric
generate_ranked_plot <- function(df, rank_var, zscore_var, sd_threshold, title_prefix) {
ggplot(df, aes(x = {{rank_var}}, y = {{zscore_var}})) +
ggtitle(paste(title_prefix, "above", sd_threshold, "SD")) +
xlab("Rank") + ylab(paste("Avg Z score", title_prefix)) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = sd_threshold, ymax = Inf, fill = "#542788", alpha = 0.3) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = -sd_threshold, ymax = -Inf, fill = "orange", alpha = 0.3) +
geom_hline(yintercept = c(-sd_threshold, sd_threshold)) +
geom_point(size = 0.1, shape = 3) +
theme_publication()
}
# Generate and save all ranked plots
generate_and_save_ranked_plots <- function(df, output_dir, prefix) {
rank_metrics <- list(
list("L_Rank", "Avg_Zscore_L", "L"),
list("K_Rank", "Avg_Zscore_K", "K"),
list("r_Rank", "Avg_Zscore_r", "r"),
list("AUC_Rank", "Avg_Zscore_AUC", "AUC"),
list("L_Rank_lm", "Z_lm_L", "L"),
list("K_Rank_lm", "Z_lm_K", "K"),
list("r_Rank_lm", "Z_lm_r", "r"),
list("AUC_Rank_lm", "Z_lm_AUC", "AUC")
)
pdf(file.path(output_dir, paste0(prefix, ".pdf")), width = 18, height = 12, onefile = TRUE)
for (sd_threshold in c(1, 2, 3)) {
for (metric in rank_metrics) {
plot <- generate_ranked_plot(df, sym(metric[[1]]), sym(metric[[2]]), sd_threshold, metric[[3]])
print(plot)
}
}
dev.off()
}
# Function to create and save all ranked plots # Function to create and save all ranked plots
create_ranked_plots <- function(df, output_dir) { create_ranked_plots <- function(df, output_dir) {
@@ -520,8 +536,45 @@ create_ranked_plots <- function(df, output_dir) {
generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_lm_naRM") generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_lm_naRM")
} }
generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", title_prefix = "",
x_label = NULL, y_label = NULL, ylim_vals = NULL, output_dir, file_prefix = "plot") {
plot_list <- list()
# Iterate over each y variable and generate the corresponding plot
for (var in y_vars) {
plot <- ggplot(df, aes(x = !!sym(x_var), y = !!sym(var), color = as.factor(!!sym(color_var)))) +
ggtitle(paste(title_prefix, var)) +
theme_publication()
if (plot_type == "scatter") {
plot <- plot +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE)
if (!is.null(ylim_vals)) {
plot <- plot + coord_cartesian(ylim = ylim_vals)
}
} else if (plot_type == "box") {
plot <- plot + geom_boxplot()
} else if (plot_type == "density") {
plot <- plot + geom_density()
} else if (plot_type == "bar") {
plot <- plot + geom_bar(stat = "identity")
}
if (!is.null(x_label)) plot <- plot + xlab(x_label)
if (!is.null(y_label)) plot <- plot + ylab(y_label)
plot_list[[var]] <- plot
}
# Save plots to PDF and HTML
save_plots(file_prefix, plot_list, output_dir)
}
main <- function() { main <- function() {
# Applying to all experiments
lapply(names(args$experiments), function(exp_name) { lapply(names(args$experiments), function(exp_name) {
exp <- args$experiments[[exp_name]] exp <- args$experiments[[exp_name]]
exp_path <- exp$path exp_path <- exp$path
@@ -537,34 +590,28 @@ main <- function() {
max_conc <- max(df$conc_num_factor) max_conc <- max(df$conc_num_factor)
# QC # QC steps and filtering
# Filter the df above sd tolerance for raw L vs K plot
df_above_tolerance <- df %>% filter(DB == 1) df_above_tolerance <- df %>% filter(DB == 1)
# Set vars above the delta background tolerance to NA
df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .x))) df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .x)))
# Remove rows where L is 0
df_no_zeros <- df_na %>% filter(L > 0) df_no_zeros <- df_na %>% filter(L > 0)
# Flag and remove non-finite data, printing affected rows
df_na_filtered <- df_na %>% df_na_filtered <- df_na %>%
filter(if_any(c(L), ~ !is.finite(.))) %>% # Add L, r, AUC, K as needed for debugging filter(if_any(c(L), ~ !is.finite(.))) %>%
{ {
if (nrow(.) > 0) { if (nrow(.) > 0) {
message("Removing non-finite rows:\n") message("Removing non-finite rows:\n")
print(head(., n = 10)) print(head(., n = 10))
} }
df_na %>% filter(if_all(c(L), is.finite)) # Add L, r, AUC, K as needed for debugging df_na %>% filter(if_all(c(L), is.finite))
} }
# Generate QC PDFs and HTMLs # Generate and save QC plots using the new generalized function
# message("Generating QC plots") message("Generating QC plots")
# variables <- c("L", "K", "r", "AUC", "delta_bg") variables <- c("L", "K", "r", "AUC", "delta_bg")
# generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE) generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
# generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE) generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
# generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables) generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
# generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables) generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)
rm(df, df_above_tolerance, df_no_zeros) rm(df, df_above_tolerance, df_no_zeros)
@@ -577,37 +624,35 @@ main <- function() {
# Originally this filtered L NA's # Originally this filtered L NA's
# Filter data within 2SD # Filter data within and outside 2SD
within_2sd_k <- stats_joined %>% within_2sd_k <- stats_joined %>%
filter(K >= (K_mean - 2 * K_sd) & K <= (K_mean + 2 * K_sd)) filter(K >= (K_mean - 2 * K_sd) & K <= (K_mean + 2 * K_sd))
# Filter data outside 2SD
outside_2sd_k <- stats_joined %>% outside_2sd_k <- stats_joined %>%
filter(K < (K_mean - 2 * K_sd) | K > (K_mean + 2 * K_sd)) filter(K < (K_mean - 2 * K_sd) | K > (K_mean + 2 * K_sd))
message("Calculating summary statistics for L within 2SD of K") # Summary statistics for within and outside 2SD of K
l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
# Write CSV files
write.csv(l_within_2sd_k, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
write.csv(l_outside_2sd_k, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
message("Calculating summary statistics for L within 2SD of K")
cols_to_remove <- names(l_within_2sd_k) cols_to_remove <- names(l_within_2sd_k)
cols_to_keep <- c("conc_num", "conc_num_factor") cols_to_keep <- c("conc_num", "conc_num_factor")
within_2sd_k_clean <- within_2sd_k %>% within_2sd_k_clean <- within_2sd_k %>%
select(-all_of(setdiff(cols_to_remove, cols_to_keep))) select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
l_within_2sd_k_joined <- within_2sd_k_clean %>% l_within_2sd_k_joined <- within_2sd_k_clean %>%
left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor")) left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor"))
write.csv(l_within_2sd_k,
file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"),
row.names = FALSE)
message("Calculating summary statistics for for L outside 2SD of K") message("Calculating summary statistics for for L outside 2SD of K")
l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
cols_to_remove <- names(l_outside_2sd_k) cols_to_remove <- names(l_outside_2sd_k)
cols_to_keep <- c("conc_num", "conc_num_factor") cols_to_keep <- c("conc_num", "conc_num_factor")
outside_2sd_k_clean <- outside_2sd_k %>% outside_2sd_k_clean <- outside_2sd_k %>%
select(-all_of(setdiff(cols_to_remove, cols_to_keep))) select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
l_outside_2sd_k_joined <- outside_2sd_k_clean %>% l_outside_2sd_k_joined <- outside_2sd_k_clean %>%
left_join(l_outside_2sd_k, by = c("conc_num", "conc_num_factor")) left_join(l_outside_2sd_k, by = c("conc_num", "conc_num_factor"))
write.csv(l_outside_2sd_k,
file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"),
row.names = FALSE)
# Process background strains # Process background strains
bg_strains <- c("YDL227C") bg_strains <- c("YDL227C")
@@ -646,18 +691,14 @@ main <- function() {
filter(OrfRep != strain) %>% filter(OrfRep != strain) %>%
mutate(SM = 0) mutate(SM = 0)
message("Processing reference strain")
reference_strain <- process_strains(l_within_2sd_k_joined) reference_strain <- process_strains(l_within_2sd_k_joined)
message("Processing deletion strains")
deletion_strains <- process_strains(l_within_2sd_k_joined) deletion_strains <- process_strains(l_within_2sd_k_joined)
# TODO we may need to add "num" to grouping vars # TODO we may need to add "num" to grouping vars
# Calculate interactions # Calculate interactions
variables <- c("L", "K", "r", "AUC") variables <- c("L", "K", "r", "AUC")
message("Calculating reference interaction scores")
reference_results <- calculate_interaction_scores(stats_joined, reference_strain, max_conc, variables) reference_results <- calculate_interaction_scores(stats_joined, reference_strain, max_conc, variables)
message("Calculating deletion interaction scores")
deletion_results <- calculate_interaction_scores(stats_joined, deletion_strains, max_conc, variables) deletion_results <- calculate_interaction_scores(stats_joined, deletion_strains, max_conc, variables)
zscores_calculations_reference <- reference_results$zscores_calculations zscores_calculations_reference <- reference_results$zscores_calculations
@@ -666,11 +707,11 @@ main <- function() {
zscores_interactions <- deletion_results$zscores_interactions zscores_interactions <- deletion_results$zscores_interactions
# TODO: I don't know if we need this? # TODO: I don't know if we need this?
zscores_interactions <- zscores_interactions %>% # zscores_interactions <- zscores_interactions %>%
arrange(desc(Z_lm_L)) %>% # arrange(desc(Z_lm_L)) %>%
arrange(desc(NG)) # arrange(desc(NG))
message("Writing zscores csv files") # Writing Z-Scores to file
write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE) write.csv(zscores_calculations_reference, file = file.path(out_dir, "RF_ZScores_Calculations.csv"), row.names = FALSE)
write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE) write.csv(zscores_calculations, file = file.path(out_dir, "ZScores_Calculations.csv"), row.names = FALSE)
write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE) write.csv(zscores_interactions_reference, file = file.path(out_dir, "RF_ZScores_Interaction.csv"), row.names = FALSE)
@@ -678,10 +719,11 @@ main <- function() {
# Define conditions for enhancers and suppressors # Define conditions for enhancers and suppressors
# TODO Add to study config file? # TODO Add to study config file?
enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= 2 threshold <- 2
suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -2 enhancer_condition_L <- zscores_interactions$Avg_Zscore_L >= threshold
enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= 2 suppressor_condition_L <- zscores_interactions$Avg_Zscore_L <= -threshold
suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -2 enhancer_condition_K <- zscores_interactions$Avg_Zscore_K >= threshold
suppressor_condition_K <- zscores_interactions$Avg_Zscore_K <= -threshold
# Subset data # Subset data
enhancers_L <- zscores_interactions[enhancer_condition_L, ] enhancers_L <- zscores_interactions[enhancer_condition_L, ]
@@ -707,10 +749,11 @@ main <- function() {
file = file.path(out_dir, "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K.csv"), row.names = FALSE) file = file.path(out_dir, "ZScores_Interaction_DeletionEnhancers_and_Suppressors_K.csv"), row.names = FALSE)
# Handle linear model based enhancers and suppressors # Handle linear model based enhancers and suppressors
enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= 2, ] lm_threshold <- 2
suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -2, ] enhancers_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L >= lm_threshold, ]
enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= 2, ] suppressors_lm_L <- zscores_interactions[zscores_interactions$Z_lm_L <= -lm_threshold, ]
suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -2, ] enhancers_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K >= lm_threshold, ]
suppressors_lm_K <- zscores_interactions[zscores_interactions$Z_lm_K <= -lm_threshold, ]
# Save linear model based enhancers and suppressors # Save linear model based enhancers and suppressors
message("Writing linear model enhancer/suppressor csv files") message("Writing linear model enhancer/suppressor csv files")
@@ -724,8 +767,10 @@ main <- function() {
file = file.path(out_dir, "ZScores_Interaction_DeletionSuppressors_K_lm.csv"), row.names = FALSE) file = file.path(out_dir, "ZScores_Interaction_DeletionSuppressors_K_lm.csv"), row.names = FALSE)
# Generate plots for interaction scores # Generate plots for interaction scores
generate_rf_plots(zscores_calculations_reference, zscores_interactions_reference, out_dir) # generate_rf_plots(zscores_calculations_reference, zscores_interactions_reference, out_dir)
generate_rf_plots(zscores_calculations, zscores_interactions, out_dir) # generate_rf_plots(zscores_calculations, zscores_interactions, out_dir)
generate_and_save_plots(zscores_calculations_reference, out_dir, "Reference_Calculations", variables)
generate_and_save_plots(zscores_calculations, out_dir, "Deletion_Calculations", variables)
# Apply filtering to remove NA values before further analysis # Apply filtering to remove NA values before further analysis
zscores_interactions_filtered <- zscores_interactions %>% zscores_interactions_filtered <- zscores_interactions %>%

View File

@@ -785,7 +785,7 @@ init_project() {
debug "Running: ${FUNCNAME[0]}" debug "Running: ${FUNCNAME[0]}"
# Create a project scans dir # Create a project scans dir
[[ -d $PROJECT_SCANS_DIR ]] || (execute mkdir -p "$PROJECT_SCANS_DIR" || return 1) [[ -d $PROJECT_SCANS_DIR ]] || { execute mkdir -p "$PROJECT_SCANS_DIR" || return 1; }
# TODO eventually copy scans from robot but for now let's pause and wait for transfer # TODO eventually copy scans from robot but for now let's pause and wait for transfer
echo "In the future we will copy scans from robot here" echo "In the future we will copy scans from robot here"
@@ -1561,7 +1561,7 @@ join_interaction_zscores() {
"${EXP_PATHS_AND_NAMES[@]}" "${EXP_PATHS_AND_NAMES[@]}"
for f in "${out_files[@]}"; do for f in "${out_files[@]}"; do
[[ -f $f ]] || (echo "$f does not exist"; return 1) [[ -f $f ]] || { echo "$f does not exist"; return 1; }
done done
} }
@@ -1606,7 +1606,7 @@ r_gta() {
"${5:-"$GTA_OUT_DIR"}" \ "${5:-"$GTA_OUT_DIR"}" \
"${@:6}" # future arguments "${@:6}" # future arguments
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) [[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; }
} }
@@ -1779,7 +1779,7 @@ r_add_shift_values() {
rm -f "$STUDY_RESULTS_DIR/REMcHeatmaps/"*.pdf # TODO why? rm -f "$STUDY_RESULTS_DIR/REMcHeatmaps/"*.pdf # TODO why?
out_file="${4:-"$STUDY_RESULTS_DIR/REMcWithShift.csv"}" out_file="${4:-"$STUDY_RESULTS_DIR/REMcWithShift.csv"}"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) [[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; }
} }
@@ -1815,7 +1815,7 @@ r_create_heat_maps() {
pdfs=(REMcHeatmaps/*.pdf) pdfs=(REMcHeatmaps/*.pdf)
execute pdftk "${pdfs[@]}" output "$out_file" execute pdftk "${pdfs[@]}" output "$out_file"
out_file="$2/compiledREMcHeatmaps.pdf" out_file="$2/compiledREMcHeatmaps.pdf"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) [[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; }
} }
@@ -1845,7 +1845,7 @@ r_heat_maps_homology() {
pdfs=("${1:-"$STUDY_RESULTS_DIR/homology"}"/*.pdf) pdfs=("${1:-"$STUDY_RESULTS_DIR/homology"}"/*.pdf)
execute pdftk "${pdfs[@]}" output "$out_file" execute pdftk "${pdfs[@]}" output "$out_file"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) [[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; }
} }
@@ -1869,7 +1869,7 @@ py_gtf_dcon() {
"$2/" \ "$2/" \
"${@:3}" # future arguments "${@:3}" # future arguments
out_file="$2/1-0-0-finaltable.csv" out_file="$2/1-0-0-finaltable.csv"
[[ -f $out_file ]] || (echo "$out_file does not exist"; return 1) [[ -f $out_file ]] || { echo "$out_file does not exist"; return 1; }
} }
@@ -1930,7 +1930,7 @@ py_gtf_concat() {
debug "Running: ${FUNCNAME[0]} $*" debug "Running: ${FUNCNAME[0]} $*"
script="$APPS_DIR/python/concatGTFResults.py" script="$APPS_DIR/python/concatGTFResults.py"
execute "$PYTHON" "$script" "$1/" "$2" execute "$PYTHON" "$script" "$1/" "$2"
[[ -f $2 ]] || (echo "$2 does not exist"; return 1) [[ -f $2 ]] || { echo "$2 does not exist"; return 1; }
} }
@@ -1983,7 +1983,7 @@ handle_results_dir() {
results_dir="${results_dirs[0]}" results_dir="${results_dirs[0]}"
else else
# Print results dirs # Print results dirs
[[ ${#results_dirs[@]} -gt 0 ]] || (err "No ${2}s found"; return 1) [[ ${#results_dirs[@]} -gt 0 ]] || { err "No ${2}s found"; return 1; }
print_in_columns "results_dirs" print_in_columns "results_dirs"
# Results selection prompt # Results selection prompt
cat <<-EOF cat <<-EOF
@@ -2091,7 +2091,7 @@ handle_config() {
# Declare a nameref to refer to the actual array using the variable name # Declare a nameref to refer to the actual array using the variable name
declare -n config_array_ref="$config_array_name" declare -n config_array_ref="$config_array_name"
[[ -z ${!config_array_ref[*]} ]] && (err "No $config_array_name array found in $config_file" && return 1) [[ -z ${!config_array_ref[*]} ]] && { err "No $config_array_name array found in $config_file" && return 1; }
for key in "${!config_array_ref[@]}"; do for key in "${!config_array_ref[@]}"; do
IFS=',' read -r main_key sub_key <<< "$key" IFS=',' read -r main_key sub_key <<< "$key"
@@ -2298,7 +2298,7 @@ main() {
if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then if ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${MODULES[i]}[[:space:]] ]]; then
echo "Module ${MODULES[$i]} is not available, removing" echo "Module ${MODULES[$i]} is not available, removing"
read -r -p "Enter replacement module name: " module read -r -p "Enter replacement module name: " module
! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${module}[[:space:]] ]] || (echo "RTFM"; return 1) ! [[ " ${ALL_MODULES[*]} " =~ [[:space:]]${module}[[:space:]] ]] || { echo "RTFM"; return 1; }
MODULES[i]="$module" MODULES[i]="$module"
fi fi
unset module unset module