Simplify plot config format

This commit is contained in:
2024-09-30 01:26:36 -04:00
parent 5d25b3c3ef
commit 2f42357aa1

View File

@@ -112,7 +112,7 @@ theme_publication <- function(base_size = 14, base_family = "sans", legend_posit
legend.spacing = unit(0, "cm"),
legend.title = element_text(face = "italic", size = rel(1.3)),
legend.text = element_text(size = rel(1.2)),
plot.margin = unit(c(10, 5, 5, 5), "mm"),
plot.margin = unit(c(10, 5, 5, 5), "mm")
# strip.background = element_rect(colour = "#f0f0f0", fill = "#f0f0f0"),
# strip.text = element_text(face = "bold")
)
@@ -150,9 +150,8 @@ load_and_filter_data <- function(easy_results_file, sd = 3) {
SM = 0,
OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded?
conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)),
conc_num_factor_new = factor(conc_num),
conc_num_factor_zeroed = factor(as.numeric(conc_num_factor_new) - 1),
conc_num_factor = as.numeric(conc_num_factor_zeroed) # for legacy purposes
conc_num_factor = as.numeric(as.factor(conc_num)) - 1, # for legacy purposes
conc_num_factor_factor = factor(conc_num)
)
return(df)
@@ -186,12 +185,12 @@ calculate_summary_stats <- function(df, variables, group_vars) {
N = n(),
across(all_of(variables),
list(
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)),
min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)),
sd = ~sd(., na.rm = TRUE),
se = ~sd(., na.rm = TRUE) / sqrt(N - 1) # Bessel's correction
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE),
max = ~ ifelse(all(is.na(.x)), NA, max(.x, na.rm = TRUE)),
min = ~ ifelse(all(is.na(.x)), NA, min(.x, na.rm = TRUE)),
sd = ~ sd(.x, na.rm = TRUE),
se = ~ sd(.x, na.rm = TRUE) / sqrt(n() - 1)
),
.names = "{.fn}_{.col}"
),
@@ -335,7 +334,8 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats,
# Declare column order for output
calculations <- calculations %>%
select(
"OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "N",
"OrfRep", "Gene", "num", "N",
"conc_num", "conc_num_factor", "conc_num_factor_factor",
"mean_L", "mean_K", "mean_r", "mean_AUC",
"median_L", "median_K", "median_r", "median_AUC",
"sd_L", "sd_K", "sd_r", "sd_AUC",
@@ -352,7 +352,8 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats,
interactions <- interactions %>%
select(
"OrfRep", "Gene", "conc_num", "conc_num_factor", "num", "NG", "DB", "SM",
"OrfRep", "Gene", "num", "NG", "DB", "SM",
"conc_num", "conc_num_factor", "conc_num_factor_factor",
"Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
"Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC",
"Sum_Z_Score_L", "Sum_Z_Score_K", "Sum_Z_Score_r", "Sum_Z_Score_AUC",
@@ -366,10 +367,11 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats,
cleaned_df <- df %>%
select(-any_of(
setdiff(intersect(names(df), names(calculations)),
c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))))
c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "conc_num_factor_factor"))))
# Join the original dataframe with calculations
df_with_calculations <- left_join(cleaned_df, calculations, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))
df_with_calculations <- left_join(cleaned_df, calculations,
by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "conc_num_factor_factor"))
# Remove overlapping columns between df_with_calculations and interactions
df_with_calculations_clean <- df_with_calculations %>%
@@ -378,7 +380,8 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats,
c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))))
# Join with interactions to create the full dataset
full_data <- left_join(df_with_calculations_clean, interactions, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))
full_data <- left_join(df_with_calculations_clean, interactions,
by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "conc_num_factor_factor"))
return(list(
calculations = calculations,
@@ -390,131 +393,67 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats,
generate_and_save_plots <- function(out_dir, filename, plot_configs) {
message("Generating ", filename, ".pdf and ", filename, ".html")
for (config_group in plot_configs) {
plot_list <- config_group$plots
grid_nrow <- config_group$grid_layout$nrow
grid_ncol <- config_group$grid_layout$ncol
grid_nrow <- ifelse(is.null(plot_configs$grid_layout$nrow), 1, plot_configs$grid_layout$nrow)
grid_ncol <- ifelse(is.null(plot_configs$grid_layout$ncol), length(plot_configs$plots), plot_configs$grid_layout$ncol)
# Set defaults if nrow or ncol are not provided
if (is.null(grid_nrow) || is.null(grid_ncol)) {
num_plots <- length(plot_list)
grid_nrow <- ifelse(is.null(grid_nrow), 1, grid_nrow)
grid_ncol <- ifelse(is.null(grid_ncol), num_plots, grid_ncol)
}
# Prepare lists to collect static and interactive plots
static_plots <- list()
plotly_plots <- list()
# Generate each individual plot based on the configuration
for (i in seq_along(plot_list)) {
config <- plot_list[[i]]
for (i in seq_along(plot_configs$plots)) {
config <- plot_configs$plots[[i]]
df <- config$df
# Create the base plot
# Define aes_mapping and handle color_var defaulting
aes_mapping <- if (config$plot_type == "bar") {
if (!is.null(config$color_var)) {
aes(x = .data[[config$x_var]], fill = as.factor(.data[[config$color_var]]), color = as.factor(.data[[config$color_var]]))
} else {
aes(x = .data[[config$x_var]])
}
aes(x = .data[[config$x_var]], fill = ifelse(!is.null(config$color_var), .data[[config$color_var]], "black"),
color = ifelse(!is.null(config$color_var), .data[[config$color_var]], "black"))
} else if (config$plot_type == "density") {
if (!is.null(config$color_var)) {
aes(x = .data[[config$x_var]], color = as.factor(.data[[config$color_var]]))
aes(x = .data[[config$x_var]], color = ifelse(!is.null(config$color_var), .data[[config$color_var]], "black"))
} else {
aes(x = .data[[config$x_var]])
}
} else {
if (!is.null(config$color_var)) {
aes(x = .data[[config$x_var]], y = .data[[config$y_var]], color = as.factor(.data[[config$color_var]]))
} else {
aes(x = .data[[config$x_var]], y = .data[[config$y_var]])
}
aes(x = .data[[config$x_var]], y = .data[[config$y_var]], color = ifelse(!is.null(config$color_var), .data[[config$color_var]], "black"))
}
plot <- ggplot(df, aes_mapping)
# Apply theme_publication with legend_position
plot <- ggplot(df, aes_mapping) + theme_publication(legend_position = config$legend_position)
# Apply theme_publication with legend_position from config
legend_position <- if (!is.null(config$legend_position)) config$legend_position else "bottom"
plot <- plot + theme_publication(legend_position = legend_position)
# Use appropriate helper function based on plot type
# Apply appropriate plot function
plot <- switch(config$plot_type,
"scatter" = generate_scatter_plot(plot, config),
"box" = generate_box_plot(plot, config),
"density" = plot + geom_density(),
"bar" = plot + geom_bar(),
plot # default case if no type matches
plot # default case
)
# Add title and labels
if (!is.null(config$title)) {
plot <- plot + ggtitle(config$title)
}
if (!is.null(config$x_label)) {
plot <- plot + xlab(config$x_label)
}
if (!is.null(config$y_label)) {
plot <- plot + ylab(config$y_label)
}
# Add cartesian coordinates if specified
if (!is.null(config$coord_cartesian)) {
plot <- plot + coord_cartesian(ylim = config$coord_cartesian)
}
# Apply scale_color_discrete(guide = FALSE) when color_var is NULL
if (is.null(config$color_var)) {
plot <- plot + scale_color_discrete(guide = "none")
}
# Add titles and labels
if (!is.null(config$title)) plot <- plot + ggtitle(config$title)
if (!is.null(config$x_label)) plot <- plot + xlab(config$x_label)
if (!is.null(config$y_label)) plot <- plot + ylab(config$y_label)
if (!is.null(config$coord_cartesian)) plot <- plot + coord_cartesian(ylim = config$coord_cartesian)
# Add interactive tooltips for plotly
tooltip_vars <- c()
if (config$plot_type == "scatter") {
if (!is.null(config$delta_bg_point) && config$delta_bg_point) {
tooltip_vars <- c(tooltip_vars, "OrfRep", "Gene", "delta_bg")
} else if (!is.null(config$gene_point) && config$gene_point) {
tooltip_vars <- c(tooltip_vars, "OrfRep", "Gene")
} else if (!is.null(config$y_var) && !is.null(config$x_var)) {
tooltip_vars <- c(config$x_var, config$y_var)
}
}
tooltip_vars <- ifelse(!is.null(config$tooltip_vars), config$tooltip_vars, "none")
plotly_plot <- suppressWarnings(plotly::ggplotly(plot, tooltip = tooltip_vars))
# Convert to plotly object and suppress warnings here
plotly_plot <- suppressWarnings({
if (length(tooltip_vars) > 0) {
plotly::ggplotly(plot, tooltip = tooltip_vars)
} else {
plotly::ggplotly(plot, tooltip = "none")
}
})
# Adjust legend position if specified
# Adjust legend position in plotly
if (!is.null(config$legend_position) && config$legend_position == "bottom") {
plotly_plot <- plotly_plot %>% layout(legend = list(orientation = "h"))
}
# Add plots to lists
# Add static and interactive plots to lists
static_plots[[i]] <- plot
plotly_plots[[i]] <- plotly_plot
}
# Save static PDF plot(s) for the current grid
# Save static PDF plots
pdf(file.path(out_dir, paste0(filename, ".pdf")), width = 16, height = 9)
grid.arrange(grobs = static_plots, ncol = grid_ncol, nrow = grid_nrow)
dev.off()
combined_plot <- plotly::subplot(
plotly_plots,
nrows = grid_nrow,
ncols = grid_ncol,
margin = 0.05
)
# Save combined HTML plot(s)
# Save combined interactive HTML plots
combined_plot <- plotly::subplot(plotly_plots, nrows = grid_nrow, ncols = grid_ncol, margin = 0.05)
html_file <- file.path(out_dir, paste0(filename, ".html"))
saveWidget(combined_plot, file = html_file, selfcontained = TRUE)
}
}
generate_scatter_plot <- function(plot, config) {
@@ -671,51 +610,39 @@ generate_box_plot <- function(plot, config) {
return(plot)
}
generate_plate_analysis_plot_configs <- function(variables, stages = c("before", "after"),
df_before = NULL, df_after = NULL, plot_type = "scatter") {
generate_plate_analysis_plot_configs <- function(variables, df_before = NULL, df_after = NULL,
plot_type = "scatter", stages = c("before", "after")) {
plots <- list()
for (var in variables) {
for (stage in stages) {
df_plot <- if (stage == "before") df_before else df_after
# Check for non-finite values in the y-variable
df_plot_filtered <- df_plot %>%
filter(is.finite(!!sym(var)))
# Count removed rows
removed_rows <- nrow(df_plot) - nrow(df_plot_filtered)
if (removed_rows > 0) {
message(sprintf("Removed %d non-finite values for variable %s during stage %s", removed_rows, var, stage))
}
df_plot_filtered <- df_plot %>% filter(is.finite(!!sym(var)))
# Adjust settings based on plot_type
if (plot_type == "scatter") {
error_bar <- TRUE
position <- "jitter"
} else if (plot_type == "box") {
error_bar <- FALSE
position <- NULL
}
config <- list(
df = df_plot,
df = df_plot_filtered,
x_var = "scan",
y_var = var,
plot_type = plot_type,
title = paste("Plate analysis by Drug Conc for", var, stage, "quality control"),
error_bar = error_bar,
color_var = "conc_num_factor_zeroed",
position = position,
size = 0.2
color_var = "conc_num_factor_factor",
position = if (plot_type == "scatter") "jitter" else NULL,
size = 0.2,
error_bar = (plot_type == "scatter")
)
# Add config to plots list
plots <- append(plots, list(config))
}
}
return(plots)
return(list(grid_layout = list(ncol = 1, nrow = length(plots)), plots = plots))
}
generate_interaction_plot_configs <- function(df, limits_map = NULL, plot_type = "reference") {
# Define limits if not provided
if (is.null(limits_map)) {
limits_map <- list(
L = c(0, 130),
@@ -729,94 +656,74 @@ generate_interaction_plot_configs <- function(df, limits_map = NULL, plot_type =
)
}
# Define grouping variables and filter data based on plot type
if (plot_type == "reference") {
group_vars <- c("OrfRep", "Gene", "num")
group_vars <- if (plot_type == "reference") c("OrfRep", "Gene", "num") else c("OrfRep", "Gene")
df_filtered <- df %>%
mutate(
OrfRepCombined = paste(OrfRep, Gene, num, sep = "_")
)
} else if (plot_type == "deletion") {
group_vars <- c("OrfRep", "Gene")
df_filtered <- df %>%
mutate(
OrfRepCombined = paste(OrfRep, Gene, sep = "_") # Compare by OrfRep and Gene for deletion
)
}
mutate(OrfRepCombined = if (plot_type == "reference") paste(OrfRep, Gene, num, sep = "_") else paste(OrfRep, Gene, sep = "_"))
# Create a list to store all configs
configs <- list()
plots <- list()
# Generate the first 8 scatter/box plots for L, K, r, AUC (shared between reference and deletion)
overall_vars <- c("L", "K", "r", "AUC")
for (var in overall_vars) {
# Generate plots for overall variables (L, K, r, AUC)
for (var in c("L", "K", "r", "AUC")) {
y_limits <- limits_map[[var]]
config <- list(
df = df_filtered,
plot_type = "scatter",
x_var = "conc_num_factor_new",
x_var = "conc_num_factor_factor",
y_var = var,
x_label = unique(df_filtered$Drug)[1],
title = sprintf("Scatter RF for %s with SD", var),
coord_cartesian = y_limits,
error_bar = TRUE,
x_breaks = unique(df_filtered$conc_num_factor_new),
x_breaks = unique(df_filtered$conc_num_factor_factor),
x_labels = as.character(unique(df_filtered$conc_num)),
grid_layout = list(ncol = 2, nrow = 2),
position = "jitter"
position = "jitter",
smooth = TRUE
)
configs <- append(configs, list(config))
plots <- append(plots, list(config))
}
# Generate Delta comparison plots (4x3 grid for deletion and reference)
# Generate Delta comparison plots
unique_groups <- df_filtered %>% select(all_of(group_vars)) %>% distinct()
for (i in seq_len(nrow(unique_groups))) {
group <- unique_groups[i, ]
group_data <- df_filtered %>% filter(across(all_of(group_vars), ~ . == group[[cur_column()]]))
group_data <- df_filtered %>% semi_join(group, by = group_vars)
OrfRep <- as.character(group$OrfRep)
Gene <- if ("Gene" %in% names(group)) as.character(group$Gene) else ""
num <- if ("num" %in% names(group)) as.character(group$num) else ""
OrfRepCombined <- paste(OrfRep, Gene, num, sep = "_")
# Generate plots for Delta variables
delta_vars <- c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC")
for (var in delta_vars) {
for (var in c("Delta_L", "Delta_K", "Delta_r", "Delta_AUC")) {
y_limits <- limits_map[[var]]
upper_y <- y_limits[2]
lower_y <- y_limits[1]
y_span <- upper_y - lower_y
y_span <- y_limits[2] - y_limits[1]
# Get WT_sd_var for error bar calculations
# Error bars
WT_sd_var <- paste0("WT_sd_", sub("Delta_", "", var))
WT_sd_value <- group_data[[WT_sd_var]][1]
error_bar_ymin <- 0 - (2 * WT_sd_value)
error_bar_ymax <- 0 + (2 * WT_sd_value)
# Set annotations (Z_Shifts, lm Z-Scores, NG, DB, SM values)
Z_Shift_var <- paste0("Z_Shift_", sub("Delta_", "", var))
Z_lm_var <- paste0("Z_lm_", sub("Delta_", "", var))
Z_Shift_value <- round(group_data[[Z_Shift_var]][1], 2)
Z_lm_value <- round(group_data[[Z_lm_var]][1], 2)
# Annotations
Z_Shift_value <- round(group_data[[paste0("Z_Shift_", sub("Delta_", "", var))]][1], 2)
Z_lm_value <- round(group_data[[paste0("Z_lm_", sub("Delta_", "", var))]][1], 2)
NG_value <- group_data$NG[1]
DB_value <- group_data$DB[1]
SM_value <- group_data$SM[1]
annotations <- list(
list(x = 1, y = upper_y - 0.2 * y_span, label = paste("ZShift =", Z_Shift_value)),
list(x = 1, y = upper_y - 0.3 * y_span, label = paste("lm ZScore =", Z_lm_value)),
list(x = 1, y = lower_y + 0.2 * y_span, label = paste("NG =", NG_value)),
list(x = 1, y = lower_y + 0.1 * y_span, label = paste("DB =", DB_value)),
list(x = 1, y = lower_y, label = paste("SM =", SM_value))
list(x = 1, y = y_limits[2] - 0.2 * y_span, label = paste("ZShift =", Z_Shift_value)),
list(x = 1, y = y_limits[2] - 0.3 * y_span, label = paste("lm ZScore =", Z_lm_value)),
list(x = 1, y = y_limits[1] + 0.2 * y_span, label = paste("NG =", NG_value)),
list(x = 1, y = y_limits[1] + 0.1 * y_span, label = paste("DB =", DB_value)),
list(x = 1, y = y_limits[1], label = paste("SM =", SM_value))
)
# Create configuration for each Delta plot
config <- list(
df = group_data,
plot_type = "scatter",
x_var = "conc_num",
x_var = "conc_num_factor_factor",
y_var = var,
x_label = unique(group_data$Drug)[1],
title = paste(OrfRep, Gene, sep = " "),
@@ -827,23 +734,21 @@ generate_interaction_plot_configs <- function(df, limits_map = NULL, plot_type =
ymin = error_bar_ymin,
ymax = error_bar_ymax
),
lm_smooth = TRUE,
x_breaks = unique(group_data$conc_num_factor_new),
smooth = TRUE,
x_breaks = unique(group_data$conc_num_factor_factor),
x_labels = as.character(unique(group_data$conc_num)),
ylim_vals = y_limits,
grid_layout = list(ncol = 4, nrow = 3) # Adjust grid layout for gene-gene comparisons
grid_layout = list(ncol = 4, nrow = 3)
)
configs <- append(configs, list(config))
plots <- append(plots, list(config))
}
}
return(configs)
return(list(grid_layout = list(ncol = 2, nrow = 2), plots = plots))
}
generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FALSE, overlap_color = FALSE) {
sd_bands <- c(1, 2, 3)
avg_zscore_cols <- paste0("Avg_Zscore_", variables)
z_lm_cols <- paste0("Z_lm_", variables)
rank_avg_zscore_cols <- paste0("Rank_", variables)
@@ -851,8 +756,8 @@ generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FA
configs <- list()
# Adjust values if necessary
if (adjust) {
message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns for ranks")
df <- df %>%
mutate(
across(all_of(avg_zscore_cols), ~ifelse(is.na(.), 0.001, .)),
@@ -860,34 +765,22 @@ generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FA
)
}
message("Calculating ranks for Avg_Zscore and Z_lm columns")
rank_col_mapping <- setNames(rank_avg_zscore_cols, avg_zscore_cols)
# Calculate rank columns for Avg_Zscore and Z_lm columns
df_ranked <- df %>%
mutate(across(all_of(avg_zscore_cols), ~rank(., na.last = "keep"), .names = "{rank_col_mapping[.col]}"))
mutate(across(all_of(avg_zscore_cols), ~rank(., na.last = "keep"), .names = paste0("Rank_", avg_zscore_cols))) %>%
mutate(across(all_of(z_lm_cols), ~rank(., na.last = "keep"), .names = paste0("Rank_lm_", z_lm_cols)))
rank_lm_col_mapping <- setNames(rank_z_lm_cols, z_lm_cols)
df_ranked <- df_ranked %>%
mutate(across(all_of(z_lm_cols), ~rank(., na.last = "keep"), .names = "{rank_lm_col_mapping[.col]}"))
# SD-based plots for L and K
# Generate plots for SD-based L and K variables
for (variable in c("L", "K")) {
if (is_lm) {
rank_var <- paste0("Rank_lm_", variable)
zscore_var <- paste0("Z_lm_", variable)
y_label <- paste("Int Z score", variable)
} else {
rank_var <- paste0("Rank_", variable)
zscore_var <- paste0("Avg_Zscore_", variable)
y_label <- paste("Avg Z score", variable)
}
rank_var <- if (is_lm) paste0("Rank_lm_", variable) else paste0("Rank_", variable)
zscore_var <- if (is_lm) paste0("Z_lm_", variable) else paste0("Avg_Zscore_", variable)
y_label <- if (is_lm) paste("Int Z score", variable) else paste("Avg Z score", variable)
for (sd_band in sd_bands) {
num_enhancers <- sum(df_ranked[[zscore_var]] >= sd_band, na.rm = TRUE)
num_suppressors <- sum(df_ranked[[zscore_var]] <= -sd_band, na.rm = TRUE)
# Annotated plot configuration
# Plot with annotations
configs[[length(configs) + 1]] <- list(
df = df_ranked,
x_var = rank_var,
@@ -903,27 +796,22 @@ generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FA
list(
x = median(df_ranked[[rank_var]], na.rm = TRUE),
y = max(df_ranked[[zscore_var]], na.rm = TRUE) * 0.9,
label = paste("Deletion Enhancers =", num_enhancers),
hjust = 0.5,
vjust = 1
label = paste("Deletion Enhancers =", num_enhancers)
),
list(
x = median(df_ranked[[rank_var]], na.rm = TRUE),
y = min(df_ranked[[zscore_var]], na.rm = TRUE) * 0.9,
label = paste("Deletion Suppressors =", num_suppressors),
hjust = 0.5,
vjust = 0
label = paste("Deletion Suppressors =", num_suppressors)
)
),
shape = 3,
size = 0.1,
y_label = y_label,
x_label = "Rank",
legend_position = "none",
grid_layout = list(ncol = 3, nrow = 2)
legend_position = "none"
)
# Non-Annotated Plot Configuration
# Plot without annotations
configs[[length(configs) + 1]] <- list(
df = df_ranked,
x_var = rank_var,
@@ -940,42 +828,26 @@ generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FA
size = 0.1,
y_label = y_label,
x_label = "Rank",
legend_position = "none",
grid_layout = list(ncol = 3, nrow = 2)
legend_position = "none"
)
}
}
# Avg ZScore and Rank Avg ZScore Plots for variables
# Generate Avg ZScore and Rank Avg ZScore plots for each variable
for (variable in variables) {
for (plot_type in c("Avg Zscore vs lm", "Rank Avg Zscore vs lm")) {
title <- paste(plot_type, variable)
# Define specific variables based on plot type
if (plot_type == "Avg Zscore vs lm") {
x_var <- paste0("Avg_Zscore_", variable)
y_var <- paste0("Z_lm_", variable)
rectangles <- list(
list(xmin = -2, xmax = 2, ymin = -2, ymax = 2,
fill = NA, color = "grey20", alpha = 0.1
)
)
} else if (plot_type == "Rank Avg Zscore vs lm") {
x_var <- paste0("Rank_", variable)
y_var <- paste0("Rank_lm_", variable)
rectangles <- NULL
}
x_var <- if (plot_type == "Avg Zscore vs lm") paste0("Avg_Zscore_", variable) else paste0("Rank_", variable)
y_var <- if (plot_type == "Avg Zscore vs lm") paste0("Z_lm_", variable) else paste0("Rank_lm_", variable)
# Fit the linear model
lm_model <- lm(as.formula(paste(y_var, "~", x_var)), data = df_ranked)
# Extract intercept, slope, and R-squared from the model coefficients
intercept <- coef(lm_model)[1]
slope <- coef(lm_model)[2]
r_squared <- summary(lm_model)$r.squared
# Annotations: include R-squared in the plot
annotations <- list(
list(
x = mean(range(df_ranked[[x_var]], na.rm = TRUE)),
@@ -987,6 +859,12 @@ generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FA
)
)
rectangles <- if (plot_type == "Avg Zscore vs lm") {
list(list(xmin = -2, xmax = 2, ymin = -2, ymax = 2, fill = NA, color = "grey20", alpha = 0.1))
} else {
NULL
}
configs[[length(configs) + 1]] <- list(
df = df_ranked,
x_var = x_var,
@@ -1007,11 +885,11 @@ generate_rank_plot_configs <- function(df, variables, is_lm = FALSE, adjust = FA
)
}
}
return(configs)
return(list(grid_layout = list(ncol = 3, nrow = 2), plots = configs))
}
generate_correlation_plot_configs <- function(df, highlight_cyan = FALSE) {
# Define relationships for plotting
relationships <- list(
list(x = "Z_lm_L", y = "Z_lm_K", label = "Interaction L vs. Interaction K"),
list(x = "Z_lm_L", y = "Z_lm_r", label = "Interaction L vs. Interaction r"),
@@ -1021,56 +899,36 @@ generate_correlation_plot_configs <- function(df, highlight_cyan = FALSE) {
list(x = "Z_lm_r", y = "Z_lm_AUC", label = "Interaction r vs. Interaction AUC")
)
configs <- list()
plots <- list()
for (rel in relationships) {
# Fit linear model
lm_model <- lm(as.formula(paste(rel$y, "~", rel$x)), data = df)
lm_summary <- summary(lm_model)
intercept <- coef(lm_model)[1]
slope <- coef(lm_model)[2]
r_squared <- lm_summary$r.squared
r_squared <- summary(lm_model)$r.squared
# Construct plot configuration
config <- list(
df = df,
x_var = rel$x,
y_var = rel$y,
plot_type = "scatter",
title = rel$label,
x_label = paste("z-score", gsub("Z_lm_", "", rel$x)),
y_label = paste("z-score", gsub("Z_lm_", "", rel$y)),
annotations = list(
list(
x = mean(range(df[[rel$x]], na.rm = TRUE)),
y = mean(range(df[[rel$y]], na.rm = TRUE)),
label = paste("R-squared =", round(r_squared, 3)),
hjust = 0.5,
vjust = 1,
size = 5,
color = "black"
)
list(x = mean(df[[rel$x]], na.rm = TRUE),
y = mean(df[[rel$y]], na.rm = TRUE),
label = paste("R-squared =", round(r_squared, 3)))
),
smooth = TRUE,
smooth_color = "tomato3",
lm_line = list(intercept = intercept, slope = slope),
legend_position = "right",
lm_line = list(intercept = coef(lm_model)[1], slope = coef(lm_model)[2]),
shape = 3,
size = 0.5,
color_var = "Overlap",
rectangles = list(
list(
xmin = -2, xmax = 2, ymin = -2, ymax = 2,
fill = NA, color = "grey20", alpha = 0.1
)
),
cyan_points = highlight_cyan, # Toggle cyan point highlighting
cyan_points = highlight_cyan
)
configs[[length(configs) + 1]] <- config
plots <- append(plots, list(config))
}
return(configs)
return(list(grid_layout = list(ncol = 3, nrow = 2), plots = plots))
}
main <- function() {
@@ -1085,8 +943,6 @@ main <- function() {
summary_vars <- c("L", "K", "r", "AUC", "delta_bg") # fields to filter and calculate summary stats across
interaction_vars <- c("L", "K", "r", "AUC") # fields to calculate interaction z-scores
print_vars <- c("OrfRep", "Plate", "scan", "Col", "Row", "num", "OrfRep", "conc_num", "conc_num_factor",
"delta_bg_tolerance", "delta_bg", "Gene", "L", "K", "r", "AUC", "NG", "DB")
message("Loading and filtering data for experiment: ", exp_name)
df <- load_and_filter_data(args$easy_results_file, sd = exp_sd) %>%
@@ -1099,7 +955,7 @@ main <- function() {
df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero
# Save some constants
max_conc <- max(df$conc_num_factor_zeroed_num)
max_conc <- max(df$conc_num_factor)
l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
@@ -1107,13 +963,13 @@ main <- function() {
df_stats <- calculate_summary_stats(
df = df,
variables = summary_vars,
group_vars = c("conc_num", "conc_num_factor"))$df_with_stats
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$df_with_stats
message("Calculating summary statistics after quality control")
ss <- calculate_summary_stats(
df = df_na,
variables = summary_vars,
group_vars = c("conc_num", "conc_num_factor"))
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))
df_na_ss <- ss$summary_stats
df_na_stats <- ss$df_with_stats
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
@@ -1150,7 +1006,7 @@ main <- function() {
df_no_zeros_stats <- calculate_summary_stats(
df = df_no_zeros,
variables = summary_vars,
group_vars = c("conc_num", "conc_num_factor")
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")
)$df_with_stats
message("Filtering by 2SD of K")
@@ -1161,13 +1017,13 @@ main <- function() {
message("Calculating summary statistics for L within 2SD of K")
# TODO We're omitting the original z_max calculation, not sure if needed?
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))$summary_stats
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats
write.csv(ss,
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
row.names = FALSE)
message("Calculating summary statistics for L outside 2SD of K")
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
write.csv(ss$summary_stats,
file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"),
@@ -1175,54 +1031,64 @@ main <- function() {
# Each plots list corresponds to a file
l_vs_k_plot_configs <- list(
grid_layout = list(ncol = 1, nrow = 1),
plots = list(
list(
df = df,
x_var = "L",
y_var = "K",
plot_type = "scatter",
delta_bg_point = TRUE,
tooltip_vars = c("OrfRep", "Gene", "delta_bg"),
title = "Raw L vs K before quality control",
color_var = "conc_num_factor_zeroed",
color_var = "conc_num_factor_factor",
error_bar = FALSE,
legend_position = "right"
)
)
)
frequency_delta_bg_plot_configs <- list(
grid_layout = list(ncol = 1, nrow = 1),
plots = list(
list(
df = df_stats,
x_var = "delta_bg",
y_var = NULL,
plot_type = "density",
title = "Density plot for Delta Background by [Drug] (All Data)",
color_var = "conc_num_factor_zeroed",
color_var = "conc_num_factor_factor",
x_label = "Delta Background",
y_label = "Density",
error_bar = FALSE,
legend_position = "right"),
legend_position = "right"
),
list(
df = df_stats,
x_var = "delta_bg",
y_var = NULL,
plot_type = "bar",
title = "Bar plot for Delta Background by [Drug] (All Data)",
color_var = "conc_num_factor_zeroed",
color_var = "conc_num_factor_factor",
x_label = "Delta Background",
y_label = "Count",
error_bar = FALSE,
legend_position = "right")
legend_position = "right"
)
)
)
above_threshold_plot_configs <- list(
grid_layout = list(ncol = 1, nrow = 1),
plots = list(
list(
df = df_above_tolerance,
x_var = "L",
y_var = "K",
plot_type = "scatter",
delta_bg_point = TRUE,
tooltip_vars = c("OrfRep", "Gene", "delta_bg"),
title = paste("Raw L vs K for strains above Delta Background threshold of",
round(df_above_tolerance$delta_bg_tolerance[[1]], 3), "or above"),
color_var = "conc_num_factor_zeroed",
color_var = "conc_num_factor_factor",
position = "jitter",
annotations = list(
list(
@@ -1235,6 +1101,7 @@ main <- function() {
legend_position = "right"
)
)
)
plate_analysis_plot_configs <- generate_plate_analysis_plot_configs(
variables = summary_vars,
@@ -1263,59 +1130,84 @@ main <- function() {
)
l_outside_2sd_k_plot_configs <- list(
grid_layout = list(ncol = 1, nrow = 1), # Ensures it's compatible with generate_and_save_plots
plots = list(
list(
df = df_na_l_outside_2sd_k_stats,
x_var = "L",
y_var = "K",
plot_type = "scatter",
delta_bg_point = TRUE,
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
color_var = "conc_num_factor_zeroed",
position = "jitter",
color_var = "conc_num_factor_factor",
position = "jitter", # Apply jitter for better visibility
tooltip_vars = c("OrfRep", "Gene", "delta_bg"),
annotations = list(
list(
x = mean(df_na_l_outside_2sd_k_stats$L, na.rm = TRUE),
y = mean(df_na_l_outside_2sd_k_stats$K, na.rm = TRUE),
label = paste("Total strains:", nrow(df_na_l_outside_2sd_k_stats))
)
),
error_bar = FALSE, # No error bars for this plot
legend_position = "right"
)
)
)
delta_bg_outside_2sd_k_plot_configs <- list(
grid_layout = list(ncol = 1, nrow = 1), # Ensures it's compatible with generate_and_save_plots
plots = list(
list(
df = df_na_l_outside_2sd_k_stats,
x_var = "delta_bg",
y_var = "K",
plot_type = "scatter",
gene_point = TRUE,
title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
color_var = "conc_num_factor_zeroed",
position = "jitter",
color_var = "conc_num_factor_factor",
position = "jitter", # Apply jitter for better visibility
tooltip_vars = c("OrfRep", "Gene", "delta_bg"),
annotations = list(
list(
x = mean(df_na_l_outside_2sd_k_stats$delta_bg, na.rm = TRUE),
y = mean(df_na_l_outside_2sd_k_stats$K, na.rm = TRUE),
label = paste("Total strains:", nrow(df_na_l_outside_2sd_k_stats))
)
),
error_bar = FALSE, # No error bars for this plot
legend_position = "right"
)
)
)
message("Generating quality control plots in parallel")
# future::plan(future::multicore, workers = parallel::detectCores())
future::plan(future::multisession, workers = 3) # generate 3 plots in parallel
plot_configs <- list(
list(out_dir = out_dir_qc, filename = "L_vs_K_before_quality_control",
plot_configs = l_vs_k_plot_configs),
list(out_dir = out_dir_qc, filename = "frequency_delta_background",
plot_configs = frequency_delta_bg_plot_configs),
list(out_dir = out_dir_qc, filename = "L_vs_K_above_threshold",
plot_configs = above_threshold_plot_configs),
list(out_dir = out_dir_qc, filename = "plate_analysis",
plot_configs = plate_analysis_plot_configs),
list(out_dir = out_dir_qc, filename = "plate_analysis_boxplots",
plot_configs = plate_analysis_boxplot_configs),
list(out_dir = out_dir_qc, filename = "plate_analysis_no_zeros",
plot_configs = plate_analysis_no_zeros_plot_configs),
list(out_dir = out_dir_qc, filename = "plate_analysis_no_zeros_boxplots",
plot_configs = plate_analysis_no_zeros_boxplot_configs),
list(out_dir = out_dir_qc, filename = "L_vs_K_for_strains_2SD_outside_mean_K",
plot_configs = l_outside_2sd_k_plot_configs),
list(out_dir = out_dir_qc, filename = "delta_background_vs_K_for_strains_2sd_outside_mean_K",
plot_configs = delta_bg_outside_2sd_k_plot_configs)
)
generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plot_configs)
quit()
# # future::plan(future::multicore, workers = parallel::detectCores())
# future::plan(future::multisession, workers = 3) # generate 3 plots in parallel
# Generating quality control plots in parallel
# plot_configs <- list(
# list(out_dir = out_dir_qc, filename = "L_vs_K_before_quality_control",
# plot_configs = l_vs_k_plot_configs),
# list(out_dir = out_dir_qc, filename = "frequency_delta_background",
# plot_configs = frequency_delta_bg_plot_configs),
# list(out_dir = out_dir_qc, filename = "L_vs_K_above_threshold",
# plot_configs = above_threshold_plot_configs),
# list(out_dir = out_dir_qc, filename = "plate_analysis",
# plot_configs = plate_analysis_plot_configs),
# list(out_dir = out_dir_qc, filename = "plate_analysis_boxplots",
# plot_configs = plate_analysis_boxplot_configs),
# list(out_dir = out_dir_qc, filename = "plate_analysis_no_zeros",
# plot_configs = plate_analysis_no_zeros_plot_configs),
# list(out_dir = out_dir_qc, filename = "plate_analysis_no_zeros_boxplots",
# plot_configs = plate_analysis_no_zeros_boxplot_configs),
# list(out_dir = out_dir_qc, filename = "L_vs_K_for_strains_2SD_outside_mean_K",
# plot_configs = l_outside_2sd_k_plot_configs),
# list(out_dir = out_dir_qc, filename = "delta_background_vs_K_for_strains_2sd_outside_mean_K",
# plot_configs = delta_bg_outside_2sd_k_plot_configs)
# )
# # Generating quality control plots in parallel
# furrr::future_map(plot_configs, function(config) {
# generate_and_save_plots(config$out_dir, config$filename, config$plot_configs)
# }, .options = furrr_options(seed = TRUE))
@@ -1340,7 +1232,7 @@ main <- function() {
# Recalculate summary statistics for the background strain
message("Calculating summary statistics for background strain")
ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
ss_bg <- calculate_summary_stats(df_bg, summary_vars, group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor"))
summary_stats_bg <- ss_bg$summary_stats
write.csv(summary_stats_bg,
file = file.path(out_dir, paste0("summary_stats_background_strain_", strain, ".csv")),
@@ -1351,7 +1243,7 @@ main <- function() {
df_reference <- df_na_stats %>% # formerly X2_RF
filter(OrfRep == strain) %>%
filter(!is.na(L)) %>%
group_by(conc_num, conc_num_factor) %>%
group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>%
mutate(
max_l_theoretical = max(max_L, na.rm = TRUE),
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
@@ -1364,7 +1256,7 @@ main <- function() {
filter(OrfRep != strain) %>%
filter(!is.na(L)) %>%
mutate(SM = 0) %>%
group_by(conc_num, conc_num_factor) %>%
group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>%
mutate(
max_l_theoretical = max(max_L, na.rm = TRUE),
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
@@ -1376,7 +1268,7 @@ main <- function() {
df_reference_stats <- calculate_summary_stats(
df = df_reference,
variables = interaction_vars,
group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")
group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor", "conc_num_factor_factor")
)$df_with_stats
reference_results <- calculate_interaction_scores(df_reference_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene", "num"))
zscore_calculations_reference <- reference_results$calculations
@@ -1387,9 +1279,9 @@ main <- function() {
df_deletion_stats <- calculate_summary_stats(
df = df_deletion,
variables = interaction_vars,
group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")
group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor", "conc_num_factor_factor")
)$df_with_stats
deletion_results <- calculate_interaction_scores(df_deletion_stats, max_conc, bg_stats, group_vars = c("OrfRep"))
deletion_results <- calculate_interaction_scores(df_deletion_stats, max_conc, bg_stats, group_vars = c("OrfRep", "Gene"))
zscore_calculations <- deletion_results$calculations
zscore_interactions <- deletion_results$interactions
zscore_interactions_joined <- deletion_results$full_data