Remove existing columns before join

This commit is contained in:
2024-09-02 01:53:04 -04:00
parent 66062438b5
commit 756075a7e5

View File

@@ -58,8 +58,9 @@ parse_arguments <- function() {
args <- parse_arguments()
dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
# Should we keep output in exp dirs or combine in the study output dir?
# dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
# dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
# Define themes and scales
theme_publication <- function(base_size = 14, base_family = "sans", legend_position = "bottom") {
@@ -156,33 +157,25 @@ update_gene_names <- function(df, sgd_gene_list) {
}
generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num", title, x_label = NULL, y_label = NULL) {
# Base ggplot object with conditional y mapping
# Start the ggplot with just the x_var and color_var
plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) +
ggtitle(title) +
theme_publication()
# If y_var is not NULL, add it to the plot aesthetics
if (!is.null(y_var)) {
plot <- ggplot(df, aes(x = !!sym(x_var), y = !!sym(y_var), color = as.factor(!!sym(color_var))))
} else {
plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var))))
plot <- plot + aes(y = !!sym(y_var))
}
# Handle different plot types
if (plot_type == "scatter") {
plot <- plot +
geom_point(shape = 3, size = 0.2, position = "jitter") +
plot <- switch(plot_type,
"scatter" = plot + geom_point(shape = 3, size = 0.2, position = "jitter") +
stat_summary(fun.data = mean_sdl, geom = "errorbar") +
stat_summary(fun = mean, geom = "point", size = 0.6)
} else if (plot_type == "box") {
plot <- plot + geom_boxplot()
} else if (plot_type == "density") {
plot <- plot + geom_density()
} else if (plot_type == "bar") {
if (!is.null(y_var)) {
plot <- plot + geom_bar(stat = "identity") # Use y aesthetic if provided
} else {
plot <- plot + geom_bar() # Default to counting occurrences
}
}
# Add titles and labels
plot <- plot + ggtitle(title) + theme_publication()
stat_summary(fun = mean, geom = "point", size = 0.6),
"box" = plot + geom_boxplot(),
"density" = plot + geom_density(),
"bar" = plot + geom_bar(stat = ifelse(is.null(y_var), "count", "identity"))
)
# Add optional labels if provided
if (!is.null(x_label)) plot <- plot + xlab(x_label)
@@ -192,6 +185,8 @@ generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_
}
generate_and_save_plots <- function(df, output_dir, prefix, variables, include_qc = FALSE) {
plots <- list()
@@ -278,18 +273,23 @@ save_plots <- function(file_name, plot_list, output_dir) {
process_strains <- function(df, l_within_2sd_k, strain) {
df_strains <- data.frame() # Initialize an empty dataframe to store results
print(names(l_within_2sd_k))
for (concentration in unique(df$conc_num)) {
df_temp <- df %>% filter(conc_num == concentration)
if (concentration > 0) {
max_l_theoretical <- l_within_2sd_k %>%
filter(conc_num_factor == concentration) %>%
pull(L_max)
df_temp <- df_temp %>%
mutate(
L = ifelse(L == 0 & !is.na(L), max_l_theoretical, L),
SM = ifelse(L >= max_l_theoretical & !is.na(L), 1, SM),
L = ifelse(L >= max_l_theoretical & !is.na(L), max_l_theoretical, L)
L = ifelse(L == 0 & !is.na(L), max_l_theoretical, L), # Replace zero values with max theoretical
SM = ifelse(L >= max_l_theoretical & !is.na(L), 1, SM), # Set SM flag
L = ifelse(L >= max_l_theoretical & !is.na(L), max_l_theoretical, L) # Cap L values
)
}
df_strains <- bind_rows(df_strains, df_temp) # Append the results of this concentration
@@ -298,6 +298,7 @@ process_strains <- function(df, l_within_2sd_k, strain) {
return(df_strains)
}
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
# Pull the background means
@@ -536,13 +537,9 @@ main <- function() {
df_above_tolerance <- df %>% filter(DB == 1)
# Set vars above the delta background tolerance to NA
df_na <- df %>%
mutate(
L = ifelse(DB == 1, NA, L),
r = ifelse(DB == 1, NA, r),
AUC = ifelse(DB == 1, NA, AUC),
K = ifelse(DB == 1, NA, K)
)
df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .x)))
# Remove rows where L is 0
df_no_zeros <- df_na %>% filter(L > 0)
# Flag and remove non-finite data, printing affected rows
@@ -551,18 +548,18 @@ main <- function() {
{
if (nrow(.) > 0) {
message("Removing non-finite rows:\n")
#print(.)
print(head(., n = 10))
}
df_na %>% filter(if_all(c(L), is.finite)) # Add L, r, AUC, K as needed for debugging
}
# Generate QC PDFs and HTMLs
message("Generating QC plots")
variables <- c("L", "K", "r", "AUC", "delta_bg")
generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)
# message("Generating QC plots")
# variables <- c("L", "K", "r", "AUC", "delta_bg")
# generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
# generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
# generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
# generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)
rm(df, df_above_tolerance, df_no_zeros)
@@ -585,12 +582,37 @@ main <- function() {
# Calculate summary statistics for L within and outside 2SD of K
message("Calculating summary statistics for L within 2SD of K")
l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
cols_to_remove <- names(l_within_2sd_k)
cols_to_keep <- c("conc_num", "conc_num_factor")
within_2sd_k_clean <- within_2sd_k %>%
select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
l_within_2sd_k_joined <- within_2sd_k_clean %>%
left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor"))
#l_within_2sd_k_joined <- merge(within_2sd_k, l_within_2sd_k, by = c("conc_num", "conc_num_factor"), all.x = TRUE)
print("within_2sd_k")
print(head(within_2sd_k))
print("l_within_2sd_k")
print(head(l_within_2sd_k))
print("l_within_2sd_k_joined")
print(head(l_within_2sd_k_joined))
quit()
write.csv(l_within_2sd_k,
file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"),
file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"),
row.names = FALSE)
message("Calculating summary statistics for for L outside 2SD of K")
l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_outside_2sd_k_joined <- merge(outside_2sd_k, l_outside_2sd_k, by = c("conc_num", "conc_num_factor"), all.x = TRUE)
write.csv(l_outside_2sd_k,
file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"),
row.names = FALSE)
@@ -615,11 +637,11 @@ main <- function() {
# Recalculate summary statistics for the background strain
message("Calculating summary statistics for background strain")
stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor"))
stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
write.csv(stats_bg,
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
row.names = FALSE)
stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "Gene", "conc_num", "conc_num_factor"))
stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "conc_num", "conc_num_factor"))
# Filter reference and deletion strains
# Formerly X2_RF (reference strain)
@@ -633,9 +655,9 @@ main <- function() {
mutate(SM = 0)
message("Processing reference strain")
reference_strain <- process_strains(df_reference, l_within_2sd_k, strain)
reference_strain <- process_strains(df_reference, l_within_2sd_k_joined, strain)
message("Processing deletion strains")
deletion_strains <- process_strains(df_deletion, l_within_2sd_k, strain)
deletion_strains <- process_strains(df_deletion, l_within_2sd_k_joined, strain)
# TODO we may need to add "num" to grouping vars