Remove existing columns before join
This commit is contained in:
@@ -58,8 +58,9 @@ parse_arguments <- function() {
|
|||||||
|
|
||||||
args <- parse_arguments()
|
args <- parse_arguments()
|
||||||
|
|
||||||
dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
|
# Should we keep output in exp dirs or combine in the study output dir?
|
||||||
dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
|
# dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
|
||||||
|
# dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
|
||||||
|
|
||||||
# Define themes and scales
|
# Define themes and scales
|
||||||
theme_publication <- function(base_size = 14, base_family = "sans", legend_position = "bottom") {
|
theme_publication <- function(base_size = 14, base_family = "sans", legend_position = "bottom") {
|
||||||
@@ -156,33 +157,25 @@ update_gene_names <- function(df, sgd_gene_list) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num", title, x_label = NULL, y_label = NULL) {
|
generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num", title, x_label = NULL, y_label = NULL) {
|
||||||
# Base ggplot object with conditional y mapping
|
# Start the ggplot with just the x_var and color_var
|
||||||
|
plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) +
|
||||||
|
ggtitle(title) +
|
||||||
|
theme_publication()
|
||||||
|
|
||||||
|
# If y_var is not NULL, add it to the plot aesthetics
|
||||||
if (!is.null(y_var)) {
|
if (!is.null(y_var)) {
|
||||||
plot <- ggplot(df, aes(x = !!sym(x_var), y = !!sym(y_var), color = as.factor(!!sym(color_var))))
|
plot <- plot + aes(y = !!sym(y_var))
|
||||||
} else {
|
|
||||||
plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var))))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Handle different plot types
|
# Handle different plot types
|
||||||
if (plot_type == "scatter") {
|
plot <- switch(plot_type,
|
||||||
plot <- plot +
|
"scatter" = plot + geom_point(shape = 3, size = 0.2, position = "jitter") +
|
||||||
geom_point(shape = 3, size = 0.2, position = "jitter") +
|
|
||||||
stat_summary(fun.data = mean_sdl, geom = "errorbar") +
|
stat_summary(fun.data = mean_sdl, geom = "errorbar") +
|
||||||
stat_summary(fun = mean, geom = "point", size = 0.6)
|
stat_summary(fun = mean, geom = "point", size = 0.6),
|
||||||
} else if (plot_type == "box") {
|
"box" = plot + geom_boxplot(),
|
||||||
plot <- plot + geom_boxplot()
|
"density" = plot + geom_density(),
|
||||||
} else if (plot_type == "density") {
|
"bar" = plot + geom_bar(stat = ifelse(is.null(y_var), "count", "identity"))
|
||||||
plot <- plot + geom_density()
|
)
|
||||||
} else if (plot_type == "bar") {
|
|
||||||
if (!is.null(y_var)) {
|
|
||||||
plot <- plot + geom_bar(stat = "identity") # Use y aesthetic if provided
|
|
||||||
} else {
|
|
||||||
plot <- plot + geom_bar() # Default to counting occurrences
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add titles and labels
|
|
||||||
plot <- plot + ggtitle(title) + theme_publication()
|
|
||||||
|
|
||||||
# Add optional labels if provided
|
# Add optional labels if provided
|
||||||
if (!is.null(x_label)) plot <- plot + xlab(x_label)
|
if (!is.null(x_label)) plot <- plot + xlab(x_label)
|
||||||
@@ -192,6 +185,8 @@ generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
generate_and_save_plots <- function(df, output_dir, prefix, variables, include_qc = FALSE) {
|
generate_and_save_plots <- function(df, output_dir, prefix, variables, include_qc = FALSE) {
|
||||||
plots <- list()
|
plots <- list()
|
||||||
|
|
||||||
@@ -278,18 +273,23 @@ save_plots <- function(file_name, plot_list, output_dir) {
|
|||||||
process_strains <- function(df, l_within_2sd_k, strain) {
|
process_strains <- function(df, l_within_2sd_k, strain) {
|
||||||
df_strains <- data.frame() # Initialize an empty dataframe to store results
|
df_strains <- data.frame() # Initialize an empty dataframe to store results
|
||||||
|
|
||||||
|
print(names(l_within_2sd_k))
|
||||||
|
|
||||||
for (concentration in unique(df$conc_num)) {
|
for (concentration in unique(df$conc_num)) {
|
||||||
df_temp <- df %>% filter(conc_num == concentration)
|
df_temp <- df %>% filter(conc_num == concentration)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if (concentration > 0) {
|
if (concentration > 0) {
|
||||||
max_l_theoretical <- l_within_2sd_k %>%
|
max_l_theoretical <- l_within_2sd_k %>%
|
||||||
filter(conc_num_factor == concentration) %>%
|
filter(conc_num_factor == concentration) %>%
|
||||||
pull(L_max)
|
pull(L_max)
|
||||||
|
|
||||||
df_temp <- df_temp %>%
|
df_temp <- df_temp %>%
|
||||||
mutate(
|
mutate(
|
||||||
L = ifelse(L == 0 & !is.na(L), max_l_theoretical, L),
|
L = ifelse(L == 0 & !is.na(L), max_l_theoretical, L), # Replace zero values with max theoretical
|
||||||
SM = ifelse(L >= max_l_theoretical & !is.na(L), 1, SM),
|
SM = ifelse(L >= max_l_theoretical & !is.na(L), 1, SM), # Set SM flag
|
||||||
L = ifelse(L >= max_l_theoretical & !is.na(L), max_l_theoretical, L)
|
L = ifelse(L >= max_l_theoretical & !is.na(L), max_l_theoretical, L) # Cap L values
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
df_strains <- bind_rows(df_strains, df_temp) # Append the results of this concentration
|
df_strains <- bind_rows(df_strains, df_temp) # Append the results of this concentration
|
||||||
@@ -298,6 +298,7 @@ process_strains <- function(df, l_within_2sd_k, strain) {
|
|||||||
return(df_strains)
|
return(df_strains)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
|
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
|
||||||
|
|
||||||
# Pull the background means
|
# Pull the background means
|
||||||
@@ -536,13 +537,9 @@ main <- function() {
|
|||||||
df_above_tolerance <- df %>% filter(DB == 1)
|
df_above_tolerance <- df %>% filter(DB == 1)
|
||||||
|
|
||||||
# Set vars above the delta background tolerance to NA
|
# Set vars above the delta background tolerance to NA
|
||||||
df_na <- df %>%
|
df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .x)))
|
||||||
mutate(
|
|
||||||
L = ifelse(DB == 1, NA, L),
|
# Remove rows where L is 0
|
||||||
r = ifelse(DB == 1, NA, r),
|
|
||||||
AUC = ifelse(DB == 1, NA, AUC),
|
|
||||||
K = ifelse(DB == 1, NA, K)
|
|
||||||
)
|
|
||||||
df_no_zeros <- df_na %>% filter(L > 0)
|
df_no_zeros <- df_na %>% filter(L > 0)
|
||||||
|
|
||||||
# Flag and remove non-finite data, printing affected rows
|
# Flag and remove non-finite data, printing affected rows
|
||||||
@@ -551,18 +548,18 @@ main <- function() {
|
|||||||
{
|
{
|
||||||
if (nrow(.) > 0) {
|
if (nrow(.) > 0) {
|
||||||
message("Removing non-finite rows:\n")
|
message("Removing non-finite rows:\n")
|
||||||
#print(.)
|
print(head(., n = 10))
|
||||||
}
|
}
|
||||||
df_na %>% filter(if_all(c(L), is.finite)) # Add L, r, AUC, K as needed for debugging
|
df_na %>% filter(if_all(c(L), is.finite)) # Add L, r, AUC, K as needed for debugging
|
||||||
}
|
}
|
||||||
|
|
||||||
# Generate QC PDFs and HTMLs
|
# Generate QC PDFs and HTMLs
|
||||||
message("Generating QC plots")
|
# message("Generating QC plots")
|
||||||
variables <- c("L", "K", "r", "AUC", "delta_bg")
|
# variables <- c("L", "K", "r", "AUC", "delta_bg")
|
||||||
generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
|
# generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
|
||||||
generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
|
# generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
|
||||||
generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
|
# generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
|
||||||
generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)
|
# generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)
|
||||||
|
|
||||||
rm(df, df_above_tolerance, df_no_zeros)
|
rm(df, df_above_tolerance, df_no_zeros)
|
||||||
|
|
||||||
@@ -585,12 +582,37 @@ main <- function() {
|
|||||||
|
|
||||||
# Calculate summary statistics for L within and outside 2SD of K
|
# Calculate summary statistics for L within and outside 2SD of K
|
||||||
message("Calculating summary statistics for L within 2SD of K")
|
message("Calculating summary statistics for L within 2SD of K")
|
||||||
|
|
||||||
l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||||
|
|
||||||
|
cols_to_remove <- names(l_within_2sd_k)
|
||||||
|
cols_to_keep <- c("conc_num", "conc_num_factor")
|
||||||
|
|
||||||
|
within_2sd_k_clean <- within_2sd_k %>%
|
||||||
|
select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
|
||||||
|
|
||||||
|
l_within_2sd_k_joined <- within_2sd_k_clean %>%
|
||||||
|
left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor"))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#l_within_2sd_k_joined <- merge(within_2sd_k, l_within_2sd_k, by = c("conc_num", "conc_num_factor"), all.x = TRUE)
|
||||||
|
print("within_2sd_k")
|
||||||
|
print(head(within_2sd_k))
|
||||||
|
print("l_within_2sd_k")
|
||||||
|
print(head(l_within_2sd_k))
|
||||||
|
print("l_within_2sd_k_joined")
|
||||||
|
print(head(l_within_2sd_k_joined))
|
||||||
|
|
||||||
|
quit()
|
||||||
|
|
||||||
write.csv(l_within_2sd_k,
|
write.csv(l_within_2sd_k,
|
||||||
file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"),
|
file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"),
|
||||||
row.names = FALSE)
|
row.names = FALSE)
|
||||||
message("Calculating summary statistics for for L outside 2SD of K")
|
message("Calculating summary statistics for for L outside 2SD of K")
|
||||||
l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||||
|
l_outside_2sd_k_joined <- merge(outside_2sd_k, l_outside_2sd_k, by = c("conc_num", "conc_num_factor"), all.x = TRUE)
|
||||||
write.csv(l_outside_2sd_k,
|
write.csv(l_outside_2sd_k,
|
||||||
file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"),
|
file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"),
|
||||||
row.names = FALSE)
|
row.names = FALSE)
|
||||||
@@ -615,11 +637,11 @@ main <- function() {
|
|||||||
|
|
||||||
# Recalculate summary statistics for the background strain
|
# Recalculate summary statistics for the background strain
|
||||||
message("Calculating summary statistics for background strain")
|
message("Calculating summary statistics for background strain")
|
||||||
stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor"))
|
stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
|
||||||
write.csv(stats_bg,
|
write.csv(stats_bg,
|
||||||
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
|
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
|
||||||
row.names = FALSE)
|
row.names = FALSE)
|
||||||
stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "Gene", "conc_num", "conc_num_factor"))
|
stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "conc_num", "conc_num_factor"))
|
||||||
|
|
||||||
# Filter reference and deletion strains
|
# Filter reference and deletion strains
|
||||||
# Formerly X2_RF (reference strain)
|
# Formerly X2_RF (reference strain)
|
||||||
@@ -633,9 +655,9 @@ main <- function() {
|
|||||||
mutate(SM = 0)
|
mutate(SM = 0)
|
||||||
|
|
||||||
message("Processing reference strain")
|
message("Processing reference strain")
|
||||||
reference_strain <- process_strains(df_reference, l_within_2sd_k, strain)
|
reference_strain <- process_strains(df_reference, l_within_2sd_k_joined, strain)
|
||||||
message("Processing deletion strains")
|
message("Processing deletion strains")
|
||||||
deletion_strains <- process_strains(df_deletion, l_within_2sd_k, strain)
|
deletion_strains <- process_strains(df_deletion, l_within_2sd_k_joined, strain)
|
||||||
|
|
||||||
# TODO we may need to add "num" to grouping vars
|
# TODO we may need to add "num" to grouping vars
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user