Left join background data to calculations df
This commit is contained in:
@@ -216,7 +216,16 @@ calculate_summary_stats <- function(df, variables, group_vars) {
|
|||||||
return(list(summary_stats = summary_stats, df_with_stats = df_joined))
|
return(list(summary_stats = summary_stats, df_with_stats = df_joined))
|
||||||
}
|
}
|
||||||
|
|
||||||
calculate_interaction_scores <- function(df, bg_stats, group_vars, overlap_threshold = 2) {
|
calculate_interaction_scores <- function(df, bg_df, group_vars, overlap_threshold = 2) {
|
||||||
|
|
||||||
|
bg_df_selected <- bg_df %>%
|
||||||
|
select(OrfRep, conc_num, conc_num_factor, conc_num_factor_factor,
|
||||||
|
mean_L, mean_K, mean_r, mean_AUC, sd_L, sd_K, sd_r, sd_AUC
|
||||||
|
)
|
||||||
|
|
||||||
|
df <- df %>%
|
||||||
|
left_join(bg_df_selected, by = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor"),
|
||||||
|
suffix = c("", "_bg"))
|
||||||
|
|
||||||
# Calculate total concentration variables
|
# Calculate total concentration variables
|
||||||
total_conc_num <- length(unique(df$conc_num))
|
total_conc_num <- length(unique(df$conc_num))
|
||||||
@@ -230,15 +239,25 @@ calculate_interaction_scores <- function(df, bg_stats, group_vars, overlap_thres
|
|||||||
SM = sum(SM, na.rm = TRUE),
|
SM = sum(SM, na.rm = TRUE),
|
||||||
num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
|
num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
|
||||||
|
|
||||||
|
# Assign WT values from the background data
|
||||||
|
WT_L = mean_L_bg,
|
||||||
|
WT_K = mean_K_bg,
|
||||||
|
WT_r = mean_r_bg,
|
||||||
|
WT_AUC = mean_AUC_bg,
|
||||||
|
WT_sd_L = sd_L_bg,
|
||||||
|
WT_sd_K = sd_K_bg,
|
||||||
|
WT_sd_r = sd_r_bg,
|
||||||
|
WT_sd_AUC = sd_AUC_bg,
|
||||||
|
|
||||||
# Calculate raw data
|
# Calculate raw data
|
||||||
Raw_Shift_L = first(mean_L) - bg_stats$mean_L,
|
Raw_Shift_L = first(mean_L) - first(mean_L_bg),
|
||||||
Raw_Shift_K = first(mean_K) - bg_stats$mean_K,
|
Raw_Shift_K = first(mean_K) - first(mean_K_bg),
|
||||||
Raw_Shift_r = first(mean_r) - bg_stats$mean_r,
|
Raw_Shift_r = first(mean_r) - first(mean_r_bg),
|
||||||
Raw_Shift_AUC = first(mean_AUC) - bg_stats$mean_AUC,
|
Raw_Shift_AUC = first(mean_AUC) - first(mean_AUC_bg),
|
||||||
Z_Shift_L = Raw_Shift_L / bg_stats$sd_L,
|
Z_Shift_L = Raw_Shift_L / first(sd_L_bg),
|
||||||
Z_Shift_K = Raw_Shift_K / bg_stats$sd_K,
|
Z_Shift_K = Raw_Shift_K / first(sd_K_bg),
|
||||||
Z_Shift_r = Raw_Shift_r / bg_stats$sd_r,
|
Z_Shift_r = Raw_Shift_r / first(sd_r_bg),
|
||||||
Z_Shift_AUC = Raw_Shift_AUC / bg_stats$sd_AUC,
|
Z_Shift_AUC = Raw_Shift_AUC / first(sd_AUC_bg),
|
||||||
|
|
||||||
# Expected values
|
# Expected values
|
||||||
Exp_L = WT_L + Raw_Shift_L,
|
Exp_L = WT_L + Raw_Shift_L,
|
||||||
@@ -1073,59 +1092,12 @@ main <- function() {
|
|||||||
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
|
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
|
||||||
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
|
dir.create(out_dir_qc, recursive = TRUE, showWarnings = FALSE)
|
||||||
|
|
||||||
|
# Each list of plots corresponds to a separate file
|
||||||
message("Loading and filtering data for experiment: ", exp_name)
|
message("Loading and filtering data for experiment: ", exp_name)
|
||||||
df <- load_and_filter_data(args$easy_results_file, sd = exp_sd) %>%
|
df <- load_and_filter_data(args$easy_results_file, sd = exp_sd) %>%
|
||||||
update_gene_names(args$sgd_gene_list) %>%
|
update_gene_names(args$sgd_gene_list) %>%
|
||||||
as_tibble()
|
as_tibble()
|
||||||
|
|
||||||
message("Calculating summary statistics before quality control")
|
|
||||||
df_stats <- calculate_summary_stats(
|
|
||||||
df = df,
|
|
||||||
variables = c("L", "K", "r", "AUC", "delta_bg"),
|
|
||||||
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$df_with_stats
|
|
||||||
|
|
||||||
message("Calculating summary statistics after quality control")
|
|
||||||
df_na <- df %>% mutate(across(all_of(c("L", "K", "r", "AUC", "delta_bg")), ~ ifelse(DB == 1, NA, .))) # formerly X
|
|
||||||
ss <- calculate_summary_stats(
|
|
||||||
df = df_na,
|
|
||||||
variables = c("L", "K", "r", "AUC", "delta_bg"),
|
|
||||||
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))
|
|
||||||
df_na_ss <- ss$summary_stats
|
|
||||||
df_na_stats <- ss$df_with_stats # formerly X_stats_ALL
|
|
||||||
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
|
||||||
# For plotting (ggplot warns on NAs)
|
|
||||||
df_na_stats_filtered <- df_na_stats %>% filter(if_all(all_of(c("L", "K", "r", "AUC", "delta_bg")), is.finite))
|
|
||||||
|
|
||||||
message("Calculating summary statistics after quality control excluding zero values")
|
|
||||||
df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero
|
|
||||||
df_no_zeros_stats <- calculate_summary_stats(
|
|
||||||
df = df_no_zeros,
|
|
||||||
variables = c("L", "K", "r", "AUC", "delta_bg"),
|
|
||||||
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")
|
|
||||||
)$df_with_stats
|
|
||||||
|
|
||||||
message("Filtering by 2SD of K")
|
|
||||||
df_na_within_2sd_k <- df_na_stats %>%
|
|
||||||
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
|
|
||||||
df_na_outside_2sd_k <- df_na_stats %>%
|
|
||||||
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
|
|
||||||
|
|
||||||
message("Calculating summary statistics for L within 2SD of K")
|
|
||||||
# TODO We're omitting the original z_max calculation, not sure if needed?
|
|
||||||
ss <- calculate_summary_stats(df_na_within_2sd_k, "L",
|
|
||||||
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats
|
|
||||||
write.csv(ss,
|
|
||||||
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
|
|
||||||
row.names = FALSE)
|
|
||||||
|
|
||||||
message("Calculating summary statistics for L outside 2SD of K")
|
|
||||||
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))
|
|
||||||
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
|
||||||
write.csv(ss$summary_stats,
|
|
||||||
file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"),
|
|
||||||
row.names = FALSE)
|
|
||||||
|
|
||||||
# Each list of plots corresponds to a file
|
|
||||||
l_vs_k_plot_configs <- list(
|
l_vs_k_plot_configs <- list(
|
||||||
plots = list(
|
plots = list(
|
||||||
list(
|
list(
|
||||||
@@ -1142,6 +1114,12 @@ main <- function() {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
message("Calculating summary statistics before quality control")
|
||||||
|
df_stats <- calculate_summary_stats(
|
||||||
|
df = df,
|
||||||
|
variables = c("L", "K", "r", "AUC", "delta_bg"),
|
||||||
|
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$df_with_stats
|
||||||
|
|
||||||
frequency_delta_bg_plot_configs <- list(
|
frequency_delta_bg_plot_configs <- list(
|
||||||
plots = list(
|
plots = list(
|
||||||
list(
|
list(
|
||||||
@@ -1171,7 +1149,9 @@ main <- function() {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
message("Filtering rows above delta background tolerance for plotting")
|
||||||
df_above_tolerance <- df %>% filter(DB == 1)
|
df_above_tolerance <- df %>% filter(DB == 1)
|
||||||
|
|
||||||
above_threshold_plot_configs <- list(
|
above_threshold_plot_configs <- list(
|
||||||
plots = list(
|
plots = list(
|
||||||
list(
|
list(
|
||||||
@@ -1197,6 +1177,49 @@ main <- function() {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
message("Setting rows above delta background tolerance to NA")
|
||||||
|
df_na <- df %>% mutate(across(all_of(c("L", "K", "r", "AUC", "delta_bg")), ~ ifelse(DB == 1, NA, .))) # formerly X
|
||||||
|
|
||||||
|
message("Calculating summary statistics across all strains")
|
||||||
|
ss <- calculate_summary_stats(
|
||||||
|
df = df_na,
|
||||||
|
variables = c("L", "K", "r", "AUC", "delta_bg"),
|
||||||
|
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))
|
||||||
|
df_na_ss <- ss$summary_stats
|
||||||
|
df_na_stats <- ss$df_with_stats # formerly X_stats_ALL
|
||||||
|
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
||||||
|
# This can help bypass missing values ggplot warnings during testing
|
||||||
|
df_na_stats_filtered <- df_na_stats %>% filter(if_all(all_of(c("L", "K", "r", "AUC", "delta_bg")), is.finite))
|
||||||
|
|
||||||
|
message("Calculating summary statistics excluding zero values")
|
||||||
|
df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero
|
||||||
|
df_no_zeros_stats <- calculate_summary_stats(
|
||||||
|
df = df_no_zeros,
|
||||||
|
variables = c("L", "K", "r", "AUC", "delta_bg"),
|
||||||
|
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")
|
||||||
|
)$df_with_stats
|
||||||
|
|
||||||
|
message("Filtering by 2SD of K")
|
||||||
|
df_na_within_2sd_k <- df_na_stats %>%
|
||||||
|
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
|
||||||
|
df_na_outside_2sd_k <- df_na_stats %>%
|
||||||
|
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
|
||||||
|
|
||||||
|
message("Calculating summary statistics for L within 2SD of K")
|
||||||
|
# TODO We're omitting the original z_max calculation, not sure if needed?
|
||||||
|
ss <- calculate_summary_stats(df_na_within_2sd_k, "L",
|
||||||
|
group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats
|
||||||
|
write.csv(ss,
|
||||||
|
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
|
||||||
|
row.names = FALSE)
|
||||||
|
|
||||||
|
message("Calculating summary statistics for L outside 2SD of K")
|
||||||
|
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))
|
||||||
|
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
||||||
|
write.csv(ss$summary_stats,
|
||||||
|
file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"),
|
||||||
|
row.names = FALSE)
|
||||||
|
|
||||||
plate_analysis_plot_configs <- generate_plate_analysis_plot_configs(
|
plate_analysis_plot_configs <- generate_plate_analysis_plot_configs(
|
||||||
variables = c("L", "K", "r", "AUC", "delta_bg"),
|
variables = c("L", "K", "r", "AUC", "delta_bg"),
|
||||||
df_before = df_stats,
|
df_before = df_stats,
|
||||||
@@ -1303,7 +1326,6 @@ main <- function() {
|
|||||||
|
|
||||||
bg_strains <- c("YDL227C")
|
bg_strains <- c("YDL227C")
|
||||||
lapply(bg_strains, function(strain) {
|
lapply(bg_strains, function(strain) {
|
||||||
|
|
||||||
message("Processing background strain: ", strain)
|
message("Processing background strain: ", strain)
|
||||||
|
|
||||||
# Handle missing data by setting zero values to NA
|
# Handle missing data by setting zero values to NA
|
||||||
@@ -1318,19 +1340,18 @@ main <- function() {
|
|||||||
) %>%
|
) %>%
|
||||||
filter(!is.na(L))
|
filter(!is.na(L))
|
||||||
|
|
||||||
# Recalculate summary statistics for the background strain
|
|
||||||
message("Calculating summary statistics for background strain")
|
message("Calculating summary statistics for background strain")
|
||||||
ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"),
|
ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"),
|
||||||
group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor"))
|
group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor"))
|
||||||
summary_stats_bg <- ss_bg$summary_stats
|
summary_stats_bg <- ss_bg$summary_stats
|
||||||
ss_bg_stats <- ss_bg$df_with_stats
|
df_bg_stats <- ss_bg$df_with_stats
|
||||||
write.csv(summary_stats_bg,
|
write.csv(
|
||||||
|
summary_stats_bg,
|
||||||
file = file.path(out_dir, paste0("summary_stats_background_strain_", strain, ".csv")),
|
file = file.path(out_dir, paste0("summary_stats_background_strain_", strain, ".csv")),
|
||||||
row.names = FALSE)
|
row.names = FALSE)
|
||||||
|
|
||||||
# Set the missing values to the highest theoretical value at each drug conc for L
|
message("Setting missing reference values to the highest theoretical value at each drug conc for L")
|
||||||
# Leave other values as 0 for the max/min
|
df_reference <- df_na_stats %>% # formerly X2_RF
|
||||||
df_reference <- df_bg_stats %>% # formerly X2_RF
|
|
||||||
filter(OrfRep == strain) %>%
|
filter(OrfRep == strain) %>%
|
||||||
filter(!is.na(L)) %>%
|
filter(!is.na(L)) %>%
|
||||||
group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>%
|
group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>%
|
||||||
@@ -1341,11 +1362,21 @@ main <- function() {
|
|||||||
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
||||||
ungroup()
|
ungroup()
|
||||||
|
|
||||||
# Ditto for deletion strains
|
message("Calculating reference strain interaction scores")
|
||||||
|
df_reference_stats <- calculate_summary_stats(
|
||||||
|
df = df_reference,
|
||||||
|
variables = c("L", "K", "r", "AUC"),
|
||||||
|
group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")
|
||||||
|
)$df_with_stats
|
||||||
|
reference_results <- calculate_interaction_scores(df_reference_stats, df_bg_stats, group_vars = c("OrfRep", "Gene", "num"))
|
||||||
|
zscore_calculations_reference <- reference_results$calculations
|
||||||
|
zscore_interactions_reference <- reference_results$interactions
|
||||||
|
zscore_interactions_reference_joined <- reference_results$full_data
|
||||||
|
|
||||||
|
message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
|
||||||
df_deletion <- df_na_stats %>% # formerly X2
|
df_deletion <- df_na_stats %>% # formerly X2
|
||||||
filter(OrfRep != strain) %>%
|
filter(OrfRep != strain) %>%
|
||||||
filter(!is.na(L)) %>%
|
filter(!is.na(L)) %>%
|
||||||
mutate(SM = 0) %>%
|
|
||||||
group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>%
|
group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>%
|
||||||
mutate(
|
mutate(
|
||||||
max_l_theoretical = max(max_L, na.rm = TRUE),
|
max_l_theoretical = max(max_L, na.rm = TRUE),
|
||||||
@@ -1354,24 +1385,13 @@ main <- function() {
|
|||||||
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
||||||
ungroup()
|
ungroup()
|
||||||
|
|
||||||
message("Calculating reference strain interaction scores")
|
|
||||||
df_reference_stats <- calculate_summary_stats(
|
|
||||||
df = df_reference,
|
|
||||||
variables = c("L", "K", "r", "AUC"),
|
|
||||||
group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")
|
|
||||||
)$df_with_stats
|
|
||||||
reference_results <- calculate_interaction_scores(df_reference_stats, bg_stats, group_vars = c("OrfRep", "Gene", "num"))
|
|
||||||
zscore_calculations_reference <- reference_results$calculations
|
|
||||||
zscore_interactions_reference <- reference_results$interactions
|
|
||||||
zscore_interactions_reference_joined <- reference_results$full_data
|
|
||||||
|
|
||||||
message("Calculating deletion strain(s) interactions scores")
|
message("Calculating deletion strain(s) interactions scores")
|
||||||
df_deletion_stats <- calculate_summary_stats(
|
df_deletion_stats <- calculate_summary_stats(
|
||||||
df = df_deletion,
|
df = df_deletion,
|
||||||
variables = c("L", "K", "r", "AUC"),
|
variables = c("L", "K", "r", "AUC"),
|
||||||
group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")
|
group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")
|
||||||
)$df_with_stats
|
)$df_with_stats
|
||||||
deletion_results <- calculate_interaction_scores(df_deletion_stats, bg_stats, group_vars = c("OrfRep", "Gene"))
|
deletion_results <- calculate_interaction_scores(df_deletion_stats, df_bg_stats, group_vars = c("OrfRep", "Gene"))
|
||||||
zscore_calculations <- deletion_results$calculations
|
zscore_calculations <- deletion_results$calculations
|
||||||
zscore_interactions <- deletion_results$interactions
|
zscore_interactions <- deletion_results$interactions
|
||||||
zscore_interactions_joined <- deletion_results$full_data
|
zscore_interactions_joined <- deletion_results$full_data
|
||||||
|
|||||||
Reference in New Issue
Block a user