|
@@ -134,10 +134,9 @@ load_and_process_data <- function(easy_results_file, sd = 3) {
|
|
DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0),
|
|
DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0),
|
|
SM = 0,
|
|
SM = 0,
|
|
OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded?
|
|
OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded?
|
|
- conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc))
|
|
|
|
- ) %>%
|
|
|
|
- mutate(
|
|
|
|
- conc_num_factor = as.factor(match(conc_num, sort(unique(conc_num))) - 1)
|
|
|
|
|
|
+ conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)),
|
|
|
|
+ conc_num_factor = factor(as.numeric(factor(conc_num)) - 1),
|
|
|
|
+ conc_num_factor_num = as.numeric(conc_num_factor)
|
|
)
|
|
)
|
|
|
|
|
|
return(df)
|
|
return(df)
|
|
@@ -169,8 +168,7 @@ calculate_summary_stats <- function(df, variables, group_vars) {
|
|
group_by(across(all_of(group_vars))) %>%
|
|
group_by(across(all_of(group_vars))) %>%
|
|
summarise(
|
|
summarise(
|
|
N = n(),
|
|
N = n(),
|
|
- across(
|
|
|
|
- all_of(variables),
|
|
|
|
|
|
+ across(all_of(variables),
|
|
list(
|
|
list(
|
|
mean = ~mean(., na.rm = TRUE),
|
|
mean = ~mean(., na.rm = TRUE),
|
|
median = ~median(., na.rm = TRUE),
|
|
median = ~median(., na.rm = TRUE),
|
|
@@ -208,10 +206,10 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, variables = c("
|
|
num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
|
|
num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1,
|
|
|
|
|
|
# Calculate raw data
|
|
# Calculate raw data
|
|
- Raw_Shift_L = first(mean_L) - bg_stats$L,
|
|
|
|
- Raw_Shift_K = first(mean_K) - bg_stats$K,
|
|
|
|
- Raw_Shift_r = first(mean_r) - bg_stats$r,
|
|
|
|
- Raw_Shift_AUC = first(mean_AUC) - bg_stats$AUC,
|
|
|
|
|
|
+ Raw_Shift_L = first(mean_L) - bg_stats$mean_L,
|
|
|
|
+ Raw_Shift_K = first(mean_K) - bg_stats$mean_K,
|
|
|
|
+ Raw_Shift_r = first(mean_r) - bg_stats$mean_r,
|
|
|
|
+ Raw_Shift_AUC = first(mean_AUC) - bg_stats$mean_AUC,
|
|
Z_Shift_L = first(Raw_Shift_L) / bg_stats$sd_L,
|
|
Z_Shift_L = first(Raw_Shift_L) / bg_stats$sd_L,
|
|
Z_Shift_K = first(Raw_Shift_K) / bg_stats$sd_K,
|
|
Z_Shift_K = first(Raw_Shift_K) / bg_stats$sd_K,
|
|
Z_Shift_r = first(Raw_Shift_r) / bg_stats$sd_r,
|
|
Z_Shift_r = first(Raw_Shift_r) / bg_stats$sd_r,
|
|
@@ -237,10 +235,10 @@ calculate_interaction_scores <- function(df, max_conc, bg_stats, variables = c("
|
|
Zscore_AUC = Delta_AUC / WT_sd_AUC,
|
|
Zscore_AUC = Delta_AUC / WT_sd_AUC,
|
|
|
|
|
|
# Fit linear models and store in list-columns
|
|
# Fit linear models and store in list-columns
|
|
- gene_lm_L = list(lm(Delta_L ~ conc_num_factor, data = pick(everything()))),
|
|
|
|
- gene_lm_K = list(lm(Delta_K ~ conc_num_factor, data = pick(everything()))),
|
|
|
|
- gene_lm_r = list(lm(Delta_r ~ conc_num_factor, data = pick(everything()))),
|
|
|
|
- gene_lm_AUC = list(lm(Delta_AUC ~ conc_num_factor, data = pick(everything()))),
|
|
|
|
|
|
+ gene_lm_L = list(lm(Delta_L ~ conc_num_factor_num, data = pick(everything()))),
|
|
|
|
+ gene_lm_K = list(lm(Delta_K ~ conc_num_factor_num, data = pick(everything()))),
|
|
|
|
+ gene_lm_r = list(lm(Delta_r ~ conc_num_factor_num, data = pick(everything()))),
|
|
|
|
+ gene_lm_AUC = list(lm(Delta_AUC ~ conc_num_factor_num, data = pick(everything()))),
|
|
|
|
|
|
# Extract coefficients using purrr::map_dbl
|
|
# Extract coefficients using purrr::map_dbl
|
|
lm_intercept_L = map_dbl(gene_lm_L, ~ coef(.x)[1]),
|
|
lm_intercept_L = map_dbl(gene_lm_L, ~ coef(.x)[1]),
|
|
@@ -1040,7 +1038,7 @@ main <- function() {
|
|
df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero
|
|
df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero
|
|
|
|
|
|
# Save some constants
|
|
# Save some constants
|
|
- max_conc <- max(as.numeric(df$conc_num_factor))
|
|
|
|
|
|
+ max_conc <- max(df$conc_num_factor_num)
|
|
l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
|
|
l_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
|
|
k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
|
k_half_median <- (median(df_above_tolerance$K, na.rm = TRUE)) / 2
|
|
|
|
|
|
@@ -1050,7 +1048,6 @@ main <- function() {
|
|
variables = summary_vars,
|
|
variables = summary_vars,
|
|
group_vars = c("conc_num", "conc_num_factor"))$df_with_stats
|
|
group_vars = c("conc_num", "conc_num_factor"))$df_with_stats
|
|
message("Filtering non-finite data")
|
|
message("Filtering non-finite data")
|
|
- # df_filtered_stats <- process_data(df_stats, c("L"), filter_nf = TRUE)
|
|
|
|
|
|
|
|
message("Calculating summary statistics after quality control")
|
|
message("Calculating summary statistics after quality control")
|
|
ss <- calculate_summary_stats(
|
|
ss <- calculate_summary_stats(
|
|
@@ -1060,9 +1057,7 @@ main <- function() {
|
|
df_na_ss <- ss$summary_stats
|
|
df_na_ss <- ss$summary_stats
|
|
df_na_stats <- ss$df_with_stats
|
|
df_na_stats <- ss$df_with_stats
|
|
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
|
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
|
- # df_na_filtered_stats <- process_data(df_na_stats, c("L"), filter_nf = TRUE)
|
|
|
|
|
|
|
|
- # Create background (WT) data columns
|
|
|
|
df_na_stats <- df_na_stats %>%
|
|
df_na_stats <- df_na_stats %>%
|
|
mutate(
|
|
mutate(
|
|
WT_L = mean_L,
|
|
WT_L = mean_L,
|
|
@@ -1079,10 +1074,10 @@ main <- function() {
|
|
bg_stats <- df_na_stats %>%
|
|
bg_stats <- df_na_stats %>%
|
|
filter(conc_num == 0) %>%
|
|
filter(conc_num == 0) %>%
|
|
summarise(
|
|
summarise(
|
|
- L = first(mean_L),
|
|
|
|
- K = first(mean_K),
|
|
|
|
- r = first(mean_r),
|
|
|
|
- AUC = first(mean_AUC),
|
|
|
|
|
|
+ mean_L = first(mean_L),
|
|
|
|
+ mean_K = first(mean_K),
|
|
|
|
+ mean_r = first(mean_r),
|
|
|
|
+ mean_AUC = first(mean_AUC),
|
|
sd_L = first(sd_L),
|
|
sd_L = first(sd_L),
|
|
sd_K = first(sd_K),
|
|
sd_K = first(sd_K),
|
|
sd_r = first(sd_r),
|
|
sd_r = first(sd_r),
|
|
@@ -1090,12 +1085,11 @@ main <- function() {
|
|
)
|
|
)
|
|
|
|
|
|
message("Calculating summary statistics after quality control excluding zero values")
|
|
message("Calculating summary statistics after quality control excluding zero values")
|
|
- ss <- calculate_summary_stats(
|
|
|
|
|
|
+ df_no_zeros_stats <- calculate_summary_stats(
|
|
df = df_no_zeros,
|
|
df = df_no_zeros,
|
|
variables = summary_vars,
|
|
variables = summary_vars,
|
|
- group_vars = c("conc_num", "conc_num_factor"))
|
|
|
|
- df_no_zeros_stats <- ss$df_with_stats
|
|
|
|
- # df_no_zeros_filtered_stats <- process_data(df_no_zeros_stats, c("L"), filter_nf = TRUE)
|
|
|
|
|
|
+ group_vars = c("conc_num", "conc_num_factor")
|
|
|
|
+ )$df_with_stats
|
|
|
|
|
|
message("Filtering by 2SD of K")
|
|
message("Filtering by 2SD of K")
|
|
df_na_within_2sd_k <- df_na_stats %>%
|
|
df_na_within_2sd_k <- df_na_stats %>%
|
|
@@ -1105,9 +1099,8 @@ main <- function() {
|
|
|
|
|
|
message("Calculating summary statistics for L within 2SD of K")
|
|
message("Calculating summary statistics for L within 2SD of K")
|
|
# TODO We're omitting the original z_max calculation, not sure if needed?
|
|
# TODO We're omitting the original z_max calculation, not sure if needed?
|
|
- ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
|
|
|
- # df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
|
|
|
- write.csv(ss$summary_stats,
|
|
|
|
|
|
+ ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))$summary_stats
|
|
|
|
+ write.csv(ss,
|
|
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
|
|
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
|
|
row.names = FALSE)
|
|
row.names = FALSE)
|
|
|
|
|
|
@@ -1293,28 +1286,24 @@ main <- function() {
|
|
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
|
|
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
|
|
row.names = FALSE)
|
|
row.names = FALSE)
|
|
|
|
|
|
- # Filter reference and deletion strains
|
|
|
|
- df_reference <- df_na_stats %>% # formerly X2_RF
|
|
|
|
- filter(OrfRep == strain) %>%
|
|
|
|
- mutate(SM = 0)
|
|
|
|
-
|
|
|
|
- df_deletion <- df_na_stats %>% # formerly X2
|
|
|
|
- filter(OrfRep != strain) %>%
|
|
|
|
- mutate(SM = 0)
|
|
|
|
-
|
|
|
|
# Set the missing values to the highest theoretical value at each drug conc for L
|
|
# Set the missing values to the highest theoretical value at each drug conc for L
|
|
# Leave other values as 0 for the max/min
|
|
# Leave other values as 0 for the max/min
|
|
- reference_strain <- df_reference %>%
|
|
|
|
|
|
+ df_reference <- df_na_stats %>% # formerly X2_RF
|
|
|
|
+ filter(OrfRep == strain) %>%
|
|
|
|
+ filter(!is.na(L)) %>%
|
|
group_by(conc_num, conc_num_factor) %>%
|
|
group_by(conc_num, conc_num_factor) %>%
|
|
mutate(
|
|
mutate(
|
|
max_l_theoretical = max(max_L, na.rm = TRUE),
|
|
max_l_theoretical = max(max_L, na.rm = TRUE),
|
|
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
|
|
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
|
|
- SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
|
|
|
|
|
|
+ SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, 0),
|
|
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
|
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
|
ungroup()
|
|
ungroup()
|
|
|
|
|
|
# Ditto for deletion strains
|
|
# Ditto for deletion strains
|
|
- deletion_strains <- df_deletion %>%
|
|
|
|
|
|
+ df_deletion <- df_na_stats %>% # formerly X2
|
|
|
|
+ filter(OrfRep != strain) %>%
|
|
|
|
+ filter(!is.na(L)) %>%
|
|
|
|
+ mutate(SM = 0) %>%
|
|
group_by(conc_num, conc_num_factor) %>%
|
|
group_by(conc_num, conc_num_factor) %>%
|
|
mutate(
|
|
mutate(
|
|
max_l_theoretical = max(max_L, na.rm = TRUE),
|
|
max_l_theoretical = max(max_L, na.rm = TRUE),
|
|
@@ -1325,7 +1314,7 @@ main <- function() {
|
|
|
|
|
|
message("Calculating reference strain interaction scores")
|
|
message("Calculating reference strain interaction scores")
|
|
df_reference_stats <- calculate_summary_stats(
|
|
df_reference_stats <- calculate_summary_stats(
|
|
- df = reference_strain,
|
|
|
|
|
|
+ df = df_reference,
|
|
variables = interaction_vars,
|
|
variables = interaction_vars,
|
|
group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")
|
|
group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")
|
|
)$df_with_stats
|
|
)$df_with_stats
|
|
@@ -1336,7 +1325,7 @@ main <- function() {
|
|
|
|
|
|
message("Calculating deletion strain(s) interactions scores")
|
|
message("Calculating deletion strain(s) interactions scores")
|
|
df_deletion_stats <- calculate_summary_stats(
|
|
df_deletion_stats <- calculate_summary_stats(
|
|
- df = deletion_strains,
|
|
|
|
|
|
+ df = df_deletion,
|
|
variables = interaction_vars,
|
|
variables = interaction_vars,
|
|
group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")
|
|
group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor")
|
|
)$df_with_stats
|
|
)$df_with_stats
|