Improve df grouping
This commit is contained in:
@@ -130,13 +130,14 @@ load_and_process_data <- function(easy_results_file, sd = 3) {
|
|||||||
DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0),
|
DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0),
|
||||||
SM = 0,
|
SM = 0,
|
||||||
OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded?
|
OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded?
|
||||||
conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)),
|
conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc))
|
||||||
conc_num_factor = as.factor(conc_num)
|
) %>%
|
||||||
# conc_num_factor = factor(conc_num, levels = sort(unique(conc_num)))
|
mutate(
|
||||||
|
conc_num_factor = as.factor(match(conc_num, sort(unique(conc_num))) - 1)
|
||||||
)
|
)
|
||||||
|
|
||||||
return(df)
|
return(df)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Update Gene names using the SGD gene list
|
# Update Gene names using the SGD gene list
|
||||||
update_gene_names <- function(df, sgd_gene_list) {
|
update_gene_names <- function(df, sgd_gene_list) {
|
||||||
@@ -160,15 +161,6 @@ update_gene_names <- function(df, sgd_gene_list) {
|
|||||||
|
|
||||||
calculate_summary_stats <- function(df, variables, group_vars) {
|
calculate_summary_stats <- function(df, variables, group_vars) {
|
||||||
|
|
||||||
|
|
||||||
summary_stats <- df %>%
|
|
||||||
group_by(across(all_of(group_vars))) %>%
|
|
||||||
summarise(
|
|
||||||
N = n(),
|
|
||||||
sd_check = sd(L, na.rm = TRUE), # Test sd on a specific variable, e.g., L
|
|
||||||
se_check = sd(L, na.rm = TRUE) / sqrt(N) # Test se on a specific variable, e.g., L
|
|
||||||
)
|
|
||||||
|
|
||||||
summary_stats <- df %>%
|
summary_stats <- df %>%
|
||||||
group_by(across(all_of(group_vars))) %>%
|
group_by(across(all_of(group_vars))) %>%
|
||||||
summarise(
|
summarise(
|
||||||
@@ -184,9 +176,6 @@ calculate_summary_stats <- function(df, variables, group_vars) {
|
|||||||
.groups = "drop"
|
.groups = "drop"
|
||||||
)
|
)
|
||||||
|
|
||||||
# sd = ~sd(., na.rm = TRUE)
|
|
||||||
# se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1)
|
|
||||||
|
|
||||||
# Create a cleaned version of df that doesn't overlap with summary_stats
|
# Create a cleaned version of df that doesn't overlap with summary_stats
|
||||||
cols_to_keep <- setdiff(names(df), names(summary_stats)[-which(names(summary_stats) %in% group_vars)])
|
cols_to_keep <- setdiff(names(df), names(summary_stats)[-which(names(summary_stats) %in% group_vars)])
|
||||||
df_cleaned <- df %>%
|
df_cleaned <- df %>%
|
||||||
@@ -219,11 +208,11 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars) {
|
|||||||
|
|
||||||
stats <- calculate_summary_stats(df,
|
stats <- calculate_summary_stats(df,
|
||||||
variables = variables,
|
variables = variables,
|
||||||
group_vars = c("OrfRep", "Gene", "num", "conc_num_factor"
|
group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")
|
||||||
))$summary_stats
|
)$summary_stats
|
||||||
|
|
||||||
stats <- df %>%
|
stats <- df %>%
|
||||||
group_by(OrfRep, Gene, num) %>%
|
group_by(across(all_of(group_vars))) %>%
|
||||||
mutate(
|
mutate(
|
||||||
WT_L = mean_L,
|
WT_L = mean_L,
|
||||||
WT_K = mean_K,
|
WT_K = mean_K,
|
||||||
@@ -277,13 +266,13 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars) {
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Calculate linear models
|
# Calculate linear models
|
||||||
lm_L <- lm(Delta_L ~ conc_num_factor, data = stats)
|
lm_L <- lm(Delta_L ~ conc_num, data = stats)
|
||||||
lm_K <- lm(Delta_K ~ conc_num_factor, data = stats)
|
lm_K <- lm(Delta_K ~ conc_num, data = stats)
|
||||||
lm_r <- lm(Delta_r ~ conc_num_factor, data = stats)
|
lm_r <- lm(Delta_r ~ conc_num, data = stats)
|
||||||
lm_AUC <- lm(Delta_AUC ~ conc_num_factor, data = stats)
|
lm_AUC <- lm(Delta_AUC ~ conc_num, data = stats)
|
||||||
|
|
||||||
interactions <- stats %>%
|
interactions <- stats %>%
|
||||||
group_by(OrfRep, Gene, num) %>%
|
group_by(across(all_of(group_vars))) %>%
|
||||||
summarise(
|
summarise(
|
||||||
OrfRep = first(OrfRep),
|
OrfRep = first(OrfRep),
|
||||||
Gene = first(Gene),
|
Gene = first(Gene),
|
||||||
@@ -357,10 +346,10 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars) {
|
|||||||
"NG", "SM", "DB")
|
"NG", "SM", "DB")
|
||||||
|
|
||||||
calculations_joined <- df %>% select(-any_of(setdiff(names(calculations), c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))))
|
calculations_joined <- df %>% select(-any_of(setdiff(names(calculations), c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))))
|
||||||
calculations_joined <- left_join(calculations_joined, calculations, by = c("OrfRep", "Gene", "num", "conc_num_factor"))
|
calculations_joined <- left_join(calculations_joined, calculations, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))
|
||||||
|
|
||||||
interactions_joined <- df %>% select(-any_of(setdiff(names(interactions), c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))))
|
interactions_joined <- df %>% select(-any_of(setdiff(names(interactions), c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))))
|
||||||
interactions_joined <- left_join(interactions_joined, interactions, by = c("OrfRep", "Gene", "num", "conc_num_factor"))
|
interactions_joined <- left_join(interactions_joined, interactions, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))
|
||||||
|
|
||||||
return(list(calculations = calculations, interactions = interactions, interactions_joined = interactions_joined,
|
return(list(calculations = calculations, interactions = interactions, interactions_joined = interactions_joined,
|
||||||
calculations_joined = calculations_joined))
|
calculations_joined = calculations_joined))
|
||||||
@@ -1059,7 +1048,7 @@ main <- function() {
|
|||||||
ss <- calculate_summary_stats(
|
ss <- calculate_summary_stats(
|
||||||
df = df,
|
df = df,
|
||||||
variables = summary_vars,
|
variables = summary_vars,
|
||||||
group_vars = c("OrfRep", "conc_num_factor"))
|
group_vars = c("conc_num", "conc_num_factor"))
|
||||||
df_stats <- ss$df_with_stats
|
df_stats <- ss$df_with_stats
|
||||||
message("Filtering non-finite data")
|
message("Filtering non-finite data")
|
||||||
df_filtered_stats <- filter_data(df_stats, c("L"), nf = TRUE)
|
df_filtered_stats <- filter_data(df_stats, c("L"), nf = TRUE)
|
||||||
@@ -1068,7 +1057,7 @@ main <- function() {
|
|||||||
ss <- calculate_summary_stats(
|
ss <- calculate_summary_stats(
|
||||||
df = df_na,
|
df = df_na,
|
||||||
variables = summary_vars,
|
variables = summary_vars,
|
||||||
group_vars = c("OrfRep", "conc_num_factor"))
|
group_vars = c("conc_num", "conc_num_factor"))
|
||||||
df_na_ss <- ss$summary_stats
|
df_na_ss <- ss$summary_stats
|
||||||
df_na_stats <- ss$df_with_stats
|
df_na_stats <- ss$df_with_stats
|
||||||
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
|
||||||
@@ -1078,7 +1067,7 @@ main <- function() {
|
|||||||
ss <- calculate_summary_stats(
|
ss <- calculate_summary_stats(
|
||||||
df = df_no_zeros,
|
df = df_no_zeros,
|
||||||
variables = summary_vars,
|
variables = summary_vars,
|
||||||
group_vars = c("OrfRep", "conc_num_factor"))
|
group_vars = c("conc_num", "conc_num_factor"))
|
||||||
df_no_zeros_stats <- ss$df_with_stats
|
df_no_zeros_stats <- ss$df_with_stats
|
||||||
df_no_zeros_filtered_stats <- filter_data(df_no_zeros_stats, c("L"), nf = TRUE)
|
df_no_zeros_filtered_stats <- filter_data(df_no_zeros_stats, c("L"), nf = TRUE)
|
||||||
|
|
||||||
@@ -1090,14 +1079,14 @@ main <- function() {
|
|||||||
|
|
||||||
message("Calculating summary statistics for L within 2SD of K")
|
message("Calculating summary statistics for L within 2SD of K")
|
||||||
# TODO We're omitting the original z_max calculation, not sure if needed?
|
# TODO We're omitting the original z_max calculation, not sure if needed?
|
||||||
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num_factor"))
|
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||||
# df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
# df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
||||||
write.csv(ss$summary_stats,
|
write.csv(ss$summary_stats,
|
||||||
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
|
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"),
|
||||||
row.names = FALSE)
|
row.names = FALSE)
|
||||||
|
|
||||||
message("Calculating summary statistics for L outside 2SD of K")
|
message("Calculating summary statistics for L outside 2SD of K")
|
||||||
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num_factor"))
|
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||||
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
||||||
write.csv(ss$summary_stats,
|
write.csv(ss$summary_stats,
|
||||||
file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"),
|
file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"),
|
||||||
@@ -1272,7 +1261,7 @@ main <- function() {
|
|||||||
# Set the missing values to the highest theoretical value at each drug conc for L
|
# Set the missing values to the highest theoretical value at each drug conc for L
|
||||||
# Leave other values as 0 for the max/min
|
# Leave other values as 0 for the max/min
|
||||||
reference_strain <- df_reference %>%
|
reference_strain <- df_reference %>%
|
||||||
group_by(conc_num_factor) %>%
|
group_by(conc_num, conc_num_factor) %>%
|
||||||
mutate(
|
mutate(
|
||||||
max_l_theoretical = max(max_L, na.rm = TRUE),
|
max_l_theoretical = max(max_L, na.rm = TRUE),
|
||||||
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
|
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
|
||||||
@@ -1282,7 +1271,7 @@ main <- function() {
|
|||||||
|
|
||||||
# Ditto for deletion strains
|
# Ditto for deletion strains
|
||||||
deletion_strains <- df_deletion %>%
|
deletion_strains <- df_deletion %>%
|
||||||
group_by(conc_num_factor) %>%
|
group_by(conc_num, conc_num_factor) %>%
|
||||||
mutate(
|
mutate(
|
||||||
max_l_theoretical = max(max_L, na.rm = TRUE),
|
max_l_theoretical = max(max_L, na.rm = TRUE),
|
||||||
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
|
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
|
||||||
|
|||||||
Reference in New Issue
Block a user