Create and join dataframes in calculate_summary_stats

This commit is contained in:
2024-09-05 19:42:48 -04:00
parent 75ba5939a3
commit 3cfd324309

View File

@@ -112,21 +112,12 @@ scale_colour_publication <- function(...) {
load_and_process_data <- function(easy_results_file, sd = 3) {
df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
# Clean and convert columns to numeric where appropriate
df <- df %>%
filter(!(.[[1]] %in% c("", "Scan"))) %>%
filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>%
mutate(
Col = as.numeric(Col),
Row = as.numeric(Row),
num = as.numeric(Num.),
L = as.numeric(l),
K = as.numeric(K),
r = as.numeric(r),
scan = as.numeric(Scan),
AUC = as.numeric(AUC96),
last_bg = as.numeric(LstBackgrd),
first_bg = as.numeric(X1stBackgrd),
across(c(Col, Row, Num.), as.numeric),
across(c(L, K, r, scan, AUC96, LstBackgrd, X1stBackgrd), as.numeric),
delta_bg = last_bg - first_bg,
delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + (sd * sd(delta_bg, na.rm = TRUE)),
NG = if_else(L == 0 & !is.na(L), 1, 0),
@@ -134,11 +125,13 @@ load_and_process_data <- function(easy_results_file, sd = 3) {
SM = 0,
OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep),
conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)),
conc_num_factor = as.numeric(as.factor(conc_num)) - 1)
conc_num_factor = as.numeric(as.factor(conc_num)) - 1
)
return(df)
}
# Update Gene names using the SGD gene list
update_gene_names <- function(df, sgd_gene_list) {
# Load SGD gene list
@@ -189,6 +182,7 @@ process_strains <- function(df) {
# Calculate summary statistics for all variables
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
# Generate summary statistics
summary_stats <- df %>%
group_by(across(all_of(group_vars))) %>%
reframe(across(all_of(variables), list(
@@ -198,13 +192,20 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), # Return NA if all values are NA
min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), # Return NA if all values are NA
sd = ~sd(., na.rm = TRUE), # Standard deviation ignoring NAs
se = ~ifelse(.N > 1, sd(., na.rm = TRUE) / sqrt(.N - 1), NA), # Standard Error using precomputed N
z_max = ~ifelse(sd(., na.rm = TRUE) == 0 | all(is.na(.)), NA, (max(., na.rm = TRUE) - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE)) # Z-score
se = ~ifelse(N > 1, sd(., na.rm = TRUE) / sqrt(N - 1), NA) # Standard Error using precomputed N
# TODO: not in original stats but better to do here than in calculate_interactions?
# z_max = ~ifelse(sd(., na.rm = TRUE) == 0 | all(is.na(.)), NA,
# (max(., na.rm = TRUE) - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE)) # Z-score
), .names = "{.fn}_{.col}"))
return(summary_stats)
# Join the summary stats back to the original dataframe
df_with_stats <- left_join(df, summary_stats, by = group_vars)
# Return both the summary stats and the updated dataframe
return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
}
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
# Calculate total concentration variables
@@ -215,7 +216,7 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
print("Calculating background means")
bg_L <- df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first()
bg_K <- df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first()
bg_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first()
bg_r <- df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first()
bg_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first()
bg_sd_L <- df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first()
@@ -223,57 +224,43 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
bg_sd_r <- df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first()
bg_sd_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
# Calculate summary statistics and shifts
# Calculate interaction scores
print("Calculating interaction scores part 1")
interaction_scores <- df %>%
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
summarise(
NG = sum(NG),
DB = sum(DB),
SM = sum(SM)
mutate(
NG = sum(NG, na.rm = TRUE),
DB = sum(DB, na.rm = TRUE),
SM = sum(SM, na.rm = TRUE)
) %>%
summarise(across(all_of(variables), list(
mutate(across(all_of(variables), list(
N = ~sum(!is.na(.)), # Count of non-NA values
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
max = ~max(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
se = ~sd(., na.rm = TRUE) / sqrt(N - 1)
se = ~sd(., na.rm = TRUE) / sqrt(N - 1), # Standard error calculation
Raw_Shift = ~mean(., na.rm = TRUE) - case_when(
cur_column() == "L" ~ bg_L,
cur_column() == "K" ~ bg_K,
cur_column() == "r" ~ bg_r,
cur_column() == "AUC" ~ bg_AUC
),
Z_Shift = ~ Raw_Shift / case_when(
cur_column() == "L" ~ bg_sd_L,
cur_column() == "K" ~ bg_sd_K,
cur_column() == "r" ~ bg_sd_r,
cur_column() == "AUC" ~ bg_sd_AUC
),
WT = ~mean(., na.rm = TRUE),
WT_sd = ~sd(., na.rm = TRUE),
Exp = ~ WT + Raw_Shift,
Delta = ~ WT - Exp,
Delta = if_else(NG == 1, mean - WT, Delta)
), .names = "{.fn}_{.col}")) %>%
summarise(
Raw_Shift_L = mean_L[[1]] - bg_L,
Raw_Shift_K = mean_K[[1]] - bg_K,
Raw_Shift_r = mean_r[[1]] - bg_r,
Raw_Shift_AUC = mean_AUC[[1]] - bg_AUC,
Z_Shift_L = Raw_Shift_L[[1]] / bg_sd_L,
Z_Shift_K = Raw_Shift_K[[1]] / bg_sd_K,
Z_Shift_r = Raw_Shift_r[[1]] / bg_sd_r,
Z_Shift_AUC = Raw_Shift_AUC[[1]] / bg_sd_AUC,
WT_L = mean_L,
WT_K = mean_K,
WT_r = mean_r,
WT_AUC = mean_AUC,
WT_sd_L = sd_L,
WT_sd_K = sd_K,
WT_sd_r = sd_r,
WT_sd_AUC = sd_AUC,
Exp_L = WT_L + Raw_Shift_L,
Exp_K = WT_K + Raw_Shift_K,
Exp_r = WT_r + Raw_Shift_r,
Exp_AUC = WT_AUC + Raw_Shift_AUC,
Delta_L = mean_L - Exp_L,
Delta_K = mean_K - Exp_K,
Delta_r = mean_r - Exp_r,
Delta_AUC = mean_AUC - Exp_AUC,
Delta_L = ifelse(NG == 1, mean_L - WT_L, Delta_L), # disregard shift for no growth values in Z score calculation
Delta_L = ifelse(SM == 1, mean_L - WT_L, Delta_L), # disregard shift for set to max values in Z score calculation
Delta_K = ifelse(NG == 1, mean_K - WT_K, Delta_K),
Delta_r = ifelse(NG == 1, mean_r - WT_r, Delta_r),
Delta_AUC = ifelse(NG == 1, mean_AUC - WT_AUC, Delta_AUC),
Zscore_L = Delta_L / WT_sd_L,
Zscore_K = Delta_K / WT_sd_K,
Zscore_r = Delta_r / WT_sd_r,
Zscore_AUC = Delta_AUC / WT_sd_AUC,
mutate(
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) # disregard shift for set to max values in Z score calculation
) %>%
ungroup()
@@ -281,52 +268,29 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
print("Calculating interaction scores part 2")
interaction_scores_all <- interaction_scores %>%
group_by(across(all_of(group_vars))) %>%
summarise(
lm_L = lm(Delta_L ~ conc_num_factor),
lm_K = lm(Delta_K ~ conc_num_factor),
lm_r = lm(Delta_r ~ conc_num_factor),
lm_AUC = lm(Delta_AUC ~ conc_num_factor),
lm_score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1],
lm_score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1],
lm_score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1],
lm_score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1],
lm_sd_L = sd(lm_score_L),
lm_sd_K = sd(lm_score_K),
lm_sd_r = sd(lm_score_r),
lm_sd_AUC = sd(lm_score_AUC),
lm_mean_L = mean(lm_score_L),
lm_mean_K = mean(lm_score_K),
lm_mean_r = mean(lm_score_r),
lm_mean_AUC = mean(lm_score_AUC),
Z_lm_L = (lm_score_L - lm_mean_L) / lm_sd_L,
Z_lm_K = (lm_score_K - lm_mean_K) / lm_sd_K,
Z_lm_r = (lm_score_r - lm_mean_r) / lm_sd_r,
Z_lm_AUC = (lm_score_AUC - lm_mean_AUC) / lm_sd_AUC,
r_squared_L = summary(lm_L)$r.squared,
r_squared_K = summary(lm_K)$r.squared,
r_squared_r = summary(lm_r)$r.squared,
r_squared_AUC = summary(lm_AUC)$r.squared,
Sum_Zscore_L = sum(Zscore_L),
mutate(across(all_of(variables), list(
lm_score = ~ max_conc * lm(Delta ~ conc_num_factor)$coefficients[2] + lm(Delta ~ conc_num_factor)$coefficients[1],
lm_sd = ~ sd(lm_score),
lm_mean = ~ mean(lm_score),
Z_lm = ~ (lm_score - lm_mean) / lm_sd,
Sum_Zscore = ~ sum(Zscore, na.rm = TRUE)
), .names = "{.fn}_{.col}")) %>%
mutate(
Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs,
Sum_Zscore_K = sum(Zscore_K),
Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs,
Sum_Zscore_r = sum(Zscore_r),
Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1),
Sum_Zscore_AUC = sum(Zscore_AUC),
Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1),
NG = sum(NG),
DB = sum(DB),
SM = sum(SM)
Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1)
) %>%
ungroup()
interaction_scores_all <- interactions_scores_all %>%
interaction_scores_all <- interaction_scores_all %>%
arrange(desc(Z_lm_L)) %>%
arrange(desc(NG))
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
}
interaction_plot_configs <- function(df, variable) {
ylim_vals <- switch(variable,
"L" = c(-65, 65),
@@ -518,40 +482,19 @@ generate_cpp_correlation_plots <- function(df_na_rm, lm_list, output_dir) {
}
# Adjust missing values and calculate ranks
adjust_missing_and_rank <- function(df) {
adjust_missing_and_rank <- function(df, variables) {
df <- df %>%
mutate(
Avg_Zscore_L = ifelse(is.na(Avg_Zscore_L), 0.001, Avg_Zscore_L),
Avg_Zscore_K = ifelse(is.na(Avg_Zscore_K), 0.001, Avg_Zscore_K),
Avg_Zscore_r = ifelse(is.na(Avg_Zscore_r), 0.001, Avg_Zscore_r),
Avg_Zscore_AUC = ifelse(is.na(Avg_Zscore_AUC), 0.001, Avg_Zscore_AUC),
Z_lm_L = ifelse(is.na(Z_lm_L), 0.001, Z_lm_L),
Z_lm_K = ifelse(is.na(Z_lm_K), 0.001, Z_lm_K),
Z_lm_r = ifelse(is.na(Z_lm_r), 0.001, Z_lm_r),
Z_lm_AUC = ifelse(is.na(Z_lm_AUC), 0.001, Z_lm_AUC),
L_Rank = rank(Avg_Zscore_L),
K_Rank = rank(Avg_Zscore_K),
r_Rank = rank(Avg_Zscore_r),
AUC_Rank = rank(Avg_Zscore_AUC),
L_Rank_lm = rank(Z_lm_L),
K_Rank_lm = rank(Z_lm_K),
r_Rank_lm = rank(Z_lm_r),
AUC_Rank_lm = rank(Z_lm_AUC)
)
mutate(across(all_of(variables), list(
Avg_Zscore = if_else(is.na(.Avg_Zscore), 0.001, .Avg_Zscore),
Z_lm = if_else(is.na(.Z_lm), 0.001, .Z_lm),
Rank = rank(.Avg_Zscore),
Rank_lm = rank(.Z_lm),
), "{.fn}_{.col}"))
return(df)
}
# Function to create and save all ranked plots
create_ranked_plots <- function(df, output_dir) {
df_adjusted <- adjust_missing_and_rank(df)
# Generate and save ranked plots
generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots")
generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_lm")
generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_naRM")
generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_lm_naRM")
}
generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", title_prefix = "",
x_label = NULL, y_label = NULL, ylim_vals = NULL, output_dir, file_prefix = "plot") {
@@ -590,13 +533,14 @@ generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num",
}
# Function to generate rank plots for the provided dataframe
generate_rank_plots <- function(df, output_dir) {
# Adjust missing values and calculate ranks
df_adjusted <- adjust_missing_and_rank(df)
create_ranked_plots <- function(df, output_dir) {
# List of variables for which we need to generate rank plots
variables <- c("L", "K", "r", "AUC")
# Adjust missing values and calculate ranks
df_adjusted <- adjust_missing_and_rank(df, variables)
# Generate rank plots for Avg_Zscore and Z_lm
for (var in variables) {
plot_rank_avg <- generate_plot(
@@ -623,13 +567,6 @@ generate_rank_plots <- function(df, output_dir) {
}
}
# Function to call rank plot generation in the main function
create_ranked_plots <- function(df, output_dir) {
message("Generating rank plots")
generate_rank_plots(df, output_dir)
}
main <- function() {
lapply(names(args$experiments), function(exp_name) {
exp <- args$experiments[[exp_name]]
@@ -658,11 +595,6 @@ main <- function() {
# Get the number of rows that are above tolerance
rows_to_remove <- nrow(df_above_tolerance)
# Logging or handling the calculated values, e.g.:
message("Half-median for L (above tolerance): ", L_half_median)
message("Half-median for K (above tolerance): ", K_half_median)
message("Number of rows above tolerance: ", rows_to_remove)
# Additional filtering for non-finite values in df_na
df_na_filtered <- df_na %>%
filter(if_any(c(L), ~ !is.finite(.))) %>%
@@ -700,52 +632,40 @@ main <- function() {
generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", after_qc_plot_configs)
generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", no_zeros_plot_configs)
# Clean up
rm(df, df_above_tolerance, df_no_zeros)
# Calculate summary statistics
message("Calculating summary statistics for all strains")
variables <- c("L", "K", "r", "AUC")
stats <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
write.csv(stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
stats_joined <- left_join(df_na, stats, by = c("OrfRep", "conc_num", "conc_num_factor"))
list(summary_stats, df_na_stats) <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
# stats <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
print("stats:")
print(head(stats))
print("Summary stats:")
print(head(summary_stats))
# Originally this filtered L NA's
# TODO: Originally this filtered L NA's
# Let's try to avoid for now since stats have already been calculated
# Filter data within and outside 2SD
within_2sd_k <- stats_joined %>%
message("Filtering by 2SD of K")
df_na_within_2sd_k <- df_na %>%
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
outside_2sd_k <- stats_joined %>%
df_na_outside_2sd_k <- df_na %>%
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
# Summary statistics for within and outside 2SD of K
l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
message("Calculating summary statistics for L within 2SD of K")
list(l_within_2sd_k_stats, df_na_l_within_2sd_k_stats) <-
calculate_summary_stats(df_na_l_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
message("Calculating summary statistics for L outside 2SD of K")
list(l_outside_2sd_k_stats, df_na_l_outside_2sd_k_stats) <-
calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
# Write CSV files
write.csv(l_within_2sd_k, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
write.csv(l_outside_2sd_k, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
message("Calculating summary statistics for L within 2SD of K")
cols_to_remove <- names(l_within_2sd_k)
cols_to_keep <- c("conc_num", "conc_num_factor")
within_2sd_k_clean <- within_2sd_k %>%
select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
l_within_2sd_k_joined <- within_2sd_k_clean %>%
left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor"))
message("Calculating summary statistics for for L outside 2SD of K")
cols_to_remove <- names(l_outside_2sd_k)
cols_to_keep <- c("conc_num", "conc_num_factor")
outside_2sd_k_clean <- outside_2sd_k %>%
select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
l_outside_2sd_k_joined <- outside_2sd_k_clean %>%
left_join(l_outside_2sd_k, by = c("conc_num", "conc_num_factor"))
write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
# Process background strains
bg_strains <- c("YDL227C")
@@ -767,20 +687,20 @@ main <- function() {
# Recalculate summary statistics for the background strain
message("Calculating summary statistics for background strain")
stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
write.csv(stats_bg,
list(summary_stats_bg, df_bg_stats) <-
calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
write.csv(summary_stats_bg,
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
row.names = FALSE)
# stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "conc_num", "conc_num_factor"))
# Filter reference and deletion strains
# Formerly X2_RF (reference strain)
df_reference <- stats_joined %>%
df_reference <- df_bg_stats %>%
filter(OrfRep == strain) %>%
mutate(SM = 0)
# Formerly X2 (deletion strains)
df_deletion <- stats_joined %>%
df_deletion <- df_bg_stats %>%
filter(OrfRep != strain) %>%
mutate(SM = 0)