Add more calcualtions to calculate_interaction_scores()
This commit is contained in:
@@ -154,20 +154,6 @@ update_gene_names <- function(df, sgd_gene_list) {
|
||||
return(df)
|
||||
}
|
||||
|
||||
# Process either deletion and or reference strain(s)
|
||||
process_strains <- function(df) {
|
||||
message("Processing strains")
|
||||
|
||||
df %>%
|
||||
group_by(conc_num) %>%
|
||||
mutate(
|
||||
max_l_theoretical = max(max_L, na.rm = TRUE),
|
||||
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
|
||||
SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
|
||||
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
||||
ungroup()
|
||||
}
|
||||
|
||||
# Calculate summary statistics for all variables
|
||||
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
|
||||
df <- df %>%
|
||||
@@ -202,9 +188,7 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
||||
total_conc_num <- length(unique(df$conc_num))
|
||||
num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1
|
||||
|
||||
# Pull the background means and standard deviations
|
||||
print("Calculating background means")
|
||||
print(head(df))
|
||||
# Pull the background means and standard deviations from 0 concentration
|
||||
bg_means <- list(
|
||||
L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(),
|
||||
K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(),
|
||||
@@ -218,19 +202,17 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
||||
AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
|
||||
)
|
||||
|
||||
# First mutate block to calculate NG, DB, SM, and N
|
||||
print("Calculating interaction scores part 1")
|
||||
# Calculate NG, DB, SM, N
|
||||
interaction_scores <- df %>%
|
||||
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
|
||||
mutate(
|
||||
NG = sum(NG, na.rm = TRUE),
|
||||
DB = sum(DB, na.rm = TRUE),
|
||||
SM = sum(SM, na.rm = TRUE),
|
||||
N = sum(!is.na(L)) # Count of non-NA values in L
|
||||
N = sum(!is.na(L))
|
||||
)
|
||||
|
||||
# Calculate Raw_Shift and Z_Shift for each variable
|
||||
print("Calculating Raw_Shift and Z_Shift")
|
||||
interaction_scores <- interaction_scores %>%
|
||||
mutate(
|
||||
Raw_Shift_L = mean_L - bg_means$L,
|
||||
@@ -244,7 +226,6 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
||||
)
|
||||
|
||||
# Calculate WT and WT_sd for each variable
|
||||
print("Calculating WT and WT_sd")
|
||||
interaction_scores <- interaction_scores %>%
|
||||
mutate(
|
||||
WT_L = mean(mean_L, na.rm = TRUE),
|
||||
@@ -258,7 +239,6 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
||||
)
|
||||
|
||||
# Calculate Exp and Delta for each variable
|
||||
print("Calculating Exp and Delta")
|
||||
interaction_scores <- interaction_scores %>%
|
||||
mutate(
|
||||
Exp_L = WT_L + Raw_Shift_L,
|
||||
@@ -271,29 +251,38 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
||||
Delta_AUC = WT_AUC - Exp_AUC
|
||||
)
|
||||
|
||||
# Final adjustment to Delta for NG and SM conditions
|
||||
# Adjust Delta for NG and SM conditions
|
||||
interaction_scores <- interaction_scores %>%
|
||||
mutate(
|
||||
Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L),
|
||||
Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K),
|
||||
Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r),
|
||||
Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC),
|
||||
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) # disregard shift for set-to-max values in Z score calculation
|
||||
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L)
|
||||
) %>%
|
||||
ungroup()
|
||||
|
||||
print("Interaction scores:")
|
||||
print(head(interaction_scores))
|
||||
|
||||
# Calculate linear models and interaction scores per gene
|
||||
print("Calculating interaction scores part 2")
|
||||
# Calculate linear models and lm_Score for each variable
|
||||
interaction_scores_all <- interaction_scores %>%
|
||||
group_by(across(all_of(group_vars))) %>%
|
||||
mutate(
|
||||
lm_score_L = max_conc * coef(lm(Delta_L ~ conc_num_factor))[2] + coef(lm(Delta_L ~ conc_num_factor))[1],
|
||||
lm_score_K = max_conc * coef(lm(Delta_K ~ conc_num_factor))[2] + coef(lm(Delta_K ~ conc_num_factor))[1],
|
||||
lm_score_r = max_conc * coef(lm(Delta_r ~ conc_num_factor))[2] + coef(lm(Delta_r ~ conc_num_factor))[1],
|
||||
lm_score_AUC = max_conc * coef(lm(Delta_AUC ~ conc_num_factor))[2] + coef(lm(Delta_AUC ~ conc_num_factor))[1]) %>%
|
||||
lm_Score_L = max_conc * coef(lm(Delta_L ~ conc_num_factor))[2] + coef(lm(Delta_L ~ conc_num_factor))[1],
|
||||
lm_Score_K = max_conc * coef(lm(Delta_K ~ conc_num_factor))[2] + coef(lm(Delta_K ~ conc_num_factor))[1],
|
||||
lm_Score_r = max_conc * coef(lm(Delta_r ~ conc_num_factor))[2] + coef(lm(Delta_r ~ conc_num_factor))[1],
|
||||
lm_Score_AUC = max_conc * coef(lm(Delta_AUC ~ conc_num_factor))[2] + coef(lm(Delta_AUC ~ conc_num_factor))[1]
|
||||
)
|
||||
|
||||
# Calculate Z_lm for each variable
|
||||
interaction_scores_all <- interaction_scores_all %>%
|
||||
mutate(
|
||||
Z_lm_L = (lm_Score_L - mean(lm_Score_L, na.rm = TRUE)) / sd(lm_Score_L, na.rm = TRUE),
|
||||
Z_lm_K = (lm_Score_K - mean(lm_Score_K, na.rm = TRUE)) / sd(lm_Score_K, na.rm = TRUE),
|
||||
Z_lm_r = (lm_Score_r - mean(lm_Score_r, na.rm = TRUE)) / sd(lm_Score_r, na.rm = TRUE),
|
||||
Z_lm_AUC = (lm_Score_AUC - mean(lm_Score_AUC, na.rm = TRUE)) / sd(lm_Score_AUC, na.rm = TRUE)
|
||||
)
|
||||
|
||||
# Calculate Average Z-scores for each variable
|
||||
interaction_scores_all <- interaction_scores_all %>%
|
||||
mutate(
|
||||
Avg_Zscore_L = sum(Z_Shift_L, na.rm = TRUE) / num_non_removed_concs,
|
||||
Avg_Zscore_K = sum(Z_Shift_K, na.rm = TRUE) / num_non_removed_concs,
|
||||
@@ -301,15 +290,16 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
||||
Avg_Zscore_AUC = sum(Z_Shift_AUC, na.rm = TRUE) / (total_conc_num - 1)
|
||||
)
|
||||
|
||||
# Arrange the results by Z_lm_L and NG
|
||||
# Arrange results by Z_lm_L and NG
|
||||
interaction_scores_all <- interaction_scores_all %>%
|
||||
arrange(desc(lm_score_L)) %>%
|
||||
arrange(desc(lm_Score_L)) %>%
|
||||
arrange(desc(NG)) %>%
|
||||
ungroup()
|
||||
|
||||
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
|
||||
}
|
||||
|
||||
|
||||
generate_and_save_plots <- function(output_dir, file_name, plot_configs) {
|
||||
|
||||
`%||%` <- function(a, b) if (!is.null(a)) a else b
|
||||
@@ -340,19 +330,14 @@ generate_and_save_plots <- function(output_dir, file_name, plot_configs) {
|
||||
width = 0.1) +
|
||||
geom_point(aes(y = !!sym(y_mean_col)), size = 0.6)
|
||||
} else if (config$error_bar %||% FALSE) {
|
||||
# Directly use geom_point and geom_errorbar with pre-calculated values
|
||||
plot <- plot +
|
||||
geom_point(shape = 3, size = 0.2) +
|
||||
geom_errorbar(aes(
|
||||
ymin = !!sym(y_mean_col) - !!sym(y_sd_col),
|
||||
ymax = !!sym(y_mean_col) + !!sym(y_sd_col)),
|
||||
width = 0.1) +
|
||||
geom_point(aes(y = !!sym(y_mean_col)), size = 0.6) +
|
||||
stat_summary(aes(
|
||||
ymin = !!sym(y_mean_col) - !!sym(y_sd_col),
|
||||
ymax = !!sym(y_mean_col) + !!sym(y_sd_col)),
|
||||
fun.data = "identity", geom = "errorbar", width = 0.1) +
|
||||
stat_summary(aes(y = !!sym(y_mean_col)),
|
||||
fun.data = "identity", geom = "point", size = 0.6)
|
||||
geom_point(aes(y = !!sym(y_mean_col)), size = 0.6)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -440,20 +425,21 @@ interaction_plot_configs <- function(df, variables) {
|
||||
ifelse(variable == "r", -0.45, -4500))), label = paste("SM =", df$SM))
|
||||
)
|
||||
|
||||
# Define the configuration for each variable (plot type, limits, labels)
|
||||
plot_configs[[variable]] <- list(
|
||||
x_var = "conc_num_factor",
|
||||
# Append a new plot configuration for each variable
|
||||
plot_configs[[length(plot_configs) + 1]] <- list(
|
||||
df = df,
|
||||
x_var = "Conc_Num_Factor",
|
||||
y_var = delta_var,
|
||||
plot_type = "scatter",
|
||||
title = paste(df$OrfRep[1], df$Gene[1], sep = " "),
|
||||
ylim_vals = ylim_vals,
|
||||
annotations = annotations,
|
||||
error_bar = list(
|
||||
ymin = 0 - (2 * df[[wt_sd_col]]),
|
||||
ymax = 0 + (2 * df[[wt_sd_col]])
|
||||
ymin = 0 - (2 * df[[wt_sd_col]][1]),
|
||||
ymax = 0 + (2 * df[[wt_sd_col]][1])
|
||||
),
|
||||
x_breaks = unique(df$conc_num_factor),
|
||||
x_labels = unique(as.character(df$conc_num)),
|
||||
x_breaks = unique(df$Conc_Num_Factor),
|
||||
x_labels = unique(as.character(df$Conc_Num)),
|
||||
x_label = unique(df$Drug[1])
|
||||
)
|
||||
}
|
||||
@@ -461,6 +447,9 @@ interaction_plot_configs <- function(df, variables) {
|
||||
return(plot_configs)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
correlation_plot_configs <- function(df, lm_list, lm_summaries) {
|
||||
lapply(seq_along(lm_list), function(i) {
|
||||
r_squared <- round(lm_summaries[[i]]$r.squared, 3)
|
||||
@@ -542,9 +531,31 @@ main <- function() {
|
||||
df_na %>% filter(if_all(c(L), is.finite))
|
||||
}
|
||||
|
||||
# Filter data within and outside 2SD
|
||||
message("Filtering by 2SD of K")
|
||||
df_na_within_2sd_k <- df_na_stats %>%
|
||||
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
|
||||
df_na_outside_2sd_k <- df_na_stats %>%
|
||||
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
|
||||
|
||||
# Summary statistics for within and outside 2SD of K
|
||||
message("Calculating summary statistics for L within 2SD of K")
|
||||
# TODO We're omitting the original z_max calculation, not sure if needed?
|
||||
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||
l_within_2sd_k_stats <- ss$summary_stats
|
||||
df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
||||
message("Calculating summary statistics for L outside 2SD of K")
|
||||
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||
l_outside_2sd_k_stats <- ss$summary_stats
|
||||
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
||||
# Write CSV files
|
||||
write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
|
||||
write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
|
||||
|
||||
|
||||
# Plot configurations
|
||||
# Each list corresponds to a group of plots in the same file
|
||||
raw_l_vs_k_plots <- list(
|
||||
l_vs_k_plots <- list(
|
||||
list(df = df, x_var = "L", y_var = "K", plot_type = "scatter",
|
||||
title = "Raw L vs K before QC",
|
||||
color_var = "conc_num",
|
||||
@@ -637,59 +648,33 @@ main <- function() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
delta_bg_plot_configs <- list(
|
||||
list(x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
||||
title = paste("Raw L vs K for strains above delta background threshold of", delta_bg_tolerance, "or above"),
|
||||
ylim_vals = NULL,
|
||||
annotate("text", x = L_half_median, y = K_half_median,
|
||||
label = paste("Strains above delta background tolerance = ", nrow(df_above_tolerance)))
|
||||
)
|
||||
|
||||
|
||||
|
||||
l_outside_2sd_k_plots <- list(
|
||||
list(df = X_outside_2SD_K, x_var = "l", y_var = "K", plot_type = "scatter",
|
||||
title = "Raw L vs K for strains falling outside 2SD of the K mean at each Conc",
|
||||
color_var = "conc_num",
|
||||
legend_position = "right")
|
||||
)
|
||||
|
||||
before_qc_configs <- list(
|
||||
list(x_var = "scan", y_var = "delta_bg", plot_type = "scatter",
|
||||
title = "Plate analysis by Drug Conc for Delta Background before QC",
|
||||
error_bar = TRUE, color_var = "conc_num"),
|
||||
list(x_var = "scan", y_var = "delta_bg", plot_type = "box",
|
||||
title = "Plate analysis by Drug Conc for Delta Background before QC",
|
||||
error_bar = FALSE, color_var = "conc_num")
|
||||
delta_bg_outside_2sd_k_plots <- list(
|
||||
list(df = X_outside_2SD_K, x_var = "delta_bg", y_var = "K", plot_type = "scatter",
|
||||
title = "Delta Background vs K for strains falling outside 2SD of the K mean at each Conc",
|
||||
color_var = "conc_num",
|
||||
legend_position = "right")
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
above_delta_bg_tolerance <- list(
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
)
|
||||
|
||||
|
||||
# Generate and save plots for each QC step
|
||||
message("Generating QC plots")
|
||||
generate_and_save_plots(out_dir_qc, "L_vs_K_before_quality_control", l_vs_k_plots)
|
||||
generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
|
||||
generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
|
||||
generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
|
||||
generate_and_save_plots(out_dir_qc, "plate_analysis_no_zeros", plate_analysis_no_zeros_plots)
|
||||
generate_and_save_plots(out_dir_qc, "L_vs_K_for_strains_2SD_outside_mean_K", l_outside_2sd_k_plots)
|
||||
generate_and_save_plots(out_dir_qc, "delta_background_vs_K_for_strains_2sd_outside_mean_K", delta_bg_outside_2sd_k_plots)
|
||||
|
||||
|
||||
# Print quality control graphs before removing data due to contamination and
|
||||
# adjusting missing data to max theoretical values
|
||||
before_qc_plot_configs <- list(
|
||||
# Plate analysis is a quality check to identify plate effects containing anomalies
|
||||
|
||||
)
|
||||
|
||||
# list(x_var = "delta_bg", y_var = NULL, plot_type = "density",
|
||||
# title = "Density plot for Delta Background",
|
||||
@@ -744,21 +729,16 @@ main <- function() {
|
||||
)
|
||||
|
||||
|
||||
# Generate and save plots for each QC step
|
||||
message("Generating QC plots")
|
||||
generate_and_save_plots(out_dir_qc, "L_vs_K_before_QC", raw_l_vs_k_plots)
|
||||
generate_and_save_plots(out_dir_qc, "L_vs_K_above_threshold", above_threshold_plots)
|
||||
generate_and_save_plots(out_dir_qc, "frequency_delta_background", frequency_delta_bg_plots)
|
||||
generate_and_save_plots(out_dir_qc, "plate_analysis", plate_analysis_plots)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
generate_and_save_plots(df, out_dir_qc, "raw_L_vs_K_before_QC.pdf", delta_bg_plots)
|
||||
generate_and_save_plots(df, out_dir_qc, "plate_analysis", before_qc_plot_configs)
|
||||
generate_and_save_plots(df_above_tolerance, out_dir_qc, above_tolerance_plot_configs)
|
||||
generate_and_save_plots(df_na_filtered, out_dir_qc, after_qc_plot_configs)
|
||||
generate_and_save_plots(df_no_zeros, out_dir_qc, "plate_analysis_no_zeros", no_zeros_plot_configs)
|
||||
# generate_and_save_plots(df, out_dir_qc, "raw_L_vs_K_before_QC.pdf", delta_bg_plots)
|
||||
# generate_and_save_plots(df, out_dir_qc, "plate_analysis", before_qc_plot_configs)
|
||||
# generate_and_save_plots(df_above_tolerance, out_dir_qc, above_tolerance_plot_configs)
|
||||
# generate_and_save_plots(df_na_filtered, out_dir_qc, after_qc_plot_configs)
|
||||
# generate_and_save_plots(df_no_zeros, out_dir_qc, "plate_analysis_no_zeros", no_zeros_plot_configs)
|
||||
|
||||
# Clean up
|
||||
rm(df, df_above_tolerance, df_no_zeros)
|
||||
@@ -766,26 +746,6 @@ main <- function() {
|
||||
# TODO: Originally this filtered L NA's
|
||||
# Let's try to avoid for now since stats have already been calculated
|
||||
|
||||
# Filter data within and outside 2SD
|
||||
message("Filtering by 2SD of K")
|
||||
df_na_within_2sd_k <- df_na_stats %>%
|
||||
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
|
||||
df_na_outside_2sd_k <- df_na_stats %>%
|
||||
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
|
||||
|
||||
# Summary statistics for within and outside 2SD of K
|
||||
message("Calculating summary statistics for L within 2SD of K")
|
||||
# TODO We're omitting the original z_max calculation, not sure if needed?
|
||||
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||
l_within_2sd_k_stats <- ss$summary_stats
|
||||
df_na_l_within_2sd_k_stats <- ss$df_with_stats
|
||||
message("Calculating summary statistics for L outside 2SD of K")
|
||||
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
|
||||
l_outside_2sd_k_stats <- ss$summary_stats
|
||||
df_na_l_outside_2sd_k_stats <- ss$df_with_stats
|
||||
# Write CSV files
|
||||
write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
|
||||
write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
|
||||
|
||||
# Process background strains
|
||||
bg_strains <- c("YDL227C")
|
||||
@@ -818,7 +778,7 @@ main <- function() {
|
||||
#print(head(summary_stats_bg))
|
||||
|
||||
# Filter reference and deletion strains
|
||||
# Formerly X2_RF (reference strain)
|
||||
# Formerly X2_RF (reference strains)
|
||||
df_reference <- df_na_stats %>%
|
||||
filter(OrfRep == strain) %>%
|
||||
mutate(SM = 0)
|
||||
@@ -827,13 +787,30 @@ main <- function() {
|
||||
df_deletion <- df_na_stats %>%
|
||||
filter(OrfRep != strain) %>%
|
||||
mutate(SM = 0)
|
||||
|
||||
reference_strain <- process_strains(df_reference) # TODO double-check
|
||||
deletion_strains <- process_strains(df_deletion) # TODO double-check
|
||||
|
||||
|
||||
# Set the missing values to the highest theoretical value at each drug conc for L
|
||||
# Leave other values as 0 for the max/min
|
||||
reference_strain <- df_reference %>%
|
||||
group_by(conc_num) %>%
|
||||
mutate(
|
||||
max_l_theoretical = max(max_L, na.rm = TRUE),
|
||||
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
|
||||
SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
|
||||
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
||||
ungroup()
|
||||
|
||||
# Ditto for deletion strains
|
||||
deletion_strains <- df_deletion %>%
|
||||
group_by(conc_num) %>%
|
||||
mutate(
|
||||
max_l_theoretical = max(max_L, na.rm = TRUE),
|
||||
L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),
|
||||
SM = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, 1, SM),
|
||||
L = ifelse(L >= max_l_theoretical & !is.na(L) & conc_num > 0, max_l_theoretical, L)) %>%
|
||||
ungroup()
|
||||
|
||||
# Calculate interactions
|
||||
variables <- c("L", "K", "r", "AUC")
|
||||
# We are recalculating some of the data here
|
||||
message("Calculating interaction scores")
|
||||
print("Reference strain:")
|
||||
print(head(reference_strain))
|
||||
|
||||
Reference in New Issue
Block a user