Refactor calculations to try and grouping issues

This commit is contained in:
2024-10-08 02:16:35 -04:00
parent aef0fba1da
commit d456bcdadd

View File

@@ -177,7 +177,7 @@ update_gene_names <- function(df, sgd_gene_list) {
} }
calculate_summary_stats <- function(df, variables, group_vars) { calculate_summary_stats <- function(df, variables, group_vars) {
summary_stats <- df %>% ss <- df %>%
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
summarise( summarise(
N = n(), N = n(),
@@ -192,16 +192,19 @@ calculate_summary_stats <- function(df, variables, group_vars) {
), ),
.names = "{.fn}_{.col}" .names = "{.fn}_{.col}"
), ),
sum_NG = sum(NG, na.rm = TRUE),
sum_DB = sum(DB, na.rm = TRUE),
sum_SM = sum(SM, na.rm = TRUE),
.groups = "drop" .groups = "drop"
) )
# Create a cleaned version of df that doesn't overlap with summary_stats # Create a cleaned version of df that doesn't overlap with summary_stats
df_cleaned <- df %>% df_cleaned <- df %>%
select(-any_of(setdiff(intersect(names(df), names(summary_stats)), group_vars))) select(-any_of(setdiff(intersect(names(df), names(ss)), group_vars)))
df_joined <- left_join(df_cleaned, summary_stats, by = group_vars) ss_joined <- left_join(df_cleaned, ss, by = group_vars)
return(list(summary_stats = summary_stats, df_with_stats = df_joined)) return(list(stats_csv = ss, stats_joined = ss_joined))
} }
calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) { calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2) {
@@ -217,11 +220,12 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
group_vars <- c("OrfRep", "Gene", "Drug") group_vars <- c("OrfRep", "Gene", "Drug")
} }
perform_lm <- function(x, y, max_conc) { perform_lm <- function(response, predictor, max_conc) {
if (all(is.na(x)) || all(is.na(y)) || length(x[!is.na(x)]) == 0 || length(y[!is.na(y)]) == 0) { if (all(is.na(response)) || all(is.na(predictor)) ||
length(response[!is.na(response)]) == 0 || length(predictor[!is.na(predictor)]) == 0) {
return(list(intercept = NA, slope = NA, r_squared = NA, score = NA)) return(list(intercept = NA, slope = NA, r_squared = NA, score = NA))
} else { } else {
fit <- lm(y ~ x) fit <- lm(response ~ predictor)
return(list( return(list(
intercept = coef(fit)[1], intercept = coef(fit)[1],
slope = coef(fit)[2], slope = coef(fit)[2],
@@ -247,97 +251,68 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
) )
# Join WT statistics to df # Join WT statistics to df
df <- df %>% df <- df %>% left_join(wt_stats, by = bg_group_vars)
left_join(wt_stats, by = c(bg_group_vars))
# Compute mean values at zero concentration # Compute mean values at zero concentration
mean_zeroes <- df %>% df_mean_zeros <- df %>%
filter(conc_num == 0) %>%
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
summarise( filter(conc_num == 0) %>%
summarize(
mean_L_zero = mean(mean_L, na.rm = TRUE), mean_L_zero = mean(mean_L, na.rm = TRUE),
mean_K_zero = mean(mean_K, na.rm = TRUE), mean_K_zero = mean(mean_K, na.rm = TRUE),
mean_r_zero = mean(mean_r, na.rm = TRUE), mean_r_zero = mean(mean_r, na.rm = TRUE),
mean_AUC_zero = mean(mean_AUC, na.rm = TRUE), mean_AUC_zero = mean(mean_AUC, na.rm = TRUE),
.groups = "drop" .groups = "drop" # Ungroup after summarizing
) )
df <- df %>% df <- df %>% left_join(df_mean_zeros, by = group_vars)
left_join(mean_zeroes, by = c(group_vars))
# Calculate Raw Shifts and Z Shifts for all rows
df <- df %>%
mutate(
Raw_Shift_L = mean_L_zero - WT_L,
Raw_Shift_K = mean_K_zero - WT_K,
Raw_Shift_r = mean_r_zero - WT_r,
Raw_Shift_AUC = mean_AUC_zero - WT_AUC,
Z_Shift_L = Raw_Shift_L / WT_sd_L,
Z_Shift_K = Raw_Shift_K / WT_sd_K,
Z_Shift_r = Raw_Shift_r / WT_sd_r,
Z_Shift_AUC = Raw_Shift_AUC / WT_sd_AUC
)
# Calculate per-row data
calculations <- df %>% calculations <- df %>%
group_by(across(all_of(c(group_vars, "conc_num", "conc_num_factor", "conc_num_factor_factor")))) %>%
mutate(
NG_sum = sum(NG, na.rm = TRUE),
DB_sum = sum(DB, na.rm = TRUE),
SM_sum = sum(SM, na.rm = TRUE),
# Expected values
Exp_L = WT_L + Raw_Shift_L,
Exp_K = WT_K + Raw_Shift_K,
Exp_r = WT_r + Raw_Shift_r,
Exp_AUC = WT_AUC + Raw_Shift_AUC,
# Deltas
Delta_L = mean_L - Exp_L,
Delta_K = mean_K - Exp_K,
Delta_r = mean_r - Exp_r,
Delta_AUC = mean_AUC - Exp_AUC,
# Adjust deltas for NG and SM
Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L),
Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K),
Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r),
Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC),
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L),
# Calculate Z-scores
Zscore_L = Delta_L / WT_sd_L,
Zscore_K = Delta_K / WT_sd_K,
Zscore_r = Delta_r / WT_sd_r,
Zscore_AUC = Delta_AUC / WT_sd_AUC
) %>%
ungroup()
# For interaction plot error bars
delta_means_sds <- calculations %>%
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
summarise( mutate(
mean_Delta_L = mean(Delta_L, na.rm = TRUE), Raw_Shift_L = mean_L_zero - WT_L,
mean_Delta_K = mean(Delta_K, na.rm = TRUE), Raw_Shift_K = mean_K_zero - WT_K,
mean_Delta_r = mean(Delta_r, na.rm = TRUE), Raw_Shift_r = mean_r_zero - WT_r,
mean_Delta_AUC = mean(Delta_AUC, na.rm = TRUE), Raw_Shift_AUC = mean_AUC_zero - WT_AUC,
sd_Delta_L = sd(Delta_L, na.rm = TRUE), Z_Shift_L = Raw_Shift_L / WT_sd_L,
sd_Delta_K = sd(Delta_K, na.rm = TRUE), Z_Shift_K = Raw_Shift_K / WT_sd_K,
sd_Delta_r = sd(Delta_r, na.rm = TRUE), Z_Shift_r = Raw_Shift_r / WT_sd_r,
sd_Delta_AUC = sd(Delta_AUC, na.rm = TRUE), Z_Shift_AUC = Raw_Shift_AUC / WT_sd_AUC,
.groups = "drop" # Expected values
) Exp_L = WT_L + Raw_Shift_L,
Exp_K = WT_K + Raw_Shift_K,
calculations <- calculations %>% Exp_r = WT_r + Raw_Shift_r,
left_join(delta_means_sds, by = group_vars) Exp_AUC = WT_AUC + Raw_Shift_AUC,
# Deltas
Delta_L = mean_L - Exp_L,
Delta_K = mean_K - Exp_K,
Delta_r = mean_r - Exp_r,
Delta_AUC = mean_AUC - Exp_AUC,
# Adjust deltas for NG and SM
Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L),
Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K),
Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r),
Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC),
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L),
# Calculate Z-scores
Zscore_L = Delta_L / WT_sd_L,
Zscore_K = Delta_K / WT_sd_K,
Zscore_r = Delta_r / WT_sd_r,
Zscore_AUC = Delta_AUC / WT_sd_AUC
)
# Calculate group-specific interactions # Calculate group-specific interactions
interactions <- calculations %>% interactions <- calculations %>%
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
summarise( summarise(
NG_sum_int = sum(NG), num_non_removed_concs = total_conc_num - sum(DB) - 1,
DB_sum_int = sum(DB),
SM_sum_int = sum(SM), # Count QA columns, this will clobber the originals in left join but
num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1, # we're not using them after this point so it's cleaner this way
NG = sum(NG, na.rm = TRUE),
DB = sum(DB, na.rm = TRUE),
SM = sum(SM, na.rm = TRUE),
# Add background data # Add background data
Raw_Shift_L = first(Raw_Shift_L), Raw_Shift_L = first(Raw_Shift_L),
@@ -356,7 +331,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Sum_Z_Score_AUC = sum(Zscore_AUC, na.rm = TRUE), Sum_Z_Score_AUC = sum(Zscore_AUC, na.rm = TRUE),
# We sum twice but it saves on creating another block # We sum twice but it saves on creating another block
# TODO should we use mean() here, not sure # TODO should we use mean() here, original logic needs explanation
Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / first(num_non_removed_concs), Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / first(num_non_removed_concs),
Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / first(num_non_removed_concs), Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / first(num_non_removed_concs),
Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / first(total_conc_num - 1), Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / first(total_conc_num - 1),
@@ -392,7 +367,7 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
# Summary statistics for lm scores # Summary statistics for lm scores
interactions <- interactions %>% interactions <- interactions %>%
# group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
mutate( mutate(
lm_mean_L = mean(lm_Score_L, na.rm = TRUE), lm_mean_L = mean(lm_Score_L, na.rm = TRUE),
lm_sd_L = sd(lm_Score_L, na.rm = TRUE), lm_sd_L = sd(lm_Score_L, na.rm = TRUE),
@@ -407,46 +382,42 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r, Z_lm_r = (lm_Score_r - lm_mean_r) / lm_sd_r,
Z_lm_AUC = (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC Z_lm_AUC = (lm_Score_AUC - lm_mean_AUC) / lm_sd_AUC
) %>% ) %>%
arrange(desc(Z_lm_L), desc(NG_sum_int)) arrange(desc(Z_lm_L), desc(NG))
# Deletion data ranking and linear modeling # Deletion data ranking and linear modeling
# Initialize this variable so we can return it as NULL for reference
# We don't do the following calculations for the reference strain
interactions_narm <- NULL # formerly X_NArm
if (type == "deletion") { if (type == "deletion") {
interactions <- interactions %>% interactions <- interactions %>%
mutate( mutate(
Avg_Zscore_L_adjusted = ifelse(is.na(Avg_Zscore_L), 0.001, Avg_Zscore_L), # Set NA values to a small value to include them in ranking
Avg_Zscore_K_adjusted = ifelse(is.na(Avg_Zscore_K), 0.001, Avg_Zscore_K), Avg_Zscore_adjusted_L = ifelse(is.na(Avg_Zscore_L), 0.001, Avg_Zscore_L),
Avg_Zscore_r_adjusted = ifelse(is.na(Avg_Zscore_r), 0.001, Avg_Zscore_r), Avg_Zscore_adjusted_K = ifelse(is.na(Avg_Zscore_K), 0.001, Avg_Zscore_K),
Avg_Zscore_AUC_adjusted = ifelse(is.na(Avg_Zscore_AUC), 0.001, Avg_Zscore_AUC), Avg_Zscore_adjusted_r = ifelse(is.na(Avg_Zscore_r), 0.001, Avg_Zscore_r),
Z_lm_L_adjusted = ifelse(is.na(Z_lm_L), 0.001, Z_lm_L), Avg_Zscore_adjusted_AUC = ifelse(is.na(Avg_Zscore_AUC), 0.001, Avg_Zscore_AUC),
Z_lm_K_adjusted = ifelse(is.na(Z_lm_K), 0.001, Z_lm_K), Z_lm_adjusted_L = ifelse(is.na(Z_lm_L), 0.001, Z_lm_L),
Z_lm_r_adjusted = ifelse(is.na(Z_lm_r), 0.001, Z_lm_r), Z_lm_adjusted_K = ifelse(is.na(Z_lm_K), 0.001, Z_lm_K),
Z_lm_AUC_adjusted = ifelse(is.na(Z_lm_AUC), 0.001, Z_lm_AUC), Z_lm_adjusted_r = ifelse(is.na(Z_lm_r), 0.001, Z_lm_r),
Rank_L = rank(Avg_Zscore_L_adjusted), Z_lm_adjusted_AUC = ifelse(is.na(Z_lm_AUC), 0.001, Z_lm_AUC),
Rank_K = rank(Avg_Zscore_K_adjusted), Rank_L = rank(Avg_Zscore_adjusted_L),
Rank_r = rank(Avg_Zscore_r_adjusted), Rank_K = rank(Avg_Zscore_adjusted_K),
Rank_AUC = rank(Avg_Zscore_AUC_adjusted), Rank_r = rank(Avg_Zscore_adjusted_r),
Rank_lm_L = rank(Z_lm_L_adjusted), Rank_AUC = rank(Avg_Zscore_adjusted_AUC),
Rank_lm_K = rank(Z_lm_K_adjusted), Rank_lm_L = rank(Z_lm_adjusted_L),
Rank_lm_r = rank(Z_lm_r_adjusted), Rank_lm_K = rank(Z_lm_adjusted_K),
Rank_lm_AUC = rank(Z_lm_AUC_adjusted), Rank_lm_r = rank(Z_lm_adjusted_r),
Rank_lm_L = list(perform_lm(Rank_lm_L, Rank_L, max_conc)), Rank_lm_AUC = rank(Z_lm_adjusted_AUC),
Rank_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)), Rank_lm_lm_L = list(perform_lm(Rank_lm_L, Rank_L, max_conc)),
Rank_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)), Rank_lm_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)),
Rank_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)), Rank_lm_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)),
Correlation_lm_L = list(perform_lm(Z_lm_L, Avg_Zscore_L, max_conc)), Rank_lm_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)),
Correlation_lm_K = list(perform_lm(Z_lm_K, Avg_Zscore_K, max_conc)),
Correlation_lm_r = list(perform_lm(Z_lm_r, Avg_Zscore_r, max_conc)),
Correlation_lm_AUC = list(perform_lm(Z_lm_AUC, Avg_Zscore_AUC, max_conc)),
Correlation_lm_K_L = list(perform_lm(Z_lm_K, Z_lm_L, max_conc)),
Correlation_lm_r_L = list(perform_lm(Z_lm_r, Z_lm_L, max_conc)),
Correlation_lm_AUC_L = list(perform_lm(Z_lm_AUC, Z_lm_L, max_conc)),
Correlation_lm_r_K = list(perform_lm(Z_lm_r, Z_lm_K, max_conc)),
Correlation_lm_AUC_K = list(perform_lm(Z_lm_AUC, Z_lm_K, max_conc)),
Correlation_lm_AUC_r = list(perform_lm(Z_lm_AUC, Z_lm_r, max_conc))
) )
# Add overlap threshold categories based on Z-lm and Avg-Z scores # We are filtering invalid Z_lm scores so this must be in its own df
interactions <- interactions %>% # A left join will just reintroduce NAs by coercion
interactions_narm <- interactions %>%
filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>% filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
mutate( mutate(
Overlap = case_when( Overlap = case_when(
@@ -460,11 +431,32 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore", Z_lm_L <= -overlap_threshold & Avg_Zscore_L >= overlap_threshold ~ "Deletion Suppressor lm, Deletion Enhancer Avg Zscore",
TRUE ~ "No Effect" TRUE ~ "No Effect"
), ),
Rank_L = rank(Avg_Zscore_L),
Rank_lm_R_squared_L = Rank_lm_L[[1]]$r_squared, Rank_K = rank(Avg_Zscore_K),
Rank_lm_R_squared_K = Rank_lm_L[[1]]$r_squared, Rank_r = rank(Avg_Zscore_r),
Rank_lm_R_squared_r = Rank_lm_r[[1]]$r_squared, Rank_AUC = rank(Avg_Zscore_AUC),
Rank_lm_R_squared_AUC = Rank_lm_AUC[[1]]$r_squared, Rank_lm_L = rank(Z_lm_L),
Rank_lm_K = rank(Z_lm_K),
Rank_lm_r = rank(Z_lm_r),
Rank_lm_AUC = rank(Z_lm_AUC),
Rank_lm_lm_L = list(perform_lm(Rank_lm_L, Rank_L, max_conc)),
Rank_lm_lm_K = list(perform_lm(Rank_lm_K, Rank_K, max_conc)),
Rank_lm_lm_r = list(perform_lm(Rank_lm_r, Rank_r, max_conc)),
Rank_lm_lm_AUC = list(perform_lm(Rank_lm_AUC, Rank_AUC, max_conc)),
# Rank_lm_R_Squared_L = Rank_lm_lm_L[[1]]$r_squared,
# Rank_lm_R_Squared_K = Rank_lm_lm_K[[1]]$r_squared,
# Rank_lm_R_Squared_r = Rank_lm_lm_r[[1]]$r_squared,
# Rank_lm_R_Squared_AUC = Rank_lm_lm_AUC[[1]]$r_squared,
Correlation_lm_L = list(perform_lm(Z_lm_L, Avg_Zscore_L, max_conc)),
Correlation_lm_K = list(perform_lm(Z_lm_K, Avg_Zscore_K, max_conc)),
Correlation_lm_r = list(perform_lm(Z_lm_r, Avg_Zscore_r, max_conc)),
Correlation_lm_AUC = list(perform_lm(Z_lm_AUC, Avg_Zscore_AUC, max_conc)),
Correlation_lm_K_L = list(perform_lm(Z_lm_K, Z_lm_L, max_conc)),
Correlation_lm_r_L = list(perform_lm(Z_lm_r, Z_lm_L, max_conc)),
Correlation_lm_AUC_L = list(perform_lm(Z_lm_AUC, Z_lm_L, max_conc)),
Correlation_lm_r_K = list(perform_lm(Z_lm_r, Z_lm_K, max_conc)),
Correlation_lm_AUC_K = list(perform_lm(Z_lm_AUC, Z_lm_K, max_conc)),
Correlation_lm_AUC_r = list(perform_lm(Z_lm_AUC, Z_lm_r, max_conc)),
Correlation_lm_intercept_L = Correlation_lm_L[[1]]$intercept, Correlation_lm_intercept_L = Correlation_lm_L[[1]]$intercept,
Correlation_lm_slope_L = Correlation_lm_L[[1]]$slope, Correlation_lm_slope_L = Correlation_lm_L[[1]]$slope,
Correlation_lm_R_Squared_L = Correlation_lm_L[[1]]$r_squared, Correlation_lm_R_Squared_L = Correlation_lm_L[[1]]$r_squared,
@@ -483,97 +475,77 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
Correlation_lm_Score_AUC = Correlation_lm_AUC[[1]]$score, Correlation_lm_Score_AUC = Correlation_lm_AUC[[1]]$score,
Correlation_lm_intercept_K_L = Correlation_lm_K_L[[1]]$intercept, Correlation_lm_intercept_K_L = Correlation_lm_K_L[[1]]$intercept,
Correlation_lm_slope_K_L = Correlation_lm_K_L[[1]]$slope, Correlation_lm_slope_K_L = Correlation_lm_K_L[[1]]$slope,
Correlation_lm_R_squared_K_L = Correlation_lm_K_L[[1]]$r_squared, Correlation_lm_R_Squared_K_L = Correlation_lm_K_L[[1]]$r_squared,
Correlation_lm_Score_K_L = Correlation_lm_K_L[[1]]$score, Correlation_lm_Score_K_L = Correlation_lm_K_L[[1]]$score,
Correlation_lm_intercept_r_L = Correlation_lm_r_L[[1]]$intercept, Correlation_lm_intercept_r_L = Correlation_lm_r_L[[1]]$intercept,
Correlation_lm_slope_r_L = Correlation_lm_r_L[[1]]$slope, Correlation_lm_slope_r_L = Correlation_lm_r_L[[1]]$slope,
Correlation_lm_R_squared_r_L = Correlation_lm_r_L[[1]]$r_squared, Correlation_lm_R_Squared_r_L = Correlation_lm_r_L[[1]]$r_squared,
Correlation_lm_Score_r_L = Correlation_lm_r_L[[1]]$score, Correlation_lm_Score_r_L = Correlation_lm_r_L[[1]]$score,
Correlation_lm_intercept_AUC_L = Correlation_lm_AUC_L[[1]]$intercept, Correlation_lm_intercept_AUC_L = Correlation_lm_AUC_L[[1]]$intercept,
Correlation_lm_slope_AUC_L = Correlation_lm_AUC_L[[1]]$slope, Correlation_lm_slope_AUC_L = Correlation_lm_AUC_L[[1]]$slope,
Correlation_lm_R_squared_AUC_L = Correlation_lm_AUC_L[[1]]$r_squared, Correlation_lm_R_Squared_AUC_L = Correlation_lm_AUC_L[[1]]$r_squared,
Correlation_lm_Score_AUC_L = Correlation_lm_AUC_L[[1]]$score, Correlation_lm_Score_AUC_L = Correlation_lm_AUC_L[[1]]$score,
Correlation_lm_intercept_r_K = Correlation_lm_r_K[[1]]$intercept, Correlation_lm_intercept_r_K = Correlation_lm_r_K[[1]]$intercept,
Correlation_lm_slope_r_K = Correlation_lm_r_K[[1]]$slope, Correlation_lm_slope_r_K = Correlation_lm_r_K[[1]]$slope,
Correlation_lm_R_squared_r_K = Correlation_lm_r_K[[1]]$r_squared, Correlation_lm_R_Squared_r_K = Correlation_lm_r_K[[1]]$r_squared,
Correlation_lm_Score_r_K = Correlation_lm_r_K[[1]]$score, Correlation_lm_Score_r_K = Correlation_lm_r_K[[1]]$score,
Correlation_lm_intercept_AUC_K = Correlation_lm_AUC_K[[1]]$intercept, Correlation_lm_intercept_AUC_K = Correlation_lm_AUC_K[[1]]$intercept,
Correlation_lm_slope_AUC_K = Correlation_lm_AUC_K[[1]]$slope, Correlation_lm_slope_AUC_K = Correlation_lm_AUC_K[[1]]$slope,
Correlation_lm_R_squared_AUC_K = Correlation_lm_AUC_K[[1]]$r_squared, Correlation_lm_R_Squared_AUC_K = Correlation_lm_AUC_K[[1]]$r_squared,
Correlation_lm_Score_AUC_K = Correlation_lm_AUC_K[[1]]$score, Correlation_lm_Score_AUC_K = Correlation_lm_AUC_K[[1]]$score,
Correlation_lm_intercept_AUC_r = Correlation_lm_AUC_r[[1]]$intercept, Correlation_lm_intercept_AUC_r = Correlation_lm_AUC_r[[1]]$intercept,
Correlation_lm_slope_AUC_r = Correlation_lm_AUC_r[[1]]$slope, Correlation_lm_slope_AUC_r = Correlation_lm_AUC_r[[1]]$slope,
Correlation_lm_R_squared_AUC_r = Correlation_lm_AUC_r[[1]]$r_squared, Correlation_lm_R_Squared_AUC_r = Correlation_lm_AUC_r[[1]]$r_squared,
Correlation_lm_Score_AUC_r = Correlation_lm_AUC_r[[1]]$score Correlation_lm_Score_AUC_r = Correlation_lm_AUC_r[[1]]$score
) )
} }
# Create the final calculations and interactions dataframes with specific columns for csv output # Create the final calculations and interactions dataframes with specific columns for csv output
# Trying to mimic original output data # Trying to mimic original output data
df_calculations <- calculations %>% calculations_csv <- calculations %>%
select(all_of(c( select(
group_vars, # necessary for full_data left_join all_of(group_vars),
"conc_num", "conc_num_factor", "conc_num_factor_factor", "N", conc_num, conc_num_factor, conc_num_factor_factor,
"mean_L", "median_L", "sd_L", "se_L", N, NG, DB, SM,
"mean_K", "median_K", "sd_K", "se_K", mean_L, median_L, sd_L, se_L,
"mean_r", "median_r", "sd_r", "se_r", mean_K, median_K, sd_K, se_K,
"mean_AUC", "median_AUC", "sd_AUC", "se_AUC", mean_r, median_r, sd_r, se_r,
"Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", mean_AUC, median_AUC, sd_AUC, se_AUC,
"Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC,
"WT_L", "WT_K", "WT_r", "WT_AUC", Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC,
"WT_sd_L", "WT_sd_K", "WT_sd_r", "WT_sd_AUC", WT_L, WT_K, WT_r, WT_AUC,
"Exp_L", "Exp_K", "Exp_r", "Exp_AUC", WT_sd_L, WT_sd_K, WT_sd_r, WT_sd_AUC,
"Delta_L", "Delta_K", "Delta_r", "Delta_AUC", Exp_L, Exp_K, Exp_r, Exp_AUC,
"mean_Delta_L", "mean_Delta_K", "mean_Delta_r", "mean_Delta_AUC", Delta_L, Delta_K, Delta_r, Delta_AUC,
"Zscore_L", "Zscore_K", "Zscore_r", "Zscore_AUC", Zscore_L, Zscore_K, Zscore_r, Zscore_AUC
"NG_sum", "DB_sum", "SM_sum" )
))) %>%
rename(NG = NG_sum, DB = DB_sum, SM = SM_sum)
df_interactions <- interactions %>% interactions_csv <- interactions %>%
select(any_of(c( select(
group_vars, # necessary for full_data left_join all_of(group_vars),
"Avg_Zscore_L", "Avg_Zscore_K", "Avg_Zscore_r", "Avg_Zscore_AUC", NG, DB, SM,
"Sum_Z_Score_L", "Sum_Z_Score_K", "Sum_Z_Score_r", "Sum_Z_Score_AUC", Avg_Zscore_L, Avg_Zscore_K, Avg_Zscore_r, Avg_Zscore_AUC,
"Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC", Z_lm_L, Z_lm_K, Z_lm_r, Z_lm_AUC,
"Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC,
"Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC,
"lm_Score_L", "lm_Score_K", "lm_Score_r", "lm_Score_AUC", Sum_Z_Score_L, Sum_Z_Score_K, Sum_Z_Score_r, Sum_Z_Score_AUC,
"R_Squared_L", "R_Squared_K", "R_Squared_r", "R_Squared_AUC", lm_Score_L, lm_Score_K, lm_Score_r, lm_Score_AUC,
"NG_sum_int", "DB_sum_int", "SM_sum_int", R_Squared_L, R_Squared_K, R_Squared_r, R_Squared_AUC
"Rank_lm_R_squared_L", "Rank_lm_R_squared_K", "Rank_lm_R_squared_r", "Rank_lm_R_squared_AUC", )
"Correlation_lm_intercept_L", "Correlation_lm_slope_L", "Correlation_lm_R_squared_L", "Correlation_lm_Score_L",
"Correlation_lm_intercept_K", "Correlation_lm_slope_K", "Correlation_lm_R_squared_K", "Correlation_lm_Score_K",
"Correlation_lm_intercept_r", "Correlation_lm_slope_r", "Correlation_lm_R_squared_r", "Correlation_lm_Score_r",
"Correlation_lm_intercept_AUC", "Correlation_lm_slope_AUC", "Correlation_lm_R_squared_AUC", "Correlation_lm_Score_AUC",
"Correlation_lm_intercept_K_L", "Correlation_lm_slope_K_L", "Correlation_lm_R_squared_K_L", "Correlation_lm_Score_K_L",
"Correlation_lm_intercept_r_L", "Correlation_lm_slope_r_L", "Correlation_lm_R_squared_r_L", "Correlation_lm_Score_r_L",
"Correlation_lm_intercept_AUC_L", "Correlation_lm_slope_AUC_L", "Correlation_lm_R_squared_AUC_L", "Correlation_lm_Score_AUC_L",
"Correlation_lm_intercept_r_K", "Correlation_lm_slope_r_K", "Correlation_lm_R_squared_r_K", "Correlation_lm_Score_r_K",
"Correlation_lm_intercept_AUC_K", "Correlation_lm_slope_AUC_K", "Correlation_lm_R_squared_AUC_K", "Correlation_lm_Score_AUC_K",
"Correlation_lm_intercept_AUC_r", "Correlation_lm_slope_AUC_r", "Correlation_lm_R_squared_AUC_r", "Correlation_lm_Score_AUC_r",
"Overlap"
))) %>%
rename(NG = NG_sum_int, DB = DB_sum_int, SM = SM_sum_int)
# Avoid column collision on left join for overlapping variables # Filter the calculations dataframe so we don't clobber on left join
calculations_no_overlap <- calculations %>% calculations_selected <- calculations %>%
select(-any_of(c("DB", "NG", "SM", select(all_of(c(group_vars, setdiff(names(calculations), names(interactions)))))
# Don't need these anywhere so easier to remove
"Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC",
"R_Squared_L", "R_Squared_K", "R_Squared_r", "R_Squared_AUC",
# we need these for the interactions but the original code has the same names in both datasets
"Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC"
)))
full_data <- calculations_no_overlap %>% interactions_joined <- left_join(calculations_selected, interactions, by = group_vars)
left_join(interactions, by = group_vars)
# Return final dataframes # Return final dataframes
return(list( return(list(
calculations = df_calculations, calculations_csv = calculations_csv,
interactions = df_interactions, interactions_csv = interactions_csv,
full_data = full_data interactions = interactions,
interactions_joined = interactions_joined,
interactions_narm = interactions_narm
)) ))
} }
@@ -1068,9 +1040,9 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
} }
for (var in names(delta_limits_map)) { for (var in names(delta_limits_map)) {
y_var_name <- paste0("Delta_", var)
y_limits <- delta_limits_map[[var]] y_limits <- delta_limits_map[[var]]
y_span <- y_limits[2] - y_limits[1] y_span <- y_limits[2] - y_limits[1]
y_var_name <- paste0("Delta_", var)
# Anti-filter to select out-of-bounds rows # Anti-filter to select out-of-bounds rows
out_of_bounds <- group_data %>% out_of_bounds <- group_data %>%
@@ -1095,19 +1067,17 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
next # skip plot if insufficient data is available next # skip plot if insufficient data is available
} }
WT_sd_value <- first(group_data_filtered[[paste0("WT_sd_", var)]], default = 0)
Z_Shift_value <- round(first(group_data_filtered[[paste0("Z_Shift_", var)]], default = 0), 2) Z_Shift_value <- round(first(group_data_filtered[[paste0("Z_Shift_", var)]], default = 0), 2)
Z_lm_value <- round(first(group_data_filtered[[paste0("Z_lm_", var)]], default = 0), 2) Z_lm_value <- round(first(group_data_filtered[[paste0("Z_lm_", var)]], default = 0), 2)
R_squared_value <- round(first(group_data_filtered[[paste0("R_Squared_", var)]], default = 0), 2)
NG_value <- first(group_data_filtered$NG, default = 0) NG <- first(group_data_filtered$NG)
DB_value <- first(group_data_filtered$DB, default = 0) DB <- first(group_data_filtered$DB)
SM_value <- first(group_data_filtered$SM, default = 0) SM <- first(group_data_filtered$SM)
lm_intercept_col <- paste0("lm_intercept_", var) lm_intercept_col <- paste0("lm_intercept_", var)
lm_slope_col <- paste0("lm_slope_", var) lm_slope_col <- paste0("lm_slope_", var)
lm_intercept <- first(group_data_filtered[[lm_intercept_col]], default = 0) lm_intercept <- first(group_data_filtered[[lm_intercept_col]])
lm_slope <- first(group_data_filtered[[lm_slope_col]], default = 0) lm_slope <- first(group_data_filtered[[lm_slope_col]])
plot_config <- list( plot_config <- list(
df = group_data_filtered, df = group_data_filtered,
@@ -1122,10 +1092,9 @@ generate_interaction_plot_configs <- function(df_summary, df_interactions, type)
annotations = list( annotations = list(
list(x = 1, y = y_limits[2] - 0.1 * y_span, label = paste(" ZShift =", round(Z_Shift_value, 2))), list(x = 1, y = y_limits[2] - 0.1 * y_span, label = paste(" ZShift =", round(Z_Shift_value, 2))),
list(x = 1, y = y_limits[2] - 0.2 * y_span, label = paste(" lm ZScore =", round(Z_lm_value, 2))), list(x = 1, y = y_limits[2] - 0.2 * y_span, label = paste(" lm ZScore =", round(Z_lm_value, 2))),
# list(x = 1, y = y_limits[2] - 0.3 * y_span, label = paste(" R-squared =", round(R_squared_value, 2))), list(x = 1, y = y_limits[1] + 0.1 * y_span, label = paste("NG =", NG)),
list(x = 1, y = y_limits[1] + 0.1 * y_span, label = paste("NG =", NG_value)), list(x = 1, y = y_limits[1] + 0.05 * y_span, label = paste("DB =", DB)),
list(x = 1, y = y_limits[1] + 0.05 * y_span, label = paste("DB =", DB_value)), list(x = 1, y = y_limits[1], label = paste("SM =", SM))
list(x = 1, y = y_limits[1], label = paste("SM =", SM_value))
), ),
error_bar = TRUE, error_bar = TRUE,
error_bar_params = list( error_bar_params = list(
@@ -1267,7 +1236,7 @@ generate_correlation_plot_configs <- function(df, df_reference) {
intercept <- df[[paste0("Correlation_lm_intercept_", rel$y, "_", rel$x)]][1] intercept <- df[[paste0("Correlation_lm_intercept_", rel$y, "_", rel$x)]][1]
slope <- df[[paste0("Correlation_lm_slope_", rel$y, "_", rel$x)]][1] slope <- df[[paste0("Correlation_lm_slope_", rel$y, "_", rel$x)]][1]
r_squared <- df[[paste0("Correlation_lm_R_squared_", rel$y, "_", rel$x)]][1] r_squared <- df[[paste0("Correlation_lm_R_Squared_", rel$y, "_", rel$x)]][1]
r_squared_rounded <- round(r_squared, 4) r_squared_rounded <- round(r_squared, 4)
r_squared_label <- paste("R-squared =", r_squared_rounded) r_squared_label <- paste("R-squared =", r_squared_rounded)
xmin <- min(c(min(df[[x_var]]), min(df_reference[[x_var]]))) xmin <- min(c(min(df[[x_var]]), min(df_reference[[x_var]])))
@@ -1355,7 +1324,7 @@ main <- function() {
df_stats <- calculate_summary_stats( # formerly X_stats_ALL df_stats <- calculate_summary_stats( # formerly X_stats_ALL
df = df, df = df,
variables = c("L", "K", "r", "AUC", "delta_bg"), variables = c("L", "K", "r", "AUC", "delta_bg"),
group_vars = c("conc_num", "conc_num_factor_factor"))$df_with_stats group_vars = c("conc_num", "conc_num_factor_factor"))$stats_joined
frequency_delta_bg_plot_configs <- list( frequency_delta_bg_plot_configs <- list(
plots = list( plots = list(
@@ -1422,11 +1391,10 @@ main <- function() {
df = df_na, df = df_na,
variables = c("L", "K", "r", "AUC", "delta_bg"), variables = c("L", "K", "r", "AUC", "delta_bg"),
group_vars = c("conc_num", "conc_num_factor_factor")) group_vars = c("conc_num", "conc_num_factor_factor"))
df_na_ss <- ss$summary_stats stats_na <- ss$stats_joined # formerly X_stats_ALL
df_na_stats <- ss$df_with_stats # formerly X_stats_ALL write.csv(ss$stats_csv, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
# This can help bypass missing values ggplot warnings during testing # This can help bypass missing values ggplot warnings during testing
df_na_stats_filtered <- df_na_stats %>% filter(if_all(all_of(c("L", "K", "r", "AUC", "delta_bg")), is.finite)) stats_na_filtered <- stats_na %>% filter(if_all(all_of(c("L", "K", "r", "AUC", "delta_bg")), is.finite))
message("Calculating summary statistics excluding zero values") message("Calculating summary statistics excluding zero values")
df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero df_no_zeros <- df_na %>% filter(L > 0) # formerly X_noZero
@@ -1434,40 +1402,40 @@ main <- function() {
df = df_no_zeros, df = df_no_zeros,
variables = c("L", "K", "r", "AUC", "delta_bg"), variables = c("L", "K", "r", "AUC", "delta_bg"),
group_vars = c("conc_num", "conc_num_factor_factor") group_vars = c("conc_num", "conc_num_factor_factor")
)$df_with_stats )$stats_joined
message("Filtering by 2SD of K") message("Filtering by 2SD of K")
df_na_within_2sd_k <- df_na_stats %>% df_na_within_2sd_k <- stats_na %>%
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
df_na_outside_2sd_k <- df_na_stats %>% df_na_outside_2sd_k <- stats_na %>%
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
message("Calculating summary statistics for L within 2SD of K") message("Calculating summary statistics for L within 2SD of K")
# TODO We're omitting the original z_max calculation, not sure if needed? # TODO We're omitting the original z_max calculation, not sure if needed?
ss <- calculate_summary_stats(df_na_within_2sd_k, "L", # formerly X_stats_BY_L_within_2SD_K ss <- calculate_summary_stats(df_na_within_2sd_k, "L", # formerly X_stats_BY_L_within_2SD_K
group_vars = c("conc_num", "conc_num_factor_factor"))$summary_stats group_vars = c("conc_num", "conc_num_factor_factor"))
write.csv(ss, write.csv(ss$stats_csv,
file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2SD_K.csv"), file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2SD_K.csv"),
row.names = FALSE) row.names = FALSE)
message("Calculating summary statistics for L outside 2SD of K") message("Calculating summary statistics for L outside 2SD of K")
ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", # formerly X_stats_BY_L_outside_2SD_K ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", # formerly X_stats_BY_L_outside_2SD_K
group_vars = c("conc_num", "conc_num_factor_factor")) group_vars = c("conc_num", "conc_num_factor_factor"))
df_na_l_outside_2sd_k_stats <- ss$df_with_stats df_na_l_outside_2sd_k_stats <- ss$stats_joined
write.csv(ss$summary_stats, write.csv(ss$stats_csv,
file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2SD_K.csv"), file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2SD_K.csv"),
row.names = FALSE) row.names = FALSE)
plate_analysis_plot_configs <- generate_plate_analysis_plot_configs( plate_analysis_plot_configs <- generate_plate_analysis_plot_configs(
variables = c("L", "K", "r", "AUC", "delta_bg"), variables = c("L", "K", "r", "AUC", "delta_bg"),
df_before = df_stats, df_before = df_stats,
df_after = df_na_stats_filtered df_after = stats_na_filtered
) )
plate_analysis_boxplot_configs <- generate_plate_analysis_plot_configs( plate_analysis_boxplot_configs <- generate_plate_analysis_plot_configs(
variables = c("L", "K", "r", "AUC", "delta_bg"), variables = c("L", "K", "r", "AUC", "delta_bg"),
df_before = df_stats, df_before = df_stats,
df_after = df_na_stats_filtered, df_after = stats_na_filtered,
plot_type = "box" plot_type = "box"
) )
@@ -1587,17 +1555,17 @@ main <- function() {
filter(!is.na(L)) filter(!is.na(L))
message("Calculating background summary statistics") message("Calculating background summary statistics")
ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), # formerly X_stats_BY ss <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), # formerly X_stats_BY
group_vars = c("OrfRep", "Drug", "conc_num", "conc_num_factor_factor")) group_vars = c("OrfRep", "Drug", "conc_num", "conc_num_factor_factor"))
summary_stats_bg <- ss_bg$summary_stats stats_background_csv <- ss$stats_csv
df_bg_stats <- ss_bg$df_with_stats stats_background <- ss$stats_joined
write.csv( write.csv(
summary_stats_bg, stats_background_csv,
file = file.path(out_dir, paste0("summary_stats_background_strain_", strain, ".csv")), file = file.path(out_dir, paste0("summary_stats_background_strain_", strain, ".csv")),
row.names = FALSE) row.names = FALSE)
message("Setting missing reference values to the highest theoretical value at each drug conc for L") message("Setting missing reference values to the highest theoretical value at each drug conc for L")
df_reference <- df_na_stats %>% # formerly X2_RF df_reference <- stats_na %>% # formerly X2_RF
filter(OrfRep == strain) %>% filter(OrfRep == strain) %>%
filter(!is.na(L)) %>% filter(!is.na(L)) %>%
group_by(OrfRep, Drug, conc_num, conc_num_factor_factor) %>% group_by(OrfRep, Drug, conc_num, conc_num_factor_factor) %>%
@@ -1609,14 +1577,14 @@ main <- function() {
ungroup() ungroup()
message("Calculating reference strain summary statistics") message("Calculating reference strain summary statistics")
df_reference_summary_stats <- calculate_summary_stats( # formerly X_stats_X2_RF stats_reference <- calculate_summary_stats( # formerly X_stats_X2_RF
df = df_reference, df = df_reference,
variables = c("L", "K", "r", "AUC"), variables = c("L", "K", "r", "AUC"),
group_vars = c("OrfRep", "Drug", "conc_num", "conc_num_factor_factor") group_vars = c("OrfRep", "Drug", "conc_num", "conc_num_factor_factor")
)$df_with_stats )$stats_joined
# Summarise statistics for error bars # Summarise statistics for error bars
df_reference_summary_stats <- df_reference_summary_stats %>% stats_reference <- stats_reference %>%
group_by(OrfRep, Drug, conc_num, conc_num_factor_factor) %>% group_by(OrfRep, Drug, conc_num, conc_num_factor_factor) %>%
mutate( mutate(
mean_mean_L = first(mean_L), mean_mean_L = first(mean_L),
@@ -1631,26 +1599,27 @@ main <- function() {
) )
message("Calculating reference strain interaction summary statistics") # formerly X_stats_interaction message("Calculating reference strain interaction summary statistics") # formerly X_stats_interaction
df_reference_interaction_stats <- calculate_summary_stats( stats_interactions_reference <- calculate_summary_stats(
df = df_reference, df = df_reference,
variables = c("L", "K", "r", "AUC"), variables = c("L", "K", "r", "AUC"),
group_vars = c("OrfRep", "Gene", "num", "Drug", "conc_num", "conc_num_factor_factor") group_vars = c("OrfRep", "Gene", "num", "Drug", "conc_num", "conc_num_factor_factor")
)$df_with_stats )$stats_joined
message("Calculating reference strain interaction scores") message("Calculating reference strain interaction scores")
reference_results <- calculate_interaction_scores(df_reference_interaction_stats, df_bg_stats, "reference") reference_results <- calculate_interaction_scores(stats_interactions_reference, stats_background, "reference")
df_reference_calculations <- reference_results$calculations calculations_reference_csv <- reference_results$calculations_csv
df_reference_interactions_joined <- reference_results$full_data interactions_reference_csv <- reference_results$interactions_csv
df_reference_interactions <- reference_results$interactions interactions_reference <- reference_results$interactions
write.csv(df_reference_calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE) interactions_reference_joined <- reference_results$interactions_joined
write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE) write.csv(calculations_reference_csv, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
write.csv(interactions_reference_csv, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
# message("Generating reference interaction plots") message("Generating reference interaction plots")
# reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference") reference_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_reference_joined, "reference")
# generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16) generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
message("Setting missing deletion values to the highest theoretical value at each drug conc for L") message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
df_deletion <- df_na_stats %>% # formerly X2 deletion_adjusted <- stats_na %>% # formerly X2
filter(OrfRep != strain) %>% filter(OrfRep != strain) %>%
filter(!is.na(L)) %>% filter(!is.na(L)) %>%
group_by(OrfRep, Gene, conc_num, conc_num_factor_factor) %>% group_by(OrfRep, Gene, conc_num, conc_num_factor_factor) %>%
@@ -1662,96 +1631,94 @@ main <- function() {
ungroup() ungroup()
message("Calculating deletion strain(s) interaction summary statistics") message("Calculating deletion strain(s) interaction summary statistics")
df_deletion_stats <- calculate_summary_stats( deletion_stats <- calculate_summary_stats(
df = df_deletion, df = deletion_adjusted,
variables = c("L", "K", "r", "AUC"), variables = c("L", "K", "r", "AUC"),
group_vars = c("OrfRep", "Gene", "Drug", "conc_num", "conc_num_factor_factor") group_vars = c("OrfRep", "Gene", "Drug", "conc_num", "conc_num_factor_factor")
)$df_with_stats )$stats_joined
message("Calculating deletion strain(s) interactions scores") message("Calculating deletion strain(s) interactions scores")
deletion_results <- calculate_interaction_scores(df_deletion_stats, df_bg_stats, "deletion") deletion_results <- calculate_interaction_scores(deletion_stats, stats_background, "deletion")
df_calculations <- deletion_results$calculations calculations_csv <- deletion_results$calculations_csv
df_interactions <- deletion_results$interactions interactions_csv <- deletion_results$interactions_csv
df_interactions_joined <- deletion_results$full_data interactions <- deletion_results$interactions
write.csv(df_calculations, file = file.path(out_dir, "zscore_calculations.csv"), row.names = FALSE) interactions_joined <- deletion_results$interactions_joined
write.csv(df_interactions, file = file.path(out_dir, "zscore_interactions.csv"), row.names = FALSE) interactions_narm <- deletion_results$interactions_narm
write.csv(calculations_csv, file = file.path(out_dir, "zscore_calculations.csv"), row.names = FALSE)
write.csv(interactions_csv, file = file.path(out_dir, "zscore_interactions.csv"), row.names = FALSE)
# message("Generating deletion interaction plots") message("Generating deletion interaction plots")
# deletion_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_interactions_joined, "deletion") deletion_plot_configs <- generate_interaction_plot_configs(stats_reference, interactions_joined, "deletion")
# generate_and_save_plots(out_dir, "interaction_plots", deletion_plot_configs, page_width = 16, page_height = 16) generate_and_save_plots(out_dir, "interaction_plots", deletion_plot_configs, page_width = 16, page_height = 16)
# message("Writing enhancer/suppressor csv files") message("Writing enhancer/suppressor csv files")
# interaction_threshold <- 2 # TODO add to study config? interaction_threshold <- 2 # TODO add to study config?
# enhancer_condition_L <- df_interactions$Avg_Zscore_L >= interaction_threshold enhancer_condition_L <- interactions$Avg_Zscore_L >= interaction_threshold
# suppressor_condition_L <- df_interactions$Avg_Zscore_L <= -interaction_threshold suppressor_condition_L <- interactions$Avg_Zscore_L <= -interaction_threshold
# enhancer_condition_K <- df_interactions$Avg_Zscore_K >= interaction_threshold enhancer_condition_K <- interactions$Avg_Zscore_K >= interaction_threshold
# suppressor_condition_K <- df_interactions$Avg_Zscore_K <= -interaction_threshold suppressor_condition_K <- interactions$Avg_Zscore_K <= -interaction_threshold
# enhancers_L <- df_interactions[enhancer_condition_L, ] enhancers_L <- interactions[enhancer_condition_L, ]
# suppressors_L <- df_interactions[suppressor_condition_L, ] suppressors_L <- interactions[suppressor_condition_L, ]
# enhancers_K <- df_interactions[enhancer_condition_K, ] enhancers_K <- interactions[enhancer_condition_K, ]
# suppressors_K <- df_interactions[suppressor_condition_K, ] suppressors_K <- interactions[suppressor_condition_K, ]
# enhancers_and_suppressors_L <- df_interactions[enhancer_condition_L | suppressor_condition_L, ] enhancers_and_suppressors_L <- interactions[enhancer_condition_L | suppressor_condition_L, ]
# enhancers_and_suppressors_K <- df_interactions[enhancer_condition_K | suppressor_condition_K, ] enhancers_and_suppressors_K <- interactions[enhancer_condition_K | suppressor_condition_K, ]
# write.csv(enhancers_L, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_L.csv"), row.names = FALSE) write.csv(enhancers_L, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_L.csv"), row.names = FALSE)
# write.csv(suppressors_L, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_L.csv"), row.names = FALSE) write.csv(suppressors_L, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_L.csv"), row.names = FALSE)
# write.csv(enhancers_K, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_K.csv"), row.names = FALSE) write.csv(enhancers_K, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_K.csv"), row.names = FALSE)
# write.csv(suppressors_K, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_K.csv"), row.names = FALSE) write.csv(suppressors_K, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_K.csv"), row.names = FALSE)
# write.csv(enhancers_and_suppressors_L, write.csv(enhancers_and_suppressors_L,
# file = file.path(out_dir, "zscore_interactions_deletion_enhancers_and_suppressors_L.csv"), row.names = FALSE) file = file.path(out_dir, "zscore_interactions_deletion_enhancers_and_suppressors_L.csv"), row.names = FALSE)
# write.csv(enhancers_and_suppressors_K, write.csv(enhancers_and_suppressors_K,
# file = file.path(out_dir, "zscore_interaction_deletion_enhancers_and_suppressors_K.csv"), row.names = FALSE) file = file.path(out_dir, "zscore_interaction_deletion_enhancers_and_suppressors_K.csv"), row.names = FALSE)
# message("Writing linear model enhancer/suppressor csv files") message("Writing linear model enhancer/suppressor csv files")
# lm_interaction_threshold <- 2 # TODO add to study config? lm_interaction_threshold <- 2 # TODO add to study config?
# enhancers_lm_L <- df_interactions[df_interactions$Z_lm_L >= lm_interaction_threshold, ] enhancers_lm_L <- interactions[interactions$Z_lm_L >= lm_interaction_threshold, ]
# suppressors_lm_L <- df_interactions[df_interactions$Z_lm_L <= -lm_interaction_threshold, ] suppressors_lm_L <- interactions[interactions$Z_lm_L <= -lm_interaction_threshold, ]
# enhancers_lm_K <- df_interactions[df_interactions$Z_lm_K >= lm_interaction_threshold, ] enhancers_lm_K <- interactions[interactions$Z_lm_K >= lm_interaction_threshold, ]
# suppressors_lm_K <- df_interactions[df_interactions$Z_lm_K <= -lm_interaction_threshold, ] suppressors_lm_K <- interactions[interactions$Z_lm_K <= -lm_interaction_threshold, ]
# write.csv(enhancers_lm_L, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_lm_L.csv"), row.names = FALSE) write.csv(enhancers_lm_L, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_lm_L.csv"), row.names = FALSE)
# write.csv(suppressors_lm_L, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_lm_L.csv"), row.names = FALSE) write.csv(suppressors_lm_L, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_lm_L.csv"), row.names = FALSE)
# write.csv(enhancers_lm_K, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_lm_K.csv"), row.names = FALSE) write.csv(enhancers_lm_K, file = file.path(out_dir, "zscore_interactions_deletion_enhancers_lm_K.csv"), row.names = FALSE)
# write.csv(suppressors_lm_K, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_lm_K.csv"), row.names = FALSE) write.csv(suppressors_lm_K, file = file.path(out_dir, "zscore_interactions_deletion_suppressors_lm_K.csv"), row.names = FALSE)
# message("Generating rank plots") message("Generating rank plots")
# rank_plot_configs <- generate_rank_plot_configs( rank_plot_configs <- generate_rank_plot_configs(
# df_interactions, interactions,
# is_lm = FALSE, is_lm = FALSE,
# ) )
# generate_and_save_plots(out_dir, "rank_plots", rank_plot_configs, generate_and_save_plots(out_dir, "rank_plots", rank_plot_configs, page_width = 18, page_height = 12)
# page_width = 18, page_height = 12)
# message("Generating ranked linear model plots") message("Generating ranked linear model plots")
# rank_lm_plot_configs <- generate_rank_plot_configs( rank_lm_plot_configs <- generate_rank_plot_configs(
# df_interactions, interactions,
# is_lm = TRUE, is_lm = TRUE,
# ) )
# generate_and_save_plots(out_dir, "rank_plots_lm", rank_lm_plot_configs, generate_and_save_plots(out_dir, "rank_plots_lm", rank_lm_plot_configs, page_width = 18, page_height = 12)
# page_width = 18, page_height = 12)
# message("Generating overlapped ranked plots") message("Generating overlapped ranked plots")
# rank_plot_filtered_configs <- generate_rank_plot_configs( rank_plot_filtered_configs <- generate_rank_plot_configs(
# df_interactions, interactions_narm,
# is_lm = FALSE, is_lm = FALSE,
# filter_na = TRUE, # filter_na = TRUE,
# overlap_color = TRUE overlap_color = TRUE
# ) )
# generate_and_save_plots(out_dir, "rank_plots_na_rm", rank_plot_filtered_configs, generate_and_save_plots(out_dir, "rank_plots_na_rm", rank_plot_filtered_configs, page_width = 18, page_height = 12)
# page_width = 18, page_height = 12)
# message("Generating overlapped ranked linear model plots") message("Generating overlapped ranked linear model plots")
# rank_plot_lm_filtered_configs <- generate_rank_plot_configs( rank_plot_lm_filtered_configs <- generate_rank_plot_configs(
# df_interactions, interactions_narm,
# is_lm = TRUE, is_lm = TRUE,
# filter_na = TRUE, # filter_na = TRUE,
# overlap_color = TRUE overlap_color = TRUE
# ) )
# generate_and_save_plots(out_dir, "rank_plots_lm_na_rm", rank_plot_lm_filtered_configs, generate_and_save_plots(out_dir, "rank_plots_lm_na_rm", rank_plot_lm_filtered_configs, page_width = 18, page_height = 12)
# page_width = 18, page_height = 12)
message("Generating correlation curve parameter pair plots") message("Generating correlation curve parameter pair plots")
correlation_plot_configs <- generate_correlation_plot_configs( correlation_plot_configs <- generate_correlation_plot_configs(
df_interactions, interactions_narm,
df_reference_interactions interactions_reference
) )
generate_and_save_plots(out_dir, "correlation_cpps", correlation_plot_configs, generate_and_save_plots(out_dir, "correlation_cpps", correlation_plot_configs,
page_width = 10, page_height = 7) page_width = 10, page_height = 7)