Pre sd/se fixes
This commit is contained in:
@@ -183,6 +183,10 @@ process_strains <- function(df) {
|
|||||||
|
|
||||||
# Calculate summary statistics for all variables
|
# Calculate summary statistics for all variables
|
||||||
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
|
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
|
||||||
|
# Replace 0 values with NA
|
||||||
|
df <- df %>%
|
||||||
|
mutate(across(all_of(variables), ~ifelse(. == 0, NA, .)))
|
||||||
|
|
||||||
# Calculate summary statistics, including a single N based on L
|
# Calculate summary statistics, including a single N based on L
|
||||||
summary_stats <- df %>%
|
summary_stats <- df %>%
|
||||||
group_by(across(all_of(group_vars))) %>%
|
group_by(across(all_of(group_vars))) %>%
|
||||||
@@ -193,44 +197,50 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
|
|||||||
median = ~median(., na.rm = TRUE),
|
median = ~median(., na.rm = TRUE),
|
||||||
max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)),
|
max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)),
|
||||||
min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)),
|
min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)),
|
||||||
sd = ~sd(., na.rm = TRUE),
|
sd = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE), 0), # If N == 1, sd is set to 0
|
||||||
se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), NA)
|
se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), 0) # If N == 1, se is set to 0
|
||||||
), .names = "{.fn}_{.col}")
|
), .names = "{.fn}_{.col}")
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get the column names from the summary_stats dataframe (excluding the group_vars)
|
|
||||||
stat_columns <- setdiff(names(summary_stats), group_vars)
|
|
||||||
|
|
||||||
# Remove existing stats columns from df if they already exist
|
# Remove existing stats columns from df if they already exist
|
||||||
|
stat_columns <- setdiff(names(summary_stats), group_vars)
|
||||||
df_cleaned <- df %>% select(-any_of(stat_columns))
|
df_cleaned <- df %>% select(-any_of(stat_columns))
|
||||||
|
|
||||||
# Join the summary stats back to the cleaned original dataframe
|
# Join the summary stats back to the cleaned original dataframe
|
||||||
df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars)
|
df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars)
|
||||||
|
|
||||||
# Return both the summary stats and the updated dataframe
|
|
||||||
return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
|
return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
|
||||||
}
|
}
|
||||||
|
|
||||||
# Calculate interaction scores
|
# Calculate interaction scores
|
||||||
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
|
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
|
||||||
|
|
||||||
|
if (nrow(df) == 0) {
|
||||||
|
message("Dataframe is empty after filtering")
|
||||||
|
return(NULL) # Or handle the empty dataframe case as needed
|
||||||
|
}
|
||||||
|
|
||||||
# Calculate total concentration variables
|
# Calculate total concentration variables
|
||||||
total_conc_num <- length(unique(df$conc_num))
|
total_conc_num <- length(unique(df$conc_num))
|
||||||
num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1
|
num_non_removed_concs <- total_conc_num - sum(df$DB, na.rm = TRUE) - 1
|
||||||
|
|
||||||
# Pull the background means
|
# Pull the background means and standard deviations
|
||||||
print("Calculating background means")
|
print("Calculating background means")
|
||||||
bg_L <- df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first()
|
print(head(df))
|
||||||
bg_K <- df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first()
|
bg_means <- list(
|
||||||
bg_r <- df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first()
|
L = df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first(),
|
||||||
bg_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first()
|
K = df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first(),
|
||||||
|
r = df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first(),
|
||||||
|
AUC = df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first()
|
||||||
|
)
|
||||||
|
bg_sd <- list(
|
||||||
|
L = df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first(),
|
||||||
|
K = df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first(),
|
||||||
|
r = df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first(),
|
||||||
|
AUC = df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
|
||||||
|
)
|
||||||
|
|
||||||
bg_sd_L <- df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first()
|
# First mutate block to calculate NG, DB, SM, and N
|
||||||
bg_sd_K <- df %>% filter(conc_num_factor == 0) %>% pull(sd_K) %>% first()
|
|
||||||
bg_sd_r <- df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first()
|
|
||||||
bg_sd_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
|
|
||||||
|
|
||||||
# First mutate block to calculate NG, DB, SM
|
|
||||||
print("Calculating interaction scores part 1")
|
print("Calculating interaction scores part 1")
|
||||||
interaction_scores <- df %>%
|
interaction_scores <- df %>%
|
||||||
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
|
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
|
||||||
@@ -238,38 +248,59 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
|||||||
NG = sum(NG, na.rm = TRUE),
|
NG = sum(NG, na.rm = TRUE),
|
||||||
DB = sum(DB, na.rm = TRUE),
|
DB = sum(DB, na.rm = TRUE),
|
||||||
SM = sum(SM, na.rm = TRUE),
|
SM = sum(SM, na.rm = TRUE),
|
||||||
N = ~sum(!is.na(L)), # Count of non-NA values
|
N = sum(!is.na(L)) # Count of non-NA values in L
|
||||||
)
|
)
|
||||||
|
|
||||||
# Second mutate block to calculate variables and Delta using NG
|
# Calculate Raw_Shift and Z_Shift for each variable
|
||||||
|
print("Calculating Raw_Shift and Z_Shift")
|
||||||
interaction_scores <- interaction_scores %>%
|
interaction_scores <- interaction_scores %>%
|
||||||
mutate(across(all_of(variables), list(
|
|
||||||
mean = ~mean(., na.rm = TRUE),
|
|
||||||
median = ~median(., na.rm = TRUE),
|
|
||||||
max = ~max(., na.rm = TRUE),
|
|
||||||
min = ~min(., na.rm = TRUE),
|
|
||||||
sd = ~sd(., na.rm = TRUE),
|
|
||||||
se = ~sd(., na.rm = TRUE) / sqrt(N - 1), # Standard error calculation
|
|
||||||
Raw_Shift = ~mean(., na.rm = TRUE) - case_when(
|
|
||||||
cur_column() == "L" ~ bg_L,
|
|
||||||
cur_column() == "K" ~ bg_K,
|
|
||||||
cur_column() == "r" ~ bg_r,
|
|
||||||
cur_column() == "AUC" ~ bg_AUC
|
|
||||||
),
|
|
||||||
Z_Shift = ~ Raw_Shift / case_when(
|
|
||||||
cur_column() == "L" ~ bg_sd_L,
|
|
||||||
cur_column() == "K" ~ bg_sd_K,
|
|
||||||
cur_column() == "r" ~ bg_sd_r,
|
|
||||||
cur_column() == "AUC" ~ bg_sd_AUC
|
|
||||||
),
|
|
||||||
WT = ~mean(., na.rm = TRUE),
|
|
||||||
WT_sd = ~sd(., na.rm = TRUE),
|
|
||||||
Exp = ~ WT + Raw_Shift,
|
|
||||||
Delta = ~ WT - Exp,
|
|
||||||
Delta = if_else(NG == 1, mean - WT, Delta)
|
|
||||||
), .names = "{.fn}_{.col}")) %>%
|
|
||||||
mutate(
|
mutate(
|
||||||
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) # disregard shift for set to max values in Z score calculation
|
Raw_Shift_L = mean_L - bg_means$L,
|
||||||
|
Raw_Shift_K = mean_K - bg_means$K,
|
||||||
|
Raw_Shift_r = mean_r - bg_means$r,
|
||||||
|
Raw_Shift_AUC = mean_AUC - bg_means$AUC,
|
||||||
|
Z_Shift_L = Raw_Shift_L / bg_sd$L,
|
||||||
|
Z_Shift_K = Raw_Shift_K / bg_sd$K,
|
||||||
|
Z_Shift_r = Raw_Shift_r / bg_sd$r,
|
||||||
|
Z_Shift_AUC = Raw_Shift_AUC / bg_sd$AUC
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate WT and WT_sd for each variable
|
||||||
|
print("Calculating WT and WT_sd")
|
||||||
|
interaction_scores <- interaction_scores %>%
|
||||||
|
mutate(
|
||||||
|
WT_L = mean(mean_L, na.rm = TRUE),
|
||||||
|
WT_K = mean(mean_K, na.rm = TRUE),
|
||||||
|
WT_r = mean(mean_r, na.rm = TRUE),
|
||||||
|
WT_AUC = mean(mean_AUC, na.rm = TRUE),
|
||||||
|
WT_sd_L = sd(mean_L, na.rm = TRUE),
|
||||||
|
WT_sd_K = sd(mean_K, na.rm = TRUE),
|
||||||
|
WT_sd_r = sd(mean_r, na.rm = TRUE),
|
||||||
|
WT_sd_AUC = sd(mean_AUC, na.rm = TRUE)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate Exp and Delta for each variable
|
||||||
|
print("Calculating Exp and Delta")
|
||||||
|
interaction_scores <- interaction_scores %>%
|
||||||
|
mutate(
|
||||||
|
Exp_L = WT_L + Raw_Shift_L,
|
||||||
|
Delta_L = WT_L - Exp_L,
|
||||||
|
Exp_K = WT_K + Raw_Shift_K,
|
||||||
|
Delta_K = WT_K - Exp_K,
|
||||||
|
Exp_r = WT_r + Raw_Shift_r,
|
||||||
|
Delta_r = WT_r - Exp_r,
|
||||||
|
Exp_AUC = WT_AUC + Raw_Shift_AUC,
|
||||||
|
Delta_AUC = WT_AUC - Exp_AUC
|
||||||
|
)
|
||||||
|
|
||||||
|
# Final adjustment to Delta for NG and SM conditions
|
||||||
|
interaction_scores <- interaction_scores %>%
|
||||||
|
mutate(
|
||||||
|
Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L),
|
||||||
|
Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K),
|
||||||
|
Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r),
|
||||||
|
Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC),
|
||||||
|
Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) # disregard shift for set-to-max values in Z score calculation
|
||||||
) %>%
|
) %>%
|
||||||
ungroup()
|
ungroup()
|
||||||
|
|
||||||
@@ -277,29 +308,29 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
|
|||||||
print("Calculating interaction scores part 2")
|
print("Calculating interaction scores part 2")
|
||||||
interaction_scores_all <- interaction_scores %>%
|
interaction_scores_all <- interaction_scores %>%
|
||||||
group_by(across(all_of(group_vars))) %>%
|
group_by(across(all_of(group_vars))) %>%
|
||||||
mutate(across(all_of(variables), list(
|
mutate(lm_score_L = max_conc * coef(lm(Delta_L ~ conc_num_factor))[2] + coef(lm(Delta_L ~ conc_num_factor))[1],
|
||||||
lm_score = ~ max_conc * lm(Delta ~ conc_num_factor)$coefficients[2] + lm(Delta ~ conc_num_factor)$coefficients[1],
|
lm_score_K = max_conc * coef(lm(Delta_K ~ conc_num_factor))[2] + coef(lm(Delta_K ~ conc_num_factor))[1],
|
||||||
lm_sd = ~ sd(lm_score),
|
lm_score_r = max_conc * coef(lm(Delta_r ~ conc_num_factor))[2] + coef(lm(Delta_r ~ conc_num_factor))[1],
|
||||||
lm_mean = ~ mean(lm_score),
|
lm_score_AUC = max_conc * coef(lm(Delta_AUC ~ conc_num_factor))[2] + coef(lm(Delta_AUC ~ conc_num_factor))[1]) %>%
|
||||||
Z_lm = ~ (lm_score - lm_mean) / lm_sd,
|
|
||||||
Sum_Zscore = ~ sum(Zscore, na.rm = TRUE)
|
|
||||||
), .names = "{.fn}_{.col}")) %>%
|
|
||||||
mutate(
|
mutate(
|
||||||
Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs,
|
Avg_Zscore_L = sum(Z_Shift_L, na.rm = TRUE) / num_non_removed_concs,
|
||||||
Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs,
|
Avg_Zscore_K = sum(Z_Shift_K, na.rm = TRUE) / num_non_removed_concs,
|
||||||
Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1),
|
Avg_Zscore_r = sum(Z_Shift_r, na.rm = TRUE) / (total_conc_num - 1),
|
||||||
Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1)
|
Avg_Zscore_AUC = sum(Z_Shift_AUC, na.rm = TRUE) / (total_conc_num - 1)
|
||||||
) %>%
|
)
|
||||||
ungroup()
|
|
||||||
|
|
||||||
|
# Arrange the results by Z_lm_L and NG
|
||||||
interaction_scores_all <- interaction_scores_all %>%
|
interaction_scores_all <- interaction_scores_all %>%
|
||||||
arrange(desc(Z_lm_L)) %>%
|
arrange(desc(lm_score_L)) %>%
|
||||||
arrange(desc(NG))
|
arrange(desc(NG)) %>%
|
||||||
|
ungroup()
|
||||||
|
|
||||||
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
|
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
interaction_plot_configs <- function(df, variable) {
|
interaction_plot_configs <- function(df, variable) {
|
||||||
ylim_vals <- switch(variable,
|
ylim_vals <- switch(variable,
|
||||||
"L" = c(-65, 65),
|
"L" = c(-65, 65),
|
||||||
@@ -650,7 +681,7 @@ main <- function() {
|
|||||||
write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
|
write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
|
||||||
|
|
||||||
print("Summary stats:")
|
print("Summary stats:")
|
||||||
print(head(summary_stats))
|
print(head(summary_stats), width = 200)
|
||||||
|
|
||||||
# TODO: Originally this filtered L NA's
|
# TODO: Originally this filtered L NA's
|
||||||
# Let's try to avoid for now since stats have already been calculated
|
# Let's try to avoid for now since stats have already been calculated
|
||||||
@@ -702,17 +733,17 @@ main <- function() {
|
|||||||
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
|
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
|
||||||
row.names = FALSE)
|
row.names = FALSE)
|
||||||
|
|
||||||
print("Background summary stats:")
|
#print("Background summary stats:")
|
||||||
print(head(summary_stats_bg))
|
#print(head(summary_stats_bg))
|
||||||
|
|
||||||
# Filter reference and deletion strains
|
# Filter reference and deletion strains
|
||||||
# Formerly X2_RF (reference strain)
|
# Formerly X2_RF (reference strain)
|
||||||
df_reference <- df_bg_stats %>%
|
df_reference <- df_na_stats %>%
|
||||||
filter(OrfRep == strain) %>%
|
filter(OrfRep == strain) %>%
|
||||||
mutate(SM = 0)
|
mutate(SM = 0)
|
||||||
|
|
||||||
# Formerly X2 (deletion strains)
|
# Formerly X2 (deletion strains)
|
||||||
df_deletion <- df_bg_stats %>%
|
df_deletion <- df_na_stats %>%
|
||||||
filter(OrfRep != strain) %>%
|
filter(OrfRep != strain) %>%
|
||||||
mutate(SM = 0)
|
mutate(SM = 0)
|
||||||
|
|
||||||
@@ -722,7 +753,12 @@ main <- function() {
|
|||||||
# Calculate interactions
|
# Calculate interactions
|
||||||
variables <- c("L", "K", "r", "AUC")
|
variables <- c("L", "K", "r", "AUC")
|
||||||
# We are recalculating some of the data here
|
# We are recalculating some of the data here
|
||||||
|
message("Calculating interaction scores")
|
||||||
|
print("Reference strain:")
|
||||||
|
print(head(reference_strain))
|
||||||
reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables)
|
reference_results <- calculate_interaction_scores(reference_strain, max_conc, variables)
|
||||||
|
print("Deletion strains:")
|
||||||
|
print(head(deletion_strains))
|
||||||
deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables)
|
deletion_results <- calculate_interaction_scores(deletion_strains, max_conc, variables)
|
||||||
|
|
||||||
zscores_calculations_reference <- reference_results$zscores_calculations
|
zscores_calculations_reference <- reference_results$zscores_calculations
|
||||||
|
|||||||
Reference in New Issue
Block a user