Create and join dataframes in calculate_summary_stats

This commit is contained in:
2024-09-05 19:42:48 -04:00
parent 75ba5939a3
commit 3cfd324309

View File

@@ -112,21 +112,12 @@ scale_colour_publication <- function(...) {
load_and_process_data <- function(easy_results_file, sd = 3) { load_and_process_data <- function(easy_results_file, sd = 3) {
df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
# Clean and convert columns to numeric where appropriate
df <- df %>% df <- df %>%
filter(!(.[[1]] %in% c("", "Scan"))) %>% filter(!(.[[1]] %in% c("", "Scan"))) %>%
filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>% filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>%
mutate( mutate(
Col = as.numeric(Col), across(c(Col, Row, Num.), as.numeric),
Row = as.numeric(Row), across(c(L, K, r, scan, AUC96, LstBackgrd, X1stBackgrd), as.numeric),
num = as.numeric(Num.),
L = as.numeric(l),
K = as.numeric(K),
r = as.numeric(r),
scan = as.numeric(Scan),
AUC = as.numeric(AUC96),
last_bg = as.numeric(LstBackgrd),
first_bg = as.numeric(X1stBackgrd),
delta_bg = last_bg - first_bg, delta_bg = last_bg - first_bg,
delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + (sd * sd(delta_bg, na.rm = TRUE)), delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + (sd * sd(delta_bg, na.rm = TRUE)),
NG = if_else(L == 0 & !is.na(L), 1, 0), NG = if_else(L == 0 & !is.na(L), 1, 0),
@@ -134,11 +125,13 @@ load_and_process_data <- function(easy_results_file, sd = 3) {
SM = 0, SM = 0,
OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep),
conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)), conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)),
conc_num_factor = as.numeric(as.factor(conc_num)) - 1) conc_num_factor = as.numeric(as.factor(conc_num)) - 1
)
return(df) return(df)
} }
# Update Gene names using the SGD gene list # Update Gene names using the SGD gene list
update_gene_names <- function(df, sgd_gene_list) { update_gene_names <- function(df, sgd_gene_list) {
# Load SGD gene list # Load SGD gene list
@@ -189,6 +182,7 @@ process_strains <- function(df) {
# Calculate summary statistics for all variables # Calculate summary statistics for all variables
calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) { calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) {
# Generate summary statistics
summary_stats <- df %>% summary_stats <- df %>%
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
reframe(across(all_of(variables), list( reframe(across(all_of(variables), list(
@@ -198,13 +192,20 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), # Return NA if all values are NA max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), # Return NA if all values are NA
min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), # Return NA if all values are NA min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), # Return NA if all values are NA
sd = ~sd(., na.rm = TRUE), # Standard deviation ignoring NAs sd = ~sd(., na.rm = TRUE), # Standard deviation ignoring NAs
se = ~ifelse(.N > 1, sd(., na.rm = TRUE) / sqrt(.N - 1), NA), # Standard Error using precomputed N se = ~ifelse(N > 1, sd(., na.rm = TRUE) / sqrt(N - 1), NA) # Standard Error using precomputed N
z_max = ~ifelse(sd(., na.rm = TRUE) == 0 | all(is.na(.)), NA, (max(., na.rm = TRUE) - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE)) # Z-score # TODO: not in original stats but better to do here than in calculate_interactions?
# z_max = ~ifelse(sd(., na.rm = TRUE) == 0 | all(is.na(.)), NA,
# (max(., na.rm = TRUE) - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE)) # Z-score
), .names = "{.fn}_{.col}")) ), .names = "{.fn}_{.col}"))
return(summary_stats) # Join the summary stats back to the original dataframe
df_with_stats <- left_join(df, summary_stats, by = group_vars)
# Return both the summary stats and the updated dataframe
return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
} }
calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
# Calculate total concentration variables # Calculate total concentration variables
@@ -215,7 +216,7 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
print("Calculating background means") print("Calculating background means")
bg_L <- df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first() bg_L <- df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first()
bg_K <- df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first() bg_K <- df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first()
bg_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first() bg_r <- df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first()
bg_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first() bg_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first()
bg_sd_L <- df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first() bg_sd_L <- df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first()
@@ -223,110 +224,73 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
bg_sd_r <- df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first() bg_sd_r <- df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first()
bg_sd_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() bg_sd_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first()
# Calculate summary statistics and shifts # Calculate interaction scores
print("Calculating interaction scores part 1") print("Calculating interaction scores part 1")
interaction_scores <- df %>% interaction_scores <- df %>%
group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>%
summarise( mutate(
NG = sum(NG), NG = sum(NG, na.rm = TRUE),
DB = sum(DB), DB = sum(DB, na.rm = TRUE),
SM = sum(SM) SM = sum(SM, na.rm = TRUE)
) %>% ) %>%
summarise(across(all_of(variables), list( mutate(across(all_of(variables), list(
mean = ~mean(., na.rm = TRUE), N = ~sum(!is.na(.)), # Count of non-NA values
median = ~median(., na.rm = TRUE), mean = ~mean(., na.rm = TRUE),
max = ~max(., na.rm = TRUE), median = ~median(., na.rm = TRUE),
min = ~min(., na.rm = TRUE), max = ~max(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE), min = ~min(., na.rm = TRUE),
se = ~sd(., na.rm = TRUE) / sqrt(N - 1) sd = ~sd(., na.rm = TRUE),
), .names = "{.fn}_{.col}")) %>% se = ~sd(., na.rm = TRUE) / sqrt(N - 1), # Standard error calculation
summarise( Raw_Shift = ~mean(., na.rm = TRUE) - case_when(
Raw_Shift_L = mean_L[[1]] - bg_L, cur_column() == "L" ~ bg_L,
Raw_Shift_K = mean_K[[1]] - bg_K, cur_column() == "K" ~ bg_K,
Raw_Shift_r = mean_r[[1]] - bg_r, cur_column() == "r" ~ bg_r,
Raw_Shift_AUC = mean_AUC[[1]] - bg_AUC, cur_column() == "AUC" ~ bg_AUC
Z_Shift_L = Raw_Shift_L[[1]] / bg_sd_L, ),
Z_Shift_K = Raw_Shift_K[[1]] / bg_sd_K, Z_Shift = ~ Raw_Shift / case_when(
Z_Shift_r = Raw_Shift_r[[1]] / bg_sd_r, cur_column() == "L" ~ bg_sd_L,
Z_Shift_AUC = Raw_Shift_AUC[[1]] / bg_sd_AUC, cur_column() == "K" ~ bg_sd_K,
WT_L = mean_L, cur_column() == "r" ~ bg_sd_r,
WT_K = mean_K, cur_column() == "AUC" ~ bg_sd_AUC
WT_r = mean_r, ),
WT_AUC = mean_AUC, WT = ~mean(., na.rm = TRUE),
WT_sd_L = sd_L, WT_sd = ~sd(., na.rm = TRUE),
WT_sd_K = sd_K, Exp = ~ WT + Raw_Shift,
WT_sd_r = sd_r, Delta = ~ WT - Exp,
WT_sd_AUC = sd_AUC, Delta = if_else(NG == 1, mean - WT, Delta)
Exp_L = WT_L + Raw_Shift_L, ), .names = "{.fn}_{.col}")) %>%
Exp_K = WT_K + Raw_Shift_K, mutate(
Exp_r = WT_r + Raw_Shift_r, Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) # disregard shift for set to max values in Z score calculation
Exp_AUC = WT_AUC + Raw_Shift_AUC, ) %>%
Delta_L = mean_L - Exp_L, ungroup()
Delta_K = mean_K - Exp_K,
Delta_r = mean_r - Exp_r,
Delta_AUC = mean_AUC - Exp_AUC,
Delta_L = ifelse(NG == 1, mean_L - WT_L, Delta_L), # disregard shift for no growth values in Z score calculation
Delta_L = ifelse(SM == 1, mean_L - WT_L, Delta_L), # disregard shift for set to max values in Z score calculation
Delta_K = ifelse(NG == 1, mean_K - WT_K, Delta_K),
Delta_r = ifelse(NG == 1, mean_r - WT_r, Delta_r),
Delta_AUC = ifelse(NG == 1, mean_AUC - WT_AUC, Delta_AUC),
Zscore_L = Delta_L / WT_sd_L,
Zscore_K = Delta_K / WT_sd_K,
Zscore_r = Delta_r / WT_sd_r,
Zscore_AUC = Delta_AUC / WT_sd_AUC,
) %>%
ungroup()
# Calculate linear models and interaction scores per gene # Calculate linear models and interaction scores per gene
print("Calculating interaction scores part 2") print("Calculating interaction scores part 2")
interaction_scores_all <- interaction_scores %>% interaction_scores_all <- interaction_scores %>%
group_by(across(all_of(group_vars))) %>% group_by(across(all_of(group_vars))) %>%
summarise( mutate(across(all_of(variables), list(
lm_L = lm(Delta_L ~ conc_num_factor), lm_score = ~ max_conc * lm(Delta ~ conc_num_factor)$coefficients[2] + lm(Delta ~ conc_num_factor)$coefficients[1],
lm_K = lm(Delta_K ~ conc_num_factor), lm_sd = ~ sd(lm_score),
lm_r = lm(Delta_r ~ conc_num_factor), lm_mean = ~ mean(lm_score),
lm_AUC = lm(Delta_AUC ~ conc_num_factor), Z_lm = ~ (lm_score - lm_mean) / lm_sd,
lm_score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1], Sum_Zscore = ~ sum(Zscore, na.rm = TRUE)
lm_score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1], ), .names = "{.fn}_{.col}")) %>%
lm_score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1], mutate(
lm_score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1],
lm_sd_L = sd(lm_score_L),
lm_sd_K = sd(lm_score_K),
lm_sd_r = sd(lm_score_r),
lm_sd_AUC = sd(lm_score_AUC),
lm_mean_L = mean(lm_score_L),
lm_mean_K = mean(lm_score_K),
lm_mean_r = mean(lm_score_r),
lm_mean_AUC = mean(lm_score_AUC),
Z_lm_L = (lm_score_L - lm_mean_L) / lm_sd_L,
Z_lm_K = (lm_score_K - lm_mean_K) / lm_sd_K,
Z_lm_r = (lm_score_r - lm_mean_r) / lm_sd_r,
Z_lm_AUC = (lm_score_AUC - lm_mean_AUC) / lm_sd_AUC,
r_squared_L = summary(lm_L)$r.squared,
r_squared_K = summary(lm_K)$r.squared,
r_squared_r = summary(lm_r)$r.squared,
r_squared_AUC = summary(lm_AUC)$r.squared,
Sum_Zscore_L = sum(Zscore_L),
Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs, Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs,
Sum_Zscore_K = sum(Zscore_K),
Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs, Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs,
Sum_Zscore_r = sum(Zscore_r),
Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1), Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1),
Sum_Zscore_AUC = sum(Zscore_AUC), Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1)
Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1),
NG = sum(NG),
DB = sum(DB),
SM = sum(SM)
) %>% ) %>%
ungroup() ungroup()
interaction_scores_all <- interactions_scores_all %>% interaction_scores_all <- interaction_scores_all %>%
arrange(desc(Z_lm_L)) %>% arrange(desc(Z_lm_L)) %>%
arrange(desc(NG)) arrange(desc(NG))
return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores))
} }
interaction_plot_configs <- function(df, variable) { interaction_plot_configs <- function(df, variable) {
ylim_vals <- switch(variable, ylim_vals <- switch(variable,
"L" = c(-65, 65), "L" = c(-65, 65),
@@ -518,40 +482,19 @@ generate_cpp_correlation_plots <- function(df_na_rm, lm_list, output_dir) {
} }
# Adjust missing values and calculate ranks # Adjust missing values and calculate ranks
adjust_missing_and_rank <- function(df) { adjust_missing_and_rank <- function(df, variables) {
df <- df %>% df <- df %>%
mutate( mutate(across(all_of(variables), list(
Avg_Zscore_L = ifelse(is.na(Avg_Zscore_L), 0.001, Avg_Zscore_L), Avg_Zscore = if_else(is.na(.Avg_Zscore), 0.001, .Avg_Zscore),
Avg_Zscore_K = ifelse(is.na(Avg_Zscore_K), 0.001, Avg_Zscore_K), Z_lm = if_else(is.na(.Z_lm), 0.001, .Z_lm),
Avg_Zscore_r = ifelse(is.na(Avg_Zscore_r), 0.001, Avg_Zscore_r), Rank = rank(.Avg_Zscore),
Avg_Zscore_AUC = ifelse(is.na(Avg_Zscore_AUC), 0.001, Avg_Zscore_AUC), Rank_lm = rank(.Z_lm),
Z_lm_L = ifelse(is.na(Z_lm_L), 0.001, Z_lm_L), ), "{.fn}_{.col}"))
Z_lm_K = ifelse(is.na(Z_lm_K), 0.001, Z_lm_K),
Z_lm_r = ifelse(is.na(Z_lm_r), 0.001, Z_lm_r),
Z_lm_AUC = ifelse(is.na(Z_lm_AUC), 0.001, Z_lm_AUC),
L_Rank = rank(Avg_Zscore_L),
K_Rank = rank(Avg_Zscore_K),
r_Rank = rank(Avg_Zscore_r),
AUC_Rank = rank(Avg_Zscore_AUC),
L_Rank_lm = rank(Z_lm_L),
K_Rank_lm = rank(Z_lm_K),
r_Rank_lm = rank(Z_lm_r),
AUC_Rank_lm = rank(Z_lm_AUC)
)
return(df) return(df)
} }
# Function to create and save all ranked plots
create_ranked_plots <- function(df, output_dir) {
df_adjusted <- adjust_missing_and_rank(df)
# Generate and save ranked plots
generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots")
generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_lm")
generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_naRM")
generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_lm_naRM")
}
generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", title_prefix = "", generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", title_prefix = "",
x_label = NULL, y_label = NULL, ylim_vals = NULL, output_dir, file_prefix = "plot") { x_label = NULL, y_label = NULL, ylim_vals = NULL, output_dir, file_prefix = "plot") {
@@ -590,13 +533,14 @@ generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num",
} }
# Function to generate rank plots for the provided dataframe # Function to generate rank plots for the provided dataframe
generate_rank_plots <- function(df, output_dir) { create_ranked_plots <- function(df, output_dir) {
# Adjust missing values and calculate ranks
df_adjusted <- adjust_missing_and_rank(df)
# List of variables for which we need to generate rank plots # List of variables for which we need to generate rank plots
variables <- c("L", "K", "r", "AUC") variables <- c("L", "K", "r", "AUC")
# Adjust missing values and calculate ranks
df_adjusted <- adjust_missing_and_rank(df, variables)
# Generate rank plots for Avg_Zscore and Z_lm # Generate rank plots for Avg_Zscore and Z_lm
for (var in variables) { for (var in variables) {
plot_rank_avg <- generate_plot( plot_rank_avg <- generate_plot(
@@ -623,13 +567,6 @@ generate_rank_plots <- function(df, output_dir) {
} }
} }
# Function to call rank plot generation in the main function
create_ranked_plots <- function(df, output_dir) {
message("Generating rank plots")
generate_rank_plots(df, output_dir)
}
main <- function() { main <- function() {
lapply(names(args$experiments), function(exp_name) { lapply(names(args$experiments), function(exp_name) {
exp <- args$experiments[[exp_name]] exp <- args$experiments[[exp_name]]
@@ -658,11 +595,6 @@ main <- function() {
# Get the number of rows that are above tolerance # Get the number of rows that are above tolerance
rows_to_remove <- nrow(df_above_tolerance) rows_to_remove <- nrow(df_above_tolerance)
# Logging or handling the calculated values, e.g.:
message("Half-median for L (above tolerance): ", L_half_median)
message("Half-median for K (above tolerance): ", K_half_median)
message("Number of rows above tolerance: ", rows_to_remove)
# Additional filtering for non-finite values in df_na # Additional filtering for non-finite values in df_na
df_na_filtered <- df_na %>% df_na_filtered <- df_na %>%
filter(if_any(c(L), ~ !is.finite(.))) %>% filter(if_any(c(L), ~ !is.finite(.))) %>%
@@ -700,52 +632,40 @@ main <- function() {
generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", after_qc_plot_configs) generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", after_qc_plot_configs)
generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", no_zeros_plot_configs) generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", no_zeros_plot_configs)
# Clean up
# Clean up rm(df, df_above_tolerance, df_no_zeros)
rm(df, df_above_tolerance, df_no_zeros)
# Calculate summary statistics # Calculate summary statistics
message("Calculating summary statistics for all strains") message("Calculating summary statistics for all strains")
variables <- c("L", "K", "r", "AUC") variables <- c("L", "K", "r", "AUC")
stats <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) list(summary_stats, df_na_stats) <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
write.csv(stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) # stats <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
stats_joined <- left_join(df_na, stats, by = c("OrfRep", "conc_num", "conc_num_factor")) write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
print("stats:") print("Summary stats:")
print(head(stats)) print(head(summary_stats))
# Originally this filtered L NA's # TODO: Originally this filtered L NA's
# Let's try to avoid for now since stats have already been calculated
# Filter data within and outside 2SD # Filter data within and outside 2SD
within_2sd_k <- stats_joined %>% message("Filtering by 2SD of K")
df_na_within_2sd_k <- df_na %>%
filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K))
outside_2sd_k <- stats_joined %>% df_na_outside_2sd_k <- df_na %>%
filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K))
# Summary statistics for within and outside 2SD of K # Summary statistics for within and outside 2SD of K
l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) message("Calculating summary statistics for L within 2SD of K")
l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) list(l_within_2sd_k_stats, df_na_l_within_2sd_k_stats) <-
calculate_summary_stats(df_na_l_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
message("Calculating summary statistics for L outside 2SD of K")
list(l_outside_2sd_k_stats, df_na_l_outside_2sd_k_stats) <-
calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
# Write CSV files # Write CSV files
write.csv(l_within_2sd_k, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE) write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE)
write.csv(l_outside_2sd_k, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE)
message("Calculating summary statistics for L within 2SD of K")
cols_to_remove <- names(l_within_2sd_k)
cols_to_keep <- c("conc_num", "conc_num_factor")
within_2sd_k_clean <- within_2sd_k %>%
select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
l_within_2sd_k_joined <- within_2sd_k_clean %>%
left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor"))
message("Calculating summary statistics for for L outside 2SD of K")
cols_to_remove <- names(l_outside_2sd_k)
cols_to_keep <- c("conc_num", "conc_num_factor")
outside_2sd_k_clean <- outside_2sd_k %>%
select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
l_outside_2sd_k_joined <- outside_2sd_k_clean %>%
left_join(l_outside_2sd_k, by = c("conc_num", "conc_num_factor"))
# Process background strains # Process background strains
bg_strains <- c("YDL227C") bg_strains <- c("YDL227C")
@@ -767,20 +687,20 @@ main <- function() {
# Recalculate summary statistics for the background strain # Recalculate summary statistics for the background strain
message("Calculating summary statistics for background strain") message("Calculating summary statistics for background strain")
stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) list(summary_stats_bg, df_bg_stats) <-
write.csv(stats_bg, calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
write.csv(summary_stats_bg,
file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
row.names = FALSE) row.names = FALSE)
# stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "conc_num", "conc_num_factor"))
# Filter reference and deletion strains # Filter reference and deletion strains
# Formerly X2_RF (reference strain) # Formerly X2_RF (reference strain)
df_reference <- stats_joined %>% df_reference <- df_bg_stats %>%
filter(OrfRep == strain) %>% filter(OrfRep == strain) %>%
mutate(SM = 0) mutate(SM = 0)
# Formerly X2 (deletion strains) # Formerly X2 (deletion strains)
df_deletion <- stats_joined %>% df_deletion <- df_bg_stats %>%
filter(OrfRep != strain) %>% filter(OrfRep != strain) %>%
mutate(SM = 0) mutate(SM = 0)