diff --git a/workflow/apps/r/calculate_interaction_zscores5.R b/workflow/apps/r/calculate_interaction_zscores5.R index 0a91dab2..a69d8f67 100644 --- a/workflow/apps/r/calculate_interaction_zscores5.R +++ b/workflow/apps/r/calculate_interaction_zscores5.R @@ -112,21 +112,12 @@ scale_colour_publication <- function(...) { load_and_process_data <- function(easy_results_file, sd = 3) { df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) - # Clean and convert columns to numeric where appropriate df <- df %>% filter(!(.[[1]] %in% c("", "Scan"))) %>% filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>% mutate( - Col = as.numeric(Col), - Row = as.numeric(Row), - num = as.numeric(Num.), - L = as.numeric(l), - K = as.numeric(K), - r = as.numeric(r), - scan = as.numeric(Scan), - AUC = as.numeric(AUC96), - last_bg = as.numeric(LstBackgrd), - first_bg = as.numeric(X1stBackgrd), + across(c(Col, Row, Num.), as.numeric), + across(c(L, K, r, scan, AUC96, LstBackgrd, X1stBackgrd), as.numeric), delta_bg = last_bg - first_bg, delta_bg_tolerance = mean(delta_bg, na.rm = TRUE) + (sd * sd(delta_bg, na.rm = TRUE)), NG = if_else(L == 0 & !is.na(L), 1, 0), @@ -134,11 +125,13 @@ load_and_process_data <- function(easy_results_file, sd = 3) { SM = 0, OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)), - conc_num_factor = as.numeric(as.factor(conc_num)) - 1) + conc_num_factor = as.numeric(as.factor(conc_num)) - 1 + ) return(df) } + # Update Gene names using the SGD gene list update_gene_names <- function(df, sgd_gene_list) { # Load SGD gene list @@ -189,6 +182,7 @@ process_strains <- function(df) { # Calculate summary statistics for all variables calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "conc_num_factor")) { + # Generate summary statistics summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% reframe(across(all_of(variables), list( @@ -198,13 +192,20 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)), # Return NA if all values are NA min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)), # Return NA if all values are NA sd = ~sd(., na.rm = TRUE), # Standard deviation ignoring NAs - se = ~ifelse(.N > 1, sd(., na.rm = TRUE) / sqrt(.N - 1), NA), # Standard Error using precomputed N - z_max = ~ifelse(sd(., na.rm = TRUE) == 0 | all(is.na(.)), NA, (max(., na.rm = TRUE) - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE)) # Z-score + se = ~ifelse(N > 1, sd(., na.rm = TRUE) / sqrt(N - 1), NA) # Standard Error using precomputed N + # TODO: not in original stats but better to do here than in calculate_interactions? + # z_max = ~ifelse(sd(., na.rm = TRUE) == 0 | all(is.na(.)), NA, + # (max(., na.rm = TRUE) - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE)) # Z-score ), .names = "{.fn}_{.col}")) - return(summary_stats) + # Join the summary stats back to the original dataframe + df_with_stats <- left_join(df, summary_stats, by = group_vars) + + # Return both the summary stats and the updated dataframe + return(list(summary_stats = summary_stats, df_with_stats = df_with_stats)) } + calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) { # Calculate total concentration variables @@ -215,7 +216,7 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c print("Calculating background means") bg_L <- df %>% filter(conc_num_factor == 0) %>% pull(mean_L) %>% first() bg_K <- df %>% filter(conc_num_factor == 0) %>% pull(mean_K) %>% first() - bg_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first() + bg_r <- df %>% filter(conc_num_factor == 0) %>% pull(mean_r) %>% first() bg_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(mean_AUC) %>% first() bg_sd_L <- df %>% filter(conc_num_factor == 0) %>% pull(sd_L) %>% first() @@ -223,110 +224,73 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c bg_sd_r <- df %>% filter(conc_num_factor == 0) %>% pull(sd_r) %>% first() bg_sd_AUC <- df %>% filter(conc_num_factor == 0) %>% pull(sd_AUC) %>% first() - # Calculate summary statistics and shifts + # Calculate interaction scores print("Calculating interaction scores part 1") interaction_scores <- df %>% group_by(across(all_of(group_vars)), conc_num, conc_num_factor) %>% - summarise( - NG = sum(NG), - DB = sum(DB), - SM = sum(SM) - ) %>% - summarise(across(all_of(variables), list( - mean = ~mean(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - se = ~sd(., na.rm = TRUE) / sqrt(N - 1) - ), .names = "{.fn}_{.col}")) %>% - summarise( - Raw_Shift_L = mean_L[[1]] - bg_L, - Raw_Shift_K = mean_K[[1]] - bg_K, - Raw_Shift_r = mean_r[[1]] - bg_r, - Raw_Shift_AUC = mean_AUC[[1]] - bg_AUC, - Z_Shift_L = Raw_Shift_L[[1]] / bg_sd_L, - Z_Shift_K = Raw_Shift_K[[1]] / bg_sd_K, - Z_Shift_r = Raw_Shift_r[[1]] / bg_sd_r, - Z_Shift_AUC = Raw_Shift_AUC[[1]] / bg_sd_AUC, - WT_L = mean_L, - WT_K = mean_K, - WT_r = mean_r, - WT_AUC = mean_AUC, - WT_sd_L = sd_L, - WT_sd_K = sd_K, - WT_sd_r = sd_r, - WT_sd_AUC = sd_AUC, - Exp_L = WT_L + Raw_Shift_L, - Exp_K = WT_K + Raw_Shift_K, - Exp_r = WT_r + Raw_Shift_r, - Exp_AUC = WT_AUC + Raw_Shift_AUC, - Delta_L = mean_L - Exp_L, - Delta_K = mean_K - Exp_K, - Delta_r = mean_r - Exp_r, - Delta_AUC = mean_AUC - Exp_AUC, - Delta_L = ifelse(NG == 1, mean_L - WT_L, Delta_L), # disregard shift for no growth values in Z score calculation - Delta_L = ifelse(SM == 1, mean_L - WT_L, Delta_L), # disregard shift for set to max values in Z score calculation - Delta_K = ifelse(NG == 1, mean_K - WT_K, Delta_K), - Delta_r = ifelse(NG == 1, mean_r - WT_r, Delta_r), - Delta_AUC = ifelse(NG == 1, mean_AUC - WT_AUC, Delta_AUC), - Zscore_L = Delta_L / WT_sd_L, - Zscore_K = Delta_K / WT_sd_K, - Zscore_r = Delta_r / WT_sd_r, - Zscore_AUC = Delta_AUC / WT_sd_AUC, - ) %>% - ungroup() + mutate( + NG = sum(NG, na.rm = TRUE), + DB = sum(DB, na.rm = TRUE), + SM = sum(SM, na.rm = TRUE) + ) %>% + mutate(across(all_of(variables), list( + N = ~sum(!is.na(.)), # Count of non-NA values + mean = ~mean(., na.rm = TRUE), + median = ~median(., na.rm = TRUE), + max = ~max(., na.rm = TRUE), + min = ~min(., na.rm = TRUE), + sd = ~sd(., na.rm = TRUE), + se = ~sd(., na.rm = TRUE) / sqrt(N - 1), # Standard error calculation + Raw_Shift = ~mean(., na.rm = TRUE) - case_when( + cur_column() == "L" ~ bg_L, + cur_column() == "K" ~ bg_K, + cur_column() == "r" ~ bg_r, + cur_column() == "AUC" ~ bg_AUC + ), + Z_Shift = ~ Raw_Shift / case_when( + cur_column() == "L" ~ bg_sd_L, + cur_column() == "K" ~ bg_sd_K, + cur_column() == "r" ~ bg_sd_r, + cur_column() == "AUC" ~ bg_sd_AUC + ), + WT = ~mean(., na.rm = TRUE), + WT_sd = ~sd(., na.rm = TRUE), + Exp = ~ WT + Raw_Shift, + Delta = ~ WT - Exp, + Delta = if_else(NG == 1, mean - WT, Delta) + ), .names = "{.fn}_{.col}")) %>% + mutate( + Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L) # disregard shift for set to max values in Z score calculation + ) %>% + ungroup() # Calculate linear models and interaction scores per gene print("Calculating interaction scores part 2") interaction_scores_all <- interaction_scores %>% group_by(across(all_of(group_vars))) %>% - summarise( - lm_L = lm(Delta_L ~ conc_num_factor), - lm_K = lm(Delta_K ~ conc_num_factor), - lm_r = lm(Delta_r ~ conc_num_factor), - lm_AUC = lm(Delta_AUC ~ conc_num_factor), - lm_score_L = max_conc * coef(lm_L)[2] + coef(lm_L)[1], - lm_score_K = max_conc * coef(lm_K)[2] + coef(lm_K)[1], - lm_score_r = max_conc * coef(lm_r)[2] + coef(lm_r)[1], - lm_score_AUC = max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1], - lm_sd_L = sd(lm_score_L), - lm_sd_K = sd(lm_score_K), - lm_sd_r = sd(lm_score_r), - lm_sd_AUC = sd(lm_score_AUC), - lm_mean_L = mean(lm_score_L), - lm_mean_K = mean(lm_score_K), - lm_mean_r = mean(lm_score_r), - lm_mean_AUC = mean(lm_score_AUC), - Z_lm_L = (lm_score_L - lm_mean_L) / lm_sd_L, - Z_lm_K = (lm_score_K - lm_mean_K) / lm_sd_K, - Z_lm_r = (lm_score_r - lm_mean_r) / lm_sd_r, - Z_lm_AUC = (lm_score_AUC - lm_mean_AUC) / lm_sd_AUC, - r_squared_L = summary(lm_L)$r.squared, - r_squared_K = summary(lm_K)$r.squared, - r_squared_r = summary(lm_r)$r.squared, - r_squared_AUC = summary(lm_AUC)$r.squared, - Sum_Zscore_L = sum(Zscore_L), + mutate(across(all_of(variables), list( + lm_score = ~ max_conc * lm(Delta ~ conc_num_factor)$coefficients[2] + lm(Delta ~ conc_num_factor)$coefficients[1], + lm_sd = ~ sd(lm_score), + lm_mean = ~ mean(lm_score), + Z_lm = ~ (lm_score - lm_mean) / lm_sd, + Sum_Zscore = ~ sum(Zscore, na.rm = TRUE) + ), .names = "{.fn}_{.col}")) %>% + mutate( Avg_Zscore_L = Sum_Zscore_L / num_non_removed_concs, - Sum_Zscore_K = sum(Zscore_K), Avg_Zscore_K = Sum_Zscore_K / num_non_removed_concs, - Sum_Zscore_r = sum(Zscore_r), Avg_Zscore_r = Sum_Zscore_r / (total_conc_num - 1), - Sum_Zscore_AUC = sum(Zscore_AUC), - Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1), - NG = sum(NG), - DB = sum(DB), - SM = sum(SM) + Avg_Zscore_AUC = Sum_Zscore_AUC / (total_conc_num - 1) ) %>% ungroup() - interaction_scores_all <- interactions_scores_all %>% - arrange(desc(Z_lm_L)) %>% - arrange(desc(NG)) + interaction_scores_all <- interaction_scores_all %>% + arrange(desc(Z_lm_L)) %>% + arrange(desc(NG)) return(list(zscores_calculations = interaction_scores_all, zscores_interactions = interaction_scores)) } + interaction_plot_configs <- function(df, variable) { ylim_vals <- switch(variable, "L" = c(-65, 65), @@ -518,40 +482,19 @@ generate_cpp_correlation_plots <- function(df_na_rm, lm_list, output_dir) { } # Adjust missing values and calculate ranks -adjust_missing_and_rank <- function(df) { +adjust_missing_and_rank <- function(df, variables) { + df <- df %>% - mutate( - Avg_Zscore_L = ifelse(is.na(Avg_Zscore_L), 0.001, Avg_Zscore_L), - Avg_Zscore_K = ifelse(is.na(Avg_Zscore_K), 0.001, Avg_Zscore_K), - Avg_Zscore_r = ifelse(is.na(Avg_Zscore_r), 0.001, Avg_Zscore_r), - Avg_Zscore_AUC = ifelse(is.na(Avg_Zscore_AUC), 0.001, Avg_Zscore_AUC), - Z_lm_L = ifelse(is.na(Z_lm_L), 0.001, Z_lm_L), - Z_lm_K = ifelse(is.na(Z_lm_K), 0.001, Z_lm_K), - Z_lm_r = ifelse(is.na(Z_lm_r), 0.001, Z_lm_r), - Z_lm_AUC = ifelse(is.na(Z_lm_AUC), 0.001, Z_lm_AUC), - L_Rank = rank(Avg_Zscore_L), - K_Rank = rank(Avg_Zscore_K), - r_Rank = rank(Avg_Zscore_r), - AUC_Rank = rank(Avg_Zscore_AUC), - L_Rank_lm = rank(Z_lm_L), - K_Rank_lm = rank(Z_lm_K), - r_Rank_lm = rank(Z_lm_r), - AUC_Rank_lm = rank(Z_lm_AUC) - ) + mutate(across(all_of(variables), list( + Avg_Zscore = if_else(is.na(.Avg_Zscore), 0.001, .Avg_Zscore), + Z_lm = if_else(is.na(.Z_lm), 0.001, .Z_lm), + Rank = rank(.Avg_Zscore), + Rank_lm = rank(.Z_lm), + ), "{.fn}_{.col}")) return(df) } -# Function to create and save all ranked plots -create_ranked_plots <- function(df, output_dir) { - df_adjusted <- adjust_missing_and_rank(df) - - # Generate and save ranked plots - generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots") - generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_lm") - generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_naRM") - generate_and_save_ranked_plots(df_adjusted, output_dir, "RankPlots_lm_naRM") -} generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", title_prefix = "", x_label = NULL, y_label = NULL, ylim_vals = NULL, output_dir, file_prefix = "plot") { @@ -590,12 +533,13 @@ generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num", } # Function to generate rank plots for the provided dataframe -generate_rank_plots <- function(df, output_dir) { - # Adjust missing values and calculate ranks - df_adjusted <- adjust_missing_and_rank(df) - +create_ranked_plots <- function(df, output_dir) { + # List of variables for which we need to generate rank plots variables <- c("L", "K", "r", "AUC") + + # Adjust missing values and calculate ranks + df_adjusted <- adjust_missing_and_rank(df, variables) # Generate rank plots for Avg_Zscore and Z_lm for (var in variables) { @@ -623,13 +567,6 @@ generate_rank_plots <- function(df, output_dir) { } } -# Function to call rank plot generation in the main function -create_ranked_plots <- function(df, output_dir) { - message("Generating rank plots") - generate_rank_plots(df, output_dir) -} - - main <- function() { lapply(names(args$experiments), function(exp_name) { exp <- args$experiments[[exp_name]] @@ -658,11 +595,6 @@ main <- function() { # Get the number of rows that are above tolerance rows_to_remove <- nrow(df_above_tolerance) - # Logging or handling the calculated values, e.g.: - message("Half-median for L (above tolerance): ", L_half_median) - message("Half-median for K (above tolerance): ", K_half_median) - message("Number of rows above tolerance: ", rows_to_remove) - # Additional filtering for non-finite values in df_na df_na_filtered <- df_na %>% filter(if_any(c(L), ~ !is.finite(.))) %>% @@ -700,52 +632,40 @@ main <- function() { generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", after_qc_plot_configs) generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", no_zeros_plot_configs) + # Clean up + rm(df, df_above_tolerance, df_no_zeros) - # Clean up - rm(df, df_above_tolerance, df_no_zeros) - - # Calculate summary statistics message("Calculating summary statistics for all strains") variables <- c("L", "K", "r", "AUC") - stats <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) - write.csv(stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) - stats_joined <- left_join(df_na, stats, by = c("OrfRep", "conc_num", "conc_num_factor")) + list(summary_stats, df_na_stats) <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) + # stats <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) + write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE) - print("stats:") - print(head(stats)) + print("Summary stats:") + print(head(summary_stats)) - # Originally this filtered L NA's + # TODO: Originally this filtered L NA's + # Let's try to avoid for now since stats have already been calculated # Filter data within and outside 2SD - within_2sd_k <- stats_joined %>% + message("Filtering by 2SD of K") + df_na_within_2sd_k <- df_na %>% filter(K >= (mean_K - 2 * sd_K) & K <= (mean_K + 2 * sd_K)) - outside_2sd_k <- stats_joined %>% + df_na_outside_2sd_k <- df_na %>% filter(K < (mean_K - 2 * sd_K) | K > (mean_K + 2 * sd_K)) # Summary statistics for within and outside 2SD of K - l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) - l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + message("Calculating summary statistics for L within 2SD of K") + list(l_within_2sd_k_stats, df_na_l_within_2sd_k_stats) <- + calculate_summary_stats(df_na_l_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) + message("Calculating summary statistics for L outside 2SD of K") + list(l_outside_2sd_k_stats, df_na_l_outside_2sd_k_stats) <- + calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) # Write CSV files - write.csv(l_within_2sd_k, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE) - write.csv(l_outside_2sd_k, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) - - message("Calculating summary statistics for L within 2SD of K") - cols_to_remove <- names(l_within_2sd_k) - cols_to_keep <- c("conc_num", "conc_num_factor") - within_2sd_k_clean <- within_2sd_k %>% - select(-all_of(setdiff(cols_to_remove, cols_to_keep))) - l_within_2sd_k_joined <- within_2sd_k_clean %>% - left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor")) - - message("Calculating summary statistics for for L outside 2SD of K") - cols_to_remove <- names(l_outside_2sd_k) - cols_to_keep <- c("conc_num", "conc_num_factor") - outside_2sd_k_clean <- outside_2sd_k %>% - select(-all_of(setdiff(cols_to_remove, cols_to_keep))) - l_outside_2sd_k_joined <- outside_2sd_k_clean %>% - left_join(l_outside_2sd_k, by = c("conc_num", "conc_num_factor")) + write.csv(l_within_2sd_k_stats, file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"), row.names = FALSE) + write.csv(l_outside_2sd_k_stats, file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"), row.names = FALSE) # Process background strains bg_strains <- c("YDL227C") @@ -767,20 +687,20 @@ main <- function() { # Recalculate summary statistics for the background strain message("Calculating summary statistics for background strain") - stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) - write.csv(stats_bg, + list(summary_stats_bg, df_bg_stats) <- + calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor")) + write.csv(summary_stats_bg, file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")), row.names = FALSE) - # stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "conc_num", "conc_num_factor")) # Filter reference and deletion strains # Formerly X2_RF (reference strain) - df_reference <- stats_joined %>% + df_reference <- df_bg_stats %>% filter(OrfRep == strain) %>% mutate(SM = 0) # Formerly X2 (deletion strains) - df_deletion <- stats_joined %>% + df_deletion <- df_bg_stats %>% filter(OrfRep != strain) %>% mutate(SM = 0)