diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index 8dd976c3..1d5b857b 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -116,7 +116,7 @@ scale_colour_publication <- function(...) { # Load the initial dataframe from the easy_results_file load_and_process_data <- function(easy_results_file, sd = 3) { df <- read.delim(easy_results_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE) - + df <- df %>% filter(!(.[[1]] %in% c("", "Scan"))) %>% filter(!is.na(ORF) & ORF != "" & !Gene %in% c("BLANK", "Blank", "blank") & Drug != "BMH21") %>% @@ -130,13 +130,14 @@ load_and_process_data <- function(easy_results_file, sd = 3) { DB = if_else(delta_bg >= delta_bg_tolerance, 1, 0), SM = 0, OrfRep = if_else(ORF == "YDL227C", "YDL227C", OrfRep), # should these be hardcoded? - conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)), - conc_num_factor = as.factor(conc_num) - # conc_num_factor = factor(conc_num, levels = sort(unique(conc_num))) + conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc)) + ) %>% + mutate( + conc_num_factor = as.factor(match(conc_num, sort(unique(conc_num))) - 1) ) - - return(df) -} + + return(df) + } # Update Gene names using the SGD gene list update_gene_names <- function(df, sgd_gene_list) { @@ -160,15 +161,6 @@ update_gene_names <- function(df, sgd_gene_list) { calculate_summary_stats <- function(df, variables, group_vars) { - - summary_stats <- df %>% - group_by(across(all_of(group_vars))) %>% - summarise( - N = n(), - sd_check = sd(L, na.rm = TRUE), # Test sd on a specific variable, e.g., L - se_check = sd(L, na.rm = TRUE) / sqrt(N) # Test se on a specific variable, e.g., L - ) - summary_stats <- df %>% group_by(across(all_of(group_vars))) %>% summarise( @@ -184,9 +176,6 @@ calculate_summary_stats <- function(df, variables, group_vars) { .groups = "drop" ) - # sd = ~sd(., na.rm = TRUE) - # se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1) - # Create a cleaned version of df that doesn't overlap with summary_stats cols_to_keep <- setdiff(names(df), names(summary_stats)[-which(names(summary_stats) %in% group_vars)]) df_cleaned <- df %>% @@ -219,11 +208,11 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars) { stats <- calculate_summary_stats(df, variables = variables, - group_vars = c("OrfRep", "Gene", "num", "conc_num_factor" - ))$summary_stats + group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") + )$summary_stats stats <- df %>% - group_by(OrfRep, Gene, num) %>% + group_by(across(all_of(group_vars))) %>% mutate( WT_L = mean_L, WT_K = mean_K, @@ -277,13 +266,13 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars) { ) # Calculate linear models - lm_L <- lm(Delta_L ~ conc_num_factor, data = stats) - lm_K <- lm(Delta_K ~ conc_num_factor, data = stats) - lm_r <- lm(Delta_r ~ conc_num_factor, data = stats) - lm_AUC <- lm(Delta_AUC ~ conc_num_factor, data = stats) + lm_L <- lm(Delta_L ~ conc_num, data = stats) + lm_K <- lm(Delta_K ~ conc_num, data = stats) + lm_r <- lm(Delta_r ~ conc_num, data = stats) + lm_AUC <- lm(Delta_AUC ~ conc_num, data = stats) interactions <- stats %>% - group_by(OrfRep, Gene, num) %>% + group_by(across(all_of(group_vars))) %>% summarise( OrfRep = first(OrfRep), Gene = first(Gene), @@ -357,10 +346,10 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars) { "NG", "SM", "DB") calculations_joined <- df %>% select(-any_of(setdiff(names(calculations), c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")))) - calculations_joined <- left_join(calculations_joined, calculations, by = c("OrfRep", "Gene", "num", "conc_num_factor")) + calculations_joined <- left_join(calculations_joined, calculations, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) interactions_joined <- df %>% select(-any_of(setdiff(names(interactions), c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")))) - interactions_joined <- left_join(interactions_joined, interactions, by = c("OrfRep", "Gene", "num", "conc_num_factor")) + interactions_joined <- left_join(interactions_joined, interactions, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) return(list(calculations = calculations, interactions = interactions, interactions_joined = interactions_joined, calculations_joined = calculations_joined)) @@ -1059,7 +1048,7 @@ main <- function() { ss <- calculate_summary_stats( df = df, variables = summary_vars, - group_vars = c("OrfRep", "conc_num_factor")) + group_vars = c("conc_num", "conc_num_factor")) df_stats <- ss$df_with_stats message("Filtering non-finite data") df_filtered_stats <- filter_data(df_stats, c("L"), nf = TRUE) @@ -1068,7 +1057,7 @@ main <- function() { ss <- calculate_summary_stats( df = df_na, variables = summary_vars, - group_vars = c("OrfRep", "conc_num_factor")) + group_vars = c("conc_num", "conc_num_factor")) df_na_ss <- ss$summary_stats df_na_stats <- ss$df_with_stats write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) @@ -1078,7 +1067,7 @@ main <- function() { ss <- calculate_summary_stats( df = df_no_zeros, variables = summary_vars, - group_vars = c("OrfRep", "conc_num_factor")) + group_vars = c("conc_num", "conc_num_factor")) df_no_zeros_stats <- ss$df_with_stats df_no_zeros_filtered_stats <- filter_data(df_no_zeros_stats, c("L"), nf = TRUE) @@ -1090,14 +1079,14 @@ main <- function() { message("Calculating summary statistics for L within 2SD of K") # TODO We're omitting the original z_max calculation, not sure if needed? - ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num_factor")) + ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) # df_na_l_within_2sd_k_stats <- ss$df_with_stats write.csv(ss$summary_stats, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) message("Calculating summary statistics for L outside 2SD of K") - ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num_factor")) + ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor")) df_na_l_outside_2sd_k_stats <- ss$df_with_stats write.csv(ss$summary_stats, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), @@ -1272,7 +1261,7 @@ main <- function() { # Set the missing values to the highest theoretical value at each drug conc for L # Leave other values as 0 for the max/min reference_strain <- df_reference %>% - group_by(conc_num_factor) %>% + group_by(conc_num, conc_num_factor) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), @@ -1282,7 +1271,7 @@ main <- function() { # Ditto for deletion strains deletion_strains <- df_deletion %>% - group_by(conc_num_factor) %>% + group_by(conc_num, conc_num_factor) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L),