diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index c5350880..bac3fe3d 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -161,9 +161,8 @@ load_and_filter_data <- function(easy_results_file, sd = 3) { ) # Set the max concentration across the whole dataframe - max_conc <- max(df$conc_num_factor, na.rm = TRUE) df <- df %>% - mutate(max_conc = max_conc) + mutate(max_conc = max(df$conc_num_factor, na.rm = TRUE)) return(df) } @@ -216,172 +215,183 @@ calculate_summary_stats <- function(df, variables, group_vars) { return(list(summary_stats = summary_stats, df_with_stats = df_joined)) } -calculate_interaction_scores <- function(df, bg_df, group_vars, overlap_threshold = 2) { +calculate_interaction_scores <- function(df, df_bg, group_vars, max_conc, overlap_threshold = 2) { - bg_df_selected <- bg_df %>% - select(OrfRep, conc_num, conc_num_factor, conc_num_factor_factor, - mean_L, mean_K, mean_r, mean_AUC, sd_L, sd_K, sd_r, sd_AUC + # Include background statistics per concentration + bg_stats <- df_bg %>% + group_by(conc_num, conc_num_factor) %>% + summarise( + WT_L = first(mean_L), + WT_K = first(mean_K), + WT_r = first(mean_r), + WT_AUC = first(mean_AUC), + WT_sd_L = first(sd_L), + WT_sd_K = first(sd_K), + WT_sd_r = first(sd_r), + WT_sd_AUC = first(sd_AUC), + .groups = "drop" ) - df <- df %>% - left_join(bg_df_selected, by = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor"), - suffix = c("", "_bg")) - - # Calculate total concentration variables + # Calculate total number of concentrations total_conc_num <- length(unique(df$conc_num)) - - # Initial calculations + + # Join background statistics to df calculations <- df %>% + left_join(bg_stats, by = c("conc_num", "conc_num_factor")) + + # Perform calculations + calculations <- calculations %>% group_by(across(all_of(group_vars))) %>% mutate( + N = n(), NG = sum(NG, na.rm = TRUE), DB = sum(DB, na.rm = TRUE), SM = sum(SM, na.rm = TRUE), - num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1, + num_non_removed_concs = n_distinct(conc_num[DB != 1]), + + # Raw shifts + Raw_Shift_L = mean_L - WT_L, + Raw_Shift_K = mean_K - WT_K, + Raw_Shift_r = mean_r - WT_r, + Raw_Shift_AUC = mean_AUC - WT_AUC, + + # Z shifts + Z_Shift_L = Raw_Shift_L / WT_sd_L, + Z_Shift_K = Raw_Shift_K / WT_sd_K, + Z_Shift_r = Raw_Shift_r / WT_sd_r, + Z_Shift_AUC = Raw_Shift_AUC / WT_sd_AUC, - # Assign WT values from the background data - WT_L = mean_L_bg, - WT_K = mean_K_bg, - WT_r = mean_r_bg, - WT_AUC = mean_AUC_bg, - WT_sd_L = sd_L_bg, - WT_sd_K = sd_K_bg, - WT_sd_r = sd_r_bg, - WT_sd_AUC = sd_AUC_bg, - - # Calculate raw data - Raw_Shift_L = first(mean_L) - first(mean_L_bg), - Raw_Shift_K = first(mean_K) - first(mean_K_bg), - Raw_Shift_r = first(mean_r) - first(mean_r_bg), - Raw_Shift_AUC = first(mean_AUC) - first(mean_AUC_bg), - Z_Shift_L = Raw_Shift_L / first(sd_L_bg), - Z_Shift_K = Raw_Shift_K / first(sd_K_bg), - Z_Shift_r = Raw_Shift_r / first(sd_r_bg), - Z_Shift_AUC = Raw_Shift_AUC / first(sd_AUC_bg), - # Expected values Exp_L = WT_L + Raw_Shift_L, Exp_K = WT_K + Raw_Shift_K, Exp_r = WT_r + Raw_Shift_r, Exp_AUC = WT_AUC + Raw_Shift_AUC, - + # Deltas Delta_L = mean_L - Exp_L, Delta_K = mean_K - Exp_K, Delta_r = mean_r - Exp_r, Delta_AUC = mean_AUC - Exp_AUC, + + # Adjust Deltas for NG and SM Delta_L = if_else(NG == 1, mean_L - WT_L, Delta_L), Delta_K = if_else(NG == 1, mean_K - WT_K, Delta_K), Delta_r = if_else(NG == 1, mean_r - WT_r, Delta_r), Delta_AUC = if_else(NG == 1, mean_AUC - WT_AUC, Delta_AUC), Delta_L = if_else(SM == 1, mean_L - WT_L, Delta_L), - - # Calculate Z-scores + + # Z-scores Zscore_L = Delta_L / WT_sd_L, Zscore_K = Delta_K / WT_sd_K, Zscore_r = Delta_r / WT_sd_r, Zscore_AUC = Delta_AUC / WT_sd_AUC ) %>% + ungroup() + + # Fit linear models within each group + calculations <- calculations %>% + group_by(across(all_of(group_vars))) %>% group_modify(~ { - # Perform linear models - lm_L <- lm(Delta_L ~ conc_num_factor, data = .x) - lm_K <- lm(Delta_K ~ conc_num_factor, data = .x) - lm_r <- lm(Delta_r ~ conc_num_factor, data = .x) - lm_AUC <- lm(Delta_AUC ~ conc_num_factor, data = .x) - - .x %>% + data <- .x + # Fit linear models + lm_L <- lm(Delta_L ~ conc_num_factor, data = data) + lm_K <- lm(Delta_K ~ conc_num_factor, data = data) + lm_r <- lm(Delta_r ~ conc_num_factor, data = data) + lm_AUC <- lm(Delta_AUC ~ conc_num_factor, data = data) + data <- data %>% mutate( lm_intercept_L = coef(lm_L)[1], lm_slope_L = coef(lm_L)[2], R_Squared_L = summary(lm_L)$r.squared, lm_Score_L = max_conc * lm_slope_L + lm_intercept_L, - + # Repeat for K, r, and AUC lm_intercept_K = coef(lm_K)[1], lm_slope_K = coef(lm_K)[2], R_Squared_K = summary(lm_K)$r.squared, lm_Score_K = max_conc * lm_slope_K + lm_intercept_K, - lm_intercept_r = coef(lm_r)[1], lm_slope_r = coef(lm_r)[2], R_Squared_r = summary(lm_r)$r.squared, lm_Score_r = max_conc * lm_slope_r + lm_intercept_r, - lm_intercept_AUC = coef(lm_AUC)[1], lm_slope_AUC = coef(lm_AUC)[2], R_Squared_AUC = summary(lm_AUC)$r.squared, lm_Score_AUC = max_conc * lm_slope_AUC + lm_intercept_AUC ) + return(data) }) %>% ungroup() - # Summary statistics for lm scores + # Compute lm means and sds across all data without grouping lm_means_sds <- calculations %>% - group_by(across(all_of(group_vars))) %>% summarise( - mean_lm_L = mean(lm_Score_L, na.rm = TRUE), - sd_lm_L = sd(lm_Score_L, na.rm = TRUE), - mean_lm_K = mean(lm_Score_K, na.rm = TRUE), - sd_lm_K = sd(lm_Score_K, na.rm = TRUE), - mean_lm_r = mean(lm_Score_r, na.rm = TRUE), - sd_lm_r = sd(lm_Score_r, na.rm = TRUE), - mean_lm_AUC = mean(lm_Score_AUC, na.rm = TRUE), - sd_lm_AUC = sd(lm_Score_AUC, na.rm = TRUE) - ) - - # Continue with gene Z-scores and interactions - calculations <- calculations %>% - left_join(lm_means_sds, by = group_vars) %>% - group_by(across(all_of(group_vars))) %>% - mutate( - Z_lm_L = (lm_Score_L - mean_lm_L) / sd_lm_L, - Z_lm_K = (lm_Score_K - mean_lm_K) / sd_lm_K, - Z_lm_r = (lm_Score_r - mean_lm_r) / sd_lm_r, - Z_lm_AUC = (lm_Score_AUC - mean_lm_AUC) / sd_lm_AUC + lm_mean_L = mean(lm_Score_L, na.rm = TRUE), + lm_sd_L = sd(lm_Score_L, na.rm = TRUE), + lm_mean_K = mean(lm_Score_K, na.rm = TRUE), + lm_sd_K = sd(lm_Score_K, na.rm = TRUE), + lm_mean_r = mean(lm_Score_r, na.rm = TRUE), + lm_sd_r = sd(lm_Score_r, na.rm = TRUE), + lm_mean_AUC = mean(lm_Score_AUC, na.rm = TRUE), + lm_sd_AUC = sd(lm_Score_AUC, na.rm = TRUE) ) - # Build summary stats (interactions) + # Apply global lm means and sds to calculate Z_lm_* + calculations <- calculations %>% + mutate( + Z_lm_L = (lm_Score_L - lm_means_sds$lm_mean_L) / lm_means_sds$lm_sd_L, + Z_lm_K = (lm_Score_K - lm_means_sds$lm_mean_K) / lm_means_sds$lm_sd_K, + Z_lm_r = (lm_Score_r - lm_means_sds$lm_mean_r) / lm_means_sds$lm_sd_r, + Z_lm_AUC = (lm_Score_AUC - lm_means_sds$lm_mean_AUC) / lm_means_sds$lm_sd_AUC + ) + + # Build interactions data frame interactions <- calculations %>% group_by(across(all_of(group_vars))) %>% summarise( - Avg_Zscore_L = sum(Zscore_L, na.rm = TRUE) / first(num_non_removed_concs), - Avg_Zscore_K = sum(Zscore_K, na.rm = TRUE) / first(num_non_removed_concs), - Avg_Zscore_r = sum(Zscore_r, na.rm = TRUE) / first(num_non_removed_concs), - Avg_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE) / first(num_non_removed_concs), - + Avg_Zscore_L = mean(Zscore_L, na.rm = TRUE), + Avg_Zscore_K = mean(Zscore_K, na.rm = TRUE), + Avg_Zscore_r = mean(Zscore_r, na.rm = TRUE), + Avg_Zscore_AUC = mean(Zscore_AUC, na.rm = TRUE), + # Interaction Z-scores Z_lm_L = first(Z_lm_L), Z_lm_K = first(Z_lm_K), Z_lm_r = first(Z_lm_r), Z_lm_AUC = first(Z_lm_AUC), - + # Raw Shifts Raw_Shift_L = first(Raw_Shift_L), Raw_Shift_K = first(Raw_Shift_K), Raw_Shift_r = first(Raw_Shift_r), Raw_Shift_AUC = first(Raw_Shift_AUC), - + # Z Shifts Z_Shift_L = first(Z_Shift_L), Z_Shift_K = first(Z_Shift_K), Z_Shift_r = first(Z_Shift_r), Z_Shift_AUC = first(Z_Shift_AUC), - + # NG, DB, SM values NG = first(NG), DB = first(DB), - SM = first(SM) + SM = first(SM), + + # R Squared values + R_Squared_L = first(R_Squared_L), + R_Squared_K = first(R_Squared_K), + R_Squared_r = first(R_Squared_r), + R_Squared_AUC = first(R_Squared_AUC), + + .groups = "drop" ) - # Creating the final calculations and interactions dataframes with only required columns for csv output + # Create the final calculations and interactions dataframes with required columns calculations_df <- calculations %>% select( all_of(group_vars), conc_num, conc_num_factor, conc_num_factor_factor, N, NG, DB, SM, - mean_L, median_L, sd_L, se_L, - mean_K, median_K, sd_K, se_K, - mean_r, median_r, sd_r, se_r, - mean_AUC, median_AUC, sd_AUC, se_AUC, + mean_L, mean_K, mean_r, mean_AUC, Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC, Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC, WT_L, WT_K, WT_r, WT_AUC, @@ -398,23 +408,15 @@ calculate_interaction_scores <- function(df, bg_df, group_vars, overlap_threshol Avg_Zscore_L, Avg_Zscore_K, Avg_Zscore_r, Avg_Zscore_AUC, Z_lm_L, Z_lm_K, Z_lm_r, Z_lm_AUC, Raw_Shift_L, Raw_Shift_K, Raw_Shift_r, Raw_Shift_AUC, - Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC + Z_Shift_L, Z_Shift_K, Z_Shift_r, Z_Shift_AUC, + R_Squared_L, R_Squared_K, R_Squared_r, R_Squared_AUC ) - calculations_no_overlap <- calculations %>% - # DB, NG, SM are same as in interactions, the rest may be different and need to be checked - select(-any_of(c( - "DB", "NG", "SM", - "Raw_Shift_L", "Raw_Shift_K", "Raw_Shift_r", "Raw_Shift_AUC", - "Z_Shift_L", "Z_Shift_K", "Z_Shift_r", "Z_Shift_AUC", - "Z_lm_L", "Z_lm_K", "Z_lm_r", "Z_lm_AUC" - ))) + # Create full_data by joining calculations_df and interactions_df + full_data <- calculations_df %>% + left_join(interactions_df, by = group_vars, suffix = c("", "_interaction")) - # Use left_join to avoid dimension mismatch issues - full_data <- calculations_no_overlap %>% - left_join(interactions, by = group_vars) - - # Return full_data and the two required dataframes (calculations and interactions) + # Return the dataframes return(list( calculations = calculations_df, interactions = interactions_df, @@ -1118,7 +1120,7 @@ main <- function() { df_stats <- calculate_summary_stats( df = df, variables = c("L", "K", "r", "AUC", "delta_bg"), - group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$df_with_stats + group_vars = c("conc_num"))$df_with_stats frequency_delta_bg_plot_configs <- list( plots = list( @@ -1184,7 +1186,7 @@ main <- function() { ss <- calculate_summary_stats( df = df_na, variables = c("L", "K", "r", "AUC", "delta_bg"), - group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")) + group_vars = c("conc_num")) df_na_ss <- ss$summary_stats df_na_stats <- ss$df_with_stats # formerly X_stats_ALL write.csv(df_na_ss, file = file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) @@ -1196,7 +1198,7 @@ main <- function() { df_no_zeros_stats <- calculate_summary_stats( df = df_no_zeros, variables = c("L", "K", "r", "AUC", "delta_bg"), - group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor") + group_vars = c("conc_num") )$df_with_stats message("Filtering by 2SD of K") @@ -1208,13 +1210,13 @@ main <- function() { message("Calculating summary statistics for L within 2SD of K") # TODO We're omitting the original z_max calculation, not sure if needed? ss <- calculate_summary_stats(df_na_within_2sd_k, "L", - group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor"))$summary_stats + group_vars = c("conc_num"))$summary_stats write.csv(ss, file = file.path(out_dir_qc, "max_observed_L_vals_for_spots_within_2sd_K.csv"), row.names = FALSE) message("Calculating summary statistics for L outside 2SD of K") - ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor", "conc_num_factor_factor")) + ss <- calculate_summary_stats(df_na_outside_2sd_k, "L", group_vars = c("conc_num")) df_na_l_outside_2sd_k_stats <- ss$df_with_stats write.csv(ss$summary_stats, file = file.path(out_dir, "max_observed_L_vals_for_spots_outside_2sd_K.csv"), @@ -1342,7 +1344,7 @@ main <- function() { message("Calculating summary statistics for background strain") ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), - group_vars = c("OrfRep", "conc_num", "conc_num_factor", "conc_num_factor_factor")) + group_vars = c("OrfRep", "conc_num")) summary_stats_bg <- ss_bg$summary_stats df_bg_stats <- ss_bg$df_with_stats write.csv( @@ -1354,7 +1356,7 @@ main <- function() { df_reference <- df_na_stats %>% # formerly X2_RF filter(OrfRep == strain) %>% filter(!is.na(L)) %>% - group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>% + group_by(conc_num) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), @@ -1366,7 +1368,7 @@ main <- function() { df_reference_stats <- calculate_summary_stats( df = df_reference, variables = c("L", "K", "r", "AUC"), - group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") + group_vars = c("OrfRep", "Gene", "num", "conc_num") )$df_with_stats reference_results <- calculate_interaction_scores(df_reference_stats, df_bg_stats, group_vars = c("OrfRep", "Gene", "num")) zscore_calculations_reference <- reference_results$calculations @@ -1377,7 +1379,7 @@ main <- function() { df_deletion <- df_na_stats %>% # formerly X2 filter(OrfRep != strain) %>% filter(!is.na(L)) %>% - group_by(conc_num, conc_num_factor, conc_num_factor_factor) %>% + group_by(conc_num) %>% mutate( max_l_theoretical = max(max_L, na.rm = TRUE), L = ifelse(L == 0 & !is.na(L) & conc_num > 0, max_l_theoretical, L), @@ -1389,7 +1391,7 @@ main <- function() { df_deletion_stats <- calculate_summary_stats( df = df_deletion, variables = c("L", "K", "r", "AUC"), - group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor") + group_vars = c("OrfRep", "Gene", "conc_num") )$df_with_stats deletion_results <- calculate_interaction_scores(df_deletion_stats, df_bg_stats, group_vars = c("OrfRep", "Gene")) zscore_calculations <- deletion_results$calculations