From 62e07f53fd047b86a5e774444711854877c02a52 Mon Sep 17 00:00:00 2001 From: Bryan Roessler Date: Wed, 18 Sep 2024 20:08:24 -0400 Subject: [PATCH] More interaction df changes --- .../apps/r/calculate_interaction_zscores.R | 94 ++++++++----------- 1 file changed, 41 insertions(+), 53 deletions(-) diff --git a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R index ae3969a5..3da34a34 100644 --- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R +++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R @@ -181,19 +181,15 @@ calculate_summary_stats <- function(df, variables, group_vars) { ) # Create a cleaned version of df that doesn't overlap with summary_stats - cols_to_keep <- setdiff(names(df), names(summary_stats)[-which(names(summary_stats) %in% group_vars)]) - df_cleaned <- df %>% - select(all_of(cols_to_keep)) + cleaned_df <- df %>% + select(-any_of(setdiff(intersect(names(df), names(summary_stats)), group_vars))) + df_joined <- left_join(cleaned_df, summary_stats, by = group_vars) - df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars) - - return(list(summary_stats = summary_stats, df_with_stats = df_with_stats)) + return(list(summary_stats = summary_stats, df_with_stats = df_joined)) } -calculate_interaction_scores <- function(df, max_conc) { - - variables <- c("L", "K", "r", "AUC") - group_vars <- c("OrfRep", "Gene", "num") +calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", "r", "AUC"), + group_vars = c("OrfRep", "Gene", "num")) { # Calculate total concentration variables total_conc_num <- length(unique(df$conc_num)) @@ -214,11 +210,14 @@ calculate_interaction_scores <- function(df, max_conc) { ) # Calculate per spot - stats <- calculate_summary_stats(df, + # sd and se calculations are invalid when grouping at this level + interaction_ss <- calculate_summary_stats( + df = df, variables = variables, group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor") - )$summary_stats + )$df_with_stats + # Load original WT data stats <- df %>% group_by(across(all_of(group_vars))) %>% mutate( @@ -232,6 +231,11 @@ calculate_interaction_scores <- function(df, max_conc) { WT_sd_AUC = sd_AUC ) + cleaned_df <- stats %>% + select(-any_of(setdiff(intersect(names(stats), names(interaction_ss)), + c(group_vars, "sd_L", "sd_K", "sd_r", "sd_AUC", "se_L", "se_K", "se_r", "se_AUC")))) + stats <- left_join(cleaned_df, interaction_ss, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) + stats <- stats %>% mutate( Raw_Shift_L = first(mean_L) - bg_means$L, @@ -273,11 +277,11 @@ calculate_interaction_scores <- function(df, max_conc) { Zscore_AUC = Delta_AUC / WT_sd_AUC ) - # Calculate linear models - lm_L <- lm(Delta_L ~ conc_num, data = stats) - lm_K <- lm(Delta_K ~ conc_num, data = stats) - lm_r <- lm(Delta_r ~ conc_num, data = stats) - lm_AUC <- lm(Delta_AUC ~ conc_num, data = stats) + # Fit linear models and calculate interaction scores + gene_lm_L <- lm(formula = Delta_L ~ conc_num_factor, data = df) + gene_lm_K <- lm(formula = Delta_K ~ conc_num_factor, data = df) + gene_lm_r <- lm(formula = Delta_r ~ conc_num_factor, data = df) + gene_lm_AUC <- lm(formula = Delta_AUC ~ conc_num_factor, data = df) interactions <- stats %>% group_by(across(all_of(group_vars))) %>% @@ -294,24 +298,15 @@ calculate_interaction_scores <- function(df, max_conc) { Z_Shift_L = first(Z_Shift_L), Z_Shift_K = first(Z_Shift_K), Z_Shift_r = first(Z_Shift_r), - Z_Shift_AUC = first(Z_Shift_AUC) - ) - - # Summarise the data to calculate summary statistics - summary_stats <- interactions %>% - summarise( - Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE), - Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE), - Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE), - Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE), - lm_Score_L = max(conc_num) * coef(lm(Zscore_L ~ conc_num))[2] + coef(lm(Zscore_L ~ conc_num))[1], - lm_Score_K = max(conc_num) * coef(lm(Zscore_K ~ conc_num))[2] + coef(lm(Zscore_K ~ conc_num))[1], - lm_Score_r = max(conc_num) * coef(lm(Zscore_r ~ conc_num))[2] + coef(lm(Zscore_r ~ conc_num))[1], - lm_Score_AUC = max(conc_num) * coef(lm(Zscore_AUC ~ conc_num))[2] + coef(lm(Zscore_AUC ~ conc_num))[1], - R_Squared_L = summary(lm(Zscore_L ~ conc_num))$r.squared, - R_Squared_K = summary(lm(Zscore_K ~ conc_num))$r.squared, - R_Squared_r = summary(lm(Zscore_r ~ conc_num))$r.squared, - R_Squared_AUC = summary(lm(Zscore_AUC ~ conc_num))$r.squared, + Z_Shift_AUC = first(Z_Shift_AUC), + lm_Score_L = max_conc * (gene_lm_L$coefficients[2]) + gene_lm_L$coefficients[1], + lm_Score_K = max_conc * (gene_lm_K$coefficients[2]) + gene_lm_K$coefficients[1], + lm_Score_r = max_conc * (gene_lm_r$coefficients[2]) + gene_lm_r$coefficients[1], + lm_Score_AUC = max_conc * (gene_lm_AUC$coefficients[2]) + gene_lm_AUC$coefficients[1], + R_Squared_L = summary(gene_lm_L)$r.squared, + R_Squared_K = summary(gene_lm_K)$r.squared, + R_Squared_r = summary(gene_lm_r)$r.squared, + R_Squared_AUC = summary(gene_lm_AUC)$r.squared, lm_intercept_L = coef(lm(Zscore_L ~ conc_num))[1], lm_slope_L = coef(lm(Zscore_L ~ conc_num))[2], lm_intercept_K = coef(lm(Zscore_K ~ conc_num))[1], @@ -320,22 +315,15 @@ calculate_interaction_scores <- function(df, max_conc) { lm_slope_r = coef(lm(Zscore_r ~ conc_num))[2], lm_intercept_AUC = coef(lm(Zscore_AUC ~ conc_num))[1], lm_slope_AUC = coef(lm(Zscore_AUC ~ conc_num))[2], + Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE), + Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE), + Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE), + Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE), NG = sum(NG, na.rm = TRUE), DB = sum(DB, na.rm = TRUE), SM = sum(SM, na.rm = TRUE), - .groups = "keep" + num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1 ) - - # Join the summary data back to the original data - cleaned_interactions <- interactions %>% - select(-any_of(intersect(names(interactions), names(summary_stats)))) - interactions_joined <- left_join(cleaned_interactions, summary_stats, by = group_vars) - interactions_joined <- interactions_joined %>% distinct() - - # Remove duplicate rows if necessary - interactions <- interactions %>% distinct() - - num_non_removed_concs <- total_conc_num - sum(stats$DB, na.rm = TRUE) - 1 interactions <- interactions %>% mutate( @@ -367,13 +355,13 @@ calculate_interaction_scores <- function(df, max_conc) { "Zscore_L", "Zscore_K", "Zscore_r", "Zscore_AUC", "NG", "SM", "DB") - calculations_joined <- df %>% - select(-any_of(intersect(names(df), names(calculations)))) - calculations_joined <- left_join(calculations_joined, calculations, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) + cleaned_df <- df %>% + select(-any_of(setdiff(intersect(names(df), names(calculations)), group_vars))) + calculations_joined <- left_join(cleaned_df, calculations, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) - interactions_joined <- df %>% - select(-any_of(intersect(names(df), names(interactions)))) - interactions_joined <- left_join(interactions_joined, interactions, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) + cleaned_df <- df %>% + select(-any_of(setdiff(intersect(names(df), names(interactions)), group_vars))) + interactions_joined <- left_join(cleaned_df, interactions, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")) return(list(calculations = calculations, interactions = interactions, interactions_joined = interactions_joined, calculations_joined = calculations_joined))