7 月之前 · 62e07f53fd
--- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
+++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
@@ -181,19 +181,15 @@ calculate_summary_stats <- function(df, variables, group_vars) {
 
				     )
			
 
				 
			
 
				   # Create a cleaned version of df that doesn't overlap with summary_stats
			
 
				-  cols_to_keep <- setdiff(names(df), names(summary_stats)[-which(names(summary_stats) %in% group_vars)])
			
 
				-  df_cleaned <- df %>%
			
 
				-    select(all_of(cols_to_keep))
			
 
				+  cleaned_df <- df %>%
			
 
				+    select(-any_of(setdiff(intersect(names(df), names(summary_stats)), group_vars)))
			
 
				+  df_joined <- left_join(cleaned_df, summary_stats, by = group_vars)
			
 
				 
			
 
				-  df_with_stats <- left_join(df_cleaned, summary_stats, by = group_vars)
			
 
				-
			
 
				-  return(list(summary_stats = summary_stats, df_with_stats = df_with_stats))
			
 
				+  return(list(summary_stats = summary_stats, df_with_stats = df_joined))
			
 
				 }
			
 
				 
			
 
				-calculate_interaction_scores <- function(df, max_conc) {
			
 
				-
			
 
				-  variables <- c("L", "K", "r", "AUC")
			
 
				-  group_vars <- c("OrfRep", "Gene", "num")
			
 
				+calculate_interaction_scores <- function(df, max_conc, variables = c("L", "K", "r", "AUC"),
			
 
				+  group_vars = c("OrfRep", "Gene", "num")) {
			
 
				 
			
 
				   # Calculate total concentration variables
			
 
				   total_conc_num <- length(unique(df$conc_num))
			
@@ -214,11 +210,14 @@ calculate_interaction_scores <- function(df, max_conc) {
 
				   )
			
 
				 
			
 
				   # Calculate per spot
			
 
				-  stats <- calculate_summary_stats(df,
			
 
				+  # sd and se calculations are invalid when grouping at this level
			
 
				+  interaction_ss <- calculate_summary_stats(
			
 
				+    df = df,
			
 
				     variables = variables,
			
 
				     group_vars = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor")
			
 
				-    )$summary_stats
			
 
				+    )$df_with_stats
			
 
				 
			
 
				+  # Load original WT data
			
 
				   stats <- df %>%
			
 
				     group_by(across(all_of(group_vars))) %>%
			
 
				     mutate(
			
@@ -232,6 +231,11 @@ calculate_interaction_scores <- function(df, max_conc) {
 
				       WT_sd_AUC = sd_AUC
			
 
				     )
			
 
				 
			
 
				+  cleaned_df <- stats %>%
			
 
				+  select(-any_of(setdiff(intersect(names(stats), names(interaction_ss)),
			
 
				+    c(group_vars, "sd_L", "sd_K", "sd_r", "sd_AUC", "se_L", "se_K", "se_r", "se_AUC"))))
			
 
				+  stats <- left_join(cleaned_df, interaction_ss, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))
			
 
				+
			
 
				   stats <- stats %>%
			
 
				     mutate(
			
 
				       Raw_Shift_L = first(mean_L) - bg_means$L,
			
@@ -273,11 +277,11 @@ calculate_interaction_scores <- function(df, max_conc) {
 
				       Zscore_AUC = Delta_AUC / WT_sd_AUC
			
 
				     )
			
 
				 
			
 
				-  # Calculate linear models
			
 
				-  lm_L <- lm(Delta_L ~ conc_num, data = stats)
			
 
				-  lm_K <- lm(Delta_K ~ conc_num, data = stats)
			
 
				-  lm_r <- lm(Delta_r ~ conc_num, data = stats)
			
 
				-  lm_AUC <- lm(Delta_AUC ~ conc_num, data = stats)
			
 
				+  # Fit linear models and calculate interaction scores
			
 
				+  gene_lm_L <- lm(formula = Delta_L ~ conc_num_factor, data = df)
			
 
				+  gene_lm_K <- lm(formula = Delta_K ~ conc_num_factor, data = df)
			
 
				+  gene_lm_r <- lm(formula = Delta_r ~ conc_num_factor, data = df)
			
 
				+  gene_lm_AUC <- lm(formula = Delta_AUC ~ conc_num_factor, data = df)
			
 
				 
			
 
				   interactions <- stats %>%
			
 
				     group_by(across(all_of(group_vars))) %>%
			
@@ -294,24 +298,15 @@ calculate_interaction_scores <- function(df, max_conc) {
 
				       Z_Shift_L = first(Z_Shift_L),
			
 
				       Z_Shift_K = first(Z_Shift_K),
			
 
				       Z_Shift_r = first(Z_Shift_r),
			
 
				-      Z_Shift_AUC = first(Z_Shift_AUC)
			
 
				-    )
			
 
				-
			
 
				-  # Summarise the data to calculate summary statistics
			
 
				-  summary_stats <- interactions %>%
			
 
				-    summarise(
			
 
				-      Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE),
			
 
				-      Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE),
			
 
				-      Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE),
			
 
				-      Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE),
			
 
				-      lm_Score_L = max(conc_num) * coef(lm(Zscore_L ~ conc_num))[2] + coef(lm(Zscore_L ~ conc_num))[1],
			
 
				-      lm_Score_K = max(conc_num) * coef(lm(Zscore_K ~ conc_num))[2] + coef(lm(Zscore_K ~ conc_num))[1],
			
 
				-      lm_Score_r = max(conc_num) * coef(lm(Zscore_r ~ conc_num))[2] + coef(lm(Zscore_r ~ conc_num))[1],
			
 
				-      lm_Score_AUC = max(conc_num) * coef(lm(Zscore_AUC ~ conc_num))[2] + coef(lm(Zscore_AUC ~ conc_num))[1],
			
 
				-      R_Squared_L = summary(lm(Zscore_L ~ conc_num))$r.squared,
			
 
				-      R_Squared_K = summary(lm(Zscore_K ~ conc_num))$r.squared,
			
 
				-      R_Squared_r = summary(lm(Zscore_r ~ conc_num))$r.squared,
			
 
				-      R_Squared_AUC = summary(lm(Zscore_AUC ~ conc_num))$r.squared,
			
 
				+      Z_Shift_AUC = first(Z_Shift_AUC),
			
 
				+      lm_Score_L = max_conc * (gene_lm_L$coefficients[2]) + gene_lm_L$coefficients[1],
			
 
				+      lm_Score_K = max_conc * (gene_lm_K$coefficients[2]) + gene_lm_K$coefficients[1],
			
 
				+      lm_Score_r = max_conc * (gene_lm_r$coefficients[2]) + gene_lm_r$coefficients[1],
			
 
				+      lm_Score_AUC = max_conc * (gene_lm_AUC$coefficients[2]) + gene_lm_AUC$coefficients[1],
			
 
				+      R_Squared_L = summary(gene_lm_L)$r.squared,
			
 
				+      R_Squared_K = summary(gene_lm_K)$r.squared,
			
 
				+      R_Squared_r = summary(gene_lm_r)$r.squared,
			
 
				+      R_Squared_AUC = summary(gene_lm_AUC)$r.squared,
			
 
				       lm_intercept_L = coef(lm(Zscore_L ~ conc_num))[1],
			
 
				       lm_slope_L = coef(lm(Zscore_L ~ conc_num))[2],
			
 
				       lm_intercept_K = coef(lm(Zscore_K ~ conc_num))[1],
			
@@ -320,22 +315,15 @@ calculate_interaction_scores <- function(df, max_conc) {
 
				       lm_slope_r = coef(lm(Zscore_r ~ conc_num))[2],
			
 
				       lm_intercept_AUC = coef(lm(Zscore_AUC ~ conc_num))[1],
			
 
				       lm_slope_AUC = coef(lm(Zscore_AUC ~ conc_num))[2],
			
 
				+      Sum_Zscore_L = sum(Zscore_L, na.rm = TRUE),
			
 
				+      Sum_Zscore_K = sum(Zscore_K, na.rm = TRUE),
			
 
				+      Sum_Zscore_r = sum(Zscore_r, na.rm = TRUE),
			
 
				+      Sum_Zscore_AUC = sum(Zscore_AUC, na.rm = TRUE),
			
 
				       NG = sum(NG, na.rm = TRUE),
			
 
				       DB = sum(DB, na.rm = TRUE),
			
 
				       SM = sum(SM, na.rm = TRUE),
			
 
				-      .groups = "keep"
			
 
				+      num_non_removed_concs = total_conc_num - sum(DB, na.rm = TRUE) - 1
			
 
				     )
			
 
				-
			
 
				-  # Join the summary data back to the original data
			
 
				-  cleaned_interactions <- interactions %>%
			
 
				-    select(-any_of(intersect(names(interactions), names(summary_stats))))
			
 
				-  interactions_joined <- left_join(cleaned_interactions, summary_stats, by = group_vars)
			
 
				-  interactions_joined <- interactions_joined %>% distinct()
			
 
				-
			
 
				-  # Remove duplicate rows if necessary
			
 
				-  interactions <- interactions %>% distinct()
			
 
				-
			
 
				-  num_non_removed_concs <- total_conc_num - sum(stats$DB, na.rm = TRUE) - 1
			
 
				   
			
 
				   interactions <- interactions %>%
			
 
				     mutate(
			
@@ -367,13 +355,13 @@ calculate_interaction_scores <- function(df, max_conc) {
 
				       "Zscore_L", "Zscore_K", "Zscore_r", "Zscore_AUC",
			
 
				       "NG", "SM", "DB")
			
 
				     
			
 
				-  calculations_joined <- df %>%
			
 
				-    select(-any_of(intersect(names(df), names(calculations))))
			
 
				-  calculations_joined <- left_join(calculations_joined, calculations, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))
			
 
				+  cleaned_df <- df %>%
			
 
				+    select(-any_of(setdiff(intersect(names(df), names(calculations)), group_vars)))
			
 
				+  calculations_joined <- left_join(cleaned_df, calculations, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))
			
 
				 
			
 
				-  interactions_joined <- df %>%
			
 
				-    select(-any_of(intersect(names(df), names(interactions))))
			
 
				-  interactions_joined <- left_join(interactions_joined, interactions, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))
			
 
				+  cleaned_df <- df %>%
			
 
				+    select(-any_of(setdiff(intersect(names(df), names(interactions)), group_vars)))
			
 
				+  interactions_joined <- left_join(cleaned_df, interactions, by = c("OrfRep", "Gene", "num", "conc_num", "conc_num_factor"))
			
 
				 
			
 
				   return(list(calculations = calculations, interactions = interactions, interactions_joined = interactions_joined,
			
 
				     calculations_joined = calculations_joined))