Remove existing columns before join

2024-09-02 01:53:04 -04:00
parent 66062438b5
commit 756075a7e5
1 changed files with 69 additions and 47 deletions
--- a/workflow/apps/r/calculate_interaction_zscores5.R
+++ b/workflow/apps/r/calculate_interaction_zscores5.R
@@ -58,8 +58,9 @@ parse_arguments <- function() {

 args <- parse_arguments()

-dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
-dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)
+# Should we keep output in exp dirs or combine in the study output dir?
+# dir.create(file.path(args$out_dir, "zscores"), showWarnings = FALSE)
+# dir.create(file.path(args$out_dir, "zscores", "qc"), showWarnings = FALSE)

 # Define themes and scales
 theme_publication <- function(base_size = 14, base_family = "sans", legend_position = "bottom") {
@@ -156,33 +157,25 @@ update_gene_names <- function(df, sgd_gene_list) {
 }

 generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num", title, x_label = NULL, y_label = NULL) {
-  # Base ggplot object with conditional y mapping
+  # Start the ggplot with just the x_var and color_var
+  plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var)))) +
+    ggtitle(title) +
+    theme_publication()
+  
+  # If y_var is not NULL, add it to the plot aesthetics
  if (!is.null(y_var)) {
-    plot <- ggplot(df, aes(x = !!sym(x_var), y = !!sym(y_var), color = as.factor(!!sym(color_var))))
-  } else {
-    plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var))))
+    plot <- plot + aes(y = !!sym(y_var))
  }
  
  # Handle different plot types
-  if (plot_type == "scatter") {
-    plot <- plot +
-      geom_point(shape = 3, size = 0.2, position = "jitter") +
+  plot <- switch(plot_type,
+    "scatter" = plot + geom_point(shape = 3, size = 0.2, position = "jitter") +
      stat_summary(fun.data = mean_sdl, geom = "errorbar") +
-      stat_summary(fun = mean, geom = "point", size = 0.6)
-  } else if (plot_type == "box") {
-    plot <- plot + geom_boxplot()
-  } else if (plot_type == "density") {
-    plot <- plot + geom_density()
-  } else if (plot_type == "bar") {
-    if (!is.null(y_var)) {
-      plot <- plot + geom_bar(stat = "identity")  # Use y aesthetic if provided
-    } else {
-      plot <- plot + geom_bar()  # Default to counting occurrences
-    }
-  }
-  
-  # Add titles and labels
-  plot <- plot + ggtitle(title) + theme_publication()
+      stat_summary(fun = mean, geom = "point", size = 0.6),
+    "box" = plot + geom_boxplot(),
+    "density" = plot + geom_density(),
+    "bar" = plot + geom_bar(stat = ifelse(is.null(y_var), "count", "identity"))
+  )
  
  # Add optional labels if provided
  if (!is.null(x_label)) plot <- plot + xlab(x_label)
@@ -192,6 +185,8 @@ generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_
 }


+
+
 generate_and_save_plots <- function(df, output_dir, prefix, variables, include_qc = FALSE) {
  plots <- list()
  
@@ -278,18 +273,23 @@ save_plots <- function(file_name, plot_list, output_dir) {
 process_strains <- function(df, l_within_2sd_k, strain) {
  df_strains <- data.frame()  # Initialize an empty dataframe to store results

+  print(names(l_within_2sd_k))
+  
  for (concentration in unique(df$conc_num)) {
    df_temp <- df %>% filter(conc_num == concentration)

+    
+
    if (concentration > 0) {
      max_l_theoretical <- l_within_2sd_k %>%
        filter(conc_num_factor == concentration) %>%
        pull(L_max)
+      
      df_temp <- df_temp %>%
        mutate(
-          L = ifelse(L == 0 & !is.na(L), max_l_theoretical, L),
-          SM = ifelse(L >= max_l_theoretical & !is.na(L), 1, SM),
-          L = ifelse(L >= max_l_theoretical & !is.na(L), max_l_theoretical, L)
+          L = ifelse(L == 0 & !is.na(L), max_l_theoretical, L),  # Replace zero values with max theoretical
+          SM = ifelse(L >= max_l_theoretical & !is.na(L), 1, SM),  # Set SM flag
+          L = ifelse(L >= max_l_theoretical & !is.na(L), max_l_theoretical, L)  # Cap L values
        )
    }
    df_strains <- bind_rows(df_strains, df_temp)  # Append the results of this concentration
@@ -298,6 +298,7 @@ process_strains <- function(df, l_within_2sd_k, strain) {
  return(df_strains)
 }

+
 calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {

  # Pull the background means
@@ -536,13 +537,9 @@ main <- function() {
    df_above_tolerance <- df %>% filter(DB == 1)
    
    # Set vars above the delta background tolerance to NA
-    df_na <- df %>%
-      mutate(
-        L = ifelse(DB == 1, NA, L),
-        r = ifelse(DB == 1, NA, r),
-        AUC = ifelse(DB == 1, NA, AUC),
-        K = ifelse(DB == 1, NA, K)
-      )
+    df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .x)))
+
+    # Remove rows where L is 0
    df_no_zeros <- df_na %>% filter(L > 0)

    # Flag and remove non-finite data, printing affected rows
@@ -551,18 +548,18 @@ main <- function() {
      {
        if (nrow(.) > 0) {
          message("Removing non-finite rows:\n")
-          #print(.)
+          print(head(., n = 10))
        }
        df_na %>% filter(if_all(c(L), is.finite)) # Add L, r, AUC, K as needed for debugging
      }
    
    # Generate QC PDFs and HTMLs
-    message("Generating QC plots")
-    variables <- c("L", "K", "r", "AUC", "delta_bg")
-    generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
-    generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
-    generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
-    generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)
+    # message("Generating QC plots")
+    # variables <- c("L", "K", "r", "AUC", "delta_bg")
+    # generate_and_save_plots(df, out_dir_qc, "Before_QC", variables, include_qc = TRUE)
+    # generate_and_save_plots(df_above_tolerance, out_dir_qc, "Raw_L_vs_K_above_delta_bg_threshold", variables, include_qc = TRUE)
+    # generate_and_save_plots(df_na_filtered, out_dir_qc, "After_QC", variables)
+    # generate_and_save_plots(df_no_zeros, out_dir_qc, "No_Zeros", variables)

    rm(df, df_above_tolerance, df_no_zeros)
    
@@ -585,12 +582,37 @@ main <- function() {

    # Calculate summary statistics for L within and outside 2SD of K
    message("Calculating summary statistics for L within 2SD of K")
+
    l_within_2sd_k <- calculate_summary_stats(within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
+    
+    cols_to_remove <- names(l_within_2sd_k)
+    cols_to_keep <- c("conc_num", "conc_num_factor")
+
+    within_2sd_k_clean <- within_2sd_k %>%
+      select(-all_of(setdiff(cols_to_remove, cols_to_keep)))
+
+    l_within_2sd_k_joined <- within_2sd_k_clean %>%
+      left_join(l_within_2sd_k, by = c("conc_num", "conc_num_factor"))
+
+
+
+
+    #l_within_2sd_k_joined <- merge(within_2sd_k, l_within_2sd_k, by = c("conc_num", "conc_num_factor"), all.x = TRUE)
+    print("within_2sd_k")
+    print(head(within_2sd_k))
+    print("l_within_2sd_k")
+    print(head(l_within_2sd_k))
+    print("l_within_2sd_k_joined")
+    print(head(l_within_2sd_k_joined))
+
+    quit()
+    
    write.csv(l_within_2sd_k,
-      file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"),
+      file = file.path(out_dir_qc, "Max_Observed_L_Vals_for_spots_within_2sd_k.csv"),
      row.names = FALSE)
    message("Calculating summary statistics for for L outside 2SD of K")
    l_outside_2sd_k <- calculate_summary_stats(outside_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
+    l_outside_2sd_k_joined <- merge(outside_2sd_k, l_outside_2sd_k, by = c("conc_num", "conc_num_factor"), all.x = TRUE)
    write.csv(l_outside_2sd_k,
      file = file.path(out_dir, "Max_Observed_L_Vals_for_spots_outside_2sd_k.csv"),
      row.names = FALSE)
@@ -615,11 +637,11 @@ main <- function() {
      
      # Recalculate summary statistics for the background strain
      message("Calculating summary statistics for background strain")
-      stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "Gene", "conc_num", "conc_num_factor"))
+      stats_bg <- calculate_summary_stats(df_bg, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
      write.csv(stats_bg,
        file = file.path(out_dir, paste0("SummaryStats_BackgroundStrains_", strain, ".csv")),
        row.names = FALSE)
-      stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "Gene", "conc_num", "conc_num_factor"))
+      stats_bg_joined <- left_join(df_bg, stats_bg, by = c("OrfRep", "conc_num", "conc_num_factor"))
      
      # Filter reference and deletion strains
      # Formerly X2_RF (reference strain)
@@ -633,9 +655,9 @@ main <- function() {
        mutate(SM = 0)
      
      message("Processing reference strain")
-      reference_strain <- process_strains(df_reference, l_within_2sd_k, strain)
+      reference_strain <- process_strains(df_reference, l_within_2sd_k_joined, strain)
      message("Processing deletion strains")
-      deletion_strains <- process_strains(df_deletion, l_within_2sd_k, strain)
+      deletion_strains <- process_strains(df_deletion, l_within_2sd_k_joined, strain)
      
      # TODO we may need to add "num" to grouping vars