Move error bars to generate_scattter_plots()

2024-10-06 14:58:54 -04:00
parent bee9aea866
commit faa82e0af4
1 changed files with 124 additions and 130 deletions
--- a/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
+++ b/qhtcp-workflow/apps/r/calculate_interaction_zscores.R
@@ -300,52 +300,45 @@ calculate_interaction_scores <- function(df, df_bg, type, overlap_threshold = 2)
    ungroup() %>%  # Ungroup before group_modify
    group_by(across(all_of(group_vars))) %>%
    group_modify(~ {
-      # Check if there are enough unique conc_num_factor levels to perform lm
-      if (length(unique(.x$conc_num_factor)) > 1) {
+
+      # Filter each column for valid data or else linear modeling will fail
+      valid_data_L <- .x %>% filter(!is.na(Delta_L))
+      valid_data_K <- .x %>% filter(!is.na(Delta_K))
+      valid_data_r <- .x %>% filter(!is.na(Delta_r))
+      valid_data_AUC <- .x %>% filter(!is.na(Delta_AUC))
      
      # Perform linear modeling
-        lm_L <- lm(Delta_L ~ conc_num_factor, data = .x)
-        lm_K <- lm(Delta_K ~ conc_num_factor, data = .x)
-        lm_r <- lm(Delta_r ~ conc_num_factor, data = .x)
-        lm_AUC <- lm(Delta_AUC ~ conc_num_factor, data = .x)
+      lm_L <- if (nrow(valid_data_L) > 1) lm(Delta_L ~ conc_num_factor, data = valid_data_L) else NULL
+      lm_K <- if (nrow(valid_data_K) > 1) lm(Delta_K ~ conc_num_factor, data = valid_data_K) else NULL
+      lm_r <- if (nrow(valid_data_r) > 1) lm(Delta_r ~ conc_num_factor, data = valid_data_r) else NULL
+      lm_AUC <- if (nrow(valid_data_AUC) > 1) lm(Delta_AUC ~ conc_num_factor, data = valid_data_AUC) else NULL

-        # If the model fails, set model-related values to NA
+      # Extract coefficients for calculations and plotting
      .x %>%
        mutate(
-            lm_intercept_L = ifelse(!is.null(lm_L), coef(lm_L)[1], NA),
-            lm_slope_L = ifelse(!is.null(lm_L), coef(lm_L)[2], NA),
-            R_Squared_L = ifelse(!is.null(lm_L), summary(lm_L)$r.squared, NA),
-            lm_Score_L = ifelse(!is.null(lm_L), max_conc * coef(lm_L)[2] + coef(lm_L)[1], NA),
+          lm_intercept_L = if (!is.null(lm_L)) coef(lm_L)[1] else NA,
+          lm_slope_L = if (!is.null(lm_L)) coef(lm_L)[2] else NA,
+          R_Squared_L = if (!is.null(lm_L)) summary(lm_L)$r.squared else NA,
+          lm_Score_L = if (!is.null(lm_L)) max_conc * coef(lm_L)[2] + coef(lm_L)[1] else NA,

-            lm_intercept_K = ifelse(!is.null(lm_K), coef(lm_K)[1], NA),
-            lm_slope_K = ifelse(!is.null(lm_K), coef(lm_K)[2], NA),
-            R_Squared_K = ifelse(!is.null(lm_K), summary(lm_K)$r.squared, NA),
-            lm_Score_K = ifelse(!is.null(lm_K), max_conc * coef(lm_K)[2] + coef(lm_K)[1], NA),
+          lm_intercept_K = if (!is.null(lm_K)) coef(lm_K)[1] else NA,
+          lm_slope_K = if (!is.null(lm_K)) coef(lm_K)[2] else NA,
+          R_Squared_K = if (!is.null(lm_K)) summary(lm_K)$r.squared else NA,
+          lm_Score_K = if (!is.null(lm_K)) max_conc * coef(lm_K)[2] + coef(lm_K)[1] else NA,

-            lm_intercept_r = ifelse(!is.null(lm_r), coef(lm_r)[1], NA),
-            lm_slope_r = ifelse(!is.null(lm_r), coef(lm_r)[2], NA),
-            R_Squared_r = ifelse(!is.null(lm_r), summary(lm_r)$r.squared, NA),
-            lm_Score_r = ifelse(!is.null(lm_r), max_conc * coef(lm_r)[2] + coef(lm_r)[1], NA),
+          lm_intercept_r = if (!is.null(lm_r)) coef(lm_r)[1] else NA,
+          lm_slope_r = if (!is.null(lm_r)) coef(lm_r)[2] else NA,
+          R_Squared_r = if (!is.null(lm_r)) summary(lm_r)$r.squared else NA,
+          lm_Score_r = if (!is.null(lm_r)) max_conc * coef(lm_r)[2] + coef(lm_r)[1] else NA,

-            lm_intercept_AUC = ifelse(!is.null(lm_AUC), coef(lm_AUC)[1], NA),
-            lm_slope_AUC = ifelse(!is.null(lm_AUC), coef(lm_AUC)[2], NA),
-            R_Squared_AUC = ifelse(!is.null(lm_AUC), summary(lm_AUC)$r.squared, NA),
-            lm_Score_AUC = ifelse(!is.null(lm_AUC), max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1], NA)
+          lm_intercept_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[1] else NA,
+          lm_slope_AUC = if (!is.null(lm_AUC)) coef(lm_AUC)[2] else NA,
+          R_Squared_AUC = if (!is.null(lm_AUC)) summary(lm_AUC)$r.squared else NA,
+          lm_Score_AUC = if (!is.null(lm_AUC)) max_conc * coef(lm_AUC)[2] + coef(lm_AUC)[1] else NA
        )
-      } else {
-        # If not enough conc_num_factor levels, set lm-related values to NA
-        .x %>%
-          mutate(
-            lm_intercept_L = NA, lm_slope_L = NA, R_Squared_L = NA, lm_Score_L = NA,
-            lm_intercept_K = NA, lm_slope_K = NA, R_Squared_K = NA, lm_Score_K = NA,
-            lm_intercept_r = NA, lm_slope_r = NA, R_Squared_r = NA, lm_Score_r = NA,
-            lm_intercept_AUC = NA, lm_slope_AUC = NA, R_Squared_AUC = NA, lm_Score_AUC = NA
-          )
-      }
    }) %>%
    ungroup()

-
  # For interaction plot error bars
  delta_means_sds <- calculations %>%
    group_by(across(all_of(group_vars))) %>%
@@ -631,6 +624,93 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
        }
      }

+      # Convert ggplot to plotly for interactive version
+      plotly_plot <- suppressWarnings(plotly::ggplotly(plot))
+
+      # Store both static and interactive versions
+      static_plots[[i]] <- plot
+      plotly_plots[[i]] <- plotly_plot
+    }
+
+    # Print the plots in the current group to the PDF
+    if (!is.null(grid_layout)) {
+      # Set grid_ncol to 1 if not specified
+      if (is.null(grid_layout$ncol)) {
+        grid_layout$ncol <- 1
+      }
+
+      # If ncol is set but nrow is not, calculate nrow dynamically based on num_plots
+      if (!is.null(grid_layout$ncol) && is.null(grid_layout$nrow)) {
+        num_plots <- length(static_plots)
+        nrow <- ceiling(num_plots / grid_layout$ncol)
+        # message("No nrow provided, automatically using nrow = ", nrow)
+        grid_layout$nrow <- nrow
+      }
+      
+      total_spots <- grid_layout$nrow * grid_layout$ncol
+      num_plots <- length(static_plots)
+
+      if (num_plots < total_spots) {
+        message("Filling ", total_spots - num_plots, " empty spots with nullGrob()")
+        static_plots <- c(static_plots, replicate(total_spots - num_plots, nullGrob(), simplify = FALSE))
+      }
+
+      # Print a page of gridded plots
+      grid.arrange(
+        grobs = static_plots,
+        ncol = grid_layout$ncol,
+        nrow = grid_layout$nrow)
+        
+    } else {
+      # Print individual plots on separate pages if no grid layout
+      for (plot in static_plots) {
+        print(plot)
+      }
+    }
+  }
+
+  # Close the PDF device after all plots are done
+  dev.off()
+
+  # Save HTML file with interactive plots if needed
+  out_html_file <- file.path(out_dir, paste0(filename, ".html"))
+  message("Saving combined HTML file: ", out_html_file)
+  htmltools::save_html(
+    htmltools::tagList(plotly_plots),
+    file = out_html_file
+  )
+}
+
+generate_scatter_plot <- function(plot, config) {
+
+  # Define the points
+  shape <- if (!is.null(config$shape)) config$shape else 3
+  size <- if (!is.null(config$size)) config$size else 1.5
+  position <-
+    if (!is.null(config$position) && config$position == "jitter") {
+      position_jitter(width = 0.4, height = 0.1)
+    } else {
+      "identity"
+    }
+
+  plot <- plot + geom_point(
+    shape = shape,
+    size = size,
+    position = position
+  )
+
+  # Add a cyan point for the reference data for correlation plots
+  if (!is.null(config$cyan_points) && config$cyan_points) {
+    plot <- plot + geom_point(
+      data = config$df_reference,
+      mapping = aes(x = .data[[config$x_var]], y = .data[[config$y_var]]),
+      color = "cyan",
+      shape = 3,
+      size = 0.5,
+      inherit.aes = FALSE
+    )
+  }
+
  # Add error bars if specified
  if (!is.null(config$error_bar) && config$error_bar) {
    # Check if custom columns are provided for y_mean and y_sd, or use the defaults
@@ -703,92 +783,6 @@ generate_and_save_plots <- function(out_dir, filename, plot_configs, page_width
    }
  }
  
-      # Convert ggplot to plotly for interactive version
-      plotly_plot <- suppressWarnings(plotly::ggplotly(plot))
-
-      # Store both static and interactive versions
-      static_plots[[i]] <- plot
-      plotly_plots[[i]] <- plotly_plot
-    }
-
-    # Print the plots in the current group to the PDF
-    if (!is.null(grid_layout)) {
-      # Set grid_ncol to 1 if not specified
-      if (is.null(grid_layout$ncol)) {
-        grid_layout$ncol <- 1
-      }
-
-      # If ncol is set but nrow is not, calculate nrow dynamically based on num_plots
-      if (!is.null(grid_layout$ncol) && is.null(grid_layout$nrow)) {
-        num_plots <- length(static_plots)
-        nrow <- ceiling(num_plots / grid_layout$ncol)
-        # message("No nrow provided, automatically using nrow = ", nrow)
-        grid_layout$nrow <- nrow
-      }
-      
-      total_spots <- grid_layout$nrow * grid_layout$ncol
-      num_plots <- length(static_plots)
-
-      # if (num_plots < total_spots) {
-      #   message("Filling ", total_spots - num_plots, " empty spots with nullGrob()")
-      #   static_plots <- c(static_plots, replicate(total_spots - num_plots, nullGrob(), simplify = FALSE))
-      # }
-
-      grid.arrange(
-        grobs = static_plots,
-        ncol = grid_layout$ncol,
-        nrow = grid_layout$nrow
-      )
-    } else {
-      # Print individual plots on separate pages if no grid layout
-      for (plot in static_plots) {
-        print(plot)
-      }
-    }
-  }
-
-  # Close the PDF device after all plots are done
-  dev.off()
-
-  # Save HTML file with interactive plots if needed
-  out_html_file <- file.path(out_dir, paste0(filename, ".html"))
-  message("Saving combined HTML file: ", out_html_file)
-  htmltools::save_html(
-    htmltools::tagList(plotly_plots),
-    file = out_html_file
-  )
-}
-
-generate_scatter_plot <- function(plot, config) {
-
-  # Define the points
-  shape <- if (!is.null(config$shape)) config$shape else 3
-  size <- if (!is.null(config$size)) config$size else 1.5
-  position <-
-    if (!is.null(config$position) && config$position == "jitter") {
-      position_jitter(width = 0.4, height = 0.1)
-    } else {
-      "identity"
-    }
-
-  plot <- plot + geom_point(
-    shape = shape,
-    size = size,
-    position = position
-  )
-
-  # Add a cyan point for the reference data for correlation plots
-  if (!is.null(config$cyan_points) && config$cyan_points) {
-    plot <- plot + geom_point(
-      data = config$df_reference,
-      mapping = aes(x = .data[[config$x_var]], y = .data[[config$y_var]]),
-      color = "cyan",
-      shape = 3,
-      size = 0.5,
-      inherit.aes = FALSE
-    )
-  }
-  
  # Add linear regression line if specified
  if (!is.null(config$lm_line)) {
    # Extract necessary values
@@ -1570,7 +1564,7 @@ main <- function() {
        ) %>%
        filter(!is.na(L))
      
-      message("Calculating background strain summary statistics")
+      message("Calculating background summary statistics")
      ss_bg <- calculate_summary_stats(df_bg, c("L", "K", "r", "AUC", "delta_bg"), # formerly X_stats_BY
        group_vars = c("OrfRep", "Drug", "conc_num", "conc_num_factor_factor"))
      summary_stats_bg <- ss_bg$summary_stats
@@ -1621,16 +1615,16 @@ main <- function() {
        group_vars = c("OrfRep", "Gene", "num", "Drug", "conc_num", "conc_num_factor_factor")
        )$df_with_stats
      
-      message("Calculating reference strain interaction scores")
-      reference_results <- calculate_interaction_scores(df_reference_interaction_stats, df_bg_stats, "reference")
-      df_reference_interactions_joined <- reference_results$full_data
-      df_reference_interactions <- reference_results$interactions
-      write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
-      write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)
+      # message("Calculating reference strain interaction scores")
+      # reference_results <- calculate_interaction_scores(df_reference_interaction_stats, df_bg_stats, "reference")
+      # df_reference_interactions_joined <- reference_results$full_data
+      # df_reference_interactions <- reference_results$interactions
+      # write.csv(reference_results$calculations, file = file.path(out_dir, "zscore_calculations_reference.csv"), row.names = FALSE)
+      # write.csv(df_reference_interactions, file = file.path(out_dir, "zscore_interactions_reference.csv"), row.names = FALSE)

-      message("Generating reference interaction plots")
-      reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
-      generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)
+      # message("Generating reference interaction plots")
+      # reference_plot_configs <- generate_interaction_plot_configs(df_reference_summary_stats, df_reference_interactions_joined, "reference")
+      # generate_and_save_plots(out_dir, "interaction_plots_reference", reference_plot_configs, page_width = 16, page_height = 16)

      message("Setting missing deletion values to the highest theoretical value at each drug conc for L")
      df_deletion <- df_na_stats %>% # formerly X2