Before old rank plot removal

2024-09-07 19:31:52 -04:00
parent fcc6cdd2d4
commit 2407a752fc
1 changed files with 224 additions and 164 deletions
--- a/workflow/apps/r/calculate_interaction_zscores5.R
+++ b/workflow/apps/r/calculate_interaction_zscores5.R
@@ -9,6 +9,7 @@ suppressMessages({
 })
 options(warn = 2, max.print = 1000)
 options(width = 10000)
 # Set the memory limit to 30GB (30 * 1024 * 1024 * 1024 bytes)
 soft_limit <- 30 * 1024 * 1024 * 1024
@@ -156,7 +157,7 @@ update_gene_names <- function(df, sgd_gene_list) {
  return(df)
 }
-# Process strains (deletion and reference)
+# Process either deletion and or reference strain(s)
 process_strains <- function(df) {
  df_strains <- data.frame()  # Initialize an empty dataframe to store results
@@ -187,6 +188,9 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
  df <- df %>%
    mutate(across(all_of(variables), ~ifelse(. == 0, NA, .)))
  print("Head of df for summary stats calculations:")
  print(head(df))
  # Calculate summary statistics, including a single N based on L
  summary_stats <- df %>%
    group_by(across(all_of(group_vars))) %>%
@@ -195,13 +199,16 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
      across(all_of(variables), list(
        mean = ~mean(., na.rm = TRUE),
        median = ~median(., na.rm = TRUE),
-        max = ~ifelse(all(is.na(.)), NA, max(., na.rm = TRUE)),
+        max = ~max(., na.rm = TRUE),
-        min = ~ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)),
+        min = ~min(., na.rm = TRUE),
-        sd = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE), 0),  # If N == 1, sd is set to 0
+        sd = ~sd(., na.rm = TRUE),
-        se = ~ifelse(sum(!is.na(.)) > 1, sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1), 0)  # If N == 1, se is set to 0
+        se = ~sd(., na.rm = TRUE) / sqrt(sum(!is.na(.)) - 1) # TODO unsure why this is - 1
      ), .names = "{.fn}_{.col}")
    )
  print("Summary stats:")
  print(head(summary_stats))
  # Remove existing stats columns from df if they already exist
  stat_columns <- setdiff(names(summary_stats), group_vars)
  df_cleaned <- df %>% select(-any_of(stat_columns))
@@ -215,10 +222,10 @@ calculate_summary_stats <- function(df, variables, group_vars = c("conc_num", "c
 # Calculate interaction scores
 calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c("OrfRep", "Gene", "num")) {
-  if (nrow(df) == 0) {
+  # if (nrow(df) == 0) {
-    message("Dataframe is empty after filtering")
+  #   message("Dataframe is empty after filtering")
-    return(NULL)  # Or handle the empty dataframe case as needed
+  #   return(NULL)  # Or handle the empty dataframe case as needed
-  }
+  # }
  # Calculate total concentration variables
  total_conc_num <- length(unique(df$conc_num))
@@ -304,6 +311,9 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
    ) %>%
    ungroup()
  print("Interaction scores:")
  print(head(interaction_scores))
  # Calculate linear models and interaction scores per gene
  print("Calculating interaction scores part 2")
  interaction_scores_all <- interaction_scores %>%
@@ -329,6 +339,107 @@ calculate_interaction_scores <- function(df, max_conc, variables, group_vars = c
 }
 generate_plot <- function(df, x_var, y_var = NULL, plot_type = "scatter", color_var = "conc_num",
                          title = "", x_label = NULL, y_label = NULL, ylim_vals = NULL) {
  # Initialize plot with dynamic x and color variables
  plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var))))
  if (!is.null(y_var)) {
    plot <- plot + aes(y = !!sym(y_var))
    # Add scatter points with summary stats if `y_var` is present
    y_mean_col <- paste0("mean_", y_var)
    y_sd_col <- paste0("sd_", y_var)
    plot <- plot + geom_point(shape = 3, size = 0.2) +
      geom_errorbar(aes(ymin = !!sym(y_mean_col) - !!sym(y_sd_col), 
                        ymax = !!sym(y_mean_col) + !!sym(y_sd_col)), width = 0.1) +
      geom_point(aes(y = !!sym(y_mean_col)), size = 0.6)
  }
  # Add the required plot type (scatter, box, density, bar)
  plot <- switch(plot_type,
    "box" = plot + geom_boxplot(),
    "density" = plot + geom_density(),
    "bar" = plot + geom_bar(stat = "identity"),
    plot + geom_point() + geom_smooth(method = "lm", se = FALSE)  # Default is scatter
  )
  # Apply y-axis limits if specified
  if (!is.null(ylim_vals)) {
    plot <- plot + coord_cartesian(ylim = ylim_vals)
  }
  # Add title and axis labels
  plot <- plot + ggtitle(title) + theme_publication()
  if (!is.null(x_label)) plot <- plot + xlab(x_label)
  if (!is.null(y_label)) plot <- plot + ylab(y_label)
  return(plot)
 }
 generate_and_save_plots <- function(df, output_dir, file_prefix, plot_configs) {
  plots <- list()
  if (nrow(df) == 0) {
    message("The dataframe is empty, skipping plots")
    return()
  }
  message("Generating plots for dataframe")
  for (config in plot_configs) {
    # Generate the plot using the configurations
    plot <- generate_plot(
      df = df,
      x_var = config$x_var,
      y_var = config$y_var,
      plot_type = config$plot_type,
      title = config$title,
      ylim_vals = config$ylim_vals
    )
    # Add custom annotations if provided
    if (!is.null(config$annotations)) {
      for (annotation in config$annotations) {
        plot <- plot + annotate("text", x = annotation$x, y = annotation$y, label = annotation$label)
      }
    }
    # Store the plot with variable name and plot type
    plots[[paste0(config$y_var, "_", config$plot_type)]] <- plot
  }
  # Save plots to PDF and HTML
  save_plots(file_prefix, plots, output_dir)
 }
 # Ensure all plots are saved and printed to PDF
 save_plots <- function(file_name, plot_list, output_dir) {
  # Save all plots to a single PDF
  pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
  lapply(plot_list, print)
  dev.off()
  # Save each plot as an interactive HTML file
  lapply(names(plot_list), function(plot_name) {
    pgg <- tryCatch({
      suppressWarnings(ggplotly(plot_list[[plot_name]]) %>%
        layout(legend = list(orientation = "h")))
    }, error = function(e) {
      message("Error generating plot: ", plot_name)
      return(NULL)
    })
    if (!is.null(pgg)) {
      saveWidget(pgg, file = file.path(output_dir, paste0(file_name, "_", plot_name, ".html")), selfcontained = TRUE)
    }
  })
 }
 interaction_plot_configs <- function(df, variable) {
@@ -361,102 +472,6 @@ interaction_plot_configs <- function(df, variable) {
  )
 }
 generate_plot <- function(df, x_var, y_var = NULL, plot_type, color_var = "conc_num",
  title, x_label = NULL, y_label = NULL, ylim_vals = NULL) {
  # Use tidy evaluation with aes() and !!sym() for dynamic column names
  plot <- ggplot(df, aes(x = !!sym(x_var), color = as.factor(!!sym(color_var))))
  if (!is.null(y_var)) {
    plot <- plot + aes(y = !!sym(y_var))
  }
  # Set up the plot based on the requested plot type
  plot <- switch(plot_type,
    "scatter" = plot + geom_point() + geom_smooth(method = "lm", se = FALSE),
    "box" = plot + geom_boxplot(),
    "density" = plot + geom_density(),
    "bar" = plot + geom_bar(),
    plot  # Default: return the plot as is
  )
  if (!is.null(ylim_vals)) {
    plot <- plot + coord_cartesian(ylim = ylim_vals)
  }
  # Add titles and labels if available
  plot <- plot + ggtitle(title) + theme_publication()
  if (!is.null(x_label)) plot <- plot + xlab(x_label)
  if (!is.null(y_label)) plot <- plot + ylab(y_label)
  return(plot)
 }
 generate_and_save_plots <- function(df, output_dir, file_prefix, plot_configs) {
  plots <- list()
  if (nrow(df) == 0) {
    message("The \"", deparse(substitute(df)), "\" dataframe is empty, skipping plots")
    return()
  }
  message("Generating plots for \"", deparse(substitute(df)), "\" dataframe")
  for (config in plot_configs) {
    plot <- generate_plot(
      df = df,
      x_var = config$x_var,
      y_var = config$y_var,
      plot_type = config$plot_type,
      title = config$title,
      ylim_vals = config$ylim_vals
    )
    # If custom annotations for interaction plots are required
    if (!is.null(config$annotations)) {
      for (annotation in config$annotations) {
        plot <- plot +
          annotate(
            "text", x = annotation$x, y = annotation$y, label = annotation$label
          )
      }
    }
    # Store the plot with the variable name and plot type
    plots[[paste0(config$y_var, "_", config$plot_type)]] <- plot
  }
  save_plots(file_prefix, plots, output_dir)
 }
 # Ensure all plots are saved and printed to PDF
 save_plots <- function(file_name, plot_list, output_dir) {
  # Save to PDF
  pdf(file.path(output_dir, paste0(file_name, ".pdf")), width = 14, height = 9)
  lapply(plot_list, function(plot) {
    print(plot)
  })
  dev.off()
  # Save to HTML with horizontal legend orientation
  lapply(names(plot_list), function(plot_name) {
    message("Generating plot: ", plot_name)
    pgg <- tryCatch({
      suppressWarnings(ggplotly(plot_list[[plot_name]]) %>%
        layout(legend = list(orientation = "h")))
    }, error = function(e) {
      message("Error in plot: ", plot_name, "\n", e)
      return(NULL)
    })
    if (!is.null(pgg)) {
      saveWidget(pgg,
        file = file.path(output_dir,
        paste0(file_name, "_", plot_name, ".html")),
        selfcontained = TRUE)
    }
  })
 }
 generate_interaction_plots <- function(df, output_file) {
  message("Generating interaction plots")
@@ -503,21 +518,27 @@ generate_interaction_plots <- function(df, output_file) {
 generate_cpp_correlation_plots <- function(df_na_rm, lm_list, output_dir) {
  lm_summaries <- lapply(lm_list, summary)
  plot_titles <- c("Interaction L vs. Interaction K", "Interaction L vs. Interaction r", "Interaction L vs. Interaction AUC",
                   "Interaction K vs. Interaction r", "Interaction K vs. Interaction AUC", "Interaction r vs. Interaction AUC")
-  plot_list <- lapply(seq_along(lm_list), function(i) {
+  # Define plot titles and annotation based on R-squared values from the lm_list
-    ggplot(df_na_rm, aes_string(x = names(lm_list)[i][1], y = names(lm_list)[i][2])) +
+  plot_configs <- lapply(seq_along(lm_list), function(i) {
-      geom_point(shape = 3, color = "gray70") +
+    r_squared <- round(lm_summaries[[i]]$r.squared, 3)
-      geom_smooth(method = "lm", color = "tomato3") +
+    plot_titles <- c("Interaction L vs. Interaction K", "Interaction L vs. Interaction r", "Interaction L vs. Interaction AUC",
-      ggtitle(plot_titles[i]) +
+                     "Interaction K vs. Interaction r", "Interaction K vs. Interaction AUC", "Interaction r vs. Interaction AUC")
-      annotate("text", x = 0, y = 0, label = paste("R-squared = ", round(lm_summaries[[i]]$r.squared, 3))) +
+    
-      theme_Publication_legend_right()
+    list(
      x_var = names(lm_list)[i][1],
      y_var = names(lm_list)[i][2],
      plot_type = "scatter",
      title = plot_titles[i],
      annotations = list(list(x = 0, y = 0, label = paste("R-squared =", r_squared)))
    )
  })
-  save_plots("Correlation_CPPs", plot_list, output_dir)
+  # Generate and save the plots using the new system
  generate_and_save_plots(df_na_rm, output_dir, "Correlation_CPPs", plot_configs)
 }
 # Adjust missing values and calculate ranks
 adjust_missing_and_rank <- function(df, variables) {
@@ -570,39 +591,38 @@ generate_plots <- function(df, x_var, y_vars, plot_type, color_var = "conc_num",
 }
 # Function to generate rank plots for the provided dataframe
-create_ranked_plots <- function(df, output_dir) {
+# create_ranked_plots <- function(df, output_dir) {
-  # List of variables for which we need to generate rank plots
+#   # List of variables for which we need to generate rank plots
-  variables <- c("L", "K", "r", "AUC")
+#   variables <- c("L", "K", "r", "AUC")
-  # Adjust missing values and calculate ranks
+#   # Adjust missing values and calculate ranks
-  df_adjusted <- adjust_missing_and_rank(df, variables)
+#   df_adjusted <- adjust_missing_and_rank(df, variables)
-  # Generate rank plots for Avg_Zscore and Z_lm
+#   # Generate rank plots
-  for (var in variables) {
+#   for (var in variables) {
-    plot_rank_avg <- generate_plot(
+#     plot_rank_avg <- generate_plot(
-      df = df_adjusted,
+#       df = df_adjusted,
-      x_var = paste0(var, "_Rank"),
+#       x_var = paste0(var, "_Rank"),
-      y_var = paste0("Avg_Zscore_", var),
+#       y_var = paste0("Avg_Zscore_", var),
-      plot_type = "scatter",
+#       plot_type = "scatter",
-      title = paste("Average Z score vs Rank for", var)
+#       title = paste("Average Z score vs Rank for", var)
-    )
+#     )
-    plot_rank_lm <- generate_plot(
+#     plot_rank_lm <- generate_plot(
-      df = df_adjusted,
+#       df = df_adjusted,
-      x_var = paste0(var, "_Rank_lm"),
+#       x_var = paste0(var, "_Rank_lm"),
-      y_var = paste0("Z_lm_", var),
+#       y_var = paste0("Z_lm_", var),
-      plot_type = "scatter",
+#       plot_type = "scatter",
-      title = paste("Interaction Z score vs Rank for", var)
+#       title = paste("Interaction Z score vs Rank for", var)
-    )
+#     )
-    # Save the plots for Avg_Zscore and Z_lm
+#     save_plots(paste0("RankPlots_", var), list(
-    save_plots(paste0("RankPlots_", var), list(
+#       plot_rank_avg = plot_rank_avg,
-      plot_rank_avg = plot_rank_avg,
+#       plot_rank_lm = plot_rank_lm
-      plot_rank_lm = plot_rank_lm
+#     ), output_dir)
-    ), output_dir)
+#   }
-  }
+# }
 }
 main <- function() {
  lapply(names(args$experiments), function(exp_name) {
@@ -622,8 +642,6 @@ main <- function() {
    # QC steps and filtering
    df_above_tolerance <- df %>% filter(DB == 1)
    df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .)))
    df_no_zeros <- df_na %>% filter(L > 0)
    # Calculate the half-medians for `L` and `K` for rows above tolerance
    L_half_median <- (median(df_above_tolerance$L, na.rm = TRUE)) / 2
@@ -632,7 +650,24 @@ main <- function() {
    # Get the number of rows that are above tolerance
    rows_to_remove <- nrow(df_above_tolerance)
-    # Additional filtering for non-finite values in df_na
+    # Set L, r, K, and AUC to NA for rows that are above tolerance
    df_na <- df %>% mutate(across(c(L, r, AUC, K), ~ ifelse(DB == 1, NA, .)))
    # Calculate summary statistics for all strains, including both background and the deletions
    message("Calculating summary statistics for all strains")
    variables <- c("L", "K", "r", "AUC")
    ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
    summary_stats <- ss$summary_stats
    df_na_stats <- ss$df_with_stats
    write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
    print("Summary stats:")
    print(head(summary_stats), width = 200)
    # Remove rows with 0 values in L
    df_no_zeros <- df_na %>% filter(L > 0)
    # Additional filtering for non-finite values
    df_na_filtered <- df_na %>%
      filter(if_any(c(L), ~ !is.finite(.))) %>%
      {
@@ -647,7 +682,7 @@ main <- function() {
    qc_plot_configs <- list(
      list(x_var = "L", y_var = "K", plot_type = "scatter", title = "Raw L vs K before QC", ylim_vals = NULL),
      list(x_var = "delta_bg", y_var = NULL, plot_type = "density", title = "Density plot for Delta Background", ylim_vals = NULL),
-      list(x_var = "delta_bg", y_var = NULL, plot_type = "bar", title = "Bar plot for Delta Background", ylim_vals = NULL)
+      list(x_var = "as.factor(delta_bg)", y_var = NULL, plot_type = "bar", title = "Bar plot for Delta Background", ylim_vals = NULL)
    )
    above_tolerance_plot_configs <- list(
@@ -659,7 +694,16 @@ main <- function() {
    )
    no_zeros_plot_configs <- list(
-      list(x_var = "L", y_var = "K", plot_type = "scatter", title = "L vs K for Non-Zero L Values", ylim_vals = NULL)
+      list(x_var = "scan", y_var = "L", plot_type = "scatter", title = "L vs Scan for Non-Zero L Values"),
      list(x_var = "scan", y_var = "K", plot_type = "scatter", title = "K vs Scan for Non-Zero L Values"),
      list(x_var = "scan", y_var = "r", plot_type = "scatter", title = "r vs Scan for Non-Zero L Values"),
      list(x_var = "scan", y_var = "AUC", plot_type = "scatter", title = "AUC vs Scan for Non-Zero L Values"),
      list(x_var = "scan", y_var = "delta_bg", plot_type = "scatter", title = "Delta background vs Scan for Non-Zero L Values"),
      list(x_var = "as.factor(scan)", y_var = "L", plot_type = "box", title = "L vs Scan for Non-Zero L Values"),
      list(x_var = "as.factor(scan)", y_var = "K", plot_type = "box", title = "K vs Scan for Non-Zero L Values"),
      list(x_var = "as.factor(scan)", y_var = "r", plot_type = "box", title = "r vs Scan for Non-Zero L Values"),
      list(x_var = "as.factor(scan)", y_var = "AUC", plot_type = "box", title = "AUC vs Scan for Non-Zero L Values"),
      list(x_var = "as.factor(scan)", y_var = "delta_bg", plot_type = "box", title = "Delta background vs Scan for Non-Zero L Values")
    )
    # Generate and save plots for each QC step
@@ -672,17 +716,6 @@ main <- function() {
    # Clean up
    rm(df, df_above_tolerance, df_no_zeros)
    # Calculate summary statistics
    message("Calculating summary statistics for all strains")
    variables <- c("L", "K", "r", "AUC")
    ss <- calculate_summary_stats(df_na, variables, group_vars = c("OrfRep", "conc_num", "conc_num_factor"))
    summary_stats <- ss$summary_stats
    df_na_stats <- ss$df_with_stats
    write.csv(summary_stats, file = file.path(out_dir, "SummaryStats_ALLSTRAINS.csv"), row.names = FALSE)
    print("Summary stats:")
    print(head(summary_stats), width = 200)
    # TODO: Originally this filtered L NA's
    # Let's try to avoid for now since stats have already been calculated
@@ -695,6 +728,7 @@ main <- function() {
    # Summary statistics for within and outside 2SD of K
    message("Calculating summary statistics for L within 2SD of K")
    # TODO We're omitting the original z_max calculation, not sure if needed?
    ss <- calculate_summary_stats(df_na_within_2sd_k, "L", group_vars = c("conc_num", "conc_num_factor"))
    l_within_2sd_k_stats <- ss$summary_stats
    df_na_l_within_2sd_k_stats <- ss$df_with_stats
@@ -844,8 +878,7 @@ main <- function() {
      zscores_interactions_filtered <- zscores_interactions %>%
        filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L))
-      # Generate summary and correlation plots
+
      generate_summary_plots(df, out_dir)
      lm_list <- list(
        lm(Z_lm_K ~ Z_lm_L, data = zscores_interactions_filtered),
        lm(Z_lm_r ~ Z_lm_L, data = zscores_interactions_filtered),
@@ -856,10 +889,37 @@ main <- function() {
      )
      # Generate cpp correlation plots
-      generate_cpp_correlation_plots(zscores_interactions_filtered, lm_list, out_dir)
+      correlation_plot_config <- list(
        list(x_var = "Z_lm_L", y_var = "Z_lm_K", plot_type = "scatter", title = "Correlation between Z_lm_L and Z_lm_K",
          annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ),
        list(x_var = "Z_lm_L", y_var = "Z_lm_r", plot_type = "scatter", title = "Correlation between Z_lm_L and Z_lm_r",
          annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ),
        list(x_var = "Z_lm_L", y_var = "Z_lm_AUC", plot_type = "scatter", title = "Correlation between Z_lm_L and Z_lm_AUC",
          annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ),
        list(x_var = "Z_lm_K", y_var = "Z_lm_r", plot_type = "scatter", title = "Correlation between Z_lm_K and Z_lm_r",
          annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ),
        list(x_var = "Z_lm_K", y_var = "Z_lm_AUC", plot_type = "scatter", title = "Correlation between Z_lm_K and Z_lm_AUC",
          annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), ),
        list(x_var = "Z_lm_r", y_var = "Z_lm_AUC", plot_type = "scatter", title = "Correlation between Z_lm_r and Z_lm_AUC",
          annotations = list(list(x = 0, y = 0, label = paste("R-squared =", round(lm_summaries[[1]]$r.squared, 3)))), )
      )
      generate_and_save_plots(zscores_interactions_filtered, output_dir, "CorrelationPlots", correlation_plot_config)
      # Generate cpp correlation plots
      #generate_cpp_correlation_plots(zscores_interactions_filtered, lm_list, out_dir)
      # Generate ranked plots
-      create_ranked_plots(zscores_interactions_filtered, out_dir)
+      rank_plot_config <- list(
        list(x_var = "L_Rank", y_var = "Avg_Zscore_L", plot_type = "scatter", title = "Rank vs Avg Z score for L"),
        list(x_var = "K_Rank", y_var = "Avg_Zscore_K", plot_type = "scatter", title = "Rank vs Avg Z score for K"),
        list(x_var = "r_Rank", y_var = "Avg_Zscore_r", plot_type = "scatter", title = "Rank vs Avg Z score for r"),
        list(x_var = "AUC_Rank", y_var = "Avg_Zscore_AUC", plot_type = "scatter", title = "Rank vs Avg Z score for AUC")
      )
      # Generate and save rank plots using the existing plotting framework
      generate_and_save_plots(zscores_interactions_filtered, output_dir, "RankPlots", rank_plot_config)
    })
  })
 }