Refine df filtering in calculate_interaction_zscores.R

This commit is contained in:
2024-08-24 00:12:56 -04:00
parent dc5dea8bfc
commit 3cb6e89360

View File

@@ -113,33 +113,37 @@ load_and_preprocess_data <- function(easy_results_file, genes) {
stop("Error reading file: ", easy_results_file, "\n", e$message) stop("Error reading file: ", easy_results_file, "\n", e$message)
}) })
# Convert specific columns to numeric
numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd") numeric_columns <- c("Col", "Row", "l", "K", "r", "Scan", "AUC96", "LstBackgrd", "X1stBackgrd")
df[numeric_columns[numeric_columns %in% colnames(df)]] <- df[numeric_columns[numeric_columns %in% colnames(df)]] <-
suppressWarnings(lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric)) suppressWarnings(lapply(df[numeric_columns[numeric_columns %in% colnames(df)]], as.numeric))
# Apply initial filters and mutating steps as in the original script
df <- df %>% df <- df %>%
filter(!.[[1]] %in% c("", "Scan")) %>% mutate(L = if ("l" %in% colnames(.)) l else {warning("Missing column: l"); NA}) %>%
filter(Gene != "BLANK" & Gene != "Blank" & ORF != "Blank" & Gene != "blank") %>% mutate(AUC = if ("AUC96" %in% colnames(.)) AUC96 else {warning("Missing column: AUC96"); NA}) %>%
filter(Drug != "BMH21") %>% filter(!.[[1]] %in% c("", "Scan")) %>% # Filter out empty or Scan rows
mutate( # {if ("Conc" %in% colnames(.)) filter(., Conc != "0ug/mL") else .} %>%
L = if ("l" %in% colnames(.)) l else {warning("Missing column: l"); NA}, filter(Gene != "BLANK" & Gene != "Blank" & ORF != "Blank" & Gene != "blank") %>% # Remove blank genes
AUC = if ("AUC96" %in% colnames(.)) AUC96 else {warning("Missing column: AUC96"); NA}, filter(Drug != "BMH21") %>% # Filter out specific drugs if necessary
conc_num = if ("Conc" %in% colnames(.)) as.numeric(gsub("[^0-9\\.]", "", Conc)) else NA, filter(!is.na(ORF) & ORF != "") %>% # Ensure ORF is not NA or empty
delta_bg = if (all(c("X1stBackgrd", "LstBackgrd") %in% colnames(.))) mutate(OrfRep = ifelse(ORF == "YDL227C", "YDL227C", OrfRep)) %>%
LstBackgrd - X1stBackgrd else {warning("Missing columns for delta_bg calculation"); NA}, mutate(conc_num = as.numeric(gsub("[^0-9\\.]", "", Conc))) %>% # Extract numeric drug concentrations
GeneName = vapply(ORF, function(orf) { filter(!is.na(LstBackgrd) & !is.na(X1stBackgrd)) %>% # Ensure finite background values
mutate(delta_bg = LstBackgrd - X1stBackgrd) %>% # Calculate delta background
filter(!is.na(L) & !is.na(K) & !is.na(r) & !is.na(AUC96)) %>% # Ensure finite measurements
mutate(GeneName = vapply(ORF, function(orf) {
gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName) gene_name <- genes %>% filter(ORF == orf) %>% pull(GeneName)
ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name) ifelse(is.null(gene_name) || gene_name == "" || length(gene_name) == 0, orf, gene_name)
}, character(1)), }, character(1)))
OrfRep = ifelse(ORF == "YDL227C", "YDL227C", OrfRep)
) # %>%
# mutate(across(everything(), ~na_if(., "")))
if (nrow(df) == 0) warning("Dataframe is empty after filtering") if (nrow(df) == 0) warning("Dataframe is empty after filtering")
return(df) return(df)
} }
create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "") { create_and_publish_plot <- function(df, var, plot_type, out_dir_qc, suffix = "") {
if (!("Scan" %in% colnames(df))) { if (!("Scan" %in% colnames(df))) {
warning("'Scan' column is not present in the data. Skipping this plot.") warning("'Scan' column is not present in the data. Skipping this plot.")
@@ -182,7 +186,6 @@ publish_summary_stats <- function(df, variables, out_dir) {
fwrite(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE) fwrite(summary_stats_df, file.path(out_dir, "summary_stats_all_strains.csv"), row.names = FALSE)
} }
# Interaction scores publication
publish_interaction_scores <- function(df, out_dir) { publish_interaction_scores <- function(df, out_dir) {
interaction_scores <- df %>% interaction_scores <- df %>%
dplyr::group_by(OrfRep) %>% dplyr::group_by(OrfRep) %>%
@@ -239,6 +242,8 @@ deletion_suppressors_K <- interaction_scores[interaction_scores$mean_K >= 2, ]
return(interaction_scores) # Return the updated data frame with rank columns return(interaction_scores) # Return the updated data frame with rank columns
} }
publish_zscores <- function(df, out_dir) { publish_zscores <- function(df, out_dir) {
zscores <- df %>% zscores <- df %>%
dplyr::mutate( dplyr::mutate(
@@ -282,6 +287,8 @@ generate_and_publish_qc <- function(df, delta_bg_tolerance, out_dir_qc) {
}) })
} }
# Create rank plots # Create rank plots
create_rank_plots <- function(interaction_scores, out_dir) { create_rank_plots <- function(interaction_scores, out_dir) {
rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank") rank_vars <- c("l_rank", "k_rank", "r_rank", "auc_rank")
@@ -305,7 +312,6 @@ create_rank_plots <- function(interaction_scores, out_dir) {
}) })
} }
create_correlation_plot <- function(interaction_scores, out_dir) { create_correlation_plot <- function(interaction_scores, out_dir) {
# Check for non-finite values and remove them from the dataset # Check for non-finite values and remove them from the dataset
interaction_scores <- interaction_scores %>% interaction_scores <- interaction_scores %>%
@@ -348,6 +354,7 @@ process_experiment <- function(exp_name, exp_dir, genes, output_dir) {
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
dir.create(out_dir_qc, showWarnings = FALSE) dir.create(out_dir_qc, showWarnings = FALSE)
# Load and preprocess the data
data <- load_and_preprocess_data(args$easy_results_file, genes) data <- load_and_preprocess_data(args$easy_results_file, genes)
# Calculate delta background tolerance # Calculate delta background tolerance