Refactor data filtering
This commit is contained in:
@@ -793,9 +793,12 @@ generate_rank_plot_configs <- function(df_filtered, is_lm = FALSE, adjust = FALS
|
|||||||
return(configs)
|
return(configs)
|
||||||
}
|
}
|
||||||
|
|
||||||
generate_correlation_plot_configs <- function(df, variables) {
|
generate_correlation_plot_configs <- function(df) {
|
||||||
|
|
||||||
configs <- list()
|
configs <- list()
|
||||||
|
|
||||||
|
variables <- c("r", "L", "K", "AUC")
|
||||||
|
|
||||||
for (variable in variables) {
|
for (variable in variables) {
|
||||||
z_lm_var <- paste0("Z_lm_", variable)
|
z_lm_var <- paste0("Z_lm_", variable)
|
||||||
avg_zscore_var <- paste0("Avg_Zscore_", variable)
|
avg_zscore_var <- paste0("Avg_Zscore_", variable)
|
||||||
@@ -821,47 +824,94 @@ generate_correlation_plot_configs <- function(df, variables) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FALSE,
|
filter_data <- function(df, variables, nf = FALSE, missing = FALSE, adjust = FALSE,
|
||||||
limits_map = NULL, verbose = TRUE) {
|
rank = FALSE, limits_map = NULL, verbose = TRUE) {
|
||||||
|
|
||||||
for (variable in variables) {
|
# Precompute column names for efficiency
|
||||||
|
avg_zscore_cols <- paste0("Avg_Zscore_", variables)
|
||||||
|
z_lm_cols <- paste0("Z_lm_", variables)
|
||||||
|
|
||||||
avg_zscore_col <- paste0("Avg_Zscore_", var)
|
if (adjust) {
|
||||||
z_lm_col <- paste0("Z_lm_", var)
|
if (verbose) message("Replacing NA with 0.001 for Avg_Zscore_ and Z_lm_ columns")
|
||||||
rank_col <- paste0("Rank_", var)
|
df <- df %>%
|
||||||
rank_lm_col <- paste0("Rank_lm_", var)
|
mutate(
|
||||||
|
across(all_of(avg_zscore_cols), ~ replace_na(., 0.001)),
|
||||||
|
across(all_of(z_lm_cols), ~ replace_na(., 0.001))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
if (adjust) {
|
# Filter non-finite values
|
||||||
message("Replacing NA with 0.001 for interaction variables")
|
if (nf) {
|
||||||
df[[avg_zscore_col]] <- if_else(is.na(df[[avg_zscore_col]]), 0.001, df[[avg_zscore_col]])
|
non_finite_df <- df %>%
|
||||||
df[[z_lm_col]] <- if_else(is.na(df[[z_lm_col]]), 0.001, df[[z_lm_col]])
|
filter(across(all_of(variables), ~ !is.finite(.)))
|
||||||
|
|
||||||
|
if (verbose && nrow(non_finite_df) > 0) {
|
||||||
|
message("Non-finite rows for variables ", paste(variables, collapse = ", "), ":")
|
||||||
|
print(non_finite_df)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nf) {
|
# Keep only rows where all specified variables are finite
|
||||||
non_finite <- df %>% filter(!is.finite(.data[[variable]]))
|
df <- df %>%
|
||||||
if (verbose && nrow(non_finite) > 0) {
|
filter(across(all_of(variables), ~ is.finite(.)))
|
||||||
message("Non-finite rows for variable ", variable, ":")
|
}
|
||||||
print(non_finite)
|
|
||||||
}
|
# Filter missing malues
|
||||||
df <- df %>% filter(is.finite(.data[[variable]]))
|
if (missing) {
|
||||||
|
missing_df <- df %>%
|
||||||
|
filter(across(all_of(variables), ~ is.na(.)))
|
||||||
|
|
||||||
|
if (verbose && nrow(missing_df) > 0) {
|
||||||
|
message("Missing data for variables ", paste(variables, collapse = ", "), ":")
|
||||||
|
print(missing_df)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (missing) {
|
# Keep only rows where all specified variables are not missing
|
||||||
missing_data <- df %>% filter(is.na(.data[[variable]]))
|
df <- df %>%
|
||||||
if (verbose && nrow(missing_data) > 0) {
|
filter(across(all_of(variables), ~ !is.na(.)))
|
||||||
message("Missing data for variable ", variable, ":")
|
}
|
||||||
print(missing_data)
|
|
||||||
}
|
|
||||||
df <- df %>% filter(!is.na(.data[[variable]]))
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!is.null(limits_map) && !is.null(limits_map[[variable]])) {
|
# Filter data outside of y-limits (for plotting)
|
||||||
ylim_vals <- limits_map[[variable]]
|
if (!is.null(limits_map)) {
|
||||||
out_of_range_data <- df %>% filter(.data[[variable]] < ylim_vals[1] | .data[[variable]] > ylim_vals[2])
|
for (variable in names(limits_map)) {
|
||||||
if (verbose && nrow(out_of_range_data) > 0) {
|
if (variable %in% variables) {
|
||||||
message("Out-of-range data for variable ", variable, ":")
|
ylim_vals <- limits_map[[variable]]
|
||||||
print(out_of_range_data)
|
|
||||||
|
# Identify out-of-range data
|
||||||
|
out_of_range_df <- df %>%
|
||||||
|
filter(.data[[variable]] < ylim_vals[1] | .data[[variable]] > ylim_vals[2])
|
||||||
|
|
||||||
|
if (verbose && nrow(out_of_range_df) > 0) {
|
||||||
|
message("Out-of-range data for variable ", variable, ":")
|
||||||
|
print(out_of_range_df)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Keep only rows within the specified limits
|
||||||
|
df <- df %>%
|
||||||
|
filter(.data[[variable]] >= ylim_vals[1] & .data[[variable]] <= ylim_vals[2])
|
||||||
}
|
}
|
||||||
df <- df %>% filter(.data[[variable]] >= ylim_vals[1] & .data[[variable]] <= ylim_vals[2])
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rank) {
|
||||||
|
if (verbose) message("Calculating rank columns for variables: ", paste(variables, collapse = ", "))
|
||||||
|
|
||||||
|
# Create Rank and Rank_lm columns using mutate and across
|
||||||
|
df <- df %>%
|
||||||
|
mutate(
|
||||||
|
# Rank based on Avg_Zscore_
|
||||||
|
across(all_of(avg_zscore_cols), ~ rank(., na.last = "keep"), .names = "Rank_{.col}"),
|
||||||
|
# Rank_lm based on Z_lm_
|
||||||
|
across(all_of(z_lm_cols), ~ rank(., na.last = "keep"), .names = "Rank_lm_{.col}")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Rename the newly created rank columns to match desired names
|
||||||
|
for (variable in variables) {
|
||||||
|
old_rank_col <- paste0("Rank_Avg_Zscore_", variable)
|
||||||
|
new_rank_col <- paste0("Rank_", variable)
|
||||||
|
df <- df %>% rename(!!new_rank_col := !!sym(old_rank_col))
|
||||||
|
|
||||||
|
old_rank_lm_col <- paste0("Rank_lm_Z_lm_", variable)
|
||||||
|
new_rank_lm_col <- paste0("Rank_lm_", variable)
|
||||||
|
df <- df %>% rename(!!new_rank_lm_col := !!sym(old_rank_lm_col))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1254,10 +1304,14 @@ main <- function() {
|
|||||||
file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE)
|
file = file.path(out_dir, "ZScores_Interaction_Deletion_Suppressors_K_lm.csv"), row.names = FALSE)
|
||||||
|
|
||||||
message("Generating rank plots")
|
message("Generating rank plots")
|
||||||
zscores_interactions_joined_filtered <- filter_data(zscores_interactions_joined, variables, missing = TRUE, adjust = TRUE)
|
zscores_interactions_joined_filtered <- filter_data(
|
||||||
|
zscores_interactions_joined,
|
||||||
|
variables,
|
||||||
|
missing = TRUE,
|
||||||
|
adjust = TRUE,
|
||||||
|
rank = TRUE)
|
||||||
rank_plot_configs <- generate_rank_plot_configs(
|
rank_plot_configs <- generate_rank_plot_configs(
|
||||||
df = zscores_interactions_joined_filtered,
|
df = zscores_interactions_joined_filtered,
|
||||||
variables = interaction_vars,
|
|
||||||
is_lm = FALSE,
|
is_lm = FALSE,
|
||||||
adjust = TRUE
|
adjust = TRUE
|
||||||
)
|
)
|
||||||
@@ -1267,7 +1321,6 @@ main <- function() {
|
|||||||
message("Generating ranked linear model plots")
|
message("Generating ranked linear model plots")
|
||||||
rank_lm_plot_configs <- generate_rank_plot_configs(
|
rank_lm_plot_configs <- generate_rank_plot_configs(
|
||||||
df = zscores_interactions_joined_filtered,
|
df = zscores_interactions_joined_filtered,
|
||||||
variables = interaction_vars,
|
|
||||||
is_lm = TRUE,
|
is_lm = TRUE,
|
||||||
adjust = TRUE
|
adjust = TRUE
|
||||||
)
|
)
|
||||||
@@ -1276,7 +1329,7 @@ main <- function() {
|
|||||||
|
|
||||||
message("Filtering and reranking plots")
|
message("Filtering and reranking plots")
|
||||||
# Formerly X_NArm
|
# Formerly X_NArm
|
||||||
zscores_interactions_filtered <- zscores_interactions %>%
|
zscores_interactions_filtered <- zscores_interactions_joined %>%
|
||||||
group_by(across(all_of(orf_group_vars))) %>%
|
group_by(across(all_of(orf_group_vars))) %>%
|
||||||
filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
|
filter(!is.na(Z_lm_L) | !is.na(Avg_Zscore_L)) %>%
|
||||||
ungroup() %>%
|
ungroup() %>%
|
||||||
@@ -1299,13 +1352,24 @@ main <- function() {
|
|||||||
) %>%
|
) %>%
|
||||||
ungroup()
|
ungroup()
|
||||||
|
|
||||||
message("Generating filtered ranked plots")
|
# Re-rank
|
||||||
|
zscores_interactions_filtered <- filter_data(
|
||||||
|
df = zscores_interactions_filtered,
|
||||||
|
variables = interaction_vars,
|
||||||
|
missing = TRUE, # TODO what I'm currently having issues with
|
||||||
|
rank = TRUE
|
||||||
|
)
|
||||||
|
|
||||||
rank_plot_filtered_configs <- generate_rank_plot_configs(
|
rank_plot_filtered_configs <- generate_rank_plot_configs(
|
||||||
df = zscores_interactions_filtered,
|
df = zscores_interactions_filtered,
|
||||||
is_lm = FALSE,
|
is_lm = FALSE,
|
||||||
adjust = FALSE
|
adjust = FALSE
|
||||||
)
|
)
|
||||||
generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_na_rm",
|
|
||||||
|
message("Generating filtered ranked plots")
|
||||||
|
generate_and_save_plots(
|
||||||
|
output_dir = out_dir,
|
||||||
|
file_name = "RankPlots_na_rm",
|
||||||
plot_configs = rank_plot_filtered_configs,
|
plot_configs = rank_plot_filtered_configs,
|
||||||
grid_layout = list(ncol = 3, nrow = 2))
|
grid_layout = list(ncol = 3, nrow = 2))
|
||||||
|
|
||||||
@@ -1315,13 +1379,17 @@ main <- function() {
|
|||||||
is_lm = TRUE,
|
is_lm = TRUE,
|
||||||
adjust = FALSE
|
adjust = FALSE
|
||||||
)
|
)
|
||||||
generate_and_save_plots(output_dir = out_dir, file_name = "RankPlots_lm_na_rm",
|
generate_and_save_plots(
|
||||||
|
output_dir = out_dir,
|
||||||
|
file_name = "RankPlots_lm_na_rm",
|
||||||
plot_configs = rank_plot_lm_filtered_configs,
|
plot_configs = rank_plot_lm_filtered_configs,
|
||||||
grid_layout = list(ncol = 3, nrow = 2))
|
grid_layout = list(ncol = 3, nrow = 2))
|
||||||
|
|
||||||
message("Generating correlation plots")
|
message("Generating correlation plots")
|
||||||
correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered, interaction_vars)
|
correlation_plot_configs <- generate_correlation_plot_configs(zscores_interactions_filtered)
|
||||||
generate_and_save_plots(output_dir = out_dir, file_name = "Avg_Zscore_vs_lm_NA_rm",
|
generate_and_save_plots(
|
||||||
|
output_dir = out_dir,
|
||||||
|
file_name = "Avg_Zscore_vs_lm_NA_rm",
|
||||||
plot_configs = correlation_plot_configs,
|
plot_configs = correlation_plot_configs,
|
||||||
grid_layout = list(ncol = 2, nrow = 2))
|
grid_layout = list(ncol = 2, nrow = 2))
|
||||||
})
|
})
|
||||||
|
|||||||
Reference in New Issue
Block a user