Rollup before configuration integration
This commit is contained in:
@@ -28,8 +28,6 @@ sgd_gene_list <- file.path(args[4])
|
||||
input_file <- file.path(args[5])
|
||||
out_dir <- file.path(args[6])
|
||||
|
||||
sprintf("The Standard Deviation value is: %d", delta_bg_factor)
|
||||
|
||||
out_dir_qc <- file.path(out_dir, "qc")
|
||||
|
||||
if (!dir.exists(out_dir)) {
|
||||
@@ -40,18 +38,6 @@ if (!dir.exists(out_dir_qc)) {
|
||||
dir.create(out_dir_qc)
|
||||
}
|
||||
|
||||
options(width = 1000)
|
||||
ls.str()
|
||||
|
||||
# Write delBGFactor to the StudyInfo file
|
||||
# TODO we probably shouldn't be doing this, need one source of truth
|
||||
# TODO disabling this for now
|
||||
# labels <- read.csv(file = study_info_file, stringsAsFactors = FALSE) # sep = ","
|
||||
# labels[exp_number, 3] <- delta_bg_factor
|
||||
# write.csv(Labels, file = study_info_file, row.names = FALSE)
|
||||
|
||||
# Begin User Data Selection Section
|
||||
|
||||
# Read in the data
|
||||
df <- read.delim(input_file, skip = 2, as.is = TRUE, row.names = 1, strip.white = TRUE)
|
||||
df <- df[!(df[[1]] %in% c("", "Scan")), ]
|
||||
@@ -61,9 +47,7 @@ df <- df[!(df[[1]] %in% c("", "Scan")), ]
|
||||
# df_end <- length(df[1, ]) - 2
|
||||
# df <- df[, c(1:42, df_end:df_length)]
|
||||
|
||||
# print(names(df))
|
||||
|
||||
# Use numeric data to perform operations
|
||||
# Use numbers
|
||||
df$col <- as.numeric(df$Col)
|
||||
df$row <- as.numeric(df$Row)
|
||||
df$l <- as.numeric(df$l)
|
||||
@@ -74,13 +58,17 @@ df$AUC <- as.numeric(df$AUC)
|
||||
df$last_bg <- as.numeric(df$LstBackgrd)
|
||||
df$first_bg <- as.numeric(df$X1stBackgrd)
|
||||
|
||||
# print(df)
|
||||
# Set delta background tolerance based on 3 sds from the mean delta background
|
||||
df$delta_bg <- df$last_bg - df$first_bg
|
||||
delta_bg_tolerance <- mean(df$delta_bg) + (delta_bg_factor * sd(df$delta_bg))
|
||||
# delta_bg_tolerance <- mean(df$delta_bg)+(3*sd(df$delta_bg))
|
||||
sprintf("The delta_bg_factor is: %d", delta_bg_factor)
|
||||
sprintf("The delta_bg_tolerance is %f", delta_bg_tolerance)
|
||||
|
||||
# Sometimes the non-varying drug is in the 'Drug' col vs the 'Modifier1' col
|
||||
# as was the case in Gemcitabin and Cytarabin experiments.
|
||||
# The following allows user to rename columns so as to get the appropriate
|
||||
# data where it needs to be for the script to run properly.
|
||||
|
||||
# colnames(df)[7] <- "Modifier1"
|
||||
# colnames(df)[8] <- "Conc1"
|
||||
# colnames(df)[10] <- "Drug"
|
||||
@@ -95,9 +83,9 @@ df[df$ORF == "YDL227C", ]$OrfRep <- "YDL227C"
|
||||
# df <- df[df$Conc1 != "0ug/ml", ]
|
||||
df <- df[df$Drug != "BMH21", ] # this removes data concerning BMH21 for this experiment
|
||||
|
||||
# Mert placed the "bad_spot" text in the ORF col. for particular spots in the RF1 and RF2 plates.
|
||||
# This code removes those spots from the data set used for the interaction analysis.
|
||||
# Dr.Hartman feels that these donot effect Zscores significantly and so "non-curated" files were used.
|
||||
# Mert placed the "bad_spot" text in the ORF col. for particular spots in the RF1 and RF2 plates
|
||||
# This code removes those spots from the data set used for the interaction analysis
|
||||
# Dr. Hartman feels that these do not affect Zscores significantly and so "non-curated" files were used
|
||||
# try(df <- df[df$ORF != "bad_spot", ])
|
||||
|
||||
# Get total number of drug concentrations
|
||||
@@ -127,10 +115,8 @@ df <- df[df$ORF != "Blank", ]
|
||||
df <- df[df$Gene != "blank", ]
|
||||
# df <- df[df$Gene != "HO", ]
|
||||
|
||||
# Use sgd_gene_list to update orfs and replace empty geneName cells with ORF name (adapted from Sean's Merge script).
|
||||
# Use sgd_gene_list to update orfs and replace empty geneName cells with ORF name
|
||||
# This is to 'fix' the naming for everything that follows (REMc, Heatmaps ... et.al) rather than do it piece meal later
|
||||
# Sean's Match Script( which was adapted here) was fixed 2022_0608 so as not to overwrite the RF1&RF2 geneNames
|
||||
# in the z_lm_l, K, r&AUC output values. Values correlated well but were off by a multiplier factor.
|
||||
genes <- data.frame(read.delim(
|
||||
file = sgd_gene_list, quote = "", header = FALSE, colClasses = c(rep("NULL", 3), rep("character", 2), rep("NULL", 11))))
|
||||
for (i in 1:length(df[, 14])) {
|
||||
@@ -149,12 +135,10 @@ for (i in 1:length(df[, 14])) {
|
||||
# jlh confirmed to leave dAmps in so comment out this section
|
||||
# DAmPs_list <- "../Code/22_0602_Remy_DAmPsList.txt"
|
||||
# Damps <- read.delim(DAmPs_list, header = F)
|
||||
|
||||
# df <- df[!(df$ORF %in% Damps$V1), ] # fix this to Damps[, 1]
|
||||
# dfafterDampsRM = df # backup for debugging
|
||||
|
||||
# Begin Graphics Boiler Plate Section
|
||||
# theme elements for plots
|
||||
# Theme elements for plots
|
||||
theme_publication <- function(base_size = 14, base_family = "sans") {
|
||||
library(grid)
|
||||
library(ggthemes)
|
||||
@@ -235,15 +219,11 @@ scale_colour_publication <- function(...) {
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
# Begin QC Section
|
||||
# Part 2 - Quality control
|
||||
# Quality control
|
||||
# Print quality control graphs for each dataset before removing contaminated data
|
||||
# and before adjusting missing data to max theoretical values
|
||||
|
||||
# Plate analysis plot
|
||||
# Plate analysis is a quality check to identify plate effects containing anomalies
|
||||
|
||||
# Plate analysis plots to identify plate anomalies
|
||||
plate_analysis_l <-
|
||||
ggplot(df, aes(Scan, l, color = as.factor(conc_num))) +
|
||||
geom_point(shape = 3, size = 0.2) +
|
||||
@@ -316,11 +296,8 @@ plate_analysis_auc_box <-
|
||||
ggtitle("Plate analysis by Drug Conc for AUC before quality control") +
|
||||
theme_publication()
|
||||
|
||||
# Quality control - values with a high delta background likely have heavy contamination
|
||||
# Check the frequency of these values
|
||||
# Report the L and k values of these spots
|
||||
# Report the number to be removed based on the delta_background_tolerance
|
||||
df$delta_bg <- df$last_bg - df$first_bg
|
||||
# Report L and K values with a high delta background (likely contaminated)
|
||||
# Report the number to be removed based on the delta_bg_tolerance
|
||||
|
||||
# Raw l vs k before QC
|
||||
raw_l_vs_k_before_qc <-
|
||||
@@ -338,11 +315,6 @@ pgg <- ggplotly(raw_l_vs_k_before_qc)
|
||||
plotly_path <- file.path(out_dir_qc, "raw_l_vs_k_before_qc.html")
|
||||
saveWidget(pgg, file = plotly_path, selfcontained = TRUE)
|
||||
|
||||
# Set delta background tolerance based on 3 sds from the mean delta background
|
||||
delta_background_tolerance <- mean(df$delta_bg) + (delta_bg_factor * sd(df$delta_bg))
|
||||
# delta_background_tolerance <- mean(df$delta_bg)+(3*sd(df$delta_bg))
|
||||
sprintf("delta_background_tolerance is %f", delta_background_tolerance)
|
||||
|
||||
plate_analysis_delta_bg <-
|
||||
ggplot(df, aes(Scan, delta_bg, color = as.factor(conc_num))) +
|
||||
geom_point(shape = 3, size = 0.2, position = "jitter") +
|
||||
@@ -362,7 +334,7 @@ plate_analysis_delta_bg_box <-
|
||||
theme_publication()
|
||||
|
||||
x_delta_bg_above_tolerance <-
|
||||
df[df$delta_bg >= delta_background_tolerance, ]
|
||||
df[df$delta_bg >= delta_bg_tolerance, ]
|
||||
x_delta_bg_above_tolerance_k_halfmedian <-
|
||||
(median(x_delta_bg_above_tolerance$K, na.rm = TRUE)) / 2
|
||||
x_delta_bg_above_tolerance_l_halfmedian <-
|
||||
@@ -373,7 +345,7 @@ x_delta_bg_above_tolerance_to_remove <-
|
||||
x_delta_bg_above_tolerance_l_vs_k <-
|
||||
ggplot(x_delta_bg_above_tolerance, aes(l, K, color = as.factor(conc_num))) +
|
||||
geom_point(aes(ORF = ORF, Gene = Gene, delta_bg = delta_bg), shape = 3) +
|
||||
ggtitle(paste("Raw L vs K for strains above delta background threshold of", delta_background_tolerance, "or above")) +
|
||||
ggtitle(paste("Raw L vs K for strains above delta background threshold of", delta_bg_tolerance, "or above")) +
|
||||
annotate("text",
|
||||
x = x_delta_bg_above_tolerance_l_halfmedian,
|
||||
y = x_delta_bg_above_tolerance_k_halfmedian,
|
||||
@@ -418,13 +390,13 @@ try(df[df$l == 0 & !is.na(df$l), ]$NG <- 1)
|
||||
# 1 for removed data, 0 non removed data
|
||||
# Use DB to identify number of genes removed due to the DeltaBackground Threshold rather than "Removed"
|
||||
df$DB <- 0
|
||||
try(df[df$delta_bg >= delta_background_tolerance, ]$DB <- 1)
|
||||
try(df[df$delta_bg >= delta_bg_tolerance, ]$DB <- 1)
|
||||
|
||||
# Replace the CPPs for l, r, AUC and k (must be last!) for removed data
|
||||
try(df[df$delta_bg >= delta_background_tolerance, ]$l <- NA)
|
||||
try(df[df$delta_bg >= delta_background_tolerance, ]$r <- NA)
|
||||
try(df[df$delta_bg >= delta_background_tolerance, ]$AUC <- NA)
|
||||
try(df[df$delta_bg >= delta_background_tolerance, ]$K <- NA)
|
||||
# Replace the CPPs for l, r, AUC and K (must be last!) for removed data
|
||||
try(df[df$delta_bg >= delta_bg_tolerance, ]$l <- NA)
|
||||
try(df[df$delta_bg >= delta_bg_tolerance, ]$r <- NA)
|
||||
try(df[df$delta_bg >= delta_bg_tolerance, ]$AUC <- NA)
|
||||
try(df[df$delta_bg >= delta_bg_tolerance, ]$K <- NA)
|
||||
|
||||
# QC Plots
|
||||
plate_analysis_l_after_qc <-
|
||||
|
||||
Reference in New Issue
Block a user