In this vignette, we demonstrate how to use the OSAT score (Yan et al. (2012)).
library(designit)
library(tidyverse)
if (!requireNamespace("OSAT")) {
print("This vignette can only be rendered if `OSAT` package is installed.")
knitr::knit_exit()
}
#> Loading required namespace: OSAT
Loading samples. We add two dummy columns to demonstrate how to choose batch columns of interest.
osat_data_path <- system.file("extdata", package = "OSAT")
samples <- read_tsv(file.path(osat_data_path, "samples.txt"),
col_types = cols(SampleType = col_factor(), Race = col_factor(), AgeGrp = col_factor())
) |>
mutate(dummy_var1 = rnorm(n()), dummy_var2 = str_c(SampleType, Race, sep = " "))
Here we use OSAT to optimize setup.
gs <- OSAT::setup.sample(samples, optimal = c("SampleType", "Race", "AgeGrp"))
gc <- OSAT::setup.container(OSAT::IlluminaBeadChip96Plate, 7, batch = "plates")
set.seed(1234)
bench::system_time(
g_setup <- OSAT::create.optimized.setup(sample = gs, container = gc, nSim = params$iterations)
)
#> Warning in OSAT::create.optimized.setup(sample = gs, container = gc, nSim =
#> params$iterations): Using default optimization method: optimal.shuffle
#> process real
#> 480ms 476ms
OSAT::QC(g_setup)
#>
#> Test independence between "plates" and sample variables
#>
#> Pearson's Chi-squared test
#> Var X-squared df p.value
#> 1 SampleType 0.1126719 6 0.9999714
#> 2 Race 0.3062402 6 0.9994663
#> 3 AgeGrp 1.8220606 24 1.0000000
#>
Saving starting point of optimization
set.seed(1234)
g_setup_start <- OSAT::create.optimized.setup(sample = gs, container = gc, nSim = 1) |>
OSAT::get.experiment.setup()
#> Warning in OSAT::create.optimized.setup(sample = gs, container = gc, nSim = 1):
#> Using default optimization method: optimal.shuffle
Visualize various batch factors. OSAT score is optimized only for
plates
in this case.
OSAT::get.experiment.setup(g_setup) |>
select(AgeGrp, plates, chipRows, chipColumns, chips, rows, columns, wells) |>
pivot_longer(-AgeGrp) |>
count(AgeGrp, value, name) |>
ggplot(aes(AgeGrp, n, fill = factor(value))) +
geom_col(position = "dodge") +
facet_wrap(~name, scales = "free_y")
plot_batch <- function(df) {
df |>
select(plates, SampleType, Race, AgeGrp) |>
pivot_longer(c(SampleType, Race, AgeGrp), names_to = "variable", values_to = "level") |>
count(plates, variable, level) |>
ggplot(aes(level, n, fill = factor(plates))) +
geom_col(position = "dodge") +
facet_wrap(~variable, scales = "free", ncol = 1)
}
Before the optimization.
After the optimization.
Compare OSAT score generated using designit.
OSAT::getLayout(gc) |>
left_join(OSAT::get.experiment.setup(g_setup)) |>
data.table::data.table() |>
osat_score("plates", c("SampleType", "Race", "AgeGrp")) |>
with(score)
#> Joining with `by = join_by(plates, chipRows, chipColumns, chips, rows, columns,
#> wells)`
#> Warning in osat_score(data.table::data.table(left_join(OSAT::getLayout(gc), :
#> NAs in features / batch columns; they will be excluded from scoring
#> [1] 26.85714
# score using OSAT
g_setup@metadata$optValue |> tail(1)
#> [1] 28.85714
First let’s create a BatchContainer with same dimensions.
bc <- BatchContainer$new(
dimensions = c(plates = 7, chips = 8, rows = 6, columns = 2)
)
bc
#> Batch container with 672 locations.
#> Dimensions: plates, chips, rows, columns
bc$n_locations
#> [1] 672
Assign samples and get initial setup.
bc <- assign_in_order(bc, samples)
starting_assignment <- bc$get_locations() |>
left_join(g_setup_start) |>
pull(ID) |>
as.integer()
#> Joining with `by = join_by(plates, chips, rows, columns)`
bc$move_samples(location_assignment = starting_assignment)
bc$get_samples(remove_empty_locations = TRUE) |>
plot_batch()
scoring_f <- osat_score_generator("plates", c("SampleType", "Race", "AgeGrp"))
bc$score(scoring_f)
#> Warning in osat_score(bc, batch_vars = batch_vars, feature_vars = feature_vars,
#> : NAs in features / batch columns; they will be excluded from scoring
#> score_1
#> 360.8571
g_setup@metadata$optValue |> head(1)
#> [1] 360.8571
# should be identical
bench::system_time({
set.seed(123)
bc_reference <- optimize_design(bc, scoring = scoring_f, max_iter = params$iterations)
})
#> Checking variances of 1-dim. score vector.
#> ... (6749.04) - OK
#> Initial score: 360.857
#> Achieved score: 348.857 at iteration 7
#> Achieved score: 344.857 at iteration 8
#> Achieved score: 338.857 at iteration 10
#> Achieved score: 326.857 at iteration 19
#> Achieved score: 322.857 at iteration 22
#> Achieved score: 320.857 at iteration 24
#> Achieved score: 318.857 at iteration 26
#> Achieved score: 314.857 at iteration 27
#> Achieved score: 308.857 at iteration 28
#> Achieved score: 302.857 at iteration 32
#> Achieved score: 300.857 at iteration 33
#> Achieved score: 292.857 at iteration 36
#> Achieved score: 290.857 at iteration 38
#> Achieved score: 290.857 at iteration 39
#> Achieved score: 284.857 at iteration 40
#> Achieved score: 274.857 at iteration 45
#> Achieved score: 270.857 at iteration 50
#> Achieved score: 264.857 at iteration 54
#> Achieved score: 256.857 at iteration 55
#> Achieved score: 246.857 at iteration 60
#> Achieved score: 240.857 at iteration 62
#> Achieved score: 222.857 at iteration 71
#> Achieved score: 220.857 at iteration 72
#> Achieved score: 214.857 at iteration 73
#> Achieved score: 202.857 at iteration 78
#> Achieved score: 200.857 at iteration 79
#> Achieved score: 192.857 at iteration 80
#> Achieved score: 184.857 at iteration 84
#> Achieved score: 178.857 at iteration 85
#> Achieved score: 174.857 at iteration 86
#> Achieved score: 172.857 at iteration 95
#> Achieved score: 166.857 at iteration 96
#> Achieved score: 156.857 at iteration 98
#> Achieved score: 150.857 at iteration 99
#> Achieved score: 146.857 at iteration 103
#> Achieved score: 144.857 at iteration 108
#> Achieved score: 142.857 at iteration 111
#> Achieved score: 140.857 at iteration 113
#> Achieved score: 136.857 at iteration 114
#> Achieved score: 134.857 at iteration 128
#> Achieved score: 130.857 at iteration 133
#> Achieved score: 120.857 at iteration 138
#> Achieved score: 116.857 at iteration 140
#> Achieved score: 108.857 at iteration 149
#> Achieved score: 106.857 at iteration 152
#> Achieved score: 106.857 at iteration 164
#> Achieved score: 104.857 at iteration 171
#> Achieved score: 102.857 at iteration 175
#> Achieved score: 100.857 at iteration 176
#> Achieved score: 96.857 at iteration 179
#> Achieved score: 92.857 at iteration 190
#> Achieved score: 90.857 at iteration 191
#> Achieved score: 88.857 at iteration 200
#> Achieved score: 84.857 at iteration 205
#> Achieved score: 78.857 at iteration 209
#> Achieved score: 76.857 at iteration 223
#> Achieved score: 74.857 at iteration 250
#> Achieved score: 70.857 at iteration 265
#> Achieved score: 68.857 at iteration 268
#> Achieved score: 68.857 at iteration 283
#> Achieved score: 64.857 at iteration 295
#> Achieved score: 62.857 at iteration 336
#> Achieved score: 60.857 at iteration 350
#> Achieved score: 58.857 at iteration 368
#> Achieved score: 56.857 at iteration 389
#> Achieved score: 54.857 at iteration 406
#> Achieved score: 52.857 at iteration 418
#> Achieved score: 50.857 at iteration 428
#> Achieved score: 50.857 at iteration 431
#> Achieved score: 48.857 at iteration 433
#> Achieved score: 46.857 at iteration 452
#> Achieved score: 44.857 at iteration 496
#> Achieved score: 42.857 at iteration 553
#> Achieved score: 40.857 at iteration 562
#> Achieved score: 38.857 at iteration 612
#> Achieved score: 36.857 at iteration 717
#> Achieved score: 34.857 at iteration 727
#> Achieved score: 32.857 at iteration 828
#> Achieved score: 30.857 at iteration 853
#> Achieved score: 30.857 at iteration 859
#> Achieved score: 28.857 at iteration 904
#> Achieved score: 26.857 at iteration 970
#> process real
#> 3.56s 3.54s
data.table
Instead of relying on BatchContainer
, here we have a
manual optimization process using data.table
.
fast_osat_optimize <- function(bc, batch_vars, feature_vars, iterations) {
bc <- bc$copy()
ldf <- data.table::data.table(bc$get_locations())[, c("plates")][, ".sample_id" := bc$assignment]
fcols <- c(".sample_id", feature_vars)
smp <- data.table::data.table(bc$samples)[, ..fcols]
df <- smp[ldf, on = ".sample_id"]
v <- osat_score(df, batch_vars, feature_vars)
edf <- v$expected_dt
current_score <- v$score
scores <- numeric(length = iterations)
n_avail <- nrow(df)
for (i in 1:iterations) {
repeat {
pos <- sample(n_avail, 2)
# does not make sense to shuffle NAs
if (any(!is.na(df[pos, feature_vars[1]]))) {
break
}
}
val <- df[c(pos[2], pos[1]), fcols, with = FALSE]
df[c(pos[1], pos[2]), (fcols) := val]
new_score <- osat_score(df, batch_vars, feature_vars, edf)$score
if (new_score <= current_score) {
current_score <- new_score
} else {
df[c(pos[2], pos[1]), (fcols) := val]
}
scores[i] <- current_score
}
bc$assignment <- df$.sample_id
list(bc=bc, scores=scores)
}
bench::system_time({
set.seed(123)
opt_res <- fast_osat_optimize(bc, "plates", c("SampleType", "Race", "AgeGrp"), iterations = params$iterations)
})
#> Warning in osat_score(df, batch_vars, feature_vars): NAs in features / batch
#> columns; they will be excluded from scoring
#> Warning in (function (assignment) : this field might become read-only in the
#> future, please use $move_samples() instead
#> process real
#> 2.17s 2.15s
scoring_f <- osat_score_generator("plates", c("SampleType", "Race", "AgeGrp"))
burn_in_it <- floor(params$iterations * 0.1)
burn_in_it
#> [1] 100
bench::system_time({
set.seed(123)
bc_burn_in <- optimize_design(
bc,
scoring = scoring_f,
n_shuffle = c(
rep(20, burn_in_it),
rep(
2,
params$iterations - burn_in_it
)
),
max_iter = params$iterations
)
})
#> Warning in osat_score(bc, batch_vars = batch_vars, feature_vars = feature_vars,
#> : NAs in features / batch columns; they will be excluded from scoring
#> Checking variances of 1-dim. score vector.
#> ... (5018.697) - OK
#> Initial score: 360.857
#> Achieved score: 322.857 at iteration 1
#> Achieved score: 290.857 at iteration 5
#> Achieved score: 278.857 at iteration 7
#> Achieved score: 254.857 at iteration 8
#> Achieved score: 238.857 at iteration 15
#> Achieved score: 224.857 at iteration 28
#> Achieved score: 216.857 at iteration 32
#> Achieved score: 198.857 at iteration 34
#> Achieved score: 196.857 at iteration 50
#> Achieved score: 184.857 at iteration 54
#> Achieved score: 172.857 at iteration 59
#> Achieved score: 162.857 at iteration 75
#> Achieved score: 158.857 at iteration 80
#> Achieved score: 158.857 at iteration 93
#> Achieved score: 156.857 at iteration 101
#> Achieved score: 148.857 at iteration 102
#> Achieved score: 144.857 at iteration 119
#> Achieved score: 136.857 at iteration 125
#> Achieved score: 130.857 at iteration 136
#> Achieved score: 126.857 at iteration 139
#> Achieved score: 124.857 at iteration 140
#> Achieved score: 124.857 at iteration 141
#> Achieved score: 118.857 at iteration 145
#> Achieved score: 114.857 at iteration 147
#> Achieved score: 112.857 at iteration 150
#> Achieved score: 108.857 at iteration 161
#> Achieved score: 100.857 at iteration 166
#> Achieved score: 94.857 at iteration 168
#> Achieved score: 92.857 at iteration 169
#> Achieved score: 90.857 at iteration 176
#> Achieved score: 88.857 at iteration 186
#> Achieved score: 86.857 at iteration 188
#> Achieved score: 82.857 at iteration 191
#> Achieved score: 80.857 at iteration 194
#> Achieved score: 78.857 at iteration 195
#> Achieved score: 78.857 at iteration 199
#> Achieved score: 76.857 at iteration 206
#> Achieved score: 76.857 at iteration 213
#> Achieved score: 72.857 at iteration 226
#> Achieved score: 66.857 at iteration 231
#> Achieved score: 62.857 at iteration 233
#> Achieved score: 60.857 at iteration 237
#> Achieved score: 58.857 at iteration 245
#> Achieved score: 58.857 at iteration 249
#> Achieved score: 56.857 at iteration 252
#> Achieved score: 56.857 at iteration 256
#> Achieved score: 54.857 at iteration 267
#> Achieved score: 52.857 at iteration 290
#> Achieved score: 50.857 at iteration 296
#> Achieved score: 48.857 at iteration 300
#> Achieved score: 42.857 at iteration 318
#> Achieved score: 40.857 at iteration 348
#> Achieved score: 40.857 at iteration 357
#> Achieved score: 38.857 at iteration 384
#> Achieved score: 36.857 at iteration 420
#> Achieved score: 34.857 at iteration 485
#> Achieved score: 32.857 at iteration 636
#> Achieved score: 30.857 at iteration 783
#> process real
#> 3.37s 3.35s
assign_random(bc)
#> Batch container with 672 locations and 576 samples (assigned).
#> Dimensions: plates, chips, rows, columns
bc$get_samples()
#> # A tibble: 672 × 10
#> plates chips rows columns ID SampleType Race AgeGrp dummy_var1
#> <int> <int> <int> <int> <dbl> <fct> <fct> <fct> <dbl>
#> 1 1 1 1 1 563 Control Hispanic (30,40] 0.342
#> 2 1 1 1 2 488 Control European (30,40] 2.14
#> 3 1 1 2 1 215 Case European (50,60] -0.152
#> 4 1 1 2 2 268 Case Hispanic (40,50] 0.693
#> 5 1 1 3 1 165 Case European (60,100] 0.582
#> 6 1 1 3 2 86 Case Hispanic (60,100] 1.34
#> 7 1 1 4 1 264 Case European (60,100] 0.437
#> 8 1 1 4 2 451 Control European (30,40] -0.203
#> 9 1 1 5 1 250 Case Hispanic (60,100] -1.63
#> 10 1 1 5 2 NA <NA> <NA> <NA> NA
#> # ℹ 662 more rows
#> # ℹ 1 more variable: dummy_var2 <chr>
bc$get_samples(remove_empty_locations = TRUE)
#> # A tibble: 576 × 10
#> plates chips rows columns ID SampleType Race AgeGrp dummy_var1
#> <int> <int> <int> <int> <dbl> <fct> <fct> <fct> <dbl>
#> 1 1 1 1 1 563 Control Hispanic (30,40] 0.342
#> 2 1 1 1 2 488 Control European (30,40] 2.14
#> 3 1 1 2 1 215 Case European (50,60] -0.152
#> 4 1 1 2 2 268 Case Hispanic (40,50] 0.693
#> 5 1 1 3 1 165 Case European (60,100] 0.582
#> 6 1 1 3 2 86 Case Hispanic (60,100] 1.34
#> 7 1 1 4 1 264 Case European (60,100] 0.437
#> 8 1 1 4 2 451 Control European (30,40] -0.203
#> 9 1 1 5 1 250 Case Hispanic (60,100] -1.63
#> 10 1 1 6 1 191 Case Hispanic (60,100] 0.696
#> # ℹ 566 more rows
#> # ℹ 1 more variable: dummy_var2 <chr>
scoring_f <- list(
fc0 = function(samples) rnorm(1) + 2 * rexp(1),
fc1 = function(samples) rnorm(1, 100),
fc2 = function(samples) -7
)
bc$score(scoring_f)
#> fc0 fc1 fc2
#> 3.64428 100.78084 -7.00000