OSAT and scoring functions

In this vignette, we demonstrate how to use the OSAT score (Yan et al. (2012)).

library(designit)
library(tidyverse)
if (!requireNamespace("OSAT")) {
  print("This vignette can only be rendered if `OSAT` package is installed.")
  knitr::knit_exit()
}
#> Loading required namespace: OSAT

Loading samples. We add two dummy columns to demonstrate how to choose batch columns of interest.

osat_data_path <- system.file("extdata", package = "OSAT")
samples <- read_tsv(file.path(osat_data_path, "samples.txt"),
  col_types = cols(SampleType = col_factor(), Race = col_factor(), AgeGrp = col_factor())
) |>
  mutate(dummy_var1 = rnorm(n()), dummy_var2 = str_c(SampleType, Race, sep = " "))

Running OSAT optimization

Here we use OSAT to optimize setup.

gs <- OSAT::setup.sample(samples, optimal = c("SampleType", "Race", "AgeGrp"))

gc <- OSAT::setup.container(OSAT::IlluminaBeadChip96Plate, 7, batch = "plates")

set.seed(1234)

bench::system_time(
  g_setup <- OSAT::create.optimized.setup(sample = gs, container = gc, nSim = params$iterations)
)
#> Warning in OSAT::create.optimized.setup(sample = gs, container = gc, nSim =
#> params$iterations): Using default optimization method: optimal.shuffle
#> process    real 
#>   480ms   476ms

OSAT::QC(g_setup)
#> 
#> Test independence between "plates" and sample variables
#> 
#> Pearson's Chi-squared test
#>          Var X-squared df   p.value
#> 1 SampleType 0.1126719  6 0.9999714
#> 2       Race 0.3062402  6 0.9994663
#> 3     AgeGrp 1.8220606 24 1.0000000
#> 

Saving starting point of optimization

set.seed(1234)

g_setup_start <- OSAT::create.optimized.setup(sample = gs, container = gc, nSim = 1) |>
  OSAT::get.experiment.setup()
#> Warning in OSAT::create.optimized.setup(sample = gs, container = gc, nSim = 1):
#> Using default optimization method: optimal.shuffle

Visualize various batch factors. OSAT score is optimized only for plates in this case.

OSAT::get.experiment.setup(g_setup) |>
  select(AgeGrp, plates, chipRows, chipColumns, chips, rows, columns, wells) |>
  pivot_longer(-AgeGrp) |>
  count(AgeGrp, value, name) |>
  ggplot(aes(AgeGrp, n, fill = factor(value))) +
  geom_col(position = "dodge") +
  facet_wrap(~name, scales = "free_y")

Visualize for plates

plot_batch <- function(df) {
  df |>
    select(plates, SampleType, Race, AgeGrp) |>
    pivot_longer(c(SampleType, Race, AgeGrp), names_to = "variable", values_to = "level") |>
    count(plates, variable, level) |>
    ggplot(aes(level, n, fill = factor(plates))) +
    geom_col(position = "dodge") +
    facet_wrap(~variable, scales = "free", ncol = 1)
}

Before the optimization.

g_setup_start |> plot_batch()

After the optimization.

OSAT::get.experiment.setup(g_setup) |>
  plot_batch()

Compare scores with various implementations

Compare OSAT score generated using designit.

OSAT::getLayout(gc) |>
  left_join(OSAT::get.experiment.setup(g_setup)) |>
  data.table::data.table() |>
  osat_score("plates", c("SampleType", "Race", "AgeGrp")) |>
  with(score)
#> Joining with `by = join_by(plates, chipRows, chipColumns, chips, rows, columns,
#> wells)`
#> Warning in osat_score(data.table::data.table(left_join(OSAT::getLayout(gc), :
#> NAs in features / batch columns; they will be excluded from scoring
#> [1] 26.85714

# score using OSAT
g_setup@metadata$optValue |> tail(1)
#> [1] 28.85714

Run using BatchContainer

First let’s create a BatchContainer with same dimensions.

bc <- BatchContainer$new(
  dimensions = c(plates = 7, chips = 8, rows = 6, columns = 2)
)
bc
#> Batch container with 672 locations.
#>   Dimensions: plates, chips, rows, columns

bc$n_locations
#> [1] 672

Assign samples and get initial setup.

bc <- assign_in_order(bc, samples)

starting_assignment <- bc$get_locations() |>
  left_join(g_setup_start) |>
  pull(ID) |>
  as.integer()
#> Joining with `by = join_by(plates, chips, rows, columns)`

bc$move_samples(location_assignment = starting_assignment)

bc$get_samples(remove_empty_locations = TRUE) |>
  plot_batch()

Using designit OSAT score implementation

scoring_f <- osat_score_generator("plates", c("SampleType", "Race", "AgeGrp"))

bc$score(scoring_f)
#> Warning in osat_score(bc, batch_vars = batch_vars, feature_vars = feature_vars,
#> : NAs in features / batch columns; they will be excluded from scoring
#>  score_1 
#> 360.8571
g_setup@metadata$optValue |> head(1)
#> [1] 360.8571
# should be identical

bench::system_time({
  set.seed(123)
  bc_reference <- optimize_design(bc, scoring = scoring_f, max_iter = params$iterations)
})
#> Checking variances of 1-dim. score vector.
#> ... (6749.04) - OK
#> Initial score: 360.857
#> Achieved score: 348.857 at iteration 7
#> Achieved score: 344.857 at iteration 8
#> Achieved score: 338.857 at iteration 10
#> Achieved score: 326.857 at iteration 19
#> Achieved score: 322.857 at iteration 22
#> Achieved score: 320.857 at iteration 24
#> Achieved score: 318.857 at iteration 26
#> Achieved score: 314.857 at iteration 27
#> Achieved score: 308.857 at iteration 28
#> Achieved score: 302.857 at iteration 32
#> Achieved score: 300.857 at iteration 33
#> Achieved score: 292.857 at iteration 36
#> Achieved score: 290.857 at iteration 38
#> Achieved score: 290.857 at iteration 39
#> Achieved score: 284.857 at iteration 40
#> Achieved score: 274.857 at iteration 45
#> Achieved score: 270.857 at iteration 50
#> Achieved score: 264.857 at iteration 54
#> Achieved score: 256.857 at iteration 55
#> Achieved score: 246.857 at iteration 60
#> Achieved score: 240.857 at iteration 62
#> Achieved score: 222.857 at iteration 71
#> Achieved score: 220.857 at iteration 72
#> Achieved score: 214.857 at iteration 73
#> Achieved score: 202.857 at iteration 78
#> Achieved score: 200.857 at iteration 79
#> Achieved score: 192.857 at iteration 80
#> Achieved score: 184.857 at iteration 84
#> Achieved score: 178.857 at iteration 85
#> Achieved score: 174.857 at iteration 86
#> Achieved score: 172.857 at iteration 95
#> Achieved score: 166.857 at iteration 96
#> Achieved score: 156.857 at iteration 98
#> Achieved score: 150.857 at iteration 99
#> Achieved score: 146.857 at iteration 103
#> Achieved score: 144.857 at iteration 108
#> Achieved score: 142.857 at iteration 111
#> Achieved score: 140.857 at iteration 113
#> Achieved score: 136.857 at iteration 114
#> Achieved score: 134.857 at iteration 128
#> Achieved score: 130.857 at iteration 133
#> Achieved score: 120.857 at iteration 138
#> Achieved score: 116.857 at iteration 140
#> Achieved score: 108.857 at iteration 149
#> Achieved score: 106.857 at iteration 152
#> Achieved score: 106.857 at iteration 164
#> Achieved score: 104.857 at iteration 171
#> Achieved score: 102.857 at iteration 175
#> Achieved score: 100.857 at iteration 176
#> Achieved score: 96.857 at iteration 179
#> Achieved score: 92.857 at iteration 190
#> Achieved score: 90.857 at iteration 191
#> Achieved score: 88.857 at iteration 200
#> Achieved score: 84.857 at iteration 205
#> Achieved score: 78.857 at iteration 209
#> Achieved score: 76.857 at iteration 223
#> Achieved score: 74.857 at iteration 250
#> Achieved score: 70.857 at iteration 265
#> Achieved score: 68.857 at iteration 268
#> Achieved score: 68.857 at iteration 283
#> Achieved score: 64.857 at iteration 295
#> Achieved score: 62.857 at iteration 336
#> Achieved score: 60.857 at iteration 350
#> Achieved score: 58.857 at iteration 368
#> Achieved score: 56.857 at iteration 389
#> Achieved score: 54.857 at iteration 406
#> Achieved score: 52.857 at iteration 418
#> Achieved score: 50.857 at iteration 428
#> Achieved score: 50.857 at iteration 431
#> Achieved score: 48.857 at iteration 433
#> Achieved score: 46.857 at iteration 452
#> Achieved score: 44.857 at iteration 496
#> Achieved score: 42.857 at iteration 553
#> Achieved score: 40.857 at iteration 562
#> Achieved score: 38.857 at iteration 612
#> Achieved score: 36.857 at iteration 717
#> Achieved score: 34.857 at iteration 727
#> Achieved score: 32.857 at iteration 828
#> Achieved score: 30.857 at iteration 853
#> Achieved score: 30.857 at iteration 859
#> Achieved score: 28.857 at iteration 904
#> Achieved score: 26.857 at iteration 970
#> process    real 
#>   3.56s   3.54s
# final score
bc_reference$score(scoring_f)
#>  score_1 
#> 26.85714
bc_reference$plot_trace() +
  ggtitle(str_glue("Final score={bc$score(scoring_f)}"))

bc$get_samples(remove_empty_locations = TRUE) |>
  plot_batch()

Manually work with data.table

Instead of relying on BatchContainer, here we have a manual optimization process using data.table.

fast_osat_optimize <- function(bc, batch_vars, feature_vars, iterations) {
  bc <- bc$copy()
  ldf <- data.table::data.table(bc$get_locations())[, c("plates")][, ".sample_id" := bc$assignment]
  fcols <- c(".sample_id", feature_vars)
  smp <- data.table::data.table(bc$samples)[, ..fcols]
  df <- smp[ldf, on = ".sample_id"]

  v <- osat_score(df, batch_vars, feature_vars)
  edf <- v$expected_dt
  current_score <- v$score
  scores <- numeric(length = iterations)
  n_avail <- nrow(df)

  for (i in 1:iterations) {
    repeat {
      pos <- sample(n_avail, 2)

      # does not make sense to shuffle NAs
      if (any(!is.na(df[pos, feature_vars[1]]))) {
        break
      }
    }

    val <- df[c(pos[2], pos[1]), fcols, with = FALSE]
    df[c(pos[1], pos[2]), (fcols) := val]

    new_score <- osat_score(df, batch_vars, feature_vars, edf)$score
    if (new_score <= current_score) {
      current_score <- new_score
    } else {
      df[c(pos[2], pos[1]), (fcols) := val]
    }

    scores[i] <- current_score
  }

  bc$assignment <- df$.sample_id

  list(bc=bc, scores=scores)
}

bench::system_time({
  set.seed(123)
  opt_res <- fast_osat_optimize(bc, "plates", c("SampleType", "Race", "AgeGrp"), iterations = params$iterations)
})
#> Warning in osat_score(df, batch_vars, feature_vars): NAs in features / batch
#> columns; they will be excluded from scoring
#> Warning in (function (assignment) : this field might become read-only in the
#> future, please use $move_samples() instead
#> process    real 
#>   2.17s   2.15s

Shuffle optimization with burn-in

scoring_f <- osat_score_generator("plates", c("SampleType", "Race", "AgeGrp"))

burn_in_it <- floor(params$iterations * 0.1)
burn_in_it
#> [1] 100

bench::system_time({
  set.seed(123)
  bc_burn_in <- optimize_design(
    bc,
    scoring = scoring_f,
    n_shuffle = c(
      rep(20, burn_in_it),
      rep(
        2,
        params$iterations - burn_in_it
      )
    ),
    max_iter = params$iterations
  )
})
#> Warning in osat_score(bc, batch_vars = batch_vars, feature_vars = feature_vars,
#> : NAs in features / batch columns; they will be excluded from scoring
#> Checking variances of 1-dim. score vector.
#> ... (5018.697) - OK
#> Initial score: 360.857
#> Achieved score: 322.857 at iteration 1
#> Achieved score: 290.857 at iteration 5
#> Achieved score: 278.857 at iteration 7
#> Achieved score: 254.857 at iteration 8
#> Achieved score: 238.857 at iteration 15
#> Achieved score: 224.857 at iteration 28
#> Achieved score: 216.857 at iteration 32
#> Achieved score: 198.857 at iteration 34
#> Achieved score: 196.857 at iteration 50
#> Achieved score: 184.857 at iteration 54
#> Achieved score: 172.857 at iteration 59
#> Achieved score: 162.857 at iteration 75
#> Achieved score: 158.857 at iteration 80
#> Achieved score: 158.857 at iteration 93
#> Achieved score: 156.857 at iteration 101
#> Achieved score: 148.857 at iteration 102
#> Achieved score: 144.857 at iteration 119
#> Achieved score: 136.857 at iteration 125
#> Achieved score: 130.857 at iteration 136
#> Achieved score: 126.857 at iteration 139
#> Achieved score: 124.857 at iteration 140
#> Achieved score: 124.857 at iteration 141
#> Achieved score: 118.857 at iteration 145
#> Achieved score: 114.857 at iteration 147
#> Achieved score: 112.857 at iteration 150
#> Achieved score: 108.857 at iteration 161
#> Achieved score: 100.857 at iteration 166
#> Achieved score: 94.857 at iteration 168
#> Achieved score: 92.857 at iteration 169
#> Achieved score: 90.857 at iteration 176
#> Achieved score: 88.857 at iteration 186
#> Achieved score: 86.857 at iteration 188
#> Achieved score: 82.857 at iteration 191
#> Achieved score: 80.857 at iteration 194
#> Achieved score: 78.857 at iteration 195
#> Achieved score: 78.857 at iteration 199
#> Achieved score: 76.857 at iteration 206
#> Achieved score: 76.857 at iteration 213
#> Achieved score: 72.857 at iteration 226
#> Achieved score: 66.857 at iteration 231
#> Achieved score: 62.857 at iteration 233
#> Achieved score: 60.857 at iteration 237
#> Achieved score: 58.857 at iteration 245
#> Achieved score: 58.857 at iteration 249
#> Achieved score: 56.857 at iteration 252
#> Achieved score: 56.857 at iteration 256
#> Achieved score: 54.857 at iteration 267
#> Achieved score: 52.857 at iteration 290
#> Achieved score: 50.857 at iteration 296
#> Achieved score: 48.857 at iteration 300
#> Achieved score: 42.857 at iteration 318
#> Achieved score: 40.857 at iteration 348
#> Achieved score: 40.857 at iteration 357
#> Achieved score: 38.857 at iteration 384
#> Achieved score: 36.857 at iteration 420
#> Achieved score: 34.857 at iteration 485
#> Achieved score: 32.857 at iteration 636
#> Achieved score: 30.857 at iteration 783
#> process    real 
#>   3.37s   3.35s
tibble(
  i = bc_burn_in$trace$scores[[1]]$step,
  normal = bc_reference$trace$scores[[1]]$score_1,
  burnin = bc_burn_in$trace$scores[[1]]$score_1
) |>
  pivot_longer(-i, names_to = "method", values_to = "score") |>
  ggplot(aes(i, score, col = method)) +
  geom_line()

Score demonstration

bc$score(scoring_f)
#>  score_1 
#> 360.8571
assign_random(bc)
#> Batch container with 672 locations and 576 samples (assigned).
#>   Dimensions: plates, chips, rows, columns

bc$get_samples()
#> # A tibble: 672 × 10
#>    plates chips  rows columns    ID SampleType Race     AgeGrp   dummy_var1
#>     <int> <int> <int>   <int> <dbl> <fct>      <fct>    <fct>         <dbl>
#>  1      1     1     1       1   563 Control    Hispanic (30,40]       0.342
#>  2      1     1     1       2   488 Control    European (30,40]       2.14 
#>  3      1     1     2       1   215 Case       European (50,60]      -0.152
#>  4      1     1     2       2   268 Case       Hispanic (40,50]       0.693
#>  5      1     1     3       1   165 Case       European (60,100]      0.582
#>  6      1     1     3       2    86 Case       Hispanic (60,100]      1.34 
#>  7      1     1     4       1   264 Case       European (60,100]      0.437
#>  8      1     1     4       2   451 Control    European (30,40]      -0.203
#>  9      1     1     5       1   250 Case       Hispanic (60,100]     -1.63 
#> 10      1     1     5       2    NA <NA>       <NA>     <NA>         NA    
#> # ℹ 662 more rows
#> # ℹ 1 more variable: dummy_var2 <chr>
bc$get_samples(remove_empty_locations = TRUE)
#> # A tibble: 576 × 10
#>    plates chips  rows columns    ID SampleType Race     AgeGrp   dummy_var1
#>     <int> <int> <int>   <int> <dbl> <fct>      <fct>    <fct>         <dbl>
#>  1      1     1     1       1   563 Control    Hispanic (30,40]       0.342
#>  2      1     1     1       2   488 Control    European (30,40]       2.14 
#>  3      1     1     2       1   215 Case       European (50,60]      -0.152
#>  4      1     1     2       2   268 Case       Hispanic (40,50]       0.693
#>  5      1     1     3       1   165 Case       European (60,100]      0.582
#>  6      1     1     3       2    86 Case       Hispanic (60,100]      1.34 
#>  7      1     1     4       1   264 Case       European (60,100]      0.437
#>  8      1     1     4       2   451 Control    European (30,40]      -0.203
#>  9      1     1     5       1   250 Case       Hispanic (60,100]     -1.63 
#> 10      1     1     6       1   191 Case       Hispanic (60,100]      0.696
#> # ℹ 566 more rows
#> # ℹ 1 more variable: dummy_var2 <chr>

scoring_f <- list(
  fc0 = function(samples) rnorm(1) + 2 * rexp(1),
  fc1 = function(samples) rnorm(1, 100),
  fc2 = function(samples) -7
)

bc$score(scoring_f)
#>       fc0       fc1       fc2 
#>   3.64428 100.78084  -7.00000