## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

# Set a flag to determine if this is being run on CRAN
on_cran <- !identical(Sys.getenv("NOT_CRAN"), "true")

## ----setup--------------------------------------------------------------------
library(diseasystore)

## ----benchmark data, include = FALSE------------------------------------------
# Our benchmark data is the mtcars data set but repeated to increase the data size
# If updated, please also update the version in the data-raw/benchmark.R
data_generator <- function(repeats) {
  purrr::map(
    seq(repeats),
    \(i) {
      dplyr::mutate(
        mtcars,
        "row_id" = dplyr::row_number() + (i - 1) * nrow(mtcars),
        "car" = paste(rownames(mtcars), .data$row_id)
      )
    }
  ) |>
    purrr::reduce(rbind) |>
    dplyr::rename_with(~ tolower(gsub(".", "_", .x, fixed = TRUE)))
}

benchmark_data <- tibble::as_tibble(data_generator(1000)) |>
  dplyr::select("row_id", "car", dplyr::everything())

## ----benchmark data print-----------------------------------------------------
benchmark_data

## ----eval = FALSE-------------------------------------------------------------
#  DiseasystoreMtcars <- R6::R6Class(
#    classname = "DiseasystoreBase",
#    inherit = DiseasystoreBase,
#    private = list(
#      .ds_map = list("n_cyl" = "mtcars_cyl", "vs" = "mtcars_vs"),
#      mtcars_cyl = FeatureHandler$new(
#        compute = function(start_date, end_date, slice_ts, source_conn) {
#          out <- benchmark_data |>
#            dplyr::transmute(
#              "key_car" = .data$car, "n_cyl" = .data$cyl,
#              "valid_from" = Sys.Date() - lubridate::days(2 * .data$row_id - 1),
#              "valid_until" = .data$valid_from + lubridate::days(2)
#            )
#          return(out)
#        },
#        key_join = key_join_sum
#      ),
#      mtcars_vs = FeatureHandler$new(
#        compute = function(start_date, end_date, slice_ts, source_conn) {
#          out <- benchmark_data |>
#            dplyr::transmute(
#              "key_car" = .data$car, .data$vs,
#              "valid_from" = Sys.Date() - lubridate::days(2 * .data$row_id),
#              "valid_until" = .data$valid_from + lubridate::days(2)
#            )
#          return(out)
#        },
#        key_join = key_join_sum
#      )
#    )
#  )

## ----benchmark context, results = "asis", include = FALSE---------------------
if (on_cran) {
  cat(
    "The results of the benchmark are shown graphically below (mean and standard deviation), where measure the",
    "performance of `diseasystore`."
  )
} else {
  cat(
    "The results of the benchmark are shown graphically below (mean and standard deviation), where we compare the",
    "current development version of `diseasystore` with the current CRAN version. In addition, we run the benchmarks",
    "with the current development version of `{SCDB}` versus the current CRAN version since `{SCDB}` is used",
    "internally in `diseasystore`."
  )
}

## ----echo = FALSE, eval = rlang::is_installed("here")-------------------------
benchmark_location <- c(
  devtools::package_file("inst", "extdata", "benchmarks.rds"),
  system.file("extdata", "benchmarks.rds", package = "diseasystore"),
  here::here("inst", "extdata", "benchmarks.rds")
) |>
  purrr::discard(~ identical(., "")) |>
  purrr::pluck(1)

benchmarks <- readRDS(benchmark_location) |>
  dplyr::mutate("version" = as.character(.data$version))

# The benchmark contains data for the CRAN version, the main branch and whatever branch the workflow was run on.
# We need to render the vignette differently dependent on where the vignette is rendered.

# If we render the vignette on CRAN, we need to only show benchmarks for the current CRAN version SCDB and the
# newest development benchmarks -- but mark these benchmarks as having the new version of diseasystore.
# (since we are releasing a new version of diseasystore, these benchmarks will belong to the new CRAN version)

# If we render the vignette on main, we need to render the benchmarks from the non-main branch as the main results
# That is because these benchmarks will be newer than those on main, and the code that generate the results
# will have been merged onto main -- causing the vignette to be rendered.

# If we render the vignette on the non-main branch, we need to render all of the benchmarks

# Determine if the SHA is on main
sha <- benchmarks |>
  dplyr::distinct(.data$version) |>
  dplyr::filter(!startsWith(.data$version, "diseasystore"), .data$version != "main") |>
  dplyr::pull("version")

# Check local git history
on_main <- tryCatch({
  system(glue::glue("git branch main --contains {sha}"), intern = TRUE) |>
    stringr::str_detect(stringr::fixed("main")) |>
    isTRUE()
}, warning = function(w) {
  # If on GitHub, git is not installed and we assume TRUE.
  # This will render the vignette as it will look once merged onto main.
  return(identical(Sys.getenv("CI"), "true"))
})

# If we are on CRAN, use the newest benchmark (version = sha) and keep only CRAN SCDB results
# This benchmark is then labelled with the neweset version number of diseasystore (the one we just deployed to CRAN)
if (on_cran) {

  benchmarks <- benchmarks |>
    dplyr::filter(.data$version == !!sha, .data$SCDB != "main") |>
    dplyr::mutate("version" = paste0("diseasystore v", packageVersion("diseasystore")))

} else if (on_main) {

  # If the SHA has been merged, use as the "main" version and remove the other, older, main version
  benchmarks <- benchmarks |>
    dplyr::filter(.data$version != "main") |>
    dplyr::mutate("version" = dplyr::if_else(.data$version == sha, "development", .data$version))

}

## ----Generate alt text for benchmarks, echo = FALSE, results = "hide", eval = rlang::is_installed("here")----
benchmark_alt_text <- benchmarks |>
  dplyr::filter(startsWith(.data$version, "diseasystore v"), .data$SCDB != "main") |>
  dplyr::transmute(
    .data$benchmark_function,
    .data$database,
    "time_seconds" = .data$time / 1e9
  ) |>
  dplyr::mutate("backend" = stringr::str_remove(.data$database, r"{\sv[\w+\.]+$}")) |>
  dplyr::summarise(
    "mean_time_seconds" = round(mean(.data$time_seconds)),
    .by = c("benchmark_function", "backend")
  ) |>
  dplyr::summarise(
    "backends" = toString(.data$backend),
    .by = c("benchmark_function", "mean_time_seconds")
  ) |>
  dplyr::arrange(.data$benchmark_function, .data$mean_time_seconds) |>
  purrr::pmap_chr(
    \(benchmark_function, mean_time_seconds, backends) {
      glue::glue(
        "The {benchmark_function} benchmark takes {mean_time_seconds} seconds to run on the {backends} backends."
      )
    }
  ) |>
  purrr::reduce(paste, .init = "The results for the benchmarks of the CRAN versions are as follows:")

## ----echo = FALSE, fig.alt = benchmark_alt_text, eval = rlang::is_installed("here")----
# Mean and standard deviation (see ggplot2::mean_se())
mean_sd <- function(x) {
  mu <- mean(x)
  sd <- sd(x)
  data.frame(y = mu, ymin = mu - sd, ymax = mu + sd)
}

# Determine subgroups
groups <- setdiff(colnames(benchmarks), c("expr", "time", "benchmark_function", "database", "version", "n"))

# Apply "dodging" to sub-groups to show graphically
dodge <- ggplot2::position_dodge(width = 0.6)

# Insert newline into database name to improve rendering of figures
labeller <- ggplot2::as_labeller(\(l) stringr::str_replace_all(l, stringr::fixed(" v"), "\nv"))

g <- ggplot2::ggplot(
  benchmarks,
  ggplot2::aes(
    x = version,
    y = time / 1e9,
    group = !!switch(length(groups) > 0, as.symbol(groups)),
    color = !!switch(length(groups) > 0, as.symbol(groups))
  )
) +
  ggplot2::stat_summary(fun.data = mean_sd, geom = "pointrange", size = 0.5, linewidth = 1, position = dodge) +
  ggplot2::facet_grid(
    rows = ggplot2::vars(benchmark_function),
    cols = ggplot2::vars(database),
    scales = "free_y",
    labeller = labeller
  ) +
  ggplot2::scale_x_discrete(guide = ggplot2::guide_axis(n.dodge = 2)) +
  ggplot2::labs(x = "Codebase version", y = "Time (s)") +
  ggplot2::theme(
    legend.position = "bottom",
    legend.justification = "left"
  ) +
  ggplot2::ylim(0, NA)

g