This vignette demonstrates application of glmnet and varbvs to the Leukemia data set. The main aim of this script is to illustrate some of the different properties of Bayesian variable selection and penalized sparse regression (as implemented by varbvs and glmnet, respectively).
We use the preprocessed data of Dettling (2004) retrieved from the
supplementary materials accompanying Friedman et al (2010). The
data are represented as a 72 x 3,571 matrix of gene expression values
(variable X
), and a vector of 72 binary disease outcomes
(variable y
).
Begin by loading these packages into your R environment.
library(lattice)
library(latticeExtra)
library(glmnet)
library(varbvs)
Specify settings for the glmnet analysis.
<- 20 # Number of cross-validation folds.
nfolds <- 0.95 # Elastic net mixing parameter.
alpha <- 10^(seq(0,-2,-0.05)) # Lambda sequence. lambda
Also set the random number generator seed.
data(leukemia)
<- leukemia$x
X <- leukemia$y
y set.seed(1)
Here, we also run 20-fold cross-validation to select the largest setting of the L1-penalty strength (lambda) that is within 1 standard error of the minimum classification error.
# This is the model fitting step.
<- system.time(fit.glmnet <-
r glmnet(X,y,family = "binomial",lambda = lambda,alpha = alpha))
cat(sprintf("Model fitting took %0.2f seconds.\n",r["elapsed"]))
# Model fitting took 0.03 seconds.
# This is the cross-validation step.
<- system.time(out.cv.glmnet <-
r cv.glmnet(X,y,family = "binomial",type.measure = "class",
alpha = alpha,nfolds = nfolds,lambda = lambda))
<- out.cv.glmnet$lambda
lambda cat(sprintf("Cross-validation took %0.2f seconds.\n",r["elapsed"]))
# Cross-validation took 0.59 seconds.
# Choose the largest value of lambda that is within 1 standard error
# of the smallest misclassification error.
<- out.cv.glmnet$lambda.1se lambda.opt
Compute estimates of the disease outcome using the fitted model, and compare against the observed values.
cat("classification results with lambda = ",lambda.opt,":\n",sep="")
<- c(predict(fit.glmnet,X,s = lambda.opt,type = "class"))
y.glmnet print(table(true = factor(y),pred = factor(y.glmnet)))
# classification results with lambda = 0.02818383:
# pred
# true 0 1
# 0 47 0
# 1 0 25
The first plot shows the evolution of regression coefficients at different settings of lambda. (Note that the intercept is not shown.) Only the curves for the variables that are selected at the optimal setting of lambda (“lambda.opt”“) are labeled.
The second plot shows the classification error at different settings of lambda.
The third plot shows the number of nonzero regression coefficients at different settings of lambda.
trellis.par.set(par.xlab.text = list(cex = 0.85),
par.ylab.text = list(cex = 0.85),
axis.text = list(cex = 0.75))
# Choose the largest value of lambda that is within 1 standard error
# of the smallest misclassification error.
<- out.cv.glmnet$lambda.1se
lambda.opt
# Plot regression coefficients.
<- fit.glmnet$lambda
lambda <- setdiff(which(rowSums(abs(coef(fit.glmnet))) > 0),1)
vars <- length(vars)
n <- as.matrix(t(coef(fit.glmnet)[vars,]))
b <- coef(fit.glmnet,s = lambda.opt)
i <- rownames(i)[which(i != 0)]
i <- i[-1]
i <- colnames(b)
vars.opt !is.element(vars.opt,i)] <- ""
vars.opt[<- substring(vars.opt,2)
vars.opt <- expression("more complex" %<-% paste(log[10],lambda) %->%
lab "less complex")
<- xyplot(y ~ x,data.frame(x = log10(lambda),y = b[,1]),type = "l",
r col = "blue",xlab = lab,ylab = "regression coefficient",
scales = list(x = list(limits = c(-2.35,0.1)),
y = list(limits = c(-0.8,1.2))),
panel = function(x, y, ...) {
panel.xyplot(x,y,...);
panel.abline(v = log10(lambda.opt),col = "orangered",
lwd = 2,lty = "dotted");
ltext(x = -2,y = b[nrow(b),],labels = vars.opt,pos = 2,
offset = 0.5,cex = 0.5);
})for (i in 2:n)
<- r + as.layer(xyplot(y ~ x,data.frame(x = log10(lambda),y = b[,i]),
r type = "l",col = "blue"))
print(r,split = c(2,1,2,1),more = TRUE)
# Plot classification error.
<- predict(fit.glmnet,X,type = "class")
Y mode(Y) <- "numeric"
print(with(out.cv.glmnet,
xyplot(y ~ x,data.frame(x = log10(lambda),y = cvm),type = "l",
col = "blue",xlab = lab,
ylab = "20-fold cross-validation \n classification error",
scales = list(y = list(limits = c(-0.02,0.45))),
panel = function(x, y, ...) {
panel.xyplot(x,y,...);
panel.abline(v = log10(lambda.opt),col = "orangered",
lwd = 2,lty = "dotted");
+
}) as.layer(xyplot(y ~ x,data.frame(x = log10(lambda),y = cvm),
pch = 20,cex = 0.6,col = "blue")) +
as.layer(xyplot(y ~ x,data.frame(x = log10(lambda),y = cvup),
type = "l",col = "blue",lty = "solid")) +
as.layer(xyplot(y ~ x,data.frame(x = log10(lambda),y = cvlo),
type = "l",col = "blue",lty = "solid")) +
as.layer(xyplot(y ~ x,data.frame(x = log10(lambda),
y = colMeans(abs(Y - y))),
type = "l",col = "darkorange",lwd = 2,
lty = "solid"))),
split = c(1,1,2,2),more = TRUE)
# Plot number of non-zero regression coefficients.
print(with(out.cv.glmnet,
xyplot(y ~ x,data.frame(x = log10(lambda),y = nzero),type = "l",
col = "blue",xlab = lab,
ylab = "number of non-zero \n coefficients",
panel = function(x, y, ...) {
panel.xyplot(x,y,...)
panel.abline(v = log10(lambda.opt),col = "orangered",
lwd = 2,lty = "dotted")
+
}) as.layer(xyplot(y ~ x,data.frame(x = log10(lambda),y = nzero),
pch = 20,cex = 0.6,col = "blue"))),
split = c(1,2,2,2),more = FALSE)
Fit the fully-factorized variational approximation to the posterior distribution of the coefficients for a logistic regression model of the binary outcome (the type of leukemia), with spike-and-slab priors on the coefficients.
<- system.time(fit.varbvs <- varbvs(X,NULL,y,"binomial",verbose = FALSE))
r cat(sprintf("Model fitting took %0.2f seconds.\n",r["elapsed"]))
# Model fitting took 2.84 seconds.
Compute estimates of the disease outcome using the fitted model, and compare against the observed values.
<- predict(fit.varbvs,X,type = "class")
y.varbvs print(table(true = factor(y),pred = factor(y.varbvs)))
# pred
# true 0 1
# 0 46 1
# 1 3 22
The first plot shows the classification error at each setting of the prior log-odds.
The second plot shows the evolution of the posterior mean regression coefficients (the beta’s) at different settings of the prior log-odds, for the top 6 variables ranked by posterior inclusion probability (averaged over settings of the hyperparameters).
The top-ranked variable (by posterior inclusion probability) has a much larger coefficient than all the others, so it is shown in a separate plot.
The third plot shows the (approximate) probability density of the prior log-odds parameter.
trellis.par.set(par.xlab.text = list(cex = 0.85),
par.ylab.text = list(cex = 0.85),
axis.text = list(cex = 0.75))
# Get the normalized importance weights.
<- fit.varbvs$w
w
# Plot classification error at each hyperparameter setting.
<- function (x)
sigmoid10 1/(1 + 10^(-x))
<- fit.varbvs$logodds
logodds <- log10(sigmoid10(logodds))
log10q <- length(logodds)
m <- rep(0,m)
err for (i in 1:m) {
<- subset(fit.varbvs,logodds == logodds[i])
r <- predict(r,X)
ypred <- mean(y != ypred)
err[i]
}<- expression("more complex" %<-% paste(log[10],pi) %->% "less complex")
lab print(xyplot(y ~ x,data.frame(x = log10q,y = err),type = "l",
col = "blue",xlab = lab,ylab = "classification error",
scales = list(x = list(limits = c(-0.9,-3.65)))) +
as.layer(xyplot(y ~ x,data.frame(x = log10q,y = err),
col = "blue",pch = 20,cex = 0.65)),
split = c(1,1,2,2),more = TRUE)
# Plot expected number of included variables at each hyperparameter
# setting.
<- colSums(fit.varbvs$alpha)
r print(xyplot(y ~ x,data.frame(x = log10q,y = r),type = "l",col = "blue",
xlab = lab,ylab = "expected number of\nincluded variables",
scales = list(x = list(limits = c(-0.9,-3.65)),
y = list(log = 10,at = c(1,10,100)))) +
as.layer(xyplot(y ~ x,data.frame(x = log10q,y = r),
col = "blue",pch = 20,cex = 0.65,
scales = list(x = list(limits = c(-0.9,-3.65)),
y = list(log = 10)))),
split = c(1,2,2,2),more = TRUE)
# Plot density of prior inclusion probability hyperparameter.
print(xyplot(y ~ x,data.frame(x = log10q,y = w),type = "l",col = "blue",
xlab = lab,
ylab = expression(paste("posterior probability of ",pi)),
scales = list(x = list(limits = c(-0.9,-3.65)))) +
as.layer(xyplot(y ~ x,data.frame(x = log10q,y = w),
col = "blue",pch = 20,cex = 0.65)),
split = c(2,1,2,1),more = FALSE)
Dettling, M. (2004). BagBoosting for tumor classification with gene expression data. Bioinformatics 20, 3583–3593.
Friedman, J., Hastie, T., Tibshirani, R. (2010) Regularization paths for generalized linear models via coordinate descent. Journal of Statistical Software 33, 1–22.
This is the version of R and the packages that were used to generate these results.
sessionInfo()
# R version 3.6.2 (2019-12-12)
# Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS Catalina 10.15.7
#
# Matrix products: default
# BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
# LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
#
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] glmnet_4.1-6 Matrix_1.3-4 latticeExtra_0.6-29
# [4] curl_4.3 varbvs_2.6-10 lattice_0.20-38
#
# loaded via a namespace (and not attached):
# [1] Rcpp_1.0.8 highr_0.8 bslib_0.3.1 compiler_3.6.2
# [5] RColorBrewer_1.1-2 jquerylib_0.1.4 iterators_1.0.12 tools_3.6.2
# [9] digest_0.6.23 jsonlite_1.7.2 evaluate_0.14 png_0.1-7
# [13] rlang_1.0.6 foreach_1.4.7 cli_3.5.0 yaml_2.2.0
# [17] xfun_0.39.1 fastmap_1.1.0 stringr_1.4.0 knitr_1.37
# [21] sass_0.4.0 grid_3.6.2 R6_2.4.1 jpeg_0.1-8.1
# [25] survival_3.1-8 rmarkdown_2.21 magrittr_2.0.1 codetools_0.2-16
# [29] htmltools_0.5.4 splines_3.6.2 shape_1.4.4 nor1mix_1.3-0
# [33] stringi_1.4.3