See the vignette “Airports.rmd” for a walk-through of the package and how it is used.
Here is a toy example which is also found in the help page for
biclustermd()
.
::install_github("jreisner/biclustermd")
devtoolslibrary(biclustermd)
?biclustermd
# EXAMPLE 1 ----
data("synthetic")
# default parameters -- col_clusters = sqrt(ncol(synthetic)), row_clusters = sqrt(nrow(synthetic))
<- biclustermd(synthetic)
bc
bcautoplot(bc)
# providing the true number of row and column clusters
<- biclustermd(synthetic, col_clusters = 3, row_clusters = 2)
bc
bcautoplot(bc)
# plot the similarity indices
autoplot(bc$Similarites)
autoplot(bc$Similarites, facet = FALSE)
# view the decrease in SSE from iteration to iteration
autoplot(bc$SSE)
# one could use a linear model to predict the missing values in cell (1, 1):
<- gather(bc) %>% filter(row_group == 1, col_group == 2)
bc_subset <- lm(value ~ col_name + row_name, data = bc_subset)
bc_subset_model summary(bc_subset_model)
predict(bc_subset_model, bc_subset)
# this is a perfect biclustering so the variation in the cell is zero.
# Another synthetic dataset with noise can demonstrate:
# EXAMPLE 2 ----
# first argument to kronecker() defines 4 row clusters and 4 column clusters
<- kronecker(matrix(1:16, nrow = 4, ncol = 4), matrix(5, nrow = 4, ncol = 4))
dat set.seed(29)
# make 35% of values missing
sample(1:length(dat), 0.35 * length(dat))] <- NA
dat[# randomly shuffle the matrix
<- dat[sample(1:nrow(dat), nrow(dat)), sample(1:ncol(dat), ncol(dat))]
dat # add N(0, 1) noise to each observation
<- dat + rnorm(prod(dim(dat)))
dat
# repeat biclustering 50 times and keep the lowest SSE result
<- rep_biclustermd(dat, nrep = 50, col_clusters = 4, row_clusters = 4)
rep_dat_bc
rep_dat_bc
plot(rep_dat_bc$rep_sse, type = 'o')
plot(cummin(rep_dat_bc$rep_sse), type = 'o')
autoplot(rep_dat_bc$best_bc)
# could choose any cell, but I'm picking (1, 3) since all rows and columns have
# at least 2 observations
<- gather(rep_dat_bc$best_bc) %>%
dat_bc_sub filter(row_group == 1, col_group == 3)
<- lm(value ~ col_name + row_name, data = dat_bc_sub)
dat_bc_sub_model summary(dat_bc_sub_model)
anova(dat_bc_sub_model)
# neither covariate is useful, but that's because the biclustering did it's job
<- dat_bc_sub %>%
dat_bc_sub ::add_predictions(dat_bc_sub_model) %>%
modelr::add_residuals(dat_bc_sub_model)
modelrsqrt(mean(dat_bc_sub$resid ^ 2, na.rm = TRUE))
%>%
dat_bc_sub ggplot(aes(row_name, col_name, fill = pred)) +
geom_tile() +
ggtitle("predicted values")
%>%
dat_bc_sub ggplot(aes(row_name, col_name, fill = resid)) +
geom_tile() +
ggtitle("linear model residuals")