MOCHA is intended for use after clustering and cell labeling, since all functions of MOCHA operate on per-sample basis within each cell population. MOCHA can be used with an ArchR Project as input, as well as with a generic input. This tutorial has two sections: demonstrating MOCHA with generic inputs - Importing from Signac and Importing from SnapATAC.
This vignette will follow the Signac tutorial to generate an Signac object and label cell types within it. If you already have a Signac object with cell types labelled, skip to section Extract Fragments from Signac
library(Signac)
library(Seurat)
library(SeuratDisk)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)
set.seed(1234)
# Download files
system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5')
system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz')
system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz.tbi')
# Load in ATAC and RNA data
counts <- Read10X_h5("pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5")
fragpath <- "pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz"
# create a Seurat object containing the RNA adata
pbmc <- CreateSeuratObject(
counts = counts$`Gene Expression`,
assay = "RNA"
)
# create ATAC assay and add it to the object
pbmc[["ATAC"]] <- CreateChromatinAssay(
counts = counts$Peaks,
sep = c(":", "-"),
fragments = fragpath,
annotation = annotation
)
DefaultAssay(pbmc) <- "ATAC"
pbmc <- NucleosomeSignal(pbmc)
pbmc <- TSSEnrichment(pbmc)
pbmc <- subset(
x = pbmc,
subset = nCount_ATAC < 100000 &
nCount_RNA < 25000 &
nCount_ATAC > 1000 &
nCount_RNA > 1000 &
nucleosome_signal < 2 &
TSS.enrichment > 1
)
# Transform RNA
DefaultAssay(pbmc) <- "RNA"
pbmc <- SCTransform(pbmc)
pbmc <- RunPCA(pbmc)
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
Label cell types:
# load PBMC reference
system('wget https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat')
reference <- LoadH5Seurat("pbmc_multimodal.h5seurat")
DefaultAssay(pbmc) <- "SCT"
# transfer cell type labels from reference to query
transfer_anchors <- FindTransferAnchors(
reference = reference,
query = pbmc,
normalization.method = "SCT",
reference.reduction = "spca",
recompute.residuals = FALSE,
dims = 1:50
)
predictions <- TransferData(
anchorset = transfer_anchors,
refdata = reference$celltype.l2,
weight.reduction = pbmc[['pca']],
dims = 1:50
)
pbmc <- AddMetaData(
object = pbmc,
metadata = predictions
)
# set the cell identities to the cell type predictions
Idents(pbmc) <- "predicted.id"
# set a reasonable order for cell types to be displayed when plotting
levels(pbmc) <- c("CD4 Naive", "CD4 TCM", "CD4 CTL", "CD4 TEM", "CD4 Proliferating",
"CD8 Naive", "dnT",
"CD8 TEM", "CD8 TCM", "CD8 Proliferating", "MAIT", "NK", "NK_CD56bright",
"NK Proliferating", "gdT",
"Treg", "B naive", "B intermediate", "B memory", "Plasmablast",
"CD14 Mono", "CD16 Mono",
"cDC1", "cDC2", "pDC", "HSPC", "Eryth", "ASDC", "ILC", "Platelet")
saveRDS(pbmc, 'pbmc_Signac_tutorial.rds')
Signac’s training dataset only has one sample, so to simulate multiple samples, we will duplicate the data here. This should not be necessary with a large dataset.
full_pbmc <- merge(
pbmc,
y = c(pbmc, pbmc),
add.cell.ids = c('Sample1', 'Sample2', 'Sample3'),
project = 'DuplicateData'
)
For a larger dataset, you would interact over the fragObj list and extract each fragments.tsv.gz file.
Instead, we’re just duplicating data for the sake of this tutorial.
More importantly, you do need to modify the barcode in the fragment file so it matches the barcodes in Seurat’s metadata.
Generate your sample/cell type list by finding all combinations of Samples and Cell Populations
Here we must also rename the GRangesList to have names in the format
CellPopulation#Sample
.
celltype_sample_list <- apply(expand.grid(unique(pbmc@meta.data$predicted.id), names(fragList)), 1, paste, collapse = "#")
# Extract metadata, add the Sample column, as well as CellBarcode.
fullMeta <- full_pbmc@meta.data
fullMeta$Sample = gsub("_.*","", rownames(full_pbmc@meta.data))
fullMeta$CellBarcode = gsub(".*_","", rownames(full_pbmc@meta.data))
# Change the CellPopulation_Sample format to CellPopulation#Sample for MOCHA
rownames(fullMeta) <- gsub("_","#", rownames(fullMeta))
CellType_GRanges <- pbapply::pblapply(cl = 30, celltype_sample_list, function(x){
celltype <- gsub("#.*","", x)
sample <- gsub(".*#","", x)
sortedMeta <- dplyr::filter(fullMeta, predicted.id == celltype, Sample == sample)
sortedFrags <- dplyr::filter(fragList[[sample]], barcode %in% rownames(sortedMeta))
makeGRangesFromDataFrame(sortedFrags, keep.extra.columns = TRUE)
})
names(CellType_GRanges) <- celltype_sample_list
Calculate the study signal (the median number of fragments per cell)
# Our blacklist comes included with Signac
blackList <- blacklist_hg38_unified
# Call Open Tiles
tileResults <- MOCHA::callOpenTiles(ATACFragments = CellType_GRanges,
cellColData = fullMeta,
blackList = blackList,
genome = 'BSgenome.Hsapiens.UCSC.hg38',
cellPopLabel = 'predicted.id',
cellPopulations = fullMeta$predicted.id,
studySignal = studySignal,
cellCol = 'barcode',
TxDb = "TxDb.Hsapiens.UCSC.hg38.refGene",
Org = "org.Hs.eg.db",
outDir = paste(getwd(),'/MOCHA_Out', sep = ''), numCores= 35)
TSAM <- MOCHA::getSampleTileMatrix(tileResults, threshold = 0.2, numCores = 3, verbose = TRUE)
In this example, we follow the SnapATAC 10X PBMC tutorial through the clustering step before extracting the fragments and cell metadata necessary for MOCHA.
If you have a fully-formed Snap file for your analysis with
clustering results and sample information added to the metadata, skip to
section Formatting Snap File Metadata. If
following the vignette, we STRONGLY recommending installing this patched
version of SnapATAC from this repository with
devtools::install_github("imran-aifi/SnapATAC")
.
Download the all the SnapATAC Tutorial Data (linked above) to your working directory, and load it:
# CLI tool for installing from Google Drive share links
pip install gdown
# https://drive.google.com/file/d/1YiYd_Ydes3tqsJGpNuqQquUOoVj2EEjE/view?usp=share_link
gdown https://drive.google.com/uc?id=1YiYd_Ydes3tqsJGpNuqQquUOoVj2EEjE
# https://drive.google.com/file/d/1NvGn4M2_HD06PL5Nj2if5xO-uVA0y8Q5/view?usp=share_link
gdown https://drive.google.com/uc?id=1NvGn4M2_HD06PL5Nj2if5xO-uVA0y8Q5
# https://drive.google.com/file/d/1LUOqsXoQN6lVx-4RNlgH90e5oXQ0y9Bd/view?usp=share_link
gdown https://drive.google.com/uc?id=1LUOqsXoQN6lVx-4RNlgH90e5oXQ0y9Bd
# https://drive.google.com/file/d/1oMJ6wFsfS-q-sY_yaLEtnYebM7RrBix6/view?usp=share_link
gdown https://drive.google.com/uc?id=1oMJ6wFsfS-q-sY_yaLEtnYebM7RrBix6
# https://drive.google.com/file/d/1SEFZ5CJgcmoAkmo4_1kCMYS60YOFP379/view?usp=share_link
gdown https://drive.google.com/uc?id=1SEFZ5CJgcmoAkmo4_1kCMYS60YOFP379
# https://drive.google.com/file/d/1RlBvTCqz6mhaTAkfYiCdeojD-U2mN2wp/view?usp=share_link
gdown https://drive.google.com/uc?id=1RlBvTCqz6mhaTAkfYiCdeojD-U2mN2wp
library(SnapATAC);
snap.files = c(
"atac_pbmc_5k_nextgem.snap",
"atac_pbmc_10k_nextgem.snap"
);
sample.names = c(
"PBMC 5K",
"PBMC 10K"
);
barcode.files = c(
"atac_pbmc_5k_nextgem_singlecell.csv",
"atac_pbmc_10k_nextgem_singlecell.csv"
);
x.sp.ls = lapply(seq(snap.files), function(i){
createSnap(
file=snap.files[i],
sample=sample.names[i]
);
})
names(x.sp.ls) = sample.names;
barcode.ls = lapply(seq(snap.files), function(i){
barcodes = read.csv(
barcode.files[i],
head=TRUE
);
barcodes = barcodes[2:nrow(barcodes),];
barcodes$logUMI = log10(barcodes$passed_filters + 1);
barcodes$promoter_ratio = (barcodes$promoter_region_fragments+1) / (barcodes$passed_filters + 1);
barcodes
})
x.sp.ls
# for both datasets, we identify usable barcodes using [3.5-5] for log10(UMI) and [0.4-0.8] for promoter ratio as cutoff.
cutoff.logUMI.low = c(3.5, 3.5);
cutoff.logUMI.high = c(5, 5);
cutoff.FRIP.low = c(0.4, 0.4);
cutoff.FRIP.high = c(0.8, 0.8);
barcode.ls = lapply(seq(snap.files), function(i){
barcodes = barcode.ls[[i]];
idx = which(
barcodes$logUMI >= cutoff.logUMI.low[i] &
barcodes$logUMI <= cutoff.logUMI.high[i] &
barcodes$promoter_ratio >= cutoff.FRIP.low[i] &
barcodes$promoter_ratio <= cutoff.FRIP.high[i]
);
barcodes[idx,]
});
x.sp.ls = lapply(seq(snap.files), function(i){
barcodes = barcode.ls[[i]];
x.sp = x.sp.ls[[i]];
barcode.shared = intersect(x.sp@barcode, barcodes$barcode);
x.sp = x.sp[match(barcode.shared, x.sp@barcode),];
barcodes = barcodes[match(barcode.shared, barcodes$barcode),];
x.sp@metaData = barcodes;
x.sp
})
names(x.sp.ls) = sample.names;
x.sp.ls
# combine two snap object
x.sp = Reduce(snapRbind, x.sp.ls);
x.sp@metaData["Sample"] = x.sp@sample;
print(table(x.sp@sample))
x.sp
# Step 2. Add cell-by-bin matrix
x.sp = addBmatToSnap(x.sp, bin.size=5000);
# Step 3. Matrix binarization
x.sp = makeBinary(x.sp, mat="bmat");
# Step 4. Bin filtering
library(GenomicRanges);
black_list = read.table("hg19.blacklist.bed.gz");
black_list.gr = GRanges(
black_list[,1],
IRanges(black_list[,2], black_list[,3])
);
idy = queryHits(
findOverlaps(x.sp@feature, black_list.gr)
);
if(length(idy) > 0){
x.sp = x.sp[,-idy, mat="bmat"];
};
x.sp
# Remove unwanted chromosomes
chr.exclude = seqlevels(x.sp@feature)[grep("random|chrM", seqlevels(x.sp@feature))];
idy = grep(paste(chr.exclude, collapse="|"), x.sp@feature);
if(length(idy) > 0){
x.sp = x.sp[,-idy, mat="bmat"]
};
x.sp
# The coverage of bins roughly obeys a log normal distribution. We remove the top 5% bins that overlap with invariant features such as the house keeping gene promoters.
bin.cov = log10(Matrix::colSums(x.sp@bmat)+1);
hist(
bin.cov[bin.cov > 0],
xlab="log10(bin cov)",
main="log10(Bin Cov)",
col="lightblue",
xlim=c(0, 5)
);
bin.cutoff = quantile(bin.cov[bin.cov > 0], 0.95);
idy = which(bin.cov <= bin.cutoff & bin.cov > 0);
x.sp = x.sp[, idy, mat="bmat"];
x.sp
# We will further remove any cells of bin coverage less than 1,000. The rational behind this is that some cells may have high number of unique fragments but end up with low bin coverage after filtering. This step is optional but highly recommended.
idx = which(Matrix::rowSums(x.sp@bmat) > 1000);
x.sp = x.sp[idx,];
x.sp
# Step 5. Dimensionality reduction
row.covs.dens <- density(
x = x.sp@metaData[,"logUMI"],
bw = 'nrd', adjust = 1
);
sampling_prob <- 1 / (approx(x = row.covs.dens$x, y = row.covs.dens$y, xout = x.sp@metaData[,"logUMI"])$y + .Machine$double.eps);
set.seed(1);
idx.landmark.ds <- base::sort(sample(x = seq(nrow(x.sp)), size = 10000, prob = sampling_prob));
x.landmark.sp = x.sp[idx.landmark.ds,];
x.query.sp = x.sp[-idx.landmark.ds,];
x.landmark.sp = runDiffusionMaps(
obj= x.landmark.sp,
input.mat="bmat",
num.eigs=50
);
x.landmark.sp@metaData$landmark = 1;
x.query.sp = runDiffusionMapsExtension(
obj1=x.landmark.sp,
obj2=x.query.sp,
input.mat="bmat"
);
x.query.sp@metaData$landmark = 0;
x.sp = snapRbind(x.landmark.sp, x.query.sp);
x.sp = x.sp[order(x.sp@metaData["sample"])];
x.sp = runKNN(
obj=x.sp,
eigs.dims=1:20,
k=15
);
x.sp=runCluster(
obj=x.sp,
tmp.folder=tempdir(),
louvain.lib="R-igraph", #"leiden" preferred, but may cause issues. Requires 'library(leiden)'.
seed.use=10,
resolution=0.7
)
Add the computed clusters to the Snap object metadata.
The Snap object contains two samples, “PBMC 5K” and “PBMC 10K”. Let’s add a “Sample” column to the metadata. Let’s also add a column “files” pointing to the original .snap files from which each cell came.
# Add clusters (from SnapATAC::runCluster) to metadata
x.sp@metaData$cluster = x.sp@cluster
# Add Sample name to metadata (if not done previously)
x.sp@metaData$Sample = x.sp@sample
# Add files to metadata, indicating the original snap file each cell belongs to.
snap.files <- c(
"atac_pbmc_5k_nextgem.snap",
"atac_pbmc_10k_nextgem.snap"
)
fileList <- unlist(lapply(x.sp@metaData$Sample, function(x){
ifelse(x == "PBMC 5K", snap.files[[1]],snap.files[[2]])
}))
x.sp@metaData$files <- fileList
# SAVE this metadata to disk
write.csv(x.sp@metaData, "./snapMetadataforMOCHA.csv")
Now we have a Snap object with metadata containing barcodes, unique
cell ids (column cell_id), sample names, and cell populations (cluster).
We also have our HG19 blackList, black_list.gr
.
Note: Following the tutorial from SnapATAC can often result in a segfault when extracting fragments with
SnapATAC::extractReads
. We recommend running on a machine with large RAM and avoiding parallelization.
Next we extract reads by sample and cell population, ensuring our
final GRanges list is named following the pattern
CellPopulation#Sample
.
snapMetadata <- read.csv("./snapMetadataforMOCHA.csv")
cellCol <- "barcode"
cellPopLabel <- "cluster"
cellPopulations <- unique(snapMetadata$cluster)
allSamples <- unique(snapMetadata$Sample)
fragmentsGRangesList <- unlist(lapply(allSamples, function(sample){
barcodesList <- lapply(cellPopulations, function(cellPop) {
snapMetadata[snapMetadata$Sample == sample,]
# Extract barcodes for a single cell population
cellPopIdx <- snapMetadata$cluster == cellPop
cellPopBarcodes <- snapMetadata[cellPopIdx,]$barcode
# Build the file list for the selected cell barcode
files <- snapMetadata[cellPopIdx,]$files
# Extract fragments
cellPopFrags <- SnapATAC::extractReads(cellPopBarcodes, files, do.par = FALSE)
})
names(barcodesList) <- paste(cellPopulations, sample, sep="#")
barcodesList
}))
Calculate the study signal (the median number of fragments per cell)
# Call Open Tiles
tileResults <- MOCHA::callOpenTiles(
ATACFragments = fragmentsGRangesList,
cellColData = snapMetadata,
blackList = black_list.gr,
genome = "BSgenome.Hsapiens.UCSC.hg38",
cellPopLabel = cellPopLabel,
cellPopulations = cellPopulations,
studySignal = studySignal,
cellCol = cellCol,
TxDb = "TxDb.Hsapiens.UCSC.hg38.refGene",
Org = "org.Hs.eg.db",
outDir = paste(getwd(),'/MOCHA_Out', sep = ''),
numCores = 5
)
TSAM <- MOCHA::getSampleTileMatrix(tileResults, threshold = 0.2, numCores = 3, verbose = TRUE)