# If any package is missing, install it using the following commands and then load it.
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("SingleCellExperiment")
# library(SingleCellExperiment)
# Loading libraries
suppressPackageStartupMessages({
library(Seurat)
library(SingleCellExperiment)
library(scuttle)
library(scran)
library(ggplot2)
library(ggridges)
library(scater)
library(pheatmap)
library(gridExtra)
library(batchelor)
library(igraph)
library(bluster)
library(celldex)
library(SingleR)
library(AnnotationDbi)
library(EnsDb.Hsapiens.v86)
library(DESeq2)
})
Single cell mRNASeq 2: batch correction, annotation, clustering, marker finding and differential gene expression
Install necessary libraries
Goal
Analyze a publicly available scRNA-Seq data set using the following steps:
- Data preparation
- Batch correction
- Cell type annotation
- Clustering
- Finding marker genes
- Differential expression analysis
Data preparation
The data set that we analyze in this session was generated using 10x Genomics technology and contains T cells and innate lymphoid cells (35233 cells in total) from tumors of 26 patients diagnosed with breast cancer. The data set is visualized in Fig. 3a of this paper and can be downloaded from this database (called “A single-cell and spatially-resolved atlas of human breast cancers - T_cells”).
Download the data set
Set a working directory:
setwd("~/Teaching/analysisOfGenomicsDataWithR")
Within the working directory create a directory, where you will download the data set:
dir.create("data/scRNAseq_2",recursive=TRUE)
Download the data set and rename it by running the following command:
#download.file("https://datasets.cellxgene.cziscience.com/1bed6d79-cd96-4eb9-8bdc-e67cb6da2589.rds", "data/scRNAseq_2/seurat_T_cells.rds")
Load the data set into R and explore it:
<- readRDS("data/scRNAseq_2/seurat_T_cells.rds")
seur_obj seur_obj
An object of class Seurat
29067 features across 35214 samples within 1 assay
Active assay: RNA (29067 features, 0 variable features)
2 layers present: counts, data
2 dimensional reductions calculated: Three.D, umap
As you can see, it is a Seurat
object. In this course we work with SingleCellExperiment
objects. So let us convert the Seurat
object into the SingleCellExperiment
object:
<- as.SingleCellExperiment(seur_obj)
sce_orig sce_orig
class: SingleCellExperiment
dim: 29067 35214
metadata(0):
assays(2): counts logcounts
rownames(29067): ENSG00000238009 ENSG00000279928 ...
ENSG00000239886 ENSG00000253874
rowData names(0):
colnames(35214): CID3586_ACTTTCAAGCTCCCAG
CID3586_AGGGATGTCGTTACAG ... CID4398_GATGAGGAGAAACCTA
CID4398_GGGAGATTCTATCCTA
colData names(47): donor_id percent_mito ...
observation_joinid ident
reducedDimNames(2): THREE.D UMAP
mainExpName: RNA
altExpNames(0):
Data exploration
Let us explore the sce_orig
object. What metadata is available for the data set? Check colData
:
colnames(colData(sce_orig))
[1] "donor_id"
[2] "percent_mito"
[3] "nCount_RNA"
[4] "nFeature_RNA"
[5] "celltype_major"
[6] "celltype_minor"
[7] "celltype_subset"
[8] "subtype"
[9] "gene_module"
[10] "calls"
[11] "normal_cell_call"
[12] "CNA_value"
[13] "batch_run"
[14] "multiplexed"
[15] "cryo_state"
[16] "development_stage_ontology_term_id"
[17] "cancer_type"
[18] "ER"
[19] "PR"
[20] "HER2_IHC"
[21] "HER2_ISH"
[22] "HER2_ISH_ratio"
[23] "Ki67"
[24] "subtype_by_IHC"
[25] "treatment_status"
[26] "treatment_details"
[27] "assay_ontology_term_id"
[28] "organism_ontology_term_id"
[29] "tissue_ontology_term_id"
[30] "suspension_type"
[31] "sex_ontology_term_id"
[32] "self_reported_ethnicity_ontology_term_id"
[33] "is_primary_data"
[34] "disease_ontology_term_id"
[35] "grade"
[36] "cell_type_ontology_term_id"
[37] "tissue_type"
[38] "cell_type"
[39] "assay"
[40] "disease"
[41] "organism"
[42] "sex"
[43] "tissue"
[44] "self_reported_ethnicity"
[45] "development_stage"
[46] "observation_joinid"
[47] "ident"
Let us explore some of these values. How many tissue donors are there and how many cells per donor are present in the data set?
table(sce_orig$donor_id)
CID3586 CID3838 CID3921 CID3941 CID3946 CID3948 CID3963
4596 1351 1473 266 92 1465 2672
CID4040 CID4066 CID4067 CID4290A CID4398 CID4461 CID4463
1512 2171 689 542 3733 68 217
CID4465 CID4471 CID4495 CID4513 CID4515 CID4523 CID4530N
116 641 3504 1466 419 177 292
CID4535 CID44041 CID44971 CID44991 CID45171
396 742 4366 902 1346
In how many batches were donor cells sequenced?
table(sce_orig$batch_run,sce_orig$donor_id)
CID3586 CID3838 CID3921 CID3941 CID3946 CID3948 CID3963 CID4040
1 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0
11 0 0 0 0 0 0 0 0
12 0 0 1473 0 0 0 2672 0
13 4596 0 0 266 0 1465 0 0
14 0 0 0 0 0 0 0 0
15 0 0 0 0 0 0 0 0
16 0 1351 0 0 92 0 0 1512
2 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 0 0
CID4066 CID4067 CID4290A CID4398 CID4461 CID4463 CID4465 CID4471
1 0 0 0 3733 0 0 0 0
10 0 0 0 0 0 0 0 0
11 0 0 0 0 0 0 0 0
12 2171 0 0 0 0 0 0 0
13 0 0 0 0 0 0 0 0
14 0 689 542 0 0 0 0 0
15 0 0 0 0 0 0 0 0
16 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0
3 0 0 0 0 68 0 116 0
4 0 0 0 0 0 217 0 0
5 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 641
7 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 0 0
CID4495 CID4513 CID4515 CID4523 CID4530N CID4535 CID44041
1 0 0 0 0 0 0 0
10 0 0 0 0 292 0 0
11 0 0 0 177 0 0 0
12 0 0 0 0 0 0 0
13 0 0 0 0 0 0 0
14 0 0 0 0 0 0 0
15 0 0 0 0 0 396 0
16 0 0 0 0 0 0 0
2 0 0 0 0 0 0 742
3 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0
6 3504 0 0 0 0 0 0
7 0 1466 0 0 0 0 0
8 0 0 419 0 0 0 0
9 0 0 0 0 0 0 0
CID44971 CID44991 CID45171
1 0 0 0
10 0 0 0
11 0 0 0
12 0 0 0
13 0 0 0
14 0 0 0
15 0 0 0
16 0 0 0
2 0 0 0
3 0 0 0
4 0 0 0
5 4366 0 0
6 0 0 0
7 0 902 0
8 0 0 0
9 0 0 1346
What type of breast cancer did donors have?
table(sce_orig$subtype)
ER+ HER2+ TNBC
9821 10937 14456
What is the grade of the disease across donors?
table(sce_orig$donor_id,sce_orig$grade)
2 3
CID3586 0 4596
CID3838 0 1351
CID3921 0 1473
CID3941 266 0
CID3946 0 92
CID3948 0 1465
CID3963 0 2672
CID4040 0 1512
CID4066 2171 0
CID4067 689 0
CID4290A 542 0
CID4398 0 3733
CID4461 68 0
CID4463 217 0
CID4465 0 116
CID4471 641 0
CID4495 0 3504
CID4513 0 1466
CID4515 0 419
CID4523 0 177
CID4530N 292 0
CID4535 396 0
CID44041 0 742
CID44971 0 4366
CID44991 0 902
CID45171 0 1346
Which cell types are present in the data set?
table(sce_orig$celltype_minor)
Cycling T-cells NK cells NKT cells T cells CD4+
1528 1846 1122 19231
T cells CD8+
11487
What are more granular cell types?
table(sce_orig$celltype_subset)
T_cells_c0_CD4+_CCR7 T_cells_c1_CD4+_IL7R
4952 7786
T_cells_c2_CD4+_T-regs_FOXP3 T_cells_c3_CD4+_Tfh_CXCL13
4246 2247
T_cells_c4_CD8+_ZFP36 T_cells_c5_CD8+_GZMK
5063 284
T_cells_c6_IFIT1 T_cells_c7_CD8+_IFNG
1017 3168
T_cells_c8_CD8+_LAG3 T_cells_c9_NK_cells_AREG
1955 1846
T_cells_c10_NKT_cells_FCGR3A T_cells_c11_MKI67
1122 1528
What QC statistics are available for each cell? For example, check the sum of counts per cell:
$nCount_RNA[1:10] sce_orig
[1] 1357 996 1372 1245 1914 1314 2181 1230 1252 1719
Plot the distribution of the sum of counts across all cells from the data set:
ggplot(data.frame(sum=sce_orig$nCount_RNA), aes(x=sum)) +
geom_histogram(bins=40) +
scale_x_continuous(trans = 'log10')+
scale_y_continuous(trans='log10')+
xlab("Sum of counts")+
ylab("# of cells") +
theme_bw() +
theme(panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Let us have a look at reduced dimensions of the data set represented by UMAP (Uniform Manifold Approximation and Projection) coordinates. UMAP is a dimension reduction technique that can be used for the visualization similarly to t-SNE, but also for general non-linear dimension reduction.
plotReducedDim(sce_orig, dimred="UMAP",colour_by="celltype_minor")
plotReducedDim(sce_orig, dimred="UMAP",colour_by="celltype_subset")
Does it correspond to Fig. 3a from the manuscript?
Batch correction
Imagine that this is your data set, which was recently generated, and you have to analyze it from scratch in a similar way how it was presented in the publication. So far you performed the initial QC of the data set as presented in the previous session, normalized counts and ended up with the following SingleCellExperiment
object:
<- SingleCellExperiment(assays = list(counts = counts(sce_orig),
sce logcounts=logcounts(sce_orig)),
colData=colData(sce_orig)[,c("donor_id","batch_run","subtype","grade","nCount_RNA")])
We will try to re-produce some of the analysis presented in the paper.
Principal component analysis
Let us start with performing principal component analysis.
# Quantifying per gene variation
set.seed(100)
<- modelGeneVarByPoisson(sce)
model_gen_var
# Visualizing the fit
plot(model_gen_var$mean, model_gen_var$total, xlab="Mean of log-expression",
ylab="Variance of log-expression")
curve(metadata(model_gen_var)$trend(x), col="dodgerblue", add=TRUE, lwd=2)
abline(h=0.1) # this will be a threshold for the variance for HVGs
# Get top variable genes
<- model_gen_var[order(model_gen_var$bio, decreasing=TRUE),]
model_gen_var_ord <- getTopHVGs(model_gen_var_ord,fdr.threshold=0.01,var.threshold=0.1)
hvg
# Perform PCA
set.seed(100)
= runPCA(sce, exprs_values = "logcounts", ncomponents = 50,
sce subset_row = hvg)
# Visualize PCA
plotReducedDim(sce, dimred="PCA",colour_by="batch_run")
Create a UMAP
Next, we will create a UMAP.
set.seed(100)
<- runUMAP(sce, dimred="PCA") sce
What is the difference of the generated UMAP in comparison to the original plot?
<- plotReducedDim(sce, dimred="UMAP",colour_by="batch_run") + ggtitle("Re-run")
p1 <- plotReducedDim(sce_orig, dimred="UMAP",colour_by="batch_run") + ggtitle("Original")
p2 grid.arrange(p1,p2,nrow=1)
It is noticeable that the data set has a strong batch effect, which was removed by authors of the paper using the package Seurat
(see here). The batch effect refers to technical sources of variation introduced to the data during handling/preparation/processing, when data are prepared in different batches. This variation distorts measurements resulting in the separation of cells of the same type across batches.
Below you will find a figure presenting a toy example of a scRNA-Seq data set composed of three subsets originating from three different labs. The lab of origin creates a batch effect forcing cells from the same lab to cluster together (on the left). Removing the batch effect allows cells to cluster based on cell types (on the right).
The most common batch effect in human research stems from the variation between individual donors. Let us also explore sequencing depth in this data set by plotting the distribution of sum of counts per cell across batches.
ggplot(data.frame(nCount_RNA=log10(sce$nCount_RNA),batch_run=sce$batch_run), aes(x = nCount_RNA, y = batch_run)) +
geom_density_ridges() +
xlab("log10(Sum of counts)")+
ylab("# of cells")
Indeed, the distribution of sum of counts looks different for different batches. For example, the majority of cells from the batch 16 have more counts than cells from other batches. It could explain, why these cells end up in a separate cluster on the UMAP plot.
Remove batch effect
How can we get rid of the batch effect? We will apply a technique called matching mutual nearest neighbours (MNN) first proposed by Haghverdi et al. 2018. This method attempts to identify similar cells across batches after projecting them in a space of reduced dimensions (see figure below taken from Haghverdi et al. 2018).
This method has the following assumptions:
- There is at least one cell population (or cell type) that is common across all batches.
- The batch effect variation is almost orthogonal to the biological variation (no high level of confounding) or at least its effects are smaller than the biological variation effect between different cell types.
The function that implements this method in R
is called fastMNN()
from the batchelor
package.
set.seed(100)
<- fastMNN(sce, batch=sce$batch_run,subset.row=hvg)
sce_mnn
# Save corrected PC coordinates
reducedDim(sce, "MNN_Batch") <- reducedDim(sce_mnn, "corrected")
# Make UMAP with corrected values
set.seed(100)
<- runUMAP(sce_mnn, dimred="corrected")
sce_mnn
# Save corrected UMAP coordinates
reducedDim(sce, "UMAP_MNN_Batch") <- reducedDim(sce_mnn,"UMAP")
Compare now resulting UMAP with corrected coordinates to the original UMAP from the paper. Cells from different batches appear all over the UMAP plot and do not form separate clusters anymore.
<- plotReducedDim(sce, dimred="UMAP_MNN_Batch",colour_by="batch_run") + ggtitle("Re-run")
p1 <- plotReducedDim(sce_orig, dimred="UMAP",colour_by="batch_run") + ggtitle("Original")
p2 grid.arrange(p1,p2,nrow=1)
Is it a problem that the newly generated UMAP plot does not look identical to the original one?
Cell type annotation
Now we would like to investigate, what subtypes of T cells are present in the data set. One way to annotate cells in our data set is to compare their expression profiles with previously annotated reference datasets. Labels (cell types) can then be assigned to each cell based on the most similar reference sample(s).
As the reference data sets we will use two publicly available data sets describing major subtypes of CD4+ and CD8+ tumor infiltrating T cells, respectively (see Zheng et al. 2021). Seurat
object of CD4+ tumor infiltrating T cells can be retrieved from here. Seurat
object of CD8+ tumor infiltrating T cells can be retrieved from here.
To facilitate calculations, reference data sets were aggregated by averaging counts from cells of the same sample and subtype. Download resulting aggregated SummarizedExperiment
objects called zheng_et_al_CD4_agg.rds
and zheng_et_al_CD8.rds
from the course web page and put them in the directory data/scRNAseq_2
. These files can be generated by executing the following code.
# download.file("https://figshare.com/ndownloader/files/39012395", "data/scRNAseq_2/zheng_et_al_CD4.rds")
# cd4 <- readRDS("data/scRNAseq_2/zheng_et_al_CD4.rds")
# cd4_sce <- as.SingleCellExperiment(cd4)
# cd4_agg <- aggregateReference(cd4_sce,labels=cd4_sce$functional.cluster)
# saveRDS(cd4_agg,"data/scRNAseq_2/zheng_et_al_CD4_agg.rds")
# download.file("https://figshare.com/ndownloader/files/41414556", "data/scRNAseq_2/zheng_et_al_CD8.rds")
# cd8 <- readRDS("data/scRNAseq_2/zheng_et_al_CD8.rds")
# cd8_sce <- as.SingleCellExperiment(cd8)
# cd8_agg <- aggregateReference(cd8_sce,labels=cd8_sce$functional.cluster)
# saveRDS(cd8_agg,"data/scRNAseq_2/zheng_et_al_CD8_agg.rds")
Load zheng_et_al_CD4_agg.rds
and zheng_et_al_CD8_agg.rds
and explore them:
<- readRDS("data/scRNAseq_2/zheng_et_al_CD4_agg.rds")
cd4_agg cd4_agg
class: SummarizedExperiment
dim: 800 274
metadata(0):
assays(1): logcounts
rownames(800): CXCL13 CCL4 ... S100A10 GOLIM4
rowData names(0):
colnames(274): CD4.NaiveLike.1 CD4.NaiveLike.2 ... CD4.Treg.53
CD4.Treg.54
colData names(1): label
table(cd4_agg$label)
CD4.NaiveLike CD4.CTL_EOMES CD4.CTL_GNLY CD4.CTL_Exh CD4.Tfh
54 39 18 37 54
CD4.Th17 CD4.Treg
18 54
Subtypes of CD4+ T cells have the following annotation:
- NaiveLike - T cells with naive-like phenotype
- Tfh - T follicular helper cells
- Th17 - Th17 helper cells
- Treg - T regulatory cells
- CTL_EOMES - Cytotoxic CD4 T cells expressing EOMES and GZMK
- CTL_GNLY - Cytotoxic CD4 T cells expressing GNLY
- CTL_Exh - Cytotoxic CD4 T cells with exhaustion phenotype
<- readRDS("data/scRNAseq_2/zheng_et_al_CD8_agg.rds")
cd8_agg cd8_agg
class: SummarizedExperiment
dim: 800 255
metadata(0):
assays(1): logcounts
rownames(800): GNLY CCL4 ... HBA2 HBA1
rowData names(0):
colnames(255): CD8.NaiveLike.1 CD8.NaiveLike.2 ... CD8.MAIT.22
CD8.MAIT.23
colData names(1): label
table(cd8_agg$label)
CD8.NaiveLike CD8.CM CD8.EM CD8.TEMRA CD8.TPEX
40 44 44 31 29
CD8.TEX CD8.MAIT
44 23
Subtypes of CD8+ T cells have the following annotation:
- NaiveLike T cells with naive-like phenotype
- CM Central memory
- EM Effector memory
- TEMRA Effector memory re-expressing CD45RA
- TPEX Precursor exhausted T cells
- TEX Terminally exhausted T cells
- MAIT Mucosal-associated invariant T cells
Before we proceed, we have to match the annotation of features, in our case genes, between reference data sets and our data set.
# Ensembl IDs were used in our data set
rownames(sce)[1:10]
[1] "ENSG00000238009" "ENSG00000279928" "ENSG00000279457"
[4] "ENSG00000228463" "ENSG00000237094" "ENSG00000230021"
[7] "ENSG00000237491" "ENSG00000225880" "ENSG00000230368"
[10] "ENSG00000223764"
# Symbols were used in reference data sets
rownames(cd4_agg)[1:10]
[1] "CXCL13" "CCL4" "GNLY" "GZMB" "CCL5" "FOS"
[7] "NKG7" "TNFRSF4" "GZMK" "CCL20"
rownames(cd8_agg)[1:10]
[1] "GNLY" "CCL4" "HBB" "CCL4L2" "CXCL13" "XCL1" "XCL2"
[8] "FOS" "GZMB" "CCL3"
# Match gene symbols to Ensembl IDs in our data set
= AnnotationDbi::mapIds(EnsDb.Hsapiens.v86, keys=rownames(sce),column="SYMBOL", keytype="GENEID") ens_symb
Warning: Unable to map 8 of 29067 requested IDs.
1:10] ens_symb[
ENSG00000238009 ENSG00000279928 ENSG00000279457 ENSG00000228463
"RP11-34P13.7" "FO538757.2" "FO538757.1" "AP006222.2"
ENSG00000237094 ENSG00000230021 ENSG00000237491 ENSG00000225880
"RP4-669L17.10" "RP5-857K21.4" "RP11-206L10.9" "LINC00115"
ENSG00000230368 ENSG00000223764
"FAM41C" "RP11-54O7.3"
# Add the annotation to the rowData
rowData(sce)$GENEID <- names(ens_symb)
rowData(sce)$SYMBOL <- ens_symb
# Keep only genes with unique not NA symbols
<- sce[!duplicated(rowData(sce)$SYMBOL),]
sce <- sce[!is.na(rowData(sce)$SYMBOL),]
sce
# Rename rows of the SingleCellExperiment object
rownames(sce) <- rowData(sce)$SYMBOL
# Check if it worked out
rownames(sce)[1:10]
[1] "RP11-34P13.7" "FO538757.2" "FO538757.1" "AP006222.2"
[5] "RP4-669L17.10" "RP5-857K21.4" "RP11-206L10.9" "LINC00115"
[9] "FAM41C" "RP11-54O7.3"
Now we will apply the SingleR
method for cell type annotation. This method assigns labels to cells based on the reference samples with the highest Spearman rank correlations, using only the marker genes between pairs of labels to focus on the relevant differences between cell types.
<- SingleR(test = sce,
pred_agg ref = list(cd4=cd4_agg,cd8=cd8_agg),
labels = list(cd4_agg$label,cd8_agg$label))
# Explore predicted labels
table(pred_agg$labels)
CD4.CTL_EOMES CD4.CTL_Exh CD4.CTL_GNLY CD4.NaiveLike CD4.Tfh
1040 836 1225 4869 2034
CD4.Th17 CD4.Treg CD8.CM CD8.EM CD8.MAIT
271 4431 7569 2299 1070
CD8.NaiveLike CD8.TEMRA CD8.TEX CD8.TPEX
4616 1190 2858 906
# Store predicted labels in SingleCellExperiment object
$celltype_zheng_agg <- pred_agg$labels
sce
# Plot UMAP and color cells based on predicted cell types
plotReducedDim(sce, dimred="UMAP_MNN_Batch",colour_by="celltype_zheng_agg")
# Plot UMAP and color cells based on predicted cell types from the original paper
plotReducedDim(sce, dimred="UMAP_MNN_Batch",colour_by=data.frame(celltype_orig=sce_orig$celltype_subset))
# Split cells based on the cancer type
plotReducedDim(sce, dimred="UMAP_MNN_Batch",colour_by="celltype_zheng_agg") + facet_wrap(~sce$subtype,nrow=1)
The above figure indicates that CD8+ terminally exhausted T cells (CD8.TEX) are much more abundant in TNBC patients in comparison to other cancer types. By performing this analysis we were able to re-produce one of the most interesting findings concerning T cells from the original paper. See the citation from the paper below:
Exercise: Re-produce the stacked barplot from Fig. 3a from the original manuscript using the new annotation of cell types.
<- as.data.frame(table(sce$subtype,sce$celltype_zheng_agg))
df ggplot(data=df, aes(x=Var1, y=Freq, fill=Var2)) +
geom_bar(stat="identity",position="fill")
Clustering and detecting marker genes
Imagine that you can not find any appropriate reference data set, but you would still need to annotate your cells. Then you can cluster your cells, find marker genes of each cluster and study them.
Clustering
Clustering aims to identify cell populations with similar expression profiles. These populations, i.e. clusters, are assumed to reflect distinct biological cell types (or subtypes) present in the dataset. It is important to keep in mind that defining the desired resolution of the clustering procedure (i.e the ideal number of clusters) is often subjective and context-dependent.
For clustering we will apply a graph-based approach. It models cell populations as nodes on a network and tries to identify well connected node communities (cell groups). Graph (network) construction is based on drawing edges (connections) between nodes (cells) according to some predefined measure of node to node similarity. A wide range of community-detection algorithms is then available to partition the resulting graph in communities of cells that are better connected to each other then to the rest of the cells.
Graph-based approaches have become extremely popular in the context of scRNA-Seq analysis, because they tend to be robust to unwanted variance that produces continuous distortions to the signal (such as sampling variance). They make no assumptions on the shape or the distribution of cells within clusters and they are scalable.
We will now build a Shared Nearest Neighbour Graph (SNN graph) using scran
. In such a graph two cells are connected only if at least one of their k
nearest neighbours is common.
<- 5
kc set.seed(100)
<- buildSNNGraph(sce, k=kc, use.dimred = "MNN_Batch") gf_corrected
We will then identify clusters using a community detection method from the package igraph
.
<- factor(cluster_louvain(gf_corrected)$membership)
clusters $clusters <- clusters
sceplotReducedDim(sce, dimred="UMAP_MNN_Batch",colour_by="clusters")
plotReducedDim(sce, dimred="UMAP_MNN_Batch",colour_by="clusters") + facet_wrap(~sce$subtype,nrow=1)
Exercise: How will clustering change if you increase or decrease the value of the parameter k
?
Finding marker genes
Here, we will use the scoreMarkers
function from the scran
package to identify marker genes, i.e. genes differentially expressed in each cluster. We have observed in this data set a strong batch effect. It is therefore meaningful to include the factor describing it into scoreMarkers
call as block
argument.
<- scran::scoreMarkers(sce, groups = sce$clusters,
markers block = sce$batch_run,
assay.type="logcounts")
The function returns a list of data frames containing statistics about marker genes for each cluster. Explore statistics about marker genes for cluster 1:
head(markers[[1]])
DataFrame with 6 rows and 19 columns
self.average other.average self.detected other.detected
<numeric> <numeric> <numeric> <numeric>
RP11-34P13.7 0.000484738 -0.000177503 0.000389536 0.000122165
FO538757.2 -0.000248498 0.000118197 0.000000000 0.000359458
FO538757.1 0.013729176 0.017727509 0.004601125 0.006908780
AP006222.2 0.012866114 0.012334720 0.005716305 0.006858298
RP4-669L17.10 -0.000223192 0.000554051 0.000000000 0.000390457
RP5-857K21.4 0.000196388 0.000186872 0.000116964 0.000260553
mean.logFC.cohen min.logFC.cohen median.logFC.cohen
<numeric> <numeric> <numeric>
RP11-34P13.7 0.019421236 -0.0043064 0.023138296
FO538757.2 -0.008307062 -0.0374611 -0.001708197
FO538757.1 -0.010235590 -0.1393497 -0.016756129
AP006222.2 -0.008345321 -0.0888047 -0.000383154
RP4-669L17.10 0.000814364 -0.0474061 0.007084308
RP5-857K21.4 0.006255245 -0.0213692 0.014080842
max.logFC.cohen rank.logFC.cohen mean.AUC min.AUC
<numeric> <integer> <numeric> <numeric>
RP11-34P13.7 0.0358773 1544 0.500209 0.499709
FO538757.2 0.0000000 8490 0.499827 0.499193
FO538757.1 0.0941306 1977 0.494178 0.455037
AP006222.2 0.0431537 2298 0.499255 0.489974
RP4-669L17.10 0.0183923 3984 0.499641 0.498499
RP5-857K21.4 0.0248822 2084 0.500033 0.499686
median.AUC max.AUC rank.AUC mean.logFC.detected
<numeric> <numeric> <integer> <numeric>
RP11-34P13.7 0.500258 0.500555 1003 0.1785225
FO538757.2 0.499966 0.500000 7865 -0.1241229
FO538757.1 0.496771 0.507437 1999 -0.1134618
AP006222.2 0.500626 0.502524 2034 -0.1031370
RP4-669L17.10 0.499884 0.500164 5711 0.0657416
RP5-857K21.4 0.500126 0.500283 1659 -0.0135459
min.logFC.detected median.logFC.detected
<numeric> <numeric>
RP11-34P13.7 -0.189701 0.1941676
FO538757.2 -0.532366 -0.0084530
FO538757.1 -1.168715 -0.1468747
AP006222.2 -1.097909 -0.0531754
RP4-669L17.10 -0.375317 0.0743984
RP5-857K21.4 -0.413718 0.0896409
max.logFC.detected rank.logFC.detected
<numeric> <integer>
RP11-34P13.7 6.32325e-01 467
FO538757.2 5.87737e-17 8250
FO538757.1 6.18295e-01 2025
AP006222.2 4.42284e-01 2564
RP4-669L17.10 5.09743e-01 3334
RP5-857K21.4 1.96494e-01 600
Explore top 10 marker genes for each cluster using the ranking by mean.logFC.cohen
measure. This measure corresponds to Cohen’s d
, which is a standardized log-fold change, where the difference in the mean log-expression between groups is scaled by the average standard deviation across groups. In other words, it is the number of standard deviations that separate the means of the two groups. The interpretation is similar to the log-fold change:
- positive values indicate that the gene is upregulated in our cluster of interest,
- negative values indicate downregulation,
- values close to zero indicate that there is little difference.
<- lapply(markers,function(x) {
markers_top10 <- order(x$mean.logFC.cohen,decreasing=TRUE)
ord rownames(x[ord,][1:10,])
})
Create a heatmap of unique marker genes:
<- unique(unlist(markers_top10))
markers_top10_merged ::plotHeatmap(sce, features = markers_top10_merged,
scatercluster_cols = FALSE,
order_columns_by="clusters",
center=TRUE,zlim=c(-3,3))
Exercise: Explore marker genes of the cluster 5. Would you manage to annotate cells from this cluster?
Differential expression analysis
For ER+ patients cluster 4 is mostly composed of CD8+ effector memory T cells (CD8.EM):
<- sce[,which(sce$subtype=="ER+" & sce$clusters==4)]
sce_er table(sce_er$celltype_zheng_agg)
CD4.CTL_EOMES CD4.CTL_Exh CD4.CTL_GNLY CD4.Tfh CD4.Th17
35 3 75 3 3
CD4.Treg CD8.CM CD8.EM CD8.MAIT CD8.NaiveLike
3 149 384 7 3
CD8.TEMRA CD8.TEX CD8.TPEX
55 119 42
CD8+ effector memory T cells are crucial for anti-tumor immunity and most forms of effective cancer immunotherapy involve CD8+ T cell effector function (see more here). Let us investigate if there are any differentially expressed (DE) genes in cells from cluster 4 between 2nd and 3rd grade of ER+ patients.
table(sce_er$donor_id,sce_er$grade)
2 3
CID3586 0 0
CID3838 0 0
CID3921 0 0
CID3941 35 0
CID3946 0 0
CID3948 0 38
CID3963 0 0
CID4040 0 66
CID4066 0 0
CID4067 62 0
CID4290A 45 0
CID4398 0 416
CID4461 7 0
CID4463 64 0
CID4465 0 0
CID4471 32 0
CID4495 0 0
CID4513 0 0
CID4515 0 0
CID4523 0 0
CID4530N 84 0
CID4535 32 0
CID44041 0 0
CID44971 0 0
CID44991 0 0
CID45171 0 0
The DE analysis for scRNA-Seq data sets is usually performed on “pseudo-bulk” expression profiles (Tung et al. 2017), which are generated by summing counts for all cells with the same combination of label and sample. We have already subset the SingleCellEperiment
object by keeping only cells from ER+ patients originating from cluster 4. We should only aggregate them by donor IDs:
<- aggregateAcrossCells(sce_er, id=colData(sce_er)$donor_id)
sce_er_agg colnames(sce_er_agg) <- sce_er_agg$donor_id
Investigate if we have to account for the batch effect. Each sample was run in a separate batch, so no correction can be done.
table(sce_er_agg$donor_id,sce_er_agg$batch_run)
1 10 11 12 13 14 15 16 2 3 4 5 6 7 8 9
CID3586 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID3838 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID3921 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID3941 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
CID3946 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID3948 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
CID3963 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID4040 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
CID4066 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID4067 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
CID4290A 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
CID4398 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID4461 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
CID4463 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
CID4465 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID4471 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
CID4495 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID4513 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID4515 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID4523 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID4530N 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID4535 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
CID44041 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID44971 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID44991 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CID45171 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Further analysis follows the same steps as it was introduced in the session 5 dedicated to the differential expression analysis.
Create a DESeq dataset.
<- DESeqDataSet(sce_er_agg, design = ~ grade) dds
converting counts to integer mode
Pre-filter the dataset.
# Keep only genes that have at least 3 counts across all samples
<- rowSums(counts(dds)) >= 3
keep table(keep)
keep
FALSE TRUE
17974 11082
<- dds[keep,]
dds
# Discard donors having less than 20 cells
<- dds$ncells < 20
discarded table(discarded)
discarded
FALSE TRUE
10 1
<- dds[,!discarded] dds
Estimate size factors for each sample.
<- estimateSizeFactors(dds)
dds summary(dds$sizeFactor)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.3953 0.6000 0.8021 1.4918 1.6066 4.4645
Estimate dispersion for genes.
<- estimateDispersions(dds) dds
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
plotDispEsts(dds)
Fit the model.
<- DESeq(dds) dds
using pre-existing size factors
estimating dispersions
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 10 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
Test for the differential expression.
<- results(dds)
res <- order(res$padj)
ord head(res[ord,])
log2 fold change (MLE): grade 3 vs 2
Wald test p-value: grade 3 vs 2
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
RP11-291B21.2 14.744312 3.43673 0.658760 5.216968
IFI44L 7.556052 3.10101 0.796644 3.892588
CUTA 18.685625 -1.62536 0.447601 -3.631269
JUND 30.354067 2.74219 0.755759 3.628395
FO538757.1 2.999083 -0.96239 0.899932 -1.069403
AP006222.2 0.540472 -0.60270 1.670749 -0.360736
pvalue padj
<numeric> <numeric>
RP11-291B21.2 1.81876e-07 0.00201373
IFI44L 9.91806e-05 0.54906380
CUTA 2.82031e-04 0.78940106
JUND 2.85188e-04 0.78940106
FO538757.1 2.84888e-01 0.99964808
AP006222.2 7.18297e-01 0.99964808
There was only one long noncoding RNA (lncRNA) RP11-291B21.2 found to be significantly (with adjusted p-value < 0.05) upregulated in ER+ patients with grade 3 in comparison to grade 2. Interestingly, RP11-291B21.2 was found to be dominantly expressed in CD8+ T cells, especially in exhausted T cells, see here.
Let us plot the expression of RP11-291B21.2 in aggregated samples split by the grade.
<- data.frame(expr=counts(dds,normalized=TRUE)["RP11-291B21.2",],
df grade=dds$grade)
head(df)
expr grade
CID3941 1.787300 2
CID3948 65.794164 3
CID4040 24.336572 3
CID4067 2.772643 2
CID4290A 2.529630 2
CID4398 31.582642 3
ggplot(df,aes(x=grade,y=expr)) +
geom_dotplot(binaxis='y', stackdir='center',dotsize = 0.3)
Bin width defaults to 1/30 of the range of the data. Pick better
value with `binwidth`.
Exercise: Plot the expression of RP11-291B21.2 in individual cells used for this analysis and split by the grade.
<- data.frame(expr=logcounts(sce_er)["RP11-291B21.2",],
df cell_type=sce_er$celltype_zheng_agg,
grade=sce_er$grade)
head(df)
expr cell_type grade
CID4461_CGTCAGGCAACTGGCC 0 CD4.CTL_GNLY 2
CID4461_GCATACAAGGATGCGT 0 CD4.CTL_GNLY 2
CID4461_TCGTACCGTCTAGGTT 0 CD8.CM 2
CID4463_AGTGAGGGTTTGGGCC 0 CD8.CM 2
CID4463_CAGTCCTTCCCTGACT 0 CD8.EM 2
CID4463_GGCTGGTGTCTAGAGG 0 CD4.CTL_GNLY 2
ggplot(df,aes(x=grade,y=expr)) +
geom_jitter(aes(colour = cell_type))