# loading the libraries
suppressPackageStartupMessages({
library(tidyverse)
library(DT)
library(tximeta)
library(macrophage)
library(SummarizedExperiment)
library(DESeq2)
})
mRNASeq 3: Differential Gene Expression with DESeq2
Goal
Identify differentially expressed (DE) genes between experimental conditions.
- Data Preparation
- Differential expression analysis
- Plotting results
- Annotating and exporting results
Data Preparation
We continue to use the data from the previous lecture. The package macrophage provides the output of running Salmon on a set of 24 RNA-seq samples from Alasoo, et al. “Shared genetic effects on chromatin and gene expression indicate a role for enhancer priming in immune response”, published in Nature Genetics, in January 2018.
They profiled gene expression (with RNA-seq) in a subset of 86 successfully differentiated lines in four experimental conditions:
- naive
- 18-h stimulation with the cytokine IFNγ
- 5-h infection with Salmonella enterica serovar Typhimurium (Salmonella)
- IFNγ stimulation followed by Salmonella infection.
We start by importing and preparing the data (using the code from last lecture)
# preparing table with samples
<- system.file("extdata", package = "macrophage")
dir <- read_csv(file.path(dir, "coldata.csv")) |>
coldata ::select(c(names, sample_id, line_id, condition_name)) dplyr
Rows: 24 Columns: 13
── Column specification ──────────────────────────────────────────────
Delimiter: ","
chr (10): names, sample_id, line_id, condition_name, macrophage_ha...
dbl (3): replicate, ng_ul_mean, rna_auto
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
coldata
# A tibble: 24 × 4
names sample_id line_id condition_name
<chr> <chr> <chr> <chr>
1 SAMEA103885102 diku_A diku_1 naive
2 SAMEA103885347 diku_B diku_1 IFNg
3 SAMEA103885043 diku_C diku_1 SL1344
4 SAMEA103885392 diku_D diku_1 IFNg_SL1344
5 SAMEA103885182 eiwy_A eiwy_1 naive
6 SAMEA103885136 eiwy_B eiwy_1 IFNg
7 SAMEA103885413 eiwy_C eiwy_1 SL1344
8 SAMEA103884967 eiwy_D eiwy_1 IFNg_SL1344
9 SAMEA103885368 fikt_A fikt_3 naive
10 SAMEA103885218 fikt_B fikt_3 IFNg
# ℹ 14 more rows
# check the number of replicates
# or with stats::xtabs(~ condition_name, data=coldata)
|>
coldata group_by(condition_name) |>
summarize(n_replicates=n())
# A tibble: 4 × 2
condition_name n_replicates
<chr> <int>
1 IFNg 6
2 IFNg_SL1344 6
3 SL1344 6
4 naive 6
# adding file path
<- coldata |>
coldata mutate(files = file.path(dir, "quants", names, "quant.sf.gz"))
# read in the quantification
<- tximeta(coldata = coldata, type = "salmon", dropInfReps = TRUE) se
# summarize on gene level
<- summarizeToGene(se) seg
# simplify gene ids (ensembl)
# "ENSG00000000003.14" -> "ENSG00000000003"
# for info how to use regular expression see `?base::regex`
rownames(seg) <- str_replace(rownames(seg), "\\.[:digit:]+$", "")
library(org.Hs.eg.db)
# add gene symbol
<- addIds(seg, "SYMBOL")
seg # add GO annotation, handle 1:many mappings
<- addIds(seg, "GO", multiVals = "list") seg
We now create a DESeqDataSet
object, which is an extension of SummarizedExperiment
and it is the main data-container for DESeq2 package. We will use it to identify the differentially expressed genes across the conditions.
Before we do that, we first convert the column condition_name
to factor. In this step we can also define the reference condition by setting it as a first level.
# convert condition_name to a factor with defined ("ordered") levels
$condition_name <- factor(seg$condition_name,
seglevels=c("naive", "IFNg", "SL1344", "IFNg_SL1344"))
$condition_name seg
[1] naive IFNg SL1344 IFNg_SL1344 naive
[6] IFNg SL1344 IFNg_SL1344 naive IFNg
[11] SL1344 IFNg_SL1344 naive IFNg SL1344
[16] IFNg_SL1344 naive IFNg SL1344 IFNg_SL1344
[21] naive IFNg SL1344 IFNg_SL1344
Levels: naive IFNg SL1344 IFNg_SL1344
We specify the experimental design (experimental groups) by setting the argument design
.
We can use the formula interface ~ condition_name
to model the observed expression levels of every gene by condition definition by the factor condition_name
.
<- DESeqDataSet(seg, design = ~ condition_name) dds
using counts and average transcript lengths from tximeta
dds
class: DESeqDataSet
dim: 58294 24
metadata(8): tximetaInfo quantInfo ... assignRanges version
assays(3): counts abundance avgTxLength
rownames(58294): ENSG00000000003 ENSG00000000005 ...
ENSG00000285993 ENSG00000285994
rowData names(4): gene_id tx_ids SYMBOL GO
colnames(24): SAMEA103885102 SAMEA103885347 ... SAMEA103885308
SAMEA103884949
colData names(4): names sample_id line_id condition_name
Pre-filtering the dataset
The count matrix within DESeqDataSet
typically contains many rows with only zeros (in some other cases NA
s, representing missing values), and additionally many rows with only very few counts. By removing these genes with no information or nearly no information, we can reduce the size of the object and increase the speed of transformation and testing functions.
Please note, that more strict filtering to increase statistical power is automatically applied via independent filtering on the mean of normalized counts within the results
function.
We can apply minimal filtering by removing rows (genes) with less that 10 counts across samples.
nrow(dds)
[1] 58294
<- rowSums(counts(dds)) >= 10
keep sum(keep)
[1] 29060
<- dds[keep,]
dds nrow(dds)
[1] 29060
Differential expression analysis
Pipeline
As we have already specified the experimental design in the “condition_name” variable of colData(dds)
, we can proceed with the differential expression analysis on the raw, not-normalized, counts by calling function DESeq
. The applied procedure can be summarized into these three steps:
- Estimation of size factors
- controlling for differences in the sequencing depth of the samples
dds <- estimateSizeFactors(dds)
- Estimation of dispersion values for each gene
- moderated version, borrows information from other genes
dds <- estimateDispersions(dds)
- Fitting a generalized linear model and Wald test or LRT (likelihood ratio test)
- using the negative binomial distribution
dds <- nbinomWaldTest(dds)
<- DESeq(dds) dds
estimating size factors
using 'avgTxLength' from assays(dds), correcting for library size
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
dds
class: DESeqDataSet
dim: 29060 24
metadata(8): tximetaInfo quantInfo ... assignRanges version
assays(7): counts abundance ... H cooks
rownames(29060): ENSG00000000003 ENSG00000000419 ...
ENSG00000285992 ENSG00000285994
rowData names(34): gene_id tx_ids ... deviance maxCooks
colnames(24): SAMEA103885102 SAMEA103885347 ... SAMEA103885308
SAMEA103884949
colData names(4): names sample_id line_id condition_name
Estimation of size factors
Default method provided in DESeq2 uses the standard median ratio
method introduced in DESeq package. The size factor is the median ratio of the sample over a “pseudosample”: for each gene, the geometric mean of all samples. It defines size factors as being the median of these ratios for each sample. Median is used so any outlier genes will not affect the normalization.
Moderated estimation of dispersion
We typically do not have high enough number of samples in groups to directly estimate the dispersion. Therefore we are borrowing the information from other genes (fitted line in the plot below) and shrinking the estimates of individual genes towards trend. Resulting dispersion estimate is more robust and help us better detect true differentially expressed genes. This procedure is sometimes called “moderated” estimation of dispersion.
Plotting the dispersion estimates is a useful diagnostic. The dispersion plot below shows the final estimates shrunk from the gene-wise estimates towards the fitted line. Some gene-wise estimates are flagged as outliers and are not shrunk towards the fitted value, (this outlier detection is described in the manual page for estimateDispersionsMAP
). The amount of shrinkage can be more or less seen here, depending on the sample size, the number of coefficients, the row mean and the variability of the gene-wise estimates.
plotDispEsts(dds)
Fitting the model and testing for DE
The DESeq
function fits the generelized linear model using the negative binomial distribution (with dispersion estimate from the step before) and test for differences across compared groups.
DESeq2 offers two kinds of hypothesis tests: the Wald test, where it uses the estimated standard error of a log2 fold change to test if it is equal to zero, and the likelihood ratio test (LRT). The LRT examines two models for the counts, a full model with a certain number of terms and a reduced model, in which some of the terms of the full model are removed. The test determines if the increased likelihood of the data using the extra terms in the full model is more than expected if those extra terms are truly zero.
The LRT can test multiple terms at once and is similar to analysis of variance (ANOVA) in classical linear regression, however here it uses analysis of deviance (ANODEV), where the deviance captures the difference in likelihood between a full and a reduced model.
Results table
We can extract the differential expression results by calling the results
function on the DESeqDataSet
object. It will extract the log2 fold changes and p-values for the last variable in the design formula. As our design formula uses only the “condition_name” variable (that has 4 levels: “naive”, “IFNg”, “SL1344”, “IFNg_SL1344”), the following function will extract the results table for a comparison of the last level (“IFNg_SL1344”) over the first level (“naive”).
<- results(dds)
res head(res)
log2 fold change (MLE): condition name IFNg SL1344 vs naive
Wald test p-value: condition name IFNg SL1344 vs naive
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
ENSG00000000003 171.782 -0.0477888 0.617722 -0.0773629
ENSG00000000419 967.527 0.1612613 0.099028 1.6284426
ENSG00000000457 681.637 1.4934804 0.272596 5.4787322
ENSG00000000460 263.282 -2.1884074 0.260095 -8.4138824
ENSG00000000938 2646.887 3.7391200 0.752400 4.9695904
ENSG00000000971 3045.742 4.6401864 0.555993 8.3457679
pvalue padj
<numeric> <numeric>
ENSG00000000003 9.38335e-01 9.64720e-01
ENSG00000000419 1.03431e-01 1.93139e-01
ENSG00000000457 4.28384e-08 2.83016e-07
ENSG00000000460 3.96658e-17 8.18852e-16
ENSG00000000938 6.70945e-07 3.67471e-06
ENSG00000000971 7.07527e-17 1.41107e-15
Description of individual columns
The column
baseMean
is the average of the normalized count values, divided by the size factors, taken over all samples in theDESeqDataSet
.The remaining four columns refer to a specific contrast, namely the comparison of the “IFNg SL1344” level over the “naive” for the factor variable condition_name.
The column
log2FoldChange
is the effect size estimate. It represents how much the gene expression is different between “IFNg SL1344” level and the “naive” condition.The uncertainty associated with estimating
log2FoldChange
is available in the columnlfcSE
, the standard error estimate for the log2 fold change estimate.For the Wald test,
stat
is the Wald statistic: thelog2FoldChange
divided bylfcSE
, which is compared to a standard normal distribution to generate a two-tailed p-values.For each gene, DESeq2 performs a hypothesis test to see whether evidence is sufficient to decide against the null hypothesis that there is zero effect of the infection on the gene and that the observed difference between infection and control was merely caused by experimental variability. It reports a p-value that indicates the probability that a fold change as strong as the observed one, or even stronger, would be seen under the situation described by the null hypothesis.
We can also specify a particular contrast of our interest in the ‘contrast’ argument. Note that the lfcThreshold = 0 & alpha = 0.1
are default values. The DESeq2 software automatically performs independent filtering that maximizes the number of genes with adjusted p-value less than a critical value (by default, alpha is set to 0.1). lfcThreshold = 0
corresponding to a test that the log2 fold changes are equal to zero.
<- results(dds,
res contrast=c("condition_name","IFNg_SL1344","naive"),
lfcThreshold = 0,
alpha = 0.1)
head(res)
log2 fold change (MLE): condition_name IFNg_SL1344 vs naive
Wald test p-value: condition name IFNg SL1344 vs naive
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
ENSG00000000003 171.782 -0.0477888 0.617722 -0.0773629
ENSG00000000419 967.527 0.1612613 0.099028 1.6284426
ENSG00000000457 681.637 1.4934804 0.272596 5.4787322
ENSG00000000460 263.282 -2.1884074 0.260095 -8.4138824
ENSG00000000938 2646.887 3.7391200 0.752400 4.9695904
ENSG00000000971 3045.742 4.6401864 0.555993 8.3457679
pvalue padj
<numeric> <numeric>
ENSG00000000003 9.38335e-01 9.65223e-01
ENSG00000000419 1.03431e-01 1.93139e-01
ENSG00000000457 4.28384e-08 2.83016e-07
ENSG00000000460 3.96658e-17 8.18852e-16
ENSG00000000938 6.70945e-07 3.67471e-06
ENSG00000000971 7.07527e-17 1.41107e-15
The output ‘res’ is a DataFrame object and carries metadata with information on the meaning of the columns:
mcols(res, use.names = TRUE)
DataFrame with 6 rows and 2 columns
type description
<character> <character>
baseMean intermediate mean of normalized c..
log2FoldChange results log2 fold change (ML..
lfcSE results standard error: cond..
stat results Wald statistic: cond..
pvalue results Wald test p-value: c..
padj results BH adjusted p-values
Summary of the results
summary(res)
out of 29060 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 7333, 25%
LFC < 0 (down) : 5740, 20%
outliers [1] : 291, 1%
low counts [2] : 1127, 3.9%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Note that there are many genes with differential expression due to infection at the FDR level of 10%. This number is dependent on alpha
and lfcThreshold
arguments:
alpha = 0.1
corresponds to the significance cut-off used for optimizing the independent filtering. The above results are at the FDR level of 10%. By specifyingalpha = 0.05
, we can lower the false discovery rate to 5%, thereby being more strict about which set of genes are considered significant.
table(res$padj < 0.1)
FALSE TRUE
14569 13073
.05 <- results(dds, alpha = 0.05)
restable(res.05$padj < 0.05)
FALSE TRUE
14712 11804
lfcThreshold = 0
corresponds to a test that the log2 fold changes for the contrast are equal to zero. By specifyinglfcThreshold = 1
, we test for genes that show significant effects of treatment on gene counts more than doubling or less than halving, because 2^1=2. Thereby being more strict about which set of genes are considered significant. A combination of stricterlfcThreshold
andalpha
gives a more narrow list of genes that are considered significantly changed.
<- results(dds, alpha = 0.05, lfcThreshold=1)
res1 table(res1$padj < 0.05)
FALSE TRUE
20923 4470
Further: In case all counts for a particular gene were zeros, or it was excluded from analysis because it contained an extreme count outlier, DESeq will report the p-value of the corresponding gene as NA
.
For more information, see the outlier detection section of the DESeq2 vignette.
Multiple testing
As you would have noticed we have been using padj
and not pvalue
column of results. In high-throughput experiments, we correct for multiple testing and not use the p values directly as evidence against the null hypothesis.
If we were to use threshold on pvalue
(instead of padj
), there would be 13163 genes with a p value below 0.05 among the 28769 genes for which the test succeeded in reporting a p-value:
# Using Benjamini-Hochberg (BH) adjusted p value
sum(res$padj < 0.05, na.rm=TRUE)
[1] 11737
# Using p value
sum(res$pvalue < 0.05, na.rm=TRUE)
[1] 13163
# Non NA p values reported
sum(!is.na(res$pvalue))
[1] 28769
Now, assume for a moment that the null hypothesis is true for all genes, i.e., no gene is affected by the infection. By the definition of the p-value, we expect up to 5% of the genes to have a p-value below 0.05. This amounts to 1438.45 genes. If we just considered the list of genes with a p-value below 0.05 as differentially expressed, this list should therefore be expected to contain up to 1438.45/ 13163 = 10.9% false positives.
DESeq2 uses by default the Benjamini-Hochberg (BH) adjustment (Benjamini and Hochberg 1995) as implemented in the base R p.adjust
function; in brief, this method calculates for each gene an adjusted p-value that answers the following question: if one called significant all genes with an adjusted p-value less than or equal to this adjusted p-value threshold, what would be the fraction of false positives (the false discovery rate, FDR) among them, in the sense of the calculation outlined above? These values, called the BH-adjusted p-values, are given in the column padj of the res
object. (Example: By using FDR value of 5% we expect a maximum of 5% false positives among genes with FDR < 5%.)
As we are often interested in reporting or focusing on a set of “the most” interesting genes, we would like to put an upper bound on the percent of false positives in this set. For that, FDR is a useful statistic.
Let’s go back to our original results. If we consider a fraction of 10% false positives acceptable, we can consider all genes with an adjusted p-value below 10% = 0.1 as significant. How many such genes are there?
sum(res$padj < 0.1, na.rm=TRUE)
[1] 13073
We subset the results table to these genes and then sort it by the log2 fold change to get the 10 significant genes with the strongest down-regulation:
<- subset(res, padj < 0.1)
resSig head(resSig[ order(resSig$log2FoldChange), ], n=10)
log2 fold change (MLE): condition_name IFNg_SL1344 vs naive
Wald test p-value: condition name IFNg SL1344 vs naive
DataFrame with 10 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
ENSG00000186439 96.2517 -9.43725 1.166455 -8.09054
ENSG00000224968 28.3890 -7.83434 0.968664 -8.08778
ENSG00000172399 716.1258 -7.52285 0.868495 -8.66194
ENSG00000267303 21.9876 -7.49600 2.478208 -3.02477
ENSG00000230387 15.0181 -7.30843 1.276882 -5.72365
ENSG00000213876 26.5609 -7.10450 0.973074 -7.30109
ENSG00000282608 1600.4708 -7.08214 0.653736 -10.83334
ENSG00000168329 43.9092 -6.63120 0.817875 -8.10784
ENSG00000255819 37.0581 -6.59499 0.885527 -7.44753
ENSG00000176659 54.3549 -6.54496 0.951709 -6.87705
pvalue padj
<numeric> <numeric>
ENSG00000186439 5.94023e-16 1.06210e-14
ENSG00000224968 6.07625e-16 1.08361e-14
ENSG00000172399 4.63793e-18 1.03139e-16
ENSG00000267303 2.48825e-03 7.36158e-03
ENSG00000230387 1.04260e-08 7.56221e-08
ENSG00000213876 2.85453e-13 3.71842e-12
ENSG00000282608 2.39265e-27 1.16439e-25
ENSG00000168329 5.15265e-16 9.30305e-15
ENSG00000255819 9.51049e-14 1.30856e-12
ENSG00000176659 6.11032e-12 6.71577e-11
…and 10 with the strongest up-regulation:
head(resSig[ order(resSig$log2FoldChange, decreasing = TRUE), ], n=10)
log2 fold change (MLE): condition_name IFNg_SL1344 vs naive
Wald test p-value: condition name IFNg SL1344 vs naive
DataFrame with 10 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
ENSG00000233816 41.5227 23.7682 3.841298 6.18755
ENSG00000182393 8135.6863 14.8851 0.853324 17.44368
ENSG00000269826 1299.8459 14.2025 1.300911 10.91732
ENSG00000197919 250.9935 12.9056 2.187350 5.90010
ENSG00000113302 20674.7997 12.4956 0.837587 14.91853
ENSG00000126353 20504.9414 11.8942 0.853637 13.93354
ENSG00000279965 232.2369 11.8268 1.005227 11.76529
ENSG00000213886 2076.7114 11.7119 0.853912 13.71554
ENSG00000011590 363.2393 11.6376 1.110763 10.47715
ENSG00000138755 343321.6407 11.4230 0.935540 12.21009
pvalue padj
<numeric> <numeric>
ENSG00000233816 6.11051e-10 5.22446e-09
ENSG00000182393 3.84525e-68 3.03687e-65
ENSG00000269826 9.52682e-28 4.76203e-26
ENSG00000197919 3.63282e-09 2.78631e-08
ENSG00000113302 2.49706e-50 6.06338e-48
ENSG00000126353 3.96241e-44 6.63812e-42
ENSG00000279965 5.89263e-32 4.13411e-30
ENSG00000213886 8.19493e-43 1.19223e-40
ENSG00000011590 1.10013e-25 4.75154e-24
ENSG00000138755 2.74596e-34 2.23246e-32
Plotting the data and the results
Counts plot
The plotCounts
function allows a quick visualization of normalized counts for a single gene over conditions.
<- rownames(res)[which.min(res$padj)]
topGene plotCounts(dds, gene = topGene, intgroup=c("condition_name"))
We can also make similar plots using the ggplot function from the ggplot2 package to plot normalized counts.
<- plotCounts(dds, gene = topGene, intgroup = c("condition_name"), returnData = TRUE)
geneCounts ggplot(geneCounts, aes(x = condition_name, y = count, colour = condition_name)) +
geom_point(position=position_jitter(width=0.1, height=0, seed=123), size=4) +
scale_y_log10() +
labs(x="group", y="normalized count") +
scale_colour_discrete(guide="none") +
theme_bw(base_size=12)
MA-plot
An MA-plot (Dudoit et al. 2002) provides a useful overview for the distribution of the estimated coefficients in the model, e.g. the comparisons of interest, across all genes.
- y-axis: “M” stands for “minus” – subtraction of log values is equivalent to the log of the ratio
- x-axis: “A” stands for “average”.
The plot is also referred to as a mean-difference plot.
<- results(dds, contrast=c("condition_name","IFNg_SL1344","naive"))
res.noshr plotMA(res.noshr)
The log2 fold change for a particular comparison is plotted on the y-axis and the average of the counts normalized by size factor is shown on the x-axis. Each gene is represented with a dot. Genes with an adjusted p value below a threshold (here 0.1, the default) are shown in red.
As can be seen the on the left side of the MA-plot, log2 fold changes from genes with very low counts and highly variable counts lead to high vertical spread of points.
We can use the lfcShrink
function to shrink the log2 fold changes. In short, lfcShrink
looks at the largest fold changes that are not due to low counts and uses these to inform a prior distribution. The large fold changes from genes with lots of statistical information are not shrunk. Imprecise fold changes or any non-informative differences in the small count genes are shrunk. This allows you to compare all estimated LFC across experiments.
There are three types of shrinkage estimators in DESeq2, which are covered in the DESeq2 vignette. Here we specify the apeglm
method for shrinking coefficients, which is good for shrinking the noisy LFC estimates while giving low bias LFC estimates for true large differences (Zhu, Ibrahim, and Love 2018).
library("apeglm")
resultsNames(dds)
[1] "Intercept"
[2] "condition_name_IFNg_vs_naive"
[3] "condition_name_SL1344_vs_naive"
[4] "condition_name_IFNg_SL1344_vs_naive"
<- lfcShrink(dds,
res coef="condition_name_IFNg_SL1344_vs_naive",
type="apeglm")
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(res)
We can also label individual points on the MA-plot. Here we use the with R function to plot a circle and text for a selected row of the results object. Within the with function, only the baseMean and log2FoldChange values for the selected rows of res are used.
plotMA(res)
<- "ENSG00000000938"
selGene with(res[selGene, ], {
points(baseMean, log2FoldChange, col="black", cex=2, lwd=2)
text(baseMean, log2FoldChange, selGene, pos=2, col="black")
})
Another useful diagnostic plot is the histogram of the p values (figure below). This plot is best formed by excluding genes with very small counts, which otherwise generate spikes in the histogram.
hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20,
col = "grey50", border = "white")
This is a very good looking plot. We have (on the surface) a set of well-behaved p-values.
That flat distribution along the bottom is all our null p-values, which are uniformly distributed between 0 and 1. Why are null p-values uniformly distributed? Because that’s part of a definition of a p-value: under the null, it has a 5% chance of being less than .05, a 10% chance of being less than .1, etc. This describes a uniform distribution.
That peak close to 0 is where our alternative hypotheses live (these are the p-values for the genes for whom the null hypothesis was rejected), along with some potential false positives.
Gene clustering (heatmap)
For plotting of genes in a heatmap, or general in visualization we do not work with original raw counts but we prefer to use transformed normalized values. DESeq2 package offers multiple possibilities how to obtain them, below we will use vst
(variance stabilizing transformation).
<- varianceStabilizingTransformation(dds, blind = TRUE) vsd
We can select of highly variable genes across the dataset and perform clustering for these selected genes. Here, for demonstration, let us select the 20 genes with the highest variance across samples.
library(genefilter)
Attaching package: 'genefilter'
The following objects are masked from 'package:MatrixGenerics':
rowSds, rowVars
The following objects are masked from 'package:matrixStats':
rowSds, rowVars
The following object is masked from 'package:readr':
spec
<- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20) topVarGenes
Rather than simply looking at the absolute expression of these selected genes, the heatmap becomes more interesting if we look at the amount by which each gene deviates in a specific sample from the gene average across all samples. Therefore, we center each genes’ values across samples, and plot a heatmap (figure below). We provide a data.frame that instructs the pheatmap
function how to label the columns.
<- assay(vsd)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
mat <- as.data.frame(colData(vsd)[, c("line_id","condition_name")])
anno library(pheatmap)
pheatmap(mat, annotation_col = anno)
Annotating and exporting results
So far, our result table contains only Ensembl gene IDs. We can add other gene identifiers using the AnnotationDbi package and the annotation package org.Hs.eg.db:
library("AnnotationDbi")
library("org.Hs.eg.db")
This is the organism annotation package (“org”) for Homo sapiens (“Hs”), organized as an AnnotationDbi database package (“db”), using Entrez Gene IDs (“eg”) as primary key. To get a list of all available key types, use:
columns(org.Hs.eg.db)
[1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
[5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
[9] "EVIDENCEALL" "GENENAME" "GENETYPE" "GO"
[13] "GOALL" "IPI" "MAP" "OMIM"
[17] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM"
[21] "PMID" "PROSITE" "REFSEQ" "SYMBOL"
[25] "UCSCKG" "UNIPROT"
We can use the mapIds
function to add individual columns to our results table.
The argument keytype=ENSEMBL
means that ensembl ids will be used as key to map our gene-list to annotation database. The multiVals
argument tells the function what to do if there are multiple possible values for a single input value. Here we ask to just give us back the first one that occurs in the database. We obtain gene symbol and Entrez ID.
$symbol <- mapIds(org.Hs.eg.db,
reskeys=rownames(res),
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
'select()' returned 1:many mapping between keys and columns
$entrez <- mapIds(org.Hs.eg.db,
reskeys=rownames(res),
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
'select()' returned 1:many mapping between keys and columns
Now the results have the desired external gene IDs:
<- res[order(res$pvalue),]
resOrdered head(resOrdered)
log2 fold change (MAP): condition name IFNg SL1344 vs naive
Wald test p-value: condition name IFNg SL1344 vs naive
DataFrame with 6 rows and 7 columns
baseMean log2FoldChange lfcSE pvalue
<numeric> <numeric> <numeric> <numeric>
ENSG00000125347 30487.254 6.46005 0.259757 9.82508e-138
ENSG00000162645 36639.987 8.15281 0.339327 8.13214e-129
ENSG00000258738 509.215 4.05406 0.169910 6.30382e-127
ENSG00000073756 128512.234 7.70550 0.331903 4.66252e-121
ENSG00000104312 7429.960 4.65592 0.207653 1.87331e-112
ENSG00000173846 9248.951 5.53364 0.256022 1.65497e-104
padj symbol entrez
<numeric> <character> <character>
ENSG00000125347 2.71585e-133 IRF1 3659
ENSG00000162645 1.12394e-124 GBP2 2634
ENSG00000258738 5.80834e-123 BAZ1A-AS1 112268124
ENSG00000073756 3.22204e-117 PTGS2 5743
ENSG00000104312 1.03564e-108 RIPK2 8767
ENSG00000173846 7.62445e-101 PLK3 1263
Exporting results
You can easily save the results table in a TSV file that you can then share or load with a spreadsheet program such as Excel.
<- as.data.frame(resOrdered)
resOrderedDF write.table(resOrderedDF, file = "data/05/results.tsv", sep="\t")
You can also export the results using the Bioconductor package ReportingTools (Huntley et al. 2013). ReportingTools generates dynamic HTML documents, including links to external databases using gene identifiers and boxplots summarizing the normalized counts across groups. See the ReportingTools vignettes for full details. The simplest version of creating a dynamic ReportingTools report is performed with the following code:
library(ReportingTools)
library(XML)
<- HTMLReport(shortName="report",
htmlRep title="My report",
reportDirectory = "data/05/report")
publish(resOrderedDF, htmlRep)
Here showing the table for top 500 genes.
Gene set enrichment analysis
Very often, we also want to explore, if there is an enrichment for a particular pathway (i.e. Reactome), or for a gene list from previous work, or for a category from Gene Ontology database.
There are essentially two ways how to perform such analysis:
- competitive: comparison of genes belonging to the pathway to all other genes: Gene Set Enrichment Analysis, ORA: over-representation analysis (implemented in various packages)
- non-competitive, test for “average” behavior of genes belonging to pathway (
fry
from edgeR package)
Here we will apply Gene Set Enrichment Analysis, resp. fast version provided in fgsea package.
For that we need as input
- list of genes attributed to different pathways (here we use Reactome).
- some measure to rank the genes, log2 fold change, t-statistics
library(reactome.db)
library(fgsea)
# get mapping ensembl -> entrez id
<- AnnotationDbi::select(org.Hs.eg.db, keys=rownames(res), columns="ENTREZID", keytype="ENSEMBL") # KEGG pathway ens2eg
'select()' returned 1:many mapping between keys and columns
# get mapping entrez id -> reactome
<- AnnotationDbi::select(reactome.db, keys=unique(ens2eg$ENTREZID), columns=c("PATHID","PATHNAME")) # KEGG pathway eg2reactome
'select()' returned 1:many mapping between keys and columns
# clean-up reactome table
head(eg2reactome)
ENTREZID PATHID
1 7105 <NA>
2 8813 R-HSA-162699
3 8813 R-HSA-163125
4 8813 R-HSA-1643685
5 8813 R-HSA-3781865
6 8813 R-HSA-392499
PATHNAME
1 <NA>
2 Homo sapiens: Synthesis of dolichyl-phosphate mannose
3 Homo sapiens: Post-translational modification: synthesis of GPI-anchored proteins
4 Homo sapiens: Disease
5 Homo sapiens: Diseases of glycosylation
6 Homo sapiens: Metabolism of proteins
<- eg2reactome[!is.na(eg2reactome$PATHID),]
eg2reactome $PATHNAME <- str_replace(eg2reactome$PATHNAME, "^Homo sapiens: ", "")
eg2reactome$REACTOME <- str_c(eg2reactome$PATHID, eg2reactome$PATHNAME, sep=": ")
eg2reactome<- eg2reactome[, c("ENTREZID", "REACTOME")]
eg2reactome head(eg2reactome)
ENTREZID
2 8813
3 8813
4 8813
5 8813
6 8813
7 8813
REACTOME
2 R-HSA-162699: Synthesis of dolichyl-phosphate mannose
3 R-HSA-163125: Post-translational modification: synthesis of GPI-anchored proteins
4 R-HSA-1643685: Disease
5 R-HSA-3781865: Diseases of glycosylation
6 R-HSA-392499: Metabolism of proteins
7 R-HSA-446193: Biosynthesis of the N-glycan precursor (dolichol lipid-linked oligosaccharide, LLO) and transfer to a nascent protein
# combine both tables
<- merge(ens2eg, eg2reactome)
ens2reactome
# create pathway list
<- split(ens2reactome$ENSEMBL, ens2reactome$REACTOME)
pathwayList str(pathwayList)
List of 2577
$ R-HSA-1059683: Interleukin-6 signaling : chr [1:11] "ENSG00000136244" "ENSG00000160712" "ENSG00000134352" "ENSG00000162434" ...
$ R-HSA-109581: Apoptosis : chr [1:176] "ENSG00000117020" "ENSG00000153094" "ENSG00000251201" "ENSG00000197822" ...
$ R-HSA-109582: Hemostasis : chr [1:570] "ENSG00000121410" "ENSG00000111252" "ENSG00000112984" "ENSG00000172575" ...
$ R-HSA-109606: Intrinsic Pathway for Apoptosis : chr [1:55] "ENSG00000117020" "ENSG00000153094" "ENSG00000147889" "ENSG00000134308" ...
$ R-HSA-109703: PKB-mediated events : chr [1:2] "ENSG00000105221" "ENSG00000152270"
$ R-HSA-109704: PI3K Cascade : chr [1:40] "ENSG00000166225" "ENSG00000159445" "ENSG00000134962" "ENSG00000105221" ...
$ R-HSA-110056: MAPK3 (ERK1) activation : chr [1:10] "ENSG00000136244" "ENSG00000160712" "ENSG00000134352" "ENSG00000162434" ...
$ R-HSA-110312: Translesion synthesis by REV1 : chr [1:16] "ENSG00000116670" "ENSG00000132646" "ENSG00000135945" "ENSG00000009413" ...
$ R-HSA-110313: Translesion synthesis by Y family DNA polymerases bypasses lesions on DNA template : chr [1:39] "ENSG00000116670" "ENSG00000077514" "ENSG00000101751" "ENSG00000154914" ...
$ R-HSA-110314: Recognition of DNA damage by PCNA-containing replication complex : chr [1:30] "ENSG00000077514" "ENSG00000167986" "ENSG00000132646" "ENSG00000143476" ...
$ R-HSA-110320: Translesion Synthesis by POLH : chr [1:19] "ENSG00000163743" "ENSG00000132646" "ENSG00000170734" "ENSG00000182446" ...
$ R-HSA-110328: Recognition and association of DNA glycosylase with site containing an affected pyrimidine : chr [1:50] "ENSG00000274559" "ENSG00000197837" "ENSG00000196890" "ENSG00000123415" ...
$ R-HSA-110329: Cleavage of the damaged pyrimidine : chr [1:50] "ENSG00000274559" "ENSG00000197837" "ENSG00000196890" "ENSG00000123415" ...
$ R-HSA-110330: Recognition and association of DNA glycosylase with site containing an affected purine : chr [1:45] "ENSG00000274559" "ENSG00000197837" "ENSG00000196890" "ENSG00000128513" ...
$ R-HSA-110331: Cleavage of the damaged purine : chr [1:45] "ENSG00000274559" "ENSG00000197837" "ENSG00000196890" "ENSG00000128513" ...
$ R-HSA-110357: Displacement of DNA glycosylase by APEX1 : chr [1:9] "ENSG00000123415" "ENSG00000100823" "ENSG00000103152" "ENSG00000132781" ...
$ R-HSA-110362: POLB-Dependent Long Patch Base Excision Repair : chr [1:9] "ENSG00000129484" "ENSG00000143799" "ENSG00000168496" "ENSG00000100823" ...
$ R-HSA-110373: Resolution of AP sites via the multiple-nucleotide patch replacement pathway : chr [1:26] "ENSG00000129484" "ENSG00000077514" "ENSG00000143799" "ENSG00000168496" ...
$ R-HSA-110381: Resolution of AP sites via the single-nucleotide replacement pathway : chr [1:4] "ENSG00000100823" "ENSG00000005156" "ENSG00000070501" "ENSG00000073050"
$ R-HSA-111367: SLBP independent Processing of Histone Pre-mRNAs : chr [1:10] "ENSG00000155858" "ENSG00000114503" "ENSG00000142528" "ENSG00000136937" ...
$ R-HSA-111446: Activation of BIM and translocation to mitochondria : chr [1:3] "ENSG00000153094" "ENSG00000107643" "ENSG00000088986"
$ R-HSA-111447: Activation of BAD and translocation to mitochondria : chr [1:15] "ENSG00000117020" "ENSG00000134308" "ENSG00000142208" "ENSG00000105221" ...
$ R-HSA-111448: Activation of NOXA and translocation to mitochondria : chr [1:5] "ENSG00000101412" "ENSG00000141682" "ENSG00000198176" "ENSG00000114126" ...
$ R-HSA-111452: Activation and oligomerization of BAK protein : chr [1:2] "ENSG00000030110" "ENSG00000015475"
$ R-HSA-111453: BH3-only proteins associate with and inactivate anti-apoptotic BCL-2 members : chr [1:9] "ENSG00000153094" "ENSG00000105327" "ENSG00000141682" "ENSG00000002330" ...
$ R-HSA-111457: Release of apoptotic factors from the mitochondria : chr [1:7] "ENSG00000105928" "ENSG00000108387" "ENSG00000172115" "ENSG00000184047" ...
$ R-HSA-111458: Formation of apoptosome : chr [1:11] "ENSG00000105483" "ENSG00000120868" "ENSG00000101966" "ENSG00000149089" ...
$ R-HSA-111459: Activation of caspases through apoptosome-mediated cleavage : chr [1:6] "ENSG00000120868" "ENSG00000101966" "ENSG00000172115" "ENSG00000164305" ...
$ R-HSA-111461: Cytochrome c-mediated apoptotic response : chr [1:13] "ENSG00000105483" "ENSG00000120868" "ENSG00000101966" "ENSG00000149089" ...
$ R-HSA-111463: SMAC (DIABLO) binds to IAPs : chr [1:7] "ENSG00000120868" "ENSG00000101966" "ENSG00000172115" "ENSG00000184047" ...
$ R-HSA-111464: SMAC(DIABLO)-mediated dissociation of IAP:caspase complexes : chr [1:7] "ENSG00000120868" "ENSG00000101966" "ENSG00000172115" "ENSG00000184047" ...
$ R-HSA-111465: Apoptotic cleavage of cellular proteins : chr [1:37] "ENSG00000197822" "ENSG00000185825" "ENSG00000087274" "ENSG00000168036" ...
$ R-HSA-111469: SMAC, XIAP-regulated apoptotic response : chr [1:8] "ENSG00000120868" "ENSG00000101966" "ENSG00000108387" "ENSG00000172115" ...
$ R-HSA-111471: Apoptotic factor-mediated response : chr [1:20] "ENSG00000147889" "ENSG00000105928" "ENSG00000105483" "ENSG00000120868" ...
$ R-HSA-111885: Opioid Signalling : chr [1:88] "ENSG00000164885" "ENSG00000110931" "ENSG00000069966" "ENSG00000164742" ...
$ R-HSA-111931: PKA-mediated phosphorylation of CREB : chr [1:21] "ENSG00000164742" "ENSG00000078295" "ENSG00000138031" "ENSG00000173175" ...
$ R-HSA-111932: CaMK IV-mediated phosphorylation of CREB : chr [1:12] "ENSG00000110931" "ENSG00000118260" "ENSG00000182481" "ENSG00000198668" ...
$ R-HSA-111933: Calmodulin induced events : chr [1:36] "ENSG00000110931" "ENSG00000164742" "ENSG00000078295" "ENSG00000138031" ...
$ R-HSA-111957: Cam-PDE 1 activation : chr [1:6] "ENSG00000115252" "ENSG00000154678" "ENSG00000123360" "ENSG00000198668" ...
$ R-HSA-111995: phospho-PLA2 pathway : chr [1:2] "ENSG00000116711" "ENSG00000100030"
$ R-HSA-111996: Ca-dependent events : chr [1:38] "ENSG00000110931" "ENSG00000164742" "ENSG00000078295" "ENSG00000138031" ...
$ R-HSA-111997: CaM pathway : chr [1:36] "ENSG00000110931" "ENSG00000164742" "ENSG00000078295" "ENSG00000138031" ...
$ R-HSA-112040: G-protein mediated events : chr [1:54] "ENSG00000110931" "ENSG00000164742" "ENSG00000168710" "ENSG00000078295" ...
$ R-HSA-112043: PLC beta mediated events : chr [1:50] "ENSG00000110931" "ENSG00000164742" "ENSG00000168710" "ENSG00000078295" ...
$ R-HSA-112122: ALKBH2 mediated reversal of alkylation damage : chr "ENSG00000189046"
$ R-HSA-112126: ALKBH3 mediated reversal of alkylation damage : chr [1:4] "ENSG00000112249" "ENSG00000166199" "ENSG00000138303" "ENSG00000100325"
$ R-HSA-112303: Electric Transmission Across Gap Junctions : chr [1:4] "ENSG00000182963" "ENSG00000110218" "ENSG00000073150" "ENSG00000159248"
$ R-HSA-112307: Transmission across Electrical Synapses : chr [1:4] "ENSG00000182963" "ENSG00000110218" "ENSG00000073150" "ENSG00000159248"
$ R-HSA-112308: Presynaptic depolarization and calcium channel opening : chr [1:12] "ENSG00000166862" "ENSG00000075461" "ENSG00000157445" "ENSG00000141837" ...
$ R-HSA-112310: Neurotransmitter release cycle : chr [1:49] "ENSG00000198722" "ENSG00000144746" "ENSG00000168993" "ENSG00000070748" ...
$ R-HSA-112311: Neurotransmitter clearance : chr [1:10] "ENSG00000093010" "ENSG00000111275" "ENSG00000284922" "ENSG00000189221" ...
$ R-HSA-112313: Neurotransmitter uptake and metabolism In glial cells : chr [1:4] "ENSG00000135821" "ENSG00000110436" "ENSG00000079215" "ENSG00000111371"
$ R-HSA-112314: Neurotransmitter receptors and postsynaptic signal transmission : chr [1:190] "ENSG00000006116" "ENSG00000166862" "ENSG00000123416" "ENSG00000258947" ...
$ R-HSA-112315: Transmission across Chemical Synapses : chr [1:253] "ENSG00000006116" "ENSG00000166862" "ENSG00000123416" "ENSG00000258947" ...
$ R-HSA-112316: Neuronal System : chr [1:385] "ENSG00000138622" "ENSG00000182963" "ENSG00000069431" "ENSG00000173338" ...
$ R-HSA-112382: Formation of RNA Pol II elongation complex : chr [1:57] "ENSG00000134058" "ENSG00000136807" "ENSG00000092201" "ENSG00000166477" ...
$ R-HSA-112399: IRS-mediated signalling : chr [1:44] "ENSG00000166225" "ENSG00000159445" "ENSG00000134962" "ENSG00000105221" ...
$ R-HSA-112409: RAF-independent MAPK1/3 activation : chr [1:23] "ENSG00000143507" "ENSG00000120129" "ENSG00000158050" "ENSG00000120875" ...
$ R-HSA-112411: MAPK1 (ERK2) activation : chr [1:9] "ENSG00000136244" "ENSG00000160712" "ENSG00000134352" "ENSG00000162434" ...
$ R-HSA-112412: SOS-mediated signalling : chr [1:7] "ENSG00000177885" "ENSG00000174775" "ENSG00000169047" "ENSG00000133703" ...
$ R-HSA-113418: Formation of the Early Elongation Complex : chr [1:33] "ENSG00000134058" "ENSG00000104884" "ENSG00000163161" "ENSG00000114503" ...
$ R-HSA-113501: Inhibition of replication initiation of damaged DNA by RB1/E2F1 : chr [1:13] "ENSG00000101412" "ENSG00000014138" "ENSG00000167393" "ENSG00000101868" ...
$ R-HSA-113507: E2F-enabled inhibition of pre-replication complex formation : chr [1:9] "ENSG00000091651" "ENSG00000135336" "ENSG00000085840" "ENSG00000115942" ...
$ R-HSA-113510: E2F mediated regulation of DNA replication : chr [1:22] "ENSG00000101412" "ENSG00000091651" "ENSG00000135336" "ENSG00000014138" ...
$ R-HSA-114294: Activation, translocation and oligomerization of BAX : chr [1:2] "ENSG00000087088" "ENSG00000015475"
$ R-HSA-114452: Activation of BH3-only proteins : chr [1:30] "ENSG00000117020" "ENSG00000153094" "ENSG00000134308" "ENSG00000264364" ...
$ R-HSA-114508: Effects of PIP2 hydrolysis : chr [1:25] "ENSG00000172575" "ENSG00000068831" "ENSG00000074416" "ENSG00000065357" ...
$ R-HSA-114516: Disinhibition of SNARE formation : chr [1:5] "ENSG00000154229" "ENSG00000166501" "ENSG00000126583" "ENSG00000103496" ...
$ R-HSA-114604: GPVI-mediated activation cascade : chr [1:35] "ENSG00000134215" "ENSG00000162493" "ENSG00000108821" "ENSG00000164692" ...
$ R-HSA-114608: Platelet degranulation : chr [1:120] "ENSG00000121410" "ENSG00000145685" "ENSG00000125257" "ENSG00000196937" ...
$ R-HSA-1168372: Downstream signaling events of B Cell Receptor (BCR) : chr [1:81] "ENSG00000172575" "ENSG00000131467" "ENSG00000115233" "ENSG00000172175" ...
$ R-HSA-1169091: Activation of NF-kappaB in B cells : chr [1:65] "ENSG00000131467" "ENSG00000115233" "ENSG00000172175" "ENSG00000213341" ...
$ R-HSA-1169092: Activation of RAS in B cells : chr [1:5] "ENSG00000172575" "ENSG00000152689" "ENSG00000174775" "ENSG00000133703" ...
$ R-HSA-1169408: ISG15 antiviral mechanism : chr [1:72] "ENSG00000093000" "ENSG00000136243" "ENSG00000184979" "ENSG00000163002" ...
$ R-HSA-1169410: Antiviral mechanism by IFN-stimulated genes : chr [1:144] "ENSG00000251503" "ENSG00000160710" "ENSG00000123416" "ENSG00000258947" ...
$ R-HSA-1170546: Prolactin receptor signaling : chr [1:11] "ENSG00000178188" "ENSG00000112964" "ENSG00000096968" "ENSG00000113494" ...
$ R-HSA-1181150: Signaling by NODAL : chr [1:18] "ENSG00000175550" "ENSG00000243709" "ENSG00000123612" "ENSG00000179284" ...
$ R-HSA-1187000: Fertilization : chr [1:19] "ENSG00000099840" "ENSG00000175294" "ENSG00000166762" "ENSG00000161652" ...
$ R-HSA-1221632: Meiotic synapsis : chr [1:66] "ENSG00000274559" "ENSG00000118007" "ENSG00000196074" "ENSG00000066923" ...
$ R-HSA-1222449: Mtb iron assimilation by chelation : chr "ENSG00000012223"
$ R-HSA-1222499: Latent infection - Other responses of Mtb to phagocytosis : chr "ENSG00000012223"
$ R-HSA-1222556: ROS and RNS production in phagocytes : chr [1:36] "ENSG00000110719" "ENSG00000151418" "ENSG00000051523" "ENSG00000165168" ...
$ R-HSA-1226099: Signaling by FGFR in disease : chr [1:59] "ENSG00000166225" "ENSG00000111605" "ENSG00000119397" "ENSG00000213066" ...
$ R-HSA-1227986: Signaling by ERBB2 : chr [1:50] "ENSG00000117020" "ENSG00000181852" "ENSG00000103266" "ENSG00000185737" ...
$ R-HSA-1227990: Signaling by ERBB2 in Cancer : chr [1:26] "ENSG00000185737" "ENSG00000105401" "ENSG00000169752" "ENSG00000113070" ...
$ R-HSA-1234158: Regulation of gene expression by Hypoxia-inducible Factor : chr [1:11] "ENSG00000164442" "ENSG00000005339" "ENSG00000100393" "ENSG00000116016" ...
$ R-HSA-1234174: Cellular response to hypoxia : chr [1:73] "ENSG00000131467" "ENSG00000115233" "ENSG00000164442" "ENSG00000269858" ...
$ R-HSA-1234176: Oxygen-dependent proline hydroxylation of Hypoxia-inducible Factor Alpha : chr [1:64] "ENSG00000131467" "ENSG00000115233" "ENSG00000269858" "ENSG00000129521" ...
$ R-HSA-1236382: Constitutive Signaling by Ligand-Responsive EGFR Cancer Variants : chr [1:19] "ENSG00000105401" "ENSG00000138798" "ENSG00000146648" "ENSG00000109458" ...
$ R-HSA-1236394: Signaling by ERBB4 : chr [1:57] "ENSG00000137693" "ENSG00000123933" "ENSG00000185737" "ENSG00000105963" ...
$ R-HSA-1236973: Cross-presentation of particulate exogenous antigens (phagosomes) : chr [1:8] "ENSG00000051523" "ENSG00000165168" "ENSG00000138448" "ENSG00000082781" ...
$ R-HSA-1236974: ER-Phagosome pathway : chr [1:85] "ENSG00000131467" "ENSG00000115233" "ENSG00000174130" "ENSG00000106803" ...
$ R-HSA-1236975: Antigen processing-Cross presentation : chr [1:100] "ENSG00000131467" "ENSG00000115233" "ENSG00000174130" "ENSG00000106803" ...
$ R-HSA-1236977: Endosomal/Vacuolar pathway : chr [1:11] "ENSG00000135047" "ENSG00000136943" "ENSG00000163131" "ENSG00000206503" ...
$ R-HSA-1236978: Cross-presentation of soluble exogenous antigens (endosomes) : chr [1:47] "ENSG00000131467" "ENSG00000115233" "ENSG00000150337" "ENSG00000068878" ...
$ R-HSA-1237044: Erythrocytes take up carbon dioxide and release oxygen : chr [1:9] "ENSG00000240583" "ENSG00000065615" "ENSG00000166394" "ENSG00000159348" ...
$ R-HSA-1237112: Methionine salvage pathway : chr [1:6] "ENSG00000120053" "ENSG00000099810" "ENSG00000149089" "ENSG00000182551" ...
$ R-HSA-1247673: Erythrocytes take up oxygen and release carbon dioxide : chr [1:5] "ENSG00000240583" "ENSG00000112077" "ENSG00000004939" "ENSG00000104267" ...
$ R-HSA-1250196: SHC1 events in ERBB2 signaling : chr [1:22] "ENSG00000185737" "ENSG00000169752" "ENSG00000113070" "ENSG00000138798" ...
[list output truncated]
# ranks (log2FC)
head(res$log2FoldChange)
ENSG00000000003 ENSG00000000419 ENSG00000000457 ENSG00000000460
-0.03336711 0.16022327 1.44232740 -2.14343410
ENSG00000000938 ENSG00000000971
3.46461375 4.51618605
We often also limit the size of pathways (remove small and very large ones).
set.seed(29)
<- fgsea(pathways = pathwayList,
fgseaRes stats = res$log2FoldChange,
eps = 0.0,
minSize = 10,
maxSize = 500)
Warning in fgseaMultilevel(pathways = pathways, stats = stats,
minSize = minSize, : There were 8 pathways for which P-values were
not calculated properly due to unbalanced (positive and negative)
gene-level statistic values. For such pathways pval, padj, NES,
log2err are set to NA. You can try to increase the value of the
argument nPermSimple (for example set it nPermSimple = 10000)
Warning in fgseaMultilevel(pathways = pathways, stats = stats,
minSize = minSize, : For some of the pathways the P-values were
likely overestimated. For such pathways log2err is set to NA.
head(fgseaRes[order(pval), ])
pathway
<char>
1: R-HSA-449147: Signaling by Interleukins
2: R-HSA-68877: Mitotic Prometaphase
3: R-HSA-72312: rRNA processing
4: R-HSA-8868773: rRNA processing in the nucleus and cytosol
5: R-HSA-877300: Interferon gamma signaling
6: R-HSA-141424: Amplification of signal from the kinetochores
pval padj log2err ES NES size
<num> <num> <num> <num> <num> <int>
1: 3.984076e-25 6.087668e-22 1.303093 0.6289297 1.919019 449
2: 4.664202e-23 3.563450e-20 1.246233 -0.6051415 -2.620406 199
3: 2.777444e-19 1.414645e-16 1.133090 -0.5678031 -2.439566 203
4: 3.361600e-17 1.284131e-14 1.067210 -0.5509338 -2.355017 192
5: 2.565946e-16 7.841532e-14 1.037696 0.7762564 2.233968 93
6: 6.654425e-16 1.452566e-13 1.027670 -0.6746151 -2.770113 95
leadingEdge
<list>
1: ENSG0000....
2: ENSG0000....
3: ENSG0000....
4: ENSG0000....
5: ENSG0000....
6: ENSG0000....
We can plot so called “barcode” - “worm” plot.
plotEnrichment(pathwayList[["R-HSA-449147: Signaling by Interleukins"]], res$log2FoldChange) +
labs(title="R-HSA-449147: Signaling by Interleukins")
Or make short overview with top 10 up- and down-regulated pathways:
<- fgseaRes[ES > 0][head(order(pval), n=10), pathway]
topPathwaysUp <- fgseaRes[ES < 0][head(order(pval), n=10), pathway]
topPathwaysDown <- c(topPathwaysUp, rev(topPathwaysDown))
topPathways plotGseaTable(pathwayList[topPathways], res$log2FoldChange, fgseaRes, gseaParam=0.5)
Further Reading
https://bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html
http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
https://www.bioconductor.org/packages/release/bioc/vignettes/GeneTonic/inst/doc/GeneTonic_manual.html
Exercises
- Repeat the analysis with one of the other comparisons.
- Modify the individual steps, i.e. plot different gene or make heatmap with different set of genes…
- Run GSEA with different type of annotation (e.g. GO:BP)
Session info
::session_info()
sessioninfo## ─ Session info ─────────────────────────────────────────────────────
## setting value
## version R Under development (unstable) (2024-03-18 r86148)
## os macOS Sonoma 14.4.1
## system aarch64, darwin23.4.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz Europe/Zurich
## date 2024-03-27
## pandoc 3.1.12.3 @ /opt/homebrew/bin/ (via rmarkdown)
##
## ─ Packages ─────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-5 2016-07-21 [1] CRAN (R 4.4.0)
## annotate 1.81.2 2024-01-26 [1] Bioconductor 3.19 (R 4.4.0)
## AnnotationDbi * 1.65.2 2023-11-03 [1] Bioconductor
## AnnotationFilter 1.27.0 2023-10-24 [1] Bioconductor
## AnnotationHub 3.11.3 2024-03-19 [1] Bioconductor 3.19 (R 4.4.0)
## apeglm * 1.25.0 2023-10-24 [1] Bioconductor
## bbmle 1.0.25.1 2023-12-09 [1] CRAN (R 4.4.0)
## bdsmatrix 1.3-7 2024-03-02 [1] CRAN (R 4.4.0)
## Biobase * 2.63.0 2023-10-24 [1] Bioconductor
## BiocFileCache 2.11.1 2023-10-26 [1] Bioconductor
## BiocGenerics * 0.49.1 2023-11-01 [1] Bioconductor
## BiocIO 1.13.0 2023-10-24 [1] Bioconductor
## BiocManager 1.30.22 2023-08-08 [1] CRAN (R 4.4.0)
## BiocParallel 1.37.1 2024-03-04 [1] Bioconductor 3.19 (R 4.4.0)
## BiocStyle * 2.31.0 2023-10-24 [1] Bioconductor
## BiocVersion 3.19.1 2023-10-26 [1] Bioconductor
## biomaRt 2.59.1 2024-01-24 [1] Bioconductor 3.19 (R 4.4.0)
## Biostrings 2.71.5 2024-03-22 [1] Bioconductor 3.19 (R 4.4.0)
## bit 4.0.5 2022-11-15 [1] CRAN (R 4.4.0)
## bit64 4.0.5 2020-08-30 [1] CRAN (R 4.4.0)
## bitops 1.0-7 2021-04-24 [1] CRAN (R 4.4.0)
## blob 1.2.4 2023-03-17 [1] CRAN (R 4.4.0)
## bslib 0.6.2 2024-03-22 [1] CRAN (R 4.4.0)
## cachem 1.0.8 2023-05-01 [1] CRAN (R 4.4.0)
## cli 3.6.2 2023-12-11 [1] CRAN (R 4.4.0)
## coda 0.19-4.1 2024-01-31 [1] CRAN (R 4.4.0)
## codetools 0.2-19 2023-02-01 [1] CRAN (R 4.4.0)
## colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.4.0)
## cowplot 1.1.3 2024-01-22 [1] CRAN (R 4.4.0)
## crayon 1.5.2 2022-09-29 [1] CRAN (R 4.4.0)
## crosstalk 1.2.1 2023-11-23 [1] CRAN (R 4.4.0)
## curl 5.2.1 2024-03-01 [1] CRAN (R 4.4.0)
## data.table 1.15.2 2024-02-29 [1] CRAN (R 4.4.0)
## DBI 1.2.2 2024-02-16 [1] CRAN (R 4.4.0)
## dbplyr 2.5.0 2024-03-19 [1] CRAN (R 4.4.0)
## DelayedArray 0.29.9 2024-03-01 [1] Bioconductor 3.19 (R 4.4.0)
## DESeq2 * 1.43.4 2024-03-06 [1] Bioconductor 3.19 (R 4.4.0)
## digest 0.6.35 2024-03-11 [1] CRAN (R 4.4.0)
## dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.4.0)
## DT * 0.32 2024-02-19 [1] CRAN (R 4.4.0)
## emdbook 1.3.13 2023-07-03 [1] CRAN (R 4.4.0)
## ensembldb 2.27.1 2023-11-20 [1] Bioconductor 3.19 (R 4.4.0)
## evaluate 0.23 2023-11-01 [1] CRAN (R 4.4.0)
## fansi 1.0.6 2023-12-08 [1] CRAN (R 4.4.0)
## farver 2.1.1 2022-07-06 [1] CRAN (R 4.4.0)
## fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.4.0)
## fastmatch 1.1-4 2023-08-18 [1] CRAN (R 4.4.0)
## fgsea * 1.29.0 2023-10-24 [1] Bioconductor
## filelock 1.0.3 2023-12-11 [1] CRAN (R 4.4.0)
## forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.4.0)
## genefilter * 1.85.1 2024-01-19 [1] Bioconductor 3.19 (R 4.4.0)
## generics 0.1.3 2022-07-05 [1] CRAN (R 4.4.0)
## GenomeInfoDb * 1.39.9 2024-03-13 [1] Bioconductor 3.19 (R 4.4.0)
## GenomeInfoDbData 1.2.11 2023-11-02 [1] Bioconductor
## GenomicAlignments 1.39.4 2024-02-15 [1] Bioconductor 3.19 (R 4.4.0)
## GenomicFeatures * 1.55.4 2024-03-12 [1] Bioconductor 3.19 (R 4.4.0)
## GenomicRanges * 1.55.4 2024-03-18 [1] Bioconductor 3.19 (R 4.4.0)
## ggplot2 * 3.5.0 2024-02-23 [1] CRAN (R 4.4.0)
## glue 1.7.0 2024-01-09 [1] CRAN (R 4.4.0)
## gtable 0.3.4 2023-08-21 [1] CRAN (R 4.4.0)
## hms 1.1.3 2023-03-21 [1] CRAN (R 4.4.0)
## htmltools 0.5.8 2024-03-25 [1] CRAN (R 4.4.0)
## htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.4.0)
## httr 1.4.7 2023-08-15 [1] CRAN (R 4.4.0)
## httr2 1.0.0 2023-11-14 [1] CRAN (R 4.4.0)
## IRanges * 2.37.1 2024-01-19 [1] Bioconductor 3.19 (R 4.4.0)
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.4.0)
## jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.4.0)
## KEGGREST 1.43.0 2023-10-24 [1] Bioconductor
## knitr * 1.45 2023-10-30 [1] CRAN (R 4.4.0)
## labeling 0.4.3 2023-08-29 [1] CRAN (R 4.4.0)
## lattice 0.22-6 2024-03-20 [1] CRAN (R 4.4.0)
## lazyeval 0.2.2 2019-03-15 [1] CRAN (R 4.4.0)
## lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.4.0)
## locfit 1.5-9.9 2024-03-01 [1] CRAN (R 4.4.0)
## lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.4.0)
## macrophage * 1.19.0 2023-10-26 [1] Bioconductor
## magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.4.0)
## MASS 7.3-60.2 2024-03-18 [1] local
## Matrix 1.7-0 2024-03-22 [1] CRAN (R 4.4.0)
## MatrixGenerics * 1.15.0 2023-10-24 [1] Bioconductor
## matrixStats * 1.2.0 2023-12-11 [1] CRAN (R 4.4.0)
## memoise 2.0.1 2021-11-26 [1] CRAN (R 4.4.0)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 4.4.0)
## mvtnorm 1.2-4 2023-11-27 [1] CRAN (R 4.4.0)
## numDeriv 2016.8-1.1 2019-06-06 [1] CRAN (R 4.4.0)
## org.Hs.eg.db * 3.18.0 2023-11-02 [1] Bioconductor
## pheatmap * 1.0.12 2019-01-04 [1] CRAN (R 4.4.0)
## pillar 1.9.0 2023-03-22 [1] CRAN (R 4.4.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.4.0)
## plyr 1.8.9 2023-10-02 [1] CRAN (R 4.4.0)
## png * 0.1-8 2022-11-29 [1] CRAN (R 4.4.0)
## prettyunits 1.2.0 2023-09-24 [1] CRAN (R 4.4.0)
## progress 1.2.3 2023-12-06 [1] CRAN (R 4.4.0)
## ProtGenerics 1.35.4 2024-03-12 [1] Bioconductor 3.19 (R 4.4.0)
## purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.4.0)
## R6 2.5.1 2021-08-19 [1] CRAN (R 4.4.0)
## rappdirs 0.3.3 2021-01-31 [1] CRAN (R 4.4.0)
## RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.4.0)
## Rcpp 1.0.12 2024-01-09 [1] CRAN (R 4.4.0)
## RCurl 1.98-1.14 2024-01-09 [1] CRAN (R 4.4.0)
## reactome.db * 1.86.2 2023-11-28 [1] Bioconductor
## readr * 2.1.5 2024-01-10 [1] CRAN (R 4.4.0)
## restfulr 0.0.15 2022-06-16 [1] CRAN (R 4.4.0)
## rjson 0.2.21 2022-01-09 [1] CRAN (R 4.4.0)
## rlang 1.1.3 2024-01-10 [1] CRAN (R 4.4.0)
## rmarkdown 2.26 2024-03-05 [1] CRAN (R 4.4.0)
## Rsamtools 2.19.4 2024-03-18 [1] bioc_xgit (@d4c43e8)
## RSQLite 2.3.5 2024-01-21 [1] CRAN (R 4.4.0)
## rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.4.0)
## rtracklayer 1.63.1 2024-03-13 [1] Bioconductor 3.19 (R 4.4.0)
## S4Arrays 1.3.6 2024-03-04 [1] Bioconductor 3.19 (R 4.4.0)
## S4Vectors * 0.41.5 2024-03-20 [1] Bioconductor 3.19 (R 4.4.0)
## sass 0.4.9 2024-03-15 [1] CRAN (R 4.4.0)
## scales 1.3.0 2023-11-28 [1] CRAN (R 4.4.0)
## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.4.0)
## SparseArray 1.3.4 2024-02-11 [1] Bioconductor 3.19 (R 4.4.0)
## stringi 1.8.3 2023-12-11 [1] CRAN (R 4.4.0)
## stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.4.0)
## SummarizedExperiment * 1.33.3 2024-01-23 [1] Bioconductor 3.19 (R 4.4.0)
## survival 3.5-8 2024-02-14 [1] CRAN (R 4.4.0)
## tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.4.0)
## tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.4.0)
## tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.4.0)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.4.0)
## timechange 0.3.0 2024-01-18 [1] CRAN (R 4.4.0)
## txdbmaker 0.99.7 2024-03-22 [1] Bioconductor 3.19 (R 4.4.0)
## tximeta * 1.21.5 2024-03-19 [1] Bioconductor 3.19 (R 4.4.0)
## tximport 1.31.1 2024-03-24 [1] Bioconductor 3.19 (R 4.4.0)
## tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.4.0)
## utf8 1.2.4 2023-10-22 [1] CRAN (R 4.4.0)
## vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.4.0)
## vroom 1.6.5 2023-12-05 [1] CRAN (R 4.4.0)
## withr 3.0.0 2024-01-16 [1] CRAN (R 4.4.0)
## xfun 0.43 2024-03-25 [1] CRAN (R 4.4.0)
## XML 3.99-0.16.1 2024-01-22 [1] CRAN (R 4.4.0)
## xml2 1.3.6 2023-12-04 [1] CRAN (R 4.4.0)
## xtable 1.8-4 2019-04-21 [1] CRAN (R 4.4.0)
## XVector 0.43.1 2024-01-10 [1] Bioconductor 3.19 (R 4.4.0)
## yaml 2.3.8 2023-12-11 [1] CRAN (R 4.4.0)
## zlibbioc 1.49.3 2024-03-13 [1] Bioconductor 3.19 (R 4.4.0)
##
## [1] /Library/Frameworks/R.framework/Versions/4.4/Resources/library
##
## ────────────────────────────────────────────────────────────────────