set.seed(123) # for reproducibility
# Load packages
library(here)
library(bears)
library(GenomicFeatures)
library(SummarizedExperiment)
library(SingleCellExperiment)
library(scater)
library(scran)
library(DESeq2)
library(tidyverse)
library(patchwork)
library(ggplot2)
2 Analyzing quantitative data
Here, I will describe the code to:
- Read and parse quantitative data from salmon output files.
- Perform dimensionality reduction with UMAP and t-SNE.
First of all, let’s load required packages and data.
# Load data
load(here("data", "ds.rda"))
<- read.csv(
final_metadata here("products", "tables", "final_metadata_classified_atlas_v2.tsv"),
header = TRUE, sep = "\t"
|>
) ::select(-c(Run, Experiment)) dplyr
2.1 From salmon quant.sf
files to SummarizedExperiment
Here, we will obtain a SummarizedExperiment
object containing gene-level transcript abundances in TPM and bias-corrected counts. Counts will be obtained using the “bias correction without an offset” method from the Bioconductor package tximport
.
To create the SummarizedExperiment
object, we will need a 2-column data frame of transcript-to-gene mapping. Let’s create it.
# Create a data frame of transcript-to-gene mapping
<- Biostrings::readDNAStringSet(
tx here("data", "gmax_transcriptome.fa.gz")
)<- data.frame(
tx2gene TXNAME = gsub(" \\|.*", "", names(tx)),
GENEID = gsub(".*\\| ", "", names(tx))
)
save(
compress = "xz",
tx2gene, file = here("products", "result_files", "tx2gene.rda")
)
Now, we can get the SummarizedExperiment
object.
# Get gene-level transcript abundance estimates from salmon
## "Bias correction without an offset" method
<- salmon2se(
se_atlas_gene
final_metadata,level = "gene",
salmondir = ds$salmondir,
tx2gene = tx2gene
)assay(se_atlas_gene, "gene_counts") <- round(
assay(se_atlas_gene, "gene_counts")
)
## "Original counts and offset" method
<- file.path(ds$salmondir, final_metadata$BioSample, "quant.sf")
files <- tximport::tximport(
se_atlas_gene_offset type = "salmon", tx2gene = tx2gene
files,
)colnames(se_atlas_gene_offset$abundance) <- final_metadata$BioSample
colnames(se_atlas_gene_offset$counts) <- final_metadata$BioSample
colnames(se_atlas_gene_offset$length) <- final_metadata$BioSample
# Get transcript-level transcript abundance estimates from salmon
<- salmon2se(
se_atlas_transcript
final_metadata,level = "transcript",
salmondir = ds$salmondir
)
# Save gene-level and transcript-level
save(
compress = "xz",
se_atlas_gene, file = here("products", "result_files", "se_atlas_gene.rda")
)
save(
compress = "xz",
se_atlas_gene_offset, file = here("products", "result_files", "se_atlas_gene_offset.rda")
)
save(
compress = "xz",
se_atlas_transcript, file = here("products", "result_files", "se_atlas_transcript.rda")
)
# Save final and complete sample metadata data frame
<- as.data.frame(colData(se_atlas_gene))
sample_metadata_complete
save(
compress = "xz",
sample_metadata_complete, file = here("products", "result_files", "sample_metadata_complete.rda")
)
2.2 Dimensionality reduction
Now, we will perform dimensionality reduction on highly variable genes with PCA, t-SNE, and UMAP. To increase speed and avoid noise, we will use the PCs computed with the PCA as input to t-SNE and UMAP.
# Load SummarizedExperiment object containing counts per gene
load(here("products", "result_files", "se_atlas_gene.rda"))
2.2.1 Feature selection
To maximize biological signal and reduce noise, we will only use highly variable genes for dimensionality reduction. Here, we will pick the top 5000 of genes with the highest biological components.
# Create a SingleCellExperiment with counts and log-normalized counts
<- SingleCellExperiment(
atlas_counts_sce assays = list(
counts = assay(se_atlas_gene, "gene_counts"),
logcounts = log2(assay(se_atlas_gene, "gene_counts") + 1)
), colData = colData(se_atlas_gene)
)
# Modeling the mean-variance relationship and visualizing the fit
<- modelGeneVar(atlas_counts_sce)
mean_var_model <- metadata(mean_var_model)
fit_mean_var
<- data.frame(
p_fit_mean_var mean = fit_mean_var$mean,
var = fit_mean_var$var,
trend = fit_mean_var$trend(fit_mean_var$mean)
|>
) ggplot(aes(x = mean, y = var)) +
geom_point(alpha = 0.4) +
geom_line(aes(y = trend), color = "steelblue3", linewidth = 1.5) +
labs(
title = "Per-gene mean-variance relationship",
subtitle = "Counts were normalized by library size and log-transformed",
x = "Mean of log-expression", y = "Variance of log-expression"
+
) theme_minimal()
p_fit_mean_var
# Extract the top 5000 of genes with the highest biological components
<- getTopHVGs(mean_var_model, n = 5000) hvg
The object hvg
is a character vector containing the IDs of the top 5000 genes with the highest biological components.
2.2.2 Principal components analysis (PCA)
Now, we will perform PCA using the genes in hvg
.
# Perform PCA
<- fixedPCA(
atlas_counts_sce subset.row = hvg
atlas_counts_sce,
)
# Plot proportion of variance explained by each PC
<- attr(reducedDim(atlas_counts_sce), "percentVar")
percent_var
<- data.frame(
p_pca_percent_var Variance = round(percent_var, 2),
PC = factor(1:50, levels = 1:50)
|>
) ggplot(aes(x = PC, y = Variance)) +
geom_col(fill = "grey40") +
geom_text(aes(label = Variance), hjust = -0.3) +
labs(
title = "Proportion of variance explained by each PC",
x = "PC", y = "Variance explained (%)"
+
) coord_flip() +
theme_minimal() +
ylim(0, 60)
p_pca_percent_var
Based on the plot, we will use only the top 8 PCs for t-SNE and UMAP.
2.2.3 t-stochastic neighbor embedding (t-SNE)
Now, we will perform dimensionality reduction with t-SNE using the top 8 PCs obtained previously. We will first test running a t-SNE with 6 different perplexity values: 10, 20, 30, 40, 50, 60. Then, we will select the best.
# Get and plot t-SNE coordinates (perplexity = 10, 20, 30, 40, 50)
<- c(10, 20, 30, 40, 50, 60)
perplexities <- lapply(perplexities, function(x) {
p_tsne
<- runTSNE(
tsne_coord perplexity = x,
atlas_counts_sce, dimred = "PCA", n_dimred = 8
)
# Color by the variable "Part"
<- plotReducedDim(tsne_coord, dimred = "TSNE", colour_by = "Part") +
p labs(
x = "t-SNE 1", y = "t-SNE 2",
title = paste0("Perplexity = ", x)
)
return(p)
})
# Visualize all plots
<- wrap_plots(p_tsne, nrow = 2) +
p_tsne_all_perplexities_panel plot_layout(guides = "collect") &
::scale_color_d3("category20") &
ggscilabs(color = "Part")
p_tsne_all_perplexities_panel
Based on the plots, we chose perplexity = 60
as the best option. Now, let’s create an object containing only the plot for this perplexity value and give it a better title.
# Plot t-SNE with perplexity = 60
<- p_tsne_all_perplexities_panel[[6]] +
p_tsne_optimal_perplexity labs(
title = "t-SNE plot of all samples in the SEA 2.0",
subtitle = "Coordinates were constructed from the top 8 principal components, with perplexity = 60"
)
p_tsne_optimal_perplexity
2.2.4 Uniform manifold approximation and projection (UMAP)
Lastly, we will perform dimensionality reduction with UMAP using the top 8 PCs identified before. Similarly to what we did for t-SNE, we will run UMAP with 6 different values for the “number of neighbors” parameter: 10, 20, 30, 40, 50, and 60. Then, we will look at each plot to choose the best.
# Run UMAP with n_neighbors = 10, 20, 30, 40, 50
<- c(10, 20, 30, 40, 50, 60)
n_neighbors <- lapply(n_neighbors, function(x) {
p_umap
<- runUMAP(
umap_coord n_neighbors = x,
atlas_counts_sce, dimred = "PCA", n_dimred = 8
)
# Color by the variable "Part"
<- plotReducedDim(umap_coord, dimred = "UMAP", colour_by = "Part") +
p labs(
x = "UMAP 1", y = "UMAP 2",
title = paste0("Number of nearest neighbors = ", x)
)
return(p)
})
# Visualize all plots
<- wrap_plots(p_umap, nrow = 2) +
p_umap_all_nneighbors_panel plot_layout(guides = "collect") &
::scale_color_d3("category20") &
ggscilabs(color = "Part")
p_umap_all_nneighbors_panel
Based on the plots, we chose n_neighbors = 30
as the best option. Now, let’s create an object containing the final plot.
# Plot UMAP with n_neighbors = 30
<- p_umap_all_nneighbors_panel[[3]] +
p_umap_optimal_nneighbors labs(
title = "UMAP plot of all samples in the SEA 2.0",
subtitle = "Coordinates were constructed from the top 8 principal components, with n_neighbors = 30"
)
p_umap_optimal_nneighbors
Session info
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.3.0 (2023-04-21)
os Ubuntu 20.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Brussels
date 2023-06-23
pandoc 3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
AnnotationDbi * 1.62.0 2023-04-25 [1] Bioconductor
beachmat 2.16.0 2023-04-25 [1] Bioconductor
bears * 0.99.0 2023-06-23 [1] Github (almeidasilvaf/bears@2dbba3d)
beeswarm 0.4.0 2021-06-01 [1] CRAN (R 4.3.0)
Biobase * 2.60.0 2023-04-25 [1] Bioconductor
BiocFileCache 2.8.0 2023-04-25 [1] Bioconductor
BiocGenerics * 0.46.0 2023-04-25 [1] Bioconductor
BiocIO 1.10.0 2023-04-25 [1] Bioconductor
BiocNeighbors 1.18.0 2023-04-25 [1] Bioconductor
BiocParallel 1.34.0 2023-04-25 [1] Bioconductor
BiocSingular 1.16.0 2023-04-25 [1] Bioconductor
biomaRt 2.56.0 2023-04-25 [1] Bioconductor
Biostrings 2.68.0 2023-04-25 [1] Bioconductor
bit 4.0.5 2022-11-15 [1] CRAN (R 4.3.0)
bit64 4.0.5 2020-08-30 [1] CRAN (R 4.3.0)
bitops 1.0-7 2021-04-24 [1] CRAN (R 4.3.0)
blob 1.2.4 2023-03-17 [1] CRAN (R 4.3.0)
bluster 1.10.0 2023-04-25 [1] Bioconductor
cachem 1.0.8 2023-05-01 [1] CRAN (R 4.3.0)
cli 3.6.1 2023-03-23 [1] CRAN (R 4.3.0)
cluster 2.1.4 2022-08-22 [4] CRAN (R 4.2.1)
codetools 0.2-19 2023-02-01 [4] CRAN (R 4.2.2)
colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.3.0)
crayon 1.5.2 2022-09-29 [1] CRAN (R 4.3.0)
curl 5.0.0 2023-01-12 [1] CRAN (R 4.3.0)
DBI 1.1.3 2022-06-18 [1] CRAN (R 4.3.0)
dbplyr 2.3.2 2023-03-21 [1] CRAN (R 4.3.0)
DelayedArray 0.26.1 2023-05-01 [1] Bioconductor
DelayedMatrixStats 1.22.1 2023-06-09 [1] Bioconductor
DESeq2 * 1.40.1 2023-05-02 [1] Bioconductor
digest 0.6.31 2022-12-11 [1] CRAN (R 4.3.0)
downloader 0.4 2015-07-09 [1] CRAN (R 4.3.0)
dplyr * 1.1.2 2023-04-20 [1] CRAN (R 4.3.0)
dqrng 0.3.0 2021-05-01 [1] CRAN (R 4.3.0)
edgeR 3.42.0 2023-04-25 [1] Bioconductor
evaluate 0.20 2023-01-17 [1] CRAN (R 4.3.0)
fansi 1.0.4 2023-01-22 [1] CRAN (R 4.3.0)
farver 2.1.1 2022-07-06 [1] CRAN (R 4.3.0)
fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0)
filelock 1.0.2 2018-10-05 [1] CRAN (R 4.3.0)
forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.0)
fs 1.6.2 2023-04-25 [1] CRAN (R 4.3.0)
generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
GenomeInfoDb * 1.36.0 2023-04-25 [1] Bioconductor
GenomeInfoDbData 1.2.10 2023-04-28 [1] Bioconductor
GenomicAlignments 1.36.0 2023-04-25 [1] Bioconductor
GenomicFeatures * 1.52.0 2023-04-25 [1] Bioconductor
GenomicRanges * 1.52.0 2023-04-25 [1] Bioconductor
ggbeeswarm 0.7.2 2023-04-29 [1] CRAN (R 4.3.0)
ggplot2 * 3.4.1 2023-02-10 [1] CRAN (R 4.3.0)
ggrepel 0.9.3 2023-02-03 [1] CRAN (R 4.3.0)
glue 1.6.2 2022-02-24 [1] CRAN (R 4.3.0)
gridExtra 2.3 2017-09-09 [1] CRAN (R 4.3.0)
gtable 0.3.3 2023-03-21 [1] CRAN (R 4.3.0)
here * 1.0.1 2020-12-13 [1] CRAN (R 4.3.0)
hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.0)
htmltools 0.5.5 2023-03-23 [1] CRAN (R 4.3.0)
htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.3.0)
httr 1.4.5 2023-02-24 [1] CRAN (R 4.3.0)
igraph 1.4.2 2023-04-07 [1] CRAN (R 4.3.0)
IRanges * 2.34.0 2023-04-25 [1] Bioconductor
irlba 2.3.5.1 2022-10-03 [1] CRAN (R 4.3.0)
jsonlite 1.8.4 2022-12-06 [1] CRAN (R 4.3.0)
KEGGREST 1.40.0 2023-04-25 [1] Bioconductor
knitr 1.42 2023-01-25 [1] CRAN (R 4.3.0)
labeling 0.4.2 2020-10-20 [1] CRAN (R 4.3.0)
lattice 0.20-45 2021-09-22 [4] CRAN (R 4.2.0)
lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.3.0)
limma 3.56.0 2023-04-25 [1] Bioconductor
locfit 1.5-9.7 2023-01-02 [1] CRAN (R 4.3.0)
lubridate * 1.9.2 2023-02-10 [1] CRAN (R 4.3.0)
magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
Matrix 1.5-1 2022-09-13 [4] CRAN (R 4.2.1)
MatrixGenerics * 1.12.2 2023-06-09 [1] Bioconductor
matrixStats * 1.0.0 2023-06-02 [1] CRAN (R 4.3.0)
memoise 2.0.1 2021-11-26 [1] CRAN (R 4.3.0)
metapod 1.8.0 2023-04-25 [1] Bioconductor
munsell 0.5.0 2018-06-12 [1] CRAN (R 4.3.0)
patchwork * 1.1.2 2022-08-19 [1] CRAN (R 4.3.0)
pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
png 0.1-8 2022-11-29 [1] CRAN (R 4.3.0)
prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.3.0)
progress 1.2.2 2019-05-16 [1] CRAN (R 4.3.0)
purrr * 1.0.1 2023-01-10 [1] CRAN (R 4.3.0)
R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
rappdirs 0.3.3 2021-01-31 [1] CRAN (R 4.3.0)
Rcpp 1.0.10 2023-01-22 [1] CRAN (R 4.3.0)
RCurl 1.98-1.12 2023-03-27 [1] CRAN (R 4.3.0)
readr * 2.1.4 2023-02-10 [1] CRAN (R 4.3.0)
rentrez 1.2.3 2020-11-10 [1] CRAN (R 4.3.0)
restfulr 0.0.15 2022-06-16 [1] CRAN (R 4.3.0)
rjson 0.2.21 2022-01-09 [1] CRAN (R 4.3.0)
rlang 1.1.1 2023-04-28 [1] CRAN (R 4.3.0)
rmarkdown 2.21 2023-03-26 [1] CRAN (R 4.3.0)
rprojroot 2.0.3 2022-04-02 [1] CRAN (R 4.3.0)
Rsamtools 2.16.0 2023-04-25 [1] Bioconductor
RSQLite 2.3.1 2023-04-03 [1] CRAN (R 4.3.0)
rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.3.0)
Rsubread 2.14.2 2023-05-22 [1] Bioconductor
rsvd 1.0.5 2021-04-16 [1] CRAN (R 4.3.0)
rtracklayer 1.60.0 2023-04-25 [1] Bioconductor
S4Arrays 1.0.1 2023-05-01 [1] Bioconductor
S4Vectors * 0.38.0 2023-04-25 [1] Bioconductor
ScaledMatrix 1.8.1 2023-05-03 [1] Bioconductor
scales 1.2.1 2022-08-20 [1] CRAN (R 4.3.0)
scater * 1.28.0 2023-04-25 [1] Bioconductor
scran * 1.28.1 2023-05-02 [1] Bioconductor
scuttle * 1.10.1 2023-05-02 [1] Bioconductor
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
SingleCellExperiment * 1.22.0 2023-04-25 [1] Bioconductor
sparseMatrixStats 1.12.1 2023-06-20 [1] Bioconductor
statmod 1.5.0 2023-01-06 [1] CRAN (R 4.3.0)
stringi 1.7.12 2023-01-11 [1] CRAN (R 4.3.0)
stringr * 1.5.0 2022-12-02 [1] CRAN (R 4.3.0)
SummarizedExperiment * 1.30.1 2023-05-01 [1] Bioconductor
tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.3.0)
tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.3.0)
tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.0)
timechange 0.2.0 2023-01-11 [1] CRAN (R 4.3.0)
tximport 1.28.0 2023-04-25 [1] Bioconductor
tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.3.0)
utf8 1.2.3 2023-01-31 [1] CRAN (R 4.3.0)
vctrs 0.6.2 2023-04-19 [1] CRAN (R 4.3.0)
vipor 0.4.5 2017-03-22 [1] CRAN (R 4.3.0)
viridis 0.6.2 2021-10-13 [1] CRAN (R 4.3.0)
viridisLite 0.4.2 2023-05-02 [1] CRAN (R 4.3.0)
withr 2.5.0 2022-03-03 [1] CRAN (R 4.3.0)
xfun 0.39 2023-04-20 [1] CRAN (R 4.3.0)
XML 3.99-0.14 2023-03-19 [1] CRAN (R 4.3.0)
xml2 1.3.4 2023-04-27 [1] CRAN (R 4.3.0)
XVector 0.40.0 2023-04-25 [1] Bioconductor
yaml 2.3.7 2023-01-23 [1] CRAN (R 4.3.0)
zlibbioc 1.46.0 2023-04-25 [1] Bioconductor
[1] /home/faalm/R/x86_64-pc-linux-gnu-library/4.3
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library
──────────────────────────────────────────────────────────────────────────────