library(cogeqc)
library(here)
library(tidyverse)
library(syntenet)
library(tidytext)
6 Assessing synteny detection in Fabaceae
Here, we will assess synteny detection using a network-based approach. The anchor pairs from synteny identification will be interpreted as edges of an unweighted undirected graph (i.e., a synteny network), and the best synteny detection will be identified based on the graphs’ clustering coefficients and node number.
We will demonstrate our network-based synteny assessment using genomic data on Fabaceae species available on PLAZA 5.0 (Van Bel et al. 2022).
6.1 Data acquisition
In this section, we will download whole-genome protein sequences and gene annotation from PLAZA 5.0, and then we will preprocess the data with syntenet::process_input()
.
<- c("mtr", "tpr", "psa", "car", "lja", "gma", "vmu", "lal", "arhy") species
<- "https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/"
base_url
# Get proteomes
<- paste0(
seq_url "Fasta/proteome.selected_transcript.",
base_url, ".fasta.gz"
species,
)
## Import files and clean gene IDs
<- lapply(seq_url, function(x) {
seq <- Biostrings::readAAStringSet(x)
s names(s) <- gsub(".* | ", "", names(s))
return(s)
})names(seq) <- species
# Get gene annotation
<- paste0(
annot_url "GFF/", species, "/annotation.selected_transcript.exon_features.",
base_url, ".gff3.gz"
species,
)
## Import files and keep only relevant fields
<- lapply(annot_url, function(x) {
annot <- rtracklayer::import(x)
a <- a[, c("type", "gene_id")]
a <- a[a$type == "gene"]
a return(a)
})names(annot) <- species
# Process data
<- process_input(seq, annot)
pdata
# Remove unprocessed data to clean the working environment
rm(annot)
rm(seq)
6.2 Network-based synteny assessment
We will infer synteny networks using the Bioconductor package syntenet. This package detects synteny using the MCScanX algorithm (Wang et al. 2012), which can produce different results based on 2 main parameters:
- anchors: minimum required number of genes to call a syntenic block. Default: 5.
- max_gaps: number of upstream and downstream genes to search for anchors. Default: 25.
We will infer synteny networks with 5 combinations of parameters, similarly to Zhao and Schranz (2019), using two approaches:
- A single Fabaceae synteny network;
- Species-specific synteny networks for each Fabaceae species.
To start with, let’s define the combinations of parameters we will use.
# Define combinations of parameters: anchors (a), max_gaps (m)
<- list(
synteny_params c(3, 25),
c(5, 15),
c(5, 25),
c(5, 35),
c(7, 25)
)
6.2.1 Assessing the Fabaceae synteny network
First, we will perform similarity searches with DIAMOND.
# Define wrapper function to run DIAMOND with different top_hits
<- file.path(tempdir(), "diamond_all")
out <- run_diamond(seq = seq, top_hits = 5, outdir = out) d5
With the DIAMOND list, we can detect synteny.
# Define helper function to detect synteny with multiple combinations of params
<- function(diamond, annotation, params) {
synteny_wrapper
<- lapply(params, function(x) {
syn
<- x[1]
anchors <- x[2]
max_gaps <- file.path(tempdir(), paste0("syn_a", anchors, "_m", max_gaps))
outdir
<- infer_syntenet(
s blast_list = diamond,
annotation = pdata$annotation,
outdir = outdir,
anchors = anchors,
max_gaps = max_gaps
)return(s)
})return(syn)
}
# Detect synteny
<- synteny_wrapper(d5, pdata$annotation, synteny_params)
syn_fabaceae names(syn_fabaceae) <- unlist(
lapply(synteny_params, function(x) paste0("a", x[1], "_m", x[2]))
)
Now, let’s use the network-based synteny assessment to see which combination of parameters is the best.
# Assess networks
<- assess_synnet_list(syn_fabaceae)
fabaceae_scores
# Look at scores, ranked from highest to lowest
%>%
fabaceae_scores arrange(-Score) |>
::kable(
knitrcaption = "Scores for each synteny network."
)
CC | Node_count | Rsquared | Score | Network |
---|---|---|---|---|
0.8253002 | 237723 | 0.6227916 | 122187.2 | a3_m25 |
0.8290880 | 235290 | 0.6156847 | 120105.4 | a5_m35 |
0.8392223 | 226657 | 0.6026291 | 114629.5 | a5_m15 |
0.8412602 | 224325 | 0.5972865 | 112717.3 | a7_m25 |
0.8347725 | 231820 | 0.5795957 | 112161.6 | a5_m25 |
As we can see, the combination of parameters a = 3; m = 25
is the best for this data set.
Finally, let’s visualize scores. To make visualization better, we will scale scores by the maximum value, so that values range from 0 to 1.
# Plot scores
<- fabaceae_scores %>%
synteny_scores_fabaceae arrange(Score) %>%
mutate(Score = Score / max(Score)) %>%
mutate(Parameters = str_replace_all(Network, "_", ", ")) %>%
mutate(Parameters = factor(Parameters, levels = unique(Parameters))) %>%
ggplot(., aes(x = Parameters, y = Score)) +
geom_col(fill = "grey60", color = "black") +
theme_bw() +
labs(
title = "Assessment of the Fabaceae synteny network",
subtitle = "a = minimum # of anchors; m = maximum # of gaps",
y = "Scaled score"
)
synteny_scores_fabaceae
6.2.2 Assessing species-specific synteny networks
In this section, we will infer species-specific synteny networks and assess each of them with our network-based approach.
This time, as we already have synteny networks for the whole Fabaceae family, we don’t need to infer them again; we will simply subset edges of the network that contain nodes from the same species.
# Create species-specific networks
<- substr(species, start = 1, stop = 3)
species_ids
<- lapply(species_ids, function(x) {
species_networks
<- lapply(syn_fabaceae, function(y) {
nets <- y[startsWith(y$Anchor1, x) & startsWith(y$Anchor2, x), ]
edges return(edges)
})return(nets)
})names(species_networks) <- species_ids
# Exploring data
names(species_networks)
[1] "mtr" "tpr" "psa" "car" "lja" "gma" "vmu" "lal" "arh"
names(species_networks$mtr)
[1] "a3_m25" "a5_m15" "a5_m25" "a5_m35" "a7_m25"
# Rename `species_networks` to keep full name
names(species_networks) <- c(
"M. truncatula", "T. pratense", "P. sativum", "C. arietinum",
"L. japonicus", "G. max", "V. mungo", "L. albus", "A. hypogaea"
)
For each species, we will assess the networks inferred with different combinations of parameters.
# Assess species-specific networks
<- lapply(seq_along(species_networks), function(x) {
scores_species_nets
<- names(species_networks)[x]
species <- assess_synnet_list(species_networks[[species]])
scores $Score[is.nan(scores$Score)] <- 0
scores<- scores[order(scores$Score, decreasing = TRUE), ]
scores $Species <- species
scores$Score <- scores$Score / max(scores$Score)
scoresreturn(scores)
})<- Reduce(rbind, scores_species_nets)
scores_species_nets
# Plot data
<- scores_species_nets %>%
synteny_scores_species mutate(
Parameters = as.factor(str_replace_all(Network, "_", ", ")),
Species = as.factor(Species)
%>%
) mutate(Network = reorder_within(Parameters, Score, Species)) %>%
ggplot(., aes(x = Network, y = Score, fill = Parameters)) +
geom_bar(stat = "identity", color = "grey90") +
facet_wrap(~Species, ncol = 3, scales = "free") +
scale_x_reordered() +
::scale_fill_jama() +
ggscitheme_bw() +
theme(axis.text.x = element_blank()) +
labs(
title = "Assessment of species-specific synteny networks",
subtitle = "a = minimum # of anchors; m = maximum # of gaps",
y = "Scaled score (by species)", x = ""
)
synteny_scores_species
The figure demonstrates that the best combination of parameters depends on the species, so there is no “universally” best combination. However, some patterns emerge. The combinations a = 7; m= 25
and a = 5; m = 15
are typically the worst. In some cases, they even lead to zero scores due to clustering coefficients of zero. Thus, if users want to test multiple combinations of parameters for their own data set, they should only test the combinations a = 3; m = 25
, a = 5; m = 25
, and a = 5; m = 35
, which lead to the best score in 45%, 33%, and 22% of the species-specific networks, respectively. Interestingly, the combination that leads to the best score in most networks (a = 3; m = 25
) is also the best when considering the whole Fabaceae synteny network (see previous section).
Session info
This document was created under the following conditions:
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.3.0 (2023-04-21)
os Ubuntu 20.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Brussels
date 2023-10-06
pandoc 3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
ape 5.7-1 2023-03-13 [1] CRAN (R 4.3.0)
aplot 0.1.10 2023-03-08 [1] CRAN (R 4.3.0)
beeswarm 0.4.0 2021-06-01 [1] CRAN (R 4.3.0)
Biobase 2.60.0 2023-04-25 [1] Bioconductor
BiocGenerics 0.46.0 2023-04-25 [1] Bioconductor
BiocIO 1.10.0 2023-04-25 [1] Bioconductor
BiocManager 1.30.21.1 2023-07-18 [1] CRAN (R 4.3.0)
BiocParallel 1.34.0 2023-04-25 [1] Bioconductor
BiocStyle 2.29.1 2023-08-04 [1] Github (Bioconductor/BiocStyle@7c0e093)
Biostrings 2.68.0 2023-04-25 [1] Bioconductor
bitops 1.0-7 2021-04-24 [1] CRAN (R 4.3.0)
cli 3.6.1 2023-03-23 [1] CRAN (R 4.3.0)
coda 0.19-4 2020-09-30 [1] CRAN (R 4.3.0)
codetools 0.2-19 2023-02-01 [4] CRAN (R 4.2.2)
cogeqc * 1.4.0 2023-04-25 [1] Bioconductor
colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.3.0)
crayon 1.5.2 2022-09-29 [1] CRAN (R 4.3.0)
DelayedArray 0.26.1 2023-05-01 [1] Bioconductor
digest 0.6.33 2023-07-07 [1] CRAN (R 4.3.0)
dplyr * 1.1.2 2023-04-20 [1] CRAN (R 4.3.0)
evaluate 0.21 2023-05-05 [1] CRAN (R 4.3.0)
fansi 1.0.4 2023-01-22 [1] CRAN (R 4.3.0)
farver 2.1.1 2022-07-06 [1] CRAN (R 4.3.0)
fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0)
forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.0)
generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
GenomeInfoDb 1.36.0 2023-04-25 [1] Bioconductor
GenomeInfoDbData 1.2.10 2023-04-28 [1] Bioconductor
GenomicAlignments 1.36.0 2023-04-25 [1] Bioconductor
GenomicRanges 1.52.0 2023-04-25 [1] Bioconductor
ggbeeswarm 0.7.2 2023-04-29 [1] CRAN (R 4.3.0)
ggfun 0.0.9 2022-11-21 [1] CRAN (R 4.3.0)
ggnetwork 0.5.12 2023-03-06 [1] CRAN (R 4.3.0)
ggplot2 * 3.4.1 2023-02-10 [1] CRAN (R 4.3.0)
ggplotify 0.1.0 2021-09-02 [1] CRAN (R 4.3.0)
ggsci 3.0.0 2023-03-08 [1] CRAN (R 4.3.0)
ggtree 3.8.0 2023-04-25 [1] Bioconductor
glue 1.6.2 2022-02-24 [1] CRAN (R 4.3.0)
gridGraphics 0.5-1 2020-12-13 [1] CRAN (R 4.3.0)
gtable 0.3.3 2023-03-21 [1] CRAN (R 4.3.0)
here * 1.0.1 2020-12-13 [1] CRAN (R 4.3.0)
hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.0)
htmltools 0.5.5 2023-03-23 [1] CRAN (R 4.3.0)
htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.3.0)
igraph 1.4.2 2023-04-07 [1] CRAN (R 4.3.0)
intergraph 2.0-2 2016-12-05 [1] CRAN (R 4.3.0)
IRanges 2.34.0 2023-04-25 [1] Bioconductor
janeaustenr 1.0.0 2022-08-26 [1] CRAN (R 4.3.0)
jsonlite 1.8.7 2023-06-29 [1] CRAN (R 4.3.0)
knitr 1.43 2023-05-25 [1] CRAN (R 4.3.0)
labeling 0.4.2 2020-10-20 [1] CRAN (R 4.3.0)
lattice 0.20-45 2021-09-22 [4] CRAN (R 4.2.0)
lazyeval 0.2.2 2019-03-15 [1] CRAN (R 4.3.0)
lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.3.0)
lubridate * 1.9.2 2023-02-10 [1] CRAN (R 4.3.0)
magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
Matrix 1.5-1 2022-09-13 [4] CRAN (R 4.2.1)
MatrixGenerics 1.12.2 2023-06-09 [1] Bioconductor
matrixStats 1.0.0 2023-06-02 [1] CRAN (R 4.3.0)
munsell 0.5.0 2018-06-12 [1] CRAN (R 4.3.0)
network 1.18.1 2023-01-24 [1] CRAN (R 4.3.0)
networkD3 0.4 2017-03-18 [1] CRAN (R 4.3.0)
nlme 3.1-162 2023-01-31 [4] CRAN (R 4.2.2)
patchwork 1.1.2 2022-08-19 [1] CRAN (R 4.3.0)
pheatmap 1.0.12 2019-01-04 [1] CRAN (R 4.3.0)
pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
plyr 1.8.8 2022-11-11 [1] CRAN (R 4.3.0)
purrr * 1.0.1 2023-01-10 [1] CRAN (R 4.3.0)
R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.3.0)
Rcpp 1.0.10 2023-01-22 [1] CRAN (R 4.3.0)
RCurl 1.98-1.12 2023-03-27 [1] CRAN (R 4.3.0)
readr * 2.1.4 2023-02-10 [1] CRAN (R 4.3.0)
reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.3.0)
restfulr 0.0.15 2022-06-16 [1] CRAN (R 4.3.0)
rjson 0.2.21 2022-01-09 [1] CRAN (R 4.3.0)
rlang 1.1.1 2023-04-28 [1] CRAN (R 4.3.0)
rmarkdown 2.23 2023-07-01 [1] CRAN (R 4.3.0)
rprojroot 2.0.3 2022-04-02 [1] CRAN (R 4.3.0)
Rsamtools 2.16.0 2023-04-25 [1] Bioconductor
rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.3.0)
rtracklayer 1.60.0 2023-04-25 [1] Bioconductor
S4Arrays 1.0.1 2023-05-01 [1] Bioconductor
S4Vectors 0.38.0 2023-04-25 [1] Bioconductor
scales 1.2.1 2022-08-20 [1] CRAN (R 4.3.0)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
SnowballC 0.7.1 2023-04-25 [1] CRAN (R 4.3.0)
statnet.common 4.8.0 2023-01-24 [1] CRAN (R 4.3.0)
stringi 1.7.12 2023-01-11 [1] CRAN (R 4.3.0)
stringr * 1.5.0 2022-12-02 [1] CRAN (R 4.3.0)
SummarizedExperiment 1.30.1 2023-05-01 [1] Bioconductor
syntenet * 1.3.3 2023-06-15 [1] Bioconductor
tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.3.0)
tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.3.0)
tidytext * 0.4.1 2023-01-07 [1] CRAN (R 4.3.0)
tidytree 0.4.2 2022-12-18 [1] CRAN (R 4.3.0)
tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.0)
timechange 0.2.0 2023-01-11 [1] CRAN (R 4.3.0)
tokenizers 0.3.0 2022-12-22 [1] CRAN (R 4.3.0)
treeio 1.24.1 2023-05-31 [1] Bioconductor
tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.3.0)
utf8 1.2.3 2023-01-31 [1] CRAN (R 4.3.0)
vctrs 0.6.3 2023-06-14 [1] CRAN (R 4.3.0)
vipor 0.4.5 2017-03-22 [1] CRAN (R 4.3.0)
withr 2.5.0 2022-03-03 [1] CRAN (R 4.3.0)
xfun 0.39 2023-04-20 [1] CRAN (R 4.3.0)
XML 3.99-0.14 2023-03-19 [1] CRAN (R 4.3.0)
XVector 0.40.0 2023-04-25 [1] Bioconductor
yaml 2.3.7 2023-01-23 [1] CRAN (R 4.3.0)
yulab.utils 0.0.6 2022-12-20 [1] CRAN (R 4.3.0)
zlibbioc 1.46.0 2023-04-25 [1] Bioconductor
[1] /home/faalm/R/x86_64-pc-linux-gnu-library/4.3
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library
──────────────────────────────────────────────────────────────────────────────