2 Obtaining duplicate genes and gene pairs

In this chapter, we will obtain duplicated gene pairs (classified by duplication mode) in the genomes of each species used in this project. If duplicates are avaiable on doubletroubledb (Almeida-Silva and Van de Peer 2024), we will simply download them, otherwise we will infer them de novo.

To start, let’s load required packages.

library(here)
library(tidyverse)
library(doubletrouble)
library(syntenet)

set.seed(123) # for reproducibility
options(timeout = 1e6)

# Load helper functions
source(here("code", "utils.R"))

2.1 Retrieving duplicate pairs

First, we will download duplicate pairs for species that are present in doubletroubledb.

# A. thaliana ----
dup_file <- tempfile(fileext = ".zip")
download.file(
    "https://figshare.com/ndownloader/files/44689777",
    destfile = dup_file
)

## Genes
ath_dups <- readr::read_tsv(
    unzip(dup_file, "arabidopsis_thaliana_genes.tsv.gz"),
    show_col_types = FALSE
) |>
    mutate(gene = str_replace_all(gene, "ara_", "")) |>
    filter(!str_detect(gene, "^ATC|^ATM")) |>
    as.data.frame()

## Pairs
ath_pairs <- readr::read_tsv(
    unzip(dup_file, "arabidopsis_thaliana_pairs.tsv.gz"),
    show_col_types = FALSE
) |>
    mutate(
        dup1 = str_replace_all(dup1, "ara_", ""),
        dup2 = str_replace_all(dup2, "ara_", "")
    ) |>
    filter(!str_detect(dup1, "^ATC|^ATM")) |>
    filter(!str_detect(dup2, "^ATC|^ATM")) |>
    as.data.frame()
    
# Z. mays ----
dup_file <- tempfile(fileext = ".zip")
download.file(
    "https://figshare.com/ndownloader/files/44690185",
    destfile = dup_file
)

## Genes
zma_dups <- readr::read_tsv(
    unzip(dup_file, "zea_mays_genes.tsv.gz"),
    show_col_types = FALSE
) |>
    mutate(gene = str_replace_all(gene, "zea_", "")) |>
    as.data.frame()

## Pairs
zma_pairs <- readr::read_tsv(
    unzip(dup_file, "zea_mays_pairs.tsv.gz"),
    show_col_types = FALSE
) |>
    mutate(
        dup1 = str_replace_all(dup1, "zea_", ""),
        dup2 = str_replace_all(dup2, "zea_", "")
    ) |>
    as.data.frame()
    
# G. max ----
dup_file <- tempfile(fileext = ".zip")
download.file(
    "https://figshare.com/ndownloader/files/44689888",
    destfile = dup_file
)

## Genes
gma_dups <- readr::read_tsv(
    unzip(dup_file, "glycine_max_genes.tsv.gz"),
    show_col_types = FALSE
) |>
    mutate(gene = str_replace_all(gene, "^gly_", "")) |>
    as.data.frame()

## Pairs
gma_pairs <- readr::read_tsv(
    unzip(dup_file, "glycine_max_pairs.tsv.gz"),
    show_col_types = FALSE
) |>
    mutate(
        dup1 = str_replace_all(dup1, "^gly_", ""),
        dup2 = str_replace_all(dup2, "^gly_", "")
    ) |>
    as.data.frame()


# H. vulgare ----
dup_file <- tempfile(fileext = ".zip")
download.file(
    "https://figshare.com/ndownloader/files/44689900",
    destfile = dup_file
)

## Genes
hvu_dups <- readr::read_tsv(
    unzip(dup_file, "hordeum_vulgare_genes.tsv.gz"),
    show_col_types = FALSE
) |>
    mutate(gene = str_replace_all(gene, "^hor_", "")) |>
    as.data.frame()

## Pairs
hvu_pairs <- readr::read_tsv(
    unzip(dup_file, "hordeum_vulgare_pairs.tsv.gz"),
    show_col_types = FALSE
) |>
    mutate(
        dup1 = str_replace_all(dup1, "^hor_", ""),
        dup2 = str_replace_all(dup2, "^hor_", "")
    ) |>
    as.data.frame()

Next, let’s identify and classify duplicate pairs de novo for the P. aphrodite genome using doubletrouble.

# Get sequence and annotation for P. aphrodite and Amborella trichopoda (outgroup)
## P. aphrodite
pap_annot <- rtracklayer::import(
    "https://orchidstra2.abrc.sinica.edu.tw/orchidstra2/pagenome/padownload/P_aphrodite_genomic_scaffold_v1.0_gene.gff.gz"
)
pap_annot$gene_id <- pap_annot$ID

pap_seq <- Biostrings::readAAStringSet(
    "https://orchidstra2.abrc.sinica.edu.tw/orchidstra2/pagenome/padownload/P_aphrodite_genomic_scaffold_v1.0_protein.fa.gz"
)
names(pap_seq) <- gsub(".* gene=", "", names(pap_seq))
genes <- intersect(names(pap_seq), pap_annot$gene_id[pap_annot$type == "gene"])
pap_seq <- pap_seq[genes]

## A. trichopoda
atr_annot <- rtracklayer::import(
    "http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-57/plants/gff3/amborella_trichopoda/Amborella_trichopoda.AMTR1.0.57.gff3.gz"
)

atr_seq <- Biostrings::readAAStringSet(
    "http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-57/plants/fasta/amborella_trichopoda/pep/Amborella_trichopoda.AMTR1.0.pep.all.fa.gz"
)
names(atr_seq) <- gsub(" .*", "", gsub(".*gene:", "", names(atr_seq)))

atr_seq <- atr_seq[order(Biostrings::width(atr_seq), decreasing = TRUE), ]
atr_seq <- atr_seq[!duplicated(names(atr_seq)), ]

# Create seq and annotation lists
seq <- list(pap = pap_seq, atr = atr_seq)
annot <- list(pap = pap_annot, atr = atr_annot)

# Process input data
pdata <- syntenet::process_input(seq, annot, filter_annotation = TRUE)

# Run DIAMOND to get whole paranome
diamond_intra <- syntenet::run_diamond(
    seq = pdata$seq["pap"],
    compare = "intraspecies",
    threads = 4,
    ... = "--sensitive"
)

# Run DIAMOND search between orchid and outgroup
diamond_inter <- syntenet::run_diamond(
    seq = pdata$seq,
    compare = data.frame(query = "pap", outgroup = "atr"),
    threads = 4,
    ... = "--sensitive"
)["pap_atr"]

# Get intron counts
txdb_list <- lapply(annot, GenomicFeatures::makeTxDbFromGRanges)
ic <- lapply(txdb_list, get_intron_counts)

# Classify duplicate pairs
orchid_pairs <- classify_gene_pairs(
    blast_list = diamond_intra,
    annotation = pdata$annotation,
    blast_inter = diamond_inter,
    intron_counts = ic,
    scheme = "full",
    collinearity_dir = here("products", "result_files")
)[[1]]

orchid_dups <- classify_genes(list(pap = orchid_pairs))[["pap"]]

# Remove species IDs
pap_pairs <- orchid_pairs |>
    mutate(
        dup1 = str_replace_all(dup1, "^pap_", ""),
        dup2 = str_replace_all(dup2, "^pap_", "")
    )

pap_dups <- orchid_dups |>
    mutate(gene = str_replace_all(gene, "^pap_", ""))

2.2 Calculating substitution rates

Next, we will also calculate substitution rates (Ks, Ka, and Ka/Ks) for duplicate pairs using doubletrouble.

# Define function to clean (remove redundancy) in ENSEMBL-derived CDS
ensembl_longest_isoform <- function(proteome = NULL) {

    pnames <- gsub(".*gene:", "", names(proteome))
    pnames <- gsub(" .*", "", pnames)

    names(proteome) <- pnames
    proteome <- proteome[order(Biostrings::width(proteome), decreasing = TRUE),]
    proteome <- proteome[!duplicated(names(proteome)), ]
    return(proteome)
}

# A. thaliana ----
## Get CDS
ath_cds <- Biostrings::readDNAStringSet("https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-59/plants/fasta/arabidopsis_thaliana/cds/Arabidopsis_thaliana.TAIR10.cds.all.fa.gz") |>
    ensembl_longest_isoform()
ath_cds <- ath_cds[names(ath_cds) %in% unique(c(ath_pairs$dup1, ath_pairs$dup2))]

## Calculate rates
ath_kaks <- pairs2kaks(
    list(Ath = ath_pairs), list(Ath = ath_cds), threads = 4
)$Ath

# Glycine max ----
## Get CDS
gma_cds <- Biostrings::readDNAStringSet("https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-59/plants/fasta/glycine_max/cds/Glycine_max.Glycine_max_v2.1.cds.all.fa.gz") |>
    ensembl_longest_isoform()
gma_cds <- gma_cds[names(gma_cds) %in% unique(c(gma_pairs$dup1, gma_pairs$dup2))]

## Calculate rates
gma_kaks <- pairs2kaks(
    gene_pairs_list = list(
        gma = gma_pairs |>
            mutate(
                dup1 = str_c("gma_", dup1), 
                dup2 = str_c("gma_", dup2),
            )
    ),
    cds = list(gma = gma_cds), 
    threads = 4
)$gma


# Z. mays ----
## Get CDS
zma_cds <- Biostrings::readDNAStringSet("https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-59/plants/fasta/zea_mays/cds/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.cds.all.fa.gz") |>
    ensembl_longest_isoform()
zma_cds <- zma_cds[names(zma_cds) %in% unique(c(zma_pairs$dup1, zma_pairs$dup2))]

## Calculate rates
zma_kaks <- pairs2kaks(
    gene_pairs_list = list(
        zma = zma_pairs |>
            mutate(
                dup1 = str_c("zma_", dup1), 
                dup2 = str_c("zma_", dup2),
            )
    ),
    cds = list(zma = zma_cds), 
    threads = 2
)$zma

# H. vulgare ----
## Get CDS
hvu_cds <- Biostrings::readDNAStringSet("https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-59/plants/fasta/hordeum_vulgare/cds/Hordeum_vulgare.MorexV3_pseudomolecules_assembly.cds.all.fa.gz") |>
    ensembl_longest_isoform()
hvu_cds <- hvu_cds[names(hvu_cds) %in% unique(c(hvu_pairs$dup1, hvu_pairs$dup2))]

## Calculate rates
hvu_kaks <- pairs2kaks(
    gene_pairs_list = list(hvu = hvu_pairs),
    cds = list(hvu = hvu_cds), 
    threads = 2
)$hvu


# P. aphrodite ----
## Get CDS
pap_cds <- Biostrings::readDNAStringSet(
    "https://orchidstra2.abrc.sinica.edu.tw/orchidstra2/pagenome/padownload/P_aphrodite_genomic_scaffold_v1.0_gene.fa.gz"
)
names(pap_cds) <- gsub(".* gene=", "", names(pap_cds))
names(pap_cds) <- gsub(" .*", "", names(pap_cds))
pap_cds <- pap_cds[order(Biostrings::width(pap_cds), decreasing = TRUE)]
pap_cds <- pap_cds[!duplicated(names(pap_cds))]

## Calculate rates
pap_kaks <- pairs2kaks(
    gene_pairs_list = list(
        pap = pap_pairs |>
            mutate(
                dup1 = str_c("pap_", dup1), 
                dup2 = str_c("pap_", dup2),
            )
    ),
    cds = list(pap = pap_cds), 
    threads = 4
)$pap


# Create a list with duplicate pairs and genes for each species
dup_list <- list(
    ath = list(pairs = ath_kaks, genes = ath_dups),
    gma = list(pairs = gma_kaks, genes = gma_dups),
    pap = list(pairs = pap_kaks, genes = pap_dups),
    zma = list(pairs = zma_kaks, genes = zma_dups),
    hvu = list(pairs = hvu_kaks, genes = hvu_dups)
)

Then, we will use Ks values to split gene pairs by age groups, so that age is not a confounder when performing comparisons between duplication modes. Here, we will not use pairs with \(K_s>2\) to avoid saturation at greater \(K_s\) values. Based on previous literature, we will set pre-defined numbers of \(K_s\) peaks, which are:

Glycine max: 2 peaks at \(K_s\) ~0.1 and ~0.56.
Zea mays: 2 peaks at \(K_s\) ~0.16 and 0.9. The second peak cannot be distinguished from the \(K_s\) distro of Zea mays because anchor pairs originating from this ancient WGD event cannot be detected anymore. Hence, we calculated the mean of \(K_s\) peaks estimates for all species in which this WGD event could be detected (N = 11), using data from AngioWGD (Chen et al. 2024). Following this approach, we will consider a peak at 0.9 (0.51-1.89).
Arabidopsis thaliana: 2 peaks at \(K_s\) ~0.73 and 1.61. The second peak cannot be distinguished from the \(K_s\) distro of Arabidopsis thaliana because peaks largely overlap. To distinguish between the two peaks, we used prior data from (Chen et al. 2024), which identified the second peak more clearly (1.61, 0.90-3.00) in the \(K_s\) distro of Cleome violacea.
Phalaenopsis aphrodite: 1 peak at \(K_s\) ~1.5.
Hordeum vulgare: 1 peak at \(K_s\) ~0.8.

# Classify gene pairs by age groups
## G. max - 2 peaks
gma_peaks <- gma_kaks |>
    filter(Ks <=2) |>
    pull(Ks) |>
    find_ks_peaks(npeaks = 2, max_ks = 1)

gma_spairs <- split_pairs_by_peak(gma_kaks, peaks = gma_peaks) |>
    purrr::pluck(1) |>
    dplyr::select(1, 2, type = 6, 7)

## Z. mays - 1 peak
zma_peaks <- zma_kaks |>
    filter(type == "SD") |>
    filter(Ks <=2) |>
    pull(Ks) |>
    find_ks_peaks(npeaks = 2, max_ks = 2)

zma_spairs <- split_pairs_by_peak(zma_kaks, peaks = zma_peaks) |>
    purrr::pluck(1) |>
    dplyr::select(1, 2, type = 6, 7)

## A. thaliana - 1 peak
ath_peaks <- ath_kaks |>
    filter(Ks <=3.0, type == "SD") |>
    pull(Ks) |>
    find_ks_peaks(npeaks = 2, max_ks = 3)
ath_peaks$mean <- c(`1` = 0.73, `2` = 1.61)

ath_spairs <- split_pairs_by_peak(ath_kaks, peaks = ath_peaks) |>
    purrr::pluck(1) |>
    dplyr::select(1, 2, type = 6, 7)

## P. aphrodite
pap_peaks <- pap_kaks |>
    filter(Ks <=2.5) |>
    pull(Ks) |>
    find_ks_peaks(npeaks = 1, max_ks = 2.5)

pap_spairs <- split_pairs_by_peak(pap_kaks, peaks = pap_peaks) |>
    purrr::pluck(1) |>
    dplyr::select(1, 2, type = 6, 7)

## H. vulgare - 1 peak
hvu_peaks <- hvu_kaks |>
    filter(Ks <=2, type == "SD") |> 
    pull(Ks) |>
    find_ks_peaks(npeaks = 1, max_ks = 2)

hvu_spairs <- split_pairs_by_peak(hvu_kaks, peaks = hvu_peaks) |>
    purrr::pluck(1) |>
    dplyr::select(1, 2, type = 6, 7)

# Create a single list with all classified duplicates and their age groups
pairs_byage <- list(
    ath = ath_spairs,
    gma = gma_spairs,
    pap = pap_spairs,
    zma = zma_spairs,
    hvu = hvu_spairs
)

And finally, we will label peaks with the names of their corresponding WGDs (following AngioWGD nomenclature).

# Name peaks
## Ath
pairs_byage$ath <- pairs_byage$ath |>
    mutate(
        peak_name = case_when(
            peak == 1 ~ "BRAS a",
            peak == 2 ~ "BRAS b"
        ),
        peak_name = factor(peak_name, levels = c("BRAS a", "BRAS b"))
    )

## Gma
pairs_byage$gma <- pairs_byage$gma |>
    mutate(
        peak_name = case_when(
            peak == 1 ~ "GLYC",
            peak == 2 ~ "PAPI"
        ),
        peak_name = factor(peak_name, levels = c("GLYC", "PAPI"))
    )

## Pap
pairs_byage$pap <- pairs_byage$pap |>
    mutate(
        peak_name = case_when(
            peak == 1 ~ "ORCH"
        ),
        peak_name = factor(peak_name, levels = "ORCH")
    )

## Zma
pairs_byage$zma <- pairs_byage$zma |>
    mutate(
        peak_name = case_when(
            peak == 1 ~ "ZEAM",
            peak == 2 ~ "POAC"
        ),
        peak_name = factor(peak_name, levels = c("ZEAM", "POAC"))
    )

## Hvu
pairs_byage$hvu <- pairs_byage$hvu |>
    mutate(
        peak_name = case_when(
            peak == 1 ~ "POAC"
        ),
        peak_name = factor(peak_name, levels = "POAC")
    )

2.3 Obtaining gene family information

We will also obtain gene family assignments for each gene using data from PLAZA 5.0 (Van Bel et al. 2022). Let’s start with species that are included in PLAZA. These are A. thaliana, G. max, and Z. mays.

# Get gene family assignment from PLAZA
fams_dicots <- read_tsv(
    "https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/GeneFamilies/genefamily_data.HOMFAM.csv.gz", show_col_types = FALSE, comment = "# "
)

fams_monocots <- read_tsv(
    "https://ftp.psb.ugent.be/pub/plaza/plaza_public_monocots_05/GeneFamilies/genefamily_data.HOMFAM.csv.gz", show_col_types = FALSE, comment = "# "
)

names(fams_dicots) <- c("family", "species", "gene")
names(fams_monocots) <- c("family", "species", "gene")

# Get gene sets
sp <- c("ath", "gma", "pap", "zma", "hvu")
gene_sets <- lapply(sp, function(x) {
    
    spe <- readRDS(
        here("products", "result_files", "spe", paste0("spe_", x, ".rds"))
    )
    gs <- lapply(spe, rownames) |> unlist() |> unique()
    return(gs)
})
names(gene_sets) <- sp

# Get assignments for species in PLAZA
fam_ath <- fams_dicots |> filter(gene %in% gene_sets$ath)

fam_gma <- fams_dicots |>
    mutate(gene = str_replace_all(gene, "Glyma\\.", "GLYMA_")) |>
    filter(gene %in% gene_sets$gma)

fam_zma <- fams_monocots |> filter(gene %in% gene_sets$zma)

For H. vulgare, the genome assembly on PLAZA is different from the one we’re using, and the P. aphrodite genome is not available on PLAZA. Thus, for these two species, we will get gene family assignments using the following approach:

Perform a DIAMOND search and get the top hit in target species (H. vulgare for H. vulgare; Allium sativum for P. aphrodite).
Use the family assignment of top hits.

# 1) Get sequence data ----
## Query sequences (used in this study)
query <- list(
    hvu = Biostrings::readAAStringSet("https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/hordeum_vulgare/pep/Hordeum_vulgare.MorexV3_pseudomolecules_assembly.pep.all.fa.gz"),
    pap = Biostrings::readAAStringSet("https://orchidstra2.abrc.sinica.edu.tw/orchidstra2/pagenome/padownload/P_aphrodite_genomic_scaffold_v1.0_protein.fa.gz")
)
names(query$hvu) <- gsub(".* gene:", "", names(query$hvu))
names(query$hvu) <- gsub(" .*", "", names(query$hvu))
names(query$pap) <- gsub(".* gene=", "", names(query$pap))

## Target sequences (from PLAZA)
target <- list(
    hvu_target = Biostrings::readAAStringSet("https://ftp.psb.ugent.be/pub/plaza/plaza_public_monocots_05/Fasta/proteome.selected_transcript.hvu.fasta.gz"),
    pap_target = Biostrings::readAAStringSet("https://ftp.psb.ugent.be/pub/plaza/plaza_public_monocots_05/Fasta/proteome.selected_transcript.asa.fasta.gz")
)
names(target$hvu_target) <- gsub(".* | ", "", names(target$hvu_target))
names(target$pap_target) <- gsub(".* | ", "", names(target$pap_target))

# 2) Perform DIAMOND searches
compare <- data.frame(
    query = c("hvu", "pap"), 
    target = c("hvu_target", "pap_target")
)
dmd <- run_diamond(
    c(query, target), 
    outdir = file.path(tempdir(), "dmd"),
    compare = compare
)

# 3) Get top hits for each query gene
top_hits <- lapply(dmd, function(x) {
    
    df <- x |>
        group_by(query) |>
        arrange(evalue) |>
        slice_head(n = 1) |>
        ungroup() |>
        as.data.frame()
    
    return(df)
})

# Create a knee plot to define a similarity threshold (if needed)
knees <- lapply(top_hits, function(x) {
    x |>
        arrange(-perc_identity) |> 
        mutate(idx = row_number()) |> 
        ggplot(aes(x = idx, y = perc_identity)) +
        geom_point() + 
        scale_y_continuous(breaks = seq(0, 100, by = 10))
})

Based on knee plots of ECDF for sequence identity, we will define similarity cut-offs of 35% for the comparison pap-asa, and 90% for hvu-vhu. Then, we will use the ID of the target gene to get gene family assignments from PLAZA.

# Get gene family assignments
fam_hvu <- top_hits$hvu_hvu_target |>
    filter(perc_identity >=90) |>
    select(query, db) |>
    inner_join(fams_monocots, by = c("db" = "gene")) |>
    select(family, species, gene = query) |>
    arrange(family)

fam_pap <- top_hits$pap_pap_target |>
    filter(perc_identity >=35) |>
    select(query, db) |>
    inner_join(fams_monocots, by = c("db" = "gene")) |>
    select(family, species, gene = query) |>
    arrange(family) |>
    mutate(species = "pap")

Now, combining gene-by-family tables in a single object.

gene_fams <- bind_rows(fam_ath, fam_gma, fam_pap, fam_hvu, fam_zma) |>
    as.data.frame()

2.4 Obtaining functional annotation

Here, we will obtain functional annotations for all genes in each species.

# Get GO annotation for each species
read_plaza_go <- function(url) {
    
    df <- read_tsv(
        url,
        show_col_types = FALSE, comment = "# "
    ) |>
        dplyr::select(gene = `#gene_id`, term = description)
    
    return(df)
}
ath_go <- read_plaza_go("https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/GO/go.ath.csv.gz")

gma_go <- read_plaza_go("https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/GO/go.gma.csv.gz") |>
    mutate(gene = str_replace_all(gene, "Glyma\\.", "GLYMA_"))

zma_go <- read_plaza_go("https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/GO/go.zma.csv.gz")

pap_go <- read_tsv(
    "https://orchidstra2.abrc.sinica.edu.tw/orchidstra2/pagenome/padownload/P_aphrodite_v1.0_gene_def_GO_KEGG.gz", show_col_types = FALSE
) |>    
    dplyr::select(gene = `Gene ID`, term = `GO Names`) |>
    mutate(term = tidyr::replace_na(term, "None")) |>
    separate_longer_delim(term, delim = ";") |>
    mutate(term = str_squish(term))

hvu_go <- read_tsv(here("data", "hvu_biomart.txt"), show_col_types = FALSE) |>
    dplyr::select(gene = 1, term = `GO term name`) |>
    mutate(term = tidyr::replace_na(term, "None")) |>
    dplyr::distinct()

# Get annotation in a list
functional_annot <- list(
    Ath = ath_go,
    Gma = gma_go, 
    Pap = pap_go,
    Zma = zma_go,
    Hvu = hvu_go
)

2.5 Saving objects

Now that we have classified duplicate gene pairs and genes, we will save them to separate .rds files for each species. Each .rds file will contain a list with elements pairs and genes containing the duplicate pairs and genes, respectively.

# Save objects
## List with dup pairs (+ substitution rates) and genes for each species
saveRDS(
    dup_list, compress = "xz",
    file = here("products", "result_files", "dup_list.rds")
)

## Duplicate pairs classified by age groups
saveRDS(
    pairs_byage, compress = "xz",
    file = here("products", "result_files", "pairs_by_age_group_expanded.rds")
)

## Gene family assignments
saveRDS(
    gene_fams, compress = "xz",
    file = here("products", "result_files", "gene_families.rds")
)

## Functional annotation
saveRDS(
    functional_annot, compress = "xz",
    file = here("products", "result_files", "functional_annotation.rds")
)

Session info

This document was created under the following conditions:

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.1 (2024-06-14)
 os       Ubuntu 22.04.4 LTS
 system   x86_64, linux-gnu
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       Europe/Brussels
 date     2025-08-12
 pandoc   3.2 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/x86_64/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package              * version   date (UTC) lib source
 abind                  1.4-5     2016-07-21 [1] CRAN (R 4.4.1)
 ade4                   1.7-22    2023-02-06 [1] CRAN (R 4.4.1)
 AnnotationDbi          1.66.0    2024-05-01 [1] Bioconductor 3.19 (R 4.4.1)
 ape                    5.8       2024-04-11 [1] CRAN (R 4.4.1)
 Biobase                2.64.0    2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 BiocGenerics           0.50.0    2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 BiocIO                 1.14.0    2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 BiocManager            1.30.23   2024-05-04 [1] CRAN (R 4.4.1)
 BiocParallel           1.38.0    2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 BiocStyle              2.32.1    2024-06-16 [1] Bioconductor 3.19 (R 4.4.1)
 Biostrings             2.72.1    2024-06-02 [1] Bioconductor 3.19 (R 4.4.1)
 bit                    4.0.5     2022-11-15 [1] CRAN (R 4.4.1)
 bit64                  4.0.5     2020-08-30 [1] CRAN (R 4.4.1)
 bitops                 1.0-7     2021-04-24 [1] CRAN (R 4.4.1)
 blob                   1.2.4     2023-03-17 [1] CRAN (R 4.4.1)
 cachem                 1.1.0     2024-05-16 [1] CRAN (R 4.4.1)
 cli                    3.6.3     2024-06-21 [1] CRAN (R 4.4.1)
 coda                   0.19-4.1  2024-01-31 [1] CRAN (R 4.4.1)
 codetools              0.2-20    2024-03-31 [1] CRAN (R 4.4.1)
 colorspace             2.1-0     2023-01-23 [1] CRAN (R 4.4.1)
 crayon                 1.5.3     2024-06-20 [1] CRAN (R 4.4.1)
 curl                   5.2.1     2024-03-01 [1] CRAN (R 4.4.1)
 DBI                    1.2.3     2024-06-02 [1] CRAN (R 4.4.1)
 DelayedArray           0.30.1    2024-05-07 [1] Bioconductor 3.19 (R 4.4.1)
 digest                 0.6.36    2024-06-23 [1] CRAN (R 4.4.1)
 doParallel             1.0.17    2022-02-07 [1] CRAN (R 4.4.1)
 doubletrouble        * 1.9.1     2025-07-23 [1] Bioconductor
 dplyr                * 1.1.4     2023-11-17 [1] CRAN (R 4.4.1)
 evaluate               0.24.0    2024-06-10 [1] CRAN (R 4.4.1)
 fastmap                1.2.0     2024-05-15 [1] CRAN (R 4.4.1)
 forcats              * 1.0.0     2023-01-29 [1] CRAN (R 4.4.1)
 foreach                1.5.2     2022-02-02 [1] CRAN (R 4.4.1)
 generics               0.1.3     2022-07-05 [1] CRAN (R 4.4.1)
 GenomeInfoDb           1.40.1    2024-05-24 [1] Bioconductor 3.19 (R 4.4.1)
 GenomeInfoDbData       1.2.12    2024-07-24 [1] Bioconductor
 GenomicAlignments      1.40.0    2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 GenomicFeatures        1.56.0    2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 GenomicRanges          1.56.1    2024-06-12 [1] Bioconductor 3.19 (R 4.4.1)
 ggnetwork              0.5.13    2024-02-14 [1] CRAN (R 4.4.1)
 ggplot2              * 3.5.1     2024-04-23 [1] CRAN (R 4.4.1)
 glue                   1.8.0     2024-09-30 [1] https://cran.r-universe.dev (R 4.4.1)
 gtable                 0.3.5     2024-04-22 [1] CRAN (R 4.4.1)
 here                 * 1.0.1     2020-12-13 [1] CRAN (R 4.4.1)
 hms                    1.1.3     2023-03-21 [1] CRAN (R 4.4.1)
 htmltools              0.5.8.1   2024-04-04 [1] CRAN (R 4.4.1)
 htmlwidgets            1.6.4     2023-12-06 [1] CRAN (R 4.4.1)
 httr                   1.4.7     2023-08-15 [1] CRAN (R 4.4.1)
 igraph                 2.1.4     2025-01-23 [1] CRAN (R 4.4.1)
 intergraph             2.0-4     2024-02-01 [1] CRAN (R 4.4.1)
 IRanges                2.38.1    2024-07-03 [1] Bioconductor 3.19 (R 4.4.1)
 iterators              1.0.14    2022-02-05 [1] CRAN (R 4.4.1)
 jsonlite               1.8.8     2023-12-04 [1] CRAN (R 4.4.1)
 KEGGREST               1.44.1    2024-06-19 [1] Bioconductor 3.19 (R 4.4.1)
 knitr                  1.48      2024-07-07 [1] CRAN (R 4.4.1)
 lattice                0.22-6    2024-03-20 [1] CRAN (R 4.4.1)
 lifecycle              1.0.4     2023-11-07 [1] CRAN (R 4.4.1)
 lubridate            * 1.9.3     2023-09-27 [1] CRAN (R 4.4.1)
 magrittr               2.0.3     2022-03-30 [1] CRAN (R 4.4.1)
 MASS                   7.3-61    2024-06-13 [1] CRAN (R 4.4.1)
 Matrix                 1.7-0     2024-04-26 [1] CRAN (R 4.4.1)
 MatrixGenerics         1.16.0    2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 matrixStats            1.3.0     2024-04-11 [1] CRAN (R 4.4.1)
 mclust                 6.1.1     2024-04-29 [1] CRAN (R 4.4.1)
 memoise                2.0.1     2021-11-26 [1] CRAN (R 4.4.1)
 MSA2dist               1.11.1    2025-03-13 [1] Github (kullrich/MSA2dist@b5ab038)
 munsell                0.5.1     2024-04-01 [1] CRAN (R 4.4.1)
 network                1.18.2    2023-12-05 [1] CRAN (R 4.4.1)
 nlme                   3.1-165   2024-06-06 [1] CRAN (R 4.4.1)
 pheatmap               1.0.12    2019-01-04 [1] CRAN (R 4.4.1)
 pillar                 1.10.2    2025-04-05 [1] https://cran.r-universe.dev (R 4.4.1)
 pkgconfig              2.0.3     2019-09-22 [1] CRAN (R 4.4.1)
 png                    0.1-8     2022-11-29 [1] CRAN (R 4.4.1)
 purrr                * 1.0.2     2023-08-10 [1] CRAN (R 4.4.1)
 pwalign                1.0.0     2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 R6                     2.5.1     2021-08-19 [1] CRAN (R 4.4.1)
 RColorBrewer           1.1-3     2022-04-03 [1] CRAN (R 4.4.1)
 Rcpp                   1.0.13    2024-07-17 [1] CRAN (R 4.4.1)
 RCurl                  1.98-1.16 2024-07-11 [1] CRAN (R 4.4.1)
 readr                * 2.1.5     2024-01-10 [1] CRAN (R 4.4.1)
 restfulr               0.0.15    2022-06-16 [1] CRAN (R 4.4.1)
 rjson                  0.2.21    2022-01-09 [1] CRAN (R 4.4.1)
 rlang                  1.1.4     2024-06-04 [1] CRAN (R 4.4.1)
 rmarkdown              2.27      2024-05-17 [1] CRAN (R 4.4.1)
 rprojroot              2.0.4     2023-11-05 [1] CRAN (R 4.4.1)
 Rsamtools              2.20.0    2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 RSQLite                2.3.7     2024-05-27 [1] CRAN (R 4.4.1)
 rstudioapi             0.16.0    2024-03-24 [1] CRAN (R 4.4.1)
 rtracklayer            1.64.0    2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 S4Arrays               1.4.1     2024-05-20 [1] Bioconductor 3.19 (R 4.4.1)
 S4Vectors              0.42.1    2024-07-03 [1] Bioconductor 3.19 (R 4.4.1)
 scales                 1.3.0     2023-11-28 [1] CRAN (R 4.4.1)
 seqinr                 4.2-36    2023-12-08 [1] CRAN (R 4.4.1)
 sessioninfo            1.2.2     2021-12-06 [1] CRAN (R 4.4.1)
 SparseArray            1.4.8     2024-05-24 [1] Bioconductor 3.19 (R 4.4.1)
 statnet.common         4.9.0     2023-05-24 [1] CRAN (R 4.4.1)
 stringi                1.8.4     2024-05-06 [1] CRAN (R 4.4.1)
 stringr              * 1.5.1     2023-11-14 [1] CRAN (R 4.4.1)
 SummarizedExperiment   1.34.0    2024-05-01 [1] Bioconductor 3.19 (R 4.4.1)
 syntenet             * 1.11.0    2025-07-23 [1] Bioconductor
 tibble               * 3.2.1     2023-03-20 [1] CRAN (R 4.4.1)
 tidyr                * 1.3.1     2024-01-24 [1] CRAN (R 4.4.1)
 tidyselect             1.2.1     2024-03-11 [1] CRAN (R 4.4.1)
 tidyverse            * 2.0.0     2023-02-22 [1] CRAN (R 4.4.1)
 timechange             0.3.0     2024-01-18 [1] CRAN (R 4.4.1)
 tzdb                   0.4.0     2023-05-12 [1] CRAN (R 4.4.1)
 UCSC.utils             1.0.0     2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 vctrs                  0.6.5     2023-12-01 [1] CRAN (R 4.4.1)
 withr                  3.0.0     2024-01-16 [1] CRAN (R 4.4.1)
 xfun                   0.51      2025-02-19 [1] CRAN (R 4.4.1)
 XML                    3.99-0.17 2024-06-25 [1] CRAN (R 4.4.1)
 XVector                0.44.0    2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
 yaml                   2.3.9     2024-07-05 [1] CRAN (R 4.4.1)
 zlibbioc               1.50.0    2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)

 [1] /home/faalm/R/x86_64-pc-linux-gnu-library/4.4
 [2] /usr/local/lib/R/site-library
 [3] /usr/lib/R/site-library
 [4] /usr/lib/R/library

──────────────────────────────────────────────────────────────────────────────

References

Almeida-Silva, Fabricio, and Yves Van de Peer. 2024. “Doubletrouble: An r/Bioconductor Package for the Identification, Classification, and Analysis of Gene and Genome Duplications.” bioRxiv, 2024–02.

Chen, Hengchi, Fabricio Almeida-Silva, Garben Logghe, Dries Bonte, and Yves Van de Peer. 2024. “The Rise of Polyploids During Environmental Catastrophes.” bioRxiv, 2024–11.

Van Bel, Michiel, Francesca Silvestri, Eric M Weitz, Lukasz Kreft, Alexander Botzki, Frederik Coppens, and Klaas Vandepoele. 2022. “PLAZA 5.0: Extending the Scope and Power of Comparative and Functional Genomics in Plants.” Nucleic Acids Research 50 (D1): D1468–74.