library(SummarizedExperiment)
library(tidyverse)
# Get count matrix
<- read.table(
counts file.path(
"https://raw.githubusercontent.com/Wendellab",
"SaltStressTranscriptome/master/salt.counts"
),header = TRUE, sep = "\t", row.names = 1
-c(1:5)]
)[, names(counts) <- gsub("sort.bam", "sort.T.bam", names(counts))
names(counts) <- gsub(".sort|.bam|_5.22|_5.23|6.11", "", names(counts))
# Get subset of data for each species
<- counts[, grep("A2", names(counts))]
A2count <- cbind(
A2count 1:5] + A2count[, 6:10], # Control 1
A2count[, grep("CK2", names(A2count))], # Control 2
A2count[, 16:20] + A2count[, 21:25], # Control 3
A2count[, grep("Salt1", names(A2count))], # Salt 1
A2count[, 31:35] + A2count[, 36:40], # Salt 2
A2count[, 41:45] + A2count[, 46:50] # Salt 3
A2count[,
)
# Combine count matrices
<- cbind(A2count, D5count, TM1count, AD4count)
c_counts
# Get total counts
<- c_counts[, grep("A2.*T$|D5.*T$", names(c_counts))]
diploid_total <- c_counts[, grep("TM1.*A$|TM1.*D$", names(c_counts))]
TM1_total <- TM1_total[, seq(1, 11, by = 2)] + TM1_total[, seq(2, 12, by = 2)]
TM1_total <- c_counts[, grep("AD4.*A$|AD4.*D$", names(c_counts))]
AD4_total <- AD4_total[, seq(1, 11, by = 2)] + AD4_total[, seq(2, 12, by = 2)]
AD4_total <- cbind(diploid_total, TM1_total, AD4_total)
counts_total
# Polish column names
names(counts_total) <- gsub("\\.T|\\.A|_\\.T|_\\.A", "", names(counts_total))
names(counts_total) <- gsub("TM1", "AD1", names(counts_total))
# Create colData
<- data.frame(
coldata row.names = names(counts_total),
species = rep(c("A2", "D5", "AD1", "AD4"), each = 6),
condition = rep(rep(c("Control", "Salt"), each = 3), 4),
rep = rep(1:3, 8)
|>
) mutate(
sample = paste(species, condition, sep = "_"),
species_name = rep(
c("Garboreum", "Graimondii", "Ghirsutum_TM1", "Gmustelinum"),
each = 6
),ploidy = rep(c("di", "allo"), each = 12)
|>
) select(species_name, species, ploidy, condition, sample, rep)
# Creating the SummarizedExperiment object
<- SummarizedExperiment(
se_cotton assays = list(counts = as.matrix(counts_total)),
colData = coldata
)
# Save object to file
save(
compress = "xz",
se_cotton, file = here::here("data", "se_cotton.rda")
)
Appendix: Data acquisition
Here, you can find the code used to obtain the benchmark data.
se_cotton.rda
This data set was obtained from (Dong et al. 2022), and it comprises RNA-seq data on cotton (Gossypium) species of different ploidy levels (i.e., allopolyploids and their diploid progenitors) under salt stress. The SummarizedExperiment
object was created with the code below:
cotton_functions.rda
This object contains a list of data frames with GO, InterPro, and MapMan annotations for genes in the G. raimondii genome. Data were obtained from PLAZA Dicots 5.0 (Van Bel et al. 2022).
# Get data frames
<- readr::read_tsv(
go_df "https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/GO/go.gra.csv.gz",
skip = 8
|>
) ::select(gene = `#gene_id`, description)
dplyr
<- readr::read_tsv(
interpro_df "https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/InterPro/interpro.gra.csv.gz",
skip = 8
|>
) ::select(gene = `#gene_id`, description)
dplyr
<- readr::read_tsv(
mapman_df "https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/MapMan/mapman.gra.csv.gz",
skip = 8
|>
) ::select(gene = gene_id, description = desc)
dplyr
# Create list
<- list(
cotton_functions GO = go_df,
InterPro = interpro_df,
MapMan = mapman_df
)
# Save object to .rda file
save(
compress = "xz",
cotton_functions, file = here("data", "cotton_functions.rda")
)
se_rice.rda
This file contains a SummarizedExperiment
object with data from Zhai et al. (2013), obtained from GEO under accession number GSE41797.
# Read data set from GEO
<- readr::read_tsv(
rice "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE41nnn/GSE41797/suppl/GSE41797%5Frice%5Fall%5Fresults.txt.gz"
)
# Get count matrix
<- rice |>
rice_exp ::select(
dplyr
Gene_id, R1, R2, R3, R4, X1, X2, X3, X4, F1, F2, F3, F4|>
) ::column_to_rownames("Gene_id") |>
tibbleas.matrix()
# Get sample metadata
<- data.frame(
rice_coldata row.names = colnames(rice_exp),
Line = c(
rep("R9308", 4), rep("Xieqingzao B", 4), rep("Xieyou 9308", 4)
),Stage = rep(c("Tillering", "Tillering", "Heading", "Heading"), 3),
Generation = c(
rep("P1", 4), rep("P2", 4), rep("F1", 4)
)
)
# Create SummarizedExperiment object
<- SummarizedExperiment::SummarizedExperiment(
se_rice assays = list(counts = rice_exp),
colData = rice_coldata
)
# Save object to file
save(
compress = "xz",
se_rice, file = here::here("data", "se_rice.rda")
)
rice_functions.rda
This object contains a list of 2-column data frames with functional annotation for rice (Oryza sativa ssp. japonica). List names are GO
, InterPro
, and MapMan
, and each table has columns named gene
(gene id as in the count matrix in se_rice
), and description
(term description).
# Get a table of tx-to-gene mapping
<- readr::read_tsv(
tx2gene "https://ftp.psb.ugent.be/pub/plaza/plaza_public_monocots_05/IdConversion/id_conversion.osa.csv.gz",
skip = 8, show_col_types = FALSE
|>
) ::filter(id_type == "tid") |>
dplyr::select(tx = id, gene = `#gene_id`)
dplyr
# Get functional annotation
## GO
<- readr::read_tsv(
go_df "https://ftp.psb.ugent.be/pub/plaza/plaza_public_monocots_05/GO/go.osa.csv.gz",
skip = 8, show_col_types = FALSE
|>
) ::select(gene = `#gene_id`, description) |>
dplyrinner_join(tx2gene) |>
::select(gene = tx, description)
dplyr
## InterPro
<- readr::read_tsv(
interpro_df "https://ftp.psb.ugent.be/pub/plaza/plaza_public_monocots_05/InterPro/interpro.osa.csv.gz",
skip = 8, show_col_types = FALSE
|>
) ::select(gene = `#gene_id`, description) |>
dplyrinner_join(tx2gene) |>
::select(gene = tx, description)
dplyr
## MapMan
<- readr::read_tsv(
mapman_df "https://ftp.psb.ugent.be/pub/plaza/plaza_public_monocots_05/MapMan/mapman.osa.csv.gz",
skip = 8, show_col_types = FALSE
|>
) ::select(gene = gene_id, description = desc) |>
dplyrinner_join(tx2gene) |>
::select(gene = tx, description)
dplyr
# Create list
<- list(
rice_functions GO = go_df,
InterPro = interpro_df,
MapMan = mapman_df
)
# Save object to file
save(
compress = "xz",
rice_functions, file = here::here("data", "rice_functions.rda")
)
Session info
This document was created under the following conditions:
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.3.2 (2023-10-31)
os Ubuntu 22.04.3 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Brussels
date 2024-02-19
pandoc 3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.2)
digest 0.6.34 2024-01-11 [1] CRAN (R 4.3.2)
evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.2)
fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.2)
htmltools 0.5.7 2023-11-03 [1] CRAN (R 4.3.2)
htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.3.2)
jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.3.2)
knitr 1.45 2023-10-30 [1] CRAN (R 4.3.2)
rlang 1.1.3 2024-01-10 [1] CRAN (R 4.3.2)
rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.3.2)
rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.3.2)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.2)
xfun 0.42 2024-02-08 [1] CRAN (R 4.3.2)
yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.2)
[1] /home/faalm/R/x86_64-pc-linux-gnu-library/4.3
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library
──────────────────────────────────────────────────────────────────────────────