set.seed(123) # for reproducibility
# Load required packages
library(here)
library(SummarizedExperiment)
library(arrow)
library(tidyverse)
library(rvest)
4 Creating a directory with data for the Shiny app
Here, we will describe the code to create the files that will be required to run the Shiny app. These files will be stored in a directory named app_data
.
4.1 .parquet files in parquet_dir
Gene-level transcript abundances in TPM and bias-corrected counts will be stored in a partitioned .parquet directory, so that expression data can be accessed in the app back-end with Apache Arrow via the BiocStyle::CRANpkg("arrow")
package.
This directory contains partitioned .parquet files with a gene expression data frame in long format with the following variables:
Gene
: character, gene ID.Sample
: character, sample name.TPM
: numeric, gene-level transcript abundances in TPM.Count
: numeric, gene-level transcript abundances in bias-corrected counts.BioProject
: factor, BioProject IDs.Part
: character, plant part.
# Load SummarizedExperiment object
load(here("products", "result_files", "se_atlas_gene.rda"))
# Get expression data in long format
## TPM
<- assay(se_atlas_gene, "gene_TPM") |>
exp_tpm ::melt() |>
reshape2mutate(
Gene = as.character(Var1),
Sample = as.character(Var2),
TPM = as.numeric(value)
|>
) ::select(Gene, Sample, TPM)
dplyr
hist(log_sorted_tpm)
## Counts
<- assay(se_atlas_gene, "gene_counts") |>
exp_counts ::melt() |>
reshape2mutate(
Gene = as.character(Var1),
Sample = as.character(Var2),
Count = as.numeric(value)
|>
) ::select(Gene, Sample, Count)
dplyr
names(exp_counts) <- c("Gene", "Sample", "Count")
## Combine data frames
identical(exp_counts$Sample, exp_tpm$Sample)
identical(exp_counts$Gene, exp_tpm$Gene)
<- cbind(exp_tpm, exp_counts[, "Count", drop = FALSE])
exp_final
# Export data with BioProject and Part info
<- colData(se_atlas_gene) |>
sample_metadata as.data.frame() |>
::rownames_to_column("BioSample")
tibble
<- data.frame(
sample_and_additional_info Sample = sample_metadata$BioSample,
BioProject = sample_metadata$BioProject,
Part = sample_metadata$Part
)
<- left_join(
exp_final2
exp_final,
sample_and_additional_info|>
) mutate(
BioProject = as.factor(BioProject),
Sample = as.factor(Sample),
BioProject = as.factor(BioProject),
Part = as.factor(Part)
)
<- here("app_data", "parquet_dir")
parquet_dir_partitioned ::dir_create(parquet_dir_partitioned)
fs
::write_dataset(
arrow
exp_final2, path = parquet_dir_partitioned,
format = "parquet",
partitioning = c("BioProject", "Part")
)
4.2 expression_by_body_part/
directory
This directory contains static .tsv
files with gene expression information by body part, and it is used by the “Download by body part” tab to avoid having to load large volumes of data in memory, which is problematic when there are too many users.
First, let’s export gene-level abundances.
load(here("products", "result_files", "se_atlas_gene.rda"))
# Get a list of character vectors with samples per body part
<- colData(se_atlas_gene) |>
samples_per_tissue as.data.frame() |>
::rownames_to_column("BioSample")
tibble
<- split(
samples_per_tissue $BioSample, samples_per_tissue$Part
samples_per_tissue
)
# Get expression data frames in TPM
<- assay(se_atlas_gene, "gene_TPM")
tpm <- lapply(samples_per_tissue, function(x) {
tpm_matrices
return(tpm[, x] |> as.data.frame() |> tibble::rownames_to_column("Gene"))
})
# Get expression data frames in counts
<- assay(se_atlas_gene, "gene_counts")
counts <- lapply(samples_per_tissue, function(x) {
count_matrices
return(counts[, x] |> as.data.frame() |> tibble::rownames_to_column("Gene"))
})
# Export data to .tsv files
<- here::here("app_data", "expression_by_body_part")
outdir if(!dir.exists(outdir)) { dir.create(outdir, recursive = TRUE) }
invisible(lapply(seq_along(count_matrices), function(x) {
<- names(count_matrices)[x]
tissue <- file.path(outdir, paste0(tissue, "_count.tsv"))
file
<- readr::write_tsv(
w
count_matrices[[x]], file = file
)return(w)
}))
invisible(lapply(seq_along(tpm_matrices), function(x) {
<- names(tpm_matrices)[x]
tissue <- file.path(outdir, paste0(tissue, "_TPM.tsv"))
file
<- readr::write_tsv(
w
tpm_matrices[[x]], file = file
)return(w)
}))
Now, we will export transcript-level abundances.
load(here("products", "result_files", "se_atlas_transcript.rda"))
# Get expression data frames in TPM
<- assay(se_atlas_transcript, "tx_TPM")
tpm <- lapply(samples_per_tissue, function(x) {
tpm_matrices
return(
|>
tpm[, x] as.data.frame() |>
::rownames_to_column("Transcript")
tibble
)
})
# Get expression data frames in counts
<- assay(se_atlas_transcript, "tx_counts")
counts <- lapply(samples_per_tissue, function(x) {
count_matrices
return(
|>
counts[, x] as.data.frame() |>
::rownames_to_column("Transcript")
tibble
)
})
# Export data to .tsv files
<- here::here("app_data", "expression_by_body_part")
outdir if(!dir.exists(outdir)) { dir.create(outdir, recursive = TRUE) }
invisible(lapply(seq_along(count_matrices), function(x) {
<- names(count_matrices)[x]
tissue <- file.path(outdir, paste0(tissue, "_count_tx.tsv"))
file
<- readr::write_tsv(
w
count_matrices[[x]], file = file
)return(w)
}))
invisible(lapply(seq_along(tpm_matrices), function(x) {
<- names(tpm_matrices)[x]
tissue <- file.path(outdir, paste0(tissue, "_TPM_tx.tsv"))
file
<- readr::write_tsv(
w
tpm_matrices[[x]], file = file
)return(w)
}))
4.3 R objects
The following R objects are small enough to be stored as .rda files, so that they can be directly loaded when the app starts without compromising performance. These .rda
objects will be stored in the data/
directory of the app.
4.3.1 project_metadata.rda
This object stores metadata at the BioProject level.
#' Create a project table to display in the "Search by project" tab
#'
#' @param metadata Data frame of sample metadata.
#'
#' @return A data frame with the variables:
#' \itemize{
#' \item
#' \item
#' }
#' @importFrom dplyr add_count select rename distinct group_by filter
#' summarise arrange
#' @importFrom stringr str_c
#' @noRd
<- function(metadata = NULL) {
create_project_table <- metadata %>%
table ::filter(startsWith(BioProject, "PRJ")) %>%
dplyr::add_count(BioProject) %>%
dplyr::select(BioProject, n, Study_title, Study_abstract) %>%
dplyr::rename(
dplyrN = n,
`Study title` = Study_title,
`Study abstract` = Study_abstract
%>%
) ::distinct()
dplyr
<- metadata %>%
tissue_count ::filter(startsWith(BioProject, "PRJ")) %>%
dplyrgroup_by(BioProject, Part) %>%
summarise(n = n()) %>%
ungroup() %>%
arrange(-n) %>%
group_by(BioProject) %>%
summarise(part_count = stringr::str_c(
": ", n, collapse = " | ")
Part,
)
<- dplyr::inner_join(
final_table by = "BioProject"
table, tissue_count, %>%
) ::rename(Part = part_count) %>%
dplyr::select(
dplyr`Study title`, `Study abstract`
BioProject, N, Part,
)return(final_table)
}
# Combine sample metadata into project metadata
<- create_project_table(sample_metadata)
project_metadata
# Create a data frame with PMID and DOI of publications associated with projects
<- unique(project_metadata$BioProject)
all_bioprojects <- Reduce(rbind, lapply(all_bioprojects, function(x) {
pub_info message(x)
<- read_html(
pubs paste0("https://www.ncbi.nlm.nih.gov/bioproject/?term=", x)
|>
) html_nodes(".RegularLink") |>
html_attr("href")
# Get PMID
<- pubs[grepl("/pubmed/", pubs)]
pmid <- unique(gsub("/pubmed/", "", pmid))
pmid
<- NULL
id_table if(length(pmid) != 0) {
# Use PMID to extract DOI
<- sapply(pmid, function(y) {
doi <- read_html(
d paste0("https://pubmed.ncbi.nlm.nih.gov/", y)
|>
) html_nodes("a") |>
html_attr("href")
<- unique(d[grepl("doi\\.org/", d)])[1]
d return(d)
})
<- data.frame(
id_table BioProject = x,
PMID = pmid,
DOI = doi
)
}
return(id_table)
}))
<- pub_info |>
pub_table mutate(DOI = str_replace_all(DOI, "https://doi.org/", "")) |>
group_by(BioProject) |>
summarise(
DOI = paste0(DOI, collapse = ", "),
PMID = paste0(PMID, collapse = ", ")
|>
) mutate(
DOI = as.factor(DOI),
PMID = as.factor(PMID)
)
<- left_join(project_metadata, pub_table) |>
pmeta ::select(
dplyr`Study title`, `Study abstract`, DOI, PMID
BioProject, N, Part,
)
<- pmeta
project_metadata
# Save object
save(
compress = "xz",
project_metadata, file = here("app_data", "project_metadata.rda")
)
4.3.2 sample_metadata.rda
This file contains a data frame of sample metadata with the following fields:
- BioProject
- BioSample
- Part
- Treatment
- Cultivar
- Study_title
- Study_abstract
- DOI
This file was generated with the following code:
load(here("app_data", "project_metadata.rda"))
# Get a data frame of sample metadata
<- as.data.frame(colData(se_atlas_gene)) |>
sample_metadata ::rownames_to_column("BioSample") |>
tibble::select(
dplyr
BioProject, BioSample, Part, Treatment, Cultivar,
Study_title, Study_abstract|>
) inner_join(
|>
project_metadata ::select(BioProject, DOI)
dplyr
)
# Save to file
save(
compress = "xz",
sample_metadata, file = here("app_data", "sample_metadata.rda")
)
4.3.3 genes.rda
This object contains a character vector of all genes in the Soybean Expression Atlas. Not all genes in the genome are included here, as genes with no detectable expression were not included in the expression matrix.
<- rownames(se_atlas_gene)
genes
save(
compress = "xz",
genes, file = here("app_data", "genes.rda")
)
4.3.4 gene_descriptions.rda
This file contains a 2-column data frame with genes and their short descriptions. Descriptions will be obtained from PLAZA Dicots 5.0.
# Create a data frame of all genes
<- data.frame(
genes_df Gene = sort(genes)
)
# Get descriptions from PLAZA Dicots 5.0
<- read_tsv(
gene_descriptions file.path(
"https://ftp.psb.ugent.be/pub/plaza/",
"plaza_public_dicots_05/Descriptions/gene_description.gma.csv.gz"
),show_col_types = FALSE, skip = 8
|>
) select(
Gene = `#gene_id`, Description = id
|>
) mutate(Description = str_replace(Description, ".* - ", "")) |>
right_join(genes_df) |>
arrange(Gene)
# Save object
save(
compress = "xz",
gene_descriptions, file = here("app_data", "gene_descriptions.rda")
)
4.3.5 tsne_coordinates.rda
This object contains t-SNE coordinates in a data frame with the following variables:
tSNE1
: numeric, x-axis coordinates.tSNE2
: numeric, y-axis coordinates.BioSample
: factor, BioSample ID.Part
: factor, plant part.Treatment
: factor, treatment.Cultivar
: factor, cultivar name.DOI
: factor, publication DOI.
# Load tSNE plot
load(here("products", "plots", "p_tsne_optimal_perplexity.rda"))
load(here("products", "result_files", "sample_metadata.rda"))
# Create data frame
<- p_tsne_optimal_perplexity$data |>
tsne_coordinates ::rownames_to_column("BioSample") |>
tibblerename(
tSNE1 = X,
tSNE2 = Y,
Part = colour_by
|>
) select(tSNE1, tSNE2, BioSample, Part) |>
mutate(
Part = str_to_title(Part),
Part = as.factor(Part),
BioSample = as.factor(BioSample)
|>
) inner_join(
|>
sample_metadata select(BioSample, Treatment, Cultivar, DOI)
)
# Save object
save(
compress = "xz",
tsne_coordinates, file = here("app_data", "tsne_coordinates.rda")
)
4.3.6 gene_metadata.rda
This file contains a data frame with metadata on genes, including \(\tau\) indices of tissue-specificity, expression-based classification, specific parts where the gene is expressed, protein domains, and a1.v1 IDs.
<- "https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05"
base_url
# Get ID correspondence between a4.v1 and a1.v1
<- read_tsv(
id_correspondence file.path(base_url, "IdConversion/id_conversion.gma.csv.gz"),
skip = 8,
show_col_types = FALSE
|>
) filter(id_type == "synonym") |>
select(Gene = `#gene_id`, ID_a1.v1 = id)
# Get InterPro domain annotation from PLAZA 5.0 Dicots
<- read_tsv(
interpro file.path(base_url, "InterPro/interpro.gma.csv.gz"), skip = 8
|>
) select(Gene = `#gene_id`, Domain = motif_id) |>
group_by(Gene) |>
summarise(Domain = str_c(Domain, collapse = ","))
# Get expression groups info
load(here("products", "result_files", "final_classified_genes.rda"))
<- final_classified_genes |>
expression_groups mutate(
Classification = as.factor(Classification),
Specific_parts = replace_na(Specific_parts, "-"),
Specific_parts = as.factor(Specific_parts)
)
# Combine everything in a single data frame
<- expression_groups |>
gene_metadata left_join(interpro) |>
left_join(id_correspondence) |>
mutate(
Domain = replace_na(Domain, "-"),
ID_a1.v1 = replace_na(ID_a1.v1, "-"),
Tau = signif(Tau, 3)
|>
) as.data.frame()
save(
compress = "xz",
gene_metadata, file = here("app_data", "gene_metadata.rda")
)
Session information
This document was created under the following conditions:
::session_info() sessioninfo
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.3.0 (2023-04-21)
os Ubuntu 20.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Brussels
date 2023-06-23
pandoc 3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
cli 3.6.1 2023-03-23 [1] CRAN (R 4.3.0)
digest 0.6.31 2022-12-11 [1] CRAN (R 4.3.0)
evaluate 0.20 2023-01-17 [1] CRAN (R 4.3.0)
fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0)
htmltools 0.5.5 2023-03-23 [1] CRAN (R 4.3.0)
htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.3.0)
jsonlite 1.8.4 2022-12-06 [1] CRAN (R 4.3.0)
knitr 1.42 2023-01-25 [1] CRAN (R 4.3.0)
rlang 1.1.1 2023-04-28 [1] CRAN (R 4.3.0)
rmarkdown 2.21 2023-03-26 [1] CRAN (R 4.3.0)
rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.3.0)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
xfun 0.39 2023-04-20 [1] CRAN (R 4.3.0)
yaml 2.3.7 2023-01-23 [1] CRAN (R 4.3.0)
[1] /home/faalm/R/x86_64-pc-linux-gnu-library/4.3
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library
──────────────────────────────────────────────────────────────────────────────