set.seed(123) # for reproducibility
# Load required packages
library(here)
library(bears)
library(tidyverse)
1 Creating the Soybean Expression Atlas v2
Here, I will describe the code I used to create the Soybean Expression Atlas using the R package bears.
1.1 Downloading data
We will download samples that have not been downloaded yet. The data frames with sample metadata are stored in the data/
directory.
options(timeout = 1e+10)
#----Get sample metadata--------------------------------------------------------
<- "Glycine max[ORGN] AND RNA-seq[STRA]"
term <- create_sample_info(term, retmax = 10000)
metadata_all
# Remove unsupported technologies
<- metadata_all[!grepl(
metadata_all "454|SOLiD|PacBio|Torrent|AB 310|Complete Genomics|ION",
$Instrument
metadata_all
), ]
## Save metadata object
save(
metadata_all,file = here("data", "metadata_all.rda"),
compress = "xz"
)
## Create directory structure
<- here("results")
rootdir <- create_dir_structure(rootdir = rootdir)
ds save(ds, file = here("data", "ds.rda"), compress = "xz")
#----Downloading samples----------------------------------------------------
<- get_url_ena(metadata_all)
urls <- download_from_ena(urls = urls, fastqdir = ds$fastqdir) download
Now, let’s check the file integrity of downloaded files. Here, I will do it for the whole atlas, including previous versions, just as a sanity check.
#----Check file integrity with md5sum check-------------------------------------
# Remove runs that were not downloaded
<- fastq_exists(metadata_all, fastqdir = ds$fastqdir)
downloaded <- downloaded$Run[!is.na(downloaded$Status)]
downloaded <- metadata_all[metadata_all$Run %in% downloaded, ]
metadata_atlas_v2
## Check md5sum
<- check_md5(
integrity run_accessions = metadata_all$Run,
fastqdir = ds$fastqdir
)
<- integrity[integrity$Status == FALSE, "Run"]
failed_corrupt <- unique(as.character(failed_corrupt)) failed_corrupt
The failed_corrupt
object is a character vector containing run accessions that failed the integrity check and, thus, must be re-downloaded. Now, let’s also check for files that were not downloaded at all. Here, we will only consider for re-download runs that are part of BioProjects with effectively downloaded runs. In other words, if all runs of a BioProject were not downloaded, I will ignore them (they are probably not available on EBI).
# Get download status
<- fastq_exists(metadata_all, fastqdir = ds$fastqdir)
dstatus
# Get BioProject info of failed runs
<- dstatus %>%
failed_bioproject full_join(., metadata_all)
# Get BioProjects with missing percentage (m) = 0 < m < 100
<- failed_bioproject %>%
bioprojects_download group_by(BioProject) %>%
summarise(perc = sum(is.na(Status)) / length(BioProject)) %>%
filter(perc != 0 & perc != 1)
# Get vector of runs from these BioProjects to re-download
<- failed_bioproject %>%
failed_nd filter(is.na(Status) & BioProject %in% bioprojects_download$BioProject) %>%
select(Run)
<- unique(as.character(failed_nd$Run)) failed_nd
In the end, 33 runs failed the integrity check (i.e., they were downloaded, but md5sums were different from the reference), and 33 runs failed to download (i.e., some runs from the BioProject were downloaded, but others were not). Re-downloading failed runs:
# Metadata data frame of failed runs only
<- c(failed_corrupt, failed_nd)
todownload <- metadata_all[metadata_all$Run %in% todownload, ]
metadata_failed
# Download again
<- get_url_ena(metadata_failed)
urls_missing
options(timeout = 1e10)
<- download_from_ena(
download_missing
metadata_failed,urls = urls_missing,
fastqdir = ds$fastqdir
)
Done! Finally, let’s create a final metadata data frame with all samples we have:
<- fastq_exists(metadata_all, fastqdir = ds$fastqdir)
dstatus <- unique(dstatus$Run[dstatus$Status == "OK"])
ok_runs
<- metadata_all[metadata_all$Run %in% ok_runs, ]
metadata_atlas_v2
## Save a data frame containing metadata for all samples of SEA v2
save(
metadata_atlas_v2_downloaded,file = here::here("data", "metadata_atlas_v2_downloaded.rda"),
compress = "xz"
)
1.2 Sequence QC and filtering
Now, we will remove sequence adapters and low-quality bases with fastp.
<- metadata_atlas_v2_downloaded
metadata_atlas_v2
<- fastq_exists(metadata_atlas_v2, ds$fastqdir) %>%
runs_fastp ::filter(!is.na(Status)) %>%
dplyr::pull(Run)
dplyr
<- metadata_atlas_v2[metadata_atlas_v2$Run %in% runs_fastp, ]
metadata_fastp
# Run fastp
<- trim_reads(
fastp_status
metadata_fastp, fastqdir = ds$fastqdir,
filtdir = ds$filtdir,
qcdir = ds$qcdir,
threads = 16,
delete_raw = TRUE
)
# Get a metadata data frame with only reads that have undergone filtering
<- unique(
filtered_reads gsub(
"(\\.fastq.*)|(_.*)", "",
basename(list.files(ds$filtdir, pattern = "fastq.gz"))
)
)
<- metadata_atlas_v2[
metadata_atlas_v2_filtered $Run %in% filtered_reads,
metadata_atlas_v2
]
save(
compress = "xz",
metadata_atlas_v2_filtered, file = here("data", "metadata_atlas_v2_filtered.rda")
)
Now, let’s import and save read filtering stats.
# Get a data frame of summary stats from fastp
<- summary_stats_fastp(ds$qcdir)
fastp_stats
save(
compress = "xz",
fastp_stats, file = here("products", "result_files", "fastp_stats.rda")
)
Finally, we will remove low-quality files based on the following criteria:
- Mean length after filtering < 40
- Q20 rate <80% after filtering.
# Remove files whose mean length after filtering is <40 and Q20 <80%
<- fastp_stats %>%
keep filter(after_meanlength >= 40) %>%
filter(after_q20rate >= 0.8) %>%
pull(Sample)
<- metadata_atlas_v2_filtered[
filtered_metadata $Run %in% keep,
metadata_atlas_v2_filtered
]rownames(filtered_metadata) <- 1:nrow(filtered_metadata)
1.3 Quantifying transcript abundance
Next, we will quantify transcript abundance with salmon. To do that, we first need to index the reference transcriptome.
# Index transcriptome
<- here("data", "gmax_transcriptome.fa.gz")
transcriptome_path
<- salmon_index(
idx_salmon salmonindex = ds$salmonindex,
transcriptome_path = transcriptome_path
)
idx_salmon
Then, we can quantify transcript abundance.
# Quantify transcript abundance
<- salmon_quantify(
quant_salmon
filtered_metadata,filtdir = ds$filtdir,
salmonindex = ds$salmonindex,
salmondir = ds$salmondir,
threads = 40
)
# Checking percentage of samples that ran sucessfully
<- nrow(quant_salmon[!is.na(quant_salmon$status), ])
n_ok / nrow(quant_salmon) n_ok
salmon was run sucessfully for 100% of the samples. Great! Now, let’s obtain mapping rates for each BioSample to see whether or not we need to discard samples. Here, we will remove samples with mapping rate <50% (i.e., less than 50% of the reads failed to “map”).
# Get a data frame of mapping rate per BioSample
<- unique(filtered_metadata$BioSample)
biosamples <- summary_stats_salmon(ds$salmondir, biosamples)
mapping_rate
save(
compress = "xz",
mapping_rate, file = here("products", "result_files", "mapping_rate_salmon.rda")
)
# Removing BioSamples with mapping rate <50%
<- mapping_rate %>%
biosamples_to_keep filter(Mapping_rate >= 50) %>%
pull(BioSample)
# Update metadata data frame to keep only samples that passed the filtering step
<- filtered_metadata %>%
final_metadata_atlas_v2 filter(BioSample %in% biosamples_to_keep)
82% of the samples (5481/6644) passed the filtering step (i.e., had mapping rates >=50%). Thus, this is the final number of samples in the Soybean Expression Atlas v2.
To conclude, let’s just fix and standardize tissue names in the metadata data frame and save it.
<- final_metadata_atlas_v2 %>%
final_metadata_classified_tissues mutate(Tissue_PO = str_to_lower(Tissue)) %>%
mutate(Tissue_PO = str_replace_all(
Tissue_PO,c(".*nodule.*" = "nodule",
".*leaves and roots.*" = "whole plant",
".*leaves.*" = "leaf",
".*leaf.*" = "leaf",
".*trifoliate.*" = "leaf",
".*seed coat.*" = "seed coat",
".*crown.*" = "root",
".*shoot.*" = "shoot",
".*stem.*" = "shoot",
".*hypocotyl.*" = "hypocotyl",
".*pod.*" = "pod",
".*root.*" = "",
".*embryo.*" = "embryo",
".*seedling.*" = "seedling",
".*cytoledon.*" = "cotyledon",
".*cotyledon.*" = "cotyledon",
".*seed.*" = "seed",
".*flower.*" = "flower",
".*petiole.*" = "petiole",
".*meristem.*" = "meristem",
".*radicle*" = "radicle",
".*epicotyl.*" = "epicotyl",
".*whole.*" = "whole plant",
".*roots + leaves.*" = "whole plant",
".*sprout*" = "seedling",
".*ovule.*" = "flower",
".*suspensor*" = "suspensor",
".*floral.*" = "flower",
".*leaf bud.*" = "leaf",
".*anther*" = "flower",
".*stem and leaf*" = "whole plant",
".*ovary.*" = "flower",
".*embryo.*" = "embryo",
".*embryo.*" = "embryo",
".*embryo.*" = "embryo"
)
)
)
write_tsv(
final_metadata_classified_tissues, file = here("products", "tables", "final_metadata_classified_tissues.tsv")
)
The file final_metadata_classified_tissues.tsv was manually edited to include classifications for missing tissues, and then it was renamed to final_metadata_classified_atlas_v2.tsv. This file only contains BioSample-level information (runs were not considered).
Session information
::session_info() sessioninfo
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.3.0 (2023-04-21)
os Ubuntu 20.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Brussels
date 2023-06-23
pandoc 3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
cli 3.6.1 2023-03-23 [1] CRAN (R 4.3.0)
digest 0.6.31 2022-12-11 [1] CRAN (R 4.3.0)
evaluate 0.20 2023-01-17 [1] CRAN (R 4.3.0)
fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0)
htmltools 0.5.5 2023-03-23 [1] CRAN (R 4.3.0)
htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.3.0)
jsonlite 1.8.4 2022-12-06 [1] CRAN (R 4.3.0)
knitr 1.42 2023-01-25 [1] CRAN (R 4.3.0)
rlang 1.1.1 2023-04-28 [1] CRAN (R 4.3.0)
rmarkdown 2.21 2023-03-26 [1] CRAN (R 4.3.0)
rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.3.0)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
xfun 0.39 2023-04-20 [1] CRAN (R 4.3.0)
yaml 2.3.7 2023-01-23 [1] CRAN (R 4.3.0)
[1] /home/faalm/R/x86_64-pc-linux-gnu-library/4.3
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library
──────────────────────────────────────────────────────────────────────────────