set.seed(123)
# Required packages
library(BioNERO)
library(SummarizedExperiment)
library(tidyverse)
library(here)
Appendix: Data acquisition
Here, I will describe how I obtained each example data set used in this course.
se_PRJNA800609.rda
This experiment comprises soybean pods infected with Colletotrichum truncatum, and data were downloaded from The Soybean Expression Atlas v2 using the “Download by project” tab.
# Load expression matrix
<- read.csv(
exp_matrix "~/Downloads/PRJNA800609_TPM.tsv", header = TRUE, sep = "\t",
row.names = 1
)
# Load sample metadata
<- read.csv(
sample_metadata "~/Downloads/PRJNA800609_metadata.tsv", header = TRUE, sep = "\t"
|>
) inner_join(
read.csv(
"~/Downloads/SraRunTable.txt", header = TRUE
) |>
) select(BioSample, Part, Cultivar, Name = `Sample.Name`) |>
mutate(
Treatment = case_when(
str_detect(Name, "CK") ~ "control",
str_detect(Name, "IN") ~ "infected"
),Timepoint = str_extract(Name, "[0-9]*h")
|>
) select(-Name) |>
column_to_rownames("BioSample")
# Create a SummarizedExperiment object
<- SummarizedExperiment(
se_PRJNA800609 assays = list(TPM = exp_matrix),
colData = sample_metadata[colnames(exp_matrix), ]
|>
) ::remove_nonexp(method = "median", min_exp = 1)
BioNERO
# Save object to .rda file
save(
compress = "xz",
se_PRJNA800609, file = here::here("data", "se_PRJNA800609.rda")
)
se_PRJNA574764.rda
This experiment comprises soybean roots infected with Phytophthora sojae, and data were downloaded from The Soybean Expression Atlas v2 using the “Download by project” tab.
# Load expression matrix
<- read.csv(
exp_matrix "~/Downloads/PRJNA574764_TPM.tsv", header = TRUE, sep = "\t",
row.names = 1
)
# Load sample metadata
<- read.csv(
sample_metadata "~/Downloads/PRJNA574764_metadata.tsv", header = TRUE, sep = "\t"
|>
) inner_join(
read.csv(
"~/Downloads/SraRunTable_PRJNA574764.txt", header = TRUE
) |>
) select(BioSample, Part, Cultivar, Age) |>
mutate(
Treatment = case_when(
str_detect(Age, "0 dpi") ~ "control", TRUE ~ "infected"
),Timepoint = str_replace_all(Age, " rep.*", "")
|>
) select(-Age) |>
column_to_rownames("BioSample")
# Create a SummarizedExperiment object
<- SummarizedExperiment(
se_PRJNA574764 assays = list(TPM = exp_matrix),
colData = sample_metadata[colnames(exp_matrix), ]
|>
) ::remove_nonexp(method = "median", min_exp = 1)
BioNERO
# Save object to .rda file
save(
compress = "xz",
se_PRJNA574764, file = here::here("data", "se_PRJNA574764.rda")
)
gma_annotation.rda
This object is a list of data frames with the following elements:
MapMan
: A 2-column data frame with gene IDs and their associated MapMan bins.InterPro
: A 2-column data frame with gene IDs and their associated InterPro protein domains.
# Get MapMan annotation
<- readr::read_tsv(
gma_mapman "https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/MapMan/mapman.gma.csv.gz",
skip = 8, show_col_types = FALSE
|>
) ::select(Gene = gene_id, MapMan = desc) |>
dplyr::distinct() |>
dplyras.data.frame()
# Get InterPro annotation
<- readr::read_tsv(
gma_interpro "https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/InterPro/interpro.gma.csv.gz",
skip = 8, show_col_types = FALSE
|>
) ::select(Gene = `#gene_id`, Interpro = description) |>
dplyr::distinct() |>
dplyras.data.frame()
# Combine annotations in a list and save it
<- list(MapMan = gma_mapman, InterPro = gma_interpro)
gma_annotation
save(
compress = "xz",
gma_annotation, file = here("data", "gma_annotation.rda")
)
gcn_soyfungi.rda, and se_soyfungi.rda
These files were obtained from Almeida-Silva and Venancio (2021), and they contain:
se_soyfungi
: ASummarizedExperiment
object containing gene expression data on soybean tissues infected with multiple pathogenic fungi. Data were obtained the Soybean Expression Atlas (Almeida-Silva, Pedrosa-Silva, and Venancio 2023) and processed with BioNERO (Almeida-Silva and Venancio 2022).gcn_soyfungi
: A gene coexpression network inferred withBioNERO::exp2gcn()
usingse_soyfungi
as input.
These files were created with the code below. First, I identified which BioProjects to download from the Soybean Expression Atlas.
# Get a data frame of sample metadata for fungi-infected samples
<- "https://github.com/almeidasilvaf/SoyFungi_GWAS_GCN/raw/main/data/"
burl
<- bind_rows(
fungi_samples read_tsv(file.path(burl, "atlas_metadata_stress_samples.tsv")),
read_tsv(file.path(burl, "newsamples_metadata_stress_samples.tsv"))
|>
) filter(Stress_info == "fungus")
# List BioProjects to download
unique(fungi_samples$BioProject)
Next, I downloaded the TPM-normalized expression matrices for each BioProject by searching in the Download by project tab. Then, I loaded the expression matrices to the R session, created a SummarizedExperiment
object, and processed it.
# Load expression data
<- list.files("~/Downloads", pattern = "PRJ", full.names = TRUE)
exp_files <- lapply(exp_files, function(x) {
exp_list return(read.csv(x, header = TRUE, sep = "\t"))
})<- Reduce(function(x, y) merge(x, y, by = "Gene"), exp_list) |>
exp ::column_to_rownames("Gene")
tibble
# Keep only samples present in both `exp` and `fungi_samples$BioSample`
<- intersect(names(exp), unique(fungi_samples$BioSample))
shared <- exp[, shared]
exp
# Create a data frame of sample metadata
<- fungi_samples |>
coldata filter(BioSample %in% shared) |>
column_to_rownames("BioSample") |>
select(Pathogen, Treatment = Sample_description, Tissue)
<- coldata[names(exp), ] # reorder rows based on colnames of `exp`
coldata
# Create `SummarizedExperiment` object
<- SummarizedExperiment(
se_soyfungi assays = list(exp_TPM = exp),
colData = coldata
)
# Process the `SummarizedExperiment` object
<- BioNERO::exp_preprocess(
se_soyfungi
se_soyfungi, min_exp = 5,
Zk_filtering = FALSE
)
# Add another metadata variable that combines `Pathogen` and `Treatment`
$Pathogen_Treatment <- paste0(
se_soyfungi$Pathogen, "_", se_soyfungi$Treatment
se_soyfungi
)
# Save object
save(
compress = "xz",
se_soyfungi, file = here("data", "se_soyfungi.rda")
)
Finally, I used the process expression data to infer a GCN.
# Infer GCN
<- SFT_fit(
sft
se_soyfungi, net_type = "unsigned",
cor_method = "pearson"
)
<- exp2gcn(
gcn_soyfungi
se_soyfungi, net_type = "unsigned",
cor_method = "pearson",
SFTpower = sft$power
)
# Save network
$adjacency_matrix <- NULL # to reduce file size
gcn_soyfungi$correlation_matrix <- NULL # to reduce file size
gcn_soyfungi
save(
compress = "xz",
gcn_soyfungi, file = here("data", "gcn_soyfungi.rda")
)
snps_soyfungi.rda
This object contains a GRangesList
with genomic coordinates of soybean SNPs associated with resistance to phytopathogenic fungi, and they were obtained from Almeida-Silva and Venancio (2021) with the following code:
# Download file
<- "https://github.com/almeidasilvaf/SoyFungi_GWAS_GCN/raw/main/products/result_files/snp_granges.rda"
furl <- tempfile(fileext = ".rda")
outfile
download.file(
destfile = outfile
furl,
)
# Load file
load(outfile)
<- snp_grangeslist
snps_soyfungi
# Save object
save(
compress = "xz",
snps_soyfungi, file = here("data", "snps_soyfungi.rda")
)
guides_soyfungi.rda
This object contains a data frame of resistance-related guide genes, and it was obtained from Almeida-Silva and Venancio (2021) with the following code:
# Get data of guide genes
<- read_tsv(
guides_soyfungi "https://github.com/almeidasilvaf/SoyFungi_GWAS_GCN/raw/main/products/tables/sup_table3.tsv",
show_col_types = FALSE
)
# Save object
save(
compress = "xz",
guides_soyfungi, file = here("data", "guides_soyfungi.rda")
)
gma_primary_transcripts.gff.gz
This file contains genomic ranges for primary transcripts.
# Download GFF file
download.file(
"https://github.com/almeidasilvaf/SoyFungi_GWAS_GCN/raw/main/data/PLAZA_selected.transcripts.gff.gz",
destfile = here("data", "gma_primary_transcripts.gff3.gz")
)
gma_chrlen.rda
This file contains a GRanges
object with chromosome lengths for the soybean genome, and it was created with the code below.
# Download file temporarily
<- "https://github.com/almeidasilvaf/SoyFungi_GWAS_GCN/raw/main/data/chr_size_soybean.rda"
url <- tempfile(fileext = ".rda")
out download.file(url, destfile = out)
# Load object and create GRanges
load(out)
<- data.frame(
gma_chrlen seqnames = rownames(chr_size),
start = 1,
end = chr_size[, 1]
)<- GenomicRanges::makeGRangesFromDataFrame(gma_chrlen)
gma_chrlen
# Save object
save(
compress = "xz",
gma_chrlen, file = here("data", "gma_chrlen.rda")
)
gma_tfs.rda
This object is a character vector of gene IDs for soybean TFs, and it was obtained with the following code:
# Get a vector of TF gene IDs
<- readr::read_tsv(
gma_tfs "http://planttfdb.gao-lab.org/download/TF_list/Gma_TF_list.txt.gz"
|>
) ::pull(Gene_ID) |>
dplyrunique()
# Save object
save(
compress = "xz",
gma_tfs, file = here("data", "gma_tfs.rda")
)