library(tidyverse)
library(patchwork)
library(here)
library(maps)
library(magick)
library(ggplotify)
library(ggpubr)
set.seed(123)
# ---- Figure 1 ----
## Load data
load(here("data", "metadata_atlas_v2_downloaded.rda"))
load(here("products", "result_files", "fastp_stats.rda"))
load(here("products", "result_files", "mapping_rate_salmon.rda"))
<- left_join(
fastp_sample_stats c("BioSample", "Run")],
fastp_stats, metadata_atlas_v2_downloaded[, by = c("Sample" = "Run")
|>
) distinct(BioSample, .keep_all = TRUE)
##----1A: Workflow----
<- ggplot() + theme_void() # workflow image will be manually added later
p_1a
##----1B: Barplot of samples that passed and failed QC----
<- data.frame(
data_1b Group = c("Initial", "Checkpoint 1", "Checkpoint 2"),
Frequency = c(
# Initial
length(unique(metadata_atlas_v2_downloaded$BioSample)),
# Passed checkpoint 1 (sequence QC)
|>
fastp_stats filter(after_meanlength >= 40) |>
filter(after_q20rate >= 0.8) |>
left_join(metadata_atlas_v2_downloaded, by = c("Sample" = "Run")) |>
pull(BioSample) |>
unique() |>
length(),
# Passed checkpoint 2 (mapping rates)
|>
mapping_rate filter(Mapping_rate >= 50) |>
pull(BioSample) |>
unique() |>
length()
)|>
) mutate(Difference = c(NA, diff(Frequency)))
<- ggplot(data_1b, aes(y = fct_reorder(Group, Frequency), x = Frequency)) +
p_1b geom_bar(
stat = "identity", fill = c("dodgerblue3", "brown2", "firebrick"),
color = "black"
+
) labs(
title = "Number of BioSamples after each filtering step",
subtitle = "Numbers inside bars indicate removed samples",
y = "", x = "Number of samples"
+
) geom_text(aes(label = Frequency), hjust = -0.3) +
geom_text(aes(label = Difference), hjust = 1.5, color = "white", fontface = "bold") +
theme_bw() +
xlim(0, 8000)
##----1C: Distribution of read length
<- fastp_sample_stats |>
p_1c filter(after_meanlength <= 150) |>
ggplot(aes(x = after_meanlength)) +
geom_density(fill = "grey85", color = "gray30") +
theme_bw() +
labs(
title = "Distribution of mean read lengths per sample",
subtitle = "Dashed line indicates the minimum length in checkpoint 1",
x = "", y = "Density"
+
) geom_vline(xintercept = 40, color = "firebrick", linetype = "dashed")
##----1D: Distribution of Q20 rates----
<- fastp_sample_stats |>
p_1d filter(after_meanlength <= 150) |>
mutate(after_q20rate = as.numeric(after_q20rate)) |>
ggplot(aes(x = after_q20rate)) +
geom_density(fill = "lightskyblue1", color = "gray30") +
theme_bw() +
geom_vline(xintercept = 0.8, color = "firebrick", linetype = "dashed") +
labs(
title = "Distribution of Q20 rates per sample",
subtitle = "Dashed line indicates the minimum rate in checkpoint 1",
x = "", y = "Density"
)
p_1d
##----1E: Distribution of quasi-mapping rates----
<- mapping_rate |>
p_1e mutate(Mapping_rate = Mapping_rate / 100) |>
ggplot(aes(x = Mapping_rate)) +
geom_density(fill = "lemonchiffon2", color = "gray30") +
theme_bw() +
geom_vline(xintercept = 0.5, color = "firebrick", linetype = "dashed") +
labs(
title = "Distribution of mapping rates per sample",
subtitle = "Dashed line indicates the rate in checkpoint 2",
x = "", y = "Density"
)
##----1F: Distribution of number of reads---
<- fastp_sample_stats |>
p_1f filter(after_meanlength <= 150) |>
filter(after_nreads <= 1600 * 1e6 & after_nreads > 0) |>
mutate(after_nreads = after_nreads / 1e6) |>
ggplot(aes(x = after_nreads)) +
geom_density(fill = "darkseagreen3", color = "gray30") +
theme_bw() +
labs(
title = "Distribution of number of reads (in millions) per sample",
x = "", y = "Density"
)
## Combine plots
<- wrap_plots(
fig1 wrap_plots(
p_1a, + theme(
p_1b axis.text.y = element_text(size = 12)
), heights = c(4, 1), nrow = 2
), wrap_plots(
p_1c, p_1d, p_1e, p_1f,ncol = 1
)+
) plot_annotation(tag_levels = "A")
## Combine workflow (fig 1A) and other panels
<- tempfile(fileext = ".pdf")
temp_fig1 ggsave(
filename = temp_fig1,
fig1, width = 11, height = 13
)
<- image_read_svg(
fig1a here("products", "figs", "atlas_workflow_paper.svg")
)<- image_read_pdf(temp_fig1)
fig1rest
<- image_composite(
fig1_final image_scale(fig1a, "x2900")
fig1rest,
)
## Save figure 1
image_write(
fig1_final, path = here("products", "figs", "figure1.pdf"),
format = "pdf"
)
image_write(
fig1_final, path = here("products", "figs", "figure1.png"),
format = "png",
density = 300
)
#----Figure 2-----
load(here("products", "plots", "p_tsne_optimal_perplexity.rda"))
load(here("products", "plots", "p_umap_optimal_nneighbors.rda"))
load(here("products", "plots", "p_samples_per_body_part.rda"))
##----2A: Barplot of number of samples per tissue
<- png::readPNG(
parts here("products", "figs", "soybean_parts_colored.png"),
native = TRUE
)
<- p_samples_per_body_part$data |>
p_2a left_join(
data.frame(
Part = c(
"Leaf", "Seed", "Seed Coat", "Root", "Embryo", "Cotyledon",
"Shoot", "Hypocotyl", "Pod", "Flower", "Endosperm",
"Suspensor", "Seedling", "Nodule"
),n_v1 = c(
601, 158, 125, 100, 81, 48, 42, 24, 23, 18, 10, 9, 3, 2
)
)|>
) mutate(
new_n = n - n_v1,
new_n = replace_na(new_n, 0),
Part = factor(Part, levels = levels(p_samples_per_body_part$data$Part))
|>
) mutate(
label = case_when(
> 0 ~ paste0("**", n, "**", " (+", new_n, ")"),
new_n TRUE ~ paste0("**", n, "**", " ")
)|>
) ggplot(aes(x = n, y = Part)) +
::background_image(
ggpubr::readPNG(
pnghere("products", "figs", "bg_gradient_left_right.png"),
native = TRUE
)+
) geom_bar(stat = "identity", fill = "#719f2d") +
labs(
title = "Number of samples per body part",
subtitle = "Numbers in parentheses represent newly added samples",
x = "Number of samples", y = ""
+
) ::geom_richtext(
ggtextaes(label = label), hjust = -0.1, fill = NA, label.color = NA,
size = 4.5
+
) ::theme_pubr() +
ggpubrscale_x_continuous(limits = c(0, 2300), expand = c(0, 0)) +
inset_element(
parts,left = 0.25,
top = 0.95,
right = 1,
bottom = 0.25
+
) theme_void()
##----2B: t-SNE representation of samples----
<- p_tsne_optimal_perplexity$data |>
p_2b mutate(colour_by = str_to_title(colour_by)) |>
# Plot
ggplot(aes(x = X, y = Y, color = colour_by)) +
::background_image(
ggpubr::readPNG(
pnghere("products", "figs", "bg_gradient_bottom_top.png"),
native = TRUE
)+
) geom_point(alpha = 0.5) +
::scale_color_d3(palette = "category20") +
ggscitheme(
legend.key = element_blank(),
legend.position = "right"
+
) labs(
color = "Body part",
title = "t-SNE representation of all samples",
subtitle = "Top 8 principal components, perplexity = 60",
x = "t-SNE 1", y = "t-SNE 2"
)
p_2b
##----2C: UMAP representation of samples----
<- p_umap_optimal_nneighbors$data |>
p_2c mutate(colour_by = str_to_title(colour_by)) |>
# Plot
ggplot(aes(x = X, y = Y, color = colour_by)) +
::background_image(
ggpubr::readPNG(
pnghere("products", "figs", "bg_gradient_bottom_top.png"),
native = TRUE
)+
) geom_point(alpha = 0.5) +
::scale_color_d3(palette = "category20") +
ggscitheme(
legend.key = element_blank(),
legend.position = "right"
+
) labs(
color = "Body part",
x = "UMAP 1", y = "UMAP 2",
title = "UMAP representation of all samples",
subtitle = "Top 8 principal components, n_neighbors = 30"
)
p_2c
## Combine plots
<- wrap_plots(
fig2
p_2a,wrap_plots(p_2b, p_2c, ncol = 1) +
plot_layout(guides = "collect"),
ncol = 2, widths = c(1.5, 1)
+
) plot_annotation(tag_levels = list(c("A", "", "B", "C")))
fig2
## Save figure 2
ggsave(
filename = here("products", "figs", "figure2.png"),
fig2, width = 16, height = 10, dpi = 300
)
ggsave(
filename = here("products", "figs", "figure2.pdf"),
fig2, width = 14, height = 10
)
#----Figure 3----
## Load data
load(here("products", "plots", "p_heatmap_median.rda"))
load(here("products", "plots", "p_heatmap_specific_tfs.rda"))
load(here("products", "plots", "p_upset.rda"))
load(here("products", "plots", "p_genes_per_group.rda"))
## Enhance graphical details
### Change title of barplot
<- p_genes_per_group +
p_genes_per_group labs(title = "Genes per expression group") +
xlim(0, 40000)
### Add title to UpSet plot
@column_title <- "Overlap among body part-specific genes"
p_upset
### Change title and fontface of heatmap from bold to plain
@column_title_param$gp$font <- 1
p_heatmap_median@column_title <- "Expression of body part-specific genes"
p_heatmap_median
### Change fontface of heatmap from bold to plain
@column_title_param$gp$font <- 1
p_heatmap_specific_tfs
## Combine plots
<- ggarrange(
fig3 ## First row
ggarrange(
as.ggplot(p_upset), nrow = 1,
p_genes_per_group, widths = c(1,3), labels = list("A", "B")
),## Second row
ggarrange(
as.ggplot(p_heatmap_median), as.ggplot(p_heatmap_specific_tfs),
nrow = 1, widths = c(1, 2), labels = list("C", "D")
),nrow = 2
)
## Save figure 3
ggsave(
fig3, filename = here("products", "figs", "figure3.pdf"),
width = 15, height = 12
)
image_read_pdf(here("products", "figs", "figure3.pdf")) |>
image_write(
path = here("products", "figs", "figure3.png"),
density = 300
)
#----Figure 4-----
## Load data
load(here("products", "plots", "p_map_samples_per_country.rda"))
load(here("products", "plots", "p_barplot_samples_per_country.rda"))
load(here("products", "plots", "p_nsamples_over_time_cumsum.rda"))
load(here("products", "plots", "p_seqtech_layout_count.rda"))
<- c(
top_producers "Brazil", "USA", "Argentina", "China", "India", "Paraguay", "Canada"
)
##----4A and 4B: World map + samples per country----
<- wrap_plots(
p_4ab
p_map_samples_per_country, +
p_barplot_samples_per_country labs(
subtitle = "Blue bars indicate world leaders in soybean production"
+
) theme(
axis.text.y = element_text(size = 11)
),widths = c(2, 1)
)
##----4C: Time-series: number of samples over time-----
<- p_nsamples_over_time_cumsum
p_4c $layers[[3]] <- NULL
p_4c<- p_4c +
p_4c labs(
subtitle = "Samples that were filtered out in the SEA v2 were not included"
+
) annotate(
"text",
x = as.Date("2015-01-01"), y = 3500,
label = "Last download for the SEA v1",
color = "gray20"
)
##----4D: sequencing stats----
<- p_seqtech_layout_count +
p_4d labs(title = "Summary sequencing statistics") +
theme(
axis.text.y = element_text(size = 11),
strip.text.y = element_text(size = 11)
)
## Combine plots
<- wrap_plots(
fig4
p_4ab,wrap_plots(p_4c, p_4d),
nrow = 2
+
) plot_annotation(tag_levels = list(c("A", "B", "C", "D")))
## Save figure 4
ggsave(
filename = here("products", "figs", "figure4.png"),
fig4, width = 15, height = 12, dpi = 300
)
ggsave(
filename = here("products", "figs", "figure4.pdf"),
fig4, width = 13, height = 12
)
#----Supplementary Figure S1----------------------------------------------------
## Load data
load(here("products", "plots", "p_pca_percent_var.rda"))
load(here("products", "plots", "p_fit_mean_var.rda"))
## Combine plots
<- wrap_plots(p_fit_mean_var, p_pca_percent_var) +
sf1 plot_annotation(tag_levels = "A")
## Save figure
ggsave(
sf1, file = here("products", "figs", "supplementary_figure1.png"),
width = 15, height = 9, dpi = 300
)
ggsave(
sf1, file = here("products", "figs", "supplementary_figure1.pdf"),
width = 15, height = 9
)
#----Supplementary Figure S2----------------------------------------------------
## Load data
load(here("products", "plots", "p_heatmap_terms_pav.rda"))
<- as.ggplot(p_heatmap_terms_pav)
sf2
ggsave(
sf2, file = here("products", "figs", "supplementary_figure2.pdf"),
width = 10, height = 12
)
image_read_pdf(here("products", "figs", "supplementary_figure2.pdf")) |>
image_write(
path = here("products", "figs", "supplementary_figure2.png"),
density = 300
)
#----Supplementary Figure S3----------------------------------------------------
## Load data
load(here("products", "plots", "p_bodypart_timeseries.rda"))
load(here("products", "plots", "p_layout_timeseries.rda"))
load(here("products", "plots", "p_seqtech_timeseries.rda"))
## Combine plots
<- wrap_plots(
sf3 +
p_bodypart_timeseries guides(color = guide_legend(ncol = 1)) +
theme(legend.position = "right"),
+
p_layout_timeseries guides(color = guide_legend(ncol = 1)) +
theme(legend.position = "right"),
+
p_seqtech_timeseries guides(color = guide_legend(ncol = 1)) +
theme(legend.position = "right"),
nrow = 3
+
) plot_annotation(tag_levels = "A")
## Save figure
ggsave(
sf3, file = here("products", "figs", "supplementary_figure3.png"),
width = 9, height = 11, dpi = 300
)
ggsave(
sf2, file = here("products", "figs", "supplementary_figure2.pdf"),
width = 9, height = 11
)
Appendices
Creating paper figures
Below you can find the code used to create the figures in the paper.
Creating paper tables
Below you can find the code used to create the tables in the paper.
library(here)
library(tidyverse)
set.seed(123)
load(here("data", "metadata_atlas_v2_downloaded.rda"))
#----Supplementary Table S1-----------------------------------------------------
## Sequence QC stats
load(here("products", "result_files", "fastp_stats.rda"))
<- fastp_stats |>
sup_table1 ::rename(Run = Sample) |>
dplyrleft_join(
|>
metadata_atlas_v2_downloaded select(BioSample, Run, BioProject)
|>
) ::select(BioProject, BioSample, Run, 3:24)
dplyr
write_tsv(
sup_table1,file = here("products", "tables", "supplementary_table_S1.tsv")
)
#----Supplementary Table S2----------------------------------------------------
## Mapping stats
load(here("products", "result_files", "mapping_rate_salmon.rda"))
<- metadata_atlas_v2_downloaded |>
sup_table2 select(BioSample, BioProject) |>
inner_join(mapping_rate)
write_tsv(
sup_table2,file = here("products", "tables", "supplementary_table_S2.tsv")
)