#' In this document, you can find the code to combine individual plots into
#' panels that will be used in the main text
#----Load data------------------------------------------------------------------
library(here)
library(patchwork)
library(tidyverse)
library(cogeqc)
# Part 0
## A
<- system.file("extdata", package = "cogeqc")
output_dir <- read_busco(output_dir)
busco_summary
data(batch_summary)
## B
<- get_genome_stats(taxon = "Zea mays")
maize_stats <- data.frame(
my_stats accession = "my_lovely_maize",
sequence_length = 2.4 * 1e9,
gene_count_total = 50000,
CC_ratio = 2
)
## C
<- system.file("extdata", package = "cogeqc")
stats_dir <- read_orthofinder_stats(stats_dir)
ortho_stats data(tree)
# Part 1
load(here("products", "plots", "combined_tree_chlorophyta.rda"))
# Part 2
## A
load(here("products", "plots", "plot_homogeneity_scores_dbs.rda")) # distros
## B
load(here("products", "plots", "p_distros_global.rda"))
load(here("products", "plots", "p_distros_mcl.rda"))
load(here("products", "plots", "p_distros_diamond.rda"))
load(here("products", "plots", "p_distros_by_species.rda"))
load(here("products", "plots", "p_treeplot.rda"))
load(here("products", "plots", "p_distros_groups.rda"))
load(here("products", "plots", "p_association_length_homogeneity.rda"))
load(here("products", "plots", "relationship_og_number_and_size.rda"))
# Part 3
load(here("products", "plots", "synteny_scores_fabaceae.rda"))
load(here("products", "plots", "synteny_scores_species.rda"))
#----Combine plots and create figs----------------------------------------------
## Figure 0: Example of figures that can be created with graphical functions
<- plot_genome_stats(ncbi_stats = maize_stats, user_stats = my_stats)
fig0_p1
<- wrap_plots(
fig0_p2 plot_busco(busco_summary),
plot_busco(batch_summary),
ncol = 2
+
) theme(legend.position = "bottom")
<- plot_og_overlap(ortho_stats) +
og_overlap scale_fill_gradient(
low = "#E5F5E0", high = "#00441B", name = "Overlap size (K)",
label = scales::unit_format(unit = "", scale = 1e-3)
+
) theme(legend.position = "bottom")
<- wrap_plots(
fig0_p3 plot_orthofinder_stats(
stats_list = ortho_stats, xlim = c(-0.1, 2)
tree,
),
og_overlap,widths = c(4, 1),
nrow = 1
)
<- wrap_plots(
fig0
fig0_p1,
fig0_p2,
fig0_p3,ncol = 1,
heights = c(2, 1, 1.5)
+
) plot_annotation(tag_levels = list(
c("A", rep("", 11), "B", "C", "D", rep("", 3), "E")
))
## Figure 1: BUSCO assessment
<- combined +
fig1 ggtitle("Percentage of BUSCOs for each species") +
plot_layout(guides = "collect", widths = c(2,3)) &
theme(legend.position = "bottom") &
::scale_fill_manual(labels = c(
ggplot2"Complete & SC", "Complete & duplicate", "Fragmented", "Missing"
),values = c(
"#32709a", "#59AAE1", "darkgoldenrod2", "#db5850"
))
## Figure 2: Orthogroup assessment
<- p_distros_global +
p_distros_global scale_x_discrete(
labels = c(
"default_1" = "Default, 1",
"default_1_5" = "Default, 1.5",
"default_2" = "Default, 2",
"default_3" = "Default, 3",
"ultra_1" = "Ultra, 1",
"ultra_1_5" = "Ultra, 1.5",
"ultra_2" = "Ultra, 2",
"ultra_3" = "Ultra, 3"
)
)
<- p_distros_diamond +
p_distros_diamond ggtitle("Effect of DIAMOND mode on orthogroup inference")
<- p_association_length_homogeneity +
p_association_length_homogeneity ggtitle("Relationship between OG homogeneity and gene length")
<- percentage_plot + scale_y_discrete(
percentage_bars labels = c(
"default_1" = "Default, 1",
"default_1_5" = "Default, 1.5",
"default_2" = "Default, 2",
"default_3" = "Default, 3",
"ultra_1" = "Ultra, 1",
"ultra_1_5" = "Ultra, 1.5",
"ultra_2" = "Ultra, 2",
"ultra_3" = "Ultra, 3"
)+
) ::theme_pubr()
ggpubr
<- patchwork::wrap_plots(
fig2 wrap_plots(distros, p_distros_global, nrow = 1), # row 1
+
p_distros_by_species theme(axis.text.x = element_text(angle = 60, vjust = 0.5)), # row 2
wrap_plots(p_distros_mcl, p_distros_diamond, nrow = 1), # row 3
# row 4
percentage_bars, ncol = 1
+
) plot_annotation(tag_levels = "A")
## Figure 3: sequence and score relationship + functional analysis of OG clusters
### Change font size of tip labels
<- p_treeplot
ptree 1]]$layers[[6]]$aes_params$size <- 3.5
ptree[[2]]$layers[[6]]$aes_params$size <- 3.5
ptree[[3]]$layers[[6]]$aes_params$size <- 3.5
ptree[[2]]$data$label <- gsub(" \\(.*", "", ptree[[2]]$data$label)
ptree[[2]]$data$label <- gsub(
ptree[["signaling pathway", "signaling", ptree[[2]]$data$label
)
<- wrap_plots(p_association_length_homogeneity, p_distros_groups, widths = c(1, 1.5))
upper <- ptree + theme(plot.margin = margin(0, 0, 0, 0, "pt"))
bottom
<- wrap_plots(upper, plot_spacer(), heights = c(1, 2)) +
fig3 inset_element(bottom, 0, 0, 1, 1, align_to = "full") +
plot_annotation(tag_levels = "A")
## Figure 4: synteny
<- wrap_plots(
fig4
synteny_scores_fabaceae, + theme(legend.position = "bottom"),
synteny_scores_species ncol = 1, heights = c(1, 3)
+
) plot_annotation(tag_levels = "A")
#----Save plots in figs/ -------------------------------------------------------
## Figure 0
ggsave(
fig0, filename = here("products", "figs", "figure_0_graphical_functions.png"),
width = 18, height = 12, dpi = 400
)
ggsave(
fig0, filename = here("products", "figs", "figure_0_graphical_functions.pdf"),
width = 19, height = 12
)
## Figure 1
ggsave(
fig1,filename = here("products", "figs", "figure_1_busco_assessment.png"),
width = 13, height = 8, dpi = 600
)
ggsave(
fig1,filename = here("products", "figs", "figure_1_busco_assessment.pdf"),
width = 13, height = 8
)
## Figure 2
ggsave(
fig2,filename = here("products", "figs", "figure_2_homogeneity_assessment.png"),
width = 17, height = 18, dpi = 300
)
ggsave(
fig2,filename = here("products", "figs", "figure_2_homogeneity_assessment.pdf"),
width = 17, height = 18
)
# Figure 3
ggsave(
fig3, filename = here("products", "figs", "figure_3_ogs_brassicaceae.png"),
width = 23, height = 12, dpi = 300
)
ggsave(
fig3, filename = here("products", "figs", "figure_3_ogs_brassicaceae.pdf"),
width = 23, height = 12
)
# Figure 4
ggsave(
fig4, filename = here("products", "figs", "figure_4_synteny_assessment.png"),
width = 8, height = 10, dpi = 300
)
ggsave(
fig4, filename = here("products", "figs", "figure_4_synteny_assessment.pdf"),
width = 8, height = 10
)
Appendices
Here, you can find other pieces of code (Bash and R scripts) that were used in the main chapters. These pieces of code were stored in separate files to make the main text (i.e., book chapters) more readable.
Creating paper figures
Below you can find the code used to create the figures in the paper.
Utility functions
Below you can find the source code for some helper functions/wrappers that are used in the main chapters.
<- function(proteins = NULL) {
extract_ogs_uniprot
<- lapply(proteins, function(x) {
ogs
<- paste0("https://www.ebi.ac.uk/proteins/api/proteins/", x)
url <- httr::GET(url)
request <- httr::content(request, type = "application/json",
annot as = "parsed", encoding = "UTF-8")
<- lapply(annot$dbReferences, as.data.frame)
dbref names(dbref) <- vapply(dbref, function(x) return(x$type), character(1))
<- NA
orthodb <- NA
eggnog <- NA
inparanoid <- NA
phylomedb <- NA
hogenom if("eggNOG" %in% names(dbref)) {
<- dbref$eggNOG$id
eggnog
}if("OrthoDB" %in% names(dbref)) {
<- dbref$OrthoDB$id
orthodb
}if("InParanoid" %in% names(dbref)) {
<- dbref$InParanoid$id
inparanoid
}if("PhylomeDB" %in% names(dbref)) {
<- dbref$PhylomeDB$id
phylomedb
}if("HOGENOM" %in% names(dbref)) {
<- dbref$HOGENOM$id
hogenom
}if(!"Araport" %in% names(dbref)) {
<- NULL
df else {
} <- data.frame(
df Gene = dbref$Araport$id,
OrthoDB = orthodb,
eggNOG = eggnog,
OrthoDB = orthodb,
InParanoid = inparanoid,
PhylomeDB = phylomedb,
HOGENOM = hogenom
)
}return(df)
})<- Reduce(rbind, ogs)
og_df return(og_df)
}
<- function(data, form, ref = NULL) {
compare # Wilcoxon test - greater and less alternatives
<- tibble::as_tibble(data) %>%
wilcoxtest_greater ::wilcox_test(
rstatixformula(form), p.adjust.method = "BH", ref.group = ref,
alternative = "greater"
)<- ifelse("p.adj" %in% names(wilcoxtest_greater), "p.adj", "p")
pg <- wilcoxtest_greater %>% dplyr::select(
wilcoxtest_greater padj_greater = all_of(pg)
group1, group2, n1, n2,
)
<- tibble::as_tibble(data) %>%
wilcoxtest_less ::wilcox_test(
rstatixformula(form), p.adjust.method = "BH", ref.group = ref,
alternative = "less"
)<- ifelse("p.adj" %in% names(wilcoxtest_less), "p.adj", "p")
pl <- wilcoxtest_less %>% dplyr::select(
wilcoxtest_less padj_less = all_of(pl)
group1, group2, n1, n2,
)
<- dplyr::inner_join(wilcoxtest_greater, wilcoxtest_less) %>%
wilcox_summary ::mutate(padj_interpretation = dplyr::case_when(
dplyr< 0.05 ~ "less",
padj_less < 0.05 ~ "greater",
padj_greater TRUE ~ "ns"
))
# Effect sizes for Wilcoxon test - greater and less alternatives
<- tibble::as_tibble(data) %>%
effsize ::wilcox_effsize(
rstatixformula(form), ref.group = ref,
%>%
) ::select(
dplyr
group1, group2, effsize, magnitude
)
<- as.data.frame(inner_join(wilcox_summary, effsize))
result
return(result)
}
<- function(compare_output) {
filter_comparison
<- compare_output |>
filtered_df ::filter(padj_interpretation != "ns") |>
dplyrmutate(padj = case_when(
== "greater" ~ padj_greater,
padj_interpretation == "less" ~ padj_less
padj_interpretation |>
)) ::select(group1, group2, n1, n2, padj, effsize, magnitude)
dplyr
return(filtered_df)
}
Bash code
Below, you can find the Bash code I used to run OrthoFinder.
#!/bin/bash
# Define paths
workdir='/home/faalm/projects/cogeqc_benchmark'
outdir='/home/faalm/projects/cogeqc_benchmark/products/result_files'
# Run OrthoFinder - default DIAMOND
orthofinder -f "$workdir/data" -S diamond -I 1.5 -o "$outdir/default_1_5" -og
orthofinder -f "$workdir/data" -S diamond -I 2 -o "$outdir/default_2" -og
orthofinder -f "$workdir/data" -S diamond -I 3 -o "$outdir/default_3" -og
orthofinder -f "$workdir/data" -S diamond -I 1 -o "$outdir/default_1" -og
# Run OrthoFinder - ultrasensitive DIAMOND
orthofinder -f "$workdir/data" -S diamond_ultra_sens -I 1.5 -t 8 -o "$outdir/ultra_1_5" -og
orthofinder -f "$workdir/data" -S diamond_ultra_sens -I 2 -t 8 -o "$outdir/ultra_2" -og
orthofinder -f "$workdir/data" -S diamond_ultra_sens -I 3 -t 8 -o "$outdir/ultra_3" -og
orthofinder -f "$workdir/data" -S diamond_ultra_sens -I 1 -t 8 -o "$outdir/ultra_1" -og
Session information
This document was created under the following conditions:
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.3.0 (2023-04-21)
os Ubuntu 20.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Brussels
date 2023-08-04
pandoc 3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
cli 3.6.1 2023-03-23 [1] CRAN (R 4.3.0)
digest 0.6.31 2022-12-11 [1] CRAN (R 4.3.0)
evaluate 0.20 2023-01-17 [1] CRAN (R 4.3.0)
fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0)
htmltools 0.5.5 2023-03-23 [1] CRAN (R 4.3.0)
htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.3.0)
jsonlite 1.8.4 2022-12-06 [1] CRAN (R 4.3.0)
knitr 1.42 2023-01-25 [1] CRAN (R 4.3.0)
rlang 1.1.1 2023-04-28 [1] CRAN (R 4.3.0)
rmarkdown 2.21 2023-03-26 [1] CRAN (R 4.3.0)
rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.3.0)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
xfun 0.39 2023-04-20 [1] CRAN (R 4.3.0)
yaml 2.3.7 2023-01-23 [1] CRAN (R 4.3.0)
[1] /home/faalm/R/x86_64-pc-linux-gnu-library/4.3
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library
──────────────────────────────────────────────────────────────────────────────