#' In this document, you can find the code to combine individual plots into
#' panels that will be used in the main text
#----Load data------------------------------------------------------------------
library(here)
library(patchwork)
library(tidyverse)
library(cogeqc)
# Part 0
## A
output_dir <- system.file("extdata", package = "cogeqc")
busco_summary <- read_busco(output_dir)
data(batch_summary)
## B
maize_stats <- get_genome_stats(taxon = "Zea mays")
my_stats <- data.frame(
accession = "my_lovely_maize",
sequence_length = 2.4 * 1e9,
gene_count_total = 50000,
CC_ratio = 2
)
## C
stats_dir <- system.file("extdata", package = "cogeqc")
ortho_stats <- read_orthofinder_stats(stats_dir)
data(tree)
# Part 1
load(here("products", "plots", "combined_tree_chlorophyta.rda"))
# Part 2
## A
load(here("products", "plots", "plot_homogeneity_scores_dbs.rda")) # distros
## B
load(here("products", "plots", "p_distros_global.rda"))
load(here("products", "plots", "p_distros_mcl.rda"))
load(here("products", "plots", "p_distros_diamond.rda"))
load(here("products", "plots", "p_distros_by_species.rda"))
load(here("products", "plots", "p_treeplot.rda"))
load(here("products", "plots", "p_distros_groups.rda"))
load(here("products", "plots", "p_association_length_homogeneity.rda"))
load(here("products", "plots", "relationship_og_number_and_size.rda"))
# Part 3
load(here("products", "plots", "synteny_scores_fabaceae.rda"))
load(here("products", "plots", "synteny_scores_species.rda"))
#----Combine plots and create figs----------------------------------------------
## Figure 0: Example of figures that can be created with graphical functions
fig0_p1 <- plot_genome_stats(ncbi_stats = maize_stats, user_stats = my_stats)
fig0_p2 <- wrap_plots(
plot_busco(busco_summary),
plot_busco(batch_summary),
ncol = 2
) +
theme(legend.position = "bottom")
og_overlap <- plot_og_overlap(ortho_stats) +
scale_fill_gradient(
low = "#E5F5E0", high = "#00441B", name = "Overlap size (K)",
label = scales::unit_format(unit = "", scale = 1e-3)
) +
theme(legend.position = "bottom")
fig0_p3 <- wrap_plots(
plot_orthofinder_stats(
tree, stats_list = ortho_stats, xlim = c(-0.1, 2)
),
og_overlap,
widths = c(4, 1),
nrow = 1
)
fig0 <- wrap_plots(
fig0_p1,
fig0_p2,
fig0_p3,
ncol = 1,
heights = c(2, 1, 1.5)
) +
plot_annotation(tag_levels = list(
c("A", rep("", 11), "B", "C", "D", rep("", 3), "E")
))
## Figure 1: BUSCO assessment
fig1 <- combined +
ggtitle("Percentage of BUSCOs for each species") +
plot_layout(guides = "collect", widths = c(2,3)) &
theme(legend.position = "bottom") &
ggplot2::scale_fill_manual(labels = c(
"Complete & SC", "Complete & duplicate", "Fragmented", "Missing"
),
values = c(
"#32709a", "#59AAE1", "darkgoldenrod2", "#db5850"
))
## Figure 2: Orthogroup assessment
p_distros_global <- p_distros_global +
scale_x_discrete(
labels = c(
"default_1" = "Default, 1",
"default_1_5" = "Default, 1.5",
"default_2" = "Default, 2",
"default_3" = "Default, 3",
"ultra_1" = "Ultra, 1",
"ultra_1_5" = "Ultra, 1.5",
"ultra_2" = "Ultra, 2",
"ultra_3" = "Ultra, 3"
)
)
p_distros_diamond <- p_distros_diamond +
ggtitle("Effect of DIAMOND mode on orthogroup inference")
p_association_length_homogeneity <- p_association_length_homogeneity +
ggtitle("Relationship between OG homogeneity and gene length")
percentage_bars <- percentage_plot + scale_y_discrete(
labels = c(
"default_1" = "Default, 1",
"default_1_5" = "Default, 1.5",
"default_2" = "Default, 2",
"default_3" = "Default, 3",
"ultra_1" = "Ultra, 1",
"ultra_1_5" = "Ultra, 1.5",
"ultra_2" = "Ultra, 2",
"ultra_3" = "Ultra, 3"
)
) +
ggpubr::theme_pubr()
fig2 <- patchwork::wrap_plots(
wrap_plots(distros, p_distros_global, nrow = 1), # row 1
p_distros_by_species +
theme(axis.text.x = element_text(angle = 60, vjust = 0.5)), # row 2
wrap_plots(p_distros_mcl, p_distros_diamond, nrow = 1), # row 3
percentage_bars, # row 4
ncol = 1
) +
plot_annotation(tag_levels = "A")
## Figure 3: sequence and score relationship + functional analysis of OG clusters
### Change font size of tip labels
ptree <- p_treeplot
ptree[[1]]$layers[[6]]$aes_params$size <- 3.5
ptree[[2]]$layers[[6]]$aes_params$size <- 3.5
ptree[[3]]$layers[[6]]$aes_params$size <- 3.5
ptree[[2]]$data$label <- gsub(" \\(.*", "", ptree[[2]]$data$label)
ptree[[2]]$data$label <- gsub(
"signaling pathway", "signaling", ptree[[2]]$data$label
)
upper <- wrap_plots(p_association_length_homogeneity, p_distros_groups, widths = c(1, 1.5))
bottom <- ptree + theme(plot.margin = margin(0, 0, 0, 0, "pt"))
fig3 <- wrap_plots(upper, plot_spacer(), heights = c(1, 2)) +
inset_element(bottom, 0, 0, 1, 1, align_to = "full") +
plot_annotation(tag_levels = "A")
## Figure 4: synteny
fig4 <- wrap_plots(
synteny_scores_fabaceae,
synteny_scores_species + theme(legend.position = "bottom"),
ncol = 1, heights = c(1, 3)
) +
plot_annotation(tag_levels = "A")
#----Save plots in figs/ -------------------------------------------------------
## Figure 0
ggsave(
fig0,
filename = here("products", "figs", "figure_0_graphical_functions.png"),
width = 18, height = 12, dpi = 400
)
ggsave(
fig0,
filename = here("products", "figs", "figure_0_graphical_functions.pdf"),
width = 19, height = 12
)
## Figure 1
ggsave(
fig1,
filename = here("products", "figs", "figure_1_busco_assessment.png"),
width = 13, height = 8, dpi = 600
)
ggsave(
fig1,
filename = here("products", "figs", "figure_1_busco_assessment.pdf"),
width = 13, height = 8
)
## Figure 2
ggsave(
fig2,
filename = here("products", "figs", "figure_2_homogeneity_assessment.png"),
width = 17, height = 18, dpi = 300
)
ggsave(
fig2,
filename = here("products", "figs", "figure_2_homogeneity_assessment.pdf"),
width = 17, height = 18
)
# Figure 3
ggsave(
fig3,
filename = here("products", "figs", "figure_3_ogs_brassicaceae.png"),
width = 23, height = 12, dpi = 300
)
ggsave(
fig3,
filename = here("products", "figs", "figure_3_ogs_brassicaceae.pdf"),
width = 23, height = 12
)
# Figure 4
ggsave(
fig4,
filename = here("products", "figs", "figure_4_synteny_assessment.png"),
width = 8, height = 10, dpi = 300
)
ggsave(
fig4,
filename = here("products", "figs", "figure_4_synteny_assessment.pdf"),
width = 8, height = 10
)Appendices
Here, you can find other pieces of code (Bash and R scripts) that were used in the main chapters. These pieces of code were stored in separate files to make the main text (i.e., book chapters) more readable.
Creating paper figures
Below you can find the code used to create the figures in the paper.
Utility functions
Below you can find the source code for some helper functions/wrappers that are used in the main chapters.
extract_ogs_uniprot <- function(proteins = NULL) {
ogs <- lapply(proteins, function(x) {
url <- paste0("https://www.ebi.ac.uk/proteins/api/proteins/", x)
request <- httr::GET(url)
annot <- httr::content(request, type = "application/json",
as = "parsed", encoding = "UTF-8")
dbref <- lapply(annot$dbReferences, as.data.frame)
names(dbref) <- vapply(dbref, function(x) return(x$type), character(1))
orthodb <- NA
eggnog <- NA
inparanoid <- NA
phylomedb <- NA
hogenom <- NA
if("eggNOG" %in% names(dbref)) {
eggnog <- dbref$eggNOG$id
}
if("OrthoDB" %in% names(dbref)) {
orthodb <- dbref$OrthoDB$id
}
if("InParanoid" %in% names(dbref)) {
inparanoid <- dbref$InParanoid$id
}
if("PhylomeDB" %in% names(dbref)) {
phylomedb <- dbref$PhylomeDB$id
}
if("HOGENOM" %in% names(dbref)) {
hogenom <- dbref$HOGENOM$id
}
if(!"Araport" %in% names(dbref)) {
df <- NULL
} else {
df <- data.frame(
Gene = dbref$Araport$id,
OrthoDB = orthodb,
eggNOG = eggnog,
OrthoDB = orthodb,
InParanoid = inparanoid,
PhylomeDB = phylomedb,
HOGENOM = hogenom
)
}
return(df)
})
og_df <- Reduce(rbind, ogs)
return(og_df)
}
compare <- function(data, form, ref = NULL) {
# Wilcoxon test - greater and less alternatives
wilcoxtest_greater <- tibble::as_tibble(data) %>%
rstatix::wilcox_test(
formula(form), p.adjust.method = "BH", ref.group = ref,
alternative = "greater"
)
pg <- ifelse("p.adj" %in% names(wilcoxtest_greater), "p.adj", "p")
wilcoxtest_greater <- wilcoxtest_greater %>% dplyr::select(
group1, group2, n1, n2, padj_greater = all_of(pg)
)
wilcoxtest_less <- tibble::as_tibble(data) %>%
rstatix::wilcox_test(
formula(form), p.adjust.method = "BH", ref.group = ref,
alternative = "less"
)
pl <- ifelse("p.adj" %in% names(wilcoxtest_less), "p.adj", "p")
wilcoxtest_less <- wilcoxtest_less %>% dplyr::select(
group1, group2, n1, n2, padj_less = all_of(pl)
)
wilcox_summary <- dplyr::inner_join(wilcoxtest_greater, wilcoxtest_less) %>%
dplyr::mutate(padj_interpretation = dplyr::case_when(
padj_less < 0.05 ~ "less",
padj_greater < 0.05 ~ "greater",
TRUE ~ "ns"
))
# Effect sizes for Wilcoxon test - greater and less alternatives
effsize <- tibble::as_tibble(data) %>%
rstatix::wilcox_effsize(
formula(form), ref.group = ref,
) %>%
dplyr::select(
group1, group2, effsize, magnitude
)
result <- as.data.frame(inner_join(wilcox_summary, effsize))
return(result)
}
filter_comparison <- function(compare_output) {
filtered_df <- compare_output |>
dplyr::filter(padj_interpretation != "ns") |>
mutate(padj = case_when(
padj_interpretation == "greater" ~ padj_greater,
padj_interpretation == "less" ~ padj_less
)) |>
dplyr::select(group1, group2, n1, n2, padj, effsize, magnitude)
return(filtered_df)
}Bash code
Below, you can find the Bash code I used to run OrthoFinder.
#!/bin/bash
# Define paths
workdir='/home/faalm/projects/cogeqc_benchmark'
outdir='/home/faalm/projects/cogeqc_benchmark/products/result_files'
# Run OrthoFinder - default DIAMOND
orthofinder -f "$workdir/data" -S diamond -I 1.5 -o "$outdir/default_1_5" -og
orthofinder -f "$workdir/data" -S diamond -I 2 -o "$outdir/default_2" -og
orthofinder -f "$workdir/data" -S diamond -I 3 -o "$outdir/default_3" -og
orthofinder -f "$workdir/data" -S diamond -I 1 -o "$outdir/default_1" -og
# Run OrthoFinder - ultrasensitive DIAMOND
orthofinder -f "$workdir/data" -S diamond_ultra_sens -I 1.5 -t 8 -o "$outdir/ultra_1_5" -og
orthofinder -f "$workdir/data" -S diamond_ultra_sens -I 2 -t 8 -o "$outdir/ultra_2" -og
orthofinder -f "$workdir/data" -S diamond_ultra_sens -I 3 -t 8 -o "$outdir/ultra_3" -og
orthofinder -f "$workdir/data" -S diamond_ultra_sens -I 1 -t 8 -o "$outdir/ultra_1" -ogSession information
This document was created under the following conditions:
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.3.0 (2023-04-21)
os Ubuntu 20.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Brussels
date 2023-08-04
pandoc 3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
cli 3.6.1 2023-03-23 [1] CRAN (R 4.3.0)
digest 0.6.31 2022-12-11 [1] CRAN (R 4.3.0)
evaluate 0.20 2023-01-17 [1] CRAN (R 4.3.0)
fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0)
htmltools 0.5.5 2023-03-23 [1] CRAN (R 4.3.0)
htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.3.0)
jsonlite 1.8.4 2022-12-06 [1] CRAN (R 4.3.0)
knitr 1.42 2023-01-25 [1] CRAN (R 4.3.0)
rlang 1.1.1 2023-04-28 [1] CRAN (R 4.3.0)
rmarkdown 2.21 2023-03-26 [1] CRAN (R 4.3.0)
rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.3.0)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
xfun 0.39 2023-04-20 [1] CRAN (R 4.3.0)
yaml 2.3.7 2023-01-23 [1] CRAN (R 4.3.0)
[1] /home/faalm/R/x86_64-pc-linux-gnu-library/4.3
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library
──────────────────────────────────────────────────────────────────────────────