Data from Conceição-Neto et al. (2015)
R setup
library(tidyverse)
library(fs)
library(here)
# library(speedyseq)
# library(furrr)
# plan(multisession, workers = 3)
# plotting helpers
library(cowplot)
library(patchwork)
library(ggbeeswarm)
theme_set(theme_cowplot())
# Okabe Ito color scheme with amber for yellow; see https://easystats.github.io/see/reference/scale_color_okabeito.html
colors_oi <- grDevices::palette.colors()
colors_oi['yellow'] <- "#F5C710"
# today <- format(Sys.time(), '%Y-%m-%d')
Import the MGS read count data from Table S4 from the supplementary pdf file,
# library(tabulizer)
supplement_path <- '~/Zotero/storage/B867AJV4/Conceição-Neto et al. - 2015 - supplement.pdf'
x <- tabulizer::extract_tables(supplement_path, pages = 4, method = 'lattice')
x <- x[[1]]
x[1,1] <- 'taxon'
# x <- tabulizer::extract_areas(supplement_path, pages = 4)
nms <- x[1,]
x <- x %>%
tail(-1) %>%
head(-1) %>%
as_tibble()
names(x) <- nms
x
# A tibble: 15 × 9
taxon Control `0.8 PC` `0.8 PES` `0.45` `0.22` `17000g`
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 "Circovirus" 114327… 105066 … 467836 (… 28819… 37063… 162676 …
2 "Parvovirus" 902175… 929352 … 2304080 … 16456… 31897… 1008512…
3 "Polyomavirus" 130761… 72391 (… 88285 (0… 29444… 11187… 47493 (…
4 "Pepino mosaic\r… 146832… 639869 … 2660872 … 16232… 22791… 3710870…
5 "Rotavirus" 113219… 535543 … 1751352 … 10783… 97106… 2619165…
6 "Coronavirus" 44067 … 15438 (… 79082 (0… 38486… 40884… 104041 …
7 "LIMEstone virus" 960756… 2125479… 4860630 … 52520… 66120… 3024671…
8 "Herpesvirus" 11958 … 4007 (0… 32653 (0… 25407… 23196… 7903 (0…
9 "Mimivirus" 103091… 211129 … 3711 (0.… 2809 … 5240 … 3133 (0…
10 "Bacteroides rRN… 179167… 18923 (… 1694 (0.… 966 (… 559 (… 1026 (0…
11 "Bacteroides" 465543… 4172023… 34282 (0… 27823… 26408… 12152 (…
12 "Bifidobacterium" 38488 … 2474 (0… 2748 (0.… 2935 … 2508 … 988 (0.…
13 "Lactobacillus" 107010… 5289 (0… 5170 (0.… 4693 … 5894 … 3957 (0…
14 "E.coli" 572592… 357029 … 51191 (0… 44481… 55943… 29121 (…
15 "unmapped" 859446… 403677 … 1152082 … 91492… 16292… 998115 …
# ℹ 2 more variables: `17000g+0.8 PES` <chr>, `17000g+0.45` <chr>
x0 <- x
# Parse
x <- x0 %>%
pivot_longer(-taxon, names_to = 'sample') %>%
separate(value, into = c('reads', 'percent'), sep = '\\s+') %>%
mutate(
across(reads, as.numeric),
across(percent, ~str_extract(.x, '\\d+\\.\\d+')),
across(taxon, ~str_replace(.x, '\\r', ' ')),
taxon_type = case_when(
taxon == 'unmapped' ~ 'unmapped',
str_detect(taxon, 'virus') ~ 'virus',
TRUE ~ 'bacteria'
),
filter = str_extract(sample, '0.22|0.45|0.8 PES|0.8 PC') %>%
fct_explicit_na('None'),
centrifuge = str_detect(sample, '17000g')
) %>%
glimpse
Rows: 120
Columns: 7
$ taxon <chr> "Circovirus", "Circovirus", "Circovirus", "Circov…
$ sample <chr> "Control", "0.8 PC", "0.8 PES", "0.45", "0.22", "…
$ reads <dbl> 114327, 105066, 467836, 288199, 370639, 162676, 1…
$ percent <chr> "1.01", "1.09", "3.47", "2.56", "2.42", "1.39", "…
$ taxon_type <chr> "virus", "virus", "virus", "virus", "virus", "vir…
$ filter <fct> None, 0.8 PC, 0.8 PES, 0.45, 0.22, None, 0.8 PES,…
$ centrifuge <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TR…
Relative abundances as centered ratios (abundance of taxon / gm mean of abundances of all taxa in that sample)
x %>%
filter(taxon != 'unmapped') %>%
mutate(.by = sample, value = center_elts(reads)) %>%
ggplot(aes(x = taxon, y = value, color = sample)) +
scale_color_manual(values = colors_oi %>% unname) +
facet_grid(~taxon_type, scales = 'free_x', space = 'free') +
scale_y_log10() +
geom_point() +
# rotate x-axis labels
labs(y = 'Centered ratio') +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
Relative abundances doubly centered: First, centered within samples as above. Then, center the taxon abundance across samples. This view helps to focus on the effect of protocols by adjusting for the difference in abundance among taxa.
x %>%
filter(taxon != 'unmapped') %>%
mutate(.by = sample, value = center_elts(reads)) %>%
mutate(.by = taxon, value = center_elts(value)) %>%
ggplot(aes(x = taxon, y = value, color = sample)) +
scale_color_manual(values = colors_oi %>% unname) +
facet_grid(~taxon_type, scales = 'free_x', space = 'free') +
scale_y_log10() +
geom_point() +
labs(y = 'Doubly-centered ratio') +
# rotate x-axis labels
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
Let’s look at the gm-average relative abundance of viruses to bacteria across samples,
p1 <- ratios %>%
ggplot(aes(x = ratio, y = sample)) +
scale_x_log10() +
geom_point()
p1
This figure suggests that the .45 filter and the 0.8 PES filter perform similarly.
Interesting that the .8 PC ratio is so much lower than the other filters.
What if we don’t care about mimivirus? Does the advantage of 0.8PES over .45 with centrifugation still hold?
p2 <- ratios_no_mimi %>%
ggplot(aes(x = ratio, y = sample)) +
scale_x_log10() +
geom_point()
# p1 / p2
p2
The modest advantage remains and overall everything looks very similar.
Might be useful to look at the variance at the level of viruses. Could normalize everything relative to the GM of bacteria, and quantify the increase relative to the control. Doing this could give us some sense of variability, using variation among viruses as a proxy for variation among samples. It also let’s us look for the effect on larger viruses.
Here, I’ll look at viral abundance as the ratio of that virus relative to the gm-average of bacterial species.
ratios_spp %>%
ggplot(aes(x = value, y = sample, color = taxon)) +
scale_color_brewer(type = 'qual', palette = 3) +
scale_x_log10() +
# geom_point() +
geom_quasirandom(width = 0.1) +
geom_point(data = avg, color = 'black', shape = 3)
avg <- ratios_spp %>%
filter(filter %in% c('0.45', '0.8 PES')) %>%
summarize(.by = c(sample, filter, centrifuge), across(value, gm_mean))
ratios_spp %>%
filter(filter %in% c('0.45', '0.8 PES')) %>%
ggplot(aes(x = value, y = sample, color = taxon)) +
facet_wrap(~centrifuge, ncol = 1, scales = 'free_y',
labeller = 'label_both') +
scale_color_brewer(type = 'qual', palette = 3) +
scale_x_log10() +
geom_quasirandom(width = 0.1) +
geom_point(data = avg, color = 'black', shape = 3)
What is the magnitude of increase due to 0.8PES versus 0.45?
Rows: 18
Columns: 6
$ taxon <chr> "Circovirus", "Circovirus", "Parvovirus", "Parvov…
$ taxon_type <chr> "virus", "virus", "virus", "virus", "virus", "vir…
$ centrifuge <lgl> FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRU…
$ `0.8 PES` <dbl> 55.5849922, 66.2281995, 273.7546254, 373.9504363,…
$ `0.45` <dbl> 41.3399182, 55.6332580, 236.0586371, 256.1308699,…
$ ratio <dbl> 1.3445840, 1.1904426, 1.1596891, 1.4599975, 0.248…
ratios_spp_filter %>%
ggplot(aes(x = ratio, y = taxon, color = centrifuge)) +
theme_minimal_grid() +
# facet_wrap(~centrifuge, ncol = 1, scales = 'free_y') +
scale_color_brewer(type = 'qual', palette = 2) +
scale_x_log10(breaks = c(0.3, 1, 1.5, 3)) +
expand_limits(x = 3) +
# geom_vline(xintercept=1.4, color = 'darkred', size = 0.2) +
geom_point() +
labs(
x = 'Ratio of virus to gm-mean of all bacteria'
) +
plot_annotation(
title = 'Advantage of 0.8 PES over 0.45, with and without centrifugation',
)
We’re generally talking about a ~1.4-fold difference, not something huge.
I’m not sure what is going on with the Polyomavirus without centrifugation, but without replication we shouldn’t treat it too seriously.