Reanalysis of Conceição-Neto et al 2015 protocol testing

R MGS protocols
Michael R. McLaren true
2023-06-17

Data from Conceição-Neto et al. (2015)

R setup

Show code
library(tidyverse)
library(fs)
library(here)
# library(speedyseq)
# library(furrr)
# plan(multisession, workers = 3)

# plotting helpers
library(cowplot)
library(patchwork)
library(ggbeeswarm)

theme_set(theme_cowplot())

# Okabe Ito color scheme with amber for yellow; see https://easystats.github.io/see/reference/scale_color_okabeito.html
colors_oi <- grDevices::palette.colors()  
colors_oi['yellow'] <- "#F5C710"

# today <- format(Sys.time(), '%Y-%m-%d')

Data import

Import the MGS read count data from Table S4 from the supplementary pdf file,

Show code
# library(tabulizer)
supplement_path <- '~/Zotero/storage/B867AJV4/Conceição-Neto et al. - 2015 - supplement.pdf'
x <- tabulizer::extract_tables(supplement_path, pages = 4, method = 'lattice')
x <- x[[1]]
x[1,1] <- 'taxon'
# x <- tabulizer::extract_areas(supplement_path, pages = 4)

nms <- x[1,]
x <- x %>%
  tail(-1) %>%
  head(-1) %>%
  as_tibble()
names(x) <- nms
x
# A tibble: 15 × 9
   taxon             Control `0.8 PC` `0.8 PES` `0.45` `0.22` `17000g`
   <chr>             <chr>   <chr>    <chr>     <chr>  <chr>  <chr>   
 1 "Circovirus"      114327… 105066 … 467836 (… 28819… 37063… 162676 …
 2 "Parvovirus"      902175… 929352 … 2304080 … 16456… 31897… 1008512…
 3 "Polyomavirus"    130761… 72391 (… 88285 (0… 29444… 11187… 47493 (…
 4 "Pepino mosaic\r… 146832… 639869 … 2660872 … 16232… 22791… 3710870…
 5 "Rotavirus"       113219… 535543 … 1751352 … 10783… 97106… 2619165…
 6 "Coronavirus"     44067 … 15438 (… 79082 (0… 38486… 40884… 104041 …
 7 "LIMEstone virus" 960756… 2125479… 4860630 … 52520… 66120… 3024671…
 8 "Herpesvirus"     11958 … 4007 (0… 32653 (0… 25407… 23196… 7903 (0…
 9 "Mimivirus"       103091… 211129 … 3711 (0.… 2809 … 5240 … 3133 (0…
10 "Bacteroides rRN… 179167… 18923 (… 1694 (0.… 966 (… 559 (… 1026 (0…
11 "Bacteroides"     465543… 4172023… 34282 (0… 27823… 26408… 12152 (…
12 "Bifidobacterium" 38488 … 2474 (0… 2748 (0.… 2935 … 2508 … 988 (0.…
13 "Lactobacillus"   107010… 5289 (0… 5170 (0.… 4693 … 5894 … 3957 (0…
14 "E.coli"          572592… 357029 … 51191 (0… 44481… 55943… 29121 (…
15 "unmapped"        859446… 403677 … 1152082 … 91492… 16292… 998115 …
# ℹ 2 more variables: `17000g+0.8 PES` <chr>, `17000g+0.45` <chr>
Show code
x0 <- x
# Parse
x <- x0 %>%
  pivot_longer(-taxon, names_to = 'sample') %>%
  separate(value, into = c('reads', 'percent'), sep = '\\s+') %>%
  mutate(
    across(reads, as.numeric),
    across(percent, ~str_extract(.x, '\\d+\\.\\d+')),
    across(taxon, ~str_replace(.x, '\\r', ' ')),
    taxon_type = case_when(
      taxon == 'unmapped' ~ 'unmapped',
      str_detect(taxon, 'virus') ~ 'virus',
      TRUE ~ 'bacteria'
    ),
    filter = str_extract(sample, '0.22|0.45|0.8 PES|0.8 PC') %>%
      fct_explicit_na('None'),
    centrifuge = str_detect(sample, '17000g')
  ) %>%
  glimpse
Rows: 120
Columns: 7
$ taxon      <chr> "Circovirus", "Circovirus", "Circovirus", "Circov…
$ sample     <chr> "Control", "0.8 PC", "0.8 PES", "0.45", "0.22", "…
$ reads      <dbl> 114327, 105066, 467836, 288199, 370639, 162676, 1…
$ percent    <chr> "1.01", "1.09", "3.47", "2.56", "2.42", "1.39", "…
$ taxon_type <chr> "virus", "virus", "virus", "virus", "virus", "vir…
$ filter     <fct> None, 0.8 PC, 0.8 PES, 0.45, 0.22, None, 0.8 PES,…
$ centrifuge <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TR…

Plots

Show code
gm_mean <- function(x) exp(mean(log(x)))
center_elts <- function(x) x / gm_mean(x)

Relative abundances as centered ratios (abundance of taxon / gm mean of abundances of all taxa in that sample)

Show code
x %>%
  filter(taxon != 'unmapped') %>%
  mutate(.by = sample, value = center_elts(reads)) %>%
  ggplot(aes(x = taxon, y = value, color = sample)) +
  scale_color_manual(values = colors_oi %>% unname) +
  facet_grid(~taxon_type, scales = 'free_x', space = 'free') +
  scale_y_log10() +
  geom_point() +
  # rotate x-axis labels
  labs(y = 'Centered ratio') +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Relative abundances doubly centered: First, centered within samples as above. Then, center the taxon abundance across samples. This view helps to focus on the effect of protocols by adjusting for the difference in abundance among taxa.

Show code
x %>%
  filter(taxon != 'unmapped') %>%
  mutate(.by = sample, value = center_elts(reads)) %>%
  mutate(.by = taxon, value = center_elts(value)) %>%
  ggplot(aes(x = taxon, y = value, color = sample)) +
  scale_color_manual(values = colors_oi %>% unname) +
  facet_grid(~taxon_type, scales = 'free_x', space = 'free') +
  scale_y_log10() +
  geom_point() +
  labs(y = 'Doubly-centered ratio') +
  # rotate x-axis labels
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Let’s look at the gm-average relative abundance of viruses to bacteria across samples,

Show code
ratios <- x %>%
  summarize(.by = c(sample, taxon_type), value = gm_mean(reads)) %>%
  select(taxon_type, sample, value) %>%
  pivot_wider(names_from = taxon_type, values_from = value) %>%
  mutate(ratio = virus / bacteria)
Show code
p1 <- ratios %>%
  ggplot(aes(x = ratio, y = sample)) +
  scale_x_log10() +
  geom_point()
p1

This figure suggests that the .45 filter and the 0.8 PES filter perform similarly.

Interesting that the .8 PC ratio is so much lower than the other filters.

What if we don’t care about mimivirus? Does the advantage of 0.8PES over .45 with centrifugation still hold?

Show code
ratios_no_mimi <- x %>%
  filter(taxon != 'Mimivirus') %>%
  summarize(.by = c(sample, taxon_type), value = gm_mean(reads)) %>%
  select(taxon_type, sample, value) %>%
  pivot_wider(names_from = taxon_type, values_from = value) %>%
  mutate(ratio = virus / bacteria)
Show code
p2 <- ratios_no_mimi %>%
  ggplot(aes(x = ratio, y = sample)) +
  scale_x_log10() +
  geom_point()
# p1 / p2
p2

The modest advantage remains and overall everything looks very similar.

Might be useful to look at the variance at the level of viruses. Could normalize everything relative to the GM of bacteria, and quantify the increase relative to the control. Doing this could give us some sense of variability, using variation among viruses as a proxy for variation among samples. It also let’s us look for the effect on larger viruses.

Here, I’ll look at viral abundance as the ratio of that virus relative to the gm-average of bacterial species.

Show code
ratios_spp <- x %>%
  mutate(.by = sample,
    ref = reads[taxon_type == 'bacteria'] %>% gm_mean,
    value = reads / ref
  ) %>%
  filter(taxon_type == 'virus')
avg <- ratios_spp %>%
  summarize(.by = sample, across(value, gm_mean))
Show code
ratios_spp %>%
  ggplot(aes(x = value, y = sample, color = taxon)) +
  scale_color_brewer(type = 'qual', palette = 3) +
  scale_x_log10() +
  # geom_point() +
  geom_quasirandom(width = 0.1) +
  geom_point(data = avg, color = 'black', shape = 3)

Show code
avg <- ratios_spp %>%
  filter(filter %in% c('0.45', '0.8 PES')) %>%
  summarize(.by = c(sample, filter, centrifuge), across(value, gm_mean))
ratios_spp %>%
  filter(filter %in% c('0.45', '0.8 PES')) %>%
  ggplot(aes(x = value, y = sample, color = taxon)) +
  facet_wrap(~centrifuge, ncol = 1, scales = 'free_y', 
             labeller = 'label_both') +
  scale_color_brewer(type = 'qual', palette = 3) +
  scale_x_log10() +
  geom_quasirandom(width = 0.1) +
  geom_point(data = avg, color = 'black', shape = 3)

What is the magnitude of increase due to 0.8PES versus 0.45?

Show code
ratios_spp_filter <- ratios_spp %>%
  filter(filter %in% c('0.45', '0.8 PES')) %>%
  select(taxon, taxon_type, centrifuge, filter, value) %>%
  pivot_wider(names_from = filter) %>%
  mutate(ratio = `0.8 PES` / `0.45`) %>%
  glimpse
Rows: 18
Columns: 6
$ taxon      <chr> "Circovirus", "Circovirus", "Parvovirus", "Parvov…
$ taxon_type <chr> "virus", "virus", "virus", "virus", "virus", "vir…
$ centrifuge <lgl> FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRU…
$ `0.8 PES`  <dbl> 55.5849922, 66.2281995, 273.7546254, 373.9504363,…
$ `0.45`     <dbl> 41.3399182, 55.6332580, 236.0586371, 256.1308699,…
$ ratio      <dbl> 1.3445840, 1.1904426, 1.1596891, 1.4599975, 0.248…
Show code
ratios_spp_filter %>%
  ggplot(aes(x = ratio, y = taxon, color = centrifuge)) +
  theme_minimal_grid() +
  # facet_wrap(~centrifuge, ncol = 1, scales = 'free_y') +
  scale_color_brewer(type = 'qual', palette = 2) +
  scale_x_log10(breaks = c(0.3, 1, 1.5, 3)) +
  expand_limits(x = 3) +
  # geom_vline(xintercept=1.4, color = 'darkred', size = 0.2) +
  geom_point() +
  labs(
    x = 'Ratio of virus to gm-mean of all bacteria'
  ) +
  plot_annotation(
    title = 'Advantage of 0.8 PES over 0.45, with and without centrifugation',
  )

We’re generally talking about a ~1.4-fold difference, not something huge.

I’m not sure what is going on with the Polyomavirus without centrifugation, but without replication we shouldn’t treat it too seriously.

Conceição-Neto, Nádia, Mark Zeller, Hanne Lefrère, Pieter De Bruyn, Leen Beller, Ward Deboutte, Claude Kwe Yinda, et al. 2015. “Modular Approach to Customise Sample Preparation Procedures for Viral Metagenomics: A Reproducible Protocol for Virome Analysis.” Scientific Reports 5 (1): 16532. https://doi.org/10.1038/srep16532.

References