Analysing 16S rRNA data with R

The goal of this tutorial is to analyse 16S rRNA gene data. Our input file is an OTU/ASV table that already contains some taxa information and will go through the following steps:

Reading the data into R
Filtering the data
Normalizing the data
Visualizing the data (i.e. alpha/beta diversity, barplots, …)

In the example we look at an OTU table of 21 samples. DNA was extracted Winogradsky using either wood or paper as substrate. DNA was collected on two separate dates, and for each treatment three different columns were sampled each with 3 replicates.

Setup

We start with setting a path to working directory and setting a seed seed for normalization protocol.

Setting a seed is not essential but this way we make sure that we get the same results when normalizing our OTU table. If we randomly select some observations for any task in R or in any statistical software it results in different values all the time and this happens because of randomization. If we want to keep the values that are produced at first random selection then we can do this by storing them in an object after randomization or we can fix the randomization procedure so that we get the same results all the time.

Load packages

library(tidyverse) #general parsing
library(data.table) #general parsing
library(phyloseq) #phyloseq object loading
library(vegan) #rarefaction
library(microbiome) #normalization
library(ALDEx2) #stats
library(DESeq2) #stats
library(grid) #organizing multiple plots
library(ggrepel) # deal with overlapping labels
library(gridExtra) #organizing multiple plots
library(scales) #plot aesthetics
library(ANCOMBC) #stats
library(rstatix) #stats
library(ggvenn) # vendiagram
library(gplots) #vendiagram

Print package versions:

# List of packages loaded in the script
packages <- c("tidyverse", "data.table", "phyloseq", "vegan", "microbiome", 
              "ALDEx2", "DESeq2", "grid", "gridExtra", "scales", "ANCOMBC", 
              "rstatix", "ggvenn", "gplots", "ggrepel")

# Format package version numbers
package_versions <- sapply(packages, function(pkg) {
  version <- packageVersion(pkg)
  paste0(pkg, " v", paste(version, collapse = "."))
})

# Print each package version on a new line
cat(package_versions, sep = "\n")

tidyverse v2.0.0
data.table v1.15.4
phyloseq v1.44.0
vegan v2.6.4
microbiome v1.20.0
ALDEx2 v1.30.0
DESeq2 v1.38.3
grid v4.2.2
gridExtra v2.3
scales v1.3.0
ANCOMBC v2.0.3
rstatix v0.7.2
ggvenn v0.1.16
gplots v3.1.3.1
ggrepel v0.9.5

Custom functions

Next, we read in some custom functions that allow us to, among others, print summary statistics for an OTU table, define custom functions to plot the same kind of plot multiple times, or to define a custom theme for plotting.

Defining functions can be useful because it means that instead of re-writing the commands for things that use the same command but on a different dataframe over and over again, we can just use the custom functions instead.

# Define function to calculate summary statistics
print_summary <- function(reads_per_OTU) {
  total_reads <- sum(reads_per_OTU)
  otu_number <- length(reads_per_OTU)
  num_singletons <- length(reads_per_OTU[reads_per_OTU == 1])
  num_doubletons <- length(reads_per_OTU[reads_per_OTU == 2])
  num_less_than_10 <- length(reads_per_OTU[reads_per_OTU < 10])
  total_reads_less_than_10 <- sum(reads_per_OTU[reads_per_OTU < 10])
  perc_reads_less_than_10 <- (total_reads_less_than_10 / sum(reads_per_OTU)) * 100
  
  cat("Total number of reads:", format(total_reads, big.mark = ","), "\n")
  cat("Number of OTUs",  format(otu_number, big.mark = ","), "\n")
  cat("Number of singleton OTUs:",  format(num_singletons, big.mark = ","), "\n")
  cat("Number of doubleton OTUs:",  format(num_doubletons, big.mark = ","), "\n")
  cat("Number of OTUs with less than 10 seqs:",  format(num_less_than_10, big.mark = ","), "\n")
  cat("Total reads for OTUs with less than 10 seqs:",  format(total_reads_less_than_10, big.mark = ","), "\n")
  cat("Percentage of reads for OTUs with less than 10 seqs:",  sprintf("%.2f%%", perc_reads_less_than_10), "\n")
  
}

# Define custom theme for generating figures
custom_theme <- function() {
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_blank(), 
    panel.border =element_blank(),
    axis.line.x = element_line(color="black", size = 0.5),
    axis.line.y = element_line(color="black", size = 0.5),
    strip.text.x = element_text(size = 7),
    strip.text.y = element_text(size = 7),
    strip.background = element_rect(fil="#FFFFFF", color = "black", linewidth = 0.5),
    axis.text.x = element_text(size = 7),
    legend.text = element_text(size = 8), legend.title = element_text(size = 10)
  )
}

# Flexible function to create a faceted plot for multiple phyloseq objects
plot_faceted_phyloseq <- function(...) {
  # Capture the list of phyloseq objects and their method names
  physeq_list <- list(...)
  
  # Initialize an empty data frame to store the combined data
  combined_df <- data.frame(Sample = character(), Sums = numeric(), Method = character())

  # Iterate over the list and extract sample sums for each phyloseq object
  for (name in names(physeq_list)) {
    physeq_obj <- physeq_list[[name]]
    sample_sums_df <- data.frame(Sample = names(sample_sums(physeq_obj)),
                                 Sums = sample_sums(physeq_obj),
                                 Method = name)
    combined_df <- bind_rows(combined_df, sample_sums_df)
  }

  # Set the factor levels for 'Method' in the order they were provided
  combined_df$Method <- factor(combined_df$Method, levels = names(physeq_list))

  # Create the faceted plot
  p <- ggplot(combined_df, aes(x = Sample, y = Sums)) +
    geom_bar(stat = "identity") +
    facet_wrap(~ Method, scales = "free_y") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    xlab("Sample") +
    ylab("Sample Sums") +
    ggtitle("Faceted Plot of Sample Sums")

  return(p)
}

# Custom function for plotting OTU table analysis
plot_otu_histogram <- function(physeq_obj, metadata, bins = 100, title_suffix = "") {
  
  # Combine taxonomy and OTU table into a single dataframe
  df <- cbind(
    as.data.frame(as(tax_table(physeq_obj), "matrix")),
    as.data.frame(as(otu_table(physeq_obj), "matrix"))
  )
  
  # Reshape the data into long format
  df_long <- reshape2::melt(df)
  
  # Merge with metadata
  df_long <- merge(df_long, metadata, by.x = "variable", by.y = "sample_id")
  
  # Summarize the data by Genus and Carbon_source
  df_summarized <- df_long %>%
    group_by(Genus, Carbon_source) %>%
    summarize(total_value = sum(value, na.rm = TRUE)) %>%
    filter(!is.na(Genus) & Genus != "uncultured")
  
  # Create the plot
  p <- ggplot(df_summarized, aes(x = total_value)) +
    geom_histogram(bins = bins) +
    custom_theme() +
    facet_grid(cols = vars(Carbon_source), scales = "free") +
    labs(title = paste("Histogram of taxa counts", title_suffix, "per Genus and Carbon Source"),
         x = "Summed Value",
         y = "Count",
         fill = "Carbon Source") 
  
  # Return the plot
  return(p)
}

# Generate color scheme  for barplots
c25 <- c("dodgerblue2", "#E31A1C", "green4", "#6A3D9A", "#FF7F00", "black", "gold1", "skyblue2", "#FB9A99", 
        "palegreen2", "#CAB2D6", "#FDBF6F", "gray70", "khaki2", "maroon", "orchid1", "deeppink1", "blue1", 
        "steelblue4", "darkturquoise", "green1", "yellow4", "yellow3","darkorange4", "brown")

# Generate color scheme for treatments
palette1 <- c(wood = "orange", paper = "purple")

Read in the data

OTU table

An OTU table contains a column with the OTUs (taxonomic ranks in our case) and one column per sample with the counts how often OTU is found in the sample. It might look something like this:

#NAME	EP1910	RMT	KJB3	TJR
Bacteria;Acidobacteria;Acidobacteriia;Acidobacteriales;Acidobacteriaceae (Subgroup 1);NA	0	0	0	0
Bacteria;Acidobacteria;Acidobacteriia;Solibacterales;Solibacteraceae (Subgroup 3);PAUC26f	0	5	3	1

# Provide the path to the otu table
file_paths <- c("../data/for_ma/otu-table-forMA.txt")

# Read in otu table
merged_otu_table <- read.table(file_paths, header = T, sep = '\t', comment = "")
colnames(merged_otu_table)[1] <- "taxid"

# Replace NA with 0
merged_otu_table[is.na(merged_otu_table)] <- 0

# R does not like - and will replace this symbol with dots,
# therefore, we first clean up the sample names
colnames(merged_otu_table) <- gsub("\\.", "_", colnames(merged_otu_table)) 

# Use the taxonIDs as rownames
rownames(merged_otu_table) <- merged_otu_table$taxid
merged_otu_table$taxid <- NULL

# Check how many otus and samples we have
dim(merged_otu_table)

[1] 1248   20

# Define list to define a custom order for the sample names
# Here, I want to order the sample names based on the pattern after the _ (i.e.)
# LK_P2, SB_P1 should become SB_P1, LK_P2 and thus orders our samples by replicates/Winogradsky column
names_vec <- colnames(merged_otu_table)
sorted_names <- names_vec[order(sub("^[^_]+_", "", names_vec))]

With this example OTU table, we work with dim(merged_otu_table)[2] samples and 1248 OTUs.

Metadata

The metadata table contains information about our samples and can look something like this:

#NAME	treatment	Date
EP1910	wood	2023_1
RMT	paper	2023_1
KJB3	mix	2023_1
TJR	paper	2023_1
IB5	wood	2023_1
ALIDNA	wood	2023_1
IG7	paper	2023_1
B314	mix	2023_1

# Read in metadata file
metadata_combined <- read.table("../data/for_ma/sample_table.txt", header = TRUE, row.names = 1, sep = "\t", comment.char = "")

# Replace dashes with underscores in row names
rownames(metadata_combined) <- gsub("-", "_", rownames(metadata_combined))

# Ensure that the group data is categorical 
metadata_combined$Practical_group <- gsub("^", "Day", metadata_combined$Practical_group)

# Add extra column for sample names
metadata_combined$name <- paste0(metadata_combined$Carbon_source, "_", rownames(metadata_combined))
metadata_combined$sample_id <- rownames(metadata_combined)

# Order the factors for our names column
metadata_combined <- metadata_combined |> 
  arrange(desc(Carbon_source))

# View output
head(metadata_combined)

	Barcode	Carbon_source	Wino_Column	Practical_group	name	sample_id
EP_SD1	BC10	wood	SD1	Day1	wood_EP_SD1	EP_SD1
RK_SD1	BC12	wood	SD1	Day1	wood_RK_SD1	RK_SD1
EW_SD1	BC23	wood	SD1	Day2	wood_EW_SD1	EW_SD1
DG_SD1	BC24	wood	SD1	Day2	wood_DG_SD1	DG_SD1
AW_SD2	BC05	wood	SD2	Day2	wood_AW_SD2	AW_SD2
UNZ_SD2	BC06	wood	SD2	Day2	wood_UNZ_SD2	UNZ_SD2

Generate taxonomy file

Next, we generate a table that list the taxonomy information for each taxonomic rank. We do this by taking the information from our OTU table. Depending on how you analysed your 16S rRNA gene sequences, you might have an OTU table with IDs (ASV1, ASV2, … or OTU1, OTU2, …) and a separate table with the taxonomy information.

If that is the case, you can read in the taxonomy information separate.

# Extract taxonomy string
temp <- as.data.frame(rownames(merged_otu_table))
colnames(temp) <- "OTU"

# Separate the taxonomic headers                      
taxonomy_file <- temp |> 
  distinct(OTU) |> 
  separate(OTU,
           c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus"), 
           sep = ";", remove = FALSE) |> 
  column_to_rownames(var = "OTU")

# View file
head(taxonomy_file)

	Kingdom	Phylum	Class	Order	Family	Genus
Bacteria;10bav-F6;NA;NA;NA;NA	Bacteria	10bav-F6	NA	NA	NA	NA
Bacteria;Abditibacteriota;Abditibacteria;Abditibacteriales;Abditibacteriaceae;Abditibacterium	Bacteria	Abditibacteriota	Abditibacteria	Abditibacteriales	Abditibacteriaceae	Abditibacterium
Bacteria;Acetothermia;Acetothermiia;NA;NA;NA	Bacteria	Acetothermia	Acetothermiia	NA	NA	NA
Bacteria;Acidobacteriota;Acidobacteriae;AKIW659;NA;NA	Bacteria	Acidobacteriota	Acidobacteriae	AKIW659	NA	NA
Bacteria;Acidobacteriota;Acidobacteriae;Bryobacterales;Bryobacteraceae;Bryobacter	Bacteria	Acidobacteriota	Acidobacteriae	Bryobacterales	Bryobacteraceae	Bryobacter
Bacteria;Acidobacteriota;Acidobacteriae;PAUC26f;NA;NA	Bacteria	Acidobacteriota	Acidobacteriae	PAUC26f	NA	NA

Generate phyloseq object

A phyloseq object combines different elements of an analysis (i.e. the OTU table, the list of taxa and the mapping file) into one single object. We can easily generate such an object with the three dataframes we have generated above:

# Combine data into a phyloseq object
OTU = otu_table(merged_otu_table, taxa_are_rows = TRUE)
TAX = tax_table(as.matrix(taxonomy_file))
physeq = phyloseq(OTU, TAX, sample_data(metadata_combined))

# View structure
physeq

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 1248 taxa and 20 samples ]
sample_data() Sample Data:       [ 20 samples by 6 sample variables ]
tax_table()   Taxonomy Table:    [ 1248 taxa by 6 taxonomic ranks ]

Exploring the raw data

Get some summary statistics

Below, we write a custom function to calculate some summary statistics. We easily could do this without a function, however, since we want to compare the statistics before and after filtering the OTU table, the function is useful to have, since we do not need to copy-paste the exact same code in two spots of the workflow:

# Calculate the number of reads found per otu
reads_per_OTU <- taxa_sums(physeq)

# Summarize the data
print_summary(reads_per_OTU)

Total number of reads: 366,981 
Number of OTUs 1,248 
Number of singleton OTUs: 245 
Number of doubleton OTUs: 95 
Number of OTUs with less than 10 seqs: 609 
Total reads for OTUs with less than 10 seqs: 1,728 
Percentage of reads for OTUs with less than 10 seqs: 0.47%

For this workflow, we define *singletons** as reads/OTUs with a sequence that is present exactly once in the dataset.

Notice that another definition of singletons can be as taxa/OTU present in a single sample.

In amplicon data analyses it is useful to remove reads with low counts because they are very likely due to sequencing errors. We generally assume that sequencing errors are independent and randomly distributed, and we can assume that erroneous sequences will occur much less often than the true sequence. We will remove such sequences during the data filtering step.

Explore the sequencing depth

Next, let’s explore how many reads we have per sample:

# Count the number of reads per sample
sample_counts <- as.data.frame(colSums(merged_otu_table))
                  
# Clean up the dataframe
names(sample_counts)[1] <- "counts"
sample_counts$sampleID <- rownames(sample_counts)

# Plot counts
p_counts <-
  ggplot(data = sample_counts, aes(x = reorder(sampleID, counts, FUN=sum, decreasing = TRUE), y = counts)) +
  geom_point() +
  geom_text(aes(x = , sampleID, y = counts, label = counts),  hjust = 0, nudge_y = 400 , size = 2.5) +
  coord_flip() +
  xlab("") + 
  ylab("Read counts") +
  custom_theme()

Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.

p_counts

In this example, we see one sample with almost no reads and we want to make sure to remove this sample. We also see that we have a large difference between different samples. To be able to compare for example sample SB_P1 (~71,000 reads) with sample AW_SD2 (~4,000 reads) we need to normalize our data after the data filtering step.

Filter data

Next, we filter the data. Specifically, we:

Remove OTUs that are not assigned to anything at Phylum rank. The subset_taxa function can be used to remove any taxa you want, i.e. if you have plant DNA in your sample, you could use this to remove chloroplast sequences as well.
Remove samples with total read counts less than 20. This cutoff is arbitrary and depends a bit on your data. To choose a good value, explore the read counts you have per sample and define a cutoff based on that. In this example, we mainly want to remove the two low read samples we have seen in our plot.
Remove low count OTUs: The threshold is up to you; removing singletons or doubletons is common, but you can be more conservative and remove any counts less than 10.

In our example, we want to remove samples with 20 or less reads, remove taxa with less than 3 hits only and remove OTUs that occur in less than 10% of our samples (v1). Since there are many different thoughts about OTU table filtering, you can also find two other options (v2 and v3) on how to filter a table.

For most analyses we will work with the filtered data but there are some diversity metrics which rely on the presence of singletons within a sample (richness estimates, i.e. Chao), so you might choose to leave them in for those sorts of analyses only.

# Define cutoffs
counts_per_sample <- 20
otu_nr_cutoff <- 3
min_percentage_samples <- 10

# Remove taxa without tax assignment at Phylum rank
physeq_filt <- subset_taxa(physeq, Phylum != "NA")

# Remove samples with less than 20 reads
physeq_filt <- prune_samples(sample_sums(physeq)>= counts_per_sample, physeq)

# Calculate the minimum number of samples an otu should be present in
min_samples <- ceiling((min_percentage_samples / 100) * nsamples(physeq_filt))

# Remove otus that occur only rarely (v1)
# here, we calculate the total abundance of each otu across all samples and checks if it's greater than the specified otu_nr_cutoff AND we check if the otu occurs in at least <min_percentage_samples>% of samples
# we only retain OTUs that satisfy this condition 
physeq_filt <- prune_taxa(taxa_sums(physeq_filt) > otu_nr_cutoff & taxa_sums(physeq_filt) >= min_samples, physeq_filt)
physeq_filt

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 824 taxa and 19 samples ]
sample_data() Sample Data:       [ 19 samples by 6 sample variables ]
tax_table()   Taxonomy Table:    [ 824 taxa by 6 taxonomic ranks ]

# Remove otus that occur only rarely (v2)
# here, we count the number of samples where the abundance of an otu is > 0. 
# If this count is greater than the specified otu_nr_cutoff, the taxon is retained.
#physeq_filt <- filter_taxa(physeq, function (x) {sum(x > 0) > otu_nr_cutoff}, prune=TRUE)
#physeq_filt

# Remove otus that occur only rarely (v3)
# here, we remove taxa not seen more than 1 times in at least 10% of the samples
#physeq_filt = filter_taxa(physeq, function(x) sum(x > 1) > (0.1*length(x)), TRUE)
#physeq_filt

Next, we can calculate the summary statistics with the custom taxa_sums function we have defined before:

# Calculate the number of reads found per otu
reads_per_OTU_filt <- taxa_sums(physeq_filt)

# Summarize the data
print_summary(reads_per_OTU_filt)

Total number of reads: 366,333 
Number of OTUs 824 
Number of singleton OTUs: 0 
Number of doubleton OTUs: 0 
Number of OTUs with less than 10 seqs: 185 
Total reads for OTUs with less than 10 seqs: 1,083 
Percentage of reads for OTUs with less than 10 seqs: 0.30%

Normalize data

Below, we generate three new phyloseq objects using three different normalization approaches:

Compositional: transforms the data into relative abundance, similar to total sum scaling (TSS)
CLR: stands for centerd log-ratio transform and allows us to compare proportions of OTUs within each sample. After this transformation the values will no longer be counts, but rather the dominance (or lack thereof) for each taxa relative to the geometric mean of all taxa on the logarithmic scale
Rarefaction: scales all of your reads down to the lowest total sequencing depth. Notice, that this might drop many OTUs in higher read samples and might lead to under-estimation of low-abundant OTUs

Generating different phyloseq objects for different normalization approaches allows us to easily compare analysis steps with different inputs.

physeq_rel_abundance <- microbiome::transform(physeq_filt, "compositional")
physeq_clr <- microbiome::transform(physeq_filt, "rclr")
physeq_rarified <- rarefy_even_depth(physeq_filt)

Plot how the data changed:

faceted_plot <- plot_faceted_phyloseq(
  Raw = physeq_filt,
  Rarified = physeq_rarified,
  TSS = physeq_rel_abundance,
  CLR = physeq_clr
)

faceted_plot

If we plot the sums of each sample, we see that the rarefied data normalized to counts of approximately 4500 while during the relative abundance normalization the data was “scaled” to 0-1 (or 0-100%). The CLR-transformed data looks quite different because, after CLR transformation, we no longer work with raw counts. Instead, we focus on the dominance of taxa relative to the geometric mean of all taxa in a sample. This results in values like -2.1, 0.4, and 6.2, which makes plotting sample sums less meaningful. CLR is useful for comparing the relative abundance of taxa within each sample but not for direct comparisons of sample sums.

We can also generate histograms:

# Raw data 
df_filt <- cbind(as.data.frame(as(tax_table(physeq_filt), "matrix")),as.data.frame(as(taxa_sums(physeq_filt), "matrix")))

ps1_df_plot <- 
  ggplot(df_filt, aes(V1)) + 
  geom_histogram() + 
  custom_theme() +
  ggtitle("Distribution of reads (raw, filtered) per taxa") + 
  ylab("Frequency") +
  xlab("Total reads")

ps1_df_plot

# Relative abundance data 
df_ra <- cbind(as.data.frame(as(tax_table(physeq_rel_abundance), "matrix")),as.data.frame(as(taxa_sums(physeq_rel_abundance), "matrix")))

ps2_df_plot <- 
  ggplot(df_ra, aes(V1)) + 
  geom_histogram() + 
  custom_theme() +
  ggtitle("Distribution of reads (relative abundance) per taxa") + 
  ylab("Frequency") +
  xlab("Total reads")

ps2_df_plot

# Rarefied data 
df_clr <- cbind(as.data.frame(as(tax_table(physeq_clr), "matrix")),as.data.frame(as(taxa_sums(physeq_clr), "matrix")))

ps3_df_plot <- 
  ggplot(df_clr, aes(V1)) + 
  geom_histogram() + 
  custom_theme() +
  ggtitle("Distribution of reads (clr-transformed) per taxa") + 
  ylab("Frequency") +
  xlab("Total reads")

ps3_df_plot

We can see that for none of the transformations we moved into a normal distribution and should keep this in mind for when we do the statistical analyses.

Data exploration

Rarefaction

Rarefactions illustrate how well your sample was sampled. The rarefaction function takes a random subsample of each column in your OTU table of a given size, starting with a very small subsample, and counts how many unique OTUs were observed in that subsample. The analysis is repeated with increasing subsample sizes until the maximum actual read depth for your table is reached.

In an ideal dataset, each curve should reach a plateau, i.e. horizontal asymptote, ideally around the same depth. While this almost never is the case, exploring these graphs gives us an idea how well each sample was sampled.

# Create a df from our otu table
df <- as.data.frame(otu_table(physeq_filt))

# Rarefy data with a step size of 50, using tidy = TRUE will return a dataframe instead of a plot
df_rate <- rarecurve(t(df), step=50, cex=0.5, label = FALSE, tidy = TRUE)

# Add metadata
df_rate <- merge(df_rate, metadata_combined, by.x = "Site", by.y = "sample_id")

# Plot
df_rate %>%
  group_by(Site) %>%
  mutate(max_sample = max(Sample)) %>%
  mutate(label = if_else(Sample == max_sample, as.character(Site), NA_character_)) %>%
  ggplot(aes(x = Sample, y = Species, color = Carbon_source, group = interaction(Site, Carbon_source))) + 
  geom_line() + 
  facet_grid(cols = vars(Carbon_source)) +
  scale_color_manual(values = palette1) +
  geom_text_repel(aes(label = label),
            position = position_nudge(x = 6000),
            na.rm = TRUE, size = 3) +
  custom_theme()

In this example we can see that almost none of the samples reach a plateau, SB_P1 is the closest but not there yet. This suggests that we mainly sampled the abundant members of our community and might miss many rare taxa. This is something to keep in mind for other analyses, such as alpha diversity analyses and the statistics.

Alpha diversity

Alpha diversity measures the diversity within our sample and we distinguish between species richness (i.e. the number of species) and species richness (i.e. how relatively abundant each of the species are).

# Calculate different alpha diversity indicators
richness_meta <-microbiome::alpha(physeq_filt, index = "all")

# Add the sample id to table
richness_meta$sample_id <- rownames(richness_meta)

# Add other metadata data
richness_meta  <- merge(richness_meta, metadata_combined, by = "sample_id")

# Check what parameters are calculated
colnames(richness_meta)

 [1] "sample_id"                  "observed"                  
 [3] "chao1"                      "diversity_inverse_simpson" 
 [5] "diversity_gini_simpson"     "diversity_shannon"         
 [7] "diversity_fisher"           "diversity_coverage"        
 [9] "evenness_camargo"           "evenness_pielou"           
[11] "evenness_simpson"           "evenness_evar"             
[13] "evenness_bulla"             "dominance_dbp"             
[15] "dominance_dmn"              "dominance_absolute"        
[17] "dominance_relative"         "dominance_simpson"         
[19] "dominance_core_abundance"   "dominance_gini"            
[21] "rarity_log_modulo_skewness" "rarity_low_abundance"      
[23] "rarity_rare_abundance"      "Barcode"                   
[25] "Carbon_source"              "Wino_Column"               
[27] "Practical_group"            "name"

Next, we can generate a plot.

# Generate figure
alpha_plot <-
  ggplot(richness_meta, aes(x = Carbon_source, y = chao1, colour = Carbon_source)) +
    geom_boxplot(outliers = FALSE) +
    geom_jitter(aes(shape = Practical_group), width = 0.2, size = 2) +
    scale_color_manual(values = palette1) +
    labs(x = "", y = "Chao1 index") +
    custom_theme() +
    theme(axis.title.y = element_text(size = 16),
          axis.text.x = element_text(size = 16),
          axis.text.y = element_text(size = 12),
          legend.key=element_blank(), 
          legend.text = element_text(size=12),
          legend.title = element_text(size=16)
          ) 

alpha_plot

#save the figure to our computer
#ggsave(paste0("results/plots/alpha-div.png"), plot=alpha_plot, device="png")

We see that there is not a difference when comparing our different samples but that we have two potential outliers.

Beta diversity

In contrast to alpha diversity, beta diversity quantifies the dissimilarity between communities (multiple samples).

Commonly used metrics include:

the Bray-Curtis index (for compositional/abundance data)
Jaccard index (for presence/absence data, ignoring abundance information)
Aitchison distance (Euclidean distance for clr transformed abundances, aiming to avoid the compositionality bias)
Unifrac distance (that takes into account the phylogenetic tree information)
…

Avoid methods that use Euclidean distance as microbiome data are sparse datasets and better suited for the above mentioned methods. One exception of this when you calculate the Aitchison distance.

Based on the type of algorithm, ordination methods can be divided in two categories:

unsupervised, which measure variation in the data without additional information on covariates or other supervision of the model, including:
- Principal Coordinate Analysis (PCoA)
- Principal Component Analysis (PCA)
- Uniform Manifold Approximation and Projection for Dimension Reduction (UMAP)
supervised ordination:
- distance-based Redundancy Analysis (dbRDA)

Bray-Curtis

For Bray-Curtis our data should not have negative entries, so we will use the relative abundance (or rarefied) data:

# Convert otu table to df
data_otu_filt_rar <- t(data.frame(otu_table(physeq_rarified)))

# Calculate bray-curtis 
dist_bc <- as.matrix(vegdist(data_otu_filt_rar, method = "bray")) 

# Calculate PCOA using Phyloseq package
pcoa_bc <- ordinate(physeq_rarified, "PCoA", "bray") 

# Plot
plot_ordination(physeq_rarified, pcoa_bc, color = "Carbon_source", shape = "Practical_group") + 
  geom_point(size = 3) +
  stat_ellipse(aes(group = Carbon_source), linetype = 2) +
  scale_color_manual(values = palette1) +
  custom_theme()

First, this two-dimensions PCOA plot show 35% of the total variance between the samples. Next, we see that our samples are forming distinct clusters, i.e. microbiomes from the paper and wood communities seem quite different.

Next, we can do a statistical test:

# Extract metdatata
metadata2 <- data.frame(meta(physeq_rarified))

# Permanova test to test for differences between practical day
adonis2(data_otu_filt_rar~Practical_group, data = metadata2, permutations=9999, method="bray")

	Df	SumOfSqs	R2	F	Pr(>F)
Practical_group	1	0.1673426	0.0504851	0.9038793	0.4844
Residual	17	3.1473500	0.9495149	NA	NA
Total	18	3.3146926	1.0000000	NA	NA

# Permanova test to test for differences between carbon source
adonis2(data_otu_filt_rar~Carbon_source, data = metadata2, permutations=9999, method="bray")

	Df	SumOfSqs	R2	F	Pr(>F)
Carbon_source	1	0.6935776	0.2092434	4.498399	0.0011
Residual	17	2.6211149	0.7907566	NA	NA
Total	18	3.3146926	1.0000000	NA	NA

There seems to be a difference between our samples.

Aitchison

Next, we generate a beta-diversity ordination using the Aitchison distance and see if we get comparable results.

# PCA via phyloseq
# RDA without constraints is a PCA
ord_clr <- phyloseq::ordinate(physeq_clr, "RDA", distance = "euclidian")

# Scale axes
clr1 <- ord_clr$CA$eig[1] / sum(ord_clr$CA$eig)
clr2 <- ord_clr$CA$eig[2] / sum(ord_clr$CA$eig)

# Plot
phyloseq::plot_ordination(physeq_clr, ord_clr, color = "Carbon_source", shape = "Practical_group") + 
  geom_point(size = 2,) +
  coord_fixed(clr2 / clr1) +
  stat_ellipse(aes(group = Carbon_source), linetype = 2) +
  scale_color_manual(values = palette1) +
  custom_theme()

# Generate distance matrix
clr_dist_matrix <- phyloseq::distance(physeq_clr, method = "euclidean") 

# ADONIS test for experiment day --> No difference
adonis2(clr_dist_matrix~Practical_group, data = metadata2, permutations=9999, method="bray")

	Df	SumOfSqs	R2	F	Pr(>F)
Practical_group	1	727.4548	0.0632343	1.147547	0.2523
Residual	17	10776.6678	0.9367657	NA	NA
Total	18	11504.1226	1.0000000	NA	NA

# ADONIS test for carbon source --> We see a difference
adonis2(clr_dist_matrix~Carbon_source, data = metadata2, permutations=9999, method="bray")

	Df	SumOfSqs	R2	F	Pr(>F)
Carbon_source	1	2097.024	0.1822846	3.789629	1e-04
Residual	17	9407.098	0.8177154	NA	NA
Total	18	11504.123	1.0000000	NA	NA

We get consistent results with our two approaches.

Taxonomic distribution

Next, we want to plot the taxa distribution. Let us first look at the most abundant phyla and check how similar our different samples are:

# Condense data at specific tax rank, i.e. on phylum level
grouped_taxa <- tax_glom(physeq_rel_abundance, "Phylum")
  
# Find top19 most abundant taxa 
top_taxa <- names(sort(taxa_sums(grouped_taxa), TRUE)[1:19])

# Make a list for the remaining taxa
other_taxa <- names(taxa_sums(grouped_taxa))[which(!names(taxa_sums(grouped_taxa)) %in% top_taxa)]

# Group the low abundant taxa into one group
merged_physeq = merge_taxa(grouped_taxa, other_taxa, 2)
  
# Transform phyloseq object into dataframe
df <- psmelt(merged_physeq)

# Cleanup the names in the df
names(df)[names(df) == "Phylum"] <- "tax_level"

# Replace NAs, with other (NAs are generated for the other category)
df$tax_level[which(is.na(df$tax_level))] <- "Other"
  
# Create a df to sort taxa labels by abundance
sorted_labels <- df |> 
  group_by(tax_level) |> 
  summarise(sum = sum(Abundance)) |> 
  arrange(desc(sum))
  
# Get list of sorted levels excluding "Other"
desired_levels <- setdiff(sorted_labels$tax_level, "Other")
  
# Sort df using the sorted levels and ensure that "Other" is the last category
df$tax_level2 <- factor(df$tax_level, levels = c(desired_levels, "Other"))
df$Sample2 <- factor(df$Sample, levels = sorted_names)

# Generate color scheme
cols <- c25[1:length(unique(df$tax_level2))]
cols[levels(df$tax_level2) == "Other"] <- "#CCCCCC"
  
# Plot
fig <-
  ggplot(df, aes(x = Sample2, y = Abundance, fill = tax_level2) ) +
    geom_bar(position = "stack", stat = "identity", width = 0.9) +
    labs(y = "Relative abundance", x = "", title = paste0("Relative abundance at ", "Phylum", " rank")) +
    scale_fill_manual(name = paste0("Phylum","_rank"), labels = levels(df$tax_level2), values = cols) +
    facet_grid(cols =  vars(Carbon_source), scales = "free", space = "free") +
    scale_y_continuous(expand = c(0, 0), limits = c(0, 1.01)) +
    custom_theme() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1 ) ) +
    guides(fill=guide_legend(title = "Phylum"))

fig

Since we want to generate one plot for each taxonomic rank, i.e. Phylum, Class, Order,…, we can do this in a loop. The figures will be generated in the folder results/plots/.

If you do not feel comfortable with a lob, you can also do this step by step by removing the for statement and replacing all instances of level with the taxonomic rank you want to investigate.

# If not there already, create output folder
img_path="../results/r_analyses/images/"
dir.create(img_path, recursive = TRUE)

# Generate one barplot for each taxonomic level
for (level in colnames(taxonomy_file)){
  
  # Condense data at specific tax rank, i.e. on phylum level
  grouped_taxa <- tax_glom(physeq_rel_abundance, level)
  
  # Find top19 most abundant taxa 
  top_taxa <- names(sort(taxa_sums(grouped_taxa), TRUE)[1:19])
  
  # Make a list for the remaining taxa
  other_taxa <- names(taxa_sums(grouped_taxa))[which(!names(taxa_sums(grouped_taxa)) %in% top_taxa)]
  
  # Group the low abundant taxa into one group
  merged_physeq = merge_taxa(grouped_taxa, other_taxa, 2)
  
  # Transform phyloseq object into dataframe
  df <- psmelt(merged_physeq) 
  
  # Cleanup the dataframe
  names(df)[names(df) == level] <- "tax_level"
  df$tax_level[which(is.na(df$tax_level))] <- "Other"
  
  # Create a df to sort taxa labels by abundance
  sorted_labels <- df |> 
    group_by(tax_level) |> 
    summarise(sum = sum(Abundance)) |> 
    arrange(desc(sum))
  
  # Get list of sorted levels excluding "Other"
  desired_levels <- setdiff(sorted_labels$tax_level, "Other")
  
  # Sort df using the sorted levels and ensure that "Other" is the last category
  df$tax_level2 <- factor(df$tax_level, levels = c(desired_levels, "Other"))
  df$Sample2 <- factor(df$Sample, levels = sorted_names)

  # Generate color scheme
  cols <- c25[1:length(unique(df$tax_level2))]
  cols[levels(df$tax_level2) == "Other"] <- "#CCCCCC"
  
  # Plot
  fig <-
    ggplot(df, aes(x = Sample2, y = Abundance, fill = tax_level2) ) +
      geom_bar(position = "stack", stat = "identity", width = 0.9) +
      labs(y = "Relative abundance", x = "", title = paste0("Relative abundance at ", level, " rank")) +
      scale_fill_manual(name = paste0(level,"_rank"), labels = levels(df$tax_level2), values = cols) +
      facet_grid(cols =  vars(Carbon_source), scales = "free", space = "free") +
      scale_y_continuous(expand = c(0, 0), limits = c(0, 1.01)) +
      custom_theme() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1 ) ) +
      guides(fill=guide_legend(title=level))
  
  ggsave(paste0(img_path, level, "_barplot_ra.png"), plot = fig, device="png")
  }

Generated plots:

Differential abundance testing

Notice: This is still in development and needs to be optimized, use with care!

Wilcox

First, let us compare for differences for a category where we compare two factors. For example, we might want to ask whether there are any significant differences when comparing our wood versus paper samples.

Let’s first test this using the non-parametric Wilcoxon rank-sum test.

# Extract OTU table for intersect
df <- cbind(as.data.frame(as(tax_table(physeq_clr), "matrix")),as.data.frame(as(otu_table(physeq_clr), "matrix")))

df_long <- reshape2::melt(df)

Using Kingdom, Phylum, Class, Order, Family, Genus as id variables

df_long <- merge(df_long, metadata2, by.x = "variable", by.y = "sample_id")

# Run non-parametric stats
temp <- df_long |> 
  group_by(Phylum, Class, Order, Family, Genus) |> 
  wilcox_test(value ~ Carbon_source) |> 
  mutate(BH_FDR = p.adjust(p, "BH"))

# Extract significant results
wilcox_results <- temp |> 
  arrange(BH_FDR) |> 
  filter(BH_FDR < 0.05) |> 
  mutate(tax = paste("Bacteria", Phylum, Class, Order, Family, Genus, sep = ";"))

Wilcoxon has some down-sites when it comes to treating zero values, an alternative approach is to use the ANOVA-like differential expression (ALDEx2) method. The aldex function is a wrapper that performs log-ratio transformation and statistical testing in a single line of code, which is why we feed the non-normalized data into it:

ALDEx2

aldex2_da <- ALDEx2::aldex(data.frame(phyloseq::otu_table(physeq_filt)), phyloseq::sample_data(physeq_filt)$Carbon_source, test="t", effect = TRUE, denom="iqlr")

aldex.clr: generating Monte-Carlo instances and clr values

operating in serial mode

computing iqlr centering

aldex.ttest: doing t-test

aldex.effect: calculating effect sizes

Specifically, this function:

generates Monte Carlo samples of the Dirichlet distribution for each sample,
converts each instance using a log-ratio transform,
returns test results for two sample (Welch’s t, Wilcoxon) or multi-sample (glm, Kruskal-Wallace) tests.

iqlr” accounts for data with systematic variation and centers the features on the set features that have variance that is between the lower and upper quartile of variance. This provides results that are more robust to asymmetric features between groups.

Next, we can plot the effect size. The effect size plot shows the median log2 fold difference by the median log2 dispersion. This is a measure of the effect size by the variability. Differentially abundant taxon will be those where the difference most exceeds the dispersion. Points toward the top of the figure are more abundant in CF samples while those towards the bottom are more abundant in healthy controls. Taxa with BH-FDR corrected p-values are shown in red.

# Plot effect sizes
ALDEx2::aldex.plot(aldex2_da, type="MW", test="wilcox", called.cex = 1, cutoff.pval = 0.001)

Finally, we can print the output with the taxa information added.

# Clean up presentation
sig_aldex2 <- aldex2_da %>%
  rownames_to_column(var = "OTU") %>%
  filter(wi.eBH < 0.05) %>%
  arrange(effect, wi.eBH) %>%
  dplyr::select(OTU, diff.btw, diff.win, effect, wi.ep, wi.eBH)

head(sig_aldex2)

OTU	diff.btw	diff.win	effect	wi.ep	wi.eBH
Bacteria;Firmicutes;Clostridia;Oscillospirales;Hungateiclostridiaceae;Ruminiclostridium	-7.567409	2.1694829	-3.528451	0.0000265	0.0043687
Bacteria;Firmicutes;Clostridia;Oscillospirales;Hungateiclostridiaceae;Acetivibrio	-9.304527	3.1653667	-2.884341	0.0000265	0.0043687
Bacteria;Firmicutes;Clostridia;Peptostreptococcales-Tissierellales;NA;JTB215	-1.988928	0.9905788	-2.025016	0.0000606	0.0059223
Bacteria;SAR324 clade(Marine group B);NA;NA;NA;NA	-2.541417	1.2263866	-1.843190	0.0003946	0.0146265
Bacteria;Firmicutes;Clostridia;Oscillospirales;Hungateiclostridiaceae;uncultured	-2.047430	1.1253063	-1.817536	0.0001089	0.0075088
Bacteria;Verrucomicrobiota;Lentisphaeria;Oligosphaerales;Lenti-02;NA	-2.290918	1.2910443	-1.790067	0.0001856	0.0084944

If an empty dataframe is returned, no significant differences were detected.

DESeq2

DESeq2 performs a differential expression analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution. DeSeq normalizes the data throughout its analysis, so we input only the filtered data. For more theory, visit this page.

# Convert treatment column to a factor
sample_data(physeq_filt)$Carbon_source <- as.factor(sample_data(physeq_filt)$Carbon_source)

# Convert the phyloseq object to a DESeqDataSet and run DESeq2:
ds <- phyloseq_to_deseq2(physeq_filt, ~ Carbon_source)

converting counts to integer mode

# Since our samples contain a lot of 0s (something DeSeq is NOT designed for) we use some alternative means to estimate the size factor
ds <-estimateSizeFactors(ds, type = 'poscounts')
ds <- DESeq(ds)

using pre-existing size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

-- replacing outliers and refitting for 55 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

estimating dispersions

fitting model and testing

# Extract the results and filter by a FDR cutoff of 0.01 and
# find significantly differentially abundant OTU between the seasons “paper” and “wood”
alpha = 0.05
res_desq = results(ds, contrast=c("Carbon_source", "paper", "wood"), alpha=alpha)
res_desq = res_desq[order(res_desq$padj, na.last=NA), ]
res_sig_deseq = as.data.frame(res_desq[(res_desq$padj < alpha), ])

# Print number of significant results
dim(res_sig_deseq)

[1] 136   6

Plot significant OTUs (counts):

# Extract relevant data
df <- cbind(as.data.frame(as(tax_table(physeq_filt)[rownames(res_sig_deseq), ], "matrix")),as.data.frame(as(otu_table(physeq_filt)[rownames(res_sig_deseq), ], "matrix")))

# Convert dataframe into long format
df_long <- reshape2::melt(df)

Using Kingdom, Phylum, Class, Order, Family, Genus as id variables

# Add metadata
df_long <- merge(df_long, metadata2, by.x = "variable", by.y = "sample_id")

# Plot
ggplot(df_long, aes(x=Genus, y=value, color = Carbon_source)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(position = position_jitterdodge(jitter.width=0.1), size = 0.9)  +
  custom_theme() +
  scale_color_manual(values = palette1) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Plot significant OTUs (relative abundance):

# Extract the relevant data
df <- cbind(as.data.frame(as(tax_table(physeq_rel_abundance)[rownames(res_sig_deseq), ], "matrix")),as.data.frame(as(otu_table(physeq_rel_abundance)[rownames(res_sig_deseq), ], "matrix")))

# Convert to long df
df_long <- reshape2::melt(df)

Using Kingdom, Phylum, Class, Order, Family, Genus as id variables

# Add metadata
df_long <- merge(df_long, metadata2, by.x = "variable", by.y = "sample_id")

# Plot
ggplot(df_long, aes(x=Genus, y=value, color = Carbon_source)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(position = position_jitterdodge(jitter.width=0.1), size = 0.9)  +
  custom_theme() +
  scale_color_manual(values = palette1) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ANCOM

# Run ancom-bc2
# Settings:
# prv_cut: a numerical fraction between 0 and 1. Taxa with prevalences less than prv_cut will be excluded in the analysis
# a numerical threshold for filtering samples based on library sizes. Samples with library sizes less than lib_cut
ancom_out = ancombc2(data = physeq_filt,
               fix_formula = "Carbon_source",
               rand_formula = NULL,
               p_adj_method = "holm", 
               pseudo_sens = FALSE,  #FALSE to speed things up for testing, but makes things less sensitive
               prv_cut = 0.10, lib_cut = 1000, s0_perc = 0.05,
               group = "Carbon_source",
               alpha = 0.05, n_cl = 2, verbose = TRUE,
               global = TRUE)

`tax_level` is not speficified 
No agglomeration will be performed
Otherwise, please speficy `tax_level` by one of the following: 
Kingdom, Phylum, Class, Order, Family, Genus

Warning: The group variable has < 3 categories 
The multi-group comparisons (global/pairwise/dunnet/trend) will be deactivated

Obtaining initial estimates ...

Estimating sample-specific biases ...

ANCOM-BC2 primary results ...

# Store resukts
res_ancom = ancom_out$res

# Extract data for figure
res_f_fig = res_ancom %>%
    dplyr::filter(diff_Carbon_sourcewood == TRUE) %>% 
    dplyr::arrange(desc(lfc_Carbon_sourcewood)) %>%
    dplyr::mutate(direct = ifelse(lfc_Carbon_sourcewood > 0, "Positive LFC", "Negative LFC"),
                  color = ifelse(lfc_Carbon_sourcewood > 0, "aquamarine3", "black"))

res_f_fig$taxon = factor(res_f_fig$taxon, levels = res_f_fig$taxon)
res_f_fig$direct = factor(res_f_fig$direct, 
                           levels = c("Positive LFC", "Negative LFC"))

# Plot
fig_ancom = res_f_fig %>%
    ggplot(aes(x = lfc_Carbon_sourcewood, y = taxon, fill = direct)) + 
    geom_bar(stat = "identity", width = 0.7, color = "black", 
             position = position_dodge(width = 0.4)) +
    geom_errorbar(aes(xmin = lfc_Carbon_sourcewood - se_Carbon_sourcewood, xmax = lfc_Carbon_sourcewood + se_Carbon_sourcewood), 
                  width = 0.2, position = position_dodge(0.05), color = "black") + 
    labs(y = NULL, x = "Log fold change Wood versus Paper", 
         title = "Log fold changes") + 
    scale_fill_discrete(name = NULL) +
    scale_color_discrete(name = NULL) +
    custom_theme() + 
    theme(plot.title = element_text(hjust = 0.5),
          panel.grid.minor.y = element_blank(),
          axis.text.y = element_text(size=6))

fig_ancom

Combine results

# For each statistical analysis make a list of significant hits
x <- list(
  ancom = as.character(res_f_fig$taxon), 
  deseq = as.character(rownames(res_sig_deseq)), 
  wilcox = as.character(wilcox_results$tax),
  aldex = sig_aldex2$OTU
  )

# Plot Vendiagram
ggvenn(
  x, 
  fill_color = c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF"),
  stroke_size = 0.5, set_name_size = 4
  )

# Get intersection of all methods
v_table <- venn(x, show.plot=FALSE)
intersections<-attr(v_table,"intersections")
sig_otus_ven <- intersections$`ancom:deseq:wilcox:aldex`

# Extract OTU table for intersect
df <- cbind(as.data.frame(as(tax_table(physeq_clr)[sig_otus_ven, ], "matrix")),as.data.frame(as(otu_table(physeq_clr)[sig_otus_ven, ], "matrix")))

df_long <- reshape2::melt(df)

Using Kingdom, Phylum, Class, Order, Family, Genus as id variables

df_long <- merge(df_long, metadata2, by.x = "variable", by.y = "sample_id")
df_long$tax <- paste("P_", df_long$Phylum, " ", "O_", df_long$Order, " ", "G_", df_long$Genus, sep = "")

# Plot
ggplot(df_long, aes(x=tax, y=value, color = Carbon_source)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(position = position_jitterdodge(jitter.width=0.1), size = 0.9)  +
  custom_theme() +
   facet_wrap(~tax, scales = "free", labeller = label_wrap_gen()) +
  scale_color_manual(values = palette1) +
  theme(axis.text.x = element_blank(),
        strip.text.x = element_text(size = 4)) +
  ylab("CLR transformed counts")

How to interpret the CLR values for i.e. Acetovibrio:

Acetovibrio is more abundant in Condition A (with a positive CLR value of 5) than in Condition B (where it has a negative CLR value of -1)
CLR value of 5 on paper indicates that the abundance of Acetovibrio is higher than the geometric mean of the other genera in the composition in paper samples
CLR value of -1 on wood indicates that the abundance of Acetovibrio is slightly lower than the geometric mean of the other genera. In this case, Acetovibrio is less abundant under these growth conditions relative to other microbial taxa.