#check if wdir is correct
#getwd()
#set seed
set.seed(1)
Analysing OTU tables with R
The goal of this tutorial is to analyse 16S rRNA gene data. Our input file is an OTU/ASV table that already contains some taxa information and will go through the following steps:
- Reading the data into R
- Filtering the data
- Normalizing the data
- Visualizing the data (i.e. alpha/beta diversity, barplots, …)
In the example we look at an OTU table of 28 samples. These 28 samples represent three distinct microbiomes from three Winogradsky column experiments in which columns were created using wood, paper and a wood/paper mix as substrate. DNA was collected on two separate dates, so another category we can compare is the sampling date.
If you want to follow this tutorial, then you can find the required input files here.
This tutorial covers only a small fraction of the possible methods for filtering, normalizing, and analyzing OTU tables. There are many alternative approaches and additional steps that can be taken based on your specific data and research questions.
See this tutorial as a starting point for your analyses and ensure that you check out other resources as well!
Setup
We start with setting a seed seed for the normalization protocol.
Setting a set is not essential but this way we make sure that we get the same results when normalizing our OTU table. If we randomly select some observations for any task in R, this results in different values all the time because of randomization. If we want to keep the values that are produced at first random selection then we can do this by storing them in an object after randomization or we can fix the randomization procedure so that we get the same results all the time.
Installation notes
Some packages required for this workflow are installed with BiocManager or devtools, if you need to install any of these tools, remove the #
from the code and run it.
#if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("phyloseq")
#BiocManager::install("microbiome")
#BiocManager::install("ALDEx2")
#BiocManager::install("DESeq2")
#BiocManager::install("metagenomeSeq")
#if (!requireNamespace("devtools")) install.packages("devtools")
#devtools::install_github("Russel88/DAtest")
Load packages
library(tidyverse) #general parsing
library(data.table) #general parsing
library(phyloseq) #phyloseq object loading
library(vegan) #rarefaction
library(microbiome) #normalization
library(ALDEx2) #stats
library(DESeq2) #stats
library(grid) #organizing multiple plots
library(gridExtra) #organizing multiple plots
library(scales) #plot aesthetic, comma setting
library(DAtest) #stats, tba
library(metagenomeSeq) # stats
library(dendextend) #control colors for dendrograms
Define custom functions
Next, we read in some custom things such as:
- a custom theme that we will use for plotting our graphs
- a custom function that we use to calculate some summary statistics for our otu table
- a color vector to have more control over what colors are used in our graphs
Defining custom themes for our plots and custom functions is useful because it means that instead of having to write the code for our plots over and over again if we want to use them more than once, we can just use these functions instead.
You can add custom functions closer to where you use them in the code, but I prefer to have them organized in one section of the workflow to keep my code organized.
#define custom theme for generating figures
<- function() {
custom_theme theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
panel.border =element_blank(),
axis.line.x = element_line(color="black", size = 0.5),
axis.line.y = element_line(color="black", size = 0.5),
strip.text.x = element_text(size = 7),
strip.text.y = element_text(size = 7),
strip.background = element_rect(fil="#FFFFFF", color = "black", linewidth = 0.5),
axis.text.x = element_text(size = 7),
axis.text.y = element_text(size = 7),
legend.text = element_text(size = 8), legend.title = element_text(size = 10)
)
}
#generate color scheme
<- c("dodgerblue2", "#E31A1C", "green4", "#6A3D9A", "#FF7F00", "black", "gold1", "skyblue2", "#FB9A99",
c25 "palegreen2", "#CAB2D6", "#FDBF6F", "gray70", "khaki2", "maroon", "orchid1", "deeppink1", "blue1",
"steelblue4", "darkturquoise", "green1", "yellow4", "yellow3","darkorange4", "brown")
#define a custom function to summarize several aspects of our otu table
<- function(df) {
summarize_table <- sum(df)
total_reads <- length(df)
otu_number <- length(df[df == 1])
num_singletons <- length(df[df == 2])
num_doubletons <- length(df[df < 10])
num_less_than_10 <- sum(df[df < 10])
total_reads_less_than_10 <- (total_reads_less_than_10 / sum(df)) * 100
perc_reads_less_than_10
cat("Total number of reads:", format(total_reads, big.mark = ","), "\n")
cat("Number of OTUs", format(otu_number, big.mark = ","), "\n")
cat("Number of singleton OTUs:", format(num_singletons, big.mark = ","), "\n")
cat("Number of doubleton OTUs:", format(num_doubletons, big.mark = ","), "\n")
cat("Number of OTUs with less than 10 seqs:", format(num_less_than_10, big.mark = ","), "\n")
cat("Total reads for OTUs with less than 10 seqs:", format(total_reads_less_than_10, big.mark = ","), "\n")
cat("Percentage of reads for OTUs with less than 10 seqs:", sprintf("%.2f%%", perc_reads_less_than_10), "\n")
}
# Flexible function to create a faceted plot for multiple phyloseq objects
<- function(...) {
plot_faceted_phyloseq # Capture the list of phyloseq objects and their method names
<- list(...)
physeq_list
# Initialize an empty data frame to store the combined data
<- data.frame(Sample = character(), Sums = numeric(), Method = character())
combined_df
# Iterate over the list and extract sample sums for each phyloseq object
for (name in names(physeq_list)) {
<- physeq_list[[name]]
physeq_obj <- data.frame(Sample = names(sample_sums(physeq_obj)),
sample_sums_df Sums = sample_sums(physeq_obj),
Method = name)
<- bind_rows(combined_df, sample_sums_df)
combined_df
}
# Set the factor levels for 'Method' in the order they were provided
$Method <- factor(combined_df$Method, levels = names(physeq_list))
combined_df
# Create the faceted plot
<- ggplot(combined_df, aes(x = Sample, y = Sums)) +
p geom_bar(stat = "identity") +
facet_wrap(~ Method, scales = "free_y") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("Sample") +
ylab("Sample Sums") +
ggtitle("Faceted Plot of Sample Sums")
return(p)
}
Read in the data
OTU table
An OTU table contains a column with the OTUs (taxonomic ranks in our case) and one column per sample with the counts how often OTU is found in the sample. It might look something like this:
#NAME | EP1910 | RMT | KJB3 | TJR |
---|---|---|---|---|
Bacteria;Acidobacteria;Acidobacteriia;Acidobacteriales;Acidobacteriaceae (Subgroup 1);NA | 0 | 0 | 0 | 0 |
Bacteria;Acidobacteria;Acidobacteriia;Solibacterales;Solibacteraceae (Subgroup 3);PAUC26f | 0 | 5 | 3 | 1 |
If you follow this tutorial, ensure that your OTU table is in a folder called input and that this folder is located where your R script is. If your OTU table is somewhere else, change the path stored in filepaths
accordingly.
#provide the path to the OTU table
<- c("input/otu_table.txt")
file_paths
#read in otu table
<- read.table(file_paths, header = T, sep = '\t', comment = "")
merged_otu_table colnames(merged_otu_table)[1] <- "taxid"
#replace NA with 0
is.na(merged_otu_table)] <- 0
merged_otu_table[
#use the taxonID (or OTU IT) as rownames
rownames(merged_otu_table) <- merged_otu_table$taxid
$taxid <- NULL
merged_otu_table
#check how many otus and samples we have
dim(merged_otu_table)
[1] 1530 28
With this example OTU table, we work with 28 samples and 1530 OTUs.
OTU tables (optional)
If you have more than one table, for example if you generated an OTU table using different classifiers, you could read in the data as follows:
Show the code
<- c("results/classification/minimap2/ITS1.merged.outmat.tsv",
file_paths "results/classification/kraken2/ITS1.merged.outmat.tsv")
<- character(0)
methods
for (i in 1:length(file_paths)){
<- read.table(file_paths[i], header = T, sep = '\t', comment = "")
table <- gsub(".*/([^/]+)/.*", "\\1", file_paths[i])
method names(table)[1] <- "taxid"
<- c(methods, method)
methods colnames(table)[-1] <- paste0(colnames(table)[-1], "_", method)
ifelse (i == 1, merged_otu_table <- table,
<- merge(merged_otu_table, table, by = "taxid", all=TRUE))
merged_otu_table
}
# Replace NA with 0
is.na(merged_otu_table)] <- 0
merged_otu_table[
# Restore row names
rownames(merged_otu_table) <- merged_otu_table$taxid
$taxid <- NULL
merged_otu_table
dim(merged_otu_table)
Metadata
Next, we read in another table that contains more information about our samples. Such a table could look something like this:
#NAME | treatment | Date |
---|---|---|
EP1910 | wood | 2023_1 |
RMT | paper | 2023_1 |
KJB3 | mix | 2023_1 |
TJR | paper | 2023_1 |
IB5 | wood | 2023_1 |
ALIDNA | wood | 2023_1 |
IG7 | paper | 2023_1 |
B314 | mix | 2023_1 |
#read in metadata file
<- read.table("input/sample_table.txt", header = TRUE, row.names = 1, sep = "\t", comment.char = "")
metadata_combined
#add extra column for sample names
$name <- paste0(metadata_combined$treatment, "_", rownames(metadata_combined))
metadata_combined$sample_id <- rownames(metadata_combined)
metadata_combined
#order the factors for our names column
<- metadata_combined |>
metadata_combined arrange(desc(treatment))
#view output
head(metadata_combined)
treatment | Date | name | sample_id | |
---|---|---|---|---|
EP1910 | wood | 2023_1 | wood_EP1910 | EP1910 |
IB5 | wood | 2023_1 | wood_IB5 | IB5 |
ALIDNA | wood | 2023_1 | wood_ALIDNA | ALIDNA |
VS15G | wood | 2023_1 | wood_VS15G | VS15G |
NZC319 | wood | 2023_1 | wood_NZC319 | NZC319 |
E321 | wood | 2023_1 | wood_E321 | E321 |
Generate taxonomy file
Next, we generate a table that lists the taxonomy information for each taxonomic rank. We do this by taking the information from our OTU table. Depending on how you analysed your 16S rRNA gene sequences, you might have:
- an OTU table with the taxonomy information as row names instead of IDs. That is what we are working with for this tutorial. In our case, the different taxonomic ranks are separated with a semicolon
- an OTU table with IDs (ASV1, ASV2, … or OTU1, OTU2, …) and a separate table with the taxonomy information. If that is the case, you can read in the taxonomy information separately
#extract taxonomy information and convert to separate df
<- as.data.frame(rownames(merged_otu_table))
df colnames(df) <- "OTU"
#separate the taxonomic headers into separate rows
<- df |>
taxonomy_file distinct(OTU) |>
separate(OTU,
c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus"),
sep = ";", remove = FALSE) |>
column_to_rownames(var = "OTU")
#view file
head(taxonomy_file)
Kingdom | Phylum | Class | Order | Family | Genus | |
---|---|---|---|---|---|---|
Bacteria;Acetothermia;Acetothermiia;bacterium enrichment culture clone 73(2013);bacterium enrichment culture clone 73(2013);bacterium enrichment culture clone 73(2013) | Bacteria | Acetothermia | Acetothermiia | bacterium enrichment culture clone 73(2013) | bacterium enrichment culture clone 73(2013) | bacterium enrichment culture clone 73(2013) |
Bacteria;Acetothermia;Acetothermiia;uncultured bacterium;uncultured bacterium;uncultured bacterium | Bacteria | Acetothermia | Acetothermiia | uncultured bacterium | uncultured bacterium | uncultured bacterium |
Bacteria;Acidobacteria;Acidobacteriia;Acidobacteriales;Acidobacteriaceae (Subgroup 1);NA | Bacteria | Acidobacteria | Acidobacteriia | Acidobacteriales | Acidobacteriaceae (Subgroup 1) | NA |
Bacteria;Acidobacteria;Acidobacteriia;Solibacterales;Solibacteraceae (Subgroup 3);PAUC26f | Bacteria | Acidobacteria | Acidobacteriia | Solibacterales | Solibacteraceae (Subgroup 3) | PAUC26f |
Bacteria;Acidobacteria;Aminicenantia;Aminicenantales;NA;NA | Bacteria | Acidobacteria | Aminicenantia | Aminicenantales | NA | NA |
Bacteria;Acidobacteria;Aminicenantia;Aminicenantales;uncultured Aminicenantes bacterium;uncultured Aminicenantes bacterium | Bacteria | Acidobacteria | Aminicenantia | Aminicenantales | uncultured Aminicenantes bacterium | uncultured Aminicenantes bacterium |
Generate phyloseq object
A phyloseq object combines different elements of an analysis (i.e. the OTU table, the list of taxa and the mapping file) into one single object. We can easily generate such an object with the three dataframes we have generated above:
#combine data
= otu_table(merged_otu_table, taxa_are_rows = TRUE)
OTU = tax_table(as.matrix(taxonomy_file))
TAX = phyloseq(OTU, TAX, sample_data(metadata_combined))
physeq
#view structure
physeq
phyloseq-class experiment-level object
otu_table() OTU Table: [ 1530 taxa and 28 samples ]
sample_data() Sample Data: [ 28 samples by 4 sample variables ]
tax_table() Taxonomy Table: [ 1530 taxa by 6 taxonomic ranks ]
Explore the raw data
Get some summary statistics
Below, we use our custom function summarize_table
calculate some summary statistics. As said above, we could easily do this without a function, however, since we want to compare the statistics before and after filtering the OTU table, the function is useful to have, since we do not need to copy-paste the exact same code in two spots of the workflow:
#calculate the number of reads found per otu
<- taxa_sums(physeq)
reads_per_OTU
#summarize the data
summarize_table(reads_per_OTU)
Total number of reads: 191,661
Number of OTUs 1,530
Number of singleton OTUs: 359
Number of doubleton OTUs: 148
Number of OTUs with less than 10 seqs: 867
Total reads for OTUs with less than 10 seqs: 2,483
Percentage of reads for OTUs with less than 10 seqs: 1.30%
For this workflow, we define singletons as reads/OTUs with a sequence that is present exactly once in the dataset.
Notice that another definition of singletons can be as taxa/OTU present in a single sample.
In amplicon data analysis it is useful to remove reads with low counts because they are very likely due to sequencing errors. Here, we make the assumption that sequencing errors are independent and randomly distributed and that erroneous sequences will occur much less often than a biologically true sequence. For our analyses not to be affected by potentially erroneous sequences we will remove them during the data filtering step.
Explore the OTU counts (optional)
Next, we calculate the cumulative counts of our reads to get a feeling for how much data we would loose if we remove singletons, doubletons, etc. To do this, we calculate something like this:
For OTUs with an read count/abundance of 1, there are 359 occurrences (N), so the cumulative sum is 359.
For OTUs with an read count/abundance of 2, there are 148 occurrences (N), so the cumulative sum becomes 359 + 148 = 507.
This would tell us that if we would remove doubletons, 507 OTUs total to be removed, and so on … We do this, to get an idea about how many OTUs would be removed in total if we want to remove singletons, doubletons, 5-tons, etc…
#extract the relevant data into a data table
= data.table(tax_table(physeq), OTUabundance = taxa_sums(physeq), OTU = taxa_names(physeq))
dt_taxa
#create a new df with the abundances and the occurrences (N)
= dt_taxa[, .N, by = OTUabundance]
dt_cumsum
#sort the df
setkey(dt_cumsum, OTUabundance)
#add a new column with the cumulative sum for the otus
:= cumsum(N)]
dt_cumsum[, CumSumOTUs
#add a new column with the cumulative sum for the reads
:= cumsum(dt_cumsum$OTUabundance * dt_cumsum$N )]
dt_cumsum[, CumSumReads
#get the total read nr
<- sum(dt_cumsum$OTUabundance * dt_cumsum$N)
total_abundance_cumsum
#calc perc of reads removed
$perc_read_removed <- dt_cumsum$CumSumReads/total_abundance_cumsum*100
dt_cumsum
#view the df
head(dt_cumsum)
OTUabundance | N | CumSumOTUs | CumSumReads | perc_read_removed |
---|---|---|---|---|
1 | 359 | 359 | 359 | 0.1873099 |
2 | 148 | 507 | 655 | 0.3417492 |
3 | 100 | 607 | 955 | 0.4982756 |
4 | 65 | 672 | 1215 | 0.6339318 |
5 | 64 | 736 | 1535 | 0.8008932 |
6 | 47 | 783 | 1817 | 0.9480280 |
Next, we can plot this and zoom into how many OTUs we would remove if we remove singletons, doubletons, 5-tons, … :
# Define the plot
<- ggplot(dt_cumsum, aes(OTUabundance, CumSumOTUs)) +
p1 geom_point() +
xlab("Filtering threshold, minimum total counts") +
ylab("OTUs Filtered") +
ggtitle("OTUs that would be filtered vs. the minimum count threshold" ) +
custom_theme() +
theme(plot.title = element_text(size = 8, face = "bold"))
#find the max value to scale our plot
<- max(dt_cumsum$OTUabundance)
max
# Specify zoom levels
<- c(1, 2, 5, 10, 25, 50, 100, max)
zoom_levels
# Create and arrange plots in a 2x2 grid
<- list()
plot_list for (zoom in zoom_levels) {
#subset data to our zoom level
<- dt_cumsum[OTUabundance <= zoom]
subset_data
#define the max value to scale each plot
<- max(subset_data$CumSumOTUs)
max_cumsum
#generate a plot
<- p1 +
p2 xlim(0, zoom) +
ggtitle("") +
custom_theme() +
scale_y_continuous(limits = c(0, max_cumsum),
breaks = seq(0, max_cumsum,
by = max_cumsum / 4),
labels = label_number())
length(plot_list) + 1]] <- p2
plot_list[[
}
# Arrange plots in a 2x2 grid
grid.arrange(grobs = plot_list, ncol = 2,
top = textGrob("OTUs that would be filtered vs. the minimum count threshold",gp=gpar(fontsize=12,font=3)))
We can also check the percentage of reads filtered:
# Define the plot
<- ggplot(dt_cumsum, aes(OTUabundance, perc_read_removed)) +
p1 geom_point() +
xlab("Filtering threshold, minimum total counts") +
ylab("% reads filtered") +
ggtitle("OTUs that would be filtered vs. the minimum count threshold" ) +
custom_theme() +
theme(plot.title = element_text(size = 8, face = "bold"))
<- max(dt_cumsum$OTUabundance)
max
# Specify zoom levels
<- c(1, 2, 5, 10, 25, 50, 100, max)
zoom_levels
# Create and arrange plots in a 2x2 grid
<- list()
plot_list for (zoom in zoom_levels) {
<- dt_cumsum[OTUabundance <= zoom]
subset_data <- max(subset_data$perc_read_removed)
max_perc
<- p1 +
p2 xlim(0, zoom) +
ggtitle("") +
custom_theme() +
scale_y_continuous(limits = c(0, max_perc),
breaks = seq(0, max_perc,
by = max_perc / 4),
labels = label_comma())
length(plot_list) + 1]] <- p2
plot_list[[
}
# Arrange plots in a 2x2 grid
grid.arrange(grobs = plot_list, ncol = 2,
top = textGrob("% of reads that would be filtered vs. the minimum count threshold",gp=gpar(fontsize=12,font=3)))
Overall, we can see that even if we are conservative and for example remove 5-tons that while we remove quite a high number of OTUs (~740), however, we remove less than 1% of the reads.
Finding a good cut-off can be tricky and if you are unsure and do not want to remove rare taxa you might only want to remove singletons or play around with different thresholds.
Explore the sequencing depth
Next, let’s explore a bit closer how our reads are distributed among our different samples:
#count the number of reads per sample
<- as.data.frame(colSums(merged_otu_table))
sample_counts
#clean up the dataframe
names(sample_counts)[1] <- "counts"
$sampleID <- rownames(sample_counts)
sample_counts
#plot counts
<-
p_counts ggplot(data = sample_counts, aes(x = reorder(sampleID, counts, FUN=sum, decreasing = TRUE), y = counts)) +
geom_point() +
geom_text(aes(x = , sampleID, y = counts, label = counts), hjust = 0, nudge_y = 200 , size = 2.5) +
coord_flip() +
xlab("") +
ylab("Read counts") +
custom_theme()
p_counts
In this example, we see two samples with almost no reads and we want to make sure to remove these samples in our filtering step.
We also see that we have a large difference between different samples and that our samples have read depths ranging from 2 to 24,368. To be able to compare for example sample IV (~25,000 reads) with sample MN (~1,000 reads) we need to normalize our data after the data filtering step.
Filter data
Next, we filter the data. Specifically, we:
- Remove OTUs that are not assigned to anything at Phylum rank. The
subset_taxa
function can be used to remove any taxa you want, i.e. if you have plant DNA in your sample, you could use this to remove chloroplast sequences as well.
- Remove samples with total read counts less than 20. This cutoff is arbitrary and depends a bit on your data. To choose a good value, explore the read counts you have per sample and define a cutoff based on that. In this example, we mainly want to remove the two low read samples we have seen in our plot.
- Remove low count OTUs: The threshold is up to you; removing singletons or doubletons is common, but you can be more conservative and remove any counts less than 5 or 10, … . To decide on a cutoff you can look at the plots we generated before to get a feeling for how many OTUs/reads will be removed.
In our example, we want to remove samples with 20 or less reads, remove singletons only and remove OTUs that occur in less than 10% of our samples (v1). Since there are many different thoughts about OTU table filtering, you can also find two other options (v2 and v3) on how to filter a table and feel free to use your own judgement on how to best filter the data.
For most analyses we will work with the filtered data but there are some diversity metrics which rely on the presence of singletons within a sample (richness estimates, i.e. Chao), so you might choose to leave them in for those sorts of analyses only.
#define cutoffs
<- 20
counts_per_sample <- 1
otu_nr_cutoff <- 10
min_percentage_samples
#remove samples with less than 20 reads
#if we view the physeq_filt, we see that 2 samples get removed because the have a low read count
<- prune_samples(sample_sums(physeq)>= counts_per_sample, physeq)
physeq_pruned
#remove taxa without tax assignment at Phylum rank
#if we view the physeq_filt, we see that 9 taxa get removed because they are not assigned at Phylum rank
<- subset_taxa(physeq_pruned, Phylum != "NA")
physeq_filt
#calculate the minimum number of samples an otu should be present in
#in our case, an OTU should be at least be in 3 samples to be considered (notice, that this will remove singletons and doubletons)
<- ceiling((min_percentage_samples / 100) * nsamples(physeq_filt))
min_samples
#remove otus that occur only rarely (v1)
#here, we calculate the total abundance of each otu across all samples and checks if it's greater than the specified otu_nr_cutoff AND we check if the otu occurs in at least <min_percentage_samples>% of samples
#we only retain OTUs that satisfy this condition
<- prune_taxa(taxa_sums(physeq_filt) > otu_nr_cutoff & taxa_sums(physeq_filt) >= min_samples, physeq_filt)
physeq_filt physeq_filt
phyloseq-class experiment-level object
otu_table() OTU Table: [ 1016 taxa and 26 samples ]
sample_data() Sample Data: [ 26 samples by 4 sample variables ]
tax_table() Taxonomy Table: [ 1016 taxa by 6 taxonomic ranks ]
Here, you find some alternative ideas on how to filter your data:
#remove otus that occur only rarely (v2)
#here, we count the number of samples where the abundance of an otu is > 0.
#If this count is greater than the specified otu_nr_cutoff, the taxon is retained.
#physeq_filt <- filter_taxa(physeq, function (x) {sum(x > 0) > otu_nr_cutoff}, prune=TRUE)
#physeq_filt
#remove otus that occur only rarely (v3)
#here, we remove taxa not seen more than 1 times in at least 10% of the samples
#physeq_filt = filter_taxa(physeq, function(x) sum(x > 1) > (0.1*length(x)), TRUE)
#physeq_filt
Next, we can calculate the summary statistics with the custom summarize_table
function we have defined before:
#calculate the number of reads found per otu
<- taxa_sums(physeq_filt)
reads_per_OTU_filt
#summarize the data
summarize_table(reads_per_OTU_filt)
Total number of reads: 190,732
Number of OTUs 1,016
Number of singleton OTUs: 0
Number of doubleton OTUs: 0
Number of OTUs with less than 10 seqs: 358
Total reads for OTUs with less than 10 seqs: 1,814
Percentage of reads for OTUs with less than 10 seqs: 0.95%
Normalize data
Below, we generate three new phyloseq objects using three different normalization approaches:
1. Compositional: transforms the data into relative abundance
2. CLR: stands for centered log-ratio transform and allows us to compare proportions of OTUs within each sample. After this transformation the values will no longer be counts, but rather the dominance (or lack thereof) for each taxa relative to the geometric mean of all taxa on the logarithmic scale
3. Rarefaction: scales all of your reads down to the lowest total sequencing depth. Notice, that this might drop many OTUs in higher read samples and might lead to under-estimation of low-abundant OTUs. Here, we use the non-filtered data as rarefaction expects singletons, and for that purpose they should be retained, unless you have reason to think they are wrong (and which of them are wrong). Depending on whether you want to filter taxa (such as chloroplast), you might want to filter those before doing rarefaction.
Generating different phyloseq objects for different normalization approaches allows us to easily compare analysis steps with different inputs.
Important Note
If you want to familiarize yourself more about why and how to normalize data, feel free to check out the following resources that give some more insights into the topic. As you will see, there are many different ways to normalize OTU tables, and there is a lot of debate about data treatment. It is crucial to test and compare different approaches with your data.
Also keep in mind that this tutorial provides an introduction to some common normalization methods, but it does not cover all possibilities. You might need to explore different methods depending on the specifics of your dataset and research goals.
- Microbiome Datasets Are Compositional: And This Is Not Optional
- Understanding sequencing data as compositions: an outlook and review
- Waste Not, Want Not: Why Rarefying Microbiome Data is Inadmissible
- To rarefy or not to rarefy: robustness and efficiency trade-offs of rarefying microbiome data
- Normalization and microbial differential abundance strategies depend upon data characteristics
<- microbiome::transform(physeq_filt, "compositional")
physeq_rel_abundance <- microbiome::transform(physeq_filt, "clr")
physeq_clr
#for physeq we run this one the pruned samples, in which we removed low-count samples
<- rarefy_even_depth(physeq_pruned) physeq_rarified
Notice:
If you compare the physeq_filt
with the newly generated tables, we see that the first two still contain 1016 taxa found across 26 samples. For the rarefied data, we work with ~800 taxa across 26 samples and since we lost some low abundant OTUs we work with less OTUs. We can quickly summarize this as well:
#calculate the number of reads found per otu
<- taxa_sums(physeq_rarified)
reads_per_OTU_filt
#summarize the data
summarize_table(reads_per_OTU_filt)
Total number of reads: 28,600
Number of OTUs 913
Number of singleton OTUs: 215
Number of doubleton OTUs: 121
Number of OTUs with less than 10 seqs: 602
Total reads for OTUs with less than 10 seqs: 1,788
Percentage of reads for OTUs with less than 10 seqs: 6.25%
If we plot the sums of each sample, we see that the rarefied data normalized to counts of approximately 900 while during the relative abundance normalization the data was “scaled” to 0-1 (or 0-100%). The CLR-transformed data looks quite different because, after CLR transformation, we no longer work with raw counts. Instead, we focus on the dominance of taxa relative to the geometric mean of all taxa in a sample. This results in values like -2.1, 0.4, and 6.2, which makes plotting sample sums less meaningful. CLR is useful for comparing the relative abundance of taxa within each sample but not for direct comparisons of sample sums.
<- plot_faceted_phyloseq(
faceted_plot Raw = physeq_pruned,
Rarified = physeq_rarified,
`Relative Abundance` = physeq_rel_abundance,
CLR = physeq_clr
)
faceted_plot
Visualize the data
Generate rarefaction curves
Rarefaction curves illustrate how well your sample was sampled. The rarefaction function takes a random subsample of each column in your OTU table of a given size, starting with a very small subsample, and counts how many unique OTUs were observed in that subsample. The analysis is repeated with increasing subsample sizes until the maximum actual read depth for your table is reached.
In an ideal dataset, each curve should reach a plateau, i.e. horizontal asymptote, ideally around the same depth. While this almost never is the case, exploring these graphs gives us an idea how well each sample was sampled.
In our dataset below, we will use the unfiltered data, since rarefaction is highly dependent on the number of singletons in a sample – the rarer the sequence, the more likely it is that it will be observed only once in a sample.
#create a df from the pruned otu table
<- as.data.frame(otu_table(physeq_pruned))
df
#rarefy data with a step size of 50, using tidy = TRUE will return a dataframe instead of a plot
#the step size in the rarefaction function can affect the smoothness of your curves. A smaller step size provides more detailed curves but increases computation time.
<- rarecurve(t(df), step=50, cex=0.5, label = FALSE, tidy = TRUE)
df_rare
#add metadata
<- merge(df_rare, metadata_combined, by.x = "Site", by.y = "sample_id")
df_rare
#transform df and plot
<-
plot_rare |>
df_rare group_by(Site) |>
#find the max value per site
mutate(max_sample = max(Sample)) |>
#for each site add a label for the max value only (we do this to create a single label to add to the plot)
mutate(label = if_else(Sample == max_sample, as.character(Site), NA_character_)) |>
#plot
ggplot(aes(x = Sample, y = Species, color = treatment, group = interaction(Site, treatment))) +
geom_line() +
facet_grid(cols = vars(treatment)) +
geom_text(aes(label = label),
position = position_nudge(x = 2000),
na.rm = TRUE, size = 3) +
custom_theme()
plot_rare
In this example we can see that almost none of the samples reach a plateau, RH is the closest but not there yet. This suggests that we mainly sampled the abundant members of our community and might miss many rare taxa. This is something to keep in mind for other analyses, such as alpha diversity analyses.
Hierarchical clustering
Hierarchical clustering allows to examine how samples cluster on some measure of taxonomic (dis)similarity. As always, there are many ways to do this and below we see one way by clustering samples based on their Bray-Curtis dissimilarity. Very simplyy, if a two samples share fewer taxa then we will the dissimilarity will increase. The Bray-Curtis dissimilarity is zero for samples that have the exact same composition and one for those sharing no taxa.
<- phyloseq::distance(physeq_rel_abundance, method="bray", type="samples")
bray <- as.dendrogram(hclust(bray, method = "ward.D2"))
ward
#Provide color codes
<- data.frame(phyloseq::sample_data(physeq_rel_abundance))
meta <- c(wood = "brown", paper = "blue", mix = "black")
colorCode labels_colors(ward) <- colorCode[meta$treatment][order.dendrogram(ward)]
plot(ward)
In this example, we do not see samples from the same treatment clustering together.
Explore alpha diversity
Alpha diversity measures the diversity within our sample and we distinguish between species richness (i.e. the number of species) and species richness (i.e. how relatively abundant each of the species are).
Something to keep in mind: Generally, richness-based alpha diversity metrics require singletons to estimate OTUs/ASVs observed zero times in the data. These methods try to to estimate how many rare things are around that you didn’t see, and the rare things you did see (that inform you about the unseen others) will be seen 1 or 2 times. Different methods (e.g. rarefaction or Chao’s S1) all have that basic dependence.
However, working with sequencing data we have one caveat: Some sequences are being missclassified as new OTUs/ASVs that simply don’t really exist. More specifically, errors/chimeras/artefacts/contaminants get interpreted as real OTUs (because they are different enough) and those show up largely in the singleton/doubletons and can almost entirely drive richness estimates to nonsensical values. Therefore, interpret your results carefully when using physeq_rarified
#calculate different alpha diversity indicators
<-microbiome::alpha(physeq_rarified, index = "all")
richness_meta
#add the sample id to table
$sample_id <- rownames(richness_meta)
richness_meta
#add other metadata data
<- merge(richness_meta, metadata_combined, by = "sample_id")
richness_meta
#check what parameters are calculated
colnames(richness_meta)
[1] "sample_id" "observed"
[3] "chao1" "diversity_inverse_simpson"
[5] "diversity_gini_simpson" "diversity_shannon"
[7] "diversity_fisher" "diversity_coverage"
[9] "evenness_camargo" "evenness_pielou"
[11] "evenness_simpson" "evenness_evar"
[13] "evenness_bulla" "dominance_dbp"
[15] "dominance_dmn" "dominance_absolute"
[17] "dominance_relative" "dominance_simpson"
[19] "dominance_core_abundance" "dominance_gini"
[21] "rarity_log_modulo_skewness" "rarity_low_abundance"
[23] "rarity_rare_abundance" "treatment"
[25] "Date" "name"
Next, we can generate a plot. Based on our discussion on singletons keep in mind that if you used physeq_filt
to calculate diversity estimates after removing singletons, then you should focus on evenness indices, such as the Shannon index.
#Plot adiv measures
<-
alpha_plot %>%
richness_meta gather(key = metric, value = value, c("observed", "evenness_simpson", "diversity_shannon")) %>%
mutate(metric = factor(metric, levels = c("observed", "evenness_simpson", "diversity_shannon"))) %>%
ggplot(aes(x = treatment, y = value)) +
geom_boxplot(outlier.color = NA) +
geom_jitter(aes(color = Date), height = 0, width = .2) +
labs(x = "", y = "") +
facet_wrap(~ metric, scales = "free") +
#geom_text(aes(label = sample_id), vjust = -1, hjust = 1, size = 3, color = "black") +
theme(legend.position="none")
alpha_plot
#save the figure to our computer
#ggsave(paste0("results/plots/alpha-div.png"), plot=alpha_plot, device="png")
We see that there is no difference when comparing our different samples but that we have some samples with low indices. When we remove the #
before the code chunk starting with geom_text
, we can see that samples with a low index are often the samples with a low read count such as BB in the Shannon plot if we check these via sample_counts[sample_counts$sampleID == "BB", ]
.
Explore beta diversity
In contrast to alpha diversity, beta diversity quantifies the dissimilarity between communities (multiple samples).
Commonly used beta-diversity measures include the:
- Bray-Curtis index (for compositional/abundance data)
- Jaccard index (for presence/absence data)
- Aitchison distance (use Euclidean distance for clr transformed abundances, aiming to avoid the compositionality bias)
- Unifrac distance (that takes into account the phylogenetic tree information)
- …
In general it is best to avoid methods that use Euclidean distance as microbiome data are compositional and represent sparse datasets that are better suited for the above mentioned methods. Aitchison is an exception as it calculates an Euclidean distance only after clr transformation (after which our data is suited for an Euclidean space).
The methods above generate distance matrices that can be used for ordination, which we use to reduce the dimensionality of our data for a more efficient analysis and visualization. Based on the type of algorithm, ordination methods can be divided in two categories:
- unsupervised, which measure variation in the data without additional information on co-variates or other supervision of the model, including:
- Principal Coordinate Analysis (PCoA)
- Principal Component Analysis (PCA)
- Uniform Manifold Approximation and Projection for Dimension Reduction (UMAP)
- Non-metric Multidimensional Scaling (NMDS)
- Principal Coordinate Analysis (PCoA)
- supervised ordination:
- distance-based Redundancy Analysis (dbRDA)
Bray-Curtis
For Bray-Curtis our data should not have negative entries, so we will use the relative abundance physeq object (alternatively, you could also look at the rarefied) data:
#calculate PCOA using Phyloseq package
<- ordinate(physeq_rel_abundance, "PCoA", "bray")
pcoa_bc <- pcoa_bc$values$Eigenvalues
evals
<-
plot_beta plot_ordination(physeq_rel_abundance, pcoa_bc, color = "treatment") +
geom_point(size = 3) +
labs(x = sprintf("PCoA1 [%s%%]", round(evals/sum(evals)*100,1)[1]),
y = sprintf("PCoA2 [%s%%]", round(evals/sum(evals)*100, 2)[2])) +
#add a 95% confidence level for a multivariate t-distribution
stat_ellipse(aes(group = treatment), linetype = 2) +
custom_theme()
plot_beta
This two-dimensions PCOA plot shows 36% of the total variance between the samples. We can also see that our samples are not forming distinct clusters, i.e. microbiomes from the mixed, paper and wood communities appear very similar.
We can also perform our statistics to confirm this visual finding:
#Generate distance matrix
<- phyloseq::distance(physeq_rel_abundance, method = "bray")
dist_matrix
#ADONIS test
adonis2(dist_matrix ~ phyloseq::sample_data(physeq_rel_abundance)$treatment, permutations=9999, method="bray")
Df | SumOfSqs | R2 | F | Pr(>F) | |
---|---|---|---|---|---|
phyloseq::sample_data(physeq_rel_abundance)$treatment | 2 | 0.5411427 | 0.0969948 | 1.235253 | 0.1875 |
Residual | 23 | 5.0379471 | 0.9030052 | NA | NA |
Total | 25 | 5.5790898 | 1.0000000 | NA | NA |
We see that the Pr is greater than 0.05, confirming that there do not seem to be any statistically significant differences between our samples.
NMDS
#NMDS via phyloseq
<- phyloseq::ordinate(physeq_rel_abundance, method="NMDS", distance = "bray") ord_nmds
#view stress value
#a value above 0.2 is deemed suspect and a stress value approaching 0.3 indicates that the ordination is arbitrary.
$stress ord_nmds
[1] 0.160681
<-
plot_nmds plot_ordination(physeq_rel_abundance, ord_nmds, color = "treatment") +
geom_point(size = 3) +
stat_ellipse(aes(group = treatment), linetype = 2) +
custom_theme()
plot_nmds
Aitchison
Next, we generate a beta-diversity ordination using the Aitchison distance as an alternative way to look at beta diversity (and well suited for investigating compositional data). We do this by applying PCA to our centered log-ratio (clr) transformed counts.
#PCA via phyloseq
#RDA without constraints is a PCA
<- phyloseq::distance(physeq_clr, method = "euclidean")
distance_matrix <- phyloseq::ordinate(physeq_clr, method = "RDA", distance = "euclidian")
ord_clr
#Plot scree plot to plot eigenvalues, i.e.the total amount of variance that can be explained by a given principal component
::plot_scree(ord_clr) +
phyloseqgeom_bar(stat="identity") +
custom_theme() +
labs(x = "", y = "Proportion of Variance\n")
#scale axes based on the eigenvalues
<- ord_clr$CA$eig[1] / sum(ord_clr$CA$eig)
clr1 <- ord_clr$CA$eig[2] / sum(ord_clr$CA$eig)
clr2
#and plot
::plot_ordination(physeq_filt, ord_clr, color = "treatment") +
phyloseqgeom_point(size = 2) +
coord_fixed(clr2 / clr1) +
stat_ellipse(aes(group = treatment), linetype = 2) +
custom_theme()
While PCA is an exploratory data visualization tool, we can test whether the samples cluster beyond that expected by sampling variability using permutational multivariate analysis of variance (PERMANOVA) with the adonis package.
#Generate distance matrix
<- phyloseq::distance(physeq_clr, method = "euclidean")
clr_dist_matrix
#ADONIS test
adonis2(clr_dist_matrix ~ phyloseq::sample_data(physeq_clr)$treatment, permutations=9999, method="bray")
Df | SumOfSqs | R2 | F | Pr(>F) | |
---|---|---|---|---|---|
phyloseq::sample_data(physeq_clr)$treatment | 2 | 4294.13 | 0.0913237 | 1.155772 | 0.2003 |
Residual | 23 | 42726.85 | 0.9086763 | NA | NA |
Total | 25 | 47020.98 | 1.0000000 | NA | NA |
Visualize the taxonomic distribution
Next, we want to plot the taxa distribution. Let us first look at the most abundant phyla and check how similar our different samples are:
#condense data at specific tax rank, i.e. on phylum level
<- tax_glom(physeq_rel_abundance, "Phylum", NArm=FALSE)
grouped_taxa
#find top19 most abundant taxa
<- names(sort(taxa_sums(grouped_taxa), TRUE)[1:19])
top_taxa
#make a list for the remaining taxa
<- names(taxa_sums(grouped_taxa))[which(!names(taxa_sums(grouped_taxa)) %in% top_taxa)]
other_taxa
#group the low abundant taxa into one group
= merge_taxa(grouped_taxa, other_taxa, 2)
merged_physeq
#transform phyloseq object into dataframe
<- psmelt(merged_physeq)
df
#cleanup the names in the df
names(df)[names(df) == "Phylum"] <- "tax_level"
#replace NAs, with other (NAs are generated for the other category)
$tax_level[which(is.na(df$tax_level))] <- "Other"
df
#create a df to sort taxa labels by abundance
<- df |>
sorted_labels group_by(tax_level) |>
summarise(sum = sum(Abundance)) |>
arrange(desc(sum))
#Get list of sorted levels excluding "Other"
<- setdiff(sorted_labels$tax_level, "Other")
desired_levels
#sort df using the sorted levels and ensure that "Other" is the last category
$tax_level2 <- factor(df$tax_level, levels = c(desired_levels, "Other"))
df
#generate color scheme
<- c25[1:length(unique(df$tax_level2))]
cols levels(df$tax_level2) == "Other"] <- "#CCCCCC"
cols[
#plot
<-
fig ggplot(df, aes(x = Sample, y = Abundance, fill = tax_level2) ) +
geom_bar(position = "stack", stat = "identity", width = 0.9) +
labs(y = "Relative abundance", x = "", title = paste0("Relative abundance at ", "Phylum", " rank")) +
scale_fill_manual(name = paste0("Phylum","_rank"), labels = levels(df$tax_level2), values = cols) +
facet_grid(cols = vars(treatment), scales = "free", space = "free") +
#scale_y_continuous(expand = c(0, 0), limits = c(0, 1.01)) +
custom_theme() +
theme(axis.text.x = element_text(angle = 45, hjust = 1 ) ) +
guides(fill=guide_legend(title = "Phylum"))
fig
Since we want to generate one plot for each taxonomic rank, i.e. Phylum, Class, Order,…, we can do this in a loop. The figures will be generated in the folder results/plots/
.
#if not there already, create output folder
="results/plots/"
img_pathdir.create(img_path, recursive = TRUE)
#generate one barplot for each taxonomic level
for (level in colnames(taxonomy_file)){
#condense data at specific tax rank, i.e. on phylum level
<- tax_glom(physeq_rel_abundance, level)
grouped_taxa
#find top19 most abundant taxa
<- names(sort(taxa_sums(grouped_taxa), TRUE)[1:19])
top_taxa
#make a list for the remaining taxa
<- names(taxa_sums(grouped_taxa))[which(!names(taxa_sums(grouped_taxa)) %in% top_taxa)]
other_taxa
#group the low abundant taxa into one group
= merge_taxa(grouped_taxa, other_taxa, 2)
merged_physeq
#transform phyloseq object into dataframe
<- psmelt(merged_physeq)
df
#cleanup the dataframe
names(df)[names(df) == level] <- "tax_level"
$tax_level[which(is.na(df$tax_level))] <- "Other"
df
#create a df to sort taxa labels by abundance
<- df |>
sorted_labels group_by(tax_level) |>
summarise(sum = sum(Abundance)) |>
arrange(desc(sum))
#Get list of sorted levels excluding "Other"
<- setdiff(sorted_labels$tax_level, "Other")
desired_levels
#sort df using the sorted levels and ensure that "Other" is the last category
$tax_level2 <- factor(df$tax_level, levels = c(desired_levels, "Other"))
df
#generate color scheme
<- c25[1:length(unique(df$tax_level2))]
cols levels(df$tax_level2) == "Other"] <- "#CCCCCC"
cols[
#plot
<-
fig ggplot(df, aes(x = Sample, y = Abundance, fill = tax_level2) ) +
geom_bar(position = "stack", stat = "identity", width = 0.9) +
labs(y = "Relative abundance", x = "", title = paste0("Relative abundance at ", level, " rank")) +
scale_fill_manual(name = paste0(level,"_rank"), labels = levels(df$tax_level2), values = cols) +
facet_grid(cols = vars(treatment), scales = "free", space = "free") +
scale_y_continuous(expand = c(0, 0), limits = c(0, 1.01)) +
custom_theme() +
theme(axis.text.x = element_text(angle = 45, hjust = 1 ) ) +
guides(fill=guide_legend(title=level))
ggsave(paste0(img_path, level, "_barplot_ra.png"), plot = fig, device="png")
}
Generated plots:
Test for differental abundances
Notice: This is still in development and needs to be optimized, use with care!
Next, we want to test, whether any OTUs differ among our different conditions, i.e. sampling dates or treatment.
Notice, that there is no gold standard how to do these analyses and there are many challenges analysing this kind of data leading to multiple approaches to solve these challenges. The issues we face are:
- Compositionality of the data and that abundances of the taxa are not independent of each other
- There are many zeros in our data, i.e. many taxa are absent in the majority of samples
- Skewed distributions since most taxa are rare and only a few are abundant
Two-factor testing
First, let us compare for differences for a category where we compare two factors. For example, we might want to ask whether there are any significant differences when comparing our samples isolated at different dates.
Let’s first test this using the non-parametric Wilcoxon rank-sum test.
#Generate data.frame with OTUs and metadata
<- as.data.frame(t(as.data.frame(otu_table(physeq_clr))))
ps_wilcox $Date <- sample_data(physeq_clr)$Date
ps_wilcox
#Define functions to pass to map
<- function(df){
wilcox_model wilcox.test(abund ~ Date, data = df)
}
<- function(df){
wilcox_pval wilcox.test(abund ~ Date, data = df)$p.value
}
#Create nested data frames by OTU and loop over each using map
<- ps_wilcox |>
wilcox_results gather(key = OTU, value = abund, -Date) |>
group_by(OTU) |>
nest() |>
mutate(wilcox_test = map(data, wilcox_model),
p_value = map(data, wilcox_pval))
#Show results
$wilcox_test[[1]] wilcox_results
Wilcoxon rank sum exact test
data: abund by Date
W = 124, p-value = 0.04412
alternative hypothesis: true location shift is not equal to 0
Ok, the p-value is below 0.05, so there seems to be something going on, let’s zoom in and look for differences per OTU level (genus rank in our case). To do this we first will correct for multiple comparison testing via false discovery rate (FDR). We also will filter the table to only include values with an FDR < 0.001.
#unnest df
<- wilcox_results |>
wilcox_results ::select(OTU, p_value) |>
dplyrunnest(cols = c(p_value))
#Computing FDR corrected p-values (since we do multiple statistical comparisons)
<- wilcox_results |>
wilcox_results arrange(p_value) |>
mutate(BH_FDR = p.adjust(p_value, "fdr")) |>
filter(BH_FDR < 0.001) |>
::select(OTU, p_value, BH_FDR, everything())
dplyr
head(wilcox_results)
OTU | p_value | BH_FDR |
---|---|---|
Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Ika33;uncultured bacterium | 0.0000523 | 0.0000523 |
Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;NA;uncultured bacterium | 0.0001304 | 0.0001304 |
Bacteria;Bacteroidetes;Chlorobia;Chlorobiales;Chlorobiaceae;Prosthecochloris | 0.0003840 | 0.0003840 |
Bacteria;Firmicutes;Clostridia;Clostridiales;Clostridiaceae 1;Clostridium sensu stricto 1 | 0.0004930 | 0.0004930 |
Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Marinilabiliaceae;Marinilabilia | 0.0006280 | 0.0006280 |
Bacteria;Proteobacteria;Deltaproteobacteria;Bdellovibrionales;Bacteriovoracaceae;uncultured | 0.0006280 | 0.0006280 |
There, seem to be a few OTUs that are distinct between sampling dates, lets explore them further by generating some plots:
#extract oTUs of interest from the phyloseq tax and otu tables
<- cbind(as.data.frame(as(tax_table(physeq_clr)[as.character(wilcox_results$OTU), ], "matrix")),
df as.data.frame(as(otu_table(physeq_clr)[as.character(wilcox_results$OTU), ], "matrix")))
#convert the long into a wide dataframe
<- reshape2::melt(df)
df_long
#add the metadata
<- merge(df_long, metadata_combined, by.x = "variable", by.y = "sample_id")
df_long
#plot
ggplot(df_long, aes(x=Genus, y=value, color = Date)) +
geom_boxplot(outlier.shape = NA) +
geom_point(position = position_jitterdodge(jitter.width=0.1), size = 0.9) +
custom_theme() +
facet_wrap(vars(Genus), scales = "free") +
scale_color_manual(values = c("orange", "purple")) +
xlab("") +
ylab("CLR abundance") +
theme(axis.text.x = element_blank())
We can see that all of the OTUs that showed differences between sampling dates are very low abundant OTUs.
Wilcoxon has some down-sites when it comes to treating zero values, an alternative approach is to use the ANOVA-like differential expression (ALDEx2) method. The aldex function is a wrapper that performs log-ratio transformation and statistical testing in a single line of code, which is why we feed the non-normalized data into it:
<- ALDEx2::aldex(data.frame(phyloseq::otu_table(physeq_filt)),
aldex2_da ::sample_data(physeq_filt)$Date, test="t", effect = TRUE, denom="iqlr") phyloseq
Specifically, this function:
- generates Monte Carlo samples of the Dirichlet distribution for each sample,
- converts each instance using a log-ratio transform,
- returns test results for two sample (Welch’s t, Wilcoxon) or multi-sample (glm, Kruskal-Wallace) tests.
Bu using iqlr
the function accounts for data with systematic variation and centers the features on the set features that have variance that is between the lower and upper quartile of variance. This provides results that are more robust to asymmetric features between groups.
Next, we can plot the effect size. This plot shows the median log2 fold difference by the median log2 dispersion (a measure of the effect size by the variability). Differentially abundant taxa will be those where the difference exceeds the dispersion. Points toward the top of the figure are more abundant in our samples from sampling date 2 while those towards the bottom are more abundant in sampling date 1. Taxa with BH-FDR corrected p-values would be shown in red.
#plot effect sizes
::aldex.plot(aldex2_da, type="MW", test="wilcox", called.cex = 1, cutoff.pval = 0.001) ALDEx2
Finally, we can print the output with the taxa information added.
#Clean up presentation
<- aldex2_da |>
sig_aldex2 rownames_to_column(var = "OTU") |>
filter(wi.eBH < 0.001) |>
arrange(effect, wi.eBH) |>
::select(OTU, diff.btw, diff.win, effect, wi.ep, wi.eBH)
dplyr
head(sig_aldex2)
OTU | diff.btw | diff.win | effect | wi.ep | wi.eBH |
---|
In our case an empty dataframe is returned, no significant differences were detected when using this statistical approach.
Mutli-factor testing using phyloseq’s mt function
Next, we want to test, if there are any OTUs that differ between our three distinct growth conditions, i.e. treatments. One way to do this is to use phyloseqs mt
function.
The default test applied to each taxa is a two-sample two-sided t.test. In our case this would fail with an error since we want to look at a data variable that contains three classes. To run this, we add test = "f"
to run a F-test. Additionally, we use FDR as multiple-hypthesis correction. B specifies the number of permutations.
#F-test, by specifying test="f"
<- mt(physeq_clr, "treatment", method = "fdr", test = "f", B = 300) res
B=300
b=3 b=6 b=9 b=12 b=15 b=18 b=21 b=24 b=27 b=30
b=33 b=36 b=39 b=42 b=45 b=48 b=51 b=54 b=57 b=60
b=63 b=66 b=69 b=72 b=75 b=78 b=81 b=84 b=87 b=90
b=93 b=96 b=99 b=102 b=105 b=108 b=111 b=114 b=117 b=120
b=123 b=126 b=129 b=132 b=135 b=138 b=141 b=144 b=147 b=150
b=153 b=156 b=159 b=162 b=165 b=168 b=171 b=174 b=177 b=180
b=183 b=186 b=189 b=192 b=195 b=198 b=201 b=204 b=207 b=210
b=213 b=216 b=219 b=222 b=225 b=228 b=231 b=234 b=237 b=240
b=243 b=246 b=249 b=252 b=255 b=258 b=261 b=264 b=267 b=270
b=273 b=276 b=279 b=282 b=285 b=288 b=291 b=294 b=297 b=300
r=10 r=20 r=30 r=40 r=50 r=60 r=70 r=80 r=90 r=100
r=110 r=120 r=130 r=140 r=150 r=160 r=170 r=180 r=190 r=200
r=210 r=220 r=230 r=240 r=250 r=260 r=270 r=280 r=290 r=300
r=310 r=320 r=330 r=340 r=350 r=360 r=370 r=380 r=390 r=400
r=410 r=420 r=430 r=440 r=450 r=460 r=470 r=480 r=490 r=500
r=510 r=520 r=530 r=540 r=550 r=560 r=570 r=580 r=590 r=600
r=610 r=620 r=630 r=640 r=650 r=660 r=670 r=680 r=690 r=700
r=710 r=720 r=730 r=740 r=750 r=760 r=770 r=780 r=790 r=800
r=810 r=820 r=830 r=840 r=850 r=860 r=870 r=880 r=890 r=900
r=910 r=920 r=930 r=940 r=950 r=960 r=970 r=980 r=990 r=1000
r=1010
#identify significant otus
<- res |>
res_sig filter(adjp < 0.001)
head(res_sig)
index | teststat | rawp | adjp | plower | Kingdom | Phylum | Class | Order | Family | Genus | fdr |
---|
We get an empty dataframe, which means that no significant differences were detected.
Mutli-factor testing using DeSeq2
DESeq2 performs a differential expression analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution. DeSeq normalizes the data throughout its analysis, so we input only the filtered data.
#convert treatment column to a factor
sample_data(physeq_filt)$treatment <- as.factor(sample_data(physeq_filt)$treatment)
#Convert the phyloseq object to a DESeqDataSet object:
<- phyloseq_to_deseq2(physeq_filt, ~ treatment) ds
converting counts to integer mode
#run DESeq
= DESeq(ds) ds
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 82 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
# pulling out our results table, we specify the object, the p-value we are going to use to filter our results, and what contrast we want to consider by first naming the column, then the two groups we care about
<- results(ds, alpha=0.01, contrast=c("treatment", "paper", "wood"))
ds_paper_vs_wood
#extract only the ones that pass
<- ds_paper_vs_wood[which(ds_paper_vs_wood$padj < 0.01), ]
ds_paper_vs_wood_sig
#add the taxonomy levels
<- cbind(as(ds_paper_vs_wood_sig, "data.frame"),
ds_paper_vs_wood_sig_with_tax as(tax_table(physeq_filt)[row.names(ds_paper_vs_wood_sig), ], "matrix"))
Plot significant OTUs (log2fold change):
#plot
ggplot(ds_paper_vs_wood_sig_with_tax, aes(x=Genus, y=log2FoldChange, color=Phylum)) +
geom_jitter(size=3, width = 0.2) +
custom_theme() +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
Plot significant OTUs (relative abundance):
#extract relevant data for significant otus
<- cbind(as.data.frame(as(tax_table(physeq_clr)[rownames(ds_paper_vs_wood_sig_with_tax), ], "matrix")),as.data.frame(as(otu_table(physeq_clr)[rownames(ds_paper_vs_wood_sig_with_tax), ], "matrix")))
df
#convert to long df
<- reshape2::melt(df)
df_long
#add metadata
<- merge(df_long, metadata_combined, by.x = "variable", by.y = "sample_id")
df_long
#plot
ggplot(df_long, aes(x=Genus, y=value, color = treatment)) +
geom_boxplot(outlier.shape = NA) +
geom_point(position = position_jitterdodge(jitter.width=0.1), size = 0.9) +
custom_theme() +
facet_wrap(vars(Genus), scales = "free") +
scale_color_manual(values = c("grey", "orange", "purple")) +
xlab("") +
ylab("CLR abundance") +
theme(axis.text.x = element_blank())
Notice, how in our plots there seems to be no major difference between the wood and paper samples for these OTUs. Considering that both OTUs have marked outliers, it is likely that our statistics are driven a lot by these outliers and we should interpret our results with care.
Also, remember that these two genera are also not returned with the other statistical test we did. If we stay conservative we would say that we see no differences between the different sample types.