Get Started with Phyloseq

We are using the Bioconductor package phyloseq for most parts of the data analysis. It is a convenient package to hand and analyse high-throughput microbiome census data. It comes with a detail manual, a compact vignette, and even an interactive web application. Besides the clear advantages of storing your sequencing experiment as a phyloseq class object, it might be dismissed as being difficult to get used to. Believe us, it is worth the effort. Following a few basic steps for you to get familiar with phyloseq.

Clean/Reset Environment

Make it a habit and start a new R session with the broom.

rm(list = ls())

Bioconductor Packages

library("phyloseq"); packageVersion("phyloseq")
library("microbiome"); packageVersion("microbiome")
library("descr"); packageVersion("descr")

Get Data

## Where am I?
getwd()

## My document path
# setwd("my/folder/") # adjust accordingly

## Working directory for MDA workshop
# dir.create("MDA"); setwd("MDA")
# Careful: you might have already created one in the previous section!

## Create Working Directory
dir.create("Dummy"); setwd("Dummy")

## Download data (R data image file)
dummy.url <- "https://www.gdc-docs.ethz.ch/MDA/data/DummyData.zip"
utils::download.file(dummy.url, destfile = "DummyData.zip")
unzip("DummyData.zip")

## View Content
list.files()
# [1] "all.rda"                     "Dummy_MapFile.tsv"           "Dummy_MapFile.txt"           "Dummy_OTU_ref.fa"           
# [5] "Dummy_OTU_rooted_tree.nwk"   "Dummy_OTU_table.tsv"         "Dummy_OTU_table.txt"         "Dummy_OTU_taxa.tsv"         
# [9] "Dummy_OTU_tree.tre"          "Dummy_OTU_unrooted_tree.nwk" "DummyData.zip"               "test.rda"                   

## View File Content
descr::file.head("Dummy_MapFile.tsv", n = 14)
descr::file.head("Dummy_OTU_table.txt", n = 3)
descr::file.head("Dummy_OTU_rooted_tree.nwk", n = 1)

Import Help

?import
?read_tree

Import from tab separated files

map.A <- read.table("Dummy_MapFile.tsv")
otu.A <- as.matrix(read.table("Dummy_OTU_table.tsv"))
tax.A <- as.matrix(read.table("Dummy_OTU_taxa.tsv"))
tre.A <- read_tree("Dummy_OTU_unrooted_tree.nwk")
#tre.A <- read_tree("Dummy_OTU_rooted_tree.nwk")

dA <- phyloseq(
  otu_table(otu.A, taxa_are_rows = TRUE),
  sample_data(map.A),
  tax_table(tax.A),
  phy_tree(tre.A)

)

dA
otu_table(dA)
tax_table(dA)

Alternative import from text files

map.B <- "Dummy_MapFile.txt"
otu.B <- "Dummy_OTU_table.txt"
#tre.B <- "Dummy_OTU_unrooted_tree.nwk"
tre.B <- "Dummy_OTU_rooted_tree.nwk"
ref.B <- "Dummy_OTU_ref.fa"

dB <-
  import_qiime(
    mapfilename = map.B,
    otufilename = otu.B,
    treefilename = tre.B,
    refseqfile = ref.B,
    verbose = TRUE
  )

dB
otu_table(dB)
summary(dB)
mode(dB)
tax_table(dB)

Phyloseq object

# Summarize phyloseq object
microbiome::summarize_phyloseq(dA)
microbiome::summarize_phyloseq(dB)

refseq(dA) # missing reference sequences > empty slot
refseq(dB) # reference are present 

# Dataset dA and dB are more or less indetical. The biggest difference is that
# dataset dB contains the reference (OTU) sequences.
# I would not import the reference sequences by default. It increases the object
# size and might not be needed at the end.

Phyloseq Components - Extraction

## Extract the OTU table plus taxonomic information
otu.tax.table <- data.frame(OTU = rownames(phyloseq::otu_table(dB)@.Data),
                            phyloseq::otu_table(dB)@.Data,
                            phyloseq::tax_table(dB)@.Data,
                            check.names = FALSE
                            )
otu.tax.table

## Extract and transposed OTU table
otu.tab <- t(data.frame(phyloseq::otu_table(dB),
                        check.names = FALSE)
             )
otu.tab

## Extract meta information
meta <- data.frame(phyloseq::sample_data(dB), 
                   check.names = FALSE
                   )
meta

## Combine count table and meta information
otu.meta.tab <- as.data.frame(cbind(otu.tab, meta))
otu.meta.tab

## Import tree (Newark format) 
u.tree <- ape::read.tree("Dummy_OTU_unrooted_tree.nwk")
r.tree <- ape::read.tree("Dummy_OTU_rooted_tree.nwk")

u.tree
r.tree

par(mfrow = c(1,2))
  plot(u.tree, edge.width = 3, cex = 1,
       edge.color = "tomato", 
       label.offset = 0.01, 
       tip.color = "blue",

       main = "Unrooted Tree")
  plot(r.tree, edge.width = 3, cex = 1,
       edge.color = "tomato", 
       label.offset = 0.01, 
       tip.color = "blue",
       main = "Rooted Tree (OTU15)")
par(mfrow = c(1,1))

## Re-root tree
tree_rerooted <- root(u.tree, outgroup = "OTU15", resolve.root = TRUE)

par(mfrow = c(1,3))
  plot(u.tree, edge.width = 3, cex = 1.5,
     edge.color = "tomato", 
     label.offset = 0.01, 
     tip.color = "blue", 
     main = "Unrooted Tree")
  plot(r.tree, edge.width = 3, cex = 1.5, 
     edge.color = "tomato", 
     label.offset = 0.01, 
     tip.color = "blue",
     main = "Rooted Tree (OTU15)")
  plot(tree_rerooted, edge.width = 3, cex = 1.5,
       edge.color = "tomato", 
       label.offset = 0.01,
       tip.color = "blue",
       main = "Re-Rooted Tree (OTU15)")
par(mfrow = c(1,1))

## Circular tree (plot_tree {phyloseq})
plot_tree(dB, color = "GID", 
          label.tips = "taxa_names", 
          shape = "GID", ladderize = "left") + 
  coord_polar(theta = "y")

Phyloseq Accessors

## We simplify 
d <- dB
## Overview
ntaxa(d)         # number of OTUs
nsamples(d)      # number of samples

taxa_names(d)    # OTU labels
sample_names(d)  # sample id

taxa_sums(d)     # Count per OTU
taxa_sums(d)[c("OTU4","OTU12")]

sample_sums(d)   # Counts per sample - sequencing depth
sample_sums(d)[c("SA1","SA2")]

rank_names(d)    # Taxonomic rank names

sample_variables(d) # Variable defined in mapfile

get_taxa(d)         # Count table
get_sample(d)       # Count table
get_variable(d)     # Meta info

Write phyloseq

# write_phyloseq(d, "OTU")
# write_phyloseq(d, "TAXONOMY")
# write_phyloseq(d, "METADATA")