amplicon_workflow.html

<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />


<meta name="author" content="Author: Linton Freund (hfreu002@ucr.edu)" />


<title>Amplicon Analysis Workflow</title>

<script src="site_libs/header-attrs-2.20/header-attrs.js"></script>
<script src="site_libs/jquery-3.6.0/jquery-3.6.0.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/cosmo.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<style>h1 {font-size: 34px;}
       h1.title {font-size: 38px;}
       h2 {font-size: 30px;}
       h3 {font-size: 24px;}
       h4 {font-size: 18px;}
       h5 {font-size: 16px;}
       h6 {font-size: 12px;}
       code {color: inherit; background-color: rgba(0, 0, 0, 0.04);}
       pre:not([class]) { background-color: white }</style>
<script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<script src="site_libs/navigation-1.1/codefolding.js"></script>
<link href="site_libs/font-awesome-5.1.0/css/all.css" rel="stylesheet" />
<link href="site_libs/font-awesome-5.1.0/css/v4-shims.css" rel="stylesheet" />

<style type="text/css">
  code{white-space: pre-wrap;}
  span.smallcaps{font-variant: small-caps;}
  span.underline{text-decoration: underline;}
  div.column{display: inline-block; vertical-align: top; width: 50%;}
  div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
  ul.task-list{list-style: none;}
    </style>


<style type="text/css">
  code {
    white-space: pre;
  }
  .sourceCode {
    overflow: visible;
  }
</style>
<style type="text/css" data-origin="pandoc">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { color: #008000; } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { color: #008000; font-weight: bold; } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */

.sourceCode .row {
  width: 100%;
}
.sourceCode {
  overflow-x: auto;
}
.code-folding-btn {
  margin-right: -30px;
}
</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
  var sheets = document.styleSheets;
  for (var i = 0; i < sheets.length; i++) {
    if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
    try { var rules = sheets[i].cssRules; } catch (e) { continue; }
    var j = 0;
    while (j < rules.length) {
      var rule = rules[j];
      // check if there is a div.sourceCode rule
      if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
        j++;
        continue;
      }
      var style = rule.style.cssText;
      // check if color or background-color is set
      if (rule.style.color === '' && rule.style.backgroundColor === '') {
        j++;
        continue;
      }
      // replace div.sourceCode by a pre.sourceCode rule
      sheets[i].deleteRule(j);
      sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
    }
  }
})();
</script>


<style type="text/css">
/* for pandoc --citeproc since 2.11 */
div.csl-bib-body { }
div.csl-entry {
  clear: both;
}
.hanging div.csl-entry {
  margin-left:2em;
  text-indent:-2em;
}
div.csl-left-margin {
  min-width:2em;
  float:left;
}
div.csl-right-inline {
  margin-left:2em;
  padding-left:1em;
}
div.csl-indent {
  margin-left: 2em;
}
</style>


<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
img {
  max-width:100%;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
summary {
  display: list-item;
}
details > summary > p:only-child {
  display: inline;
}
pre code {
  padding: 0;
}
</style>


<style type="text/css">
.dropdown-submenu {
  position: relative;
}
.dropdown-submenu>.dropdown-menu {
  top: 0;
  left: 100%;
  margin-top: -6px;
  margin-left: -1px;
  border-radius: 0 6px 6px 6px;
}
.dropdown-submenu:hover>.dropdown-menu {
  display: block;
}
.dropdown-submenu>a:after {
  display: block;
  content: " ";
  float: right;
  width: 0;
  height: 0;
  border-color: transparent;
  border-style: solid;
  border-width: 5px 0 5px 5px;
  border-left-color: #cccccc;
  margin-top: 5px;
  margin-right: -10px;
}
.dropdown-submenu:hover>a:after {
  border-left-color: #adb5bd;
}
.dropdown-submenu.pull-left {
  float: none;
}
.dropdown-submenu.pull-left>.dropdown-menu {
  left: -100%;
  margin-left: 10px;
  border-radius: 6px 0 6px 6px;
}
</style>

<script type="text/javascript">
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark the anchor link active (and if it's in a dropdown, also mark that active)
  var dropdown = menuAnchor.closest('li.dropdown');
  if (window.bootstrap) { // Bootstrap 4+
    menuAnchor.addClass('active');
    dropdown.find('> .dropdown-toggle').addClass('active');
  } else { // Bootstrap 3
    menuAnchor.parent().addClass('active');
    dropdown.addClass('active');
  }

  // Navbar adjustments
  var navHeight = $(".navbar").first().height() + 15;
  var style = document.createElement('style');
  var pt = "padding-top: " + navHeight + "px; ";
  var mt = "margin-top: -" + navHeight + "px; ";
  var css = "";
  // offset scroll position for anchor links (for fixed navbar)
  for (var i = 1; i <= 6; i++) {
    css += ".section h" + i + "{ " + pt + mt + "}\n";
  }
  style.innerHTML = "body {" + pt + "padding-bottom: 40px; }\n" + css;
  document.head.appendChild(style);
});
</script>

<!-- tabsets -->

<style type="text/css">
.tabset-dropdown > .nav-tabs {
  display: inline-table;
  max-height: 500px;
  min-height: 44px;
  overflow-y: auto;
  border: 1px solid #ddd;
  border-radius: 4px;
}

.tabset-dropdown > .nav-tabs > li.active:before, .tabset-dropdown > .nav-tabs.nav-tabs-open:before {
  content: "\e259";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
  content: "\e258";
  font-family: 'Glyphicons Halflings';
  border: none;
}

.tabset-dropdown > .nav-tabs > li.active {
  display: block;
}

.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
  border: none;
  display: inline-block;
  border-radius: 4px;
  background-color: transparent;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
  display: block;
  float: none;
}

.tabset-dropdown > .nav-tabs > li {
  display: none;
}
</style>

<!-- code folding -->
<style type="text/css">
.code-folding-btn { margin-bottom: 4px; }
</style>


<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}

@media print {
.toc-content {
  /* see https://github.com/w3c/csswg-drafts/issues/4434 */
  float: right;
}
}

.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
}

.tocify .list-group-item {
  border-radius: 0px;
}


</style>


</head>

<body>


<div class="container-fluid main-container">


<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">


<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-bs-toggle="collapse" data-target="#navbar" data-bs-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">The Resources</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Home</a>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Workflows
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="amplicon_workflow.html">
        <span class="fa fa-solid fa-dna"></span>
         
        Amplicon Sequencing Worfklow
      </a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Tutorials
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="Basic_Bash_Tutorial.html">
        <span class="fa fa-solid fa-terminal"></span>
         
        Bash Basics Tutorial
      </a>
    </li>
  </ul>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<div id="header">

<div class="btn-group pull-right float-right">
<button type="button" class="btn btn-default btn-xs btn-secondary btn-sm dropdown-toggle" data-toggle="dropdown" data-bs-toggle="dropdown" aria-haspopup="true" aria-expanded="false"><span>Code</span> <span class="caret"></span></button>
<ul class="dropdown-menu dropdown-menu-right" style="min-width: 50px;">
<li><a id="rmd-show-all-code" href="#">Show All Code</a></li>
<li><a id="rmd-hide-all-code" href="#">Hide All Code</a></li>
</ul>
</div>


<h1 class="title toc-ignore">Amplicon Analysis Workflow</h1>
<h4 class="author">Author: Linton Freund (<a
href="mailto:hfreu002@ucr.edu" class="email">hfreu002@ucr.edu</a>)</h4>
<h4 class="date">Last update: 02 October, 2023</h4>

</div>


<style type="text/css">
  body{
  font-size: 13pt;
}
</style>
<!--
Render from R:
rmarkdown::render("Amplicon_Workflow.Rmd", clean=TRUE, output_format="html_document")
R

Note: if you render as a PDF then try to render a website, change the date format above (see here https://stackoverflow.com/questions/39415294/avoid-yaml-header-change-when-switching-knitr-output-format)

Rendering from the command-line. To render to PDF format, use the argument setting: output_format="pdf_document".
$ Rscript -e "rmarkdown::render('Amplicon_Workflow.Rmd', output_format='html_document', clean=TRUE)"

Add logo:
htmltools::img(src = knitr::image_uri("mylogo.png"), 
               alt = 'logo', 
               style = 'position:absolute; top:0; center:0; padding:10px;')
-->
<div id="background" class="section level1" number="1">
<h1><span class="header-section-number">1</span> Background</h1>
<p>This is a tutorial on how to process your 16S ITS1/ITS2 amplicon
sequences and identify the taxonomic identification of the ASVs (i.e.,
amplicon sequence variants, also known as zOTUs for zero OTUs or ESVs
for exact sequence variants) in your sequence data.</p>
<p>To create this tutorial, I have assembled scripts I’ve used to
analyze 16S amplicon sequence data provided by Dr. Emma Aronson’s lab.
The data I am working with to create this workflow comes from a project
that examined soil microbial community composition in Mount Saint
Helens. The target region was the V4 region within the 16S gene, and
sequencing was performed with an Illumina MiSeq (2x300).</p>
<p>This tutorial would not have been possible without <a
href="https://callahanlab.cvm.ncsu.edu/">Dr. Benjamin Callahan’s</a>
<code>DADA2</code> <a
href="https://github.com/benjjneb/dada2">program</a> <span
class="citation">(Callahan et al. 2016)</span> and <a
href="https://benjjneb.github.io/dada2/tutorial.html">tutorials</a>.
Additionally, I would like to especially thank <a
href="https://astrobiomike.github.io/research/">Dr. Mike Lee</a> for his
guidance, his patience, and his Happy Belly Bioinformatics tutorial
called <a
href="https://astrobiomike.github.io/amplicon/dada2_workflow_ex#removing-likely-contaminants"><strong>Amplicon
Analysis</strong></a> tutorial <span class="citation">(Lee
2019)</span>.</p>
<p><strong>You can find all of the scripts used in this workflow in this
<a
href="https://github.com/hlfreund/Amplicon_Sequencing_Worfklow">GitHub
repository</a> or you can download it from SourceForge or Zenodo, which
is linked in the <a href="#about-me">About Me</a> section.</strong></p>
<p><strong>If you do use this workflow, please cite this using the DOI,
which is included in the <a href="#about-me">About Me</a>
section</strong></p>
<div id="considerations-before-you-begin" class="section level2"
number="1.1">
<h2><span class="header-section-number">1.1</span> Considerations before
you begin</h2>
<p>I was able to analyze these sequences on a High Performance Computing
cluster (HPCC) that uses a Slurm scheduler. The minimum amount of total
memory I used (not per CPU, but overall) for each step in this workflow
(i.e., each step as a separate ‘job’ via Slurm) was 400GB. Having enough
memory is essential to running most of these programs, so please keep
this in mind before embarking on this workflow!</p>
<p>These steps are also time consuming depending on your memory
constraints, so do not be concerned if this process takes a while. If
you plan to run through multiple steps in the workflow in one sitting,
then I suggest loading <strong>tmux</strong> before you run your
scripts. Here is a handy <a href="https://tmuxcheatsheet.com/">tmux
cheat sheet</a> that I refer to often. For more information on what tmux
is and how to utilize it, check this <a
href="https://thoughtbot.com/blog/a-tmux-crash-course">tmux crash
course</a> by Josh Clayton.</p>
<p>I also suggest exploring a program called <strong>neovim</strong>
(aka nvim) that allows you to use Vim (a text editor) to edit R code and
run the code simultaneously. Though nvim is not necessary to run through
this workflow, I find that it makes my life a bit easier when running
through the <code>DADA2</code> portion of the workflow. I will get more
into the usage of nvim once we get to the <code>DADA2</code> step(s),
but for more information please view the <a
href="https://github.com/jalvesaq/Nvim-R">Neovim Github</a> as well as
its <a
href="https://github.com/jamespeapen/Nvim-R/wiki#overview">documentation</a>.
You can also find a helpful nvim tutorial <a
href="https://girke.bioinformatics.ucr.edu/GEN242/tutorials/linux/linux/#nvim-r-tmux-essentials">here</a>
created by Dr. Thomas Girke from UC Riverside.</p>
<p>Additionally, you will need to change your path to each of these
programs depending on where they are stored in your computer or HPCC. If
you are running these steps locally (which, if you are, then you have
one badass computer!), then you can skip the module loading lines in
each step – loading modules is specifically for running these scripts on
a HPCC that uses a Slurm Workload Manager.</p>
</div>
<div id="submitting-scripts-as-jobs-with-slurm" class="section level2"
number="1.2">
<h2><span class="header-section-number">1.2</span> Submitting Scripts as
Jobs with Slurm</h2>
<p>If you are unsure as to how to set up the script for submitting on
your HPCC, check the code chunk below. This is the information I use
when submitting a job to our Slurm system. Again, this is specifically
for a system that uses the Slurm scheduler. For more information on what
the arguments mean and how to set up your job submissions, please refer
to this handy <a
href="https://slurm.schedmd.com/pdfs/summary.pdf">cheatsheet</a> made by
Slurm.</p>
<p><strong>NOTE</strong>: If you are running these scripts on an HPCC,
please load the module you need before running, or add
<code>load module name_of_module</code> to your script before you call
on the program you want to use.</p>
<div class="sourceCode" id="cb1"><pre
class="sourceCode bash"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a><span class="co">#!/bin/bash -l</span></span>
<span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a></span>
<span id="cb1-3"><a href="#cb1-3" tabindex="-1"></a><span class="co">#SBATCH --nodes=1</span></span>
<span id="cb1-4"><a href="#cb1-4" tabindex="-1"></a><span class="co">#SBATCH --ntasks=1</span></span>
<span id="cb1-5"><a href="#cb1-5" tabindex="-1"></a><span class="co">#SBATCH --cpus-per-task=4 # must match the # of threads if program allows threading (-t #)</span></span>
<span id="cb1-6"><a href="#cb1-6" tabindex="-1"></a><span class="co">##SBATCH --mem-per-cpu=500G # memory per cpu - * if threading, do not let this line run (use ##). Cannot ask for too much memory per cpu!</span></span>
<span id="cb1-7"><a href="#cb1-7" tabindex="-1"></a><span class="co">#SBATCH --mem=500GB # overall memory - if you&#39;re threading, keep this line</span></span>
<span id="cb1-8"><a href="#cb1-8" tabindex="-1"></a><span class="co">#SBATCH --time=1-00:00:00     # time requested; this example is 1 day, 0 hrs</span></span>
<span id="cb1-9"><a href="#cb1-9" tabindex="-1"></a><span class="co">#SBATCH --output=name_of_log_file_6.27.21.stdout # name of your log file</span></span>
<span id="cb1-10"><a href="#cb1-10" tabindex="-1"></a><span class="co">#SBATCH --mail-user=email_address@gmail.com # your email address </span></span>
<span id="cb1-11"><a href="#cb1-11" tabindex="-1"></a><span class="co">#SBATCH --mail-type=ALL # will send you email updates when job starts and ends (and if it runs successfully or not)</span></span>
<span id="cb1-12"><a href="#cb1-12" tabindex="-1"></a><span class="co">#SBATCH --job-name=&quot;Name of Job 1/1/21&quot; # name of your job for Slurm</span></span>
<span id="cb1-13"><a href="#cb1-13" tabindex="-1"></a><span class="co">#SBATCH -p node_name_here # partition node name</span></span></code></pre></div>
<p>When I don’t know exactly what a program’s output will look like, I
will run the program via an interactive job on the HPCC. I also suggest
running programs interactively if the program requires multiple lines of
code to run, and you want to make sure each step has the correct input
(whether it be a file, an object, or the output of a previous step in
the code). For some more information on interactive jobs in a Slurm
system, check out this <a
href="https://yunmingzhang.wordpress.com/2015/06/29/how-to-use-srun-to-get-an-interactive-node/">blog
post</a> by Yunming Zhang. This is how I set up an interactive job on
the HPCC (that uses Slurm).</p>
<div class="sourceCode" id="cb2"><pre
class="sourceCode bash"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a><span class="ex">srun</span> <span class="at">-p</span> node_name_here <span class="at">--mem</span><span class="op">=</span>500gb <span class="at">--time</span><span class="op">=</span>1-00:00:00 <span class="at">--pty</span> bash <span class="at">-l</span></span>
<span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a><span class="co"># -p = partition</span></span>
<span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a><span class="co"># --mem = overall memory being requested, not memory per CPU</span></span>
<span id="cb2-4"><a href="#cb2-4" tabindex="-1"></a><span class="co"># --time = overall time requested; 1 day, 0 hrs</span></span>
<span id="cb2-5"><a href="#cb2-5" tabindex="-1"></a><span class="co"># -–pty = gives you a pseudo terminal that runs bash</span></span>
<span id="cb2-6"><a href="#cb2-6" tabindex="-1"></a><span class="co"># bash -l = setting bash as language</span></span></code></pre></div>
</div>
<div id="a-bash-scripting-tip-for-before-we-start"
class="section level2" number="1.3">
<h2><span class="header-section-number">1.3</span> A bash scripting tip
for before we start</h2>
<p>I wanted to share a bit of code that you will see being implemented
in every script throughout the tutorial. This little bit of code will
help you pull out the sample names from your files, allowing you easily
run through your files while also keeping track of which samples those
files belong to.</p>
<div class="sourceCode" id="cb3"><pre
class="sourceCode bash"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a><span class="cf">for</span> FILE <span class="kw">in</span> path/<span class="pp">*</span>.fastq<span class="kw">;</span></span>
<span id="cb3-2"><a href="#cb3-2" tabindex="-1"></a><span class="cf">do</span></span>
<span id="cb3-3"><a href="#cb3-3" tabindex="-1"></a>    <span class="va">f</span><span class="op">=</span><span class="va">$(</span><span class="fu">basename</span> <span class="va">$FILE)</span></span>
<span id="cb3-4"><a href="#cb3-4" tabindex="-1"></a>    <span class="va">SAMPLE</span><span class="op">=</span><span class="va">${f</span><span class="op">%</span>.fastq<span class="pp">*</span><span class="va">}</span></span>
<span id="cb3-5"><a href="#cb3-5" tabindex="-1"></a>    <span class="bu">echo</span> <span class="va">$SAMPLE</span></span>
<span id="cb3-6"><a href="#cb3-6" tabindex="-1"></a><span class="cf">done</span></span></code></pre></div>
<p>Here I am using using a for loop to loop through each fastq files in
a specific directory. In each iteration of the loop, an <code>$f</code>
variable is created, which uses the <code>basename</code> function to
get the file name of the <code>$FILE</code> variable. Then
<code>$SAMPLE</code> is created by using <code>%</code> to remove the
<code>.fastq</code> extension and everything that follows, keeping only
the file name (minus the extension) and calling that
<code>$SAMPLE</code>. Then we can use the <code>$SAMPLE</code> variable
to substitute the file names, which come in handy for running these
scripts over multiple samples at one time. This concept should become
clearer as we move through the workflow. If you’d like more information
on string substitution (i.e., using <code>%</code> to remove parts of a
string), please see this helpful <a
href="https://tldp.org/LDP/abs/html/string-manipulation.html">link</a>.</p>
</div>
</div>
<div id="sample-pre-processing" class="section level1" number="2">
<h1><span class="header-section-number">2</span> Sample
pre-processing</h1>
<div id="demultiplex-your-samples" class="section level2" number="2.1">
<h2><span class="header-section-number">2.1</span> Demultiplex your
samples</h2>
<p>When preparing sequencing libraries, we typically multiplex our
samples. This means that during library preparation, we’ve attached
barcodes to our sequences that help us trace the sample that these
sequences came from. This allows us to pool multiple libraries together
in one sequencing run. After sequencing, the sequences are
<em>demultiplexed</em>, meaning the individual sequences are separated
out by sample into individual FASTQ files.</p>
<p>Typically your samples will be returned to you already demultiplexed.
However, if your samples are still pooled into one large FASTQ file, do
not panic! You can follow the <a
href="https://astrobiomike.github.io/amplicon/demultiplexing"><strong>demultiplexing
tutorial</strong></a> by <a
href="https://astrobiomike.github.io/research/">Dr. Mike Lee</a> which
utilizes the <a
href="https://github.com/najoshi/sabre"><code>sabre</code> tool</a>. Or,
you can use <code>bcl2fastq2</code> by Illumina (more information <a
href="https://support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq2-v2-20-software-guide-15051736-03.pdf">here</a>).</p>
</div>
<div id="sequence-quality-and-where-to-trim" class="section level2"
number="2.2">
<h2><span class="header-section-number">2.2</span> Sequence Quality and
Where to Trim</h2>
<div id="check-the-quality-of-your-sequences-with-fastqc"
class="section level3" number="2.2.1">
<h3><span class="header-section-number">2.2.1</span> Check the quality
of your sequences with <code>FastQC</code></h3>
<p>It’s always a good idea to check the quality of your sequences before
you start your analysis, regardless of the type of sequences they are
(metagenomes, RNA-seq data, etc). <code>FastQC</code> <span
class="citation">(Andrews, n.d.)</span> provides a comprehensive report
on the quality of your sequences and is helpful for the following:
identifying primers or adapters still attached to your sequences;
determining the quality of your reverse reads; etc. You can also use the
FastQC reports to determine if you should attempt to merge your forward
and reverse reads, or just proceed with only the forward reads.</p>
<div class="sourceCode" id="cb4"><pre
class="sourceCode bash"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" tabindex="-1"></a><span class="co"># my 16S and ITS2 sequences are in separate directories, which I why I loop through them separately below</span></span>
<span id="cb4-2"><a href="#cb4-2" tabindex="-1"></a></span>
<span id="cb4-3"><a href="#cb4-3" tabindex="-1"></a><span class="co"># create a directory to store your FastQC results in</span></span>
<span id="cb4-4"><a href="#cb4-4" tabindex="-1"></a><span class="cf">if</span> <span class="kw">[[</span> <span class="ot">!</span> <span class="ot">-d</span> ./FastQC_Results <span class="kw">]];</span> <span class="cf">then</span></span>
<span id="cb4-5"><a href="#cb4-5" tabindex="-1"></a>    <span class="fu">mkdir</span> FastQC_Results</span>
<span id="cb4-6"><a href="#cb4-6" tabindex="-1"></a><span class="cf">fi</span></span>
<span id="cb4-7"><a href="#cb4-7" tabindex="-1"></a></span>
<span id="cb4-8"><a href="#cb4-8" tabindex="-1"></a><span class="co"># create directory within results directory for 16S FastQC Results</span></span>
<span id="cb4-9"><a href="#cb4-9" tabindex="-1"></a><span class="cf">if</span> <span class="kw">[[</span> <span class="ot">!</span> <span class="ot">-d</span> ./FastQC_Results/16S_FastQC <span class="kw">]];</span> <span class="cf">then</span></span>
<span id="cb4-10"><a href="#cb4-10" tabindex="-1"></a>    <span class="fu">mkdir</span> FastQC_Results/16S_FastQC</span>
<span id="cb4-11"><a href="#cb4-11" tabindex="-1"></a><span class="cf">fi</span></span>
<span id="cb4-12"><a href="#cb4-12" tabindex="-1"></a></span>
<span id="cb4-13"><a href="#cb4-13" tabindex="-1"></a><span class="co"># create directory within results directory for ITS2 (or ITS1) FastQC Results</span></span>
<span id="cb4-14"><a href="#cb4-14" tabindex="-1"></a><span class="cf">if</span> <span class="kw">[[</span> <span class="ot">!</span> <span class="ot">-d</span> ./FastQC_Results/ITS2_FastQC <span class="kw">]];</span> <span class="cf">then</span></span>
<span id="cb4-15"><a href="#cb4-15" tabindex="-1"></a>    <span class="fu">mkdir</span> FastQC_Results/ITS2_FastQC</span>
<span id="cb4-16"><a href="#cb4-16" tabindex="-1"></a><span class="cf">fi</span></span>
<span id="cb4-17"><a href="#cb4-17" tabindex="-1"></a></span>
<span id="cb4-18"><a href="#cb4-18" tabindex="-1"></a><span class="co"># loop through each 16S fastq.gz file and run through FastQC</span></span>
<span id="cb4-19"><a href="#cb4-19" tabindex="-1"></a><span class="cf">for</span> FILE <span class="kw">in</span> 16S_Seqs/<span class="pp">*</span>.fastq.gz<span class="kw">;</span></span>
<span id="cb4-20"><a href="#cb4-20" tabindex="-1"></a><span class="cf">do</span></span>
<span id="cb4-21"><a href="#cb4-21" tabindex="-1"></a>    <span class="co"># extract out just the sample name from the file name</span></span>
<span id="cb4-22"><a href="#cb4-22" tabindex="-1"></a>    <span class="va">f</span><span class="op">=</span><span class="va">$(</span><span class="fu">basename</span> <span class="va">$FILE)</span></span>
<span id="cb4-23"><a href="#cb4-23" tabindex="-1"></a>    <span class="va">SAMPLE</span><span class="op">=</span><span class="va">${f</span><span class="op">%</span>.fastq<span class="pp">*</span><span class="va">}</span> <span class="co">#string manipulation to drop .fastq and everything that comes after</span></span>
<span id="cb4-24"><a href="#cb4-24" tabindex="-1"></a>    </span>
<span id="cb4-25"><a href="#cb4-25" tabindex="-1"></a>    <span class="ex">fastqc</span> <span class="va">$FILE</span> <span class="at">--outdir</span><span class="op">=</span>./FastQC_Results/16S_FastQC</span>
<span id="cb4-26"><a href="#cb4-26" tabindex="-1"></a>    </span>
<span id="cb4-27"><a href="#cb4-27" tabindex="-1"></a><span class="cf">done</span></span>
<span id="cb4-28"><a href="#cb4-28" tabindex="-1"></a></span>
<span id="cb4-29"><a href="#cb4-29" tabindex="-1"></a><span class="co"># loop through each ITS2 fastq.gz file and run through FastQC</span></span>
<span id="cb4-30"><a href="#cb4-30" tabindex="-1"></a><span class="cf">for</span> FILE <span class="kw">in</span> ITS2_Seqs/<span class="pp">*</span>.fastq.gz<span class="kw">;</span></span>
<span id="cb4-31"><a href="#cb4-31" tabindex="-1"></a><span class="cf">do</span></span>
<span id="cb4-32"><a href="#cb4-32" tabindex="-1"></a>    <span class="va">f</span><span class="op">=</span><span class="va">$(</span><span class="fu">basename</span> <span class="va">$FILE)</span></span>
<span id="cb4-33"><a href="#cb4-33" tabindex="-1"></a>    <span class="va">SAMPLE</span><span class="op">=</span><span class="va">${f</span><span class="op">%</span>.fastq<span class="pp">*</span><span class="va">}</span></span>
<span id="cb4-34"><a href="#cb4-34" tabindex="-1"></a>    <span class="ex">fastqc</span> <span class="va">$FILE</span> <span class="at">--outdir</span><span class="op">=</span>./FastQC_Results/ITS2_FastQC</span>
<span id="cb4-35"><a href="#cb4-35" tabindex="-1"></a>    </span>
<span id="cb4-36"><a href="#cb4-36" tabindex="-1"></a><span class="cf">done</span></span></code></pre></div>
<p>FastQC will return a report assessing the per base and per sequence
quality of your sequences, as well as the GC and N (i.e., unidentified
base) content across your sequences, the distribution of your sequence
lengths, and whether or not adapters are still attached to your
sequences. The second tab of the report details the per base sequence
quality across all of your sequences. The per base quality score
(<strong>Q score</strong>), also known as a <strong>Phred
score</strong>, is the estimated probability that the base call is
wrong. The following equation is used for calculating the Q score: <span
class="math display">\[
Q = -10log_{10}E
\]</span> Here, E is the estimated probability of the base call being
wrong. The higher the Q score, the smaller the probability of a base
call error. A quality score of 30 (Q30) means that the probability of an
incorrect base call is 1 in 1000, and that the base call accuracy (1 -
probability of incorrect base call) is 99.9%. For more information on
quality scores, please see this info from <a
href="https://www.illumina.com/science/technology/next-generation-sequencing/plan-experiments/quality-scores.html">Illumina</a>.</p>
<p>Below is an example of the “per base sequence quality” portion of the
report. This portion of the report helps me to determine where I should
trim my sequences as I move forward with the analysis. This part of the
report can also give you a sense on whether there was an error in your
sequencing run. For example, if the average quality score (i.e., the
blue line in the report) across all of the bases dips below 30 for half
of the sequence length in all of my samples, that could indicate that
there was an error with the sequencing run itself.</p>
<center>
<img src="amplicon_workflow/fastqc_base_seq_qual_plot.png" />
</center>
<div align="center">
Figure 1: Per Base Quality Scores from FastQC Report
</div>
<p></br></p>
<p>Another useful piece of the FastQC report is the adapter content tab,
which is the very last tab in the report. This portion of the report
tells us the percentage of reads that have adapter sequences at specific
base positions along the reads. The following snapshot from a FastQC
report shows that the Small RNA 3’ adapter sequence is found in ~2% of
the sequences starting at around the 160th base. We can use this
information to then decide exactly which adapter sequences to cut from
our samples in the trimming step.</p>
<center>
<img src="amplicon_workflow/fastqc_adapter_content.png" />
</center>
<div align="center">
Figure 2: Frequency of Adapter Sequences from FastQC Report
</div>
<p></br></p>
<p>For more on how to interpret FastQC reports, please check out this
helpful <a
href="https://rtsf.natsci.msu.edu/genomics/tech-notes/fastqc-tutorial-and-faq/">FastQC
tutorial</a> from Michigan State University.</p>
</div>
<div id="expected-error-filtering-of-sequences-with-eestats"
class="section level3" number="2.2.2">
<h3><span class="header-section-number">2.2.2</span> Expected Error
Filtering of Sequences with <code>eestats</code></h3>
<p>The <code>eestats2</code> program <span class="citation">(Edgar and
Flyvbjerg 2015)</span> creates a report detailing the percentage of
reads that will pass through an expected error filter when the reads are
at different lengths. Specifically the program will determine how many
reads at each specific length (i.e., 50 bp, 100 bp, 150 bp, etc.) have
good enough quality to surpass the three expected error thresholds:
0.5%, 1%, and 2%.</p>
<p>Before you run the <code>eestats</code> program, be sure to
<em>gunzip</em> (aka decompress) your fastq.gz files! You can do that by
running the following command:
<code>gunzip /path/to/*.fastq.gz</code>.</p>
<div class="sourceCode" id="cb5"><pre
class="sourceCode bash"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" tabindex="-1"></a><span class="co"># Create directory to store eestats results</span></span>
<span id="cb5-2"><a href="#cb5-2" tabindex="-1"></a><span class="cf">if</span> <span class="kw">[[</span> <span class="ot">!</span> <span class="ot">-d</span> ./EEstats_Results <span class="kw">]];</span> <span class="cf">then</span></span>
<span id="cb5-3"><a href="#cb5-3" tabindex="-1"></a>    <span class="fu">mkdir</span> EEstats_Results</span>
<span id="cb5-4"><a href="#cb5-4" tabindex="-1"></a><span class="cf">fi</span></span>
<span id="cb5-5"><a href="#cb5-5" tabindex="-1"></a></span>
<span id="cb5-6"><a href="#cb5-6" tabindex="-1"></a><span class="co"># Create specific directory within eestats results for 16S eestats results</span></span>
<span id="cb5-7"><a href="#cb5-7" tabindex="-1"></a><span class="cf">if</span> <span class="kw">[[</span> <span class="ot">!</span> <span class="ot">-d</span> ./EEstats_Results/16S_EEstats <span class="kw">]];</span> <span class="cf">then</span></span>
<span id="cb5-8"><a href="#cb5-8" tabindex="-1"></a>    <span class="fu">mkdir</span> EEstats_Results/16S_EEstats</span>
<span id="cb5-9"><a href="#cb5-9" tabindex="-1"></a><span class="cf">fi</span></span>
<span id="cb5-10"><a href="#cb5-10" tabindex="-1"></a></span>
<span id="cb5-11"><a href="#cb5-11" tabindex="-1"></a><span class="co"># Create specific directory within eestats results for ITS2 (or ITS1) eestats results</span></span>
<span id="cb5-12"><a href="#cb5-12" tabindex="-1"></a><span class="cf">if</span> <span class="kw">[[</span> <span class="ot">!</span> <span class="ot">-d</span> ./EEstats_Results/ITS2_EEstats <span class="kw">]];</span> <span class="cf">then</span></span>
<span id="cb5-13"><a href="#cb5-13" tabindex="-1"></a>    <span class="fu">mkdir</span> EEstats_Results/ITS2_EEstats</span>
<span id="cb5-14"><a href="#cb5-14" tabindex="-1"></a><span class="cf">fi</span></span>
<span id="cb5-15"><a href="#cb5-15" tabindex="-1"></a></span>
<span id="cb5-16"><a href="#cb5-16" tabindex="-1"></a><span class="co"># Run eestats2 in loop with 16S fastq files</span></span>
<span id="cb5-17"><a href="#cb5-17" tabindex="-1"></a><span class="cf">for</span> FILE <span class="kw">in</span> 16S_Seqs/<span class="pp">*</span>.fastq<span class="kw">;</span></span>
<span id="cb5-18"><a href="#cb5-18" tabindex="-1"></a><span class="cf">do</span></span>
<span id="cb5-19"><a href="#cb5-19" tabindex="-1"></a>    <span class="va">f</span><span class="op">=</span><span class="va">$(</span><span class="fu">basename</span> <span class="va">$FILE)</span></span>
<span id="cb5-20"><a href="#cb5-20" tabindex="-1"></a>    <span class="va">SAMPLE</span><span class="op">=</span><span class="va">${f</span><span class="op">%</span>.fastq<span class="pp">*</span><span class="va">}</span></span>
<span id="cb5-21"><a href="#cb5-21" tabindex="-1"></a>    </span>
<span id="cb5-22"><a href="#cb5-22" tabindex="-1"></a>    <span class="ex">usearch</span> <span class="at">-fastq_eestats2</span> <span class="va">$FILE</span> <span class="at">-output</span> <span class="va">${SAMPLE}</span>_eestats2.txt</span>
<span id="cb5-23"><a href="#cb5-23" tabindex="-1"></a>    <span class="co"># move results to EEstats_Results directory</span></span>
<span id="cb5-24"><a href="#cb5-24" tabindex="-1"></a>    <span class="fu">mv</span> <span class="va">${SAMPLE}</span>_eestats2.txt EEstats_Results/16S_EEstats</span>
<span id="cb5-25"><a href="#cb5-25" tabindex="-1"></a><span class="cf">done</span></span>
<span id="cb5-26"><a href="#cb5-26" tabindex="-1"></a></span>
<span id="cb5-27"><a href="#cb5-27" tabindex="-1"></a><span class="co"># Run eestats2 in loop with ITS2 fastq files</span></span>
<span id="cb5-28"><a href="#cb5-28" tabindex="-1"></a><span class="cf">for</span> FILE <span class="kw">in</span> ITS2_Seqs/<span class="pp">*</span>.fastq<span class="kw">;</span></span>
<span id="cb5-29"><a href="#cb5-29" tabindex="-1"></a><span class="cf">do</span></span>
<span id="cb5-30"><a href="#cb5-30" tabindex="-1"></a>    <span class="va">f</span><span class="op">=</span><span class="va">$(</span><span class="fu">basename</span> <span class="va">$FILE)</span></span>
<span id="cb5-31"><a href="#cb5-31" tabindex="-1"></a>    <span class="va">SAMPLE</span><span class="op">=</span><span class="va">${f</span><span class="op">%</span>.fastq<span class="pp">*</span><span class="va">}</span></span>
<span id="cb5-32"><a href="#cb5-32" tabindex="-1"></a>    </span>
<span id="cb5-33"><a href="#cb5-33" tabindex="-1"></a>    <span class="ex">usearch</span> <span class="at">-fastq_eestats2</span> <span class="va">$FILE</span> <span class="at">-output</span> <span class="va">${SAMPLE}</span>_eestats2.txt</span>
<span id="cb5-34"><a href="#cb5-34" tabindex="-1"></a>    <span class="co"># move results to EEstats_Results directory</span></span>
<span id="cb5-35"><a href="#cb5-35" tabindex="-1"></a>    <span class="fu">mv</span> <span class="va">${SAMPLE}</span>_eestats2.txt EEstats_Results/ITS2_EEstats</span>
<span id="cb5-36"><a href="#cb5-36" tabindex="-1"></a>    </span>
<span id="cb5-37"><a href="#cb5-37" tabindex="-1"></a><span class="cf">done</span></span></code></pre></div>
The following image shows a sample eestats2 report. For this specific
sample, the ideal trimming length of the sequences would be around 250
basepairs long. This is because when considering the expected error
threshold of 1%, more than 80.8% of the sequence pass this threshold.
Though the 300 bp length also allows high rentention of reads, we know
that the per base quality of this sample drops as we approach the 300 bp
position. Thus, it seems like trimming these sequences to 250 bps would
be ideal moving forward.
<center>
<img src="amplicon_workflow/eestats_example_report.png" />
</center>
<div align="center">
Figure 3: eestats Report Example
</div>
<p></br> For more information on the <code>eestats2</code> programs by
<a href="https://www.drive5.com/usearch/">USEARCH</a>, please read the
documentation <a
href="https://www.drive5.com/usearch/manual/cmd_fastq_eestats2.html">here</a>.</p>
</div>
</div>
<div id="decontaminate-trim-sequences-and-cut-adapters-primers-etc"
class="section level2" number="2.3">
<h2><span class="header-section-number">2.3</span> Decontaminate &amp;
Trim sequences and cut adapters, primers, etc</h2>
<p>There are plenty of programs out there that can be used for trimming,
and the following three are the most popular for amplicon analyses: <a
href="https://cutadapt.readthedocs.io/en/stable/"><code>cutadapt</code></a>,
<a
href="http://www.usadellab.org/cms/?page=trimmomatic"><code>trimmomatic</code></a>,
and <a
href="https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/"><code>bbduk</code></a>
<span class="citation">(Bushnell, n.d.)</span>. All of these programs
are reputable, but I personally like the <code>bbduk</code>, and will
use this tool for trimming and adapter removal.</p>
<p>Before I trim my sequences, I refer to the FastQC reports to find out
exactly which adapters I should remove from my sequences. For example,
when looking at the adapter content portion of the FastQC report above,
I can see that the Nextera Transposase Sequence is still present in that
particular sample. Thankfully Illumina shares their adapter sequences on
their <a
href="https://support-docs.illumina.com/SHARE/AdapterSeq/Content/SHARE/AdapterSeq/AdapterSequencesIntro.htm">website</a>,
allowing us to easily find common adapters in sequences, like the <a
href="https://support-docs.illumina.com/SHARE/AdapterSeq/Content/SHARE/AdapterSeq/Nextera/SequencesNXTILMPrepAndPCR.htm">Nextera
Transposase Sequence</a> for example.</p>
<p>I also know that with the sequences I am analyzing, the PCR primers
are still attached (FastQC may identify these primers in your report’s
Overrepresented Sequences tab, but not necessarily the origin of these
sequences). I can either remove these primer sequences using the actual
sequence using the (<code>literal=</code>) flag, or I can trim from the
right (<code>ftr=</code>) and/or the left (<code>ftl=</code>) of the
sequences if I know exactly how long the primer sequences were.</p>
<p>It is recommended to check the overrepresented sequences from the
FastQC report to see if there are contaminating sequences present in
your data. I suggest taking the most frequent overrepresented sequence
and running it through <a
href="https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&amp;BLAST_SPEC=GeoBlast&amp;PAGE_TYPE=BlastSearch">BLASTn</a>
if the source of this overrepresented sequence says “No Hit” (meaning
that FastQC cannot attribute this sequence to its list of adapter
sequences). If the sequence comes up as a contaminant (i.e., a different
gene than the amplicon you’re looking at) or adapter/primer of some
kind, you can add this to the <code>literal=</code> flag in
<code>bbduk</code> to remove the contaminant.</p>
<p>In addition to removing adapter and primer sequences using the the
<code>literal=</code> flag, I also include a reference file provided by
<code>bbduk</code> (referenced in the <code>ref=</code> flag) that
contains all of the Illumina TruSeq adapters. The sequences in the
reference file, in addition to the given adapters and primers, will be
removed from the sequences. Additionally, bbduk decontaminates sequences
by matching kmers (aka reads of a specific length k) to reference
genomes. If the kmers match the reference genome, then the kmer is kept.
The longer the kmer, the higher the specificity - but there is a limit
to this, seeing as the likelihood that long kmers are shared across
multiple reads is unlikely. If your kmer length is too short, you could
be keeping sequences that are adapters, primers, etc by accident (this
is why using the literal sequence flag and the adapter reference file is
helpful in <code>bbduk</code>).</p>
<p>Below the shell script is a description of all of the flags used by
<code>bbduk</code> and exactly what they mean. For more information on
the <code>bbduk</code> flags, please see the <code>bbduk</code> <a
href="https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/">documentation</a>.</p>
<div class="sourceCode" id="cb6"><pre
class="sourceCode bash"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1" tabindex="-1"></a><span class="va">path</span><span class="op">=</span>/path/to/sequences/here <span class="co"># replace with the path to your files</span></span>
<span id="cb6-2"><a href="#cb6-2" tabindex="-1"></a><span class="co"># my sequence files are in $path/16S_Seqs/ -- see for loop below</span></span>
<span id="cb6-3"><a href="#cb6-3" tabindex="-1"></a></span>
<span id="cb6-4"><a href="#cb6-4" tabindex="-1"></a><span class="cf">if</span> <span class="kw">[[</span> <span class="ot">!</span> <span class="ot">-d</span> ./Trimmed_Seqs <span class="kw">]];</span> <span class="cf">then</span> <span class="co"># creating directory to store trimmed sequences in</span></span>
<span id="cb6-5"><a href="#cb6-5" tabindex="-1"></a>    <span class="fu">mkdir</span> Trimmed_Seqs</span>
<span id="cb6-6"><a href="#cb6-6" tabindex="-1"></a><span class="cf">fi</span></span>
<span id="cb6-7"><a href="#cb6-7" tabindex="-1"></a></span>
<span id="cb6-8"><a href="#cb6-8" tabindex="-1"></a><span class="cf">if</span> <span class="kw">[[</span> <span class="ot">!</span> <span class="ot">-d</span> ./Trimmed_Seqs/16S_Trimmed <span class="kw">]];</span> <span class="cf">then</span> <span class="co"># creating directory for specifically trimmed 16S sequences</span></span>
<span id="cb6-9"><a href="#cb6-9" tabindex="-1"></a>    <span class="fu">mkdir</span> Trimmed_Seqs/16S_Trimmed</span>
<span id="cb6-10"><a href="#cb6-10" tabindex="-1"></a><span class="cf">fi</span></span>
<span id="cb6-11"><a href="#cb6-11" tabindex="-1"></a></span>
<span id="cb6-12"><a href="#cb6-12" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="va">${path}</span>/16S_Seqs/<span class="pp">*</span>_R1.fastq<span class="kw">;</span></span>
<span id="cb6-13"><a href="#cb6-13" tabindex="-1"></a><span class="cf">do</span></span>
<span id="cb6-14"><a href="#cb6-14" tabindex="-1"></a>    <span class="va">f</span><span class="op">=</span><span class="va">$(</span><span class="fu">basename</span> <span class="va">$i)</span></span>
<span id="cb6-15"><a href="#cb6-15" tabindex="-1"></a>    <span class="va">SAMPLE</span><span class="op">=</span><span class="va">${f</span><span class="op">%</span>_R<span class="pp">*</span><span class="va">}</span></span>
<span id="cb6-16"><a href="#cb6-16" tabindex="-1"></a>    </span>
<span id="cb6-17"><a href="#cb6-17" tabindex="-1"></a>    <span class="ex">bbduk.sh</span> <span class="at">-Xmx10g</span> in1=<span class="va">${path}</span>/16S_Seqs/<span class="va">${SAMPLE}</span>_R1.fastq in2=<span class="va">${path}</span>/16S_Seqs/<span class="va">${SAMPLE}</span>_R2.fastq out1=<span class="va">${path}</span>/Trimmed_Seqs/16S_Trimmed/<span class="va">${SAMPLE}</span>_R1_clean.fastq out2=<span class="va">${path}</span>/Trimmed_Seqs/16S_Trimmed/<span class="va">${SAMPLE}</span>_R2_clean.fastq literal=TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG,GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG ref=/bigdata/aronsonlab/shared/bbmap_resources/adapters.fa rcomp=t ktrim=r k=23 maq=10 minlength=200 mink=13 hdist=1 tpe tbo</span>
<span id="cb6-18"><a href="#cb6-18" tabindex="-1"></a>    </span>
<span id="cb6-19"><a href="#cb6-19" tabindex="-1"></a><span class="cf">done</span></span>
<span id="cb6-20"><a href="#cb6-20" tabindex="-1"></a></span>
<span id="cb6-21"><a href="#cb6-21" tabindex="-1"></a><span class="co"># ref ---&gt; file provided by bbduk that holds collection of Illumina TruSeq adapters</span></span>
<span id="cb6-22"><a href="#cb6-22" tabindex="-1"></a><span class="co"># literal=(sequence here) ---&gt; literal adapter sequences to remove; &quot;N&quot; represents any base -- in this case, they are indexes within the adapters</span></span>
<span id="cb6-23"><a href="#cb6-23" tabindex="-1"></a><span class="co"># rcomp=t ---&gt; Rcomp looks for kmers and their reverse-complements, rather than just forward kmer, if set to true</span></span>
<span id="cb6-24"><a href="#cb6-24" tabindex="-1"></a><span class="co"># ktrim=r ---&gt; “ktrim=r” is for right-trimming (3′ adapters)</span></span>
<span id="cb6-25"><a href="#cb6-25" tabindex="-1"></a><span class="co"># k=23 ---&gt; look for kmer that is 23 bp long</span></span>
<span id="cb6-26"><a href="#cb6-26" tabindex="-1"></a><span class="co"># mink=11 ---&gt; in addition to kmers of x length, look for shorter kmers with lengths 23 to 11 (in this case)</span></span>
<span id="cb6-27"><a href="#cb6-27" tabindex="-1"></a><span class="co"># maq=10 ---&gt; This will discard reads with average quality below 10</span></span>
<span id="cb6-28"><a href="#cb6-28" tabindex="-1"></a><span class="co"># hdist=1 ---&gt; hamming distance of 1</span></span>
<span id="cb6-29"><a href="#cb6-29" tabindex="-1"></a><span class="co"># mlf=50 ---&gt; (minlengthfraction=50) would discard reads under 50% of their original length after trimming</span></span>
<span id="cb6-30"><a href="#cb6-30" tabindex="-1"></a><span class="co"># trimq=10 ---&gt; quality-trim to Q10 using the Phred algorithm, which is more accurate than naive trimming.</span></span>
<span id="cb6-31"><a href="#cb6-31" tabindex="-1"></a><span class="co"># qtrim=r ---&gt; means it will quality trim the right side only</span></span>
<span id="cb6-32"><a href="#cb6-32" tabindex="-1"></a><span class="co"># tpe ---&gt; which specifies to trim both reads to the same length</span></span>
<span id="cb6-33"><a href="#cb6-33" tabindex="-1"></a><span class="co"># tbo ---&gt; which specifies to also trim adapters based on pair overlap detection using BBMerge (which does not require known adapter sequences)</span></span>
<span id="cb6-34"><a href="#cb6-34" tabindex="-1"></a><span class="co"># mm ----&gt; Maskmiddle ignores the middle base of a kmer, can turn off with mm=f</span></span></code></pre></div>
<p>To be extra cautious and ensure that the trimming step was
successful, I will run the trimmed sequences through FastQC and compare
the reports. If the per base and per sequence qualities have improved
and/or the adapters are absent, then I will move forward with the
workflow. However, if I am still not happy with the quality of the
trimmed reads, I will then run the <em>trimmed</em> reads through
<code>bbduk</code> in hopes of removing persistent, unwanted sequences.
I will also check the overrepresented sequences and their frequencies
again, and run the most frequent overepresented sequence(s) in
BLASTn.</p>
</div>
</div>
<div id="asv-assignment-with-dada2" class="section level1" number="3">
<h1><span class="header-section-number">3</span> ASV Assignment with
<code>DADA2</code></h1>
<p>All of the steps in this portion of the workflow (excluding the tmux
and nvim code chunks) have been adapted from Dr. Callahan’s
<code>DADA2</code> <a
href="https://benjjneb.github.io/dada2/tutorial.html">tutorial</a> and
Dr. Lee’s amplicon <a
href="https://astrobiomike.github.io/amplicon/dada2_workflow_ex">tutorial</a>.</p>
<p>To prepare for running <code>DADA2</code>, I want to separate our
sequence files by locus and region. For example, you do not want to
analyze your 16S and ITS2 sequences together in DADA2 – combining loci
and even different regions of the same loci can interfere with the
<code>DADA2</code> algorithm. For example, even if you have 16S
sequences of just the V3 region, and a set of 16S sequences with the
V3-V4 region, you would want to run these regions separately through the
<code>DADA2</code> pipeline. The reason for this will become clearer as
we get to the filtering and trimming step and the error rate prediction
step.</p>
<p>To create separate directories for your sequence data, I first ensure
that their file names include their amplicon that’s been sequenced for
that particular sample (e.g., the 16S V4 data for Sample1 is in the file
<code>Sample1_16S.V4_R1_001.fastq</code>). Then I would run the
following line of code.</p>
<div class="sourceCode" id="cb7"><pre
class="sourceCode bash"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" tabindex="-1"></a><span class="co"># make sure you are in the correct directory before doing this</span></span>
<span id="cb7-2"><a href="#cb7-2" tabindex="-1"></a><span class="fu">mv</span> <span class="pp">*</span>_16S.V4_<span class="pp">*</span> 16S.V4_Seqs</span>
<span id="cb7-3"><a href="#cb7-3" tabindex="-1"></a><span class="co"># format of move command: mv file_name directory_name</span></span></code></pre></div>
<p>Here I am using a <code>*</code> which is a special character that
can be used to represent any character or set of characters. In this
case, I am telling the <code>mv</code> command to move any files that
have the <code>_16S.V4_</code> pattern anywhere in the file name to a
directory called <code>16S.V4_Seqs</code>. After running this command, I
make sure that my script containing the following <code>DADA2</code> R
code is in the directory with the specific files you want to analyze. My
<code>DADA2</code> R script is called
<code>DADA2_tutorial_16S_pipeline.R</code>, which you will see me
reference in a couple of code chunks.</p>
<div id="run-interactive-job-tmux-on-hpcc" class="section level2"
number="3.1">
<h2><span class="header-section-number">3.1</span> Run Interactive Job +
<code>tmux</code> on HPCC</h2>
<p>Personally, I like to run through <code>DADA2</code> via an
interactive job on our HPCC. This will allow us to run scripts line by
line and check the output, rather than submitting a job to run in the
cluster without our supervision. Basically, this is an easy to way
constantly check our progress and (ideally) catch errors as soon as they
happen. Again, your HPCC must use Slurm to run an interactive job in
this manner.</p>
<div class="sourceCode" id="cb8"><pre
class="sourceCode bash"><code class="sourceCode bash"><span id="cb8-1"><a href="#cb8-1" tabindex="-1"></a><span class="ex">srun</span> <span class="at">--partition</span><span class="op">=</span>node_name_here <span class="at">--mem</span><span class="op">=</span>400gb <span class="at">--cpus-per-task</span> 4 <span class="at">--ntasks</span> 1 <span class="at">--time</span> 08:00:00 <span class="at">--pty</span> bash <span class="at">-l</span></span>
<span id="cb8-2"><a href="#cb8-2" tabindex="-1"></a><span class="co"># --cpus-per-task and --ntasks are not necessary</span></span>
<span id="cb8-3"><a href="#cb8-3" tabindex="-1"></a><span class="co"># --cpus-per-task is needed for multithreading</span></span></code></pre></div>
<p>Once the interactive job is running, we can use <strong>tmux</strong>
and <strong>nvim</strong> to start running through the
<code>DADA2</code> R script.</p>
<div class="sourceCode" id="cb9"><pre
class="sourceCode bash"><code class="sourceCode bash"><span id="cb9-1"><a href="#cb9-1" tabindex="-1"></a><span class="ex">tmux</span> new <span class="at">-s</span> mysession <span class="co"># start new tmux session named mysession; do this if you have not started running tmux already</span></span>
<span id="cb9-2"><a href="#cb9-2" tabindex="-1"></a><span class="ex">nvim</span> DADA2_tutorial_16S_pipeline.R <span class="co"># load R script using nvim</span></span></code></pre></div>
<center>
<img src="amplicon_workflow/nvim_openR.png" />
</center>
<div align="center">
Figure 4a: Neovim Display of R script
</div>
<p></br></p>
Almost immediately after the R script opens in nvim, I type
<code>\rf</code>. This will open another window showing your terminal.
You can toggle the horizontal verses vertical alignment fo the windows
by typing <code>Ctrl-w shift-H</code> for a horizontal alignment or
<code>Ctrl-w shift-V</code>for a vertical alignment. Below is what the
screen should look like after typing <code>\rf</code> followed by
<code>Ctrl-w shift-H</code>. You can see the R script is open in the
left window, and my terminal is open in the right window.
<center>
<img src="amplicon_workflow/nvim_horizontal.png" />
</center>
<div align="center">
Figure 4b: View of Neovim Display after typing <code>\rf</code>
</div>
<p></br> Now that we are in nvim, all you need to do to run a line of
code is to just hit the <code>space</code> bar! You can also toggle
between windows using <code>Ctrl-w w</code>, edit or type code by
pressing <code>i</code> to insert code, and leaving the editing mode by
pressing <code>esc</code>. To quit and save changes to your R file, just
type <code>:wq</code>, or to quit without saving changes to your file,
just type <code>:q!</code>. When using nvim, I keep Dr. Girke’s handy <a
href="https://girke.bioinformatics.ucr.edu/GEN242/tutorials/linux/linux/#basic-usage-of-nvim-r-tmux">tmux/nvim
tutorial</a> open as a reference just in case.</p>
<div id="load-the-path-fastq-files" class="section level3"
number="3.1.1">
<h3><span class="header-section-number">3.1.1</span> Load the path &amp;
FASTQ files</h3>
<p>We can start by loading the libraries we need as well as the path to
the sequences you want to analyze. In this example I will be analyzing
16S V3-V4 sequences, so I set the path object to be the path to those
specific sequences.</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" tabindex="-1"></a><span class="fu">getwd</span>() <span class="co"># double check that we are in the correct directory, where are trimmed sequences are stored.</span></span>
<span id="cb10-2"><a href="#cb10-2" tabindex="-1"></a></span>
<span id="cb10-3"><a href="#cb10-3" tabindex="-1"></a><span class="fu">packageVersion</span>(<span class="st">&quot;dada2&quot;</span>) <span class="co"># see which version of DADA2 you have installed</span></span>
<span id="cb10-4"><a href="#cb10-4" tabindex="-1"></a></span>
<span id="cb10-5"><a href="#cb10-5" tabindex="-1"></a><span class="fu">suppressPackageStartupMessages</span>({ <span class="co"># load packages quietly</span></span>
<span id="cb10-6"><a href="#cb10-6" tabindex="-1"></a>  <span class="fu">library</span>(dada2)</span>
<span id="cb10-7"><a href="#cb10-7" tabindex="-1"></a>  <span class="fu">library</span>(tidyr)</span>
<span id="cb10-8"><a href="#cb10-8" tabindex="-1"></a>  <span class="fu">library</span>(ggpubr)</span>
<span id="cb10-9"><a href="#cb10-9" tabindex="-1"></a>  <span class="fu">library</span>(decontam)</span>
<span id="cb10-10"><a href="#cb10-10" tabindex="-1"></a>})</span>
<span id="cb10-11"><a href="#cb10-11" tabindex="-1"></a></span>
<span id="cb10-12"><a href="#cb10-12" tabindex="-1"></a>path <span class="ot">&lt;-</span> <span class="st">&quot;/path/to/fastq/files&quot;</span> <span class="co"># CHANGE ME to the directory containing the fastq files after </span></span>
<span id="cb10-13"><a href="#cb10-13" tabindex="-1"></a><span class="fu">list.files</span>(path)</span>
<span id="cb10-14"><a href="#cb10-14" tabindex="-1"></a></span>
<span id="cb10-15"><a href="#cb10-15" tabindex="-1"></a><span class="do">## Read in sample names</span></span>
<span id="cb10-16"><a href="#cb10-16" tabindex="-1"></a>fnFs <span class="ot">&lt;-</span> <span class="fu">sort</span>(<span class="fu">list.files</span>(path, <span class="at">pattern=</span><span class="st">&quot;_R1_clean.fastq&quot;</span>, <span class="at">full.names =</span> <span class="cn">TRUE</span>))</span>
<span id="cb10-17"><a href="#cb10-17" tabindex="-1"></a>fnFs <span class="co"># sanity check to see what the file names are</span></span>
<span id="cb10-18"><a href="#cb10-18" tabindex="-1"></a></span>
<span id="cb10-19"><a href="#cb10-19" tabindex="-1"></a>fnRs <span class="ot">&lt;-</span> <span class="fu">sort</span>(<span class="fu">list.files</span>(path, <span class="at">pattern=</span><span class="st">&quot;_R2_clean.fastq&quot;</span>, <span class="at">full.names =</span> <span class="cn">TRUE</span>)); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>) <span class="co"># saves all objects in global env.; runs after portion of code before &quot;;&quot;</span></span>
<span id="cb10-20"><a href="#cb10-20" tabindex="-1"></a></span>
<span id="cb10-21"><a href="#cb10-21" tabindex="-1"></a><span class="co"># Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq</span></span>
<span id="cb10-22"><a href="#cb10-22" tabindex="-1"></a>sample.names <span class="ot">&lt;-</span> <span class="fu">sapply</span>(<span class="fu">strsplit</span>(<span class="fu">basename</span>(fnFs), <span class="st">&quot;_R1&quot;</span>), <span class="st">`</span><span class="at">[</span><span class="st">`</span>, <span class="dv">1</span>) <span class="co">#pattern where you want to split the name; i.e., at the _R1 level</span></span>
<span id="cb10-23"><a href="#cb10-23" tabindex="-1"></a><span class="do">## want to split by R1 so that you do not get duplicate sample names for each R1/R2 paired-end sequences</span></span>
<span id="cb10-24"><a href="#cb10-24" tabindex="-1"></a><span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>) <span class="co"># save global env.</span></span>
<span id="cb10-25"><a href="#cb10-25" tabindex="-1"></a></span>
<span id="cb10-26"><a href="#cb10-26" tabindex="-1"></a>sample.names <span class="co"># sanity check</span></span></code></pre></div>
<p>If you have already started the <code>DADA2</code> workflow and want
to pick up from where you left off, then you can run this next code
chunk to load everything that was in your global environment and saved
to an .Rdata object (I will explain this code when we save our first
.Rdata file).</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" tabindex="-1"></a><span class="do">## If you are picking up where you left off, load your mydada_16S.V4.Rdata file now</span></span>
<span id="cb11-2"><a href="#cb11-2" tabindex="-1"></a><span class="fu">load</span>(<span class="st">&quot;/path/to/fastq/files/mydada_16S.V4.Rdata&quot;</span>)</span></code></pre></div>
</div>
</div>
<div id="check-sequence-quality" class="section level2" number="3.2">
<h2><span class="header-section-number">3.2</span> Check sequence
quality</h2>
<p>Though we have already done this with <code>FastQC</code>, there is a
step here to check the quality of our forward and reverse reads. These
per base sequence quality reports do look nicer than the
<code>FastQC</code> output and tell you the total number of reads in
that particular sample. We use <code>ggsave()</code> from the
<code>ggpubr</code> <a
href="https://rpkgs.datanovia.com/ggpubr/index.html">package</a> <span
class="citation">(Kassambara 2020)</span> to save these reports as high
quality PDFs.</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" tabindex="-1"></a>plot1<span class="ot">&lt;-</span><span class="fu">plotQualityProfile</span>(fnFs[<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>]) <span class="co"># check quality of Forward reads (2 samples)</span></span>
<span id="cb12-2"><a href="#cb12-2" tabindex="-1"></a>plot2<span class="ot">&lt;-</span><span class="fu">plotQualityProfile</span>(fnRs[<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>]) <span class="co"># check quality of Reverse reads (2 samples)</span></span>
<span id="cb12-3"><a href="#cb12-3" tabindex="-1"></a></span>
<span id="cb12-4"><a href="#cb12-4" tabindex="-1"></a><span class="fu">ggsave</span>(plot1,<span class="at">filename =</span> <span class="st">&quot;16S_pretrim_DADA2_F_quality.pdf&quot;</span>, <span class="at">width=</span><span class="dv">15</span>, <span class="at">height=</span><span class="dv">12</span>, <span class="at">dpi=</span><span class="dv">600</span>) </span>
<span id="cb12-5"><a href="#cb12-5" tabindex="-1"></a><span class="fu">ggsave</span>(plot2,<span class="at">filename =</span> <span class="st">&quot;16S_pretrim_DADA2_R_quality.pdf&quot;</span>, <span class="at">width=</span><span class="dv">15</span>, <span class="at">height=</span><span class="dv">12</span>, <span class="at">dpi=</span><span class="dv">600</span>) </span></code></pre></div>
<center>
<img src="amplicon_workflow/16S_pretrim_DADA2_F_quality.png" />
</center>
<div align="center">
Figure 5a: Per Base Sequence Quality Reports for 2 Samples (R1 Only)
</div>
</br>
<center>
<img src="amplicon_workflow/16S_pretrim_DADA2_R_quality.png" />
</center>
<div align="center">
Figure 5b: Per Base Sequence Quality Reports for 2 Samples (R2 Only)
</div>
<p></br></p>
</div>
<div id="filter-and-trim" class="section level2" number="3.3">
<h2><span class="header-section-number">3.3</span> Filter and Trim</h2>
<p>Now that we’ve set up our file paths and checked the quality of our
sequences, we can set them up for the <code>DADA2</code> filter and trim
step. First, we create objects that will hold the file names of filtered
sequences based on the sample names we have provided. Then the
<code>filterAndTrim</code> command will filter the reads based upon the
following: read quality, read length, the number of Ns (i.e., unknown
bases) in a read, the maximum number of expected errors after truncating
the reads, and whether or not reads in your sample match the PhiX genome
(i.e., a small virus genome used as a control in Illumina sequencing
runs; more information <a
href="https://dornsife.usc.edu/uscgenomecore/faq/">here</a>). The
maximum expected errors is calculated by solving for E in the Quality
Score equation (see above). We have also specified here that we would
like our output FASTQ files to be compressed, and that we can
multithread this filter and trimming process. Keep in mind that if you
are using a <strong>Windows</strong> to run your analyses locally, then
multithreading is not available for this step.</p>
<p>Soem crucial things to consider at this step are your read lengths
(i.e., 2x250, 2x300), the locus and region(s) you’re sequencing, the per
base quality of your sequences, the and whether you are using paired-end
reads or just forward reads.</p>
<p>For the <strong>ITS1</strong> and <strong>ITS2</strong> genes, their
lengths vary so wildly that truncating the sequences based on a specific
length is not recommended. However, for the <strong>16S gene</strong>,
its regions (i.e., V1-V9) are a bit more reliable in their average
lengths. The <strong>16S V4</strong> region varies between ~250 - 283
nucleotides in length <span class="citation">(Illumina, n.d.;
Vargas-Albores et al. 2017)</span>, whereas the <strong>V3</strong>
region varies between ~130 to 190 nucleotides <span
class="citation">(Vargas-Albores et al. 2017)</span>. The
<strong>V3-V4</strong> region ranges between ~400 - 470 nucleotides in
length. For more information, check out this <code>DADA2</code> GitHub
<a href="https://github.com/benjjneb/dada2/issues/1033">issue</a> as
well as <span class="citation">Rausch et al. (2019)</span> and <span
class="citation">Bukin et al. (2019)</span>.</p>
<p>Forward and reverse reads are merged if they have at least a <em>12
base pair</em> overlap. If you are using paired-end reads, then your
merged read lengths (considering the 12 nucleotide overlap) need to
total up to these region lengths. For example, let’s say you’re
truncating your 16S V3-V4 forward reads to 250 base pairs (bp) long and
your reverse reads to 160 bp long. If your reads are merged, the total
length will be 250 + (160-12) = 398 bp long. This total read length of
398 bp would be a decent minimum read length considering that the range
of the 16S V3-V4 region is ~400 - 470 bp.</p>
<p>Lastly, when setting your expected errors per forward and reverse
reads (<code>maxEE=c(R1,R2)</code>), it is important to consider the per
base sequence quality of your reads. Because reverse reads typically
have lower per base sequence quality than your forward reads, you may
want to relax the expected errors for your reverse reads.</p>
<p>If few too reads are surviving this step, then consider changing your
expected errors per read parameter or adjusting the
<code>truncLength</code> of your reads. Referring to your FastQC and
eestats2 reports may be provide even more clarity for how you want to
define these paramters. For more information on the
<code>filterAndTrim</code> function, please view this <a
href="https://rdrr.io/github/benjjneb/dada2/man/filterAndTrim.html">documentation</a>.</p>
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" tabindex="-1"></a>path <span class="co"># double check that your path is correct</span></span>
<span id="cb13-2"><a href="#cb13-2" tabindex="-1"></a></span>
<span id="cb13-3"><a href="#cb13-3" tabindex="-1"></a><span class="co"># Create objects that will hold filtered file names in directory called &quot;Filtered&quot;</span></span>
<span id="cb13-4"><a href="#cb13-4" tabindex="-1"></a><span class="do">## these files will be created in the filter + trim step w/ filterAndTrim command</span></span>
<span id="cb13-5"><a href="#cb13-5" tabindex="-1"></a>filtFs <span class="ot">&lt;-</span> <span class="fu">file.path</span>(path, <span class="st">&quot;Filtered&quot;</span>, <span class="fu">paste0</span>(sample.names, <span class="st">&quot;_F_filtered.fastq.gz&quot;</span>))</span>
<span id="cb13-6"><a href="#cb13-6" tabindex="-1"></a>filtRs <span class="ot">&lt;-</span> <span class="fu">file.path</span>(path, <span class="st">&quot;Filtered&quot;</span>, <span class="fu">paste0</span>(sample.names, <span class="st">&quot;_R_filtered.fastq.gz&quot;</span>))</span>
<span id="cb13-7"><a href="#cb13-7" tabindex="-1"></a></span>
<span id="cb13-8"><a href="#cb13-8" tabindex="-1"></a><span class="co"># giving these file.name elements the sames of the samples</span></span>
<span id="cb13-9"><a href="#cb13-9" tabindex="-1"></a><span class="fu">names</span>(filtFs) <span class="ot">&lt;-</span> sample.names</span>
<span id="cb13-10"><a href="#cb13-10" tabindex="-1"></a><span class="fu">names</span>(filtRs) <span class="ot">&lt;-</span> sample.names; <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span>
<span id="cb13-11"><a href="#cb13-11" tabindex="-1"></a></span>
<span id="cb13-12"><a href="#cb13-12" tabindex="-1"></a>filtFs <span class="co"># let&#39;s see what this object looks like</span></span>
<span id="cb13-13"><a href="#cb13-13" tabindex="-1"></a></span>
<span id="cb13-14"><a href="#cb13-14" tabindex="-1"></a><span class="co"># Filter &amp; Trim! </span></span>
<span id="cb13-15"><a href="#cb13-15" tabindex="-1"></a>out <span class="ot">&lt;-</span> <span class="fu">filterAndTrim</span>(fnFs, filtFs, fnRs, filtRs, <span class="at">truncLen=</span><span class="fu">c</span>(<span class="dv">250</span>,<span class="dv">235</span>),</span>
<span id="cb13-16"><a href="#cb13-16" tabindex="-1"></a>                     <span class="at">maxN=</span><span class="dv">0</span>, <span class="at">maxEE=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">3</span>), <span class="at">truncQ=</span><span class="dv">2</span>, <span class="at">rm.phix=</span><span class="cn">TRUE</span>,</span>
<span id="cb13-17"><a href="#cb13-17" tabindex="-1"></a>                     <span class="at">compress=</span><span class="cn">TRUE</span>, <span class="at">multithread=</span><span class="cn">TRUE</span>); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>) <span class="co"># On Windows set multithread=FALSE</span></span>
<span id="cb13-18"><a href="#cb13-18" tabindex="-1"></a></span>
<span id="cb13-19"><a href="#cb13-19" tabindex="-1"></a><span class="co"># filterAndTrim notes:</span></span>
<span id="cb13-20"><a href="#cb13-20" tabindex="-1"></a><span class="do">## The maxEE parameter sets the maximum number of “expected errors” allowed in a read, which is a better filter than simply averaging quality scores.</span></span>
<span id="cb13-21"><a href="#cb13-21" tabindex="-1"></a><span class="do">## Standard filtering parameters: maxN=0 (DADA2 requires no Ns), truncQ=2, rm.phix=TRUE and maxEE=2.</span></span>
<span id="cb13-22"><a href="#cb13-22" tabindex="-1"></a><span class="co"># truncLen=c(240,230) -- trim F reads to 240 bp, trim R reads to 230 bp</span></span>
<span id="cb13-23"><a href="#cb13-23" tabindex="-1"></a><span class="do">## Notes for trunc length of 2x300 PE reads: https://github.com/benjjneb/dada2/issues/236</span></span>
<span id="cb13-24"><a href="#cb13-24" tabindex="-1"></a></span>
<span id="cb13-25"><a href="#cb13-25" tabindex="-1"></a><span class="fu">head</span>(out)</span>
<span id="cb13-26"><a href="#cb13-26" tabindex="-1"></a><span class="co"># * if you are only doing F reads, remove the &quot;truncLen&quot; command - truncLen=c(240,160) [for PE reads]</span></span>
<span id="cb13-27"><a href="#cb13-27" tabindex="-1"></a><span class="co"># sometimes there is a trimLeft=15 argument here, but I removed this because I already trimmed my sequences with bbduk</span></span></code></pre></div>
<div id="learn-the-error-rates" class="section level3" number="3.3.1">
<h3><span class="header-section-number">3.3.1</span> Learn the Error
Rates</h3>
<p>Dr. Callahan developed an algorithm for a parametric error model that
can use both inference and estimations to determine the error rates for
each sample. Here is the excerpt of Dr. Callahan describing this
function in his <code>DADA2</code> <a
href="https://benjjneb.github.io/dada2/tutorial.html">tutorial</a>.</p>
<blockquote>
<p>The DADA2 algorithm makes use of a parametric error model
(<code>err</code>) and every amplicon dataset has a different set of
error rates. The learnErrors method learns this error model from the
data, by alternating estimation of the error rates and inference of
sample composition until they converge on a jointly consistent solution.
As in many machine-learning problems, the algorithm must begin with an
initial guess, for which the maximum possible error rates in this data
are used (the error rates if only the most abundant sequence is correct
and all the rest are errors).</p>
</blockquote>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" tabindex="-1"></a>errF <span class="ot">&lt;-</span> <span class="fu">learnErrors</span>(filtFs, <span class="at">multithread=</span><span class="cn">TRUE</span>); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span>
<span id="cb14-2"><a href="#cb14-2" tabindex="-1"></a>errR <span class="ot">&lt;-</span> <span class="fu">learnErrors</span>(filtRs, <span class="at">multithread=</span><span class="cn">TRUE</span>); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span>
<span id="cb14-3"><a href="#cb14-3" tabindex="-1"></a><span class="co"># The learnErrors method learns this error model from the data by alternating estimation of the error rates and inference of sample composition until they converge on a jointly consistent solution.</span></span>
<span id="cb14-4"><a href="#cb14-4" tabindex="-1"></a><span class="co"># As in many machine-learning (ML) problems, the algorithm must begin with an initial guess, for which the maximum possible error rates in this data are used (the error rates if only the most abundant sequence is correct and all the rest are errors)</span></span>
<span id="cb14-5"><a href="#cb14-5" tabindex="-1"></a></span>
<span id="cb14-6"><a href="#cb14-6" tabindex="-1"></a>plot_error<span class="ot">&lt;-</span><span class="fu">plotErrors</span>(errF, <span class="at">nominalQ=</span><span class="cn">TRUE</span>)<span class="do">## sanity check by visualizing estimated error rates -- should see error rates drop w/ increased quality</span></span>
<span id="cb14-7"><a href="#cb14-7" tabindex="-1"></a><span class="fu">ggsave</span>(plot_error,<span class="at">filename =</span> <span class="st">&quot;16S_errormodel_DADA2.pdf&quot;</span>, <span class="at">width=</span><span class="dv">15</span>, <span class="at">height=</span><span class="dv">15</span>, <span class="at">dpi=</span><span class="dv">600</span>)</span></code></pre></div>
<center>
<img src="amplicon_workflow/16S_errormodel_DADA2.png" />
</center>
<div align="center">
Figure 6: Rarefaction Curve Example
</div>
<p></br></p>
<p>Once we have constructed the error model from the reads, we can plot
the observed frequency of transitions from base to base (i.e., A2A
indicates an A followed by an A) as a function of the consensus quality
score at that position in the read. The individual points in black
represent the observed error rates for each consensus quality score. The
black line shows estimated error rates after convergence of the ML
algorithm, and the red line shows error rates expected under the nominal
definition of the Q-score. One thing we notice is that as consensus
quality score increases, the error rates (black lines) decrease, which
is expected.</p>
<p>If it seems like we are doing a lot of sanity checks throughout this
workflow, it’s because we are! This process can take a while and require
some trouble shooting, so it’s good to constantly check your work as you
make your way through the workflow. Coding without sanity checks is
never recommended.</p>
</div>
<div id="identify-asvs-in-reads" class="section level3" number="3.3.2">
<h3><span class="header-section-number">3.3.2</span> Identify ASVs in
Reads</h3>
<p>Before we run the Divisive Amplicon Denoising Algorithm
(<code>dada()</code>), we have to remove the files from our
<code>filtFs/filtRs</code> objects that were dropped in the filtering
step. The algorithm will not work if we include file names of files that
do not actually exist in the <strong>Filtered</strong> directory.</p>
<p>Now it’s time to use the <code>dada</code> algorithm to infer our
amplicon sequence variants (ASVs) from our sequences. For more
information on how the Divisive Amplicon Denoising Algorithm works,
please see <span class="citation">Callahan et al. (2016)</span> and
<span class="citation">Rosen et al. (2012)</span>. To increase the
signal of sequence variants with very low abundance across samples, you
can choose to pool (<code>pool=TRUE</code>) or pseudo-pool
(<code>pool=pseudo</code>) your sample sequences together.</p>
<p>The output of the algorithm will be a <code>data-class</code> object,
containing the number of ASVs inferred out of the number of total input
sequences.</p>
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" tabindex="-1"></a>filtFs <span class="ot">&lt;-</span> filtFs[<span class="fu">file.exists</span>(filtFs)] <span class="co"># removes files that were not included in output because 0 reads passed filter step</span></span>
<span id="cb15-2"><a href="#cb15-2" tabindex="-1"></a>filtRs <span class="ot">&lt;-</span> filtRs[<span class="fu">file.exists</span>(filtRs)]</span>
<span id="cb15-3"><a href="#cb15-3" tabindex="-1"></a>dadaFs <span class="ot">&lt;-</span> <span class="fu">dada</span>(filtFs, <span class="at">err=</span>errF, <span class="at">multithread=</span><span class="cn">TRUE</span>, <span class="at">pool=</span><span class="cn">TRUE</span>); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>) <span class="co"># pseudo pooling is computationally more efficient but similar in results to pooling; pool = True will pool samples together before sample inference</span></span>
<span id="cb15-4"><a href="#cb15-4" tabindex="-1"></a>dadaRs <span class="ot">&lt;-</span> <span class="fu">dada</span>(filtRs, <span class="at">err=</span>errR, <span class="at">multithread=</span><span class="cn">TRUE</span>, <span class="at">pool=</span><span class="cn">TRUE</span>); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span>
<span id="cb15-5"><a href="#cb15-5" tabindex="-1"></a></span>
<span id="cb15-6"><a href="#cb15-6" tabindex="-1"></a>dadaFs[<span class="dv">1</span>] <span class="co"># Returns first section of dada-class object {one sample}</span></span></code></pre></div>
<p>The wonderful thing about ASVs is that because they are assigned
based on 99% sequence identity, they are true representative of
biological sequences and thus directly comparable across workflows <span
class="citation">(Prodan et al. 2020; Callahan, McMurdie, and Holmes
2017)</span>. I highly recommend reading these papers for more
information on the benefits of using ASVs/ESVs/zOTUs.</p>
</div>
</div>
<div id="merge-forward-reverse-reads" class="section level2"
number="3.4">
<h2><span class="header-section-number">3.4</span> Merge Forward +
Reverse Reads</h2>
<p>At this point in the workflow, we are now going to merge our denoised
Forward and Reverse reads to get our contiguous sequences (i.e.,
contigs). Sequences will be merged if they share at least 12
nucleotides. These sequences must be identical to each other in these
overlapping regions or else they will not be merged.</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" tabindex="-1"></a>mergers <span class="ot">&lt;-</span> <span class="fu">mergePairs</span>(dadaFs, filtFs, dadaRs, filtRs, <span class="at">verbose=</span><span class="cn">TRUE</span>); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span>
<span id="cb16-2"><a href="#cb16-2" tabindex="-1"></a><span class="fu">head</span>(mergers[[<span class="dv">1</span>]])</span></code></pre></div>
<p>The <code>mergers</code> object is a <code>data.frame</code>
containing the merged sequence, its abundance, and some statistics about
the sequences themselves. If most of your reads do not merge, then you
should revisit the filter and trimming step. It could be that you cut
too much off of your sequencing reads, making it more difficult to
successfully merge your reads.</p>
</div>
<div id="create-sequence-table-remove-chimeras" class="section level2"
number="3.5">
<h2><span class="header-section-number">3.5</span> Create Sequence Table
&amp; Remove Chimeras</h2>
<p>Let’s make an ASV table (similar to an OTU table but using ASVs),
which will have our samples as rows and our ASVs as columns.</p>
<p>Using this table, we get a sense of how long our ASVs are and the
distribution of these ASV lengths. We can also determine the percentage
of reads that fall within our desired range of region lengths.</p>
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" tabindex="-1"></a>seqtab <span class="ot">&lt;-</span> <span class="fu">makeSequenceTable</span>(mergers); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span>
<span id="cb17-2"><a href="#cb17-2" tabindex="-1"></a><span class="fu">dim</span>(seqtab)</span>
<span id="cb17-3"><a href="#cb17-3" tabindex="-1"></a><span class="fu">table</span>(<span class="fu">nchar</span>(<span class="fu">getSequences</span>(seqtab)))</span>
<span id="cb17-4"><a href="#cb17-4" tabindex="-1"></a></span>
<span id="cb17-5"><a href="#cb17-5" tabindex="-1"></a><span class="co"># we can filter out ASVs that are not within our target range of lengths</span></span>
<span id="cb17-6"><a href="#cb17-6" tabindex="-1"></a>seqtab2 <span class="ot">&lt;-</span> seqtab[,<span class="fu">nchar</span>(<span class="fu">colnames</span>(seqtab)) <span class="sc">%in%</span> <span class="dv">250</span><span class="sc">:</span><span class="dv">290</span>] <span class="co"># here looking at ASVs that are between 250 - 290 nucleotides long</span></span>
<span id="cb17-7"><a href="#cb17-7" tabindex="-1"></a><span class="fu">dim</span>(seqtab2)</span>
<span id="cb17-8"><a href="#cb17-8" tabindex="-1"></a><span class="fu">table</span>(<span class="fu">nchar</span>(<span class="fu">getSequences</span>(seqtab2)))</span>
<span id="cb17-9"><a href="#cb17-9" tabindex="-1"></a></span>
<span id="cb17-10"><a href="#cb17-10" tabindex="-1"></a><span class="co"># how many reads fall within our desired length range</span></span>
<span id="cb17-11"><a href="#cb17-11" tabindex="-1"></a><span class="fu">sum</span>(seqtab2)<span class="sc">/</span><span class="fu">sum</span>(seqtab) </span>
<span id="cb17-12"><a href="#cb17-12" tabindex="-1"></a></span>
<span id="cb17-13"><a href="#cb17-13" tabindex="-1"></a><span class="co"># Look at merged sequences in a plot -- see their distribution and frequency of sequences of certain length</span></span>
<span id="cb17-14"><a href="#cb17-14" tabindex="-1"></a><span class="do">## x axis - total number of reads per sample; y axis - density of samples w/ specific # of total reads</span></span>
<span id="cb17-15"><a href="#cb17-15" tabindex="-1"></a>compare_reads_plot1 <span class="ot">=</span> <span class="fu">ggdensity</span>(<span class="fu">rowSums</span>(seqtab), <span class="at">fill =</span> <span class="st">&quot;blue4&quot;</span>, <span class="at">alpha =</span> <span class="fl">0.7</span>); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span>
<span id="cb17-16"><a href="#cb17-16" tabindex="-1"></a>compare_reads_plot2 <span class="ot">=</span> <span class="fu">ggdensity</span>(<span class="fu">rowSums</span>(seqtab2), <span class="at">fill =</span> <span class="st">&quot;red4&quot;</span>, <span class="at">alpha =</span> <span class="fl">0.7</span>)</span>
<span id="cb17-17"><a href="#cb17-17" tabindex="-1"></a>comp_plots<span class="ot">&lt;-</span><span class="fu">ggarrange</span>(compare_reads_plot1, compare_reads_plot2,<span class="at">labels=</span><span class="fu">c</span>(<span class="st">&quot;All Reads&quot;</span>, <span class="st">&quot;Reads of Desired Length&quot;</span>),<span class="at">ncol=</span><span class="dv">1</span>, <span class="at">nrow=</span><span class="dv">2</span>)</span>
<span id="cb17-18"><a href="#cb17-18" tabindex="-1"></a><span class="fu">ggsave</span>(comp_plots,<span class="at">filename =</span> <span class="st">&quot;16S.V4_compare_total_reads.pdf&quot;</span>, <span class="at">width=</span><span class="dv">10</span>, <span class="at">height=</span><span class="dv">20</span>, <span class="at">dpi=</span><span class="dv">600</span>)</span>
<span id="cb17-19"><a href="#cb17-19" tabindex="-1"></a><span class="fu">dev.off</span>()</span></code></pre></div>
</div>
<div id="remove-chimeras" class="section level2" number="3.6">
<h2><span class="header-section-number">3.6</span> Remove Chimeras</h2>
<p>Now we are going to remove all chimeric sequences from our data.
<strong>Chimeras</strong> are the the result of two or more biological
sequences incorrectly joining together. This is often a result of PCR
for a number of reasons. In <code>DADA2</code> specifically, chimeras
are identified if they can be reconstructed from right and left segments
from two or more “parent” sequences. The object
<code>seqtab.nochim</code> will be our sequence table with the chimeras
removed. Most of your reads should not be removed during this step.
However, according to the <code>DADA2</code> <a
href="https://benjjneb.github.io/dada2/faq.html">FAQ page</a> if you
have more than 25% of your reads removed, then it is likely that primers
are still attached to your sequences. Be sure to remove these primers in
the trimming step with either <code>bbduk</code>, <code>cutadapt</code>,
or <code>trimmomatic</code> and begin the workflow again.</p>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" tabindex="-1"></a>seqtab.nochim <span class="ot">&lt;-</span> <span class="fu">removeBimeraDenovo</span>(seqtab2, <span class="at">method=</span><span class="st">&quot;consensus&quot;</span>, <span class="at">multithread=</span><span class="cn">TRUE</span>, <span class="at">verbose=</span><span class="cn">TRUE</span>); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span>
<span id="cb18-2"><a href="#cb18-2" tabindex="-1"></a><span class="co"># Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences</span></span>
<span id="cb18-3"><a href="#cb18-3" tabindex="-1"></a><span class="fu">dim</span>(seqtab.nochim)</span>
<span id="cb18-4"><a href="#cb18-4" tabindex="-1"></a><span class="fu">dim</span>(seqtab)</span>
<span id="cb18-5"><a href="#cb18-5" tabindex="-1"></a><span class="fu">dim</span>(seqtab2)</span>
<span id="cb18-6"><a href="#cb18-6" tabindex="-1"></a></span>
<span id="cb18-7"><a href="#cb18-7" tabindex="-1"></a><span class="fu">sum</span>(seqtab.nochim)<span class="sc">/</span><span class="fu">sum</span>(seqtab) <span class="co"># comparing reads after chimera removal over total reads (after filtering)</span></span>
<span id="cb18-8"><a href="#cb18-8" tabindex="-1"></a><span class="fu">sum</span>(seqtab.nochim)<span class="sc">/</span><span class="fu">sum</span>(seqtab2) <span class="co"># comparing reads after chimera removal over reads that are our desired length</span></span></code></pre></div>
</div>
<div id="track-the-reads---sanity-check" class="section level2"
number="3.7">
<h2><span class="header-section-number">3.7</span> Track the Reads -
Sanity Check</h2>
<p>Time for a sanity check and see how many reads we have at this point
in our workflow. This is a great place to see if we have lost any reads,
and at which steps they were lost - which can really help us determine
if we trimmed our reads to the appropriate length. If a lot of reads are
lost, it is recommended to check if primers and adapters are still
attached to your sequences, and the truncation length of your sequences
for the filter and trimming step.</p>
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" tabindex="-1"></a>getN <span class="ot">&lt;-</span> <span class="cf">function</span>(x) <span class="fu">sum</span>(<span class="fu">getUniques</span>(x)) <span class="co"># function get number of unique sequences per object</span></span>
<span id="cb19-2"><a href="#cb19-2" tabindex="-1"></a>track <span class="ot">&lt;-</span> <span class="fu">cbind</span>(out, <span class="fu">sapply</span>(dadaFs, getN), <span class="fu">sapply</span>(dadaRs, getN), <span class="fu">sapply</span>(mergers, getN), <span class="fu">rowSums</span>(seqtab.nochim)); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span>
<span id="cb19-3"><a href="#cb19-3" tabindex="-1"></a><span class="co"># If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs) ;  sapply(dadaRs, getN), sapply(mergers, getN),</span></span>
<span id="cb19-4"><a href="#cb19-4" tabindex="-1"></a><span class="fu">head</span>(track)</span>
<span id="cb19-5"><a href="#cb19-5" tabindex="-1"></a></span>
<span id="cb19-6"><a href="#cb19-6" tabindex="-1"></a><span class="fu">colnames</span>(track) <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;input&quot;</span>, <span class="st">&quot;filtered&quot;</span>, <span class="st">&quot;denoisedF&quot;</span>, <span class="st">&quot;denoisedR&quot;</span>, <span class="st">&quot;merged&quot;</span>, <span class="st">&quot;nonchim&quot;</span>) <span class="co"># remove whichever labels you didn&#39;t include</span></span>
<span id="cb19-7"><a href="#cb19-7" tabindex="-1"></a><span class="fu">rownames</span>(track) <span class="ot">&lt;-</span> sample.names</span>
<span id="cb19-8"><a href="#cb19-8" tabindex="-1"></a><span class="fu">head</span>(track)</span>
<span id="cb19-9"><a href="#cb19-9" tabindex="-1"></a></span>
<span id="cb19-10"><a href="#cb19-10" tabindex="-1"></a><span class="fu">write.table</span>(track,<span class="st">&quot;16S_tracking_reads_dada2.txt&quot;</span>,<span class="at">sep=</span><span class="st">&quot;</span><span class="sc">\t</span><span class="st">&quot;</span>,<span class="at">row.names=</span><span class="cn">TRUE</span>,<span class="at">col.names=</span><span class="cn">TRUE</span>); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span></code></pre></div>
</div>
<div id="assign-taxonomy-to-asvs" class="section level2" number="3.8">
<h2><span class="header-section-number">3.8</span> Assign Taxonomy to
ASVs</h2>
<p>Now it’s time to assign taxonomic identities to our ASVs.
Dr. Callahan utilizes the <a
href="https://pubmed.ncbi.nlm.nih.gov/17586664/">naive Bayesian
classifier</a> his <code>assignTaxonomy</code> function, which is the
same classifier used by the <a href="https://rdp.cme.msu.edu/">Ribosomal
Database Project</a> (RDP) for taxonomic assignment. For more
information on how this classifier works, please read <span
class="citation">Wang et al. (2007)</span>.</p>
<p>In order to assign the taxonomic IDs to our ASVs, we need to have a
reference database FASTA file to use as our known training data. These
training data with known references will help the classifier determine
which taxa our ASVs belong to. Currently the options for reference
databases include the latest versions of the <strong>Silva</strong>
database (for 16S), the <strong>Ribosomal Database Project</strong>
database (for 16S), and the <strong>UNITE</strong> database (which
should be used specifically for ITS sequences). A <strong>Green
Genes</strong> database file is also included (for 16S analyses), but
Green Genes has not been updated in a long time and thus is not the best
choice for our reference training dataset. Dr. Callahan has included
these reference database files <a
href="https://benjjneb.github.io/dada2/training.html">here</a>.</p>
<p>The <code>assignTaxonomy()</code> command has multiple arguments that
can be adjusted as described <a
href="https://rdrr.io/bioc/dada2/man/assignTaxonomy.html">here</a>. One
specific argument is <code>tryRC</code>, which is an option to try the
reverse complement of your sequences to query against your database of
choice. If <code>tryRC=TRUE</code>, then the reverse complement of the
query sequence will be used for taxonomic classification of the ASVs if
it is a better match to the reference than the forward (or original)
sequence. By default <code>tryRC=FALSE</code>, however <strong>if you
are analyzing ITS2 sequences and used the 5.8S-Fun/ITS4-Fun primer set
created by <span class="citation">Taylor et al. (2016)</span></strong>,
you must set <code>tryRC=TRUE</code>. This is because the “Illumina
forward adaptor and barcodes were added to the ITS4-Fun primer rather
than the 5.8S-Fun primer to avoid excessive hairpin formation” <span
class="citation">(Taylor et al. 2016)</span>. That means that your F and
R reads are in reverse orientation, and that the reverse complement of
these reads should be used when comparing these reads to the UNITE
database. Thank you to <a
href="https://scholar.google.com/citations?user=EqvoNK4AAAAJ&amp;hl=en">Dr.Fabiola
Pulido-Chavez</a> for sharing this helpful info with me!</p>
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" tabindex="-1"></a>taxa <span class="ot">&lt;-</span> <span class="fu">assignTaxonomy</span>(seqtab.nochim, <span class="st">&quot;/bigdata/aronsonlab/shared/DADA2_Silva_Files/silva_nr99_v138.1_wSpecies_train_set.fa.gz&quot;</span>, <span class="at">multithread=</span><span class="cn">TRUE</span>); <span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span>
<span id="cb20-2"><a href="#cb20-2" tabindex="-1"></a></span>
<span id="cb20-3"><a href="#cb20-3" tabindex="-1"></a><span class="co"># ITS2 w/ Taylor et al 2016 5.8S-Fun/ITS4-Fun primers</span></span>
<span id="cb20-4"><a href="#cb20-4" tabindex="-1"></a><span class="co"># taxa &lt;- assignTaxonomy(seqtab.nochim, &quot;/bigdata/aronsonlab/shared/DADA2_Silva_Files/silva_nr99_v138.1_wSpecies_train_set.fa.gz&quot;, multithread=TRUE,tryRC=TRUE); save.image(file = &quot;mydada_16S.V4.Rdata&quot;)</span></span>
<span id="cb20-5"><a href="#cb20-5" tabindex="-1"></a></span>
<span id="cb20-6"><a href="#cb20-6" tabindex="-1"></a>taxa.print <span class="ot">&lt;-</span> taxa <span class="co"># Removing sequence rownames for display only</span></span>
<span id="cb20-7"><a href="#cb20-7" tabindex="-1"></a><span class="fu">rownames</span>(taxa.print) <span class="ot">&lt;-</span> <span class="cn">NULL</span></span>
<span id="cb20-8"><a href="#cb20-8" tabindex="-1"></a>taxa.print<span class="ot">&lt;-</span><span class="fu">as.data.frame</span>(<span class="fu">apply</span>(taxa.print,<span class="dv">2</span>, <span class="cf">function</span>(x) <span class="fu">gsub</span>(<span class="st">&quot;[^.]__&quot;</span>, <span class="st">&quot;&quot;</span>, x))) <span class="co"># remove leading letters and __ with gsub</span></span>
<span id="cb20-9"><a href="#cb20-9" tabindex="-1"></a><span class="fu">head</span>(taxa.print)</span></code></pre></div>
<p>If you are only seeing taxonomic identification at the Phyla level
(i.e., the rest of the columns are filled with <code>NAs</code>), then
this could indicate that we have not trimmed our sequences correctly.
For example, let’s say we have reads that are 300bp long (from 2x300 PE
sequencing), but we are interested in the 16S V3 region which ranges
from ~ 130-190 nucleotides in length. If we have not trimmed our
sequences down to our desired region length (here 130-190 nucleotides),
then our merged reads are no longer reliable, and the classifier will
incorrectly identify our ASVs. Remember, the reads need to have at least
a 12 nucleotide overlap to merge - so if we are not trimming our reads
correctly, we could create merged sequences that are not accurate
representations of the regions we are trying to identify, which will
hurt us in the taxonomic assignment step.</p>
</div>
<div id="save-dada2-output-for-future-analysis" class="section level2"
number="3.9">
<h2><span class="header-section-number">3.9</span> Save DADA2 Output for
future analysis</h2>
<p>We have finished the <code>DADA2</code> portion of the workflow! We
can save the output from <code>DADA2</code> as R objects, text files,
and tsv files for future import into R.</p>
<p>First we create a vector of the ASV labels called
<code>asv_headers</code> that we will use to make the ASV IDs easier to
read. We then use a for loop to add an “ASV” prefix to our
<code>asv_headers</code> so that they are easily identifiable by an ASV
number instead of just a number to represent each ASV. We then combine
<code>asv_headers</code> with the sequences themselves to make an object
called <code>asv_fasta</code>, which now holds our ASV sequences and
their respective IDs.</p>
<div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" tabindex="-1"></a><span class="co"># giving our seq headers more manageable names (ASV_1, ASV_2...)</span></span>
<span id="cb21-2"><a href="#cb21-2" tabindex="-1"></a>asv_seqs <span class="ot">&lt;-</span> <span class="fu">colnames</span>(seqtab.nochim)</span>
<span id="cb21-3"><a href="#cb21-3" tabindex="-1"></a>asv_headers <span class="ot">&lt;-</span> <span class="fu">vector</span>(<span class="fu">dim</span>(seqtab.nochim)[<span class="dv">2</span>], <span class="at">mode=</span><span class="st">&quot;character&quot;</span>)</span>
<span id="cb21-4"><a href="#cb21-4" tabindex="-1"></a><span class="fu">head</span>(seqtab.nochim)</span>
<span id="cb21-5"><a href="#cb21-5" tabindex="-1"></a><span class="fu">head</span>(asv_headers)</span>
<span id="cb21-6"><a href="#cb21-6" tabindex="-1"></a></span>
<span id="cb21-7"><a href="#cb21-7" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="dv">1</span><span class="sc">:</span><span class="fu">dim</span>(seqtab.nochim)[<span class="dv">2</span>]) {</span>
<span id="cb21-8"><a href="#cb21-8" tabindex="-1"></a>  asv_headers[i] <span class="ot">&lt;-</span> <span class="fu">paste</span>(<span class="st">&quot;&gt;ASV&quot;</span>, i, <span class="at">sep=</span><span class="st">&quot;_&quot;</span>)</span>
<span id="cb21-9"><a href="#cb21-9" tabindex="-1"></a>}</span>
<span id="cb21-10"><a href="#cb21-10" tabindex="-1"></a>asv_fasta <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="fu">rbind</span>(asv_headers, asv_seqs))</span></code></pre></div>
<p>Now we can save our ASV count table, our ASV taxonomy table, and the
ASV sequences themselves as separate files and R objects. I wanted to
provide several file options because some people have a preference as to
how they import data into R.</p>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" tabindex="-1"></a><span class="co"># making and writing out a fasta file of our final ASV seqs w/ ASV IDs:</span></span>
<span id="cb22-2"><a href="#cb22-2" tabindex="-1"></a><span class="fu">write</span>(asv_fasta, <span class="st">&quot;16S_ASVs_dada2.fa&quot;</span>) <span class="co"># write fasta file</span></span>
<span id="cb22-3"><a href="#cb22-3" tabindex="-1"></a><span class="fu">write.table</span>(asv_fasta,<span class="st">&quot;16S_ASVs_dada2.txt&quot;</span>,<span class="at">sep=</span><span class="st">&quot;</span><span class="sc">\t</span><span class="st">&quot;</span>,<span class="at">row.names=</span><span class="cn">TRUE</span>,<span class="at">col.names=</span><span class="cn">TRUE</span>)</span>
<span id="cb22-4"><a href="#cb22-4" tabindex="-1"></a></span>
<span id="cb22-5"><a href="#cb22-5" tabindex="-1"></a><span class="co"># ASV count table:</span></span>
<span id="cb22-6"><a href="#cb22-6" tabindex="-1"></a>asv_counts <span class="ot">&lt;-</span> <span class="fu">t</span>(seqtab.nochim)</span>
<span id="cb22-7"><a href="#cb22-7" tabindex="-1"></a><span class="fu">row.names</span>(asv_counts) <span class="ot">&lt;-</span> <span class="fu">sub</span>(<span class="st">&quot;&gt;&quot;</span>, <span class="st">&quot;&quot;</span>, asv_headers)</span>
<span id="cb22-8"><a href="#cb22-8" tabindex="-1"></a></span>
<span id="cb22-9"><a href="#cb22-9" tabindex="-1"></a><span class="co"># For Vegan format: sample IDs as rows, ASVs as columns</span></span>
<span id="cb22-10"><a href="#cb22-10" tabindex="-1"></a>asv_tab<span class="ot">&lt;-</span><span class="fu">t</span>(asv_counts)</span>
<span id="cb22-11"><a href="#cb22-11" tabindex="-1"></a><span class="fu">write.table</span>(asv_tab, <span class="st">&quot;16S.V4_ASVs_Table_dada2.tsv&quot;</span>, <span class="at">sep=</span><span class="st">&quot;</span><span class="sc">\t</span><span class="st">&quot;</span>, <span class="at">quote=</span>F, <span class="at">col.names=</span><span class="cn">NA</span>)</span>
<span id="cb22-12"><a href="#cb22-12" tabindex="-1"></a><span class="fu">write.table</span>(asv_tab,<span class="st">&quot;16S.V4_ASVs_Table_dada2.txt&quot;</span>,<span class="at">sep=</span><span class="st">&quot;</span><span class="sc">\t</span><span class="st">&quot;</span>,<span class="at">row.names=</span><span class="cn">TRUE</span>,<span class="at">col.names=</span><span class="cn">TRUE</span>)</span>
<span id="cb22-13"><a href="#cb22-13" tabindex="-1"></a><span class="co"># For Phyloseq format: ASVs as row IDs, sample IDs as columns</span></span>
<span id="cb22-14"><a href="#cb22-14" tabindex="-1"></a><span class="fu">write.table</span>(asv_counts, <span class="st">&quot;16S.V4_ASVs_Counts_dada2.tsv&quot;</span>, <span class="at">sep=</span><span class="st">&quot;</span><span class="sc">\t</span><span class="st">&quot;</span>, <span class="at">quote=</span>F, <span class="at">col.names=</span><span class="cn">NA</span>)</span>
<span id="cb22-15"><a href="#cb22-15" tabindex="-1"></a><span class="fu">write.table</span>(asv_counts,<span class="st">&quot;16S.V4_ASVs_Counts_dada2.txt&quot;</span>,<span class="at">sep=</span><span class="st">&quot;</span><span class="sc">\t</span><span class="st">&quot;</span>,<span class="at">row.names=</span><span class="cn">TRUE</span>,<span class="at">col.names=</span><span class="cn">TRUE</span>)</span>
<span id="cb22-16"><a href="#cb22-16" tabindex="-1"></a></span>
<span id="cb22-17"><a href="#cb22-17" tabindex="-1"></a><span class="co"># taxa ID table:</span></span>
<span id="cb22-18"><a href="#cb22-18" tabindex="-1"></a>asv_tax <span class="ot">&lt;-</span> taxa</span>
<span id="cb22-19"><a href="#cb22-19" tabindex="-1"></a><span class="fu">row.names</span>(asv_tax) <span class="ot">&lt;-</span> <span class="fu">sub</span>(<span class="st">&quot;&gt;&quot;</span>, <span class="st">&quot;&quot;</span>, asv_headers)</span>
<span id="cb22-20"><a href="#cb22-20" tabindex="-1"></a><span class="fu">write.table</span>(asv_tax, <span class="st">&quot;16S.V4_ASVs_Taxonomy_dada2.tsv&quot;</span>, <span class="at">sep=</span><span class="st">&quot;</span><span class="sc">\t</span><span class="st">&quot;</span>, <span class="at">quote=</span>F, <span class="at">col.names=</span><span class="cn">NA</span>)</span>
<span id="cb22-21"><a href="#cb22-21" tabindex="-1"></a><span class="fu">write.table</span>(asv_tax,<span class="st">&quot;16S.V4_ASVs_Taxonomy_dada2.txt&quot;</span>,<span class="at">sep=</span><span class="st">&quot;</span><span class="sc">\t</span><span class="st">&quot;</span>,<span class="at">row.names=</span><span class="cn">TRUE</span>,<span class="at">col.names=</span><span class="cn">TRUE</span>)</span>
<span id="cb22-22"><a href="#cb22-22" tabindex="-1"></a></span>
<span id="cb22-23"><a href="#cb22-23" tabindex="-1"></a><span class="do">#### Save all ASV objects as R objects ####</span></span>
<span id="cb22-24"><a href="#cb22-24" tabindex="-1"></a><span class="fu">saveRDS</span>(asv_tax, <span class="at">file =</span> <span class="st">&quot;16S.V4_ASVs_Taxonomy_dada2_Robject.rds&quot;</span>, <span class="at">ascii =</span> <span class="cn">FALSE</span>, <span class="at">version =</span> <span class="cn">NULL</span>,</span>
<span id="cb22-25"><a href="#cb22-25" tabindex="-1"></a>        <span class="at">compress =</span> <span class="cn">TRUE</span>, <span class="at">refhook =</span> <span class="cn">NULL</span>)</span>
<span id="cb22-26"><a href="#cb22-26" tabindex="-1"></a><span class="fu">saveRDS</span>(asv_tab, <span class="at">file =</span> <span class="st">&quot;16S.V4_ASVs_Counts_dada2_Robject.rds&quot;</span>, <span class="at">ascii =</span> <span class="cn">FALSE</span>, <span class="at">version =</span> <span class="cn">NULL</span>,</span>
<span id="cb22-27"><a href="#cb22-27" tabindex="-1"></a>        <span class="at">compress =</span> <span class="cn">TRUE</span>, <span class="at">refhook =</span> <span class="cn">NULL</span>)</span>
<span id="cb22-28"><a href="#cb22-28" tabindex="-1"></a><span class="fu">saveRDS</span>(asv_fasta, <span class="at">file =</span> <span class="st">&quot;16S.V4_ASV_Sequences_dada2_Robject.rds&quot;</span>, <span class="at">ascii =</span> <span class="cn">FALSE</span>, <span class="at">version =</span> <span class="cn">NULL</span>,</span>
<span id="cb22-29"><a href="#cb22-29" tabindex="-1"></a>        <span class="at">compress =</span> <span class="cn">TRUE</span>, <span class="at">refhook =</span> <span class="cn">NULL</span>)</span>
<span id="cb22-30"><a href="#cb22-30" tabindex="-1"></a></span>
<span id="cb22-31"><a href="#cb22-31" tabindex="-1"></a><span class="do">#### Save everything from global environment into .Rdata file</span></span>
<span id="cb22-32"><a href="#cb22-32" tabindex="-1"></a><span class="fu">save.image</span>(<span class="at">file =</span> <span class="st">&quot;mydada_16S.V4.Rdata&quot;</span>)</span></code></pre></div>
<p>Personally, I like using R objects (file extension .rds) in my
analyses. In order to import R objects into R, you can run
<code>data.frame(readRDS("path/to/Robject.rds", refhook = NULL))</code>
to create a data frame holding the contents of your R object file.</p>
</div>
</div>
<div id="statistical-analysis" class="section level1" number="4">
<h1><span class="header-section-number">4</span> Statistical
Analysis</h1>
<p>At this point you should have either/or R objects, text files, and
tsv files containing the following: 1. your ASV sequences in FASTA
format, 2. your ASV count table, and 3. your ASV taxonomy table. You
should also have some metadata for your samples that will allow for
deeper investigation into your microbial data.</p>
<div id="import-and-prepare-data-for-analyses" class="section level2"
number="4.1">
<h2><span class="header-section-number">4.1</span> Import and Prepare
Data for Analyses</h2>
<p>First before we import any data, let’s make sure that we are in the
right directory (where our <code>DADA2</code> files are stored) and that
have all of the necessary R libraries loaded.</p>
<div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" tabindex="-1"></a><span class="fu">getwd</span>() <span class="co"># use setwd(&quot;path/to/files&quot;) if you are not in the right directory</span></span>
<span id="cb23-2"><a href="#cb23-2" tabindex="-1"></a><span class="fu">suppressPackageStartupMessages</span>({ <span class="co"># load packages quietly</span></span>
<span id="cb23-3"><a href="#cb23-3" tabindex="-1"></a>  <span class="fu">library</span>(phyloseq)</span>
<span id="cb23-4"><a href="#cb23-4" tabindex="-1"></a>  <span class="fu">library</span>(ggplot2)</span>
<span id="cb23-5"><a href="#cb23-5" tabindex="-1"></a>  <span class="fu">library</span>(vegan)</span>
<span id="cb23-6"><a href="#cb23-6" tabindex="-1"></a>  <span class="fu">library</span>(dendextend)</span>
<span id="cb23-7"><a href="#cb23-7" tabindex="-1"></a>  <span class="fu">library</span>(ggpubr)</span>
<span id="cb23-8"><a href="#cb23-8" tabindex="-1"></a>  <span class="fu">library</span>(scales)</span>
<span id="cb23-9"><a href="#cb23-9" tabindex="-1"></a>  <span class="fu">library</span>(grid)</span>
<span id="cb23-10"><a href="#cb23-10" tabindex="-1"></a>  <span class="fu">library</span>(ape)</span>
<span id="cb23-11"><a href="#cb23-11" tabindex="-1"></a>  <span class="fu">library</span>(plyr)</span>
<span id="cb23-12"><a href="#cb23-12" tabindex="-1"></a>  <span class="fu">library</span>(dplyr)</span>
<span id="cb23-13"><a href="#cb23-13" tabindex="-1"></a>  <span class="fu">library</span>(readxl)</span>
<span id="cb23-14"><a href="#cb23-14" tabindex="-1"></a>  <span class="fu">library</span>(dplyr)</span>
<span id="cb23-15"><a href="#cb23-15" tabindex="-1"></a>  <span class="fu">library</span>(tidyr)</span>
<span id="cb23-16"><a href="#cb23-16" tabindex="-1"></a>  <span class="fu">library</span>(reshape)</span>
<span id="cb23-17"><a href="#cb23-17" tabindex="-1"></a>  <span class="fu">library</span>(reshape2)</span>
<span id="cb23-18"><a href="#cb23-18" tabindex="-1"></a>  <span class="fu">library</span>(shades)</span>
<span id="cb23-19"><a href="#cb23-19" tabindex="-1"></a>  <span class="fu">library</span>(microbiome)</span>
<span id="cb23-20"><a href="#cb23-20" tabindex="-1"></a>  <span class="fu">library</span>(devtools)</span>
<span id="cb23-21"><a href="#cb23-21" tabindex="-1"></a>  <span class="fu">library</span>(decontam)</span>
<span id="cb23-22"><a href="#cb23-22" tabindex="-1"></a>  <span class="fu">library</span>(pairwiseAdonis)</span>
<span id="cb23-23"><a href="#cb23-23" tabindex="-1"></a>  <span class="fu">library</span>(corrplot)</span>
<span id="cb23-24"><a href="#cb23-24" tabindex="-1"></a>})</span></code></pre></div>
<p>Now let’s import the <code>DADA2</code> output into R for some
statistical analyses. We will import our ASV count table, our ASV
taxonomic table, and our metadata for this dataset. We are also going to
create an object called <code>colorset1</code> to contain the color
labels for each of our categories. This will help us keep the colors
consistent for each category in all of our figures.</p>
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" tabindex="-1"></a><span class="do">## Import bacterial ASV count data</span></span>
<span id="cb24-2"><a href="#cb24-2" tabindex="-1"></a>bac.ASV_counts<span class="ot">&lt;-</span><span class="fu">data.frame</span>(<span class="fu">readRDS</span>(<span class="st">&quot;16S.V4_MSH_ASVs_Counts_dada2_9.20.2021_Robject.rds&quot;</span>, <span class="at">refhook =</span> <span class="cn">NULL</span>))</span>
<span id="cb24-3"><a href="#cb24-3" tabindex="-1"></a><span class="fu">dim</span>(bac.ASV_counts)</span>
<span id="cb24-4"><a href="#cb24-4" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_counts)</span>
<span id="cb24-5"><a href="#cb24-5" tabindex="-1"></a><span class="fu">colnames</span>(bac.ASV_counts)<span class="ot">&lt;-</span><span class="fu">gsub</span>(<span class="st">&quot;X1&quot;</span>, <span class="st">&quot;1&quot;</span>, <span class="fu">colnames</span>(bac.ASV_counts)) <span class="co"># shorten sample names to match sample names in metadata file</span></span>
<span id="cb24-6"><a href="#cb24-6" tabindex="-1"></a>bac.ASV_counts<span class="sc">$</span>ASV_ID<span class="ot">&lt;-</span><span class="fu">rownames</span>(bac.ASV_counts)</span>
<span id="cb24-7"><a href="#cb24-7" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_counts)</span>
<span id="cb24-8"><a href="#cb24-8" tabindex="-1"></a></span>
<span id="cb24-9"><a href="#cb24-9" tabindex="-1"></a><span class="do">## Import metadata</span></span>
<span id="cb24-10"><a href="#cb24-10" tabindex="-1"></a>metadata<span class="ot">&lt;-</span><span class="fu">as.data.frame</span>(<span class="fu">read_excel</span>(<span class="st">&quot;MSH_MappingFile_for_Workflow.xlsx&quot;</span>), <span class="at">header=</span><span class="cn">TRUE</span>)</span>
<span id="cb24-11"><a href="#cb24-11" tabindex="-1"></a><span class="fu">head</span>(metadata)</span>
<span id="cb24-12"><a href="#cb24-12" tabindex="-1"></a><span class="co">#metadata&lt;-na.omit(metadata) # drop NAs from metadata</span></span>
<span id="cb24-13"><a href="#cb24-13" tabindex="-1"></a><span class="fu">head</span>(metadata)</span>
<span id="cb24-14"><a href="#cb24-14" tabindex="-1"></a>metadata<span class="sc">$</span>SampleID<span class="ot">&lt;-</span><span class="fu">gsub</span>(<span class="st">&quot;(</span><span class="sc">\\</span><span class="st">_.*?)</span><span class="sc">\\</span><span class="st">_([0-9])&quot;</span>,<span class="st">&quot;</span><span class="sc">\\</span><span class="st">1.</span><span class="sc">\\</span><span class="st">2&quot;</span>, metadata<span class="sc">$</span>SampleID) <span class="co"># replace second _ with .</span></span>
<span id="cb24-15"><a href="#cb24-15" tabindex="-1"></a><span class="fu">rownames</span>(metadata)<span class="ot">&lt;-</span>metadata<span class="sc">$</span>SampleID</span>
<span id="cb24-16"><a href="#cb24-16" tabindex="-1"></a></span>
<span id="cb24-17"><a href="#cb24-17" tabindex="-1"></a><span class="co"># create color variable(s) to identify variables by colors</span></span>
<span id="cb24-18"><a href="#cb24-18" tabindex="-1"></a><span class="do">## color for Category</span></span>
<span id="cb24-19"><a href="#cb24-19" tabindex="-1"></a>colorset1 <span class="ot">=</span> <span class="fu">melt</span>(<span class="fu">c</span>(<span class="at">ClearCutSoil=</span><span class="st">&quot;#D00000&quot;</span>,<span class="at">Gopher=</span><span class="st">&quot;#f8961e&quot;</span>,<span class="at">NoGopher=</span><span class="st">&quot;#4ea8de&quot;</span>,<span class="at">OldGrowth=</span><span class="st">&quot;#283618&quot;</span>))</span>
<span id="cb24-20"><a href="#cb24-20" tabindex="-1"></a></span>
<span id="cb24-21"><a href="#cb24-21" tabindex="-1"></a>colorset1<span class="sc">$</span>Category<span class="ot">&lt;-</span><span class="fu">rownames</span>(colorset1)</span>
<span id="cb24-22"><a href="#cb24-22" tabindex="-1"></a><span class="fu">colnames</span>(colorset1)[<span class="fu">which</span>(<span class="fu">names</span>(colorset1) <span class="sc">==</span> <span class="st">&quot;value&quot;</span>)] <span class="ot">&lt;-</span> <span class="st">&quot;Category_col&quot;</span></span>
<span id="cb24-23"><a href="#cb24-23" tabindex="-1"></a>colorset1</span>
<span id="cb24-24"><a href="#cb24-24" tabindex="-1"></a></span>
<span id="cb24-25"><a href="#cb24-25" tabindex="-1"></a>metadata<span class="ot">&lt;-</span><span class="fu">merge</span>(metadata, colorset1, <span class="at">by=</span><span class="st">&quot;Category&quot;</span>)</span>
<span id="cb24-26"><a href="#cb24-26" tabindex="-1"></a><span class="fu">head</span>(metadata)</span>
<span id="cb24-27"><a href="#cb24-27" tabindex="-1"></a>metadata<span class="sc">$</span>Category_col <span class="ot">&lt;-</span> <span class="fu">as.character</span>(metadata<span class="sc">$</span>Category_col)</span>
<span id="cb24-28"><a href="#cb24-28" tabindex="-1"></a><span class="fu">rownames</span>(metadata)<span class="ot">&lt;-</span>metadata<span class="sc">$</span>SampleID</span>
<span id="cb24-29"><a href="#cb24-29" tabindex="-1"></a></span>
<span id="cb24-30"><a href="#cb24-30" tabindex="-1"></a><span class="do">## Import ASV taxonomic data</span></span>
<span id="cb24-31"><a href="#cb24-31" tabindex="-1"></a>bac.ASV_taxa<span class="ot">&lt;-</span><span class="fu">data.frame</span>(<span class="fu">readRDS</span>(<span class="st">&quot;16S.V4_MSH_ASVs_Taxonomy_dada2_9.20.2021_Robject.rds&quot;</span>, <span class="at">refhook =</span> <span class="cn">NULL</span>))</span>
<span id="cb24-32"><a href="#cb24-32" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_taxa)</span>
<span id="cb24-33"><a href="#cb24-33" tabindex="-1"></a></span>
<span id="cb24-34"><a href="#cb24-34" tabindex="-1"></a>bac.ASV_taxa[<span class="fu">is.na</span>(bac.ASV_taxa)]<span class="ot">&lt;-</span> <span class="st">&quot;Unknown&quot;</span> <span class="co"># turn all NAs into &quot;Unkowns&quot;</span></span>
<span id="cb24-35"><a href="#cb24-35" tabindex="-1"></a>bac.ASV_taxa<span class="sc">$</span>Species<span class="ot">&lt;-</span><span class="fu">gsub</span>(<span class="st">&quot;Unknown&quot;</span>, <span class="st">&quot;unknown&quot;</span>, bac.ASV_taxa<span class="sc">$</span>Species) <span class="co"># change uppercase Unkonwn to lowercase unknown for unknown species classification</span></span>
<span id="cb24-36"><a href="#cb24-36" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_taxa)</span>
<span id="cb24-37"><a href="#cb24-37" tabindex="-1"></a>bac.ASV_taxa<span class="sc">$</span>ASV_ID<span class="ot">&lt;-</span><span class="fu">rownames</span>(bac.ASV_taxa) <span class="co"># create ASV ID column to use for merging data frames</span></span>
<span id="cb24-38"><a href="#cb24-38" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_taxa)</span></code></pre></div>
<p>With our data imported, we now need to remove any potential
contaminates from our ASV table. These are ASVs that were identified in
your positive and negative controls. Fortunately we can use the
<code>decontam()</code> package to do this and create new, “clean” ASV
count and taxonomy tables <span class="citation">(Davis et al.
2018)</span>. Be sure to have a column in your metadata that tells you
exactly which samples are controls. For information on how to properly
use <code>decontam()</code>, view the tutorial <a
href="https://benjjneb.github.io/decontam/vignettes/decontam_intro.html">here</a>.</p>
<p><strong>NOTE</strong>: In the particular project we are using for
this workflow, there were NO PCR or sequencing controls included.
Running the code below will lead to an error. Please use the following
section of code as a guide for removing decontaminants in your ASV
counts data frame.</p>
<div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" tabindex="-1"></a><span class="do">## Identify &amp; Remove Contaminants </span></span>
<span id="cb25-2"><a href="#cb25-2" tabindex="-1"></a></span>
<span id="cb25-3"><a href="#cb25-3" tabindex="-1"></a><span class="co"># Create a df that contains which samples in your data set are positive/negative controls</span></span>
<span id="cb25-4"><a href="#cb25-4" tabindex="-1"></a>ControlDF<span class="ot">&lt;-</span>metadata[metadata<span class="sc">$</span>SampleType<span class="sc">==</span><span class="st">&quot;Control&quot;</span>,]</span>
<span id="cb25-5"><a href="#cb25-5" tabindex="-1"></a></span>
<span id="cb25-6"><a href="#cb25-6" tabindex="-1"></a><span class="co"># create a vector for the decontam() pacakge that tells us which sames are controls (TRUE) or not (FALSE)</span></span>
<span id="cb25-7"><a href="#cb25-7" tabindex="-1"></a>vector_for_decontam<span class="ot">&lt;-</span>metadata<span class="sc">$</span>Sample_or_Control <span class="co"># use for decontam package</span></span>
<span id="cb25-8"><a href="#cb25-8" tabindex="-1"></a></span>
<span id="cb25-9"><a href="#cb25-9" tabindex="-1"></a><span class="co">#convert bac.ASV_counts data frame to numeric</span></span>
<span id="cb25-10"><a href="#cb25-10" tabindex="-1"></a>bac.ASV_counts[,<span class="sc">-</span><span class="fu">length</span>(bac.ASV_counts)] <span class="ot">&lt;-</span> <span class="fu">as.data.frame</span>(<span class="fu">sapply</span>(bac.ASV_counts[,<span class="sc">-</span><span class="fu">length</span>(bac.ASV_counts)], as.numeric)) </span>
<span id="cb25-11"><a href="#cb25-11" tabindex="-1"></a></span>
<span id="cb25-12"><a href="#cb25-12" tabindex="-1"></a><span class="co"># transpose so that rows are Samples and columns are ASVs</span></span>
<span id="cb25-13"><a href="#cb25-13" tabindex="-1"></a>bac.ASV_c2<span class="ot">&lt;-</span><span class="fu">t</span>(bac.ASV_counts[,<span class="sc">-</span><span class="fu">length</span>(bac.ASV_counts)])</span>
<span id="cb25-14"><a href="#cb25-14" tabindex="-1"></a></span>
<span id="cb25-15"><a href="#cb25-15" tabindex="-1"></a><span class="co"># create data frame of which ASVs are contaminants are not</span></span>
<span id="cb25-16"><a href="#cb25-16" tabindex="-1"></a>contam_df <span class="ot">&lt;-</span> <span class="fu">isContaminant</span>(bac.ASV_c2, <span class="at">neg=</span>vector_for_decontam)</span>
<span id="cb25-17"><a href="#cb25-17" tabindex="-1"></a></span>
<span id="cb25-18"><a href="#cb25-18" tabindex="-1"></a><span class="fu">table</span>(contam_df<span class="sc">$</span>contaminant) <span class="co"># identify contaminants aka TRUE</span></span>
<span id="cb25-19"><a href="#cb25-19" tabindex="-1"></a></span>
<span id="cb25-20"><a href="#cb25-20" tabindex="-1"></a><span class="co"># pull out ASV IDs for contaminating ASVs</span></span>
<span id="cb25-21"><a href="#cb25-21" tabindex="-1"></a>contam_asvs <span class="ot">&lt;-</span> (contam_df[contam_df<span class="sc">$</span>contaminant <span class="sc">==</span> <span class="cn">TRUE</span>, ]) </span>
<span id="cb25-22"><a href="#cb25-22" tabindex="-1"></a></span>
<span id="cb25-23"><a href="#cb25-23" tabindex="-1"></a><span class="co"># see which taxa are contaminants</span></span>
<span id="cb25-24"><a href="#cb25-24" tabindex="-1"></a>bac.ASV_taxa[<span class="fu">row.names</span>(bac.ASV_taxa) <span class="sc">%in%</span> <span class="fu">row.names</span>(contam_asvs),] </span>
<span id="cb25-25"><a href="#cb25-25" tabindex="-1"></a></span>
<span id="cb25-26"><a href="#cb25-26" tabindex="-1"></a><span class="do">## Create new files that EXCLUDE contaminants!!!</span></span>
<span id="cb25-27"><a href="#cb25-27" tabindex="-1"></a></span>
<span id="cb25-28"><a href="#cb25-28" tabindex="-1"></a><span class="co"># making new fasta file (if you want)</span></span>
<span id="cb25-29"><a href="#cb25-29" tabindex="-1"></a><span class="co">#contam_indices &lt;- which(asv_fasta %in% paste0(&quot;&gt;&quot;, contam_asvs))</span></span>
<span id="cb25-30"><a href="#cb25-30" tabindex="-1"></a><span class="co">#dont_want &lt;- sort(c(contam_indices, contam_indices + 1))</span></span>
<span id="cb25-31"><a href="#cb25-31" tabindex="-1"></a><span class="co">#asv_fasta_no_contam &lt;- asv_fasta[- dont_want]</span></span>
<span id="cb25-32"><a href="#cb25-32" tabindex="-1"></a></span>
<span id="cb25-33"><a href="#cb25-33" tabindex="-1"></a><span class="co"># making new count table</span></span>
<span id="cb25-34"><a href="#cb25-34" tabindex="-1"></a>bac.ASV_counts_no.contam <span class="ot">&lt;-</span> bac.ASV_counts[<span class="sc">!</span><span class="fu">row.names</span>(bac.ASV_counts) <span class="sc">%in%</span> <span class="fu">row.names</span>(contam_asvs), ] <span class="co"># drop ASVs found in contam_asvs</span></span>
<span id="cb25-35"><a href="#cb25-35" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_counts_no.contam)</span>
<span id="cb25-36"><a href="#cb25-36" tabindex="-1"></a></span>
<span id="cb25-37"><a href="#cb25-37" tabindex="-1"></a><span class="co"># making new taxonomy table</span></span>
<span id="cb25-38"><a href="#cb25-38" tabindex="-1"></a>bac.ASV_taxa.no.contam <span class="ot">&lt;-</span> bac.ASV_taxa[<span class="sc">!</span><span class="fu">row.names</span>(bac.ASV_taxa) <span class="sc">%in%</span> <span class="fu">row.names</span>(contam_asvs), ] <span class="co"># drop ASVs found in contam_asvs</span></span>
<span id="cb25-39"><a href="#cb25-39" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_taxa.no.contam)</span>
<span id="cb25-40"><a href="#cb25-40" tabindex="-1"></a></span>
<span id="cb25-41"><a href="#cb25-41" tabindex="-1"></a><span class="co"># Remove ASVs found in Controls from samples (in addition to contaminants previously ID&#39;d)</span></span>
<span id="cb25-42"><a href="#cb25-42" tabindex="-1"></a></span>
<span id="cb25-43"><a href="#cb25-43" tabindex="-1"></a>Control_counts<span class="ot">&lt;-</span>bac.ASV_counts_no.contam[,<span class="fu">colnames</span>(bac.ASV_counts_no.contam) <span class="sc">%in%</span> ControlDF<span class="sc">$</span>SampleID] <span class="co"># see which taxa are contaminants</span></span>
<span id="cb25-44"><a href="#cb25-44" tabindex="-1"></a>Control_counts</span>
<span id="cb25-45"><a href="#cb25-45" tabindex="-1"></a>Control_counts<span class="ot">&lt;-</span>Control_counts[<span class="fu">which</span>(<span class="fu">rowSums</span>(Control_counts) <span class="sc">&gt;</span> <span class="dv">0</span>),] <span class="co"># drop ASVs that don&#39;t appear in Controls</span></span>
<span id="cb25-46"><a href="#cb25-46" tabindex="-1"></a><span class="fu">dim</span>(Control_counts)</span>
<span id="cb25-47"><a href="#cb25-47" tabindex="-1"></a><span class="fu">head</span>(Control_counts)</span>
<span id="cb25-48"><a href="#cb25-48" tabindex="-1"></a></span>
<span id="cb25-49"><a href="#cb25-49" tabindex="-1"></a>bac.ASV_counts_CLEAN<span class="ot">&lt;-</span>bac.ASV_counts_no.contam[<span class="sc">!</span>bac.ASV_counts_no.contam<span class="sc">$</span>ASV_ID <span class="sc">%in%</span> <span class="fu">row.names</span>(Control_counts),<span class="sc">!</span><span class="fu">colnames</span>(bac.ASV_counts_no.contam) <span class="sc">%in%</span> <span class="fu">colnames</span>(Control_counts)]</span>
<span id="cb25-50"><a href="#cb25-50" tabindex="-1"></a>bac.ASV_taxa_CLEAN<span class="ot">&lt;-</span>bac.ASV_taxa.no.contam[<span class="sc">!</span>bac.ASV_taxa.no.contam<span class="sc">$</span>ASV_ID <span class="sc">%in%</span> <span class="fu">row.names</span>(Control_counts),]</span>
<span id="cb25-51"><a href="#cb25-51" tabindex="-1"></a></span>
<span id="cb25-52"><a href="#cb25-52" tabindex="-1"></a><span class="co"># sanity check</span></span>
<span id="cb25-53"><a href="#cb25-53" tabindex="-1"></a><span class="fu">colnames</span>(bac.ASV_counts_CLEAN) <span class="co"># check for control sample IDs</span></span>
<span id="cb25-54"><a href="#cb25-54" tabindex="-1"></a></span>
<span id="cb25-55"><a href="#cb25-55" tabindex="-1"></a><span class="do">## and now writing them out to files</span></span>
<span id="cb25-56"><a href="#cb25-56" tabindex="-1"></a><span class="co">#write(asv_fasta_no_contam, &quot;ASVs-no-contam.fa&quot;)</span></span>
<span id="cb25-57"><a href="#cb25-57" tabindex="-1"></a><span class="fu">write.table</span>(bac.ASV_counts_CLEAN, <span class="st">&quot;data/EnvMiSeq_W23_16S.V3V4_ASVs_Counts_NoContam.tsv&quot;</span>,</span>
<span id="cb25-58"><a href="#cb25-58" tabindex="-1"></a>            <span class="at">sep=</span><span class="st">&quot;</span><span class="sc">\t</span><span class="st">&quot;</span>, <span class="at">quote=</span>F, <span class="at">col.names=</span><span class="cn">NA</span>)</span>
<span id="cb25-59"><a href="#cb25-59" tabindex="-1"></a><span class="fu">saveRDS</span>(bac.ASV_counts_CLEAN, <span class="at">file =</span> <span class="st">&quot;data/EnvMiSeq_W23_16S.V3V4_ASVs_Counts_NoContam_Robject.rds&quot;</span>, <span class="at">ascii =</span> <span class="cn">FALSE</span>, <span class="at">version =</span> <span class="cn">NULL</span>,</span>
<span id="cb25-60"><a href="#cb25-60" tabindex="-1"></a>        <span class="at">compress =</span> <span class="cn">TRUE</span>, <span class="at">refhook =</span> <span class="cn">NULL</span>)</span>
<span id="cb25-61"><a href="#cb25-61" tabindex="-1"></a></span>
<span id="cb25-62"><a href="#cb25-62" tabindex="-1"></a><span class="fu">write.table</span>(bac.ASV_taxa_CLEAN, <span class="st">&quot;data/EnvMiSeq_W23_16S.V3V4_ASVs_Taxa_NoContam.tsv&quot;</span>,</span>
<span id="cb25-63"><a href="#cb25-63" tabindex="-1"></a>            <span class="at">sep=</span><span class="st">&quot;</span><span class="sc">\t</span><span class="st">&quot;</span>, <span class="at">quote=</span>F, <span class="at">col.names=</span><span class="cn">NA</span>)</span>
<span id="cb25-64"><a href="#cb25-64" tabindex="-1"></a><span class="fu">saveRDS</span>(bac.ASV_taxa_CLEAN, <span class="at">file =</span> <span class="st">&quot;data/EnvMiSeq_W23_16S.V3V4_ASVs_Taxa_NoContam_Robject.rds&quot;</span>, <span class="at">ascii =</span> <span class="cn">FALSE</span>, <span class="at">version =</span> <span class="cn">NULL</span>,</span>
<span id="cb25-65"><a href="#cb25-65" tabindex="-1"></a>        <span class="at">compress =</span> <span class="cn">TRUE</span>, <span class="at">refhook =</span> <span class="cn">NULL</span>)</span></code></pre></div>
<div id="data-formatting-filtering-and-transformation"
class="section level3" number="4.1.1">
<h3><span class="header-section-number">4.1.1</span> Data Formatting,
Filtering, and Transformation</h3>
<p>Now that we have imported our ASV count table, our ASV taxonomy
table, and our metadata, we can start to reformat the actual data
objects in R to get them ready for running through the
<code>vegan</code> suite of tools. First we are going to merge our ASV
count table and our ASV taxonomy tables together and filter out some
unwanted taxa.</p>
<p>Even though we are analyzing bacterial data, sometimes chloroplast
and mitochondrial sequences are attributed to 16S genes. For example, in
the Silva database, Chloroplast sequences attributed to Eukaryotes are
found within the databases’ set of Cyanobacteria sequences. Some
sequences within this Chloroplast distinction are actually labeled as
bacteria, but they have not been phylogenetically connected to a
reference genome. It’s important to filter our these eukaryotic
sequences before we start playing with statistical analyses. We are also
going to drop any ASVs that do not have any counts as well as
“singletons”, which are ASVs with only 1 count in the entire data
set.</p>
<p><strong>NOTE</strong>: Here we are merging the original ASV counts
and ASV taxa data frames. If you followed the <code>decontam()</code>
step, you should be merging the CLEAN versions of these objects. There
is a commented-out line of code in the section below that shows this
step.</p>
<div class="sourceCode" id="cb26"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" tabindex="-1"></a><span class="co"># first we merge the ASV count object and the ASV taxonomy object together by column called &quot;ASV_ID&quot;</span></span>
<span id="cb26-2"><a href="#cb26-2" tabindex="-1"></a><span class="co"># merge CLEAN aka contaminants/controls removed count &amp; taxa tables</span></span>
<span id="cb26-3"><a href="#cb26-3" tabindex="-1"></a></span>
<span id="cb26-4"><a href="#cb26-4" tabindex="-1"></a><span class="co">#bac.ASV_all&lt;-merge(bac.ASV_counts_CLEAN,bac.ASV_taxa_CLEAN, by=&quot;ASV_ID&quot;)</span></span>
<span id="cb26-5"><a href="#cb26-5" tabindex="-1"></a>bac.ASV_all<span class="ot">&lt;-</span><span class="fu">merge</span>(bac.ASV_counts,bac.ASV_taxa, <span class="at">by=</span><span class="st">&quot;ASV_ID&quot;</span>)</span>
<span id="cb26-6"><a href="#cb26-6" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_all)</span>
<span id="cb26-7"><a href="#cb26-7" tabindex="-1"></a><span class="fu">dim</span>(bac.ASV_all)</span>
<span id="cb26-8"><a href="#cb26-8" tabindex="-1"></a>bac.ASV_all<span class="ot">&lt;-</span>bac.ASV_all[, <span class="sc">!</span><span class="fu">duplicated</span>(<span class="fu">colnames</span>(bac.ASV_all))] <span class="co"># remove col duplicates</span></span>
<span id="cb26-9"><a href="#cb26-9" tabindex="-1"></a><span class="fu">dim</span>(bac.ASV_all)</span>
<span id="cb26-10"><a href="#cb26-10" tabindex="-1"></a></span>
<span id="cb26-11"><a href="#cb26-11" tabindex="-1"></a>bac.ASV_dat<span class="ot">&lt;-</span><span class="fu">melt</span>(bac.ASV_all)</span>
<span id="cb26-12"><a href="#cb26-12" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_dat)</span>
<span id="cb26-13"><a href="#cb26-13" tabindex="-1"></a></span>
<span id="cb26-14"><a href="#cb26-14" tabindex="-1"></a><span class="co"># rename columns</span></span>
<span id="cb26-15"><a href="#cb26-15" tabindex="-1"></a><span class="fu">colnames</span>(bac.ASV_dat)[<span class="fu">which</span>(<span class="fu">names</span>(bac.ASV_dat) <span class="sc">==</span> <span class="st">&quot;variable&quot;</span>)] <span class="ot">&lt;-</span> <span class="st">&quot;SampleID&quot;</span></span>
<span id="cb26-16"><a href="#cb26-16" tabindex="-1"></a><span class="fu">colnames</span>(bac.ASV_dat)[<span class="fu">which</span>(<span class="fu">names</span>(bac.ASV_dat) <span class="sc">==</span> <span class="st">&quot;value&quot;</span>)] <span class="ot">&lt;-</span> <span class="st">&quot;Count&quot;</span></span>
<span id="cb26-17"><a href="#cb26-17" tabindex="-1"></a></span>
<span id="cb26-18"><a href="#cb26-18" tabindex="-1"></a><span class="co"># Drop all Zero counts &amp; singletons ASVs</span></span>
<span id="cb26-19"><a href="#cb26-19" tabindex="-1"></a><span class="fu">dim</span>(bac.ASV_dat)</span>
<span id="cb26-20"><a href="#cb26-20" tabindex="-1"></a>bac.ASV_dat<span class="ot">&lt;-</span>bac.ASV_dat[<span class="fu">which</span>(bac.ASV_dat<span class="sc">$</span>Count <span class="sc">&gt;</span> <span class="dv">0</span>),]</span>
<span id="cb26-21"><a href="#cb26-21" tabindex="-1"></a><span class="fu">dim</span>(bac.ASV_dat)</span>
<span id="cb26-22"><a href="#cb26-22" tabindex="-1"></a></span>
<span id="cb26-23"><a href="#cb26-23" tabindex="-1"></a><span class="co"># Drop Unknowns and Eukaryotic hits</span></span>
<span id="cb26-24"><a href="#cb26-24" tabindex="-1"></a>bac.ASV_dat<span class="ot">&lt;-</span><span class="fu">subset</span>(bac.ASV_dat, Kingdom<span class="sc">!=</span><span class="st">&quot;Unknown&quot;</span>) <span class="do">## drop Unknowns from Kingdom</span></span>
<span id="cb26-25"><a href="#cb26-25" tabindex="-1"></a>bac.ASV_dat<span class="ot">&lt;-</span><span class="fu">subset</span>(bac.ASV_dat, Phylum<span class="sc">!=</span><span class="st">&quot;Unknown&quot;</span>) <span class="do">## drop Unknowns from Phylum</span></span>
<span id="cb26-26"><a href="#cb26-26" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_dat)</span>
<span id="cb26-27"><a href="#cb26-27" tabindex="-1"></a><span class="fu">dim</span>(bac.ASV_dat)</span>
<span id="cb26-28"><a href="#cb26-28" tabindex="-1"></a></span>
<span id="cb26-29"><a href="#cb26-29" tabindex="-1"></a><span class="co"># Create ASV count file that is filtered of eukaryotic taxa - for later use just in case</span></span>
<span id="cb26-30"><a href="#cb26-30" tabindex="-1"></a>bac.ASV_dat.with.euks<span class="ot">&lt;-</span>bac.ASV_dat</span>
<span id="cb26-31"><a href="#cb26-31" tabindex="-1"></a><span class="fu">colnames</span>(bac.ASV_dat.with.euks)</span>
<span id="cb26-32"><a href="#cb26-32" tabindex="-1"></a></span>
<span id="cb26-33"><a href="#cb26-33" tabindex="-1"></a><span class="co"># Drop chloroplast &amp; mitochondria seqs</span></span>
<span id="cb26-34"><a href="#cb26-34" tabindex="-1"></a>bac.ASV_dat<span class="ot">&lt;-</span><span class="fu">subset</span>(bac.ASV_dat, Class<span class="sc">!=</span><span class="st">&quot;Chloroplast&quot;</span>) <span class="do">## exclude Chloroplast sequences</span></span>
<span id="cb26-35"><a href="#cb26-35" tabindex="-1"></a>bac.ASV_dat<span class="ot">&lt;-</span><span class="fu">subset</span>(bac.ASV_dat, Order<span class="sc">!=</span><span class="st">&quot;Chloroplast&quot;</span>) <span class="do">## exclude Chloroplast sequences</span></span>
<span id="cb26-36"><a href="#cb26-36" tabindex="-1"></a>bac.ASV_dat<span class="ot">&lt;-</span><span class="fu">subset</span>(bac.ASV_dat, Family<span class="sc">!=</span><span class="st">&quot;Mitochondria&quot;</span>) <span class="do">## exclude Mitochondrial sequences just in case</span></span>
<span id="cb26-37"><a href="#cb26-37" tabindex="-1"></a></span>
<span id="cb26-38"><a href="#cb26-38" tabindex="-1"></a><span class="co"># check if Eukaryotic and Unknowns are still in your data, this may take a while to run!</span></span>
<span id="cb26-39"><a href="#cb26-39" tabindex="-1"></a><span class="st">&#39;Chloroplast&#39;</span> <span class="sc">%in%</span> bac.ASV_dat <span class="co"># check if Chloroplast counts are still in df, should be false because they&#39;ve been removed</span></span>
<span id="cb26-40"><a href="#cb26-40" tabindex="-1"></a><span class="st">&#39;Mitochondria&#39;</span> <span class="sc">%in%</span> bac.ASV_dat <span class="co"># check if Mitochondria counts are still in df, should be false because they&#39;ve been removed</span></span>
<span id="cb26-41"><a href="#cb26-41" tabindex="-1"></a><span class="st">&#39;Undetermined&#39;</span> <span class="sc">%in%</span> bac.ASV_dat <span class="co"># check if undetermined taxa in data frame</span></span>
<span id="cb26-42"><a href="#cb26-42" tabindex="-1"></a><span class="co">#NA %in% bac.ASV_dat</span></span>
<span id="cb26-43"><a href="#cb26-43" tabindex="-1"></a></span>
<span id="cb26-44"><a href="#cb26-44" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_dat)</span></code></pre></div>
<p>After dropping unknown or undesired sequences from our combined ASV
data frame, it’s time to create an ASV table that is properly formatted
for the <code>vegan</code> package. This ASV table must be a
<strong>Samples x Species</strong> matrix, in which our Sample IDs as
our row names and our ASV IDs as our column names.</p>
<p><strong>NOTE</strong>: We could have made this ASV table earlier
immediately after importing the ASV count data by transposing the table
with the <code>t()</code> function. However, I want to have an ASV table
that excludes taxa I do not want in my data set, like ASVs attributed to
Chloroplast sequences or ASVs attributed to unknown Phyla.</p>
<div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" tabindex="-1"></a>bac.ASV_table<span class="ot">&lt;-</span><span class="fu">as.data.frame</span>(<span class="fu">dcast</span>(bac.ASV_dat, SampleID<span class="sc">~</span>ASV_ID, <span class="at">value.var=</span><span class="st">&quot;Count&quot;</span>, <span class="at">fun.aggregate=</span>sum)) <span class="do">###</span></span>
<span id="cb27-2"><a href="#cb27-2" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_table)</span>
<span id="cb27-3"><a href="#cb27-3" tabindex="-1"></a><span class="fu">rownames</span>(bac.ASV_table)<span class="ot">&lt;-</span>bac.ASV_table<span class="sc">$</span>SampleID</span>
<span id="cb27-4"><a href="#cb27-4" tabindex="-1"></a>bac.ASV_table<span class="ot">&lt;-</span><span class="fu">subset</span>(bac.ASV_table, <span class="at">select=</span><span class="sc">-</span><span class="fu">c</span>(SampleID))</span>
<span id="cb27-5"><a href="#cb27-5" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_table)</span></code></pre></div>
<p>Now we can reformat our metadata to be in the same order (by rows) as
our ASV table. <strong>This is a crucial step!</strong> Though it may
appear minor, certain functions (such as <code>adonis2()</code> for
example) will not correctly analyze your data if your metadata and your
ASV table are not arranged in the same order by rows. This next step
will only work if the two data frames we are reordering have the same
number of rows AND the same row names.</p>
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" tabindex="-1"></a><span class="co"># double check dimensions of metadata and ASV table</span></span>
<span id="cb28-2"><a href="#cb28-2" tabindex="-1"></a><span class="fu">dim</span>(metadata)</span>
<span id="cb28-3"><a href="#cb28-3" tabindex="-1"></a><span class="fu">dim</span>(bac.ASV_table)</span>
<span id="cb28-4"><a href="#cb28-4" tabindex="-1"></a><span class="co"># double check that the rownames exist + match</span></span>
<span id="cb28-5"><a href="#cb28-5" tabindex="-1"></a><span class="fu">rownames</span>(metadata)</span>
<span id="cb28-6"><a href="#cb28-6" tabindex="-1"></a><span class="fu">rownames</span>(bac.ASV_table)</span>
<span id="cb28-7"><a href="#cb28-7" tabindex="-1"></a></span>
<span id="cb28-8"><a href="#cb28-8" tabindex="-1"></a><span class="co"># Find rows in metadata that are not in ASV table, and vice versa --&gt; sanity check</span></span>
<span id="cb28-9"><a href="#cb28-9" tabindex="-1"></a><span class="fu">setdiff</span>(<span class="fu">rownames</span>(metadata), <span class="fu">rownames</span>(bac.ASV_table)) <span class="co"># check rows in metadata not in bac.ASV_table</span></span>
<span id="cb28-10"><a href="#cb28-10" tabindex="-1"></a><span class="fu">setdiff</span>(<span class="fu">rownames</span>(bac.ASV_table), <span class="fu">rownames</span>(metadata)) <span class="co"># check rows in bac.ASV_table not in metadata</span></span>
<span id="cb28-11"><a href="#cb28-11" tabindex="-1"></a></span>
<span id="cb28-12"><a href="#cb28-12" tabindex="-1"></a><span class="co"># reorder metadata based off of ASV table</span></span>
<span id="cb28-13"><a href="#cb28-13" tabindex="-1"></a>metadata<span class="ot">=</span>metadata[<span class="fu">rownames</span>(bac.ASV_table),]</span>
<span id="cb28-14"><a href="#cb28-14" tabindex="-1"></a><span class="co"># here we are reordering our metadata by rows, using the rownames from our ASV table as a guide</span></span>
<span id="cb28-15"><a href="#cb28-15" tabindex="-1"></a><span class="co"># this indexing method will only work if the two dfs have the same # of rows AND the same row names!</span></span>
<span id="cb28-16"><a href="#cb28-16" tabindex="-1"></a></span>
<span id="cb28-17"><a href="#cb28-17" tabindex="-1"></a><span class="co"># sanity check to see if this indexing step worked</span></span>
<span id="cb28-18"><a href="#cb28-18" tabindex="-1"></a><span class="fu">head</span>(metadata)</span>
<span id="cb28-19"><a href="#cb28-19" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_table)</span></code></pre></div>
<p>Before we jump into statistically analyzing our sequence data, we
will want to standardize our environmental data. You may have multiple
environmental variables that you’ve recorded (i.e., temperature, pH,
dissolved oxygen concentration, etc.), all of which could be in
different units or vary widely in their relative concentrations,
variances, etc. It’s important that we scale and center our
environmental variables so that we can compare variables of different
units/concentrations/etc.</p>
<div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1" tabindex="-1"></a><span class="fu">head</span>(metadata)</span>
<span id="cb29-2"><a href="#cb29-2" tabindex="-1"></a>meta_scaled<span class="ot">&lt;-</span>metadata</span>
<span id="cb29-3"><a href="#cb29-3" tabindex="-1"></a>meta_scaled[,<span class="dv">17</span><span class="sc">:</span><span class="dv">19</span>]<span class="ot">&lt;-</span><span class="fu">scale</span>(meta_scaled[,<span class="dv">17</span><span class="sc">:</span><span class="dv">19</span>],<span class="at">center=</span><span class="cn">TRUE</span>,<span class="at">scale=</span><span class="cn">TRUE</span>) <span class="co"># only scale chem env data</span></span>
<span id="cb29-4"><a href="#cb29-4" tabindex="-1"></a><span class="fu">head</span>(meta_scaled)</span></code></pre></div>
<p>Now that all of our files are in R and correctly formatted, we can
start some statistical analyses!</p>
</div>
</div>
<div id="alpha-diversity-species-richness" class="section level2"
number="4.2">
<h2><span class="header-section-number">4.2</span> Alpha Diversity &amp;
Species Richness</h2>
<div id="rarefaction-curves" class="section level3" number="4.2.1">
<h3><span class="header-section-number">4.2.1</span> Rarefaction
Curves</h3>
<p>To calculate species richness and alpha diversity (using the
Shannon-Wiener index), we will use functions from the <code>vegan</code>
<a
href="https://cran.r-project.org/web/packages/vegan/vegan.pdf">package</a>
<span class="citation">(Oksanen et al. 2020)</span>. Before I get to the
alpha diversity and species richness, I will calculate a rarefaction
curve for my ASV table. The rarefaction curve tells us that after
resampling a pool of N individuals per sample (x-axis), we will identify
a certain number of species in said sample (y-axis). This can give us an
idea if any sample is more/less species rich than other samples, which
can be useful to identify outliars.</p>
<div class="sourceCode" id="cb30"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" tabindex="-1"></a><span class="fu">sort</span>(<span class="fu">colSums</span>(bac.ASV_table))</span>
<span id="cb30-2"><a href="#cb30-2" tabindex="-1"></a><span class="fu">png</span>(<span class="st">&#39;rarecurve_example.png&#39;</span>,,<span class="at">width =</span> <span class="dv">1000</span>, <span class="at">height =</span> <span class="dv">900</span>, <span class="at">res=</span><span class="dv">100</span>)</span>
<span id="cb30-3"><a href="#cb30-3" tabindex="-1"></a><span class="fu">rarecurve</span>(<span class="fu">as.matrix</span>(bac.ASV_table), <span class="at">col=</span>metadata<span class="sc">$</span>Category_col, <span class="at">step=</span><span class="dv">1000</span>, <span class="at">label=</span>F,<span class="at">ylab=</span><span class="st">&quot;ASVs&quot;</span>)</span>
<span id="cb30-4"><a href="#cb30-4" tabindex="-1"></a><span class="co"># to show sampel labels per curve, change label=T</span></span>
<span id="cb30-5"><a href="#cb30-5" tabindex="-1"></a><span class="fu">dev.off</span>()</span></code></pre></div>
<center>
<img src="amplicon_workflow/rarecurve_example.png" />
</center>
<div align="center">
Figure 7: Rarefaction Curve Example, Colored by Sample Category
</div>
<p></br></p>
<p>In this rarefaction curve, each curve is colored based on its sample
category: red represents “Clear Cut Soil”, yellow represents the
“Gopher” category , light blue represents the “No Gopher” category, and
dark green represents “Old Growth” soil. Based on this rarefaction
curve, it appears that samples within the Old Growth category have a
smaller, average sample size compared to the other sample
categories.</p>
</div>
<div id="shannon-diversity-species-richness" class="section level3"
number="4.2.2">
<h3><span class="header-section-number">4.2.2</span> Shannon Diversity
&amp; Species Richness</h3>
<p>Now that I’ve viewed the rarefaction curve and looked for outliers,
we can move onto the alpha diversity and species richness steps.</p>
<p>For alpha diversity and species richness measurements, we are going
to use raw data. The use of raw data for any kind of analysis is quite
controversial because not all of the samples have the same number of
observations: for example, one of our samples may have thousands of ASV
counts, whereas other samples can be much smaller or larger than that.
Transforming our data can allow us to view the actual distribution of
our data, revealing patterns that may have been difficult to observe in
the raw data. You can find a helpful example of data transformations and
the benefits via its respective Wikipedia <a
href="https://en.wikipedia.org/wiki/Data_transformation_(statistics)">page</a>.</p>
<p>Some microbiologists would tell you that we should rarefy our data
before moving onto any diversity assessments or downstream analyses.
<em>Rarefying</em> is a type of data transformation that involves
finding the sample with the minimum number of counts in all of your
samples, then scaling all of your sample counts down to this size. As
described with the rarefaction curve, rarefying allows you to see 1. the
number of species across samples and 2. the abundance of said species
across samples when sampling based on a given minimum. Historically,
rarefaction was the strategy used to transform microbial data. However,
more recently many statisticians have advised against rarefaction as we
tend to lose a lot of information regarding low abundance OTUs/ASVs. For
more information on why rarefaction is not a useful transformation
method, please read “Waste Not, Want Not: Why Rarefying Microbiome Data
is Inadmissable” by <span class="citation">McMurdie and Holmes
(2014)</span>.</p>
<p>When we get to the section on calculating <a
href="#beta-div-section">beta diversity</a>, I will provide more insight
into which transformation method(s) I use and why.</p>
<p>Alpha diversity is a way to measure within-sample diversity, using an
equation that considers the richness of certain species as well as the
evenness of those species. The <code>vegan</code> package has a
<code>diversity()</code> function that allows one to specify which
diversity index the user would like to use for calculating alpha
diversity. Here I use the Shannon-Wiener index for calculating alpha
diversity. In order to calculate Shannon Diversity, we have to calculate
Shannon Entropy, then take calculate the exponential value of the
Shannon Entropy (<em>e</em> to the power of Shannon Entropy). We can
also calculate species richness (i.e., how many species are present in
each sample) using the <code>specnumber()</code> function from the vegan
package. Once we’ve found species richness and Shannon diversity, we can
combine these values into one data frame, then merge this data frame
with our metadata to create one dataframe containing: Shannon entropy,
Shannon diversity, species richness, and your sample metadata.</p>
<div class="sourceCode" id="cb31"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" tabindex="-1"></a><span class="co"># if you have another package loaded that has a diversity function, you can specify that you want to use vegan&#39;s diversity function as shown below</span></span>
<span id="cb31-2"><a href="#cb31-2" tabindex="-1"></a>Shan_ent<span class="fl">.16</span>s<span class="ot">&lt;-</span>vegan<span class="sc">::</span><span class="fu">diversity</span>(bac.ASV_table, <span class="at">index=</span><span class="st">&quot;shannon&quot;</span>) <span class="co"># Shannon entropy</span></span>
<span id="cb31-3"><a href="#cb31-3" tabindex="-1"></a>Shan_div<span class="fl">.16</span>s<span class="ot">&lt;-</span> <span class="fu">exp</span>(Shan_ent<span class="fl">.16</span>s) <span class="co"># Shannon Diversity aka Hill number 1</span></span>
<span id="cb31-4"><a href="#cb31-4" tabindex="-1"></a></span>
<span id="cb31-5"><a href="#cb31-5" tabindex="-1"></a><span class="co"># create data frame with Shannon entropy and Shannon diversity values</span></span>
<span id="cb31-6"><a href="#cb31-6" tabindex="-1"></a>div_16s<span class="ot">&lt;-</span><span class="fu">data.frame</span>(<span class="at">Bac_Shannon_Entropy=</span>Shan_ent<span class="fl">.16</span>s,<span class="at">Bac_Shannon_Diversity=</span>Shan_div<span class="fl">.16</span>s)</span>
<span id="cb31-7"><a href="#cb31-7" tabindex="-1"></a><span class="fu">class</span>(div_16s)</span>
<span id="cb31-8"><a href="#cb31-8" tabindex="-1"></a>div_16s<span class="sc">$</span>SampleID<span class="ot">&lt;-</span><span class="fu">rownames</span>(div_16s)</span>
<span id="cb31-9"><a href="#cb31-9" tabindex="-1"></a><span class="fu">head</span>(div_16s)</span>
<span id="cb31-10"><a href="#cb31-10" tabindex="-1"></a></span>
<span id="cb31-11"><a href="#cb31-11" tabindex="-1"></a><span class="co"># create a data frame with species richness</span></span>
<span id="cb31-12"><a href="#cb31-12" tabindex="-1"></a>S_16s<span class="ot">&lt;-</span><span class="fu">data.frame</span>(<span class="at">Bac_Species_Richness=</span><span class="fu">specnumber</span>(bac.ASV_table), <span class="at">SampleID=</span><span class="fu">rownames</span>(bac.ASV_table)) <span class="co"># finds # of species per sample using RAW count data; if MARGIN = 2 it finds frequencies of species</span></span>
<span id="cb31-13"><a href="#cb31-13" tabindex="-1"></a></span>
<span id="cb31-14"><a href="#cb31-14" tabindex="-1"></a><span class="co"># merge richness and diversity dataframes together</span></span>
<span id="cb31-15"><a href="#cb31-15" tabindex="-1"></a>d.r_16s<span class="ot">&lt;-</span><span class="fu">merge</span>(div_16s, S_16s, <span class="at">by.x=</span><span class="st">&quot;SampleID&quot;</span>, <span class="at">by.y=</span><span class="st">&quot;SampleID&quot;</span>)</span>
<span id="cb31-16"><a href="#cb31-16" tabindex="-1"></a></span>
<span id="cb31-17"><a href="#cb31-17" tabindex="-1"></a><span class="co"># merge w/ metadata</span></span>
<span id="cb31-18"><a href="#cb31-18" tabindex="-1"></a>bac.div.metadat <span class="ot">&lt;-</span> <span class="fu">merge</span>(d.r_16s,metadata, <span class="at">by.x=</span><span class="st">&quot;SampleID&quot;</span>, <span class="at">by.y=</span><span class="st">&quot;SampleID&quot;</span>)</span>
<span id="cb31-19"><a href="#cb31-19" tabindex="-1"></a><span class="fu">head</span>(bac.div.metadat)</span>
<span id="cb31-20"><a href="#cb31-20" tabindex="-1"></a><span class="fu">class</span>(bac.div.metadat) <span class="co"># want data frame</span></span></code></pre></div>
<p>We can now use the data frame we made with our alpha diversity,
species richness, and our metadata to create some nice figures. First
want to ensure that the category of interest (i.e., in this example that
will be “Category”) is the right <code>class</code> of variable for
generating this figure. Because we are using a categorical identifier,
it is wise for us to make sure that our <code>Category</code> variable
is in the <code>factor</code> format.</p>
<div class="sourceCode" id="cb32"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb32-1"><a href="#cb32-1" tabindex="-1"></a><span class="fu">unique</span>(bac.div.metadat<span class="sc">$</span>Category) <span class="co"># see how many elements there are in the Category variable</span></span>
<span id="cb32-2"><a href="#cb32-2" tabindex="-1"></a>bac.div.metadat<span class="sc">$</span>Category <span class="ot">&lt;-</span> <span class="fu">factor</span>(bac.div.metadat<span class="sc">$</span>Category, <span class="at">levels =</span> <span class="fu">c</span>(<span class="st">&quot;ClearCutSoil&quot;</span>,<span class="st">&quot;Gopher&quot;</span>,<span class="st">&quot;NoGopher&quot;</span>,<span class="st">&quot;OldGrowth&quot;</span>))</span>
<span id="cb32-3"><a href="#cb32-3" tabindex="-1"></a><span class="fu">class</span>(bac.div.metadat<span class="sc">$</span>Category)</span></code></pre></div>
<p>Now let’s make some pretty figures with <code>ggplot2</code> <span
class="citation">(Wickham 2016)</span>! Using <code>ggplot2</code>, we
can specify what type of plot to make, the color palette you’ll use, the
size(s) of your font, etc. If you’re interested in everything that
<code>ggplot2</code> can do, please check out this amazing
<code>ggplot2</code> <a
href="https://www.maths.usyd.edu.au/u/UG/SM/STAT3022/r/current/Misc/data-visualization-2.1.pdf">Cheat
Sheet</a>. We are also using <code>ggpubr</code>, a wrapper for
<code>ggplot2</code> that allows for easy manipulation and export of
<code>ggplot</code> figures. For more information on
<code>ggpubr</code>, please check out the package <a
href="https://rpkgs.datanovia.com/ggpubr/#:~:text=ggplot2%2C%20by%20Hadley%20Wickham%2C%20is,elegant%20data%20visualization%20in%20R.&amp;text=The%20&#39;ggpubr&#39;%20package%20provides%20some,&#39;%2D%20based%20publication%20ready%20plots">website</a>.</p>
<p>Here we are going to create box-and-whisker plots of our alpha
diversity and species richness data. The first plot will display the
alpha diversity across of our groups, and the second plot will display
the species richness of these groups. The <code>y-axis</code> will show
the Shannon diversity in the first plot, and the species richness in the
second plot. For both plots, the <code>x-axis</code> will display the
Category labels.</p>
<p>Each of the individual box-and-whisker plots will be assigned a
different color based on the Category variable using the
<code>$Category_col</code> variable we created earlier.</p>
<div class="sourceCode" id="cb33"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" tabindex="-1"></a><span class="co"># shannon diversity by year</span></span>
<span id="cb33-2"><a href="#cb33-2" tabindex="-1"></a>bac.a.div<span class="ot">&lt;-</span><span class="fu">ggplot</span>(bac.div.metadat, <span class="fu">aes</span>(<span class="at">x=</span>Category, <span class="at">y=</span>Bac_Shannon_Diversity, <span class="at">fill=</span>Category)) <span class="sc">+</span></span>
<span id="cb33-3"><a href="#cb33-3" tabindex="-1"></a>  <span class="fu">geom_boxplot</span>(<span class="at">color=</span><span class="st">&quot;black&quot;</span>)<span class="sc">+</span><span class="fu">scale_x_discrete</span>(<span class="at">labels=</span><span class="fu">c</span>(<span class="st">&quot;ClearCutSoil&quot;</span><span class="ot">=</span><span class="st">&quot;Clear Cut Soil&quot;</span>, <span class="st">&quot;Gopher&quot;</span><span class="ot">=</span><span class="st">&quot;Gopher&quot;</span>, <span class="st">&quot;NoGopher&quot;</span><span class="ot">=</span><span class="st">&quot;No Gopher&quot;</span>, <span class="st">&quot;OldGrowth&quot;</span><span class="ot">=</span><span class="st">&quot;Old Growth&quot;</span>))<span class="sc">+</span></span>
<span id="cb33-4"><a href="#cb33-4" tabindex="-1"></a>  <span class="fu">scale_fill_manual</span>( <span class="at">values=</span><span class="fu">unique</span>(bac.div.metadat<span class="sc">$</span>Category_col[<span class="fu">order</span>(bac.div.metadat<span class="sc">$</span>Category)]), <span class="at">name =</span><span class="st">&quot;Sample Category&quot;</span>, <span class="at">labels=</span><span class="fu">c</span>(<span class="st">&quot;ClearCutSoil&quot;</span><span class="ot">=</span><span class="st">&quot;Clear Cut Soil&quot;</span>, <span class="st">&quot;Gopher&quot;</span><span class="ot">=</span><span class="st">&quot;Gopher&quot;</span>, <span class="st">&quot;NoGopher&quot;</span><span class="ot">=</span><span class="st">&quot;No Gopher&quot;</span>, <span class="st">&quot;OldGrowth&quot;</span><span class="ot">=</span><span class="st">&quot;Old Growth&quot;</span>), )<span class="sc">+</span></span>
<span id="cb33-5"><a href="#cb33-5" tabindex="-1"></a>  <span class="fu">theme_classic</span>()<span class="sc">+</span></span>
<span id="cb33-6"><a href="#cb33-6" tabindex="-1"></a>  </span>
<span id="cb33-7"><a href="#cb33-7" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">&quot;Bacterial Shannon Diversity by Sample Category&quot;</span>, <span class="at">x=</span><span class="st">&quot;Category&quot;</span>, <span class="at">y=</span><span class="st">&quot;Shannon Diversity&quot;</span>, <span class="at">fill=</span><span class="st">&quot;Category&quot;</span>)<span class="sc">+</span><span class="fu">theme</span>(<span class="at">axis.title.x =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.title.y =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">vjust=</span><span class="dv">1</span>),<span class="at">legend.title.align=</span><span class="fl">0.5</span>, <span class="at">legend.title =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">legend.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">hjust=</span><span class="fl">0.5</span>, <span class="at">size=</span><span class="dv">15</span>))</span>
<span id="cb33-8"><a href="#cb33-8" tabindex="-1"></a></span>
<span id="cb33-9"><a href="#cb33-9" tabindex="-1"></a><span class="fu">ggsave</span>(bac.a.div,<span class="at">filename =</span> <span class="st">&quot;Bacterial_alpha_diversity.png&quot;</span>, <span class="at">width=</span><span class="dv">12</span>, <span class="at">height=</span><span class="dv">10</span>, <span class="at">dpi=</span><span class="dv">600</span>)</span>
<span id="cb33-10"><a href="#cb33-10" tabindex="-1"></a></span>
<span id="cb33-11"><a href="#cb33-11" tabindex="-1"></a><span class="co"># ggplot code break down:</span></span>
<span id="cb33-12"><a href="#cb33-12" tabindex="-1"></a><span class="co"># ggplot(bac.div.metadat, aes(x=Category, y=Bac_Shannon_Diversity, fill=Category)) -- dataset is bac.div.metadat; set aesthetics aka x variable, y variable, and variable for filling in box-whisker plots</span></span>
<span id="cb33-13"><a href="#cb33-13" tabindex="-1"></a><span class="co"># geom_boxplot(color=&quot;black&quot;) -- outline of box-whisker plots will be black</span></span>
<span id="cb33-14"><a href="#cb33-14" tabindex="-1"></a><span class="co"># scale_x_discrete(labels=c()) -- fix the labels of groups in x-axis</span></span>
<span id="cb33-15"><a href="#cb33-15" tabindex="-1"></a><span class="co"># theme_classic() -- removes grid lines in background of figure</span></span>
<span id="cb33-16"><a href="#cb33-16" tabindex="-1"></a><span class="co"># labs() -- fix plot labels</span></span>
<span id="cb33-17"><a href="#cb33-17" tabindex="-1"></a><span class="co"># theme(...) -- these are changing the settings of the fig: setting axis title size, legend title size, alignment of axes and legend labels, etc</span></span></code></pre></div>
<center>
<img src="amplicon_workflow/Bacterial_alpha_diversity.png" />
</center>
<div align="center">
Figure 8a: Boxplot of Alpha Diversity by Sample Category
</div>
<p></br></p>
<p>We observe the highest average Shannon diversity within the Clear Cut
Soil category, followed by the Gopher category, then the No Gopher
category, and the Old Growth category. Though this figure is helpful for
comparing these categories to one another, we cannot really glean
meaningful statistical inforamtion from this boxplot.</p>
<p>Not only does <code>ggpubr</code> help us with arranging and saving
figures, but we can also use some if its functions to add statistics
directly onto our figures with ease. In the next code chunk we use a
function called <code>stat_compare_means()</code> which allows you to
compare different sets of samples to each other. Because we have already
assigned our samples to Categories, we can compare the means across our
multiple samples.</p>
<p>We can compare the means of each sample to each other in a pair-wise
fashion by using either a <strong>T-test</strong> (<code>t.test</code>)
or a using a <strong>Wilcoxon test</strong> (<code>wilcox.test</code>),
or compare the means across all of our samples at once using an
<strong>Analysis of Variance</strong> aka ANOVA (<code>anova</code>) or
a <strong>Kruskal-Wallis test</strong> (<code>kruskal.test</code>). For
more information on how to use the <code>stat_compare_means()</code>
function to add statistics to your plots, please see this <a
href="http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/76-add-p-values-and-significance-levels-to-ggplots/">website</a>.</p>
<p>Deciding on whether to use a T-test verses a Wilcoxon test, or an
ANOVA verses a Kruskal-Wallis test, depends on whether your data
fulfills certain assumptions held by these statistical tests. One of the
assumptions for a T-test and an ANOVA is that the data is <em>normally
distributed</em>. We can test for normality using the
<strong>Shapiro-Wilk test</strong>.</p>
<p>The null hypothesis for the Shapiro-Wilk test is that the data is
normally distributed. This means that if your p-value for the
Shapiro-Wilk test is &gt; 0.05, then the null hypothesis is accepted and
the data is in fact normally distributed. However, if p is &lt; 0.05,
then the null hypothesis is rejected and your data are not normally
distributed.</p>
<p>We can also use a <strong>Q-Q plot</strong> to compare our data with
a theoretical normal distribution. These plots show the quantiles for
our data in the y-axis, and the theoretical quantiles for a normal
distribution on the x-axis. If our data points lie on the line in the
Q-Q plot, then the data is considered normally distributed. Skewed data
will contain points that are further from the line, curving one way or
another.</p>
<p>Let’s run a Shapiro-Wilks test using our species richness results,
and use a Q-Q plot to see the distribution of these richness values.</p>
<p><strong>NOTE:</strong> diversity and richness are usually
<em>not</em> normally distributed, but it’s still important to always
see how the data are distributed if you plan on running statistical
tests that <a
href="https://www.statisticshowto.com/assumption-of-normality-test/">assume
normality</a>.</p>
<div class="sourceCode" id="cb34"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb34-1"><a href="#cb34-1" tabindex="-1"></a><span class="do">## Using Shapiro-Wilk test for normality</span></span>
<span id="cb34-2"><a href="#cb34-2" tabindex="-1"></a><span class="fu">shapiro.test</span>(bac.div.metadat<span class="sc">$</span>Bac_Species_Richness) <span class="co"># what is the p-value?</span></span>
<span id="cb34-3"><a href="#cb34-3" tabindex="-1"></a><span class="co"># my example p-value was p-value = 0.4429</span></span>
<span id="cb34-4"><a href="#cb34-4" tabindex="-1"></a></span>
<span id="cb34-5"><a href="#cb34-5" tabindex="-1"></a><span class="co"># visualize Q-Q plot for species richness</span></span>
<span id="cb34-6"><a href="#cb34-6" tabindex="-1"></a><span class="fu">png</span>(<span class="st">&#39;qqplot.png&#39;</span>,,<span class="at">width =</span> <span class="dv">1000</span>, <span class="at">height =</span> <span class="dv">900</span>, <span class="at">res=</span><span class="dv">100</span>)</span>
<span id="cb34-7"><a href="#cb34-7" tabindex="-1"></a><span class="fu">qqnorm</span>(bac.div.metadat<span class="sc">$</span>Bac_Species_Richness, <span class="at">pch =</span> <span class="dv">1</span>, <span class="at">frame =</span> <span class="cn">FALSE</span>) </span>
<span id="cb34-8"><a href="#cb34-8" tabindex="-1"></a><span class="fu">qqline</span>(bac.div.metadat<span class="sc">$</span>Bac_Species_Richness, <span class="at">col =</span> <span class="st">&quot;steelblue&quot;</span>, <span class="at">lwd =</span> <span class="dv">2</span>)</span>
<span id="cb34-9"><a href="#cb34-9" tabindex="-1"></a><span class="fu">dev.off</span>()</span></code></pre></div>
<center>
<img src="amplicon_workflow/qqplot.png" />
</center>
<div align="center">
Figure 8b: Normal Q-Q Plot: Species Richness
</div>
<p></br></p>
<p>Because our p-value for the Shapiro-Wilks test is &gt; 0.05, we’ve
determined that our species richness values are not normally
distributed. Because of this, we will use a Wilcoxon test (rather than a
T-test) to compare the means of our sample groups in a pairwise fashion.
Because we only have two groups in this example, we cannot run an
Kruskal-Wallis test. Kruskal-Wallis tests and ANOVAs are used when
comparing three or more groups.</p>
<p>In the boxplot below, I have only included a few pairwise group
comparisons as to not overwhelm the plot. If you’d rather use
<code>*</code> as indicators of statistical significance instead of
using the p-values themselves, you can change the label parameter in the
<code>stat_compare_means()</code> function from <code>"p.format"</code>
to <code>"p.signif"</code>.</p>
<div class="sourceCode" id="cb35"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" tabindex="-1"></a>bac.spec.rich<span class="ot">&lt;-</span><span class="fu">ggplot</span>(bac.div.metadat, <span class="fu">aes</span>(<span class="at">x=</span>Category, <span class="at">y=</span>Bac_Species_Richness, <span class="at">fill=</span>Category)) <span class="sc">+</span><span class="fu">geom_boxplot</span>(<span class="at">color=</span><span class="st">&quot;black&quot;</span>)<span class="sc">+</span><span class="fu">scale_x_discrete</span>(<span class="at">labels=</span><span class="fu">c</span>(<span class="st">&quot;ClearCutSoil&quot;</span><span class="ot">=</span><span class="st">&quot;Clear Cut Soil&quot;</span>, <span class="st">&quot;Gopher&quot;</span><span class="ot">=</span><span class="st">&quot;Gopher&quot;</span>, <span class="st">&quot;NoGopher&quot;</span><span class="ot">=</span><span class="st">&quot;No Gopher&quot;</span>, <span class="st">&quot;OldGrowth&quot;</span><span class="ot">=</span><span class="st">&quot;Old Growth&quot;</span>))<span class="sc">+</span><span class="fu">theme_bw</span>()<span class="sc">+</span><span class="fu">scale_fill_manual</span>( <span class="at">values=</span><span class="fu">unique</span>(bac.div.metadat<span class="sc">$</span>Category_col[<span class="fu">order</span>(bac.div.metadat<span class="sc">$</span>Category)]), <span class="at">name =</span><span class="st">&quot;Sample Category&quot;</span>, <span class="at">labels=</span><span class="fu">c</span>(<span class="st">&quot;ClearCutSoil&quot;</span><span class="ot">=</span><span class="st">&quot;Clear Cut Soil&quot;</span>, <span class="st">&quot;Gopher&quot;</span><span class="ot">=</span><span class="st">&quot;Gopher&quot;</span>, <span class="st">&quot;NoGopher&quot;</span><span class="ot">=</span><span class="st">&quot;No Gopher&quot;</span>, <span class="st">&quot;OldGrowth&quot;</span><span class="ot">=</span><span class="st">&quot;Old Growth&quot;</span>), )<span class="sc">+</span><span class="fu">theme_classic</span>()<span class="sc">+</span><span class="fu">labs</span>(<span class="at">title =</span> <span class="st">&quot;Bacterial Species Richness by Sample Category&quot;</span>, <span class="at">x=</span><span class="st">&quot;Category&quot;</span>, <span class="at">y=</span><span class="st">&quot;Species Richness&quot;</span>, <span class="at">fill=</span><span class="st">&quot;Category&quot;</span>)<span class="sc">+</span><span class="fu">theme</span>(<span class="at">axis.title.x =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.title.y =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">vjust=</span><span class="dv">1</span>),<span class="at">legend.title.align=</span><span class="fl">0.5</span>, <span class="at">legend.title =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">legend.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">hjust=</span><span class="fl">0.5</span>, <span class="at">size=</span><span class="dv">15</span>))<span class="sc">+</span><span class="fu">stat_compare_means</span>(<span class="at">comparisons =</span> <span class="fu">list</span>(<span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">4</span>), <span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">3</span>), <span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">4</span>)), <span class="at">hide.ns =</span> <span class="cn">FALSE</span>,<span class="at">label =</span> <span class="st">&quot;p.signif&quot;</span>,<span class="at">method=</span><span class="st">&quot;wilcox.test&quot;</span>)<span class="sc">+</span><span class="fu">stat_compare_means</span>(<span class="at">label.y=</span><span class="dv">3600</span>)</span>
<span id="cb35-2"><a href="#cb35-2" tabindex="-1"></a></span>
<span id="cb35-3"><a href="#cb35-3" tabindex="-1"></a><span class="fu">ggsave</span>(bac.spec.rich,<span class="at">filename =</span> <span class="st">&quot;Bacterial_species_richness.png&quot;</span>, <span class="at">width=</span><span class="dv">12</span>, <span class="at">height=</span><span class="dv">10</span>, <span class="at">dpi=</span><span class="dv">600</span>)</span></code></pre></div>
<center>
<img src="amplicon_workflow/Bacterial_species_richness.png" />
</center>
<div align="center">
Figure 8c: Boxplot of Species Richness by Sample Category
</div>
<p></br></p>
<p>I did not want to overwhelm you with multiple pairwise group
comparisons on this figure, so you are only seeing the results of three
Wilcoxon tests comparing Clear Cut Soil to Old Growth samples, Gopher to
No Gopher samples, and No Gopher to Old Growth Samples. The
<code>*</code> indicate significance levels, with <code>*</code> being
<em>p &lt;= 0.05</em>, <code>**</code> being <em>p &lt;= 0.01</em>,
<code>***</code> is <em>p &lt;= 0.001</em>, and <code>****</code> is
<em>p &lt;= 0.0001</em>. The symbol <code>ns</code> stands for <em>not
significant</em>. Figure 8c shows us that the Clear Clut Soil samples
have a significantly higher average Species Richness than the Old Growth
samples. We can also see that our No Gopher samples are significantly
higher in species richness than the Old Growth samples, but the
difference between the species richness averages in the Gopher verses No
Gopher samples is not statistically significant. You can also see a
printed value for the Kruskal-Wallis test, which here is p = 0.00008.
This test indicates that there is a significant difference in average
species richness across all of the sample categories.</p>
</div>
</div>
<div id="beta-div-section" class="section level2" number="4.3">
<h2><span class="header-section-number">4.3</span> Beta Diversity</h2>
<div id="data-transformation" class="section level3" number="4.3.1">
<h3><span class="header-section-number">4.3.1</span> Data
Transformation</h3>
<p>Before going any further, we should transform our data. Data
transformation helps us to better interpret our data by changing the
scales in which we view our data, as well as reducing the impact of
skewed data and/or outliers in our data set. We can also perform a
transformation that normalizes our data, aka changing the distribution
of our data to be a normal (i.e., Gaussian) distribution, which is
useful for running certain statistical tests that assume normality (like
T-tests, ANOVAs). For more on why you should transform your data and
what kind of transformations are out there, check out the resources
included in this very helpful <a
href="https://medium.com/analytics-vidhya/a-guide-to-data-transformation-9e5fa9ae1ca3">Medium
article</a>. I have also found this <a
href="https://medium.com/@kyawsawhtoon/log-transformation-purpose-and-interpretation-9444b4b049c9#:~:text=Log%20transformation%20is%20a%20data,on%20the%20natural%20log%20transformation.">Medium
article</a> on log transformations helpful as well.</p>
<p>Two useful transformations I have seen used are the <strong>variance
stabilizing transformation</strong> (i.e, <strong>VST</strong>) and the
<strong>centered log-ratio</strong> transformation (i.e,
<strong>CLR</strong>). For information on how to employ this particular
transformation, please check out this <a
href="https://astrobiomike.github.io/amplicon/dada2_workflow_ex#beta-diversity">tutorial</a>
by the legendary bioinformatician <a
href="https://astrobiomike.github.io/research/">Dr. Mike Lee</a>. Though
I won’t be using the VST transformation, I have not found any literature
saying that the CLR transformation is better than VST. The CLR
transformation appears to be popular among statisticians, which is why I
am choosing to go this route.</p>
<p>We will use the <code>vegan</code> package to CLR transform our count
data for creating clustering dendrograms and ordinations. The CLR
transformation is recommended in the paper “Microbiome Datasets Are
Compositional: And This Is Not Optional” by <span class="citation">Gloor
et al. (2017)</span>, which proposes that microbiome data sets are
compositional, meaning they describe relationships between multiple
components. <span class="citation">Gloor et al. (2017)</span> argues
that the reason that CLR transformations are ideal for compositional
data is because 1. ratio transformations are useful for detecting
relationships in both count data and proportion data, and 2. because
log-ratios specifically make the data symmetric in a log space.</p>
<p>It’s important to change all of the 0s in our ASV table to a very
small number so that the log of 0 is not taken in our transformation
(<span class="citation">Quinn and Erb (2021)</span>). To account for
this, we include the <code>pseudocount = 1</code> argument in the
<code>decostand()</code> function, which will add a small pseudocount to
the cells in our matrix containing <code>0s</code>. To better understand
how CLR transformation works and why its a useful transformation for
microbiome data, watch this great <a
href="https://www.youtube.com/watch?v=fQPCeV4MUe4">YouTube video</a>
created by <a href="https://tpq.github.io/">Dr. Thomas Quinn</a> and
read “Normalization and microbial differential abundance strategies
depend upon data characteristics” by <span class="citation">Weiss et al.
(2017)</span>. For more information on the pros and cons of the CLR
transformation, please read “A field guide for the compositional
analysis of any-omics data” by <span class="citation">Quinn et al.
(2019)</span>.</p>
<div class="sourceCode" id="cb36"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1" tabindex="-1"></a><span class="co"># CLR transformation of ASV table</span></span>
<span id="cb36-2"><a href="#cb36-2" tabindex="-1"></a><span class="co"># df must have rownames are SampleIDs, columns are ASV IDs for vegan functions below</span></span>
<span id="cb36-3"><a href="#cb36-3" tabindex="-1"></a>b.clr<span class="ot">&lt;-</span><span class="fu">decostand</span>(bac.ASV_table[,<span class="sc">-</span><span class="dv">1</span>],<span class="at">method =</span> <span class="st">&quot;clr&quot;</span>, <span class="at">pseudocount =</span> <span class="dv">1</span>) <span class="co">#CLR transformation</span></span>
<span id="cb36-4"><a href="#cb36-4" tabindex="-1"></a>b.clr[<span class="dv">1</span><span class="sc">:</span><span class="dv">4</span>,<span class="dv">1</span><span class="sc">:</span><span class="dv">4</span>]</span></code></pre></div>
</div>
<div id="hierarchical-clustering" class="section level3" number="4.3.2">
<h3><span class="header-section-number">4.3.2</span> Hierarchical
Clustering</h3>
<p>Now that we have our CLR-transformed ASV counts, we can create a
Euclidean distance matrix that will describe how close (aka similar) our
samples are to each other based on their microbial composition. The
Euclidean distance between CLR-Transformed compositional data is known
as <strong>Aitchison Distance</strong> <span class="citation">(Quinn et
al. 2018)</span>.</p>
<div class="sourceCode" id="cb37"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1" tabindex="-1"></a><span class="co"># check rownames of CLR transformed ASV data &amp; metadata</span></span>
<span id="cb37-2"><a href="#cb37-2" tabindex="-1"></a><span class="fu">rownames</span>(b.clr) <span class="sc">%in%</span> <span class="fu">rownames</span>(meta_scaled)</span>
<span id="cb37-3"><a href="#cb37-3" tabindex="-1"></a>meta_scaled<span class="ot">=</span>meta_scaled[<span class="fu">rownames</span>(b.clr),] <span class="do">## reorder metadata to match order of CLR data</span></span>
<span id="cb37-4"><a href="#cb37-4" tabindex="-1"></a></span>
<span id="cb37-5"><a href="#cb37-5" tabindex="-1"></a><span class="co"># calculate our Euclidean distance matrix using CLR data (aka Aitchison distance)</span></span>
<span id="cb37-6"><a href="#cb37-6" tabindex="-1"></a>b.euc_dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(b.clr, <span class="at">method =</span> <span class="st">&quot;euclidean&quot;</span>)</span>
<span id="cb37-7"><a href="#cb37-7" tabindex="-1"></a></span>
<span id="cb37-8"><a href="#cb37-8" tabindex="-1"></a><span class="co"># creating our hierarcical clustering dendrogram</span></span>
<span id="cb37-9"><a href="#cb37-9" tabindex="-1"></a>b.euc_clust <span class="ot">&lt;-</span> <span class="fu">hclust</span>(b.euc_dist, <span class="at">method=</span><span class="st">&quot;ward.D2&quot;</span>)</span>
<span id="cb37-10"><a href="#cb37-10" tabindex="-1"></a></span>
<span id="cb37-11"><a href="#cb37-11" tabindex="-1"></a><span class="co"># let&#39;s make it a little nicer...</span></span>
<span id="cb37-12"><a href="#cb37-12" tabindex="-1"></a>b.euc_dend <span class="ot">&lt;-</span> <span class="fu">as.dendrogram</span>(b.euc_clust, <span class="at">hang=</span><span class="fl">0.1</span>)</span>
<span id="cb37-13"><a href="#cb37-13" tabindex="-1"></a>b.dend_cols <span class="ot">&lt;-</span> <span class="fu">as.character</span>(metadata<span class="sc">$</span>Category_col[<span class="fu">order.dendrogram</span>(b.euc_dend)])</span>
<span id="cb37-14"><a href="#cb37-14" tabindex="-1"></a><span class="fu">labels_colors</span>(b.euc_dend) <span class="ot">&lt;-</span> b.dend_cols</span>
<span id="cb37-15"><a href="#cb37-15" tabindex="-1"></a></span>
<span id="cb37-16"><a href="#cb37-16" tabindex="-1"></a><span class="fu">png</span>(<span class="at">file=</span><span class="st">&quot;16S_CLR_cluster_Category.png&quot;</span>,<span class="at">width =</span> <span class="dv">1000</span>, <span class="at">height =</span> <span class="dv">900</span>, <span class="at">res=</span><span class="dv">100</span>)</span>
<span id="cb37-17"><a href="#cb37-17" tabindex="-1"></a><span class="fu">par</span>(<span class="at">cex=</span><span class="dv">1</span>)</span>
<span id="cb37-18"><a href="#cb37-18" tabindex="-1"></a><span class="fu">plot</span>(b.euc_dend, <span class="at">ylab=</span><span class="st">&quot;CLR Euclidean Distance&quot;</span>) <span class="sc">+</span> <span class="fu">title</span>(<span class="at">main =</span> <span class="st">&quot;Bacteria/Archaea Clustering Dendrogram&quot;</span>, <span class="at">sub =</span> <span class="st">&quot;Colored by Sample Category&quot;</span>, <span class="at">cex.main =</span> <span class="dv">2</span>, <span class="at">font.main=</span> <span class="dv">2</span>, <span class="at">cex.sub =</span> <span class="fl">0.8</span>, <span class="at">font.sub =</span> <span class="dv">3</span>)</span>
<span id="cb37-19"><a href="#cb37-19" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topright&quot;</span>,<span class="at">legend =</span> <span class="fu">c</span>(<span class="st">&quot;Clear Cut Soil&quot;</span>, <span class="st">&quot;Gopher&quot;</span>, <span class="st">&quot;No Gopher&quot;</span>, <span class="st">&quot;Old Growth&quot;</span>),<span class="at">cex=</span>.<span class="dv">8</span>,<span class="at">col =</span> <span class="fu">c</span>(<span class="st">&quot;#D00000&quot;</span>, <span class="st">&quot;#f8961e&quot;</span>, <span class="st">&quot;#4ea8de&quot;</span>, <span class="st">&quot;#283618&quot;</span>),<span class="at">pch =</span> <span class="dv">15</span>, <span class="at">bty =</span> <span class="st">&quot;n&quot;</span>)</span>
<span id="cb37-20"><a href="#cb37-20" tabindex="-1"></a><span class="fu">dev.off</span>()</span></code></pre></div>
<center>
<img src="amplicon_workflow/16S_CLR_cluster_Category.png" />
</center>
<div align="center">
Figure 9: Hierarchical Clustering (with Centered Log-Ratio Transformed
Data)
</div>
<p></br></p>
<p>Though there are some samples not clustered within their sample
categories, overall it appears that samples from specific categories
form distinct clusters. This indicates thats generally, samples from the
same category have similar microbial community composition. We can also
see that most of the samples in the Clear Cut Soil category are more
similar to the Gopher and No Gopher samples compared to the Old Growth
samples. Though this dendogram is helpful, it’s not as informative as
other visualizations we can do.</p>
</div>
<div id="principal-coordinate-analysis-pcoa" class="section level3"
number="4.3.3">
<h3><span class="header-section-number">4.3.3</span> Principal
Coordinate Analysis (PCoA)</h3>
<p>To learn more about how these sample categories’ microbiomes compare
to one another, we can use our Euclidean distance matrix (created from
CLR transformed ASV counts) to generate an ordination known as
<strong>Principal Coordinate Analysis</strong>, aka a
<strong>PCoA</strong>.</p>
<p>I am not going to get into the math behind a PCoA, but you can learn
more by watching this excellent <a
href="https://youtu.be/GEn-_dAyYME">StatQuest YouTube video</a> and this
helpful <a
href="https://mb3is.megx.net/gustame/dissimilarity-based-methods/principal-coordinates-analysis">link</a>
that describes what a PCoA is and its uses. If you’re interested in
learning more about ordinations in general and the impacts they can have
on microbiome data, please read “Uncovering the Horseshoe Effect in
Microbial Analyses” by <span class="citation">Morton et al.
(2017)</span>.</p>
<p>Generally a PCoA is used at looking at how similar your samples are
to each other, and the variability exhibited by your samples, in a
reduced dimensional space. The closer two points are in your ordination,
the more similar they are. PCoAs yield multiple axes (i.e., principal
components) that capture the variation within your data set and are
associated with certain values (i.e., <strong>eigenvalues</strong>) that
represent to the magnitude of the variation for each axis. These
eigenvalues are relative representations of how important each axis of
variation is for describing the data set.</p>
<p>PCoAs were developed so that we can create these ordinations using
distances that are NOT Euclidean, for example like Bray-Curtis
dissimilarity distances. PCoAs come from <strong>Principal Component
Analysis</strong>, which is specifically used for Euclidean distances.
For more information on PCAs, check out this <a
href="https://youtu.be/FgakZw6K1QQ">StatQuest YouTube video</a> as well
as this <a
href="https://ourcodingclub.github.io/tutorials/ordination/">tutorial</a>
that compares PCAs to PCoAs.</p>
<p>One reason PCoAs are often preferred over PCAs in microbial ecology
is that PCAs are very sensitive to the presence of 0s, which is
something we experience with microbial amplicon sequence data. For
example, if two samples are missing ASV 1 and ASV 2, these two samples
could be interpreted as being similar based on the species they are
<em>missing</em>, not the species they share. For more on comparing PCAs
to PCoAs, please check out this <a
href="https://ourcodingclub.github.io/tutorials/ordination/#:~:text=While%20PCA%20is%20based%20on,to%20the%20data%20in%20question.">link</a>.</p>
<p>Let’s generate our PCoA and a PCA and check out the proportion of
variance explained by our axes. The reason I am showing you how to
generate both is because a PCoA using Euclidean distances is equivalent
to a PCA. This is a useful way to confirm that our data is actually
Euclidean in nature.</p>
<div class="sourceCode" id="cb38"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" tabindex="-1"></a><span class="co"># let&#39;s use our Euclidean distance matrix from before to generate a PCoA</span></span>
<span id="cb38-2"><a href="#cb38-2" tabindex="-1"></a>b.pcoa <span class="ot">&lt;-</span> <span class="fu">pcoa</span>(b.euc_dist)</span>
<span id="cb38-3"><a href="#cb38-3" tabindex="-1"></a></span>
<span id="cb38-4"><a href="#cb38-4" tabindex="-1"></a><span class="co"># Variance explained by each axis is the Relative eigen (values$Relative_eig)</span></span>
<span id="cb38-5"><a href="#cb38-5" tabindex="-1"></a>b.pcoa<span class="sc">$</span>values<span class="sc">$</span>Relative_eig</span>
<span id="cb38-6"><a href="#cb38-6" tabindex="-1"></a></span>
<span id="cb38-7"><a href="#cb38-7" tabindex="-1"></a>b.pca <span class="ot">=</span> <span class="fu">prcomp</span>(b.clr)</span>
<span id="cb38-8"><a href="#cb38-8" tabindex="-1"></a><span class="co"># Variance explained by each axis is the Proportion of Variance</span></span>
<span id="cb38-9"><a href="#cb38-9" tabindex="-1"></a>b.pca.sum<span class="ot">&lt;-</span><span class="fu">summary</span>(b.pca)</span>
<span id="cb38-10"><a href="#cb38-10" tabindex="-1"></a>b.pca.sum<span class="sc">$</span>importance </span></code></pre></div>
<p>The first axis (PC1) of variation describes 8.94% of the variance in
the entire data set. The second axis (PC2) describes 5.69% of the
variation. Our PC axes generated by our PCoA are equivalent to our axes
generated by the PCA, which is to be expected using Euclidean distances.
The first 2-3 axes describe the greatest amount of variation in the data
set, and are included in the visualization of the PCoA.</p>
<p>To visualize our PCoA with <code>ggplot2</code>, we have to extract
the principal coordinates for each sample across our axes of variation
and combine these values with our metadata. Then we can make a PCoA
ordination and color each sample ID with our variable of interest (in
this case, the Sample Category, aka <code>metadata$Category</code>). We
can also include the relative variation for each axis in our x-axis and
y-axis labels.</p>
<div class="sourceCode" id="cb39"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb39-1"><a href="#cb39-1" tabindex="-1"></a><span class="co"># extract principal coordinates</span></span>
<span id="cb39-2"><a href="#cb39-2" tabindex="-1"></a>b.pcoa.vectors<span class="ot">&lt;-</span><span class="fu">data.frame</span>(b.pcoa<span class="sc">$</span>vectors)</span>
<span id="cb39-3"><a href="#cb39-3" tabindex="-1"></a>b.pcoa.vectors<span class="sc">$</span>SampleID<span class="ot">&lt;-</span><span class="fu">rownames</span>(b.pcoa<span class="sc">$</span>vectors)</span>
<span id="cb39-4"><a href="#cb39-4" tabindex="-1"></a></span>
<span id="cb39-5"><a href="#cb39-5" tabindex="-1"></a><span class="co"># merge pcoa coordinates w/ metadata</span></span>
<span id="cb39-6"><a href="#cb39-6" tabindex="-1"></a>b.pcoa.meta<span class="ot">&lt;-</span><span class="fu">merge</span>(b.pcoa.vectors, metadata, <span class="at">by.x=</span><span class="st">&quot;SampleID&quot;</span>, <span class="at">by.y=</span><span class="st">&quot;SampleID&quot;</span>)</span>
<span id="cb39-7"><a href="#cb39-7" tabindex="-1"></a><span class="fu">head</span>(b.pcoa.meta)</span>
<span id="cb39-8"><a href="#cb39-8" tabindex="-1"></a></span>
<span id="cb39-9"><a href="#cb39-9" tabindex="-1"></a>b.pcoa<span class="sc">$</span>values<span class="sc">$</span>Relative_eig <span class="co"># pull out relative variation % to add to axes labels</span></span>
<span id="cb39-10"><a href="#cb39-10" tabindex="-1"></a></span>
<span id="cb39-11"><a href="#cb39-11" tabindex="-1"></a><span class="co"># create PCoA ggplot fig</span></span>
<span id="cb39-12"><a href="#cb39-12" tabindex="-1"></a>pcoa1<span class="ot">&lt;-</span><span class="fu">ggplot</span>(b.pcoa.meta, <span class="fu">aes</span>(<span class="at">x=</span>Axis<span class="fl">.1</span>, <span class="at">y=</span>Axis<span class="fl">.2</span>)) <span class="sc">+</span><span class="fu">geom_point</span>(<span class="fu">aes</span>(<span class="at">color=</span><span class="fu">factor</span>(Category)), <span class="at">size=</span><span class="dv">4</span>)<span class="sc">+</span><span class="fu">theme_bw</span>()<span class="sc">+</span><span class="fu">labs</span>(<span class="at">title=</span><span class="st">&quot;PCoA: Bacteria/Archaea&quot;</span>,<span class="at">subtitle=</span><span class="st">&quot;Using Centered-Log Ratio Data&quot;</span>,<span class="at">xlab=</span><span class="st">&quot;Axis 1&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Axis 2&quot;</span>,<span class="at">color=</span><span class="st">&quot;Sample Category&quot;</span>)<span class="sc">+</span><span class="fu">theme_classic</span>()<span class="sc">+</span> <span class="fu">theme</span>(<span class="at">axis.title.x =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">15</span>),<span class="at">axis.title.y =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">15</span>),<span class="at">legend.title.align=</span><span class="fl">0.5</span>, <span class="at">legend.title =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">15</span>),<span class="at">axis.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">12</span>),<span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">vjust=</span><span class="dv">1</span>),<span class="at">legend.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">12</span>),,<span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">17</span>))<span class="sc">+</span><span class="fu">guides</span>(<span class="at">shape =</span> <span class="fu">guide_legend</span>(<span class="at">override.aes =</span> <span class="fu">list</span>(<span class="at">size =</span> <span class="dv">5</span>)))<span class="sc">+</span><span class="fu">scale_color_manual</span>(<span class="at">name =</span><span class="st">&quot;Sample Category&quot;</span>, <span class="at">labels=</span><span class="fu">c</span>(<span class="st">&quot;ClearCutSoil&quot;</span><span class="ot">=</span><span class="st">&quot;Clear Cut Soil&quot;</span>, <span class="st">&quot;Gopher&quot;</span><span class="ot">=</span><span class="st">&quot;Gopher&quot;</span>, <span class="st">&quot;NoGopher&quot;</span><span class="ot">=</span><span class="st">&quot;No Gopher&quot;</span>, <span class="st">&quot;OldGrowth&quot;</span><span class="ot">=</span><span class="st">&quot;Old Growth&quot;</span>),<span class="at">values=</span><span class="fu">unique</span>(b.pcoa.meta<span class="sc">$</span>Category_col[<span class="fu">order</span>(b.pcoa.meta<span class="sc">$</span>Category)])) <span class="sc">+</span><span class="fu">xlab</span>(<span class="st">&quot;Axis 1 [8.94%]&quot;</span>) <span class="sc">+</span> <span class="fu">ylab</span>(<span class="st">&quot;Axis 2 [5.69%]&quot;</span>)</span>
<span id="cb39-13"><a href="#cb39-13" tabindex="-1"></a></span>
<span id="cb39-14"><a href="#cb39-14" tabindex="-1"></a><span class="fu">ggsave</span>(pcoa1,<span class="at">filename =</span> <span class="st">&quot;16S_pcoa_CLR.png&quot;</span>, <span class="at">width=</span><span class="dv">12</span>, <span class="at">height=</span><span class="dv">10</span>, <span class="at">dpi=</span><span class="dv">600</span>)</span>
<span id="cb39-15"><a href="#cb39-15" tabindex="-1"></a></span>
<span id="cb39-16"><a href="#cb39-16" tabindex="-1"></a><span class="do">## ggplot code break down:</span></span>
<span id="cb39-17"><a href="#cb39-17" tabindex="-1"></a><span class="co"># guides(shape = guide_legend(override.aes = list(size = 5))) -- make shapes in legend bigger</span></span>
<span id="cb39-18"><a href="#cb39-18" tabindex="-1"></a><span class="co"># scale_color_manual(name =, labels=c(),values=unique(b.pcoa.meta$Category_col[order(b.pcoa.meta$Category)])) -- organize color variable in our data frame by the category its associated with</span></span>
<span id="cb39-19"><a href="#cb39-19" tabindex="-1"></a><span class="co"># xlab() -- x-axis label</span></span>
<span id="cb39-20"><a href="#cb39-20" tabindex="-1"></a><span class="co"># ylab() -- y-axis label</span></span></code></pre></div>
<center>
<img src="amplicon_workflow/16S_pcoa_CLR.png" />
</center>
<div align="center">
Figure 10: Principal Coordinates Analysis, Colored by Sample Category
</div>
<p></br></p>
<p>From this PCoA we can tell that the microbial community composition
in the Old Growth samples are similar to one another, forming a tight
cluster in the PCoA (points in dark green). The microbial composition of
the Clear Cut Soil samples are also similar to one another (points in
red). Interestingly, the Gopher and No Gopher samples cluster together,
indicating that their microbial communities are similar to each other,
regardless of whether a gopher was introduced to the soil or not. It is
important to keep in mind that though we see distinct clusters by sample
category, the variation explained by this variable is quite low (Axis 1
- 8.94%, Axis 2 - 5.59%).</p>
<p>So this information is helpful, but we are not sure if our categories
are significantly similar/dissimilar from each other. To do this, we
first need to check the dispersion (aka variance) of the composition
data within each group to see if we can even compare these groups to
each other. Basically we are finding the spatial median or the
<em>centroid</em> of each group in multivariate space, and calculating
the distance from each point to the centroid within a respective group
or category. The actual distances are reduced to principal coordinates
(as is done in a PCA or PCoA) before the distances from each group are
compared. We then can use an ANOVA as well as a <strong>Tukey’s Honest
Signifcant Difference Test</strong> (aka Tukey’s HSD) to statistically
compare the group dispersions.</p>
<p>To check out our group dispersions and whether or not they are
homogeneous (equal/similar to each other in their variance), we will use
the <code>betadisper()</code> function from the <code>vegan</code>
package. We can then compare the axes of dispersion with the
<code>anova()</code> function and compare the spatial means with the
<code>TukeyHSD()</code> function.</p>
<div class="sourceCode" id="cb40"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb40-1"><a href="#cb40-1" tabindex="-1"></a><span class="co"># create CLR Sample x Species matrix for input into dist()</span></span>
<span id="cb40-2"><a href="#cb40-2" tabindex="-1"></a>b.clr<span class="ot">&lt;-</span><span class="fu">as.matrix</span>(<span class="fu">t</span>(asv_clr))</span>
<span id="cb40-3"><a href="#cb40-3" tabindex="-1"></a><span class="fu">rownames</span>(b.clr)</span>
<span id="cb40-4"><a href="#cb40-4" tabindex="-1"></a><span class="fu">rownames</span>(metadata)</span>
<span id="cb40-5"><a href="#cb40-5" tabindex="-1"></a><span class="co"># reorder the transformed ASV table to match order of metadata data frame</span></span>
<span id="cb40-6"><a href="#cb40-6" tabindex="-1"></a>b.clr<span class="ot">=</span>b.clr[<span class="fu">rownames</span>(metadata),] <span class="co"># reorder both dfs by row names</span></span>
<span id="cb40-7"><a href="#cb40-7" tabindex="-1"></a><span class="co"># sanity check</span></span>
<span id="cb40-8"><a href="#cb40-8" tabindex="-1"></a><span class="fu">rownames</span>(b.clr)</span>
<span id="cb40-9"><a href="#cb40-9" tabindex="-1"></a><span class="fu">rownames</span>(metadata)</span>
<span id="cb40-10"><a href="#cb40-10" tabindex="-1"></a></span>
<span id="cb40-11"><a href="#cb40-11" tabindex="-1"></a><span class="co"># calculate our Euclidean distance matrix using CLR data (aka Aitchison distance)</span></span>
<span id="cb40-12"><a href="#cb40-12" tabindex="-1"></a>b.euc_dist <span class="ot">&lt;-</span> <span class="fu">dist</span>(b.clr, <span class="at">method =</span> <span class="st">&quot;euclidean&quot;</span>)</span>
<span id="cb40-13"><a href="#cb40-13" tabindex="-1"></a>b.disper<span class="ot">&lt;-</span><span class="fu">betadisper</span>(b.euc_dist, metadata<span class="sc">$</span>Category)</span>
<span id="cb40-14"><a href="#cb40-14" tabindex="-1"></a>b.disper</span>
<span id="cb40-15"><a href="#cb40-15" tabindex="-1"></a></span>
<span id="cb40-16"><a href="#cb40-16" tabindex="-1"></a><span class="fu">permutest</span>(b.disper, <span class="at">pairwise=</span><span class="cn">TRUE</span>) <span class="co"># compare dispersions to each other via permutation test to see significant differences in dispersion by pairwise comparisons</span></span>
<span id="cb40-17"><a href="#cb40-17" tabindex="-1"></a></span>
<span id="cb40-18"><a href="#cb40-18" tabindex="-1"></a><span class="fu">anova</span>(b.disper) <span class="co"># p = 0.0001394 --&gt; reject the Null H, spatial medians are significantly difference across Categories</span></span>
<span id="cb40-19"><a href="#cb40-19" tabindex="-1"></a></span>
<span id="cb40-20"><a href="#cb40-20" tabindex="-1"></a><span class="fu">TukeyHSD</span>(b.disper) <span class="co"># tells us which Category&#39;s dispersion MEANS are significantly different than each other</span></span></code></pre></div>
<p>The ANOVA results tell us that our dispersions by category are
significantly different than each other (p=0.0001394), meaning the
variance within each category is not homogenous. We can visualize this
comparison as well via an ordination (calculated by
<code>betadisper()</code>) and a boxplot based on the distance to the
centroid for each group.</p>
<div class="sourceCode" id="cb41"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb41-1"><a href="#cb41-1" tabindex="-1"></a><span class="co"># Visualize dispersions</span></span>
<span id="cb41-2"><a href="#cb41-2" tabindex="-1"></a><span class="fu">png</span>(<span class="st">&#39;pcoa_betadispersion.png&#39;</span>,<span class="at">width =</span> <span class="dv">700</span>, <span class="at">height =</span> <span class="dv">600</span>, <span class="at">res=</span><span class="dv">100</span>)</span>
<span id="cb41-3"><a href="#cb41-3" tabindex="-1"></a><span class="fu">plot</span>(b.disper,<span class="at">main =</span> <span class="st">&quot;Centroids and Dispersion based on Aitchison Distance&quot;</span>, <span class="at">col=</span>colorset1<span class="sc">$</span>Category_col)</span>
<span id="cb41-4"><a href="#cb41-4" tabindex="-1"></a><span class="fu">dev.off</span>()</span></code></pre></div>
<center>
<img src="amplicon_workflow/pcoa_betadispersion.png" />
</center>
<div align="center">
Figure 11a: Principal Coordinates Analysis w/ <code>betadisper()</code>.
Colored by Sample Category
</div>
<p></br></p>
<div class="sourceCode" id="cb42"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb42-1"><a href="#cb42-1" tabindex="-1"></a><span class="fu">png</span>(<span class="st">&#39;boxplot_centroid_distance.png&#39;</span>,<span class="at">width =</span> <span class="dv">700</span>, <span class="at">height =</span> <span class="dv">600</span>, <span class="at">res=</span><span class="dv">100</span>)</span>
<span id="cb42-2"><a href="#cb42-2" tabindex="-1"></a><span class="fu">boxplot</span>(b.disper,<span class="at">xlab=</span><span class="st">&quot;Sample Category&quot;</span>, <span class="at">main =</span> <span class="st">&quot;Distance to Centroid by Category&quot;</span>, <span class="at">sub=</span><span class="st">&quot;Based on Aitchison Distance&quot;</span>, <span class="at">names=</span><span class="fu">c</span>(<span class="st">&quot;Clear Cut Soil&quot;</span>, <span class="st">&quot;Gopher&quot;</span>, <span class="st">&quot;No Gopher&quot;</span>, <span class="st">&quot;Old Growth&quot;</span>), <span class="at">col=</span>colorset1<span class="sc">$</span>Category_col)</span>
<span id="cb42-3"><a href="#cb42-3" tabindex="-1"></a><span class="fu">dev.off</span>()</span></code></pre></div>
<center>
<img src="amplicon_workflow/boxplot_centroid_distance.png" />
</center>
<div align="center">
Figure 11b: Distance to Centroid of Dispersion. Colored by Sample
Category
</div>
<p></br></p>
<p>The reason that our dispersion results are problematic is that if we
try to compare the groups using a <strong>Permutational Analysis of
Variance</strong> (aka a <strong>PERMANOVA</strong>), the significant
differences we may see between groups could be attributed to their
unequal variances (i.e., dispersion effects) rather than actual
differences in the community compositions by category.</p>
</div>
<div id="permanova-w-aitchison-distance" class="section level3"
number="4.3.4">
<h3><span class="header-section-number">4.3.4</span> PERMANOVA w/
Aitchison Distance</h3>
<p>A PERMANOVA is similar to an ANOVA in that both analyses compare
differences between groups (using <em>sum-of-squares</em>), but a
PERMANOVA runs multiple permutations to compare these <em>distances</em>
to each other - whereas an ANOVA is comparing group <em>averages</em> to
each other without the use of permutations. Another difference is that
while ANOVAs assume that the data is normally distributed, the PERMANOVA
assumes that the groups have equal variance (dispersion). For more
information on PERMANOVAs and comparing group variances, check out this
very helpful <a
href="https://archetypalecology.wordpress.com/2018/02/21/permutational-multivariate-analysis-of-variance-permanova-in-r-preliminary/">link</a>
by <a
href="https://scholar.google.com/citations?user=0U4m9BUAAAAJ&amp;hl=en">Dr. Joshua
Ebner</a>.</p>
<p>Even though we should not run a PERMANOVA with these data, let’s go
over how we would run a PERMANVOA. <strong>The most crucial thing about
running a PERMANOVA in R is that your feature table and your metadata
need to be in the same order by row!</strong> The program does not know
to match up sample IDs or labels to each other, so you have to confirm
that your data frames are arranged in the same way by row before running
the PERMANOVA. We can then run our PERMANOVA including multiple
variables of interest. We can see if there are interactions between
multiple variables and our compositional data respectively with
<code>+</code> (i.e, <code>var1 + var2</code>), or we can check for
interactions between our variables and our composition data with
<code>*</code> (i.e, <code>var1 * var2</code>).</p>
<p><strong>NOTE</strong>: Using <code>adonis2()</code> with
<em>Continuous</em> Variables! The variation explained is directly
analogous to that of <strong>general linear models</strong>. With a
continuous variable, the PERMANOVA acts like simple linear regression,
where each point is associated with its own “centroid” which is the best
fit linear approximation. For a more in-depth explanation of this,
please check out this <a
href="https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permanova/">link</a>.</p>
<div class="sourceCode" id="cb43"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb43-1"><a href="#cb43-1" tabindex="-1"></a><span class="co"># check your rownames and the order of the rownames for ordering step</span></span>
<span id="cb43-2"><a href="#cb43-2" tabindex="-1"></a><span class="fu">rownames</span>(b.clr)</span>
<span id="cb43-3"><a href="#cb43-3" tabindex="-1"></a><span class="fu">rownames</span>(metadata)</span>
<span id="cb43-4"><a href="#cb43-4" tabindex="-1"></a><span class="co"># in case you need to reorder</span></span>
<span id="cb43-5"><a href="#cb43-5" tabindex="-1"></a>b.clr<span class="ot">=</span>b.clr[<span class="fu">rownames</span>(metadata),]</span>
<span id="cb43-6"><a href="#cb43-6" tabindex="-1"></a><span class="co"># sanity check</span></span>
<span id="cb43-7"><a href="#cb43-7" tabindex="-1"></a><span class="fu">rownames</span>(b.clr)</span>
<span id="cb43-8"><a href="#cb43-8" tabindex="-1"></a><span class="fu">rownames</span>(metadata)</span>
<span id="cb43-9"><a href="#cb43-9" tabindex="-1"></a></span>
<span id="cb43-10"><a href="#cb43-10" tabindex="-1"></a><span class="co"># PERMANOVA requires assumption of homogenous within-group disperions to ensure observed differences in groups are real</span></span>
<span id="cb43-11"><a href="#cb43-11" tabindex="-1"></a>perm1<span class="ot">&lt;-</span><span class="fu">adonis2</span>(b.clr <span class="sc">~</span> Category, <span class="at">data =</span> metadata, <span class="at">permutations =</span> <span class="dv">999</span>, <span class="at">method=</span><span class="st">&quot;euclid&quot;</span>, <span class="at">by=</span><span class="st">&#39;terms&#39;</span>) <span class="co"># looks for interactions between predictor variables</span></span>
<span id="cb43-12"><a href="#cb43-12" tabindex="-1"></a>perm1</span>
<span id="cb43-13"><a href="#cb43-13" tabindex="-1"></a></span>
<span id="cb43-14"><a href="#cb43-14" tabindex="-1"></a>perm2<span class="ot">&lt;-</span><span class="fu">adonis2</span>(b.clr <span class="sc">~</span> Category<span class="sc">*</span>layer, <span class="at">data =</span> metadata, <span class="at">permutations =</span> <span class="dv">999</span>, <span class="at">method=</span><span class="st">&quot;euclid&quot;</span>, <span class="at">by=</span><span class="st">&#39;terms&#39;</span>)</span>
<span id="cb43-15"><a href="#cb43-15" tabindex="-1"></a><span class="co"># * -&gt; for interactions between predictor variables; + -&gt; interactions with multiple variables but not between them or combined interactions</span></span>
<span id="cb43-16"><a href="#cb43-16" tabindex="-1"></a>perm2</span>
<span id="cb43-17"><a href="#cb43-17" tabindex="-1"></a><span class="fu">adonis2</span>(b.clr <span class="sc">~</span> Category<span class="sc">+</span>layer, <span class="at">data =</span> metadata, <span class="at">permutations =</span> <span class="dv">999</span>, <span class="at">method=</span><span class="st">&quot;euclid&quot;</span>, <span class="at">by=</span><span class="st">&#39;terms&#39;</span>)</span>
<span id="cb43-18"><a href="#cb43-18" tabindex="-1"></a></span>
<span id="cb43-19"><a href="#cb43-19" tabindex="-1"></a><span class="co"># export PERMANOVA results to csv</span></span>
<span id="cb43-20"><a href="#cb43-20" tabindex="-1"></a>perm1_results<span class="ot">&lt;-</span><span class="fu">data.frame</span>(<span class="at">DF=</span>perm1<span class="sc">$</span>Df, <span class="at">SumofSqs=</span>perm1<span class="sc">$</span>SumOfSqs, <span class="at">R2=</span>perm1<span class="sc">$</span>R2, <span class="at">F=</span>perm1<span class="sc">$</span>F, <span class="at">p=</span>perm1<span class="sc">$</span><span class="st">`</span><span class="at">Pr(&gt;F)</span><span class="st">`</span>)</span>
<span id="cb43-21"><a href="#cb43-21" tabindex="-1"></a><span class="fu">rownames</span>(perm1_results)<span class="ot">&lt;-</span><span class="fu">rownames</span>(perm1)</span>
<span id="cb43-22"><a href="#cb43-22" tabindex="-1"></a>perm1_results</span>
<span id="cb43-23"><a href="#cb43-23" tabindex="-1"></a><span class="fu">write.csv</span>(perm1_results,<span class="st">&quot;16S_PERMANOVA_Results.csv&quot;</span>,<span class="at">row.names=</span><span class="cn">TRUE</span>)</span>
<span id="cb43-24"><a href="#cb43-24" tabindex="-1"></a></span>
<span id="cb43-25"><a href="#cb43-25" tabindex="-1"></a><span class="co"># save PERMANOVA results in a nice table</span></span>
<span id="cb43-26"><a href="#cb43-26" tabindex="-1"></a>tab <span class="ot">&lt;-</span> <span class="fu">ggtexttable</span>(perm1_results, <span class="at">theme =</span> <span class="fu">ttheme</span>(<span class="st">&quot;light&quot;</span>))</span>
<span id="cb43-27"><a href="#cb43-27" tabindex="-1"></a>tab2<span class="ot">&lt;-</span> tab <span class="sc">%&gt;%</span></span>
<span id="cb43-28"><a href="#cb43-28" tabindex="-1"></a>  <span class="fu">tab_add_title</span>(<span class="at">text =</span> <span class="st">&quot;PERMANOVA Results: Composition ~ Category&quot;</span>, <span class="at">face =</span> <span class="st">&quot;bold&quot;</span>, <span class="at">padding =</span> <span class="fu">unit</span>(<span class="dv">1</span>, <span class="st">&quot;line&quot;</span>)) <span class="sc">%&gt;%</span></span>
<span id="cb43-29"><a href="#cb43-29" tabindex="-1"></a>  <span class="fu">tab_add_footnote</span>(<span class="at">text =</span> <span class="st">&quot;Using Euclidean Distance of CLR-Transformed Data (Aitchison Distance)&quot;</span>, <span class="at">size =</span> <span class="dv">10</span>, <span class="at">face =</span> <span class="st">&quot;italic&quot;</span>)</span>
<span id="cb43-30"><a href="#cb43-30" tabindex="-1"></a></span>
<span id="cb43-31"><a href="#cb43-31" tabindex="-1"></a><span class="co"># save table as png</span></span>
<span id="cb43-32"><a href="#cb43-32" tabindex="-1"></a><span class="fu">png</span>(<span class="st">&#39;permanova_table_test.png&#39;</span>,<span class="at">width =</span> <span class="dv">700</span>, <span class="at">height =</span> <span class="dv">600</span>, <span class="at">res=</span><span class="dv">200</span>)</span>
<span id="cb43-33"><a href="#cb43-33" tabindex="-1"></a>tab2</span>
<span id="cb43-34"><a href="#cb43-34" tabindex="-1"></a><span class="fu">dev.off</span>()</span></code></pre></div>
<center>
<img src="amplicon_workflow/permanova_results_screenshot.png" />
</center>
<div align="center">
Figure 12: Table of PERMANOVA Results (by Category)
</div>
<p></br></p>
<p>As predicted by the <code>betadisper()</code> results, we are seeing
a significant difference in community composition between our groups.
Again, for this data set we can’t really know if these differences are
meaningful because the within-group disperions (aka variances) are NOT
homogenous - so the significance we are seeing here is likely due to
dispersion effects rather than real differences between groups.</p>
</div>
</div>
<div id="relative-abundance" class="section level2" number="4.4">
<h2><span class="header-section-number">4.4</span> Relative
Abundance</h2>
<p>So we know our microbial communities are similar to each other if
they’re from the same sample category, and that the microbiomes from
each category are quite different from each other…but what about the
actual taxa within those communities? Calculating the relative abundance
allows us to determine how abundant certain taxa are compared to the
rest of the taxa within a specific sample. In order to get a better
sense of the microbial community in our samples, we need to calculate
the relative abundance at varying taxonomic levels. We can also
calculate the relative abundance by group; for example, we can determine
the relative contribution of taxa 1 compared to the rest of the
microbial commiunity in Group A.</p>
<p>Let’s merge our ASV taxonomy table to our metadata table. This will
help us create a data frame that we can use to get the sum of our ASV
counts across specific taxonomic levels.</p>
<p><strong>NOTE</strong>: Here we are merging the original ASV counts
and ASV taxa data frames. If you followed the <code>decontam()</code>
step, you should be merging the CLEAN versions of these objects. There
is a commented-out line of code in the section below that shows this
step.</p>
<div class="sourceCode" id="cb44"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb44-1"><a href="#cb44-1" tabindex="-1"></a><span class="co">#bac.all&lt;-merge(bac.ASV_counts_CLEAN, bac.ASV_taxa_CLEAN, by=&quot;ASV_ID&quot;)</span></span>
<span id="cb44-2"><a href="#cb44-2" tabindex="-1"></a><span class="co"># bac.all&lt;-merge(bac.ASV_counts, bac.ASV_taxa, by=&quot;ASV_ID&quot;)</span></span>
<span id="cb44-3"><a href="#cb44-3" tabindex="-1"></a><span class="fu">head</span>(bac.ASV_all)</span>
<span id="cb44-4"><a href="#cb44-4" tabindex="-1"></a>bac_melt<span class="ot">&lt;-</span><span class="fu">melt</span>(bac.all, <span class="at">id.vars =</span> <span class="fu">c</span>(<span class="st">&quot;ASV_ID&quot;</span>, <span class="st">&quot;Kingdom&quot;</span>, <span class="st">&quot;Phylum&quot;</span>, <span class="st">&quot;Class&quot;</span>, <span class="st">&quot;Order&quot;</span>, <span class="st">&quot;Family&quot;</span>, <span class="st">&quot;Genus&quot;</span>, <span class="st">&quot;Species&quot;</span>))</span>
<span id="cb44-5"><a href="#cb44-5" tabindex="-1"></a><span class="fu">head</span>(bac_melt)</span>
<span id="cb44-6"><a href="#cb44-6" tabindex="-1"></a><span class="fu">names</span>(bac_melt)[<span class="fu">which</span>(<span class="fu">names</span>(bac_melt) <span class="sc">==</span> <span class="st">&quot;variable&quot;</span>)] <span class="ot">&lt;-</span> <span class="st">&quot;SampleID&quot;</span></span>
<span id="cb44-7"><a href="#cb44-7" tabindex="-1"></a><span class="fu">names</span>(bac_melt)[<span class="fu">which</span>(<span class="fu">names</span>(bac_melt) <span class="sc">==</span> <span class="st">&quot;value&quot;</span>)] <span class="ot">&lt;-</span> <span class="st">&quot;Counts&quot;</span></span>
<span id="cb44-8"><a href="#cb44-8" tabindex="-1"></a><span class="fu">head</span>(bac_melt)</span>
<span id="cb44-9"><a href="#cb44-9" tabindex="-1"></a></span>
<span id="cb44-10"><a href="#cb44-10" tabindex="-1"></a><span class="fu">head</span>(metadata)</span>
<span id="cb44-11"><a href="#cb44-11" tabindex="-1"></a></span>
<span id="cb44-12"><a href="#cb44-12" tabindex="-1"></a>all_bac<span class="ot">&lt;-</span><span class="fu">merge</span>(bac_melt, metadata, <span class="at">by =</span> <span class="st">&quot;SampleID&quot;</span>)</span>
<span id="cb44-13"><a href="#cb44-13" tabindex="-1"></a><span class="fu">head</span>(all_bac) <span class="co"># contains metadata, ASV counts, and taxonomic IDs for ASVs</span></span></code></pre></div>
<p>Using the <code>all_bac</code> data frame containing our data and
metadata altogether and the <code>dcast()</code> function from the
<code>reshape2</code> package <span class="citation">(Wickham
2007)</span>, we can create data frames that contain counts by taxonomic
levels. We will then use these data frames to get the relative abundance
at specific taxonomic levels. Because I am using a bacterial data set, I
am only going to calculate the relative abundance at the phyla and class
levels. Though we could get deeper taxonomic resolution, the
visualization at these levels can be really overwhelming because there
are SO many taxa present.</p>
<p>Remember that when calculating your relative abundance, each count
per taxa in each sample is divided by the sum of all the counts for that
sample. This means that when you sum up all of the relativized values
for each sample, they should sum up to 1. We can use this fact to help
us check whether our relativization worked.</p>
<div class="sourceCode" id="cb45"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb45-1"><a href="#cb45-1" tabindex="-1"></a><span class="co"># use dcast to count up ASVs within each Phylum across all of the samples</span></span>
<span id="cb45-2"><a href="#cb45-2" tabindex="-1"></a>b.phyla_counts <span class="ot">&lt;-</span> <span class="fu">as.data.frame</span>(<span class="fu">dcast</span>(all_bac, SampleID<span class="sc">~</span>Phylum, <span class="at">value.var=</span><span class="st">&quot;Counts&quot;</span>, <span class="at">fun.aggregate=</span>sum)) <span class="do">###</span></span>
<span id="cb45-3"><a href="#cb45-3" tabindex="-1"></a><span class="fu">head</span>(b.phyla_counts) <span class="co"># counts by phyla per sample</span></span>
<span id="cb45-4"><a href="#cb45-4" tabindex="-1"></a><span class="fu">rownames</span>(b.phyla_counts)<span class="ot">&lt;-</span>b.phyla_counts<span class="sc">$</span>SampleID</span>
<span id="cb45-5"><a href="#cb45-5" tabindex="-1"></a></span>
<span id="cb45-6"><a href="#cb45-6" tabindex="-1"></a>b.phyla_RelAb<span class="ot">&lt;-</span><span class="fu">data.frame</span>(<span class="fu">decostand</span>(b.phyla_counts[,<span class="sc">-</span><span class="dv">1</span>], <span class="at">method=</span><span class="st">&quot;total&quot;</span>, <span class="at">MARGIN=</span><span class="dv">1</span>, <span class="at">na.rm=</span><span class="cn">TRUE</span>))  </span>
<span id="cb45-7"><a href="#cb45-7" tabindex="-1"></a><span class="co"># relative abundance of taxa data where everything is divided by col total (b/c Margin=1 meaning rows == SAMPLES in this case)</span></span>
<span id="cb45-8"><a href="#cb45-8" tabindex="-1"></a><span class="fu">rowSums</span>(b.phyla_RelAb) <span class="co"># sanity check to make sure the transformation worked! All rowsums should = 1</span></span>
<span id="cb45-9"><a href="#cb45-9" tabindex="-1"></a></span>
<span id="cb45-10"><a href="#cb45-10" tabindex="-1"></a>b.phyla_RelAb<span class="sc">$</span>SampleID<span class="ot">&lt;-</span><span class="fu">rownames</span>(b.phyla_RelAb)</span>
<span id="cb45-11"><a href="#cb45-11" tabindex="-1"></a><span class="fu">head</span>(b.phyla_RelAb)</span>
<span id="cb45-12"><a href="#cb45-12" tabindex="-1"></a><span class="fu">write.csv</span>(b.phyla_RelAb,<span class="st">&quot;16S_Phyla_Relative_Abundance.csv&quot;</span>,<span class="at">row.names=</span><span class="cn">TRUE</span>) <span class="co"># good to save just in case</span></span>
<span id="cb45-13"><a href="#cb45-13" tabindex="-1"></a></span>
<span id="cb45-14"><a href="#cb45-14" tabindex="-1"></a><span class="co"># melt down relativized data to merge with metadata</span></span>
<span id="cb45-15"><a href="#cb45-15" tabindex="-1"></a>b.phyla_m<span class="ot">&lt;-</span><span class="fu">melt</span>(b.phyla_RelAb)</span>
<span id="cb45-16"><a href="#cb45-16" tabindex="-1"></a></span>
<span id="cb45-17"><a href="#cb45-17" tabindex="-1"></a><span class="fu">head</span>(b.phyla_m)</span>
<span id="cb45-18"><a href="#cb45-18" tabindex="-1"></a><span class="fu">colnames</span>(b.phyla_m)[<span class="fu">which</span>(<span class="fu">names</span>(b.phyla_m) <span class="sc">==</span> <span class="st">&quot;variable&quot;</span>)] <span class="ot">&lt;-</span> <span class="st">&quot;Phylum&quot;</span></span>
<span id="cb45-19"><a href="#cb45-19" tabindex="-1"></a><span class="fu">colnames</span>(b.phyla_m)[<span class="fu">which</span>(<span class="fu">names</span>(b.phyla_m) <span class="sc">==</span> <span class="st">&quot;value&quot;</span>)] <span class="ot">&lt;-</span> <span class="st">&quot;Counts&quot;</span></span>
<span id="cb45-20"><a href="#cb45-20" tabindex="-1"></a><span class="fu">head</span>(b.phyla_m) <span class="do">## relative abundance based on sum of counts by phyla!</span></span>
<span id="cb45-21"><a href="#cb45-21" tabindex="-1"></a></span>
<span id="cb45-22"><a href="#cb45-22" tabindex="-1"></a>b.phyla_RA_meta<span class="ot">&lt;-</span><span class="fu">merge</span>(b.phyla_m,metadata, <span class="at">by=</span><span class="st">&quot;SampleID&quot;</span>)</span>
<span id="cb45-23"><a href="#cb45-23" tabindex="-1"></a><span class="fu">head</span>(b.phyla_RA_meta) <span class="do">## relative abundance based on sum of counts by phyla!</span></span></code></pre></div>
<p>Now that we have our relativized phyla counts, let’s visualize the
relativized counts by phyla using a stacked barplot.</p>
<div class="sourceCode" id="cb46"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb46-1"><a href="#cb46-1" tabindex="-1"></a>b.phy_RA<span class="ot">&lt;-</span><span class="fu">ggplot</span>(b.phyla_RA_meta, <span class="fu">aes</span>(<span class="at">x=</span>SampleID, <span class="at">y=</span>Counts, <span class="at">fill=</span>Phylum))<span class="sc">+</span><span class="fu">geom_bar</span>(<span class="at">stat=</span><span class="st">&quot;identity&quot;</span>,<span class="at">colour=</span><span class="st">&quot;black&quot;</span>)<span class="sc">+</span><span class="fu">scale_x_discrete</span>()<span class="sc">+</span><span class="fu">theme_classic</span>()<span class="sc">+</span></span>
<span id="cb46-2"><a href="#cb46-2" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">&quot;Microbial Phylum Relative Abundance&quot;</span>, <span class="at">x=</span><span class="st">&quot;SampleID&quot;</span>, <span class="at">y=</span><span class="st">&quot;Relative Abundance&quot;</span>, <span class="at">fill=</span><span class="st">&quot;Phylum&quot;</span>)<span class="sc">+</span></span>
<span id="cb46-3"><a href="#cb46-3" tabindex="-1"></a>  <span class="fu">theme</span>(<span class="at">axis.title.x =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.title.y =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">vjust=</span><span class="dv">1</span>,<span class="at">angle=</span><span class="dv">90</span>),<span class="at">legend.title.align=</span><span class="fl">0.5</span>, <span class="at">legend.title =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">legend.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">15</span>))</span>
<span id="cb46-4"><a href="#cb46-4" tabindex="-1"></a></span>
<span id="cb46-5"><a href="#cb46-5" tabindex="-1"></a><span class="fu">ggsave</span>(b.phy_RA,<span class="at">filename =</span> <span class="st">&quot;16S_phyla_RA.png&quot;</span>, <span class="at">width=</span><span class="dv">17</span>, <span class="at">height=</span><span class="dv">10</span>, <span class="at">dpi=</span><span class="dv">600</span>)</span></code></pre></div>
<center>
<img src="amplicon_workflow/16S_phyla_RA.png" />
</center>
<div align="center">
Figure 13: Relative Abundance by Phyla Across Samples
</div>
<p></br></p>
<p>We can see that across samples, the two most relatively abundant
phyla are Abditibacteriota in coral, and Proteobacteria in light purple.
Though this is helpful, it’s hard to determine which categories our
samples belong to.</p>
<div class="sourceCode" id="cb47"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb47-1"><a href="#cb47-1" tabindex="-1"></a><span class="fu">head</span>(all_bac)</span>
<span id="cb47-2"><a href="#cb47-2" tabindex="-1"></a><span class="co"># by class + elevation</span></span>
<span id="cb47-3"><a href="#cb47-3" tabindex="-1"></a>bac.cls <span class="ot">&lt;-</span> <span class="fu">as.data.frame</span>(<span class="fu">dcast</span>(all_bac,SampleID<span class="sc">~</span>Class, <span class="at">value.var=</span><span class="st">&quot;Counts&quot;</span>, <span class="at">fun.aggregate=</span>sum)) <span class="do">### </span></span>
<span id="cb47-4"><a href="#cb47-4" tabindex="-1"></a><span class="fu">head</span>(bac.cls) <span class="co"># counts by class + elevation</span></span>
<span id="cb47-5"><a href="#cb47-5" tabindex="-1"></a><span class="fu">rownames</span>(bac.cls)<span class="ot">&lt;-</span>bac.cls<span class="sc">$</span>SampleID</span>
<span id="cb47-6"><a href="#cb47-6" tabindex="-1"></a></span>
<span id="cb47-7"><a href="#cb47-7" tabindex="-1"></a>b.RA_cls<span class="ot">&lt;-</span><span class="fu">data.frame</span>(<span class="fu">decostand</span>(bac.cls[,<span class="sc">-</span><span class="dv">1</span>], <span class="at">method=</span><span class="st">&quot;total&quot;</span>, <span class="at">MARGIN=</span><span class="dv">1</span>, <span class="at">na.rm=</span><span class="cn">TRUE</span>))  </span>
<span id="cb47-8"><a href="#cb47-8" tabindex="-1"></a><span class="co"># relative abundance of taxa data where everything is divided by margin total (default MARGIN = 1 = rows) -- rows = samples</span></span>
<span id="cb47-9"><a href="#cb47-9" tabindex="-1"></a><span class="co"># bac.cls.cat[,-1] -- drop first column to not be included in relative abundance calculation</span></span>
<span id="cb47-10"><a href="#cb47-10" tabindex="-1"></a></span>
<span id="cb47-11"><a href="#cb47-11" tabindex="-1"></a><span class="fu">rowSums</span>(b.RA_cls) <span class="co"># sanity check</span></span>
<span id="cb47-12"><a href="#cb47-12" tabindex="-1"></a>b.RA_cls<span class="sc">$</span>SampleID<span class="ot">&lt;-</span><span class="fu">rownames</span>(b.RA_cls)</span>
<span id="cb47-13"><a href="#cb47-13" tabindex="-1"></a><span class="fu">head</span>(b.RA_cls)</span>
<span id="cb47-14"><a href="#cb47-14" tabindex="-1"></a></span>
<span id="cb47-15"><a href="#cb47-15" tabindex="-1"></a><span class="co">#melt down relativized data to merge with metadata</span></span>
<span id="cb47-16"><a href="#cb47-16" tabindex="-1"></a>b.cls_m<span class="ot">&lt;-</span><span class="fu">melt</span>(b.RA_cls, <span class="at">by=</span><span class="st">&quot;SampleID&quot;</span>)</span>
<span id="cb47-17"><a href="#cb47-17" tabindex="-1"></a></span>
<span id="cb47-18"><a href="#cb47-18" tabindex="-1"></a><span class="fu">head</span>(b.cls_m)</span>
<span id="cb47-19"><a href="#cb47-19" tabindex="-1"></a><span class="fu">colnames</span>(b.cls_m)[<span class="fu">which</span>(<span class="fu">names</span>(b.cls_m) <span class="sc">==</span> <span class="st">&quot;variable&quot;</span>)] <span class="ot">&lt;-</span> <span class="st">&quot;Class&quot;</span></span>
<span id="cb47-20"><a href="#cb47-20" tabindex="-1"></a><span class="fu">colnames</span>(b.cls_m)[<span class="fu">which</span>(<span class="fu">names</span>(b.cls_m) <span class="sc">==</span> <span class="st">&quot;value&quot;</span>)] <span class="ot">&lt;-</span> <span class="st">&quot;Counts&quot;</span></span>
<span id="cb47-21"><a href="#cb47-21" tabindex="-1"></a><span class="fu">head</span>(b.cls_m) <span class="do">## relative abundance based on sum of counts by class!</span></span></code></pre></div>
<div id="taxonomic-summaries" class="section level3" number="4.4.1">
<h3><span class="header-section-number">4.4.1</span> Taxonomic
Summaries</h3>
<p>Now that we have calculated the relativized counts by class according
to our sample categories, let’s visualize it! Instead of a stacked
barplot, we could visualize this in a different way - shout out to
Dr. Mike Lee for sharing this technique in his amplicon <a
href="https://astrobiomike.github.io/amplicon/dada2_workflow_ex">tutorial</a>!</p>
<p>Dr. Lee visualized the proportions of gene copies recovered in a way
that he calls <strong>taxonomic summaries</strong>. These summaries
allow us to view the relative abundance (or gene copy number, or any
abundance measurement) of our microbes by having our x-axis be the taxa
themselves rather than the sample IDs or the group labels. Personally I
find these taxonomic summaries useful for comparing the relative
abundance of specific taxa across multiple groups. To provide an example
that’s easy to look at, we are going to view microbial classes by
category that have a relative abundance of at least 1% or higher per
sample.</p>
<div class="sourceCode" id="cb48"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb48-1"><a href="#cb48-1" tabindex="-1"></a>cat_meta<span class="ot">&lt;-</span><span class="fu">unique</span>(<span class="fu">data.frame</span>(<span class="st">&quot;Category&quot;</span><span class="ot">=</span>metadata<span class="sc">$</span>Category, <span class="st">&quot;Category_col&quot;</span><span class="ot">=</span>metadata<span class="sc">$</span>Category_col, <span class="st">&quot;SampleID&quot;</span><span class="ot">=</span>metadata<span class="sc">$</span>SampleID))</span>
<span id="cb48-2"><a href="#cb48-2" tabindex="-1"></a>cls_cat_meta<span class="ot">&lt;-</span><span class="fu">merge</span>(cat_meta,b.cls_m, <span class="at">by=</span><span class="st">&quot;SampleID&quot;</span>)</span>
<span id="cb48-3"><a href="#cb48-3" tabindex="-1"></a><span class="fu">head</span>(cls_cat_meta)</span>
<span id="cb48-4"><a href="#cb48-4" tabindex="-1"></a></span>
<span id="cb48-5"><a href="#cb48-5" tabindex="-1"></a><span class="co"># Subset data to include only classes with relative abundance &gt; 1%</span></span>
<span id="cb48-6"><a href="#cb48-6" tabindex="-1"></a>cls_cat<span class="fl">.1</span><span class="ot">&lt;-</span><span class="fu">subset</span>(cls_cat_meta, cls_cat_meta<span class="sc">$</span>Counts<span class="sc">&gt;</span><span class="fl">0.01</span>)</span>
<span id="cb48-7"><a href="#cb48-7" tabindex="-1"></a></span>
<span id="cb48-8"><a href="#cb48-8" tabindex="-1"></a>ts1<span class="ot">&lt;-</span><span class="fu">ggplot</span>(cls_cat<span class="fl">.1</span>, <span class="fu">aes</span>(Class, Counts)) <span class="sc">+</span></span>
<span id="cb48-9"><a href="#cb48-9" tabindex="-1"></a>  <span class="fu">geom_jitter</span>(<span class="fu">aes</span>(<span class="at">color=</span><span class="fu">factor</span>(Category)), <span class="at">size=</span><span class="dv">2</span>, <span class="at">width=</span><span class="fl">0.15</span>, <span class="at">height=</span><span class="dv">0</span>) <span class="sc">+</span> <span class="fu">scale_color_manual</span>(<span class="at">name =</span><span class="st">&quot;Sample Category&quot;</span>, <span class="at">labels=</span><span class="fu">c</span>(<span class="st">&quot;ClearCutSoil&quot;</span><span class="ot">=</span><span class="st">&quot;Clear Cut Soil&quot;</span>, <span class="st">&quot;Gopher&quot;</span><span class="ot">=</span><span class="st">&quot;Gopher&quot;</span>, <span class="st">&quot;NoGopher&quot;</span><span class="ot">=</span><span class="st">&quot;No Gopher&quot;</span>, <span class="st">&quot;OldGrowth&quot;</span><span class="ot">=</span><span class="st">&quot;Old Growth&quot;</span>),<span class="at">values=</span><span class="fu">unique</span>(cls_cat_meta<span class="sc">$</span>Category_col[<span class="fu">order</span>(cls_cat_meta<span class="sc">$</span>Category)])) <span class="sc">+</span> <span class="fu">geom_boxplot</span>(<span class="at">fill=</span><span class="cn">NA</span>, <span class="at">outlier.color=</span><span class="cn">NA</span>) <span class="sc">+</span> <span class="fu">theme_classic</span>() <span class="sc">+</span> <span class="fu">theme</span>(<span class="at">axis.title.x =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.title.y =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">vjust=</span><span class="dv">1</span>,<span class="at">angle=</span><span class="dv">90</span>),<span class="at">legend.title.align=</span><span class="fl">0.5</span>, <span class="at">legend.title =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">legend.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">15</span>),<span class="at">plot.subtitle =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">12</span>)) <span class="sc">+</span> <span class="fu">labs</span>(<span class="at">x=</span><span class="st">&quot;Microbial Class&quot;</span>, <span class="at">y=</span><span class="st">&quot;Relative Abundance&quot;</span>, <span class="at">title=</span><span class="st">&quot;Bacteria/Archaea &amp; Sample Category&quot;</span>, <span class="at">subtitle=</span><span class="st">&quot;Only Including Taxa with Relative Abudance &gt; 1%&quot;</span>)</span>
<span id="cb48-10"><a href="#cb48-10" tabindex="-1"></a></span>
<span id="cb48-11"><a href="#cb48-11" tabindex="-1"></a><span class="fu">ggsave</span>(ts1,<span class="at">filename =</span> <span class="st">&quot;16S_Class_1percent_RA_category.png&quot;</span>, <span class="at">width=</span><span class="dv">12</span>, <span class="at">height=</span><span class="dv">10</span>, <span class="at">dpi=</span><span class="dv">600</span>)</span></code></pre></div>
<center>
<img src="amplicon_workflow/16S_Class_1percent_RA_category.png" />
</center>
<div align="center">
Figure 14: Taxonomic Summary of Bacterial Relative Abundance (&gt; 1%)
by Class Per Sample. Colored by Sample Category.
</div>
<p></br></p>
<p>This taxonomic summary tells us that Old Growth samples have a higher
relative abundance (&gt;20-40%) of the Acidimicrobiia class compared to
samples belonging to the other categories. Gammaproteobacteria appears
to be the most abundant class in Clear Cut Soil samples, with some
samples having a relative abundance of this taxa that is greater than
20%. Interestingly, these samples from the Clear Cut Soil category are
the only samples contain Bacilli, though this could be due to our 1%
relative abundancer threshold for this figure.</p>
</div>
<div id="relative-abundance-by-category" class="section level3"
number="4.4.2">
<h3><span class="header-section-number">4.4.2</span> Relative Abundance
by Category</h3>
<p>Maybe it would be helpful to know the relative abundance of taxa by
the categories themselves rather than the relative abundance by sample.
We can easily calculate this and visualize a stacked barplot to compare
sample categories.</p>
<div class="sourceCode" id="cb49"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb49-1"><a href="#cb49-1" tabindex="-1"></a><span class="fu">head</span>(all_bac)</span>
<span id="cb49-2"><a href="#cb49-2" tabindex="-1"></a><span class="co"># by class + elevation</span></span>
<span id="cb49-3"><a href="#cb49-3" tabindex="-1"></a>bac.cls.cat <span class="ot">&lt;-</span> <span class="fu">as.data.frame</span>(<span class="fu">dcast</span>(all_bac,Category<span class="sc">~</span>Class, <span class="at">value.var=</span><span class="st">&quot;Counts&quot;</span>, <span class="at">fun.aggregate=</span>sum)) <span class="do">### </span></span>
<span id="cb49-4"><a href="#cb49-4" tabindex="-1"></a><span class="fu">head</span>(bac.cls.cat) <span class="co"># counts by class + elevation</span></span>
<span id="cb49-5"><a href="#cb49-5" tabindex="-1"></a><span class="fu">rownames</span>(bac.cls.cat)<span class="ot">&lt;-</span>bac.cls.cat<span class="sc">$</span>Category</span>
<span id="cb49-6"><a href="#cb49-6" tabindex="-1"></a></span>
<span id="cb49-7"><a href="#cb49-7" tabindex="-1"></a>b.RA_cls.cat<span class="ot">&lt;-</span><span class="fu">data.frame</span>(<span class="fu">decostand</span>(bac.cls.cat[,<span class="sc">-</span><span class="dv">1</span>], <span class="at">method=</span><span class="st">&quot;total&quot;</span>, <span class="at">MARGIN=</span><span class="dv">1</span>, <span class="at">na.rm=</span><span class="cn">TRUE</span>))  </span>
<span id="cb49-8"><a href="#cb49-8" tabindex="-1"></a><span class="co"># relative abundance of taxa data where everything is divided by margin total (default MARGIN = 1 = rows) -- rows = samples</span></span>
<span id="cb49-9"><a href="#cb49-9" tabindex="-1"></a><span class="fu">rowSums</span>(b.RA_cls.cat) <span class="co"># sanity check</span></span>
<span id="cb49-10"><a href="#cb49-10" tabindex="-1"></a>b.RA_cls.cat<span class="sc">$</span>Category<span class="ot">&lt;-</span><span class="fu">rownames</span>(b.RA_cls.cat)</span>
<span id="cb49-11"><a href="#cb49-11" tabindex="-1"></a><span class="fu">head</span>(b.RA_cls.cat)</span>
<span id="cb49-12"><a href="#cb49-12" tabindex="-1"></a></span>
<span id="cb49-13"><a href="#cb49-13" tabindex="-1"></a><span class="co">#melt down relativized data to merge with metadata</span></span>
<span id="cb49-14"><a href="#cb49-14" tabindex="-1"></a>b.cls.cat_m<span class="ot">&lt;-</span><span class="fu">melt</span>(b.RA_cls.cat, <span class="at">by=</span><span class="st">&quot;Category&quot;</span>)</span>
<span id="cb49-15"><a href="#cb49-15" tabindex="-1"></a></span>
<span id="cb49-16"><a href="#cb49-16" tabindex="-1"></a><span class="fu">head</span>(b.cls.cat_m)</span>
<span id="cb49-17"><a href="#cb49-17" tabindex="-1"></a><span class="fu">colnames</span>(b.cls.cat_m)[<span class="fu">which</span>(<span class="fu">names</span>(b.cls.cat_m) <span class="sc">==</span> <span class="st">&quot;variable&quot;</span>)] <span class="ot">&lt;-</span> <span class="st">&quot;Class&quot;</span></span>
<span id="cb49-18"><a href="#cb49-18" tabindex="-1"></a><span class="fu">colnames</span>(b.cls.cat_m)[<span class="fu">which</span>(<span class="fu">names</span>(b.cls.cat_m) <span class="sc">==</span> <span class="st">&quot;value&quot;</span>)] <span class="ot">&lt;-</span> <span class="st">&quot;Counts&quot;</span></span>
<span id="cb49-19"><a href="#cb49-19" tabindex="-1"></a><span class="fu">head</span>(b.cls.cat_m) <span class="do">## relative abundance based on sum of counts by class!</span></span>
<span id="cb49-20"><a href="#cb49-20" tabindex="-1"></a></span>
<span id="cb49-21"><a href="#cb49-21" tabindex="-1"></a><span class="co"># Subset data to include only classes with relative abundance &gt; 1%</span></span>
<span id="cb49-22"><a href="#cb49-22" tabindex="-1"></a>b.cls.cat<span class="fl">.1</span><span class="ot">&lt;-</span><span class="fu">subset</span>(b.cls.cat_m, b.cls.cat_m<span class="sc">$</span>Counts<span class="sc">&gt;</span><span class="fl">0.01</span>)</span>
<span id="cb49-23"><a href="#cb49-23" tabindex="-1"></a></span>
<span id="cb49-24"><a href="#cb49-24" tabindex="-1"></a>c1<span class="ot">&lt;-</span><span class="fu">ggplot</span>(b.cls.cat<span class="fl">.1</span>, <span class="fu">aes</span>(<span class="at">x=</span>Category, <span class="at">y=</span>Counts, <span class="at">fill=</span>Class))<span class="sc">+</span><span class="fu">geom_bar</span>(<span class="at">stat=</span><span class="st">&quot;identity&quot;</span>,<span class="at">colour=</span><span class="st">&quot;black&quot;</span>)<span class="sc">+</span><span class="fu">scale_x_discrete</span>(<span class="at">labels=</span><span class="fu">c</span>(<span class="st">&quot;ClearCutSoil&quot;</span><span class="ot">=</span><span class="st">&quot;Clear Cut Soil&quot;</span>, <span class="st">&quot;Gopher&quot;</span><span class="ot">=</span><span class="st">&quot;Gopher&quot;</span>, <span class="st">&quot;NoGopher&quot;</span><span class="ot">=</span><span class="st">&quot;No Gopher&quot;</span>, <span class="st">&quot;OldGrowth&quot;</span><span class="ot">=</span><span class="st">&quot;Old Growth&quot;</span>))<span class="sc">+</span><span class="fu">theme_classic</span>()<span class="sc">+</span></span>
<span id="cb49-25"><a href="#cb49-25" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">title =</span> <span class="st">&quot;Microbial Class Relative Abundance&quot;</span>, <span class="at">x=</span><span class="st">&quot;Sample Category&quot;</span>, <span class="at">y=</span><span class="st">&quot;Relative Abundance&quot;</span>, <span class="at">fill=</span><span class="st">&quot;Class&quot;</span>, <span class="at">subtitle=</span><span class="st">&quot;Only Including Taxa with Relative Abudance &gt; 1%&quot;</span>)<span class="sc">+</span></span>
<span id="cb49-26"><a href="#cb49-26" tabindex="-1"></a>  <span class="fu">theme</span>(<span class="at">axis.title.x =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.title.y =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">vjust=</span><span class="dv">1</span>),<span class="at">legend.title.align=</span><span class="fl">0.5</span>, <span class="at">legend.title =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">legend.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">plot.title =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">15</span>),<span class="at">plot.subtitle =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>))</span>
<span id="cb49-27"><a href="#cb49-27" tabindex="-1"></a></span>
<span id="cb49-28"><a href="#cb49-28" tabindex="-1"></a><span class="fu">ggsave</span>(c1,<span class="at">filename =</span> <span class="st">&quot;16S_Class_1percent_RA_byCategory.png&quot;</span>, <span class="at">width=</span><span class="dv">12</span>, <span class="at">height=</span><span class="dv">10</span>, <span class="at">dpi=</span><span class="dv">600</span>)</span></code></pre></div>
<center>
<img src="amplicon_workflow/16S_Class_1percent_RA_byCategory.png" />
</center>
<div align="center">
Figure 15: Relative Abundance (&gt; 1%) of Bacterial Classes by Sample
Category
</div>
<p></br></p>
<p>Instead of viewing the relative abundance of classes by sample as we
did in the taxonomic summary, we are now looking at the relativea
abundance of taxa within the categories themselves. For example, we can
tell that out of all of the categories, samples within the Old Growth
category have the highest relative abundance of the Acidobacteriae
class. Gammaproteobacteria appears to be the most abundant in the Clear
Cut Soil category, whereas Bacteroidia is the most abundant class in the
No Gopher category. Though our taxonomic summaries helped us to see
these trends, these stacked bar plots offer some more insight to
category-specific trends.</p>
</div>
</div>
<div id="environmental-drivers" class="section level2" number="4.5">
<h2><span class="header-section-number">4.5</span> Environmental
Drivers</h2>
<p>The last step in this workflow is to determine how our microbial
community composition corresponds to environmental variables. To do
this, we can run either a <strong>Canonical Correspondence
Analysis</strong> (aka CCA) or a <strong>Redundancy Analysis</strong>
(aka RDA). These are ordination techniques that allow us to correlate
our composition data with our environmental variables to explain the
variance in our microbial community structures. CCAs and RDAs utilize
eigenanalysis, which is quite out of the scope of this tutorial. I
highly recommend watching this <a
href="https://www.youtube.com/watch?v=PFDu9oVAE-g">YouTube video</a>
detailing what <em>eigenvectors</em> and <em>eigenvalues</em> are and
why they’re essential in linear algebra and statistics.</p>
<p>A CCA considers how community composition relates to a set of defined
constraints, which would be your environmental variables of interests.
CCAs assume that your composition data will have a
<strong>unimodal</strong> (aka have one maximum value) relationship with
the environmental variables. Typically for a CCA you already know
exactly what environmental variables you will be considering. It’s
important to note that if your model (how your environmental variables
influence your community composition) is not statistically significant,
then your results are not meaningful enough to discuss.</p>
<p>A RDA can tell us whether our environmental variables explain the
variation we observe in our microbial communities, or whether or not
their impact on community structure is redundant (i.e., the variation is
explained by something else). RDAs assume that your composition data
will have a <strong>linear</strong> relationship with your environmental
data. Variance described by our environmental response variables is
known as <em>constrained variance</em>, whereas variance described by
unknown variables (or variables excluded from our response variables) is
called <em>unconstrained variance</em>.</p>
<p>So at this point you’re probably asking yourself “how do I know if my
data has a unimodal or linear relationship with my environmental
variables?”, and that’s a great question! We can use a <strong>Detrended
Correspondence Analysis</strong> (aka DCA) to help us decide whether we
should use a CCA or an RDA for our composition data. If the length of
our first DCA axis is <em>longer than 4</em>, that indicates that our
data is heterogeneous and requires unimodal methods like a CCA. If the
length of our first DCA axis is <em>less than 3</em>, the data is
considered homogeneous and thus requires linear methods like an RDA. If
your first axis length is between 3 and 4, then it’s up to you to choose
whether to use an RDA or a CCA.</p>
<p>For more information on how CCAs and RDAs are calculated and their
applications, please view this helpful <a
href="http://dmcglinn.github.io/quant_methods/lessons/multivariate_models.html">tutorial</a>
for a deep look into different Euclidean-based ordinations. I also
recommend watching Dr. Ralf Shaefer’s <a
href="https://www.youtube.com/watch?v=AjU6s7-EBGY&amp;t=1s">YouTube
video on RDAs</a> as well as this helpful <a
href="https://youtu.be/tVnnG7mFeqA">YouTube video</a> that walks you
through different ordination techniques using the <code>vegan</code>
package in R. ### Prep Step 1: Check for Correlations Among
Environmental Variables</p>
<div class="sourceCode" id="cb50"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb50-1"><a href="#cb50-1" tabindex="-1"></a><span class="do">## check the assumption</span></span>
<span id="cb50-2"><a href="#cb50-2" tabindex="-1"></a>chem_data<span class="ot">&lt;-</span><span class="fu">subset</span>(metadata, <span class="at">select=</span><span class="fu">c</span>(Cu, Mn, P))</span>
<span id="cb50-3"><a href="#cb50-3" tabindex="-1"></a><span class="fu">pairs</span>(<span class="fu">c</span>(b.clr[<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>],chem_data))</span></code></pre></div>
<div id="prep-step-2-detrended-correspondence-analysis-dca"
class="section level3" number="4.5.1">
<h3><span class="header-section-number">4.5.1</span> Prep Step 2:
Detrended Correspondence Analysis (DCA)</h3>
<p>Let’s first run a DCA to determine the length of our first axis and
decide which ordination technique to use. For this we are going to use
our <strong>CLR-transformed ASV table</strong> as our site x species
matrix. The <code>decorana()</code> function from vegan performs the
DCA, which requires that the row sums are greater than zero. Because clr
transformed data can have negative values, I’ve decided to add a
pseudocount of 1 to the matrix to fulfill the requirements of the
<code>decorana()</code> function.</p>
<p><strong>Note: The chemical data I will be using for this portion of
the workflow is not real data from this project. These data come from a
different project and are being used just for this example.</strong></p>
<div class="sourceCode" id="cb51"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb51-1"><a href="#cb51-1" tabindex="-1"></a><span class="do">## remember, CCA assumes that our species have a unimodal relationship with our variables.</span></span>
<span id="cb51-2"><a href="#cb51-2" tabindex="-1"></a><span class="do">### unimodal = one maximum, think upsidedown bellcurve or something</span></span>
<span id="cb51-3"><a href="#cb51-3" tabindex="-1"></a><span class="do">## RDA assumes a linear relationship</span></span>
<span id="cb51-4"><a href="#cb51-4" tabindex="-1"></a></span>
<span id="cb51-5"><a href="#cb51-5" tabindex="-1"></a><span class="do">## The length of first DCA axis:</span></span>
<span id="cb51-6"><a href="#cb51-6" tabindex="-1"></a><span class="do">## &gt; 4 indicates heterogeneous dataset on which unimodal methods should be used (CCA), </span></span>
<span id="cb51-7"><a href="#cb51-7" tabindex="-1"></a><span class="do">##  &lt; 3 indicates homogeneous dataset for which linear methods are suitable (RDA) </span></span>
<span id="cb51-8"><a href="#cb51-8" tabindex="-1"></a><span class="do">## between 3 and 4 both linear and unimodal methods are OK.</span></span>
<span id="cb51-9"><a href="#cb51-9" tabindex="-1"></a></span>
<span id="cb51-10"><a href="#cb51-10" tabindex="-1"></a><span class="fu">head</span>(b.clr)</span>
<span id="cb51-11"><a href="#cb51-11" tabindex="-1"></a></span>
<span id="cb51-12"><a href="#cb51-12" tabindex="-1"></a><span class="co"># add pseudocount so row sums are &gt; 0</span></span>
<span id="cb51-13"><a href="#cb51-13" tabindex="-1"></a>b.clr.pseudo<span class="ot">&lt;-</span>b.clr<span class="sc">+</span><span class="dv">1</span></span>
<span id="cb51-14"><a href="#cb51-14" tabindex="-1"></a>b.dca <span class="ot">=</span> <span class="fu">decorana</span>(b.clr.pseudo)</span>
<span id="cb51-15"><a href="#cb51-15" tabindex="-1"></a><span class="co">#plot(b.dca) # may take too long to load</span></span>
<span id="cb51-16"><a href="#cb51-16" tabindex="-1"></a><span class="fu">summary</span>(b.dca) <span class="co">#DCA1 axis length = 1.7990; use RDA</span></span></code></pre></div>
<p>Our first DCA axis has a length of 1.7990, which suggests that we
should run an RDA with our data.</p>
</div>
<div id="prep-step-3-subset-environmental-data" class="section level3"
number="4.5.2">
<h3><span class="header-section-number">4.5.2</span> Prep Step 3: Subset
Environmental Data</h3>
<p>We have alreadys caled our environmental data (the
<code>meta_scaled</code> data frame) so that they are comparable to each
other, despite their initial measurements being vastly different (i.e.,
your mg of Cu and mg of P are not immediately comparable). At this
point, we could jump into doing an RDA…but it may be more informative if
we look at the environmental drivers of each category or site one by one
rather than considering the drivers across sites.</p>
<p>To subset our data, we will use the <code>lapply()</code> function:
if <code>metdata$Category</code> is the same as (<code>==</code>) the
element(s) in <code>site_list</code>, subset the metadata data frame by
the <code>site_list</code> element into a new list called
<code>site_subsets</code>. After this step, we will have a list where
each element in the list (<code>site_subsets[[i]]</code>) will contain
all of the metadata for each respective Category.</p>
<p>I have also provided examples on how to subset lists for practice -
indexing in R can be tricky in the beginning, and personally I need to
review it often.</p>
<div class="sourceCode" id="cb52"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb52-1"><a href="#cb52-1" tabindex="-1"></a><span class="co"># recall our scaled metadata df</span></span>
<span id="cb52-2"><a href="#cb52-2" tabindex="-1"></a><span class="fu">head</span>(meta_scaled)</span>
<span id="cb52-3"><a href="#cb52-3" tabindex="-1"></a></span>
<span id="cb52-4"><a href="#cb52-4" tabindex="-1"></a><span class="fu">unique</span>(meta_scaled<span class="sc">$</span>Category)</span>
<span id="cb52-5"><a href="#cb52-5" tabindex="-1"></a></span>
<span id="cb52-6"><a href="#cb52-6" tabindex="-1"></a>site_list<span class="ot">&lt;-</span><span class="fu">unique</span>(meta_scaled<span class="sc">$</span>Category) <span class="co">#define an array of string values</span></span>
<span id="cb52-7"><a href="#cb52-7" tabindex="-1"></a></span>
<span id="cb52-8"><a href="#cb52-8" tabindex="-1"></a><span class="co"># go through metadata &amp; create a list of data frames </span></span>
<span id="cb52-9"><a href="#cb52-9" tabindex="-1"></a><span class="do">## when metadata$Category == element in site_list (aka x in this case), subset metadata by said element into elements of a list</span></span>
<span id="cb52-10"><a href="#cb52-10" tabindex="-1"></a>site_subsets<span class="ot">&lt;-</span><span class="fu">lapply</span>(site_list, <span class="cf">function</span>(x) {<span class="fu">subset</span>(meta_scaled, Category<span class="sc">==</span>x)})</span>
<span id="cb52-11"><a href="#cb52-11" tabindex="-1"></a><span class="co"># here the function(x) is using site_list aka x to subset metadata, when $Category column == site_list</span></span>
<span id="cb52-12"><a href="#cb52-12" tabindex="-1"></a></span>
<span id="cb52-13"><a href="#cb52-13" tabindex="-1"></a>site_subsets <span class="co"># sanity check1 (should see all elements in list)</span></span>
<span id="cb52-14"><a href="#cb52-14" tabindex="-1"></a>site_subsets[[<span class="dv">1</span>]] <span class="co"># sanity check2 (see 1st element in list)</span></span>
<span id="cb52-15"><a href="#cb52-15" tabindex="-1"></a><span class="co">#rename the list elements</span></span>
<span id="cb52-16"><a href="#cb52-16" tabindex="-1"></a></span>
<span id="cb52-17"><a href="#cb52-17" tabindex="-1"></a><span class="co"># name each element in list</span></span>
<span id="cb52-18"><a href="#cb52-18" tabindex="-1"></a><span class="fu">names</span>(site_subsets)<span class="ot">&lt;-</span>site_list <span class="co"># * only do this if the order of names in site_list match order of the elements in site_subsets!</span></span>
<span id="cb52-19"><a href="#cb52-19" tabindex="-1"></a>site_subsets<span class="sc">$</span>ClearCutSoil <span class="co"># sanity check3 - should be able to pull dataframes by names rather than index now</span></span>
<span id="cb52-20"><a href="#cb52-20" tabindex="-1"></a></span>
<span id="cb52-21"><a href="#cb52-21" tabindex="-1"></a><span class="co"># example of subsetting</span></span>
<span id="cb52-22"><a href="#cb52-22" tabindex="-1"></a>site_subsets[[<span class="dv">2</span>]][<span class="dv">1</span><span class="sc">:</span><span class="dv">3</span>]</span>
<span id="cb52-23"><a href="#cb52-23" tabindex="-1"></a>site_subsets<span class="sc">$</span>Gopher[<span class="dv">1</span><span class="sc">:</span><span class="dv">3</span>] <span class="co"># should produce same ouptut as line above</span></span>
<span id="cb52-24"><a href="#cb52-24" tabindex="-1"></a></span>
<span id="cb52-25"><a href="#cb52-25" tabindex="-1"></a>site_subsets[[<span class="dv">2</span>]][<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>] <span class="co"># another example</span></span>
<span id="cb52-26"><a href="#cb52-26" tabindex="-1"></a></span>
<span id="cb52-27"><a href="#cb52-27" tabindex="-1"></a><span class="co"># ^ subsetting to [[second dataframe]], [[row #, column #]]</span></span>
<span id="cb52-28"><a href="#cb52-28" tabindex="-1"></a>site_subsets[[<span class="dv">2</span>]][[<span class="dv">1</span>,<span class="dv">2</span>]] <span class="co"># [[second dataframe]], [[row 1, column 2]]</span></span></code></pre></div>
<p>The next step is to take each element in our list and make a new data
frame, so that we have data frames of the metadata separated by
Category. I have written a custom function called
<code>df_specific.subset</code> to do this, but if you know an easier
way of doing this, feel free to use that instead!</p>
<p>At the end of this step, we should have data frames of each category
containing all of their respective metadata.</p>
<div class="sourceCode" id="cb53"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb53-1"><a href="#cb53-1" tabindex="-1"></a><span class="co"># set up the function and run this to store it in our Global environment</span></span>
<span id="cb53-2"><a href="#cb53-2" tabindex="-1"></a>df_specific.subset<span class="ot">&lt;-</span><span class="cf">function</span>(var_vec,var_subsets){ </span>
<span id="cb53-3"><a href="#cb53-3" tabindex="-1"></a>  <span class="co"># var_vec = vector of variable elements from specific categorical variable; </span></span>
<span id="cb53-4"><a href="#cb53-4" tabindex="-1"></a>  <span class="do">## e.g. vector of names from Site categorical variable (metadata sites)</span></span>
<span id="cb53-5"><a href="#cb53-5" tabindex="-1"></a>  <span class="co"># var_subsets = list of dataframes subsetted by column$element from original dataframe;</span></span>
<span id="cb53-6"><a href="#cb53-6" tabindex="-1"></a>  <span class="do">## e.g. list of dataframes (each df = element of list) subsetted from metadata using vector of metadata$Site names</span></span>
<span id="cb53-7"><a href="#cb53-7" tabindex="-1"></a>  <span class="cf">for</span>(i <span class="cf">in</span> <span class="fu">seq_along</span>(var_vec)){</span>
<span id="cb53-8"><a href="#cb53-8" tabindex="-1"></a>    <span class="co"># print(var_vec[i]) -- var_vec[i] = each element in var_vec</span></span>
<span id="cb53-9"><a href="#cb53-9" tabindex="-1"></a>    <span class="co"># print(var_subsets[[i]]) -- var_subsets[[i]] = each sub</span></span>
<span id="cb53-10"><a href="#cb53-10" tabindex="-1"></a>    df<span class="ot">&lt;-</span><span class="fu">paste</span>(var_vec[i])</span>
<span id="cb53-11"><a href="#cb53-11" tabindex="-1"></a>    <span class="co">#print(df)</span></span>
<span id="cb53-12"><a href="#cb53-12" tabindex="-1"></a>    <span class="fu">assign</span>(df, var_subsets[[i]], <span class="at">envir =</span> .GlobalEnv)</span>
<span id="cb53-13"><a href="#cb53-13" tabindex="-1"></a>    <span class="fu">print</span>(<span class="fu">paste</span>(<span class="st">&quot;Dataframe&quot;</span>, var_vec[i] ,<span class="st">&quot;done&quot;</span>))</span>
<span id="cb53-14"><a href="#cb53-14" tabindex="-1"></a>    </span>
<span id="cb53-15"><a href="#cb53-15" tabindex="-1"></a>  }</span>
<span id="cb53-16"><a href="#cb53-16" tabindex="-1"></a>  </span>
<span id="cb53-17"><a href="#cb53-17" tabindex="-1"></a>}</span>
<span id="cb53-18"><a href="#cb53-18" tabindex="-1"></a></span>
<span id="cb53-19"><a href="#cb53-19" tabindex="-1"></a><span class="co"># run the function</span></span>
<span id="cb53-20"><a href="#cb53-20" tabindex="-1"></a><span class="fu">df_specific.subset</span>(site_list, site_subsets) <span class="co"># used scaled metadata quantitative values</span></span>
<span id="cb53-21"><a href="#cb53-21" tabindex="-1"></a></span>
<span id="cb53-22"><a href="#cb53-22" tabindex="-1"></a><span class="fu">head</span>(ClearCutSoil) <span class="co"># sanity check</span></span>
<span id="cb53-23"><a href="#cb53-23" tabindex="-1"></a>ClearCutSoil[<span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>,<span class="dv">17</span><span class="sc">:</span><span class="dv">19</span>] <span class="co"># double check that our new Category data frames still have scaled chemical data</span></span></code></pre></div>
<p>Now that we have data frames containing all of the metadata by
category, we need to match these data frames up with our composition
data (our transformed ASV table). We are going to use another custom
function I wrote called <code>match_dat</code> to do this. First we have
to run the function to make sure it’s in our Global Environment, and
then we can use it. To save time, I am only going to use one of our
category data frames to match up with our transformed composition
data.</p>
<div class="sourceCode" id="cb54"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb54-1"><a href="#cb54-1" tabindex="-1"></a><span class="co"># matching data with user defined function</span></span>
<span id="cb54-2"><a href="#cb54-2" tabindex="-1"></a>match_dat<span class="ot">&lt;-</span><span class="cf">function</span>(compdata, subset_metadata){</span>
<span id="cb54-3"><a href="#cb54-3" tabindex="-1"></a>  subset_comp_data <span class="ot">=</span> pullrow<span class="ot">&lt;-</span>(<span class="fu">is.element</span>(<span class="fu">row.names</span>(compdata), <span class="fu">row.names</span>(subset_metadata)))</span>
<span id="cb54-4"><a href="#cb54-4" tabindex="-1"></a>  <span class="do">### * comp data and metadata need to have row names - rownames should be Sample IDs</span></span>
<span id="cb54-5"><a href="#cb54-5" tabindex="-1"></a>  subset_comp_data<span class="ot">=</span>compdata[pullrow,]</span>
<span id="cb54-6"><a href="#cb54-6" tabindex="-1"></a>  <span class="fu">return</span>(subset_comp_data)</span>
<span id="cb54-7"><a href="#cb54-7" tabindex="-1"></a>}</span>
<span id="cb54-8"><a href="#cb54-8" tabindex="-1"></a></span>
<span id="cb54-9"><a href="#cb54-9" tabindex="-1"></a><span class="co"># check that our data frames are ready for this function, aka that they both have the same rownames</span></span>
<span id="cb54-10"><a href="#cb54-10" tabindex="-1"></a><span class="do">## row #s do not have to be the same, but their row names should be in the same format and be able to match up</span></span>
<span id="cb54-11"><a href="#cb54-11" tabindex="-1"></a><span class="fu">rownames</span>(b.clr)</span>
<span id="cb54-12"><a href="#cb54-12" tabindex="-1"></a><span class="fu">rownames</span>(ClearCutSoil)</span>
<span id="cb54-13"><a href="#cb54-13" tabindex="-1"></a></span>
<span id="cb54-14"><a href="#cb54-14" tabindex="-1"></a><span class="co"># run the function</span></span>
<span id="cb54-15"><a href="#cb54-15" tabindex="-1"></a>b.clr_C.C.S<span class="ot">&lt;-</span><span class="fu">match_dat</span>(b.clr,ClearCutSoil)</span>
<span id="cb54-16"><a href="#cb54-16" tabindex="-1"></a></span>
<span id="cb54-17"><a href="#cb54-17" tabindex="-1"></a><span class="co"># did the function work the way we wanted it to? let&#39;s check!</span></span>
<span id="cb54-18"><a href="#cb54-18" tabindex="-1"></a><span class="fu">head</span>(b.clr_C.C.S)</span>
<span id="cb54-19"><a href="#cb54-19" tabindex="-1"></a><span class="fu">rownames</span>(ClearCutSoil) <span class="sc">%in%</span> <span class="fu">rownames</span>(b.clr_C.C.S) <span class="co"># hopefully all of the rownames match, aka will get output of TRUE</span></span></code></pre></div>
</div>
<div id="redundancy-analysis-rda" class="section level3" number="4.5.3">
<h3><span class="header-section-number">4.5.3</span> Redundancy Analysis
(RDA)</h3>
<p>Now that we have the metadata and the corresponding transformed ASV
counts for the samples in the Clear Cut Soil category, we can run an
<strong>RDA</strong> with the <code>rda()</code> function from the
<code>vegan</code> package to see if our variables of interest drive our
microbial community composition.</p>
<p>If your DCA 1 axis was longer than 3 and you needed to run a
<strong>CCA</strong>, all of these steps would be the same
<em>except</em> that instead of using the <code>rda()</code> function by
<code>vegan</code>, you would use the <code>cca()</code> function.
Similar to the <code>decorana</code> function, you may need to add a
small pseudocount to your transformed feature table before you run the
<code>cca()</code> function.</p>
<p>To view the results of our rda, we will use the
<code>summary()</code> function, and we can check the amount of
variation explained by the model (i.e., R<sup>2</sup>) using the
<code>RsquareAdj()</code> function. It’s wise to use the adjusted
R<sup>2</sup> value because the more variables included in the model,
the more inflated your R<sup>2</sup> value will be. We can also check
out the individual terms of the model and their respective impact via an
<code>anova()</code>. Lastly, we can see exactly which variables are the
drivers of variation in our model using the <code>ordistep()</code>
function. Based on the results of this last step, we can adjust our
model and re-run the RDA.</p>
<div class="sourceCode" id="cb55"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb55-1"><a href="#cb55-1" tabindex="-1"></a>rda.ccs<span class="ot">&lt;-</span><span class="fu">rda</span>(b.clr_C.C.S <span class="sc">~</span> Cu <span class="sc">+</span> Mn <span class="sc">+</span> P, <span class="at">data=</span>ClearCutSoil)</span>
<span id="cb55-2"><a href="#cb55-2" tabindex="-1"></a></span>
<span id="cb55-3"><a href="#cb55-3" tabindex="-1"></a><span class="co"># plot RDA</span></span>
<span id="cb55-4"><a href="#cb55-4" tabindex="-1"></a><span class="co">#plot(rda.ccs) # depending on how many species you have, this step may take a while</span></span>
<span id="cb55-5"><a href="#cb55-5" tabindex="-1"></a><span class="fu">plot</span>(rda.ccs, <span class="at">scaling =</span> <span class="dv">1</span>)</span>
<span id="cb55-6"><a href="#cb55-6" tabindex="-1"></a><span class="do">## scaling = 1 -&gt; emphasizes relationships among sites</span></span>
<span id="cb55-7"><a href="#cb55-7" tabindex="-1"></a><span class="fu">plot</span>(rda.ccs, <span class="at">scaling =</span> <span class="dv">2</span>)</span>
<span id="cb55-8"><a href="#cb55-8" tabindex="-1"></a><span class="do">## scaling = 2 -&gt; emphasizes relationships among species</span></span>
<span id="cb55-9"><a href="#cb55-9" tabindex="-1"></a></span>
<span id="cb55-10"><a href="#cb55-10" tabindex="-1"></a><span class="co"># check summary of RDA</span></span>
<span id="cb55-11"><a href="#cb55-11" tabindex="-1"></a><span class="fu">summary</span>(rda.ccs)</span>
<span id="cb55-12"><a href="#cb55-12" tabindex="-1"></a></span>
<span id="cb55-13"><a href="#cb55-13" tabindex="-1"></a><span class="co"># how much variation does our model explain?</span></span>
<span id="cb55-14"><a href="#cb55-14" tabindex="-1"></a><span class="do">## reminder: R^2 = % of variation in dependent variable explained by model</span></span>
<span id="cb55-15"><a href="#cb55-15" tabindex="-1"></a><span class="fu">RsquareAdj</span>(rda.ccs) <span class="co"># 2.94%</span></span>
<span id="cb55-16"><a href="#cb55-16" tabindex="-1"></a><span class="do">## ^^ use this b/c chance correlations can inflate R^2</span></span>
<span id="cb55-17"><a href="#cb55-17" tabindex="-1"></a></span>
<span id="cb55-18"><a href="#cb55-18" tabindex="-1"></a><span class="do">## we can then test for significance of the model by permutation</span></span>
<span id="cb55-19"><a href="#cb55-19" tabindex="-1"></a><span class="do">## if it is not significant, it doesn&#39;t matter how much of the variation is explained</span></span>
<span id="cb55-20"><a href="#cb55-20" tabindex="-1"></a><span class="fu">anova</span>(rda.ccs, <span class="at">permutations =</span> <span class="fu">how</span>(<span class="at">nperm=</span><span class="dv">999</span>))</span>
<span id="cb55-21"><a href="#cb55-21" tabindex="-1"></a></span>
<span id="cb55-22"><a href="#cb55-22" tabindex="-1"></a><span class="do">## we can also do a permutation test by axis </span></span>
<span id="cb55-23"><a href="#cb55-23" tabindex="-1"></a><span class="fu">anova</span>(rda.ccs, <span class="at">by =</span> <span class="st">&quot;axis&quot;</span>, <span class="at">permutations =</span> <span class="fu">how</span>(<span class="at">nperm=</span><span class="dv">999</span>)) <span class="do">### by RDA axis</span></span>
<span id="cb55-24"><a href="#cb55-24" tabindex="-1"></a><span class="do">## or by terms</span></span>
<span id="cb55-25"><a href="#cb55-25" tabindex="-1"></a><span class="fu">anova</span>(rda.ccs, <span class="at">by =</span> <span class="st">&quot;terms&quot;</span>, <span class="at">permutations =</span> <span class="fu">how</span>(<span class="at">nperm=</span><span class="dv">999</span>)) <span class="do">### by variables</span></span>
<span id="cb55-26"><a href="#cb55-26" tabindex="-1"></a><span class="do">## this will help us interpret our RDA and we can see some variable are not significant</span></span>
<span id="cb55-27"><a href="#cb55-27" tabindex="-1"></a></span>
<span id="cb55-28"><a href="#cb55-28" tabindex="-1"></a><span class="do">## we can use model selection instead of picking variables we think are important</span></span>
<span id="cb55-29"><a href="#cb55-29" tabindex="-1"></a>rda.ccs.a <span class="ot">=</span> <span class="fu">ordistep</span>(<span class="fu">rda</span>(b.clr_C.C.S <span class="sc">~</span> <span class="dv">1</span>, <span class="at">data =</span> ClearCutSoil[,<span class="dv">17</span><span class="sc">:</span><span class="dv">19</span>]),</span>
<span id="cb55-30"><a href="#cb55-30" tabindex="-1"></a>                  <span class="at">scope=</span><span class="fu">formula</span>(rda.ccs),</span>
<span id="cb55-31"><a href="#cb55-31" tabindex="-1"></a>                  <span class="at">direction =</span> <span class="st">&quot;forward&quot;</span>,</span>
<span id="cb55-32"><a href="#cb55-32" tabindex="-1"></a>                  <span class="at">permutations =</span> <span class="fu">how</span>(<span class="at">nperm=</span><span class="dv">999</span>))</span></code></pre></div>
<p>Because our model was not significant, normally we would stop here
and not visualize this RDA. However, the point of this workflow is to
provide you with the code to run and visualize these analyses
yourselves, so I am going to show you how I would visualize this
RDA.</p>
<p>The <code>ggvegan</code> <a
href="https://github.com/gavinsimpson/ggvegan">package</a> is a useful
package that takes advantage of some of the difficulties of
<code>ggplot2</code> when it comes to plotting ordinations. This package
has a function called <code>autoplot</code> which helps us easily plot
an RDA.</p>
<div class="sourceCode" id="cb56"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb56-1"><a href="#cb56-1" tabindex="-1"></a><span class="fu">png</span>(<span class="st">&#39;autoplot_rda_example.png&#39;</span>,<span class="at">width =</span> <span class="dv">700</span>, <span class="at">height =</span> <span class="dv">600</span>, <span class="at">res=</span><span class="dv">100</span>)</span>
<span id="cb56-2"><a href="#cb56-2" tabindex="-1"></a><span class="fu">autoplot</span>(rda.ccs, <span class="at">arrows =</span> <span class="cn">TRUE</span>,<span class="at">data =</span> rda.ccs ,<span class="at">layers=</span><span class="fu">c</span>(<span class="st">&quot;biplot&quot;</span>,<span class="st">&quot;sites&quot;</span>),<span class="at">label =</span> <span class="cn">FALSE</span>, <span class="at">label.size =</span> <span class="dv">3</span>, <span class="at">shape =</span> <span class="cn">FALSE</span>, <span class="at">loadings =</span> <span class="cn">TRUE</span>, <span class="at">loadings.colour =</span> <span class="st">&#39;blue&#39;</span>, <span class="at">loadings.label =</span> <span class="cn">TRUE</span>, <span class="at">loadings.label.size =</span> <span class="dv">3</span>, <span class="at">scale=</span> <span class="dv">0</span>)<span class="sc">+</span><span class="fu">theme_classic</span>()</span>
<span id="cb56-3"><a href="#cb56-3" tabindex="-1"></a><span class="fu">dev.off</span>()</span></code></pre></div>
<center>
<img src="amplicon_workflow/autoplot_rda_example.png" />
</center>
<div align="center">
Figure 16: Redundancy Analysis Autoplot for the Clear Cut Soil category
by <code>ggvegan</code>
</div>
<p></br></p>
<p>We can also extract data from the <code>summary(rda)</code> object
and use this information to create our own RDA plot with
<code>ggplot2</code>. I am going to show you two versions of the plot:
one version that has not been altered in any way (Figure 17a), and an
RDA where the axes pointing to our environmental variables have been
amplified to match the autoplot (Figure 17b).</p>
<div class="sourceCode" id="cb57"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb57-1"><a href="#cb57-1" tabindex="-1"></a>rda.sum<span class="ot">&lt;-</span><span class="fu">summary</span>(rda.ccs)</span>
<span id="cb57-2"><a href="#cb57-2" tabindex="-1"></a>rda.sum<span class="sc">$</span>sites[,<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>]</span>
<span id="cb57-3"><a href="#cb57-3" tabindex="-1"></a></span>
<span id="cb57-4"><a href="#cb57-4" tabindex="-1"></a><span class="co"># create data frame w/ RDA axes for sites</span></span>
<span id="cb57-5"><a href="#cb57-5" tabindex="-1"></a>rda.axes<span class="ot">&lt;-</span><span class="fu">data.frame</span>(<span class="at">RDA1=</span>rda.sum<span class="sc">$</span>sites[,<span class="dv">1</span>], <span class="at">RDA2=</span>rda.sum<span class="sc">$</span>sites[,<span class="dv">2</span>], <span class="at">SampleID=</span><span class="fu">rownames</span>(rda.sum<span class="sc">$</span>sites))</span>
<span id="cb57-6"><a href="#cb57-6" tabindex="-1"></a></span>
<span id="cb57-7"><a href="#cb57-7" tabindex="-1"></a><span class="co"># create data frame w/ RDA axes for variables</span></span>
<span id="cb57-8"><a href="#cb57-8" tabindex="-1"></a>arrows<span class="ot">&lt;-</span><span class="fu">data.frame</span>(<span class="at">RDA1=</span>rda.sum<span class="sc">$</span>biplot[,<span class="dv">1</span>], <span class="at">RDA2=</span>rda.sum<span class="sc">$</span>biplot[,<span class="dv">2</span>], <span class="at">Label=</span><span class="fu">rownames</span>(rda.sum<span class="sc">$</span>biplot))</span>
<span id="cb57-9"><a href="#cb57-9" tabindex="-1"></a></span>
<span id="cb57-10"><a href="#cb57-10" tabindex="-1"></a>rda.plot1<span class="ot">&lt;-</span><span class="fu">ggplot</span>(rda.axes, <span class="fu">aes</span>(<span class="at">x =</span> RDA1, <span class="at">y =</span> RDA2)) <span class="sc">+</span> <span class="fu">geom_point</span>(<span class="at">size=</span><span class="dv">2</span>) <span class="sc">+</span> </span>
<span id="cb57-11"><a href="#cb57-11" tabindex="-1"></a>  <span class="fu">geom_segment</span>(<span class="at">data =</span> arrows,<span class="at">mapping =</span> <span class="fu">aes</span>(<span class="at">x =</span> <span class="dv">0</span>, <span class="at">y =</span> <span class="dv">0</span>, <span class="at">xend =</span> RDA1, <span class="at">yend =</span> RDA2),<span class="at">lineend =</span> <span class="st">&quot;round&quot;</span>, <span class="co"># See available arrow types in example above</span></span>
<span id="cb57-12"><a href="#cb57-12" tabindex="-1"></a>               <span class="at">linejoin =</span> <span class="st">&quot;round&quot;</span>,</span>
<span id="cb57-13"><a href="#cb57-13" tabindex="-1"></a>               <span class="at">size =</span> <span class="fl">0.5</span>, </span>
<span id="cb57-14"><a href="#cb57-14" tabindex="-1"></a>               <span class="at">arrow =</span> <span class="fu">arrow</span>(<span class="at">length =</span> <span class="fu">unit</span>(<span class="fl">0.15</span>, <span class="st">&quot;inches&quot;</span>)),</span>
<span id="cb57-15"><a href="#cb57-15" tabindex="-1"></a>               <span class="at">colour =</span> <span class="st">&quot;#7400b8&quot;</span>) <span class="sc">+</span></span>
<span id="cb57-16"><a href="#cb57-16" tabindex="-1"></a>  <span class="fu">geom_label</span>(<span class="at">data =</span> arrows,<span class="fu">aes</span>(<span class="at">label =</span> Label, <span class="at">x =</span> RDA1, <span class="at">y =</span> RDA2, <span class="at">fontface=</span><span class="st">&quot;bold&quot;</span>))<span class="sc">+</span></span>
<span id="cb57-17"><a href="#cb57-17" tabindex="-1"></a>  <span class="fu">coord_fixed</span>() <span class="sc">+</span> <span class="fu">theme_classic</span>() <span class="sc">+</span></span>
<span id="cb57-18"><a href="#cb57-18" tabindex="-1"></a>  <span class="fu">theme</span>(<span class="at">axis.title.x =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.title.y =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">vjust=</span><span class="dv">1</span>))</span>
<span id="cb57-19"><a href="#cb57-19" tabindex="-1"></a></span>
<span id="cb57-20"><a href="#cb57-20" tabindex="-1"></a><span class="fu">ggsave</span>(rda.plot1,<span class="at">filename =</span> <span class="st">&quot;16S_RDA_example1.png&quot;</span>, <span class="at">width=</span><span class="dv">15</span>, <span class="at">height=</span><span class="dv">12</span>, <span class="at">dpi=</span><span class="dv">600</span>) <span class="ot">=</span></span></code></pre></div>
<center>
<img src="amplicon_workflow/16S_RDA_example1.png" />
</center>
<div align="center">
Figure 17a: Redundancy Analysis of Clear Cut Soil samples w/
<code>ggplot2</code> Example 1
</div>
<p></br></p>
<div class="sourceCode" id="cb58"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb58-1"><a href="#cb58-1" tabindex="-1"></a>rda.plot2<span class="ot">&lt;-</span><span class="fu">ggplot</span>(rda.axes, <span class="fu">aes</span>(<span class="at">x =</span> RDA1, <span class="at">y =</span> RDA2)) <span class="sc">+</span> <span class="fu">geom_point</span>(<span class="at">size=</span><span class="dv">3</span>) <span class="sc">+</span> </span>
<span id="cb58-2"><a href="#cb58-2" tabindex="-1"></a>  <span class="fu">geom_segment</span>(<span class="at">data =</span> arrows,<span class="at">mapping =</span> <span class="fu">aes</span>(<span class="at">x =</span> <span class="dv">0</span>, <span class="at">y =</span> <span class="dv">0</span>, <span class="at">xend =</span> RDA1<span class="sc">*</span><span class="dv">19</span>, <span class="at">yend =</span> RDA2<span class="sc">*</span><span class="dv">19</span>),<span class="at">lineend =</span> <span class="st">&quot;round&quot;</span>, <span class="co"># See available arrow types in example above</span></span>
<span id="cb58-3"><a href="#cb58-3" tabindex="-1"></a>               <span class="at">linejoin =</span> <span class="st">&quot;round&quot;</span>,</span>
<span id="cb58-4"><a href="#cb58-4" tabindex="-1"></a>               <span class="at">size =</span> <span class="fl">0.8</span>, </span>
<span id="cb58-5"><a href="#cb58-5" tabindex="-1"></a>               <span class="at">arrow =</span> <span class="fu">arrow</span>(<span class="at">length =</span> <span class="fu">unit</span>(<span class="fl">0.15</span>, <span class="st">&quot;inches&quot;</span>)),</span>
<span id="cb58-6"><a href="#cb58-6" tabindex="-1"></a>               <span class="at">colour =</span> <span class="st">&quot;#7400b8&quot;</span>) <span class="sc">+</span></span>
<span id="cb58-7"><a href="#cb58-7" tabindex="-1"></a>  <span class="fu">geom_label</span>(<span class="at">data =</span> arrows,<span class="fu">aes</span>(<span class="at">label =</span> Label, <span class="at">x =</span> RDA1<span class="sc">*</span><span class="dv">21</span>, <span class="at">y =</span> RDA2<span class="sc">*</span><span class="dv">21</span>, <span class="at">fontface=</span><span class="st">&quot;bold&quot;</span>), <span class="at">size=</span><span class="dv">6</span>)<span class="sc">+</span></span>
<span id="cb58-8"><a href="#cb58-8" tabindex="-1"></a>  <span class="fu">coord_fixed</span>() <span class="sc">+</span> <span class="fu">theme_classic</span>() <span class="sc">+</span> </span>
<span id="cb58-9"><a href="#cb58-9" tabindex="-1"></a>  <span class="fu">theme</span>(<span class="at">axis.title.x =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.title.y =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">13</span>),<span class="at">axis.text =</span> <span class="fu">element_text</span>(<span class="at">size=</span><span class="dv">11</span>),<span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">vjust=</span><span class="dv">1</span>))</span>
<span id="cb58-10"><a href="#cb58-10" tabindex="-1"></a></span>
<span id="cb58-11"><a href="#cb58-11" tabindex="-1"></a><span class="fu">ggsave</span>(rda.plot2,<span class="at">filename =</span> <span class="st">&quot;16S_RDA_example2.png&quot;</span>, <span class="at">width=</span><span class="dv">15</span>, <span class="at">height=</span><span class="dv">12</span>, <span class="at">dpi=</span><span class="dv">600</span>)</span></code></pre></div>
<center>
<img src="amplicon_workflow/16S_RDA_example2.png" />
</center>
<div align="center">
Figure 17b: Redundancy Analysis of Clear Cut Soil samples w/
<code>ggplot2</code> Example 2
</div>
<p></br></p>
<p>The first ggplot example does not amplify the biplot axes from our
RDA as we see with the RDA created by <code>autoplot()</code>. The
second plot looks more like what <code>autplot()</code> produced, which
contains axes that were amplified to clearly show the trends. It’s
important to be cautious of this type of ordination manipulation because
you do not want to “identify” relationships in your data that do not
actually exist. That is why I recommend comparing your RDA axes from the
RDA <code>summary()</code> (i.e., your site, biplot, and species axes)
so that you know exactly how much variation is being described by your
environmental variables.</p>
<p>If our RDA had shown these variables as being significant drivers of
community composition in the Clear Cut Soil samples, that is what I
would describe here. It is also wise to include a table that describes
the significance of your RDA or CCA as well as the variation (i.e.,
adjust R<sup>2</sup> value) explained by your model. However, because
the chemical data used for this portion of the workflow is fictional, I
won’t describe this figure in depth.</p>
<p>Because none of my environmental variables were significant drivers
of community composition, I should <em>not</em> have included them in my
RDA. Remember that it’s important to <em>only</em> show environmental
variables on your RDAs or CCAs that <strong>significantly</strong>
describe the variation in your compositional data that’s explained by
your model.</p>
</div>
</div>
</div>
<div id="conclusion" class="section level1" number="5">
<h1><span class="header-section-number">5</span> Conclusion</h1>
<p>Congratulations, we have reached the end of this workflow! By now you
should have some analyses and figures for your data set, and you can
start to get to work on interpreting your results. We can do ourselves a
favor and save everything in our global environment by running the
following line of code. To load this data into your global environment,
just run <code>load("amplicon_WF_environment.Rdata")</code> in your
console, and your global environment will be populated with your
data.</p>
<div class="sourceCode" id="cb59"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb59-1"><a href="#cb59-1" tabindex="-1"></a><span class="fu">save.image</span>(<span class="st">&quot;amplicon_WF_environment.Rdata&quot;</span>)</span>
<span id="cb59-2"><a href="#cb59-2" tabindex="-1"></a><span class="co"># to load saved objects/packages/functions, run: load(&quot;amplicon_WF_environment.Rdata&quot;)</span></span></code></pre></div>
<p>Please keep in mind that this is a collection of scripts and analyses
that I have created, and this is by no means a Bible for microbial
ecology! You do not have to take all of these steps with your own data,
and if you have more efficient ways of doing some of these steps, then I
encourage you to use those methods. This worfklow is meant to serve as a
jumping off point rather than a final destination, so please do whatever
you think is necessary to investigate the patterns you’re observing in
your microbial communities.</p>
<p>If you’d like to get a hold of me to offer me feedback about this
workflow, or if you’d like to discuss more of these steps/programs/stats
in depth, do not hestitate to reach out. My contact information is in
the <a href="#about-me">About Me</a> section.</p>
<p>Thank you for following along!</p>
</div>
<div id="version-information" class="section level1" number="6">
<h1><span class="header-section-number">6</span> Version
Information</h1>
<div class="sourceCode" id="cb60"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb60-1"><a href="#cb60-1" tabindex="-1"></a><span class="fu">sessionInfo</span>()</span></code></pre></div>
<pre><code>## R version 4.2.2 (2022-10-31)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Ventura 13.5.2
## 
## Matrix products: default
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.31    R6_2.5.1         jsonlite_1.8.4   lifecycle_1.0.3  magrittr_2.0.3   evaluate_0.20    scales_1.2.1     stringi_1.7.12  
##  [9] cachem_1.0.7     rlang_1.1.0      cli_3.6.1        rstudioapi_0.14  jquerylib_0.1.4  bslib_0.4.2      vctrs_0.6.1      rmarkdown_2.20  
## [17] tools_4.2.2      stringr_1.5.0    glue_1.6.2       munsell_0.5.0    xfun_0.40        yaml_2.3.7       fastmap_1.1.1    compiler_4.2.2  
## [25] colorspace_2.1-0 htmltools_0.5.4  knitr_1.42       sass_0.4.5</code></pre>
</div>
<div id="about-me" class="section level1" number="7">
<h1><span class="header-section-number">7</span> About Me</h1>
<p>My name is Linton and my pronouns are they/them. I am currently a PhD
Student at UC Riverside in the <a href="https://ggb.ucr.edu/">Genetics,
Genomics, and Bioinformatics</a> PhD program and a member of Dr. Emma
Aronson’s lab.</p>
<p>If you have any questions regarding this workflow and the scripts I
used, do not hesitate to contact <a
href="mailto:hannahfreund3@gmail.com?subject=Amplicon%20Workflow">me</a>.
Or, if you’d like to talk bioinformatics and all things ’omics, I would
love that too!</p>
<div align="center">
<strong>Thank you so much for checking out my workflow!</strong>
</div>
<center>
<a
href="https://sourceforge.net/projects/amplicon-sequencing-worfklow/files/latest/download"><img
src="https://a.fsdn.com/con/app/sf-download-button"
alt="Download AmpliconSequencingWorfklow" /></a>
</center>
<center>
<a href="https://doi.org/10.5281/zenodo.8264886"><img
src="https://zenodo.org/badge/DOI/10.5281/zenodo.8264886.svg"
alt="DOI" /></a>
</center>
</div>
<div id="references" class="section level1 unnumbered">
<h1 class="unnumbered">References</h1>
<div id="refs" class="references csl-bib-body hanging-indent">
<div id="ref-Andrews" class="csl-entry">
Andrews, S. n.d. <span>“<span class="nocase">FastQC: A Quality Control
Tool for High Throughput Sequence Data</span>.”</span>
</div>
<div id="ref-Bukin2019" class="csl-entry">
Bukin, Yu S., Yu P. Galachyants, I. V. Morozov, S. V. Bukin, A. S.
Zakharenko, and T. I. Zemskaya. 2019. <span>“<span class="nocase">The
effect of 16s rRNA region choice on bacterial community metabarcoding
results</span>.”</span> <em>Scientific Data</em> 6: 1–14. <a
href="https://doi.org/10.1038/sdata.2019.7">https://doi.org/10.1038/sdata.2019.7</a>.
</div>
<div id="ref-Bushnell" class="csl-entry">
Bushnell, Brian. n.d. <span>“<span>BBMap</span>.”</span> <a
href="https://sourceforge.net/projects/bbmap/">sourceforge.net/projects/bbmap/</a>.
</div>
<div id="ref-Callahan2017" class="csl-entry">
Callahan, Benjamin J., Paul J. McMurdie, and Susan P. Holmes. 2017.
<span>“<span class="nocase">Exact sequence variants should replace
operational taxonomic units in marker-gene data analysis</span>.”</span>
<em>ISME Journal</em> 11 (12): 2639–43. <a
href="https://doi.org/10.1038/ismej.2017.119">https://doi.org/10.1038/ismej.2017.119</a>.
</div>
<div id="ref-Callahan2016" class="csl-entry">
Callahan, Benjamin J., Paul J. McMurdie, Michael J. Rosen, Andrew W.
Han, Amy Jo A. Johnson, and Susan P. Holmes. 2016. <span>“<span
class="nocase">DADA2: High-resolution sample inference from Illumina
amplicon data</span>.”</span> <em>Nature Methods</em> 13 (7): 581–83. <a
href="https://doi.org/10.1038/nmeth.3869">https://doi.org/10.1038/nmeth.3869</a>.
</div>
<div id="ref-Davis2018" class="csl-entry">
Davis, Nicole M., DIana M. Proctor, Susan P. Holmes, David A. Relman,
and Benjamin J. Callahan. 2018. <span>“Simple Statistical Identification
and Removal of Contaminant Sequences in Marker-Gene and Metagenomics
Data.”</span> <em>Microbiome</em> 6 (December). <a
href="https://doi.org/10.1186/s40168-018-0605-2">https://doi.org/10.1186/s40168-018-0605-2</a>.
</div>
<div id="ref-Edgar2015" class="csl-entry">
Edgar, Robert C., and Henrik Flyvbjerg. 2015. <span>“<span
class="nocase">Error filtering, pair assembly and error correction for
next-generation sequencing reads</span>.”</span> <em>Bioinformatics</em>
31 (21): 3476–82. <a
href="https://doi.org/10.1093/bioinformatics/btv401">https://doi.org/10.1093/bioinformatics/btv401</a>.
</div>
<div id="ref-Gloor2017" class="csl-entry">
Gloor, Gregory B., Jean M. Macklaim, Vera Pawlowsky-Glahn, and Juan J.
Egozcue. 2017. <span>“<span class="nocase">Microbiome datasets are
compositional: And this is not optional</span>.”</span> <em>Frontiers in
Microbiology</em> 8 (NOV): 1–6. <a
href="https://doi.org/10.3389/fmicb.2017.02224">https://doi.org/10.3389/fmicb.2017.02224</a>.
</div>
<div id="ref-Illumina" class="csl-entry">
Illumina. n.d. <span>“<span class="nocase">High-Speed, Multiplexed 16S
Microbial Sequencing on the MiSeq<span></span> System</span>.”</span> <a
href="http://files/998/High-Speed, Multiplexed 16S Microbial Sequencing o.pdf">http://files/998/High-Speed,
Multiplexed 16S Microbial Sequencing o.pdf</a>.
</div>
<div id="ref-Kassambara2020" class="csl-entry">
Kassambara, Alboukadel. 2020. <span>“<span class="nocase">ggpubr:
’ggplot2’ Based Publication Ready Plots</span>.”</span> CRAN. <a
href="https://rpkgs.datanovia.com/ggpubr/">https://rpkgs.datanovia.com/ggpubr/</a>.
</div>
<div id="ref-Lee2019" class="csl-entry">
Lee, Michael. 2019. <span>“<span class="nocase">Happy Belly
Bioinformatics: an open-source resource dedicated to helping biologists
utilize bioinformatics</span>.”</span> <em>Journal of Open Source
Education</em> 2 (19): 53. <a
href="https://doi.org/10.21105/jose.00053">https://doi.org/10.21105/jose.00053</a>.
</div>
<div id="ref-McMurdie2014" class="csl-entry">
McMurdie, Paul J., and Susan Holmes. 2014. <span>“<span>Waste Not, Want
Not: Why Rarefying Microbiome Data Is Inadmissible</span>.”</span>
<em>PLoS Computational Biology</em> 10 (4). <a
href="https://doi.org/10.1371/journal.pcbi.1003531">https://doi.org/10.1371/journal.pcbi.1003531</a>.
</div>
<div id="ref-Morton2017" class="csl-entry">
Morton, James T., Liam Toran, Anna Edlund, Jessica L. Metcalf, Christian
Lauber, and Rob Knight. 2017. <span>“<span class="nocase">Uncovering the
Horseshoe Effect in Microbial Analyses</span>.”</span> <em>mSystems</em>
2 (1): 1–8. <a
href="https://doi.org/10.1128/msystems.00166-16">https://doi.org/10.1128/msystems.00166-16</a>.
</div>
<div id="ref-Oksanen2020" class="csl-entry">
Oksanen, Jari, F. Guillaume Blanchet, Michael Friendly, Roeland Kindt,
Pierre Legendre, Dan McGlinn, Petter R. Minchin, et al. 2020.
<span>“<span class="nocase">vegan: Community Ecology
Package</span>.”</span> CRAN. <a
href="https://cran.r-project.org/package=vegan">https://cran.r-project.org/package=vegan</a>.
</div>
<div id="ref-Prodan2020" class="csl-entry">
Prodan, Andrei, Valentina Tremaroli, Harald Brolin, Aeilko H.
Zwinderman, Max Nieuwdorp, and Evgeni Levin. 2020. <span>“<span
class="nocase">Comparing bioinformatic pipelines for microbial 16S rRNA
amplicon sequencing</span>.”</span> <em>PLoS ONE</em> 15 (1): 1–19. <a
href="https://doi.org/10.1371/journal.pone.0227434">https://doi.org/10.1371/journal.pone.0227434</a>.
</div>
<div id="ref-Quinn2021" class="csl-entry">
Quinn, Thomas P., and Ionas Erb. 2021. <span>“<span
class="nocase">Examining microbe–metabolite correlations by linear
methods</span>.”</span> <em>Nature Methods</em> 18 (1): 37–39. <a
href="https://doi.org/10.1038/s41592-020-01006-1">https://doi.org/10.1038/s41592-020-01006-1</a>.
</div>
<div id="ref-Quinn2019" class="csl-entry">
Quinn, Thomas P., Ionas Erb, Greg Gloor, Cedric Notredame, Mark F.
Richardson, and Tamsyn M. Crowley. 2019. <span>“<span class="nocase">A
field guide for the compositional analysis of any-omics
data</span>.”</span> <em>GigaScience</em> 8 (9): 1–14. <a
href="https://doi.org/10.1093/gigascience/giz107">https://doi.org/10.1093/gigascience/giz107</a>.
</div>
<div id="ref-Quinn2018" class="csl-entry">
Quinn, Thomas P., Ionas Erb, Mark F. Richardson, and Tamsyn M. Crowley.
2018. <span>“<span class="nocase">Understanding sequencing data as
compositions: An outlook and review</span>.”</span>
<em>Bioinformatics</em> 34 (16): 2870–78. <a
href="https://doi.org/10.1093/bioinformatics/bty175">https://doi.org/10.1093/bioinformatics/bty175</a>.
</div>
<div id="ref-Rausch2019" class="csl-entry">
Rausch, Philipp, Malte Rühlemann, Britt M. Hermes, Shauni Doms, Tal
Dagan, Katja Dierking, Hanna Domin, et al. 2019. <span>“<span
class="nocase">Comparative analysis of amplicon and metagenomic
sequencing methods reveals key features in the evolution of animal
metaorganisms</span>.”</span> <em>Microbiome</em> 7 (1): 1–19. <a
href="https://doi.org/10.1186/s40168-019-0743-1">https://doi.org/10.1186/s40168-019-0743-1</a>.
</div>
<div id="ref-Rosen2012" class="csl-entry">
Rosen, Michael J., Benjamin J. Callahan, Daniel S. Fisher, and Susan P.
Holmes. 2012. <span>“<span class="nocase">Denoising PCR-amplified
metagenome data</span>.”</span> <em>BMC Bioinformatics</em> 13 (1). <a
href="https://doi.org/10.1186/1471-2105-13-283">https://doi.org/10.1186/1471-2105-13-283</a>.
</div>
<div id="ref-Taylor2016" class="csl-entry">
Taylor, D. Lee, William A. Walters, Niall J. Lennon, James Bochicchio,
Andrew Krohn, J. Gregory Caporaso, and Taina Pennanen. 2016.
<span>“<span class="nocase">Accurate estimation of fungal diversity and
abundance through improved lineage-specific primers optimized for
Illumina amplicon sequencing</span>.”</span> <em>Applied and
Environmental Microbiology</em> 82 (24): 7217–26. <a
href="https://doi.org/10.1128/AEM.02576-16">https://doi.org/10.1128/AEM.02576-16</a>.
</div>
<div id="ref-Vargas-Albores2017" class="csl-entry">
Vargas-Albores, Francisco, Luis Enrique Ortiz-Suárez, Enrique
Villalpando-Canchola, and Marcel Martínez-Porchas. 2017. <span>“<span
class="nocase">Size-variable zone in V3 region of 16S
rRNA</span>.”</span> <em>RNA Biology</em> 14 (11): 1514–21. <a
href="https://doi.org/10.1080/15476286.2017.1317912">https://doi.org/10.1080/15476286.2017.1317912</a>.
</div>
<div id="ref-Wang2007" class="csl-entry">
Wang, Qiong, George M. Garrity, James M. Tiedje, and James R. Cole.
2007. <span>“<span class="nocase">Na<span class="nocase">ï</span>ve
Bayesian classifier for rapid assignment of rRNA sequences into the new
bacterial taxonomy</span>.”</span> <em>Applied and Environmental
Microbiology</em> 73 (16): 5261–67. <a
href="https://doi.org/10.1128/AEM.00062-07">https://doi.org/10.1128/AEM.00062-07</a>.
</div>
<div id="ref-Weiss2017" class="csl-entry">
Weiss, Sophie, Zhenjiang Zech Xu, Shyamal Peddada, Amnon Amir, Kyle
Bittinger, Antonio Gonzalez, Catherine Lozupone, et al. 2017.
<span>“<span class="nocase">Normalization and microbial differential
abundance strategies depend upon data characteristics</span>.”</span>
<em>Microbiome</em> 5 (1): 1–18. <a
href="https://doi.org/10.1186/s40168-017-0237-y">https://doi.org/10.1186/s40168-017-0237-y</a>.
</div>
<div id="ref-Wickham2007" class="csl-entry">
Wickham, Hadley. 2007. <span>“<span class="nocase">Reshaping Data with
the reshape Package</span>.”</span> <em>Journal of Statistical
Software</em> 21 (12): 6–17. <a
href="https://doi.org/10.18637/jss.v021.i12">https://doi.org/10.18637/jss.v021.i12</a>.
</div>
<div id="ref-Wickham2016" class="csl-entry">
———. 2016. <span>“<span class="nocase">ggplot2: Elegant Graphics for
Data Analysis</span>.”</span> Springer-Verlag New York.
</div>
</div>
</div>


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- tabsets -->

<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});

$(document).ready(function () {
  $('.tabset-dropdown > .nav-tabs > li').click(function () {
    $(this).parent().toggleClass('nav-tabs-open');
  });
});
</script>

<!-- code folding -->
<script>
$(document).ready(function () {
  window.initializeCodeFolding("show" === "show");
});
</script>

<script>
$(document).ready(function ()  {

    // temporarily add toc-ignore selector to headers for the consistency with Pandoc
    $('.unlisted.unnumbered').addClass('toc-ignore')

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_');
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = true;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>