Skip to content

Commit

Permalink
Merge pull request #39 from will-rowe/babygroot
Browse files Browse the repository at this point in the history
accuracy and memory improvements
  • Loading branch information
Will Rowe authored May 5, 2020
2 parents fffdefa + 3f1429b commit 7a0977c
Show file tree
Hide file tree
Showing 33 changed files with 569 additions and 502 deletions.
2 changes: 1 addition & 1 deletion cmd/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ var alignCmd = &cobra.Command{
func init() {
fastq = alignCmd.Flags().StringSliceP("fastq", "f", []string{}, "FASTQ file(s) to align")
fasta = alignCmd.Flags().Bool("fasta", false, "if set, the input will be treated as fasta sequence(s) (experimental feature)")
containmentThreshold = alignCmd.Flags().Float64P("contThresh", "t", 0.95, "containment threshold for the LSH ensemble")
containmentThreshold = alignCmd.Flags().Float64P("contThresh", "t", 0.99, "containment threshold for the LSH ensemble")
minKmerCoverage = alignCmd.Flags().Float64P("minKmerCov", "c", 1.0, "minimum number of k-mers covering each base of a graph segment")
graphDir = alignCmd.PersistentFlags().StringP("graphDir", "g", defaultGraphDir, "directory to save variation graphs to")
RootCmd.AddCommand(alignCmd)
Expand Down
36 changes: 20 additions & 16 deletions cmd/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@ import (

// the command line arguments
var (
kmerSize *int // size of k-mer
sketchSize *int // size of MinHash sketch
windowSize *int // length of query reads (used during alignment subcommand), needed as window length should ~= read length
numPart *int // number of partitions in the LSH Ensemble
maxK *int // maxK in the LSH Ensemble
msaDir *string // directory containing the input MSA files
msaList []string // the collected MSA files
kmerSize *int // size of k-mer
sketchSize *int // size of MinHash sketch
windowSize *int // length of query reads (used during alignment subcommand), needed as window length should ~= read length
numPart *int // number of partitions in the LSH Ensemble
maxK *int // maxK in the LSH Ensemble
maxSketchSpan *int // max distance between merged sketches
msaDir *string // directory containing the input MSA files
msaList []string // the collected MSA files
)

// the index command (used by cobra)
Expand All @@ -41,11 +42,12 @@ var indexCmd = &cobra.Command{

// a function to initialise the command line arguments
func init() {
kmerSize = indexCmd.Flags().IntP("kmerSize", "k", 21, "size of k-mer")
sketchSize = indexCmd.Flags().IntP("sketchSize", "s", 42, "size of MinHash sketch")
kmerSize = indexCmd.Flags().IntP("kmerSize", "k", 31, "size of k-mer")
sketchSize = indexCmd.Flags().IntP("sketchSize", "s", 21, "size of MinHash sketch")
windowSize = indexCmd.Flags().IntP("windowSize", "w", 100, "size of window to sketch graph traversals with")
numPart = indexCmd.Flags().IntP("numPart", "x", 8, "number of partitions in the LSH Ensemble")
maxK = indexCmd.Flags().IntP("maxK", "y", 4, "maxK in the LSH Ensemble")
maxSketchSpan = indexCmd.Flags().Int("maxSketchSpan", 30, "max number of identical neighbouring sketches permitted in any graph traversal")
msaDir = indexCmd.Flags().StringP("msaDir", "m", "", "directory containing the clustered references (MSA files) - required")
indexCmd.MarkFlagRequired("msaDir")
RootCmd.AddCommand(indexCmd)
Expand Down Expand Up @@ -89,16 +91,18 @@ func runIndex() {
log.Printf("\tgraph window size: %d", *windowSize)
log.Printf("\tnum. partitions: %d", *numPart)
log.Printf("\tmax. K: %d", *maxK)
log.Printf("\tmax. sketch span: %d", *maxSketchSpan)

// record the runtime information for the index sub command
info := &pipeline.Info{
Version: version.GetVersion(),
KmerSize: *kmerSize,
SketchSize: *sketchSize,
WindowSize: *windowSize,
NumPart: *numPart,
MaxK: *maxK,
IndexDir: *indexDir,
Version: version.GetVersion(),
KmerSize: *kmerSize,
SketchSize: *sketchSize,
WindowSize: *windowSize,
NumPart: *numPart,
MaxK: *maxK,
MaxSketchSpan: *maxSketchSpan,
IndexDir: *indexDir,
}

// create the pipeline
Expand Down
Binary file added db/clustered-ARG-databases/1.1/arg-annot.90.tar
Binary file not shown.
Binary file added db/clustered-ARG-databases/1.1/card.90.tar
Binary file not shown.
Binary file not shown.
Binary file added db/clustered-ARG-databases/1.1/groot-db.90.tar
Binary file not shown.
Binary file added db/clustered-ARG-databases/1.1/resfinder.90.tar
Binary file not shown.
9 changes: 6 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@
author = 'Will Rowe'

# The short X.Y version
version = '1.0'
version = '1.1'
# The full version, including alpha/beta/rc tags
release = '1.0.2'
release = '1.1.0'


# -- General configuration ---------------------------------------------------
Expand Down Expand Up @@ -79,8 +79,11 @@
#
html_theme = 'bootstrap'
html_theme_path = sphinx_bootstrap_theme.get_html_theme_path()


def setup(app):
app.add_stylesheet("my-styles.css") # also can be a full URL
app.add_stylesheet("my-styles.css") # also can be a full URL


html_logo = "groot-logo.png"
html_theme_options = {
Expand Down
1 change: 1 addition & 0 deletions docs/using-groot.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ Some more flags that can be used:
- `-s`: size of MinHash sketch
- `-x`: number of partitions in the LSH Ensemble index
- `-y`: maxK in the LSH Ensemble index
- `--maxSketchSpan`: max number of identical neighbouring sketches permitted in any graph traversal

> Important: GROOT can only accept MSAs as input. You can cluster your own database or use `groot get` to obtain a pre-clustered one.
Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/will-rowe/groot
go 1.14

require (
github.com/adam-hanna/arrayOperations v0.2.6
github.com/biogo/biogo v1.0.2
github.com/biogo/hts v1.1.0
github.com/dgryski/go-minhash v0.0.0-20190315135803-ad340ca03076 // indirect
Expand All @@ -13,5 +14,5 @@ require (
github.com/spf13/cobra v1.0.0
github.com/spf13/pflag v1.0.5
github.com/will-rowe/gfa v0.0.0-20190502084819-05c93955478b
github.com/will-rowe/ntHash v0.0.0-20190624153018-541592fc7931
github.com/will-rowe/nthash v0.2.0
)
6 changes: 4 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/adam-hanna/arrayOperations v0.2.6 h1:QZC99xC8MgUawXnav7bFMejs/dm7YySnDpMx3oZzz2Y=
github.com/adam-hanna/arrayOperations v0.2.6/go.mod h1:iIzkSjP91FnE66cUFNAjjUJVmjyAwCH0SXnWsx2nbdk=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
Expand Down Expand Up @@ -117,8 +119,8 @@ github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGr
github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8=
github.com/will-rowe/gfa v0.0.0-20190502084819-05c93955478b h1:FPqzpx98kL1aOLiZ5ZjwfI+b/nh60qoSKr+RDhYctVA=
github.com/will-rowe/gfa v0.0.0-20190502084819-05c93955478b/go.mod h1:jSVX6uZtpiVv7aToIC1BEtuh2is2B4/GCkFvprNHgkY=
github.com/will-rowe/ntHash v0.0.0-20190624153018-541592fc7931 h1:QCE806NstAGzZoHhyq+YZNOexOiRj9uNOGf/007yybg=
github.com/will-rowe/ntHash v0.0.0-20190624153018-541592fc7931/go.mod h1:iT1gPVtz/gRasyESdx5RSSLGFoQkzxqP9c6uVszzobU=
github.com/will-rowe/nthash v0.2.0 h1:qLf6wKNZn+FBNspBAJgUvjmhgK/0Wls6r/rRNTZwHQA=
github.com/will-rowe/nthash v0.2.0/go.mod h1:5ezweuK0J5j+/7lih/RkrSmnxI3hoaPpQiVWJ7rd960=
github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
Expand Down
37 changes: 29 additions & 8 deletions src/graph/alignment.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
func (GrootGraph *GrootGraph) AlignRead(read *seqio.FASTQread, mapping *lshforest.Key, references []*sam.Reference) ([]*sam.Record, error) {

// TODO: move this hardcoded value to CLI options
MaxClip := 2
MaxClip := 1

// store the ID of the first node in the seed
seedNodeID := mapping.Node
Expand All @@ -29,27 +29,48 @@ func (GrootGraph *GrootGraph) AlignRead(read *seqio.FASTQread, mapping *lshfores
startPos := make(map[int]int)
startClippedBases := 0
endClippedBases := 0
origOffSet := mapping.OffSet

// 1. exact alignment and seed offset shuffling
var shuffles int
for shuffles = 0; shuffles <= int(mapping.MergeSpan+3); shuffles++ {
for shuffles = 0; shuffles <= int(mapping.MergeSpan+mapping.WindowSize); shuffles++ {
IDs, startPos = GrootGraph.performAlignment(nodeLookup, &read.Seq, int(mapping.OffSet))
if len(IDs) > 0 {
break
}
mapping.OffSet++
}

/*
TODO:
// 2. reverse seed shuffling (start at last base of the input edge)
*/
// reset the offset
mapping.OffSet = origOffSet

// 3. hard clipping the start of the read
// 2. exact alignment and seed node shuffling
if len(IDs) == 0 {
for shuffledNode := range mapping.ContainedNodes {
var shuffles int
mapping.OffSet = 0
for shuffles = 0; shuffles <= 10; shuffles++ {
nodeLookup, ok := GrootGraph.NodeLookup[shuffledNode]
if !ok {
return nil, fmt.Errorf("could not perform node lookup during alignment - possible incorrect seed")
}
IDs, startPos = GrootGraph.performAlignment(nodeLookup, &read.Seq, int(mapping.OffSet))
if len(IDs) > 0 {
break
}
mapping.OffSet++
}
if len(IDs) > 0 {
break
}
}

// reset the offset
mapping.OffSet -= uint32(shuffles)
mapping.OffSet = origOffSet
}

// 3. hard clipping the start of the read
if len(IDs) == 0 {

// make a copy of the sequence for clipping
clippedSeq := read.Seq
Expand Down
Loading

0 comments on commit 7a0977c

Please sign in to comment.