Skip to content

Commit

Permalink
genbank parser now compatible
Browse files Browse the repository at this point in the history
  • Loading branch information
Keoni Gandall committed Sep 13, 2023
1 parent 90316d3 commit 7b2cd52
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 1,158 deletions.
30 changes: 24 additions & 6 deletions bio/bio.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (

"github.com/TimothyStiles/poly/bio/fasta"
"github.com/TimothyStiles/poly/bio/fastq"
"github.com/TimothyStiles/poly/bio/genbank"
"github.com/TimothyStiles/poly/bio/pileup"
"github.com/TimothyStiles/poly/bio/slow5"
"golang.org/x/sync/errgroup"
Expand All @@ -29,6 +30,7 @@ type Format int
const (
Fasta Format = iota
Fastq
Genbank
Slow5
Pileup
)
Expand All @@ -41,10 +43,11 @@ const (
// particular reason to use a different number.
const defaultMaxLineLength int = bufio.MaxScanTokenSize // 64kB is a magic number often used by the Go stdlib for parsing.
var DefaultMaxLengths = map[Format]int{
Fasta: defaultMaxLineLength,
Fastq: 8 * 1024 * 1024, // The longest single nanopore sequencing read so far is 4Mb. A 8mb buffer should be large enough for any sequencing.
Slow5: 128 * 1024 * 1024, // 128mb is used because slow5 lines can be massive, since a single read can be many millions of base pairs.
Pileup: defaultMaxLineLength,
Fasta: defaultMaxLineLength,
Fastq: 8 * 1024 * 1024, // The longest single nanopore sequencing read so far is 4Mb. A 8mb buffer should be large enough for any sequencing.
Genbank: defaultMaxLineLength,
Slow5: 128 * 1024 * 1024, // 128mb is used because slow5 lines can be massive, since a single read can be many millions of base pairs.
Pileup: defaultMaxLineLength,
}

/******************************************************************************
Expand All @@ -66,18 +69,22 @@ Lower level interfaces
// for this is needed at the last Next(), when it returns an io.EOF error. A
// pointer is used to represent the difference between a null DataType and an
// empty DataType.
type parserInterface[DataType fasta.Record | fastq.Read | slow5.Read | pileup.Line, DataTypeHeader fasta.Header | fastq.Header | slow5.Header | pileup.Header] interface {
type parserInterface[DataType fasta.Record | fastq.Read | genbank.Genbank | slow5.Read | pileup.Line, DataTypeHeader fasta.Header | fastq.Header | genbank.Header | slow5.Header | pileup.Header] interface {
Header() (*DataTypeHeader, error)
Next() (*DataType, error)
}

// The following checks that all DataType and DataTypeHeaders implement the io.WriteTo interface.
var _ io.WriterTo = (*fasta.Record)(nil)
var _ io.WriterTo = (*fastq.Read)(nil)

// genbank
var _ io.WriterTo = (*slow5.Read)(nil)
var _ io.WriterTo = (*pileup.Line)(nil)
var _ io.WriterTo = (*fasta.Header)(nil)
var _ io.WriterTo = (*fastq.Header)(nil)

// genbank
var _ io.WriterTo = (*slow5.Header)(nil)
var _ io.WriterTo = (*pileup.Header)(nil)

Expand All @@ -90,7 +97,7 @@ Higher level parse
// Parser is generic bioinformatics file parser. It contains a LowerLevelParser
// and implements useful functions on top of it: such as Parse(), ParseToChannel(), and
// ParseWithHeader().
type Parser[DataType fasta.Record | fastq.Read | slow5.Read | pileup.Line, DataTypeHeader fasta.Header | fastq.Header | slow5.Header | pileup.Header] struct {
type Parser[DataType fasta.Record | fastq.Read | genbank.Genbank | slow5.Read | pileup.Line, DataTypeHeader fasta.Header | fastq.Header | genbank.Header | slow5.Header | pileup.Header] struct {
parserInterface parserInterface[DataType, DataTypeHeader]
}

Expand All @@ -116,6 +123,17 @@ func NewFastqParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[fa
return &Parser[fastq.Read, fastq.Header]{parserInterface: fastq.NewParser(r, maxLineLength)}, nil
}

// NewGenbankParser initiates a new Genbank parser form an io.Reader.
func NewGenbankParser(r io.Reader) (*Parser[genbank.Genbank, genbank.Header], error) {
return NewGenbankParserWithMaxLineLength(r, DefaultMaxLengths[Genbank])
}

// NewGenbankParserWithMaxLineLength initiates a new Genbank parser from an
// io.Reader and a user-given maxLineLength.
func NewGenbankParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[genbank.Genbank, genbank.Header], error) {
return &Parser[genbank.Genbank, genbank.Header]{parserInterface: genbank.NewParser(r, maxLineLength)}, nil
}

// NewSlow5Parser initiates a new SLOW5 parser from an io.Reader.
func NewSlow5Parser(r io.Reader) (*Parser[slow5.Read, slow5.Header], error) {
return NewSlow5ParserWithMaxLineLength(r, DefaultMaxLengths[Slow5])
Expand Down
82 changes: 55 additions & 27 deletions bio/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,8 @@ import (

"github.com/TimothyStiles/poly/bio"
"github.com/TimothyStiles/poly/bio/fasta"
"github.com/TimothyStiles/poly/bio/genbank"
"github.com/TimothyStiles/poly/bio/gff"
"github.com/TimothyStiles/poly/bio/polyjson"
)

// This is where the integration tests that make effed up cyclic dependencies go.
func Example() {
// Poly can take in basic gff, gbk, fasta, and JSON.
// We call the json package "pson" (poly JSON) to prevent namespace collision with Go's standard json package.

gffInput, _ := gff.Read("../data/ecoli-mg1655-short.gff")
gbkInput, _ := genbank.Read("../data/puc19.gbk")
//fastaInput, _ := fasta.Read("fasta/data/base.fasta")
jsonInput, _ := polyjson.Read("../data/cat.json")

// Poly can also output these file formats. Every file format has a corresponding Write function.
_ = gff.Write(gffInput, "test.gff")
_ = genbank.Write(gbkInput, "test.gbk")
//_ = fasta.WriteFile(fastaInput, "test.fasta")
_ = polyjson.Write(jsonInput, "test.json")

// Extra tips:

// 1. All of these file formats can be read and written in JSON format using their native schemas.
// 2. If you want to convert from one format to another (e.g. genbank to polyjson), you can easily do so with a for-loop and some field mapping.
// 3. Every file format is unique but they all share a common interface so you can use them with almost every native function in Poly.
}

// ExampleRead shows an example of reading a file from disk.
func ExampleRead() {

Check failure on line 16 in bio/example_test.go

View workflow job for this annotation

GitHub Actions / lint

tests: ExampleRead refers to unknown identifier: Read (govet)
// Read lets you read files from disk into a parser.
Expand All @@ -50,8 +24,8 @@ func ExampleRead() {
// Output: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK*
}

// Example_readGz shows an example of reading and parsing a gzipped file.
func Example_readGz() {
// ReadGz lets you read gzipped files into a parser.
fileGz, _ := os.Open("fasta/data/base.fasta.gz")
file, _ := gzip.NewReader(fileGz)
parser, _ := bio.NewFastaParser(file)
Expand Down Expand Up @@ -200,6 +174,60 @@ $$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0
// Output:GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT
}

func ExampleGenbankParser() {
// The following can be replaced with a any io.Reader. For example,
// `file, err := os.Open(path)` for file would also work.
file := strings.NewReader(`LOCUS pUC19_lacZ 336 bp DNA linear UNA 12-SEP-2023
DEFINITION natural linear DNA
ACCESSION .
VERSION .
KEYWORDS .
SOURCE natural DNA sequence
ORGANISM unspecified
REFERENCE 1 (bases 1 to 336)
AUTHORS Keoni Gandall
TITLE Direct Submission
JOURNAL Exported Sep 12, 2023 from SnapGene 6.2.2
https://www.snapgene.com
FEATURES Location/Qualifiers
source 1..336
/mol_type="genomic DNA"
/organism="unspecified"
primer_bind 1..17
/label=M13 rev
/note="common sequencing primer, one of multiple similar
variants"
CDS 13..336
/codon_start=1
/gene="lacZ"
/product="LacZ-alpha fragment of beta-galactosidase"
/label=lacZ-alpha
/translation="MTMITPSLHACRSTLEDPRVPSSNSLAVVLQRRDWENPGVTQLNR
LAAHPPFASWRNSEEARTDRPSQQLRSLNGEWRLMRYFLLTHLCGISHRIWCTLSTICS
DAA"
misc_feature 30..86
/label=MCS
/note="pUC19 multiple cloning site"
primer_bind complement(87..103)
/label=M13 fwd
/note="common sequencing primer, one of multiple similar
variants"
ORIGIN
1 caggaaacag ctatgaccat gattacgcca agcttgcatg cctgcaggtc gactctagag
61 gatccccggg taccgagctc gaattcactg gccgtcgttt tacaacgtcg tgactgggaa
121 aaccctggcg ttacccaact taatcgcctt gcagcacatc cccctttcgc cagctggcgt
181 aatagcgaag aggcccgcac cgatcgccct tcccaacagt tgcgcagcct gaatggcgaa
241 tggcgcctga tgcggtattt tctccttacg catctgtgcg gtatttcaca ccgcatatgg
301 tgcactctca gtacaatctg ctctgatgcc gcatag
//
`)
parser, _ := bio.NewGenbankParser(file)
records, _ := parser.Parse()

fmt.Println(records[0].Features[2].Attributes["translation"])
// Output: MTMITPSLHACRSTLEDPRVPSSNSLAVVLQRRDWENPGVTQLNRLAAHPPFASWRNSEEARTDRPSQQLRSLNGEWRLMRYFLLTHLCGISHRIWCTLSTICSDAA
}

func ExampleNewSlow5Parser() {
// The following can be replaced with a any io.Reader. For example,
// `file, err := os.Open(path)` for file would also work.
Expand Down
168 changes: 0 additions & 168 deletions bio/genbank/example_test.go

This file was deleted.

Loading

0 comments on commit 7b2cd52

Please sign in to comment.