-
-
Notifications
You must be signed in to change notification settings - Fork 73
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
249 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
// TODO: add Alphabet and KmerCounter for codons | ||
// TODO: create benchmark for space and time costs | ||
// TODO: create efficient storage for children + counts in top node via multidimensional array | ||
// TODO: integrate with IO parsers | ||
// TODO: enable reading and writing to binary format | ||
// TODO: iterate over observed kmers (what is the appropriate iteration method?) | ||
// TODO: comparison between two KmerCounter's | ||
// TODO: initialize upper levels while respecting cache locality | ||
// TODO: add counts from other KmerCounter, to enable reducing parallel counts | ||
|
||
package alphabet | ||
|
||
import ( | ||
"fmt" | ||
) | ||
|
||
type KmerCounter struct { | ||
alphabet *Alphabet | ||
num_symbols uint8 | ||
max_k uint8 | ||
total uint64 | ||
children []Node | ||
} | ||
|
||
type Node struct { | ||
succ *Node | ||
child *Node | ||
encoding uint8 | ||
count uint32 | ||
} | ||
|
||
func NewKmerCounter(a *Alphabet, max_k uint8) *KmerCounter { | ||
kc := new(KmerCounter) | ||
kc.alphabet = a | ||
kc.num_symbols = uint8(len(a.symbols)) | ||
kc.max_k = max_k | ||
|
||
kc.children = make([]Node, kc.num_symbols) | ||
for i := uint8(1); i < kc.num_symbols; i++ { | ||
kc.children[i].encoding = i | ||
kc.children[i-1].succ = &kc.children[i] | ||
} | ||
return kc | ||
} | ||
|
||
func lookupChild(n *Node, encoding uint8) *Node { | ||
for c := n.child; c != nil && encoding <= c.encoding; c = c.succ { | ||
if encoding == c.encoding { return c } | ||
} | ||
return nil | ||
} | ||
|
||
func insertChild(p *Node, encoding uint8, n *Node) { | ||
if p.child == nil { | ||
p.child = n | ||
} else if (n.encoding < p.child.encoding) { | ||
n.succ = p.child | ||
p.child = n | ||
} else { | ||
for c := p.child; c.encoding < encoding; c = c.succ { | ||
if encoding < c.succ.encoding { | ||
n.succ = c.succ | ||
c.succ = n | ||
} | ||
} | ||
} | ||
n.count++ // not sure why this is needed | ||
n.encoding = encoding | ||
} | ||
|
||
func Observe(kc *KmerCounter, seq string) error { | ||
cbuf, err := kc.alphabet.EncodeAll(seq[:kc.max_k]) | ||
if err != nil { return err } | ||
|
||
CreateChildren := func (index int, remaining int) *Node { | ||
if remaining == 0 { return nil } // this condition should never happen | ||
nodes := make([]Node, remaining) | ||
|
||
for j, n := range nodes { | ||
n.count = 1 | ||
if j != 0 { | ||
nodes[j-1].child = &n | ||
n.encoding = cbuf[(index+j) % int(kc.max_k)] | ||
} | ||
} | ||
|
||
return &nodes[0] | ||
} | ||
|
||
UpdateBuffer := func (index int) (max_k int, err error) { | ||
max_k = int(kc.max_k) | ||
lookahead := index + max_k - 1 | ||
var encoding uint8 | ||
if lookahead < len(seq) { | ||
next := string(seq[lookahead]) | ||
encoding, err = kc.alphabet.Encode(next) | ||
if err != nil { | ||
err = fmt.Errorf("in position %d: %w", index, err) | ||
return | ||
} | ||
cbuf[lookahead % int(kc.max_k)] = encoding | ||
} else { | ||
max_k = len(seq) - index | ||
} | ||
return | ||
} | ||
|
||
var encoding uint8 | ||
for i, _ := range seq { | ||
max_k, err := UpdateBuffer(i) | ||
if err != nil { return err } | ||
|
||
p := &kc.children[cbuf[i % int(kc.max_k)]] | ||
kc.total++ | ||
for k := 1; k <= max_k; k++ { | ||
p.count++ | ||
// fmt.Printf("%d %d %v\n", i, k, p) | ||
|
||
if k != max_k { | ||
encoding = cbuf[(i+k) % int(kc.max_k)] | ||
c := lookupChild(p, encoding) | ||
if c == nil { | ||
insertChild(p, encoding, CreateChildren(i+k, max_k-k)) | ||
break // inserted nodes already have count = 1 added | ||
} | ||
p = c | ||
} | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
func LookupCount(kc *KmerCounter, kmer string) (count uint32, err error) { | ||
if len(kmer) > int(kc.max_k) { | ||
err = fmt.Errorf("kmer_counter: attempted to lookup count of %d-mer which exceeds max supported length %d", len(kmer), kc.max_k) | ||
return | ||
} | ||
if len(kmer) == 0 { | ||
err = fmt.Errorf("kmer_counter: attempted to lookup count of 0mer") | ||
return | ||
} | ||
encoded, err := kc.alphabet.EncodeAll(kmer) | ||
if err != nil { return } | ||
|
||
var k int | ||
var p *Node | ||
for k, p = 1, &kc.children[encoded[0]]; k < len(kmer) && p != nil; k++ { | ||
p = lookupChild(p, encoded[k]) | ||
} | ||
if p == nil { return } | ||
count = p.count | ||
return | ||
} | ||
|
||
func LookupFrequency(kc *KmerCounter, kmer string) (float64, error) { | ||
count, err := LookupCount(kc, kmer) | ||
if err != nil { return 0, err } | ||
return float64(count) / float64(kc.total - uint64(len(kmer)-1)), nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
package alphabet | ||
|
||
import ( | ||
"strings" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
|
||
"github.com/TimothyStiles/poly/transform/variants" | ||
) | ||
|
||
func Assertf (t *testing.T, cond bool, format string, args ...any) { | ||
if !cond { | ||
t.Errorf(format, args...) | ||
} | ||
} | ||
|
||
func TestKmerCounterEmpty(t *testing.T) { | ||
kc := NewKmerCounter(DNA, 3) | ||
|
||
count, err := LookupCount(kc, "A") | ||
assert.EqualValues(t, 0, count) | ||
assert.NoError(t, err) | ||
count, err = LookupCount(kc, "X") | ||
assert.Error(t, err) | ||
} | ||
|
||
func TestKmerCounterRepeated(t *testing.T) { | ||
kc := NewKmerCounter(DNA, 3) | ||
seq := strings.Repeat("A", 12) | ||
assert.Equal(t, 12, len(seq)) | ||
err := Observe(kc, seq) | ||
assert.NoError(t, err) | ||
|
||
assert.Equal(t, 12, int(kc.total)) | ||
|
||
onemers, _ := variants.AllVariantsIUPAC("N") | ||
for _, kmer := range onemers { | ||
count, err := LookupCount(kc, kmer) | ||
assert.NoError(t, err) | ||
if kmer == "A" { | ||
assert.EqualValues(t, 12, count, "Wrong count") | ||
} else { | ||
assert.EqualValues(t, 0, count, "Reports nonzero count for 1mer not present in sequence %v", kmer) | ||
} | ||
} | ||
twomers, _ := variants.AllVariantsIUPAC("NN") | ||
for _, kmer := range twomers { | ||
count, err := LookupCount(kc, kmer) | ||
assert.NoError(t, err) | ||
if kmer == "AA" { | ||
assert.EqualValues(t, 11, count, "Wrong count") | ||
} else { | ||
assert.EqualValues(t, 0, count, "Reports nonzero count for 2mer not present in sequence %v", kmer) | ||
} | ||
} | ||
threemers, _ := variants.AllVariantsIUPAC("NNN") | ||
for _, kmer := range threemers { | ||
count, err := LookupCount(kc, kmer) | ||
assert.NoError(t, err) | ||
if kmer == "AAA" { | ||
assert.EqualValues(t, 10, count, "Wrong count") | ||
} else { | ||
assert.EqualValues(t, 0, count, "Reports nonzero count for 3mer not present in sequence %v", kmer) | ||
} | ||
} | ||
} |