From 53bd8df47c276720fbfddb90d18f47997c4dc716 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Thu, 14 Sep 2023 22:08:39 -0700 Subject: [PATCH 01/14] created a super minimal mash function for sketching sequences. --- go.mod | 1 + go.sum | 2 ++ mash/mash.go | 92 +++++++++++++++++++++++++++++++++++++++++++++++ mash/mash_test.go | 20 +++++++++++ 4 files changed, 115 insertions(+) create mode 100644 mash/mash.go create mode 100644 mash/mash_test.go diff --git a/go.mod b/go.mod index 0d403922..41892465 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,7 @@ require ( require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/mattn/go-sqlite3 v1.14.13 // indirect + github.com/spaolacci/murmur3 v1.1.0 // indirect ) require ( diff --git a/go.sum b/go.sum index 16976009..d47c80d5 100644 --- a/go.sum +++ b/go.sum @@ -22,6 +22,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= +github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= +github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= diff --git a/mash/mash.go b/mash/mash.go new file mode 100644 index 00000000..3f59ea79 --- /dev/null +++ b/mash/mash.go @@ -0,0 +1,92 @@ +/* +Package mash is for sketching sequence data to make it easier to search for and against. + +The package is named mash after the mash sketching algorithm, which is based on the MinHash algorithm. + +Mash: fast genome and metagenome distance estimation using MinHash. +Ondov, B.D., Treangen, T.J., Melsted, P. et al. +Genome Biol 17, 132 (2016). +https://doi.org/10.1186/s13059-016-0997-x + +Mash Screen: high-throughput sequence containment estimation for genome discovery. +Ondov, B., Starrett, G., Sappington, A. et al. +Genome Biol 20, 232 (2019). +https://doi.org/10.1186/s13059-019-1841-x + +The idea is that we can sketch a sequence of data, and then compare the sketch to other sketches to see how similar they are. +This saves a bunch of computation time and memory, because we don't have to compare the entire sequence to another sequence. + +TTFN, +Tim +*/ +package mash + +import "github.com/spaolacci/murmur3" + +// sketch algorithm +// slide a window of size k along the sequence +// for each kmer in the window, hash it to a 32 or 64 bit number +// keep the minimum hash value of all the kmers in the window up to a given sketch size +// the sketch is a vector of the minimum hash values + +// what are mash's inputs and outputs? +// inputs: a + +type Mash struct { + KmerSize uint + SketchSize uint + Sketches []uint32 +} + +func NewMash(kmerSize uint, sketchSize uint) *Mash { + return &Mash{ + KmerSize: kmerSize, + SketchSize: sketchSize, + Sketches: make([]uint32, sketchSize), + } +} + +func (m *Mash) Sketch(sequence string) { + // slide a window of size k along the sequence + for kmerStart := 0; kmerStart < len(sequence)-int(m.KmerSize); kmerStart++ { + kmer := sequence[kmerStart : kmerStart+int(m.KmerSize)] + // hash the kmer to a 32 bit number + hash := murmur3.Sum32([]byte(kmer)) + // keep the minimum hash value of all the kmers in the window up to a given sketch size + // the sketch is a vector of the minimum hash values + var biggestHashIndex int + + // find the biggest hash value in the sketch + for i := 0; i < len(m.Sketches); i++ { + if m.Sketches[i] == 0 { + biggestHashIndex = i + break + } else if m.Sketches[i] > m.Sketches[biggestHashIndex] { + biggestHashIndex = i + } + } + m.Sketches[biggestHashIndex] = hash + } +} + +// distance algorithm +// compare the sketches of two sequences +// count the number of hashes that are the same +// divide the number of hashes that are the same by the total number of hashes +// the result is the distance between the two sequences +func (m *Mash) Distance(other *Mash) float64 { + var sameHashes int + for i := 0; i < len(m.Sketches); i++ { + for j := 0; j < len(other.Sketches); j++ { + if m.Sketches[i] == other.Sketches[j] { + sameHashes++ + break + } + } + } + return 1 - (float64(sameHashes) / float64(len(m.Sketches))) +} + +// for each kmer in the window, hash it to a 32 bit number +// keep the minimum hash value of all the kmers in the window up to a given sketch size +// the sketch is a vector of the minimum hash values diff --git a/mash/mash_test.go b/mash/mash_test.go new file mode 100644 index 00000000..95eafc79 --- /dev/null +++ b/mash/mash_test.go @@ -0,0 +1,20 @@ +package mash_test + +import ( + "testing" + + "github.com/TimothyStiles/poly/mash" +) + +func TestMash(t *testing.T) { + fingerprint1 := mash.NewMash(17, 10) + fingerprint1.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA") + + fingerprint2 := mash.NewMash(17, 10) + fingerprint2.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA") + + distance := fingerprint1.Distance(fingerprint2) + if distance != 0 { + t.Errorf("Expected distance to be 0, got %f", distance) + } +} From b66bb6beae9fab888b168c7a9ad28c7e84863bb5 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Tue, 19 Sep 2023 10:08:24 -0700 Subject: [PATCH 02/14] removed cruft. --- mash/mash.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mash/mash.go b/mash/mash.go index 3f59ea79..85316adf 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -86,7 +86,3 @@ func (m *Mash) Distance(other *Mash) float64 { } return 1 - (float64(sameHashes) / float64(len(m.Sketches))) } - -// for each kmer in the window, hash it to a 32 bit number -// keep the minimum hash value of all the kmers in the window up to a given sketch size -// the sketch is a vector of the minimum hash values From ca6c6970b884220762ec173797041bbed4d1f255 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Sat, 23 Sep 2023 11:53:55 -0700 Subject: [PATCH 03/14] added comments and removed cruft. --- mash/mash.go | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/mash/mash.go b/mash/mash.go index 85316adf..cc0e303e 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -21,21 +21,13 @@ Tim */ package mash -import "github.com/spaolacci/murmur3" - -// sketch algorithm -// slide a window of size k along the sequence -// for each kmer in the window, hash it to a 32 or 64 bit number -// keep the minimum hash value of all the kmers in the window up to a given sketch size -// the sketch is a vector of the minimum hash values - -// what are mash's inputs and outputs? -// inputs: a +import "github.com/spaolacci/murmur3" // murmur3 is a fast non-cryptographic hash algorithm that was also used in the original papers-> https://github.com/shenwei356/go-hashing-kmer-bench +// Mash is a collection of hashes of kmers from a given sequence. type Mash struct { - KmerSize uint - SketchSize uint - Sketches []uint32 + KmerSize uint // The kmer size is the size of the sliding window that is used to generate the hashes. + SketchSize uint // The sketch size is the number of hashes to store. + Sketches []uint32 // The sketches are the hashes of the kmers that we can compare to other sketches. } func NewMash(kmerSize uint, sketchSize uint) *Mash { @@ -69,11 +61,6 @@ func (m *Mash) Sketch(sequence string) { } } -// distance algorithm -// compare the sketches of two sequences -// count the number of hashes that are the same -// divide the number of hashes that are the same by the total number of hashes -// the result is the distance between the two sequences func (m *Mash) Distance(other *Mash) float64 { var sameHashes int for i := 0; i < len(m.Sketches); i++ { From f73adda6297470233d054ea6b2ea789e3cc8b5d7 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Fri, 29 Sep 2023 12:57:44 -0700 Subject: [PATCH 04/14] updated tl;dr for mash at the top of mash.go --- mash/mash.go | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/mash/mash.go b/mash/mash.go index cc0e303e..5354c502 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -1,5 +1,5 @@ /* -Package mash is for sketching sequence data to make it easier to search for and against. +Package mash is for sketching sequence data to make it easier to compare to other sequence. The package is named mash after the mash sketching algorithm, which is based on the MinHash algorithm. @@ -13,8 +13,27 @@ Ondov, B., Starrett, G., Sappington, A. et al. Genome Biol 20, 232 (2019). https://doi.org/10.1186/s13059-019-1841-x -The idea is that we can sketch a sequence of data, and then compare the sketch to other sketches to see how similar they are. -This saves a bunch of computation time and memory, because we don't have to compare the entire sequence to another sequence. +tl;dr for the papers above: + +Comparing biological sequences is really hard because similar sequences can have frameshifts that make it impossible +to measure similarity using more common metric distances like hamming distance or levenshtein distance. + +Bioinformatics and nlp researchers have come up with tons of strings comparison algorithms that are better suited for +comparing biological sequences. For example poly already implements a few of them like the Needleman-Wunsch and Smith-Waterman +algorithms in our "align" package. + +Mash is a different approach to comparing biological sequences. It uses a technique called sketching to reduce the +complexity of the sequence to a vector of hashes. The hashes are generated by sliding a window of size k along the +sequence and hashing each kmer. The hash is then stored in a vector of size s. The vector is sorted and the smallest +hash is kept. The process is repeated until the vector is full. The vector of hashes is the sketch. + +The sketch is then compared to other sketches by counting the number of hashes that are the same between the two sketches. +The number of hashes that are the same is divided by the size of the sketch to get a distance between 0 and 1. + +Hash vectors can only be compared to other hash vectors that use the same sliding window size K. +Sketch size S limits how many hashes can be stored in the vector and the return vector +will always be of size S containing the lexographically smallest hashes that were generated. +The larger the sketch size the more accurate the distance calculation will be but the longer it will take to calculate. TTFN, Tim From 0698669b397b2fb2bb061e40e6064276e4d17c2d Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Thu, 5 Oct 2023 13:57:33 -0700 Subject: [PATCH 05/14] cleaned up control flow for sketch method. Maybe faster? --- mash/mash.go | 64 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/mash/mash.go b/mash/mash.go index 5354c502..389349b5 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -18,7 +18,7 @@ tl;dr for the papers above: Comparing biological sequences is really hard because similar sequences can have frameshifts that make it impossible to measure similarity using more common metric distances like hamming distance or levenshtein distance. -Bioinformatics and nlp researchers have come up with tons of strings comparison algorithms that are better suited for +Bioinformatics and nlp researchers have come up with tons of string comparison algorithms that are better suited for comparing biological sequences. For example poly already implements a few of them like the Needleman-Wunsch and Smith-Waterman algorithms in our "align" package. @@ -40,16 +40,20 @@ Tim */ package mash -import "github.com/spaolacci/murmur3" // murmur3 is a fast non-cryptographic hash algorithm that was also used in the original papers-> https://github.com/shenwei356/go-hashing-kmer-bench +import ( + "sort" + + "github.com/spaolacci/murmur3" +) // murmur3 is a fast non-cryptographic hash algorithm that was also used in the original papers-> https://github.com/shenwei356/go-hashing-kmer-bench // Mash is a collection of hashes of kmers from a given sequence. type Mash struct { - KmerSize uint // The kmer size is the size of the sliding window that is used to generate the hashes. - SketchSize uint // The sketch size is the number of hashes to store. + KmerSize int // The kmer size is the size of the sliding window that is used to generate the hashes. + SketchSize int // The sketch size is the number of hashes to store. Sketches []uint32 // The sketches are the hashes of the kmers that we can compare to other sketches. } -func NewMash(kmerSize uint, sketchSize uint) *Mash { +func NewMash(kmerSize int, sketchSize int) *Mash { return &Mash{ KmerSize: kmerSize, SketchSize: sketchSize, @@ -57,38 +61,52 @@ func NewMash(kmerSize uint, sketchSize uint) *Mash { } } -func (m *Mash) Sketch(sequence string) { +func (mash *Mash) Sketch(sequence string) { + + // the sketch size is the number of hashes to store. Pre-shifted to avoid off-by-one errors. + maxShiftedSketchSize := mash.SketchSize - 1 + // slide a window of size k along the sequence - for kmerStart := 0; kmerStart < len(sequence)-int(m.KmerSize); kmerStart++ { - kmer := sequence[kmerStart : kmerStart+int(m.KmerSize)] + for kmerStart := 0; kmerStart < len(sequence)-int(mash.KmerSize); kmerStart++ { + kmer := sequence[kmerStart : kmerStart+int(mash.KmerSize)] // hash the kmer to a 32 bit number hash := murmur3.Sum32([]byte(kmer)) // keep the minimum hash value of all the kmers in the window up to a given sketch size // the sketch is a vector of the minimum hash values - var biggestHashIndex int - // find the biggest hash value in the sketch - for i := 0; i < len(m.Sketches); i++ { - if m.Sketches[i] == 0 { - biggestHashIndex = i - break - } else if m.Sketches[i] > m.Sketches[biggestHashIndex] { - biggestHashIndex = i - } + // if the sketch is not full, store the hash in the sketch + if kmerStart < maxShiftedSketchSize { + mash.Sketches[kmerStart] = uint32(hash) + continue } - m.Sketches[biggestHashIndex] = hash + + if kmerStart == maxShiftedSketchSize { + // sort the sketch from smallest to largest + mash.Sketches[maxShiftedSketchSize] = hash + sort.Slice(mash.Sketches, func(i, j int) bool { return mash.Sketches[i] < mash.Sketches[j] }) + continue + } + + // if the sketch is full and the new hash is smaller than the largest hash in the sketch, + // replace the largest hash with the new hash and sort the sketch + if kmerStart > maxShiftedSketchSize && mash.Sketches[maxShiftedSketchSize] > hash { + mash.Sketches[maxShiftedSketchSize] = hash + sort.Slice(mash.Sketches, func(i, j int) bool { return mash.Sketches[i] < mash.Sketches[j] }) + continue + } + } } -func (m *Mash) Distance(other *Mash) float64 { +func (mash *Mash) Distance(other *Mash) float64 { var sameHashes int - for i := 0; i < len(m.Sketches); i++ { - for j := 0; j < len(other.Sketches); j++ { - if m.Sketches[i] == other.Sketches[j] { + for hashIndex := 0; hashIndex < len(mash.Sketches); hashIndex++ { + for otherHashIndex := 0; otherHashIndex < len(other.Sketches); otherHashIndex++ { + if mash.Sketches[hashIndex] == other.Sketches[otherHashIndex] { sameHashes++ break } } } - return 1 - (float64(sameHashes) / float64(len(m.Sketches))) + return 1 - (float64(sameHashes) / float64(len(mash.Sketches))) } From 3367e083bc412ba6d62f6cc49573829765b1cfc5 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Thu, 5 Oct 2023 14:09:12 -0700 Subject: [PATCH 06/14] added check to only sort sketch on insert if the hash is less than both the largest and second to largest hashes. --- mash/mash.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mash/mash.go b/mash/mash.go index 389349b5..3f5ffc52 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -62,7 +62,6 @@ func NewMash(kmerSize int, sketchSize int) *Mash { } func (mash *Mash) Sketch(sequence string) { - // the sketch size is the number of hashes to store. Pre-shifted to avoid off-by-one errors. maxShiftedSketchSize := mash.SketchSize - 1 @@ -91,7 +90,9 @@ func (mash *Mash) Sketch(sequence string) { // replace the largest hash with the new hash and sort the sketch if kmerStart > maxShiftedSketchSize && mash.Sketches[maxShiftedSketchSize] > hash { mash.Sketches[maxShiftedSketchSize] = hash - sort.Slice(mash.Sketches, func(i, j int) bool { return mash.Sketches[i] < mash.Sketches[j] }) + if hash < mash.Sketches[maxShiftedSketchSize-1] { // if the new hash is smaller than the second largest hash in the sketch, sort the sketch + sort.Slice(mash.Sketches, func(i, j int) bool { return mash.Sketches[i] < mash.Sketches[j] }) + } continue } From a47f96ce44ee1883e3e80c4e3225775f28fb65fb Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Thu, 5 Oct 2023 14:16:00 -0700 Subject: [PATCH 07/14] added comments and fixed lint issues. --- mash/mash.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mash/mash.go b/mash/mash.go index 3f5ffc52..e839cd9d 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -66,8 +66,8 @@ func (mash *Mash) Sketch(sequence string) { maxShiftedSketchSize := mash.SketchSize - 1 // slide a window of size k along the sequence - for kmerStart := 0; kmerStart < len(sequence)-int(mash.KmerSize); kmerStart++ { - kmer := sequence[kmerStart : kmerStart+int(mash.KmerSize)] + for kmerStart := 0; kmerStart < len(sequence)-mash.KmerSize; kmerStart++ { + kmer := sequence[kmerStart : kmerStart+mash.KmerSize] // hash the kmer to a 32 bit number hash := murmur3.Sum32([]byte(kmer)) // keep the minimum hash value of all the kmers in the window up to a given sketch size @@ -75,10 +75,11 @@ func (mash *Mash) Sketch(sequence string) { // if the sketch is not full, store the hash in the sketch if kmerStart < maxShiftedSketchSize { - mash.Sketches[kmerStart] = uint32(hash) + mash.Sketches[kmerStart] = hash continue } + // if the sketch has just been filled add the hash to the sketch and sort the sketch if kmerStart == maxShiftedSketchSize { // sort the sketch from smallest to largest mash.Sketches[maxShiftedSketchSize] = hash @@ -87,7 +88,7 @@ func (mash *Mash) Sketch(sequence string) { } // if the sketch is full and the new hash is smaller than the largest hash in the sketch, - // replace the largest hash with the new hash and sort the sketch + // replace the largest hash with the new hash and sort the sketch if the new hash is smaller than the second largest hash in the sketch if kmerStart > maxShiftedSketchSize && mash.Sketches[maxShiftedSketchSize] > hash { mash.Sketches[maxShiftedSketchSize] = hash if hash < mash.Sketches[maxShiftedSketchSize-1] { // if the new hash is smaller than the second largest hash in the sketch, sort the sketch From ba3d892979f4d5d7112009978a46848cfe873dea Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Thu, 5 Oct 2023 14:21:39 -0700 Subject: [PATCH 08/14] fixed trailing whitespace. --- mash/mash.go | 1 - 1 file changed, 1 deletion(-) diff --git a/mash/mash.go b/mash/mash.go index e839cd9d..4df38330 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -96,7 +96,6 @@ func (mash *Mash) Sketch(sequence string) { } continue } - } } From bc48aa2837164942e8ff9f2a5c6241cb9fcedd49 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Thu, 5 Oct 2023 15:18:31 -0700 Subject: [PATCH 09/14] added a little bit of tests and made seperate distance and similarity methods. --- mash/mash.go | 47 +++++++++++++++++++++++++++++++++++++++-------- mash/mash_test.go | 7 ++++++- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/mash/mash.go b/mash/mash.go index 4df38330..23d95c49 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -53,6 +53,7 @@ type Mash struct { Sketches []uint32 // The sketches are the hashes of the kmers that we can compare to other sketches. } +// NewMash initializes a new mash sketch. func NewMash(kmerSize int, sketchSize int) *Mash { return &Mash{ KmerSize: kmerSize, @@ -61,6 +62,7 @@ func NewMash(kmerSize int, sketchSize int) *Mash { } } +// Sketch generates a mash sketch of the sequence. func (mash *Mash) Sketch(sequence string) { // the sketch size is the number of hashes to store. Pre-shifted to avoid off-by-one errors. maxShiftedSketchSize := mash.SketchSize - 1 @@ -99,15 +101,44 @@ func (mash *Mash) Sketch(sequence string) { } } -func (mash *Mash) Distance(other *Mash) float64 { +// Similarity returns the Jaccard similarity between two sketches (number of matching hashes / sketch size) +func (mash *Mash) Similarity(other *Mash) float64 { var sameHashes int - for hashIndex := 0; hashIndex < len(mash.Sketches); hashIndex++ { - for otherHashIndex := 0; otherHashIndex < len(other.Sketches); otherHashIndex++ { - if mash.Sketches[hashIndex] == other.Sketches[otherHashIndex] { - sameHashes++ - break - } + + var largerSketch *Mash + var smallerSketch *Mash + + if mash.SketchSize > other.SketchSize { + largerSketch = mash + smallerSketch = other + } else { + largerSketch = other + smallerSketch = mash + } + + largerSketchSizeShifted := largerSketch.SketchSize - 1 + smallerSketchSizeShifted := smallerSketch.SketchSize - 1 + + // if the largest hash in the larger sketch is smaller than the smallest hash in the smaller sketch, the distance is 1 + if largerSketch.Sketches[largerSketchSizeShifted] < smallerSketch.Sketches[0] { + return 1 + } + + // if the largest hash in the smaller sketch is smaller than the smallest hash in the larger sketch, the distance is 1 + if smallerSketch.Sketches[smallerSketchSizeShifted] < largerSketch.Sketches[0] { + return 1 + } + + for _, hash := range smallerSketch.Sketches { + ind := sort.Search(largerSketchSizeShifted, func(ind int) bool { return largerSketch.Sketches[ind] <= hash }) + if largerSketch.Sketches[ind] == hash { + sameHashes++ } } - return 1 - (float64(sameHashes) / float64(len(mash.Sketches))) + + return float64(sameHashes) / float64(len(mash.Sketches)) +} + +func (mash *Mash) Distance(other *Mash) float64 { + return 1 - mash.Similarity(other) } diff --git a/mash/mash_test.go b/mash/mash_test.go index 95eafc79..87e329fe 100644 --- a/mash/mash_test.go +++ b/mash/mash_test.go @@ -10,11 +10,16 @@ func TestMash(t *testing.T) { fingerprint1 := mash.NewMash(17, 10) fingerprint1.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA") - fingerprint2 := mash.NewMash(17, 10) + fingerprint2 := mash.NewMash(17, 9) fingerprint2.Sketch("ATGCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA") distance := fingerprint1.Distance(fingerprint2) if distance != 0 { t.Errorf("Expected distance to be 0, got %f", distance) } + + distance = fingerprint2.Distance(fingerprint1) + if distance != 0 { + t.Errorf("Expected distance to be 0, got %f", distance) + } } From 528129db04e3a0d31ffea66556926733847ba2c3 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Thu, 5 Oct 2023 15:26:07 -0700 Subject: [PATCH 10/14] set denominator for jaccard similarity to smaller of two sketch sizes. --- mash/mash.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mash/mash.go b/mash/mash.go index 23d95c49..398a3593 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -136,7 +136,7 @@ func (mash *Mash) Similarity(other *Mash) float64 { } } - return float64(sameHashes) / float64(len(mash.Sketches)) + return float64(sameHashes) / float64(smallerSketch.SketchSize) } func (mash *Mash) Distance(other *Mash) float64 { From 56abb6620eb713a8b946f274707afb0620477168 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Thu, 5 Oct 2023 15:26:32 -0700 Subject: [PATCH 11/14] added comment to distance. --- mash/mash.go | 1 + 1 file changed, 1 insertion(+) diff --git a/mash/mash.go b/mash/mash.go index 398a3593..a9aeef9c 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -139,6 +139,7 @@ func (mash *Mash) Similarity(other *Mash) float64 { return float64(sameHashes) / float64(smallerSketch.SketchSize) } +// Distance returns the Jaccard distance between two sketches (1 - similarity) func (mash *Mash) Distance(other *Mash) float64 { return 1 - mash.Similarity(other) } From 40323db64760e4b67e619e9b4ff2d7bb37e3694d Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Thu, 5 Oct 2023 15:30:13 -0700 Subject: [PATCH 12/14] spoofed fingerprint test added. Test coverage 100% --- mash/mash_test.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mash/mash_test.go b/mash/mash_test.go index 87e329fe..b450cd65 100644 --- a/mash/mash_test.go +++ b/mash/mash_test.go @@ -22,4 +22,19 @@ func TestMash(t *testing.T) { if distance != 0 { t.Errorf("Expected distance to be 0, got %f", distance) } + + spoofedFingerprint := mash.NewMash(17, 10) + spoofedFingerprint.Sketches[0] = 0 + + distance = fingerprint1.Distance(spoofedFingerprint) + if distance != 1 { + t.Errorf("Expected distance to be 1, got %f", distance) + } + + spoofedFingerprint = mash.NewMash(17, 9) + + distance = fingerprint1.Distance(spoofedFingerprint) + if distance != 1 { + t.Errorf("Expected distance to be 1, got %f", distance) + } } From f5982b6a54c62fc1e1d880a90281b0be3b494822 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Thu, 5 Oct 2023 15:34:21 -0700 Subject: [PATCH 13/14] fixed return value bug. --- mash/mash.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mash/mash.go b/mash/mash.go index a9aeef9c..f718bd9d 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -121,12 +121,12 @@ func (mash *Mash) Similarity(other *Mash) float64 { // if the largest hash in the larger sketch is smaller than the smallest hash in the smaller sketch, the distance is 1 if largerSketch.Sketches[largerSketchSizeShifted] < smallerSketch.Sketches[0] { - return 1 + return 0 } // if the largest hash in the smaller sketch is smaller than the smallest hash in the larger sketch, the distance is 1 if smallerSketch.Sketches[smallerSketchSizeShifted] < largerSketch.Sketches[0] { - return 1 + return 0 } for _, hash := range smallerSketch.Sketches { From 396a4db6b4beaae9486fbc19650299f199144517 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Thu, 5 Oct 2023 15:44:25 -0700 Subject: [PATCH 14/14] updated package level comment. --- mash/mash.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mash/mash.go b/mash/mash.go index f718bd9d..2b7c90d9 100644 --- a/mash/mash.go +++ b/mash/mash.go @@ -30,9 +30,11 @@ hash is kept. The process is repeated until the vector is full. The vector of ha The sketch is then compared to other sketches by counting the number of hashes that are the same between the two sketches. The number of hashes that are the same is divided by the size of the sketch to get a distance between 0 and 1. -Hash vectors can only be compared to other hash vectors that use the same sliding window size K. -Sketch size S limits how many hashes can be stored in the vector and the return vector -will always be of size S containing the lexographically smallest hashes that were generated. +Hash vectors can only be compared to other hash vectors that use the same sliding window size. +Sketch size limits how many hashes can be stored in the vector and the return vector +will always be of length of the sketch size and filled the smallest hashes that were generated +and sorted from smallest to largest. + The larger the sketch size the more accurate the distance calculation will be but the longer it will take to calculate. TTFN,