-
Notifications
You must be signed in to change notification settings - Fork 112
/
minhash.go
104 lines (91 loc) · 2.46 KB
/
minhash.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
package boom
import (
"math"
"math/rand"
)
// MinHash is a variation of the technique for estimating similarity between
// two sets as presented by Broder in On the resemblance and containment of
// documents:
//
// http://gatekeeper.dec.com/ftp/pub/dec/SRC/publications/broder/positano-final-wpnums.pdf
//
// This can be used to cluster or compare documents by splitting the corpus
// into a bag of words. MinHash returns the approximated similarity ratio of
// the two bags. The similarity is less accurate for very small bags of words.
func MinHash(bag1, bag2 []string) float32 {
k := len(bag1) + len(bag2)
hashes := make([]int, k)
for i := 0; i < k; i++ {
a := uint(rand.Int())
b := uint(rand.Int())
c := uint(rand.Int())
x := computeHash(a*b*c, a, b, c)
hashes[i] = int(x)
}
bitMap := bitMap(bag1, bag2)
minHashValues := hashBuckets(2, k)
minHash(bag1, 0, minHashValues, bitMap, k, hashes)
minHash(bag2, 1, minHashValues, bitMap, k, hashes)
return similarity(minHashValues, k)
}
func minHash(bag []string, bagIndex int, minHashValues [][]int,
bitArray map[string][]bool, k int, hashes []int) {
index := 0
for element := range bitArray {
for i := 0; i < k; i++ {
if contains(bag, element) {
hindex := hashes[index]
if hindex < minHashValues[bagIndex][index] {
minHashValues[bagIndex][index] = hindex
}
}
}
index++
}
}
func contains(bag []string, element string) bool {
for _, e := range bag {
if e == element {
return true
}
}
return false
}
func bitMap(bag1, bag2 []string) map[string][]bool {
bitArray := map[string][]bool{}
for _, element := range bag1 {
bitArray[element] = []bool{true, false}
}
for _, element := range bag2 {
if _, ok := bitArray[element]; ok {
bitArray[element] = []bool{true, true}
} else if _, ok := bitArray[element]; !ok {
bitArray[element] = []bool{false, true}
}
}
return bitArray
}
func hashBuckets(numSets, k int) [][]int {
minHashValues := make([][]int, numSets)
for i := 0; i < numSets; i++ {
minHashValues[i] = make([]int, k)
}
for i := 0; i < numSets; i++ {
for j := 0; j < k; j++ {
minHashValues[i][j] = math.MaxInt32
}
}
return minHashValues
}
func computeHash(x, a, b, u uint) uint {
return (a*x + b) >> (32 - u)
}
func similarity(minHashValues [][]int, k int) float32 {
identicalMinHashes := 0
for i := 0; i < k; i++ {
if minHashValues[0][i] == minHashValues[1][i] {
identicalMinHashes++
}
}
return (1.0 * float32(identicalMinHashes)) / float32(k)
}