improve formula for scoring possible dupes for active learning

should address #1077
dedupeio · Feb 17, 2023 · dd67074 · dd67074
1 parent 89a8973
commit dd67074
Showing 1 changed file with 5 additions and 6 deletions.
diff --git a/dedupe/labeler.py b/dedupe/labeler.py
@@ -179,19 +179,18 @@ def _sample_indices(
             # if a predicate only covers a few record pairs, the value of
             # the vote it puts on those few pairs will be worth more than
             # a predicate that covers almost all the record pairs
-            proportion = len(covered) / max_cover
-            weight: float = numpy.exp(-10 * proportion)
-            if weight and proportion != 1:
+            if len(covered) < max_cover:
+                weight = 1 / len(covered)
                 for pair in covered:
                     weights[pair] = weights.get(pair, 0.0) + weight
 
         sample_ids: Iterable[RecordIDPair]
         if sample_size < len(weights):
             # consider using a reservoir sampling strategy, which would
             # be more memory efficient and probably about as fast
-            normalized_weights = numpy.fromiter(weights.values(), dtype=float) / sum(
-                weights.values()
-            )
+            normalized_weights = numpy.fromiter(
+                weights.values(), dtype=float, count=len(weights)
+            ) / sum(weights.values())
             rng = numpy.random.default_rng()
             sample_indices = rng.choice(
                 len(weights), size=sample_size, replace=False, p=normalized_weights