From dd670742f77b3e354d9f15d9b42fe4a7ecb1d8d2 Mon Sep 17 00:00:00 2001
From: Forest Gregg <fgregg@datamade.us>
Date: Fri, 17 Feb 2023 11:29:12 -0500
Subject: [PATCH] improve formula for scoring possible dupes for active
 learning

should address #1077
---
 dedupe/labeler.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/dedupe/labeler.py b/dedupe/labeler.py
index 3e774bfe..042c180b 100644
--- a/dedupe/labeler.py
+++ b/dedupe/labeler.py
@@ -179,9 +179,8 @@ def _sample_indices(
             # if a predicate only covers a few record pairs, the value of
             # the vote it puts on those few pairs will be worth more than
             # a predicate that covers almost all the record pairs
-            proportion = len(covered) / max_cover
-            weight: float = numpy.exp(-10 * proportion)
-            if weight and proportion != 1:
+            if len(covered) < max_cover:
+                weight = 1 / len(covered)
                 for pair in covered:
                     weights[pair] = weights.get(pair, 0.0) + weight
 
@@ -189,9 +188,9 @@ def _sample_indices(
         if sample_size < len(weights):
             # consider using a reservoir sampling strategy, which would
             # be more memory efficient and probably about as fast
-            normalized_weights = numpy.fromiter(weights.values(), dtype=float) / sum(
-                weights.values()
-            )
+            normalized_weights = numpy.fromiter(
+                weights.values(), dtype=float, count=len(weights)
+            ) / sum(weights.values())
             rng = numpy.random.default_rng()
             sample_indices = rng.choice(
                 len(weights), size=sample_size, replace=False, p=normalized_weights