From dd670742f77b3e354d9f15d9b42fe4a7ecb1d8d2 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Fri, 17 Feb 2023 11:29:12 -0500 Subject: [PATCH] improve formula for scoring possible dupes for active learning should address #1077 --- dedupe/labeler.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dedupe/labeler.py b/dedupe/labeler.py index 3e774bfe..042c180b 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -179,9 +179,8 @@ def _sample_indices( # if a predicate only covers a few record pairs, the value of # the vote it puts on those few pairs will be worth more than # a predicate that covers almost all the record pairs - proportion = len(covered) / max_cover - weight: float = numpy.exp(-10 * proportion) - if weight and proportion != 1: + if len(covered) < max_cover: + weight = 1 / len(covered) for pair in covered: weights[pair] = weights.get(pair, 0.0) + weight @@ -189,9 +188,9 @@ def _sample_indices( if sample_size < len(weights): # consider using a reservoir sampling strategy, which would # be more memory efficient and probably about as fast - normalized_weights = numpy.fromiter(weights.values(), dtype=float) / sum( - weights.values() - ) + normalized_weights = numpy.fromiter( + weights.values(), dtype=float, count=len(weights) + ) / sum(weights.values()) rng = numpy.random.default_rng() sample_indices = rng.choice( len(weights), size=sample_size, replace=False, p=normalized_weights