Skip to content

Commit

Permalink
improve formula for scoring possible dupes for active learning
Browse files Browse the repository at this point in the history
should address #1077
  • Loading branch information
fgregg committed Feb 17, 2023
1 parent 89a8973 commit dd67074
Showing 1 changed file with 5 additions and 6 deletions.
11 changes: 5 additions & 6 deletions dedupe/labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,19 +179,18 @@ def _sample_indices(
# if a predicate only covers a few record pairs, the value of
# the vote it puts on those few pairs will be worth more than
# a predicate that covers almost all the record pairs
proportion = len(covered) / max_cover
weight: float = numpy.exp(-10 * proportion)
if weight and proportion != 1:
if len(covered) < max_cover:
weight = 1 / len(covered)
for pair in covered:
weights[pair] = weights.get(pair, 0.0) + weight

sample_ids: Iterable[RecordIDPair]
if sample_size < len(weights):
# consider using a reservoir sampling strategy, which would
# be more memory efficient and probably about as fast
normalized_weights = numpy.fromiter(weights.values(), dtype=float) / sum(
weights.values()
)
normalized_weights = numpy.fromiter(
weights.values(), dtype=float, count=len(weights)
) / sum(weights.values())
rng = numpy.random.default_rng()
sample_indices = rng.choice(
len(weights), size=sample_size, replace=False, p=normalized_weights
Expand Down

0 comments on commit dd67074

Please sign in to comment.