Skip to content

Commit

Permalink
update via advice from RapidFuzz#291
Browse files Browse the repository at this point in the history
  • Loading branch information
Zeroto521 committed Dec 7, 2022
1 parent 5007c10 commit 22c7e25
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 48 deletions.
43 changes: 6 additions & 37 deletions dtoolkit/accessor/series/textdistance.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

from functools import lru_cache
from functools import wraps
from typing import Callable
from warnings import warn

Expand All @@ -21,6 +20,7 @@ def textdistance(
other: None | str | pd.Series = None,
method: Callable = None,
align: bool = True,
**kwargs,
) -> pd.Series:
"""
Return a ``Series`` containing the text distance to aligned ``other``.
Expand Down Expand Up @@ -60,10 +60,6 @@ def textdistance(
--------
textdistance_matrix
Notes
-----
The distance of any value compared to nan or None is 0.
Examples
--------
>>> import dtoolkit
Expand All @@ -87,11 +83,12 @@ def textdistance(
raise TypeError(f"Expected string dtype, but got {s.dtype!r}.")

if method is None:
method = __import__("rapidfuzz.fuzz").fuzz.ratio
method = lru_cache(check_none(check_nan(method)))
method = __import__("rapidfuzz").fuzz.ratio
method = lru_cache(method)

if isinstance(other, str):
return s.apply(method, args=(other,))
return s.apply(method, args=(other,), **kwargs)

elif isinstance(other, pd.Series):
if not is_string_dtype(other):
raise TypeError(f"Expected Series(string), but got {other.dtype!r}.")
Expand All @@ -104,37 +101,9 @@ def textdistance(
raise ValueError(f"{s.size=} != {other.size=}.")

return pd.Series(
(method(*xy) for xy in zip(s, other)),
(method(*xy, **kwargs) for xy in zip(s, other)),
name=s.name,
index=s.index,
)
elif other is None or (not is_list_like(other) and pd.isna(other)):
# NOTE:
# - pd.na(Series) returns array-like of bool
# to make sure pd.isna(other) returns bool
# need to other is not array-like
# - compare to None or nan always returns 0
# the behavior is following rapidfuzz.fuzz.ratio
return pd.Series(np.zeros(s.size), name=s.name, index=s.index)

raise TypeError(f"Expected Series(string), but got {type(other).__name__!r}.")


def check_none(func):
@wraps(func)
def decorator(*args, **kwargs):
# NOTE: compare to None always returns 0
# the behavior is following rapidfuzz.fuzz.ratio
return 0 if args[0] is None or args[1] is None else func(*args, **kwargs)

return decorator


def check_nan(func):
@wraps(func)
def decorator(*args, **kwargs):
# NOTE: compare to nan always returns 0
# the behavior is following rapidfuzz.fuzz.ratio
return 0 if pd.isna(args[0]) or pd.isna(args[1]) else func(*args, **kwargs)

return decorator
4 changes: 0 additions & 4 deletions dtoolkit/accessor/series/textdistance_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,6 @@ def textdistance_matrix(
--------
textdistance
Notes
-----
Can't handle nan or None type value.
Examples
--------
>>> import dtoolkit
Expand Down
7 changes: 0 additions & 7 deletions test/accessor/series/test_textdistance_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,6 @@
rapidfuzz.string_metric.levenshtein,
pd.DataFrame([[4, 9], [6, 9]]),
),
# other elements contain None or nan
(
pd.Series(["hello", "world", "!"]),
pd.Series(["hi!", None, float("nan")]),
None,
pd.DataFrame([[25, 0, 0], [0, 0, 0], [50, 0, 0]]),
),
],
)
def test_work(s, other, method, expected):
Expand Down

0 comments on commit 22c7e25

Please sign in to comment.