From 297e2e2e2659136396ff47f40faf59388ad323c3 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Mon, 18 Dec 2023 23:30:48 +0000 Subject: [PATCH 1/3] Add Levenshtein distance --- src/trustyai/metrics/distance.py | 98 ++++++++++++++++++++++++++++++++ src/trustyai/metrics/language.py | 15 +---- 2 files changed, 101 insertions(+), 12 deletions(-) create mode 100644 src/trustyai/metrics/distance.py diff --git a/src/trustyai/metrics/distance.py b/src/trustyai/metrics/distance.py new file mode 100644 index 0000000..7bbf312 --- /dev/null +++ b/src/trustyai/metrics/distance.py @@ -0,0 +1,98 @@ +""""Distance metrics""" +from trustyai import _default_initializer # pylint: disable=unused-import +from dataclasses import dataclass + +# pylint: disable = import-error +from typing import List, Optional, Union, Callable + +from org.kie.trustyai.metrics.language.distance import ( + Levenshtein as _Levenshtein, + LevenshteinResult as _LevenshteinResult, + LevenshteinCounters as _LevenshteinCounters +) +from opennlp.tools.tokenize import Tokenizer +import numpy as np +import matplotlib.pyplot as plt + +@dataclass +class LevenshteinCounters: + """LevenshteinCounters Counters""" + + substitutions: int + insertions: int + deletions: int + correct: int + + @staticmethod + def convert(result: _LevenshteinCounters): + return LevenshteinCounters(substitutions=result.getSubstitutions(), + insertions=result.getInsertions(), + deletions=result.getDeletions(), + correct=result.getCorrect()) + +@dataclass +class LevenshteinResult: + """Levenshtein Result""" + + distance: float + counters: LevenshteinCounters + matrix: np.ndarray + reference: List[str] + hypothesis: List[str] + + @staticmethod + def convert(result: _LevenshteinResult): + """Converts a Java LevenshteinResult to a Python LevenshteinResult""" + distance = result.getDistance() + counters = LevenshteinCounters.convert(result.getCounters()) + data = result.getDistanceMatrix().getData() + numpy_array = np.array(data)[1:, 1:] + reference = result.getReferenceTokens() + hypothesis = result.getHypothesisTokens() + + return LevenshteinResult( + distance=distance, + counters=counters, + matrix=numpy_array, + reference=reference, + hypothesis=hypothesis + ) + + def plot(self): + cmap = plt.cm.viridis + + fig, ax = plt.subplots() + cax = ax.imshow(self.matrix, cmap=cmap, interpolation='nearest') + + plt.colorbar(cax) + + ax.set_xticks(np.arange(len(self.reference))) + ax.set_yticks(np.arange(len(self.hypothesis))) + ax.set_xticklabels(self.reference) + ax.set_yticklabels(self.hypothesis) + + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + + for i in range(len(self.hypothesis)): + for j in range(len(self.reference)): + color = 'white' if self.matrix[i, j] < self.matrix.max() / 2 else 'black' + text = ax.text(j, i, self.matrix[i, j], ha="center", va="center", color=color) + + plt.show() + +def levenshtein( + reference: str, + hypothesis: str, + tokenizer: Optional[Union[Tokenizer, Callable[[str], List[str]]]] = None, +) -> LevenshteinResult: + """Calculate Levenshtein distance between two strings""" + if not tokenizer: + return LevenshteinResult.convert(_Levenshtein.calculateToken(reference, hypothesis)) + elif isinstance(tokenizer, Tokenizer): + return LevenshteinResult.convert(_Levenshtein.calculateToken(reference, hypothesis, tokenizer)) + elif callable(tokenizer): + tokenized_reference = tokenizer(reference) + tokenized_hypothesis = tokenizer(hypothesis) + return LevenshteinResult.convert(_Levenshtein.calculateToken(tokenized_reference, tokenized_hypothesis)) + else: + raise ValueError("Unsupported tokenizer") diff --git a/src/trustyai/metrics/language.py b/src/trustyai/metrics/language.py index b80891a..7d74b20 100644 --- a/src/trustyai/metrics/language.py +++ b/src/trustyai/metrics/language.py @@ -1,4 +1,5 @@ -""""Group fairness metrics""" +""""Language metrics""" +from trustyai import _default_initializer # pylint: disable=unused-import from dataclasses import dataclass # pylint: disable = import-error @@ -9,17 +10,7 @@ ErrorRateResult as _ErrorRateResult, ) from opennlp.tools.tokenize import Tokenizer - - -@dataclass -class LevenshteinCounters: - """LevenshteinCounters Counters""" - - substitutions: int - insertions: int - deletions: int - correct: int - +from .distance import LevenshteinCounters @dataclass class ErrorRateResult: From 63c5e6c218f6346d398e151d955d440edc221869 Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Tue, 19 Dec 2023 12:09:05 +0000 Subject: [PATCH 2/3] Fix linting and formatting --- src/trustyai/metrics/distance.py | 74 ++++++++++++++++++++------------ src/trustyai/metrics/language.py | 6 ++- 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/src/trustyai/metrics/distance.py b/src/trustyai/metrics/distance.py index 7bbf312..7000276 100644 --- a/src/trustyai/metrics/distance.py +++ b/src/trustyai/metrics/distance.py @@ -1,18 +1,18 @@ """"Distance metrics""" -from trustyai import _default_initializer # pylint: disable=unused-import -from dataclasses import dataclass - # pylint: disable = import-error +from dataclasses import dataclass from typing import List, Optional, Union, Callable from org.kie.trustyai.metrics.language.distance import ( Levenshtein as _Levenshtein, LevenshteinResult as _LevenshteinResult, - LevenshteinCounters as _LevenshteinCounters + LevenshteinCounters as _LevenshteinCounters, ) from opennlp.tools.tokenize import Tokenizer import numpy as np import matplotlib.pyplot as plt +from trustyai import _default_initializer # pylint: disable=unused-import + @dataclass class LevenshteinCounters: @@ -25,10 +25,14 @@ class LevenshteinCounters: @staticmethod def convert(result: _LevenshteinCounters): - return LevenshteinCounters(substitutions=result.getSubstitutions(), - insertions=result.getInsertions(), - deletions=result.getDeletions(), - correct=result.getCorrect()) + """Converts a Java LevenshteinCounters to a Python LevenshteinCounters""" + return LevenshteinCounters( + substitutions=result.getSubstitutions(), + insertions=result.getInsertions(), + deletions=result.getDeletions(), + correct=result.getCorrect(), + ) + @dataclass class LevenshteinResult: @@ -55,44 +59,58 @@ def convert(result: _LevenshteinResult): counters=counters, matrix=numpy_array, reference=reference, - hypothesis=hypothesis + hypothesis=hypothesis, ) def plot(self): + """Plot the Levenshtein distance matrix""" cmap = plt.cm.viridis - fig, ax = plt.subplots() - cax = ax.imshow(self.matrix, cmap=cmap, interpolation='nearest') + _, axes = plt.subplots() + cax = axes.imshow(self.matrix, cmap=cmap, interpolation="nearest") plt.colorbar(cax) - ax.set_xticks(np.arange(len(self.reference))) - ax.set_yticks(np.arange(len(self.hypothesis))) - ax.set_xticklabels(self.reference) - ax.set_yticklabels(self.hypothesis) + axes.set_xticks(np.arange(len(self.reference))) + axes.set_yticks(np.arange(len(self.hypothesis))) + axes.set_xticklabels(self.reference) + axes.set_yticklabels(self.hypothesis) - plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") + plt.setp( + axes.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor" + ) for i in range(len(self.hypothesis)): for j in range(len(self.reference)): - color = 'white' if self.matrix[i, j] < self.matrix.max() / 2 else 'black' - text = ax.text(j, i, self.matrix[i, j], ha="center", va="center", color=color) + color = ( + "white" if self.matrix[i, j] < self.matrix.max() / 2 else "black" + ) + _ = axes.text( + j, i, self.matrix[i, j], ha="center", va="center", color=color + ) plt.show() + def levenshtein( - reference: str, - hypothesis: str, - tokenizer: Optional[Union[Tokenizer, Callable[[str], List[str]]]] = None, + reference: str, + hypothesis: str, + tokenizer: Optional[Union[Tokenizer, Callable[[str], List[str]]]] = None, ) -> LevenshteinResult: """Calculate Levenshtein distance between two strings""" if not tokenizer: - return LevenshteinResult.convert(_Levenshtein.calculateToken(reference, hypothesis)) - elif isinstance(tokenizer, Tokenizer): - return LevenshteinResult.convert(_Levenshtein.calculateToken(reference, hypothesis, tokenizer)) - elif callable(tokenizer): + return LevenshteinResult.convert( + _Levenshtein.calculateToken(reference, hypothesis) + ) + if isinstance(tokenizer, Tokenizer): + return LevenshteinResult.convert( + _Levenshtein.calculateToken(reference, hypothesis, tokenizer) + ) + if callable(tokenizer): tokenized_reference = tokenizer(reference) tokenized_hypothesis = tokenizer(hypothesis) - return LevenshteinResult.convert(_Levenshtein.calculateToken(tokenized_reference, tokenized_hypothesis)) - else: - raise ValueError("Unsupported tokenizer") + return LevenshteinResult.convert( + _Levenshtein.calculateToken(tokenized_reference, tokenized_hypothesis) + ) + + raise ValueError("Unsupported tokenizer") diff --git a/src/trustyai/metrics/language.py b/src/trustyai/metrics/language.py index 7d74b20..c921f7b 100644 --- a/src/trustyai/metrics/language.py +++ b/src/trustyai/metrics/language.py @@ -1,8 +1,7 @@ """"Language metrics""" -from trustyai import _default_initializer # pylint: disable=unused-import +# pylint: disable = import-error from dataclasses import dataclass -# pylint: disable = import-error from typing import List, Optional, Union, Callable from org.kie.trustyai.metrics.language.levenshtein import ( @@ -10,8 +9,11 @@ ErrorRateResult as _ErrorRateResult, ) from opennlp.tools.tokenize import Tokenizer +from trustyai import _default_initializer # pylint: disable=unused-import + from .distance import LevenshteinCounters + @dataclass class ErrorRateResult: """Word Error Rate Result""" From 44b4d7355214bf43168dab7044e0362325daee4d Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Tue, 19 Dec 2023 19:47:18 +0000 Subject: [PATCH 3/3] Fix matrix plot ranges --- src/trustyai/metrics/distance.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/trustyai/metrics/distance.py b/src/trustyai/metrics/distance.py index 7000276..b802973 100644 --- a/src/trustyai/metrics/distance.py +++ b/src/trustyai/metrics/distance.py @@ -80,13 +80,14 @@ def plot(self): axes.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor" ) - for i in range(len(self.hypothesis)): - for j in range(len(self.reference)): + nrows, ncols = self.matrix.shape + for i in range(nrows): + for j in range(ncols): color = ( "white" if self.matrix[i, j] < self.matrix.max() / 2 else "black" ) - _ = axes.text( - j, i, self.matrix[i, j], ha="center", va="center", color=color + axes.text( + j, i, int(self.matrix[i, j]), ha="center", va="center", color=color ) plt.show()