diff --git a/Pipfile b/Pipfile index ec2736b..cef4773 100644 --- a/Pipfile +++ b/Pipfile @@ -12,11 +12,12 @@ isort = "*" [packages] numpy = "*" +scikit-learn = "*" [requires] python_version = "3.8" [scripts] isort = "isort . -c" -test = "pytest tests/test_scheme.py --cov=seqeval --cov-report=term-missing -vv" +test = "pytest tests/test_scheme.py tests/test_reporters.py tests/test_v1.py --cov=seqeval --cov-report=term-missing -vv" flake8 = "flake8 seqeval --ignore=F401,E741" diff --git a/Pipfile.lock b/Pipfile.lock index b2cdceb..b2a9257 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "b4e22269a630553874e70082ad4ec752d455c4a5da5beed5e7cebcdb06dd35ab" + "sha256": "ed3e34c865fd90593db1c52563d09c3f8328d8821cec176a467fcf2b3a7da7fa" }, "pipfile-spec": 6, "requires": { @@ -16,6 +16,14 @@ ] }, "default": { + "joblib": { + "hashes": [ + "sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72", + "sha256:9e284edd6be6b71883a63c9b7f124738a3c16195513ad940eae7e3438de885d5" + ], + "markers": "python_version >= '3.6'", + "version": "==0.17.0" + }, "numpy": { "hashes": [ "sha256:04c7d4ebc5ff93d9822075ddb1751ff392a4375e5885299445fcebf877f179d5", @@ -47,6 +55,58 @@ ], "index": "pypi", "version": "==1.19.2" + }, + "scikit-learn": { + "hashes": [ + "sha256:0a127cc70990d4c15b1019680bfedc7fec6c23d14d3719fdf9b64b22d37cdeca", + "sha256:0d39748e7c9669ba648acf40fb3ce96b8a07b240db6888563a7cb76e05e0d9cc", + "sha256:1b8a391de95f6285a2f9adffb7db0892718950954b7149a70c783dc848f104ea", + "sha256:20766f515e6cd6f954554387dfae705d93c7b544ec0e6c6a5d8e006f6f7ef480", + "sha256:2aa95c2f17d2f80534156215c87bee72b6aa314a7f8b8fe92a2d71f47280570d", + "sha256:5ce7a8021c9defc2b75620571b350acc4a7d9763c25b7593621ef50f3bd019a2", + "sha256:6c28a1d00aae7c3c9568f61aafeaad813f0f01c729bee4fd9479e2132b215c1d", + "sha256:7671bbeddd7f4f9a6968f3b5442dac5f22bf1ba06709ef888cc9132ad354a9ab", + "sha256:914ac2b45a058d3f1338d7736200f7f3b094857758895f8667be8a81ff443b5b", + "sha256:98508723f44c61896a4e15894b2016762a55555fbf09365a0bb1870ecbd442de", + "sha256:a64817b050efd50f9abcfd311870073e500ae11b299683a519fbb52d85e08d25", + "sha256:cb3e76380312e1f86abd20340ab1d5b3cc46a26f6593d3c33c9ea3e4c7134028", + "sha256:d0dcaa54263307075cb93d0bee3ceb02821093b1b3d25f66021987d305d01dce", + "sha256:d9a1ce5f099f29c7c33181cc4386660e0ba891b21a60dc036bf369e3a3ee3aec", + "sha256:da8e7c302003dd765d92a5616678e591f347460ac7b53e53d667be7dfe6d1b10", + "sha256:daf276c465c38ef736a79bd79fc80a249f746bcbcae50c40945428f7ece074f8" + ], + "index": "pypi", + "version": "==0.23.2" + }, + "scipy": { + "hashes": [ + "sha256:066c513d90eb3fd7567a9e150828d39111ebd88d3e924cdfc9f8ce19ab6f90c9", + "sha256:07e52b316b40a4f001667d1ad4eb5f2318738de34597bd91537851365b6c61f1", + "sha256:0a0e9a4e58a4734c2eba917f834b25b7e3b6dc333901ce7784fd31aefbd37b2f", + "sha256:1c7564a4810c1cd77fcdee7fa726d7d39d4e2695ad252d7c86c3ea9d85b7fb8f", + "sha256:315aa2165aca31375f4e26c230188db192ed901761390be908c9b21d8b07df62", + "sha256:6e86c873fe1335d88b7a4bfa09d021f27a9e753758fd75f3f92d714aa4093768", + "sha256:8e28e74b97fc8d6aa0454989db3b5d36fc27e69cef39a7ee5eaf8174ca1123cb", + "sha256:92eb04041d371fea828858e4fff182453c25ae3eaa8782d9b6c32b25857d23bc", + "sha256:a0afbb967fd2c98efad5f4c24439a640d39463282040a88e8e928db647d8ac3d", + "sha256:a785409c0fa51764766840185a34f96a0a93527a0ff0230484d33a8ed085c8f8", + "sha256:cca9fce15109a36a0a9f9cfc64f870f1c140cb235ddf27fe0328e6afb44dfed0", + "sha256:d56b10d8ed72ec1be76bf10508446df60954f08a41c2d40778bc29a3a9ad9bce", + "sha256:dac09281a0eacd59974e24525a3bc90fa39b4e95177e638a31b14db60d3fa806", + "sha256:ec5fe57e46828d034775b00cd625c4a7b5c7d2e354c3b258d820c6c72212a6ec", + "sha256:eecf40fa87eeda53e8e11d265ff2254729d04000cd40bae648e76ff268885d66", + "sha256:fc98f3eac993b9bfdd392e675dfe19850cc8c7246a8fd2b42443e506344be7d9" + ], + "markers": "python_version >= '3.6'", + "version": "==1.5.2" + }, + "threadpoolctl": { + "hashes": [ + "sha256:38b74ca20ff3bb42caca8b00055111d74159ee95c4370882bbff2b93d24da725", + "sha256:ddc57c96a38beb63db45d6c159b5ab07b6bced12c45a1f07b2b92f272aebfa6b" + ], + "markers": "python_version >= '3.5'", + "version": "==2.1.0" } }, "develop": { @@ -122,11 +182,11 @@ }, "isort": { "hashes": [ - "sha256:36f0c6659b9000597e92618d05b72d4181104cf59472b1c6a039e3783f930c95", - "sha256:ba040c24d20aa302f78f4747df549573ae1eaf8e1084269199154da9c483f07f" + "sha256:2f510f34ae18a8d0958c53eec51ef84fd099f07c4c639676525acbcd7b5bd3ff", + "sha256:dd3211f513f4a92ec1ec1876fc1dc3c686649c349d49523f5b5adbb0814e5960" ], "index": "pypi", - "version": "==5.5.4" + "version": "==5.6.1" }, "mccabe": { "hashes": [ diff --git a/README.md b/README.md index f0382cf..24ad774 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # seqeval + seqeval is a Python framework for sequence labeling evaluation. seqeval can evaluate the performance of chunking tasks such as named-entity recognition, part-of-speech tagging, semantic role labeling and so on. @@ -6,6 +7,7 @@ This is well-tested by using the Perl script [conlleval](https://www.clips.uantw which can be used for measuring the performance of a system that has processed the CoNLL-2000 shared task data. ## Support features + seqeval supports following formats: * IOB1 * IOB2 @@ -24,6 +26,7 @@ and supports following metrics: | classification_report(y\_true, y\_pred, digits=2) | Build a text report showing the main classification metrics. `digits` is number of digits for formatting output floating point values. Default value is `2`. | ## Usage + Behold, the power of seqeval: ```python @@ -39,41 +42,36 @@ Behold, the power of seqeval: >>> accuracy_score(y_true, y_pred) 0.80 >>> classification_report(y_true, y_pred) - precision recall f1-score support + precision recall f1-score support - MISC 0.00 0.00 0.00 1 - PER 1.00 1.00 1.00 1 + MISC 0.00 0.00 0.00 1 + PER 1.00 1.00 1.00 1 - micro avg 0.50 0.50 0.50 2 - macro avg 0.50 0.50 0.50 2 + micro avg 0.50 0.50 0.50 2 + macro avg 0.50 0.50 0.50 2 +weighted avg 0.50 0.50 0.50 2 ``` -### Keras Callback - -Seqeval provides a callback for Keras: +If you want to explicitly specify the evaluation scheme, use `mode='strict'`: ```python -from seqeval.callbacks import F1Metrics +>>> from seqeval.scheme import IOB2 +>>> classification_report(y_true, y_pred, mode='strict', scheme=IOB2) + precision recall f1-score support + + MISC 0.00 0.00 0.00 1 + PER 1.00 1.00 1.00 1 -id2label = {0: '', 1: 'B-LOC', 2: 'I-LOC'} -callbacks = [F1Metrics(id2label)] -model.fit(x, y, validation_data=(x_val, y_val), callbacks=callbacks) + micro avg 0.50 0.50 0.50 2 + macro avg 0.50 0.50 0.50 2 +weighted avg 0.50 0.50 0.50 2 ``` +Note: The behavior of the strict mode is different from the default one which is designed to simulate conlleval. + ## Installation To install seqeval, simply run: ``` -$ pip install seqeval[cpu] +$ pip install seqeval ``` - -If you want to install seqeval on GPU environment, please run: - -```bash -$ pip install seqeval[gpu] -``` - -## Requirement - -* numpy >= 1.14.0 -* tensorflow(optional) \ No newline at end of file diff --git a/seqeval/callbacks.py b/seqeval/callbacks.py deleted file mode 100644 index 704f5d9..0000000 --- a/seqeval/callbacks.py +++ /dev/null @@ -1,99 +0,0 @@ -import numpy as np -from keras.callbacks import Callback -from seqeval.metrics import f1_score, classification_report - - -class F1Metrics(Callback): - - def __init__(self, id2label, pad_value=0, validation_data=None, digits=4): - """ - Args: - id2label (dict): id to label mapping. - (e.g. {1: 'B-LOC', 2: 'I-LOC'}) - pad_value (int): padding value. - digits (int or None): number of digits in printed classification report - (use None to print only F1 score without a report). - """ - super(F1Metrics, self).__init__() - self.id2label = id2label - self.pad_value = pad_value - self.validation_data = validation_data - self.digits = digits - self.is_fit = validation_data is None - - def convert_idx_to_name(self, y, array_indexes): - """Convert label index to name. - - Args: - y (np.ndarray): label index 2d array. - array_indexes (list): list of valid index arrays for each row. - - Returns: - y: label name list. - """ - y = [[self.id2label[idx] for idx in row[row_indexes]] for - row, row_indexes in zip(y, array_indexes)] - return y - - def predict(self, X, y): - """Predict sequences. - - Args: - X (np.ndarray): input data. - y (np.ndarray): tags. - - Returns: - y_true: true sequences. - y_pred: predicted sequences. - """ - y_pred = self.model.predict_on_batch(X) - - # reduce dimension. - y_true = np.argmax(y, -1) - y_pred = np.argmax(y_pred, -1) - - non_pad_indexes = [np.nonzero(y_true_row != self.pad_value)[0] for y_true_row in y_true] - - y_true = self.convert_idx_to_name(y_true, non_pad_indexes) - y_pred = self.convert_idx_to_name(y_pred, non_pad_indexes) - - return y_true, y_pred - - def score(self, y_true, y_pred): - """Calculate f1 score. - - Args: - y_true (list): true sequences. - y_pred (list): predicted sequences. - - Returns: - score: f1 score. - """ - score = f1_score(y_true, y_pred) - print(' - f1: {:04.2f}'.format(score * 100)) - if self.digits: - print(classification_report(y_true, y_pred, digits=self.digits)) - return score - - def on_epoch_end(self, epoch, logs={}): - if self.is_fit: - self.on_epoch_end_fit(epoch, logs) - else: - self.on_epoch_end_fit_generator(epoch, logs) - - def on_epoch_end_fit(self, epoch, logs={}): - X = self.validation_data[0] - y = self.validation_data[1] - y_true, y_pred = self.predict(X, y) - score = self.score(y_true, y_pred) - logs['f1'] = score - - def on_epoch_end_fit_generator(self, epoch, logs={}): - y_true = [] - y_pred = [] - for X, y in self.validation_data: - y_true_batch, y_pred_batch = self.predict(X, y) - y_true.extend(y_true_batch) - y_pred.extend(y_pred_batch) - score = self.score(y_true, y_pred) - logs['f1'] = score diff --git a/seqeval/metrics/sequence_labeling.py b/seqeval/metrics/sequence_labeling.py index dc6d78e..d954e72 100644 --- a/seqeval/metrics/sequence_labeling.py +++ b/seqeval/metrics/sequence_labeling.py @@ -13,6 +13,7 @@ import numpy as np from seqeval.reporters import DictReporter, StringReporter +from seqeval.metrics.v1 import classification_report as cr def get_entities(seq, suffix=False): @@ -303,15 +304,42 @@ def performance_measure(y_true, y_pred): return performance_dict -def classification_report(y_true, y_pred, digits=2, suffix=False, output_dict=False): +def classification_report(y_true, y_pred, + digits=2, + suffix=False, + output_dict=False, + mode=None, + sample_weight=None, + zero_division='warn', + scheme=None): """Build a text report showing the main classification metrics. Args: y_true : 2d array. Ground truth (correct) target values. + y_pred : 2d array. Estimated targets as returned by a classifier. + digits : int. Number of digits for formatting output floating point values. + output_dict : bool(default=False). If True, return output as dict else str. + mode : str. If mode="strict", use new classification_report. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : "warn", 0 or 1, default="warn" + Sets the value to return when there is a zero division: + - recall: when there are no positive labels + - precision: when there are no positive predictions + - f-score: both + + If set to "warn", this acts as 0, but warnings are also raised. + + scheme : Token, [IOB2, IOE2, IOBES] + + suffix : bool, False by default. + Returns: report : string/dict. Summary of the precision, recall, F1 score for each class. @@ -330,6 +358,16 @@ def classification_report(y_true, y_pred, digits=2, suffix=False, output_dict=Fa weighted avg 0.50 0.50 0.50 2 """ + if mode == 'strict': + return cr(y_true, y_pred, + digits=digits, + output_dict=output_dict, + sample_weight=sample_weight, + zero_division=zero_division, + scheme=scheme, + suffix=suffix + ) + true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) diff --git a/seqeval/metrics/v1.py b/seqeval/metrics/v1.py new file mode 100644 index 0000000..f06440c --- /dev/null +++ b/seqeval/metrics/v1.py @@ -0,0 +1,379 @@ +import warnings +from typing import List, Union, Tuple, Type, Optional + +import numpy as np +from sklearn.exceptions import UndefinedMetricWarning + +from seqeval.reporters import DictReporter, StringReporter +from seqeval.scheme import Entities, Token, auto_detect + +PER_CLASS_SCORES = Tuple[List[float], List[float], List[float], List[int]] +AVERAGE_SCORES = Tuple[float, float, float, int] +SCORES = Union[PER_CLASS_SCORES, AVERAGE_SCORES] + + +def _prf_divide(numerator, denominator, metric, + modifier, average, warn_for, zero_division='warn'): + """Performs division and handles divide-by-zero. + + On zero-division, sets the corresponding result elements equal to + 0 or 1 (according to ``zero_division``). Plus, if + ``zero_division != "warn"`` raises a warning. + + The metric, modifier and average arguments are used only for determining + an appropriate warning. + """ + mask = denominator == 0.0 + denominator = denominator.copy() + denominator[mask] = 1 # avoid infs/nans + result = numerator / denominator + + if not np.any(mask): + return result + + # if ``zero_division=1``, set those with denominator == 0 equal to 1 + result[mask] = 0.0 if zero_division in ['warn', 0] else 1.0 + + # the user will be removing warnings if zero_division is set to something + # different than its default value. If we are computing only f-score + # the warning will be raised only if precision and recall are ill-defined + if zero_division != 'warn' or metric not in warn_for: + return result + + # build appropriate warning + # E.g. "Precision and F-score are ill-defined and being set to 0.0 in + # labels with no predicted samples. Use ``zero_division`` parameter to + # control this behavior." + + if metric in warn_for and 'f-score' in warn_for: + msg_start = '{0} and F-score are'.format(metric.title()) + elif metric in warn_for: + msg_start = '{0} is'.format(metric.title()) + elif 'f-score' in warn_for: + msg_start = 'F-score is' + else: + return result + + _warn_prf(average, modifier, msg_start, len(result)) + + return result + + +def _warn_prf(average, modifier, msg_start, result_size): + axis0, axis1 = 'sample', 'label' + if average == 'samples': + axis0, axis1 = axis1, axis0 + msg = ('{0} ill-defined and being set to 0.0 {{0}} ' + 'no {1} {2}s. Use `zero_division` parameter to control' + ' this behavior.'.format(msg_start, modifier, axis0)) + if result_size == 1: + msg = msg.format('due to') + else: + msg = msg.format('in {0}s with'.format(axis1)) + warnings.warn(msg, UndefinedMetricWarning, stacklevel=2) + + +def unique_labels(y_true: List[List[str]], y_pred: List[List[str]], + scheme: Type[Token], suffix: bool = False) -> List[str]: + sequences_true = Entities(y_true, scheme, suffix) + sequences_pred = Entities(y_pred, scheme, suffix) + unique_tags = sequences_true.unique_tags | sequences_pred.unique_tags + return sorted(unique_tags) + + +def check_consistent_length(y_true: List[List[str]], y_pred: List[List[str]]): + """Check that all arrays have consistent first and second dimensions. + + Checks whether all objects in arrays have the same shape or length. + + Args: + y_true : 2d array. + y_pred : 2d array. + """ + len_true = list(map(len, y_true)) + len_pred = list(map(len, y_pred)) + is_list = set(map(type, y_true + y_pred)) + if len(y_true) != len(y_pred) or len_true != len_pred or not is_list == {list}: + raise ValueError("Found input variables with inconsistent numbers of samples:\n{}\n{}".format(len_true, len_pred)) + + +def precision_recall_fscore_support(y_true: List[List[str]], + y_pred: List[List[str]], + *, + average: Optional[str] = None, + warn_for=('precision', 'recall', 'f-score'), + beta: float = 1.0, + sample_weight=None, + zero_division: str = 'warn', + scheme: Type[Token] = None, + suffix: bool = False) -> SCORES: + """Compute precision, recall, F-measure and support for each class. + + Args: + y_true : 2d array. Ground truth (correct) target values. + + y_pred : 2d array. Estimated targets as returned by a tagger. + + beta : float, 1.0 by default + The strength of recall versus precision in the F-score. + + average : string, [None (default), 'micro', 'macro', 'weighted'] + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + + warn_for : tuple or set, for internal use + This determines which warnings will be made in the case that this + function is being used to return only one of its metrics. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : "warn", 0 or 1, default="warn" + Sets the value to return when there is a zero division: + - recall: when there are no positive labels + - precision: when there are no positive predictions + - f-score: both + + If set to "warn", this acts as 0, but warnings are also raised. + + scheme : Token, [IOB2, IOE2, IOBES] + + suffix : bool, False by default. + + Returns: + precision : float (if average is not None) or array of float, shape = [n_unique_labels] + + recall : float (if average is not None) or array of float, , shape = [n_unique_labels] + + fbeta_score : float (if average is not None) or array of float, shape = [n_unique_labels] + + support : int (if average is not None) or array of int, shape = [n_unique_labels] + The number of occurrences of each label in ``y_true``. + + Examples: + >>> from seqeval.metrics.v1 import precision_recall_fscore_support + >>> from seqeval.scheme import IOB2 + >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] + >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] + >>> precision_recall_fscore_support(y_true, y_pred, average='macro', scheme=IOB2) + (0.5, 0.5, 0.5, 2) + >>> precision_recall_fscore_support(y_true, y_pred, average='micro', scheme=IOB2) + (0.5, 0.5, 0.5, 2) + >>> precision_recall_fscore_support(y_true, y_pred, average='weighted', scheme=IOB2) + (0.5, 0.5, 0.5, 2) + + It is possible to compute per-label precisions, recalls, F1-scores and + supports instead of averaging: + + >>> precision_recall_fscore_support(y_true, y_pred, average=None, scheme=IOB2) + (array([0., 1.]), array([0., 1.]), array([0., 1.]), array([1, 1])) + + Notes: + When ``true positive + false positive == 0``, precision is undefined; + When ``true positive + false negative == 0``, recall is undefined. + In such cases, by default the metric will be set to 0, as will f-score, + and ``UndefinedMetricWarning`` will be raised. This behavior can be + modified with ``zero_division``. + """ + if beta < 0: + raise ValueError('beta should be >=0 in the F-beta score') + + average_options = (None, 'micro', 'macro', 'weighted') + if average not in average_options: + raise ValueError('average has to be one of {}'.format(average_options)) + + check_consistent_length(y_true, y_pred) + + target_names = unique_labels(y_true, y_pred, scheme, suffix) + entities_true = Entities(y_true, scheme, suffix) + entities_pred = Entities(y_pred, scheme, suffix) + + tp_sum = np.array([], dtype=np.int32) + pred_sum = np.array([], dtype=np.int32) + true_sum = np.array([], dtype=np.int32) + for type_name in target_names: + entities_true_type = entities_true.filter(type_name) + entities_pred_type = entities_pred.filter(type_name) + tp_sum = np.append(tp_sum, len(entities_true_type & entities_pred_type)) + pred_sum = np.append(pred_sum, len(entities_pred_type)) + true_sum = np.append(true_sum, len(entities_true_type)) + + if average == 'micro': + tp_sum = np.array([tp_sum.sum()]) + pred_sum = np.array([pred_sum.sum()]) + true_sum = np.array([true_sum.sum()]) + + # Finally, we have all our sufficient statistics. Divide! # + beta2 = beta ** 2 + + # Divide, and on zero-division, set scores and/or warn according to + # zero_division: + precision = _prf_divide( + numerator=tp_sum, + denominator=pred_sum, + metric='precision', + modifier='predicted', + average=average, + warn_for=warn_for, + zero_division=zero_division + ) + recall = _prf_divide( + numerator=tp_sum, + denominator=true_sum, + metric='recall', + modifier='true', + average=average, + warn_for=warn_for, + zero_division=zero_division + ) + + # warn for f-score only if zero_division is warn, it is in warn_for + # and BOTH prec and rec are ill-defined + if zero_division == 'warn' and ('f-score',) == warn_for: + if (pred_sum[true_sum == 0] == 0).any(): + _warn_prf( + average, 'true nor predicted', 'F-score is', len(true_sum) + ) + + # if tp == 0 F will be 1 only if all predictions are zero, all labels are + # zero, and zero_division=1. In all other case, 0 + if np.isposinf(beta): + f_score = recall + else: + denom = beta2 * precision + recall + + denom[denom == 0.] = 1 # avoid division by 0 + f_score = (1 + beta2) * precision * recall / denom + + # Average the results + if average == 'weighted': + weights = true_sum + if weights.sum() == 0: + zero_division_value = 0.0 if zero_division in ['warn', 0] else 1.0 + # precision is zero_division if there are no positive predictions + # recall is zero_division if there are no positive labels + # fscore is zero_division if all labels AND predictions are + # negative + return (zero_division_value if pred_sum.sum() == 0 else 0.0, + zero_division_value, + zero_division_value if pred_sum.sum() == 0 else 0.0, + sum(true_sum)) + + elif average == 'samples': + weights = sample_weight + else: + weights = None + + if average is not None: + precision = np.average(precision, weights=weights) + recall = np.average(recall, weights=weights) + f_score = np.average(f_score, weights=weights) + true_sum = sum(true_sum) + + return precision, recall, f_score, true_sum + + +def classification_report(y_true: List[List[str]], + y_pred: List[List[str]], + *, + sample_weight=None, + digits: int = 2, + output_dict: bool = False, + zero_division: str = 'warn', + suffix: bool = False, + scheme: Type[Token] = None) -> Union[str, dict]: + """Build a text report showing the main tagging metrics. + + Args: + y_true : 2d array. Ground truth (correct) target values. + + y_pred : 2d array. Estimated targets as returned by a classifier. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + digits : int. Number of digits for formatting output floating point values. + + output_dict : bool(default=False). If True, return output as dict else str. + + zero_division : "warn", 0 or 1, default="warn" + Sets the value to return when there is a zero division: + - recall: when there are no positive labels + - precision: when there are no positive predictions + - f-score: both + + If set to "warn", this acts as 0, but warnings are also raised. + + scheme : Token, [IOB2, IOE2, IOBES] + + suffix : bool, False by default. + + Returns: + report : string/dict. Summary of the precision, recall, F1 score for each class. + + Examples: + >>> from seqeval.metrics.v1 import classification_report + >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] + >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] + >>> print(classification_report(y_true, y_pred)) + precision recall f1-score support + + MISC 0.00 0.00 0.00 1 + PER 1.00 1.00 1.00 1 + + micro avg 0.50 0.50 0.50 2 + macro avg 0.50 0.50 0.50 2 + weighted avg 0.50 0.50 0.50 2 + + """ + if scheme is None or not issubclass(scheme, Token): + scheme = auto_detect(y_true, suffix) + target_names = unique_labels(y_true, y_pred, scheme, suffix) + + if output_dict: + reporter = DictReporter() + else: + name_width = max(map(len, target_names)) + avg_width = len('weighted avg') + width = max(name_width, avg_width, digits) + reporter = StringReporter(width=width, digits=digits) + + # compute per-class scores. + p, r, f1, s = precision_recall_fscore_support( + y_true, y_pred, + average=None, + sample_weight=sample_weight, + zero_division=zero_division, + scheme=scheme, + suffix=suffix + ) + for row in zip(target_names, p, r, f1, s): + reporter.write(*row) + reporter.write_blank() + + # compute average scores. + average_options = ('micro', 'macro', 'weighted') + for average in average_options: + avg_p, avg_r, avg_f1, support = precision_recall_fscore_support( + y_true, y_pred, + average=average, + sample_weight=sample_weight, + zero_division=zero_division, + scheme=scheme, + suffix=suffix + ) + reporter.write('{} avg'.format(average), avg_p, avg_r, avg_f1, support) + reporter.write_blank() + + return reporter.report() diff --git a/seqeval/scheme.py b/seqeval/scheme.py index 571395e..0357f87 100644 --- a/seqeval/scheme.py +++ b/seqeval/scheme.py @@ -1,25 +1,28 @@ import enum +from itertools import chain from typing import List, Set, Tuple, Type class Entity: - def __init__(self, start: int, end: int, tag: str): + def __init__(self, sent_id: int, start: int, end: int, tag: str): + self.sent_id = sent_id self.start = start self.end = end self.tag = tag def __repr__(self): - return '({}, {}, {})'.format(self.tag, self.start, self.end) + return '({}, {}, {}, {})'.format(self.sent_id, self.tag, self.start, self.end) def __eq__(self, other: 'Entity'): - return self.start == other.start and self.end == other.end and self.tag == other.tag + return self.sent_id == other.sent_id and self.start == other.start and\ + self.end == other.end and self.tag == other.tag def __hash__(self): return hash(self.to_tuple()) def to_tuple(self): - return self.tag, self.start, self.end + return self.sent_id, self.tag, self.start, self.end class Prefix(enum.Flag): @@ -205,10 +208,12 @@ class IOBES(Token): class Tokens: - def __init__(self, tokens: List[str], scheme: Type[Token], suffix: bool = False, delimiter: str = '-'): + def __init__(self, tokens: List[str], scheme: Type[Token], + suffix: bool = False, delimiter: str = '-', sent_id: int = None): self.tokens = [scheme(token, suffix=suffix, delimiter=delimiter) for token in tokens] self.scheme = scheme self.outside_token = scheme('O', suffix=suffix, delimiter=delimiter) + self.sent_id = sent_id @property def entities(self): @@ -231,8 +236,8 @@ def entities(self): if token.is_start(prev): end = self._forward(start=i + 1, prev=token) if self._is_end(end): - entity = Entity(start=i, end=end, tag=token.tag) - entities.append(entity.to_tuple()) + entity = Entity(sent_id=self.sent_id, start=i, end=end, tag=token.tag) + entities.append(entity) i = end else: i += 1 @@ -259,6 +264,26 @@ def extended_tokens(self): return tokens +class Entities: + + def __init__(self, sequences: List[List[str]], scheme: Type[Token], suffix: bool = False, delimiter: str = '-'): + self.entities = [ + Tokens(seq, scheme=scheme, suffix=suffix, delimiter=delimiter, sent_id=sent_id).entities + for sent_id, seq in enumerate(sequences) + ] + + def filter(self, tag_name: str): + entities = {entity for entity in chain(*self.entities) if entity.tag == tag_name} + return entities + + @property + def unique_tags(self): + tags = { + entity.tag for entity in chain(*self.entities) + } + return tags + + def auto_detect(sequences: List[List[str]], suffix: bool = False, delimiter: str = '-'): """Detects scheme automatically. @@ -277,14 +302,34 @@ def auto_detect(sequences: List[List[str]], suffix: bool = False, delimiter: str except KeyError: raise ValueError(error_message.format(token)) - iob_prefixes = {Prefix.I, Prefix.O, Prefix.B} - ioe_prefixes = {Prefix.I, Prefix.O, Prefix.E} - iobes_prefixes = {Prefix.I, Prefix.O, Prefix.B, Prefix.E, Prefix.S} - if prefixes == iob_prefixes or prefixes == iob_prefixes - {Prefix.O}: + allowed_iob2_prefixes = [ + {Prefix.I, Prefix.O, Prefix.B}, + {Prefix.I, Prefix.B}, + {Prefix.B, Prefix.O}, + {Prefix.B} + ] + allowed_ioe2_prefixes = [ + {Prefix.I, Prefix.O, Prefix.E}, + {Prefix.I, Prefix.E}, + {Prefix.E, Prefix.O}, + {Prefix.E} + ] + allowed_iobes_prefixes = [ + {Prefix.I, Prefix.O, Prefix.B, Prefix.E, Prefix.S}, + {Prefix.I, Prefix.B, Prefix.E, Prefix.S}, + {Prefix.I, Prefix.O, Prefix.B, Prefix.E}, + {Prefix.O, Prefix.B, Prefix.E, Prefix.S}, + {Prefix.I, Prefix.B, Prefix.E}, + {Prefix.B, Prefix.E, Prefix.S}, + {Prefix.O, Prefix.B, Prefix.E}, + {Prefix.B, Prefix.E}, + {Prefix.S} + ] + if prefixes in allowed_iob2_prefixes: return IOB2 - elif prefixes == ioe_prefixes or prefixes == ioe_prefixes - {Prefix.O}: + elif prefixes in allowed_ioe2_prefixes: return IOE2 - elif prefixes == iobes_prefixes or prefixes == iobes_prefixes - {Prefix.O}: + elif prefixes in allowed_iobes_prefixes: return IOBES else: raise ValueError(error_message.format(prefixes)) diff --git a/setup.py b/setup.py index 80c0373..d5785c0 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ os.system('python setup.py sdist bdist_wheel upload') sys.exit() -required = ['numpy>=1.14.0', 'Keras>=2.2.4'] +required = ['numpy==1.19.2', 'scikit-learn==0.23.2'] setup( name=NAME, @@ -38,8 +38,8 @@ packages=find_packages(exclude=('tests',)), install_requires=required, extras_require={ - 'cpu': ['tensorflow>=1.13.1'], - 'gpu': ['tensorflow-gpu'], + 'cpu': [], + 'gpu': [], }, include_package_data=True, license=LICENSE, @@ -56,4 +56,4 @@ 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy' ], -) \ No newline at end of file +) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 83bd5a8..27055ac 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -7,15 +7,6 @@ import subprocess import unittest -import numpy -from keras import Sequential -from keras.backend import constant -from keras.layers import Lambda -from keras.preprocessing.sequence import pad_sequences -from keras.preprocessing.text import Tokenizer -from keras.utils import to_categorical - -from seqeval.callbacks import F1Metrics from seqeval.metrics import (f1_score, accuracy_score, classification_report, precision_score, recall_score, performance_measure) @@ -120,43 +111,6 @@ def test_statistical_tests(self): os.remove(filepath) - def test_keras_callback(self): - expected_score = f1_score(self.y_true, self.y_pred) - tokenizer = Tokenizer(lower=False) - tokenizer.fit_on_texts(self.y_true) - maxlen = max((len(row) for row in self.y_true)) - - def prepare(y, padding): - indexes = tokenizer.texts_to_sequences(y) - padded = pad_sequences(indexes, maxlen=maxlen, padding=padding, truncating=padding) - categorical = to_categorical(padded) - return categorical - - for padding in ('pre', 'post'): - callback = F1Metrics(id2label=tokenizer.index_word) - y_true_cat = prepare(self.y_true, padding) - y_pred_cat = prepare(self.y_pred, padding) - - input_shape = (1,) - layer = Lambda(lambda _: constant(y_pred_cat), input_shape=input_shape) - fake_model = Sequential(layers=[layer]) - callback.set_model(fake_model) - - X = numpy.zeros((y_true_cat.shape[0], 1)) - - # Verify that the callback translates sequences correctly by itself - y_true_cb, y_pred_cb = callback.predict(X, y_true_cat) - self.assertEqual(y_pred_cb, self.y_pred) - self.assertEqual(y_true_cb, self.y_true) - - # Verify that the callback stores the correct number in logs - fake_model.compile(optimizer='adam', loss='categorical_crossentropy') - history = fake_model.fit(x=X, batch_size=y_true_cat.shape[0], y=y_true_cat, - validation_data=(X, y_true_cat), - callbacks=[callback]) - actual_score = history.history['f1'][0] - self.assertAlmostEqual(actual_score, expected_score) - def load_labels(self, filename): y_true, y_pred = [], [] with open(filename) as f: diff --git a/tests/test_reporters.py b/tests/test_reporters.py index 4a85be5..d20835a 100644 --- a/tests/test_reporters.py +++ b/tests/test_reporters.py @@ -1,26 +1,61 @@ import pytest -from seqeval.reporters import DictReporter - - -@pytest.mark.parametrize( - 'rows, expected', - [ - ([], {}), - ( - [['PERSON', 0.82, 0.79, 0.81, 24]], - { - 'PERSON': { - 'precision': 0.82, - 'recall': 0.79, - 'f1-score': 0.81, - 'support': 24 +from seqeval.reporters import DictReporter, StringReporter + + +class TestDictReporter: + + def test_write_empty(self): + reporter = DictReporter() + reporter.write_blank() + assert reporter.report_dict == {} + + @pytest.mark.parametrize( + 'rows, expected', + [ + ([], {}), + ( + [['PERSON', 0.82, 0.79, 0.81, 24]], + { + 'PERSON': { + 'precision': 0.82, + 'recall': 0.79, + 'f1-score': 0.81, + 'support': 24 + } } - } - ) - ] -) -def test_dict_reporter_output(rows, expected): - reporter = DictReporter() - for row in rows: - reporter.write(*row) - assert reporter.report() == expected + ) + ] + ) + def test_dict_reporter_output(self, rows, expected): + reporter = DictReporter() + for row in rows: + reporter.write(*row) + assert reporter.report() == expected + + +class TestStringReporter: + + def test_write_empty(self): + reporter = StringReporter() + reporter.write_blank() + assert reporter.buffer == [''] + + def test_write_header(self): + reporter = StringReporter() + report = reporter.write_header() + assert 'precision' in report + assert 'recall' in report + assert 'f1-score' in report + assert 'support' in report + + def test_write(self): + reporter = StringReporter() + reporter.write('XXX', 0, 0, 0, 0) + assert 'XXX' in reporter.buffer[0] + + def test_report(self): + reporter = StringReporter() + reporter.write('XXX', 0, 0, 0, 0) + report = reporter.report() + assert 'XXX' in report + assert 'precision' in report diff --git a/tests/test_scheme.py b/tests/test_scheme.py index 179ec0b..ff9ba50 100644 --- a/tests/test_scheme.py +++ b/tests/test_scheme.py @@ -1,6 +1,43 @@ import pytest -from seqeval.scheme import IOB1, IOB2, IOBES, IOE1, IOE2, Prefix, Tokens, Token, auto_detect +from seqeval.scheme import IOB1, IOB2, IOBES, IOE1, IOE2, Prefix, Tokens, Token, auto_detect, Entity, Entities + + +def test_entity_repr(): + data = (0, 0, 0, 0) + entity = Entity(*data) + assert str(data) == str(entity) + + +@pytest.mark.parametrize( + 'data1, data2, expected', + [ + ((0, 0, 0, 0), (0, 0, 0, 0), True), + ((1, 0, 0, 0), (0, 0, 0, 0), False), + ((0, 1, 0, 0), (0, 0, 0, 0), False), + ((0, 0, 1, 0), (0, 0, 0, 0), False), + ((0, 0, 0, 1), (0, 0, 0, 0), False) + ] +) +def test_entity_equality(data1, data2, expected): + entity1 = Entity(*data1) + entity2 = Entity(*data2) + is_equal = entity1 == entity2 + assert is_equal == expected + + +@pytest.mark.parametrize( + 'sequences, tag_name, expected', + [ + ([['B-PER', 'B-ORG']], '', set()), + ([['B-PER', 'B-ORG']], 'ORG', {Entity(0, 1, 2, 'ORG')}), + ([['B-PER', 'B-ORG']], 'PER', {Entity(0, 0, 1, 'PER')}) + ] +) +def test_entities_filter(sequences, tag_name, expected): + entities = Entities(sequences, IOB2) + filtered = entities.filter(tag_name) + assert filtered == expected @pytest.mark.parametrize( @@ -196,7 +233,7 @@ def test_iobes_start_inside_end(prev, token, expected): ) def test_iob1_tokens(tokens, expected): tokens = Tokens(tokens, IOB1) - entities = tokens.entities + entities = [entity.to_tuple()[1:] for entity in tokens.entities] assert entities == expected @@ -220,7 +257,7 @@ def test_iob1_tokens(tokens, expected): ) def test_iob1_tokens_without_tag(tokens, expected): tokens = Tokens(tokens, IOB1) - entities = tokens.entities + entities = [entity.to_tuple()[1:] for entity in tokens.entities] assert entities == expected @@ -247,7 +284,7 @@ def test_iob1_tokens_without_tag(tokens, expected): ) def test_iob2_tokens(tokens, expected): tokens = Tokens(tokens, IOB2) - entities = tokens.entities + entities = [entity.to_tuple()[1:] for entity in tokens.entities] assert entities == expected @@ -271,7 +308,7 @@ def test_iob2_tokens(tokens, expected): ) def test_iob2_tokens_without_tag(tokens, expected): tokens = Tokens(tokens, IOB2) - entities = tokens.entities + entities = [entity.to_tuple()[1:] for entity in tokens.entities] assert entities == expected @@ -298,7 +335,7 @@ def test_iob2_tokens_without_tag(tokens, expected): ) def test_ioe1_tokens(tokens, expected): tokens = Tokens(tokens, IOE1) - entities = tokens.entities + entities = [entity.to_tuple()[1:] for entity in tokens.entities] assert entities == expected @@ -322,7 +359,7 @@ def test_ioe1_tokens(tokens, expected): ) def test_ioe1_tokens_without_tag(tokens, expected): tokens = Tokens(tokens, IOE1) - entities = tokens.entities + entities = [entity.to_tuple()[1:] for entity in tokens.entities] assert entities == expected @@ -349,7 +386,7 @@ def test_ioe1_tokens_without_tag(tokens, expected): ) def test_ioe2_tokens(tokens, expected): tokens = Tokens(tokens, IOE2) - entities = tokens.entities + entities = [entity.to_tuple()[1:] for entity in tokens.entities] assert entities == expected @@ -373,7 +410,7 @@ def test_ioe2_tokens(tokens, expected): ) def test_ioe2_tokens_without_tag(tokens, expected): tokens = Tokens(tokens, IOE2) - entities = tokens.entities + entities = [entity.to_tuple()[1:] for entity in tokens.entities] assert entities == expected @@ -418,7 +455,7 @@ def test_ioe2_tokens_without_tag(tokens, expected): ) def test_iobes_tokens(tokens, expected): tokens = Tokens(tokens, IOBES) - entities = tokens.entities + entities = [entity.to_tuple()[1:] for entity in tokens.entities] assert entities == expected @@ -459,7 +496,7 @@ def test_iobes_tokens(tokens, expected): ) def test_iobes_tokens_without_tag(tokens, expected): tokens = Tokens(tokens, IOBES) - entities = tokens.entities + entities = [entity.to_tuple()[1:] for entity in tokens.entities] assert entities == expected @@ -530,10 +567,21 @@ class TestAutoDetect: [ ([['B', 'I', 'O']], IOB2), ([['B', 'I']], IOB2), + ([['B', 'O']], IOB2), + ([['B']], IOB2), ([['I', 'O', 'E']], IOE2), ([['I', 'E']], IOE2), + ([['E', 'O']], IOE2), + ([['E']], IOE2), ([['I', 'O', 'B', 'E', 'S']], IOBES), - ([['I', 'B', 'E', 'S']], IOBES) + ([['I', 'B', 'E', 'S']], IOBES), + ([['I', 'O', 'B', 'E']], IOBES), + ([['O', 'B', 'E', 'S']], IOBES), + ([['I', 'B', 'E']], IOBES), + ([['B', 'E', 'S']], IOBES), + ([['O', 'B', 'E']], IOBES), + ([['B', 'E']], IOBES), + ([['S']], IOBES) ] ) def test_valid_scheme(self, sequences, expected): diff --git a/tests/test_v1.py b/tests/test_v1.py new file mode 100644 index 0000000..d03b295 --- /dev/null +++ b/tests/test_v1.py @@ -0,0 +1,161 @@ +import numpy as np +import pytest +from sklearn.exceptions import UndefinedMetricWarning +from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal + +from seqeval.metrics.v1 import unique_labels, precision_recall_fscore_support, classification_report +from seqeval.scheme import IOB2 + + +@pytest.mark.parametrize( + 'y_true, y_pred, expected', + [ + ([[]], [[]], []), + ([['B-PER']], [[]], ['PER']), + ([[]], [['B-PER']], ['PER']), + ([['B-PER']], [['B-PER']], ['PER']), + ([['B-PER', 'O']], [[]], ['PER']), + ([['B-PER', 'I-PER']], [[]], ['PER']), + ([['B-PER']], [['B-ORG']], ['ORG', 'PER']) + ] +) +def test_unique_labels(y_true, y_pred, expected): + labels = unique_labels(y_true, y_pred, IOB2) + assert labels == expected + + +class TestPrecisionRecallFscoreSupport: + + def test_bad_beta(self): + y_true, y_pred = [[]], [[]] + with pytest.raises(ValueError): + precision_recall_fscore_support(y_true, y_pred, beta=-0.1, scheme=IOB2) + + def test_bad_average_option(self): + y_true, y_pred = [[]], [[]] + with pytest.raises(ValueError): + precision_recall_fscore_support(y_true, y_pred, average='mega', scheme=IOB2) + + @pytest.mark.parametrize( + 'average', [None, 'macro', 'weighted'] + ) + def test_warning(self, average): + y_true = [['B-PER']] + y_pred = [['B-Test']] + with pytest.warns(UndefinedMetricWarning): + precision_recall_fscore_support(y_true, y_pred, average=average, beta=1.0, scheme=IOB2) + + def test_fscore_warning(self): + with pytest.warns(UndefinedMetricWarning): + precision_recall_fscore_support([[]], [[]], average='micro', scheme=IOB2, warn_for=('f-score', )) + + def test_length(self): + y_true = [['B-PER']] + y_pred = [['B-PER', 'O']] + with pytest.raises(ValueError): + precision_recall_fscore_support(y_true, y_pred, scheme=IOB2) + + def test_weighted_true_sum_zero(self): + res = precision_recall_fscore_support([['O']], [['O']], average='weighted', scheme=IOB2) + assert res == (0.0, 0.0, 0.0, 0) + + def test_scores(self): + y_true = [['B-A', 'B-B', 'O', 'B-A']] + y_pred = [['O', 'B-B', 'B-C', 'B-A']] + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None, scheme=IOB2) + assert_array_almost_equal(p, [1.00, 1.00, 0.00], 2) + assert_array_almost_equal(r, [0.50, 1.00, 0.00], 2) + assert_array_almost_equal(f, [0.67, 1.00, 0.00], 2) + assert_array_equal(s, [2, 1, 0]) + + @pytest.mark.parametrize( + 'average, expected', + [ + ('micro', [0.67, 0.67, 0.67, 3]), + ('macro', [0.67, 0.50, 0.56, 3]), + ('weighted', [1.00, 0.67, 0.78, 3]) + ] + ) + def test_average_scores(self, average, expected): + y_true = [['B-A', 'B-B', 'O', 'B-A']] + y_pred = [['O', 'B-B', 'B-C', 'B-A']] + scores = precision_recall_fscore_support(y_true, y_pred, average=average, scheme=IOB2) + assert_array_almost_equal(scores, expected, 2) + + @pytest.mark.parametrize( + 'average, expected', + [ + ('micro', [0.67, 0.67, 0.67, 3]), + ('macro', [0.67, 0.50, 0.50, 3]), + ('weighted', [1.00, 0.67, 0.67, 3]) + ] + ) + def test_average_scores_beta_inf(self, average, expected): + y_true = [['B-A', 'B-B', 'O', 'B-A']] + y_pred = [['O', 'B-B', 'B-C', 'B-A']] + scores = precision_recall_fscore_support(y_true, y_pred, average=average, scheme=IOB2, beta=np.inf) + assert_array_almost_equal(scores, expected, 2) + + +class TestClassificationReport: + + def test_output_dict(self): + y_true = [['B-A', 'B-B', 'O', 'B-A']] + y_pred = [['O', 'B-B', 'B-C', 'B-A']] + report = classification_report(y_true, y_pred, output_dict=True) + expected_report = { + 'A': { + 'f1-score': 0.6666666666666666, + 'precision': 1.0, + 'recall': 0.5, + 'support': 2 + }, + 'B': { + 'f1-score': 1.0, + 'precision': 1.0, + 'recall': 1.0, + 'support': 1 + }, + 'C': { + 'f1-score': 0.0, + 'precision': 0.0, + 'recall': 0.0, + 'support': 0 + }, + 'macro avg': { + 'f1-score': 0.5555555555555555, + 'precision': 0.6666666666666666, + 'recall': 0.5, + 'support': 3 + }, + 'micro avg': { + 'f1-score': 0.6666666666666666, + 'precision': 0.6666666666666666, + 'recall': 0.6666666666666666, + 'support': 3 + }, + 'weighted avg': { + 'f1-score': 0.7777777777777777, + 'precision': 1.0, + 'recall': 0.6666666666666666, + 'support': 3 + } + } + assert report == expected_report + + def test_output_string(self): + y_true = [['B-A', 'B-B', 'O', 'B-A']] + y_pred = [['O', 'B-B', 'B-C', 'B-A']] + report = classification_report(y_true, y_pred) + expected_report = """\ + precision recall f1-score support + + A 1.00 0.50 0.67 2 + B 1.00 1.00 1.00 1 + C 0.00 0.00 0.00 0 + + micro avg 0.67 0.67 0.67 3 + macro avg 0.67 0.50 0.56 3 +weighted avg 1.00 0.67 0.78 3 +""" + assert report == expected_report