diff --git a/Pipfile b/Pipfile
index ec2736b..cef4773 100644
--- a/Pipfile
+++ b/Pipfile
@@ -12,11 +12,12 @@ isort = "*"
 
 [packages]
 numpy = "*"
+scikit-learn = "*"
 
 [requires]
 python_version = "3.8"
 
 [scripts]
 isort = "isort . -c"
-test = "pytest tests/test_scheme.py --cov=seqeval --cov-report=term-missing -vv"
+test = "pytest tests/test_scheme.py tests/test_reporters.py tests/test_v1.py --cov=seqeval --cov-report=term-missing -vv"
 flake8 = "flake8 seqeval --ignore=F401,E741"
diff --git a/Pipfile.lock b/Pipfile.lock
index b2cdceb..b2a9257 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "b4e22269a630553874e70082ad4ec752d455c4a5da5beed5e7cebcdb06dd35ab"
+            "sha256": "ed3e34c865fd90593db1c52563d09c3f8328d8821cec176a467fcf2b3a7da7fa"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -16,6 +16,14 @@
         ]
     },
     "default": {
+        "joblib": {
+            "hashes": [
+                "sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",
+                "sha256:9e284edd6be6b71883a63c9b7f124738a3c16195513ad940eae7e3438de885d5"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==0.17.0"
+        },
         "numpy": {
             "hashes": [
                 "sha256:04c7d4ebc5ff93d9822075ddb1751ff392a4375e5885299445fcebf877f179d5",
@@ -47,6 +55,58 @@
             ],
             "index": "pypi",
             "version": "==1.19.2"
+        },
+        "scikit-learn": {
+            "hashes": [
+                "sha256:0a127cc70990d4c15b1019680bfedc7fec6c23d14d3719fdf9b64b22d37cdeca",
+                "sha256:0d39748e7c9669ba648acf40fb3ce96b8a07b240db6888563a7cb76e05e0d9cc",
+                "sha256:1b8a391de95f6285a2f9adffb7db0892718950954b7149a70c783dc848f104ea",
+                "sha256:20766f515e6cd6f954554387dfae705d93c7b544ec0e6c6a5d8e006f6f7ef480",
+                "sha256:2aa95c2f17d2f80534156215c87bee72b6aa314a7f8b8fe92a2d71f47280570d",
+                "sha256:5ce7a8021c9defc2b75620571b350acc4a7d9763c25b7593621ef50f3bd019a2",
+                "sha256:6c28a1d00aae7c3c9568f61aafeaad813f0f01c729bee4fd9479e2132b215c1d",
+                "sha256:7671bbeddd7f4f9a6968f3b5442dac5f22bf1ba06709ef888cc9132ad354a9ab",
+                "sha256:914ac2b45a058d3f1338d7736200f7f3b094857758895f8667be8a81ff443b5b",
+                "sha256:98508723f44c61896a4e15894b2016762a55555fbf09365a0bb1870ecbd442de",
+                "sha256:a64817b050efd50f9abcfd311870073e500ae11b299683a519fbb52d85e08d25",
+                "sha256:cb3e76380312e1f86abd20340ab1d5b3cc46a26f6593d3c33c9ea3e4c7134028",
+                "sha256:d0dcaa54263307075cb93d0bee3ceb02821093b1b3d25f66021987d305d01dce",
+                "sha256:d9a1ce5f099f29c7c33181cc4386660e0ba891b21a60dc036bf369e3a3ee3aec",
+                "sha256:da8e7c302003dd765d92a5616678e591f347460ac7b53e53d667be7dfe6d1b10",
+                "sha256:daf276c465c38ef736a79bd79fc80a249f746bcbcae50c40945428f7ece074f8"
+            ],
+            "index": "pypi",
+            "version": "==0.23.2"
+        },
+        "scipy": {
+            "hashes": [
+                "sha256:066c513d90eb3fd7567a9e150828d39111ebd88d3e924cdfc9f8ce19ab6f90c9",
+                "sha256:07e52b316b40a4f001667d1ad4eb5f2318738de34597bd91537851365b6c61f1",
+                "sha256:0a0e9a4e58a4734c2eba917f834b25b7e3b6dc333901ce7784fd31aefbd37b2f",
+                "sha256:1c7564a4810c1cd77fcdee7fa726d7d39d4e2695ad252d7c86c3ea9d85b7fb8f",
+                "sha256:315aa2165aca31375f4e26c230188db192ed901761390be908c9b21d8b07df62",
+                "sha256:6e86c873fe1335d88b7a4bfa09d021f27a9e753758fd75f3f92d714aa4093768",
+                "sha256:8e28e74b97fc8d6aa0454989db3b5d36fc27e69cef39a7ee5eaf8174ca1123cb",
+                "sha256:92eb04041d371fea828858e4fff182453c25ae3eaa8782d9b6c32b25857d23bc",
+                "sha256:a0afbb967fd2c98efad5f4c24439a640d39463282040a88e8e928db647d8ac3d",
+                "sha256:a785409c0fa51764766840185a34f96a0a93527a0ff0230484d33a8ed085c8f8",
+                "sha256:cca9fce15109a36a0a9f9cfc64f870f1c140cb235ddf27fe0328e6afb44dfed0",
+                "sha256:d56b10d8ed72ec1be76bf10508446df60954f08a41c2d40778bc29a3a9ad9bce",
+                "sha256:dac09281a0eacd59974e24525a3bc90fa39b4e95177e638a31b14db60d3fa806",
+                "sha256:ec5fe57e46828d034775b00cd625c4a7b5c7d2e354c3b258d820c6c72212a6ec",
+                "sha256:eecf40fa87eeda53e8e11d265ff2254729d04000cd40bae648e76ff268885d66",
+                "sha256:fc98f3eac993b9bfdd392e675dfe19850cc8c7246a8fd2b42443e506344be7d9"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==1.5.2"
+        },
+        "threadpoolctl": {
+            "hashes": [
+                "sha256:38b74ca20ff3bb42caca8b00055111d74159ee95c4370882bbff2b93d24da725",
+                "sha256:ddc57c96a38beb63db45d6c159b5ab07b6bced12c45a1f07b2b92f272aebfa6b"
+            ],
+            "markers": "python_version >= '3.5'",
+            "version": "==2.1.0"
         }
     },
     "develop": {
@@ -122,11 +182,11 @@
         },
         "isort": {
             "hashes": [
-                "sha256:36f0c6659b9000597e92618d05b72d4181104cf59472b1c6a039e3783f930c95",
-                "sha256:ba040c24d20aa302f78f4747df549573ae1eaf8e1084269199154da9c483f07f"
+                "sha256:2f510f34ae18a8d0958c53eec51ef84fd099f07c4c639676525acbcd7b5bd3ff",
+                "sha256:dd3211f513f4a92ec1ec1876fc1dc3c686649c349d49523f5b5adbb0814e5960"
             ],
             "index": "pypi",
-            "version": "==5.5.4"
+            "version": "==5.6.1"
         },
         "mccabe": {
             "hashes": [
diff --git a/README.md b/README.md
index f0382cf..24ad774 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 # seqeval
+
 seqeval is a Python framework for sequence labeling evaluation.
 seqeval can evaluate the performance of chunking tasks such as named-entity recognition, part-of-speech tagging, semantic role labeling and so on.
 
@@ -6,6 +7,7 @@ This is well-tested by using the Perl script [conlleval](https://www.clips.uantw
 which can be used for measuring the performance of a system that has processed the CoNLL-2000 shared task data.
 
 ## Support features
+
 seqeval supports following formats:
 * IOB1
 * IOB2
@@ -24,6 +26,7 @@ and supports following metrics:
 | classification_report(y\_true, y\_pred, digits=2)  | Build a text report showing the main classification metrics. `digits` is number of digits for formatting output floating point values. Default value is `2`. |
 
 ## Usage
+
 Behold, the power of seqeval:
 
 ```python
@@ -39,41 +42,36 @@ Behold, the power of seqeval:
 >>> accuracy_score(y_true, y_pred)
 0.80
 >>> classification_report(y_true, y_pred)
-             precision    recall  f1-score   support
+              precision    recall  f1-score   support
 
-       MISC       0.00      0.00      0.00         1
-        PER       1.00      1.00      1.00         1
+        MISC       0.00      0.00      0.00         1
+         PER       1.00      1.00      1.00         1
 
-  micro avg       0.50      0.50      0.50         2
-  macro avg       0.50      0.50      0.50         2
+   micro avg       0.50      0.50      0.50         2
+   macro avg       0.50      0.50      0.50         2
+weighted avg       0.50      0.50      0.50         2
 ```
 
-### Keras Callback
-
-Seqeval provides a callback for Keras:
+If you want to explicitly specify the evaluation scheme, use `mode='strict'`:
 
 ```python
-from seqeval.callbacks import F1Metrics
+>>> from seqeval.scheme import IOB2
+>>> classification_report(y_true, y_pred, mode='strict', scheme=IOB2)
+              precision    recall  f1-score   support
+
+        MISC       0.00      0.00      0.00         1
+         PER       1.00      1.00      1.00         1
 
-id2label = {0: '<PAD>', 1: 'B-LOC', 2: 'I-LOC'}
-callbacks = [F1Metrics(id2label)]
-model.fit(x, y, validation_data=(x_val, y_val), callbacks=callbacks)
+   micro avg       0.50      0.50      0.50         2
+   macro avg       0.50      0.50      0.50         2
+weighted avg       0.50      0.50      0.50         2
 ```
 
+Note: The behavior of the strict mode is different from the default one which is designed to simulate conlleval.
+
 ## Installation
 To install seqeval, simply run:
 
 ```
-$ pip install seqeval[cpu]
+$ pip install seqeval
 ```
-
-If you want to install seqeval on GPU environment, please run:
-
-```bash
-$ pip install seqeval[gpu]
-```
-
-## Requirement
-
-* numpy >= 1.14.0
-* tensorflow(optional)
\ No newline at end of file
diff --git a/seqeval/callbacks.py b/seqeval/callbacks.py
deleted file mode 100644
index 704f5d9..0000000
--- a/seqeval/callbacks.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import numpy as np
-from keras.callbacks import Callback
-from seqeval.metrics import f1_score, classification_report
-
-
-class F1Metrics(Callback):
-
-    def __init__(self, id2label, pad_value=0, validation_data=None, digits=4):
-        """
-        Args:
-            id2label (dict): id to label mapping.
-            (e.g. {1: 'B-LOC', 2: 'I-LOC'})
-            pad_value (int): padding value.
-            digits (int or None): number of digits in printed classification report
-              (use None to print only F1 score without a report).
-        """
-        super(F1Metrics, self).__init__()
-        self.id2label = id2label
-        self.pad_value = pad_value
-        self.validation_data = validation_data
-        self.digits = digits
-        self.is_fit = validation_data is None
-
-    def convert_idx_to_name(self, y, array_indexes):
-        """Convert label index to name.
-
-        Args:
-            y (np.ndarray): label index 2d array.
-            array_indexes (list): list of valid index arrays for each row.
-
-        Returns:
-            y: label name list.
-        """
-        y = [[self.id2label[idx] for idx in row[row_indexes]] for
-             row, row_indexes in zip(y, array_indexes)]
-        return y
-
-    def predict(self, X, y):
-        """Predict sequences.
-
-        Args:
-            X (np.ndarray): input data.
-            y (np.ndarray): tags.
-
-        Returns:
-            y_true: true sequences.
-            y_pred: predicted sequences.
-        """
-        y_pred = self.model.predict_on_batch(X)
-
-        # reduce dimension.
-        y_true = np.argmax(y, -1)
-        y_pred = np.argmax(y_pred, -1)
-
-        non_pad_indexes = [np.nonzero(y_true_row != self.pad_value)[0] for y_true_row in y_true]
-
-        y_true = self.convert_idx_to_name(y_true, non_pad_indexes)
-        y_pred = self.convert_idx_to_name(y_pred, non_pad_indexes)
-
-        return y_true, y_pred
-
-    def score(self, y_true, y_pred):
-        """Calculate f1 score.
-
-        Args:
-            y_true (list): true sequences.
-            y_pred (list): predicted sequences.
-
-        Returns:
-            score: f1 score.
-        """
-        score = f1_score(y_true, y_pred)
-        print(' - f1: {:04.2f}'.format(score * 100))
-        if self.digits:
-            print(classification_report(y_true, y_pred, digits=self.digits))
-        return score
-
-    def on_epoch_end(self, epoch, logs={}):
-        if self.is_fit:
-            self.on_epoch_end_fit(epoch, logs)
-        else:
-            self.on_epoch_end_fit_generator(epoch, logs)
-
-    def on_epoch_end_fit(self, epoch, logs={}):
-        X = self.validation_data[0]
-        y = self.validation_data[1]
-        y_true, y_pred = self.predict(X, y)
-        score = self.score(y_true, y_pred)
-        logs['f1'] = score
-
-    def on_epoch_end_fit_generator(self, epoch, logs={}):
-        y_true = []
-        y_pred = []
-        for X, y in self.validation_data:
-            y_true_batch, y_pred_batch = self.predict(X, y)
-            y_true.extend(y_true_batch)
-            y_pred.extend(y_pred_batch)
-        score = self.score(y_true, y_pred)
-        logs['f1'] = score
diff --git a/seqeval/metrics/sequence_labeling.py b/seqeval/metrics/sequence_labeling.py
index dc6d78e..d954e72 100644
--- a/seqeval/metrics/sequence_labeling.py
+++ b/seqeval/metrics/sequence_labeling.py
@@ -13,6 +13,7 @@
 import numpy as np
 
 from seqeval.reporters import DictReporter, StringReporter
+from seqeval.metrics.v1 import classification_report as cr
 
 
 def get_entities(seq, suffix=False):
@@ -303,15 +304,42 @@ def performance_measure(y_true, y_pred):
     return performance_dict
 
 
-def classification_report(y_true, y_pred, digits=2, suffix=False, output_dict=False):
+def classification_report(y_true, y_pred,
+                          digits=2,
+                          suffix=False,
+                          output_dict=False,
+                          mode=None,
+                          sample_weight=None,
+                          zero_division='warn',
+                          scheme=None):
     """Build a text report showing the main classification metrics.
 
     Args:
         y_true : 2d array. Ground truth (correct) target values.
+
         y_pred : 2d array. Estimated targets as returned by a classifier.
+
         digits : int. Number of digits for formatting output floating point values.
+
         output_dict : bool(default=False). If True, return output as dict else str.
 
+        mode : str. If mode="strict", use new classification_report.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        zero_division : "warn", 0 or 1, default="warn"
+            Sets the value to return when there is a zero division:
+               - recall: when there are no positive labels
+               - precision: when there are no positive predictions
+               - f-score: both
+
+            If set to "warn", this acts as 0, but warnings are also raised.
+
+        scheme : Token, [IOB2, IOE2, IOBES]
+
+        suffix : bool, False by default.
+
     Returns:
         report : string/dict. Summary of the precision, recall, F1 score for each class.
 
@@ -330,6 +358,16 @@ def classification_report(y_true, y_pred, digits=2, suffix=False, output_dict=Fa
        weighted avg       0.50      0.50      0.50         2
         <BLANKLINE>
     """
+    if mode == 'strict':
+        return cr(y_true, y_pred,
+                  digits=digits,
+                  output_dict=output_dict,
+                  sample_weight=sample_weight,
+                  zero_division=zero_division,
+                  scheme=scheme,
+                  suffix=suffix
+                  )
+
     true_entities = set(get_entities(y_true, suffix))
     pred_entities = set(get_entities(y_pred, suffix))
 
diff --git a/seqeval/metrics/v1.py b/seqeval/metrics/v1.py
new file mode 100644
index 0000000..f06440c
--- /dev/null
+++ b/seqeval/metrics/v1.py
@@ -0,0 +1,379 @@
+import warnings
+from typing import List, Union, Tuple, Type, Optional
+
+import numpy as np
+from sklearn.exceptions import UndefinedMetricWarning
+
+from seqeval.reporters import DictReporter, StringReporter
+from seqeval.scheme import Entities, Token, auto_detect
+
+PER_CLASS_SCORES = Tuple[List[float], List[float], List[float], List[int]]
+AVERAGE_SCORES = Tuple[float, float, float, int]
+SCORES = Union[PER_CLASS_SCORES, AVERAGE_SCORES]
+
+
+def _prf_divide(numerator, denominator, metric,
+                modifier, average, warn_for, zero_division='warn'):
+    """Performs division and handles divide-by-zero.
+
+    On zero-division, sets the corresponding result elements equal to
+    0 or 1 (according to ``zero_division``). Plus, if
+    ``zero_division != "warn"`` raises a warning.
+
+    The metric, modifier and average arguments are used only for determining
+    an appropriate warning.
+    """
+    mask = denominator == 0.0
+    denominator = denominator.copy()
+    denominator[mask] = 1  # avoid infs/nans
+    result = numerator / denominator
+
+    if not np.any(mask):
+        return result
+
+    # if ``zero_division=1``, set those with denominator == 0 equal to 1
+    result[mask] = 0.0 if zero_division in ['warn', 0] else 1.0
+
+    # the user will be removing warnings if zero_division is set to something
+    # different than its default value. If we are computing only f-score
+    # the warning will be raised only if precision and recall are ill-defined
+    if zero_division != 'warn' or metric not in warn_for:
+        return result
+
+    # build appropriate warning
+    # E.g. "Precision and F-score are ill-defined and being set to 0.0 in
+    # labels with no predicted samples. Use ``zero_division`` parameter to
+    # control this behavior."
+
+    if metric in warn_for and 'f-score' in warn_for:
+        msg_start = '{0} and F-score are'.format(metric.title())
+    elif metric in warn_for:
+        msg_start = '{0} is'.format(metric.title())
+    elif 'f-score' in warn_for:
+        msg_start = 'F-score is'
+    else:
+        return result
+
+    _warn_prf(average, modifier, msg_start, len(result))
+
+    return result
+
+
+def _warn_prf(average, modifier, msg_start, result_size):
+    axis0, axis1 = 'sample', 'label'
+    if average == 'samples':
+        axis0, axis1 = axis1, axis0
+    msg = ('{0} ill-defined and being set to 0.0 {{0}} '
+           'no {1} {2}s. Use `zero_division` parameter to control'
+           ' this behavior.'.format(msg_start, modifier, axis0))
+    if result_size == 1:
+        msg = msg.format('due to')
+    else:
+        msg = msg.format('in {0}s with'.format(axis1))
+    warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+
+
+def unique_labels(y_true: List[List[str]], y_pred: List[List[str]],
+                  scheme: Type[Token], suffix: bool = False) -> List[str]:
+    sequences_true = Entities(y_true, scheme, suffix)
+    sequences_pred = Entities(y_pred, scheme, suffix)
+    unique_tags = sequences_true.unique_tags | sequences_pred.unique_tags
+    return sorted(unique_tags)
+
+
+def check_consistent_length(y_true: List[List[str]], y_pred: List[List[str]]):
+    """Check that all arrays have consistent first and second dimensions.
+
+    Checks whether all objects in arrays have the same shape or length.
+
+    Args:
+        y_true : 2d array.
+        y_pred : 2d array.
+    """
+    len_true = list(map(len, y_true))
+    len_pred = list(map(len, y_pred))
+    is_list = set(map(type, y_true + y_pred))
+    if len(y_true) != len(y_pred) or len_true != len_pred or not is_list == {list}:
+        raise ValueError("Found input variables with inconsistent numbers of samples:\n{}\n{}".format(len_true, len_pred))
+
+
+def precision_recall_fscore_support(y_true: List[List[str]],
+                                    y_pred: List[List[str]],
+                                    *,
+                                    average: Optional[str] = None,
+                                    warn_for=('precision', 'recall', 'f-score'),
+                                    beta: float = 1.0,
+                                    sample_weight=None,
+                                    zero_division: str = 'warn',
+                                    scheme: Type[Token] = None,
+                                    suffix: bool = False) -> SCORES:
+    """Compute precision, recall, F-measure and support for each class.
+
+    Args:
+        y_true : 2d array. Ground truth (correct) target values.
+
+        y_pred : 2d array. Estimated targets as returned by a tagger.
+
+        beta : float, 1.0 by default
+            The strength of recall versus precision in the F-score.
+
+        average : string, [None (default), 'micro', 'macro', 'weighted']
+            If ``None``, the scores for each class are returned. Otherwise, this
+            determines the type of averaging performed on the data:
+            ``'micro'``:
+                Calculate metrics globally by counting the total true positives,
+                false negatives and false positives.
+            ``'macro'``:
+                Calculate metrics for each label, and find their unweighted
+                mean.  This does not take label imbalance into account.
+            ``'weighted'``:
+                Calculate metrics for each label, and find their average weighted
+                by support (the number of true instances for each label). This
+                alters 'macro' to account for label imbalance; it can result in an
+                F-score that is not between precision and recall.
+
+        warn_for : tuple or set, for internal use
+            This determines which warnings will be made in the case that this
+            function is being used to return only one of its metrics.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        zero_division : "warn", 0 or 1, default="warn"
+            Sets the value to return when there is a zero division:
+               - recall: when there are no positive labels
+               - precision: when there are no positive predictions
+               - f-score: both
+
+            If set to "warn", this acts as 0, but warnings are also raised.
+
+        scheme : Token, [IOB2, IOE2, IOBES]
+
+        suffix : bool, False by default.
+
+    Returns:
+        precision : float (if average is not None) or array of float, shape = [n_unique_labels]
+
+        recall : float (if average is not None) or array of float, , shape = [n_unique_labels]
+
+        fbeta_score : float (if average is not None) or array of float, shape = [n_unique_labels]
+
+        support : int (if average is not None) or array of int, shape = [n_unique_labels]
+            The number of occurrences of each label in ``y_true``.
+
+    Examples:
+        >>> from seqeval.metrics.v1 import precision_recall_fscore_support
+        >>> from seqeval.scheme import IOB2
+        >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
+        >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
+        >>> precision_recall_fscore_support(y_true, y_pred, average='macro', scheme=IOB2)
+        (0.5, 0.5, 0.5, 2)
+        >>> precision_recall_fscore_support(y_true, y_pred, average='micro', scheme=IOB2)
+        (0.5, 0.5, 0.5, 2)
+        >>> precision_recall_fscore_support(y_true, y_pred, average='weighted', scheme=IOB2)
+        (0.5, 0.5, 0.5, 2)
+
+        It is possible to compute per-label precisions, recalls, F1-scores and
+        supports instead of averaging:
+
+        >>> precision_recall_fscore_support(y_true, y_pred, average=None, scheme=IOB2)
+        (array([0., 1.]), array([0., 1.]), array([0., 1.]), array([1, 1]))
+
+    Notes:
+        When ``true positive + false positive == 0``, precision is undefined;
+        When ``true positive + false negative == 0``, recall is undefined.
+        In such cases, by default the metric will be set to 0, as will f-score,
+        and ``UndefinedMetricWarning`` will be raised. This behavior can be
+        modified with ``zero_division``.
+    """
+    if beta < 0:
+        raise ValueError('beta should be >=0 in the F-beta score')
+
+    average_options = (None, 'micro', 'macro', 'weighted')
+    if average not in average_options:
+        raise ValueError('average has to be one of {}'.format(average_options))
+
+    check_consistent_length(y_true, y_pred)
+
+    target_names = unique_labels(y_true, y_pred, scheme, suffix)
+    entities_true = Entities(y_true, scheme, suffix)
+    entities_pred = Entities(y_pred, scheme, suffix)
+
+    tp_sum = np.array([], dtype=np.int32)
+    pred_sum = np.array([], dtype=np.int32)
+    true_sum = np.array([], dtype=np.int32)
+    for type_name in target_names:
+        entities_true_type = entities_true.filter(type_name)
+        entities_pred_type = entities_pred.filter(type_name)
+        tp_sum = np.append(tp_sum, len(entities_true_type & entities_pred_type))
+        pred_sum = np.append(pred_sum, len(entities_pred_type))
+        true_sum = np.append(true_sum, len(entities_true_type))
+
+    if average == 'micro':
+        tp_sum = np.array([tp_sum.sum()])
+        pred_sum = np.array([pred_sum.sum()])
+        true_sum = np.array([true_sum.sum()])
+
+    # Finally, we have all our sufficient statistics. Divide! #
+    beta2 = beta ** 2
+
+    # Divide, and on zero-division, set scores and/or warn according to
+    # zero_division:
+    precision = _prf_divide(
+        numerator=tp_sum,
+        denominator=pred_sum,
+        metric='precision',
+        modifier='predicted',
+        average=average,
+        warn_for=warn_for,
+        zero_division=zero_division
+    )
+    recall = _prf_divide(
+        numerator=tp_sum,
+        denominator=true_sum,
+        metric='recall',
+        modifier='true',
+        average=average,
+        warn_for=warn_for,
+        zero_division=zero_division
+    )
+
+    # warn for f-score only if zero_division is warn, it is in warn_for
+    # and BOTH prec and rec are ill-defined
+    if zero_division == 'warn' and ('f-score',) == warn_for:
+        if (pred_sum[true_sum == 0] == 0).any():
+            _warn_prf(
+                average, 'true nor predicted', 'F-score is', len(true_sum)
+            )
+
+    # if tp == 0 F will be 1 only if all predictions are zero, all labels are
+    # zero, and zero_division=1. In all other case, 0
+    if np.isposinf(beta):
+        f_score = recall
+    else:
+        denom = beta2 * precision + recall
+
+        denom[denom == 0.] = 1  # avoid division by 0
+        f_score = (1 + beta2) * precision * recall / denom
+
+    # Average the results
+    if average == 'weighted':
+        weights = true_sum
+        if weights.sum() == 0:
+            zero_division_value = 0.0 if zero_division in ['warn', 0] else 1.0
+            # precision is zero_division if there are no positive predictions
+            # recall is zero_division if there are no positive labels
+            # fscore is zero_division if all labels AND predictions are
+            # negative
+            return (zero_division_value if pred_sum.sum() == 0 else 0.0,
+                    zero_division_value,
+                    zero_division_value if pred_sum.sum() == 0 else 0.0,
+                    sum(true_sum))
+
+    elif average == 'samples':
+        weights = sample_weight
+    else:
+        weights = None
+
+    if average is not None:
+        precision = np.average(precision, weights=weights)
+        recall = np.average(recall, weights=weights)
+        f_score = np.average(f_score, weights=weights)
+        true_sum = sum(true_sum)
+
+    return precision, recall, f_score, true_sum
+
+
+def classification_report(y_true: List[List[str]],
+                          y_pred: List[List[str]],
+                          *,
+                          sample_weight=None,
+                          digits: int = 2,
+                          output_dict: bool = False,
+                          zero_division: str = 'warn',
+                          suffix: bool = False,
+                          scheme: Type[Token] = None) -> Union[str, dict]:
+    """Build a text report showing the main tagging metrics.
+
+    Args:
+        y_true : 2d array. Ground truth (correct) target values.
+
+        y_pred : 2d array. Estimated targets as returned by a classifier.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        digits : int. Number of digits for formatting output floating point values.
+
+        output_dict : bool(default=False). If True, return output as dict else str.
+
+        zero_division : "warn", 0 or 1, default="warn"
+            Sets the value to return when there is a zero division:
+               - recall: when there are no positive labels
+               - precision: when there are no positive predictions
+               - f-score: both
+
+            If set to "warn", this acts as 0, but warnings are also raised.
+
+        scheme : Token, [IOB2, IOE2, IOBES]
+
+        suffix : bool, False by default.
+
+    Returns:
+        report : string/dict. Summary of the precision, recall, F1 score for each class.
+
+    Examples:
+        >>> from seqeval.metrics.v1 import classification_report
+        >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
+        >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
+        >>> print(classification_report(y_true, y_pred))
+                     precision    recall  f1-score   support
+        <BLANKLINE>
+               MISC       0.00      0.00      0.00         1
+                PER       1.00      1.00      1.00         1
+        <BLANKLINE>
+          micro avg       0.50      0.50      0.50         2
+          macro avg       0.50      0.50      0.50         2
+       weighted avg       0.50      0.50      0.50         2
+        <BLANKLINE>
+    """
+    if scheme is None or not issubclass(scheme, Token):
+        scheme = auto_detect(y_true, suffix)
+    target_names = unique_labels(y_true, y_pred, scheme, suffix)
+
+    if output_dict:
+        reporter = DictReporter()
+    else:
+        name_width = max(map(len, target_names))
+        avg_width = len('weighted avg')
+        width = max(name_width, avg_width, digits)
+        reporter = StringReporter(width=width, digits=digits)
+
+    # compute per-class scores.
+    p, r, f1, s = precision_recall_fscore_support(
+        y_true, y_pred,
+        average=None,
+        sample_weight=sample_weight,
+        zero_division=zero_division,
+        scheme=scheme,
+        suffix=suffix
+    )
+    for row in zip(target_names, p, r, f1, s):
+        reporter.write(*row)
+    reporter.write_blank()
+
+    # compute average scores.
+    average_options = ('micro', 'macro', 'weighted')
+    for average in average_options:
+        avg_p, avg_r, avg_f1, support = precision_recall_fscore_support(
+            y_true, y_pred,
+            average=average,
+            sample_weight=sample_weight,
+            zero_division=zero_division,
+            scheme=scheme,
+            suffix=suffix
+        )
+        reporter.write('{} avg'.format(average), avg_p, avg_r, avg_f1, support)
+    reporter.write_blank()
+
+    return reporter.report()
diff --git a/seqeval/scheme.py b/seqeval/scheme.py
index 571395e..0357f87 100644
--- a/seqeval/scheme.py
+++ b/seqeval/scheme.py
@@ -1,25 +1,28 @@
 import enum
+from itertools import chain
 from typing import List, Set, Tuple, Type
 
 
 class Entity:
 
-    def __init__(self, start: int, end: int, tag: str):
+    def __init__(self, sent_id: int, start: int, end: int, tag: str):
+        self.sent_id = sent_id
         self.start = start
         self.end = end
         self.tag = tag
 
     def __repr__(self):
-        return '({}, {}, {})'.format(self.tag, self.start, self.end)
+        return '({}, {}, {}, {})'.format(self.sent_id, self.tag, self.start, self.end)
 
     def __eq__(self, other: 'Entity'):
-        return self.start == other.start and self.end == other.end and self.tag == other.tag
+        return self.sent_id == other.sent_id and self.start == other.start and\
+               self.end == other.end and self.tag == other.tag
 
     def __hash__(self):
         return hash(self.to_tuple())
 
     def to_tuple(self):
-        return self.tag, self.start, self.end
+        return self.sent_id, self.tag, self.start, self.end
 
 
 class Prefix(enum.Flag):
@@ -205,10 +208,12 @@ class IOBES(Token):
 
 class Tokens:
 
-    def __init__(self, tokens: List[str], scheme: Type[Token], suffix: bool = False, delimiter: str = '-'):
+    def __init__(self, tokens: List[str], scheme: Type[Token],
+                 suffix: bool = False, delimiter: str = '-', sent_id: int = None):
         self.tokens = [scheme(token, suffix=suffix, delimiter=delimiter) for token in tokens]
         self.scheme = scheme
         self.outside_token = scheme('O', suffix=suffix, delimiter=delimiter)
+        self.sent_id = sent_id
 
     @property
     def entities(self):
@@ -231,8 +236,8 @@ def entities(self):
             if token.is_start(prev):
                 end = self._forward(start=i + 1, prev=token)
                 if self._is_end(end):
-                    entity = Entity(start=i, end=end, tag=token.tag)
-                    entities.append(entity.to_tuple())
+                    entity = Entity(sent_id=self.sent_id, start=i, end=end, tag=token.tag)
+                    entities.append(entity)
                 i = end
             else:
                 i += 1
@@ -259,6 +264,26 @@ def extended_tokens(self):
         return tokens
 
 
+class Entities:
+
+    def __init__(self, sequences: List[List[str]], scheme: Type[Token], suffix: bool = False, delimiter: str = '-'):
+        self.entities = [
+            Tokens(seq, scheme=scheme, suffix=suffix, delimiter=delimiter, sent_id=sent_id).entities
+            for sent_id, seq in enumerate(sequences)
+        ]
+
+    def filter(self, tag_name: str):
+        entities = {entity for entity in chain(*self.entities) if entity.tag == tag_name}
+        return entities
+
+    @property
+    def unique_tags(self):
+        tags = {
+            entity.tag for entity in chain(*self.entities)
+        }
+        return tags
+
+
 def auto_detect(sequences: List[List[str]], suffix: bool = False, delimiter: str = '-'):
     """Detects scheme automatically.
 
@@ -277,14 +302,34 @@ def auto_detect(sequences: List[List[str]], suffix: bool = False, delimiter: str
             except KeyError:
                 raise ValueError(error_message.format(token))
 
-    iob_prefixes = {Prefix.I, Prefix.O, Prefix.B}
-    ioe_prefixes = {Prefix.I, Prefix.O, Prefix.E}
-    iobes_prefixes = {Prefix.I, Prefix.O, Prefix.B, Prefix.E, Prefix.S}
-    if prefixes == iob_prefixes or prefixes == iob_prefixes - {Prefix.O}:
+    allowed_iob2_prefixes = [
+        {Prefix.I, Prefix.O, Prefix.B},
+        {Prefix.I, Prefix.B},
+        {Prefix.B, Prefix.O},
+        {Prefix.B}
+    ]
+    allowed_ioe2_prefixes = [
+        {Prefix.I, Prefix.O, Prefix.E},
+        {Prefix.I, Prefix.E},
+        {Prefix.E, Prefix.O},
+        {Prefix.E}
+    ]
+    allowed_iobes_prefixes = [
+        {Prefix.I, Prefix.O, Prefix.B, Prefix.E, Prefix.S},
+        {Prefix.I, Prefix.B, Prefix.E, Prefix.S},
+        {Prefix.I, Prefix.O, Prefix.B, Prefix.E},
+        {Prefix.O, Prefix.B, Prefix.E, Prefix.S},
+        {Prefix.I, Prefix.B, Prefix.E},
+        {Prefix.B, Prefix.E, Prefix.S},
+        {Prefix.O, Prefix.B, Prefix.E},
+        {Prefix.B, Prefix.E},
+        {Prefix.S}
+    ]
+    if prefixes in allowed_iob2_prefixes:
         return IOB2
-    elif prefixes == ioe_prefixes or prefixes == ioe_prefixes - {Prefix.O}:
+    elif prefixes in allowed_ioe2_prefixes:
         return IOE2
-    elif prefixes == iobes_prefixes or prefixes == iobes_prefixes - {Prefix.O}:
+    elif prefixes in allowed_iobes_prefixes:
         return IOBES
     else:
         raise ValueError(error_message.format(prefixes))
diff --git a/setup.py b/setup.py
index 80c0373..d5785c0 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@
     os.system('python setup.py sdist bdist_wheel upload')
     sys.exit()
 
-required = ['numpy>=1.14.0', 'Keras>=2.2.4']
+required = ['numpy==1.19.2', 'scikit-learn==0.23.2']
 
 setup(
     name=NAME,
@@ -38,8 +38,8 @@
     packages=find_packages(exclude=('tests',)),
     install_requires=required,
     extras_require={
-        'cpu': ['tensorflow>=1.13.1'],
-        'gpu': ['tensorflow-gpu'],
+        'cpu': [],
+        'gpu': [],
     },
     include_package_data=True,
     license=LICENSE,
@@ -56,4 +56,4 @@
         'Programming Language :: Python :: Implementation :: CPython',
         'Programming Language :: Python :: Implementation :: PyPy'
     ],
-)
\ No newline at end of file
+)
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 83bd5a8..27055ac 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -7,15 +7,6 @@
 import subprocess
 import unittest
 
-import numpy
-from keras import Sequential
-from keras.backend import constant
-from keras.layers import Lambda
-from keras.preprocessing.sequence import pad_sequences
-from keras.preprocessing.text import Tokenizer
-from keras.utils import to_categorical
-
-from seqeval.callbacks import F1Metrics
 from seqeval.metrics import (f1_score, accuracy_score, classification_report,
                              precision_score, recall_score,
                              performance_measure)
@@ -120,43 +111,6 @@ def test_statistical_tests(self):
 
         os.remove(filepath)
 
-    def test_keras_callback(self):
-        expected_score = f1_score(self.y_true, self.y_pred)
-        tokenizer = Tokenizer(lower=False)
-        tokenizer.fit_on_texts(self.y_true)
-        maxlen = max((len(row) for row in self.y_true))
-
-        def prepare(y, padding):
-            indexes = tokenizer.texts_to_sequences(y)
-            padded = pad_sequences(indexes, maxlen=maxlen, padding=padding, truncating=padding)
-            categorical = to_categorical(padded)
-            return categorical
-
-        for padding in ('pre', 'post'):
-            callback = F1Metrics(id2label=tokenizer.index_word)
-            y_true_cat = prepare(self.y_true, padding)
-            y_pred_cat = prepare(self.y_pred, padding)
-
-            input_shape = (1,)
-            layer = Lambda(lambda _: constant(y_pred_cat), input_shape=input_shape)
-            fake_model = Sequential(layers=[layer])
-            callback.set_model(fake_model)
-
-            X = numpy.zeros((y_true_cat.shape[0], 1))
-
-            # Verify that the callback translates sequences correctly by itself
-            y_true_cb, y_pred_cb = callback.predict(X, y_true_cat)
-            self.assertEqual(y_pred_cb, self.y_pred)
-            self.assertEqual(y_true_cb, self.y_true)
-
-            # Verify that the callback stores the correct number in logs
-            fake_model.compile(optimizer='adam', loss='categorical_crossentropy')
-            history = fake_model.fit(x=X, batch_size=y_true_cat.shape[0], y=y_true_cat,
-                                     validation_data=(X, y_true_cat),
-                                     callbacks=[callback])
-            actual_score = history.history['f1'][0]
-            self.assertAlmostEqual(actual_score, expected_score)
-
     def load_labels(self, filename):
         y_true, y_pred = [], []
         with open(filename) as f:
diff --git a/tests/test_reporters.py b/tests/test_reporters.py
index 4a85be5..d20835a 100644
--- a/tests/test_reporters.py
+++ b/tests/test_reporters.py
@@ -1,26 +1,61 @@
 import pytest
-from seqeval.reporters import DictReporter
-
-
-@pytest.mark.parametrize(
-    'rows, expected',
-    [
-        ([], {}),
-        (
-            [['PERSON', 0.82, 0.79, 0.81, 24]],
-            {
-                'PERSON': {
-                    'precision': 0.82,
-                    'recall': 0.79,
-                    'f1-score': 0.81,
-                    'support': 24
+from seqeval.reporters import DictReporter, StringReporter
+
+
+class TestDictReporter:
+
+    def test_write_empty(self):
+        reporter = DictReporter()
+        reporter.write_blank()
+        assert reporter.report_dict == {}
+
+    @pytest.mark.parametrize(
+        'rows, expected',
+        [
+            ([], {}),
+            (
+                [['PERSON', 0.82, 0.79, 0.81, 24]],
+                {
+                    'PERSON': {
+                        'precision': 0.82,
+                        'recall': 0.79,
+                        'f1-score': 0.81,
+                        'support': 24
+                    }
                 }
-            }
-        )
-    ]
-)
-def test_dict_reporter_output(rows, expected):
-    reporter = DictReporter()
-    for row in rows:
-        reporter.write(*row)
-    assert reporter.report() == expected
+            )
+        ]
+    )
+    def test_dict_reporter_output(self, rows, expected):
+        reporter = DictReporter()
+        for row in rows:
+            reporter.write(*row)
+        assert reporter.report() == expected
+
+
+class TestStringReporter:
+
+    def test_write_empty(self):
+        reporter = StringReporter()
+        reporter.write_blank()
+        assert reporter.buffer == ['']
+
+    def test_write_header(self):
+        reporter = StringReporter()
+        report = reporter.write_header()
+        assert 'precision' in report
+        assert 'recall' in report
+        assert 'f1-score' in report
+        assert 'support' in report
+
+    def test_write(self):
+        reporter = StringReporter()
+        reporter.write('XXX', 0, 0, 0, 0)
+        assert 'XXX' in reporter.buffer[0]
+
+    def test_report(self):
+        reporter = StringReporter()
+        reporter.write('XXX', 0, 0, 0, 0)
+        report = reporter.report()
+        assert 'XXX' in report
+        assert 'precision' in report
diff --git a/tests/test_scheme.py b/tests/test_scheme.py
index 179ec0b..ff9ba50 100644
--- a/tests/test_scheme.py
+++ b/tests/test_scheme.py
@@ -1,6 +1,43 @@
 import pytest
 
-from seqeval.scheme import IOB1, IOB2, IOBES, IOE1, IOE2, Prefix, Tokens, Token, auto_detect
+from seqeval.scheme import IOB1, IOB2, IOBES, IOE1, IOE2, Prefix, Tokens, Token, auto_detect, Entity, Entities
+
+
+def test_entity_repr():
+    data = (0, 0, 0, 0)
+    entity = Entity(*data)
+    assert str(data) == str(entity)
+
+
+@pytest.mark.parametrize(
+    'data1, data2, expected',
+    [
+        ((0, 0, 0, 0), (0, 0, 0, 0), True),
+        ((1, 0, 0, 0), (0, 0, 0, 0), False),
+        ((0, 1, 0, 0), (0, 0, 0, 0), False),
+        ((0, 0, 1, 0), (0, 0, 0, 0), False),
+        ((0, 0, 0, 1), (0, 0, 0, 0), False)
+    ]
+)
+def test_entity_equality(data1, data2, expected):
+    entity1 = Entity(*data1)
+    entity2 = Entity(*data2)
+    is_equal = entity1 == entity2
+    assert is_equal == expected
+
+
+@pytest.mark.parametrize(
+    'sequences, tag_name, expected',
+    [
+        ([['B-PER', 'B-ORG']], '', set()),
+        ([['B-PER', 'B-ORG']], 'ORG', {Entity(0, 1, 2, 'ORG')}),
+        ([['B-PER', 'B-ORG']], 'PER', {Entity(0, 0, 1, 'PER')})
+    ]
+)
+def test_entities_filter(sequences, tag_name, expected):
+    entities = Entities(sequences, IOB2)
+    filtered = entities.filter(tag_name)
+    assert filtered == expected
 
 
 @pytest.mark.parametrize(
@@ -196,7 +233,7 @@ def test_iobes_start_inside_end(prev, token, expected):
 )
 def test_iob1_tokens(tokens, expected):
     tokens = Tokens(tokens, IOB1)
-    entities = tokens.entities
+    entities = [entity.to_tuple()[1:] for entity in tokens.entities]
     assert entities == expected
 
 
@@ -220,7 +257,7 @@ def test_iob1_tokens(tokens, expected):
 )
 def test_iob1_tokens_without_tag(tokens, expected):
     tokens = Tokens(tokens, IOB1)
-    entities = tokens.entities
+    entities = [entity.to_tuple()[1:] for entity in tokens.entities]
     assert entities == expected
 
 
@@ -247,7 +284,7 @@ def test_iob1_tokens_without_tag(tokens, expected):
 )
 def test_iob2_tokens(tokens, expected):
     tokens = Tokens(tokens, IOB2)
-    entities = tokens.entities
+    entities = [entity.to_tuple()[1:] for entity in tokens.entities]
     assert entities == expected
 
 
@@ -271,7 +308,7 @@ def test_iob2_tokens(tokens, expected):
 )
 def test_iob2_tokens_without_tag(tokens, expected):
     tokens = Tokens(tokens, IOB2)
-    entities = tokens.entities
+    entities = [entity.to_tuple()[1:] for entity in tokens.entities]
     assert entities == expected
 
 
@@ -298,7 +335,7 @@ def test_iob2_tokens_without_tag(tokens, expected):
 )
 def test_ioe1_tokens(tokens, expected):
     tokens = Tokens(tokens, IOE1)
-    entities = tokens.entities
+    entities = [entity.to_tuple()[1:] for entity in tokens.entities]
     assert entities == expected
 
 
@@ -322,7 +359,7 @@ def test_ioe1_tokens(tokens, expected):
 )
 def test_ioe1_tokens_without_tag(tokens, expected):
     tokens = Tokens(tokens, IOE1)
-    entities = tokens.entities
+    entities = [entity.to_tuple()[1:] for entity in tokens.entities]
     assert entities == expected
 
 
@@ -349,7 +386,7 @@ def test_ioe1_tokens_without_tag(tokens, expected):
 )
 def test_ioe2_tokens(tokens, expected):
     tokens = Tokens(tokens, IOE2)
-    entities = tokens.entities
+    entities = [entity.to_tuple()[1:] for entity in tokens.entities]
     assert entities == expected
 
 
@@ -373,7 +410,7 @@ def test_ioe2_tokens(tokens, expected):
 )
 def test_ioe2_tokens_without_tag(tokens, expected):
     tokens = Tokens(tokens, IOE2)
-    entities = tokens.entities
+    entities = [entity.to_tuple()[1:] for entity in tokens.entities]
     assert entities == expected
 
 
@@ -418,7 +455,7 @@ def test_ioe2_tokens_without_tag(tokens, expected):
 )
 def test_iobes_tokens(tokens, expected):
     tokens = Tokens(tokens, IOBES)
-    entities = tokens.entities
+    entities = [entity.to_tuple()[1:] for entity in tokens.entities]
     assert entities == expected
 
 
@@ -459,7 +496,7 @@ def test_iobes_tokens(tokens, expected):
 )
 def test_iobes_tokens_without_tag(tokens, expected):
     tokens = Tokens(tokens, IOBES)
-    entities = tokens.entities
+    entities = [entity.to_tuple()[1:] for entity in tokens.entities]
     assert entities == expected
 
 
@@ -530,10 +567,21 @@ class TestAutoDetect:
         [
             ([['B', 'I', 'O']], IOB2),
             ([['B', 'I']], IOB2),
+            ([['B', 'O']], IOB2),
+            ([['B']], IOB2),
             ([['I', 'O', 'E']], IOE2),
             ([['I', 'E']], IOE2),
+            ([['E', 'O']], IOE2),
+            ([['E']], IOE2),
             ([['I', 'O', 'B', 'E', 'S']], IOBES),
-            ([['I', 'B', 'E', 'S']], IOBES)
+            ([['I', 'B', 'E', 'S']], IOBES),
+            ([['I', 'O', 'B', 'E']], IOBES),
+            ([['O', 'B', 'E', 'S']], IOBES),
+            ([['I', 'B', 'E']], IOBES),
+            ([['B', 'E', 'S']], IOBES),
+            ([['O', 'B', 'E']], IOBES),
+            ([['B', 'E']], IOBES),
+            ([['S']], IOBES)
          ]
     )
     def test_valid_scheme(self, sequences, expected):
diff --git a/tests/test_v1.py b/tests/test_v1.py
new file mode 100644
index 0000000..d03b295
--- /dev/null
+++ b/tests/test_v1.py
@@ -0,0 +1,161 @@
+import numpy as np
+import pytest
+from sklearn.exceptions import UndefinedMetricWarning
+from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
+
+from seqeval.metrics.v1 import unique_labels, precision_recall_fscore_support, classification_report
+from seqeval.scheme import IOB2
+
+
+@pytest.mark.parametrize(
+    'y_true, y_pred, expected',
+    [
+        ([[]], [[]], []),
+        ([['B-PER']], [[]], ['PER']),
+        ([[]], [['B-PER']], ['PER']),
+        ([['B-PER']], [['B-PER']], ['PER']),
+        ([['B-PER', 'O']], [[]], ['PER']),
+        ([['B-PER', 'I-PER']], [[]], ['PER']),
+        ([['B-PER']], [['B-ORG']], ['ORG', 'PER'])
+    ]
+)
+def test_unique_labels(y_true, y_pred, expected):
+    labels = unique_labels(y_true, y_pred, IOB2)
+    assert labels == expected
+
+
+class TestPrecisionRecallFscoreSupport:
+
+    def test_bad_beta(self):
+        y_true, y_pred = [[]], [[]]
+        with pytest.raises(ValueError):
+            precision_recall_fscore_support(y_true, y_pred, beta=-0.1, scheme=IOB2)
+
+    def test_bad_average_option(self):
+        y_true, y_pred = [[]], [[]]
+        with pytest.raises(ValueError):
+            precision_recall_fscore_support(y_true, y_pred, average='mega', scheme=IOB2)
+
+    @pytest.mark.parametrize(
+        'average', [None, 'macro', 'weighted']
+    )
+    def test_warning(self, average):
+        y_true = [['B-PER']]
+        y_pred = [['B-Test']]
+        with pytest.warns(UndefinedMetricWarning):
+            precision_recall_fscore_support(y_true, y_pred, average=average, beta=1.0, scheme=IOB2)
+
+    def test_fscore_warning(self):
+        with pytest.warns(UndefinedMetricWarning):
+            precision_recall_fscore_support([[]], [[]], average='micro', scheme=IOB2, warn_for=('f-score', ))
+
+    def test_length(self):
+        y_true = [['B-PER']]
+        y_pred = [['B-PER', 'O']]
+        with pytest.raises(ValueError):
+            precision_recall_fscore_support(y_true, y_pred, scheme=IOB2)
+
+    def test_weighted_true_sum_zero(self):
+        res = precision_recall_fscore_support([['O']], [['O']], average='weighted', scheme=IOB2)
+        assert res == (0.0, 0.0, 0.0, 0)
+
+    def test_scores(self):
+        y_true = [['B-A', 'B-B', 'O', 'B-A']]
+        y_pred = [['O', 'B-B', 'B-C', 'B-A']]
+        p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None, scheme=IOB2)
+        assert_array_almost_equal(p, [1.00, 1.00, 0.00], 2)
+        assert_array_almost_equal(r, [0.50, 1.00, 0.00], 2)
+        assert_array_almost_equal(f, [0.67, 1.00, 0.00], 2)
+        assert_array_equal(s, [2, 1, 0])
+
+    @pytest.mark.parametrize(
+        'average, expected',
+        [
+            ('micro', [0.67, 0.67, 0.67, 3]),
+            ('macro', [0.67, 0.50, 0.56, 3]),
+            ('weighted', [1.00, 0.67, 0.78, 3])
+        ]
+    )
+    def test_average_scores(self, average, expected):
+        y_true = [['B-A', 'B-B', 'O', 'B-A']]
+        y_pred = [['O', 'B-B', 'B-C', 'B-A']]
+        scores = precision_recall_fscore_support(y_true, y_pred, average=average, scheme=IOB2)
+        assert_array_almost_equal(scores, expected, 2)
+
+    @pytest.mark.parametrize(
+        'average, expected',
+        [
+            ('micro', [0.67, 0.67, 0.67, 3]),
+            ('macro', [0.67, 0.50, 0.50, 3]),
+            ('weighted', [1.00, 0.67, 0.67, 3])
+        ]
+    )
+    def test_average_scores_beta_inf(self, average, expected):
+        y_true = [['B-A', 'B-B', 'O', 'B-A']]
+        y_pred = [['O', 'B-B', 'B-C', 'B-A']]
+        scores = precision_recall_fscore_support(y_true, y_pred, average=average, scheme=IOB2, beta=np.inf)
+        assert_array_almost_equal(scores, expected, 2)
+
+
+class TestClassificationReport:
+
+    def test_output_dict(self):
+        y_true = [['B-A', 'B-B', 'O', 'B-A']]
+        y_pred = [['O', 'B-B', 'B-C', 'B-A']]
+        report = classification_report(y_true, y_pred, output_dict=True)
+        expected_report = {
+            'A': {
+                'f1-score': 0.6666666666666666,
+                'precision': 1.0,
+                'recall': 0.5,
+                'support': 2
+            },
+            'B': {
+                'f1-score': 1.0,
+                'precision': 1.0,
+                'recall': 1.0,
+                'support': 1
+            },
+            'C': {
+                'f1-score': 0.0,
+                'precision': 0.0,
+                'recall': 0.0,
+                'support': 0
+            },
+            'macro avg': {
+                'f1-score': 0.5555555555555555,
+                'precision': 0.6666666666666666,
+                'recall': 0.5,
+                'support': 3
+            },
+            'micro avg': {
+                'f1-score': 0.6666666666666666,
+                'precision': 0.6666666666666666,
+                'recall': 0.6666666666666666,
+                'support': 3
+            },
+            'weighted avg': {
+                'f1-score': 0.7777777777777777,
+                'precision': 1.0,
+                'recall': 0.6666666666666666,
+                'support': 3
+            }
+        }
+        assert report == expected_report
+
+    def test_output_string(self):
+        y_true = [['B-A', 'B-B', 'O', 'B-A']]
+        y_pred = [['O', 'B-B', 'B-C', 'B-A']]
+        report = classification_report(y_true, y_pred)
+        expected_report = """\
+              precision    recall  f1-score   support
+
+           A       1.00      0.50      0.67         2
+           B       1.00      1.00      1.00         1
+           C       0.00      0.00      0.00         0
+
+   micro avg       0.67      0.67      0.67         3
+   macro avg       0.67      0.50      0.56         3
+weighted avg       1.00      0.67      0.78         3
+"""
+        assert report == expected_report