Skip to content

Commit

Permalink
Merge branch 'main' into add-start-of-line
Browse files Browse the repository at this point in the history
  • Loading branch information
tomeras91 committed Jan 2, 2024
2 parents 7b7bef2 + 081dda3 commit c9e0f62
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 35 deletions.
27 changes: 27 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,37 @@



## v0.7.0 (2024-01-02)

### Feature

* feat: Init tokenizer from filehandle (#76)

* feat: allow creating JurassicTokenizer from model file handle

* fix: Add default for model_path and model_file_handle

* feat: Add JurassicTokenizer.from_file_path classmethod

* fix: remove model_path=None in JurassicTokenizer.from_file_handle

* fix: rename _assert_exactly_one to _validate_init and make it not static

* refactor: semantics

* test: Added tests

---------

Co-authored-by: Asaf Gardin <[email protected]> ([`dcb73a7`](https://github.com/AI21Labs/ai21-tokenizer/commit/dcb73a72348e576b06cd4a066e06141ceae37a44))


## v0.6.0 (2023-12-28)

### Chore

* chore(release): v0.6.0 [skip ci] ([`7b8348d`](https://github.com/AI21Labs/ai21-tokenizer/commit/7b8348d303eb54c4a75ca1c58be5c08c35ec3de8))

* chore: add test case for encode with is_start=False (#74)

* chore: add test case for encode with is_start=False
Expand Down
28 changes: 25 additions & 3 deletions ai21_tokenizer/jurassic_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import re
from dataclasses import dataclass
from typing import List, Union, Optional, Dict, Any, Tuple
from typing import List, Union, Optional, Dict, Any, Tuple, BinaryIO

import sentencepiece as spm

Expand All @@ -19,11 +19,16 @@ class SpaceSymbol:
class JurassicTokenizer(BaseTokenizer):
def __init__(
self,
model_path: PathLike,
model_path: Optional[PathLike] = None,
model_file_handle: Optional[BinaryIO] = None,
config: Optional[Dict[str, Any]] = None,
):
self._validate_init(model_path=model_path, model_file_handle=model_file_handle)

model_proto = load_binary(model_path) if model_path else model_file_handle.read()

# noinspection PyArgumentList
self._sp = spm.SentencePieceProcessor(model_proto=load_binary(model_path))
self._sp = spm.SentencePieceProcessor(model_proto=model_proto)
config = config or {}

self.pad_id = config.get("pad_id")
Expand Down Expand Up @@ -52,6 +57,13 @@ def __init__(
self._space_mode = config.get("space_mode")
self._space_tokens = self._map_space_tokens()

def _validate_init(self, model_path: Optional[PathLike], model_file_handle: Optional[BinaryIO]) -> None:
if model_path is None and model_file_handle is None:
raise ValueError("Must provide exactly one of model_path or model_file_handle. Got none.")

if model_path is not None and model_file_handle is not None:
raise ValueError("Must provide exactly one of model_path or model_file_handle. Got both.")

def _map_space_tokens(self) -> List[SpaceSymbol]:
res = []
for count in range(32, 0, -1):
Expand Down Expand Up @@ -226,3 +238,13 @@ def convert_ids_to_tokens(self, token_ids: Union[int, List[int]], **kwargs) -> U
return self._id_to_token(token_ids)

return [self._id_to_token(token_id) for token_id in token_ids]

@classmethod
def from_file_handle(
cls, model_file_handle: BinaryIO, config: Optional[Dict[str, Any]] = None
) -> JurassicTokenizer:
return cls(model_file_handle=model_file_handle, config=config)

@classmethod
def from_file_path(cls, model_path: PathLike, config: Optional[Dict[str, Any]] = None) -> JurassicTokenizer:
return cls(model_path=model_path, config=config)
2 changes: 1 addition & 1 deletion ai21_tokenizer/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION = "0.6.0"
VERSION = "0.7.0"
34 changes: 6 additions & 28 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ schema_pattern = "(build|ci|docs|feat|fix|perf|refactor|style|test|chore|revert|

[tool.poetry]
name = "ai21-tokenizer"
version = "0.6.0"
version = "0.7.0"
description = ""
authors = ["AI21 Labs"]
readme = "README.md"
Expand Down Expand Up @@ -111,9 +111,10 @@ newline_sequence = "\n"




[tool.poetry.group.test.dependencies]
coverage = "^7.1.0"
pytest = "7.2.1"
pytest = "7.4.4"
pytest-cov = "4.0.0"
pytest-mock = "3.11.1"

Expand Down
76 changes: 75 additions & 1 deletion tests/test_jurassic_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import json
from pathlib import Path
from typing import Union, List
from typing import Union, List, BinaryIO, Optional

import pytest

from ai21_tokenizer.jurassic_tokenizer import JurassicTokenizer
from ai21_tokenizer.utils import PathLike

_LOCAL_RESOURCES_PATH = Path(__file__).parents[1] / "ai21_tokenizer" / "resources" / "j2-tokenizer"


def test_tokenizer_encode_decode(tokenizer: JurassicTokenizer):
Expand Down Expand Up @@ -106,3 +109,74 @@ def test_tokenizer__decode_with_start_of_line(
actual_text = tokenizer.decode(tokens, start_of_line=start_of_line)

assert actual_text == expected_text


def test_tokenizer__from_file_handle():
text = "Hello world!"
model_config = {
"vocab_size": 262144,
"pad_id": 0,
"bos_id": 1,
"eos_id": 2,
"unk_id": 3,
"add_dummy_prefix": False,
"newline_piece": "<|newline|>",
"number_mode": "right_keep",
"space_mode": "left",
}

with (_LOCAL_RESOURCES_PATH / "j2-tokenizer.model").open("rb") as tokenizer_file:
tokenizer = JurassicTokenizer.from_file_handle(model_file_handle=tokenizer_file, config=model_config)

encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)

assert decoded == text


def test_tokenizer__from_file_path():
text = "Hello world!"
model_config = {
"vocab_size": 262144,
"pad_id": 0,
"bos_id": 1,
"eos_id": 2,
"unk_id": 3,
"add_dummy_prefix": False,
"newline_piece": "<|newline|>",
"number_mode": "right_keep",
"space_mode": "left",
}

tokenizer = JurassicTokenizer.from_file_path(
model_path=(_LOCAL_RESOURCES_PATH / "j2-tokenizer.model"), config=model_config
)

encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)

assert decoded == text


@pytest.mark.parametrize(
ids=[
"when_model_path_and_file_handle_are_none__should_raise_value_error",
"when_model_path_and_file_handle_are_not_none__should_raise_value_error",
],
argnames=["model_path", "model_file_handle", "expected_error_message"],
argvalues=[
(None, None, "Must provide exactly one of model_path or model_file_handle. Got none."),
(
Path("some_path"),
"some_file_handle",
"Must provide exactly one of model_path or model_file_handle. Got both.",
),
],
)
def test_tokenizer__(
model_path: Optional[PathLike], model_file_handle: Optional[BinaryIO], expected_error_message: str
):
with pytest.raises(ValueError) as error:
JurassicTokenizer(model_file_handle=model_file_handle, model_path=model_path, config={})

assert error.value.args[0] == expected_error_message

0 comments on commit c9e0f62

Please sign in to comment.