From 23ecd6038c477898522349a4977a522df8388b06 Mon Sep 17 00:00:00 2001 From: Miri Bar Date: Sun, 16 Jun 2024 14:47:48 +0300 Subject: [PATCH] feat: fix and add tests, examples, update readme --- README.md | 44 +++++++++++++----- ai21_tokenizer/jamba_instruct_tokenizer.py | 8 +++- ai21_tokenizer/jurassic_tokenizer.py | 5 +++ examples/async_jamba_tokenizer.py | 18 ++++++++ examples/async_jurassic_tokenizer.py | 4 +- examples/jamba_tokenizer.py | 11 +++++ examples/use_tokenizer_async.py | 2 +- pyproject.toml | 2 + tests/conftest.py | 9 ++-- tests/test_jamba_tokenizer.py | 52 +++++----------------- tests/test_jurassic_tokenizer.py | 14 ++++++ 11 files changed, 107 insertions(+), 62 deletions(-) create mode 100644 examples/async_jamba_tokenizer.py create mode 100644 examples/jamba_tokenizer.py diff --git a/README.md b/README.md index c0d7c00..1f2c194 100644 --- a/README.md +++ b/README.md @@ -35,40 +35,60 @@ poetry add ai21-tokenizer ### Tokenizer Creation +### Jamba-Instruct Tokenizer + ```python -from ai21_tokenizer import Tokenizer +from ai21_tokenizer import Tokenizer, PreTrainedTokenizers -tokenizer = Tokenizer.get_tokenizer() +tokenizer = Tokenizer.get_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER) # Your code here ``` -Another way would be to use our Jurassic model directly: +Another way would be to use our Jamba model directly: ```python -from ai21_tokenizer import JurassicTokenizer +from ai21_tokenizer import JambaInstructTokenizer -model_path = "" -config = {} # "dictionary object of your config.json file" -tokenizer = JurassicTokenizer(model_path=model_path, config=config) +model_path = "" +tokenizer = JambaInstructTokenizer(model_path=model_path) +# Your code here +``` + +#### Async usage + +```python +from ai21_tokenizer import Tokenizer, PreTrainedTokenizers + +tokenizer = Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER) +# Your code here ``` -### Async usage +### J2 Tokenizer ```python from ai21_tokenizer import Tokenizer -tokenizer = Tokenizer.get_tokenizer(is_async=True) +tokenizer = Tokenizer.get_tokenizer() # Your code here ``` -Direct usage of async Jurassic model: +Another way would be to use our Jurassic model directly: ```python -from ai21_tokenizer import AsyncJurassicTokenizer +from ai21_tokenizer import JurassicTokenizer model_path = "" config = {} # "dictionary object of your config.json file" -tokenizer = AsyncJurassicTokenizer(model_path=model_path, config=config) +tokenizer = JurassicTokenizer(model_path=model_path, config=config) +``` + +#### Async usage + +```python +from ai21_tokenizer import Tokenizer + +tokenizer = Tokenizer.get_async_tokenizer() +# Your code here ``` ### Functions diff --git a/ai21_tokenizer/jamba_instruct_tokenizer.py b/ai21_tokenizer/jamba_instruct_tokenizer.py index f3f4f3a..a0eb98e 100644 --- a/ai21_tokenizer/jamba_instruct_tokenizer.py +++ b/ai21_tokenizer/jamba_instruct_tokenizer.py @@ -79,13 +79,12 @@ def __init__( """ self._model_path = model_path self._cache_dir = cache_dir or _DEFAULT_MODEL_CACHE_DIR - # BaseJambaInstructTokenizer.__init__(self, model_path=model_path, cache_dir=cache_dir) async def __aenter__(self): await self._init_tokenizer() return self - def __aexit__(self, exc_type, exc_val, exc_tb): + async def __aexit__(self, exc_type, exc_val, exc_tb): pass async def encode(self, text: str, **kwargs) -> List[int]: @@ -110,6 +109,11 @@ async def convert_ids_to_tokens(self, token_ids: Union[int, List[int]], **kwargs @property def vocab_size(self) -> int: + if not self._tokenizer: + raise ValueError( + "Tokenizer not properly initialized. Please do not initialize the tokenizer directly. Use " + "Tokenizer.get_async_tokenizer instead." + ) return self._tokenizer.get_vocab_size() async def _init_tokenizer(self): diff --git a/ai21_tokenizer/jurassic_tokenizer.py b/ai21_tokenizer/jurassic_tokenizer.py index 6d90e23..b904e9c 100644 --- a/ai21_tokenizer/jurassic_tokenizer.py +++ b/ai21_tokenizer/jurassic_tokenizer.py @@ -98,6 +98,11 @@ async def __aexit__(self, exc_type, exc_val, exc_tb): @property def vocab_size(self) -> int: + if not self._sp: + raise ValueError( + "Tokenizer not properly initialized. Please do not initialize the tokenizer directly. Use " + "Tokenizer.get_async_tokenizer instead." + ) return self._sp.vocab_size() async def encode(self, text: str, **kwargs) -> List[int]: diff --git a/examples/async_jamba_tokenizer.py b/examples/async_jamba_tokenizer.py new file mode 100644 index 0000000..f255bca --- /dev/null +++ b/examples/async_jamba_tokenizer.py @@ -0,0 +1,18 @@ +import asyncio + +from ai21_tokenizer import Tokenizer, PreTrainedTokenizers + + +async def main(): + tokenizer = await Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER) + + example_sentence = "This sentence should be encoded and then decoded. Hurray!!!!" + encoded = await tokenizer.encode(example_sentence) + decoded = await tokenizer.decode(encoded) + + assert decoded == example_sentence + print("Example sentence: " + example_sentence) + print("Encoded and decoded: " + decoded) + + +asyncio.run(main()) diff --git a/examples/async_jurassic_tokenizer.py b/examples/async_jurassic_tokenizer.py index a50032e..3cb246d 100644 --- a/examples/async_jurassic_tokenizer.py +++ b/examples/async_jurassic_tokenizer.py @@ -1,7 +1,7 @@ import asyncio from pathlib import Path -from ai21_tokenizer import AsyncJurassicTokenizer +from ai21_tokenizer import Tokenizer, PreTrainedTokenizers from ai21_tokenizer.utils import load_json resource_path = Path(__file__).parent.parent / "ai21_tokenizer" / "resources" @@ -12,7 +12,7 @@ async def main(): - tokenizer = AsyncJurassicTokenizer(model_path=model_path, config=config) + tokenizer = await Tokenizer.get_async_tokenizer(PreTrainedTokenizers.J2_TOKENIZER) example_sentence = "This sentence should be encoded and then decoded. Hurray!!!!" encoded = await tokenizer.encode(example_sentence) diff --git a/examples/jamba_tokenizer.py b/examples/jamba_tokenizer.py new file mode 100644 index 0000000..497f3bf --- /dev/null +++ b/examples/jamba_tokenizer.py @@ -0,0 +1,11 @@ +from ai21_tokenizer import JambaInstructTokenizer + +model_path = "ai21labs/Jamba-v0.1" + +tokenizer = JambaInstructTokenizer(model_path=model_path) + +example_sentence = "This sentence should be encoded and then decoded. Hurray!!!!" +encoded = tokenizer.encode(example_sentence) +decoded = tokenizer.decode(encoded) + +assert decoded == example_sentence diff --git a/examples/use_tokenizer_async.py b/examples/use_tokenizer_async.py index 8643195..d07fdfc 100644 --- a/examples/use_tokenizer_async.py +++ b/examples/use_tokenizer_async.py @@ -4,7 +4,7 @@ async def main(): - tokenizer = Tokenizer.get_tokenizer(is_async=True) + tokenizer = await Tokenizer.get_async_tokenizer() example_sentence = "This sentence should be encoded and then decoded. Hurray!!" encoded = await tokenizer.encode(example_sentence) decoded = await tokenizer.decode(encoded) diff --git a/pyproject.toml b/pyproject.toml index c6c2afc..30c83df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,6 +106,8 @@ newline_sequence = "\n" + + [tool.poetry.group.test.dependencies] coverage = "^7.1.0" pytest = "7.4.4" diff --git a/tests/conftest.py b/tests/conftest.py index b54ba9b..64619e7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -29,9 +29,9 @@ def tokenizer() -> JurassicTokenizer: raise ValueError("JurassicTokenizer not found") -@pytest.fixture(scope="session") -def async_tokenizer() -> AsyncJurassicTokenizer: - jurassic_tokenizer = Tokenizer.get_tokenizer(tokenizer_name=PreTrainedTokenizers.J2_TOKENIZER, is_async=True) +@pytest.fixture() +async def async_tokenizer() -> AsyncJurassicTokenizer: + jurassic_tokenizer = await Tokenizer.get_async_tokenizer(tokenizer_name=PreTrainedTokenizers.J2_TOKENIZER) if isinstance(jurassic_tokenizer, AsyncJurassicTokenizer): return jurassic_tokenizer @@ -49,8 +49,7 @@ def jamba_instruct_tokenizer() -> JambaInstructTokenizer: raise ValueError("JambaInstructTokenizer not found") -@pytest.mark.asyncio -@pytest.fixture() +@pytest.fixture async def async_jamba_instruct_tokenizer() -> AsyncJambaInstructTokenizer: jamba_tokenizer = await Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER) diff --git a/tests/test_jamba_tokenizer.py b/tests/test_jamba_tokenizer.py index 113d7ab..6b7698d 100644 --- a/tests/test_jamba_tokenizer.py +++ b/tests/test_jamba_tokenizer.py @@ -1,6 +1,6 @@ from pathlib import Path from typing import List, Union -from unittest.mock import patch, AsyncMock +from unittest.mock import patch import pytest from ai21_tokenizer import JambaInstructTokenizer, AsyncJambaInstructTokenizer @@ -179,46 +179,18 @@ async def test_async_tokenizer_encode_caches_tokenizer__should_have_tokenizer_in @pytest.mark.asyncio -@patch("ai21_tokenizer.jamba_instruct_tokenizer._load_from_cache", new_callable=AsyncMock) -async def test_async_tokenizer_when_cache_dir_exists__should_load_from_cache( +async def test_async_tokenizer_initialized_directly_and_uses_vocab_size__should_raise_error( tmp_path: Path, - mock_async_jamba_instruct_tokenizer: AsyncJambaInstructTokenizer, ): - # Creating tokenizer once from repo - assert not (tmp_path / "tokenizer.json").exists() - tokenizer = AsyncJambaInstructTokenizer(JAMBA_TOKENIZER_HF_PATH, tmp_path) - _ = await tokenizer.encode("Hello world!") - - assert (tmp_path / "tokenizer.json").exists() + with pytest.raises(ValueError): + tokenizer = AsyncJambaInstructTokenizer(model_path=JAMBA_TOKENIZER_HF_PATH, cache_dir=tmp_path) + _ = tokenizer.vocab_size - tokenizer2 = AsyncJambaInstructTokenizer(JAMBA_TOKENIZER_HF_PATH, tmp_path) - assert (tmp_path / "tokenizer.json").exists() - _ = await tokenizer2.encode("Hello world!") - - # Assert that _load_from_cache was called once - mock_async_jamba_instruct_tokenizer._load_from_cache.assert_called_once() - - -# @pytest.mark.asyncio -# async def test_async_tokenizer__when_cache_dir_not_exists__should_save_tokenizer_in_cache_dir(tmp_path: Path): -# assert not (tmp_path / "tokenizer.json").exists() -# AsyncJambaInstructTokenizer(JAMABA_TOKENIZER_HF_PATH, tmp_path) -# -# assert (tmp_path / "tokenizer.json").exists() - - -# @pytest.mark.asyncio -# async def test_async_tokenizer__when_cache_dir_exists__should_load_from_cache(tmp_path: Path): -# # Creating tokenizer once from repo -# assert not (tmp_path / "tokenizer.json").exists() -# AsyncJambaInstructTokenizer(JAMABA_TOKENIZER_HF_PATH, tmp_path) -# -# # Creating tokenizer again to load from cache -# with patch.object( -# AsyncJambaInstructTokenizer, AsyncJambaInstructTokenizer._load_from_cache.__name__ -# ) as mock_load_from_cache: -# AsyncJambaInstructTokenizer(JAMABA_TOKENIZER_HF_PATH, tmp_path) -# -# # assert load_from_cache was called -# mock_load_from_cache.assert_called_once() +@pytest.mark.asyncio +async def test_async_tokenizer_initialized_with_manager_and_uses_vocab_size__should_not_raise_error( + tmp_path: Path, +): + tokenizer = AsyncJambaInstructTokenizer(model_path=JAMBA_TOKENIZER_HF_PATH, cache_dir=tmp_path) + async with tokenizer: + assert tokenizer.vocab_size > 0 diff --git a/tests/test_jurassic_tokenizer.py b/tests/test_jurassic_tokenizer.py index d1976a2..1ec9e53 100644 --- a/tests/test_jurassic_tokenizer.py +++ b/tests/test_jurassic_tokenizer.py @@ -344,3 +344,17 @@ async def test_async_init__when_model_path_is_a_file__should_support_backwards_c decoded = await async_tokenizer.decode(encoded) assert decoded == TEXT + + +@pytest.mark.asyncio +async def test_async_tokenizer_initialized_directly_and_uses_vocab_size__should_raise_error(): + with pytest.raises(ValueError): + tokenizer = AsyncJurassicTokenizer(model_path=_LOCAL_RESOURCES_PATH / "j2-tokenizer.model") + _ = tokenizer.vocab_size + + +@pytest.mark.asyncio +async def test_async_tokenizer_initialized_with_manager_and_uses_vocab_size__should_not_raise_error(): + tokenizer = AsyncJurassicTokenizer(model_path=_LOCAL_RESOURCES_PATH / "j2-tokenizer.model") + async with tokenizer: + assert tokenizer.vocab_size > 0