From 4ec43a04e875d0fcb4dc557607c17a6ff8734176 Mon Sep 17 00:00:00 2001 From: miri-bar <160584887+miri-bar@users.noreply.github.com> Date: Thu, 20 Jun 2024 14:27:00 +0300 Subject: [PATCH] fix: add missing space tokens init on jursssic (#89) * fix: missing space tokens init on jursssic, fix import in example * test: add test for space_tokens init --- ai21_tokenizer/jurassic_tokenizer.py | 2 ++ examples/jurassic_tokenizer.py | 2 +- tests/test_jurassic_tokenizer.py | 39 ++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/ai21_tokenizer/jurassic_tokenizer.py b/ai21_tokenizer/jurassic_tokenizer.py index c44125a..0257277 100644 --- a/ai21_tokenizer/jurassic_tokenizer.py +++ b/ai21_tokenizer/jurassic_tokenizer.py @@ -36,6 +36,7 @@ def __init__( self._convert_ids_to_tokens([i for i in range(self.vocab_size) if self._sp.IsControl(i)]) ) self.newline_id = self._token_to_id(self._newline_piece) + self._space_tokens = self._map_space_tokens() @property def vocab_size(self) -> int: @@ -194,3 +195,4 @@ def _set_model_proto_related_variables(self, model_proto: bytes): self._convert_ids_to_tokens([i for i in range(self.vocab_size) if self._sp.IsControl(i)]) ) self.newline_id = self._token_to_id(self._newline_piece) + self._space_tokens = self._map_space_tokens() diff --git a/examples/jurassic_tokenizer.py b/examples/jurassic_tokenizer.py index cab23ad..3b8c872 100644 --- a/examples/jurassic_tokenizer.py +++ b/examples/jurassic_tokenizer.py @@ -1,7 +1,7 @@ from pathlib import Path from ai21_tokenizer import JurassicTokenizer -from ai21_tokenizer.utils import load_json +from ai21_tokenizer.file_utils import load_json resource_path = Path(__file__).parent.parent / "ai21_tokenizer" / "resources" diff --git a/tests/test_jurassic_tokenizer.py b/tests/test_jurassic_tokenizer.py index 657640c..6a81738 100644 --- a/tests/test_jurassic_tokenizer.py +++ b/tests/test_jurassic_tokenizer.py @@ -178,6 +178,24 @@ def test_init__when_model_path_is_a_file__should_support_backwards_compatability assert decoded == TEXT +def test_tokenizer__initializes_space_tokens_properly__should_encode_without_errors(): + tokenizer = JurassicTokenizer(model_path=_LOCAL_RESOURCES_PATH / "j2-tokenizer.model", config=MODEL_CONFIG) + prompt = """ +Question: who is Albert Einstein? +Thought: +Action: Search +Action Input: Albert Einstein +Thought: +Action: +Action Input: +Observation: is not a valid tool, try one of [Search]. +Thought:""" + try: + _ = tokenizer.encode(prompt) + except Exception as e: + pytest.fail(f"Encoding failed with error: {e}") + + @pytest.mark.asyncio async def test_async_tokenizer_encode_decode(async_tokenizer: AsyncJurassicTokenizer): encoded = await async_tokenizer.encode(TEXT) @@ -350,3 +368,24 @@ async def test_async_init__when_model_path_is_a_file__should_support_backwards_c async def test_async_tokenizer_initialized_directly__should_raise_error(): with pytest.raises(ValueError): AsyncJurassicTokenizer() + + +@pytest.mark.asyncio +async def test_async_tokenizer__initializes_space_tokens_properly__should_encode_without_errors(): + jurassic_tokenizer = await AsyncJurassicTokenizer.create( + model_path=_LOCAL_RESOURCES_PATH / "j2-tokenizer.model", config=MODEL_CONFIG + ) + prompt = """ +Question: who is Albert Einstein? +Thought: +Action: Search +Action Input: Albert Einstein +Thought: +Action: +Action Input: +Observation: is not a valid tool, try one of [Search]. +Thought:""" + try: + _ = await jurassic_tokenizer.encode(prompt) + except Exception as e: + pytest.fail(f"Encoding failed with error: {e}")