fix: add missing space tokens init on jursssic (#89)

* fix: missing space tokens init on jursssic, fix import in example * test: add test for space_tokens init
AI21Labs · Jun 20, 2024 · 4ec43a0 · 4ec43a0
1 parent 07bf0cd
commit 4ec43a0
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 1 deletion.
diff --git a/ai21_tokenizer/jurassic_tokenizer.py b/ai21_tokenizer/jurassic_tokenizer.py
@@ -36,6 +36,7 @@ def __init__(
             self._convert_ids_to_tokens([i for i in range(self.vocab_size) if self._sp.IsControl(i)])
         )
         self.newline_id = self._token_to_id(self._newline_piece)
+        self._space_tokens = self._map_space_tokens()
 
     @property
     def vocab_size(self) -> int:
@@ -194,3 +195,4 @@ def _set_model_proto_related_variables(self, model_proto: bytes):
             self._convert_ids_to_tokens([i for i in range(self.vocab_size) if self._sp.IsControl(i)])
         )
         self.newline_id = self._token_to_id(self._newline_piece)
+        self._space_tokens = self._map_space_tokens()
diff --git a/examples/jurassic_tokenizer.py b/examples/jurassic_tokenizer.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 
 from ai21_tokenizer import JurassicTokenizer
-from ai21_tokenizer.utils import load_json
+from ai21_tokenizer.file_utils import load_json
 
 resource_path = Path(__file__).parent.parent / "ai21_tokenizer" / "resources"
 

diff --git a/tests/test_jurassic_tokenizer.py b/tests/test_jurassic_tokenizer.py
@@ -178,6 +178,24 @@ def test_init__when_model_path_is_a_file__should_support_backwards_compatability
     assert decoded == TEXT
 
 
+def test_tokenizer__initializes_space_tokens_properly__should_encode_without_errors():
+    tokenizer = JurassicTokenizer(model_path=_LOCAL_RESOURCES_PATH / "j2-tokenizer.model", config=MODEL_CONFIG)
+    prompt = """
+Question: who is Albert Einstein?
+Thought:
+Action: Search
+Action Input: Albert Einstein
+Thought:
+Action:
+Action Input:
+Observation:  is not a valid tool, try one of [Search].
+Thought:"""
+    try:
+        _ = tokenizer.encode(prompt)
+    except Exception as e:
+        pytest.fail(f"Encoding failed with error: {e}")
+
+
 @pytest.mark.asyncio
 async def test_async_tokenizer_encode_decode(async_tokenizer: AsyncJurassicTokenizer):
     encoded = await async_tokenizer.encode(TEXT)
@@ -350,3 +368,24 @@ async def test_async_init__when_model_path_is_a_file__should_support_backwards_c
 async def test_async_tokenizer_initialized_directly__should_raise_error():
     with pytest.raises(ValueError):
         AsyncJurassicTokenizer()
+
+
+@pytest.mark.asyncio
+async def test_async_tokenizer__initializes_space_tokens_properly__should_encode_without_errors():
+    jurassic_tokenizer = await AsyncJurassicTokenizer.create(
+        model_path=_LOCAL_RESOURCES_PATH / "j2-tokenizer.model", config=MODEL_CONFIG
+    )
+    prompt = """
+Question: who is Albert Einstein?
+Thought:
+Action: Search
+Action Input: Albert Einstein
+Thought:
+Action:
+Action Input:
+Observation:  is not a valid tool, try one of [Search].
+Thought:"""
+    try:
+        _ = await jurassic_tokenizer.encode(prompt)
+    except Exception as e:
+        pytest.fail(f"Encoding failed with error: {e}")