Skip to content

Commit

Permalink
fix: add missing space tokens init on jursssic (#89)
Browse files Browse the repository at this point in the history
* fix: missing space tokens init on jursssic, fix import in example

* test: add test for space_tokens init
  • Loading branch information
miri-bar authored Jun 20, 2024
1 parent 07bf0cd commit 4ec43a0
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 1 deletion.
2 changes: 2 additions & 0 deletions ai21_tokenizer/jurassic_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __init__(
self._convert_ids_to_tokens([i for i in range(self.vocab_size) if self._sp.IsControl(i)])
)
self.newline_id = self._token_to_id(self._newline_piece)
self._space_tokens = self._map_space_tokens()

@property
def vocab_size(self) -> int:
Expand Down Expand Up @@ -194,3 +195,4 @@ def _set_model_proto_related_variables(self, model_proto: bytes):
self._convert_ids_to_tokens([i for i in range(self.vocab_size) if self._sp.IsControl(i)])
)
self.newline_id = self._token_to_id(self._newline_piece)
self._space_tokens = self._map_space_tokens()
2 changes: 1 addition & 1 deletion examples/jurassic_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path

from ai21_tokenizer import JurassicTokenizer
from ai21_tokenizer.utils import load_json
from ai21_tokenizer.file_utils import load_json

resource_path = Path(__file__).parent.parent / "ai21_tokenizer" / "resources"

Expand Down
39 changes: 39 additions & 0 deletions tests/test_jurassic_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,24 @@ def test_init__when_model_path_is_a_file__should_support_backwards_compatability
assert decoded == TEXT


def test_tokenizer__initializes_space_tokens_properly__should_encode_without_errors():
tokenizer = JurassicTokenizer(model_path=_LOCAL_RESOURCES_PATH / "j2-tokenizer.model", config=MODEL_CONFIG)
prompt = """
Question: who is Albert Einstein?
Thought:
Action: Search
Action Input: Albert Einstein
Thought:
Action:
Action Input:
Observation: is not a valid tool, try one of [Search].
Thought:"""
try:
_ = tokenizer.encode(prompt)
except Exception as e:
pytest.fail(f"Encoding failed with error: {e}")


@pytest.mark.asyncio
async def test_async_tokenizer_encode_decode(async_tokenizer: AsyncJurassicTokenizer):
encoded = await async_tokenizer.encode(TEXT)
Expand Down Expand Up @@ -350,3 +368,24 @@ async def test_async_init__when_model_path_is_a_file__should_support_backwards_c
async def test_async_tokenizer_initialized_directly__should_raise_error():
with pytest.raises(ValueError):
AsyncJurassicTokenizer()


@pytest.mark.asyncio
async def test_async_tokenizer__initializes_space_tokens_properly__should_encode_without_errors():
jurassic_tokenizer = await AsyncJurassicTokenizer.create(
model_path=_LOCAL_RESOURCES_PATH / "j2-tokenizer.model", config=MODEL_CONFIG
)
prompt = """
Question: who is Albert Einstein?
Thought:
Action: Search
Action Input: Albert Einstein
Thought:
Action:
Action Input:
Observation: is not a valid tool, try one of [Search].
Thought:"""
try:
_ = await jurassic_tokenizer.encode(prompt)
except Exception as e:
pytest.fail(f"Encoding failed with error: {e}")

0 comments on commit 4ec43a0

Please sign in to comment.