-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add async tokenizer, add detokenize method (#144)
* feat: add detokenize method, add async tokenizer * chore: update pyproject and poetry.lock * fix: fix tokenizer name in examples and readme, add example
- Loading branch information
Showing
10 changed files
with
223 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from .ai21_tokenizer import AI21Tokenizer | ||
from .factory import get_tokenizer | ||
from .factory import get_tokenizer, get_async_tokenizer | ||
|
||
__all__ = ["AI21Tokenizer", "get_tokenizer"] | ||
__all__ = ["AI21Tokenizer", "get_tokenizer", "get_async_tokenizer"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from ai21.tokenizers import get_async_tokenizer | ||
import asyncio | ||
|
||
prompt = ( | ||
"The following is a conversation between a user of an eCommerce store and a user operation" | ||
" associate called Max. Max is very kind and keen to help." | ||
" The following are important points about the business policies:\n- " | ||
"Delivery takes up to 5 days\n- There is no return option\n\nUser gender:" | ||
" Male.\n\nConversation:\nUser: Hi, had a question\nMax: " | ||
"Hi there, happy to help!\nUser: Is there no way to return a product?" | ||
" I got your blue T-Shirt size small but it doesn't fit.\n" | ||
"Max: I'm sorry to hear that. Unfortunately we don't have a return policy. \n" | ||
"User: That's a shame. \nMax: Is there anything else i can do for you?\n\n" | ||
"##\n\nThe following is a conversation between a user of an eCommerce store and a user operation" | ||
" associate called Max. Max is very kind and keen to help. The following are important points about" | ||
" the business policies:\n- Delivery takes up to 5 days\n- There is no return option\n\n" | ||
'User gender: Female.\n\nConversation:\nUser: Hi, I was wondering when you\'ll have the "Blue & White" ' | ||
"t-shirt back in stock?\nMax: Hi, happy to assist! We currently don't have it in stock. Do you want me" | ||
" to send you an email once we do?\nUser: Yes!\nMax: Awesome. What's your email?\nUser: [email protected]\n" | ||
"Max: Great. I'll send you an email as soon as we get it.\n\n##\n\nThe following is a conversation between" | ||
" a user of an eCommerce store and a user operation associate called Max. Max is very kind and keen to help." | ||
" The following are important points about the business policies:\n- Delivery takes up to 5 days\n" | ||
"- There is no return option\n\nUser gender: Female.\n\nConversation:\nUser: Hi, how much time does it" | ||
" take for the product to reach me?\nMax: Hi, happy to assist! It usually takes 5 working" | ||
" days to reach you.\nUser: Got it! thanks. Is there a way to shorten that delivery time if i pay extra?\n" | ||
"Max: I'm sorry, no.\nUser: Got it. How do i know if the White Crisp t-shirt will fit my size?\n" | ||
"Max: The size charts are available on the website.\nUser: Can you tell me what will fit a young women.\n" | ||
"Max: Sure. Can you share her exact size?\n\n##\n\nThe following is a conversation between a user of an" | ||
" eCommerce store and a user operation associate called Max. Max is very kind and keen to help. The following" | ||
" are important points about the business policies:\n- Delivery takes up to 5 days\n" | ||
"- There is no return option\n\nUser gender: Female.\n\nConversation:\n" | ||
"User: Hi, I have a question for you" | ||
) | ||
|
||
|
||
async def main(): | ||
tokenizer = await get_async_tokenizer(name="jamba-tokenizer") | ||
response = await tokenizer.count_tokens(prompt) | ||
print(response) | ||
|
||
|
||
asyncio.run(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
from typing import List | ||
|
||
import pytest | ||
from ai21.tokenizers.factory import get_async_tokenizer | ||
|
||
|
||
class TestAsyncAI21Tokenizer: | ||
@pytest.mark.asyncio | ||
@pytest.mark.parametrize( | ||
ids=[ | ||
"when_j2_tokenizer", | ||
"when_jamba_instruct_tokenizer", | ||
], | ||
argnames=["tokenizer_name", "expected_tokens"], | ||
argvalues=[ | ||
("j2-tokenizer", 8), | ||
("jamba-tokenizer", 9), | ||
], | ||
) | ||
async def test__count_tokens__should_return_number_of_tokens(self, tokenizer_name: str, expected_tokens: int): | ||
tokenizer = await get_async_tokenizer(tokenizer_name) | ||
|
||
actual_number_of_tokens = await tokenizer.count_tokens("Text to Tokenize - Hello world!") | ||
|
||
assert actual_number_of_tokens == expected_tokens | ||
|
||
@pytest.mark.asyncio | ||
@pytest.mark.parametrize( | ||
ids=[ | ||
"when_j2_tokenizer", | ||
"when_jamba_instruct_tokenizer", | ||
], | ||
argnames=["tokenizer_name", "expected_tokens"], | ||
argvalues=[ | ||
("j2-tokenizer", ["▁Text", "▁to", "▁Token", "ize", "▁-", "▁Hello", "▁world", "!"]), | ||
( | ||
"jamba-tokenizer", | ||
["<|startoftext|>", "Text", "▁to", "▁Token", "ize", "▁-", "▁Hello", "▁world", "!"], | ||
), | ||
], | ||
) | ||
async def test__tokenize__should_return_list_of_tokens(self, tokenizer_name: str, expected_tokens: List[str]): | ||
tokenizer = await get_async_tokenizer(tokenizer_name) | ||
|
||
actual_tokens = await tokenizer.tokenize("Text to Tokenize - Hello world!") | ||
|
||
assert actual_tokens == expected_tokens | ||
|
||
@pytest.mark.asyncio | ||
@pytest.mark.parametrize( | ||
ids=[ | ||
"when_j2_tokenizer", | ||
"when_jamba_instruct_tokenizer", | ||
], | ||
argnames=["tokenizer_name"], | ||
argvalues=[ | ||
("j2-tokenizer",), | ||
("jamba-tokenizer",), | ||
], | ||
) | ||
async def test__detokenize__should_return_list_of_tokens(self, tokenizer_name: str): | ||
tokenizer = await get_async_tokenizer(tokenizer_name) | ||
original_text = "Text to Tokenize - Hello world!" | ||
actual_tokens = await tokenizer.tokenize(original_text) | ||
detokenized_text = await tokenizer.detokenize(actual_tokens) | ||
|
||
assert original_text == detokenized_text | ||
|
||
@pytest.mark.asyncio | ||
async def test__tokenizer__should_be_singleton__when_called_twice(self): | ||
tokenizer1 = await get_async_tokenizer() | ||
tokenizer2 = await get_async_tokenizer() | ||
|
||
assert tokenizer1 is tokenizer2 | ||
|
||
@pytest.mark.asyncio | ||
async def test__get_tokenizer__when_called_with_different_tokenizer_name__should_return_different_tokenizer(self): | ||
tokenizer1 = await get_async_tokenizer("j2-tokenizer") | ||
tokenizer2 = await get_async_tokenizer("jamba-tokenizer") | ||
|
||
assert tokenizer1._tokenizer is not tokenizer2._tokenizer | ||
|
||
@pytest.mark.asyncio | ||
async def test__get_tokenizer__when_tokenizer_name_not_supported__should_raise_error(self): | ||
with pytest.raises(ValueError): | ||
await get_async_tokenizer("some-tokenizer") |