Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add start_of_line to decode #77

Merged
merged 3 commits into from
Jan 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions ai21_tokenizer/jurassic_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,14 @@ def decode(self, token_ids: List[int], **kwargs) -> str:
"""
Transforms token ids into text
"""
res_text, _ = self.decode_with_offsets(token_ids)
res_text, _ = self.decode_with_offsets(token_ids, **kwargs)
return res_text

def decode_with_offsets(self, token_ids: List[int]) -> Tuple[str, List[Tuple[int, int]]]:
def decode_with_offsets(self, token_ids: List[int], **kwargs) -> Tuple[str, List[Tuple[int, int]]]:
"""
Transforms token ids into text, and returns the offsets of each token as well
"""
start_of_line = True
start_of_line = kwargs.get("start_of_line", True)

res_text = ""
offsets = []
Expand Down
19 changes: 19 additions & 0 deletions tests/test_jurassic_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,25 @@ def test_tokenizer__convert_tokens_to_ids(
assert actual_ids == expected_ids


@pytest.mark.parametrize(
ids=[
"when_start_of_line__should_return_no_leading_whitespace",
"when_not_start_of_line__should_return_leading_whitespace",
],
argnames=["tokens", "start_of_line", "expected_text"],
argvalues=[
([30671], True, "hello"),
([30671], False, " hello"),
],
)
def test_tokenizer__decode_with_start_of_line(
tokens: List[int], start_of_line: bool, expected_text: str, tokenizer: JurassicTokenizer
):
actual_text = tokenizer.decode(tokens, start_of_line=start_of_line)

assert actual_text == expected_text


def test_tokenizer__from_file_handle():
text = "Hello world!"
model_config = {
Expand Down
Loading