Skip to content

Commit

Permalink
Merge branch 'main' into add-decode-with-offsets
Browse files Browse the repository at this point in the history
  • Loading branch information
tomeras91 committed Dec 28, 2023
2 parents 5b1d4f1 + 296bda5 commit ecc2c36
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion ai21_tokenizer/jurassic_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def encode(self, text: str, **kwargs) -> List[int]:
"""
Tokenizes the input text and returns it's token ids
"""
is_start = kwargs.get("is_start", True)
lines = text.split("\n")
toks = []

Expand All @@ -163,7 +164,7 @@ def encode(self, text: str, **kwargs) -> List[int]:
if not line:
continue
# We add the dummy prefix on every newline, and also for the 1st line if it's a 'start'
if self._manual_add_dummy_prefix and i >= 0:
if self._manual_add_dummy_prefix and (i > 0 or (i == 0 and is_start)):
line = " " + line
toks.extend(self._encode(line))

Expand Down

0 comments on commit ecc2c36

Please sign in to comment.