Merge branch 'main' into add-decode-with-offsets

AI21Labs · Dec 28, 2023 · ecc2c36 · ecc2c36
2 parents 5b1d4f1 + 296bda5
commit ecc2c36
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/ai21_tokenizer/jurassic_tokenizer.py b/ai21_tokenizer/jurassic_tokenizer.py
@@ -154,6 +154,7 @@ def encode(self, text: str, **kwargs) -> List[int]:
         """
         Tokenizes the input text and returns it's token ids
         """
+        is_start = kwargs.get("is_start", True)
         lines = text.split("\n")
         toks = []
 
@@ -163,7 +164,7 @@ def encode(self, text: str, **kwargs) -> List[int]:
             if not line:
                 continue
             # We add the dummy prefix on every newline, and also for the 1st line if it's a 'start'
-            if self._manual_add_dummy_prefix and i >= 0:
+            if self._manual_add_dummy_prefix and (i > 0 or (i == 0 and is_start)):
                 line = " " + line
             toks.extend(self._encode(line))