From 375e0aa2c2055bf36bd0e0a759d8f31d5a3dcb8b Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Thu, 28 Dec 2023 11:59:19 +0200 Subject: [PATCH 1/6] fix: commitizen tag starts with "v" --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6c75298..e5fad94 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,7 +31,7 @@ repos: - id: git-check files: "CHANGELOG.md" - repo: https://github.com/commitizen-tools/commitizen - rev: 3.5.3 + rev: v3.5.3 hooks: - id: commitizen name: Lint commit message From 5d40a180145c81a29bf48348ff1a1cdcceaadcd3 Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Thu, 28 Dec 2023 12:06:45 +0200 Subject: [PATCH 2/6] feat: add eos_id --- ai21_tokenizer/jurassic_tokenizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ai21_tokenizer/jurassic_tokenizer.py b/ai21_tokenizer/jurassic_tokenizer.py index 2b7a7d5..fc1b0b6 100644 --- a/ai21_tokenizer/jurassic_tokenizer.py +++ b/ai21_tokenizer/jurassic_tokenizer.py @@ -30,6 +30,7 @@ def __init__( self.unk_id = config.get("unk_id") self.eop_id = config.get("eop_id") self.bos_id = config.get("bos_id") + self.eos_id = config.get("eos_id") self._newline_piece = config.get("newline_piece") self._mask_pieces = config.get("mask_pieces", []) From 4fcf820ba3b6273db69c1ed7cec833cb9bc77c0a Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Thu, 28 Dec 2023 12:07:57 +0200 Subject: [PATCH 3/6] feat: Add newline_id --- ai21_tokenizer/jurassic_tokenizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ai21_tokenizer/jurassic_tokenizer.py b/ai21_tokenizer/jurassic_tokenizer.py index fc1b0b6..30f7af1 100644 --- a/ai21_tokenizer/jurassic_tokenizer.py +++ b/ai21_tokenizer/jurassic_tokenizer.py @@ -33,6 +33,8 @@ def __init__( self.eos_id = config.get("eos_id") self._newline_piece = config.get("newline_piece") + self.newline_id = self.convert_tokens_to_ids(self.newline_piece) + self._mask_pieces = config.get("mask_pieces", []) self._manual_add_dummy_prefix = not (config.get("add_dummy_prefix", True)) From 1a4fe70b1ea8861bc5b323e740e55af3fbad9659 Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Thu, 28 Dec 2023 12:14:34 +0200 Subject: [PATCH 4/6] fix: typo "_newline_piece" instead of "newline_piece" --- ai21_tokenizer/jurassic_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai21_tokenizer/jurassic_tokenizer.py b/ai21_tokenizer/jurassic_tokenizer.py index 30f7af1..85ec85a 100644 --- a/ai21_tokenizer/jurassic_tokenizer.py +++ b/ai21_tokenizer/jurassic_tokenizer.py @@ -33,7 +33,7 @@ def __init__( self.eos_id = config.get("eos_id") self._newline_piece = config.get("newline_piece") - self.newline_id = self.convert_tokens_to_ids(self.newline_piece) + self.newline_id = self.convert_tokens_to_ids(self._newline_piece) self._mask_pieces = config.get("mask_pieces", []) From b59a5d3796ef8a207b3cb69dfe430de3ce73972a Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Thu, 28 Dec 2023 12:52:50 +0200 Subject: [PATCH 5/6] fix: newline_id already existed as "private". Just make it "public" --- ai21_tokenizer/jurassic_tokenizer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ai21_tokenizer/jurassic_tokenizer.py b/ai21_tokenizer/jurassic_tokenizer.py index 85ec85a..b819eac 100644 --- a/ai21_tokenizer/jurassic_tokenizer.py +++ b/ai21_tokenizer/jurassic_tokenizer.py @@ -33,8 +33,6 @@ def __init__( self.eos_id = config.get("eos_id") self._newline_piece = config.get("newline_piece") - self.newline_id = self.convert_tokens_to_ids(self._newline_piece) - self._mask_pieces = config.get("mask_pieces", []) self._manual_add_dummy_prefix = not (config.get("add_dummy_prefix", True)) @@ -45,7 +43,7 @@ def __init__( self._convert_ids_to_tokens([i for i in range(self.vocab_size) if self._sp.IsControl(i)]) ) - self._newline_id = self._token_to_id(self._newline_piece) + self.newline_id = self._token_to_id(self._newline_piece) self._sample_split = re.compile(r"▁*[^▁]+|▁") self._space_split = re.compile("( {2,})") # Split by 2 or more consecutive spaces From 9c5a507c8471450eb67a162345b8c99f64e19b59 Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Thu, 28 Dec 2023 13:00:56 +0200 Subject: [PATCH 6/6] fix: forgot to rename everywhere --- ai21_tokenizer/jurassic_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai21_tokenizer/jurassic_tokenizer.py b/ai21_tokenizer/jurassic_tokenizer.py index b819eac..a718cf1 100644 --- a/ai21_tokenizer/jurassic_tokenizer.py +++ b/ai21_tokenizer/jurassic_tokenizer.py @@ -159,7 +159,7 @@ def encode(self, text: str, **kwargs) -> List[int]: for i, line in enumerate(lines): if i > 0: - toks.append(self._newline_id) + toks.append(self.newline_id) if not line: continue # We add the dummy prefix on every newline, and also for the 1st line if it's a 'start'