From e5d5e4357aa8f558f9ac0e2287e29cbe919f8564 Mon Sep 17 00:00:00 2001 From: HRUSHIKESH DOKALA <96101829+Hk669@users.noreply.github.com> Date: Wed, 5 Jun 2024 20:28:56 +0530 Subject: [PATCH] feat: from_pretrained enabled with wi17k_base (#6) * feat: from_pretrained enabled with wi17k_base * fix: tests_train_bpe_w_specail_tokens --- bpetokenizer/base.py | 3 +- .../pretrained/wi17k_base}/wi17k_base.json | 2 +- bpetokenizer/tokenizer.py | 31 ++++++++++++++++--- bpetokenizer/version.py | 2 +- setup.py | 3 ++ tests/test_tokenizer.py | 4 +-- 6 files changed, 35 insertions(+), 10 deletions(-) rename {pretrained => bpetokenizer/pretrained/wi17k_base}/wi17k_base.json (99%) diff --git a/bpetokenizer/base.py b/bpetokenizer/base.py index f88cb2d..f6f7cd8 100644 --- a/bpetokenizer/base.py +++ b/bpetokenizer/base.py @@ -171,6 +171,7 @@ def load(self, file_name, mode="json"): self.merges = {tuple(map(int, k.strip('()').split(','))): v for k, v in merges.items()} vocab = data["vocab"] self.vocab = {int(k): v.encode("utf-8") for k, v in vocab.items()} + self.inverse_vocab = {v.decode("utf-8"): k for k, v in self.vocab.items()} @@ -197,7 +198,7 @@ def decode(self, ids): text = bytes_str.decode("utf-8", errors="replace") return text - def train(self, texts, vocab_size, verbose=False, min_frequency=2): + def train(self, texts, vocab_size, verbose=False, min_frequency=1): """ Train the tokenizer on the given texts and vocab size. The vocab size should be greater than 256. params: diff --git a/pretrained/wi17k_base.json b/bpetokenizer/pretrained/wi17k_base/wi17k_base.json similarity index 99% rename from pretrained/wi17k_base.json rename to bpetokenizer/pretrained/wi17k_base/wi17k_base.json index 113c462..45fe235 100644 --- a/pretrained/wi17k_base.json +++ b/bpetokenizer/pretrained/wi17k_base/wi17k_base.json @@ -1,5 +1,5 @@ { - "version": "1.0.32", + "version": "1.0.4", "pattern": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", "special_tokens": { "": 17311, diff --git a/bpetokenizer/tokenizer.py b/bpetokenizer/tokenizer.py index adfe829..505eb5e 100644 --- a/bpetokenizer/tokenizer.py +++ b/bpetokenizer/tokenizer.py @@ -16,6 +16,7 @@ from .base import Tokenizer, get_stats, merge import regex as re +import os # from the openai/tiktoken (used in gpt4 tokenizer) GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" # raw string @@ -32,7 +33,22 @@ def __init__(self, pattern=None, special_tokens=None): self.inverse_special_tokens = {} if special_tokens is None else {v: k for k, v in special_tokens.items()} - def train(self, texts, vocab_size, verbose=False, min_frequency=2) -> None: + @classmethod + def from_pretrained(cls, + tokenizer_name: str, + verbose=False): + tokenizer = cls() + pretrained_dir = 'bpetokenizer/pretrained' + tokenizer_file = os.path.join(pretrained_dir, tokenizer_name, f'{tokenizer_name}.json') + if verbose: + print(f"loading tokenizer from: {tokenizer_file}") + if not os.path.exists(tokenizer_file): + raise FileNotFoundError(f"tokenizer file not found: {tokenizer_file}. Please check the tokenizer name") + tokenizer.load(tokenizer_file, mode="json") + return tokenizer + + + def train(self, texts, vocab_size, verbose=False, min_frequency=1) -> None: """ Train the tokenizer on the given texts and vocab size. The vocab size should be greater than 256. params: @@ -92,9 +108,12 @@ def encode_ord(self, text) -> list: text_chunks = re.findall(self.compiled_pattern, text) ids = [] for chunk in text_chunks: - _bytes = chunk.encode("utf-8") - chunk_ids = self._encode(_bytes) - ids.extend(chunk_ids) + if chunk in self.vocab: + ids.append(self.vocab[chunk]) + else: + _bytes = chunk.encode("utf-8") + chunk_ids = self._encode(_bytes) + ids.extend(chunk_ids) return ids @@ -164,6 +183,8 @@ def tokens(self, text, verbose=False) -> list: chunk_tokens = [self.vocab[idx].decode("utf-8", errors="replace") if idx in self.vocab else f"[UNK{idx}]" for idx in chunk_ids] _tokens.extend(chunk_tokens) if verbose: + print(f"---\nlength: {len(text_chunks)}\n") print(f"---\ntext chunks: {text_chunks}\n") print(f"---\npattern: {self.pattern}\n") - return _tokens \ No newline at end of file + return _tokens + \ No newline at end of file diff --git a/bpetokenizer/version.py b/bpetokenizer/version.py index 3986d08..87a7cf5 100644 --- a/bpetokenizer/version.py +++ b/bpetokenizer/version.py @@ -1 +1 @@ -__version__ = "1.0.32" \ No newline at end of file +__version__ = "1.0.4" \ No newline at end of file diff --git a/setup.py b/setup.py index 574d5b4..60f95e3 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,9 @@ author_email="hrushi669@gmail.com", license="MIT", packages=find_packages(include=["bpetokenizer"]), + package_data={ + 'bpetokenizer': ['pretrained/wi17k_base/wi17k_base.json'], + }, classifiers=[ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index bafea1e..834e64e 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -61,8 +61,8 @@ def test_train_bpe_w_special_tokens(): texts = "<|startoftext|> Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.<|endoftext|>" tokenizer.train(texts, vocab_size=310, verbose=False) - assert len(tokenizer.vocab) == 281 - assert len(tokenizer.merges) == 25 + assert len(tokenizer.vocab) == 310 + assert len(tokenizer.merges) == 54 assert tokenizer.decode(tokenizer.encode(texts)) == texts assert tokenizer.inverse_special_tokens == {v: k for k,v in special_tokens.items()} assert tokenizer.special_tokens == special_tokens