diff --git a/README.md b/README.md index 8e0bc80..0ea0930 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # bpetokenizer -A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format. +A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer(tiktoken), allows you to train your own tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format. The `bpetokenizer` also supports [pretrained](bpetokenizer/pretrained/) tokenizers. ### Overview @@ -31,7 +31,7 @@ Every LLM(LLama, Gemini, Mistral..) use their own Tokenizers trained on their ow 2. [BPETokenizer](bpetokenizer/tokenizer.py): This class emphasizes the real power of the tokenizer(used in gpt4 tokenizer..[tiktoken](https://github.com/openai/tiktoken)), uses the `GPT4_SPLIT_PATTERN` to split the text as mentioned in the gpt4 tokenizer. also handles the `special_tokens` (refer [sample_bpetokenizer](sample/bpetokenizer/sample_bpetokenizer.py)). which inherits the `save` and `load` functionlities to save and load the tokenizer respectively. -3. [PreTrained Tokenizer](pretrained/wi17k_base.json): PreTrained Tokenizer wi17k_base, has a 17316 vocabulary. trained with the wikitext dataset (len: 1000000). with 6 special_tokens. +3. [PreTrained Tokenizer](bpetokenizer/pretrained/wi17k_base): PreTrained Tokenizer wi17k_base, has a 17316 vocabulary. trained with the wikitext dataset (len: 1000000). with 6 special_tokens. ### Usage @@ -121,6 +121,28 @@ print("tokens: ", tokens) ``` refer to the [load_json_vocab](sample/load_json_vocab/) and run the `bpetokenizer_json` to get an overview of `vocab`, `merges`, `special_tokens` and to view the tokens that are split by the tokenizer using pattern, look at [tokens](sample/load_json_vocab/tokens.py) + +#### To load the pretrained tokenizers + +```py +from bpetokenizer import BPETokenzier + +tokenizer = BPETokenizer.from_pretrained("wi17k_base", verbose=True) + +texts = """ +def get_stats(tokens, counts=None) -> dict: + "Get statistics of the tokens. Includes the frequency of each consecutive pair of tokens" + counts = if counts is None else counts + for pair in zip(tokens, tokens[1:]): + counts[pair] = counts.get(pair, 0) + 1 + return counts +""" +tokenizer.tokens(texts, verbose=True) + +``` +for now, we only have a single 17k vocab tokenizer `wi17_base` at [pretrained](/bpetokenizer/pretrained/) + + ### Run Tests the tests folder `tests/` include the tests of the tokenizer, uses pytest. @@ -138,7 +160,7 @@ Contributions to the BPE Tokenizer are most welcomed! If you would like to contr - Star and Fork the repository. - Create a new branch (git checkout -b feature/your-feature). -- Commit your changes (git commit -am 'Add some feature'). +- Commit your changes (git commit -m 'Add some feature'). - Push to the branch (git push origin feature/your-feature). - Create a new Pull Request. diff --git a/bpetokenizer/base.py b/bpetokenizer/base.py index e0d8123..01b5a0a 100644 --- a/bpetokenizer/base.py +++ b/bpetokenizer/base.py @@ -67,6 +67,7 @@ def __init__(self, special_tokens=None): self.compiled_pattern = re.compile(self.pattern) if self.pattern else "" self.special_tokens = special_tokens if special_tokens else {} self.vocab = self._build_vocab() if self.merges else {} + self.inverse_vocab = {str(v.decode("utf-8")): k for k, v in self.vocab.items()} if self.vocab else {} def _build_vocab(self) -> dict: """Build the vocab from the merges and special tokens. This will be used to encode/decode the tokens.""" @@ -169,7 +170,7 @@ def load(self, file_name, mode="json"): self.merges = {tuple(map(int, k.strip('()').split(','))): v for k, v in merges.items()} vocab = data["vocab"] self.vocab = {int(k): v.encode("utf-8") for k, v in vocab.items()} - self.inverse_vocab = {v.decode("utf-8"): k for k, v in self.vocab.items()} + self.inverse_vocab = {str(v.decode("utf-8")): k for k, v in self.vocab.items()} if self.vocab else {} diff --git a/bpetokenizer/pretrained/wi17k_base/wi17k_base.json b/bpetokenizer/pretrained/wi17k_base/wi17k_base.json index 45fe235..aca5ff9 100644 --- a/bpetokenizer/pretrained/wi17k_base/wi17k_base.json +++ b/bpetokenizer/pretrained/wi17k_base/wi17k_base.json @@ -2,12 +2,13 @@ "version": "1.0.4", "pattern": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", "special_tokens": { - "": 17311, - "": 17312, - "": 17313, - "": 17314, + "": 17317, + "": 17318, + "": 17319, + "": 17320, "<|startoftext|>": 17315, - "<|endoftext|>": 17316 + "<|endoftext|>": 17316, + "\n": 17317 }, "merges": { "(32, 116)": 256, @@ -18091,7 +18092,7 @@ "1021": " pers", "1022": "pect", "1023": " mov", - "1024": " def", + "1024": "def", "1025": "view", "1026": " several", "1027": "ros", @@ -34377,6 +34378,8 @@ "17307": " Lourinh�", "17308": " Lourinhã", "17309": " differs", - "17310": " allosaurid" + "17311": " def", + "17312": "_stats", + "17313": " get" } } \ No newline at end of file diff --git a/bpetokenizer/tokenizer.py b/bpetokenizer/tokenizer.py index 688adf6..7d7be60 100644 --- a/bpetokenizer/tokenizer.py +++ b/bpetokenizer/tokenizer.py @@ -46,6 +46,9 @@ def from_pretrained(cls, if not os.path.exists(tokenizer_file): raise FileNotFoundError(f"tokenizer file not found: {tokenizer_file}. Please check the tokenizer name") tokenizer.load(tokenizer_file, mode="json") + if verbose: + print('---\nSpecial tokens: ', tokenizer.special_tokens) + print('---\nLength of Vocab: ', len(tokenizer.vocab)) return tokenizer @@ -60,7 +63,8 @@ def train(self, texts, vocab_size, verbose=False, min_frequency=1) -> None: """ assert vocab_size >= 256 num_merges = vocab_size - 256 - + assert num_merges > 0 + text_chunks = re.findall(self.compiled_pattern, texts) # handles the desired pattern of tokens with regex pattern ids = [list(tokens.encode("utf-8")) for tokens in text_chunks] # List[List[int]] @@ -119,6 +123,8 @@ def encode_ord(self, text) -> list: for chunk in text_chunks: if chunk in self.vocab: ids.append(self.vocab[chunk]) + elif chunk in self.special_tokens: + ids.append(self.special_tokens[chunk]) else: _bytes = chunk.encode("utf-8") chunk_ids = self._encode(_bytes) @@ -143,19 +149,18 @@ def encode(self, text, special_tokens="none") -> list: assert all(token not in text for token in self.special_tokens) else: raise ValueError(f"invalid special tokens argument: {special_tokens}") + - if not special: - return self.encode_ord(text) - - special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")" - text_chunks = re.split(special_pattern, text) + text_chunks = re.findall(self.compiled_pattern, text) ids = [] for chunk in text_chunks: - if chunk in special: - ids.append(special[chunk]) + if chunk in self.inverse_vocab: + ids.append(self.inverse_vocab[chunk]) + elif chunk in self.special_tokens: + ids.append(self.special_tokens[chunk]) else: - chunkids = self._encode(chunk.encode("utf-8")) - ids.extend(chunkids) + chunk_ids = self._encode(chunk.encode("utf-8")) + ids.extend(chunk_ids) return ids @@ -184,16 +189,11 @@ def _special_tokens(self, special_tokens) -> None: def tokens(self, text, verbose=False) -> list: text_chunks = re.findall(self.compiled_pattern, text) - - _tokens = [] - for chunk in text_chunks: - _bytes = chunk.encode("utf-8") - chunk_ids = self._encode(_bytes) - chunk_tokens = [self.vocab[idx].decode("utf-8", errors="replace") if idx in self.vocab else f"[UNK{idx}]" for idx in chunk_ids] - _tokens.extend(chunk_tokens) + ids = self.encode(text, special_tokens="all") if verbose: - print(f"---\nlength: {len(text_chunks)}\n") - print(f"---\ntext chunks: {text_chunks}\n") - print(f"---\npattern: {self.pattern}\n") - return _tokens + print(f"---\nText chunks: {text_chunks}\n") + print(f"---\nLength Text chunks: {len(text_chunks)}\n") + print(f"---\nIDs: {ids}") + print(f"---\nLength: {len(ids)}\n") + return ids \ No newline at end of file diff --git a/bpetokenizer/version.py b/bpetokenizer/version.py index 4a2bfa8..42cf7cd 100644 --- a/bpetokenizer/version.py +++ b/bpetokenizer/version.py @@ -1 +1 @@ -__version__ = "1.2.0" \ No newline at end of file +__version__ = "1.2.1" \ No newline at end of file diff --git a/setup.py b/setup.py index 60f95e3..f80d535 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( name="bpetokenizer", version=__version__, - description="Byte Pair Encoding Tokenizer with special tokens and regex pattern", + description="A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer(tiktoken), allows you to train your own tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format. The `bpetokenizer` also supports [pretrained](bpetokenizer/pretrained/) tokenizers.", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/Hk669/bpetokenizer",