Skip to content

Commit

Permalink
deprecate: file mode in load and save (#15)
Browse files Browse the repository at this point in the history
  • Loading branch information
Hk669 authored Jun 7, 2024
1 parent c66cab7 commit 9e736da
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 59 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# bpetokenizer

A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer(tiktoken), allows you to train your own tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` and `file` format. The `bpetokenizer` also supports [pretrained](bpetokenizer/pretrained/) tokenizers.
A Byte Pair Encoding (BPE) tokenizer, which algorithmically follows along the GPT tokenizer(tiktoken), allows you to train your own tokenizer. The tokenizer is capable of handling special tokens and uses a customizable regex pattern for tokenization(includes the gpt4 regex pattern). supports `save` and `load` tokenizers in the `json` format. The `bpetokenizer` also supports [pretrained](bpetokenizer/pretrained/) tokenizers.


### Overview
Expand Down Expand Up @@ -79,7 +79,7 @@ print(ids)
decode_text = tokenizer.decode(ids)
print(decode_text)

tokenizer.save("sample_bpetokenizer", mode="json") # mode: default is file
tokenizer.save("sample_bpetokenizer", mode="json")
```

refer [sample_bpetokenizer](sample/bpetokenizer) to have an understanding of the `vocab` and the `model` file of the tokenizer trained on the above texts.
Expand Down
62 changes: 7 additions & 55 deletions bpetokenizer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,38 +82,9 @@ def _build_vocab(self) -> dict:
def save(self, file_name, mode="json"):
"""
Writes metadata and vocabulary information to the model and vocab files.
mode: str, default="json" | "file" to save the model and vocab in file format.
mode: str, default="json" to save the model and vocab in json format.
"""
if mode == "file":
model_file = file_name + ".model"
with open(model_file, 'w') as f:
f.write(f"{__version__}\n")
f.write(f"{self.pattern}\n")
f.write(f"{len(self.special_tokens)}\n")
if self.special_tokens:
for special, idx in self.special_tokens.items():
f.write(f"{special} {idx}\n")

for idx1, idx2 in self.merges: # this will give the tokens of pair which are merged
f.write(f"{idx1} {idx2}\n")

vocab_file = file_name + ".vocab"
inverted_merges = {idx: pair for pair, idx in self.merges.items()}
with open(vocab_file, "w", encoding="utf-8") as f:
for idx, token in self.vocab.items():
s = render_token(token)
# find the children of this token, if any
if idx in inverted_merges:
# if this token has children, render it nicely as a merge
idx0, idx1 = inverted_merges[idx]
s0 = render_token(self.vocab[idx0])
s1 = render_token(self.vocab[idx1])
f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
else:
# otherwise this is leaf token, just print it
# (this should just be the first 256 tokens, the bytes)
f.write(f"[{s}] {idx}\n")
elif mode == "json":
if mode == "json":
import json
data = {
"version": __version__,
Expand All @@ -125,35 +96,15 @@ def save(self, file_name, mode="json"):
with open(file_name + ".json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
else:
raise ValueError("mode should be either 'file' or 'json'")
raise ValueError("mode should be 'json' only.")


def load(self, file_name, mode="json"):
"""
Load the model and vocab files to the tokenizer.
mode: str, default="json" | "file" to load the model and vocab in file format.
mode: str, default="json" to load the model and vocab in json format.
"""
if mode == "file":
assert file_name.endswith(".model")
merges = {}
special_tokens = {}
idx = 256
with open(file_name, 'r', encoding="utf-8") as f:
self.pattern = f.readline().strip().split()
num_special = int(f.readline().strip()) # no of lines of special_tokens
for _ in range(num_special):
special, idx = f.readline().strip().split()
special_tokens[special] = int(idx)
for line in f:
idx1, idx2 = map(int, line.strip().split())
merges[(idx1, idx2)] = idx
idx += 1

self.merges = merges
self.special_tokens = special_tokens
self.vocab = self._build_vocab()

elif mode == "json":
if mode == "json":
assert file_name.endswith(".json")

import json
Expand All @@ -171,7 +122,8 @@ def load(self, file_name, mode="json"):
vocab = data["vocab"]
self.vocab = {int(k): v.encode("utf-8") for k, v in vocab.items()}
self.inverse_vocab = {str(v.decode("utf-8")): k for k, v in self.vocab.items()} if self.vocab else {}

else:
raise ValueError("mode should be 'json' only.")


def encode(self, texts):
Expand Down
4 changes: 2 additions & 2 deletions bpetokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"""

from .base import Tokenizer, get_stats, merge
from .base import Tokenizer, get_stats, merge, render_token, replace_control_characters
import regex as re
import os
import time
Expand Down Expand Up @@ -186,7 +186,7 @@ def decode(self, ids, verbose=False) -> str:
text_bytes = b"".join(part_bytes)
if verbose:
print("---\nText bytes: ", text_bytes)
text = text_bytes.decode("utf-8", errors="replace")
text = render_token(text_bytes)
return text


Expand Down

0 comments on commit 9e736da

Please sign in to comment.