Skip to content

Commit

Permalink
add custom_preprocessing arg
Browse files Browse the repository at this point in the history
  • Loading branch information
ccdv-ai committed Jan 17, 2024
1 parent b77aad0 commit a9d3c4f
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 15 deletions.
37 changes: 36 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Tokenizer Adapter

A simple tool for adapting a pre-trained Huggingface model to a new vocabulary with (almost) no training. \
A simple tool for adapting a pre-trained Huggingface model to a new vocabulary with (almost) no training.

This technique can significantly reduce sequence length when a language model is used on data with a specific vocabulary (biology, medicine, law, other languages, etc...).

Should work for most Huggingface Hub language models (requires further testing). \
**Everything is run on CPU.**

Expand Down Expand Up @@ -36,6 +39,38 @@ adapter = TokenizerAdapter()
# Patch the model with the new tokenizer
model = adapter.adapt_from_pretrained(new_tokenizer, model, tokenizer)

# Save the model and the new tokenizer
model.save_pretrained("my_new_model/")
new_tokenizer.save_pretrained("my_new_model/")
```

If you want to use a custom/different tokenizer (**experimental**), you may need to use the `custom_preprocessing` argument. \
Example using a RoBERTa (similar to Phi-2) style tokenizer for a CamemBERT model:

```python
from tokenizer_adapter import TokenizerAdapter
from transformers import AutoTokenizer, AutoModelForMaskedLM

BASE_MODEL_PATH = "camembert-base"
NEW_CUSTOM_TOKENIZER = "roberta-base"

# A simple corpus
corpus = ["A first sentence", "A second sentence", "blablabla"]

# Load model and tokenizer
model = AutoModelForMaskedLM.from_pretrained(BASE_MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)

# Also load this custom tokenizer to train the new one
new_tokenizer = AutoTokenizer.from_pretrained(NEW_CUSTOM_TOKENIZER)
new_tokenizer = new_tokenizer.train_new_from_iterator(corpus, vocab_size=300)

# CamemBERT tokenizer relies on '▁' while the RoBERTa one relies on 'Ġ'
adapter = TokenizerAdapter(custom_preprocessing=lambda x: x.replace('Ġ', ''))

# Patch the model with the new tokenizer
model = adapter.adapt_from_pretrained(new_tokenizer, model, tokenizer)

# Save the model and the new tokenizer
model.save_pretrained("my_new_model/")
new_tokenizer.save_pretrained("my_new_model/")
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = tokenizer-adapter
version = 0.1.1
version = 0.1.2
author = Charles Condevaux
author_email = [email protected]
description = A simple to adapt a pretrained language model to a new vocabulary
Expand Down
85 changes: 72 additions & 13 deletions tokenizer_adapter/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,17 @@ class TokenizerAdapter():
def __init__(self, method="average", clean_tokenizer=False, custom_preprocessing=None) -> None:
"""
Adapter an existing model with a new tokenizer
Args:
method (`str`, *optional*, defaults to 'average'):
Method to use to merge tokens. In ["average", "bos", "frequency", "reverse_frequency", "inverse_frequency"]
Method to use to merge tokens. In `['average', 'bos', 'frequency', 'reverse_frequency', 'inverse_frequency']`
clean_tokenizer (`bool`, *optional*, defaults to False):
Remove the normalizer, the pre_tokenizer and the decoder in the old tokenizer (experimental)
custom_preprocessing (`function`, *optional*, defaults to None):
A custom function to apply some normalization before feeding tokens from the new vocabulary to the old tokenizer.
Example replacing a metaspace by a RoBERTa separator: lambda x: x.replace("▁", "Ġ")
A function to apply some normalization before feeding tokens from the new vocabulary to the old tokenizer to find the ids.
Example to replace a Llama style tokenizer by a RoBERTa style tokenizer:
`custom_preprocessing=lambda x: x.replace('Ġ', '▁')`
"""
assert method in ["average", "bos", "frequency", "reverse_frequency", "inverse_frequency"]
self.method = method
Expand All @@ -30,6 +33,7 @@ def __init__(self, method="average", clean_tokenizer=False, custom_preprocessing
"inverse_frequency": self.process_inverse_frequency
}[self.method]
self.clean_tokenizer = clean_tokenizer
self.custom_preprocessing = custom_preprocessing

def get_state_dict_keys_to_update(self, state_dict, vocab_size):

Expand All @@ -39,41 +43,93 @@ def get_state_dict_keys_to_update(self, state_dict, vocab_size):
state_dict_to_update[key] = tensor
return state_dict_to_update

def prepare_correspondance_dict(self, new_tokenizer, old_tokenizer):
def get_unk_token_id(self, old_tokenizer):

vocab_size = len(new_tokenizer.vocab.keys())
old_vocab_size = len(old_tokenizer.vocab.keys())
frequency_matrix = None

unk_token_id = old_tokenizer.unk_token_id
if unk_token_id is None:
unk_token_id = old_tokenizer.pad_token_id
if unk_token_id is None:
unk_token_id = old_tokenizer.eos_token_id
if unk_token_id is None:
unk_token_id = old_tokenizer.bos_token_id
return unk_token_id

def prepare_special_token_ids(self, correspondance_dict, new_tokenizer, old_tokenizer, unk_token_id):

if new_tokenizer.bos_token_id is not None:
correspondance_dict["pairs"][str(new_tokenizer.bos_token_id)] = [
old_tokenizer.bos_token_id if old_tokenizer.bos_token_id is not None else unk_token_id]

if new_tokenizer.eos_token_id is not None:
correspondance_dict["pairs"][str(new_tokenizer.eos_token_id)] = [
old_tokenizer.eos_token_id if old_tokenizer.eos_token_id is not None else unk_token_id]

if new_tokenizer.pad_token_id is not None:
correspondance_dict["pairs"][str(new_tokenizer.pad_token_id)] = [
old_tokenizer.pad_token_id if old_tokenizer.pad_token_id is not None else unk_token_id]

if new_tokenizer.sep_token_id is not None:
correspondance_dict["pairs"][str(new_tokenizer.sep_token_id)] = [
old_tokenizer.sep_token_id if old_tokenizer.sep_token_id is not None else unk_token_id]

if new_tokenizer.unk_token_id is not None:
correspondance_dict["pairs"][str(new_tokenizer.unk_token_id)] = [
old_tokenizer.unk_token_id if old_tokenizer.unk_token_id is not None else unk_token_id]

if new_tokenizer.mask_token_id is not None:
correspondance_dict["pairs"][str(new_tokenizer.mask_token_id)] = [
old_tokenizer.mask_token_id if old_tokenizer.mask_token_id is not None else unk_token_id]

return correspondance_dict

def prepare_correspondance_dict(self, new_tokenizer, old_tokenizer):

vocab_size = len(new_tokenizer.vocab.keys())
old_vocab_size = len(old_tokenizer.vocab.keys())
frequency_matrix = None

unk_token_id = self.get_unk_token_id(old_tokenizer)

# Keep track if using 'frequency' method
if self.method in ["frequency", "reverse_frequency", "inverse_frequency"]:
frequency_matrix = torch.zeros(old_vocab_size)

correspondance_dict = {"pairs": {}, "meta": {}}

# Loop over the new vocabulary
for new_token, i in tqdm.tqdm(new_tokenizer.vocab.items()):

old_token_ids = old_tokenizer.convert_tokens_to_ids([new_token])
# if token doesnt exist in old vocab

# Do custom preprocessing if any before to adapt to the old tokenizer
if self.custom_preprocessing is not None:
old_token_ids = old_tokenizer.convert_tokens_to_ids([self.custom_preprocessing(new_token)])
else:
# Try to find the token in the old tokenizer
old_token_ids = old_tokenizer.convert_tokens_to_ids([new_token])

# If token doesnt exist in old vocab
if len(old_token_ids) == 0 or (len(old_token_ids) == 1 and old_token_ids[0] == unk_token_id):
# untokenize new_token
# Detokenize new_token
new_token = new_tokenizer.convert_tokens_to_string([new_token])
old_token_ids = old_tokenizer.encode(new_token, add_special_tokens=False)

# Get old ids
old_token_ids = old_tokenizer.encode(new_token, add_special_tokens=False)

# Remove unk ids
old_token_ids = [t if t < old_vocab_size else unk_token_id for t in old_token_ids]
if len(old_token_ids) == 0:
old_token_ids = [unk_token_id]

# Add pair
correspondance_dict["pairs"][str(i)] = old_token_ids

# Fill frequency matrix
if frequency_matrix is not None and len(old_token_ids) > 1:
for t in old_token_ids:
frequency_matrix[t] += 1

# Process special tokens
correspondance_dict = self.prepare_special_token_ids(correspondance_dict, new_tokenizer, old_tokenizer, unk_token_id)

correspondance_dict["meta"]["vocab_size"] = vocab_size
correspondance_dict["meta"]["old_vocab_size"] = old_vocab_size
correspondance_dict["meta"]["frequency_matrix"] = frequency_matrix
Expand Down Expand Up @@ -186,13 +242,16 @@ def adapt_from_pretrained(self, new_tokenizer, model, tokenizer, **kwargs):

"""
Adapt a new model from a pretrained model and a pretrained tokenizer
Args:
new_tokenizer (`PreTrainedTokenizer`):
The new tokenizer trained on a specific corpus
model (`PreTrainedModel`):
The pretrained model to modify
tokenizer (`PreTrainedTokenizer`):
The tokenizer of the pretrained model
Returns: `PreTrainedModel`
"""

if self.clean_tokenizer:
Expand Down

0 comments on commit a9d3c4f

Please sign in to comment.