From a9d3c4f74ac98b6a21cad205647b361730599b4f Mon Sep 17 00:00:00 2001 From: ccdv-ai Date: Wed, 17 Jan 2024 14:59:28 +0000 Subject: [PATCH] add custom_preprocessing arg --- README.md | 37 +++++++++++++++- setup.cfg | 2 +- tokenizer_adapter/adapter.py | 85 ++++++++++++++++++++++++++++++------ 3 files changed, 109 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 856ca6e..1a9148a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ # Tokenizer Adapter -A simple tool for adapting a pre-trained Huggingface model to a new vocabulary with (almost) no training. \ +A simple tool for adapting a pre-trained Huggingface model to a new vocabulary with (almost) no training. + +This technique can significantly reduce sequence length when a language model is used on data with a specific vocabulary (biology, medicine, law, other languages, etc...). + Should work for most Huggingface Hub language models (requires further testing). \ **Everything is run on CPU.** @@ -36,6 +39,38 @@ adapter = TokenizerAdapter() # Patch the model with the new tokenizer model = adapter.adapt_from_pretrained(new_tokenizer, model, tokenizer) +# Save the model and the new tokenizer +model.save_pretrained("my_new_model/") +new_tokenizer.save_pretrained("my_new_model/") +``` + +If you want to use a custom/different tokenizer (**experimental**), you may need to use the `custom_preprocessing` argument. \ +Example using a RoBERTa (similar to Phi-2) style tokenizer for a CamemBERT model: + +```python +from tokenizer_adapter import TokenizerAdapter +from transformers import AutoTokenizer, AutoModelForMaskedLM + +BASE_MODEL_PATH = "camembert-base" +NEW_CUSTOM_TOKENIZER = "roberta-base" + +# A simple corpus +corpus = ["A first sentence", "A second sentence", "blablabla"] + +# Load model and tokenizer +model = AutoModelForMaskedLM.from_pretrained(BASE_MODEL_PATH) +tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH) + +# Also load this custom tokenizer to train the new one +new_tokenizer = AutoTokenizer.from_pretrained(NEW_CUSTOM_TOKENIZER) +new_tokenizer = new_tokenizer.train_new_from_iterator(corpus, vocab_size=300) + +# CamemBERT tokenizer relies on '▁' while the RoBERTa one relies on 'Ġ' +adapter = TokenizerAdapter(custom_preprocessing=lambda x: x.replace('Ġ', '▁')) + +# Patch the model with the new tokenizer +model = adapter.adapt_from_pretrained(new_tokenizer, model, tokenizer) + # Save the model and the new tokenizer model.save_pretrained("my_new_model/") new_tokenizer.save_pretrained("my_new_model/") diff --git a/setup.cfg b/setup.cfg index 5e0ec1f..0d28669 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = tokenizer-adapter -version = 0.1.1 +version = 0.1.2 author = Charles Condevaux author_email = charles.condevaux@gmail.com description = A simple to adapt a pretrained language model to a new vocabulary diff --git a/tokenizer_adapter/adapter.py b/tokenizer_adapter/adapter.py index 5ec8dbe..06781b6 100644 --- a/tokenizer_adapter/adapter.py +++ b/tokenizer_adapter/adapter.py @@ -11,14 +11,17 @@ class TokenizerAdapter(): def __init__(self, method="average", clean_tokenizer=False, custom_preprocessing=None) -> None: """ Adapter an existing model with a new tokenizer + Args: method (`str`, *optional*, defaults to 'average'): - Method to use to merge tokens. In ["average", "bos", "frequency", "reverse_frequency", "inverse_frequency"] + Method to use to merge tokens. In `['average', 'bos', 'frequency', 'reverse_frequency', 'inverse_frequency']` clean_tokenizer (`bool`, *optional*, defaults to False): Remove the normalizer, the pre_tokenizer and the decoder in the old tokenizer (experimental) custom_preprocessing (`function`, *optional*, defaults to None): - A custom function to apply some normalization before feeding tokens from the new vocabulary to the old tokenizer. - Example replacing a metaspace by a RoBERTa separator: lambda x: x.replace("▁", "Ġ") + A function to apply some normalization before feeding tokens from the new vocabulary to the old tokenizer to find the ids. + + Example to replace a Llama style tokenizer by a RoBERTa style tokenizer: + `custom_preprocessing=lambda x: x.replace('Ġ', '▁')` """ assert method in ["average", "bos", "frequency", "reverse_frequency", "inverse_frequency"] self.method = method @@ -30,6 +33,7 @@ def __init__(self, method="average", clean_tokenizer=False, custom_preprocessing "inverse_frequency": self.process_inverse_frequency }[self.method] self.clean_tokenizer = clean_tokenizer + self.custom_preprocessing = custom_preprocessing def get_state_dict_keys_to_update(self, state_dict, vocab_size): @@ -39,18 +43,54 @@ def get_state_dict_keys_to_update(self, state_dict, vocab_size): state_dict_to_update[key] = tensor return state_dict_to_update - def prepare_correspondance_dict(self, new_tokenizer, old_tokenizer): + def get_unk_token_id(self, old_tokenizer): - vocab_size = len(new_tokenizer.vocab.keys()) - old_vocab_size = len(old_tokenizer.vocab.keys()) - frequency_matrix = None - unk_token_id = old_tokenizer.unk_token_id if unk_token_id is None: unk_token_id = old_tokenizer.pad_token_id if unk_token_id is None: unk_token_id = old_tokenizer.eos_token_id + if unk_token_id is None: + unk_token_id = old_tokenizer.bos_token_id + return unk_token_id + + def prepare_special_token_ids(self, correspondance_dict, new_tokenizer, old_tokenizer, unk_token_id): + + if new_tokenizer.bos_token_id is not None: + correspondance_dict["pairs"][str(new_tokenizer.bos_token_id)] = [ + old_tokenizer.bos_token_id if old_tokenizer.bos_token_id is not None else unk_token_id] + + if new_tokenizer.eos_token_id is not None: + correspondance_dict["pairs"][str(new_tokenizer.eos_token_id)] = [ + old_tokenizer.eos_token_id if old_tokenizer.eos_token_id is not None else unk_token_id] + + if new_tokenizer.pad_token_id is not None: + correspondance_dict["pairs"][str(new_tokenizer.pad_token_id)] = [ + old_tokenizer.pad_token_id if old_tokenizer.pad_token_id is not None else unk_token_id] + + if new_tokenizer.sep_token_id is not None: + correspondance_dict["pairs"][str(new_tokenizer.sep_token_id)] = [ + old_tokenizer.sep_token_id if old_tokenizer.sep_token_id is not None else unk_token_id] + + if new_tokenizer.unk_token_id is not None: + correspondance_dict["pairs"][str(new_tokenizer.unk_token_id)] = [ + old_tokenizer.unk_token_id if old_tokenizer.unk_token_id is not None else unk_token_id] + + if new_tokenizer.mask_token_id is not None: + correspondance_dict["pairs"][str(new_tokenizer.mask_token_id)] = [ + old_tokenizer.mask_token_id if old_tokenizer.mask_token_id is not None else unk_token_id] + + return correspondance_dict + + def prepare_correspondance_dict(self, new_tokenizer, old_tokenizer): + + vocab_size = len(new_tokenizer.vocab.keys()) + old_vocab_size = len(old_tokenizer.vocab.keys()) + frequency_matrix = None + + unk_token_id = self.get_unk_token_id(old_tokenizer) + # Keep track if using 'frequency' method if self.method in ["frequency", "reverse_frequency", "inverse_frequency"]: frequency_matrix = torch.zeros(old_vocab_size) @@ -58,15 +98,28 @@ def prepare_correspondance_dict(self, new_tokenizer, old_tokenizer): # Loop over the new vocabulary for new_token, i in tqdm.tqdm(new_tokenizer.vocab.items()): - - old_token_ids = old_tokenizer.convert_tokens_to_ids([new_token]) - # if token doesnt exist in old vocab + + # Do custom preprocessing if any before to adapt to the old tokenizer + if self.custom_preprocessing is not None: + old_token_ids = old_tokenizer.convert_tokens_to_ids([self.custom_preprocessing(new_token)]) + else: + # Try to find the token in the old tokenizer + old_token_ids = old_tokenizer.convert_tokens_to_ids([new_token]) + + # If token doesnt exist in old vocab if len(old_token_ids) == 0 or (len(old_token_ids) == 1 and old_token_ids[0] == unk_token_id): - # untokenize new_token + # Detokenize new_token new_token = new_tokenizer.convert_tokens_to_string([new_token]) - old_token_ids = old_tokenizer.encode(new_token, add_special_tokens=False) + # Get old ids + old_token_ids = old_tokenizer.encode(new_token, add_special_tokens=False) + + # Remove unk ids old_token_ids = [t if t < old_vocab_size else unk_token_id for t in old_token_ids] + if len(old_token_ids) == 0: + old_token_ids = [unk_token_id] + + # Add pair correspondance_dict["pairs"][str(i)] = old_token_ids # Fill frequency matrix @@ -74,6 +127,9 @@ def prepare_correspondance_dict(self, new_tokenizer, old_tokenizer): for t in old_token_ids: frequency_matrix[t] += 1 + # Process special tokens + correspondance_dict = self.prepare_special_token_ids(correspondance_dict, new_tokenizer, old_tokenizer, unk_token_id) + correspondance_dict["meta"]["vocab_size"] = vocab_size correspondance_dict["meta"]["old_vocab_size"] = old_vocab_size correspondance_dict["meta"]["frequency_matrix"] = frequency_matrix @@ -186,6 +242,7 @@ def adapt_from_pretrained(self, new_tokenizer, model, tokenizer, **kwargs): """ Adapt a new model from a pretrained model and a pretrained tokenizer + Args: new_tokenizer (`PreTrainedTokenizer`): The new tokenizer trained on a specific corpus @@ -193,6 +250,8 @@ def adapt_from_pretrained(self, new_tokenizer, model, tokenizer, **kwargs): The pretrained model to modify tokenizer (`PreTrainedTokenizer`): The tokenizer of the pretrained model + + Returns: `PreTrainedModel` """ if self.clean_tokenizer: