Skip to content

Commit

Permalink
first
Browse files Browse the repository at this point in the history
  • Loading branch information
ccdv-ai committed Jan 15, 2024
1 parent ec04885 commit cb3559f
Show file tree
Hide file tree
Showing 9 changed files with 342 additions and 1 deletion.
42 changes: 42 additions & 0 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# This workflow will upload a Python Package using Twine when a release is created
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

name: Upload Python Package

on:
release:
types: [published]

permissions:
contents: read

env:
PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}

jobs:
deploy:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.8'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade build
pip install twine
- name: Build package
run: |
python -m build
- name: Upload to PyPi
run: |
twine upload dist/* -u __token__ -p "$PYPI_API_TOKEN"
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
build
tokenizer_adapter.egg-info
__pycache__
*/**/__pycache__
tokenizer_adapter/eval_adapter.py
tokenizer_adapter/test_adapter.py
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [yyyy] [name of copyright owner]
Copyright [2024] [Charles Condevaux]

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
38 changes: 38 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Tokenizer Adapter

A simple tool to adapt a pretrained Huggingface model to a new vocabulary (domain specific) with (almost) no training. \
Should work for almost all language models from the Huggingface Hub (need more test).

## Install

```
pip install
```

## Usage
```python
from tokenizer_adapter import TokenizerAdapter
from transformers import AutoTokenizer, AutoModelForMaskedLM

BASE_MODEL_PATH = "camembert-base"

# A simple corpus
corpus = ["A first sentence", "A second sentence", "blablabla"]

# Load model and tokenizer
model = AutoModelForMaskedLM.from_pretrained(BASE_MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)

# Train new vocabulary from the old tokenizer
new_tokenizer = tokenizer.train_new_from_iterator(corpus, vocab_size=300)

# Default params should work in most cases
adapter = TokenizerAdapter()

# Patch the model with the new tokenizer
model = adapter.adapt_from_pretrained(new_tokenizer, model, tokenizer)

# Save the model and the new tokenizer
model.save_pretrained("my_new_model/")
new_tokenizer.save_pretrained("my_new_model/")
```
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools>=61.0.0", "torch>=1.8", "tokenizers>=0.15.0", "tqdm"]
build-backend = 'setuptools.build_meta'
Empty file added requirements
Empty file.
23 changes: 23 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[metadata]
name = tokenizer-adapter
version = 0.1.0
author = Charles Condevaux
author_email = [email protected]
description = Tools to adapt a pretrained model to a new vocabulary
long_description = file: README.md
long_description_content_type = text/markdown
url = https://github.com/ccdv-ai/tokenizer-adapter
classifiers =
Programming Language :: Python :: 3
License :: OSI Approved :: MIT License
Operating System :: OS Independent

[options]
packages = find:
python_requires = >=3.8
include_package_data = True
install_requires =
torch>=1.8
tokenizers>=0.15.0
tqdm

1 change: 1 addition & 0 deletions tokenizer_adapter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .adapter import *
228 changes: 228 additions & 0 deletions tokenizer_adapter/adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
import torch
import tqdm
from copy import deepcopy
from math import sqrt
from tokenizers import normalizers
from tokenizers import pre_tokenizers
from tokenizers import decoders

class TokenizerAdapter():

def __init__(self, method="average", clean_tokenizer=False, custom_preprocessing=None) -> None:
"""
Adapter an existing model with a new tokenizer
Args:
method (`str`, *optional*, defaults to 'average'):
Method to use to merge tokens. In ["average", "bos", "frequency", "reverse_frequency", "inverse_frequency"]
clean_tokenizer (`bool`, *optional*, defaults to False):
Remove the normalizer, the pre_tokenizer and the decoder in the old tokenizer (experimental)
custom_preprocessing (`function`, *optional*, defaults to None):
A custom function to apply some normalization before feeding tokens from the new vocabulary to the old tokenizer.
Example replacing a metaspace by a RoBERTa separator: lambda x: x.replace("▁", "Ġ")
"""
assert method in ["average", "bos", "frequency", "reverse_frequency", "inverse_frequency"]
self.method = method
self.process_function = {
"average": self.process_average,
"bos": self.process_bos,
"frequency": self.process_frequency,
"reverse_frequency": self.process_reverse_frequency,
"inverse_frequency": self.process_inverse_frequency
}[self.method]
self.clean_tokenizer = clean_tokenizer

def get_state_dict_keys_to_update(self, state_dict, vocab_size):

state_dict_to_update = {}
for key, tensor in state_dict.items():
if vocab_size in tensor.shape:
state_dict_to_update[key] = tensor
return state_dict_to_update

def prepare_correspondance_dict(self, new_tokenizer, old_tokenizer):

vocab_size = len(new_tokenizer.vocab.keys())
old_vocab_size = len(old_tokenizer.vocab.keys())
frequency_matrix = None

unk_token_id = old_tokenizer.unk_token_id
if unk_token_id is None:
unk_token_id = old_tokenizer.pad_token_id
if unk_token_id is None:
unk_token_id = old_tokenizer.eos_token_id

if self.method in ["frequency", "reverse_frequency", "inverse_frequency"]:
frequency_matrix = torch.zeros(old_vocab_size)

correspondance_dict = {"pairs": {}, "meta": {}}

# Loop over the new vocabulary
for new_token, i in tqdm.tqdm(new_tokenizer.vocab.items()):

old_token_ids = old_tokenizer.convert_tokens_to_ids([new_token])
# if token doesnt exist in old vocab
if len(old_token_ids) == 0 or (len(old_token_ids) == 1 and old_token_ids[0] == unk_token_id):
# untokenize new_token
new_token = new_tokenizer.convert_tokens_to_string([new_token])
old_token_ids = old_tokenizer.encode(new_token, add_special_tokens=False)

old_token_ids = [t if t < old_vocab_size else unk_token_id for t in old_token_ids]
correspondance_dict["pairs"][str(i)] = old_token_ids

# Fill frequency matrix
if frequency_matrix is not None and len(old_token_ids) > 1:
for t in old_token_ids:
frequency_matrix[t] += 1

correspondance_dict["meta"]["vocab_size"] = vocab_size
correspondance_dict["meta"]["old_vocab_size"] = old_vocab_size
correspondance_dict["meta"]["frequency_matrix"] = frequency_matrix

correspondance_dict["meta"]["old_bos_token_id"] = old_tokenizer.bos_token_id
correspondance_dict["meta"]["bos_token_id"] = new_tokenizer.bos_token_id
correspondance_dict["meta"]["old_eos_token_id"] = old_tokenizer.eos_token_id
correspondance_dict["meta"]["eos_token_id"] = new_tokenizer.eos_token_id
correspondance_dict["meta"]["old_pad_token_id"] = old_tokenizer.pad_token_id
correspondance_dict["meta"]["pad_token_id"] = new_tokenizer.pad_token_id

return correspondance_dict

def process_tensors(self, state_dict, correspondance_dict):
vocab_size = correspondance_dict["meta"]["old_vocab_size"]

for tensor_key, tensor in state_dict.items():

print("Processing: ", tensor_key)
do_transpose = False
if len(tensor.size()) > 1 and tensor.size()[-1] == vocab_size:
do_transpose = True
tensor = tensor.T

new_tensor = self.process_single_tensor(tensor, correspondance_dict)
state_dict[tensor_key] = new_tensor.T if do_transpose else new_tensor

return state_dict

def process_single_tensor(self, tensor, correspondance_dict):

vocab_size = correspondance_dict["meta"]["vocab_size"]

if len(tensor.size()) > 1:
new_tensor = torch.zeros(vocab_size, tensor.size()[-1], dtype=tensor.dtype)
else:
new_tensor = torch.zeros(vocab_size, dtype=tensor.dtype)

for new_idx, old_idx in tqdm.tqdm(correspondance_dict["pairs"].items()):
new_idx = int(new_idx)
value = self.process_function(old_idx, tensor, correspondance_dict["meta"])
new_tensor[new_idx] = value
return new_tensor

def process_average(self, old_idx, tensor, meta_dict):
new_tensor = tensor[old_idx].mean(dim=0)
return new_tensor

def process_bos(self, old_idx, tensor, meta_dict):

bos = tensor[meta_dict["old_bos_token_id"]]
new_tensor = tensor[old_idx]
if len(bos.size()) == 0:
bos = bos.unsqueeze(-1)
new_tensor = new_tensor.unsqueeze(-1)
new_tensor = torch.softmax(bos @ new_tensor.T / sqrt(bos.size()[0]), dim=-1) @ new_tensor
return new_tensor

def process_frequency(self, old_idx, tensor, meta_dict):

frequencies = meta_dict["frequency_matrix"] / meta_dict["frequency_matrix"].sum() + 1e-8
frequencies = frequencies[old_idx]
frequencies = frequencies / frequencies.sum()
new_tensor = tensor[old_idx]

if len(new_tensor.size()) == 1:
new_tensor = new_tensor.unsqueeze(-1)
new_tensor = (new_tensor * frequencies.unsqueeze(-1)).sum(dim=0)
return new_tensor

def process_reverse_frequency(self, old_idx, tensor, meta_dict):

frequencies = meta_dict["frequency_matrix"] / meta_dict["frequency_matrix"].sum() + 1e-8
frequencies = 1 - frequencies[old_idx]
frequencies = frequencies / frequencies.sum()
new_tensor = tensor[old_idx]

if len(new_tensor.size()) == 1:
new_tensor = new_tensor.unsqueeze(-1)
new_tensor = (new_tensor * frequencies.unsqueeze(-1)).sum(dim=0)
return new_tensor

def process_inverse_frequency(self, old_idx, tensor, meta_dict):

frequencies = meta_dict["frequency_matrix"] / meta_dict["frequency_matrix"].sum() + 1e-8
frequencies = 1 / frequencies[old_idx]
frequencies = frequencies / frequencies.sum()
new_tensor = tensor[old_idx]

if len(new_tensor.size()) == 1:
new_tensor = new_tensor.unsqueeze(-1)
new_tensor = (new_tensor * frequencies.unsqueeze(-1)).sum(dim=0)
return new_tensor

def merge_dict(self, state_dict, state_dict_keys_updated):

for key, value in state_dict_keys_updated.items():
requires_grad = state_dict[key].requires_grad
state_dict[key] = value
return state_dict

def prepare_new_config(self, config, correspondance_dict):
config.pad_token_id = correspondance_dict["meta"]["pad_token_id"]
config.bos_token_id = correspondance_dict["meta"]["bos_token_id"]
config.eos_token_id = correspondance_dict["meta"]["eos_token_id"]
config.vocab_size = correspondance_dict["meta"]["vocab_size"]
return config

def adapt_from_pretrained(self, new_tokenizer, model, tokenizer, **kwargs):

"""
Adapt a new model from a pretrained model and a pretrained tokenizer
Args:
new_tokenizer (`PreTrainedTokenizer`):
The new tokenizer trained on a specific corpus
model (`PreTrainedModel`):
The pretrained model to modify
tokenizer (`PreTrainedTokenizer`):
The tokenizer of the pretrained model
"""

if self.clean_tokenizer:
tokenizer._tokenizer.normalizer = normalizers.Sequence([])
tokenizer._tokenizer.pre_tokenizer = pre_tokenizers.Sequence([])
tokenizer._tokenizer.decoder = decoders.Sequence([])

with torch.no_grad():
#state_dict = deepcopy(model.state_dict())
state_dict = model.state_dict()
config = deepcopy(model.config)
config_vocab_size = model.config.vocab_size
#del model

# Select keys to update
state_dict_keys_to_update = self.get_state_dict_keys_to_update(state_dict, config_vocab_size)
if len(state_dict_keys_to_update.keys()) == 0:
state_dict_keys_to_update = self.get_state_dict_keys_to_update(state_dict, len(tokenizer.vocab.keys()))

# Create correspondance table
correspondance_dict = self.prepare_correspondance_dict(new_tokenizer, tokenizer)

# Update config
config = self.prepare_new_config(config, correspondance_dict)
# Update tensors
state_dict_keys_to_update = self.process_tensors(state_dict_keys_to_update, correspondance_dict)
# Merge in state dict
state_dict = self.merge_dict(state_dict, state_dict_keys_to_update)

model = model.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=state_dict)

return model

0 comments on commit cb3559f

Please sign in to comment.