Skip to content

Commit

Permalink
Merge pull request #248 from CaptainVee/refactor-sentence-encoder
Browse files Browse the repository at this point in the history
refactor: modified the sentence encoder to tokenize a text before encoding it
  • Loading branch information
heffernankevin authored Sep 22, 2023
2 parents 94bc7aa + 0858676 commit 51b4293
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 7 deletions.
17 changes: 15 additions & 2 deletions laser_encoders/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ encoder = initialize_encoder(lang="igbo")
embeddings = encoder.encode_sentences([tokenized_sentence])
```

When initializing the encoder, you have the option to enable both tokenization and encoding by setting the `tokenize` flag to `True`. Below is an example of how to use it:
```py
encoder = initialize_encoder(lang="igbo", spm=True, tokenize=True)
embeddings = encoder("nnọọ, kedu ka ị mere")
```
>setting the `spm` flag to `True` tells the encoder to also download the accompanying spm model
**Supported Languages:** You can specify any language from the [FLORES200](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) dataset. This includes both languages identified by their full codes (like "ibo_Latn") and simpler alternatives (like "igbo").

## Downloading the pre-trained models
Expand All @@ -61,13 +68,19 @@ python -m laser_encoders.download_models --model-dir=path/to/model/directory

> For a comprehensive list of available arguments, you can use the `--help` command with the download_models script.
Once you have successfully downloaded the models, you can utilize the `LaserTokenizer` to tokenize text in your desired language. Here's an example of how you can achieve this:
Once you have successfully downloaded the models, you can utilize the `SentenceEncoder` to tokenize and encode your text in your desired language. Here's an example of how you can achieve this:

```py
from laser_encoders.laser_tokenizer import LaserTokenizer
from laser_encoders.models import SentenceEncoder
from pathlib import Path

encoder = SentenceEncoder(model_path=path/to/downloaded/model, spm_model=Path(path/to/spm_model), spm_vocab=path/to/cvocab)
embeddings = encoder("This is a test sentence.")
```
If you want to perform tokenization seperately, you can do this below:
```py
from laser_encoders.laser_tokenizer import LaserTokenizer

tokenizer = LaserTokenizer(spm_model=Path(path/to/spm_model))

tokenized_sentence = tokenizer.tokenize("This is a test sentence.")
Expand Down
18 changes: 13 additions & 5 deletions laser_encoders/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def initialize_encoder(
model_dir: str = None,
spm: bool = True,
laser: str = None,
tokenize: bool = False,
):
downloader = LaserModelDownloader(model_dir)
if laser is not None:
Expand Down Expand Up @@ -146,12 +147,19 @@ def initialize_encoder(

model_dir = downloader.model_dir
model_path = os.path.join(model_dir, f"{file_path}.pt")
spm_path = os.path.join(model_dir, f"{file_path}.cvocab")

if not os.path.exists(spm_path):
spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab")
spm_model = None
if not os.path.exists(spm_vocab):
# if there is no cvocab for the laser3 lang use laser2 cvocab
spm_path = os.path.join(model_dir, "laser2.cvocab")
return SentenceEncoder(model_path=model_path, spm_vocab=spm_path)
spm_vocab = os.path.join(model_dir, "laser2.cvocab")
if tokenize:
spm_model = os.path.join(model_dir, f"{file_path}.spm")
if not os.path.exists(spm_model):
spm_model = os.path.join(model_dir, "laser2.spm")

return SentenceEncoder(
model_path=model_path, spm_vocab=spm_vocab, spm_model=spm_model
)


def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None):
Expand Down
17 changes: 17 additions & 0 deletions laser_encoders/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import re
import sys
from collections import namedtuple
from pathlib import Path

import numpy as np
import torch
Expand All @@ -25,6 +26,8 @@
from fairseq.models.transformer import Embedding, TransformerEncoder
from fairseq.modules import LayerNorm

from laser_encoders.laser_tokenizer import LaserTokenizer

SPACE_NORMALIZER = re.compile(r"\s+")
Batch = namedtuple("Batch", "srcs tokens lengths")

Expand All @@ -43,13 +46,18 @@ def __init__(
max_sentences=None,
max_tokens=None,
spm_vocab=None,
spm_model=None,
cpu=False,
fp16=False,
verbose=False,
sort_kind="quicksort",
):
if verbose:
logger.info(f"loading encoder: {model_path}")
self.spm_model = spm_model
if self.spm_model:
self.tokenizer = LaserTokenizer(spm_model=Path(self.spm_model))

self.use_cuda = torch.cuda.is_available() and not cpu
self.max_sentences = max_sentences
self.max_tokens = max_tokens
Expand Down Expand Up @@ -83,6 +91,15 @@ def __init__(
self.encoder.eval()
self.sort_kind = sort_kind

def __call__(self, sentences):
if self.spm_model:
sentences = self.tokenizer(sentences)
return self.encode_sentences(sentences)
else:
raise ValueError(
"Either initialize the encoder with an spm_model or pre-tokenize and use the encode_sentences method."
)

def _process_batch(self, batch):
tokens = batch.tokens
lengths = batch.lengths
Expand Down

0 comments on commit 51b4293

Please sign in to comment.