diff --git a/laser_encoders/README.md b/laser_encoders/README.md index b6022628..7a45d929 100644 --- a/laser_encoders/README.md +++ b/laser_encoders/README.md @@ -43,6 +43,13 @@ encoder = initialize_encoder(lang="igbo") embeddings = encoder.encode_sentences([tokenized_sentence]) ``` +When initializing the encoder, you have the option to enable both tokenization and encoding by setting the `tokenize` flag to `True`. Below is an example of how to use it: +```py +encoder = initialize_encoder(lang="igbo", spm=True, tokenize=True) +embeddings = encoder("nnọọ, kedu ka ị mere") +``` +>setting the `spm` flag to `True` tells the encoder to also download the accompanying spm model + **Supported Languages:** You can specify any language from the [FLORES200](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200) dataset. This includes both languages identified by their full codes (like "ibo_Latn") and simpler alternatives (like "igbo"). ## Downloading the pre-trained models @@ -61,13 +68,19 @@ python -m laser_encoders.download_models --model-dir=path/to/model/directory > For a comprehensive list of available arguments, you can use the `--help` command with the download_models script. -Once you have successfully downloaded the models, you can utilize the `LaserTokenizer` to tokenize text in your desired language. Here's an example of how you can achieve this: +Once you have successfully downloaded the models, you can utilize the `SentenceEncoder` to tokenize and encode your text in your desired language. Here's an example of how you can achieve this: ```py -from laser_encoders.laser_tokenizer import LaserTokenizer from laser_encoders.models import SentenceEncoder from pathlib import Path +encoder = SentenceEncoder(model_path=path/to/downloaded/model, spm_model=Path(path/to/spm_model), spm_vocab=path/to/cvocab) +embeddings = encoder("This is a test sentence.") +``` +If you want to perform tokenization seperately, you can do this below: +```py +from laser_encoders.laser_tokenizer import LaserTokenizer + tokenizer = LaserTokenizer(spm_model=Path(path/to/spm_model)) tokenized_sentence = tokenizer.tokenize("This is a test sentence.") diff --git a/laser_encoders/download_models.py b/laser_encoders/download_models.py index ccfc31e9..17a5db35 100644 --- a/laser_encoders/download_models.py +++ b/laser_encoders/download_models.py @@ -117,6 +117,7 @@ def initialize_encoder( model_dir: str = None, spm: bool = True, laser: str = None, + tokenize: bool = False, ): downloader = LaserModelDownloader(model_dir) if laser is not None: @@ -146,12 +147,19 @@ def initialize_encoder( model_dir = downloader.model_dir model_path = os.path.join(model_dir, f"{file_path}.pt") - spm_path = os.path.join(model_dir, f"{file_path}.cvocab") - - if not os.path.exists(spm_path): + spm_vocab = os.path.join(model_dir, f"{file_path}.cvocab") + spm_model = None + if not os.path.exists(spm_vocab): # if there is no cvocab for the laser3 lang use laser2 cvocab - spm_path = os.path.join(model_dir, "laser2.cvocab") - return SentenceEncoder(model_path=model_path, spm_vocab=spm_path) + spm_vocab = os.path.join(model_dir, "laser2.cvocab") + if tokenize: + spm_model = os.path.join(model_dir, f"{file_path}.spm") + if not os.path.exists(spm_model): + spm_model = os.path.join(model_dir, "laser2.spm") + + return SentenceEncoder( + model_path=model_path, spm_vocab=spm_vocab, spm_model=spm_model + ) def initialize_tokenizer(lang: str = None, model_dir: str = None, laser: str = None): diff --git a/laser_encoders/models.py b/laser_encoders/models.py index 7ce0e326..e2a81ef9 100644 --- a/laser_encoders/models.py +++ b/laser_encoders/models.py @@ -17,6 +17,7 @@ import re import sys from collections import namedtuple +from pathlib import Path import numpy as np import torch @@ -25,6 +26,8 @@ from fairseq.models.transformer import Embedding, TransformerEncoder from fairseq.modules import LayerNorm +from laser_encoders.laser_tokenizer import LaserTokenizer + SPACE_NORMALIZER = re.compile(r"\s+") Batch = namedtuple("Batch", "srcs tokens lengths") @@ -43,6 +46,7 @@ def __init__( max_sentences=None, max_tokens=None, spm_vocab=None, + spm_model=None, cpu=False, fp16=False, verbose=False, @@ -50,6 +54,10 @@ def __init__( ): if verbose: logger.info(f"loading encoder: {model_path}") + self.spm_model = spm_model + if self.spm_model: + self.tokenizer = LaserTokenizer(spm_model=Path(self.spm_model)) + self.use_cuda = torch.cuda.is_available() and not cpu self.max_sentences = max_sentences self.max_tokens = max_tokens @@ -83,6 +91,15 @@ def __init__( self.encoder.eval() self.sort_kind = sort_kind + def __call__(self, sentences): + if self.spm_model: + sentences = self.tokenizer(sentences) + return self.encode_sentences(sentences) + else: + raise ValueError( + "Either initialize the encoder with an spm_model or pre-tokenize and use the encode_sentences method." + ) + def _process_batch(self, batch): tokens = batch.tokens lengths = batch.lengths