Skip to content

Commit

Permalink
test: Add test for LaserEncoderPipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
Paulooh007 committed Oct 26, 2023
1 parent d56a3a8 commit 3fc5ea2
Showing 1 changed file with 59 additions and 1 deletion.
60 changes: 59 additions & 1 deletion laser_encoders/test_laser_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@
import numpy as np
import pytest

from laser_encoders import initialize_encoder, initialize_tokenizer
from laser_encoders import (
LaserEncoderPipeline,
initialize_encoder,
initialize_tokenizer,
)


@pytest.fixture
Expand All @@ -35,6 +39,27 @@ def input_text() -> str:
return "This is a test sentence."


@pytest.fixture
def test_readme_params() -> dict:
return {
"lang": "igbo",
"input_sentences": ["nnọọ, kedu ka ị mere"],
"expected_embedding_shape": (1, 1024),
"expected_array": [
0.3807628,
-0.27941525,
-0.17819545,
0.44144684,
-0.38985375,
0.04719935,
0.20238206,
-0.03934783,
0.0118901,
0.28986093,
],
}


def test_tokenize(tokenizer, input_text: str):
expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
assert tokenizer.tokenize(input_text) == expected_output
Expand Down Expand Up @@ -175,3 +200,36 @@ def test_sentence_encoder(
assert isinstance(sentence_embedding, np.ndarray)
assert sentence_embedding.shape == (1, 1024)
assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3)


def test_laser_encoder_pipeline(tmp_path: Path, test_readme_params: dict):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
expected_embedding_shape = test_readme_params["expected_embedding_shape"]
expected_array = test_readme_params["expected_array"]

encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
embeddings = encoder.encode_sentences(input_sentences)

assert isinstance(embeddings, np.ndarray)
assert embeddings.shape == expected_embedding_shape
assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)


def test_separate_initialization_and_encoding(
tmp_path, tokenizer, test_readme_params: dict
):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
expected_embedding_shape = test_readme_params["expected_embedding_shape"]
expected_array = test_readme_params["expected_array"]

tokenized_sentence = tokenizer.tokenize(input_sentences[0])
sentence_encoder = initialize_encoder(model_dir=tmp_path, lang=lang)

# Encode tokenized sentences into embeddings
embeddings = sentence_encoder.encode_sentences([tokenized_sentence])

assert isinstance(embeddings, np.ndarray)
assert embeddings.shape == expected_embedding_shape
assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)

0 comments on commit 3fc5ea2

Please sign in to comment.