From 7b7bef2caa735b6120d8443fddc26925386b67ee Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Tue, 2 Jan 2024 18:05:40 +0200 Subject: [PATCH] test: added unittest with start_of_line=True and False --- tests/test_jurassic_tokenizer.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_jurassic_tokenizer.py b/tests/test_jurassic_tokenizer.py index f37f6b7..0b22d17 100644 --- a/tests/test_jurassic_tokenizer.py +++ b/tests/test_jurassic_tokenizer.py @@ -87,3 +87,22 @@ def test_tokenizer__convert_tokens_to_ids( actual_ids = tokenizer.convert_tokens_to_ids(tokens) assert actual_ids == expected_ids + + +@pytest.mark.parametrize( + ids=[ + "when_start_of_line__should_return_no_leading_whitespace", + "when_not_start_of_line__should_return_leading_whitespace", + ], + argnames=["tokens", "start_of_line", "expected_text"], + argvalues=[ + ([30671], True, "hello"), + ([30671], False, " hello"), + ], +) +def test_tokenizer__decode_with_start_of_line( + tokens: List[int], start_of_line: bool, expected_text: str, tokenizer: JurassicTokenizer +): + actual_text = tokenizer.decode(tokens, start_of_line=start_of_line) + + assert actual_text == expected_text