From a111c6769abf36ecf38d711b96dd2a9e59ef37e0 Mon Sep 17 00:00:00 2001 From: "Eli C. Lowry" <83078660+Enkidu93@users.noreply.github.com> Date: Thu, 29 Aug 2024 12:40:25 -0400 Subject: [PATCH] Ignore ellipses (#469) * Filter out ellipses segments * Test pretranslation content --- .../Services/PreprocessBuildJob.cs | 15 ++-- .../Services/PreprocessBuildJobTests.cs | 68 +++++++++++++++++-- .../Services/data/pt-source1/41MATTe1.SFM | 3 + .../Services/data/pt-target1/41MATTe2.SFM | 3 +- 4 files changed, 78 insertions(+), 11 deletions(-) diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs index 97103a1f..1a8da401 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs @@ -268,8 +268,8 @@ ITextCorpus trgCorpus IEnumerable? textIds = corpus.TrainOnChapters is not null ? corpus.TrainOnChapters.Keys : corpus.TrainOnTextIds; - srcCorpora = srcCorpora.Select(sc => sc.FilterTexts(textIds)).ToArray(); - trgCorpus = trgCorpus.FilterTexts(textIds); + srcCorpora = srcCorpora.Select(sc => sc.FilterTexts(textIds).Transform(CleanSegment)).ToArray(); + trgCorpus = trgCorpus.FilterTexts(textIds).Transform(CleanSegment); if (trgCorpus.IsScripture()) { @@ -389,8 +389,8 @@ private static IEnumerable AlignPretranslateCorpus(Corpus corpus, ITextCorp IEnumerable? textIds = corpus.PretranslateChapters is not null ? corpus.PretranslateChapters.Keys : corpus.PretranslateTextIds; - srcCorpus = srcCorpus.FilterTexts(textIds); - trgCorpus = trgCorpus.FilterTexts(textIds); + srcCorpus = srcCorpus.FilterTexts(textIds).Transform(CleanSegment); + trgCorpus = trgCorpus.FilterTexts(textIds).Transform(CleanSegment); int rowCount = 0; StringBuilder srcSegBuffer = new(); StringBuilder trgSegBuffer = new(); @@ -446,4 +446,11 @@ protected virtual bool ResolveLanguageCodeForBaseModel(string languageCode, out resolvedCode = languageCode; return true; } + + private static TextRow CleanSegment(TextRow row) + { + if (row.Text == "...") + row.Segment = []; + return row; + } } diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 874ec6ce..7c347603 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -188,12 +188,12 @@ public async Task RunAsync_MixedSource_Paratext() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(4)); + Assert.That(src1Count, Is.EqualTo(5)); Assert.That(src2Count, Is.EqualTo(12)); Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(12)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(13)); } [Test] @@ -242,6 +242,46 @@ public async Task RunAsync_UnknownLanguageTagsNoDataSmtTransfer() await env.RunBuildJobAsync(corpus1, engineId: "engine2", engineType: TranslationEngineType.SmtTransfer); } + [Test] + public async Task RunAsync_RemoveFreestandingEllipses() + { + using TestEnvironment env = new(); + Corpus corpus1 = env.DefaultParatextCorpus with + { + TrainOnChapters = new Dictionary> + { + { + "MAT", + new HashSet() { 2 } + } + }, + PretranslateChapters = new Dictionary> + { + { + "MAT", + new HashSet() { 2 } + } + } + }; + await env.RunBuildJobAsync(corpus1, useKeyTerms: false); + string sourceExtract = await env.GetSourceExtractAsync(); + Assert.That( + sourceExtract, + Is.EqualTo("Source one, chapter two, verse one.\nSource one, chapter two, verse two.\n\n"), + sourceExtract + ); + string targetExtract = await env.GetTargetExtractAsync(); + Assert.That( + targetExtract, + Is.EqualTo("Target one, chapter two, verse one.\n\nTarget one, chapter two, verse three.\n"), + targetExtract + ); + JsonArray? pretranslations = await env.GetPretranslationAsync(); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations.Count, Is.EqualTo(1)); + Assert.That(pretranslations[0]!["translation"]!.ToString(), Is.EqualTo("Source one, chapter two, verse two.")); + } + [Test] public void RunAsync_OnlyParseSelectedBooks_NoBadBooks() { @@ -581,6 +621,18 @@ public Task RunBuildJobAsync( .RunAsync(engineId, "build1", [corpus], useKeyTerms ? null : "{\"use_key_terms\":false}", default); } + public async Task GetSourceExtractAsync() + { + using StreamReader srcReader = new(await SharedFileService.OpenReadAsync("builds/build1/train.src.txt")); + return await srcReader.ReadToEndAsync(); + } + + public async Task GetTargetExtractAsync() + { + using StreamReader trgReader = new(await SharedFileService.OpenReadAsync("builds/build1/train.trg.txt")); + return await trgReader.ReadToEndAsync(); + } + public async Task<(int Source1Count, int Source2Count, int TargetCount, int TermCount)> GetTrainCountAsync() { using StreamReader srcReader = new(await SharedFileService.OpenReadAsync("builds/build1/train.src.txt")); @@ -610,12 +662,16 @@ public Task RunBuildJobAsync( return (src1Count, src2Count, trgCount, termCount); } - public async Task GetPretranslateCountAsync() + public async Task GetPretranslationAsync() { using StreamReader reader = new(await SharedFileService.OpenReadAsync("builds/build1/pretranslate.src.json")); - JsonArray? pretranslationJsonObject = JsonSerializer.Deserialize(await reader.ReadToEndAsync()); - return pretranslationJsonObject?.Count ?? 0; + return JsonSerializer.Deserialize(await reader.ReadToEndAsync()); + } + + public async Task GetPretranslateCountAsync() + { + return (await GetPretranslationAsync())?.Count ?? 0; } private void ZipParatextProject(string name) @@ -659,7 +715,7 @@ private class DummyCorpus(IEnumerable books, IEnumerable failsOn new List() { new TextRow(b, new ScriptureRef(new VerseRef("MAT", "1", "1", ScrVers.English))) } )); - public bool IsTokenized => throw new NotImplementedException(); + public bool IsTokenized => false; public ScrVers Versification => ScrVers.English; diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/41MATTe1.SFM b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/41MATTe1.SFM index ccf166e2..bca7c590 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/41MATTe1.SFM +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/41MATTe1.SFM @@ -14,3 +14,6 @@ \c 2 \p \v 1 Source one, chapter two, verse one. +\v 2 Source one, chapter two, verse two. +\v 3 ... +\v 4 ... diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/41MATTe2.SFM b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/41MATTe2.SFM index 69f46250..202c3ae1 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/41MATTe2.SFM +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/41MATTe2.SFM @@ -14,4 +14,5 @@ \c 2 \p \v 1 Target one, chapter two, verse one. -\v 2 Target one, chapter two, verse two. +\v 2 ... +\v 3 Target one, chapter two, verse three.