diff --git a/Serval.sln b/Serval.sln index edd3f075..12c0aaaf 100644 --- a/Serval.sln +++ b/Serval.sln @@ -86,6 +86,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C3A14577-A65 EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit", "src\ServiceToolkit\src\SIL.ServiceToolkit\SIL.ServiceToolkit.csproj", "{0E40F959-C641-40A2-9750-B17A4F9F9E55}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit.Tests", "src\ServiceToolkit\test\SIL.ServiceToolkit.Tests\SIL.ServiceToolkit.Tests.csproj", "{C50ED15A-876D-42BF-980A-388E8C49C78D}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -180,6 +184,10 @@ Global {0E40F959-C641-40A2-9750-B17A4F9F9E55}.Debug|Any CPU.Build.0 = Debug|Any CPU {0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.ActiveCfg = Release|Any CPU {0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.Build.0 = Release|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -215,6 +223,8 @@ Global {10657805-48F1-4205-B8F5-79447F6EF620} = {25CDB05B-4E24-4A6E-933E-1E0BEC97D74D} {C3A14577-A654-4604-818C-4E683DD45A51} = {EA69B41C-49EF-4017-A687-44B9DF37FF98} {0E40F959-C641-40A2-9750-B17A4F9F9E55} = {C3A14577-A654-4604-818C-4E683DD45A51} + {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126} = {EA69B41C-49EF-4017-A687-44B9DF37FF98} + {C50ED15A-876D-42BF-980A-388E8C49C78D} = {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {9F18C25E-E140-43C3-B177-D562E1628370} diff --git a/deploy/qa-ext-values.yaml b/deploy/qa-ext-values.yaml index 7106e030..5d7d1ecf 100644 --- a/deploy/qa-ext-values.yaml +++ b/deploy/qa-ext-values.yaml @@ -1,6 +1,6 @@ externalHost: qa.serval-api.org environment: Production -deploymentVersion: '1.7.QA7' +deploymentVersion: '1.8.QA1' alertEmail: ext-qa-serval-alerts@languagetechnology.org emailsToAlert: john_lambert@sil.org enableTls: true @@ -8,8 +8,8 @@ namespace: serval auth0Domain: dev-sillsdev.auth0.com lokiTenent: serval-tenant lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local -servalImage: ghcr.io/sillsdev/serval:1.7.7 -ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.7.2 +servalImage: ghcr.io/sillsdev/serval:1.8.1 +ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.8.1 ClearMLQueue: production MongoConnectionPrefix: qa_ SharedFileLocation: s3://silnlp/ext-qa/ diff --git a/samples/ApiExample/ApiExample.csproj b/samples/ApiExample/ApiExample.csproj index 9d56d539..9a87fdcc 100644 --- a/samples/ApiExample/ApiExample.csproj +++ b/samples/ApiExample/ApiExample.csproj @@ -22,7 +22,7 @@ - + diff --git a/src/Echo/src/EchoTranslationEngine/Program.cs b/src/Echo/src/EchoTranslationEngine/Program.cs index a679dfb5..352c536a 100644 --- a/src/Echo/src/EchoTranslationEngine/Program.cs +++ b/src/Echo/src/EchoTranslationEngine/Program.cs @@ -10,6 +10,8 @@ builder.Services.AddHostedService(); builder.Services.AddSingleton(); +builder.Services.AddParallelCorpusPreprocessor(); + builder.Services.AddHealthChecks().AddCheck("Live", () => HealthCheckResult.Healthy()); builder.Services.Configure(builder.Configuration.GetSection("Bugsnag")); diff --git a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs index 254fe0af..720a0126 100644 --- a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs +++ b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs @@ -1,10 +1,16 @@ namespace EchoTranslationEngine; -public class TranslationEngineServiceV1(BackgroundTaskQueue taskQueue) : TranslationEngineApi.TranslationEngineApiBase +public class TranslationEngineServiceV1( + BackgroundTaskQueue taskQueue, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService +) : TranslationEngineApi.TranslationEngineApiBase { private static readonly Empty Empty = new(); private readonly BackgroundTaskQueue _taskQueue = taskQueue; + private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService = + parallelCorpusPreprocessingService; + public override Task Create(CreateRequest request, ServerCallContext context) { if (request.SourceLanguage != request.TargetLanguage) @@ -75,159 +81,35 @@ await client.BuildStartedAsync( try { + List pretranslationsRequests = []; + await _parallelCorpusPreprocessingService.Preprocess( + request.Corpora.Select(Map).ToList(), + row => Task.CompletedTask, + (row, corpus) => + { + pretranslationsRequests.Add( + new InsertPretranslationsRequest + { + EngineId = request.EngineId, + CorpusId = corpus.Id, + TextId = row.TextId, + Refs = { row.Refs.Select(r => r.ToString()) }, + Translation = row.SourceSegment + } + ); + return Task.CompletedTask; + }, + false + ); using ( AsyncClientStreamingCall call = client.InsertPretranslations(cancellationToken: cancellationToken) ) { - foreach (ParallelCorpus corpus in request.Corpora) + foreach (InsertPretranslationsRequest request in pretranslationsRequests) { - var sourceFiles = corpus - .SourceCorpora.SelectMany(sc => - sc.Files.Where(f => - ( - sc.PretranslateAll - || sc.PretranslateTextIds is null - || sc.PretranslateTextIds.Contains(f.TextId) - ) - && f.Format == FileFormat.Text - ) - ) - .ToDictionary(f => f.TextId, f => f.Location); - var targetFiles = corpus - .TargetCorpora.SelectMany(tc => - tc.Files.Where(f => - ( - tc.PretranslateAll - || tc.PretranslateTextIds is null - || tc.PretranslateTextIds.Contains(f.TextId) - ) - && f.Format == FileFormat.Text - ) - ) - .ToDictionary(f => f.TextId, f => f.Location); - - foreach (KeyValuePair sourceFile in sourceFiles) - { - string[] sourceLines = await File.ReadAllLinesAsync( - sourceFile.Value, - cancellationToken - ); - - if (targetFiles.TryGetValue(sourceFile.Key, out string? targetPath)) - { - string[] targetLines = await File.ReadAllLinesAsync(targetPath, cancellationToken); - bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/'); - if (!isTabSeparated) - { - int lineNum = 1; - foreach ( - (string sourceLine, string targetLine) in sourceLines - .Select(l => l.Trim()) - .Zip(targetLines.Select(l => l.Trim())) - ) - { - if (sourceLine.Length > 0 && targetLine.Length == 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{lineNum}" }, - Translation = sourceLine - }, - cancellationToken - ); - } - lineNum++; - } - } - else - { - var sourceLinesDict = sourceLines.ToDictionary( - l => l.Split('\t')[0].Trim(), - l => l.Split('\t')[1].Trim() - ); - var targetLinesDict = targetLines.ToDictionary( - l => l.Split('\t')[0].Trim(), - l => l.Contains('\t') ? l.Split('\t')[1].Trim() : string.Empty - ); - foreach (KeyValuePair targetLineKVPair in targetLinesDict) - { - string? sourceLine = null; - sourceLinesDict.TryGetValue(targetLineKVPair.Key, out sourceLine); - sourceLine ??= string.Empty; - string? targetLine = targetLineKVPair.Value; - if (sourceLine.Length > 0 && targetLine.Length == 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{targetLineKVPair.Key}" }, - Translation = sourceLine - }, - cancellationToken - ); - } - } - } - } - else - { - bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/'); - if (!isTabSeparated) - { - int lineNum = 1; - foreach (string sourceLine in sourceLines.Select(l => l.Trim())) - { - if (sourceLine.Length > 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{lineNum}" }, - Translation = sourceLine - }, - cancellationToken - ); - } - lineNum++; - } - } - else - { - foreach (string sourceLine in sourceLines.Select(l => l.Trim())) - { - if (sourceLine.Length > 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{sourceLine.Split('\t')[0]}" }, - Translation = sourceLine.Contains('\t') - ? sourceLine.Split('\t')[1].Trim() - : string.Empty - }, - cancellationToken - ); - } - } - } - } - } + await call.RequestStream.WriteAsync(request, cancellationToken); } - await call.RequestStream.CompleteAsync(); await call; } @@ -325,4 +207,78 @@ ServerCallContext context new GetLanguageInfoResponse { InternalCode = request.Language + "_echo", IsNative = true, } ); } + + private static SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source) + { + return new SIL.ServiceToolkit.Models.ParallelCorpus + { + Id = source.Id, + SourceCorpora = source.SourceCorpora.Select(Map).ToList(), + TargetCorpora = source.TargetCorpora.Select(Map).ToList() + }; + } + + private static SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source) + { + var trainOnChapters = source.TrainOnChapters.ToDictionary( + kvp => kvp.Key, + kvp => kvp.Value.Chapters.ToHashSet() + ); + var trainOnTextIds = source.TrainOnTextIds.ToHashSet(); + FilterChoice trainingFilter = GetFilterChoice(trainOnChapters, trainOnTextIds, source.TrainOnAll); + + var pretranslateChapters = source.PretranslateChapters.ToDictionary( + kvp => kvp.Key, + kvp => kvp.Value.Chapters.ToHashSet() + ); + var pretranslateTextIds = source.PretranslateTextIds.ToHashSet(); + FilterChoice pretranslateFilter = GetFilterChoice( + pretranslateChapters, + pretranslateTextIds, + source.PretranslateAll + ); + + return new SIL.ServiceToolkit.Models.MonolingualCorpus + { + Id = source.Id, + Language = source.Language, + Files = source.Files.Select(Map).ToList(), + TrainOnChapters = trainingFilter == FilterChoice.Chapters ? trainOnChapters : null, + TrainOnTextIds = trainingFilter == FilterChoice.TextIds ? trainOnTextIds : null, + PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null, + PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null + }; + } + + private static SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source) + { + return new SIL.ServiceToolkit.Models.CorpusFile + { + Location = source.Location, + Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format, + TextId = source.TextId + }; + } + + private enum FilterChoice + { + Chapters, + TextIds, + None + } + + private static FilterChoice GetFilterChoice( + IReadOnlyDictionary> chapters, + HashSet textIds, + bool noFilter + ) + { + // Only either textIds or Scripture Range will be used at a time + // TextIds may be an empty array, so prefer that if both are empty (which applies to both scripture and text) + if (noFilter || (chapters is null && textIds is null)) + return FilterChoice.None; + if (chapters is null || chapters.Count == 0) + return FilterChoice.TextIds; + return FilterChoice.Chapters; + } } diff --git a/src/Echo/src/EchoTranslationEngine/Usings.cs b/src/Echo/src/EchoTranslationEngine/Usings.cs index b7f3ba2d..0404305b 100644 --- a/src/Echo/src/EchoTranslationEngine/Usings.cs +++ b/src/Echo/src/EchoTranslationEngine/Usings.cs @@ -5,3 +5,4 @@ global using Grpc.Core; global using Microsoft.Extensions.Diagnostics.HealthChecks; global using Serval.Translation.V1; +global using SIL.ServiceToolkit.Utils; diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs index c00fd45e..67b8ef3d 100644 --- a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs +++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs @@ -50,6 +50,12 @@ public static IMachineBuilder AddBuildJobOptions(this IMachineBuilder builder, I return builder; } + public static IMachineBuilder AddServiceToolkitServices(this IMachineBuilder builder) + { + builder.Services.AddParallelCorpusPreprocessor(); + return builder; + } + public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder) { return builder.AddThotSmtModel(builder.Configuration.GetSection(ThotSmtModelOptions.Key)); diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs index c72302b9..8fcaced4 100644 --- a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs +++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs @@ -15,11 +15,11 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf services.AddTransient(); services.AddScoped(); - services.AddSingleton(); services.AddStartupTask( (sp, cancellationToken) => sp.GetRequiredService().InitAsync(cancellationToken) ); + services.AddParallelCorpusPreprocessor(); var builder = new MachineBuilder(services, configuration); builder.AddServiceOptions(configuration.GetSection(ServiceOptions.Key)); diff --git a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj index b9985198..f9756293 100644 --- a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj +++ b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj @@ -36,9 +36,9 @@ - - - + + + diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs b/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs deleted file mode 100644 index bbcc9de3..00000000 --- a/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs +++ /dev/null @@ -1,7 +0,0 @@ -namespace Serval.Machine.Shared.Services; - -public interface ICorpusService -{ - IEnumerable CreateTextCorpora(IReadOnlyList files); - IEnumerable CreateTermCorpora(IReadOnlyList files); -} diff --git a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs index 3c46a34e..2e79d09a 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs @@ -7,8 +7,8 @@ public class NmtPreprocessBuildJob( ILogger logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - ICorpusService corpusService, - ILanguageTagService languageTagService + ILanguageTagService languageTagService, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService ) : PreprocessBuildJob( platformService, @@ -17,7 +17,7 @@ ILanguageTagService languageTagService logger, buildJobService, sharedFileService, - corpusService + parallelCorpusPreprocessingService ) { private readonly ILanguageTagService _languageTagService = languageTagService; diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs index 6d8506a0..de630b0c 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs @@ -1,49 +1,35 @@ namespace Serval.Machine.Shared.Services; -public class PreprocessBuildJob : HangfireBuildJob> +public class PreprocessBuildJob( + IPlatformService platformService, + IRepository engines, + IDataAccessContext dataAccessContext, + ILogger logger, + IBuildJobService buildJobService, + ISharedFileService sharedFileService, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService +) + : HangfireBuildJob>( + platformService, + engines, + dataAccessContext, + buildJobService, + logger + ) { private static readonly JsonWriterOptions PretranslateWriterOptions = new() { Indented = true }; internal BuildJobRunnerType TrainJobRunnerType { get; init; } = BuildJobRunnerType.ClearML; - private readonly ISharedFileService _sharedFileService; - private readonly ICorpusService _corpusService; - private int _seed = 1234; - private Random _random; - - public PreprocessBuildJob( - IPlatformService platformService, - IRepository engines, - IDataAccessContext dataAccessContext, - ILogger logger, - IBuildJobService buildJobService, - ISharedFileService sharedFileService, - ICorpusService corpusService - ) - : base(platformService, engines, dataAccessContext, buildJobService, logger) - { - _sharedFileService = sharedFileService; - _corpusService = corpusService; - _random = new Random(_seed); - } + private readonly ISharedFileService _sharedFileService = sharedFileService; - internal int Seed - { - get => _seed; - set - { - if (_seed != value) - { - _seed = value; - _random = new Random(_seed); - } - } - } + private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService = + parallelCorpusPreprocessingService; protected override async Task DoWorkAsync( string engineId, string buildId, - IReadOnlyList data, + IReadOnlyList data, string? buildOptions, CancellationToken cancellationToken ) @@ -114,11 +100,11 @@ CancellationToken cancellationToken JsonObject? buildOptionsObject = null; if (buildOptions is not null) buildOptionsObject = JsonSerializer.Deserialize(buildOptions); + await using StreamWriter sourceTrainWriter = new(await _sharedFileService.OpenWriteAsync($"builds/{buildId}/train.src.txt", cancellationToken)); await using StreamWriter targetTrainWriter = new(await _sharedFileService.OpenWriteAsync($"builds/{buildId}/train.trg.txt", cancellationToken)); - await using Stream pretranslateStream = await _sharedFileService.OpenWriteAsync( $"builds/{buildId}/pretranslate.src.json", cancellationToken @@ -128,166 +114,44 @@ CancellationToken cancellationToken int trainCount = 0; int pretranslateCount = 0; pretranslateWriter.WriteStartArray(); - foreach (ParallelCorpus corpus in corpora) - { - (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus - .SourceCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) - .ToArray(); - ITextCorpus[] sourceTrainingCorpora = sourceCorpora - .Select(sc => - { - ITextCorpus textCorpus = sc.TextCorpus; - if (sc.Corpus.TrainOnTextIds is not null) - textCorpus = textCorpus.FilterTexts(sc.Corpus.TrainOnTextIds); - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || sc.Corpus.TrainOnChapters is null - || IsInChapters(sr, sc.Corpus.TrainOnChapters) - ); - }) - .ToArray(); - ITextCorpus? sourcePretranslateCorpus = sourceCorpora - .Select(sc => - { - ITextCorpus textCorpus = sc.TextCorpus; - if (sc.Corpus.PretranslateTextIds is not null) - { - textCorpus = textCorpus.FilterTexts( - sc.Corpus.PretranslateTextIds.Except(sc.Corpus.TrainOnTextIds ?? new()) - ); - } - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || sc.Corpus.PretranslateChapters is null - || ( - IsInChapters(sr, sc.Corpus.PretranslateChapters) - && !IsInChapters(sr, sc.Corpus.TrainOnChapters ?? new()) - ) - ); - }) - .ToArray() - .FirstOrDefault(); - - (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus - .TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) - .ToArray(); - ITextCorpus[] targetTrainingCorpora = targetCorpora - .Select(tc => - { - ITextCorpus textCorpus = tc.TextCorpus; - if (tc.Corpus.TrainOnTextIds is not null) - textCorpus = textCorpus.FilterTexts(tc.Corpus.TrainOnTextIds); - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || tc.Corpus.TrainOnChapters is null - || IsInChapters(sr, tc.Corpus.TrainOnChapters) - ); - }) - .ToArray(); - - if (sourceCorpora.Length == 0) - continue; - - int skipCount = 0; - foreach (Row?[] rows in AlignTrainCorpus(sourceTrainingCorpora, targetTrainingCorpora)) + await _parallelCorpusPreprocessingService.Preprocess( + corpora, + async row => { - if (skipCount > 0) - { - skipCount--; - continue; - } - - Row[] trainRows = rows.Where(r => r is not null).Cast().ToArray(); - if (trainRows.Length > 0) + if (row.SourceSegment.Length > 0 || row.TargetSegment.Length > 0) { - Row row = trainRows[0]; - if (rows.Length > 1) - { - Row[] nonEmptyRows = trainRows.Where(r => r.SourceSegment.Length > 0).ToArray(); - Row[] targetNonEmptyRows = nonEmptyRows.Where(r => r.TargetSegment.Length > 0).ToArray(); - if (targetNonEmptyRows.Length > 0) - nonEmptyRows = targetNonEmptyRows; - if (nonEmptyRows.Length > 0) - { - nonEmptyRows = nonEmptyRows - .GroupBy(r => r.SourceSegment) - .Select(group => group.First()) - .ToArray(); - { - nonEmptyRows = nonEmptyRows - .GroupBy(r => r.SourceSegment) - .Select(group => group.First()) - .ToArray(); - row = nonEmptyRows[_random.Next(nonEmptyRows.Length)]; - } - } - } - await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n"); await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n"); - skipCount = row.RowCount - 1; - if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) - trainCount++; - } - } - - if ((bool?)buildOptionsObject?["use_key_terms"] ?? true) - { - ITextCorpus? sourceTermCorpus = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList()) - .FirstOrDefault(); - ITextCorpus? targetTermCorpus = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList()) - .FirstOrDefault(); - if (sourceTermCorpus is not null && targetTermCorpus is not null) - { - IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); - foreach (ParallelTextRow row in parallelKeyTermsCorpus) - { - await sourceTrainWriter.WriteAsync($"{row.SourceText}\n"); - await targetTrainWriter.WriteAsync($"{row.TargetText}\n"); - trainCount++; - } } - } - void WriteRow(Utf8JsonWriter writer, string textId, IReadOnlyList refs, string translation) - { - writer.WriteStartObject(); - writer.WriteString("corpusId", corpus.Id); - writer.WriteString("textId", textId); - writer.WriteStartArray("refs"); - foreach (object rowRef in refs) - writer.WriteStringValue(rowRef.ToString()); - writer.WriteEndArray(); - writer.WriteString("translation", translation); - writer.WriteEndObject(); - pretranslateCount++; - } - - ITextCorpus targetCorpus = - targetCorpora.Length > 0 ? targetCorpora[0].TextCorpus : new DictionaryTextCorpus(); - if (sourcePretranslateCorpus != null) + if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) + trainCount++; + }, + async (row, corpus) => { - foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpus, targetCorpus)) + if (row.SourceSegment.Length > 0 && row.TargetSegment.Length == 0) { - if (row.SourceSegment.Length > 0 && (row.TargetSegment.Length == 0 || !targetCorpus.Any())) - WriteRow(pretranslateWriter, row.TextId, row.Refs, row.SourceSegment); + pretranslateWriter.WriteStartObject(); + pretranslateWriter.WriteString("corpusId", corpus.Id); + pretranslateWriter.WriteString("textId", row.TextId); + pretranslateWriter.WriteStartArray("refs"); + foreach (object rowRef in row.Refs) + pretranslateWriter.WriteStringValue(rowRef.ToString()); + pretranslateWriter.WriteEndArray(); + pretranslateWriter.WriteString("translation", row.SourceSegment); + pretranslateWriter.WriteEndObject(); + pretranslateCount++; } - } - } + if (pretranslateWriter.BytesPending > 1024 * 1024) + await pretranslateWriter.FlushAsync(); + }, + (bool?)buildOptionsObject?["use_key_terms"] ?? true + ); pretranslateWriter.WriteEndArray(); return (trainCount, pretranslateCount); } - private static bool IsInChapters(ScriptureRef sr, Dictionary> selection) - { - return selection.TryGetValue(sr.Book, out HashSet? chapters) - && chapters != null - && (chapters.Count == 0 || chapters.Contains(sr.ChapterNum)); - } - protected override async Task CleanupAsync( string engineId, string buildId, @@ -308,194 +172,9 @@ JobCompletionStatus completionStatus } } - private static IEnumerable AlignTrainCorpus( - IReadOnlyList srcCorpora, - IReadOnlyList trgCorpora - ) - { - srcCorpora = srcCorpora.Select(sc => sc.Transform(CleanSegment)).ToArray(); - trgCorpora = trgCorpora.Select(tc => tc.Transform(CleanSegment)).ToArray(); - - if (trgCorpora.All(tc => tc.IsScripture())) - { - return srcCorpora - .SelectMany(sc => trgCorpora.Select(tc => AlignScripture(sc, tc))) - .ZipMany(rows => rows.ToArray()) - // filter out every list that only contains completely empty rows - .Where(rows => rows.Any(r => r is null || r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0)); - } - - IEnumerable sourceOnlyRows = srcCorpora - .SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allSourceRows: true))) - .ZipMany(rows => - rows.Where(r => r.TargetSegment.Count == 0) - .Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1)) - .ToArray() - ); - - IEnumerable targetRows = srcCorpora - .SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allTargetRows: true))) - .ZipMany(rows => - rows.Where(r => r.TargetSegment.Count > 0) - .Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1)) - .ToArray() - ); - - return sourceOnlyRows - .Concat(targetRows) - // filter out every list that only contains completely empty rows - .Where(rows => rows.Any(r => r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0)); - } - - private static IEnumerable AlignScripture(ITextCorpus srcCorpus, ITextCorpus trgCorpus) - { - int rowCount = 0; - StringBuilder srcSegBuffer = new(); - StringBuilder trgSegBuffer = new(); - HashSet vrefs = []; - foreach ( - (VerseRef vref, string srcSegment, string trgSegment) in srcCorpus - .ExtractScripture() - .Select(r => (r.CorpusVerseRef, r.Text)) - .Zip( - trgCorpus.ExtractScripture().Select(r => r.Text), - (s, t) => (VerseRef: s.CorpusVerseRef, SourceSegment: s.Text, TargetSegment: t) - ) - ) - { - if (srcSegment == "" && trgSegment == "") - { - vrefs.UnionWith(vref.AllVerses()); - rowCount++; - } - else if (srcSegment == "") - { - vrefs.UnionWith(vref.AllVerses()); - if (trgSegment.Length > 0) - { - if (trgSegBuffer.Length > 0) - trgSegBuffer.Append(' '); - trgSegBuffer.Append(trgSegment); - } - rowCount++; - } - else if (trgSegment == "") - { - vrefs.UnionWith(vref.AllVerses()); - if (srcSegment.Length > 0) - { - if (srcSegBuffer.Length > 0) - srcSegBuffer.Append(' '); - srcSegBuffer.Append(srcSegment); - } - rowCount++; - } - else - { - if (rowCount > 0) - { - yield return new( - vrefs.First().Book, - vrefs.Order().Select(v => new ScriptureRef(v)).Cast().ToArray(), - srcSegBuffer.ToString(), - trgSegBuffer.ToString(), - rowCount - ); - for (int i = 0; i < rowCount - 1; i++) - yield return null; - srcSegBuffer.Clear(); - trgSegBuffer.Clear(); - vrefs.Clear(); - rowCount = 0; - } - vrefs.UnionWith(vref.AllVerses()); - srcSegBuffer.Append(srcSegment); - trgSegBuffer.Append(trgSegment); - rowCount++; - } - } - - if (rowCount > 0) - { - yield return new( - vrefs.First().Book, - vrefs.Order().Select(v => new ScriptureRef(v)).Cast().ToArray(), - srcSegBuffer.ToString(), - trgSegBuffer.ToString(), - rowCount - ); - for (int i = 0; i < rowCount - 1; i++) - yield return null; - } - } - - private static IEnumerable AlignPretranslateCorpus(ITextCorpus srcCorpus, ITextCorpus trgCorpus) - { - int rowCount = 0; - StringBuilder srcSegBuffer = new(); - StringBuilder trgSegBuffer = new(); - List refs = []; - string textId = ""; - - srcCorpus = srcCorpus.Transform(CleanSegment); - trgCorpus = trgCorpus.Transform(CleanSegment); - - foreach (ParallelTextRow row in srcCorpus.AlignRows(trgCorpus, allSourceRows: true)) - { - if (!row.IsTargetRangeStart && row.IsTargetInRange) - { - refs.AddRange(row.TargetRefs); - if (row.SourceText.Length > 0) - { - if (srcSegBuffer.Length > 0) - srcSegBuffer.Append(' '); - srcSegBuffer.Append(row.SourceText); - } - rowCount++; - } - else - { - if (rowCount > 0) - { - if (trgSegBuffer.Length == 0) - yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); - textId = ""; - srcSegBuffer.Clear(); - trgSegBuffer.Clear(); - refs.Clear(); - rowCount = 0; - } - - textId = row.TextId; - refs.AddRange(row.TargetRefs); - srcSegBuffer.Append(row.SourceText); - trgSegBuffer.Append(row.TargetText); - rowCount++; - } - } - - if (rowCount > 0) - yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); - } - - private record Row( - string TextId, - IReadOnlyList Refs, - string SourceSegment, - string TargetSegment, - int RowCount - ); - protected virtual bool ResolveLanguageCodeForBaseModel(string languageCode, out string resolvedCode) { resolvedCode = languageCode; return true; } - - private static TextRow CleanSegment(TextRow row) - { - if (row.Text == "...") - row.Segment = []; - return row; - } } diff --git a/src/Machine/src/Serval.Machine.Shared/Services/S3WriteStream.cs b/src/Machine/src/Serval.Machine.Shared/Services/S3WriteStream.cs index 4b623d6d..e1ba3494 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/S3WriteStream.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/S3WriteStream.cs @@ -15,6 +15,9 @@ ILoggerFactory loggerFactory private readonly List _uploadResponses = new List(); private readonly ILogger _logger = loggerFactory.CreateLogger(); + private readonly Stream _stream = new MemoryStream(); + private int _bytesWritten = 0; + public const int MaxPartSize = 5 * 1024 * 1024; public override bool CanRead => false; @@ -23,7 +26,7 @@ ILoggerFactory loggerFactory public override bool CanWrite => true; - public override long Length => 0; + public override long Length => _stream.Length; public override long Position { @@ -48,47 +51,60 @@ public override async ValueTask WriteAsync( CancellationToken cancellationToken = default ) { - try - { - using Stream stream = buffer.AsStream(); + // S3 buckets can only be written to in chunks of MaxPartSize + // therefore, break it into chunks, resetting the stream each time - int bytesWritten = 0; + while (buffer.Length + _stream.Position > MaxPartSize) + { + int toWrite = MaxPartSize - (int)_stream.Position; + await _stream.WriteAsync(buffer[..toWrite], cancellationToken); + await UploadPartAsync(cancellationToken); + buffer = buffer[toWrite..]; + } + // save the remaining buffer for future calls + await _stream.WriteAsync(buffer, cancellationToken); + } - while (stream.Length > bytesWritten) - { - int partNumber = _uploadResponses.Count + 1; - UploadPartRequest request = - new() - { - BucketName = _bucketName, - Key = _key, - UploadId = _uploadId, - PartNumber = partNumber, - InputStream = stream, - PartSize = MaxPartSize - }; - request.StreamTransferProgress += new EventHandler( - (_, e) => - { - _logger.LogDebug( - "Transferred {e.TransferredBytes}/{e.TotalBytes}", - e.TransferredBytes, - e.TotalBytes - ); - } - ); - UploadPartResponse response = await _client.UploadPartAsync(request, cancellationToken); - if (response.HttpStatusCode != HttpStatusCode.OK) + private async Task UploadPartAsync(CancellationToken cancellationToken = default) + { + if (_stream.Length == 0) + return; + try + { + _stream.Position = 0; + int partNumber = _uploadResponses.Count + 1; + UploadPartRequest request = + new() { - throw new HttpRequestException( - $"Tried to upload part {partNumber} of upload {_uploadId} to {_bucketName}/{_key} but received response code {response.HttpStatusCode}" + BucketName = _bucketName, + Key = _key, + UploadId = _uploadId, + PartNumber = partNumber, + InputStream = _stream, + PartSize = MaxPartSize + }; + request.StreamTransferProgress += new EventHandler( + (_, e) => + { + _logger.LogDebug( + "Transferred {e.TransferredBytes}/{e.TotalBytes}", + e.TransferredBytes, + e.TotalBytes ); } + ); + UploadPartResponse response = await _client.UploadPartAsync(request, cancellationToken); + if (response.HttpStatusCode != HttpStatusCode.OK) + { + throw new HttpRequestException( + $"Tried to upload part {partNumber} of upload {_uploadId} to {_bucketName}/{_key} but received response code {response.HttpStatusCode}" + ); + } - _uploadResponses.Add(response); + _uploadResponses.Add(response); - bytesWritten += MaxPartSize; - } + _bytesWritten += MaxPartSize; + _stream.SetLength(0); } catch (Exception e) { @@ -104,6 +120,7 @@ public override async Task WriteAsync(byte[] buffer, int offset, int count, Canc protected override void Dispose(bool disposing) { + UploadPartAsync().WaitAndUnwrapException(); try { if (disposing) @@ -164,6 +181,7 @@ protected override void Dispose(bool disposing) public override async ValueTask DisposeAsync() { + await UploadPartAsync(); try { if (_uploadResponses.Count == 0) diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs b/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs index dfc52263..336d98ae 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs @@ -91,7 +91,7 @@ await engineService.TrainSegmentPairAsync( public override async Task StartBuild(StartBuildRequest request, ServerCallContext context) { ITranslationEngineService engineService = GetEngineService(request.EngineType); - Models.ParallelCorpus[] corpora = request.Corpora.Select(Map).ToArray(); + SIL.ServiceToolkit.Models.ParallelCorpus[] corpora = request.Corpora.Select(Map).ToArray(); try { await engineService.StartBuildAsync( @@ -269,9 +269,9 @@ private static Translation.V1.Phrase Map(SIL.Machine.Translation.Phrase source) }; } - private static Models.ParallelCorpus Map(Translation.V1.ParallelCorpus source) + private static SIL.ServiceToolkit.Models.ParallelCorpus Map(Translation.V1.ParallelCorpus source) { - return new Models.ParallelCorpus + return new SIL.ServiceToolkit.Models.ParallelCorpus { Id = source.Id, SourceCorpora = source.SourceCorpora.Select(Map).ToList(), @@ -279,7 +279,7 @@ private static Models.ParallelCorpus Map(Translation.V1.ParallelCorpus source) }; } - private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus source) + private static SIL.ServiceToolkit.Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus source) { var trainOnChapters = source.TrainOnChapters.ToDictionary( kvp => kvp.Key, @@ -299,7 +299,7 @@ private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus sou source.PretranslateAll ); - var corpus = new Models.MonolingualCorpus + return new SIL.ServiceToolkit.Models.MonolingualCorpus { Id = source.Id, Language = source.Language, @@ -309,15 +309,14 @@ private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus sou PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null, PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null }; - return corpus; } - private static Models.CorpusFile Map(Translation.V1.CorpusFile source) + private static SIL.ServiceToolkit.Models.CorpusFile Map(Translation.V1.CorpusFile source) { - return new Models.CorpusFile + return new SIL.ServiceToolkit.Models.CorpusFile { Location = source.Location, - Format = (Models.FileFormat)source.Format, + Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format, TextId = source.TextId }; } diff --git a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs index b9393e9b..7e1627a6 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs @@ -7,9 +7,9 @@ public class SmtTransferPreprocessBuildJob( ILogger logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - ICorpusService corpusService, IDistributedReaderWriterLockFactory lockFactory, - IRepository trainSegmentPairs + IRepository trainSegmentPairs, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService ) : PreprocessBuildJob( platformService, @@ -18,7 +18,7 @@ IRepository trainSegmentPairs logger, buildJobService, sharedFileService, - corpusService + parallelCorpusPreprocessingService ) { private readonly IDistributedReaderWriterLockFactory _lockFactory = lockFactory; diff --git a/src/Machine/src/Serval.Machine.Shared/Usings.cs b/src/Machine/src/Serval.Machine.Shared/Usings.cs index ea49e89d..bb148b80 100644 --- a/src/Machine/src/Serval.Machine.Shared/Usings.cs +++ b/src/Machine/src/Serval.Machine.Shared/Usings.cs @@ -54,7 +54,7 @@ global using SIL.Machine.Translation; global using SIL.Machine.Translation.Thot; global using SIL.Machine.Utils; -global using SIL.Scripture; +global using SIL.ServiceToolkit.Models; global using SIL.ServiceToolkit.Services; global using SIL.ServiceToolkit.Utils; global using SIL.WritingSystems; diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs index 67145c01..f05a8cb3 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs @@ -301,8 +301,8 @@ public override object ActivateJob(Type jobType) Substitute.For>(), _env.BuildJobService, _env.SharedFileService, - Substitute.For(), - new LanguageTagService() + new LanguageTagService(), + new ParallelCorpusPreprocessingService(new CorpusService()) ); } if (jobType == typeof(PostprocessBuildJob)) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index d29f2213..13785191 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -76,7 +76,7 @@ public async Task RunAsync_PretranslateAll() await env.RunBuildJobAsync(corpus1); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(2)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(4)); } [Test] @@ -90,6 +90,23 @@ public async Task RunAsync_PretranslateTextIds() Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(2)); } + [Test] + public async Task RunAsync_PretranslateTextIdsOverlapWithTrainOnTextIds() + { + using TestEnvironment env = new(); + ParallelCorpus corpus1 = TestEnvironment.TextFileCorpus( + pretranslateTextIds: ["textId1"], + trainOnTextIds: ["textId1"] + ); + + await env.RunBuildJobAsync(corpus1); + Assert.Multiple(async () => + { + Assert.That((await env.GetTrainCountAsync()).Source1Count, Is.EqualTo(4)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(2)); + }); + } + [Test] public async Task RunAsync_EnableKeyTerms() { @@ -101,10 +118,10 @@ public async Task RunAsync_EnableKeyTerms() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(0)); + Assert.That(src1Count, Is.EqualTo(14)); Assert.That(src2Count, Is.EqualTo(0)); - Assert.That(trgCount, Is.EqualTo(0)); - Assert.That(termCount, Is.EqualTo(5726)); + Assert.That(trgCount, Is.EqualTo(1)); + Assert.That(termCount, Is.EqualTo(166)); }); } @@ -119,9 +136,9 @@ public async Task RunAsync_DisableKeyTerms() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(0)); + Assert.That(src1Count, Is.EqualTo(14)); Assert.That(src2Count, Is.EqualTo(0)); - Assert.That(trgCount, Is.EqualTo(0)); + Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); } @@ -143,7 +160,11 @@ public async Task RunAsync_PretranslateChapters() await env.RunBuildJobAsync(corpus1); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(4)); + Assert.That( + await env.GetPretranslateCountAsync(), + Is.EqualTo(4), + JsonSerializer.Serialize(await env.GetPretranslationsAsync()) + ); } [Test] @@ -184,16 +205,12 @@ public async Task RunAsync_MixedSource_Paratext() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(5)); - Assert.That(src2Count, Is.EqualTo(12)); + Assert.That(src1Count, Is.EqualTo(7)); + Assert.That(src2Count, Is.EqualTo(13)); Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); - Assert.That( - await env.GetPretranslateCountAsync(), - Is.EqualTo(13), - (await env.GetPretranslationsAsync())?.ToJsonString() - ); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(15)); } [Test] @@ -207,16 +224,12 @@ public async Task RunAsync_MixedSource_Text() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(3)); - Assert.That(src2Count, Is.EqualTo(2)); + Assert.That(src1Count, Is.EqualTo(1)); + Assert.That(src2Count, Is.EqualTo(4)); Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); - Assert.That( - await env.GetPretranslateCountAsync(), - Is.EqualTo(2), - (await env.GetPretranslationsAsync())?.ToJsonString() - ); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(3)); } [Test] @@ -275,7 +288,7 @@ public async Task RunAsync_RemoveFreestandingEllipses() ); JsonArray? pretranslations = await env.GetPretranslationsAsync(); Assert.That(pretranslations, Is.Not.Null); - Assert.That(pretranslations.Count, Is.EqualTo(0)); + Assert.That(pretranslations!.Count, Is.EqualTo(1)); } [Test] @@ -346,7 +359,7 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook() } [Test] - public async Task ParallelCorpusLogic() + public async Task ParallelCorpusAsync() { using TestEnvironment env = new(); var corpora = new List() @@ -396,6 +409,13 @@ public async Task ParallelCorpusLogic() new() { } } }, + PretranslateChapters = new() + { + { + "1CH", + new() { } + } + } }, }, TargetCorpora = new List() @@ -442,26 +462,29 @@ public async Task ParallelCorpusLogic() } }; await env.RunBuildJobAsync(corpora, useKeyTerms: false); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); Assert.Multiple(async () => { + string src = await env.GetSourceExtractAsync(); Assert.That( - await env.GetSourceExtractAsync(), + src, Is.EqualTo( @"Source one, chapter fourteen, verse fifty-five. Segment b. Source one, chapter fourteen, verse fifty-six. -Source one, chapter one, verse one. +Source two, chapter one, verse one. Source two, chapter one, verse two. Source two, chapter one, verse three. -Source two, chapter one, verse four. +Source one, chapter one, verse four. Source two, chapter one, verse five. Source two, chapter one, verse six. -Source two, chapter one, verse seven. Source two, chapter one, verse eight. -Source two, chapter one, verse nine. Source two, chapter one, verse ten. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. Source two, chapter one, verse one. " - ) + ), + src ); + string trg = await env.GetTargetExtractAsync(); Assert.That( - await env.GetTargetExtractAsync(), + trg, Is.EqualTo( @"Target two, chapter fourteen, verse fifty-five. Target two, chapter fourteen, verse fifty-six. @@ -470,12 +493,251 @@ await env.GetTargetExtractAsync(), Target one, chapter one, verse three. Target one, chapter one, verse five and six. -Target one, chapter one, verse seven and eight. -Target one, chapter one, verse nine and ten. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. " - ) + ), + trg + ); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(7)); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") + ); + }); + } + + [Test] + public async Task ParallelCorpusAsync_UseKeyTerms() + { + using TestEnvironment env = new(); + var corpora = new List() + { + new ParallelCorpus() + { + Id = "1", + SourceCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "LEV", + new() { } + } + }, + PretranslateChapters = new() + { + { + "1CH", + new() { } + } + } + }, + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + }, + PretranslateChapters = new() { } + }, + }, + TargetCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-target1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + } + }, + new() + { + Id = "_2", + Language = "en", + Files = new List { env.ParatextFile("pt-target2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + }, + { + "LEV", + new() { } + } + } + } + } + } + }; + await env.RunBuildJobAsync(corpora, useKeyTerms: true); + string source = await env.GetSourceExtractAsync(); + string target = await env.GetTargetExtractAsync(); + Assert.Multiple(() => + { + StringAssert.StartsWith( + @"Source one, chapter fourteen, verse fifty-five. Segment b. +Source one, chapter fourteen, verse fifty-six. +Source two, chapter one, verse one. +Source two, chapter one, verse two. +Source two, chapter one, verse three. +Source one, chapter one, verse four. +Source two, chapter one, verse five. Source two, chapter one, verse six. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. +Source two, chapter one, verse one. +", + source ); + StringAssert.StartsWith( + @"Target two, chapter fourteen, verse fifty-five. +Target two, chapter fourteen, verse fifty-six. +Target one, chapter one, verse one. +Target one, chapter one, verse two. +Target one, chapter one, verse three. + +Target one, chapter one, verse five and six. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. + +", + target + ); + StringAssert.Contains("Abraham", source); + StringAssert.Contains("Abraham", target); + StringAssert.DoesNotContain("Zedekiah", source); + StringAssert.DoesNotContain("Zedekiah", target); + }); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") + ); + } + + [Test] + public async Task ParallelCorpusAsync_UseKeyTerms_TextIds() + { + using TestEnvironment env = new(); + var corpora = new List() + { + new ParallelCorpus() + { + Id = "1", + SourceCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source1") }, + TrainOnTextIds = ["MAT", "LEV"], + PretranslateTextIds = ["1CH"] + }, + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source2") }, + TrainOnTextIds = ["MAT", "MRK"], + PretranslateTextIds = [] + }, + }, + TargetCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-target1") }, + TrainOnTextIds = ["MAT", "MRK"] + }, + new() + { + Id = "_2", + Language = "en", + Files = new List { env.ParatextFile("pt-target2") }, + TrainOnTextIds = ["MAT", "MRK", "LEV"] + } + } + } + }; + await env.RunBuildJobAsync(corpora, useKeyTerms: true); + string source = await env.GetSourceExtractAsync(); + string target = await env.GetTargetExtractAsync(); + Assert.Multiple(() => + { + StringAssert.StartsWith( + @"Source one, chapter fourteen, verse fifty-five. Segment b. +Source one, chapter fourteen, verse fifty-six. +Source two, chapter one, verse one. +Source two, chapter one, verse two. +Source two, chapter one, verse three. +Source one, chapter one, verse four. +Source two, chapter one, verse five. Source two, chapter one, verse six. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. +Source one, chapter two, verse one. +Source one, chapter two, verse two. + +Source two, chapter one, verse one. +", + source + ); + StringAssert.StartsWith( + @"Target two, chapter fourteen, verse fifty-five. +Target two, chapter fourteen, verse fifty-six. +Target one, chapter one, verse one. +Target one, chapter one, verse two. +Target one, chapter one, verse three. + +Target one, chapter one, verse five and six. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. +Target one, chapter two, verse one. + +Target one, chapter two, verse three. + +", + target + ); + StringAssert.Contains("Abraham", source); + StringAssert.Contains("Abraham", target); + StringAssert.DoesNotContain("Zedekiah", source); + StringAssert.DoesNotContain("Zedekiah", target); }); JsonArray? pretranslations = await env.GetPretranslationsAsync(); Assert.That(pretranslations, Is.Not.Null); @@ -591,8 +853,8 @@ public TestEnvironment() Id = "src_1", Language = "es", Files = [ParatextFile("pt-source1")], - TrainOnTextIds = [], - PretranslateTextIds = [] + TrainOnTextIds = null, + PretranslateTextIds = null } }, TargetCorpora = new List() @@ -602,7 +864,7 @@ public TestEnvironment() Id = "trg_1", Language = "en", Files = [ParatextFile("pt-target1")], - TrainOnTextIds = [] + TrainOnTextIds = null } } }; @@ -789,12 +1051,9 @@ public PreprocessBuildJob GetBuildJob(TranslationEngineType engineType) Substitute.For>(), BuildJobService, SharedFileService, - CorpusService, - new LanguageTagService() - ) - { - Seed = 1234 - }; + new LanguageTagService(), + new ParallelCorpusPreprocessingService(CorpusService) + ); } case TranslationEngineType.SmtTransfer: { @@ -805,13 +1064,10 @@ public PreprocessBuildJob GetBuildJob(TranslationEngineType engineType) Substitute.For>(), BuildJobService, SharedFileService, - CorpusService, LockFactory, - TrainSegmentPairs - ) - { - Seed = 1234 - }; + TrainSegmentPairs, + new ParallelCorpusPreprocessingService(CorpusService) + ); } default: throw new InvalidOperationException("Unknown engine type."); diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs index 6b888794..17c89ed4 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs @@ -687,9 +687,9 @@ public override object ActivateJob(Type jobType) Substitute.For>(), _env.BuildJobService, _env.SharedFileService, - Substitute.For(), _env._lockFactory, - _env.TrainSegmentPairs + _env.TrainSegmentPairs, + new ParallelCorpusPreprocessingService(new CorpusService()) ) { TrainJobRunnerType = _env._trainJobRunnerType diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs index f58cb973..3ccb5537 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs @@ -28,4 +28,6 @@ global using SIL.Machine.Utils; global using SIL.ObjectModel; global using SIL.Scripture; +global using SIL.ServiceToolkit.Models; +global using SIL.ServiceToolkit.Services; global using SIL.WritingSystems; diff --git a/src/Serval/src/Serval.Client/Serval.Client.csproj b/src/Serval/src/Serval.Client/Serval.Client.csproj index 66ed8ebe..13feff18 100644 --- a/src/Serval/src/Serval.Client/Serval.Client.csproj +++ b/src/Serval/src/Serval.Client/Serval.Client.csproj @@ -2,7 +2,7 @@ netstandard2.1 - 1.7.3 + 1.8.0 Client classes for Serval. Serval.Client Serval diff --git a/src/Serval/src/Serval.Shared/Serval.Shared.csproj b/src/Serval/src/Serval.Shared/Serval.Shared.csproj index 0974a424..f2607b7b 100644 --- a/src/Serval/src/Serval.Shared/Serval.Shared.csproj +++ b/src/Serval/src/Serval.Shared/Serval.Shared.csproj @@ -19,7 +19,7 @@ - + diff --git a/src/Serval/src/Serval.Translation/Services/EngineService.cs b/src/Serval/src/Serval.Translation/Services/EngineService.cs index 443b2d23..a8bb3a05 100644 --- a/src/Serval/src/Serval.Translation/Services/EngineService.cs +++ b/src/Serval/src/Serval.Translation/Services/EngineService.cs @@ -727,12 +727,12 @@ pretranslateCorpus is not null ); } } - return new V1.ParallelCorpus - { - Id = source.Id, - SourceCorpora = { sourceCorpus }, - TargetCorpora = { targetCorpus } - }; + V1.ParallelCorpus corpus = new() { Id = source.Id }; + if (sourceCorpus.Files.Count > 0) + corpus.SourceCorpora.Add(sourceCorpus); + if (targetCorpus.Files.Count > 0) + corpus.TargetCorpora.Add(targetCorpus); + return corpus; } private V1.ParallelCorpus Map( diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index 1bf552fb..516e634e 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -100,8 +100,7 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) textId, pretranslations.ToList(), fullName: targetSettings.FullName, - stripAllText: false, - preferExistingText: true + behavior: UpdateUsfmBehavior.PreferExisting ) ?? ""; break; case PretranslationUsfmTextOrigin.PreferPretranslated: @@ -110,8 +109,7 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) textId, pretranslations.ToList(), fullName: targetSettings.FullName, - stripAllText: false, - preferExistingText: false + behavior: UpdateUsfmBehavior.PreferNew ) ?? ""; break; case PretranslationUsfmTextOrigin.OnlyExisting: @@ -120,8 +118,7 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) textId, [], // don't put any pretranslations, we only want the existing text. fullName: targetSettings.FullName, - stripAllText: false, - preferExistingText: false + behavior: UpdateUsfmBehavior.PreferNew ) ?? ""; break; case PretranslationUsfmTextOrigin.OnlyPretranslated: @@ -130,8 +127,7 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) textId, pretranslations.ToList(), fullName: targetSettings.FullName, - stripAllText: true, - preferExistingText: false + behavior: UpdateUsfmBehavior.StripExisting ) ?? ""; break; } @@ -155,16 +151,14 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) textId, pretranslations.ToList(), fullName: targetSettings.FullName, - stripAllText: true, - preferExistingText: true + behavior: UpdateUsfmBehavior.StripExisting ) ?? ""; case PretranslationUsfmTextOrigin.OnlyExisting: return updater.UpdateUsfm( textId, [], // don't pass the pretranslations, we only want the existing text. fullName: targetSettings.FullName, - stripAllText: true, - preferExistingText: true + behavior: UpdateUsfmBehavior.StripExisting ) ?? ""; } } diff --git a/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs b/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs index d489cf9a..87f54a13 100644 --- a/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs +++ b/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs @@ -179,12 +179,22 @@ public async Task AddTextCorpusToEngineAsync( bool pretranslate ) { - List sourceFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, sourceLanguage); + List sourceFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + sourceLanguage, + isTarget: false + ); var targetFileConfig = new List(); if (!pretranslate) { - List targetFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, targetLanguage); + List targetFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + targetLanguage, + isTarget: true + ); foreach (var item in targetFiles.Select((file, i) => new { i, file })) { targetFileConfig.Add( @@ -195,20 +205,11 @@ bool pretranslate var sourceFileConfig = new List(); - if (sourceLanguage == targetLanguage && !pretranslate) - { - // if it's the same language, and we are not pretranslating, do nothing (echo for suggestions) - // if pretranslating, we need to upload the source separately - // if different languages, we are not echoing. - } - else + for (int i = 0; i < sourceFiles.Count; i++) { - for (int i = 0; i < sourceFiles.Count; i++) - { - sourceFileConfig.Add( - new TranslationCorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] } - ); - } + sourceFileConfig.Add( + new TranslationCorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] } + ); } TranslationCorpus response = await TranslationEnginesClient.AddCorpusAsync( @@ -240,12 +241,22 @@ public async Task MakeParallelTextCorpus( bool pretranslate ) { - List sourceFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, sourceLanguage); + List sourceFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + sourceLanguage, + isTarget: false + ); var targetFileConfig = new List(); if (!pretranslate) { - List targetFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, targetLanguage); + List targetFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + targetLanguage, + isTarget: true + ); foreach (var item in targetFiles.Select((file, i) => new { i, file })) { targetFileConfig.Add(new CorpusFileConfig { FileId = item.file.Id, TextId = filesToAdd[item.i] }); @@ -264,18 +275,9 @@ bool pretranslate var sourceFileConfig = new List(); - if (sourceLanguage == targetLanguage && !pretranslate) - { - // if it's the same language, and we are not pretranslating, do nothing (echo for suggestions) - // if pretranslating, we need to upload the source separately - // if different languages, we are not echoing. - } - else + for (int i = 0; i < sourceFiles.Count; i++) { - for (int i = 0; i < sourceFiles.Count; i++) - { - sourceFileConfig.Add(new CorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] }); - } + sourceFileConfig.Add(new CorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] }); } CorpusConfig sourceCorpusConfig = @@ -315,7 +317,8 @@ bool pretranslate public async Task> UploadFilesAsync( IEnumerable filesToAdd, FileFormat fileFormat, - string language + string language, + bool isTarget ) { string languageFolder = Path.GetFullPath( @@ -335,7 +338,7 @@ string language foreach (string fileName in filesToAdd) { - string fullName = _prefix + language + "_" + fileName; + string fullName = _prefix + language + "_" + fileName + (isTarget ? "_trg" : "_src"); //delete files that have the name name if (filenameToId.Contains(fullName)) diff --git a/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs b/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs index b4dc6841..42d70339 100644 --- a/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs +++ b/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs @@ -17,7 +17,7 @@ public void GetZipParatextProjectTextUpdater() TestEnvironment env = new(); using ZipParatextProjectTextUpdater updater = env.Service.GetZipParatextProjectTextUpdater("file1.zip"); Assert.That( - updater.UpdateUsfm("MAT", [], preferExistingText: true).ReplaceLineEndings("\n"), + updater.UpdateUsfm("MAT", [], behavior: UpdateUsfmBehavior.PreferExisting).ReplaceLineEndings("\n"), Is.EqualTo( $@"\id MAT - PROJ \h {Canon.BookIdToEnglishName("MAT")} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs index 83fd6a21..14e4ba2a 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs @@ -1,6 +1,4 @@ -using SIL.ServiceToolkit.Services; - -namespace Microsoft.Extensions.DependencyInjection; +namespace Microsoft.Extensions.DependencyInjection; public static class IHealthChecksBuilderExtensions { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs new file mode 100644 index 00000000..d5a6424f --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs @@ -0,0 +1,11 @@ +namespace Microsoft.Extensions.DependencyInjection; + +public static class IServiceCollectionExtensions +{ + public static IServiceCollection AddParallelCorpusPreprocessor(this IServiceCollection services) + { + services.AddSingleton(); + services.AddSingleton(); + return services; + } +} diff --git a/src/Machine/src/Serval.Machine.Shared/Models/CorpusFile.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/CorpusFile.cs similarity index 84% rename from src/Machine/src/Serval.Machine.Shared/Models/CorpusFile.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Models/CorpusFile.cs index a84bf7f6..65e45202 100644 --- a/src/Machine/src/Serval.Machine.Shared/Models/CorpusFile.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/CorpusFile.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Models; +namespace SIL.ServiceToolkit.Models; public enum FileFormat { diff --git a/src/Machine/src/Serval.Machine.Shared/Models/MonolingualCorpus.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs similarity index 92% rename from src/Machine/src/Serval.Machine.Shared/Models/MonolingualCorpus.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs index 2b4a1612..c0323727 100644 --- a/src/Machine/src/Serval.Machine.Shared/Models/MonolingualCorpus.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Models; +namespace SIL.ServiceToolkit.Models; public record MonolingualCorpus { diff --git a/src/Machine/src/Serval.Machine.Shared/Models/ParallelCorpus.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs similarity index 87% rename from src/Machine/src/Serval.Machine.Shared/Models/ParallelCorpus.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs index a28dfc14..83374162 100644 --- a/src/Machine/src/Serval.Machine.Shared/Models/ParallelCorpus.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Models; +namespace SIL.ServiceToolkit.Models; public record ParallelCorpus { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/Row.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/Row.cs new file mode 100644 index 00000000..5b43e1fe --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/Row.cs @@ -0,0 +1,3 @@ +namespace SIL.ServiceToolkit.Models; + +public record Row(string TextId, IReadOnlyList Refs, string SourceSegment, string TargetSegment, int RowCount); diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj index a84edf58..ced38ebc 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj @@ -16,6 +16,12 @@ + + + + + + diff --git a/src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs similarity index 83% rename from src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 17d562ad..793e5046 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Services; +namespace SIL.ServiceToolkit.Services; public class CorpusService : ICorpusService { @@ -36,14 +36,16 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file return corpora; } - public IEnumerable CreateTermCorpora(IReadOnlyList files) + public IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora + ) { - foreach (CorpusFile file in files) + foreach ((CorpusFile file, Dictionary> chapters) in corpora) { switch (file.Format) { case FileFormat.Paratext: - yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]); + yield return new ParatextBackupTermsCorpus(file.Location, ["PN"], chapters: chapters); break; } } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs new file mode 100644 index 00000000..3f19fccc --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -0,0 +1,9 @@ +namespace SIL.ServiceToolkit.Services; + +public interface ICorpusService +{ + IEnumerable CreateTextCorpora(IReadOnlyList files); + IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora + ); +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs new file mode 100644 index 00000000..1be70d5e --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs @@ -0,0 +1,13 @@ +using Nito.AsyncEx; + +namespace SIL.ServiceToolkit.Utils; + +public interface IParallelCorpusPreprocessingService +{ + Task Preprocess( + IReadOnlyList corpora, + Func train, + Func pretranslate, + bool useKeyTerms = false + ); +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs new file mode 100644 index 00000000..71769985 --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -0,0 +1,246 @@ +namespace SIL.ServiceToolkit.Services; + +public class ParallelCorpusPreprocessingService : IParallelCorpusPreprocessingService +{ + private readonly ICorpusService _corpusService; + private int _seed = 1234; + private Random _random; + + public ParallelCorpusPreprocessingService(ICorpusService corpusService) + { + _corpusService = corpusService; + _random = new Random(_seed); + } + + internal int Seed + { + get => _seed; + set + { + if (_seed != value) + { + _seed = value; + _random = new Random(_seed); + } + } + } + + public async Task Preprocess( + IReadOnlyList corpora, + Func train, + Func pretranslate, + bool useKeyTerms = false + ) + { + foreach (ParallelCorpus corpus in corpora) + { + (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus + .SourceCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) + .ToArray(); + + if (sourceCorpora.Length == 0) + continue; + + ITextCorpus[] sourceTrainingCorpora = sourceCorpora + .Select(sc => FilterTrainingCorpora(sc.Corpus, sc.TextCorpus)) + .ToArray(); + + ITextCorpus[] sourcePretranslateCorpora = sourceCorpora + .Select(sc => FilterPretranslateCorpora(sc.Corpus, sc.TextCorpus)) + .ToArray(); + + (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus + .TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) + .ToArray(); + + ITextCorpus[] targetTrainingCorpora = targetCorpora + .Select(tc => FilterTrainingCorpora(tc.Corpus, tc.TextCorpus)) + .ToArray(); + + ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora.ChooseRandom(Seed); + if (sourceTrainingCorpus.IsScripture()) + { + sourceTrainingCorpus = sourceTrainingCorpus.Where(IsScriptureRow); + } + + ITextCorpus targetCorpus = targetTrainingCorpora.ChooseFirst(); + + ITextCorpus targetTrainingCorpus = targetCorpus; + if (targetTrainingCorpus.IsScripture()) + { + targetTrainingCorpus = targetTrainingCorpus.Where(IsScriptureRow); + } + + ParallelTextRow[] trainingRows = sourceTrainingCorpus + .AlignRows(targetTrainingCorpus, allSourceRows: true, allTargetRows: true) + .ToArray(); + + foreach (Row row in CollapseRanges(trainingRows)) + { + await train(row); + } + + if (useKeyTerms) + { + ITextCorpus[]? sourceTermCorpora = _corpusService + .CreateTermCorpora( + sourceCorpora + .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus)) + .ToArray() + ) + .ToArray(); + ITextCorpus[]? targetTermCorpora = _corpusService + .CreateTermCorpora( + targetCorpora + .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus)) + .ToArray() + ) + .ToArray(); + if (sourceTermCorpora is not null && targetTermCorpora is not null) + { + IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora + .ChooseRandom(Seed) + .AlignRows(targetTermCorpora.ChooseFirst()); + foreach (ParallelTextRow row in parallelKeyTermsCorpus) + { + await train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); + } + } + } + ITextCorpus sourcePretranslateCorpus = sourcePretranslateCorpora.ChooseFirst(); + + IParallelTextCorpus pretranslateCorpus = sourcePretranslateCorpus.AlignRows( + targetCorpus, + allSourceRows: true + ); + + foreach (Row row in CollapseRanges(pretranslateCorpus.ToArray())) + { + await pretranslate(row, corpus); + } + } + } + + private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChaptersPerFile( + MonolingualCorpus mc, + ITextCorpus tc + ) + { + Dictionary>? chapters = mc.TrainOnChapters; + if (chapters is null && mc.TrainOnTextIds is not null) + { + chapters = mc.TrainOnTextIds.Select(tid => (tid, new HashSet { })).ToDictionary(); + } + chapters ??= tc.Texts.Select(t => (t.Id, new HashSet() { })).ToDictionary(); + return mc.Files.Select(f => (f, chapters)); + } + + private static ITextCorpus FilterPretranslateCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus) + { + textCorpus = textCorpus.Transform(CleanSegment); + if (corpus.PretranslateTextIds is not null) + { + return textCorpus.FilterTexts(corpus.PretranslateTextIds); + } + if (corpus.PretranslateChapters is not null) + { + return textCorpus + .FilterTexts(corpus.PretranslateChapters.Keys) + .Where(row => row.Ref is not ScriptureRef sr || IsInChapters(sr, corpus.PretranslateChapters)); + } + return textCorpus; + } + + private static ITextCorpus FilterTrainingCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus) + { + textCorpus = textCorpus.Transform(CleanSegment); + if (corpus.TrainOnTextIds is not null) + { + return textCorpus.FilterTexts(corpus.TrainOnTextIds); + } + if (corpus.TrainOnChapters is not null) + { + return textCorpus + .FilterTexts(corpus.TrainOnChapters.Keys) + .Where(row => row.Ref is not ScriptureRef sr || IsInChapters(sr, corpus.TrainOnChapters)); + } + return textCorpus; + } + + private static IEnumerable CollapseRanges(ParallelTextRow[] rows) + { + StringBuilder srcSegBuffer = new(); + StringBuilder trgSegBuffer = new(); + List refs = []; + string textId = ""; + bool hasUnfinishedRange = false; + + foreach (ParallelTextRow row in rows) + { + if ( + hasUnfinishedRange + && (!row.IsTargetInRange || row.IsTargetRangeStart) + && (!row.IsSourceInRange || row.IsSourceRangeStart) + ) + { + yield return new Row(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + refs.Clear(); + + hasUnfinishedRange = false; + } + + textId = row.TextId; + refs.AddRange(row.TargetRefs); + if (row.SourceText.Length > 0) + { + if (srcSegBuffer.Length > 0) + srcSegBuffer.Append(' '); + srcSegBuffer.Append(row.SourceText); + } + if (row.TargetText.Length > 0) + { + if (trgSegBuffer.Length > 0) + trgSegBuffer.Append(' '); + trgSegBuffer.Append(row.TargetText); + } + + if (row.IsTargetInRange || row.IsSourceInRange) + { + hasUnfinishedRange = true; + continue; + } + + yield return new Row(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + refs.Clear(); + } + if (hasUnfinishedRange) + { + yield return new Row(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + } + } + + private static bool IsScriptureRow(TextRow parallelTextRow) + { + return parallelTextRow.Ref is ScriptureRef sr && sr.IsVerse; + } + + private static bool IsInChapters(ScriptureRef sr, Dictionary> selection) + { + return selection.TryGetValue(sr.Book, out HashSet? chapters) + && chapters != null + && (chapters.Count == 0 || chapters.Contains(sr.ChapterNum)); + } + + private static TextRow CleanSegment(TextRow row) + { + if (row.Text == "...") + row.Segment = []; + return row; + } +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs index 0d9630d6..a5800d9f 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs @@ -1,4 +1,5 @@ global using System.Diagnostics.CodeAnalysis; +global using System.Text; global using System.Text.Json.Nodes; global using System.Text.RegularExpressions; global using Grpc.Core; @@ -9,4 +10,8 @@ global using Microsoft.Extensions.Hosting; global using Microsoft.Extensions.Logging; global using Microsoft.Extensions.Options; +global using SIL.Machine.Corpora; +global using SIL.ServiceToolkit.Models; +global using SIL.ServiceToolkit.Services; +global using SIL.ServiceToolkit.Utils; global using SIL.WritingSystems; diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/SIL.ServiceToolkit.Tests.csproj b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/SIL.ServiceToolkit.Tests.csproj new file mode 100644 index 00000000..0b5ceff0 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/SIL.ServiceToolkit.Tests.csproj @@ -0,0 +1,33 @@ + + + + net8.0 + enable + enable + SIL.ServiceToolkit + + + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs new file mode 100644 index 00000000..033467f4 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs @@ -0,0 +1,98 @@ +namespace SIL.ServiceToolkit.Services; + +[TestFixture] +public class ParallelCorpusPreprocessingServiceTests +{ + private static readonly string TestDataPath = Path.Combine( + AppContext.BaseDirectory, + "..", + "..", + "..", + "Services", + "data" + ); + + [Test] + public async Task TestParallelCorpusPreprocessor() + { + ParallelCorpusPreprocessingService processor = new(new CorpusService()); + List corpora = + [ + new() + { + Id = "corpus1", + SourceCorpora = + [ + new() + { + Id = "source-corpus1", + Language = "en", + Files = + [ + new() + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "source1.txt") + } + ] + }, + new() + { + Id = "source-corpus2", + Language = "en", + Files = + [ + new() + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "source2.txt") + } + ] + } + ], + TargetCorpora = + [ + new() + { + Id = "target-corpus1", + Language = "en", + Files = + [ + new() + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "target1.txt") + } + ] + } + ] + } + ]; + int trainCount = 0; + int pretranslateCount = 0; + await processor.Preprocess( + corpora, + row => + { + if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) + trainCount++; + return Task.CompletedTask; + }, + (row, _) => + { + if (row.SourceSegment.Length > 0 && row.TargetSegment.Length == 0) + pretranslateCount++; + return Task.CompletedTask; + }, + false + ); + Assert.Multiple(() => + { + Assert.That(trainCount, Is.EqualTo(2)); + Assert.That(pretranslateCount, Is.EqualTo(3)); + }); + } +} diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source1.txt b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source1.txt new file mode 100644 index 00000000..2aeb971c --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source1.txt @@ -0,0 +1,7 @@ +Source one, Line 1 +Source one, Line 2 + +Source one, Line 4 + +Source one, Line 6 + diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source2.txt b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source2.txt new file mode 100644 index 00000000..7f4a0669 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source2.txt @@ -0,0 +1,7 @@ +Source two, Line 1 +Source two, Line 2 + +Source two, Line 4 +Source two, Line 5 +Source two, Line 6 + diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/target1.txt b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/target1.txt new file mode 100644 index 00000000..816e9435 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/target1.txt @@ -0,0 +1,7 @@ +Target one, Line 1 + + +Target one, Line 4 + + +Target one, Line 7 diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs new file mode 100644 index 00000000..e1c24c5f --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs @@ -0,0 +1,2 @@ +global using NUnit.Framework; +global using SIL.ServiceToolkit.Models;