diff --git a/Serval.sln b/Serval.sln
index edd3f075..12c0aaaf 100644
--- a/Serval.sln
+++ b/Serval.sln
@@ -86,6 +86,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C3A14577-A65
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit", "src\ServiceToolkit\src\SIL.ServiceToolkit\SIL.ServiceToolkit.csproj", "{0E40F959-C641-40A2-9750-B17A4F9F9E55}"
EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit.Tests", "src\ServiceToolkit\test\SIL.ServiceToolkit.Tests\SIL.ServiceToolkit.Tests.csproj", "{C50ED15A-876D-42BF-980A-388E8C49C78D}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -180,6 +184,10 @@ Global
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Debug|Any CPU.Build.0 = Debug|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.ActiveCfg = Release|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.Build.0 = Release|Any CPU
+ {C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -215,6 +223,8 @@ Global
{10657805-48F1-4205-B8F5-79447F6EF620} = {25CDB05B-4E24-4A6E-933E-1E0BEC97D74D}
{C3A14577-A654-4604-818C-4E683DD45A51} = {EA69B41C-49EF-4017-A687-44B9DF37FF98}
{0E40F959-C641-40A2-9750-B17A4F9F9E55} = {C3A14577-A654-4604-818C-4E683DD45A51}
+ {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126} = {EA69B41C-49EF-4017-A687-44B9DF37FF98}
+ {C50ED15A-876D-42BF-980A-388E8C49C78D} = {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {9F18C25E-E140-43C3-B177-D562E1628370}
diff --git a/deploy/qa-ext-values.yaml b/deploy/qa-ext-values.yaml
index 7106e030..5d7d1ecf 100644
--- a/deploy/qa-ext-values.yaml
+++ b/deploy/qa-ext-values.yaml
@@ -1,6 +1,6 @@
externalHost: qa.serval-api.org
environment: Production
-deploymentVersion: '1.7.QA7'
+deploymentVersion: '1.8.QA1'
alertEmail: ext-qa-serval-alerts@languagetechnology.org
emailsToAlert: john_lambert@sil.org
enableTls: true
@@ -8,8 +8,8 @@ namespace: serval
auth0Domain: dev-sillsdev.auth0.com
lokiTenent: serval-tenant
lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local
-servalImage: ghcr.io/sillsdev/serval:1.7.7
-ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.7.2
+servalImage: ghcr.io/sillsdev/serval:1.8.1
+ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.8.1
ClearMLQueue: production
MongoConnectionPrefix: qa_
SharedFileLocation: s3://silnlp/ext-qa/
diff --git a/samples/ApiExample/ApiExample.csproj b/samples/ApiExample/ApiExample.csproj
index 9d56d539..9a87fdcc 100644
--- a/samples/ApiExample/ApiExample.csproj
+++ b/samples/ApiExample/ApiExample.csproj
@@ -22,7 +22,7 @@
-
+
diff --git a/src/Echo/src/EchoTranslationEngine/Program.cs b/src/Echo/src/EchoTranslationEngine/Program.cs
index a679dfb5..352c536a 100644
--- a/src/Echo/src/EchoTranslationEngine/Program.cs
+++ b/src/Echo/src/EchoTranslationEngine/Program.cs
@@ -10,6 +10,8 @@
builder.Services.AddHostedService();
builder.Services.AddSingleton();
+builder.Services.AddParallelCorpusPreprocessor();
+
builder.Services.AddHealthChecks().AddCheck("Live", () => HealthCheckResult.Healthy());
builder.Services.Configure(builder.Configuration.GetSection("Bugsnag"));
diff --git a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
index 254fe0af..720a0126 100644
--- a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
+++ b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
@@ -1,10 +1,16 @@
namespace EchoTranslationEngine;
-public class TranslationEngineServiceV1(BackgroundTaskQueue taskQueue) : TranslationEngineApi.TranslationEngineApiBase
+public class TranslationEngineServiceV1(
+ BackgroundTaskQueue taskQueue,
+ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
+) : TranslationEngineApi.TranslationEngineApiBase
{
private static readonly Empty Empty = new();
private readonly BackgroundTaskQueue _taskQueue = taskQueue;
+ private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService =
+ parallelCorpusPreprocessingService;
+
public override Task Create(CreateRequest request, ServerCallContext context)
{
if (request.SourceLanguage != request.TargetLanguage)
@@ -75,159 +81,35 @@ await client.BuildStartedAsync(
try
{
+ List pretranslationsRequests = [];
+ await _parallelCorpusPreprocessingService.Preprocess(
+ request.Corpora.Select(Map).ToList(),
+ row => Task.CompletedTask,
+ (row, corpus) =>
+ {
+ pretranslationsRequests.Add(
+ new InsertPretranslationsRequest
+ {
+ EngineId = request.EngineId,
+ CorpusId = corpus.Id,
+ TextId = row.TextId,
+ Refs = { row.Refs.Select(r => r.ToString()) },
+ Translation = row.SourceSegment
+ }
+ );
+ return Task.CompletedTask;
+ },
+ false
+ );
using (
AsyncClientStreamingCall call =
client.InsertPretranslations(cancellationToken: cancellationToken)
)
{
- foreach (ParallelCorpus corpus in request.Corpora)
+ foreach (InsertPretranslationsRequest request in pretranslationsRequests)
{
- var sourceFiles = corpus
- .SourceCorpora.SelectMany(sc =>
- sc.Files.Where(f =>
- (
- sc.PretranslateAll
- || sc.PretranslateTextIds is null
- || sc.PretranslateTextIds.Contains(f.TextId)
- )
- && f.Format == FileFormat.Text
- )
- )
- .ToDictionary(f => f.TextId, f => f.Location);
- var targetFiles = corpus
- .TargetCorpora.SelectMany(tc =>
- tc.Files.Where(f =>
- (
- tc.PretranslateAll
- || tc.PretranslateTextIds is null
- || tc.PretranslateTextIds.Contains(f.TextId)
- )
- && f.Format == FileFormat.Text
- )
- )
- .ToDictionary(f => f.TextId, f => f.Location);
-
- foreach (KeyValuePair sourceFile in sourceFiles)
- {
- string[] sourceLines = await File.ReadAllLinesAsync(
- sourceFile.Value,
- cancellationToken
- );
-
- if (targetFiles.TryGetValue(sourceFile.Key, out string? targetPath))
- {
- string[] targetLines = await File.ReadAllLinesAsync(targetPath, cancellationToken);
- bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/');
- if (!isTabSeparated)
- {
- int lineNum = 1;
- foreach (
- (string sourceLine, string targetLine) in sourceLines
- .Select(l => l.Trim())
- .Zip(targetLines.Select(l => l.Trim()))
- )
- {
- if (sourceLine.Length > 0 && targetLine.Length == 0)
- {
- await call.RequestStream.WriteAsync(
- new InsertPretranslationsRequest
- {
- EngineId = request.EngineId,
- CorpusId = corpus.Id,
- TextId = sourceFile.Key,
- Refs = { $"{sourceFile.Key}:{lineNum}" },
- Translation = sourceLine
- },
- cancellationToken
- );
- }
- lineNum++;
- }
- }
- else
- {
- var sourceLinesDict = sourceLines.ToDictionary(
- l => l.Split('\t')[0].Trim(),
- l => l.Split('\t')[1].Trim()
- );
- var targetLinesDict = targetLines.ToDictionary(
- l => l.Split('\t')[0].Trim(),
- l => l.Contains('\t') ? l.Split('\t')[1].Trim() : string.Empty
- );
- foreach (KeyValuePair targetLineKVPair in targetLinesDict)
- {
- string? sourceLine = null;
- sourceLinesDict.TryGetValue(targetLineKVPair.Key, out sourceLine);
- sourceLine ??= string.Empty;
- string? targetLine = targetLineKVPair.Value;
- if (sourceLine.Length > 0 && targetLine.Length == 0)
- {
- await call.RequestStream.WriteAsync(
- new InsertPretranslationsRequest
- {
- EngineId = request.EngineId,
- CorpusId = corpus.Id,
- TextId = sourceFile.Key,
- Refs = { $"{sourceFile.Key}:{targetLineKVPair.Key}" },
- Translation = sourceLine
- },
- cancellationToken
- );
- }
- }
- }
- }
- else
- {
- bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/');
- if (!isTabSeparated)
- {
- int lineNum = 1;
- foreach (string sourceLine in sourceLines.Select(l => l.Trim()))
- {
- if (sourceLine.Length > 0)
- {
- await call.RequestStream.WriteAsync(
- new InsertPretranslationsRequest
- {
- EngineId = request.EngineId,
- CorpusId = corpus.Id,
- TextId = sourceFile.Key,
- Refs = { $"{sourceFile.Key}:{lineNum}" },
- Translation = sourceLine
- },
- cancellationToken
- );
- }
- lineNum++;
- }
- }
- else
- {
- foreach (string sourceLine in sourceLines.Select(l => l.Trim()))
- {
- if (sourceLine.Length > 0)
- {
- await call.RequestStream.WriteAsync(
- new InsertPretranslationsRequest
- {
- EngineId = request.EngineId,
- CorpusId = corpus.Id,
- TextId = sourceFile.Key,
- Refs = { $"{sourceFile.Key}:{sourceLine.Split('\t')[0]}" },
- Translation = sourceLine.Contains('\t')
- ? sourceLine.Split('\t')[1].Trim()
- : string.Empty
- },
- cancellationToken
- );
- }
- }
- }
- }
- }
+ await call.RequestStream.WriteAsync(request, cancellationToken);
}
-
await call.RequestStream.CompleteAsync();
await call;
}
@@ -325,4 +207,78 @@ ServerCallContext context
new GetLanguageInfoResponse { InternalCode = request.Language + "_echo", IsNative = true, }
);
}
+
+ private static SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source)
+ {
+ return new SIL.ServiceToolkit.Models.ParallelCorpus
+ {
+ Id = source.Id,
+ SourceCorpora = source.SourceCorpora.Select(Map).ToList(),
+ TargetCorpora = source.TargetCorpora.Select(Map).ToList()
+ };
+ }
+
+ private static SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source)
+ {
+ var trainOnChapters = source.TrainOnChapters.ToDictionary(
+ kvp => kvp.Key,
+ kvp => kvp.Value.Chapters.ToHashSet()
+ );
+ var trainOnTextIds = source.TrainOnTextIds.ToHashSet();
+ FilterChoice trainingFilter = GetFilterChoice(trainOnChapters, trainOnTextIds, source.TrainOnAll);
+
+ var pretranslateChapters = source.PretranslateChapters.ToDictionary(
+ kvp => kvp.Key,
+ kvp => kvp.Value.Chapters.ToHashSet()
+ );
+ var pretranslateTextIds = source.PretranslateTextIds.ToHashSet();
+ FilterChoice pretranslateFilter = GetFilterChoice(
+ pretranslateChapters,
+ pretranslateTextIds,
+ source.PretranslateAll
+ );
+
+ return new SIL.ServiceToolkit.Models.MonolingualCorpus
+ {
+ Id = source.Id,
+ Language = source.Language,
+ Files = source.Files.Select(Map).ToList(),
+ TrainOnChapters = trainingFilter == FilterChoice.Chapters ? trainOnChapters : null,
+ TrainOnTextIds = trainingFilter == FilterChoice.TextIds ? trainOnTextIds : null,
+ PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null,
+ PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null
+ };
+ }
+
+ private static SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source)
+ {
+ return new SIL.ServiceToolkit.Models.CorpusFile
+ {
+ Location = source.Location,
+ Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format,
+ TextId = source.TextId
+ };
+ }
+
+ private enum FilterChoice
+ {
+ Chapters,
+ TextIds,
+ None
+ }
+
+ private static FilterChoice GetFilterChoice(
+ IReadOnlyDictionary> chapters,
+ HashSet textIds,
+ bool noFilter
+ )
+ {
+ // Only either textIds or Scripture Range will be used at a time
+ // TextIds may be an empty array, so prefer that if both are empty (which applies to both scripture and text)
+ if (noFilter || (chapters is null && textIds is null))
+ return FilterChoice.None;
+ if (chapters is null || chapters.Count == 0)
+ return FilterChoice.TextIds;
+ return FilterChoice.Chapters;
+ }
}
diff --git a/src/Echo/src/EchoTranslationEngine/Usings.cs b/src/Echo/src/EchoTranslationEngine/Usings.cs
index b7f3ba2d..0404305b 100644
--- a/src/Echo/src/EchoTranslationEngine/Usings.cs
+++ b/src/Echo/src/EchoTranslationEngine/Usings.cs
@@ -5,3 +5,4 @@
global using Grpc.Core;
global using Microsoft.Extensions.Diagnostics.HealthChecks;
global using Serval.Translation.V1;
+global using SIL.ServiceToolkit.Utils;
diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs
index c00fd45e..67b8ef3d 100644
--- a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs
+++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs
@@ -50,6 +50,12 @@ public static IMachineBuilder AddBuildJobOptions(this IMachineBuilder builder, I
return builder;
}
+ public static IMachineBuilder AddServiceToolkitServices(this IMachineBuilder builder)
+ {
+ builder.Services.AddParallelCorpusPreprocessor();
+ return builder;
+ }
+
public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder)
{
return builder.AddThotSmtModel(builder.Configuration.GetSection(ThotSmtModelOptions.Key));
diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs
index c72302b9..8fcaced4 100644
--- a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs
+++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs
@@ -15,11 +15,11 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf
services.AddTransient();
services.AddScoped();
- services.AddSingleton();
services.AddStartupTask(
(sp, cancellationToken) =>
sp.GetRequiredService().InitAsync(cancellationToken)
);
+ services.AddParallelCorpusPreprocessor();
var builder = new MachineBuilder(services, configuration);
builder.AddServiceOptions(configuration.GetSection(ServiceOptions.Key));
diff --git a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj
index b9985198..f9756293 100644
--- a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj
+++ b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj
@@ -36,9 +36,9 @@
-
-
-
+
+
+
diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs b/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs
deleted file mode 100644
index bbcc9de3..00000000
--- a/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs
+++ /dev/null
@@ -1,7 +0,0 @@
-namespace Serval.Machine.Shared.Services;
-
-public interface ICorpusService
-{
- IEnumerable CreateTextCorpora(IReadOnlyList files);
- IEnumerable CreateTermCorpora(IReadOnlyList files);
-}
diff --git a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs
index 3c46a34e..2e79d09a 100644
--- a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs
+++ b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs
@@ -7,8 +7,8 @@ public class NmtPreprocessBuildJob(
ILogger logger,
IBuildJobService buildJobService,
ISharedFileService sharedFileService,
- ICorpusService corpusService,
- ILanguageTagService languageTagService
+ ILanguageTagService languageTagService,
+ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
)
: PreprocessBuildJob(
platformService,
@@ -17,7 +17,7 @@ ILanguageTagService languageTagService
logger,
buildJobService,
sharedFileService,
- corpusService
+ parallelCorpusPreprocessingService
)
{
private readonly ILanguageTagService _languageTagService = languageTagService;
diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs
index 6d8506a0..de630b0c 100644
--- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs
+++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs
@@ -1,49 +1,35 @@
namespace Serval.Machine.Shared.Services;
-public class PreprocessBuildJob : HangfireBuildJob>
+public class PreprocessBuildJob(
+ IPlatformService platformService,
+ IRepository engines,
+ IDataAccessContext dataAccessContext,
+ ILogger logger,
+ IBuildJobService buildJobService,
+ ISharedFileService sharedFileService,
+ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
+)
+ : HangfireBuildJob>(
+ platformService,
+ engines,
+ dataAccessContext,
+ buildJobService,
+ logger
+ )
{
private static readonly JsonWriterOptions PretranslateWriterOptions = new() { Indented = true };
internal BuildJobRunnerType TrainJobRunnerType { get; init; } = BuildJobRunnerType.ClearML;
- private readonly ISharedFileService _sharedFileService;
- private readonly ICorpusService _corpusService;
- private int _seed = 1234;
- private Random _random;
-
- public PreprocessBuildJob(
- IPlatformService platformService,
- IRepository engines,
- IDataAccessContext dataAccessContext,
- ILogger logger,
- IBuildJobService buildJobService,
- ISharedFileService sharedFileService,
- ICorpusService corpusService
- )
- : base(platformService, engines, dataAccessContext, buildJobService, logger)
- {
- _sharedFileService = sharedFileService;
- _corpusService = corpusService;
- _random = new Random(_seed);
- }
+ private readonly ISharedFileService _sharedFileService = sharedFileService;
- internal int Seed
- {
- get => _seed;
- set
- {
- if (_seed != value)
- {
- _seed = value;
- _random = new Random(_seed);
- }
- }
- }
+ private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService =
+ parallelCorpusPreprocessingService;
protected override async Task DoWorkAsync(
string engineId,
string buildId,
- IReadOnlyList data,
+ IReadOnlyList data,
string? buildOptions,
CancellationToken cancellationToken
)
@@ -114,11 +100,11 @@ CancellationToken cancellationToken
JsonObject? buildOptionsObject = null;
if (buildOptions is not null)
buildOptionsObject = JsonSerializer.Deserialize(buildOptions);
+
await using StreamWriter sourceTrainWriter =
new(await _sharedFileService.OpenWriteAsync($"builds/{buildId}/train.src.txt", cancellationToken));
await using StreamWriter targetTrainWriter =
new(await _sharedFileService.OpenWriteAsync($"builds/{buildId}/train.trg.txt", cancellationToken));
-
await using Stream pretranslateStream = await _sharedFileService.OpenWriteAsync(
$"builds/{buildId}/pretranslate.src.json",
cancellationToken
@@ -128,166 +114,44 @@ CancellationToken cancellationToken
int trainCount = 0;
int pretranslateCount = 0;
pretranslateWriter.WriteStartArray();
- foreach (ParallelCorpus corpus in corpora)
- {
- (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus
- .SourceCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)))
- .ToArray();
- ITextCorpus[] sourceTrainingCorpora = sourceCorpora
- .Select(sc =>
- {
- ITextCorpus textCorpus = sc.TextCorpus;
- if (sc.Corpus.TrainOnTextIds is not null)
- textCorpus = textCorpus.FilterTexts(sc.Corpus.TrainOnTextIds);
- return textCorpus.Where(row =>
- row.Ref is not ScriptureRef sr
- || sc.Corpus.TrainOnChapters is null
- || IsInChapters(sr, sc.Corpus.TrainOnChapters)
- );
- })
- .ToArray();
- ITextCorpus? sourcePretranslateCorpus = sourceCorpora
- .Select(sc =>
- {
- ITextCorpus textCorpus = sc.TextCorpus;
- if (sc.Corpus.PretranslateTextIds is not null)
- {
- textCorpus = textCorpus.FilterTexts(
- sc.Corpus.PretranslateTextIds.Except(sc.Corpus.TrainOnTextIds ?? new())
- );
- }
- return textCorpus.Where(row =>
- row.Ref is not ScriptureRef sr
- || sc.Corpus.PretranslateChapters is null
- || (
- IsInChapters(sr, sc.Corpus.PretranslateChapters)
- && !IsInChapters(sr, sc.Corpus.TrainOnChapters ?? new())
- )
- );
- })
- .ToArray()
- .FirstOrDefault();
-
- (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus
- .TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)))
- .ToArray();
- ITextCorpus[] targetTrainingCorpora = targetCorpora
- .Select(tc =>
- {
- ITextCorpus textCorpus = tc.TextCorpus;
- if (tc.Corpus.TrainOnTextIds is not null)
- textCorpus = textCorpus.FilterTexts(tc.Corpus.TrainOnTextIds);
- return textCorpus.Where(row =>
- row.Ref is not ScriptureRef sr
- || tc.Corpus.TrainOnChapters is null
- || IsInChapters(sr, tc.Corpus.TrainOnChapters)
- );
- })
- .ToArray();
-
- if (sourceCorpora.Length == 0)
- continue;
-
- int skipCount = 0;
- foreach (Row?[] rows in AlignTrainCorpus(sourceTrainingCorpora, targetTrainingCorpora))
+ await _parallelCorpusPreprocessingService.Preprocess(
+ corpora,
+ async row =>
{
- if (skipCount > 0)
- {
- skipCount--;
- continue;
- }
-
- Row[] trainRows = rows.Where(r => r is not null).Cast().ToArray();
- if (trainRows.Length > 0)
+ if (row.SourceSegment.Length > 0 || row.TargetSegment.Length > 0)
{
- Row row = trainRows[0];
- if (rows.Length > 1)
- {
- Row[] nonEmptyRows = trainRows.Where(r => r.SourceSegment.Length > 0).ToArray();
- Row[] targetNonEmptyRows = nonEmptyRows.Where(r => r.TargetSegment.Length > 0).ToArray();
- if (targetNonEmptyRows.Length > 0)
- nonEmptyRows = targetNonEmptyRows;
- if (nonEmptyRows.Length > 0)
- {
- nonEmptyRows = nonEmptyRows
- .GroupBy(r => r.SourceSegment)
- .Select(group => group.First())
- .ToArray();
- {
- nonEmptyRows = nonEmptyRows
- .GroupBy(r => r.SourceSegment)
- .Select(group => group.First())
- .ToArray();
- row = nonEmptyRows[_random.Next(nonEmptyRows.Length)];
- }
- }
- }
-
await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n");
- skipCount = row.RowCount - 1;
- if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
- trainCount++;
- }
- }
-
- if ((bool?)buildOptionsObject?["use_key_terms"] ?? true)
- {
- ITextCorpus? sourceTermCorpus = _corpusService
- .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList())
- .FirstOrDefault();
- ITextCorpus? targetTermCorpus = _corpusService
- .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList())
- .FirstOrDefault();
- if (sourceTermCorpus is not null && targetTermCorpus is not null)
- {
- IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus);
- foreach (ParallelTextRow row in parallelKeyTermsCorpus)
- {
- await sourceTrainWriter.WriteAsync($"{row.SourceText}\n");
- await targetTrainWriter.WriteAsync($"{row.TargetText}\n");
- trainCount++;
- }
}
- }
- void WriteRow(Utf8JsonWriter writer, string textId, IReadOnlyList