Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ public class BuildJobOptions

public IList<ClearMLBuildQueue> ClearML { get; set; } = new List<ClearMLBuildQueue>();
public bool PreserveBuildFiles { get; set; } = false;
public int MaxWarnings { get; set; } = 1000;
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ public class NmtPreprocessBuildJob(
IBuildJobService<TranslationEngine> buildJobService,
ISharedFileService sharedFileService,
ILanguageTagService languageTagService,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService,
IOptionsMonitor<BuildJobOptions> options
)
: TranslationPreprocessBuildJob(
platformService,
Expand All @@ -17,7 +18,8 @@ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
logger,
buildJobService,
sharedFileService,
parallelCorpusPreprocessingService
parallelCorpusPreprocessingService,
options
)
{
private readonly ILanguageTagService _languageTagService = languageTagService;
Expand Down Expand Up @@ -87,6 +89,14 @@ CancellationToken cancellationToken
corpora
);

int maxWarnings = BuildJobOptions.MaxWarnings;
if (warnings.Count > maxWarnings)
{
string tooManyWarningsWarning =
$"There were {warnings.Count} warnings. Only the first {maxWarnings} are shown.";
warnings = [tooManyWarningsWarning, .. warnings.Take(maxWarnings)];
}

// Log summary of build data
JsonObject buildPreprocessSummary =
new()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ public abstract class PreprocessBuildJob<TEngine>(
ILogger<PreprocessBuildJob<TEngine>> logger,
IBuildJobService<TEngine> buildJobService,
ISharedFileService sharedFileService,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService,
IOptionsMonitor<BuildJobOptions> options
)
: HangfireBuildJob<TEngine, IReadOnlyList<ParallelCorpus>>(
platformService,
Expand All @@ -24,7 +25,7 @@ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
new() { Indented = true, Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping };

internal BuildJobRunnerType TrainJobRunnerType { get; init; } = BuildJobRunnerType.ClearML;

protected readonly BuildJobOptions BuildJobOptions = options.CurrentValue;
protected readonly ISharedFileService SharedFileService = sharedFileService;
protected readonly IParallelCorpusPreprocessingService ParallelCorpusPreprocessingService =
parallelCorpusPreprocessingService;
Expand Down Expand Up @@ -148,7 +149,7 @@ IReadOnlyList<ParallelCorpus> corpora
foreach (UsfmVersificationError error in errors)
{
warnings.Add(
$"USFM does not match project versification for parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId}: Expected verse {error.ExpectedVerseRef}, Actual verse {error.ActualVerseRef}, Mismatch type {error.Type}"
$"USFM versification error in project {error.ProjectName}, expected verse {error.ExpectedVerseRef}”, actual verse {error.ActualVerseRef}”, mismatch type {error.Type} (parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId})"
);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ public class SmtTransferPreprocessBuildJob(
ISharedFileService sharedFileService,
IDistributedReaderWriterLockFactory lockFactory,
IRepository<TrainSegmentPair> trainSegmentPairs,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService,
IOptionsMonitor<BuildJobOptions> options
)
: TranslationPreprocessBuildJob(
platformService,
Expand All @@ -18,7 +19,8 @@ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
logger,
buildJobService,
sharedFileService,
parallelCorpusPreprocessingService
parallelCorpusPreprocessingService,
options
)
{
private readonly IDistributedReaderWriterLockFactory _lockFactory = lockFactory;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ public class TranslationPreprocessBuildJob(
ILogger<PreprocessBuildJob<TranslationEngine>> logger,
IBuildJobService<TranslationEngine> buildJobService,
ISharedFileService sharedFileService,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService,
IOptionsMonitor<BuildJobOptions> options
)
: PreprocessBuildJob<TranslationEngine>(
platformService,
Expand All @@ -16,7 +17,8 @@ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
logger,
buildJobService,
sharedFileService,
parallelCorpusPreprocessingService
parallelCorpusPreprocessingService,
options
)
{
protected override async Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ public class WordAlignmentPreprocessBuildJob(
ILogger<WordAlignmentPreprocessBuildJob> logger,
IBuildJobService<WordAlignmentEngine> buildJobService,
ISharedFileService sharedFileService,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService,
IOptionsMonitor<BuildJobOptions> options
)
: PreprocessBuildJob<WordAlignmentEngine>(
platformService,
Expand All @@ -16,7 +17,8 @@ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
logger,
buildJobService,
sharedFileService,
parallelCorpusPreprocessingService
parallelCorpusPreprocessingService,
options
)
{
protected override async Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,8 @@ public TestEnvironment()
.When(x => x.StopTaskAsync("job1", Arg.Any<CancellationToken>()))
.Do(_ => _cancellationTokenSource.Cancel());
SharedFileService = new SharedFileService(Substitute.For<ILoggerFactory>());
var buildJobOptions = Substitute.For<IOptionsMonitor<BuildJobOptions>>();
buildJobOptions.CurrentValue.Returns(
BuildJobOptions = Substitute.For<IOptionsMonitor<BuildJobOptions>>();
BuildJobOptions.CurrentValue.Returns(
new BuildJobOptions
{
ClearML =
Expand Down Expand Up @@ -181,7 +181,7 @@ public TestEnvironment()
Engines
)
],
buildJobOptions
BuildJobOptions
)
],
Engines
Expand All @@ -193,7 +193,7 @@ public TestEnvironment()
ClearMLService,
SharedFileService,
clearMLOptions,
buildJobOptions,
BuildJobOptions,
Substitute.For<ILogger<ClearMLMonitorService>>()
);
_jobServer = CreateJobServer();
Expand All @@ -207,6 +207,7 @@ public TestEnvironment()
public IClearMLService ClearMLService { get; }
public ISharedFileService SharedFileService { get; }
public IBuildJobService<TranslationEngine> BuildJobService { get; }
public IOptionsMonitor<BuildJobOptions> BuildJobOptions { get; }

public void PersistModel()
{
Expand Down Expand Up @@ -329,7 +330,8 @@ public override object ActivateJob(Type jobType)
_env.BuildJobService,
_env.SharedFileService,
new LanguageTagService(),
new ParallelCorpusPreprocessingService(new TextCorpusService())
new ParallelCorpusPreprocessingService(new TextCorpusService()),
_env.BuildJobOptions
);
}
if (jobType == typeof(TranslationPostprocessBuildJob))
Expand All @@ -343,7 +345,7 @@ public override object ActivateJob(Type jobType)
_env.BuildJobService,
Substitute.For<ILogger<TranslationPostprocessBuildJob>>(),
_env.SharedFileService,
buildJobOptions
_env.BuildJobOptions
);
}
return base.ActivateJob(jobType);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ public async Task RunAsync_BuildWarnings()

await env.RunBuildJobAsync(corpus1, useKeyTerms: true);
Assert.That(env.ExecutionData.Warnings, Has.Count.EqualTo(8));

env.BuildJobOptions.CurrentValue.Returns(new BuildJobOptions() { MaxWarnings = 2 });
await env.RunBuildJobAsync(corpus1, useKeyTerms: true);
// Two warnings after truncation + one warning mentioning that warnings were truncated
Assert.That(env.ExecutionData.Warnings, Has.Count.EqualTo(3));
}

[Test]
Expand Down Expand Up @@ -474,6 +479,11 @@ public async Task ParallelCorpusAsync()
pretranslations[2]!["translation"]!.ToString(),
Is.EqualTo("Source one, chapter twelve, verse one.")
);
Assert.That(
env.ExecutionData.Warnings,
Has.Count.EqualTo(16),
JsonSerializer.Serialize(env.ExecutionData.Warnings)
);
});
}

Expand Down Expand Up @@ -794,7 +804,8 @@ public PreprocessBuildJob<TranslationEngine> GetBuildJob(EngineType engineType)
BuildJobService,
SharedFileService,
new LanguageTagService(),
new ParallelCorpusPreprocessingService(TextCorpusService)
new ParallelCorpusPreprocessingService(TextCorpusService),
BuildJobOptions
);
}
case EngineType.SmtTransfer:
Expand All @@ -808,7 +819,8 @@ public PreprocessBuildJob<TranslationEngine> GetBuildJob(EngineType engineType)
SharedFileService,
LockFactory,
TrainSegmentPairs,
new ParallelCorpusPreprocessingService(TextCorpusService)
new ParallelCorpusPreprocessingService(TextCorpusService),
BuildJobOptions
);
}
default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,8 +288,8 @@ public TestEnvironment(BuildJobRunnerType trainJobRunnerType = BuildJobRunnerTyp
SharedFileService = new SharedFileService(Substitute.For<ILoggerFactory>());
var clearMLOptions = Substitute.For<IOptionsMonitor<ClearMLOptions>>();
clearMLOptions.CurrentValue.Returns(new ClearMLOptions());
var buildJobOptions = Substitute.For<IOptionsMonitor<BuildJobOptions>>();
buildJobOptions.CurrentValue.Returns(
BuildJobOptions = Substitute.For<IOptionsMonitor<BuildJobOptions>>();
BuildJobOptions.CurrentValue.Returns(
new BuildJobOptions
{
ClearML =
Expand Down Expand Up @@ -335,7 +335,7 @@ public TestEnvironment(BuildJobRunnerType trainJobRunnerType = BuildJobRunnerTyp
ClearMLService,
SharedFileService,
clearMLOptions,
buildJobOptions,
BuildJobOptions,
Substitute.For<ILogger<ClearMLMonitorService>>()
);
BuildJobService = new BuildJobService<TranslationEngine>(
Expand All @@ -344,7 +344,7 @@ public TestEnvironment(BuildJobRunnerType trainJobRunnerType = BuildJobRunnerTyp
new ClearMLBuildJobRunner(
ClearMLService,
[new SmtTransferClearMLBuildJobFactory(SharedFileService, Engines)],
buildJobOptions
BuildJobOptions
)
],
Engines
Expand All @@ -365,6 +365,7 @@ [new SmtTransferClearMLBuildJobFactory(SharedFileService, Engines)],
public ITruecaser Truecaser { get; }
public ITrainer TruecaserTrainer { get; }
public IPlatformService PlatformService { get; }
public IOptionsMonitor<BuildJobOptions> BuildJobOptions { get; }

public IClearMLService ClearMLService { get; }
public IClearMLQueueService ClearMLMonitorService { get; }
Expand Down Expand Up @@ -708,7 +709,8 @@ public override object ActivateJob(Type jobType)
_env.SharedFileService,
_env._lockFactory,
_env.TrainSegmentPairs,
new ParallelCorpusPreprocessingService(new TextCorpusService())
new ParallelCorpusPreprocessingService(new TextCorpusService()),
_env.BuildJobOptions
)
{
TrainJobRunnerType = _env._trainJobRunnerType
Expand All @@ -718,8 +720,6 @@ public override object ActivateJob(Type jobType)
{
var engineOptions = Substitute.For<IOptionsMonitor<SmtTransferEngineOptions>>();
engineOptions.CurrentValue.Returns(new SmtTransferEngineOptions());
var buildJobOptions = Substitute.For<IOptionsMonitor<BuildJobOptions>>();
buildJobOptions.CurrentValue.Returns(new BuildJobOptions());
return new SmtTransferPostprocessBuildJob(
_env.PlatformService,
_env.Engines,
Expand All @@ -731,7 +731,7 @@ public override object ActivateJob(Type jobType)
_env.TrainSegmentPairs,
_env.SmtModelFactory,
_env._truecaserFactory,
buildJobOptions,
_env.BuildJobOptions,
engineOptions
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,8 @@ public TestEnvironment(BuildJobRunnerType trainJobRunnerType = BuildJobRunnerTyp
SharedFileService = new SharedFileService(Substitute.For<ILoggerFactory>());
var clearMLOptions = Substitute.For<IOptionsMonitor<ClearMLOptions>>();
clearMLOptions.CurrentValue.Returns(new ClearMLOptions());
var buildJobOptions = Substitute.For<IOptionsMonitor<BuildJobOptions>>();
buildJobOptions.CurrentValue.Returns(
BuildJobOptions = Substitute.For<IOptionsMonitor<BuildJobOptions>>();
BuildJobOptions.CurrentValue.Returns(
new BuildJobOptions
{
ClearML =
Expand Down Expand Up @@ -219,7 +219,7 @@ public TestEnvironment(BuildJobRunnerType trainJobRunnerType = BuildJobRunnerTyp
ClearMLService,
SharedFileService,
clearMLOptions,
buildJobOptions,
BuildJobOptions,
Substitute.For<ILogger<ClearMLMonitorService>>()
);
BuildJobService = new BuildJobService<WordAlignmentEngine>(
Expand All @@ -228,7 +228,7 @@ public TestEnvironment(BuildJobRunnerType trainJobRunnerType = BuildJobRunnerTyp
new ClearMLBuildJobRunner(
ClearMLService,
[new StatisticalClearMLBuildJobFactory(SharedFileService, Engines)],
buildJobOptions
BuildJobOptions
)
],
Engines
Expand All @@ -252,6 +252,7 @@ [new StatisticalClearMLBuildJobFactory(SharedFileService, Engines)],
public ISharedFileService SharedFileService { get; }

public IBuildJobService<WordAlignmentEngine> BuildJobService { get; }
public IOptionsMonitor<BuildJobOptions> BuildJobOptions { get; }

public async Task CommitAsync(TimeSpan inactiveTimeout)
{
Expand Down Expand Up @@ -455,7 +456,8 @@ public override object ActivateJob(Type jobType)
Substitute.For<ILogger<WordAlignmentPreprocessBuildJob>>(),
_env.BuildJobService,
_env.SharedFileService,
new ParallelCorpusPreprocessingService(new TextCorpusService())
new ParallelCorpusPreprocessingService(new TextCorpusService()),
_env.BuildJobOptions
)
{
TrainJobRunnerType = _env._trainJobRunnerType
Expand All @@ -465,8 +467,6 @@ public override object ActivateJob(Type jobType)
{
var engineOptions = Substitute.For<IOptionsMonitor<StatisticalEngineOptions>>();
engineOptions.CurrentValue.Returns(new StatisticalEngineOptions());
var buildJobOptions = Substitute.For<IOptionsMonitor<BuildJobOptions>>();
buildJobOptions.CurrentValue.Returns(new BuildJobOptions());
return new StatisticalPostprocessBuildJob(
_env.PlatformService,
_env.Engines,
Expand All @@ -476,7 +476,7 @@ public override object ActivateJob(Type jobType)
_env.SharedFileService,
_env._lockFactory,
_env.WordAlignmentModelFactory,
buildJobOptions,
_env.BuildJobOptions,
engineOptions
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,7 @@ public record MonolingualCorpus
public Dictionary<string, HashSet<int>>? TrainOnChapters { get; set; }
public HashSet<string>? InferenceTextIds { get; set; }
public Dictionary<string, HashSet<int>>? InferenceChapters { get; set; }

public bool IsFiltered =>
TrainOnTextIds != null || TrainOnChapters != null || InferenceTextIds != null || InferenceChapters != null;
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ public interface IParallelCorpusPreprocessingService
{
QuoteConventionAnalysis? AnalyzeTargetCorpusQuoteConvention(ParallelCorpus corpus);
IReadOnlyList<(string CorpusId, IReadOnlyList<UsfmVersificationError> Errors)> AnalyzeUsfmVersification(
ParallelCorpus corpus
ParallelCorpus parallelCorpus
);

Task PreprocessAsync(
Expand Down
Loading
Loading