Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.IO;
using System.Text;

Expand All @@ -24,8 +25,10 @@ public string UpdateUsfm(
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows,
string fullName = null,
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve,
UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve,
UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip
UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip,
ImmutableHashSet<string> preserveParagraphStyles = null
)
{
string fileName = _settings.GetBookFileName(bookId);
Expand All @@ -42,8 +45,10 @@ public string UpdateUsfm(
rows,
fullName is null ? null : $"- {fullName}",
textBehavior,
paragraphBehavior,
embedBehavior,
styleBehavior
styleBehavior,
preserveParagraphStyles
);
try
{
Expand Down
4 changes: 1 addition & 3 deletions src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ public enum ScriptureTextType
None,
NonVerse,
Verse,
Embed,
NoteText
}

Expand All @@ -20,7 +19,6 @@ public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
private readonly Stack<ScriptureElement> _curElements;
private readonly Stack<ScriptureTextType> _curTextType;
private bool _duplicateVerse = false;

private bool _inEmbed;
protected bool InNoteText { get; private set; }
private bool _inNestedEmbed;
Expand Down Expand Up @@ -409,7 +407,7 @@ protected static bool IsEmbedPartStyle(string marker)

protected static bool IsEmbedStyle(string marker)
{
return !(marker is null) && marker.IsOneOf(EmbedStyles);
return !(marker is null) && marker.Trim('*').IsOneOf(EmbedStyles);
}
}
}
108 changes: 89 additions & 19 deletions src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;

namespace SIL.Machine.Corpora
Expand All @@ -26,32 +27,44 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
private readonly IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> _rows;
private readonly List<UsfmToken> _tokens;
private readonly List<UsfmToken> _newTokens;
private readonly List<UsfmToken> _newEmbedTokens;
private readonly string _idText;
private readonly UpdateUsfmTextBehavior _textBehavior;
private readonly UpdateUsfmMarkerBehavior _paragraphBehavior;
private readonly UpdateUsfmMarkerBehavior _embedBehavior;
private readonly UpdateUsfmMarkerBehavior _styleBehavior;
private readonly ImmutableHashSet<string> _preserveParagraphStyles;
private readonly Stack<bool> _replace;
private int _rowIndex;
private int _tokenIndex;
private bool _embedUpdated;
private bool _inPreservedParagraph;
private List<string> _embedRowTexts;

public UpdateUsfmParserHandler(
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows = null,
string idText = null,
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve,
UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve,
UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip
UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip,
ImmutableHashSet<string> preserveParagraphStyles = null
)
{
_rows = rows ?? Array.Empty<(IReadOnlyList<ScriptureRef>, string)>();
_tokens = new List<UsfmToken>();
_newTokens = new List<UsfmToken>();
_newEmbedTokens = new List<UsfmToken>();
_idText = idText;
_replace = new Stack<bool>();
_textBehavior = textBehavior;
_paragraphBehavior = paragraphBehavior;
_embedBehavior = embedBehavior;
_styleBehavior = styleBehavior;
if (preserveParagraphStyles == null)
_preserveParagraphStyles = ImmutableHashSet.Create("r", "rem");
else
_preserveParagraphStyles = preserveParagraphStyles;
_embedUpdated = false;
_embedRowTexts = new List<string>();
}
Expand Down Expand Up @@ -89,11 +102,32 @@ public override void StartPara(
IReadOnlyList<UsfmAttribute> attributes
)
{
CollectTokens(state);
if (marker != null && _preserveParagraphStyles.Contains(marker))
{
_inPreservedParagraph = true;
}
if (
state.IsVerseText
&& (HasNewText() || _textBehavior == UpdateUsfmTextBehavior.StripExisting)
&& _paragraphBehavior == UpdateUsfmMarkerBehavior.Strip
)
{
SkipTokens(state);
}
else
{
CollectTokens(state);
}

base.StartPara(state, marker, unknown, attributes);
}

public override void EndPara(UsfmParserState state, string marker)
{
base.EndPara(state, marker);
_inPreservedParagraph = false;
}

public override void StartRow(UsfmParserState state, string marker)
{
CollectTokens(state);
Expand Down Expand Up @@ -244,13 +278,13 @@ public override void Ref(UsfmParserState state, string marker, string display, s

public override void Text(UsfmParserState state, string text)
{
base.Text(state, text);

// strip out text in verses that are being replaced
if (ReplaceWithNewTokens(state))
SkipTokens(state);
else
CollectTokens(state);

base.Text(state, text);
}

public override void OptBreak(UsfmParserState state)
Expand Down Expand Up @@ -299,7 +333,7 @@ protected override void EndNonVerseText(UsfmParserState state, ScriptureRef scri

protected override void StartNoteText(UsfmParserState state)
{
PushNewTokens(_embedRowTexts.Select(t => new UsfmToken(t + " ")));
PushNewEmbedTokens(_embedRowTexts.Select(t => new UsfmToken(t + " ")));
}

protected override void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef)
Expand Down Expand Up @@ -375,15 +409,9 @@ private void SkipTokens(UsfmParserState state)

private bool ReplaceWithNewTokens(UsfmParserState state, bool closed = true)
{
if (_textBehavior == UpdateUsfmTextBehavior.StripExisting)
{
AddNewTokens();
return true;
}

bool newText = _replace.Count > 0 && _replace.Peek();
string marker = state?.Token?.Marker;
bool inEmbed = IsInEmbed(marker);

bool inNestedEmbed = IsInNestedEmbed(marker);
bool isStyleTag = marker != null && !IsEmbedPartStyle(marker);

Expand All @@ -393,23 +421,36 @@ private bool ReplaceWithNewTokens(UsfmParserState state, bool closed = true)
.Any(t => t.Type == UsfmTokenType.Text && t.Text.Length > 0);

bool useNewTokens =
newText
&& (!existingText || _textBehavior == UpdateUsfmTextBehavior.PreferNew)
(
(_textBehavior == UpdateUsfmTextBehavior.StripExisting && !IsInPreservedParagraph(marker))
|| (HasNewText() && (!existingText || _textBehavior != UpdateUsfmTextBehavior.PreferExisting))
)
&& (!inEmbed || (InNoteText && !inNestedEmbed && _embedBehavior == UpdateUsfmMarkerBehavior.Preserve));

if (useNewTokens)
AddNewTokens();
{
if (inEmbed)
AddNewEmbedTokens();
else
AddNewTokens();
}

if (existingText && _textBehavior == UpdateUsfmTextBehavior.PreferExisting)
ClearNewTokens();
{
if (inEmbed)
ClearNewEmbedTokens();
else
ClearNewTokens();
}

// figure out when to skip the existing text
bool embedInNewVerseText = _replace.Any(r => r) && inEmbed;
bool embedInNewVerseText =
(_replace.Any(r => r) || _textBehavior == UpdateUsfmTextBehavior.StripExisting) && inEmbed;
if (embedInNewVerseText || _embedUpdated)
{
if (_embedBehavior == UpdateUsfmMarkerBehavior.Strip)
{
ClearNewTokens();
ClearNewEmbedTokens();
return true;
}

Expand All @@ -419,13 +460,18 @@ private bool ReplaceWithNewTokens(UsfmParserState state, bool closed = true)

bool skipTokens = useNewTokens && closed;

if (newText && isStyleTag)
if (useNewTokens && isStyleTag)
{
skipTokens = _styleBehavior == UpdateUsfmMarkerBehavior.Strip;
}
return skipTokens;
}

private bool HasNewText()
{
return _replace.Count > 0 && _replace.Peek();
}

private void PushNewTokens(IEnumerable<UsfmToken> tokens)
{
_replace.Push(tokens.Any());
Expand All @@ -444,9 +490,33 @@ private void ClearNewTokens()
_newTokens.Clear();
}

private void PushNewEmbedTokens(IEnumerable<UsfmToken> tokens)
{
_replace.Push(tokens.Any());
if (tokens.Any())
_newEmbedTokens.AddRange(tokens);
}

private void AddNewEmbedTokens()
{
if (_newEmbedTokens.Count > 0)
_tokens.AddRange(_newEmbedTokens);
_newEmbedTokens.Clear();
}

private void ClearNewEmbedTokens()
{
_newEmbedTokens.Clear();
}

private void PopNewTokens()
{
_replace.Pop();
}

private bool IsInPreservedParagraph(string marker)
{
return _inPreservedParagraph || _preserveParagraphStyles.Contains(marker);
}
}
}
Loading
Loading