Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block)
|| !elements.Any(e =>
e.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style)
&& !e.MarkedForRemoval
&& e.Tokens.Count == 1
)
)
{
Expand Down Expand Up @@ -173,10 +174,24 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block)
sourceTokens,
targetTokens
);
int targetStringIndex =
adjacentTargetToken < targetTokenStarts.Count
? targetTokenStarts[adjacentTargetToken]
: targetSentence.Length;
int targetStringIndex;
if (
adjacentSourceToken > 0
&& element.Type == UsfmUpdateBlockElementType.Style
&& element.Tokens[0].Marker[element.Tokens[0].Marker.Length - 1] == '*'
)
{
targetStringIndex =
targetTokenStarts[adjacentTargetToken - 1] + targetTokens[adjacentTargetToken - 1].Length;
}
else if (adjacentTargetToken < targetTokenStarts.Count)
{
targetStringIndex = targetTokenStarts[adjacentTargetToken];
}
else
{
targetStringIndex = targetSentence.Length;
}
toInsert.Add((targetStringIndex, element));
}
toInsert.Sort((p1, p2) => p1.Index.CompareTo(p2.Index));
Expand Down
45 changes: 3 additions & 42 deletions src/SIL.Machine/Corpora/UsfmTextBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -94,33 +94,16 @@ private class TextRowCollector : ScriptureRefUsfmParserHandlerBase
private readonly List<TextRow> _rows;
private readonly Stack<StringBuilder> _rowTexts;
private bool _sentenceStart;
private readonly List<UsfmToken> _nextParaTokens;
private bool _nextParaTextStarted = false;

public TextRowCollector(UsfmTextBase text)
{
_text = text;
_rows = new List<TextRow>();
_rowTexts = new Stack<StringBuilder>();
_nextParaTokens = new List<UsfmToken>();
}

public IEnumerable<TextRow> Rows => _rows;

public override void Verse(
UsfmParserState state,
string number,
string marker,
string altNumber,
string pubNumber
)
{
base.Verse(state, number, marker, altNumber, pubNumber);

_nextParaTextStarted = true;
_nextParaTokens.Clear();
}

public override void StartPara(
UsfmParserState state,
string marker,
Expand Down Expand Up @@ -244,16 +227,6 @@ public override void Text(UsfmParserState state, string text)
text = text.TrimEnd('\r', '\n');
if (text.Length > 0)
{
if (!text.IsWhiteSpace())
{
if (CurrentTextType == ScriptureTextType.Verse)
{
foreach (UsfmToken token in _nextParaTokens)
rowText.Append(token.ToString() + " ");
_nextParaTokens.Clear();
}
_nextParaTextStarted = true;
}
if (rowText.Length == 0 || char.IsWhiteSpace(rowText[rowText.Length - 1]))
text = text.TrimStart();
rowText.Append(text);
Expand Down Expand Up @@ -283,13 +256,6 @@ protected override void StartVerseText(UsfmParserState state, IReadOnlyList<Scri
protected override void EndVerseText(UsfmParserState state, IReadOnlyList<ScriptureRef> scriptureRefs)
{
string text = _rowTexts.Pop().ToString();
if (_text._includeMarkers)
{
foreach (UsfmToken token in _nextParaTokens)
{
text += token.ToString() + " ";
}
}
_rows.AddRange(_text.CreateRows(scriptureRefs, text, _sentenceStart));
_sentenceStart = state.Token.Marker == "c" || text.HasSentenceEnding();
}
Expand All @@ -310,11 +276,7 @@ private void OutputMarker(UsfmParserState state)
{
if (!_text._includeMarkers || _rowTexts.Count == 0)
return;

if (_nextParaTextStarted)
_rowTexts.Peek().Append(state.Token);
else
_nextParaTokens.Add(state.Token);
_rowTexts.Peek().Append(state.Token.ToString());
}

private void HandlePara(UsfmParserState state)
Expand All @@ -327,10 +289,9 @@ private void HandlePara(UsfmParserState state)
if (rowText.Length > 0 && !char.IsWhiteSpace(rowText[rowText.Length - 1]))
rowText.Append(" ");
}
if (CurrentTextType == ScriptureTextType.Verse)
if (CurrentTextType == ScriptureTextType.Verse && _text._includeMarkers)
{
_nextParaTokens.Add(state.Token);
_nextParaTextStarted = false;
_rowTexts.Peek().Append(state.Token.ToString() + " ");
}
if (!state.IsVersePara)
_sentenceStart = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,9 @@ public void UpdateUsfm_StyleMarkers()
string result =
@"\id MAT
\c 1
\v 1 Esta es la \w primera \w*oración. Este texto está en \w inglés \w*y esta prueba es \w para \w*marcadores de estilo.
\v 1 Esta es la \w primera\w* oración. Este texto está en \w inglés\w* y esta prueba es \w para\w* marcadores de estilo.
";

// NOTE: the spacing before/after end markers is incorrect,
// but this is an issue with how the is USFM is generated from the tokens
AssertUsfmEquals(target, result);

target = UpdateUsfm(
Expand Down Expand Up @@ -307,7 +305,7 @@ public void UpdateUsfm_ConsecutiveMarkers()
@"\id MAT
\c 1
\v 1 Old verse 1
\p \qt \+w word \+w* \qt*
\p \qt \+w word\+w*\qt*
";
IReadOnlyList<PlaceMarkersAlignmentInfo> alignInfo =
[
Expand All @@ -331,7 +329,7 @@ public void UpdateUsfm_ConsecutiveMarkers()
@"\id MAT
\c 1
\v 1 New verse 1
\p \qt \+w WORD \+w*\qt*
\p \qt \+w WORD\+w*\qt*
";

AssertUsfmEquals(target, result);
Expand Down
24 changes: 24 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,30 @@ public void GetRows_ParagraphBeforeNonVerseParagraph()
});
}

[Test]
public void GetRows_StyleStartingNonVerseParagraphAfterEmptyParagraph()
{
TextRow[] rows = GetRows(
@"\id MAT - Test
\c 1
\p
\v 1 verse 1
\b
\s1 \w header\w*
\q1
\v 2 verse 2
",
includeAllText: true,
includeMarkers: true
);
Assert.Multiple(() =>
{
Assert.That(rows, Has.Length.EqualTo(4), string.Join(",", rows.Select(tr => tr.Text)));
Assert.That(rows[1].Text, Is.EqualTo("verse 1 \\b \\q1"));
Assert.That(rows[2].Text, Is.EqualTo("\\w header\\w*"));
});
}

private static TextRow[] GetRows(string usfm, bool includeMarkers = false, bool includeAllText = false)
{
UsfmMemoryText text =
Expand Down
Loading