Skip to content

Commit 5468776

Browse files
authored
Correctly parse and place verse text in verse 0 (#344)
1 parent 15919fb commit 5468776

File tree

6 files changed

+145
-24
lines changed

6 files changed

+145
-24
lines changed

src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
using System.Collections.Generic;
1+
using System.Collections.Generic;
22
using System.Linq;
3-
using SIL.Extensions;
43
using SIL.Scripture;
54

65
namespace SIL.Machine.Corpora
@@ -67,31 +66,36 @@ public override void Verse(
6766
string pubNumber
6867
)
6968
{
70-
if (state.VerseRef.Equals(_curVerseRef) && !DuplicateVerse)
69+
if (state.ChapterHasVerseZero && state.VerseRef.VerseNum == 0)
70+
{
71+
// Fall through for the special case of verse 0 being specified in the USFM
72+
}
73+
else if (state.VerseRef.Equals(_curVerseRef) && !DuplicateVerse)
7174
{
7275
if (state.VerseRef.VerseNum > 0)
7376
{
7477
EndVerseText(state, CreateVerseRefs());
7578
// ignore duplicate verses
7679
DuplicateVerse = true;
7780
}
81+
82+
return;
7883
}
7984
else if (VerseRef.AreOverlappingVersesRanges(verse1: number, verse2: _curVerseRef.Verse))
8085
{
8186
// merge overlapping verse ranges in to one range
8287
VerseRef verseRef = _curVerseRef.Clone();
8388
verseRef.Verse = CorporaUtils.MergeVerseRanges(number, _curVerseRef.Verse);
8489
UpdateVerseRef(verseRef, marker);
90+
return;
8591
}
92+
93+
if (CurrentTextType == ScriptureTextType.NonVerse)
94+
EndNonVerseText(state);
8695
else
87-
{
88-
if (CurrentTextType == ScriptureTextType.NonVerse)
89-
EndNonVerseText(state);
90-
else
91-
EndVerseText(state);
92-
UpdateVerseRef(state.VerseRef, marker);
93-
StartVerseText(state);
94-
}
96+
EndVerseText(state);
97+
UpdateVerseRef(state.VerseRef, marker);
98+
StartVerseText(state);
9599
}
96100

97101
public override void StartPara(
@@ -259,9 +263,9 @@ private void StartVerseText(UsfmParserState state)
259263

260264
private void EndVerseText(UsfmParserState state)
261265
{
262-
if (!DuplicateVerse && _curVerseRef.VerseNum > 0)
266+
if (!DuplicateVerse && (_curVerseRef.VerseNum > 0 || state.ChapterHasVerseZero))
263267
EndVerseText(state, CreateVerseRefs());
264-
if (_curVerseRef.VerseNum > 0)
268+
if (_curVerseRef.VerseNum > 0 || state.ChapterHasVerseZero)
265269
_curTextType.Pop();
266270
}
267271

@@ -280,7 +284,14 @@ private void EndNonVerseText(UsfmParserState state)
280284

281285
private void UpdateVerseRef(VerseRef verseRef, string marker)
282286
{
283-
if (!VerseRef.AreOverlappingVersesRanges(verseRef, _curVerseRef))
287+
if (_curVerseRef.VerseNum == 0 && verseRef.VerseNum == 0 && marker == "v")
288+
{
289+
// As the verse 0 marker appears within the middle of verse 0,
290+
// we should not break the position of current element stack by clearing it.
291+
// Instead, we just need to pop the current element off the stack.
292+
_curElements.Pop();
293+
}
294+
else if (!VerseRef.AreOverlappingVersesRanges(verseRef, _curVerseRef))
284295
{
285296
_curElements.Clear();
286297
_curElements.Push(new ScriptureElement(0, marker));
@@ -357,6 +368,7 @@ private void CheckConvertVerseParaToNonVerse(UsfmParserState state)
357368
&& paraTag.Marker != "tr"
358369
&& state.IsVersePara
359370
&& _curVerseRef.VerseNum == 0
371+
&& !state.ChapterHasVerseZero
360372
&& !IsPrivateUseMarker(paraTag.Marker)
361373
)
362374
{

src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
using System;
1+
using System;
22
using System.Collections.Generic;
33
using System.Linq;
44
using SIL.Scripture;
@@ -107,7 +107,7 @@ public UpdateUsfmParserHandler(
107107
preserveParagraphStyles == null
108108
? new HashSet<string> { "r", "rem" }
109109
: new HashSet<string>(preserveParagraphStyles);
110-
_remarks = remarks == null ? new List<string>() : remarks.ToList();
110+
_remarks = remarks?.ToList() ?? new List<string>();
111111
_errorHandler = errorHandler;
112112
if (_errorHandler == null)
113113
_errorHandler = (error) => false;
@@ -457,6 +457,12 @@ IReadOnlyList<ScriptureRef> segScrRefs
457457
var rowTexts = new List<string>();
458458
Dictionary<string, object> rowMetadata = null;
459459
int sourceIndex = 0;
460+
461+
// handle the special case of verse 0, which although first in the rows,
462+
// it will be retrieved some of other segments in the verse.
463+
if (segScrRefs.Count > 0 && segScrRefs[0].VerseNum == 0 && segScrRefs[0].Path.Count == 0)
464+
_verseRowIndex = 0;
465+
460466
// search the sorted rows with updated text, starting from where we left off last.
461467
while (_verseRowIndex < _verseRows.Count && sourceIndex < segScrRefs.Count)
462468
{

src/SIL.Machine/Corpora/UsfmParser.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ public bool ProcessToken()
355355
vref = State.VerseRef;
356356
vref.Chapter = token.Data;
357357
vref.VerseNum = 0;
358+
State.ChapterHasVerseZero = false;
358359
State.VerseRef = vref;
359360
// Verse offset is not zeroed for chapter 1, as it is part of intro
360361
if (State.VerseRef.ChapterNum != 1)
@@ -391,6 +392,8 @@ public bool ProcessToken()
391392
// Verse
392393
vref = State.VerseRef;
393394
vref.Verse = token.Data;
395+
if (vref.VerseNum == 0)
396+
State.ChapterHasVerseZero = true;
394397
State.VerseRef = vref;
395398
State.VerseOffset = 0;
396399

src/SIL.Machine/Corpora/UsfmParserState.cs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn
7676
/// </summary>
7777
public int SpecialTokenCount { get; internal set; }
7878

79+
/// <summary>
80+
/// <c>true</c> if a chapter has verse 0 specified.
81+
/// </summary>
82+
public bool ChapterHasVerseZero { get; internal set; }
83+
7984
/// <summary>
8085
/// True if the token processed is a figure.
8186
/// </summary>
@@ -104,10 +109,7 @@ public UsfmTag ParaTag
104109
/// <summary>
105110
/// Innermost character tag or null for none
106111
/// </summary>
107-
public UsfmTag CharTag
108-
{
109-
get { return CharTags.FirstOrDefault(); }
110-
}
112+
public UsfmTag CharTag => CharTags.FirstOrDefault();
111113

112114
/// <summary>
113115
/// Current note tag or null for none
@@ -157,8 +159,8 @@ public bool IsVerseText
157159
{
158160
get
159161
{
160-
// Anything before verse 1 is not verse text
161-
if (VerseRef.VerseNum == 0)
162+
// Anything before verse 1 is not verse text, unless the USFM specified verse 0
163+
if (VerseRef.VerseNum == 0 && !ChapterHasVerseZero)
162164
return false;
163165

164166
// Sidebars and notes are not verse text

tests/SIL.Machine.Tests/Corpora/PlaceMarkersUsfmUpdateBlockHandlerTests.cs

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,63 @@ public void UpdateUsfm_StripParagraphsWithHeaders()
805805
AssertUsfmEquals(target, result);
806806
}
807807

808+
[Test]
809+
public void UpdateUsfm_SupportVerseZero()
810+
{
811+
// Note: Verse 0 has an empty paragraph as the paragraph occurs before verse text,
812+
// so is not included in the verse text as it is for the paragraphs for the other verses.
813+
IReadOnlyList<UpdateUsfmRow> rows =
814+
[
815+
new UpdateUsfmRow(ScrRef("MAT 1:0"), "New verse 0"),
816+
new UpdateUsfmRow(ScrRef("MAT 1:0/1:mt"), "New book header"),
817+
new UpdateUsfmRow(ScrRef("MAT 1:0/2:s"), "New chapter header"),
818+
new UpdateUsfmRow(ScrRef("MAT 1:0/3:p"), ""),
819+
new UpdateUsfmRow(ScrRef("MAT 1:0/4:ms"), "New major section header"),
820+
new UpdateUsfmRow(ScrRef("MAT 1:0/5:s"), "New section header 1"),
821+
new UpdateUsfmRow(ScrRef("MAT 1:1"), "New verse 1"),
822+
new UpdateUsfmRow(ScrRef("MAT 1:1/1:s"), "New section header 2"),
823+
new UpdateUsfmRow(ScrRef("MAT 1:2"), "New verse 2"),
824+
new UpdateUsfmRow(ScrRef("MAT 1:3"), "New verse 3"),
825+
];
826+
string usfm =
827+
@"\id MAT
828+
\mt Old book header
829+
\c 1
830+
\s Old chapter header
831+
\p
832+
\v 0 Old verse 0
833+
\ms Old major section header
834+
\s Old section header 1
835+
\p
836+
\v 1 Old verse 1
837+
\s Old section header 2
838+
\p
839+
\v 2 Old verse 2
840+
\v 3 Old verse 3
841+
";
842+
843+
string target = UpdateUsfm(rows, usfm, usfmUpdateBlockHandlers: [new PlaceMarkersUsfmUpdateBlockHandler()]);
844+
845+
string result =
846+
@"\id MAT
847+
\mt New book header
848+
\c 1
849+
\s New chapter header
850+
\p
851+
\v 0 New verse 0
852+
\ms New major section header
853+
\s New section header 1
854+
\p
855+
\v 1 New verse 1
856+
\s New section header 2
857+
\p
858+
\v 2 New verse 2
859+
\v 3 New verse 3
860+
";
861+
862+
AssertUsfmEquals(target, result);
863+
}
864+
808865
private static ScriptureRef[] ScrRef(params string[] refs)
809866
{
810867
return refs.Select(r => ScriptureRef.Parse(r)).ToArray();

tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,14 +294,55 @@ public void GetRows_VerseZero()
294294

295295
Assert.Multiple(() =>
296296
{
297-
Assert.That(rows, Has.Length.EqualTo(1));
297+
Assert.That(rows, Has.Length.EqualTo(2));
298+
299+
Assert.That(
300+
rows[0].Ref,
301+
Is.EqualTo(ScriptureRef.Parse("MAT 1:0")),
302+
string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString()))
303+
);
304+
Assert.That(rows[0].Text, Is.Empty, string.Join(",", rows.ToList().Select(tr => tr.Text)));
305+
306+
Assert.That(
307+
rows[1].Ref,
308+
Is.EqualTo(ScriptureRef.Parse("MAT 1:1")),
309+
string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString()))
310+
);
311+
Assert.That(rows[1].Text, Is.EqualTo("Verse one."), string.Join(",", rows.ToList().Select(tr => tr.Text)));
312+
});
313+
}
314+
315+
[Test]
316+
public void GetRows_VerseZeroWithText()
317+
{
318+
TextRow[] rows = GetRows(
319+
@"\id MAT - Test
320+
\h
321+
\mt
322+
\c 1
323+
\p \v 0 Verse zero.
324+
\s
325+
\p \v 1 Verse one.
326+
"
327+
);
328+
329+
Assert.Multiple(() =>
330+
{
331+
Assert.That(rows, Has.Length.EqualTo(2));
298332

299333
Assert.That(
300334
rows[0].Ref,
335+
Is.EqualTo(ScriptureRef.Parse("MAT 1:0")),
336+
string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString()))
337+
);
338+
Assert.That(rows[0].Text, Is.EqualTo("Verse zero."), string.Join(",", rows.ToList().Select(tr => tr.Text)));
339+
340+
Assert.That(
341+
rows[1].Ref,
301342
Is.EqualTo(ScriptureRef.Parse("MAT 1:1")),
302343
string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString()))
303344
);
304-
Assert.That(rows[0].Text, Is.EqualTo("Verse one."), string.Join(",", rows.ToList().Select(tr => tr.Text)));
345+
Assert.That(rows[1].Text, Is.EqualTo("Verse one."), string.Join(",", rows.ToList().Select(tr => tr.Text)));
305346
});
306347
}
307348

0 commit comments

Comments
 (0)