Skip to content

Commit 9d729bb

Browse files
committed
1 parent 87355e6 commit 9d729bb

File tree

8 files changed

+240
-25
lines changed

8 files changed

+240
-25
lines changed

src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
using System;
2+
using System.Collections.Generic;
23
using System.IO;
4+
using System.Linq;
35
using System.Text;
46
using SIL.Machine.Corpora;
7+
using SIL.Scripture;
58

69
namespace SIL.Machine.PunctuationAnalysis
710
{
@@ -19,11 +22,22 @@ protected ParatextProjectQuoteConventionDetector(ParatextProjectSettingsParserBa
1922
_settings = settingsParser.Parse();
2023
}
2124

22-
public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null)
25+
public QuoteConventionAnalysis GetQuoteConventionAnalysis(
26+
QuoteConventionDetector handler = null,
27+
Dictionary<int, List<int>> includeChapters = null
28+
)
2329
{
2430
handler = handler ?? new QuoteConventionDetector();
25-
foreach (string fileName in _settings.GetAllScriptureBookFileNames())
31+
foreach (
32+
string bookId in Canon
33+
.AllBookNumbers.Where(num => Canon.IsCanonical(num))
34+
.Select(num => Canon.BookNumberToId(num))
35+
)
2636
{
37+
if (includeChapters != null && includeChapters.ContainsKey(Canon.BookIdToNumber(bookId)))
38+
continue;
39+
40+
string fileName = _settings.GetBookFileName(bookId);
2741
if (!Exists(fileName))
2842
continue;
2943

@@ -47,7 +61,7 @@ public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetecto
4761
throw new InvalidOperationException(sb.ToString(), ex);
4862
}
4963
}
50-
return handler.DetectQuotationConvention();
64+
return handler.DetectQuoteConvention(includeChapters);
5165
}
5266

5367
protected abstract bool Exists(string fileName);

src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@ private void CountQuotationMarksInChapter(Chapter chapter, QuoteConventionSet po
5656
_quotationMarkTabulator.Tabulate(resolvedQuotationMarks);
5757
}
5858

59-
public QuoteConventionAnalysis DetectQuotationConvention()
59+
public QuoteConventionAnalysis DetectQuoteConvention(Dictionary<int, List<int>> includeChapters = null)
6060
{
61-
CountQuotationMarksInChapters(GetChapters());
61+
CountQuotationMarksInChapters(GetChapters(includeChapters));
6262

6363
(QuoteConvention bestQuoteConvention, double score) = QuoteConventions.Standard.FindMostSimilarConvention(
6464
_quotationMarkTabulator

src/SIL.Machine/PunctuationAnalysis/TextSegment.cs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ public string Text
1212
get => _surrogatePairString.ToString();
1313
private set => _surrogatePairString = new SurrogatePairString(value);
1414
}
15+
public string Book { get; private set; }
16+
public int Chapter { get; private set; }
1517
public UsfmMarkerType ImmediatePrecedingMarker { get; private set; }
1618
public HashSet<UsfmMarkerType> MarkersInPrecedingContext { get; private set; }
1719
public TextSegment PreviousSegment { get; set; }
@@ -139,6 +141,18 @@ public Builder AddPrecedingMarker(UsfmMarkerType marker)
139141
return this;
140142
}
141143

144+
public Builder SetBook(string code)
145+
{
146+
_textSegment.Book = code;
147+
return this;
148+
}
149+
150+
public Builder SetChapter(int number)
151+
{
152+
_textSegment.Chapter = number;
153+
return this;
154+
}
155+
142156
public Builder SetUsfmToken(UsfmToken token)
143157
{
144158
_textSegment.UsfmToken = token;

src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using System.Collections.Generic;
22
using SIL.Machine.Corpora;
3+
using SIL.Scripture;
34

45
namespace SIL.Machine.PunctuationAnalysis
56
{
@@ -14,9 +15,15 @@ public UsfmStructureExtractor()
1415
_nextTextSegmentBuilder = new TextSegment.Builder();
1516
}
1617

18+
public void StartBook(UsfmParserState state, string marker, string code)
19+
{
20+
_nextTextSegmentBuilder.SetBook(code);
21+
}
22+
1723
public void Chapter(UsfmParserState state, string number, string marker, string altNumber, string pubNumber)
1824
{
1925
_nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Chapter);
26+
_nextTextSegmentBuilder.SetChapter(int.Parse(number));
2027
}
2128

2229
public void EndBook(UsfmParserState state, string marker) { }
@@ -65,8 +72,6 @@ public void Ref(UsfmParserState state, string marker, string display, string tar
6572
_nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed);
6673
}
6774

68-
public void StartBook(UsfmParserState state, string marker, string code) { }
69-
7075
public void StartCell(UsfmParserState state, string marker, string align, int colspan) { }
7176

7277
public void StartChar(
@@ -127,13 +132,26 @@ public void Verse(UsfmParserState state, string number, string marker, string al
127132
_nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse);
128133
}
129134

130-
public List<Chapter> GetChapters()
135+
public List<Chapter> GetChapters(Dictionary<int, List<int>> includeChapters = null)
131136
{
132137
var chapters = new List<Chapter>();
138+
int currentBook = 0;
139+
int currentChapter = 0;
133140
var currentChapterVerses = new List<Verse>();
134141
var currentVerseSegments = new List<TextSegment>();
135142
foreach (TextSegment textSegment in _textSegments)
136143
{
144+
if (textSegment.Book != null)
145+
currentBook = Canon.BookIdToNumber(textSegment.Book) - 1; //make 0-indexed
146+
if (textSegment.Chapter > 0)
147+
currentChapter = textSegment.Chapter;
148+
if (includeChapters != null && currentBook > 0)
149+
{
150+
if (!includeChapters.TryGetValue(currentBook, out List<int> bookChapters))
151+
continue;
152+
if (currentChapter > 0 && bookChapters.Count > 0 && !bookChapters.Contains(currentChapter))
153+
continue;
154+
}
137155
if (textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse))
138156
{
139157
if (currentVerseSegments.Count > 0)

tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs

Lines changed: 129 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ namespace SIL.Machine.Corpora;
88
[TestFixture]
99
public class ParatextProjectQuoteConventionDetectorTests
1010
{
11+
private static readonly QuoteConvention StandardEnglishQuoteConvention =
12+
QuoteConventions.Standard.GetQuoteConventionByName("standard_english");
13+
private static readonly QuoteConvention StandardFrenchQuoteConvention =
14+
QuoteConventions.Standard.GetQuoteConventionByName("standard_french");
15+
1116
[Test]
1217
public void TestGetQuotationAnalysis()
1318
{
@@ -16,18 +21,9 @@ public void TestGetQuotationAnalysis()
1621
{
1722
{
1823
"41MATTest.SFM",
19-
@"\id MAT
20-
\c 1
21-
\v 1 Someone said, “This is something I am saying!
22-
\v 2 This is also something I am saying” (that is, “something I am speaking”).
23-
\p
24-
\v 3 Other text, and someone else said,
25-
\q1
26-
\v 4 “Things
27-
\q2 someone else said!
28-
\q3 and more things someone else said.”
29-
\m That is why he said “things someone else said.”
30-
\v 5 Then someone said, “More things someone said.”"
24+
$@"\id MAT
25+
{GetTestChapter(1, StandardEnglishQuoteConvention)}
26+
"
3127
}
3228
}
3329
);
@@ -37,6 +33,100 @@ public void TestGetQuotationAnalysis()
3733
Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english"));
3834
}
3935

36+
[Test]
37+
public void TestGetQuotationByBook()
38+
{
39+
var env = new TestEnvironment(
40+
files: new Dictionary<string, string>()
41+
{
42+
{
43+
"41MATTest.SFM",
44+
$@"\id MAT
45+
{GetTestChapter(1, StandardEnglishQuoteConvention)}
46+
"
47+
},
48+
{
49+
"42MRKTest.SFM",
50+
$@"\id MRK
51+
{GetTestChapter(1, StandardFrenchQuoteConvention)}
52+
"
53+
}
54+
}
55+
);
56+
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MRK");
57+
Assert.That(analysis, Is.Not.Null);
58+
Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.8));
59+
Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french"));
60+
}
61+
62+
[Test]
63+
public void TestGetQuotationConventionByChapter()
64+
{
65+
var env = new TestEnvironment(
66+
files: new Dictionary<string, string>()
67+
{
68+
{
69+
"41MATTest.SFM",
70+
$@"\id MAT
71+
{GetTestChapter(1, StandardEnglishQuoteConvention)}
72+
"
73+
},
74+
{
75+
"42MRKTest.SFM",
76+
$@"\id MRK
77+
{GetTestChapter(1, StandardEnglishQuoteConvention)}
78+
{GetTestChapter(2, StandardFrenchQuoteConvention)}
79+
{GetTestChapter(3, StandardEnglishQuoteConvention)}
80+
{GetTestChapter(4, StandardEnglishQuoteConvention)}
81+
{GetTestChapter(5, StandardFrenchQuoteConvention)}
82+
"
83+
}
84+
}
85+
);
86+
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MRK2,4-5");
87+
Assert.That(analysis, Is.Not.Null);
88+
Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.66));
89+
Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french"));
90+
}
91+
92+
[Test]
93+
public void TestGetQuotationConventionByChapterIndeterminate()
94+
{
95+
var env = new TestEnvironment(
96+
files: new Dictionary<string, string>()
97+
{
98+
{
99+
"41MATTest.SFM",
100+
$@"\id MAT
101+
{GetTestChapter(1)}
102+
{GetTestChapter(2, StandardEnglishQuoteConvention)}
103+
{GetTestChapter(3)}
104+
"
105+
}
106+
}
107+
);
108+
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MAT1,3");
109+
Assert.That(analysis, Is.Null);
110+
}
111+
112+
[Test]
113+
public void TestGetQuotationConventionInvalidBookCode()
114+
{
115+
var env = new TestEnvironment(
116+
files: new Dictionary<string, string>()
117+
{
118+
{
119+
"41MATTest.SFM",
120+
$@"\id LUK
121+
{GetTestChapter(1, StandardEnglishQuoteConvention)}
122+
"
123+
}
124+
}
125+
);
126+
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MAT");
127+
Assert.That(analysis, Is.Null);
128+
}
129+
40130
private class TestEnvironment(ParatextProjectSettings? settings = null, Dictionary<string, string>? files = null)
41131
{
42132
public ParatextProjectQuoteConventionDetector Detector { get; } =
@@ -45,12 +135,37 @@ private class TestEnvironment(ParatextProjectSettings? settings = null, Dictiona
45135
files ?? new()
46136
);
47137

48-
public QuoteConventionAnalysis GetQuoteConvention()
138+
public QuoteConventionAnalysis GetQuoteConvention(string? scriptureRange = null)
49139
{
50-
return Detector.GetQuoteConventionAnalysis();
140+
Dictionary<int, List<int>>? chapters = null;
141+
if (scriptureRange != null)
142+
{
143+
chapters = ScriptureRangeParser
144+
.GetChapters(scriptureRange)
145+
.ToDictionary(kvp => Canon.BookIdToNumber(kvp.Key) - 1, kvp => kvp.Value); // make 0-indexed
146+
}
147+
return Detector.GetQuoteConventionAnalysis(includeChapters: chapters);
51148
}
52149
}
53150

151+
private static string GetTestChapter(int number, QuoteConvention? quoteConvention = null)
152+
{
153+
string leftQuote = quoteConvention != null ? quoteConvention.GetOpeningQuotationMarkAtDepth(1) : "";
154+
string rightQuote = quoteConvention != null ? quoteConvention.GetClosingQuotationMarkAtDepth(1) : "";
155+
return $@"\c {number}
156+
\v 1 Someone said, {leftQuote}This is something I am saying!
157+
\v 2 This is also something I am saying{rightQuote} (that is, {leftQuote}something I am speaking{rightQuote}).
158+
\p
159+
\v 3 Other text, and someone else said,
160+
\q1
161+
\v 4 {leftQuote}Things
162+
\q2 someone else said!
163+
\q3 and more things someone else said.{rightQuote}
164+
\m That is why he said {leftQuote}things someone else said.{rightQuote}
165+
\v 5 Then someone said, {leftQuote}More things someone said.{rightQuote}
166+
";
167+
}
168+
54169
private class DefaultParatextProjectSettings(
55170
string name = "Test",
56171
string fullName = "TestProject",

tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,8 @@ public void AnalyzeCorporaQuoteConventions()
186186
var quoteConventionDetector2 = new ZipParatextProjectQuoteConventionDetector(zipArchive2);
187187
quoteConventionDetector2.GetQuoteConventionAnalysis(targetHandler);
188188

189-
QuoteConventionAnalysis sourceAnalysis = sourceHandler.DetectQuotationConvention();
190-
QuoteConventionAnalysis targetAnalysis = targetHandler.DetectQuotationConvention();
189+
QuoteConventionAnalysis sourceAnalysis = sourceHandler.DetectQuoteConvention();
190+
QuoteConventionAnalysis targetAnalysis = targetHandler.DetectQuoteConvention();
191191

192192
Assert.Multiple(() =>
193193
{

tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,6 @@ public QuoteConventionAnalysis DetectQuotationConvention(string usfm)
368368
{
369369
var quoteConventionDetector = new QuoteConventionDetector();
370370
UsfmParser.Parse(usfm, quoteConventionDetector);
371-
return quoteConventionDetector.DetectQuotationConvention();
371+
return quoteConventionDetector.DetectQuoteConvention();
372372
}
373373
}

tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,60 @@ public void SetUp()
1616
_verseTextParserState.SetVerseNum(1);
1717
}
1818

19+
[Test]
20+
public void GetChaptersFilterByBook()
21+
{
22+
var usfmStructureExtractor = new UsfmStructureExtractor();
23+
usfmStructureExtractor.StartBook(_verseTextParserState, "id", "GEN");
24+
usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null);
25+
usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null);
26+
usfmStructureExtractor.Text(_verseTextParserState, "test");
27+
28+
Assert.That(
29+
usfmStructureExtractor.GetChapters(new Dictionary<int, List<int>> { { 2, [1] } }), // EXO 1
30+
Has.Count.EqualTo(0)
31+
);
32+
}
33+
34+
[Test]
35+
public void GetChaptersFilterByChapter()
36+
{
37+
var usfmStructureExtractor = new UsfmStructureExtractor();
38+
usfmStructureExtractor.StartBook(_verseTextParserState, "id", "MAT");
39+
usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null);
40+
usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null);
41+
usfmStructureExtractor.Text(_verseTextParserState, "test");
42+
usfmStructureExtractor.Chapter(_verseTextParserState, "2", "c", null, null);
43+
usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null);
44+
usfmStructureExtractor.Text(_verseTextParserState, "test2");
45+
usfmStructureExtractor.Chapter(_verseTextParserState, "3", "c", null, null);
46+
usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null);
47+
usfmStructureExtractor.Text(_verseTextParserState, "test3");
48+
49+
List<Chapter> expectedChapters =
50+
[
51+
new Chapter(
52+
[
53+
new Verse(
54+
[
55+
new TextSegment.Builder()
56+
.SetText("test2")
57+
.AddPrecedingMarker(UsfmMarkerType.Chapter)
58+
.AddPrecedingMarker(UsfmMarkerType.Verse)
59+
.Build()
60+
]
61+
)
62+
]
63+
)
64+
];
65+
List<Chapter> actualChapters = usfmStructureExtractor.GetChapters(
66+
new Dictionary<int, List<int>> { { 40, [2] } }
67+
);
68+
AssertChapterEqual(expectedChapters, actualChapters);
69+
Assert.That(actualChapters[0].Verses[0].TextSegments[0].PreviousSegment, Is.Null);
70+
Assert.That(actualChapters[0].Verses[0].TextSegments[0].NextSegment, Is.Null);
71+
}
72+
1973
[Test]
2074
public void ChapterAndVerseMarkers()
2175
{

0 commit comments

Comments
 (0)