Skip to content

Commit e9868b8

Browse files
committed
1 parent ba5915c commit e9868b8

14 files changed

+164
-155
lines changed

src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
using System;
2-
using System.Collections.Generic;
3-
using System.Linq;
4-
using SIL.Extensions;
51
using SIL.Machine.Translation;
62

73
namespace SIL.Machine.Corpora

src/SIL.Machine/Corpora/QuotationMarkDenormalizationFirstPass.cs

Lines changed: 0 additions & 14 deletions
This file was deleted.

src/SIL.Machine/Corpora/FallbackQuotationMarkResolver.cs renamed to src/SIL.Machine/PunctuationAnalysis/FallbackQuotationMarkResolver.cs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
1-
using System.Collections.Generic;
2-
using System.Linq;
3-
using SIL.Machine.PunctuationAnalysis;
4-
5-
namespace SIL.Machine.Corpora
1+
namespace SIL.Machine.PunctuationAnalysis
62
{
73
public class FallbackQuotationMarkResolver : IQuotationMarkResolver
84
{

src/SIL.Machine/Corpora/ParatextProjectQuoteConventionDetector.cs renamed to src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,4 @@
1-
using System;
2-
using System.IO;
3-
using System.Text;
4-
using SIL.Machine.PunctuationAnalysis;
5-
6-
namespace SIL.Machine.Corpora
1+
namespace SIL.Machine.PunctuationAnalysis
72
{
83
public abstract class ParatextProjectQuoteConventionDetector
94
{

src/SIL.Machine/PunctuationAnalysis/PreliminaryQuotationMarkAnalyzer.cs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,42 @@
66

77
namespace SIL.Machine.PunctuationAnalysis
88
{
9+
public class QuotationMarkCounter
10+
{
11+
private const double NegligibleProportionThreshold = 0.01;
12+
private Dictionary<string, int> _quotationMarkCounts;
13+
private int _totalQuotationMarkCount;
14+
15+
public QuotationMarkCounter()
16+
{
17+
Reset();
18+
}
19+
20+
public void Reset()
21+
{
22+
_quotationMarkCounts = new Dictionary<string, int>();
23+
_totalQuotationMarkCount = 0;
24+
}
25+
26+
public void CountQuotationMarks(List<QuotationMarkStringMatch> quotationMarks)
27+
{
28+
foreach (var quotationMarkMatch in quotationMarks)
29+
{
30+
string mark = quotationMarkMatch.QuotationMark;
31+
_quotationMarkCounts.UpdateValue(mark, () => 0, i => i + 1);
32+
_totalQuotationMarkCount++;
33+
}
34+
}
35+
36+
public bool IsQuotationMarkProportionNegligible(string quotationMark)
37+
{
38+
if (_totalQuotationMarkCount == 0)
39+
return true;
40+
int quotationMarkCount = _quotationMarkCounts.TryGetValue(quotationMark, out int count) ? count : 0;
41+
return ((double)quotationMarkCount / _totalQuotationMarkCount) < NegligibleProportionThreshold;
42+
}
43+
}
44+
945
public class ApostropheProportionStatistics
1046
{
1147
private int _numCharacters;
@@ -385,19 +421,22 @@ public class PreliminaryQuotationMarkAnalyzer
385421
private readonly QuoteConventionSet _quoteConventions;
386422
private readonly PreliminaryApostropheAnalyzer _apostropheAnalyzer;
387423
private readonly QuotationMarkSequences _quotationMarkSequences;
424+
private readonly QuotationMarkCounter _quotationMarkCounts;
388425

389426
public PreliminaryQuotationMarkAnalyzer(QuoteConventionSet quoteConventions)
390427
{
391428
_quoteConventions = quoteConventions;
392429
_apostropheAnalyzer = new PreliminaryApostropheAnalyzer();
393430
_quotationMarkSequences = new QuotationMarkSequences();
431+
_quotationMarkCounts = new QuotationMarkCounter();
394432
Reset();
395433
}
396434

397435
public void Reset()
398436
{
399437
_apostropheAnalyzer.Reset();
400438
_quotationMarkSequences.Reset();
439+
_quotationMarkCounts.Reset();
401440
}
402441

403442
public QuoteConventionSet NarrowDownPossibleQuoteConventions(List<Chapter> chapters)
@@ -420,6 +459,7 @@ private void AnalyzeQuotationMarksForVerse(Verse verse)
420459
).FindAllPotentialQuotationMarksInVerse(verse);
421460
AnalyzeQuotationMarkSequence(quotationMarks);
422461
_apostropheAnalyzer.ProcessQuotationMarks(verse.TextSegments.ToList(), quotationMarks);
462+
_quotationMarkCounts.CountQuotationMarks(quotationMarks);
423463
}
424464

425465
private void AnalyzeQuotationMarkSequence(List<QuotationMarkStringMatch> quotationMarks)
@@ -450,6 +490,9 @@ private List<string> FindOpeningQuotationMarks()
450490

451491
private bool IsOpeningQuotationMark(string quotationMark)
452492
{
493+
if (_quotationMarkCounts.IsQuotationMarkProportionNegligible(quotationMark))
494+
return false;
495+
453496
if (_apostropheAnalyzer.IsApostropheOnly(quotationMark))
454497
return false;
455498

@@ -475,6 +518,9 @@ private List<string> FindClosingQuotationMarks()
475518

476519
private bool IsClosingQuotationMark(string quotationMark)
477520
{
521+
if (_quotationMarkCounts.IsQuotationMarkProportionNegligible(quotationMark))
522+
return false;
523+
478524
if (_apostropheAnalyzer.IsApostropheOnly(quotationMark))
479525
return false;
480526

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
namespace SIL.Machine.PunctuationAnalysis
2+
{
3+
// This is a convenience class so that users don't have to know to normalize the source quote convention
4+
public class QuotationMarkDenormalizationFirstPass : QuotationMarkUpdateFirstPass
5+
{
6+
public QuotationMarkDenormalizationFirstPass(QuoteConvention targetQuoteConvention)
7+
: base(targetQuoteConvention.Normalize(), targetQuoteConvention) { }
8+
}
9+
}

src/SIL.Machine/Corpora/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs renamed to src/SIL.Machine/PunctuationAnalysis/QuotationMarkDenormalizationUsfmUpdateBlockHandler.cs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
1-
using SIL.Machine.PunctuationAnalysis;
2-
3-
namespace SIL.Machine.Corpora
1+
namespace SIL.Machine.PunctuationAnalysis
42
{
53
public class QuotationMarkDenormalizationUsfmUpdateBlockHandler : QuoteConventionChangingUsfmUpdateBlockHandler
64
{
75
// This is a convenience class so that users don't have to know to normalize the source quote convention
86
public QuotationMarkDenormalizationUsfmUpdateBlockHandler(
9-
QuoteConvention sourceQuoteConvention,
107
QuoteConvention targetQuoteConvention,
118
QuotationMarkUpdateSettings settings = null
129
)
1310
: base(
14-
sourceQuoteConvention.Normalize(),
11+
targetQuoteConvention.Normalize(),
1512
targetQuoteConvention,
1613
settings ?? new QuotationMarkUpdateSettings()
1714
) { }

src/SIL.Machine/Corpora/QuotationMarkUpdateFirstPass.cs renamed to src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateFirstPass.cs

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,4 @@
1-
using System;
2-
using System.Collections.Generic;
3-
using System.Linq;
4-
using SIL.Machine.PunctuationAnalysis;
5-
6-
namespace SIL.Machine.Corpora
1+
namespace SIL.Machine.PunctuationAnalysis
72
{
83
// Determines the best strategy to take for each chapter
94
public class QuotationMarkUpdateFirstPass : UsfmStructureExtractor
@@ -12,46 +7,37 @@ public class QuotationMarkUpdateFirstPass : UsfmStructureExtractor
127
private readonly DepthBasedQuotationMarkResolver _quotationMarkResolver;
138
public bool WillFallbackModeWork { get; set; }
149

15-
public QuotationMarkUpdateFirstPass(
16-
QuoteConvention sourceQuoteConvention,
17-
QuoteConvention targetQuoteConvention
18-
)
10+
public QuotationMarkUpdateFirstPass(QuoteConvention oldQuoteConvention, QuoteConvention newQuoteConvention)
1911
{
2012
_quotationMarkFinder = new QuotationMarkFinder(
21-
new QuoteConventionSet(new List<QuoteConvention> { sourceQuoteConvention, targetQuoteConvention })
13+
new QuoteConventionSet(new List<QuoteConvention> { oldQuoteConvention, newQuoteConvention })
2214
);
2315
_quotationMarkResolver = new DepthBasedQuotationMarkResolver(
24-
new QuotationMarkUpdateResolutionSettings(sourceQuoteConvention)
16+
new QuotationMarkUpdateResolutionSettings(oldQuoteConvention)
2517
);
26-
WillFallbackModeWork = CheckWhetherFallbackModeWillWork(sourceQuoteConvention, targetQuoteConvention);
18+
WillFallbackModeWork = CheckWhetherFallbackModeWillWork(oldQuoteConvention, newQuoteConvention);
2719
}
2820

2921
public bool CheckWhetherFallbackModeWillWork(
30-
QuoteConvention sourceQuoteConvention,
31-
QuoteConvention targetQuoteConvention
22+
QuoteConvention oldQuoteConvention,
23+
QuoteConvention newQuoteConvention
3224
)
3325
{
34-
var targetMarkBySourceMark = new Dictionary<string, string>();
26+
var newMarkByOldMark = new Dictionary<string, string>();
3527
foreach (
36-
int depth in Enumerable.Range(
37-
1,
38-
Math.Min(sourceQuoteConvention.NumLevels, targetQuoteConvention.NumLevels)
39-
)
28+
int depth in Enumerable.Range(1, Math.Min(oldQuoteConvention.NumLevels, newQuoteConvention.NumLevels))
4029
)
4130
{
42-
string openingQuotationMark = sourceQuoteConvention.GetOpeningQuotationMarkAtDepth(depth);
43-
string closingQuotationMark = targetQuoteConvention.GetClosingQuotationMarkAtDepth(depth);
31+
string openingQuotationMark = oldQuoteConvention.GetOpeningQuotationMarkAtDepth(depth);
32+
string closingQuotationMark = newQuoteConvention.GetClosingQuotationMarkAtDepth(depth);
4433
if (
45-
targetMarkBySourceMark.TryGetValue(
46-
openingQuotationMark,
47-
out string correspondingClosingQuotationMark
48-
)
34+
newMarkByOldMark.TryGetValue(openingQuotationMark, out string correspondingClosingQuotationMark)
4935
&& correspondingClosingQuotationMark != closingQuotationMark
5036
)
5137
{
5238
return false;
5339
}
54-
targetMarkBySourceMark[openingQuotationMark] = closingQuotationMark;
40+
newMarkByOldMark[openingQuotationMark] = closingQuotationMark;
5541
}
5642
return true;
5743
}

src/SIL.Machine/Corpora/QuotationMarkUpdateResolutionSettings.cs renamed to src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateResolutionSettings.cs

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
1-
using System.Collections.Generic;
2-
using System.Text.RegularExpressions;
3-
41
namespace SIL.Machine.PunctuationAnalysis
52
{
63
public class QuotationMarkUpdateResolutionSettings : IQuotationMarkResolutionSettings
74
{
8-
private readonly QuoteConvention _sourceQuoteConvention;
5+
private readonly QuoteConvention _oldQuoteConvention;
96
private readonly QuoteConventionSet _quoteConventionSingletonSet;
107

11-
public QuotationMarkUpdateResolutionSettings(QuoteConvention sourceQuoteConvention)
8+
public QuotationMarkUpdateResolutionSettings(QuoteConvention oldQuoteConvention)
129
{
13-
_sourceQuoteConvention = sourceQuoteConvention;
14-
_quoteConventionSingletonSet = new QuoteConventionSet(new List<QuoteConvention> { sourceQuoteConvention });
10+
_oldQuoteConvention = oldQuoteConvention;
11+
_quoteConventionSingletonSet = new QuoteConventionSet(new List<QuoteConvention> { oldQuoteConvention });
1512
}
1613

1714
public bool AreMarksAValidPair(string openingMark, string closingMark)
@@ -31,7 +28,7 @@ public Regex GetOpeningQuotationMarkRegex()
3128

3229
public HashSet<int> GetPossibleDepths(string quotationMark, QuotationMarkDirection direction)
3330
{
34-
return _sourceQuoteConvention.GetPossibleDepths(quotationMark, direction);
31+
return _oldQuoteConvention.GetPossibleDepths(quotationMark, direction);
3532
}
3633

3734
public bool IsValidClosingQuotationMark(QuotationMarkStringMatch quotationMarkMatch)
@@ -46,7 +43,7 @@ public bool IsValidOpeningQuotationMark(QuotationMarkStringMatch quotationMarkMa
4643

4744
public bool MetadataMatchesQuotationMark(string quotationMark, int depth, QuotationMarkDirection direction)
4845
{
49-
return _sourceQuoteConvention.GetExpectedQuotationMark(depth, direction) == quotationMark;
46+
return _oldQuoteConvention.GetExpectedQuotationMark(depth, direction) == quotationMark;
5047
}
5148

5249
public bool ShouldRelyOnParagraphMarkers()

src/SIL.Machine/Corpora/QuotationMarkUpdateSettings.cs renamed to src/SIL.Machine/PunctuationAnalysis/QuotationMarkUpdateSettings.cs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
using System.Collections.Generic;
2-
3-
namespace SIL.Machine.Corpora
1+
namespace SIL.Machine.PunctuationAnalysis
42
{
53
public class QuotationMarkUpdateSettings
64
{

0 commit comments

Comments
 (0)