diff --git a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py index 3d52658..808a717 100644 --- a/machine/punctuation_analysis/paratext_project_quote_convention_detector.py +++ b/machine/punctuation_analysis/paratext_project_quote_convention_detector.py @@ -1,9 +1,10 @@ from abc import ABC, abstractmethod -from typing import BinaryIO, Optional, Union +from typing import BinaryIO, Dict, List, Optional, Union from ..corpora.paratext_project_settings import ParatextProjectSettings from ..corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from ..corpora.usfm_parser import parse_usfm +from ..scripture.canon import book_id_to_number, get_scripture_books from ..utils.typeshed import StrPath from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector @@ -16,10 +17,13 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti self._settings = settings def get_quote_convention_analysis( - self, handler: Optional[QuoteConventionDetector] = None + self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None ) -> Optional[QuoteConventionAnalysis]: handler = QuoteConventionDetector() if handler is None else handler - for file_name in self._settings.get_all_scripture_book_file_names(): + for book_id in get_scripture_books(): + if include_chapters is not None and book_id_to_number(book_id) not in include_chapters: + continue + file_name: str = self._settings.get_book_file_name(book_id) if not self._exists(file_name): continue with self._open(file_name) as sfm_file: @@ -33,7 +37,7 @@ def get_quote_convention_analysis( f". Error: '{e}'" ) raise RuntimeError(error_message) from e - return handler.detect_quote_convention() + return handler.detect_quote_convention(include_chapters) @abstractmethod def _exists(self, file_name: StrPath) -> bool: ... diff --git a/machine/punctuation_analysis/quote_convention_detector.py b/machine/punctuation_analysis/quote_convention_detector.py index 5a8d098..c37e813 100644 --- a/machine/punctuation_analysis/quote_convention_detector.py +++ b/machine/punctuation_analysis/quote_convention_detector.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import List, Optional +from typing import Dict, List, Optional from .chapter import Chapter from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver @@ -51,8 +51,10 @@ def _count_quotation_marks_in_chapter( self._quotation_mark_tabulator.tabulate(resolved_quotation_marks) - def detect_quote_convention(self) -> Optional[QuoteConventionAnalysis]: - self._count_quotation_marks_in_chapters(self.get_chapters()) + def detect_quote_convention( + self, include_chapters: Optional[Dict[int, List[int]]] = None + ) -> Optional[QuoteConventionAnalysis]: + self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters)) (best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention( self._quotation_mark_tabulator diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py index c8d44bd..bbccdbd 100644 --- a/machine/punctuation_analysis/text_segment.py +++ b/machine/punctuation_analysis/text_segment.py @@ -7,6 +7,8 @@ class TextSegment: def __init__(self): self._text = "" + self.book: Optional[str] = None + self.chapter: Optional[int] = None self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER self._markers_in_preceding_context: Set[UsfmMarkerType] = set() self.previous_segment: Optional[TextSegment] = None @@ -71,6 +73,14 @@ def add_preceding_marker(self, marker: UsfmMarkerType) -> "TextSegment.Builder": self._text_segment._markers_in_preceding_context.add(marker) return self + def set_book(self, code: str) -> "TextSegment.Builder": + self._text_segment.book = code + return self + + def set_chapter(self, number: int) -> "TextSegment.Builder": + self._text_segment.chapter = number + return self + def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder": self._text_segment._usfm_token = token return self diff --git a/machine/punctuation_analysis/usfm_structure_extractor.py b/machine/punctuation_analysis/usfm_structure_extractor.py index 42fb9c3..baeef06 100644 --- a/machine/punctuation_analysis/usfm_structure_extractor.py +++ b/machine/punctuation_analysis/usfm_structure_extractor.py @@ -1,8 +1,9 @@ -from typing import Optional, Sequence +from typing import Dict, List, Optional, Sequence from ..corpora.usfm_parser_handler import UsfmParserHandler from ..corpora.usfm_parser_state import UsfmParserState from ..corpora.usfm_token import UsfmAttribute +from ..scripture.canon import book_id_to_number from .chapter import Chapter from .text_segment import TextSegment from .usfm_marker_type import UsfmMarkerType @@ -14,6 +15,9 @@ def __init__(self): self._text_segments: list[TextSegment] = [] self._next_text_segment_builder: TextSegment.Builder = TextSegment.Builder() + def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: + self._next_text_segment_builder.set_book(code) + def chapter( self, state: UsfmParserState, @@ -23,6 +27,8 @@ def chapter( pub_number: Optional[str], ) -> None: self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHAPTER) + chapter_number: int = int(number) if number.isdigit() else 0 + self._next_text_segment_builder.set_chapter(chapter_number) def start_para( self, @@ -79,11 +85,26 @@ def text(self, state: UsfmParserState, text: str) -> None: self._text_segments.append(text_segment) self._next_text_segment_builder = TextSegment.Builder() - def get_chapters(self) -> list[Chapter]: + def get_chapters(self, include_chapters: Optional[Dict[int, List[int]]] = None) -> list[Chapter]: chapters: list[Chapter] = [] + current_book: int = 0 + current_chapter: int = 0 current_chapter_verses: list[Verse] = [] current_verse_segments: list[TextSegment] = [] for text_segment in self._text_segments: + if text_segment.book is not None: + current_book = book_id_to_number(text_segment.book) + if text_segment.chapter is not None: + current_chapter = text_segment.chapter + if include_chapters is not None and current_book > 0: + if current_book not in include_chapters: + continue + elif ( + current_chapter > 0 + and len(include_chapters[current_book]) > 0 + and current_chapter not in include_chapters[current_book] + ): + continue if text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE): if len(current_verse_segments) > 0: current_chapter_verses.append(Verse(current_verse_segments)) diff --git a/machine/scripture/canon.py b/machine/scripture/canon.py index 8bafaf1..04ae8f0 100644 --- a/machine/scripture/canon.py +++ b/machine/scripture/canon.py @@ -184,4 +184,4 @@ def is_canonical(book: Union[str, int]) -> bool: def get_scripture_books() -> Iterable[str]: - return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_ot_nt(kvp[1]), BOOK_NUMBERS.items()))) + return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_canonical(kvp[1]), BOOK_NUMBERS.items()))) diff --git a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py index 7ffe2ba..36e0f9f 100644 --- a/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py +++ b/tests/punctuation_analysis/test_paratext_project_quote_convention_detector.py @@ -1,27 +1,29 @@ -from typing import Dict, Optional +from typing import Dict, List, Optional from testutils.memory_paratext_project_quote_convention_detector import MemoryParatextProjectQuoteConventionDetector from machine.corpora import ParatextProjectSettings, UsfmStylesheet -from machine.punctuation_analysis import ParatextProjectQuoteConventionDetector, QuoteConventionAnalysis -from machine.scripture import ORIGINAL_VERSIFICATION, Versification +from machine.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, + ParatextProjectQuoteConventionDetector, + QuoteConvention, + QuoteConventionAnalysis, +) +from machine.scripture import ORIGINAL_VERSIFICATION, Versification, get_chapters + +standard_english_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( + "standard_english" +) +standard_french_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name( + "standard_french" +) def test_get_quote_convention() -> None: env = _TestEnvironment( files={ - "41MATTest.SFM": r"""\id MAT -\c 1 -\v 1 Someone said, “This is something I am saying! -\v 2 This is also something I am saying” (that is, “something I am speaking”). -\p -\v 3 Other text, and someone else said, -\q1 -\v 4 “Things -\q2 someone else said! -\q3 and more things someone else said.” -\m That is why he said “things someone else said.” -\v 5 Then someone said, “More things someone said.”""", + "41MATTest.SFM": rf"""\id MAT +{get_test_chapter(1, standard_english_quote_convention)}""", } ) analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention() @@ -30,6 +32,64 @@ def test_get_quote_convention() -> None: assert analysis.best_quote_convention.name == "standard_english" +def test_get_quote_convention_by_book() -> None: + env = _TestEnvironment( + files={ + "41MATTest.SFM": rf"""\id MAT +{get_test_chapter(1, standard_english_quote_convention)}""", + "42MRKTest.SFM": rf"""\id MRK +{get_test_chapter(1, standard_french_quote_convention)}""", + } + ) + analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK") + assert analysis is not None + assert analysis.best_quote_convention_score > 0.8 + assert analysis.best_quote_convention.name == "standard_french" + + +def test_get_quote_convention_by_chapter() -> None: + env = _TestEnvironment( + files={ + "41MATTest.SFM": rf"""\id MAT +{get_test_chapter(1, standard_english_quote_convention)}""", + "42MRKTest.SFM": rf"""\id MRK +{get_test_chapter(1, standard_english_quote_convention)} +{get_test_chapter(2, standard_french_quote_convention)} +{get_test_chapter(3, standard_english_quote_convention)} +{get_test_chapter(4, standard_english_quote_convention)} +{get_test_chapter(5, standard_french_quote_convention)}""", + } + ) + analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK2,4-5") + assert analysis is not None + assert analysis.best_quote_convention_score > 0.66 + assert analysis.best_quote_convention.name == "standard_french" + + +def test_get_quote_convention_by_chapter_indeterminate() -> None: + env = _TestEnvironment( + files={ + "41MATTest.SFM": rf"""\id MAT +{get_test_chapter(1, None)} +{get_test_chapter(2, standard_english_quote_convention)} +{get_test_chapter(3, None)}""", + } + ) + analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT1,3") + assert analysis is None + + +def test_get_quote_convention_invalid_book_code() -> None: + env = _TestEnvironment( + files={ + "41MATTest.SFM": rf"""\id LUK +{get_test_chapter(1, standard_english_quote_convention)}""", + } + ) + analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT") + assert analysis is None + + class _TestEnvironment: def __init__( self, @@ -44,8 +104,27 @@ def __init__( def detector(self) -> ParatextProjectQuoteConventionDetector: return self._detector - def get_quote_convention(self) -> Optional[QuoteConventionAnalysis]: - return self.detector.get_quote_convention_analysis() + def get_quote_convention(self, scripture_range: Optional[str] = None) -> Optional[QuoteConventionAnalysis]: + chapters: Optional[Dict[int, List[int]]] = None + if scripture_range is not None: + chapters = get_chapters(scripture_range, ORIGINAL_VERSIFICATION) + return self.detector.get_quote_convention_analysis(include_chapters=chapters) + + +def get_test_chapter(number: int, quote_convention: Optional[QuoteConvention]) -> str: + left_quote: str = quote_convention.get_opening_quotation_mark_at_depth(1) if quote_convention else "" + right_quote: str = quote_convention.get_closing_quotation_mark_at_depth(1) if quote_convention else "" + return rf"""\c {number} +\v 1 Someone said, {left_quote}This is something I am saying! +\v 2 This is also something I am saying{right_quote} (that is, {left_quote}something I am speaking{right_quote}). +\p +\v 3 Other text, and someone else said, +\q1 +\v 4 {left_quote}Things +\q2 someone else said! +\q3 and more things someone else said.{right_quote} +\m That is why he said {left_quote}things someone else said.{right_quote} +\v 5 Then someone said, {left_quote}More things someone said.{right_quote}""" class _DefaultParatextProjectSettings(ParatextProjectSettings): diff --git a/tests/punctuation_analysis/test_usfm_structure_extractor.py b/tests/punctuation_analysis/test_usfm_structure_extractor.py index 9c8941e..7249a6a 100644 --- a/tests/punctuation_analysis/test_usfm_structure_extractor.py +++ b/tests/punctuation_analysis/test_usfm_structure_extractor.py @@ -7,6 +7,52 @@ verse_text_parser_state.verse_ref.verse_num = 1 +def test_get_chapters_filter_by_book(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.start_book(verse_text_parser_state, "id", "GEN") + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + actual_chapters = usfm_structure_extractor.get_chapters({2: [1]}) # EXO 1 + assert len(actual_chapters) == 0 + + +def test_get_chapters_filter_by_chapter(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.start_book(verse_text_parser_state, "id", "MAT") + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + usfm_structure_extractor.chapter(verse_text_parser_state, "2", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test2") + usfm_structure_extractor.chapter(verse_text_parser_state, "3", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test3") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test2") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters({40: [2]}) + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + + def test_chapter_and_verse_markers(): usfm_structure_extractor = UsfmStructureExtractor() usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None)