Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from abc import ABC, abstractmethod
from typing import BinaryIO, Optional, Union
from typing import BinaryIO, Dict, List, Optional, Union

from ..corpora.paratext_project_settings import ParatextProjectSettings
from ..corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from ..corpora.usfm_parser import parse_usfm
from ..scripture.canon import book_id_to_number, get_scripture_books
from ..utils.typeshed import StrPath
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector

Expand All @@ -16,10 +17,13 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
self._settings = settings

def get_quote_convention_analysis(
self, handler: Optional[QuoteConventionDetector] = None
self, handler: Optional[QuoteConventionDetector] = None, include_chapters: Optional[Dict[int, List[int]]] = None
) -> Optional[QuoteConventionAnalysis]:
handler = QuoteConventionDetector() if handler is None else handler
for file_name in self._settings.get_all_scripture_book_file_names():
for book_id in get_scripture_books():
if include_chapters is not None and book_id_to_number(book_id) not in include_chapters:
continue
file_name: str = self._settings.get_book_file_name(book_id)
if not self._exists(file_name):
continue
with self._open(file_name) as sfm_file:
Expand All @@ -33,7 +37,7 @@ def get_quote_convention_analysis(
f". Error: '{e}'"
)
raise RuntimeError(error_message) from e
return handler.detect_quote_convention()
return handler.detect_quote_convention(include_chapters)

@abstractmethod
def _exists(self, file_name: StrPath) -> bool: ...
Expand Down
8 changes: 5 additions & 3 deletions machine/punctuation_analysis/quote_convention_detector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dataclasses import dataclass
from typing import List, Optional
from typing import Dict, List, Optional

from .chapter import Chapter
from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver
Expand Down Expand Up @@ -51,8 +51,10 @@ def _count_quotation_marks_in_chapter(

self._quotation_mark_tabulator.tabulate(resolved_quotation_marks)

def detect_quote_convention(self) -> Optional[QuoteConventionAnalysis]:
self._count_quotation_marks_in_chapters(self.get_chapters())
def detect_quote_convention(
self, include_chapters: Optional[Dict[int, List[int]]] = None
) -> Optional[QuoteConventionAnalysis]:
self._count_quotation_marks_in_chapters(self.get_chapters(include_chapters))

(best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention(
self._quotation_mark_tabulator
Expand Down
10 changes: 10 additions & 0 deletions machine/punctuation_analysis/text_segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
class TextSegment:
def __init__(self):
self._text = ""
self.book: Optional[str] = None
self.chapter: Optional[int] = None
self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER
self._markers_in_preceding_context: Set[UsfmMarkerType] = set()
self.previous_segment: Optional[TextSegment] = None
Expand Down Expand Up @@ -71,6 +73,14 @@ def add_preceding_marker(self, marker: UsfmMarkerType) -> "TextSegment.Builder":
self._text_segment._markers_in_preceding_context.add(marker)
return self

def set_book(self, code: str) -> "TextSegment.Builder":
self._text_segment.book = code
return self

def set_chapter(self, number: int) -> "TextSegment.Builder":
self._text_segment.chapter = number
return self

def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder":
self._text_segment._usfm_token = token
return self
Expand Down
25 changes: 23 additions & 2 deletions machine/punctuation_analysis/usfm_structure_extractor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import Optional, Sequence
from typing import Dict, List, Optional, Sequence

from ..corpora.usfm_parser_handler import UsfmParserHandler
from ..corpora.usfm_parser_state import UsfmParserState
from ..corpora.usfm_token import UsfmAttribute
from ..scripture.canon import book_id_to_number
from .chapter import Chapter
from .text_segment import TextSegment
from .usfm_marker_type import UsfmMarkerType
Expand All @@ -14,6 +15,9 @@ def __init__(self):
self._text_segments: list[TextSegment] = []
self._next_text_segment_builder: TextSegment.Builder = TextSegment.Builder()

def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
self._next_text_segment_builder.set_book(code)

def chapter(
self,
state: UsfmParserState,
Expand All @@ -23,6 +27,8 @@ def chapter(
pub_number: Optional[str],
) -> None:
self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHAPTER)
chapter_number: int = int(number) if number.isdigit() else 0
self._next_text_segment_builder.set_chapter(chapter_number)

def start_para(
self,
Expand Down Expand Up @@ -79,11 +85,26 @@ def text(self, state: UsfmParserState, text: str) -> None:
self._text_segments.append(text_segment)
self._next_text_segment_builder = TextSegment.Builder()

def get_chapters(self) -> list[Chapter]:
def get_chapters(self, include_chapters: Optional[Dict[int, List[int]]] = None) -> list[Chapter]:
chapters: list[Chapter] = []
current_book: int = 0
current_chapter: int = 0
current_chapter_verses: list[Verse] = []
current_verse_segments: list[TextSegment] = []
for text_segment in self._text_segments:
if text_segment.book is not None:
current_book = book_id_to_number(text_segment.book)
if text_segment.chapter is not None:
current_chapter = text_segment.chapter
if include_chapters is not None and current_book > 0:
if current_book not in include_chapters:
continue
elif (
current_chapter > 0
and len(include_chapters[current_book]) > 0
and current_chapter not in include_chapters[current_book]
):
continue
if text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE):
if len(current_verse_segments) > 0:
current_chapter_verses.append(Verse(current_verse_segments))
Expand Down
2 changes: 1 addition & 1 deletion machine/scripture/canon.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,4 +184,4 @@ def is_canonical(book: Union[str, int]) -> bool:


def get_scripture_books() -> Iterable[str]:
return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_ot_nt(kvp[1]), BOOK_NUMBERS.items())))
return list(map(lambda kvp: kvp[0], filter(lambda kvp: is_canonical(kvp[1]), BOOK_NUMBERS.items())))
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
from typing import Dict, Optional
from typing import Dict, List, Optional

from testutils.memory_paratext_project_quote_convention_detector import MemoryParatextProjectQuoteConventionDetector

from machine.corpora import ParatextProjectSettings, UsfmStylesheet
from machine.punctuation_analysis import ParatextProjectQuoteConventionDetector, QuoteConventionAnalysis
from machine.scripture import ORIGINAL_VERSIFICATION, Versification
from machine.punctuation_analysis import (
STANDARD_QUOTE_CONVENTIONS,
ParatextProjectQuoteConventionDetector,
QuoteConvention,
QuoteConventionAnalysis,
)
from machine.scripture import ORIGINAL_VERSIFICATION, Versification, get_chapters

standard_english_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(
"standard_english"
)
standard_french_quote_convention: Optional[QuoteConvention] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(
"standard_french"
)


def test_get_quote_convention() -> None:
env = _TestEnvironment(
files={
"41MATTest.SFM": r"""\id MAT
\c 1
\v 1 Someone said, “This is something I am saying!
\v 2 This is also something I am saying” (that is, “something I am speaking”).
\p
\v 3 Other text, and someone else said,
\q1
\v 4 “Things
\q2 someone else said!
\q3 and more things someone else said.”
\m That is why he said “things someone else said.”
\v 5 Then someone said, “More things someone said.”""",
"41MATTest.SFM": rf"""\id MAT
{get_test_chapter(1, standard_english_quote_convention)}""",
}
)
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention()
Expand All @@ -30,6 +32,64 @@ def test_get_quote_convention() -> None:
assert analysis.best_quote_convention.name == "standard_english"


def test_get_quote_convention_by_book() -> None:
env = _TestEnvironment(
files={
"41MATTest.SFM": rf"""\id MAT
{get_test_chapter(1, standard_english_quote_convention)}""",
"42MRKTest.SFM": rf"""\id MRK
{get_test_chapter(1, standard_french_quote_convention)}""",
}
)
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK")
assert analysis is not None
assert analysis.best_quote_convention_score > 0.8
assert analysis.best_quote_convention.name == "standard_french"


def test_get_quote_convention_by_chapter() -> None:
env = _TestEnvironment(
files={
"41MATTest.SFM": rf"""\id MAT
{get_test_chapter(1, standard_english_quote_convention)}""",
"42MRKTest.SFM": rf"""\id MRK
{get_test_chapter(1, standard_english_quote_convention)}
{get_test_chapter(2, standard_french_quote_convention)}
{get_test_chapter(3, standard_english_quote_convention)}
{get_test_chapter(4, standard_english_quote_convention)}
{get_test_chapter(5, standard_french_quote_convention)}""",
}
)
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MRK2,4-5")
assert analysis is not None
assert analysis.best_quote_convention_score > 0.66
assert analysis.best_quote_convention.name == "standard_french"


def test_get_quote_convention_by_chapter_indeterminate() -> None:
env = _TestEnvironment(
files={
"41MATTest.SFM": rf"""\id MAT
{get_test_chapter(1, None)}
{get_test_chapter(2, standard_english_quote_convention)}
{get_test_chapter(3, None)}""",
}
)
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT1,3")
assert analysis is None


def test_get_quote_convention_invalid_book_code() -> None:
env = _TestEnvironment(
files={
"41MATTest.SFM": rf"""\id LUK
{get_test_chapter(1, standard_english_quote_convention)}""",
}
)
analysis: Optional[QuoteConventionAnalysis] = env.get_quote_convention("MAT")
assert analysis is None


class _TestEnvironment:
def __init__(
self,
Expand All @@ -44,8 +104,27 @@ def __init__(
def detector(self) -> ParatextProjectQuoteConventionDetector:
return self._detector

def get_quote_convention(self) -> Optional[QuoteConventionAnalysis]:
return self.detector.get_quote_convention_analysis()
def get_quote_convention(self, scripture_range: Optional[str] = None) -> Optional[QuoteConventionAnalysis]:
chapters: Optional[Dict[int, List[int]]] = None
if scripture_range is not None:
chapters = get_chapters(scripture_range, ORIGINAL_VERSIFICATION)
return self.detector.get_quote_convention_analysis(include_chapters=chapters)


def get_test_chapter(number: int, quote_convention: Optional[QuoteConvention]) -> str:
left_quote: str = quote_convention.get_opening_quotation_mark_at_depth(1) if quote_convention else ""
right_quote: str = quote_convention.get_closing_quotation_mark_at_depth(1) if quote_convention else ""
return rf"""\c {number}
\v 1 Someone said, {left_quote}This is something I am saying!
\v 2 This is also something I am saying{right_quote} (that is, {left_quote}something I am speaking{right_quote}).
\p
\v 3 Other text, and someone else said,
\q1
\v 4 {left_quote}Things
\q2 someone else said!
\q3 and more things someone else said.{right_quote}
\m That is why he said {left_quote}things someone else said.{right_quote}
\v 5 Then someone said, {left_quote}More things someone said.{right_quote}"""


class _DefaultParatextProjectSettings(ParatextProjectSettings):
Expand Down
46 changes: 46 additions & 0 deletions tests/punctuation_analysis/test_usfm_structure_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,52 @@
verse_text_parser_state.verse_ref.verse_num = 1


def test_get_chapters_filter_by_book():
usfm_structure_extractor = UsfmStructureExtractor()
usfm_structure_extractor.start_book(verse_text_parser_state, "id", "GEN")
usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None)
usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
usfm_structure_extractor.text(verse_text_parser_state, "test")

actual_chapters = usfm_structure_extractor.get_chapters({2: [1]}) # EXO 1
assert len(actual_chapters) == 0


def test_get_chapters_filter_by_chapter():
usfm_structure_extractor = UsfmStructureExtractor()
usfm_structure_extractor.start_book(verse_text_parser_state, "id", "MAT")
usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None)
usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
usfm_structure_extractor.text(verse_text_parser_state, "test")
usfm_structure_extractor.chapter(verse_text_parser_state, "2", "c", None, None)
usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
usfm_structure_extractor.text(verse_text_parser_state, "test2")
usfm_structure_extractor.chapter(verse_text_parser_state, "3", "c", None, None)
usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None)
usfm_structure_extractor.text(verse_text_parser_state, "test3")

expected_chapters = [
Chapter(
[
Verse(
[
TextSegment.Builder()
.set_text("test2")
.add_preceding_marker(UsfmMarkerType.CHAPTER)
.add_preceding_marker(UsfmMarkerType.VERSE)
.build()
]
)
]
)
]

actual_chapters = usfm_structure_extractor.get_chapters({40: [2]})
assert_chapter_equal(expected_chapters, actual_chapters)
assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None
assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None


def test_chapter_and_verse_markers():
usfm_structure_extractor = UsfmStructureExtractor()
usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None)
Expand Down