From 388e56cf16e7c5fe7569112d229fa628d5b81497 Mon Sep 17 00:00:00 2001 From: Matěj Cepl Date: Sun, 14 May 2017 00:02:50 +0200 Subject: All tests for the verse parser PASS!!! Yay! --- generate_reference.py | 83 ++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 38 deletions(-) (limited to 'generate_reference.py') diff --git a/generate_reference.py b/generate_reference.py index e182205..d437a3d 100755 --- a/generate_reference.py +++ b/generate_reference.py @@ -18,6 +18,7 @@ ENGL_BOOKS = ('Gen', 'Exod', 'Lev', 'Num', 'Deut', 'Josh', 'Judg', 'Ruth', 'Phil', 'Col', '1Thess', '2Thess', '1Tim', '2Tim', 'Titus', 'Phlm', 'Heb', 'Jas', '1Pet', '2Pet', '1John', '2John', '3John', 'Jude', 'Rev') +log = logging.getLogger(__name__) # 1Mak, 2Mak, 3Mak, 4Mak, ditto s Ma @@ -95,60 +96,72 @@ EN_BOOKS = tuple(TranslDict.values()) BIBLE_REF_PARSE_RE = re.compile(r''' (%s)? # Name of the Bible book (optional) \s*([0-9]+), # chapter number separated by whitespace - ([0-9—.-]+) # verse number(s) separated by (optional) comma + ([0-9n—.-]+) # verse(s) number ''' % '|'.join(CZ_BOOKS), flags=re.VERBOSE) ONLY_VERSE_PARSE_RE = re.compile(r''' - v\.\s+ # abbreviation of "verš" (verse) - ([0-9—.-]+) # verse number(s) separated by (optional) comma -''', flags=re.VERBOSE) -SPLIT_VERSE_RE = re.compile(''' - (\d+) - [—.-] - (\d+) + \bv\.\s+ # abbreviation of "verš" (verse) + ([0-9n—.-]+) # verse(s) number ''', flags=re.VERBOSE) +SPLIT_VERSE_RE = re.compile('[—.-]+') + + +def next_verses(v_str): + out = [] + v_int = int(v_str.rstrip('n')) + out.append(v_int) + for add_v in range(v_str.count('n')): + out.append(v_int + add_v + 1) + log.debug('out = %s', out) + return out + + +def verse_interval(verse_str): + log.debug('verse_str = %s', verse_str) + out = [] + + for vers_elem in verse_str.split('.'): + log.debug('vers_elem = %s', vers_elem) + i_matchs = SPLIT_VERSE_RE.split(vers_elem) + log.debug('i_matchs = %s', i_matchs) + if len(i_matchs) == 2: + up_limit = next_verses(i_matchs[1]) + for verse in range(int(i_matchs[0]), up_limit[-1] + 1): + out.append(verse) + elif len(i_matchs) == 1: + out.extend(next_verses(i_matchs[0])) + else: + raise ValueError('weird interval = %s' % vers_elem) + log.debug('out = %s', out) + return tuple(out) -def verse_interval(verse): - match = SPLIT_VERSE_RE.search(verse) - logging.debug('match = %s', match) - if match: - matches = match.groups() - verse = int(match.group(1)) - next_verse = int(match.group(2)) - logging.debug('verse, next_verse = %s, %s', verse, next_verse) - return (verse, next_verse) - else: - return (int(verse),) def parse_notes_test(instr): - logging.debug('%s\ninstr = %s', '-' * 30, instr) + log.debug('%s\ninstr = %s', '-' * 30, instr) matches = BIBLE_REF_PARSE_RE.findall(instr) - logging.debug('BIBLE_REF_PARSE_RE matches = %s', matches) + log.debug('BIBLE_REF_PARSE_RE matches = %s', matches) out_list = [] book = None for match in matches: - logging.debug('match = %s', match) + log.debug('match = %s', match) if match[0]: book = match[0] elif book is None: book = '' - logging.debug('match[1] = %s', match[1]) if match[1]: - chapter = int(match[1].rstrip(',')) + chapter = int(match[1]) else: chapter = 0 - logging.debug('match[2] = %s', match[2]) - verse = match[2] - out_list.append((book, chapter) + verse_interval(match[2])) + out_list.append((book, chapter, verse_interval(match[2]))) + matches = ONLY_VERSE_PARSE_RE.findall(instr) - logging.debug('ONLY_VERSE_PARSE_RE matches = %s', matches) + log.debug('ONLY_VERSE_PARSE_RE matches = %s', matches) for match in matches: - logging.debug('match = %s', match) + log.debug('match = %s', match) if match: - out = ('', 0) + verse_interval(match) - out_list.append(out) + out_list.append(('', 0, verse_interval(match))) return tuple(out_list) @@ -163,12 +176,6 @@ class GenerateReferencesFilter(XMLFilterBase): self._in_note = True self._note_content = "" -# def startDocument(self): -# pass - -# def endDocument(self): -# pass - def startElement(self, name, attrs): # noqa if name == "verse" and 'sID' in attrs: ref_elements = attrs['sID'].split('.') @@ -181,7 +188,7 @@ class GenerateReferencesFilter(XMLFilterBase): def endElement(self, name): # noqa if name == 'note' and self._in_note: - logging.debug('content:\n%s', self._note_content) + log.debug('content:\n%s', self._note_content) self._in_note = False self._note_content = "" self._downstream.endElement(name) -- cgit