diff options
author | Dominique Corbex <domcox@crosswire.org> | 2023-08-17 15:41:12 +0000 |
---|---|---|
committer | Dominique Corbex <domcox@crosswire.org> | 2023-08-17 15:41:12 +0000 |
commit | bc55005722f034ee61913842ff5ca3de97ec5bad (patch) | |
tree | 4cf057dd79095dfee80f71efb6ed200d7fb05f15 /modules | |
parent | cda6afbc5c9b2fc2d84a0d96daf166b8c28e379c (diff) | |
download | sword-tools-master.tar.gz |
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@564 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'modules')
-rwxr-xr-x | modules/conf/confmaker.py | 666 |
1 files changed, 337 insertions, 329 deletions
diff --git a/modules/conf/confmaker.py b/modules/conf/confmaker.py index 2ed9476..d3ba12a 100755 --- a/modules/conf/confmaker.py +++ b/modules/conf/confmaker.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- - # confmaker.py - Provides a initial conf file for a new module by analyzing # the related OSIS xml file. @@ -36,14 +35,18 @@ # Revision: # 2021-01-16 domcox <domcox@crosswire.org> # Changed language library from iso-639 to langtags +# 2023-07-30 domcox <domcox@crosswire.org> +# Full rewrite using ElementTree XML parsing module -# Requirements +# TODO: +# - EntrySize for verses that do not use milestone elements +# - EntrySize for book titles & introduction +# - Implement GlobalOptionsFilter=OSISReferenceLinks -import time -import re import argparse import sys +import time import xml.etree.ElementTree as ET from datetime import date from pathlib import Path @@ -61,10 +64,9 @@ except: # Variables -Version = '1.1' - -doc = [] +Version = '2.0' +# List of V11n and relative SWORD Minimum version versification = { 'KJV': '1.5.9', 'KJVA': '1.6.0', @@ -86,13 +88,12 @@ versification = { 'Calvin': '1.8.0' } - # Functions def die(msg): - ''' + """ Show an error message then exit on error - ''' + """ print('ERROR! ' + msg, file=sys.stderr) sys.exit(1) @@ -105,16 +106,23 @@ def get_parameters(): # Creating parser description = ''' - provides a conf file for a module by analysing given OSIS XML file and optionally including extra elements from a conf.in file. + provides a conf file for a module by analysing the given OSIS XML file. + Optionally include extra elements from a conf.in file. This option will + be removed in a future version. ''' parser = argparse.ArgumentParser(description=description) # Adding arguments - parser.add_argument("-i", "--infile", help="conf.in file containing extra elements to include, (default none)") - parser.add_argument("-o", "--outfile", help="name of generated conf file, (default to screen)") - parser.add_argument("-v", "--v11n", default='KJV', help="versification schema, (default: KJV)") - parser.add_argument("-s", "--size", default='2' , help="set -s 4 for modules with large entries > 64Kb, (default -s 2)") - parser.add_argument('osis', help='OSIS XML file') + parser.add_argument('osis', + help='name of the OSIS XML file') + parser.add_argument("-o", "--outfile", + help="name of generated conf file, (default to screen)") + parser.add_argument("-v", "--v11n", default='KJV', + help="versification schema, (default: KJV)") + parser.add_argument("-s", "--size", default='2', + help="set -s 4 for modules with large entries > 64Kb, (default -s 2)") + parser.add_argument("-i", "--infile", + help="conf.in file containing extra elements to include, (default none)") # Parsing arguments args = parser.parse_args() @@ -123,29 +131,24 @@ def get_parameters(): def check_parameters(params): - ''' - Check CLI parameters for validity - ''' + """ + Check command arguments + """ - # Checking OSIS file + # Check OSIS file value osisfile = params['osis'] fileObj = Path(osisfile) if not fileObj.is_file(): die(f"File '{osisfile}' does not exist.") - # Checking conf.in file in input + # Check conf.in file value if params['infile']: infile = params['infile'] fileObj = Path(infile) if not fileObj.is_file(): die(f"File '{infile}' does not exist.") - # Checking Size - size = params['size'] - if size not in ('2', '4'): - die(f"--size='{size}' Incorrect value.") - - # Chexcking versification schema + # Check versification schema v11n = params['v11n'] av11n = versification.keys() if v11n not in av11n: @@ -153,73 +156,11 @@ def check_parameters(params): return (True) -def get_osistext(osisfile): - """ - Read osisText node from osis file. - Returns dict containing osisIDWork, osisRefWork, osisLang - """ - - # Search for <osisText ... > node - start_tag = '<osisText' - end_tag = '>' - start_tag_identified = False - node_identified = False - # osisText content in XML - captured_line = '' - # osisText attributes - osistext = dict([]) - # open Osis - with open(osisfile) as f: - # Read lines until osisText is captured - while not node_identified: - line = f.readline() - if not line: - # End of File - die('osisText not found in osis file') - # Search for osisText tag - if start_tag in line: - start_tag_identified = True - if start_tag_identified: - # capture osisText content - captured_line += line - if end_tag in line: - # osisText is fully captured - node_identified = True - # Read attributes - for attribute in {'osisIDWork', 'osisRefWork', 'xml:lang'}: - value = (re.search(rf'{attribute}="(.+?)"', captured_line, flags=re.IGNORECASE)) - if value: - osistext[attribute] = value.group(1) - else: - die(f'osisText attribute missing: {attribute}') - return osistext - - -def check_osistext(osistext): - ''' - Check osisText attributes - ''' - # Check osisIDWork - module = osistext['osisIDWork'].lower() - if len(module) < 1: - die('FATAL: osisIDWork is empty.') - - # Check osisRefWork - moduletype = osistext['osisRefWork'] - if moduletype.lower() not in ['bible','commentary','genbook']: - die(f"FATAL: Invalid attribute osisRefWork: {osiswork}") - - # Check Language - lang = osistext['xml:lang'] - language = get_language(lang) - - return True - - def get_language(lang): """ - Search BCP-47 Languages Database for lang + Search BCP-47 Languages Database for the given lang """ + found = False try: @@ -232,69 +173,6 @@ def get_language(lang): return (tag.language.description.replace('\n', ' ')) -def is_tag(xml_file, tag): - """ - Search for 'tag' in OSIS file and returns True if 'tag' exists, False otherwise - """ - # Start searching after <header> tag to avoid confusion - end_header_tag = '</header>' - header_read = False - # Tag to search - start_tag = f'<{tag}' - tag_identified = False - line = True - with open(xml_file) as f: - # Read until tag is identified - while line and not tag_identified: - line = f.readline() - # Skip <header> section - if end_header_tag in line: - header_read = True - if header_read: - if start_tag in line: - tag_identified = True - return tag_identified - - -def is_attribute(xml_file, tag, attribute): - """ - Search for 'tag' + 'attribute' in OSIS file, - returns True if 'tag' + 'attribute' exists, False otherwise - """ - # Start searching after <header> tag to avoid confusion - end_header_tag = '</header>' - header_read = False - # Start and end tags defining the element that may have 'attribute' - start_tag = f'<{tag}' - end_tag = f'</{tag}>' - element = '' - start_tag_identified = False - attribute_identified = False - line = True - with open(xml_file) as f: - # Read lines until attribute is identified - while line and not attribute_identified: - line = f.readline() - # Skip <header> section - if end_header_tag in line: - header_read = True - if header_read: - # Search for tag - if start_tag in line: - start_tag_identified = True - if start_tag_identified: - # Read elemnt - element += line - if end_tag in line: - element += line - start_tag_identified = False - # Search for attribute - if attribute in line: - attribute_identified = True - element = '' - return attribute_identified - - def is_diacritic(xml_file, lang, diacritic): ''' Search for 'diacritic' in OSIS File @@ -336,208 +214,338 @@ def is_diacritic(xml_file, lang, diacritic): return(ref_text.c_str() != mod_text.c_str()) -def build_doc(conf): +def osis2conf_parser(args): ''' - Generate conf file + This function Parses the OSIS file. searches for specific tags + and creates the relevant conf elements that will be used to build the conf file. ''' - # Module Name - module = conf['osisIDWork'] - doc.append("[" + module + "]") - - # Module Type - moduletype = conf['osisRefWork'] - # Parameters related to moduletype - # Big entries - size = conf['size'] - block = '4' if size == '4' else '' - # mod - mod = module.lower() - # ModDrv + Datapath - if moduletype.lower() in 'bible': - doc.append("ModDrv=zText" + block) - doc.append("DataPath=./modules/texts/ztext" + block + "/" + mod + "/") - if moduletype.lower() in 'commentary': - doc.append("ModDrv=zCom" + block) - doc.append("DataPath=./modules/comments/zcom" + block + "/" + mod + "/") - if moduletype.lower() in 'genbook': - doc.append("ModDrv=RawGenBook" + block) - doc.append("DataPath=./modules/genbook/rawgenbook/" + block + "/" + mod + "/" + mod) - - # Compression - if moduletype.lower() in ['bible','commentary']: - doc.append('CompressType=ZIP') - - # misc. - doc.append('BlockType=BOOK') - doc.append('Encoding=UTF-8') - doc.append('SourceType=OSIS') - doc.append('OSISVersion=2.1.1') - doc.append('SwordVersionDate=' + str(date.today())) - - # Language - lang = conf['xml:lang'] - doc.append('Lang=' + lang) - - # GlobalOptionFilter - # Get Osis file name - osis = conf['osis'] - # We should have Footnotes before Headings on order to have - # working notes in titles - # Footnotes - if is_tag(osis, 'note'): - doc.append('GlobalOptionFilter=OSISFootnotes') - # Headings - if is_tag(osis, 'title'): - doc.append('GlobalOptionFilter=OSISHeadings') - # Scripref - if is_tag(osis, 'reference'): - doc.append('GlobalOptionFilter=OSISScripref') - # RedLetterWords - if is_tag(osis, 'q '): - doc.append('GlobalOptionFilter=OSISRedLetterWords') - # Variants - variants = False - if is_attribute(osis, 'seg', ' type="x-variant"'): - variants = True - if is_tag(osis, 'rdg'): - variants = True - if variants: - doc.append('GlobalOptionFilter=OSISVariants') - # MorphSegmentaton - osisMorphSegmentation = False - if is_attribute(osis, 'seg', 'type="morph"'): - osisMorphSegmentation = True - if is_attribute(osis, 'seg', 'type="x-morph"'): - osisMorphSegmentation = True - if osisMorphSegmentation: - doc.append('GlobalOptionFilter=OSISMorphSegmentation') - # Lemma - if is_attribute(osis, 'w', ' lemma='): - doc.append('GlobalOptionFilter=OSISLemma') - # Strong - strong = is_attribute(osis, 'w', 'strong:') - if strong: - doc.append('GlobalOptionFilter=OSISStrongs') - # Glosses - if is_attribute(osis, 'w', ' gloss='): - doc.append('GlobalOptionFilter=OSISGlosses') - # Morph - if is_attribute(osis, 'w', ' morph='): - doc.append('GlobalOptionFilter=OSISMorph') - # Enum - if is_attribute(osis, 'w', ' n='): - doc.append('GlobalOptionFilter=OSISEnum') - # Xlit - if is_attribute(osis, 'w', ' xlit='): - doc.append('GlobalOptionFilter=OSISXlit') - - # Diacritics + # Variables: + + # 1. List of Key elements of the resulting SWORD conf file + Elements = [] + + # 2. OSIS sections + Header = False + Chapter = False + Verse = False + + # 3. Big entry + if int(args['size']) > 2: + EntrySize = 655536 + else: + EntrySize=0 + + # 4. Key elements of a SWORD module.conf + Name = '' + Type = '' + Lang = '' + Description = '' + About = '' + TextSource = '' + DistributionLicense = '' + OSISFootnotes = False + OSISHeadings = False + OSISScripref = False + OSISRedLetterWords = False + OSISVariants = False + OSISMorphSegmentation = False + OSISLemma = False + OSISStrongs = False + OSISGlosses = False + OSISMorph = False + OSISEnum = False + OSISXlit = False + Images = False + NoParagraphs = True + Copyright = '' + CopyrightHolder = '' + CopyrightDate = '' + CopyrightNotes = '' + CopyrightContactName = '' + CopyrightContactNotes = '' + CopyrightContactAddress = '' + Abbreviation = '' + KeyType = '' + DisplayLevel = '' + CaseSensitiveKeys='' + PreferredCSSXHTML = '' + Obsoletes = '' + Companion = '' + + # Let's parse + for event, node in ET.iterparse(args['osis'], events=("start", "end")): + # OsisText content + if not Name: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}osisText': + # Get osisIDWork + Name = node.get('osisIDWork') + if not Name: + die('FATAL: osisIDWork is empty.') + # Get osisRefWork + Type = node.get('osisRefWork').lower() + if Type not in ['bible', 'commentary', 'genbook']: + die(f"FATAL: Invalid attribute osisRefWork: {osiswork}") + # Get Language + Lang = node.get('{http://www.w3.org/XML/1998/namespace}lang') + if not Lang: + die(f'FATAL: Missing lang element') + + # Select header + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}header' and event == "start": + Header = True + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}header' and event == "end": + Header = False + # Select Chapter + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}chapter' and event == "start": + Chapter = True + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}chapter' and event == "end": + Chapter = False + + # GlobalOptionFilters + if not Header: + # Footnotes + if not OSISFootnotes: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}note': + OSISFootnotes = True + # Headings + if not OSISHeadings: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}title': + OSISHeadings = True + # Scripref + if not OSISScripref: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}reference': + OSISScripref = True + # RedLetterWords + if not OSISRedLetterWords: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}q': + OSISRedLetterWords = True + # Variants + if not OSISVariants: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}seg': + if 'type' in node.keys(): + if 'x-variant' in node.get('type'): + OSISVariants = True + # MorphSegmentation + if not OSISMorphSegmentation: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}seg': + if 'type' in node.keys(): + if 'morph:' in node.get('type'): + OSISMorphSegmentation = True + # Lemma + if not OSISLemma: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': + if node.get('lemma') != None: + OSISLemma = True + # Strongs + if not OSISStrongs: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': + if 'lemma' in node.keys(): + if 'strong' in node.get('lemma'): + OSISStrongs = True + # Glosses + if not OSISGlosses: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': + if 'gloss' in node.keys(): + OSISGlosses = True + # Morph + if not OSISMorph: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': + if 'morph' in node.keys(): + OSISMorph = True + # Enum + if not OSISEnum: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': + if 'n' in node.keys(): + OSISEnum = True + # Xlit + if not OSISXlit: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w': + if 'xlit' in node.keys(): + OSISXlit = True + + # Images + if not Images: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}figure': + Images = True + # Search only inside Chapters + if Chapter: + # NoParagraphs + if NoParagraphs: + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}p': + NoParagraphs = False + # Entries length - Get verse max size + if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}verse': + if 'sID' in node.keys(): + Verse = True + rawtext = '' + if 'eID' in node.keys(): + Verse = False + if len(rawtext) > EntrySize: + EntrySize =len(rawtext) + # Entries length - Get verse text + if Verse: + if node.text != None: + rawtext = rawtext + node.text + if node.tail != None: + rawtext = rawtext + node.tail + + # Define Elements + # Set Name + Elements.append("[" + Name + "]") + # Derive module name + module = Name.lower() + # Set big entry option for entries greater than 64K bytes + big = '' + if EntrySize > 64000: + big='4' + # Set ModDrv + if Type in 'bible': + Elements.append("ModDrv=zText" + big) + if Type in 'commentary': + Elements.append("ModDrv=zCom" + big) + if Type in 'genbook': + Elements.append("ModDrv=RawGenBook" + big) + # Set Datapath + if Type in 'bible': + Elements.append("DataPath=./modules/texts/ztext" + big + "/" + module + "/") + if Type in 'commentary': + Elements.append("DataPath=./modules/comments/zcom" + big + "/" + module + "/") + if Type in 'genbook': + Elements.append("DataPath=./modules/genbook/rawgenbook/" + big + "/" + module + "/" + module) + # Set Compression + if Type in ['bible', 'commentary']: + Elements.append('CompressType=ZIP') + # Set misc. elements + Elements.append('BlockType=BOOK') + Elements.append('Encoding=UTF-8') + Elements.append('SourceType=OSIS') + Elements.append('OSISVersion=2.1.1') + Elements.append('SwordVersionDate=' + str(date.today())) + + # Set Lang + Elements.append('Lang=' + Lang) + # Set GlobalOptionFilters + if OSISFootnotes: + Elements.append('GlobalOptionFilter=OSISFootnotes') + if OSISHeadings: + Elements.append('GlobalOptionFilter=OSISHeadings') + if OSISScripref: + Elements.append('GlobalOptionFilter=OSISScripref') + if OSISRedLetterWords: + Elements.append('GlobalOptionFilter=OSISRedLetterWords') + if OSISVariants: + Elements.append('GlobalOptionFilter=OSISVariants') + if OSISMorphSegmentation: + Elements.append('GlobalOptionFilter=OSISMorphSegmentation') + if OSISLemma: + Elements.append('GlobalOptionFilter=OSISLemma') + if OSISStrongs: + Elements.append('GlobalOptionFilter=OSISStrongs') + if OSISGlosses: + Elements.append('GlobalOptionFilter=OSISGlosses') + if OSISMorph: + Elements.append('GlobalOptionFilter=OSISMorph') + if OSISEnum: + Elements.append('GlobalOptionFilter=OSISEnum') + if OSISXlit: + Elements.append('GlobalOptionFilter=OSISXlit') + # Set Diacritics # Hebrew Vowel Points - if is_diacritic(osis, lang, 'Hebrew Vowel Points'): - doc.append('GlobalOptionFilter=UTF8HebrewPoints') + if is_diacritic(args['osis'], Lang, 'Hebrew Vowel Points'): + Elements.append('GlobalOptionFilter=UTF8HebrewPoints') # Arabic Vowel Points - if is_diacritic(osis, lang, 'Arabic Vowel Points'): - doc.append('GlobalOptionFilter=UTF8ArabicPoints') + if is_diacritic(args['osis'], Lang, 'Arabic Vowel Points'): + Elements.append('GlobalOptionFilter=UTF8ArabicPoints') # Hebrew Cantillation - if is_diacritic(osis, lang, 'Hebrew Cantillation'): - doc.append('GlobalOptionFilter=UTF8Cantillation') + if is_diacritic(args['osis'], Lang, 'Hebrew Cantillation'): + Elements.append('GlobalOptionFilter=UTF8Cantillation') # Greek Accents - if is_diacritic(osis, lang, 'Greek Accents'): - doc.append('GlobalOptionFilter=UTF8GreekAccents ') - - # Features - # StrongsNumbers - if strong: - doc.append('Feature=StrongsNumbers') + if is_diacritic(args['osis'], Lang, 'Greek Accents'): + Elements.append('GlobalOptionFilter=UTF8GreekAccents ') + # Set Features + if OSISStrongs: + Elements.append('Feature=StrongsNumbers') # Images - if is_tag(osis, 'figure '): - doc.append('Feature=Images') + if Images: + Elements.append('Feature=Images') # NoParagraphs - if not is_tag(osis, 'p '): - doc.append('Feature=NoParagraphs') + if NoParagraphs: + Elements.append('Feature=NoParagraphs') + # Set LCSH + language = get_language(Lang) + if Type not in 'genbook': + Elements.append('LCSH=' + Type.capitalize() + '.' + language) - # LCSH - lang_name = get_language(lang) - if moduletype.lower() in ['bible','commentary']: - doc.append('LCSH=' + moduletype + '.' + lang_name) + # Set Sword Minimum Version + Elements.append('MinimumVersion=' + versification[args['v11n']]) + # Set Versification + if Type not in 'genbook': + Elements.append('Versification=' + args['v11n']) - # Sword Minimum Version - doc.append('MinimumVersion=' + versification[conf['v11n']]) - if moduletype.lower() in ['bible','commentary']: - doc.append('Versification=' + conf['v11n']) - return True - - -def include_file(conf): - ''' - Include conf.in file if it exists - ''' - # Get conf.in file if it exists - infile = conf['infile'] - if infile: - # Read and include conf.in contents - with open(infile, 'r', encoding='utf-8', newline='\n') as f: - for line in f: - doc.append(line.rstrip()) - else: - # No conf.in file -> generate default values - module = conf['osisIDWork'] - moduletype = conf['osisRefWork'] - language = get_language(conf['xml:lang']) - doc.append('DistributionLicense=Copyrighted') - doc.append(f'Description={module}, {moduletype} in {language}') - doc.append(f'About={module}, {moduletype} in {language}') - doc.append('Version=1.0') - doc.append('History_1.0=First release') - return True - - -def print_out(conf, doc): - ''' - Print generated conf file - ''' - # Get conf file name - outfile = conf['outfile'] - if not outfile: - # Default to screen - for element in doc: - print(element) - else: - # Write config to file - with open(outfile, 'w') as f: - for element in doc: - print(element, file=f) - return True + # End + print('EntrySize=',EntrySize) + return Elements def main(): ''' Main function ''' - # Start benchmark + # Start benchmark start_time = time.perf_counter() # Read CLI params params = get_parameters() check_parameters(params) - # Read OSIS attributes - osis_attributes = (get_osistext(params['osis'])) - #print(osis_attributes) - check_osistext(osis_attributes) + # Parse OSIS + conf = osis2conf_parser(params) + # print('conf=', conf) - # Generate conf - cf = {**params, **osis_attributes} - build_doc(cf) - include_file(cf) - print_out(cf, doc) + # Generate conf file + outfile = params['outfile'] + if not outfile: + # Default to screen + for key in conf: + print(key) + else: + # Write config to file + with open(outfile, 'w') as f: + for key in conf: + print(key, file=f) + + # Include conf.in file if it exists + infile = params['infile'] + if infile: + # Read and include conf.in contents + with open(infile, 'r', encoding='utf-8', newline='\n') as f: + for line in f: + if not outfile: + # Default to screen + print(line.rstrip()) + else: + # Write config to file + with open(outfile, 'a') as f: + print(line.rstrip(), file=f) + else: + # No conf.in file -> generate default values + if not outfile: + # Default to screen + print('DistributionLicense=Copyrighted') + print('Description=This is a new module') + print('Version=1.0') + print('History_1.0=First release') + else: + # Write config to file + with open(outfile, 'a') as f: + print('DistributionLicense=Copyrighted', file=f) + print('Description=This is a new module', file=f) + print('Version=1.0', file=f) + print('History_1.0=First release', file=f) # Benchmark results end_time = time.perf_counter() total_time = round(end_time - start_time, 1) print(f'-- Module Config generated in {total_time} s') - return True - - main() |