summaryrefslogtreecommitdiffstats
path: root/modules/conf/confmaker.py
diff options
context:
space:
mode:
Diffstat (limited to 'modules/conf/confmaker.py')
-rwxr-xr-xmodules/conf/confmaker.py543
1 files changed, 543 insertions, 0 deletions
diff --git a/modules/conf/confmaker.py b/modules/conf/confmaker.py
new file mode 100755
index 0000000..2ed9476
--- /dev/null
+++ b/modules/conf/confmaker.py
@@ -0,0 +1,543 @@
+#!/usr/bin/env python3
+
+# -*- coding: utf-8 -*-
+
+# confmaker.py - Provides a initial conf file for a new module by analyzing
+# the related OSIS xml file.
+
+## The programme searches for relevant tags and creates the GlobalOptionFilter
+# entries and other relevant conf entries. This a port to Python from the
+# previous confmaker.pl Perl script we were using. It fixes detection of
+# diacritics and OSISMorphSegmentation (GlobalOpionFilters) and adds support
+# for genbook and modules with large entries > 64Kb.
+
+# Copyright (C) 2020 CrossWire Bible Society
+
+
+# Author: kris <kristof.szabo@lutheran.hu> & domcox <domcox@crosswire.org>
+
+# This file is part of Sword Modules
+
+# Sword Modules is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# Sword Modules is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with Sword Modules. If not, see <https://www.gnu.org/licenses/>.
+
+# Created: 2021-01-08
+#
+# Revision:
+# 2021-01-16 domcox <domcox@crosswire.org>
+# Changed language library from iso-639 to langtags
+
+
+# Requirements
+
+import time
+import re
+import argparse
+import sys
+import xml.etree.ElementTree as ET
+from datetime import date
+from pathlib import Path
+try:
+ import langtags
+except:
+ sys.stderr.write("You do not have the Python langtags library installed. Please install it (pip install langtags).\n")
+ sys.exit(1)
+try:
+ import Sword
+except:
+ sys.stderr.write("You do not have the SWORD library installed. Please install it.\n")
+ sys.exit(1)
+
+
+# Variables
+
+Version = '1.1'
+
+doc = []
+
+versification = {
+ 'KJV': '1.5.9',
+ 'KJVA': '1.6.0',
+ 'NRSV': '1.6.0',
+ 'NRSVA': '1.6.0',
+ 'MT': '1.6.0',
+ 'Leningrad': '1.6.0',
+ 'Synodal': '1.6.1',
+ 'Vulg': '1.6.1',
+ 'Luther': '1.6.1',
+ 'German': '1.6.1',
+ 'Catholic': '1.6.2',
+ 'Catholic2': '1.6.2',
+ 'LXX': '1.7.2',
+ 'Orthodox': '1.7.2',
+ 'SynodalProt': '1.7.2',
+ 'DarbyFr': '1.8.0',
+ 'Segond': '1.8.0',
+ 'Calvin': '1.8.0'
+}
+
+
+# Functions
+
+def die(msg):
+ '''
+ Show an error message then exit on error
+ '''
+ print('ERROR! ' + msg, file=sys.stderr)
+ sys.exit(1)
+
+
+def get_parameters():
+ """
+ Get Parse command-line options.
+ Returns dict containing parameters values
+ """
+
+ # Creating parser
+ description = '''
+ provides a conf file for a module by analysing given OSIS XML file and optionally including extra elements from a conf.in file.
+ '''
+ parser = argparse.ArgumentParser(description=description)
+
+ # Adding arguments
+ parser.add_argument("-i", "--infile", help="conf.in file containing extra elements to include, (default none)")
+ parser.add_argument("-o", "--outfile", help="name of generated conf file, (default to screen)")
+ parser.add_argument("-v", "--v11n", default='KJV', help="versification schema, (default: KJV)")
+ parser.add_argument("-s", "--size", default='2' , help="set -s 4 for modules with large entries > 64Kb, (default -s 2)")
+ parser.add_argument('osis', help='OSIS XML file')
+
+ # Parsing arguments
+ args = parser.parse_args()
+
+ return (vars(args))
+
+
+def check_parameters(params):
+ '''
+ Check CLI parameters for validity
+ '''
+
+ # Checking OSIS file
+ osisfile = params['osis']
+ fileObj = Path(osisfile)
+ if not fileObj.is_file():
+ die(f"File '{osisfile}' does not exist.")
+
+ # Checking conf.in file in input
+ if params['infile']:
+ infile = params['infile']
+ fileObj = Path(infile)
+ if not fileObj.is_file():
+ die(f"File '{infile}' does not exist.")
+
+ # Checking Size
+ size = params['size']
+ if size not in ('2', '4'):
+ die(f"--size='{size}' Incorrect value.")
+
+ # Chexcking versification schema
+ v11n = params['v11n']
+ av11n = versification.keys()
+ if v11n not in av11n:
+ die(f"'{v11n}': Unknown versification schema.")
+ return (True)
+
+
+def get_osistext(osisfile):
+ """
+ Read osisText node from osis file.
+ Returns dict containing osisIDWork, osisRefWork, osisLang
+ """
+
+ # Search for <osisText ... > node
+ start_tag = '<osisText'
+ end_tag = '>'
+ start_tag_identified = False
+ node_identified = False
+ # osisText content in XML
+ captured_line = ''
+ # osisText attributes
+ osistext = dict([])
+ # open Osis
+ with open(osisfile) as f:
+ # Read lines until osisText is captured
+ while not node_identified:
+ line = f.readline()
+ if not line:
+ # End of File
+ die('osisText not found in osis file')
+ # Search for osisText tag
+ if start_tag in line:
+ start_tag_identified = True
+ if start_tag_identified:
+ # capture osisText content
+ captured_line += line
+ if end_tag in line:
+ # osisText is fully captured
+ node_identified = True
+ # Read attributes
+ for attribute in {'osisIDWork', 'osisRefWork', 'xml:lang'}:
+ value = (re.search(rf'{attribute}="(.+?)"', captured_line, flags=re.IGNORECASE))
+ if value:
+ osistext[attribute] = value.group(1)
+ else:
+ die(f'osisText attribute missing: {attribute}')
+ return osistext
+
+
+def check_osistext(osistext):
+ '''
+ Check osisText attributes
+ '''
+ # Check osisIDWork
+ module = osistext['osisIDWork'].lower()
+ if len(module) < 1:
+ die('FATAL: osisIDWork is empty.')
+
+ # Check osisRefWork
+ moduletype = osistext['osisRefWork']
+ if moduletype.lower() not in ['bible','commentary','genbook']:
+ die(f"FATAL: Invalid attribute osisRefWork: {osiswork}")
+
+ # Check Language
+ lang = osistext['xml:lang']
+ language = get_language(lang)
+
+ return True
+
+
+def get_language(lang):
+ """
+ Search BCP-47 Languages Database for lang
+ """
+ found = False
+
+ try:
+ tag = langtags.Tag(lang)
+ found = True
+ except:
+ die(f"Language '{lang}' not found in BCP 47 Languages Database")
+
+ # Sometimes language description is multiline -> remove '\n'
+ return (tag.language.description.replace('\n', ' '))
+
+
+def is_tag(xml_file, tag):
+ """
+ Search for 'tag' in OSIS file and returns True if 'tag' exists, False otherwise
+ """
+ # Start searching after <header> tag to avoid confusion
+ end_header_tag = '</header>'
+ header_read = False
+ # Tag to search
+ start_tag = f'<{tag}'
+ tag_identified = False
+ line = True
+ with open(xml_file) as f:
+ # Read until tag is identified
+ while line and not tag_identified:
+ line = f.readline()
+ # Skip <header> section
+ if end_header_tag in line:
+ header_read = True
+ if header_read:
+ if start_tag in line:
+ tag_identified = True
+ return tag_identified
+
+
+def is_attribute(xml_file, tag, attribute):
+ """
+ Search for 'tag' + 'attribute' in OSIS file,
+ returns True if 'tag' + 'attribute' exists, False otherwise
+ """
+ # Start searching after <header> tag to avoid confusion
+ end_header_tag = '</header>'
+ header_read = False
+ # Start and end tags defining the element that may have 'attribute'
+ start_tag = f'<{tag}'
+ end_tag = f'</{tag}>'
+ element = ''
+ start_tag_identified = False
+ attribute_identified = False
+ line = True
+ with open(xml_file) as f:
+ # Read lines until attribute is identified
+ while line and not attribute_identified:
+ line = f.readline()
+ # Skip <header> section
+ if end_header_tag in line:
+ header_read = True
+ if header_read:
+ # Search for tag
+ if start_tag in line:
+ start_tag_identified = True
+ if start_tag_identified:
+ # Read elemnt
+ element += line
+ if end_tag in line:
+ element += line
+ start_tag_identified = False
+ # Search for attribute
+ if attribute in line:
+ attribute_identified = True
+ element = ''
+ return attribute_identified
+
+
+def is_diacritic(xml_file, lang, diacritic):
+ '''
+ Search for 'diacritic' in OSIS File
+ Returns True or False
+ '''
+ # Don't search OSIS targetting other languages than Hebrew, Greek, Arabic
+ if not lang in ('ar','grc','he','hbo'):
+ return False
+ elif not lang in 'ar' and diacritic == 'Arabic Vowel Points':
+ return False
+ elif not lang in 'grc' and diacritic == 'Greek Accents':
+ return False
+ elif not lang in ('he','hbo') and diacritic == 'Hebrew Cantillation':
+ return False
+ elif not lang in ('he','hbo') and diacritic == 'Hebrew Vowel Points':
+ return False
+ else:
+ # Grab the base SWORD manager
+ mgr = Sword.SWMgr()
+ mgr.setGlobalOption("Arabic Vowel Points", "Off");
+ mgr.setGlobalOption("Greek Accents", "Off");
+ mgr.setGlobalOption("Hebrew Cantillation", "Off");
+ mgr.setGlobalOption("Hebrew Vowel Points", "Off");
+
+ # Parse XML
+ xml_text = ET.parse(xml_file)
+ xml_root = xml_text.getroot()
+
+ # Remove all tags and keep bare text only, make 2 sets
+ strip_text = ET.tostring(xml_root, encoding='unicode', method='text')
+ ref_text = Sword.SWBuf(strip_text)
+ mod_text = Sword.SWBuf(strip_text)
+
+ # Apply filter on 1 text
+ mgr.filterText(diacritic, mod_text)
+
+ # Compare original bare text and filtered one
+ # return True is the filter has made changes to the text, False otherwise
+ return(ref_text.c_str() != mod_text.c_str())
+
+
+def build_doc(conf):
+ '''
+ Generate conf file
+ '''
+ # Module Name
+ module = conf['osisIDWork']
+ doc.append("[" + module + "]")
+
+ # Module Type
+ moduletype = conf['osisRefWork']
+ # Parameters related to moduletype
+ # Big entries
+ size = conf['size']
+ block = '4' if size == '4' else ''
+ # mod
+ mod = module.lower()
+ # ModDrv + Datapath
+ if moduletype.lower() in 'bible':
+ doc.append("ModDrv=zText" + block)
+ doc.append("DataPath=./modules/texts/ztext" + block + "/" + mod + "/")
+ if moduletype.lower() in 'commentary':
+ doc.append("ModDrv=zCom" + block)
+ doc.append("DataPath=./modules/comments/zcom" + block + "/" + mod + "/")
+ if moduletype.lower() in 'genbook':
+ doc.append("ModDrv=RawGenBook" + block)
+ doc.append("DataPath=./modules/genbook/rawgenbook/" + block + "/" + mod + "/" + mod)
+
+ # Compression
+ if moduletype.lower() in ['bible','commentary']:
+ doc.append('CompressType=ZIP')
+
+ # misc.
+ doc.append('BlockType=BOOK')
+ doc.append('Encoding=UTF-8')
+ doc.append('SourceType=OSIS')
+ doc.append('OSISVersion=2.1.1')
+ doc.append('SwordVersionDate=' + str(date.today()))
+
+ # Language
+ lang = conf['xml:lang']
+ doc.append('Lang=' + lang)
+
+ # GlobalOptionFilter
+ # Get Osis file name
+ osis = conf['osis']
+ # We should have Footnotes before Headings on order to have
+ # working notes in titles
+ # Footnotes
+ if is_tag(osis, 'note'):
+ doc.append('GlobalOptionFilter=OSISFootnotes')
+ # Headings
+ if is_tag(osis, 'title'):
+ doc.append('GlobalOptionFilter=OSISHeadings')
+ # Scripref
+ if is_tag(osis, 'reference'):
+ doc.append('GlobalOptionFilter=OSISScripref')
+ # RedLetterWords
+ if is_tag(osis, 'q '):
+ doc.append('GlobalOptionFilter=OSISRedLetterWords')
+ # Variants
+ variants = False
+ if is_attribute(osis, 'seg', ' type="x-variant"'):
+ variants = True
+ if is_tag(osis, 'rdg'):
+ variants = True
+ if variants:
+ doc.append('GlobalOptionFilter=OSISVariants')
+ # MorphSegmentaton
+ osisMorphSegmentation = False
+ if is_attribute(osis, 'seg', 'type="morph"'):
+ osisMorphSegmentation = True
+ if is_attribute(osis, 'seg', 'type="x-morph"'):
+ osisMorphSegmentation = True
+ if osisMorphSegmentation:
+ doc.append('GlobalOptionFilter=OSISMorphSegmentation')
+ # Lemma
+ if is_attribute(osis, 'w', ' lemma='):
+ doc.append('GlobalOptionFilter=OSISLemma')
+ # Strong
+ strong = is_attribute(osis, 'w', 'strong:')
+ if strong:
+ doc.append('GlobalOptionFilter=OSISStrongs')
+ # Glosses
+ if is_attribute(osis, 'w', ' gloss='):
+ doc.append('GlobalOptionFilter=OSISGlosses')
+ # Morph
+ if is_attribute(osis, 'w', ' morph='):
+ doc.append('GlobalOptionFilter=OSISMorph')
+ # Enum
+ if is_attribute(osis, 'w', ' n='):
+ doc.append('GlobalOptionFilter=OSISEnum')
+ # Xlit
+ if is_attribute(osis, 'w', ' xlit='):
+ doc.append('GlobalOptionFilter=OSISXlit')
+
+ # Diacritics
+ # Hebrew Vowel Points
+ if is_diacritic(osis, lang, 'Hebrew Vowel Points'):
+ doc.append('GlobalOptionFilter=UTF8HebrewPoints')
+ # Arabic Vowel Points
+ if is_diacritic(osis, lang, 'Arabic Vowel Points'):
+ doc.append('GlobalOptionFilter=UTF8ArabicPoints')
+ # Hebrew Cantillation
+ if is_diacritic(osis, lang, 'Hebrew Cantillation'):
+ doc.append('GlobalOptionFilter=UTF8Cantillation')
+ # Greek Accents
+ if is_diacritic(osis, lang, 'Greek Accents'):
+ doc.append('GlobalOptionFilter=UTF8GreekAccents ')
+
+ # Features
+ # StrongsNumbers
+ if strong:
+ doc.append('Feature=StrongsNumbers')
+ # Images
+ if is_tag(osis, 'figure '):
+ doc.append('Feature=Images')
+ # NoParagraphs
+ if not is_tag(osis, 'p '):
+ doc.append('Feature=NoParagraphs')
+
+ # LCSH
+ lang_name = get_language(lang)
+ if moduletype.lower() in ['bible','commentary']:
+ doc.append('LCSH=' + moduletype + '.' + lang_name)
+
+ # Sword Minimum Version
+ doc.append('MinimumVersion=' + versification[conf['v11n']])
+ if moduletype.lower() in ['bible','commentary']:
+ doc.append('Versification=' + conf['v11n'])
+ return True
+
+
+def include_file(conf):
+ '''
+ Include conf.in file if it exists
+ '''
+ # Get conf.in file if it exists
+ infile = conf['infile']
+ if infile:
+ # Read and include conf.in contents
+ with open(infile, 'r', encoding='utf-8', newline='\n') as f:
+ for line in f:
+ doc.append(line.rstrip())
+ else:
+ # No conf.in file -> generate default values
+ module = conf['osisIDWork']
+ moduletype = conf['osisRefWork']
+ language = get_language(conf['xml:lang'])
+ doc.append('DistributionLicense=Copyrighted')
+ doc.append(f'Description={module}, {moduletype} in {language}')
+ doc.append(f'About={module}, {moduletype} in {language}')
+ doc.append('Version=1.0')
+ doc.append('History_1.0=First release')
+ return True
+
+
+def print_out(conf, doc):
+ '''
+ Print generated conf file
+ '''
+ # Get conf file name
+ outfile = conf['outfile']
+ if not outfile:
+ # Default to screen
+ for element in doc:
+ print(element)
+ else:
+ # Write config to file
+ with open(outfile, 'w') as f:
+ for element in doc:
+ print(element, file=f)
+ return True
+
+
+def main():
+ '''
+ Main function
+ '''
+ # Start benchmark
+ start_time = time.perf_counter()
+
+ # Read CLI params
+ params = get_parameters()
+ check_parameters(params)
+
+ # Read OSIS attributes
+ osis_attributes = (get_osistext(params['osis']))
+ #print(osis_attributes)
+ check_osistext(osis_attributes)
+
+ # Generate conf
+ cf = {**params, **osis_attributes}
+ build_doc(cf)
+ include_file(cf)
+ print_out(cf, doc)
+
+ # Benchmark results
+ end_time = time.perf_counter()
+ total_time = round(end_time - start_time, 1)
+ print(f'-- Module Config generated in {total_time} s')
+
+ return True
+
+
+main()