#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# confmaker.py - Provides a initial conf file for a new module by analyzing
# the related OSIS xml file.
## The programme searches for relevant tags and creates the GlobalOptionFilter
# entries and other relevant conf entries. This a port to Python from the
# previous confmaker.pl Perl script we were using. It fixes detection of
# diacritics and OSISMorphSegmentation (GlobalOpionFilters) and adds support
# for genbook and modules with large entries > 64Kb.
# Copyright (C) 2020 CrossWire Bible Society
# Author: kris <kristof.szabo@lutheran.hu> & domcox <domcox@crosswire.org>
# This file is part of Sword Modules
# Sword Modules is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# Sword Modules is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with Sword Modules. If not, see <https://www.gnu.org/licenses/>.
# Created: 2021-01-08
#
# Revision:
# 2021-01-16 domcox <domcox@crosswire.org>
# Changed language library from iso-639 to langtags
# 2023-07-30 domcox <domcox@crosswire.org>
# Full rewrite using ElementTree XML parsing module
# TODO:
# - EntrySize for verses that do not use milestone elements
# - EntrySize for book titles & introduction
# - Implement GlobalOptionsFilter=OSISReferenceLinks
import argparse
import sys
import time
import xml.etree.ElementTree as ET
from datetime import date
from pathlib import Path
try:
import langtags
except:
sys.stderr.write("You do not have the Python langtags library installed. Please install it (pip install langtags).\n")
sys.exit(1)
try:
import Sword
except:
sys.stderr.write("You do not have the SWORD library installed. Please install it.\n")
sys.exit(1)
# Variables
Version = '2.0'
# List of V11n and relative SWORD Minimum version
versification = {
'KJV': '1.5.9',
'KJVA': '1.6.0',
'NRSV': '1.6.0',
'NRSVA': '1.6.0',
'MT': '1.6.0',
'Leningrad': '1.6.0',
'Synodal': '1.6.1',
'Vulg': '1.6.1',
'Luther': '1.6.1',
'German': '1.6.1',
'Catholic': '1.6.2',
'Catholic2': '1.6.2',
'LXX': '1.7.2',
'Orthodox': '1.7.2',
'SynodalProt': '1.7.2',
'DarbyFr': '1.8.0',
'Segond': '1.8.0',
'Calvin': '1.8.0'
}
# Functions
def die(msg):
"""
Show an error message then exit on error
"""
print('ERROR! ' + msg, file=sys.stderr)
sys.exit(1)
def get_parameters():
"""
Get Parse command-line options.
Returns dict containing parameters values
"""
# Creating parser
description = '''
provides a conf file for a module by analysing the given OSIS XML file.
Optionally include extra elements from a conf.in file. This option will
be removed in a future version.
'''
parser = argparse.ArgumentParser(description=description)
# Adding arguments
parser.add_argument('osis',
help='name of the OSIS XML file')
parser.add_argument("-o", "--outfile",
help="name of generated conf file, (default to screen)")
parser.add_argument("-v", "--v11n", default='KJV',
help="versification schema, (default: KJV)")
parser.add_argument("-s", "--size", default='2',
help="set -s 4 for modules with large entries > 64Kb, (default -s 2)")
parser.add_argument("-i", "--infile",
help="conf.in file containing extra elements to include, (default none)")
# Parsing arguments
args = parser.parse_args()
return (vars(args))
def check_parameters(params):
"""
Check command arguments
"""
# Check OSIS file value
osisfile = params['osis']
fileObj = Path(osisfile)
if not fileObj.is_file():
die(f"File '{osisfile}' does not exist.")
# Check conf.in file value
if params['infile']:
infile = params['infile']
fileObj = Path(infile)
if not fileObj.is_file():
die(f"File '{infile}' does not exist.")
# Check versification schema
v11n = params['v11n']
av11n = versification.keys()
if v11n not in av11n:
die(f"'{v11n}': Unknown versification schema.")
return (True)
def get_language(lang):
"""
Search BCP-47 Languages Database for the given lang
"""
found = False
try:
tag = langtags.Tag(lang)
found = True
except:
die(f"Language '{lang}' not found in BCP 47 Languages Database")
# Sometimes language description is multiline -> remove '\n'
return (tag.language.description.replace('\n', ' '))
def is_diacritic(xml_file, lang, diacritic):
'''
Search for 'diacritic' in OSIS File
Returns True or False
'''
# Don't search OSIS targetting other languages than Hebrew, Greek, Arabic
if not lang in ('ar','grc','he','hbo'):
return False
elif not lang in 'ar' and diacritic == 'Arabic Vowel Points':
return False
elif not lang in 'grc' and diacritic == 'Greek Accents':
return False
elif not lang in ('he','hbo') and diacritic == 'Hebrew Cantillation':
return False
elif not lang in ('he','hbo') and diacritic == 'Hebrew Vowel Points':
return False
else:
# Grab the base SWORD manager
mgr = Sword.SWMgr()
mgr.setGlobalOption("Arabic Vowel Points", "Off");
mgr.setGlobalOption("Greek Accents", "Off");
mgr.setGlobalOption("Hebrew Cantillation", "Off");
mgr.setGlobalOption("Hebrew Vowel Points", "Off");
# Parse XML
xml_text = ET.parse(xml_file)
xml_root = xml_text.getroot()
# Remove all tags and keep bare text only, make 2 sets
strip_text = ET.tostring(xml_root, encoding='unicode', method='text')
ref_text = Sword.SWBuf(strip_text)
mod_text = Sword.SWBuf(strip_text)
# Apply filter on 1 text
mgr.filterText(diacritic, mod_text)
# Compare original bare text and filtered one
# return True is the filter has made changes to the text, False otherwise
return(ref_text.c_str() != mod_text.c_str())
def osis2conf_parser(args):
'''
This function Parses the OSIS file. searches for specific tags
and creates the relevant conf elements that will be used to build the conf file.
'''
# Variables:
# 1. List of Key elements of the resulting SWORD conf file
Elements = []
# 2. OSIS sections
Header = False
Chapter = False
Verse = False
# 3. Big entry
if int(args['size']) > 2:
EntrySize = 655536
else:
EntrySize=0
# 4. Key elements of a SWORD module.conf
Name = ''
Type = ''
Lang = ''
Description = ''
About = ''
TextSource = ''
DistributionLicense = ''
OSISFootnotes = False
OSISHeadings = False
OSISScripref = False
OSISRedLetterWords = False
OSISVariants = False
OSISMorphSegmentation = False
OSISLemma = False
OSISStrongs = False
OSISGlosses = False
OSISMorph = False
OSISEnum = False
OSISXlit = False
Images = False
NoParagraphs = True
Copyright = ''
CopyrightHolder = ''
CopyrightDate = ''
CopyrightNotes = ''
CopyrightContactName = ''
CopyrightContactNotes = ''
CopyrightContactAddress = ''
Abbreviation = ''
KeyType = ''
DisplayLevel = ''
CaseSensitiveKeys=''
PreferredCSSXHTML = ''
Obsoletes = ''
Companion = ''
# Let's parse
for event, node in ET.iterparse(args['osis'], events=("start", "end")):
# OsisText content
if not Name:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}osisText':
# Get osisIDWork
Name = node.get('osisIDWork')
if not Name:
die('FATAL: osisIDWork is empty.')
# Get osisRefWork
Type = node.get('osisRefWork').lower()
if Type not in ['bible', 'commentary', 'genbook']:
die(f"FATAL: Invalid attribute osisRefWork: {osiswork}")
# Get Language
Lang = node.get('{http://www.w3.org/XML/1998/namespace}lang')
if not Lang:
die(f'FATAL: Missing lang element')
# Select header
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}header' and event == "start":
Header = True
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}header' and event == "end":
Header = False
# Select Chapter
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}chapter' and event == "start":
Chapter = True
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}chapter' and event == "end":
Chapter = False
# GlobalOptionFilters
if not Header:
# Footnotes
if not OSISFootnotes:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}note':
OSISFootnotes = True
# Headings
if not OSISHeadings:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}title':
OSISHeadings = True
# Scripref
if not OSISScripref:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}reference':
OSISScripref = True
# RedLetterWords
if not OSISRedLetterWords:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}q':
OSISRedLetterWords = True
# Variants
if not OSISVariants:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}seg':
if 'type' in node.keys():
if 'x-variant' in node.get('type'):
OSISVariants = True
# MorphSegmentation
if not OSISMorphSegmentation:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}seg':
if 'type' in node.keys():
if 'morph:' in node.get('type'):
OSISMorphSegmentation = True
# Lemma
if not OSISLemma:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
if node.get('lemma') != None:
OSISLemma = True
# Strongs
if not OSISStrongs:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
if 'lemma' in node.keys():
if 'strong' in node.get('lemma'):
OSISStrongs = True
# Glosses
if not OSISGlosses:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
if 'gloss' in node.keys():
OSISGlosses = True
# Morph
if not OSISMorph:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
if 'morph' in node.keys():
OSISMorph = True
# Enum
if not OSISEnum:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
if 'n' in node.keys():
OSISEnum = True
# Xlit
if not OSISXlit:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
if 'xlit' in node.keys():
OSISXlit = True
# Images
if not Images:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}figure':
Images = True
# Search only inside Chapters
if Chapter:
# NoParagraphs
if NoParagraphs:
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}p':
NoParagraphs = False
# Entries length - Get verse max size
if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}verse':
if 'sID' in node.keys():
Verse = True
rawtext = ''
if 'eID' in node.keys():
Verse = False
if len(rawtext) > EntrySize:
EntrySize =len(rawtext)
# Entries length - Get verse text
if Verse:
if node.text != None:
rawtext = rawtext + node.text
if node.tail != None:
rawtext = rawtext + node.tail
# Define Elements
# Set Name
Elements.append("[" + Name + "]")
# Derive module name
module = Name.lower()
# Set big entry option for entries greater than 64K bytes
big = ''
if EntrySize > 64000:
big='4'
# Set ModDrv
if Type in 'bible':
Elements.append("ModDrv=zText" + big)
if Type in 'commentary':
Elements.append("ModDrv=zCom" + big)
if Type in 'genbook':
Elements.append("ModDrv=RawGenBook" + big)
# Set Datapath
if Type in 'bible':
Elements.append("DataPath=./modules/texts/ztext" + big + "/" + module + "/")
if Type in 'commentary':
Elements.append("DataPath=./modules/comments/zcom" + big + "/" + module + "/")
if Type in 'genbook':
Elements.append("DataPath=./modules/genbook/rawgenbook/" + big + "/" + module + "/" + module)
# Set Compression
if Type in ['bible', 'commentary']:
Elements.append('CompressType=ZIP')
# Set misc. elements
Elements.append('BlockType=BOOK')
Elements.append('Encoding=UTF-8')
Elements.append('SourceType=OSIS')
Elements.append('OSISVersion=2.1.1')
Elements.append('SwordVersionDate=' + str(date.today()))
# Set Lang
Elements.append('Lang=' + Lang)
# Set GlobalOptionFilters
if OSISFootnotes:
Elements.append('GlobalOptionFilter=OSISFootnotes')
if OSISHeadings:
Elements.append('GlobalOptionFilter=OSISHeadings')
if OSISScripref:
Elements.append('GlobalOptionFilter=OSISScripref')
if OSISRedLetterWords:
Elements.append('GlobalOptionFilter=OSISRedLetterWords')
if OSISVariants:
Elements.append('GlobalOptionFilter=OSISVariants')
if OSISMorphSegmentation:
Elements.append('GlobalOptionFilter=OSISMorphSegmentation')
if OSISLemma:
Elements.append('GlobalOptionFilter=OSISLemma')
if OSISStrongs:
Elements.append('GlobalOptionFilter=OSISStrongs')
if OSISGlosses:
Elements.append('GlobalOptionFilter=OSISGlosses')
if OSISMorph:
Elements.append('GlobalOptionFilter=OSISMorph')
if OSISEnum:
Elements.append('GlobalOptionFilter=OSISEnum')
if OSISXlit:
Elements.append('GlobalOptionFilter=OSISXlit')
# Set Diacritics
# Hebrew Vowel Points
if is_diacritic(args['osis'], Lang, 'Hebrew Vowel Points'):
Elements.append('GlobalOptionFilter=UTF8HebrewPoints')
# Arabic Vowel Points
if is_diacritic(args['osis'], Lang, 'Arabic Vowel Points'):
Elements.append('GlobalOptionFilter=UTF8ArabicPoints')
# Hebrew Cantillation
if is_diacritic(args['osis'], Lang, 'Hebrew Cantillation'):
Elements.append('GlobalOptionFilter=UTF8Cantillation')
# Greek Accents
if is_diacritic(args['osis'], Lang, 'Greek Accents'):
Elements.append('GlobalOptionFilter=UTF8GreekAccents ')
# Set Features
if OSISStrongs:
Elements.append('Feature=StrongsNumbers')
# Images
if Images:
Elements.append('Feature=Images')
# NoParagraphs
if NoParagraphs:
Elements.append('Feature=NoParagraphs')
# Set LCSH
language = get_language(Lang)
if Type not in 'genbook':
Elements.append('LCSH=' + Type.capitalize() + '.' + language)
# Set Sword Minimum Version
Elements.append('MinimumVersion=' + versification[args['v11n']])
# Set Versification
if Type not in 'genbook':
Elements.append('Versification=' + args['v11n'])
# End
print('EntrySize=',EntrySize)
return Elements
def main():
'''
Main function
'''
# Start benchmark
start_time = time.perf_counter()
# Read CLI params
params = get_parameters()
check_parameters(params)
# Parse OSIS
conf = osis2conf_parser(params)
# print('conf=', conf)
# Generate conf file
outfile = params['outfile']
if not outfile:
# Default to screen
for key in conf:
print(key)
else:
# Write config to file
with open(outfile, 'w') as f:
for key in conf:
print(key, file=f)
# Include conf.in file if it exists
infile = params['infile']
if infile:
# Read and include conf.in contents
with open(infile, 'r', encoding='utf-8', newline='\n') as f:
for line in f:
if not outfile:
# Default to screen
print(line.rstrip())
else:
# Write config to file
with open(outfile, 'a') as f:
print(line.rstrip(), file=f)
else:
# No conf.in file -> generate default values
if not outfile:
# Default to screen
print('DistributionLicense=Copyrighted')
print('Description=This is a new module')
print('Version=1.0')
print('History_1.0=First release')
else:
# Write config to file
with open(outfile, 'a') as f:
print('DistributionLicense=Copyrighted', file=f)
print('Description=This is a new module', file=f)
print('Version=1.0', file=f)
print('History_1.0=First release', file=f)
# Benchmark results
end_time = time.perf_counter()
total_time = round(end_time - start_time, 1)
print(f'-- Module Config generated in {total_time} s')
main()