summaryrefslogblamecommitdiffstats
path: root/modules/conf/confmaker.py
blob: d3ba12a6be621673d59c512746c9512326c6d715 (plain) (tree)
1
2
3


                       

































                                                                              

                                                              

 



                                                           
 

               
           
















                                                                                                                           
               
 
                                                 




















                           


             
        
                                             
        











                                             


                                                                           



                                                             









                                                                                        







                              


                            
 
                            




                                                   
                               





                                                      
                                 






                                                         

                       
                                                       
       
 











                                                                         








































                                                                                   
                           
        

                                                                                     
        






































































































































































































































                                                                                                              
                          

                                                                
                          

                                                                
                          

                                                                
                    




                                                                 
             

                                           
                   





                                                                       
 




                                                                     
 


                                  





                  
                       





                                     


                                    
 







































                                                                      





                                                           
      
#!/usr/bin/env python3

# -*- coding: utf-8 -*-
# confmaker.py - Provides a initial conf file for a new module by analyzing
#                the related OSIS xml file.

## The programme searches for relevant tags and creates the GlobalOptionFilter
#  entries and other relevant conf entries. This a port to Python from the
#  previous confmaker.pl Perl script we were using. It fixes detection of
#  diacritics and OSISMorphSegmentation (GlobalOpionFilters) and adds support
#  for genbook and modules with large entries > 64Kb.

# Copyright (C) 2020 CrossWire Bible Society


# Author: kris <kristof.szabo@lutheran.hu> & domcox <domcox@crosswire.org>

# This file is part of Sword Modules

# Sword Modules is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# Sword Modules is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with Sword Modules.  If not, see <https://www.gnu.org/licenses/>.

# Created:  2021-01-08
#
# Revision:
# 2021-01-16 domcox <domcox@crosswire.org>
#            Changed language library from iso-639 to langtags
# 2023-07-30 domcox <domcox@crosswire.org>
#            Full rewrite using ElementTree XML parsing module


# TODO:
# - EntrySize for verses that do not use milestone elements
# - EntrySize for book titles & introduction
# - Implement GlobalOptionsFilter=OSISReferenceLinks

import argparse
import sys
import time
import xml.etree.ElementTree as ET
from datetime import date
from pathlib import Path
try:
     import langtags
except:
     sys.stderr.write("You do not have the Python langtags library installed. Please install it (pip install langtags).\n")
     sys.exit(1)
try:
     import Sword
except:
     sys.stderr.write("You do not have the SWORD library installed. Please install it.\n")
     sys.exit(1)


# Variables

Version = '2.0'

# List of V11n and relative SWORD Minimum version
versification = {
    'KJV': '1.5.9',
    'KJVA': '1.6.0',
    'NRSV': '1.6.0',
    'NRSVA': '1.6.0',
    'MT': '1.6.0',
    'Leningrad': '1.6.0',
    'Synodal': '1.6.1',
    'Vulg': '1.6.1',
    'Luther': '1.6.1',
    'German': '1.6.1',
    'Catholic': '1.6.2',
    'Catholic2': '1.6.2',
    'LXX': '1.7.2',
    'Orthodox': '1.7.2',
    'SynodalProt': '1.7.2',
    'DarbyFr': '1.8.0',
    'Segond': '1.8.0',
    'Calvin': '1.8.0'
}

# Functions

def die(msg):
     """
     Show an error message then exit on error
     """
     print('ERROR! ' + msg, file=sys.stderr)
     sys.exit(1)


def get_parameters():
    """
    Get Parse command-line options.
    Returns dict containing parameters values
    """

    # Creating parser
    description = '''
    provides a conf file for a module by analysing the given OSIS XML file.
    Optionally include extra elements from a conf.in file. This option will
    be removed in a future version.
    '''
    parser = argparse.ArgumentParser(description=description)

    # Adding arguments
    parser.add_argument('osis',
               help='name of the OSIS XML file')
    parser.add_argument("-o", "--outfile",
               help="name of generated conf file, (default to screen)")
    parser.add_argument("-v", "--v11n", default='KJV',
               help="versification schema, (default: KJV)")
    parser.add_argument("-s", "--size", default='2',
               help="set -s 4 for modules with large entries > 64Kb, (default -s 2)")
    parser.add_argument("-i", "--infile",
               help="conf.in file containing extra elements to include, (default none)")

    # Parsing arguments
    args = parser.parse_args()

    return (vars(args))


def check_parameters(params):
     """
     Check command arguments
     """

     # Check OSIS file value
     osisfile = params['osis']
     fileObj = Path(osisfile)
     if not fileObj.is_file():
          die(f"File '{osisfile}' does not exist.")

     # Check conf.in file value
     if params['infile']:
          infile = params['infile']
          fileObj = Path(infile)
          if not fileObj.is_file():
               die(f"File '{infile}' does not exist.")

     # Check versification schema
     v11n = params['v11n']
     av11n = versification.keys()
     if v11n not in av11n:
          die(f"'{v11n}': Unknown versification schema.")
     return (True)


def get_language(lang):
    """
    Search BCP-47 Languages Database for the given lang
    """

    found = False

    try:
        tag = langtags.Tag(lang)
        found = True
    except:
         die(f"Language '{lang}' not found in BCP 47 Languages Database")

    # Sometimes language description is multiline -> remove '\n'
    return (tag.language.description.replace('\n', ' '))


def is_diacritic(xml_file, lang, diacritic):
     '''
     Search for 'diacritic' in OSIS File
     Returns True or False
     '''
     # Don't search OSIS targetting other languages than Hebrew, Greek, Arabic
     if not lang in ('ar','grc','he','hbo'):
          return False
     elif not lang in 'ar' and diacritic == 'Arabic Vowel Points':
          return False
     elif not lang in 'grc' and diacritic == 'Greek Accents':
          return False
     elif not lang in ('he','hbo') and diacritic == 'Hebrew Cantillation':
          return False
     elif not lang in ('he','hbo') and diacritic == 'Hebrew Vowel Points':
          return False
     else:
          # Grab the base SWORD manager
          mgr = Sword.SWMgr()
          mgr.setGlobalOption("Arabic Vowel Points", "Off");
          mgr.setGlobalOption("Greek Accents", "Off");
          mgr.setGlobalOption("Hebrew Cantillation", "Off");
          mgr.setGlobalOption("Hebrew Vowel Points", "Off");

          # Parse XML
          xml_text = ET.parse(xml_file)
          xml_root = xml_text.getroot()

          # Remove all tags and keep bare text only, make 2 sets
          strip_text = ET.tostring(xml_root, encoding='unicode', method='text')
          ref_text = Sword.SWBuf(strip_text)
          mod_text = Sword.SWBuf(strip_text)

          # Apply filter on 1 text
          mgr.filterText(diacritic, mod_text)

          # Compare original bare text and filtered one
          # return True is the filter has made changes to the text, False otherwise
          return(ref_text.c_str() != mod_text.c_str())


def osis2conf_parser(args):
     '''
     This function Parses the OSIS file. searches for specific tags
     and creates the relevant conf elements that will be used to build the conf file.
     '''
     # Variables:

     # 1. List of Key elements of the resulting SWORD conf file
     Elements = []

     # 2. OSIS sections
     Header = False
     Chapter = False
     Verse = False

     # 3. Big entry
     if int(args['size']) > 2:
          EntrySize = 655536
     else:
          EntrySize=0

     # 4. Key elements of a SWORD module.conf
     Name = ''
     Type = ''
     Lang = ''
     Description = ''
     About = ''
     TextSource = ''
     DistributionLicense = ''
     OSISFootnotes = False
     OSISHeadings = False
     OSISScripref = False
     OSISRedLetterWords = False
     OSISVariants = False
     OSISMorphSegmentation = False
     OSISLemma = False
     OSISStrongs = False
     OSISGlosses = False
     OSISMorph = False
     OSISEnum = False
     OSISXlit = False
     Images = False
     NoParagraphs = True
     Copyright = ''
     CopyrightHolder = ''
     CopyrightDate = ''
     CopyrightNotes = ''
     CopyrightContactName = ''
     CopyrightContactNotes = ''
     CopyrightContactAddress = ''
     Abbreviation = ''
     KeyType = ''
     DisplayLevel = ''
     CaseSensitiveKeys=''
     PreferredCSSXHTML = ''
     Obsoletes = ''
     Companion = ''

     # Let's parse
     for event, node in ET.iterparse(args['osis'], events=("start", "end")):
          # OsisText content
          if not Name:
               if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}osisText':
                    # Get osisIDWork
                    Name = node.get('osisIDWork')
                    if not Name:
                         die('FATAL: osisIDWork is empty.')
                    # Get osisRefWork
                    Type = node.get('osisRefWork').lower()
                    if Type not in ['bible', 'commentary', 'genbook']:
                         die(f"FATAL: Invalid attribute osisRefWork: {osiswork}")
                    # Get Language
                    Lang = node.get('{http://www.w3.org/XML/1998/namespace}lang')
                    if not Lang:
                         die(f'FATAL: Missing lang element')

          # Select header
          if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}header' and event == "start":
               Header = True
          if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}header' and event == "end":
               Header = False
          # Select Chapter
          if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}chapter' and event == "start":
               Chapter = True
          if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}chapter' and event == "end":
               Chapter = False

          # GlobalOptionFilters
          if not Header:
               # Footnotes
               if not OSISFootnotes:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}note':
                         OSISFootnotes = True
               # Headings
               if not OSISHeadings:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}title':
                         OSISHeadings = True
               # Scripref
               if not OSISScripref:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}reference':
                         OSISScripref = True
               # RedLetterWords
               if not OSISRedLetterWords:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}q':
                         OSISRedLetterWords = True
               # Variants
               if not OSISVariants:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}seg':
                         if 'type' in node.keys():
                              if 'x-variant' in node.get('type'):
                                   OSISVariants = True
               # MorphSegmentation
               if not OSISMorphSegmentation:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}seg':
                         if 'type' in node.keys():
                              if 'morph:' in node.get('type'):
                                   OSISMorphSegmentation = True
               # Lemma
               if not OSISLemma:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
                         if node.get('lemma') != None:
                              OSISLemma = True
               # Strongs
               if not OSISStrongs:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
                         if 'lemma' in node.keys():
                              if 'strong' in node.get('lemma'):
                                   OSISStrongs = True
               # Glosses
               if not OSISGlosses:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
                         if 'gloss' in node.keys():
                              OSISGlosses = True
               # Morph
               if not OSISMorph:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
                         if 'morph' in node.keys():
                              OSISMorph = True
               # Enum
               if not OSISEnum:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
                         if 'n' in node.keys():
                              OSISEnum = True
               # Xlit
               if not OSISXlit:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}w':
                         if 'xlit' in node.keys():
                              OSISXlit = True

               # Images
               if not Images:
                    if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}figure':
                         Images = True
               # Search only inside Chapters
               if Chapter:
                    # NoParagraphs
                    if NoParagraphs:
                         if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}p':
                              NoParagraphs = False
               # Entries length - Get verse max size
               if node.tag == '{http://www.bibletechnologies.net/2003/OSIS/namespace}verse':
                    if 'sID' in node.keys():
                         Verse = True
                         rawtext = ''
                    if 'eID' in node.keys():
                         Verse = False
                         if len(rawtext) > EntrySize:
                              EntrySize =len(rawtext)
               # Entries length - Get verse text
               if Verse:
                    if node.text != None:
                         rawtext = rawtext + node.text
                    if node.tail != None:
                         rawtext = rawtext + node.tail

     # Define Elements
     # Set Name
     Elements.append("[" + Name + "]")
     # Derive module name
     module = Name.lower()
     # Set big entry option for entries greater than 64K bytes
     big = ''
     if EntrySize > 64000:
          big='4'
     # Set ModDrv
     if Type in 'bible':
          Elements.append("ModDrv=zText" + big)
     if Type in 'commentary':
          Elements.append("ModDrv=zCom" + big)
     if Type in 'genbook':
          Elements.append("ModDrv=RawGenBook" + big)
     # Set Datapath
     if Type in 'bible':
          Elements.append("DataPath=./modules/texts/ztext" + big + "/" + module + "/")
     if Type in 'commentary':
          Elements.append("DataPath=./modules/comments/zcom" + big + "/" + module + "/")
     if Type in 'genbook':
          Elements.append("DataPath=./modules/genbook/rawgenbook/" + big + "/" + module + "/" + module)
     # Set Compression
     if Type in ['bible', 'commentary']:
          Elements.append('CompressType=ZIP')
     # Set misc. elements
     Elements.append('BlockType=BOOK')
     Elements.append('Encoding=UTF-8')
     Elements.append('SourceType=OSIS')
     Elements.append('OSISVersion=2.1.1')
     Elements.append('SwordVersionDate=' + str(date.today()))

     # Set Lang
     Elements.append('Lang=' + Lang)
     # Set GlobalOptionFilters
     if OSISFootnotes:
          Elements.append('GlobalOptionFilter=OSISFootnotes')
     if OSISHeadings:
          Elements.append('GlobalOptionFilter=OSISHeadings')
     if OSISScripref:
          Elements.append('GlobalOptionFilter=OSISScripref')
     if OSISRedLetterWords:
          Elements.append('GlobalOptionFilter=OSISRedLetterWords')
     if OSISVariants:
          Elements.append('GlobalOptionFilter=OSISVariants')
     if OSISMorphSegmentation:
          Elements.append('GlobalOptionFilter=OSISMorphSegmentation')
     if OSISLemma:
          Elements.append('GlobalOptionFilter=OSISLemma')
     if OSISStrongs:
          Elements.append('GlobalOptionFilter=OSISStrongs')
     if OSISGlosses:
          Elements.append('GlobalOptionFilter=OSISGlosses')
     if OSISMorph:
          Elements.append('GlobalOptionFilter=OSISMorph')
     if OSISEnum:
          Elements.append('GlobalOptionFilter=OSISEnum')
     if OSISXlit:
          Elements.append('GlobalOptionFilter=OSISXlit')
     # Set Diacritics
     # Hebrew Vowel Points
     if is_diacritic(args['osis'], Lang, 'Hebrew Vowel Points'):
          Elements.append('GlobalOptionFilter=UTF8HebrewPoints')
     # Arabic Vowel Points
     if is_diacritic(args['osis'], Lang, 'Arabic Vowel Points'):
          Elements.append('GlobalOptionFilter=UTF8ArabicPoints')
     # Hebrew Cantillation
     if is_diacritic(args['osis'], Lang, 'Hebrew Cantillation'):
          Elements.append('GlobalOptionFilter=UTF8Cantillation')
     # Greek Accents
     if is_diacritic(args['osis'], Lang, 'Greek Accents'):
          Elements.append('GlobalOptionFilter=UTF8GreekAccents ')
     # Set Features
     if OSISStrongs:
          Elements.append('Feature=StrongsNumbers')
     # Images
     if Images:
          Elements.append('Feature=Images')
     # NoParagraphs
     if NoParagraphs:
          Elements.append('Feature=NoParagraphs')
     # Set LCSH
     language = get_language(Lang)
     if Type not in 'genbook':
          Elements.append('LCSH=' + Type.capitalize() + '.' + language)

     # Set Sword Minimum Version
     Elements.append('MinimumVersion=' + versification[args['v11n']])
     # Set Versification
     if Type not in 'genbook':
          Elements.append('Versification=' + args['v11n'])

     # End
     print('EntrySize=',EntrySize)
     return Elements


def main():
     '''
     Main function
     '''
      # Start benchmark
     start_time = time.perf_counter()

     # Read CLI params
     params = get_parameters()
     check_parameters(params)

     # Parse OSIS
     conf = osis2conf_parser(params)
     # print('conf=', conf)

     # Generate conf file
     outfile = params['outfile']
     if not outfile:
          # Default to screen
          for key in conf:
               print(key)
     else:
          # Write config to file
          with open(outfile, 'w') as f:
               for key in conf:
                    print(key, file=f)

     # Include conf.in file if it exists
     infile = params['infile']
     if infile:
          # Read and include conf.in contents
          with open(infile, 'r', encoding='utf-8', newline='\n') as f:
               for line in f:
                    if not outfile:
                         # Default to screen
                         print(line.rstrip())
                    else:
                         # Write config to file
                         with open(outfile, 'a') as f:
                              print(line.rstrip(), file=f)
     else:
          # No conf.in file -> generate default values
          if not outfile:
               # Default to screen
               print('DistributionLicense=Copyrighted')
               print('Description=This is a new module')
               print('Version=1.0')
               print('History_1.0=First release')
          else:
               # Write config to file
               with open(outfile, 'a') as f:
                    print('DistributionLicense=Copyrighted', file=f)
                    print('Description=This is a new module', file=f)
                    print('Version=1.0', file=f)
                    print('History_1.0=First release', file=f)

     # Benchmark results
     end_time = time.perf_counter()
     total_time = round(end_time - start_time, 1)
     print(f'-- Module Config generated in {total_time} s')

main()