summaryrefslogtreecommitdiffstats
path: root/modules/wikisource/wycliffe.py
diff options
context:
space:
mode:
Diffstat (limited to 'modules/wikisource/wycliffe.py')
-rw-r--r--modules/wikisource/wycliffe.py124
1 files changed, 124 insertions, 0 deletions
diff --git a/modules/wikisource/wycliffe.py b/modules/wikisource/wycliffe.py
new file mode 100644
index 0000000..409f44f
--- /dev/null
+++ b/modules/wikisource/wycliffe.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+
+import urllib, urllib2
+import re
+import codecs
+
+WSbase = 'Bible_(Wycliffe)/'
+WSbooks = (
+ 'Genesis', 'Exodus', 'Leviticus', 'Numbers',
+ 'Deuteronomy', 'Joshua', 'Judges', 'Ruth',
+ '1 Kings', '2 Kings', '3 Kings', '4 Kings',
+ '1 Paralipomenon', '2 Paralipomenon', '1 Esdras', '2 Esdras',
+ '3 Esdras', 'Tobit', 'Judith', 'Esther',
+ 'Job', 'Psalms', 'Proverbs', 'Ecclesiastes',
+ 'Songes of Songes', 'Wisdom', 'Syrach', 'Isaiah',
+ 'Jeremiah', 'Lamentations', 'Preier of Jeremye', 'Baruk',
+ 'Ezechiel', 'Daniel', 'Osee', 'Joel',
+ 'Amos', 'Abdias', 'Jonas', 'Mychee',
+ 'Naum', 'Abacuk', 'Sofonye', 'Aggey',
+ 'Sacarie', 'Malachie', '1 Machabeis', '2 Machabeis',
+ 'Matheu', 'Mark', 'Luke', 'John',
+ 'Dedis of Apostlis', 'Romaynes', '1 Corinthis', '2 Corinthis',
+ 'Galathies', 'Effesies', 'Filipensis', 'Colosencis',
+ '1 Thessalonycensis', '2 Thessalonycensis', '1 Tymothe', '2 Tymothe',
+ 'Tite', 'Filemon', 'Ebrews', 'James',
+ '1 Petre', '2 Petre', '1 Joon', '2 Joon',
+ '3 Joon', 'Judas', 'Apocalips', 'Laodicensis'
+ )
+
+OSISbook = (
+ 'Gen', 'Exod', 'Lev', 'Num',
+ 'Deut', 'Josh', 'Judg', 'Ruth',
+ '1Sam', '2Sam', '1Kgs', '2Kgs',
+ '1Chr', '2Chr', 'Ezra', 'Neh',
+ '1Esd', 'Tob', 'Jdt', 'Esth',
+ 'Job', 'Ps', 'Prov', 'Eccl',
+ 'Song', 'Wis', 'Sir', 'Isa',
+ 'Jer', 'Lam', 'EpJer', 'Bar',
+ 'Ezek', 'Dan', 'Hos', 'Joel',
+ 'Amos', 'Obad', 'Jonah', 'Mic',
+ 'Nah', 'Hab', 'Zeph', 'Hag',
+ 'Zech', 'Mal', '1Macc', '2Macc',
+ 'Matt', 'Mark', 'Luke', 'John',
+ 'Acts', 'Rom', '1Cor', '2Cor',
+ 'Gal', 'Eph', 'Phil', 'Col',
+ '1Thess', '2Thess', '1Tim', '2Tim',
+ 'Titus', 'Phlm', 'Heb', 'Jas',
+ '1Pet', '2Pet', '1John', '2John',
+ '3John', 'Jude', 'Rev', 'EpLao'
+ )
+
+USFMbook = (
+ 'GEN', 'EXO', 'LEV', 'NUM',
+ 'DEU', 'JOS', 'JDG', 'RUT',
+ '1SA', '2SA', '1KI', '2KI',
+ '1CH', '2CH', 'EZR', 'NEH',
+ '1ES', 'TOB', 'JDT', 'EST',
+ 'JOB', 'PSA', 'PRO', 'ECC',
+ 'SNG', 'WIS', 'SIR', 'ISA',
+ 'JER', 'LAM', 'LJE', 'BAR',
+ 'EZK', 'DAN', 'HOS', 'JOL',
+ 'AMO', 'OBA', 'JON', 'MIC',
+ 'NAM', 'HAB', 'ZEP', 'HAG',
+ 'ZEC', 'MAL', '1MA', '2MA',
+ 'MAT', 'MRK', 'LUK', 'JHN',
+ 'ACT', 'ROM', '1CO', '2CO',
+ 'GAL', 'EPH', 'PHP', 'COL',
+ '1TH', '2TH', '1TI', '2TI',
+ 'TIT', 'PHM', 'HEB', 'JAS',
+ '1PE', '2PE', '1JN', '2JN',
+ '3JN', 'JUD', 'REV', 'LAO'
+ );
+
+"""
+articleList = ''
+for WSbook in WSbooks:
+ articleList += WSbase + '/' + WSbook + '\n'
+
+"""
+
+OSISdoc = ''
+
+for i in range(len(WSbooks)):
+ print WSbooks[i]
+ url = 'http://en.wikisource.org/wiki/Special:Export'
+ vals = {'pages':WSbase+WSbooks[i],
+ 'curonly':1,
+ 'wpDownload':0}
+ vals = urllib.urlencode(vals)
+ request = urllib2.Request(url,vals)
+ inputDoc = urllib2.urlopen(request).read().decode('utf-8')
+
+ inputDoc = re.sub(r'.*<text .+?>(.+?)</text>.*', r'\1', inputDoc, flags=re.DOTALL|re.IGNORECASE)
+
+ inputDoc = re.sub(' +', ' ', inputDoc)
+
+ inputDoc = re.sub(r'{{header.+?}}', '', inputDoc, flags=re.DOTALL|re.IGNORECASE)
+ inputDoc = re.sub(r'{{Other versions.+?}}', '', inputDoc, flags=re.DOTALL|re.IGNORECASE)
+ inputDoc = re.sub(r'{{biblecontents.+?}}.*', '', inputDoc, flags=re.DOTALL|re.IGNORECASE)
+ inputDoc = re.sub(r'\[\[Category.+?\]\]', '', inputDoc, flags=re.DOTALL|re.IGNORECASE)
+ inputDoc = re.sub(r'&lt;section .+?&gt;', '', inputDoc, flags=re.DOTALL|re.IGNORECASE)
+ inputDoc = re.sub(r'&lt;onlyinclude&gt;{{{.+?\|\s*(.+?)}}}&lt;/onlyinclude&gt;', r'\1', inputDoc, flags=re.DOTALL|re.IGNORECASE)
+ inputDoc = re.sub(r'&lt;/?onlyinclude&gt; *', '', inputDoc, flags=re.DOTALL|re.IGNORECASE)
+
+ inputDoc = re.sub(r'==Chapter \d+==', '', inputDoc, flags=re.IGNORECASE)
+
+ inputDoc = re.sub(r'{{chapter\|(\d+)}}\s*', r'\\c \1\n', inputDoc, flags=re.IGNORECASE)
+ inputDoc = re.sub(r'{{verse\|chapter=\d+\|verse=(\d+)}}\s*', r'\\v \1 ', inputDoc, flags=re.IGNORECASE)
+
+ inputDoc = re.sub(r'==External links==.+', '', inputDoc, flags=re.DOTALL|re.IGNORECASE)
+
+ inputDoc = re.sub(r'\[Note:(.+?) +\]', r'\\f + \1\\f*', inputDoc, flags=re.DOTALL|re.IGNORECASE)
+ inputDoc = re.sub(r'\[(.+?)\]', r'\\it \1\\it*', inputDoc, flags=re.DOTALL|re.IGNORECASE)
+
+ USFMdoc = codecs.open('wycliffe_'+str(i+1).zfill(2)+'_'+USFMbook[i]+'.usfm', 'w', 'utf-8')
+
+ USFMdoc.write('\id '+USFMbook[i]+'\n')
+ USFMdoc.write('\mt1 '+WSbooks[i]+'\n')
+
+ for l in inputDoc.split('\n'):
+ l = l.strip()
+ if l:
+ USFMdoc.write(l)
+ USFMdoc.write('\n')