1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
#!/usr/bin/python
import re
import codecs
from xml.etree import ElementTree as et
import sys
import logging
#logging.basicConfig(level=logging.DEBUG)
cur_year = 2012
def csv2dict(filename):
out_dict = {}
splitRE = re.compile(r'\s+')
field_names = ['no', 'test', 'cs_abbr', 'cs_name',
'de_abbr', 'de_name']
with codecs.open(filename, 'rb', 'utf8') as csvfile:
for row in csvfile:
line_dict = dict(zip(field_names, splitRE.split(row)))
out_dict[line_dict['cs_abbr']] = line_dict
return out_dict
def parse_body(elem):
out = []
body = et.Element("p")
logging.debug("body elem = %s", elem)
verses = list(elem.getiterator("L"))
logging.debug("verses = %s", verses)
logging.debug("verses = len %s", len(verses))
if len(verses) > 0:
body.text = verses[0].text
logging.debug("first line = %s", body.text)
if len(verses) > 1:
for line in verses[1:]:
l = et.SubElement(body, "br")
logging.debug("another line = %s", line.text)
l.tail = line.text
ref = elem.find("SL")
logging.debug("ref = %s", ref)
logging.debug("ref = len %s", len(ref))
out.append(body)
if ref is not None:
sig = et.Element("p")
sig.attrib['class'] = "reference"
sig.text = ref.text
out.append(sig)
return out
def parse_one(elem):
logging.debug("losung = %s", elem)
out = et.Element("article", attrib={
"id": "%4d-%02d-%02d" % (cur_year, int(elem.attrib["m"]),
int(elem.attrib["d"]))
})
head = et.SubElement(out, "header")
title = et.SubElement(head, "h1")
title.text = elem.find("TL").text
for tst in ['OT', 'NT']:
sect = elem.find(tst)
for el in parse_body(sect):
out.append(el)
return out
def parse_file(filename):
tree = et.parse(filename).getroot()
doc = et.Element("html")
head = et.SubElement(doc, "head")
et.SubElement(head, "meta", attrib={"charset": "utf-8"})
et.SubElement(head, "meta", attrib={
"name": "viewport",
"content": "width=device-width, initial-scale=1.0, " + \
" maximum-scale=2.0, user-scalable=yes"})
style = et.SubElement(head, "link", attrib={
"rel": "stylesheet",
"type": "text/css",
"href": "screen.css"
})
title = et.SubElement(head, "title")
title.text = "Title"
script = et.SubElement(head, "script",
attrib={
"type": "text/javascript",
"src": "hesla.js",
"async": "async",
"defer": "defer"
})
# script element cannot be empty, otherwise we would get self-closed
# element, which is invalid for <script>
script.text = "\n"
body = et.SubElement(doc, "body")
noscript = et.SubElement(body, "noscript")
noscript.text="I am sorry, this really doesn't work without JavaScript."
for los in tree.getiterator("LOSUNG"):
body.append(parse_one(los))
return "<!DOCTYPE html>\n" + et.tostring(doc, encoding="utf-8")
if __name__ == "__main__":
print(parse_file(sys.argv[1]))
|