diff options
Diffstat (limited to 'XParser.py')
-rw-r--r-- | XParser.py | 269 |
1 files changed, 269 insertions, 0 deletions
diff --git a/XParser.py b/XParser.py new file mode 100644 index 0000000..dcfedc4 --- /dev/null +++ b/XParser.py @@ -0,0 +1,269 @@ +# Copyright (c) 2001 - 2005 +# Yuan Wang. All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Redistributions in any form must be accompanied by information on +# how to obtain complete source code for the X-Diff software and any +# accompanying software that uses the X-Diff software. The source code +# must either be included in the distribution or be available for no +# more than the cost of distribution plus a nominal fee, and must be +# freely redistributable under reasonable conditions. For an executable +# file, complete source code means the source code for all modules it +# contains. It does not include source code for modules or files that +# typically accompany the major components of the operating system on +# which the executable file runs. + +# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, +# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import xml.sax +_PARSER_NAME = "org.apache.xerces.parsers.SAXParser"; + +# FIXME +# This is interesting +# http://www.virtuousprogrammer.com/?page_id=183 +# http://docs.python.org/library/xml.sax.reader.html +# <code>XParser</code> parses an input XML document and constructs an +# <code>XTree</code> +# class XParser extends DefaultHandler implements LexicalHandler +class XParser(xml.sax.handler.ContentHandler): + _setValidation = False + _setNameSpaces = True + _setSchemaSupport = True + _setSchemaFullSupport = False + _setNameSpacePrefixes = True + + _STACK_SIZE = 100 + + private XMLReader _parser; + private XTree _xtree; + private int _idStack[], _lsidStack[]; // id and left sibling + private long _valueStack[]; + private int _stackTop, _currentNodeID; + private boolean _readElement; + private StringBuffer _elementBuffer; + +# Constructor. + def __init__(self): + { + XHash.initialize(); + try + { + _parser = (XMLReader)Class.forName(_PARSER_NAME).newInstance(); + _parser.setFeature("http://xml.org/sax/features/validation", _setValidation); + _parser.setFeature("http://xml.org/sax/features/namespaces", _setNameSpaces); + _parser.setFeature("http://apache.org/xml/features/validation/schema", _setSchemaSupport); + _parser.setFeature("http://apache.org/xml/features/validation/schema-full-checking", _setSchemaFullSupport); + _parser.setFeature("http://xml.org/sax/features/namespace-prefixes", _setNameSpacePrefixes); + + _parser.setContentHandler(this); + _parser.setErrorHandler(this); + _parser.setProperty("http://xml.org/sax/properties/lexical-handler", this); + } + catch (Exception e) + { + System.err.println(e.getMessage()); + System.exit(1); + } + + _idStack = new int[_STACK_SIZE]; + _lsidStack = new int[_STACK_SIZE]; + _valueStack = new long[_STACK_SIZE]; + _stackTop = 0; + _currentNodeID = XTree.NULL_NODE; + _elementBuffer = new StringBuffer(); + } + +# Parse an XML document +# @param uri input XML document +# @return the created XTree + def parse(String uri): + { + _xtree = new XTree(); + _idStack[_stackTop] = XTree.NULL_NODE; + _lsidStack[_stackTop] = XTree.NULL_NODE; + + try + { + _parser.parse(uri); + } + catch (Exception e) + { + System.err.println(e.getMessage()); + System.exit(1); + } + + return _xtree; + } + + // Document handler methods + + public void startElement(String uri, String local, String raw, + Attributes attrs) + { + // if text is mixed with elements + if (_elementBuffer.length() > 0) + { + String text = _elementBuffer.toString().trim(); + if (text.length() > 0) + { + long value = XHash.hash(text); + int tid = _xtree.addText(_idStack[_stackTop], _lsidStack[_stackTop], text, value); + _lsidStack[_stackTop] = tid; + _currentNodeID = tid; + _valueStack[_stackTop] += value; + } + } + + int eid = _xtree.addElement(_idStack[_stackTop], + _lsidStack[_stackTop], local); + + // Update last sibling info. + _lsidStack[_stackTop] = eid; + + // Push + _stackTop++; + _idStack[_stackTop] = eid; + _currentNodeID = eid; + _lsidStack[_stackTop] = XTree.NULL_NODE; + _valueStack[_stackTop] = XHash.hash(local); + + // Take care of attributes + if ((attrs != null) && (attrs.getLength() > 0)) + { + for (int i = 0; i < attrs.getLength(); i++) + { + String name = attrs.getQName(i); + String value = attrs.getValue(i); + long namehash = XHash.hash(name); + long valuehash = XHash.hash(value); + long attrhash = namehash * namehash + + valuehash * valuehash; + int aid = _xtree.addAttribute(eid, _lsidStack[_stackTop], name, value, namehash, attrhash); + + _lsidStack[_stackTop] = aid; + _currentNodeID = aid + 1; + _valueStack[_stackTop] += attrhash * attrhash; + } + } + + _readElement = True; + _elementBuffer = new StringBuffer(); + } + + def characters(char ch[], int start, int length): + { + _elementBuffer.append(ch, start, length); + } + + def endElement(String uri, String local, String raw): + { + if (_readElement) + { + if (_elementBuffer.length() > 0) + { + String text = _elementBuffer.toString(); + long value = XHash.hash(text); + _currentNodeID = + _xtree.addText(_idStack[_stackTop], + _lsidStack[_stackTop], + text, value); + _valueStack[_stackTop] += value; + } + else // an empty element + { + _currentNodeID = + _xtree.addText(_idStack[_stackTop], + _lsidStack[_stackTop], + "", 0); + } + _readElement = False; + } + else + { + if (_elementBuffer.length() > 0) + { + String text = _elementBuffer.toString().trim(); + // More text nodes before end of the element. + if (text.length() > 0) + { + long value = XHash.hash(text); + _currentNodeID = + _xtree.addText(_idStack[_stackTop], + _lsidStack[_stackTop], + text, value); + _valueStack[_stackTop] += value; + } + } + } + + _elementBuffer = new StringBuffer(); + _xtree.addHashValue(_idStack[_stackTop], + _valueStack[_stackTop]); + _valueStack[_stackTop-1] += _valueStack[_stackTop] * + _valueStack[_stackTop]; + _lsidStack[_stackTop-1] = _idStack[_stackTop]; + + // Pop + _stackTop--; + } + + // End of document handler methods + + // Lexical handler methods. + + def startCDATA(): + { + // The text node id should be the one next to the current + // node id. + int textid = _currentNodeID + 1; + String text = _elementBuffer.toString(); + _xtree.addCDATA(textid, text.length()); + } + + def endCDATA(): + { + int textid = _currentNodeID + 1; + String text = _elementBuffer.toString(); + _xtree.addCDATA(textid, text.length()); + } + + // Following functions are not implemented. + def comment(char[] ch, int start, int length): + { + } + + def startDTD(String name, String publicId, String systemId): + { + } + + def endDTD(): + { + } + + def startEntity(String name): + { + } + + def endEntity(String name): + { + } + + // End of lexical handler methods. +} |