# Copyright (c) 2001 - 2005 # Yuan Wang. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. Redistributions in any form must be accompanied by information on # how to obtain complete source code for the X-Diff software and any # accompanying software that uses the X-Diff software. The source code # must either be included in the distribution or be available for no # more than the cost of distribution plus a nominal fee, and must be # freely redistributable under reasonable conditions. For an executable # file, complete source code means the source code for all modules it # contains. It does not include source code for modules or files that # typically accompany the major components of the operating system on # which the executable file runs. # # THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, # ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. MATCH = 0 CHANGE = 1 NO_MATCH = -1 INSERT = -1 DELETE = -1 NULL_NODE = -1 NO_CONNECTION = 1048576 _TOP_LEVEL_CAPACITY = 16384 _BOT_LEVEL_CAPACITY = 4096 # XTree provides a DOM-like interface but somehow simplified # Ideally, it can be replaced by any other DOM parser output tree structures. class XTree: # private _topCap, _botCap # private _elementIndex, _tagIndex, self._valueCount # private self._firstChild[][], self._nextSibling[][] # private self._childrenCount[][], _valueIndex[][] # private boolean self._isAttribute[][] # private self._matching[][] # private long self._hashValue[][] # private String _value[][] # private Hashtable self._tagNames, _cdataTable _root = 0 _firstChild = [] _nextSibling = [] _isAttribute = [] _valueIndex = [] _matching = [] _childrenCount = [] _hashValue = [] _value = [] _value.append([]) _tagNames = [] # This hashtable is used to record CDATA section info. # The key is the text node id, the value is the list of # (start,end) position pair of each CDATA section. _cdataTable = {} _elementIndex = -1 _tagIndex = -1 _valueCount = 0 def __init__(self, topcap=None, botcap=None): self._topCap = _TOP_LEVEL_CAPACITY self._botCap = _BOT_LEVEL_CAPACITY if topcap: self._topCap = topcap if botcap: self._botCap = botcap self._initialize() # Initialization. def _initialize(self): self._root = 0 self._firstChild = [] self._nextSibling = [] self._isAttribute = [] self._valueIndex = [] self._matching = [] self._childrenCount = [] self._hashValue = [] self._value = [] self._value.append([]) self._tagNames = [] # This hashtable is used to record CDATA section info. # The key is the text node id, the value is the list of # (start,end) position pair of each CDATA section. self._cdataTable = {} self._elementIndex = -1 self._tagIndex = -1 self._valueCount = self._botCap - 1 # ID Expansion def _expand(self, topid): self._firstChild[topid] = [] self._nextSibling[topid] = [] self._childrenCount[topid] = [] self._matching[topid] = [] self._valueIndex[topid] = [] self._hashValue[topid] = [] self._isAttribute[topid] = [] for i in range(self._botCap): self._firstChild[topid][i] = NULL_NODE self._nextSibling[topid][i] = NULL_NODE self._childrenCount[topid][i] = 0 self._matching[topid][i] = MATCH self._valueIndex[topid][i] = -1 self._isAttribute[topid][i] = False # Start -- methods for constructing a tree. # Add a new element to the tree. # @param pid parent id # @param lsid left-side sibling id # @param tagName element name # @return the element id in the tree. def addElement(self, pid, lsid, tagName): self._elementIndex += 1 topid = self._elementIndex / self._botCap botid = self._elementIndex % self._botCap if (botid == 0): self._expand(topid) # Check if we've already had the tag tagID = self._tagNames[tagName] if (tagID != None): self._valueIndex[topid][botid] = tagID.intValue() else: self._tagIndex += 1 tagID = int(self._tagIndex) self._value[0][self._tagIndex] = tagName self._tagNames.append(tagName, tagID) self._valueIndex[topid][botid] = self._tagIndex if (pid == NULL_NODE): return self._elementIndex ptopid = pid / self._botCap pbotid = pid % self._botCap # parent-child relation or sibling-sibling relation if (lsid == NULL_NODE): self._firstChild[ptopid][pbotid] = self._elementIndex else: self._nextSibling[lsid / self._botCap][lsid % self._botCap] = self._elementIndex # update children count self._childrenCount[ptopid][pbotid] += 1 return self._elementIndex # Add a text node. # @param eid element id # @param lsid the sibling id on the left # @param text text value # @param value hash value def addText(self, eid, lsid, text, value): self._elementIndex += 1 topid = self._elementIndex / self._botCap botid = self._elementIndex % self._botCap if (botid == 0): self._expand(topid) etopid = eid / self._botCap ebotid = eid % self._botCap if (lsid == NULL_NODE): self._firstChild[etopid][ebotid] = self._elementIndex else: self._nextSibling[lsid / self._botCap][lsid % self._botCap] = self._elementIndex self._childrenCount[etopid][ebotid] += 1 self._hashValue[topid][botid] = value self._valueCount += 1 vtopid = self._valueCount / self._botCap vbotid = self._valueCount % self._botCap if (vbotid == 0): self._value[vtopid] = str[self._botCap] self._value[vtopid][vbotid] = text self._valueIndex[topid][botid] = self._valueCount return self._elementIndex # Add an attribute. # @param eid element id # @param lsid the sibling id on the left # @param name attribute name # @param value attribute value # @param valuehash hash value of the value # @param attrhash hash value of the entire attribute # @return the element id of the attribute def addAttribute(self, eid, lsid, name, value, valuehash, attrhash): # attribute name first. aid = self.addElement(eid, lsid, name) # attribute value second. self.addText(aid, NULL_NODE, value, valuehash) # hash value third atopid = aid / self._botCap abotid = aid % self._botCap self._isAttribute[atopid][abotid] = True self._hashValue[atopid][abotid] = attrhash return aid # Add more information (hash value) to an element node. # @param eid element id # @param value extra hash value def addHashValue(self, eid, value): self._hashValue[eid / self._botCap][eid % self._botCap] = value # Add a CDATA section (either a start or an end) to the CDATA # hashtable, in which each entry should have an even number of # position slots. # @param eid The text node id # @param position the section tag position def addCDATA(self, eid, position): key = int(eid) value = self._cdataTable[key] if (value == None): elem_list = [] elem_list.append(position) self._cdataTable[key] = elem_list else: elem_list = value elem_list.append(position) self._cdataTable[key] = elem_list # Add matching information. # @param eid element id # @param match ?match and matched element id def addMatching(self, eid, match): if (match[0] == NO_MATCH): self._matching[eid / self._botCap][eid % self._botCap] = NO_MATCH elif (match[0] == MATCH): self._matching[eid / self._botCap][eid % self._botCap] = MATCH else: self._matching[eid / self._botCap][eid % self._botCap] = match[1] + 1 # End -- methods for constructing a tree. # Start -- methods for accessing a tree. # Get matching information. # @param eid element id # @param match ?change and matched element id def getMatching(self, eid, match): mid = self._matching[eid / self._botCap][eid % self._botCap] if (mid == NO_MATCH): match[0] = NO_MATCH elif (mid == MATCH): match[0] = MATCH else: match[0] = CHANGE match[1] = mid - 1 # Get the root element id. def getRoot(self): return self._root # Get the first child of a node. # @param eid element id def getFirstChild(self, eid): cid = self._firstChild[eid / self._botCap][eid % self._botCap] while (cid > self._root): ctopid = cid / self._botCap cbotid = cid % self._botCap if (self._isAttribute[ctopid][cbotid]): cid = self._nextSibling[ctopid][cbotid] else: return cid return NULL_NODE # Get the next sibling of a node. # @param eid element id def getNextSibling(self, eid): return self._nextSibling[eid / self._botCap][eid % self._botCap] # Get the first attribute of a node. # @param eid element id def getFirstAttribute(self, eid): aid = self._firstChild[eid / self._botCap][eid % self._botCap] if ((aid > self._root) and (self._isAttribute[aid / self._botCap][aid % self._botCap])): return aid else: return NULL_NODE # Get the next attribute of a node. # @param aid attribute id def getNextAttribute(self, aid): aid1 = self._nextSibling[aid / self._botCap][aid % self._botCap] if ((aid1 > self._root) and (self._isAttribute[aid1 / self._botCap][aid1 % self._botCap])): return aid1 else: return NULL_NODE # Get the attribute value. # @param aid attribute id def getAttributeValue(self, aid): cid = self._firstChild[aid / self._botCap][aid % self._botCap] index = self._valueIndex[cid / self._botCap][cid % self._botCap] if (index > 0): return self._value[index / self._botCap][index % self._botCap] else: return "" # Get the hash value of a node. # @param eid element id def getHashValue(self, eid): return self._hashValue[eid / self._botCap][eid % self._botCap] # Get the CDATA section position list of a text node. # @param eid element id # @return position list which is a vector or None if no CDATA def getCDATA(self, eid): return self._cdataTable[eid] # Get the childern count of a node. # @param eid element id def getChildrenCount(self, eid): return self._childrenCount[eid / self._botCap][eid % self._botCap] # Get the # of all decendents of a node. # @param eid element id def getDecendentsCount(self, eid): topid = eid / self._botCap botid = eid % self._botCap count = self._childrenCount[topid][botid] if (count == 0): return 0 cid = self._firstChild[topid][botid] while (cid > NULL_NODE): count += self.getDecendentsCount(cid) cid = self._nextSibling[cid / self._botCap][cid % self._botCap] return count # Get the value index of a node # @param eid element id def getValueIndex(self, eid): return self._valueIndex[eid / self._botCap][eid % self._botCap] # Get the value of a leaf node # @param index value index def getValue(self, index): return self._value[index / self._botCap][index % self._botCap] # Get the tag of an element node # @param eid element id def getTag(self, eid): index = self._valueIndex[eid / self._botCap][eid % self._botCap] return self._value[0][index] # Get the text value of a leaf node # @param eid element id def getText(self, eid): index = self._valueIndex[eid / self._botCap][eid % self._botCap] if (index >= self._botCap): return self._value[index / self._botCap][index % self._botCap] else: return "" # Check if a node an element node. # @param eid element id def isElement(self, eid): vindex = self._valueIndex[eid / self._botCap][eid % self._botCap] if (vindex < self._botCap): return True else: return False # Check if a node is an attribute node. # @param eid element id def isAttribute(self, eid): return self._isAttribute[eid / self._botCap][eid % self._botCap] # Check if a node an leaf text node. # @param edi element id def isLeaf(self, eid): index = self._valueIndex[eid / self._botCap][eid % self._botCap] if (index < self._botCap): return False else: return True # End -- methods for accessing a tree. # For testing purpose. def dump(self, eid=None): if eid: topid = eid / self._botCap botid = eid % self._botCap vid = self._valueIndex[topid][botid] vtopid = vid / self._botCap vbotid = vid % self._botCap print eid + "\t" + \ self._firstChild[topid][botid] + "\t" + \ self._nextSibling[topid][botid] + "\t" + \ self._isAttribute[topid][botid] + "\t" + \ self._childrenCount[topid][botid] + "\t" + \ self._hashValue[topid][botid] + "\t" + \ self._matching[topid][botid] + "\t" + \ self._value[vtopid][vbotid] else: print "eid\tfirstC\tnextS\tattr?\tcCount\thash\tmatch\tvalue" for i in range(self._root, self._elementIndex + 1): topid = i / self._botCap botid = i % self._botCap vid = self._valueIndex[topid][botid] vtopid = vid / self._botCap vbotid = vid % self._botCap print i + "\t" + \ self._firstChild[topid][botid] + "\t" + \ self._nextSibling[topid][botid] + "\t" + \ self._isAttribute[topid][botid] + "\t" + \ self._childrenCount[topid][botid] + "\t" + \ self._hashValue[topid][botid] + "\t" + \ self._matching[topid][botid] + "\t" + \ self._value[vtopid][vbotid]