# Copyright (c) 2001 - 2005
# Yuan Wang. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. Redistributions in any form must be accompanied by information on
# how to obtain complete source code for the X-Diff software and any
# accompanying software that uses the X-Diff software. The source code
# must either be included in the distribution or be available for no
# more than the cost of distribution plus a nominal fee, and must be
# freely redistributable under reasonable conditions. For an executable
# file, complete source code means the source code for all modules it
# contains. It does not include source code for modules or files that
# typically accompany the major components of the operating system on
# which the executable file runs.
#
# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
MATCH = 0
CHANGE = 1
NO_MATCH = -1
INSERT = -1
DELETE = -1
NULL_NODE = -1
NO_CONNECTION = 1048576
_TOP_LEVEL_CAPACITY = 16384
_BOT_LEVEL_CAPACITY = 4096
# <code>XTree</code> provides a DOM-like interface but somehow simplified
# Ideally, it can be replaced by any other DOM parser output tree structures.
class XTree:
# private _topCap, _botCap
# private _elementIndex, _tagIndex, self._valueCount
# private self._firstChild[][], self._nextSibling[][]
# private self._childrenCount[][], _valueIndex[][]
# private boolean self._isAttribute[][]
# private self._matching[][]
# private long self._hashValue[][]
# private String _value[][]
# private Hashtable self._tagNames, _cdataTable
_root = 0
_firstChild = []
_nextSibling = []
_isAttribute = []
_valueIndex = []
_matching = []
_childrenCount = []
_hashValue = []
_value = []
_value.append([])
_tagNames = []
# This hashtable is used to record CDATA section info.
# The key is the text node id, the value is the list of
# (start,end) position pair of each CDATA section.
_cdataTable = {}
_elementIndex = -1
_tagIndex = -1
_valueCount = 0
def __init__(self, topcap=None, botcap=None):
self._topCap = _TOP_LEVEL_CAPACITY
self._botCap = _BOT_LEVEL_CAPACITY
if topcap:
self._topCap = topcap
if botcap:
self._botCap = botcap
self._initialize()
# Initialization.
def _initialize(self):
self._root = 0
self._firstChild = []
self._nextSibling = []
self._isAttribute = []
self._valueIndex = []
self._matching = []
self._childrenCount = []
self._hashValue = []
self._value = []
self._value.append([])
self._tagNames = []
# This hashtable is used to record CDATA section info.
# The key is the text node id, the value is the list of
# (start,end) position pair of each CDATA section.
self._cdataTable = {}
self._elementIndex = -1
self._tagIndex = -1
self._valueCount = self._botCap - 1
# ID Expansion
def _expand(self, topid):
self._firstChild[topid] = []
self._nextSibling[topid] = []
self._childrenCount[topid] = []
self._matching[topid] = []
self._valueIndex[topid] = []
self._hashValue[topid] = []
self._isAttribute[topid] = []
for i in range(self._botCap):
self._firstChild[topid][i] = NULL_NODE
self._nextSibling[topid][i] = NULL_NODE
self._childrenCount[topid][i] = 0
self._matching[topid][i] = MATCH
self._valueIndex[topid][i] = -1
self._isAttribute[topid][i] = False
# Start -- methods for constructing a tree.
# Add a new element to the tree.
# @param pid parent id
# @param lsid left-side sibling id
# @param tagName element name
# @return the element id in the tree.
def addElement(self, pid, lsid, tagName):
self._elementIndex += 1
topid = self._elementIndex / self._botCap
botid = self._elementIndex % self._botCap
if (botid == 0):
self._expand(topid)
# Check if we've already had the tag
tagID = self._tagNames[tagName]
if (tagID != None):
self._valueIndex[topid][botid] = tagID.intValue()
else:
self._tagIndex += 1
tagID = int(self._tagIndex)
self._value[0][self._tagIndex] = tagName
self._tagNames.append(tagName, tagID)
self._valueIndex[topid][botid] = self._tagIndex
if (pid == NULL_NODE):
return self._elementIndex
ptopid = pid / self._botCap
pbotid = pid % self._botCap
# parent-child relation or sibling-sibling relation
if (lsid == NULL_NODE):
self._firstChild[ptopid][pbotid] = self._elementIndex
else:
self._nextSibling[lsid / self._botCap][lsid % self._botCap] = self._elementIndex
# update children count
self._childrenCount[ptopid][pbotid] += 1
return self._elementIndex
# Add a text node.
# @param eid element id
# @param lsid the sibling id on the left
# @param text text value
# @param value hash value
def addText(self, eid, lsid, text, value):
self._elementIndex += 1
topid = self._elementIndex / self._botCap
botid = self._elementIndex % self._botCap
if (botid == 0):
self._expand(topid)
etopid = eid / self._botCap
ebotid = eid % self._botCap
if (lsid == NULL_NODE):
self._firstChild[etopid][ebotid] = self._elementIndex
else:
self._nextSibling[lsid / self._botCap][lsid % self._botCap] = self._elementIndex
self._childrenCount[etopid][ebotid] += 1
self._hashValue[topid][botid] = value
self._valueCount += 1
vtopid = self._valueCount / self._botCap
vbotid = self._valueCount % self._botCap
if (vbotid == 0):
self._value[vtopid] = str[self._botCap]
self._value[vtopid][vbotid] = text
self._valueIndex[topid][botid] = self._valueCount
return self._elementIndex
# Add an attribute.
# @param eid element id
# @param lsid the sibling id on the left
# @param name attribute name
# @param value attribute value
# @param valuehash hash value of the value
# @param attrhash hash value of the entire attribute
# @return the element id of the attribute
def addAttribute(self, eid, lsid, name, value, valuehash, attrhash):
# attribute name first.
aid = self.addElement(eid, lsid, name)
# attribute value second.
self.addText(aid, NULL_NODE, value, valuehash)
# hash value third
atopid = aid / self._botCap
abotid = aid % self._botCap
self._isAttribute[atopid][abotid] = True
self._hashValue[atopid][abotid] = attrhash
return aid
# Add more information (hash value) to an element node.
# @param eid element id
# @param value extra hash value
def addHashValue(self, eid, value):
self._hashValue[eid / self._botCap][eid % self._botCap] = value
# Add a CDATA section (either a start or an end) to the CDATA
# hashtable, in which each entry should have an even number of
# position slots.
# @param eid The text node id
# @param position the section tag position
def addCDATA(self, eid, position):
key = int(eid)
value = self._cdataTable[key]
if (value == None):
elem_list = []
elem_list.append(position)
self._cdataTable[key] = elem_list
else:
elem_list = value
elem_list.append(position)
self._cdataTable[key] = elem_list
# Add matching information.
# @param eid element id
# @param match ?match and matched element id
def addMatching(self, eid, match):
if (match[0] == NO_MATCH):
self._matching[eid / self._botCap][eid % self._botCap] = NO_MATCH
elif (match[0] == MATCH):
self._matching[eid / self._botCap][eid % self._botCap] = MATCH
else:
self._matching[eid / self._botCap][eid % self._botCap] = match[1] + 1
# End -- methods for constructing a tree.
# Start -- methods for accessing a tree.
# Get matching information.
# @param eid element id
# @param match ?change and matched element id
def getMatching(self, eid, match):
mid = self._matching[eid / self._botCap][eid % self._botCap]
if (mid == NO_MATCH):
match[0] = NO_MATCH
elif (mid == MATCH):
match[0] = MATCH
else:
match[0] = CHANGE
match[1] = mid - 1
# Get the root element id.
def getRoot(self):
return self._root
# Get the first child of a node.
# @param eid element id
def getFirstChild(self, eid):
cid = self._firstChild[eid / self._botCap][eid % self._botCap]
while (cid > self._root):
ctopid = cid / self._botCap
cbotid = cid % self._botCap
if (self._isAttribute[ctopid][cbotid]):
cid = self._nextSibling[ctopid][cbotid]
else:
return cid
return NULL_NODE
# Get the next sibling of a node.
# @param eid element id
def getNextSibling(self, eid):
return self._nextSibling[eid / self._botCap][eid % self._botCap]
# Get the first attribute of a node.
# @param eid element id
def getFirstAttribute(self, eid):
aid = self._firstChild[eid / self._botCap][eid % self._botCap]
if ((aid > self._root) and (self._isAttribute[aid / self._botCap][aid % self._botCap])):
return aid
else:
return NULL_NODE
# Get the next attribute of a node.
# @param aid attribute id
def getNextAttribute(self, aid):
aid1 = self._nextSibling[aid / self._botCap][aid % self._botCap]
if ((aid1 > self._root) and (self._isAttribute[aid1 / self._botCap][aid1 % self._botCap])):
return aid1
else:
return NULL_NODE
# Get the attribute value.
# @param aid attribute id
def getAttributeValue(self, aid):
cid = self._firstChild[aid / self._botCap][aid % self._botCap]
index = self._valueIndex[cid / self._botCap][cid % self._botCap]
if (index > 0):
return self._value[index / self._botCap][index % self._botCap]
else:
return ""
# Get the hash value of a node.
# @param eid element id
def getHashValue(self, eid):
return self._hashValue[eid / self._botCap][eid % self._botCap]
# Get the CDATA section position list of a text node.
# @param eid element id
# @return position list which is a vector or None if no CDATA
def getCDATA(self, eid):
return self._cdataTable[eid]
# Get the childern count of a node.
# @param eid element id
def getChildrenCount(self, eid):
return self._childrenCount[eid / self._botCap][eid % self._botCap]
# Get the # of all decendents of a node.
# @param eid element id
def getDecendentsCount(self, eid):
topid = eid / self._botCap
botid = eid % self._botCap
count = self._childrenCount[topid][botid]
if (count == 0):
return 0
cid = self._firstChild[topid][botid]
while (cid > NULL_NODE):
count += self.getDecendentsCount(cid)
cid = self._nextSibling[cid / self._botCap][cid % self._botCap]
return count
# Get the value index of a node
# @param eid element id
def getValueIndex(self, eid):
return self._valueIndex[eid / self._botCap][eid % self._botCap]
# Get the value of a leaf node
# @param index value index
def getValue(self, index):
return self._value[index / self._botCap][index % self._botCap]
# Get the tag of an element node
# @param eid element id
def getTag(self, eid):
index = self._valueIndex[eid / self._botCap][eid % self._botCap]
return self._value[0][index]
# Get the text value of a leaf node
# @param eid element id
def getText(self, eid):
index = self._valueIndex[eid / self._botCap][eid % self._botCap]
if (index >= self._botCap):
return self._value[index / self._botCap][index % self._botCap]
else:
return ""
# Check if a node an element node.
# @param eid element id
def isElement(self, eid):
vindex = self._valueIndex[eid / self._botCap][eid % self._botCap]
if (vindex < self._botCap):
return True
else:
return False
# Check if a node is an attribute node.
# @param eid element id
def isAttribute(self, eid):
return self._isAttribute[eid / self._botCap][eid % self._botCap]
# Check if a node an leaf text node.
# @param edi element id
def isLeaf(self, eid):
index = self._valueIndex[eid / self._botCap][eid % self._botCap]
if (index < self._botCap):
return False
else:
return True
# End -- methods for accessing a tree.
# For testing purpose.
def dump(self, eid=None):
if eid:
topid = eid / self._botCap
botid = eid % self._botCap
vid = self._valueIndex[topid][botid]
vtopid = vid / self._botCap
vbotid = vid % self._botCap
print eid + "\t" + \
self._firstChild[topid][botid] + "\t" + \
self._nextSibling[topid][botid] + "\t" + \
self._isAttribute[topid][botid] + "\t" + \
self._childrenCount[topid][botid] + "\t" + \
self._hashValue[topid][botid] + "\t" + \
self._matching[topid][botid] + "\t" + \
self._value[vtopid][vbotid]
else:
print "eid\tfirstC\tnextS\tattr?\tcCount\thash\tmatch\tvalue"
for i in range(self._root, self._elementIndex + 1):
topid = i / self._botCap
botid = i % self._botCap
vid = self._valueIndex[topid][botid]
vtopid = vid / self._botCap
vbotid = vid % self._botCap
print i + "\t" + \
self._firstChild[topid][botid] + "\t" + \
self._nextSibling[topid][botid] + "\t" + \
self._isAttribute[topid][botid] + "\t" + \
self._childrenCount[topid][botid] + "\t" + \
self._hashValue[topid][botid] + "\t" + \
self._matching[topid][botid] + "\t" + \
self._value[vtopid][vbotid]