diff options
-rw-r--r-- | XDiff.py | 1996 | ||||
-rw-r--r-- | XHash.py | 310 | ||||
-rw-r--r-- | XLut.py | 67 | ||||
-rw-r--r-- | XParser.py | 269 | ||||
-rw-r--r-- | XTree.py | 423 |
5 files changed, 3065 insertions, 0 deletions
diff --git a/XDiff.py b/XDiff.py new file mode 100644 index 0000000..ad0e93c --- /dev/null +++ b/XDiff.py @@ -0,0 +1,1996 @@ +#!/usr/bin/env python +""" +java XDiff [-o|-g] [-p percent] [-e encoding] xml_file1 xml_file2 diff_result +Options: + The default setting is "-o -p 0.3 -e UTF8" + -o The optimal mode, to get the minimum editing distance. + -g The greedy mode, to find a difference quickly. + -p The maximum change percentage allowed. + Default value: 1.0 for -o mode; 0.3 for -g mode. + -e The encoding of the output file. + Default value: UTF8. +""" +# Copyright (c) 2001 - 2005 +# Yuan Wang. All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Redistributions in any form must be accompanied by information on +# how to obtain complete source code for the X-Diff software and any +# accompanying software that uses the X-Diff software. The source code +# must either be included in the distribution or be available for no +# more than the cost of distribution plus a nominal fee, and must be +# freely redistributable under reasonable conditions. For an executable +# file, complete source code means the source code for all modules it +# contains. It does not include source code for modules or files that +# typically accompany the major components of the operating system on +# which the executable file runs. + +# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, +# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +#import java.io.BufferedReader +#import java.io.FileReader +#import java.io.FileOutputStream +#import java.io.OutputStreamWriter +#import java.io.IOException +#import java.util.Random +#import java.util.Vector +import sys, time, codecs +import XTree, XLut +from XParser import XParser + +# <code>XDiff</code> computes the difference of two input XML documents. + +_CIRCUIT_SIZE = 2048 +_MATRIX_SIZE = 1024 +_ATTRIBUTE_SIZE = 1024 +_TEXT_SIZE = 1024 + + +class XDiff: + _oFlag = False + _NO_MATCH_THRESHOLD = 0.3 + _sampleCount = 3 + _DEBUG = False + _encoding = "UTF8" + +# self._xtree1, self._xtree2 +# private XLut _xlut +# private _leastCostMatrix[][], self._pathMatrix[][], self._circuit[] +# +# private _attrList1[], _attrList2[], _textList1[], _textList2[] +# private boolean _attrMatch[], _textMatch1[], _textMatch2[] +# private long _attrHash[], _textHash[] +# private String _attrTag[] +# +# private self._matchp[] +# private boolean self._needNewLine + + + + # Constructor + # @param input1 input file #1 + # @param input2 input file #2 + # @param output output file + + def __init__(self, input1, input2, output): + # Parse input files + parser = XParser() + t0 = time.time() + self._xtree1 = parser.parse(input1) + t1 = time.time() + parser = XParser() + self._xtree2 = parser.parse(input2) + t2 = time.time() + + # check both root nodes. + root1 = self._xtree1.getRoot() + root2 = self._xtree2.getRoot() + if (self._xtree1.getHashValue(root1) == self._xtree2.getHashValue(root2)): + print "No difference!" + print "Execution time: " + (t2 - t0) + " ms" + print "Parsing " + input1 + ": " + (t1 - t0) + " ms" + print "Parsing " + input2 + ": " + (t2 - t1) + " ms" + else: + self._xlut = XLut.XLut() + self._matchp = int[2] + + if (self._xtree1.getTag(root1).compareTo(self._xtree2.getTag(root2)) != 0): + print "The root is changed!" + self._matchp[0] = XTree.NO_MATCH + self._xtree1.addMatching(root1, self._matchp) + self._xtree2.addMatching(root2, self._matchp) + else: + # initialize data structures. + self._attrList1 = [] + self._attrList2 = [] + self._attrMatch = [] + self._attrHash = [] + self._attrTag = [] + + self._textList1 = [] + self._textList2 = [] + self._textMatch1 = [] + self._textMatch2 = [] + self._textHash = [] + + self._leastCostMatrix = [] + self._pathMatrix = [] + self._circuit = [] + + for i in range(_MATRIX_SIZE): + self._leastCostMatrix[i] = int[_MATRIX_SIZE] + self._pathMatrix[i] = int[_MATRIX_SIZE] + + self._matchp[0] = XTree.CHANGE + self._matchp[1] = root2 + self._xtree1.addMatching(root1, self._matchp) + self._matchp[1] = root1 + self._xtree2.addMatching(root2, self._matchp) + self.xdiff(root1, root2, False) + + t3 = time.time() + self.writeDiff(input1, output) + t4 = time.time() + + print "Difference detected!" + print "Execution time: " + (t4 - t0) + " ms" + print "Parsing " + input1 + ": " + (t1 - t0) + " ms" + print "Parsing " + input2 + ": " + (t2 - t1) + " ms" + print "Diffing: " + (t3 - t2) + " ms" + print "Writing result: " + (t4 - t3) + " ms" + + + # Diff two element lists + # This is the official one that records matching top-down + # @param pid1 parent id #1 + # @param pid2 parent id #2 + # @param matchFlag indicates if distance computation needed + def xdiff(self, pid1, pid2, matchFlag): + # diff attributes. + attrCount1 = 0 + attrCount2 = 0 + attr1 = self._xtree1.getFirstAttribute(pid1) + while (attr1 != XTree.NULL_NODE): + self._attrList1[attrCount1] = attr1 + attrCount1 += 1 + attr1 = self._xtree1.getNextAttribute(attr1) + attr2 = self._xtree2.getFirstAttribute(pid2) + while (attr2 != XTree.NULL_NODE): + self._attrList2[attrCount2] = attr2 + attrCount2 += 1 + attr2 = self._xtree2.getNextAttribute(attr2) + + if (attrCount1 > 0): + if (attrCount2 > 0): + self.diffAttributes(attrCount1, attrCount2) + else: + self._matchp[0] = XTree.NO_MATCH + for i in range(attrCount1): + self._xtree1.addMatching(self._attrList1[i], + self._matchp) + elif (attrCount2 > 0): # attrCount1 == 0 + self._matchp[0] = XTree.NO_MATCH + for i in range(attrCount2): + self._xtree2.addMatching(self._attrList2[i], self._matchp) + + # Match element nodes. + count1 = self._xtree1.getChildrenCount(pid1) - attrCount1 + count2 = self._xtree2.getChildrenCount(pid2) - attrCount2 + + if (count1 == 0): + self._matchp[0] = XTree.NO_MATCH + node2 = self._xtree2.getFirstChild(pid2) + self._xtree2.addMatching(node2, self._matchp) + for i in range(1,count2): + node2 = self._xtree2.getNextSibling(node2) + self._xtree2.addMatching(node2, self._matchp) + elif (count2 == 0): + self._matchp[0] = XTree.NO_MATCH + node1 = self._xtree1.getFirstChild(pid1) + self._xtree1.addMatching(node1, self._matchp) + for i in range(1, count1): + node1 = self._xtree1.getNextSibling(node1) + self._xtree1.addMatching(node1, self._matchp) + elif ((count1 == 1) and (count2 == 1)): + node1 = self._xtree1.getFirstChild(pid1) + node2 = self._xtree2.getFirstChild(pid2) + + if (self._xtree1.getHashValue(node1) == self._xtree2.getHashValue(node2)): + return + + isE1 = self._xtree1.isElement(node1) + isE2 = self._xtree2.isElement(node2) + + if (isE1 and isE2): + tag1 = self._xtree1.getTag(node1) + tag2 = self._xtree2.getTag(node2) + if (tag1.compareTo(tag2) == 0): + self._matchp[0] = XTree.CHANGE + self._matchp[1] = node2 + self._xtree1.addMatching(node1, self._matchp) + self._matchp[1] = node1 + self._xtree2.addMatching(node2, self._matchp) + + self.xdiff(node1, node2, matchFlag) + else: + self._matchp[0] = XTree.NO_MATCH + self._xtree1.addMatching(node1, self._matchp) + self._xtree2.addMatching(node2, self._matchp) + elif (not isE1 and not isE2): + self._matchp[0] = XTree.CHANGE + self._matchp[1] = node2 + self._xtree1.addMatching(node1, self._matchp) + self._matchp[1] = node1 + self._xtree2.addMatching(node2, self._matchp) + else: + self._matchp[0] = XTree.NO_MATCH + self._xtree1.addMatching(node1, self._matchp) + self._xtree2.addMatching(node2, self._matchp) + else: + elements1 = int[count1] + elements2 = int[count2] + elementCount1 = 0 + textCount1 = 0 + elementCount2 = 0 + textCount2 = 0 + + child1 = self._xtree1.getFirstChild(pid1) + if (self._xtree1.isElement(child1)): + elements1[elementCount1] = child1 + elementCount1 += 1 + else: + self._textList1[textCount1] = child1 + textCount1 += 1 + for i in range(1,count1): + child1 = self._xtree1.getNextSibling(child1) + if (self._xtree1.isElement(child1)): + elements1[elementCount1] = child1 + elementCount1 += 1 + else: + self._textList1[textCount1] = child1 + textCount1 += 1 + + child2 = self._xtree2.getFirstChild(pid2) + if (self._xtree2.isElement(child2)): + elements2[elementCount2] = child2 + elementCount2 += 1 + else: + self._textList2[textCount2] = child2 + textCount2 += 1 + for i in range(1,count2): + child2 = self._xtree2.getNextSibling(child2) + if (self._xtree2.isElement(child2)): + elements2[elementCount2] = child2 + elementCount2 += 1 + else: + self._textList2[textCount2] = child2 + textCount2 += 1 + + # Match text nodes. + if (textCount1 > 0): + if (textCount2 > 0): + self.diffText(textCount1, textCount2) + else: + self._matchp[0] = XTree.NO_MATCH + for i in range(textCount1): + self._xtree1.addMatching(self._textList1[i], self._matchp) + elif (textCount2 > 0): + self._matchp[0] = XTree.NO_MATCH + for i in (textCount2): + self._xtree2.addMatching(self._textList2[i], + self._matchp) + + matched1 = [] + matched2 = [] + mcount = self._matchFilter(elements1, elementCount1, + elements2, elementCount2, + matched1, matched2) + + if ((elementCount1 == mcount) and (elementCount2 == mcount)): + return + + if (elementCount1 == mcount): + self._matchp[0] = XTree.NO_MATCH + for i in range(elementCount2): + if (not matched2[i]): + self._xtree2.addMatching(elements2[i], self._matchp) + return + if (elementCount2 == mcount): + self._matchp[0] = XTree.NO_MATCH + for i in range(elementCount1): + if (not matched1[i]): + self._xtree1.addMatching(elements1[i], self._matchp) + return + + # Write the list of unmatched nodes. + ucount1 = elementCount1 - mcount + ucount2 = elementCount2 - mcount + unmatched1 = int[ucount1] + unmatched2 = int[ucount2] + muc1 = 0 + muc2 = 0 + start = 0 + + while ((muc1 < ucount1) and (muc2 < ucount2)): + while (start < elementCount1) and matched1[start]: + start += 1 + startTag = self._xtree1.getTag(elements1[start]) + uele1 = 0 + uele2 = 0 + muc1 += 1 + unmatched1[uele1] = elements1[start] + uele1 += 1 + matched1[start] = True + start += 1 + + i = start + while (i < elementCount1) and (muc1 < ucount1): + if (not matched1[i] and startTag.equals(self._xtree1.getTag(elements1[i]))): + matched1[i] = True + muc1 += 1 + unmatched1[uele1] = elements1[i] + uele1 += 1 + i += 1 + + i = 0 + while (i < elementCount2) and (muc2 < ucount2): + if (not matched2[i] and startTag.equals(self._xtree2.getTag(elements2[i]))): + matched2[i] = True + muc2 += 1 + unmatched2[uele2] = elements2[i] + uele2 += 1 + i += 1 + + if (uele2 == 0): + self._matchp[0] = XTree.NO_MATCH + for i in range(uele1): + self._xtree1.addMatching(unmatched1[i], self._matchp) + else: + if ((uele1 == 1) and (uele2 == 1)): + self._matchp[0] = XTree.CHANGE + self._matchp[1] = unmatched2[0] + self._xtree1.addMatching(unmatched1[0], self._matchp) + self._matchp[1] = unmatched1[0] + self._xtree2.addMatching(unmatched2[0], self._matchp) + self.xdiff(unmatched1[0], + unmatched2[0], + matchFlag) + # To find minimal-cost matching between those unmatched. + elif (uele1 >= uele2): + if ((uele2 <= self._sampleCount) or not self._gFlag): + self.matchListO(unmatched1, unmatched2, uele1, uele2, True, matchFlag) + else: + self.matchList(unmatched1, unmatched2, uele1, uele2, True, matchFlag) + else: + if ((uele1 <= self._sampleCount) or not self._gFlag): + self.matchListO(unmatched2, unmatched1, uele2, uele1, False, matchFlag) + else: + self.matchList(unmatched2, unmatched1, uele2, uele1, False, matchFlag) + + if (muc1 < ucount1): + self._matchp[0] = XTree.NO_MATCH + for i in range(start,elementCount1): + if (not matched1[i]): + self._xtree1.addMatching(elements1[i], self._matchp) + elif (muc2 < ucount2): + self._matchp[0] = XTree.NO_MATCH + for i in range(elementCount2): + if (not matched2[i]): + self._xtree2.addMatching(elements2[i], self._matchp) + + + # Diff and match two lists of attributes + # @param attrCount1 number of attributes in the 1st list + # @param attrCount2 number of attributes in the 2nd list + def diffAttributes(self, attrCount1, attrCount2): + if ((attrCount1 == 1) and (attrCount2 == 1)): + ah1 = self._xtree1.getHashValue(self._attrList1[0]) + ah2 = self._xtree2.getHashValue(self._attrList2[0]) + if (ah1 == ah2): + return + + tag1 = self._xtree1.getTag(self._attrList1[0]) + tag2 = self._xtree2.getTag(self._attrList2[0]) + if (tag1.compareTo(tag2) == 0): + self._matchp[0] = XTree.CHANGE + self._matchp[1] = self._attrList2[0] + self._xtree1.addMatching(self._attrList1[0], self._matchp) + + self._matchp[1] = self._attrList1[0] + self._xtree2.addMatching(self._attrList2[0], self._matchp) + + tid1 = self._xtree1.getFirstChild(self._attrList1[0]) + tid2 = self._xtree2.getFirstChild(self._attrList2[0]) + self._matchp[1] = tid2 + self._xtree1.addMatching(tid1, self._matchp) + + self._matchp[1] = tid1 + self._xtree2.addMatching(tid2, self._matchp) + + return + else: + self._matchp[0] = XTree.NO_MATCH + self._xtree1.addMatching(self._attrList1[0], self._matchp) + self._xtree2.addMatching(self._attrList2[0], self._matchp) + return + + for i in range(attrCount2): + self._attrHash[i] = self._xtree2.getHashValue(self._attrList2[i]) + self._attrTag[i] = self._xtree2.getTag(self._attrList2[i]) + self._attrMatch[i] = False + + matchCount = 0 + for i in range(attrCount1): + attr1 = self._attrList1[i] + ah1 = self._xtree1.getHashValue(attr1) + tag1 = self._xtree1.getTag(attr1) + + found = False + for j in range(attrCount2): + attr2 = self._attrList2[j] + if (self._attrMatch[j]): + continue + elif (ah1 == self._attrHash[j]): + self._attrMatch[j] = True + matchCount += 1 + found = True + break + elif (tag1.compareTo(self._attrTag[j]) == 0): + self._attrMatch[j] = True + matchCount += 1 + + self._matchp[0] = XTree.CHANGE + self._matchp[1] = attr2 + self._xtree1.addMatching(attr1, self._matchp) + + self._matchp[1] = attr1 + self._xtree2.addMatching(attr2, self._matchp) + + tid1 = self._xtree1.getFirstChild(attr1) + tid2 = self._xtree2.getFirstChild(attr2) + self._matchp[1] = tid2 + self._xtree1.addMatching(tid1, self._matchp) + + self._matchp[1] = tid1 + self._xtree2.addMatching(tid2, self._matchp) + + found = True + break + + if (not found): + self._matchp[0] = XTree.NO_MATCH + self._xtree1.addMatching(attr1, self._matchp) + + if (matchCount != attrCount2): + self._matchp[0] = XTree.NO_MATCH + for i in range(attrCount2): + if (not self._attrMatch[i]): + self._xtree2.addMatching(self._attrList2[i], + self._matchp) + + # Diff and match two lists of text nodes. + # XXX This is just a hack that treats text nodes as unordered, to + # be consistent with the entire algorithm. + # @param textCount1 number of text nodes in the 1st list + # @param textCount2 number of text nodes in the 2nd list + + def diffText(self, textCount1, textCount2): + for i in range(textCount1): + self._textMatch1[i] = False + for i in range(textCount2): + self._textMatch2[i] = False + self._textHash[i] = self._xtree2.getHashValue(self._textList2[i]) + + mcount = 0 + for i in range(textCount1): + hash1 = self._xtree1.getHashValue(self._textList1[i]) + for j in range(textCount2): + if (not self._textMatch2[j] and (hash1 == self._textHash[j])): + self._textMatch1[i] = True + self._textMatch2[j] = True + mcount += 1 + break + + if (mcount == textCount2): + break + + if ((mcount < textCount1) and (textCount1 <= textCount2)): + self._matchp[0] = XTree.CHANGE + i = 0 + j = 0 + while (i < textCount1) and (mcount < textCount1): + if (self._textMatch1[i]): + continue + while self._textMatch2[j]: + j += 1 + self._matchp[1] = self._textList2[j] + self._xtree1.addMatching(self._textList1[i], self._matchp) + self._textMatch1[i] = True + self._matchp[1] = self._textList1[i] + self._xtree2.addMatching(self._textList2[j], self._matchp) + self._textMatch2[j] = True + mcount += 1 + i += 1 + elif ((mcount < textCount2) and (textCount2 < textCount1)): + self._matchp[0] = XTree.CHANGE + i = 0 + j = 0 + while (i < textCount2) and (mcount < textCount2): + if (self._textMatch2[i]): + continue + while (self._textMatch1[j]): + j += 1 + self._matchp[1] = self._textList1[j] + self._xtree2.addMatching(self._textList2[i], self._matchp) + self._textMatch2[i] = True + self._matchp[1] = self._textList2[i] + self._xtree1.addMatching(self._textList1[j], self._matchp) + self._textMatch1[j] = True + mcount += 1 + i += 1 + + self._matchp[0] = XTree.NO_MATCH + if (mcount < textCount1): + for i in range(textCount1): + if (not self._textMatch1[i]): + self._xtree1.addMatching(self._textList1[i], + self._matchp) + elif (mcount < textCount2): + for i in range(textCount2): + if (not self._textMatch2[i]): + self._xtree2.addMatching(self._textList2[i], + self._matchp) + + + # Filter out matched nodepairs. + # @param elements1 node list #1 + # @param elements2 node list #2 + # @param matched1 match list #1 + # @param matched2 match list #2 + # @return how many matched pairs found + + def _matchFilter(self, elements1, count1, elements2, count2, matched1, matched2): + value1 = int[count1] + value2 = int[count2] + + for i in range(count1): + value1[i] = self._xtree1.getHashValue(elements1[i]) + matched1[i] = False + for i in range(count2): + value2[i] = self._xtree2.getHashValue(elements2[i]) + matched2[i] = False + + mcount = 0 + for i in range(count2): + for j in range (count1): + if (not matched1[j] and not matched2[i] and (value1[j] == value2[i])): + matched1[j] = True + matched2[i] = True + mcount += 1 + break + + return mcount + + + # Find minimal cost matching between two node lists + # Record the matching info back to the trees + # Using the original algorithm + # @param nodes1 node list #1 + # @param nodes2 node list #2 + # @param count1 # of nodes in node list #1 + # @param count2 # of nodes in node list #2 + # @param treeOrder True for original, False for inverse + # @param matchFlag indicates if distance computation needed + + def matchListO(self, nodes1, nodes2, count1, count2, treeOrder, matchFlag): + distance = [] + matching1 = [] + matching2 = [] + + # insert cost. + distance[count1] = int[count2+1] + for i in range(count2): + if treeOrder: + distance[count1][i] = self._xtree2.getDecendentsCount(nodes2[i]) + else: + distance[count1][i] = self._xtree1.getDecendentsCount(nodes2[i]) + 1 + + for i in range(count1): + distance[i] = int[count2+1] + if treeOrder: + deleteCost = self._xtree1.getDecendentsCount(nodes1[i]) + else: + deleteCost = self._xtree2.getDecendentsCount(nodes1[i]) + 1 + for j in range(count2): + dist = 0 + if (matchFlag): + if treeOrder: + dist = self._xlut.get(nodes1[i], nodes2[j]) + else: + dist = self._xlut.get(nodes2[j], nodes1[i]) + else: + if treeOrder: + dist = distance(nodes1[i], nodes2[j], True, XTree.NO_CONNECTION) + else: + dist = distance(nodes2[j], nodes1[i], True, XTree.NO_CONNECTION) + # the default mode. + if (not self._oFlag and (dist > 1) and (dist >= self._NO_MATCH_THRESHOLD * (deleteCost + distance[count1][j]))): + dist = XTree.NO_CONNECTION + if (dist < XTree.NO_CONNECTION): + if (treeOrder): + self._xlut.add(nodes1[i], + nodes2[j], + dist) + else: + self._xlut.add(nodes2[j], + nodes1[i], + dist) + distance[i][j] = dist + # delete cost. + distance[i][count2] = deleteCost + + # compute the minimal cost matching. + self.findMatching(count1, count2, distance, matching1, matching2) + + for i in range(count1): + if (matching1[i] == XTree.NO_MATCH): + self._matchp[0] = XTree.NO_MATCH + else: + self._matchp[0] = XTree.CHANGE + self._matchp[1] = nodes2[matching1[i]] + if (treeOrder): + self._xtree1.addMatching(nodes1[i], self._matchp) + else: + self._xtree2.addMatching(nodes1[i], self._matchp) + + for i in range(count2): + if (matching2[i] == XTree.NO_MATCH): + self._matchp[0] = XTree.NO_MATCH + else: + self._matchp[0] = XTree.CHANGE + self._matchp[1] = nodes1[matching2[i]] + if (treeOrder): + self._xtree2.addMatching(nodes2[i], self._matchp) + else: + self._xtree1.addMatching(nodes2[i], self._matchp) + + for i in range(count1): + if (matching1[i] != XTree.NO_MATCH): + todo1 = nodes1[i] + todo2 = nodes2[matching1[i]] + if (treeOrder): + if (self._xtree1.isElement(todo1) and self._xtree2.isElement(todo2)): + self.xdiff(todo1, todo2, True) + else: + if (self._xtree1.isElement(todo2) and self._xtree2.isElement(todo1)): + self.xdiff(todo2, todo1, True) + + + # Find minimal cost matching between two node lists + # Record the matching info back to the trees + # Do sampling. + # @param nodes1 node list #1 + # @param nodes2 node list #2 + # @param count1 # of nodes in node list #1 + # @param count2 # of nodes in node list #2 + # @param treeOrder True for original, False for inverse + # @param matchFlag indicates if distance computation needed + + def matchList(self, nodes1, nodes2, count1, count2, treeOrder, matchFlag): + matching1 = [] + matching2 = [] + for i in range(count1): + matching1[i] = XTree.NO_MATCH + for i in range(count2): + matching2[i] = XTree.NO_MATCH + + if (matchFlag): + for i in range(count1): + for j in range(count2): + if treeOrder: + d = self._xlut.get(nodes1[i], nodes2[j]) + else: + d = self._xlut.get(nodes2[j], nodes1[i]) + if (d != XTree.NO_CONNECTION): + matching1[i] = j + matching2[j] = i + break + else: + r = Random(time.time()) # FIXME + scount1 = 0 + scount2 = 0 + matchingThreshold = 0 + i = 0 + while (i < self._sampleCount) and (scount2 < count2): + snode = r.nextInt(count2 - scount2) + scount2 + dist = XTree.NO_CONNECTION + bestmatch = XTree.NO_MATCH + for j in range(scount1,count1): + if treeOrder: + d = self.distance(nodes1[j], nodes2[snode], False, dist) + else: + d = self.distance(nodes2[snode], nodes1[j], False, dist) + if (d < dist): + dist = d + bestmatch = j + if (d == 1): + break + scount2 += 1 + + if treeOrder: + deleteCost = self._xtree2.getDecendentsCount(nodes2[snode]) + 1 + else: + deleteCost = self._xtree1.getDecendentsCount(nodes2[snode]) + 1 + if ((dist > 1) and (dist > (self._NO_MATCH_THRESHOLD * deleteCost))): + tmp = nodes2[snode] + nodes2[snode] = nodes2[scount2] + nodes2[scount2] = tmp + else: + tmp = nodes1[bestmatch] + nodes1[bestmatch] = nodes1[scount1] + nodes1[scount1] = tmp + tmp = nodes2[snode] + nodes2[snode] = nodes2[scount2] + nodes2[scount2] = tmp + + if (treeOrder): + self._xlut.add(nodes1[scount1], nodes2[scount2], dist) + else: + self._xlut.add(nodes2[scount2], nodes1[scount1], dist) + matching1[scount1] = scount2 + matching2[scount2] = scount1 + + i += 1 + scount1 += 1 + if (matchingThreshold < dist): + matchingThreshold = dist + + while scount2 < count2: + dist = XTree.NO_CONNECTION + bestmatch = XTree.NO_MATCH + for i in range(scount1,count1): + if treeOrder: + d = self.distance(nodes1[i], nodes2[scount2], False, dist) + else: + d = self.distance(nodes2[scount2], nodes1[i], False, dist) + if (d <= matchingThreshold): + dist = d + bestmatch = i + break + elif (d < dist): + dist = d + bestmatch = i + + if (bestmatch != XTree.NO_MATCH): + tmp = nodes1[bestmatch] + nodes1[bestmatch] = nodes1[scount1] + nodes1[scount1] = tmp + + if (treeOrder): + self._xlut.add(nodes1[scount1], nodes2[scount2], dist) + else: + self._xlut.add(nodes2[scount2], nodes1[scount1], dist) + matching1[scount1] = scount2 + matching2[scount2] = scount1 + scount1 += 1 + scount2 += 1 + + # Record matching + for i in range(count1): + if (matching1[i] == XTree.NO_MATCH): + self._matchp[0] = XTree.NO_MATCH + else: + self._matchp[0] = XTree.CHANGE + self._matchp[1] = nodes2[matching1[i]] + if (treeOrder): + self._xtree1.addMatching(nodes1[i], self._matchp) + else: + self._xtree2.addMatching(nodes1[i], self._matchp) + + for i in range(count2): + if (matching2[i] == XTree.NO_MATCH): + self._matchp[0] = XTree.NO_MATCH + else: + self._matchp[0] = XTree.CHANGE + self._matchp[1] = nodes1[matching2[i]] + if (treeOrder): + self._xtree2.addMatching(nodes2[i], self._matchp) + else: + self._xtree1.addMatching(nodes2[i], self._matchp) + + for i in range(count1): + if (matching1[i] != XTree.NO_MATCH): + todo1 = nodes1[i] + todo2 = nodes2[matching1[i]] + if (treeOrder): + if (self._xtree1.isElement(todo1) and self._xtree2.isElement(todo2)): + self.xdiff(todo1, todo2, True) + else: + if (self._xtree1.isElement(todo2) and self._xtree2.isElement(todo1)): + self.xdiff(todo2, todo1, True) + + + # Compute (minimal-editing) distance between two nodes. + # @param eid1 element id #1 + # @param eid2 element id #2 + # @param toRecord whether or not to keep the result + # @param threshold No need to return a distance higher + # than this threshold + # @return the distance + + def distance(self, eid1, eid2, toRecord, threshold): + isE1 = self._xtree1.isElement(eid1) + isE2 = self._xtree2.isElement(eid2) + if (isE1 and isE2): + if (self._xtree1.getTag(eid1).compareTo(self._xtree2.getTag(eid2)) != 0): + return XTree.NO_CONNECTION + else: + dist = self._xdiff(eid1, eid2, threshold) + if (toRecord and (dist < XTree.NO_CONNECTION)): + self._xlut.add(eid1, eid2, dist) + return dist + elif (not isE1 and not isE2): + return 1 + else: + return XTree.NO_CONNECTION + + + # To compute the editing distance between two nodes + # @param pid1 parent id #1 + # @param pid2 parent id #2 + # @param threshold No need to return a distance higher + # than this threshold + # @return the distance + + def _xdiff(self, pid1, pid2, threshold): + dist = 0 + + # diff attributes. + attrCount1 = 0 + attrCount2 = 0 + attr1 = self._xtree1.getFirstAttribute(pid1) + while (attr1 != XTree.NULL_NODE): + self._attrList1[attrCount1] = attr1 + attrCount1 += 1 + attr1 = self._xtree1.getNextAttribute(attr1) + attr2 = self._xtree2.getFirstAttribute(pid2) + while (attr2 != XTree.NULL_NODE): + self._attrList2[attrCount2] = attr2 + attrCount2 += 1 + attr2 = self._xtree2.getNextAttribute(attr2) + + if (attrCount1 == 0): + dist = attrCount2 * 2 + elif (attrCount2 == 0): + dist = attrCount1 * 2 + else: + dist = self._diffAttributes(attrCount1, attrCount2) + if (self._gFlag and (dist >= threshold)): + return XTree.NO_CONNECTION + + # Match second level nodes first. + count1 = self._xtree1.getChildrenCount(pid1) - attrCount1 + count2 = self._xtree2.getChildrenCount(pid2) - attrCount2 + + if (count1 == 0): + node2 = self._xtree2.getFirstChild(pid2) + while (node2 != XTree.NULL_NODE): + dist += self._xtree2.getDecendentsCount(node2) + 1 + if (self._gFlag and (dist >= threshold)): + return XTree.NO_CONNECTION + node2 = self._xtree2.getNextSibling(node2) + elif (count2 == 0): + node1 = self._xtree1.getFirstChild(pid1) + while (node1 != XTree.NULL_NODE): + dist += self._xtree1.getDecendentsCount(node1) + 1 + if (self._gFlag and (dist >= threshold)): + return XTree.NO_CONNECTION + node1 = self._xtree1.getNextSibling(node1) + elif ((count1 == 1) and (count2 == 1)): + node1 = self._xtree1.getFirstChild(pid1) + node2 = self._xtree2.getFirstChild(pid2) + + if (self._xtree1.getHashValue(node1) == self._xtree2.getHashValue(node2)): + return dist + + isE1 = self._xtree1.isElement(node1) + isE2 = self._xtree2.isElement(node2) + + if (isE1 and isE2): + tag1 = self._xtree1.getTag(node1) + tag2 = self._xtree2.getTag(node2) + if (tag1.compareTo(tag2) == 0): + dist += self._xdiff(node1, node2, threshold - dist) + else: + dist += self._xtree1.getDecendentsCount(node1) + self._xtree2.getDecendentsCount(node2) + 2 + elif (not isE1 and not isE2): + dist += 1 + else: + dist += self._xtree1.getDecendentsCount(node1) + self._xtree2.getDecendentsCount(node2) + 2 + else: + elements1 = int[count1] + elements2 = int[count2] + elementCount1 = 0 + textCount1 = 0 + elementCount2 = 0 + textCount2 = 0 + + child1 = self._xtree1.getFirstChild(pid1) + if (self._xtree1.isElement(child1)): + elements1[elementCount1] = child1 + elementCount1 += 1 + else: + self._textList1[textCount1] = child1 + textCount1 += 1 + for i in range(1,count1): + child1 = self._xtree1.getNextSibling(child1) + if (self._xtree1.isElement(child1)): + elements1[elementCount1] = child1 + elementCount1 += 1 + else: + self._textList1[textCount1] = child1 + textCount1 += 1 + + child2 = self._xtree2.getFirstChild(pid2) + if (self._xtree2.isElement(child2)): + elements2[elementCount2] = child2 + elementCount2 += 1 + else: + self._textList2[textCount2] = child2 + textCount2 += 1 + for i in range(1,count2): + child2 = self._xtree2.getNextSibling(child2) + if (self._xtree2.isElement(child2)): + elements2[elementCount2] = child2 + elementCount2 += 1 + else: + self._textList2[textCount2] = child2 + textCount2 += 1 + + # Match text nodes. + if (textCount1 == 0): + dist += textCount2 + elif (textCount2 == 0): + dist += textCount1 + else: + dist += self._diffText(textCount1, textCount2) + + if (self._gFlag and (dist >= threshold)): + return XTree.NO_CONNECTION + + matched1 = [] + matched2 = [] + mcount = self._matchFilter(elements1, elementCount1, + elements2, elementCount2, + matched1, matched2) + + if ((elementCount1 == mcount) and (elementCount2 == mcount)): + return dist + if (elementCount1 == mcount): + for i in range(elementCount2): + if (not matched2[i]): + dist += self._xtree2.getDecendentsCount(elements2[i]) + 1 + if (self._gFlag and (dist >= threshold)): + return XTree.NO_CONNECTION + return dist + if (elementCount2 == mcount): + for i in range(elementCount1): + if (not matched1[i]): + dist += self._xtree1.getDecendentsCount(elements1[i]) + 1 + if (self._gFlag and (dist >= threshold)): + return XTree.NO_CONNECTION + return dist + + # Write the list of unmatched nodes. + ucount1 = elementCount1 - mcount + ucount2 = elementCount2 - mcount + unmatched1 = [] + unmatched2 = [] + muc1 = 0 + muc2 = 0 + start = 0 + + while ((muc1 < ucount1) and (muc2 < ucount2)): + while (start < elementCount1) and matched1[start]: + start += 1 + startTag = self._xtree1.getTag(elements1[start]) + uele1 = 0 + uele2 = 0 + muc1 += 1 + unmatched1[uele1] = elements1[start] + uele1 += 1 + matched1[start] = True + start += 1 + + i = start + while (i < elementCount1) and (muc1 < ucount1): + if (not matched1[i] and startTag.equals(self._xtree1.getTag(elements1[i]))): + matched1[i] = True + muc1 += 1 + unmatched1[uele1] = elements1[i] + uele1 += 1 + i += 1 + + i = 0 + while (i < elementCount2) and (muc2 < ucount2): + if (not matched2[i] and startTag.equals(self._xtree2.getTag(elements2[i]))): + matched2[i] = True + muc2 += 1 + unmatched2[uele2] = elements2[i] + uele2 += 1 + i += 1 + + if (uele2 == 0): + for i in range(uele1): + dist += self._xtree1.getDecendentsCount(unmatched1[i]) + else: +# if ((uele1 == 1) and (uele2 == 1)): +# dist += self._xdiff(unmatched1[0], +# unmatched2[0], +# threshold-dist) +# elif (uele1 >= uele2): + # To find minimal-cost matching between those unmatched. + if (uele1 >= uele2): + if ((uele2 <= self._sampleCount) or not self._gFlag): + dist += self._matchListO(unmatched1, unmatched2, uele1, uele2, True) + else: + dist += self._matchList(unmatched1, unmatched2, uele1, uele2, True, threshold - dist) + else: + if ((uele1 <= self._sampleCount) or not self._gFlag): + dist += self._matchListO(unmatched2, unmatched1, uele2, uele1, False) + else: + dist += self._matchList(unmatched2, unmatched1, uele2, uele1, False, threshold - dist) + + if (self._gFlag and (dist >= threshold)): + return XTree.NO_CONNECTION + + if (muc1 < ucount1): + for i in range (start,elementCount1): + if (not matched1[i]): + dist += self._xtree1.getDecendentsCount(elements1[i]) + elif (muc2 < ucount2): + for i in range(elementCount2): + if (not matched2[i]): + dist += self._xtree2.getDecendentsCount(elements2[i]) + + if (not self._gFlag or (dist < threshold)): + return dist + else: + return XTree.NO_CONNECTION + + + # Diff two lists of attributes + # @param attrCount1 number of attributes in the 1st list + # @param attrCount2 number of attributes in the 2nd list + # @return the distance + + def _diffAttributes(self, attrCount1, attrCount2): + if ((attrCount1 == 1) and (attrCount2 == 1)): + ah1 = self._xtree1.getHashValue(self._attrList1[0]) + ah2 = self._xtree2.getHashValue(self._attrList2[0]) + if (ah1 == ah2): + return 0 + + tag1 = self._xtree1.getTag(self._attrList1[0]) + tag2 = self._xtree2.getTag(self._attrList2[0]) + if (tag1.compareTo(tag2) == 0): + return 1 + else: + return 2 + + dist = 0 + for i in range(attrCount2): + self._attrHash[i] = self._xtree2.getHashValue(self._attrList2[i]) + self._attrTag[i] = self._xtree2.getTag(self._attrList2[i]) + self._attrMatch[i] = False + + matchCount = 0 + for i in range(attrCount1): + ah1 = self._xtree1.getHashValue(self._attrList1[i]) + tag1 = self._xtree1.getTag(self._attrList1[i]) + found = False + + for j in range(attrCount2): + if (self._attrMatch[j]): + continue + elif (ah1 == self._attrHash[j]): + self._attrMatch[j] = True + found = True + matchCount += 1 + break + elif (tag1.compareTo(self._attrTag[j]) == 0): + self._attrMatch[j] = True + dist += 1 + found = True + matchCount += 1 + break + + if (not found): + dist += 2 + + dist += (attrCount2 - matchCount) * 2 + return dist + + + # Diff and match two lists of text nodes. + # XXX This is just a hack that treats text nodes as unordered, to + # be consistent with the entire algorithm. + # @param textCount1 number of text nodes in the 1st list + # @param textCount2 number of text nodes in the 2nd list + # @return the "distance" between these two lists. + + def _diffText(self, textCount1, textCount2): + for i in range(textCount2): + self._textMatch2[i] = False + self._textHash[i] = self._xtree2.getHashValue(self._textList2[i]) + + mcount = 0 + for i in range(textCount1): + hash1 = self._xtree1.getHashValue(self._textList1[i]) + for j in range(textCount2): + if (not self._textMatch2[j] and (hash1 == self._textHash[j])): + self._textMatch2[j] = True + mcount += 1 + break + + if (mcount == textCount2): + break + + if (textCount1 >= textCount2): + return textCount1 - mcount + else: + return textCount2 - mcount + + + # Find minimal cost matching between two node lists + # Using the original algorithm + # @param nodes1 node list #1 + # @param nodes2 node list #2 + # @param count1 # of nodes in node list #1 + # @param count2 # of nodes in node list #2 + # @param treeOrder True for original, False for inverse + + def _matchListO(self, nodes1, nodes2, count1, count2, treeOrder): + distance = [] + matching1 = [] + matching2 = [] + + # insert cost. + distance[count1] = int[count2+1] + for i in range(count2): + if treeOrder: + distance[count1][i] = self._xtree2.getDecendentsCount(nodes2[i]) + 1 + else: + distance[count1][i] = self._xtree1.getDecendentsCount(nodes2[i]) + 1 + + for i in range(count1): + distance[i] = int[count2+1] + if treeOrder: + deleteCost = self._xtree1.getDecendentsCount(nodes1[i]) + 1 + else: + deleteCost = self._xtree2.getDecendentsCount(nodes1[i]) + 1 + for j in range(count2): + if treeOrder: + dist = distance(nodes1[i], nodes2[j], True, XTree.NO_CONNECTION) + else: + dist = distance(nodes2[j], nodes1[i], True, XTree.NO_CONNECTION) + # the default mode. + if (not self._oFlag and (dist > 1) and (dist < XTree.NO_CONNECTION) and \ + (dist >= self._NO_MATCH_THRESHOLD * \ + (deleteCost + distance[count1][j]))): + dist = XTree.NO_CONNECTION + + if (dist < XTree.NO_CONNECTION): + if (treeOrder): + self._xlut.add(nodes1[i], nodes2[j], dist) + else: + self._xlut.add(nodes2[j], nodes1[i], dist) + distance[i][j] = dist + # delete cost. + distance[i][count2] = deleteCost + + # compute the minimal cost matching. + return self.findMatching(count1, count2, distance, matching1, + matching2) + + + # Find minimal cost matching between two node lists + # Do sampling + # @param nodes1 node list #1 + # @param nodes2 node list #2 + # @param count1 # of nodes in node list #1 + # @param count2 # of nodes in node list #2 + # @param treeOrder True for original, False for inverse + # @param threshold No need to return a distance higher + # than this threshold + def _matchList(self, nodes1, nodes2, count1, count2, treeOrder, threshold): + matching1 = [] + matching2 = [] + for i in range(count1): + matching1[i] = XTree.NO_MATCH + for i in range(count2): + matching2[i] = XTree.NO_MATCH + + distance = 0 + r = Random(time.time()) + scount1 = 0 + scount2 = 0 + matchingThreshold = 0 + + i = 0 + while (i < self._sampleCount) and (scount2 < count2): + snode = r.nextInt(count2 - scount2) + scount2 + dist = XTree.NO_CONNECTION + bestmatch = XTree.NO_MATCH + for j in range(scount1,count1): + if treeOrder: + d = distance(nodes1[j], nodes2[snode], False, threshold - distance) + else: + d = distance(nodes2[snode], nodes1[j], False, threshold - distance) + if (d < dist): + dist = d + bestmatch = j + if (d == 1): + break + + if treeOrder: + deleteCost = self._xtree2.getDecendentsCount(nodes2[snode]) + 1 + deleteCost = self._xtree1.getDecendentsCount(nodes2[snode]) + 1 + + if ((dist > 1) and (dist > (self._NO_MATCH_THRESHOLD * deleteCost))): + tmp = nodes2[snode] + nodes2[snode] = nodes2[scount2] + nodes2[scount2] = tmp + distance += deleteCost + else: + tmp = nodes1[bestmatch] + nodes1[bestmatch] = nodes1[scount1] + nodes1[scount1] = tmp + tmp = nodes2[snode] + nodes2[snode] = nodes2[scount2] + nodes2[scount2] = tmp + + if (treeOrder): + self._xlut.add(nodes1[scount1], nodes2[scount2], dist) + else: + self._xlut.add(nodes2[scount2], nodes1[scount1], dist) + matching1[scount1] = scount2 + matching2[scount2] = scount1 + + i += 1 + scount1 += 1 + if (matchingThreshold < dist): + matchingThreshold = dist + distance += dist + + if (distance >= threshold): + return XTree.NO_CONNECTION + scount2 += 1 + + while (scount2 < count2): + if treeOrder: + deleteCost = self._xtree2.getDecendentsCount(nodes2[scount2]) + 1 + else: + deleteCost = self._xtree1.getDecendentsCount(nodes2[scount2]) + 1 + dist = XTree.NO_CONNECTION + bestmatch = XTree.NO_MATCH + for i in range(scount1,count1): + if treeOrder: + d = distance(nodes1[i], nodes2[scount2], False, threshold - distance) + else: + d = distance(nodes2[scount2], nodes1[i], False, threshold - distance) + if (d <= matchingThreshold): + dist = d + bestmatch = i + break + elif ((d == 1) or ( d < (self._NO_MATCH_THRESHOLD * dist))): + dist = d + bestmatch = i + + if (bestmatch == XTree.NO_MATCH): + distance += deleteCost + else: + tmp = nodes1[bestmatch] + nodes1[bestmatch] = nodes1[scount1] + nodes1[scount1] = tmp + + if (treeOrder): + self._xlut.add(nodes1[scount1], nodes2[scount2], dist) + else: + self._xlut.add(nodes2[scount2], nodes1[scount1], dist) + + matching1[scount1] = scount2 + matching2[scount2] = scount1 + scount1 += 1 + distance += dist + + if (distance >= threshold): + return XTree.NO_CONNECTION + scount2 += 1 + + for i in range(count1): + if (matching1[i] == XTree.NO_MATCH): + if treeOrder: + distance += self._xtree1.getDecendentsCount(nodes1[i]) + 1 + else: + distance += self._xtree2.getDecendentsCount(nodes1[i]) + 1 + if (distance >= threshold): + return XTree.NO_CONNECTION + + return distance + + + # Perform minimal-cost matching between two node lists #1 + # Trivial part. + # @param count1 length of node list #1 + # @param count2 length of node list #2 + # @param dist distance matrix + # @param matching1 matching list (for node list #1) + # @param matching2 matching list (for node list #2) + # @return distance + def findMatching(self, count1, count2, dist, matching1, matching2): + if (count1 == 1): + # count2 == 1 + if (dist[0][0] < XTree.NO_CONNECTION): + matching1[0] = 0 + matching2[0] = 0 + else: + matching1[0] = XTree.DELETE + matching2[0] = XTree.DELETE + + return dist[0][0] + elif (count2 == 1): + distance = 0 + mate = 0 + mindist = XTree.NO_CONNECTION + matching2[0] = XTree.DELETE + + for i in range(count1): + matching1[i] = XTree.DELETE + if (mindist > dist[i][0]): + mindist = dist[i][0] + mate = i + + # Suppose we delete every node on list1. + distance += dist[i][1] + + if (mindist < XTree.NO_CONNECTION): + matching1[mate] = 0 + matching2[0] = mate + distance += mindist - dist[mate][1] + else: + # Add the delete cost of the single node + # on list2. + distance += dist[count1][0] + + return distance + elif ((count1 == 2) and (count2 == 2)): + distance1 = dist[0][0] + dist[1][1] + distance2 = dist[0][1] + dist[1][0] + if (distance1 < distance2): + if (dist[0][0] < XTree.NO_CONNECTION): + matching1[0] = 0 + matching2[0] = 0 + distance1 = dist[0][0] + else: + matching1[0] = XTree.DELETE + matching2[0] = XTree.DELETE + distance1 = dist[0][2] + dist[2][0] + + if (dist[1][1] < XTree.NO_CONNECTION): + matching1[1] = 1 + matching2[1] = 1 + distance1 += dist[1][1] + else: + matching1[1] = XTree.DELETE + matching2[1] = XTree.DELETE + distance1 += dist[1][2] + dist[2][1] + + return distance1 + else: + if (dist[0][1] < XTree.NO_CONNECTION): + matching1[0] = 1 + matching2[1] = 0 + distance2 = dist[0][1] + else: + matching1[0] = XTree.DELETE + matching2[1] = XTree.DELETE + distance2 = dist[0][2] + dist[2][1] + + if (dist[1][0] < XTree.NO_CONNECTION): + matching1[1] = 0 + matching2[0] = 1 + distance2 += dist[1][0] + else: + matching1[1] = XTree.DELETE + matching2[0] = XTree.DELETE + distance2 += dist[1][2] + dist[2][0] + + return distance2 + else: + return self.optimalMatching(count1, count2, dist, + matching1, matching2) + + + # Perform minimal-cost matching between two node lists + # @param count1 length of node list #1 + # @param count2 length of node list #2 + # @param dist distance matrix + # @param matching1 matching list (for node list #1) + # @param matching2 matching list (for node list #2) + # @return distance + + def optimalMatching(self, count1, count2, dist, matching1, matching2): + # Initialize matching. + # Initial guess will be pair-matching between two lists. + # Others will be insertion or deletion + for i in range(count2): + matching1[i] = i + for i in range(count2, count1): + matching1[i] = XTree.DELETE + + # Three artificial nodes: "start", "end" and "delete". + count = count1 + count2 + 3 + + # Initialize least cost matrix and path matrix. + # Both have been initialized at the very beginning. + + # Start algorithm. + while (True): + # Construct least cost matrix. + self.constructLCM(dist, matching1, count1, count2) + + # Initialize path matrix. + for i in range(count): + for j in range(count): + self._pathMatrix[i][j] = i + + # Search negative cost circuit. + clen = self.searchNCC(count) + if (clen > 0): + # Modify matching. + i = 0 + next = 0 + while (i < clen - 1): + n1 = self._circuit[next] + next = self._circuit[next+1] + # Node in node list 1. + if ((n1 > 0) and (n1 <= count1)): + nid1 = n1 - 1 + nid2 = self._circuit[next] - count1 - 1 + if (nid2 == count2): + nid2 = XTree.DELETE + + matching1[nid1] = nid2 + i += 1 + else: # Stop. + break + + distance = 0 + # Suppose all insertion on list2 + for i in range(count2): + matching2[i] = XTree.INSERT + distance += dist[count1][i] + + # update distance by looking at matching pairs. + for i in range(count1): + mmm = matching1[i] + if (mmm == XTree.DELETE): + distance += dist[i][count2] + else: + matching2[mmm] = i + distance += dist[i][mmm] - dist[count1][mmm] + + return distance + + + # Construct a least cost matrix (of the flow network) based on + # the cost matrix + # @param costMatrix cost matrix + # @param matching matching information + # @param nodeCount1 # of nodes in node list 1 + # @param nodeCount2 # of nodes in node list 2 + + def constructLCM(self, costMatrix, matching, nodeCount1, nodeCount2): + # Three artificial nodes: "start", "end" and "delete". + nodeCount = nodeCount1 + nodeCount2 + 3 + + # Initialize. + for i in range(nodeCount): + for j in range(nodeCount): + self._leastCostMatrix[i][j] = XTree.NO_CONNECTION + + # self. + self._leastCostMatrix[i][i] = 0 + + # Between start node and nodes in list 1. + # Start -> node1 = Infinity; node1 -> Start = -0. + for i in range(nodeCount1): + self._leastCostMatrix[i+1][0] = 0 + + # Between nodes in list2 and the end node. + # Unless matched (later), node2 -> end = 0 + # end -> node2 = Infinity. + for i in range(nodeCount2): + self._leastCostMatrix[i+nodeCount1+1][nodeCount-1] = 0 + + deleteCount = 0 + + # Between nodes in list1 and nodes in list2. + # For matched, node1 -> node2 = Infinity + # node2 -> node1 = -1 * distance + # For unmatched, node1 -> node2 = distance + # node2 -> node1 = Infinity + for i in range(nodeCount1): + node1 = i + 1 + + # According to cost matrix. + for j in range(nodeCount2): + node2 = j + nodeCount1 + 1 + self._leastCostMatrix[node1][node2] = costMatrix[i][j] + + # According to matching. + if (matching[i] == XTree.DELETE): + deleteCount += 1 + + # node1 -> Delete = Infinity + # Delete -> node1 = -1 * DELETE_COST + self._leastCostMatrix[nodeCount-2][node1] = -1 * costMatrix[i][nodeCount2] + else: + node2 = matching[i] + nodeCount1 + 1 + + # Between node1 and node2. + self._leastCostMatrix[node1][node2] = XTree.NO_CONNECTION + self._leastCostMatrix[node2][node1] = costMatrix[i][matching[i]] * -1 + + # Between node1 and delete. + self._leastCostMatrix[node1][nodeCount-2] = costMatrix[i][nodeCount2] + + # Between node2 and end. + self._leastCostMatrix[node2][nodeCount-1] = XTree.NO_CONNECTION + self._leastCostMatrix[nodeCount-1][node2] = costMatrix[nodeCount1][matching[i]] + + # Between the "Delete" and the "End". + # If delete all, delete -> end = Infinity; end -> delete = 0. + if (deleteCount == nodeCount1): + self._leastCostMatrix[nodeCount-1][nodeCount-2] = 0 + # if no delete, delete -> end = 0; end -> delete = Infinity. + elif (deleteCount == 0): + self._leastCostMatrix[nodeCount-2][nodeCount-1] = 0 + # else, both 0 + else: + self._leastCostMatrix[nodeCount-2][nodeCount-1] = 0 + self._leastCostMatrix[nodeCount-1][nodeCount-2] = 0 + + + # Search for negative cost circuit in the least cost matrix. + # @param nodeCount node count + # @return the length of the path if found; otherwise 0 + def searchNCC(self, nodeCount): + for k in range(nodeCount): + for i in range(nodeCount): + if ((i != k) and (self._leastCostMatrix[i][k] != XTree.NO_CONNECTION)): + for j in range(nodeCount): + if ((j != k) and (self._leastCostMatrix[k][j] != XTree.NO_CONNECTION)): + less = self._leastCostMatrix[i][k] + self._leastCostMatrix[k][j] + if (less < self._leastCostMatrix[i][j]): + self._leastCostMatrix[i][j] = less + self._pathMatrix[i][j] = k + + # Found! + if ((i == j) and (less < 0)): + clen = 0; # the length of the circuit. + + # Locate the circuit. + #circuit.addElement( Integer(i)) + self._circuit[0] = i + self._circuit[1] = 2 + + #circuit.addElement( Integer(pathMatrix[i][i])) + self._circuit[2] = self._pathMatrix[i][i] + self._circuit[3] = 4 + + #circuit.addElement( Integer(i)) + self._circuit[4] = i + self._circuit[5] = -1 + + clen = 3 + + finish = False + while (not finish): + finish = True + cit = 0 + n = 0 + while (cit < clen - 1): + left = self._circuit[n] + next = self._circuit[n + 1] + if next == -1: + right = -1 + else: + right = self._circuit[next] + + #int middle = pathMatrix[circuit[n-1]][circuit[n]] + middle = self._pathMatrix[left][right] + + if (middle != left): + #circuit.insert( cit, middle ) + self._circuit[clen * 2] = middle + self._circuit[clen * 2 + 1] = next + self._circuit[n + 1] = clen * 2 + clen += 1 + + finish = False + break + n = next + cit += 1 + + return clen + + return 0 + + + # For testing purpose -- print out matrixes + def printMatrix(self, nodeCount): + print "Cost Matrix:" + for i in range(nodeCount): + for j in range(nodeCount): + if (self._leastCostMatrix[i][j] < XTree.NO_CONNECTION): + sys.stdout.write(self._leastCostMatrix[i][j] + "\t") + else: + sys.stdout.write("\t") + print + + print "\nPath Matrix:" + for i in range(nodeCount): + for j in range(nodeCount - 1): + sys.stdout.write(self._pathMatrix[i][j] + "\t") + print self._pathMatrix[i][nodeCount-1] + + + # Write out the diff result -- how doc1 is changed to doc2 + # @param input the first/old xml document + # @param output output file name + # FIXME this is probably completely wrong ... IO is Java-specific!!! + def writeDiff(self, input, output): + try: + out = codecs.open(output, self._encoding) + br = open(input) + + root1 = self._xtree1.getRoot() + root2 = self._xtree2.getRoot() + + # XXX <root > is as valid as <root>, + # but < root> is NOT! + rootTag = "<" + self._xtree1.getTag(root1) + line = br.readLine() + while (line != None): + if (line.indexOf(rootTag) >= 0): + break + out.write(line + "\n") + line = br.readLine() + + self._xtree1.getMatching(root1, self._matchp) + if (self._matchp[0] == XTree.DELETE): + self.writeDeleteNode(out, root1) + self.writeInsertNode(out, root2) + else: + self.writeDiffNode(out, root1, root2) + + out.close() + except IOError as (errno, strerror): + print >>sys.stderr, strerror + + + # Write an element that has been deleted from the old document. + # @param out output file writer + # @param node element id + + def writeDeleteNode(self, out, node): + if (self._xtree1.isElement(node)): + tag = self._xtree1.getTag(node) + out.write("<" + tag) + + # Attributes. + attr = self._xtree1.getFirstAttribute(node) + while (attr > 0): + atag = self._xtree1.getTag(attr) + value = self._xtree1.getAttributeValue(attr) + out.write(" " + atag + "=\"" + value + "\"") + attr = self._xtree1.getNextAttribute(attr) + + # Child nodes. + child = self._xtree1.getFirstChild(node) + + if (child < 0): + out.write("/><?DELETE " + tag + "?>\n") + self._needNewLine = False + return + + out.write("><?DELETE " + tag + "?>\n") + self._needNewLine = False + + while (child > 0): + self.writeMatchNode(out, self._xtree1, child) + child = self._xtree1.getNextSibling(child) + + if (self._needNewLine): + out.write("\n") + self._needNewLine = False + + out.write("</" + tag + ">\n") + else: + out.write("<?DELETE \"" + self.constructText(self._xtree1, node) + + "\"?>\n") + self._needNewLine = False + + + # Write an element that has been inserted from the document. + # @param out output file writer + # @param node element id + + def writeInsertNode(self, out, node): + if (self._xtree2.isElement(node)): + tag = self._xtree2.getTag(node) + out.write("<" + tag) + + # Attributes. + attr = self._xtree2.getFirstAttribute(node) + while (attr > 0): + atag = self._xtree2.getTag(attr) + value = self._xtree2.getAttributeValue(attr) + out.write(" " + atag + "=\"" + value + "\"") + attr = self._xtree2.getNextAttribute(attr) + + # Child nodes. + child = self._xtree2.getFirstChild(node) + if (child < 0): + out.write("/><?INSERT " + tag + "?>\n") + self._needNewLine = False + return + + out.write("><?INSERT " + tag + "?>\n") + self._needNewLine = False + + while (child > 0): + self.writeMatchNode(out, self._xtree2, child) + child = self._xtree2.getNextSibling(child) + + if (self._needNewLine): + out.write("\n") + self._needNewLine = False + + out.write("</" + tag + ">\n") + else: + out.write(self.constructText(self._xtree2, node) + + "<?INSERT?>\n") + self._needNewLine = False + + + # Write an element that is unchanged or in a deleted node or in + # an inserted node. + # @param out output file writer + # @param xtree the document tree + # @param node element id + + def writeMatchNode(self, out, xtree, node): + if (xtree.isElement(node)): + tag = xtree.getTag(node) + if (self._needNewLine): + out.write("\n") + + out.write("<" + tag) + + # Attributes. + attr = xtree.getFirstAttribute(node) + while (attr > 0): + atag = xtree.getTag(attr) + value = xtree.getAttributeValue(attr) + out.write(" " + atag + "=\"" + value + "\"") + attr = xtree.getNextAttribute(attr) + + # Child nodes. + child = xtree.getFirstChild(node) + if (child < 0): + out.write("/>\n") + self._needNewLine = False + return + + out.write(">") + self._needNewLine = True + + while (child > 0): + self.writeMatchNode(out, xtree, child) + child = xtree.getNextSibling(child) + + if (self._needNewLine): + out.write("\n") + self._needNewLine = False + + out.write("</" + tag + ">\n") + else: + out.write(self.constructText(xtree, node)) + self._needNewLine = False + + + # Write one node in the diff result. + # @param out output file writer + # @param node1 the node in the first tree + # @param node2 node1's conterpart in the second tree + + def writeDiffNode(self, out, node1, node2): + if (self._xtree1.isElement(node1)): + tag = self._xtree1.getTag(node1) + if (self._needNewLine): + out.write("\n") + out.write("<" + tag) + + # Attributes. + attr1 = self._xtree1.getFirstAttribute(node1) + diffff = "" + while (attr1 > 0): + atag = self._xtree1.getTag(attr1) + value = self._xtree1.getAttributeValue(attr1) + self._xtree1.getMatching(attr1, self._matchp) + if (self._matchp[0] == XTree.MATCH): + out.write(" " + atag + "=\"" + + value + "\"") + elif (self._matchp[0] == XTree.DELETE): + out.write(" " + atag + "=\"" + + value + "\"") + diffff += "<?DELETE " + atag + "?>" + else: + value2 = self._xtree2.getAttributeValue(self._matchp[1]) + out.write(" " + atag + "=\"" + + value2 + "\"") + diffff += "<?UPDATE " + atag + \ + " FROM \"" + value + "\"?>" + + attr1 = self._xtree1.getNextAttribute(attr1) + + attr2 = self._xtree2.getFirstAttribute(node2) + while (attr2 > 0): + self._xtree2.getMatching(attr2, self._matchp) + if (self._matchp[0] == XTree.INSERT): + atag = self._xtree2.getTag(attr2) + value = self._xtree2.getAttributeValue(attr2) + out.write(" " + atag + "=\"" + + value + "\"") + diffff += "<?INSERT " + atag + "?>" + + attr2 = self._xtree2.getNextAttribute(attr2) + + # Child nodes. + child1 = self._xtree1.getFirstChild(node1) + if (child1 < 0): + out.write("/>" + diffff + "\n") + self._needNewLine = False + return + + out.write(">" + diffff) + self._needNewLine = True + + while (child1 > 0): + self._xtree1.getMatching(child1, self._matchp) + if (self._matchp[0] == XTree.MATCH): + self.writeMatchNode(out, self._xtree1, child1) + elif (self._matchp[0] == XTree.DELETE): + self.writeDeleteNode(out, child1) + else: + self.writeDiffNode(out, child1, self._matchp[1]) + + child1 = self._xtree1.getNextSibling(child1) + + child2 = self._xtree2.getFirstChild(node2) + while (child2 > 0): + self._xtree2.getMatching(child2, self._matchp) + if (self._matchp[0] == XTree.INSERT): + self.writeInsertNode(out, child2) + + child2 = self._xtree2.getNextSibling(child2) + + if (self._needNewLine): + out.write("\n") + self._needNewLine = False + + out.write("</" + tag + ">\n") + else: + out.write(self.constructText(self._xtree2, node2) + + "<?UPDATE FROM \"" + + self.constructText(self._xtree1, node1) + "\"?>") + self._needNewLine = False + + + # Construct the text node -- to handle the possible CDATA sections. + + def constructText(self, xtree, eid): + text = xtree.getText(eid) + cdatalist = xtree.getCDATA(eid) + if (cdatalist == None): + return text + + buf = StringBuffer() + count = cdatalist.size() + lastEnd = 0 + + for i in range(0,count,2): + cdataStart = int(self.cdatalist[i]) + cdataEnd = int(self.cdatalist[i+1]) + + if (cdataStart > lastEnd): + buf.append(text.substring(lastEnd, cdataStart)) + buf.append("<![CDATA[" + + text.substring(cdataStart, cdataEnd) + + "]]>") + lastEnd = cdataEnd + if (lastEnd < text.length()): + buf.append(text.substring(lastEnd)) + + return buf.toString() + +def readParameters(args, parameters): + opid = 0 + if (args.length < 3): + return False + # we are not in the object, so how can we get to these values? + # FIXME global module variables? + elif (args[0].equals("-o")): + _oFlag = True + opid += 1 + elif (args[0].equals("-g")): + _gFlag = True + opid += 1 + + if (args[opid].equals("-p")): + opid += 1 + p = 0 +# try: + p = float(args[opid]) + opid += 1 +# FIXME ... most likely FloatingPointError +# except NumberFormatException: +# return False + + if ((p <= 0) or (p > 1)): + return False + XDiff._NO_MATCH_THRESHOLD = p + + if (args[opid].equals("-e")): + opid += 1 + _encoding = args[opid] + opid += 1 + + if ((args.length - opid) != 3): + return False + parameters.add(args[opid]) + opid += 1 + parameters.add(args[opid]) + opid += 1 + parameters.add(args[opid]) + + return True + +if __name__ == "__main__": + parameters = [] + if (not readParameters(sys.argv, parameters)): + print >>sys.stderr, __doc__ + return + + mydiff = XDiff(parameters[0], parameters[1], parameters[2])
\ No newline at end of file diff --git a/XHash.py b/XHash.py new file mode 100644 index 0000000..f0c1554 --- /dev/null +++ b/XHash.py @@ -0,0 +1,310 @@ +# Copyright (c) 2001 - 2005 +# Yuan Wang. All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Redistributions in any form must be accompanied by information on +# how to obtain complete source code for the X-Diff software and any +# accompanying software that uses the X-Diff software. The source code +# must either be included in the distribution or be available for no +# more than the cost of distribution plus a nominal fee, and must be +# freely redistributable under reasonable conditions. For an executable +# file, complete source code means the source code for all modules it +# contains. It does not include source code for modules or files that +# typically accompany the major components of the operating system on +# which the executable file runs. + +# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, +# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +# <code>XHash</code> is an implementaion of DES +class XHash: + private static final int _initialPermutation[] = + 57, 49, 41, 33 , 25, 17, 9, 1, 59, 51, 43, 35, 27, 19, 11, 3, + 61, 53, 45, 37, 29, 21, 13, 5, 63, 55, 47, 39, 31, 23, 15, 7, + 56, 48, 40, 32 ,24, 16, 8, 0, 58, 50, 42, 34, 26, 18, 10, 2, + 60, 52, 44, 36, 28, 20, 12, 4, 62, 54, 46, 38, 30, 22, 14, 6 + }; + + private static final int _finalPermutation[] = + 39, 7, 47, 15, 55, 23, 63, 31, 38, 6, 46, 14, 54, 22, 62, 30, + 37, 5, 45, 13, 53, 21, 61, 29, 36, 4, 44, 12, 52, 20, 60, 28, + 35, 3, 43, 11, 51, 19, 59, 27, 34, 2, 42, 10, 50, 18, 58, 26, + 33, 1, 41, 9, 49, 17, 57, 25, 32, 0, 40, 8, 48, 16, 56, 24 + }; + + private static final int _keyReducePermutation[] = + 60, 52, 44, 36, 59, 51, 43, 35, 27, 19, 11, 3, 58, 50, + 42, 34, 26, 18, 10, 2, 57, 49, 41, 33, 25, 17, 9, 1, + 28, 20, 12, 4, 61, 53, 45, 37, 29, 21, 13, 5, 62, 54, + 46, 38, 30, 22, 14, 6, 63, 55, 47, 39, 31, 23, 15, 7 + }; + + private static final int _keyCompressPermutation[] = + 24, 27, 20, 6, 14, 10, 3, 22, 0, 17, 7, 12, + 8, 23, 11, 5, 16, 26, 1, 9, 19, 25, 4, 15, + 54, 43 ,36, 29, 49, 40, 48, 30, 52, 44, 37, 33, + 46, 35, 50, 41, 28, 53, 51, 55, 32, 45, 39, 42 + }; + + private static final int _keyRot[] = + 1, 2, 4, 6, 8, 10, 12, 14, 15, 17, 19, 21, 23, 25, 27, 28 + }; + + private static final int _sBoxP[][] = + 0x00808200, 0x00000000, 0x00008000, 0x00808202, + 0x00808002, 0x00008202, 0x00000002, 0x00008000, + 0x00000200, 0x00808200, 0x00808202, 0x00000200, + 0x00800202, 0x00808002, 0x00800000, 0x00000002, + 0x00000202, 0x00800200, 0x00800200, 0x00008200, + 0x00008200, 0x00808000, 0x00808000, 0x00800202, + 0x00008002, 0x00800002, 0x00800002, 0x00008002, + 0x00000000, 0x00000202, 0x00008202, 0x00800000, + 0x00008000, 0x00808202, 0x00000002, 0x00808000, + 0x00808200, 0x00800000, 0x00800000, 0x00000200, + 0x00808002, 0x00008000, 0x00008200, 0x00800002, + 0x00000200, 0x00000002, 0x00800202, 0x00008202, + 0x00808202, 0x00008002, 0x00808000, 0x00800202, + 0x00800002, 0x00000202, 0x00008202, 0x00808200, + 0x00000202, 0x00800200, 0x00800200, 0x00000000, + 0x00008002, 0x00008200, 0x00000000, 0x00808002 + }, + 0x40084010, 0x40004000, 0x00004000, 0x00084010, + 0x00080000, 0x00000010, 0x40080010, 0x40004010, + 0x40000010, 0x40084010, 0x40084000, 0x40000000, + 0x40004000, 0x00080000, 0x00000010, 0x40080010, + 0x00084000, 0x00080010, 0x40004010, 0x00000000, + 0x40000000, 0x00004000, 0x00084010, 0x40080000, + 0x00080010, 0x40000010, 0x00000000, 0x00084000, + 0x00004010, 0x40084000, 0x40080000, 0x00004010, + 0x00000000, 0x00084010, 0x40080010, 0x00080000, + 0x40004010, 0x40080000, 0x40084000, 0x00004000, + 0x40080000, 0x40004000, 0x00000010, 0x40084010, + 0x00084010, 0x00000010, 0x00004000, 0x40000000, + 0x00004010, 0x40084000, 0x00080000, 0x40000010, + 0x00080010, 0x40004010, 0x40000010, 0x00080010, + 0x00084000, 0x00000000, 0x40004000, 0x00004010, + 0x40000000, 0x40080010, 0x40084010, 0x00084000 + }, + 0x00000104, 0x04010100, 0x00000000, 0x04010004, + 0x04000100, 0x00000000, 0x00010104, 0x04000100, + 0x00010004, 0x04000004, 0x04000004, 0x00010000, + 0x04010104, 0x00010004, 0x04010000, 0x00000104, + 0x04000000, 0x00000004, 0x04010100, 0x00000100, + 0x00010100, 0x04010000, 0x04010004, 0x00010104, + 0x04000104, 0x00010100, 0x00010000, 0x04000104, + 0x00000004, 0x04010104, 0x00000100, 0x04000000, + 0x04010100, 0x04000000, 0x00010004, 0x00000104, + 0x00010000, 0x04010100, 0x04000100, 0x00000000, + 0x00000100, 0x00010004, 0x04010104, 0x04000100, + 0x04000004, 0x00000100, 0x00000000, 0x04010004, + 0x04000104, 0x00010000, 0x04000000, 0x04010104, + 0x00000004, 0x00010104, 0x00010100, 0x04000004, + 0x04010000, 0x04000104, 0x00000104, 0x04010000, + 0x00010104, 0x00000004, 0x04010004, 0x00010100 + }, + 0x80401000, 0x80001040, 0x80001040, 0x00000040, + 0x00401040, 0x80400040, 0x80400000, 0x80001000, + 0x00000000, 0x00401000, 0x00401000, 0x80401040, + 0x80000040, 0x00000000, 0x00400040, 0x80400000, + 0x80000000, 0x00001000, 0x00400000, 0x80401000, + 0x00000040, 0x00400000, 0x80001000, 0x00001040, + 0x80400040, 0x80000000, 0x00001040, 0x00400040, + 0x00001000, 0x00401040, 0x80401040, 0x80000040, + 0x00400040, 0x80400000, 0x00401000, 0x80401040, + 0x80000040, 0x00000000, 0x00000000, 0x00401000, + 0x00001040, 0x00400040, 0x80400040, 0x80000000, + 0x80401000, 0x80001040, 0x80001040, 0x00000040, + 0x80401040, 0x80000040, 0x80000000, 0x00001000, + 0x80400000, 0x80001000, 0x00401040, 0x80400040, + 0x80001000, 0x00001040, 0x00400000, 0x80401000, + 0x00000040, 0x00400000, 0x00001000, 0x00401040 + }, + 0x00000080, 0x01040080, 0x01040000, 0x21000080, + 0x00040000, 0x00000080, 0x20000000, 0x01040000, + 0x20040080, 0x00040000, 0x01000080, 0x20040080, + 0x21000080, 0x21040000, 0x00040080, 0x20000000, + 0x01000000, 0x20040000, 0x20040000, 0x00000000, + 0x20000080, 0x21040080, 0x21040080, 0x01000080, + 0x21040000, 0x20000080, 0x00000000, 0x21000000, + 0x01040080, 0x01000000, 0x21000000, 0x00040080, + 0x00040000, 0x21000080, 0x00000080, 0x01000000, + 0x20000000, 0x01040000, 0x21000080, 0x20040080, + 0x01000080, 0x20000000, 0x21040000, 0x01040080, + 0x20040080, 0x00000080, 0x01000000, 0x21040000, + 0x21040080, 0x00040080, 0x21000000, 0x21040080, + 0x01040000, 0x00000000, 0x20040000, 0x21000000, + 0x00040080, 0x01000080, 0x20000080, 0x00040000, + 0x00000000, 0x20040000, 0x01040080, 0x20000080 + }, + 0x10000008, 0x10200000, 0x00002000, 0x10202008, + 0x10200000, 0x00000008, 0x10202008, 0x00200000, + 0x10002000, 0x00202008, 0x00200000, 0x10000008, + 0x00200008, 0x10002000, 0x10000000, 0x00002008, + 0x00000000, 0x00200008, 0x10002008, 0x00002000, + 0x00202000, 0x10002008, 0x00000008, 0x10200008, + 0x10200008, 0x00000000, 0x00202008, 0x10202000, + 0x00002008, 0x00202000, 0x10202000, 0x10000000, + 0x10002000, 0x00000008, 0x10200008, 0x00202000, + 0x10202008, 0x00200000, 0x00002008, 0x10000008, + 0x00200000, 0x10002000, 0x10000000, 0x00002008, + 0x10000008, 0x10202008, 0x00202000, 0x10200000, + 0x00202008, 0x10202000, 0x00000000, 0x10200008, + 0x00000008, 0x00002000, 0x10200000, 0x00202008, + 0x00002000, 0x00200008, 0x10002008, 0x00000000, + 0x10202000, 0x10000000, 0x00200008, 0x10002008 + }, + 0x00100000, 0x02100001, 0x02000401, 0x00000000, + 0x00000400, 0x02000401, 0x00100401, 0x02100400, + 0x02100401, 0x00100000, 0x00000000, 0x02000001, + 0x00000001, 0x02000000, 0x02100001, 0x00000401, + 0x02000400, 0x00100401, 0x00100001, 0x02000400, + 0x02000001, 0x02100000, 0x02100400, 0x00100001, + 0x02100000, 0x00000400, 0x00000401, 0x02100401, + 0x00100400, 0x00000001, 0x02000000, 0x00100400, + 0x02000000, 0x00100400, 0x00100000, 0x02000401, + 0x02000401, 0x02100001, 0x02100001, 0x00000001, + 0x00100001, 0x02000000, 0x02000400, 0x00100000, + 0x02100400, 0x00000401, 0x00100401, 0x02100400, + 0x00000401, 0x02000001, 0x02100401, 0x02100000, + 0x00100400, 0x00000000, 0x00000001, 0x02100401, + 0x00000000, 0x00100401, 0x02100000, 0x00000400, + 0x02000001, 0x02000400, 0x00000400, 0x00100001 + }, + 0x08000820, 0x00000800, 0x00020000, 0x08020820, + 0x08000000, 0x08000820, 0x00000020, 0x08000000, + 0x00020020, 0x08020000, 0x08020820, 0x00020800, + 0x08020800, 0x00020820, 0x00000800, 0x00000020, + 0x08020000, 0x08000020, 0x08000800, 0x00000820, + 0x00020800, 0x00020020, 0x08020020, 0x08020800, + 0x00000820, 0x00000000, 0x00000000, 0x08020020, + 0x08000020, 0x08000800, 0x00020820, 0x00020000, + 0x00020820, 0x00020000, 0x08020800, 0x00000800, + 0x00000020, 0x08020020, 0x00000800, 0x00020820, + 0x08000800, 0x00000020, 0x08000020, 0x08020000, + 0x08020020, 0x08000000, 0x00020000, 0x08000820, + 0x00000000, 0x08020820, 0x00020020, 0x08000020, + 0x08020000, 0x08000800, 0x08000820, 0x00000000, + 0x08020820, 0x00020800, 0x00020800, 0x00000820, + 0x00000820, 0x00020020, 0x08000000, 0x08020800 + }; + + private static long _keys[]; + private static char _word[]; + + private static long _initialKey = 1007360890380L; + + /** +# Initialization #1. + public static void initialize() + makeKeys(_initialKey); + _word = new char[64]; + + /** +# Initialization #2. + public static void initialize(long key) + makeKeys(key); + _word = new char[64]; + + public static long hash(String word) + int len = word.length(); + long value = 0L; + for (int start = 0; start < len; start += 64) + if (len - start > 64) + value += (_hash(word.substring(start, start + 64)) ^ 0xffffffffL); + else + value += (_hash(word.substring(start)) ^ 0xffffffffL); + break; + return value; + + /** +# The actual hash function. + private static long _hash(String word) + int len = word.length(); + for (int i = 0; i < len; i++) + _word[i] = word.charAt(i); + int round = len / 8; + int rest = len % 8; + if (rest > 0) + for (int i = 0; i < 8 - rest; i++) + _word[len+i] = 0; + round++; + + int value = 0; + for (int i = 0, pos = 0; i < round; i++, pos += 8) + long todo = 0L + (byte)_word[pos]; + for (int j = 1; j < 8; j++) + todo = (todo << 8) + (byte)_word[pos+j]; + value += des(todo); + + return value; + + private static void makeKeys(long key) + long reduced = permutate(key, _keyReducePermutation); + int l = (int)(reduced >> 28); + int r = (int)(reduced & 0xfffffff); + _keys = new long[16]; + for (int i = 0; i < 16; i++) + _keys[i] = permutate(rotate(l, r, _keyRot[i]), + _keyCompressPermutation); + + private static long des(long w) + long x = permutate(w, _initialPermutation); + int l = (int)(x >>> 32); + int r = (int)x; + for (int i = 0; i < 16; i++) + int tmp = desFunc(r, _keys[i]) ^ l; + l = r; + r = tmp; + long y = ((long)r << 32) | ((long)l & 0xffffffffL); + return permutate(y, _finalPermutation); + + private static long permutate(long k, int p[]) + long s = 0; + for (int i = 0; i < p.length; i++) + if ((k & (1L << p[i])) != 0) + s |= 1L << i; + + return s; + + private static long rotate(int l, int r, int s) + return ((long)(((l<<s) & 0xfffffff) | (l>>>(28 - s))) << 28) | + ((r<<s) & 0xfffffff) | (r>> (28 - s)); + + private static int desFunc(int x, long k) + int p = x >>> 27; + int q = (p & 3) << 4; + int r = x << 5; + p |= r; + r = _sBoxP[0][(int)((k >> 42) ^ p) & 0x3f]; + p >>>= 4; + r |= _sBoxP[7][(int)((k >> 0) ^ p) & 0x3f]; + p >>>= 4; + r |= _sBoxP[6][(int)((k >> 6) ^ p) & 0x3f]; + p >>>= 4; + r |= _sBoxP[5][(int)((k >> 12) ^ p) & 0x3f]; + p >>>= 4; + r |= _sBoxP[4][(int)((k >> 18) ^ p) & 0x3f]; + p >>>= 4; + r |= _sBoxP[3][(int)((k >> 24) ^ p) & 0x3f]; + p >>>= 4; + r |= _sBoxP[2][(int)((k >> 30) ^ p) & 0x3f]; + p >>>= 4; + r |= _sBoxP[1][(int)((k >> 36) ^ (p | q)) & 0x3f]; + return r; @@ -0,0 +1,67 @@ +# Copyright (c) 2001 - 2005 +# Yuan Wang. All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Redistributions in any form must be accompanied by information on +# how to obtain complete source code for the X-Diff software and any +# accompanying software that uses the X-Diff software. The source code +# must either be included in the distribution or be available for no +# more than the cost of distribution plus a nominal fee, and must be +# freely redistributable under reasonable conditions. For an executable +# file, complete source code means the source code for all modules it +# contains. It does not include source code for modules or files that +# typically accompany the major components of the operating system on +# which the executable file runs. + +# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, +# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +import XTree + +# <code>XLut</code> is the hash lookup table for node distance. +class XLut: + +# Constructor. + def __init__(self): + self._xTable = {} + +# Add a node pair and their distance to this table. +# @param eid1 element id #1 +# @param eid2 element id #2 +# @param dist distance + def add(self, eid1, eid2, dist): + key = eid1 + key = key << 32 + key += eid2 + + self._xTable[key] = int(dist) + +# Get the distance of a node pair. +# @param eid1 element id #1 +# @param eid2 element id #2 +# @return distance or -1 if not found + def get(self, eid1, eid2): + key = eid1 + key = key << 32 + key += eid2 + + value = self._xTable[key] + if value == None: + return XTree.NO_CONNECTION + else: + return int(value) diff --git a/XParser.py b/XParser.py new file mode 100644 index 0000000..dcfedc4 --- /dev/null +++ b/XParser.py @@ -0,0 +1,269 @@ +# Copyright (c) 2001 - 2005 +# Yuan Wang. All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Redistributions in any form must be accompanied by information on +# how to obtain complete source code for the X-Diff software and any +# accompanying software that uses the X-Diff software. The source code +# must either be included in the distribution or be available for no +# more than the cost of distribution plus a nominal fee, and must be +# freely redistributable under reasonable conditions. For an executable +# file, complete source code means the source code for all modules it +# contains. It does not include source code for modules or files that +# typically accompany the major components of the operating system on +# which the executable file runs. + +# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, +# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import xml.sax +_PARSER_NAME = "org.apache.xerces.parsers.SAXParser"; + +# FIXME +# This is interesting +# http://www.virtuousprogrammer.com/?page_id=183 +# http://docs.python.org/library/xml.sax.reader.html +# <code>XParser</code> parses an input XML document and constructs an +# <code>XTree</code> +# class XParser extends DefaultHandler implements LexicalHandler +class XParser(xml.sax.handler.ContentHandler): + _setValidation = False + _setNameSpaces = True + _setSchemaSupport = True + _setSchemaFullSupport = False + _setNameSpacePrefixes = True + + _STACK_SIZE = 100 + + private XMLReader _parser; + private XTree _xtree; + private int _idStack[], _lsidStack[]; // id and left sibling + private long _valueStack[]; + private int _stackTop, _currentNodeID; + private boolean _readElement; + private StringBuffer _elementBuffer; + +# Constructor. + def __init__(self): + { + XHash.initialize(); + try + { + _parser = (XMLReader)Class.forName(_PARSER_NAME).newInstance(); + _parser.setFeature("http://xml.org/sax/features/validation", _setValidation); + _parser.setFeature("http://xml.org/sax/features/namespaces", _setNameSpaces); + _parser.setFeature("http://apache.org/xml/features/validation/schema", _setSchemaSupport); + _parser.setFeature("http://apache.org/xml/features/validation/schema-full-checking", _setSchemaFullSupport); + _parser.setFeature("http://xml.org/sax/features/namespace-prefixes", _setNameSpacePrefixes); + + _parser.setContentHandler(this); + _parser.setErrorHandler(this); + _parser.setProperty("http://xml.org/sax/properties/lexical-handler", this); + } + catch (Exception e) + { + System.err.println(e.getMessage()); + System.exit(1); + } + + _idStack = new int[_STACK_SIZE]; + _lsidStack = new int[_STACK_SIZE]; + _valueStack = new long[_STACK_SIZE]; + _stackTop = 0; + _currentNodeID = XTree.NULL_NODE; + _elementBuffer = new StringBuffer(); + } + +# Parse an XML document +# @param uri input XML document +# @return the created XTree + def parse(String uri): + { + _xtree = new XTree(); + _idStack[_stackTop] = XTree.NULL_NODE; + _lsidStack[_stackTop] = XTree.NULL_NODE; + + try + { + _parser.parse(uri); + } + catch (Exception e) + { + System.err.println(e.getMessage()); + System.exit(1); + } + + return _xtree; + } + + // Document handler methods + + public void startElement(String uri, String local, String raw, + Attributes attrs) + { + // if text is mixed with elements + if (_elementBuffer.length() > 0) + { + String text = _elementBuffer.toString().trim(); + if (text.length() > 0) + { + long value = XHash.hash(text); + int tid = _xtree.addText(_idStack[_stackTop], _lsidStack[_stackTop], text, value); + _lsidStack[_stackTop] = tid; + _currentNodeID = tid; + _valueStack[_stackTop] += value; + } + } + + int eid = _xtree.addElement(_idStack[_stackTop], + _lsidStack[_stackTop], local); + + // Update last sibling info. + _lsidStack[_stackTop] = eid; + + // Push + _stackTop++; + _idStack[_stackTop] = eid; + _currentNodeID = eid; + _lsidStack[_stackTop] = XTree.NULL_NODE; + _valueStack[_stackTop] = XHash.hash(local); + + // Take care of attributes + if ((attrs != null) && (attrs.getLength() > 0)) + { + for (int i = 0; i < attrs.getLength(); i++) + { + String name = attrs.getQName(i); + String value = attrs.getValue(i); + long namehash = XHash.hash(name); + long valuehash = XHash.hash(value); + long attrhash = namehash * namehash + + valuehash * valuehash; + int aid = _xtree.addAttribute(eid, _lsidStack[_stackTop], name, value, namehash, attrhash); + + _lsidStack[_stackTop] = aid; + _currentNodeID = aid + 1; + _valueStack[_stackTop] += attrhash * attrhash; + } + } + + _readElement = True; + _elementBuffer = new StringBuffer(); + } + + def characters(char ch[], int start, int length): + { + _elementBuffer.append(ch, start, length); + } + + def endElement(String uri, String local, String raw): + { + if (_readElement) + { + if (_elementBuffer.length() > 0) + { + String text = _elementBuffer.toString(); + long value = XHash.hash(text); + _currentNodeID = + _xtree.addText(_idStack[_stackTop], + _lsidStack[_stackTop], + text, value); + _valueStack[_stackTop] += value; + } + else // an empty element + { + _currentNodeID = + _xtree.addText(_idStack[_stackTop], + _lsidStack[_stackTop], + "", 0); + } + _readElement = False; + } + else + { + if (_elementBuffer.length() > 0) + { + String text = _elementBuffer.toString().trim(); + // More text nodes before end of the element. + if (text.length() > 0) + { + long value = XHash.hash(text); + _currentNodeID = + _xtree.addText(_idStack[_stackTop], + _lsidStack[_stackTop], + text, value); + _valueStack[_stackTop] += value; + } + } + } + + _elementBuffer = new StringBuffer(); + _xtree.addHashValue(_idStack[_stackTop], + _valueStack[_stackTop]); + _valueStack[_stackTop-1] += _valueStack[_stackTop] * + _valueStack[_stackTop]; + _lsidStack[_stackTop-1] = _idStack[_stackTop]; + + // Pop + _stackTop--; + } + + // End of document handler methods + + // Lexical handler methods. + + def startCDATA(): + { + // The text node id should be the one next to the current + // node id. + int textid = _currentNodeID + 1; + String text = _elementBuffer.toString(); + _xtree.addCDATA(textid, text.length()); + } + + def endCDATA(): + { + int textid = _currentNodeID + 1; + String text = _elementBuffer.toString(); + _xtree.addCDATA(textid, text.length()); + } + + // Following functions are not implemented. + def comment(char[] ch, int start, int length): + { + } + + def startDTD(String name, String publicId, String systemId): + { + } + + def endDTD(): + { + } + + def startEntity(String name): + { + } + + def endEntity(String name): + { + } + + // End of lexical handler methods. +} diff --git a/XTree.py b/XTree.py new file mode 100644 index 0000000..eef0af9 --- /dev/null +++ b/XTree.py @@ -0,0 +1,423 @@ +# Copyright (c) 2001 - 2005 +# Yuan Wang. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Redistributions in any form must be accompanied by information on +# how to obtain complete source code for the X-Diff software and any +# accompanying software that uses the X-Diff software. The source code +# must either be included in the distribution or be available for no +# more than the cost of distribution plus a nominal fee, and must be +# freely redistributable under reasonable conditions. For an executable +# file, complete source code means the source code for all modules it +# contains. It does not include source code for modules or files that +# typically accompany the major components of the operating system on +# which the executable file runs. +# +# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, +# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +MATCH = 0 +CHANGE = 1 +NO_MATCH = -1 +INSERT = -1 +DELETE = -1 +NULL_NODE = -1 +NO_CONNECTION = 1048576 + +_TOP_LEVEL_CAPACITY = 16384 +_BOT_LEVEL_CAPACITY = 4096 + + +# <code>XTree</code> provides a DOM-like interface but somehow simplified +# Ideally, it can be replaced by any other DOM parser output tree structures. +class XTree: +# private _topCap, _botCap +# private _elementIndex, _tagIndex, self._valueCount +# private self._firstChild[][], self._nextSibling[][] +# private self._childrenCount[][], _valueIndex[][] +# private boolean self._isAttribute[][] +# private self._matching[][] +# private long self._hashValue[][] +# private String _value[][] +# private Hashtable self._tagNames, _cdataTable + + def __init__(self, topcap=None, botcap=None): + self._topCap = _TOP_LEVEL_CAPACITY + self._botCap = _BOT_LEVEL_CAPACITY + if topcap: + self._topCap = topcap + if botcap: + self._botCap = botcap + self._initialize() + + # Initialization. + def _initialize(self): + self._root = 0 + self._firstChild = [] + self._nextSibling = [] + self._isAttribute = [] + self._valueIndex = [] + self._matching = [] + self._childrenCount = [] + self._hashValue = [] + self._value = [] + + self._value[0] = [] + self._tagNames = [] + + # This hashtable is used to record CDATA section info. + # The key is the text node id, the value is the list of + # (start,end) position pair of each CDATA section. + self._cdataTable = {} + + self._elementIndex = -1 + self._tagIndex = -1 + self._valueCount = self._botCap - 1 + + # ID Expansion + def _expand(self, topid): + self._firstChild[topid] = [] + self._nextSibling[topid] = [] + self._childrenCount[topid] = [] + self._matching[topid] = [] + self._valueIndex[topid] = [] + self._hashValue[topid] = [] + self._isAttribute[topid] = [] + + for i in range(self._botCap): + self._firstChild[topid][i] = NULL_NODE + self._nextSibling[topid][i] = NULL_NODE + self._childrenCount[topid][i]= 0 + self._matching[topid][i] = MATCH + self._valueIndex[topid][i] = -1 + self._isAttribute[topid][i] = False + + # Start -- methods for constructing a tree. + # Add a new element to the tree. + # @param pid parent id + # @param lsid left-side sibling id + # @param tagName element name + # @return the element id in the tree. + def addElement(self, pid, lsid, tagName): + self._elementIndex += 1 + + topid = self._elementIndex / self._botCap + botid = self._elementIndex % self._botCap + if (botid == 0): + self._expand(topid) + + # Check if we've already had the tag + tagID = self._tagNames[tagName] + if (tagID != None): + self._valueIndex[topid][botid] = tagID.intValue() + else: + self._tagIndex += 1 + tagID = int(self._tagIndex) + self._value[0][self._tagIndex] = tagName + self._tagNames.append(tagName, tagID) + self._valueIndex[topid][botid] = self._tagIndex + + if (pid == NULL_NODE): + return self._elementIndex + + ptopid = pid / self._botCap + pbotid = pid % self._botCap + # parent-child relation or sibling-sibling relation + if (lsid == NULL_NODE): + self._firstChild[ptopid][pbotid] = self._elementIndex + else: + self._nextSibling[lsid/self._botCap][lsid%self._botCap] = self._elementIndex + + # update children count + self._childrenCount[ptopid][pbotid] += 1 + + return self._elementIndex + + # Add a text node. + # @param eid element id + # @param lsid the sibling id on the left + # @param text text value + # @param value hash value + def addText(self, eid, lsid, text, value): + self._elementIndex += 1 + topid = self._elementIndex / self._botCap + botid = self._elementIndex % self._botCap + if (botid == 0): + self._expand(topid) + + etopid = eid / self._botCap + ebotid = eid % self._botCap + if (lsid == NULL_NODE): + self._firstChild[etopid][ebotid] = self._elementIndex + else: + self._nextSibling[lsid/self._botCap][lsid%self._botCap] = self._elementIndex + + self._childrenCount[etopid][ebotid] += 1 + self._hashValue[topid][botid] = value + + self._valueCount += 1 + vtopid = self._valueCount / self._botCap + vbotid = self._valueCount % self._botCap + if (vbotid == 0): + self._value[vtopid] = str[self._botCap] + + self._value[vtopid][vbotid] = text + self._valueIndex[topid][botid] = self._valueCount + + return self._elementIndex + + # Add an attribute. + # @param eid element id + # @param lsid the sibling id on the left + # @param name attribute name + # @param value attribute value + # @param valuehash hash value of the value + # @param attrhash hash value of the entire attribute + # @return the element id of the attribute + def addAttribute(self, eid, lsid, name, value, valuehash, attrhash): + # attribute name first. + aid = self.addElement(eid, lsid, name) + + # attribute value second. + self.addText(aid, NULL_NODE, value, valuehash) + + # hash value third + atopid = aid / self._botCap + abotid = aid % self._botCap + self._isAttribute[atopid][abotid] = True + self._hashValue[atopid][abotid] = attrhash + + return aid + + # Add more information (hash value) to an element node. + # @param eid element id + # @param value extra hash value + def addHashValue(self, eid, value): + self._hashValue[eid/self._botCap][eid%self._botCap] = value + + # Add a CDATA section (either a start or an end) to the CDATA + # hashtable, in which each entry should have an even number of + # position slots. + # @param eid The text node id + # @param position the section tag position + def addCDATA(self, eid, position): + key = int(eid) + value = self._cdataTable[key] + if (value == None): + elem_list = [] + elem_list.append(position) + self._cdataTable[key] = elem_list + else: + elem_list = value + elem_list.append(position) + self._cdataTable[key] = elem_list + + # Add matching information. + # @param eid element id + # @param match ?match and matched element id + def addMatching(self, eid, match): + if (match[0] == NO_MATCH): + self._matching[eid/self._botCap][eid%self._botCap] = NO_MATCH + elif (match[0] == MATCH): + self._matching[eid/self._botCap][eid%self._botCap] = MATCH + else: + self._matching[eid/self._botCap][eid%self._botCap] = match[1] + 1 + + # End -- methods for constructing a tree. + + # Start -- methods for accessing a tree. + + # Get matching information. + # @param eid element id + # @param match ?change and matched element id + def getMatching(self, eid, match): + mid = self._matching[eid/self._botCap][eid%self._botCap] + if (mid == NO_MATCH): + match[0] = NO_MATCH + elif (mid == MATCH): + match[0] = MATCH + else: + match[0] = CHANGE + match[1] = mid - 1 + + # Get the root element id. + def getRoot(self): + return self._root + + # Get the first child of a node. + # @param eid element id + def getFirstChild(self, eid): + cid = self._firstChild[eid/self._botCap][eid%self._botCap] + while (cid > self._root): + ctopid = cid / self._botCap + cbotid = cid % self._botCap + if (self._isAttribute[ctopid][cbotid]): + cid = self._nextSibling[ctopid][cbotid] + else: + return cid + + return NULL_NODE + + # Get the next sibling of a node. + # @param eid element id + def getNextSibling(self, eid): + return self._nextSibling[eid/self._botCap][eid%self._botCap] + + # Get the first attribute of a node. + # @param eid element id + def getFirstAttribute(self, eid): + aid = self._firstChild[eid/self._botCap][eid%self._botCap] + if ((aid > self._root) and (self._isAttribute[aid/self._botCap][aid%self._botCap])): + return aid + else: + return NULL_NODE + + # Get the next attribute of a node. + # @param aid attribute id + def getNextAttribute(self, aid): + aid1 = self._nextSibling[aid/self._botCap][aid%self._botCap] + if ((aid1 > self._root) and (self._isAttribute[aid1/self._botCap][aid1%self._botCap])): + return aid1 + else: + return NULL_NODE + + # Get the attribute value. + # @param aid attribute id + def getAttributeValue(self, aid): + cid = self._firstChild[aid/self._botCap][aid%self._botCap] + index = self._valueIndex[cid/self._botCap][cid%self._botCap] + if (index > 0): + return self._value[index/self._botCap][index%self._botCap] + else: + return "" + + # Get the hash value of a node. + # @param eid element id + def getHashValue(self, eid): + return self._hashValue[eid/self._botCap][eid%self._botCap] + + # Get the CDATA section position list of a text node. + # @param eid element id + # @return position list which is a vector or None if no CDATA + def getCDATA(self, eid): + return self._cdataTable[eid] + + # Get the childern count of a node. + # @param eid element id + def getChildrenCount(self, eid): + return self._childrenCount[eid/self._botCap][eid%self._botCap] + + # Get the # of all decendents of a node. + # @param eid element id + def getDecendentsCount(self, eid): + topid = eid / self._botCap + botid = eid % self._botCap + count = self._childrenCount[topid][botid] + if (count == 0): + return 0 + + cid = self._firstChild[topid][botid] + while (cid > NULL_NODE): + count += self.getDecendentsCount(cid) + cid = self._nextSibling[cid/self._botCap][cid%self._botCap] + + return count + + # Get the value index of a node + # @param eid element id + def getValueIndex(self, eid): + return self._valueIndex[eid/self._botCap][eid%self._botCap] + + # Get the value of a leaf node + # @param index value index + def getValue(self, index): + return self._value[index/self._botCap][index%self._botCap] + + # Get the tag of an element node + # @param eid element id + def getTag(self, eid): + index = self._valueIndex[eid/self._botCap][eid%self._botCap] + return self._value[0][index] + + # Get the text value of a leaf node + # @param eid element id + def getText(self, eid): + index = self._valueIndex[eid/self._botCap][eid%self._botCap] + if (index >= self._botCap): + return self._value[index/self._botCap][index%self._botCap] + else: + return "" + + # Check if a node an element node. + # @param eid element id + def isElement(self, eid): + vindex = self._valueIndex[eid/self._botCap][eid%self._botCap] + if (vindex < self._botCap): + return True + else: + return False + + # Check if a node is an attribute node. + # @param eid element id + def isAttribute(self, eid): + return self._isAttribute[eid/self._botCap][eid%self._botCap] + + # Check if a node an leaf text node. + # @param edi element id + def isLeaf(self, eid): + index = self._valueIndex[eid/self._botCap][eid%self._botCap] + if (index < self._botCap): + return False + else: + return True + + # End -- methods for accessing a tree. + + # For testing purpose. + def dump(self, eid = None): + if eid: + topid = eid / self._botCap + botid = eid % self._botCap + vid = self._valueIndex[topid][botid] + vtopid = vid / self._botCap + vbotid = vid % self._botCap + print eid + "\t" + \ + self._firstChild[topid][botid] + "\t" + \ + self._nextSibling[topid][botid] + "\t" + \ + self._isAttribute[topid][botid] + "\t" + \ + self._childrenCount[topid][botid] + "\t" + \ + self._hashValue[topid][botid] + "\t" + \ + self._matching[topid][botid] + "\t" + \ + self._value[vtopid][vbotid] + else: + print "eid\tfirstC\tnextS\tattr?\tcCount\thash\tmatch\tvalue" + for i in range(self._root,self._elementIndex+1): + topid = i / self._botCap + botid = i % self._botCap + vid = self._valueIndex[topid][botid] + vtopid = vid / self._botCap + vbotid = vid % self._botCap + print i + "\t" + \ + self._firstChild[topid][botid] + "\t" + \ + self._nextSibling[topid][botid] + "\t" + \ + self._isAttribute[topid][botid] + "\t" + \ + self._childrenCount[topid][botid] + "\t" + \ + self._hashValue[topid][botid] + "\t" + \ + self._matching[topid][botid] + "\t" + \ + self._value[vtopid][vbotid] |