aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@redhat.com>2011-10-23 04:24:45 +0200
committerMatěj Cepl <mcepl@redhat.com>2011-10-24 23:26:31 +0200
commit7f7b2e533dbdfaef16f41902e750588f58e2cdaf (patch)
treef401c7d61287d3e6484363459eefbf7591b76f36
parent81f605fb4885efbfccf615848a3f5ee6a1452025 (diff)
downloadjson_diff-7f7b2e533dbdfaef16f41902e750588f58e2cdaf.tar.gz
Some biggest classes are rewritten to be syntactically correct.
-rw-r--r--XDiff.py1996
-rw-r--r--XHash.py310
-rw-r--r--XLut.py67
-rw-r--r--XParser.py269
-rw-r--r--XTree.py423
5 files changed, 3065 insertions, 0 deletions
diff --git a/XDiff.py b/XDiff.py
new file mode 100644
index 0000000..ad0e93c
--- /dev/null
+++ b/XDiff.py
@@ -0,0 +1,1996 @@
+#!/usr/bin/env python
+"""
+java XDiff [-o|-g] [-p percent] [-e encoding] xml_file1 xml_file2 diff_result
+Options:
+ The default setting is "-o -p 0.3 -e UTF8"
+ -o The optimal mode, to get the minimum editing distance.
+ -g The greedy mode, to find a difference quickly.
+ -p The maximum change percentage allowed.
+ Default value: 1.0 for -o mode; 0.3 for -g mode.
+ -e The encoding of the output file.
+ Default value: UTF8.
+"""
+# Copyright (c) 2001 - 2005
+# Yuan Wang. All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. Redistributions in any form must be accompanied by information on
+# how to obtain complete source code for the X-Diff software and any
+# accompanying software that uses the X-Diff software. The source code
+# must either be included in the distribution or be available for no
+# more than the cost of distribution plus a nominal fee, and must be
+# freely redistributable under reasonable conditions. For an executable
+# file, complete source code means the source code for all modules it
+# contains. It does not include source code for modules or files that
+# typically accompany the major components of the operating system on
+# which the executable file runs.
+
+# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+#import java.io.BufferedReader
+#import java.io.FileReader
+#import java.io.FileOutputStream
+#import java.io.OutputStreamWriter
+#import java.io.IOException
+#import java.util.Random
+#import java.util.Vector
+import sys, time, codecs
+import XTree, XLut
+from XParser import XParser
+
+# <code>XDiff</code> computes the difference of two input XML documents.
+
+_CIRCUIT_SIZE = 2048
+_MATRIX_SIZE = 1024
+_ATTRIBUTE_SIZE = 1024
+_TEXT_SIZE = 1024
+
+
+class XDiff:
+ _oFlag = False
+ _NO_MATCH_THRESHOLD = 0.3
+ _sampleCount = 3
+ _DEBUG = False
+ _encoding = "UTF8"
+
+# self._xtree1, self._xtree2
+# private XLut _xlut
+# private _leastCostMatrix[][], self._pathMatrix[][], self._circuit[]
+#
+# private _attrList1[], _attrList2[], _textList1[], _textList2[]
+# private boolean _attrMatch[], _textMatch1[], _textMatch2[]
+# private long _attrHash[], _textHash[]
+# private String _attrTag[]
+#
+# private self._matchp[]
+# private boolean self._needNewLine
+
+
+
+ # Constructor
+ # @param input1 input file #1
+ # @param input2 input file #2
+ # @param output output file
+
+ def __init__(self, input1, input2, output):
+ # Parse input files
+ parser = XParser()
+ t0 = time.time()
+ self._xtree1 = parser.parse(input1)
+ t1 = time.time()
+ parser = XParser()
+ self._xtree2 = parser.parse(input2)
+ t2 = time.time()
+
+ # check both root nodes.
+ root1 = self._xtree1.getRoot()
+ root2 = self._xtree2.getRoot()
+ if (self._xtree1.getHashValue(root1) == self._xtree2.getHashValue(root2)):
+ print "No difference!"
+ print "Execution time: " + (t2 - t0) + " ms"
+ print "Parsing " + input1 + ": " + (t1 - t0) + " ms"
+ print "Parsing " + input2 + ": " + (t2 - t1) + " ms"
+ else:
+ self._xlut = XLut.XLut()
+ self._matchp = int[2]
+
+ if (self._xtree1.getTag(root1).compareTo(self._xtree2.getTag(root2)) != 0):
+ print "The root is changed!"
+ self._matchp[0] = XTree.NO_MATCH
+ self._xtree1.addMatching(root1, self._matchp)
+ self._xtree2.addMatching(root2, self._matchp)
+ else:
+ # initialize data structures.
+ self._attrList1 = []
+ self._attrList2 = []
+ self._attrMatch = []
+ self._attrHash = []
+ self._attrTag = []
+
+ self._textList1 = []
+ self._textList2 = []
+ self._textMatch1 = []
+ self._textMatch2 = []
+ self._textHash = []
+
+ self._leastCostMatrix = []
+ self._pathMatrix = []
+ self._circuit = []
+
+ for i in range(_MATRIX_SIZE):
+ self._leastCostMatrix[i] = int[_MATRIX_SIZE]
+ self._pathMatrix[i] = int[_MATRIX_SIZE]
+
+ self._matchp[0] = XTree.CHANGE
+ self._matchp[1] = root2
+ self._xtree1.addMatching(root1, self._matchp)
+ self._matchp[1] = root1
+ self._xtree2.addMatching(root2, self._matchp)
+ self.xdiff(root1, root2, False)
+
+ t3 = time.time()
+ self.writeDiff(input1, output)
+ t4 = time.time()
+
+ print "Difference detected!"
+ print "Execution time: " + (t4 - t0) + " ms"
+ print "Parsing " + input1 + ": " + (t1 - t0) + " ms"
+ print "Parsing " + input2 + ": " + (t2 - t1) + " ms"
+ print "Diffing: " + (t3 - t2) + " ms"
+ print "Writing result: " + (t4 - t3) + " ms"
+
+
+ # Diff two element lists
+ # This is the official one that records matching top-down
+ # @param pid1 parent id #1
+ # @param pid2 parent id #2
+ # @param matchFlag indicates if distance computation needed
+ def xdiff(self, pid1, pid2, matchFlag):
+ # diff attributes.
+ attrCount1 = 0
+ attrCount2 = 0
+ attr1 = self._xtree1.getFirstAttribute(pid1)
+ while (attr1 != XTree.NULL_NODE):
+ self._attrList1[attrCount1] = attr1
+ attrCount1 += 1
+ attr1 = self._xtree1.getNextAttribute(attr1)
+ attr2 = self._xtree2.getFirstAttribute(pid2)
+ while (attr2 != XTree.NULL_NODE):
+ self._attrList2[attrCount2] = attr2
+ attrCount2 += 1
+ attr2 = self._xtree2.getNextAttribute(attr2)
+
+ if (attrCount1 > 0):
+ if (attrCount2 > 0):
+ self.diffAttributes(attrCount1, attrCount2)
+ else:
+ self._matchp[0] = XTree.NO_MATCH
+ for i in range(attrCount1):
+ self._xtree1.addMatching(self._attrList1[i],
+ self._matchp)
+ elif (attrCount2 > 0): # attrCount1 == 0
+ self._matchp[0] = XTree.NO_MATCH
+ for i in range(attrCount2):
+ self._xtree2.addMatching(self._attrList2[i], self._matchp)
+
+ # Match element nodes.
+ count1 = self._xtree1.getChildrenCount(pid1) - attrCount1
+ count2 = self._xtree2.getChildrenCount(pid2) - attrCount2
+
+ if (count1 == 0):
+ self._matchp[0] = XTree.NO_MATCH
+ node2 = self._xtree2.getFirstChild(pid2)
+ self._xtree2.addMatching(node2, self._matchp)
+ for i in range(1,count2):
+ node2 = self._xtree2.getNextSibling(node2)
+ self._xtree2.addMatching(node2, self._matchp)
+ elif (count2 == 0):
+ self._matchp[0] = XTree.NO_MATCH
+ node1 = self._xtree1.getFirstChild(pid1)
+ self._xtree1.addMatching(node1, self._matchp)
+ for i in range(1, count1):
+ node1 = self._xtree1.getNextSibling(node1)
+ self._xtree1.addMatching(node1, self._matchp)
+ elif ((count1 == 1) and (count2 == 1)):
+ node1 = self._xtree1.getFirstChild(pid1)
+ node2 = self._xtree2.getFirstChild(pid2)
+
+ if (self._xtree1.getHashValue(node1) == self._xtree2.getHashValue(node2)):
+ return
+
+ isE1 = self._xtree1.isElement(node1)
+ isE2 = self._xtree2.isElement(node2)
+
+ if (isE1 and isE2):
+ tag1 = self._xtree1.getTag(node1)
+ tag2 = self._xtree2.getTag(node2)
+ if (tag1.compareTo(tag2) == 0):
+ self._matchp[0] = XTree.CHANGE
+ self._matchp[1] = node2
+ self._xtree1.addMatching(node1, self._matchp)
+ self._matchp[1] = node1
+ self._xtree2.addMatching(node2, self._matchp)
+
+ self.xdiff(node1, node2, matchFlag)
+ else:
+ self._matchp[0] = XTree.NO_MATCH
+ self._xtree1.addMatching(node1, self._matchp)
+ self._xtree2.addMatching(node2, self._matchp)
+ elif (not isE1 and not isE2):
+ self._matchp[0] = XTree.CHANGE
+ self._matchp[1] = node2
+ self._xtree1.addMatching(node1, self._matchp)
+ self._matchp[1] = node1
+ self._xtree2.addMatching(node2, self._matchp)
+ else:
+ self._matchp[0] = XTree.NO_MATCH
+ self._xtree1.addMatching(node1, self._matchp)
+ self._xtree2.addMatching(node2, self._matchp)
+ else:
+ elements1 = int[count1]
+ elements2 = int[count2]
+ elementCount1 = 0
+ textCount1 = 0
+ elementCount2 = 0
+ textCount2 = 0
+
+ child1 = self._xtree1.getFirstChild(pid1)
+ if (self._xtree1.isElement(child1)):
+ elements1[elementCount1] = child1
+ elementCount1 += 1
+ else:
+ self._textList1[textCount1] = child1
+ textCount1 += 1
+ for i in range(1,count1):
+ child1 = self._xtree1.getNextSibling(child1)
+ if (self._xtree1.isElement(child1)):
+ elements1[elementCount1] = child1
+ elementCount1 += 1
+ else:
+ self._textList1[textCount1] = child1
+ textCount1 += 1
+
+ child2 = self._xtree2.getFirstChild(pid2)
+ if (self._xtree2.isElement(child2)):
+ elements2[elementCount2] = child2
+ elementCount2 += 1
+ else:
+ self._textList2[textCount2] = child2
+ textCount2 += 1
+ for i in range(1,count2):
+ child2 = self._xtree2.getNextSibling(child2)
+ if (self._xtree2.isElement(child2)):
+ elements2[elementCount2] = child2
+ elementCount2 += 1
+ else:
+ self._textList2[textCount2] = child2
+ textCount2 += 1
+
+ # Match text nodes.
+ if (textCount1 > 0):
+ if (textCount2 > 0):
+ self.diffText(textCount1, textCount2)
+ else:
+ self._matchp[0] = XTree.NO_MATCH
+ for i in range(textCount1):
+ self._xtree1.addMatching(self._textList1[i], self._matchp)
+ elif (textCount2 > 0):
+ self._matchp[0] = XTree.NO_MATCH
+ for i in (textCount2):
+ self._xtree2.addMatching(self._textList2[i],
+ self._matchp)
+
+ matched1 = []
+ matched2 = []
+ mcount = self._matchFilter(elements1, elementCount1,
+ elements2, elementCount2,
+ matched1, matched2)
+
+ if ((elementCount1 == mcount) and (elementCount2 == mcount)):
+ return
+
+ if (elementCount1 == mcount):
+ self._matchp[0] = XTree.NO_MATCH
+ for i in range(elementCount2):
+ if (not matched2[i]):
+ self._xtree2.addMatching(elements2[i], self._matchp)
+ return
+ if (elementCount2 == mcount):
+ self._matchp[0] = XTree.NO_MATCH
+ for i in range(elementCount1):
+ if (not matched1[i]):
+ self._xtree1.addMatching(elements1[i], self._matchp)
+ return
+
+ # Write the list of unmatched nodes.
+ ucount1 = elementCount1 - mcount
+ ucount2 = elementCount2 - mcount
+ unmatched1 = int[ucount1]
+ unmatched2 = int[ucount2]
+ muc1 = 0
+ muc2 = 0
+ start = 0
+
+ while ((muc1 < ucount1) and (muc2 < ucount2)):
+ while (start < elementCount1) and matched1[start]:
+ start += 1
+ startTag = self._xtree1.getTag(elements1[start])
+ uele1 = 0
+ uele2 = 0
+ muc1 += 1
+ unmatched1[uele1] = elements1[start]
+ uele1 += 1
+ matched1[start] = True
+ start += 1
+
+ i = start
+ while (i < elementCount1) and (muc1 < ucount1):
+ if (not matched1[i] and startTag.equals(self._xtree1.getTag(elements1[i]))):
+ matched1[i] = True
+ muc1 += 1
+ unmatched1[uele1] = elements1[i]
+ uele1 += 1
+ i += 1
+
+ i = 0
+ while (i < elementCount2) and (muc2 < ucount2):
+ if (not matched2[i] and startTag.equals(self._xtree2.getTag(elements2[i]))):
+ matched2[i] = True
+ muc2 += 1
+ unmatched2[uele2] = elements2[i]
+ uele2 += 1
+ i += 1
+
+ if (uele2 == 0):
+ self._matchp[0] = XTree.NO_MATCH
+ for i in range(uele1):
+ self._xtree1.addMatching(unmatched1[i], self._matchp)
+ else:
+ if ((uele1 == 1) and (uele2 == 1)):
+ self._matchp[0] = XTree.CHANGE
+ self._matchp[1] = unmatched2[0]
+ self._xtree1.addMatching(unmatched1[0], self._matchp)
+ self._matchp[1] = unmatched1[0]
+ self._xtree2.addMatching(unmatched2[0], self._matchp)
+ self.xdiff(unmatched1[0],
+ unmatched2[0],
+ matchFlag)
+ # To find minimal-cost matching between those unmatched.
+ elif (uele1 >= uele2):
+ if ((uele2 <= self._sampleCount) or not self._gFlag):
+ self.matchListO(unmatched1, unmatched2, uele1, uele2, True, matchFlag)
+ else:
+ self.matchList(unmatched1, unmatched2, uele1, uele2, True, matchFlag)
+ else:
+ if ((uele1 <= self._sampleCount) or not self._gFlag):
+ self.matchListO(unmatched2, unmatched1, uele2, uele1, False, matchFlag)
+ else:
+ self.matchList(unmatched2, unmatched1, uele2, uele1, False, matchFlag)
+
+ if (muc1 < ucount1):
+ self._matchp[0] = XTree.NO_MATCH
+ for i in range(start,elementCount1):
+ if (not matched1[i]):
+ self._xtree1.addMatching(elements1[i], self._matchp)
+ elif (muc2 < ucount2):
+ self._matchp[0] = XTree.NO_MATCH
+ for i in range(elementCount2):
+ if (not matched2[i]):
+ self._xtree2.addMatching(elements2[i], self._matchp)
+
+
+ # Diff and match two lists of attributes
+ # @param attrCount1 number of attributes in the 1st list
+ # @param attrCount2 number of attributes in the 2nd list
+ def diffAttributes(self, attrCount1, attrCount2):
+ if ((attrCount1 == 1) and (attrCount2 == 1)):
+ ah1 = self._xtree1.getHashValue(self._attrList1[0])
+ ah2 = self._xtree2.getHashValue(self._attrList2[0])
+ if (ah1 == ah2):
+ return
+
+ tag1 = self._xtree1.getTag(self._attrList1[0])
+ tag2 = self._xtree2.getTag(self._attrList2[0])
+ if (tag1.compareTo(tag2) == 0):
+ self._matchp[0] = XTree.CHANGE
+ self._matchp[1] = self._attrList2[0]
+ self._xtree1.addMatching(self._attrList1[0], self._matchp)
+
+ self._matchp[1] = self._attrList1[0]
+ self._xtree2.addMatching(self._attrList2[0], self._matchp)
+
+ tid1 = self._xtree1.getFirstChild(self._attrList1[0])
+ tid2 = self._xtree2.getFirstChild(self._attrList2[0])
+ self._matchp[1] = tid2
+ self._xtree1.addMatching(tid1, self._matchp)
+
+ self._matchp[1] = tid1
+ self._xtree2.addMatching(tid2, self._matchp)
+
+ return
+ else:
+ self._matchp[0] = XTree.NO_MATCH
+ self._xtree1.addMatching(self._attrList1[0], self._matchp)
+ self._xtree2.addMatching(self._attrList2[0], self._matchp)
+ return
+
+ for i in range(attrCount2):
+ self._attrHash[i] = self._xtree2.getHashValue(self._attrList2[i])
+ self._attrTag[i] = self._xtree2.getTag(self._attrList2[i])
+ self._attrMatch[i] = False
+
+ matchCount = 0
+ for i in range(attrCount1):
+ attr1 = self._attrList1[i]
+ ah1 = self._xtree1.getHashValue(attr1)
+ tag1 = self._xtree1.getTag(attr1)
+
+ found = False
+ for j in range(attrCount2):
+ attr2 = self._attrList2[j]
+ if (self._attrMatch[j]):
+ continue
+ elif (ah1 == self._attrHash[j]):
+ self._attrMatch[j] = True
+ matchCount += 1
+ found = True
+ break
+ elif (tag1.compareTo(self._attrTag[j]) == 0):
+ self._attrMatch[j] = True
+ matchCount += 1
+
+ self._matchp[0] = XTree.CHANGE
+ self._matchp[1] = attr2
+ self._xtree1.addMatching(attr1, self._matchp)
+
+ self._matchp[1] = attr1
+ self._xtree2.addMatching(attr2, self._matchp)
+
+ tid1 = self._xtree1.getFirstChild(attr1)
+ tid2 = self._xtree2.getFirstChild(attr2)
+ self._matchp[1] = tid2
+ self._xtree1.addMatching(tid1, self._matchp)
+
+ self._matchp[1] = tid1
+ self._xtree2.addMatching(tid2, self._matchp)
+
+ found = True
+ break
+
+ if (not found):
+ self._matchp[0] = XTree.NO_MATCH
+ self._xtree1.addMatching(attr1, self._matchp)
+
+ if (matchCount != attrCount2):
+ self._matchp[0] = XTree.NO_MATCH
+ for i in range(attrCount2):
+ if (not self._attrMatch[i]):
+ self._xtree2.addMatching(self._attrList2[i],
+ self._matchp)
+
+ # Diff and match two lists of text nodes.
+ # XXX This is just a hack that treats text nodes as unordered, to
+ # be consistent with the entire algorithm.
+ # @param textCount1 number of text nodes in the 1st list
+ # @param textCount2 number of text nodes in the 2nd list
+
+ def diffText(self, textCount1, textCount2):
+ for i in range(textCount1):
+ self._textMatch1[i] = False
+ for i in range(textCount2):
+ self._textMatch2[i] = False
+ self._textHash[i] = self._xtree2.getHashValue(self._textList2[i])
+
+ mcount = 0
+ for i in range(textCount1):
+ hash1 = self._xtree1.getHashValue(self._textList1[i])
+ for j in range(textCount2):
+ if (not self._textMatch2[j] and (hash1 == self._textHash[j])):
+ self._textMatch1[i] = True
+ self._textMatch2[j] = True
+ mcount += 1
+ break
+
+ if (mcount == textCount2):
+ break
+
+ if ((mcount < textCount1) and (textCount1 <= textCount2)):
+ self._matchp[0] = XTree.CHANGE
+ i = 0
+ j = 0
+ while (i < textCount1) and (mcount < textCount1):
+ if (self._textMatch1[i]):
+ continue
+ while self._textMatch2[j]:
+ j += 1
+ self._matchp[1] = self._textList2[j]
+ self._xtree1.addMatching(self._textList1[i], self._matchp)
+ self._textMatch1[i] = True
+ self._matchp[1] = self._textList1[i]
+ self._xtree2.addMatching(self._textList2[j], self._matchp)
+ self._textMatch2[j] = True
+ mcount += 1
+ i += 1
+ elif ((mcount < textCount2) and (textCount2 < textCount1)):
+ self._matchp[0] = XTree.CHANGE
+ i = 0
+ j = 0
+ while (i < textCount2) and (mcount < textCount2):
+ if (self._textMatch2[i]):
+ continue
+ while (self._textMatch1[j]):
+ j += 1
+ self._matchp[1] = self._textList1[j]
+ self._xtree2.addMatching(self._textList2[i], self._matchp)
+ self._textMatch2[i] = True
+ self._matchp[1] = self._textList2[i]
+ self._xtree1.addMatching(self._textList1[j], self._matchp)
+ self._textMatch1[j] = True
+ mcount += 1
+ i += 1
+
+ self._matchp[0] = XTree.NO_MATCH
+ if (mcount < textCount1):
+ for i in range(textCount1):
+ if (not self._textMatch1[i]):
+ self._xtree1.addMatching(self._textList1[i],
+ self._matchp)
+ elif (mcount < textCount2):
+ for i in range(textCount2):
+ if (not self._textMatch2[i]):
+ self._xtree2.addMatching(self._textList2[i],
+ self._matchp)
+
+
+ # Filter out matched nodepairs.
+ # @param elements1 node list #1
+ # @param elements2 node list #2
+ # @param matched1 match list #1
+ # @param matched2 match list #2
+ # @return how many matched pairs found
+
+ def _matchFilter(self, elements1, count1, elements2, count2, matched1, matched2):
+ value1 = int[count1]
+ value2 = int[count2]
+
+ for i in range(count1):
+ value1[i] = self._xtree1.getHashValue(elements1[i])
+ matched1[i] = False
+ for i in range(count2):
+ value2[i] = self._xtree2.getHashValue(elements2[i])
+ matched2[i] = False
+
+ mcount = 0
+ for i in range(count2):
+ for j in range (count1):
+ if (not matched1[j] and not matched2[i] and (value1[j] == value2[i])):
+ matched1[j] = True
+ matched2[i] = True
+ mcount += 1
+ break
+
+ return mcount
+
+
+ # Find minimal cost matching between two node lists
+ # Record the matching info back to the trees
+ # Using the original algorithm
+ # @param nodes1 node list #1
+ # @param nodes2 node list #2
+ # @param count1 # of nodes in node list #1
+ # @param count2 # of nodes in node list #2
+ # @param treeOrder True for original, False for inverse
+ # @param matchFlag indicates if distance computation needed
+
+ def matchListO(self, nodes1, nodes2, count1, count2, treeOrder, matchFlag):
+ distance = []
+ matching1 = []
+ matching2 = []
+
+ # insert cost.
+ distance[count1] = int[count2+1]
+ for i in range(count2):
+ if treeOrder:
+ distance[count1][i] = self._xtree2.getDecendentsCount(nodes2[i])
+ else:
+ distance[count1][i] = self._xtree1.getDecendentsCount(nodes2[i]) + 1
+
+ for i in range(count1):
+ distance[i] = int[count2+1]
+ if treeOrder:
+ deleteCost = self._xtree1.getDecendentsCount(nodes1[i])
+ else:
+ deleteCost = self._xtree2.getDecendentsCount(nodes1[i]) + 1
+ for j in range(count2):
+ dist = 0
+ if (matchFlag):
+ if treeOrder:
+ dist = self._xlut.get(nodes1[i], nodes2[j])
+ else:
+ dist = self._xlut.get(nodes2[j], nodes1[i])
+ else:
+ if treeOrder:
+ dist = distance(nodes1[i], nodes2[j], True, XTree.NO_CONNECTION)
+ else:
+ dist = distance(nodes2[j], nodes1[i], True, XTree.NO_CONNECTION)
+ # the default mode.
+ if (not self._oFlag and (dist > 1) and (dist >= self._NO_MATCH_THRESHOLD * (deleteCost + distance[count1][j]))):
+ dist = XTree.NO_CONNECTION
+ if (dist < XTree.NO_CONNECTION):
+ if (treeOrder):
+ self._xlut.add(nodes1[i],
+ nodes2[j],
+ dist)
+ else:
+ self._xlut.add(nodes2[j],
+ nodes1[i],
+ dist)
+ distance[i][j] = dist
+ # delete cost.
+ distance[i][count2] = deleteCost
+
+ # compute the minimal cost matching.
+ self.findMatching(count1, count2, distance, matching1, matching2)
+
+ for i in range(count1):
+ if (matching1[i] == XTree.NO_MATCH):
+ self._matchp[0] = XTree.NO_MATCH
+ else:
+ self._matchp[0] = XTree.CHANGE
+ self._matchp[1] = nodes2[matching1[i]]
+ if (treeOrder):
+ self._xtree1.addMatching(nodes1[i], self._matchp)
+ else:
+ self._xtree2.addMatching(nodes1[i], self._matchp)
+
+ for i in range(count2):
+ if (matching2[i] == XTree.NO_MATCH):
+ self._matchp[0] = XTree.NO_MATCH
+ else:
+ self._matchp[0] = XTree.CHANGE
+ self._matchp[1] = nodes1[matching2[i]]
+ if (treeOrder):
+ self._xtree2.addMatching(nodes2[i], self._matchp)
+ else:
+ self._xtree1.addMatching(nodes2[i], self._matchp)
+
+ for i in range(count1):
+ if (matching1[i] != XTree.NO_MATCH):
+ todo1 = nodes1[i]
+ todo2 = nodes2[matching1[i]]
+ if (treeOrder):
+ if (self._xtree1.isElement(todo1) and self._xtree2.isElement(todo2)):
+ self.xdiff(todo1, todo2, True)
+ else:
+ if (self._xtree1.isElement(todo2) and self._xtree2.isElement(todo1)):
+ self.xdiff(todo2, todo1, True)
+
+
+ # Find minimal cost matching between two node lists
+ # Record the matching info back to the trees
+ # Do sampling.
+ # @param nodes1 node list #1
+ # @param nodes2 node list #2
+ # @param count1 # of nodes in node list #1
+ # @param count2 # of nodes in node list #2
+ # @param treeOrder True for original, False for inverse
+ # @param matchFlag indicates if distance computation needed
+
+ def matchList(self, nodes1, nodes2, count1, count2, treeOrder, matchFlag):
+ matching1 = []
+ matching2 = []
+ for i in range(count1):
+ matching1[i] = XTree.NO_MATCH
+ for i in range(count2):
+ matching2[i] = XTree.NO_MATCH
+
+ if (matchFlag):
+ for i in range(count1):
+ for j in range(count2):
+ if treeOrder:
+ d = self._xlut.get(nodes1[i], nodes2[j])
+ else:
+ d = self._xlut.get(nodes2[j], nodes1[i])
+ if (d != XTree.NO_CONNECTION):
+ matching1[i] = j
+ matching2[j] = i
+ break
+ else:
+ r = Random(time.time()) # FIXME
+ scount1 = 0
+ scount2 = 0
+ matchingThreshold = 0
+ i = 0
+ while (i < self._sampleCount) and (scount2 < count2):
+ snode = r.nextInt(count2 - scount2) + scount2
+ dist = XTree.NO_CONNECTION
+ bestmatch = XTree.NO_MATCH
+ for j in range(scount1,count1):
+ if treeOrder:
+ d = self.distance(nodes1[j], nodes2[snode], False, dist)
+ else:
+ d = self.distance(nodes2[snode], nodes1[j], False, dist)
+ if (d < dist):
+ dist = d
+ bestmatch = j
+ if (d == 1):
+ break
+ scount2 += 1
+
+ if treeOrder:
+ deleteCost = self._xtree2.getDecendentsCount(nodes2[snode]) + 1
+ else:
+ deleteCost = self._xtree1.getDecendentsCount(nodes2[snode]) + 1
+ if ((dist > 1) and (dist > (self._NO_MATCH_THRESHOLD * deleteCost))):
+ tmp = nodes2[snode]
+ nodes2[snode] = nodes2[scount2]
+ nodes2[scount2] = tmp
+ else:
+ tmp = nodes1[bestmatch]
+ nodes1[bestmatch] = nodes1[scount1]
+ nodes1[scount1] = tmp
+ tmp = nodes2[snode]
+ nodes2[snode] = nodes2[scount2]
+ nodes2[scount2] = tmp
+
+ if (treeOrder):
+ self._xlut.add(nodes1[scount1], nodes2[scount2], dist)
+ else:
+ self._xlut.add(nodes2[scount2], nodes1[scount1], dist)
+ matching1[scount1] = scount2
+ matching2[scount2] = scount1
+
+ i += 1
+ scount1 += 1
+ if (matchingThreshold < dist):
+ matchingThreshold = dist
+
+ while scount2 < count2:
+ dist = XTree.NO_CONNECTION
+ bestmatch = XTree.NO_MATCH
+ for i in range(scount1,count1):
+ if treeOrder:
+ d = self.distance(nodes1[i], nodes2[scount2], False, dist)
+ else:
+ d = self.distance(nodes2[scount2], nodes1[i], False, dist)
+ if (d <= matchingThreshold):
+ dist = d
+ bestmatch = i
+ break
+ elif (d < dist):
+ dist = d
+ bestmatch = i
+
+ if (bestmatch != XTree.NO_MATCH):
+ tmp = nodes1[bestmatch]
+ nodes1[bestmatch] = nodes1[scount1]
+ nodes1[scount1] = tmp
+
+ if (treeOrder):
+ self._xlut.add(nodes1[scount1], nodes2[scount2], dist)
+ else:
+ self._xlut.add(nodes2[scount2], nodes1[scount1], dist)
+ matching1[scount1] = scount2
+ matching2[scount2] = scount1
+ scount1 += 1
+ scount2 += 1
+
+ # Record matching
+ for i in range(count1):
+ if (matching1[i] == XTree.NO_MATCH):
+ self._matchp[0] = XTree.NO_MATCH
+ else:
+ self._matchp[0] = XTree.CHANGE
+ self._matchp[1] = nodes2[matching1[i]]
+ if (treeOrder):
+ self._xtree1.addMatching(nodes1[i], self._matchp)
+ else:
+ self._xtree2.addMatching(nodes1[i], self._matchp)
+
+ for i in range(count2):
+ if (matching2[i] == XTree.NO_MATCH):
+ self._matchp[0] = XTree.NO_MATCH
+ else:
+ self._matchp[0] = XTree.CHANGE
+ self._matchp[1] = nodes1[matching2[i]]
+ if (treeOrder):
+ self._xtree2.addMatching(nodes2[i], self._matchp)
+ else:
+ self._xtree1.addMatching(nodes2[i], self._matchp)
+
+ for i in range(count1):
+ if (matching1[i] != XTree.NO_MATCH):
+ todo1 = nodes1[i]
+ todo2 = nodes2[matching1[i]]
+ if (treeOrder):
+ if (self._xtree1.isElement(todo1) and self._xtree2.isElement(todo2)):
+ self.xdiff(todo1, todo2, True)
+ else:
+ if (self._xtree1.isElement(todo2) and self._xtree2.isElement(todo1)):
+ self.xdiff(todo2, todo1, True)
+
+
+ # Compute (minimal-editing) distance between two nodes.
+ # @param eid1 element id #1
+ # @param eid2 element id #2
+ # @param toRecord whether or not to keep the result
+ # @param threshold No need to return a distance higher
+ # than this threshold
+ # @return the distance
+
+ def distance(self, eid1, eid2, toRecord, threshold):
+ isE1 = self._xtree1.isElement(eid1)
+ isE2 = self._xtree2.isElement(eid2)
+ if (isE1 and isE2):
+ if (self._xtree1.getTag(eid1).compareTo(self._xtree2.getTag(eid2)) != 0):
+ return XTree.NO_CONNECTION
+ else:
+ dist = self._xdiff(eid1, eid2, threshold)
+ if (toRecord and (dist < XTree.NO_CONNECTION)):
+ self._xlut.add(eid1, eid2, dist)
+ return dist
+ elif (not isE1 and not isE2):
+ return 1
+ else:
+ return XTree.NO_CONNECTION
+
+
+ # To compute the editing distance between two nodes
+ # @param pid1 parent id #1
+ # @param pid2 parent id #2
+ # @param threshold No need to return a distance higher
+ # than this threshold
+ # @return the distance
+
+ def _xdiff(self, pid1, pid2, threshold):
+ dist = 0
+
+ # diff attributes.
+ attrCount1 = 0
+ attrCount2 = 0
+ attr1 = self._xtree1.getFirstAttribute(pid1)
+ while (attr1 != XTree.NULL_NODE):
+ self._attrList1[attrCount1] = attr1
+ attrCount1 += 1
+ attr1 = self._xtree1.getNextAttribute(attr1)
+ attr2 = self._xtree2.getFirstAttribute(pid2)
+ while (attr2 != XTree.NULL_NODE):
+ self._attrList2[attrCount2] = attr2
+ attrCount2 += 1
+ attr2 = self._xtree2.getNextAttribute(attr2)
+
+ if (attrCount1 == 0):
+ dist = attrCount2 * 2
+ elif (attrCount2 == 0):
+ dist = attrCount1 * 2
+ else:
+ dist = self._diffAttributes(attrCount1, attrCount2)
+ if (self._gFlag and (dist >= threshold)):
+ return XTree.NO_CONNECTION
+
+ # Match second level nodes first.
+ count1 = self._xtree1.getChildrenCount(pid1) - attrCount1
+ count2 = self._xtree2.getChildrenCount(pid2) - attrCount2
+
+ if (count1 == 0):
+ node2 = self._xtree2.getFirstChild(pid2)
+ while (node2 != XTree.NULL_NODE):
+ dist += self._xtree2.getDecendentsCount(node2) + 1
+ if (self._gFlag and (dist >= threshold)):
+ return XTree.NO_CONNECTION
+ node2 = self._xtree2.getNextSibling(node2)
+ elif (count2 == 0):
+ node1 = self._xtree1.getFirstChild(pid1)
+ while (node1 != XTree.NULL_NODE):
+ dist += self._xtree1.getDecendentsCount(node1) + 1
+ if (self._gFlag and (dist >= threshold)):
+ return XTree.NO_CONNECTION
+ node1 = self._xtree1.getNextSibling(node1)
+ elif ((count1 == 1) and (count2 == 1)):
+ node1 = self._xtree1.getFirstChild(pid1)
+ node2 = self._xtree2.getFirstChild(pid2)
+
+ if (self._xtree1.getHashValue(node1) == self._xtree2.getHashValue(node2)):
+ return dist
+
+ isE1 = self._xtree1.isElement(node1)
+ isE2 = self._xtree2.isElement(node2)
+
+ if (isE1 and isE2):
+ tag1 = self._xtree1.getTag(node1)
+ tag2 = self._xtree2.getTag(node2)
+ if (tag1.compareTo(tag2) == 0):
+ dist += self._xdiff(node1, node2, threshold - dist)
+ else:
+ dist += self._xtree1.getDecendentsCount(node1) + self._xtree2.getDecendentsCount(node2) + 2
+ elif (not isE1 and not isE2):
+ dist += 1
+ else:
+ dist += self._xtree1.getDecendentsCount(node1) + self._xtree2.getDecendentsCount(node2) + 2
+ else:
+ elements1 = int[count1]
+ elements2 = int[count2]
+ elementCount1 = 0
+ textCount1 = 0
+ elementCount2 = 0
+ textCount2 = 0
+
+ child1 = self._xtree1.getFirstChild(pid1)
+ if (self._xtree1.isElement(child1)):
+ elements1[elementCount1] = child1
+ elementCount1 += 1
+ else:
+ self._textList1[textCount1] = child1
+ textCount1 += 1
+ for i in range(1,count1):
+ child1 = self._xtree1.getNextSibling(child1)
+ if (self._xtree1.isElement(child1)):
+ elements1[elementCount1] = child1
+ elementCount1 += 1
+ else:
+ self._textList1[textCount1] = child1
+ textCount1 += 1
+
+ child2 = self._xtree2.getFirstChild(pid2)
+ if (self._xtree2.isElement(child2)):
+ elements2[elementCount2] = child2
+ elementCount2 += 1
+ else:
+ self._textList2[textCount2] = child2
+ textCount2 += 1
+ for i in range(1,count2):
+ child2 = self._xtree2.getNextSibling(child2)
+ if (self._xtree2.isElement(child2)):
+ elements2[elementCount2] = child2
+ elementCount2 += 1
+ else:
+ self._textList2[textCount2] = child2
+ textCount2 += 1
+
+ # Match text nodes.
+ if (textCount1 == 0):
+ dist += textCount2
+ elif (textCount2 == 0):
+ dist += textCount1
+ else:
+ dist += self._diffText(textCount1, textCount2)
+
+ if (self._gFlag and (dist >= threshold)):
+ return XTree.NO_CONNECTION
+
+ matched1 = []
+ matched2 = []
+ mcount = self._matchFilter(elements1, elementCount1,
+ elements2, elementCount2,
+ matched1, matched2)
+
+ if ((elementCount1 == mcount) and (elementCount2 == mcount)):
+ return dist
+ if (elementCount1 == mcount):
+ for i in range(elementCount2):
+ if (not matched2[i]):
+ dist += self._xtree2.getDecendentsCount(elements2[i]) + 1
+ if (self._gFlag and (dist >= threshold)):
+ return XTree.NO_CONNECTION
+ return dist
+ if (elementCount2 == mcount):
+ for i in range(elementCount1):
+ if (not matched1[i]):
+ dist += self._xtree1.getDecendentsCount(elements1[i]) + 1
+ if (self._gFlag and (dist >= threshold)):
+ return XTree.NO_CONNECTION
+ return dist
+
+ # Write the list of unmatched nodes.
+ ucount1 = elementCount1 - mcount
+ ucount2 = elementCount2 - mcount
+ unmatched1 = []
+ unmatched2 = []
+ muc1 = 0
+ muc2 = 0
+ start = 0
+
+ while ((muc1 < ucount1) and (muc2 < ucount2)):
+ while (start < elementCount1) and matched1[start]:
+ start += 1
+ startTag = self._xtree1.getTag(elements1[start])
+ uele1 = 0
+ uele2 = 0
+ muc1 += 1
+ unmatched1[uele1] = elements1[start]
+ uele1 += 1
+ matched1[start] = True
+ start += 1
+
+ i = start
+ while (i < elementCount1) and (muc1 < ucount1):
+ if (not matched1[i] and startTag.equals(self._xtree1.getTag(elements1[i]))):
+ matched1[i] = True
+ muc1 += 1
+ unmatched1[uele1] = elements1[i]
+ uele1 += 1
+ i += 1
+
+ i = 0
+ while (i < elementCount2) and (muc2 < ucount2):
+ if (not matched2[i] and startTag.equals(self._xtree2.getTag(elements2[i]))):
+ matched2[i] = True
+ muc2 += 1
+ unmatched2[uele2] = elements2[i]
+ uele2 += 1
+ i += 1
+
+ if (uele2 == 0):
+ for i in range(uele1):
+ dist += self._xtree1.getDecendentsCount(unmatched1[i])
+ else:
+# if ((uele1 == 1) and (uele2 == 1)):
+# dist += self._xdiff(unmatched1[0],
+# unmatched2[0],
+# threshold-dist)
+# elif (uele1 >= uele2):
+ # To find minimal-cost matching between those unmatched.
+ if (uele1 >= uele2):
+ if ((uele2 <= self._sampleCount) or not self._gFlag):
+ dist += self._matchListO(unmatched1, unmatched2, uele1, uele2, True)
+ else:
+ dist += self._matchList(unmatched1, unmatched2, uele1, uele2, True, threshold - dist)
+ else:
+ if ((uele1 <= self._sampleCount) or not self._gFlag):
+ dist += self._matchListO(unmatched2, unmatched1, uele2, uele1, False)
+ else:
+ dist += self._matchList(unmatched2, unmatched1, uele2, uele1, False, threshold - dist)
+
+ if (self._gFlag and (dist >= threshold)):
+ return XTree.NO_CONNECTION
+
+ if (muc1 < ucount1):
+ for i in range (start,elementCount1):
+ if (not matched1[i]):
+ dist += self._xtree1.getDecendentsCount(elements1[i])
+ elif (muc2 < ucount2):
+ for i in range(elementCount2):
+ if (not matched2[i]):
+ dist += self._xtree2.getDecendentsCount(elements2[i])
+
+ if (not self._gFlag or (dist < threshold)):
+ return dist
+ else:
+ return XTree.NO_CONNECTION
+
+
+ # Diff two lists of attributes
+ # @param attrCount1 number of attributes in the 1st list
+ # @param attrCount2 number of attributes in the 2nd list
+ # @return the distance
+
+ def _diffAttributes(self, attrCount1, attrCount2):
+ if ((attrCount1 == 1) and (attrCount2 == 1)):
+ ah1 = self._xtree1.getHashValue(self._attrList1[0])
+ ah2 = self._xtree2.getHashValue(self._attrList2[0])
+ if (ah1 == ah2):
+ return 0
+
+ tag1 = self._xtree1.getTag(self._attrList1[0])
+ tag2 = self._xtree2.getTag(self._attrList2[0])
+ if (tag1.compareTo(tag2) == 0):
+ return 1
+ else:
+ return 2
+
+ dist = 0
+ for i in range(attrCount2):
+ self._attrHash[i] = self._xtree2.getHashValue(self._attrList2[i])
+ self._attrTag[i] = self._xtree2.getTag(self._attrList2[i])
+ self._attrMatch[i] = False
+
+ matchCount = 0
+ for i in range(attrCount1):
+ ah1 = self._xtree1.getHashValue(self._attrList1[i])
+ tag1 = self._xtree1.getTag(self._attrList1[i])
+ found = False
+
+ for j in range(attrCount2):
+ if (self._attrMatch[j]):
+ continue
+ elif (ah1 == self._attrHash[j]):
+ self._attrMatch[j] = True
+ found = True
+ matchCount += 1
+ break
+ elif (tag1.compareTo(self._attrTag[j]) == 0):
+ self._attrMatch[j] = True
+ dist += 1
+ found = True
+ matchCount += 1
+ break
+
+ if (not found):
+ dist += 2
+
+ dist += (attrCount2 - matchCount) * 2
+ return dist
+
+
+ # Diff and match two lists of text nodes.
+ # XXX This is just a hack that treats text nodes as unordered, to
+ # be consistent with the entire algorithm.
+ # @param textCount1 number of text nodes in the 1st list
+ # @param textCount2 number of text nodes in the 2nd list
+ # @return the "distance" between these two lists.
+
+ def _diffText(self, textCount1, textCount2):
+ for i in range(textCount2):
+ self._textMatch2[i] = False
+ self._textHash[i] = self._xtree2.getHashValue(self._textList2[i])
+
+ mcount = 0
+ for i in range(textCount1):
+ hash1 = self._xtree1.getHashValue(self._textList1[i])
+ for j in range(textCount2):
+ if (not self._textMatch2[j] and (hash1 == self._textHash[j])):
+ self._textMatch2[j] = True
+ mcount += 1
+ break
+
+ if (mcount == textCount2):
+ break
+
+ if (textCount1 >= textCount2):
+ return textCount1 - mcount
+ else:
+ return textCount2 - mcount
+
+
+ # Find minimal cost matching between two node lists
+ # Using the original algorithm
+ # @param nodes1 node list #1
+ # @param nodes2 node list #2
+ # @param count1 # of nodes in node list #1
+ # @param count2 # of nodes in node list #2
+ # @param treeOrder True for original, False for inverse
+
+ def _matchListO(self, nodes1, nodes2, count1, count2, treeOrder):
+ distance = []
+ matching1 = []
+ matching2 = []
+
+ # insert cost.
+ distance[count1] = int[count2+1]
+ for i in range(count2):
+ if treeOrder:
+ distance[count1][i] = self._xtree2.getDecendentsCount(nodes2[i]) + 1
+ else:
+ distance[count1][i] = self._xtree1.getDecendentsCount(nodes2[i]) + 1
+
+ for i in range(count1):
+ distance[i] = int[count2+1]
+ if treeOrder:
+ deleteCost = self._xtree1.getDecendentsCount(nodes1[i]) + 1
+ else:
+ deleteCost = self._xtree2.getDecendentsCount(nodes1[i]) + 1
+ for j in range(count2):
+ if treeOrder:
+ dist = distance(nodes1[i], nodes2[j], True, XTree.NO_CONNECTION)
+ else:
+ dist = distance(nodes2[j], nodes1[i], True, XTree.NO_CONNECTION)
+ # the default mode.
+ if (not self._oFlag and (dist > 1) and (dist < XTree.NO_CONNECTION) and \
+ (dist >= self._NO_MATCH_THRESHOLD * \
+ (deleteCost + distance[count1][j]))):
+ dist = XTree.NO_CONNECTION
+
+ if (dist < XTree.NO_CONNECTION):
+ if (treeOrder):
+ self._xlut.add(nodes1[i], nodes2[j], dist)
+ else:
+ self._xlut.add(nodes2[j], nodes1[i], dist)
+ distance[i][j] = dist
+ # delete cost.
+ distance[i][count2] = deleteCost
+
+ # compute the minimal cost matching.
+ return self.findMatching(count1, count2, distance, matching1,
+ matching2)
+
+
+ # Find minimal cost matching between two node lists
+ # Do sampling
+ # @param nodes1 node list #1
+ # @param nodes2 node list #2
+ # @param count1 # of nodes in node list #1
+ # @param count2 # of nodes in node list #2
+ # @param treeOrder True for original, False for inverse
+ # @param threshold No need to return a distance higher
+ # than this threshold
+ def _matchList(self, nodes1, nodes2, count1, count2, treeOrder, threshold):
+ matching1 = []
+ matching2 = []
+ for i in range(count1):
+ matching1[i] = XTree.NO_MATCH
+ for i in range(count2):
+ matching2[i] = XTree.NO_MATCH
+
+ distance = 0
+ r = Random(time.time())
+ scount1 = 0
+ scount2 = 0
+ matchingThreshold = 0
+
+ i = 0
+ while (i < self._sampleCount) and (scount2 < count2):
+ snode = r.nextInt(count2 - scount2) + scount2
+ dist = XTree.NO_CONNECTION
+ bestmatch = XTree.NO_MATCH
+ for j in range(scount1,count1):
+ if treeOrder:
+ d = distance(nodes1[j], nodes2[snode], False, threshold - distance)
+ else:
+ d = distance(nodes2[snode], nodes1[j], False, threshold - distance)
+ if (d < dist):
+ dist = d
+ bestmatch = j
+ if (d == 1):
+ break
+
+ if treeOrder:
+ deleteCost = self._xtree2.getDecendentsCount(nodes2[snode]) + 1
+ deleteCost = self._xtree1.getDecendentsCount(nodes2[snode]) + 1
+
+ if ((dist > 1) and (dist > (self._NO_MATCH_THRESHOLD * deleteCost))):
+ tmp = nodes2[snode]
+ nodes2[snode] = nodes2[scount2]
+ nodes2[scount2] = tmp
+ distance += deleteCost
+ else:
+ tmp = nodes1[bestmatch]
+ nodes1[bestmatch] = nodes1[scount1]
+ nodes1[scount1] = tmp
+ tmp = nodes2[snode]
+ nodes2[snode] = nodes2[scount2]
+ nodes2[scount2] = tmp
+
+ if (treeOrder):
+ self._xlut.add(nodes1[scount1], nodes2[scount2], dist)
+ else:
+ self._xlut.add(nodes2[scount2], nodes1[scount1], dist)
+ matching1[scount1] = scount2
+ matching2[scount2] = scount1
+
+ i += 1
+ scount1 += 1
+ if (matchingThreshold < dist):
+ matchingThreshold = dist
+ distance += dist
+
+ if (distance >= threshold):
+ return XTree.NO_CONNECTION
+ scount2 += 1
+
+ while (scount2 < count2):
+ if treeOrder:
+ deleteCost = self._xtree2.getDecendentsCount(nodes2[scount2]) + 1
+ else:
+ deleteCost = self._xtree1.getDecendentsCount(nodes2[scount2]) + 1
+ dist = XTree.NO_CONNECTION
+ bestmatch = XTree.NO_MATCH
+ for i in range(scount1,count1):
+ if treeOrder:
+ d = distance(nodes1[i], nodes2[scount2], False, threshold - distance)
+ else:
+ d = distance(nodes2[scount2], nodes1[i], False, threshold - distance)
+ if (d <= matchingThreshold):
+ dist = d
+ bestmatch = i
+ break
+ elif ((d == 1) or ( d < (self._NO_MATCH_THRESHOLD * dist))):
+ dist = d
+ bestmatch = i
+
+ if (bestmatch == XTree.NO_MATCH):
+ distance += deleteCost
+ else:
+ tmp = nodes1[bestmatch]
+ nodes1[bestmatch] = nodes1[scount1]
+ nodes1[scount1] = tmp
+
+ if (treeOrder):
+ self._xlut.add(nodes1[scount1], nodes2[scount2], dist)
+ else:
+ self._xlut.add(nodes2[scount2], nodes1[scount1], dist)
+
+ matching1[scount1] = scount2
+ matching2[scount2] = scount1
+ scount1 += 1
+ distance += dist
+
+ if (distance >= threshold):
+ return XTree.NO_CONNECTION
+ scount2 += 1
+
+ for i in range(count1):
+ if (matching1[i] == XTree.NO_MATCH):
+ if treeOrder:
+ distance += self._xtree1.getDecendentsCount(nodes1[i]) + 1
+ else:
+ distance += self._xtree2.getDecendentsCount(nodes1[i]) + 1
+ if (distance >= threshold):
+ return XTree.NO_CONNECTION
+
+ return distance
+
+
+ # Perform minimal-cost matching between two node lists #1
+ # Trivial part.
+ # @param count1 length of node list #1
+ # @param count2 length of node list #2
+ # @param dist distance matrix
+ # @param matching1 matching list (for node list #1)
+ # @param matching2 matching list (for node list #2)
+ # @return distance
+ def findMatching(self, count1, count2, dist, matching1, matching2):
+ if (count1 == 1):
+ # count2 == 1
+ if (dist[0][0] < XTree.NO_CONNECTION):
+ matching1[0] = 0
+ matching2[0] = 0
+ else:
+ matching1[0] = XTree.DELETE
+ matching2[0] = XTree.DELETE
+
+ return dist[0][0]
+ elif (count2 == 1):
+ distance = 0
+ mate = 0
+ mindist = XTree.NO_CONNECTION
+ matching2[0] = XTree.DELETE
+
+ for i in range(count1):
+ matching1[i] = XTree.DELETE
+ if (mindist > dist[i][0]):
+ mindist = dist[i][0]
+ mate = i
+
+ # Suppose we delete every node on list1.
+ distance += dist[i][1]
+
+ if (mindist < XTree.NO_CONNECTION):
+ matching1[mate] = 0
+ matching2[0] = mate
+ distance += mindist - dist[mate][1]
+ else:
+ # Add the delete cost of the single node
+ # on list2.
+ distance += dist[count1][0]
+
+ return distance
+ elif ((count1 == 2) and (count2 == 2)):
+ distance1 = dist[0][0] + dist[1][1]
+ distance2 = dist[0][1] + dist[1][0]
+ if (distance1 < distance2):
+ if (dist[0][0] < XTree.NO_CONNECTION):
+ matching1[0] = 0
+ matching2[0] = 0
+ distance1 = dist[0][0]
+ else:
+ matching1[0] = XTree.DELETE
+ matching2[0] = XTree.DELETE
+ distance1 = dist[0][2] + dist[2][0]
+
+ if (dist[1][1] < XTree.NO_CONNECTION):
+ matching1[1] = 1
+ matching2[1] = 1
+ distance1 += dist[1][1]
+ else:
+ matching1[1] = XTree.DELETE
+ matching2[1] = XTree.DELETE
+ distance1 += dist[1][2] + dist[2][1]
+
+ return distance1
+ else:
+ if (dist[0][1] < XTree.NO_CONNECTION):
+ matching1[0] = 1
+ matching2[1] = 0
+ distance2 = dist[0][1]
+ else:
+ matching1[0] = XTree.DELETE
+ matching2[1] = XTree.DELETE
+ distance2 = dist[0][2] + dist[2][1]
+
+ if (dist[1][0] < XTree.NO_CONNECTION):
+ matching1[1] = 0
+ matching2[0] = 1
+ distance2 += dist[1][0]
+ else:
+ matching1[1] = XTree.DELETE
+ matching2[0] = XTree.DELETE
+ distance2 += dist[1][2] + dist[2][0]
+
+ return distance2
+ else:
+ return self.optimalMatching(count1, count2, dist,
+ matching1, matching2)
+
+
+ # Perform minimal-cost matching between two node lists
+ # @param count1 length of node list #1
+ # @param count2 length of node list #2
+ # @param dist distance matrix
+ # @param matching1 matching list (for node list #1)
+ # @param matching2 matching list (for node list #2)
+ # @return distance
+
+ def optimalMatching(self, count1, count2, dist, matching1, matching2):
+ # Initialize matching.
+ # Initial guess will be pair-matching between two lists.
+ # Others will be insertion or deletion
+ for i in range(count2):
+ matching1[i] = i
+ for i in range(count2, count1):
+ matching1[i] = XTree.DELETE
+
+ # Three artificial nodes: "start", "end" and "delete".
+ count = count1 + count2 + 3
+
+ # Initialize least cost matrix and path matrix.
+ # Both have been initialized at the very beginning.
+
+ # Start algorithm.
+ while (True):
+ # Construct least cost matrix.
+ self.constructLCM(dist, matching1, count1, count2)
+
+ # Initialize path matrix.
+ for i in range(count):
+ for j in range(count):
+ self._pathMatrix[i][j] = i
+
+ # Search negative cost circuit.
+ clen = self.searchNCC(count)
+ if (clen > 0):
+ # Modify matching.
+ i = 0
+ next = 0
+ while (i < clen - 1):
+ n1 = self._circuit[next]
+ next = self._circuit[next+1]
+ # Node in node list 1.
+ if ((n1 > 0) and (n1 <= count1)):
+ nid1 = n1 - 1
+ nid2 = self._circuit[next] - count1 - 1
+ if (nid2 == count2):
+ nid2 = XTree.DELETE
+
+ matching1[nid1] = nid2
+ i += 1
+ else: # Stop.
+ break
+
+ distance = 0
+ # Suppose all insertion on list2
+ for i in range(count2):
+ matching2[i] = XTree.INSERT
+ distance += dist[count1][i]
+
+ # update distance by looking at matching pairs.
+ for i in range(count1):
+ mmm = matching1[i]
+ if (mmm == XTree.DELETE):
+ distance += dist[i][count2]
+ else:
+ matching2[mmm] = i
+ distance += dist[i][mmm] - dist[count1][mmm]
+
+ return distance
+
+
+ # Construct a least cost matrix (of the flow network) based on
+ # the cost matrix
+ # @param costMatrix cost matrix
+ # @param matching matching information
+ # @param nodeCount1 # of nodes in node list 1
+ # @param nodeCount2 # of nodes in node list 2
+
+ def constructLCM(self, costMatrix, matching, nodeCount1, nodeCount2):
+ # Three artificial nodes: "start", "end" and "delete".
+ nodeCount = nodeCount1 + nodeCount2 + 3
+
+ # Initialize.
+ for i in range(nodeCount):
+ for j in range(nodeCount):
+ self._leastCostMatrix[i][j] = XTree.NO_CONNECTION
+
+ # self.
+ self._leastCostMatrix[i][i] = 0
+
+ # Between start node and nodes in list 1.
+ # Start -> node1 = Infinity; node1 -> Start = -0.
+ for i in range(nodeCount1):
+ self._leastCostMatrix[i+1][0] = 0
+
+ # Between nodes in list2 and the end node.
+ # Unless matched (later), node2 -> end = 0
+ # end -> node2 = Infinity.
+ for i in range(nodeCount2):
+ self._leastCostMatrix[i+nodeCount1+1][nodeCount-1] = 0
+
+ deleteCount = 0
+
+ # Between nodes in list1 and nodes in list2.
+ # For matched, node1 -> node2 = Infinity
+ # node2 -> node1 = -1 * distance
+ # For unmatched, node1 -> node2 = distance
+ # node2 -> node1 = Infinity
+ for i in range(nodeCount1):
+ node1 = i + 1
+
+ # According to cost matrix.
+ for j in range(nodeCount2):
+ node2 = j + nodeCount1 + 1
+ self._leastCostMatrix[node1][node2] = costMatrix[i][j]
+
+ # According to matching.
+ if (matching[i] == XTree.DELETE):
+ deleteCount += 1
+
+ # node1 -> Delete = Infinity
+ # Delete -> node1 = -1 * DELETE_COST
+ self._leastCostMatrix[nodeCount-2][node1] = -1 * costMatrix[i][nodeCount2]
+ else:
+ node2 = matching[i] + nodeCount1 + 1
+
+ # Between node1 and node2.
+ self._leastCostMatrix[node1][node2] = XTree.NO_CONNECTION
+ self._leastCostMatrix[node2][node1] = costMatrix[i][matching[i]] * -1
+
+ # Between node1 and delete.
+ self._leastCostMatrix[node1][nodeCount-2] = costMatrix[i][nodeCount2]
+
+ # Between node2 and end.
+ self._leastCostMatrix[node2][nodeCount-1] = XTree.NO_CONNECTION
+ self._leastCostMatrix[nodeCount-1][node2] = costMatrix[nodeCount1][matching[i]]
+
+ # Between the "Delete" and the "End".
+ # If delete all, delete -> end = Infinity; end -> delete = 0.
+ if (deleteCount == nodeCount1):
+ self._leastCostMatrix[nodeCount-1][nodeCount-2] = 0
+ # if no delete, delete -> end = 0; end -> delete = Infinity.
+ elif (deleteCount == 0):
+ self._leastCostMatrix[nodeCount-2][nodeCount-1] = 0
+ # else, both 0
+ else:
+ self._leastCostMatrix[nodeCount-2][nodeCount-1] = 0
+ self._leastCostMatrix[nodeCount-1][nodeCount-2] = 0
+
+
+ # Search for negative cost circuit in the least cost matrix.
+ # @param nodeCount node count
+ # @return the length of the path if found; otherwise 0
+ def searchNCC(self, nodeCount):
+ for k in range(nodeCount):
+ for i in range(nodeCount):
+ if ((i != k) and (self._leastCostMatrix[i][k] != XTree.NO_CONNECTION)):
+ for j in range(nodeCount):
+ if ((j != k) and (self._leastCostMatrix[k][j] != XTree.NO_CONNECTION)):
+ less = self._leastCostMatrix[i][k] + self._leastCostMatrix[k][j]
+ if (less < self._leastCostMatrix[i][j]):
+ self._leastCostMatrix[i][j] = less
+ self._pathMatrix[i][j] = k
+
+ # Found!
+ if ((i == j) and (less < 0)):
+ clen = 0; # the length of the circuit.
+
+ # Locate the circuit.
+ #circuit.addElement( Integer(i))
+ self._circuit[0] = i
+ self._circuit[1] = 2
+
+ #circuit.addElement( Integer(pathMatrix[i][i]))
+ self._circuit[2] = self._pathMatrix[i][i]
+ self._circuit[3] = 4
+
+ #circuit.addElement( Integer(i))
+ self._circuit[4] = i
+ self._circuit[5] = -1
+
+ clen = 3
+
+ finish = False
+ while (not finish):
+ finish = True
+ cit = 0
+ n = 0
+ while (cit < clen - 1):
+ left = self._circuit[n]
+ next = self._circuit[n + 1]
+ if next == -1:
+ right = -1
+ else:
+ right = self._circuit[next]
+
+ #int middle = pathMatrix[circuit[n-1]][circuit[n]]
+ middle = self._pathMatrix[left][right]
+
+ if (middle != left):
+ #circuit.insert( cit, middle )
+ self._circuit[clen * 2] = middle
+ self._circuit[clen * 2 + 1] = next
+ self._circuit[n + 1] = clen * 2
+ clen += 1
+
+ finish = False
+ break
+ n = next
+ cit += 1
+
+ return clen
+
+ return 0
+
+
+ # For testing purpose -- print out matrixes
+ def printMatrix(self, nodeCount):
+ print "Cost Matrix:"
+ for i in range(nodeCount):
+ for j in range(nodeCount):
+ if (self._leastCostMatrix[i][j] < XTree.NO_CONNECTION):
+ sys.stdout.write(self._leastCostMatrix[i][j] + "\t")
+ else:
+ sys.stdout.write("\t")
+ print
+
+ print "\nPath Matrix:"
+ for i in range(nodeCount):
+ for j in range(nodeCount - 1):
+ sys.stdout.write(self._pathMatrix[i][j] + "\t")
+ print self._pathMatrix[i][nodeCount-1]
+
+
+ # Write out the diff result -- how doc1 is changed to doc2
+ # @param input the first/old xml document
+ # @param output output file name
+ # FIXME this is probably completely wrong ... IO is Java-specific!!!
+ def writeDiff(self, input, output):
+ try:
+ out = codecs.open(output, self._encoding)
+ br = open(input)
+
+ root1 = self._xtree1.getRoot()
+ root2 = self._xtree2.getRoot()
+
+ # XXX <root > is as valid as <root>,
+ # but < root> is NOT!
+ rootTag = "<" + self._xtree1.getTag(root1)
+ line = br.readLine()
+ while (line != None):
+ if (line.indexOf(rootTag) >= 0):
+ break
+ out.write(line + "\n")
+ line = br.readLine()
+
+ self._xtree1.getMatching(root1, self._matchp)
+ if (self._matchp[0] == XTree.DELETE):
+ self.writeDeleteNode(out, root1)
+ self.writeInsertNode(out, root2)
+ else:
+ self.writeDiffNode(out, root1, root2)
+
+ out.close()
+ except IOError as (errno, strerror):
+ print >>sys.stderr, strerror
+
+
+ # Write an element that has been deleted from the old document.
+ # @param out output file writer
+ # @param node element id
+
+ def writeDeleteNode(self, out, node):
+ if (self._xtree1.isElement(node)):
+ tag = self._xtree1.getTag(node)
+ out.write("<" + tag)
+
+ # Attributes.
+ attr = self._xtree1.getFirstAttribute(node)
+ while (attr > 0):
+ atag = self._xtree1.getTag(attr)
+ value = self._xtree1.getAttributeValue(attr)
+ out.write(" " + atag + "=\"" + value + "\"")
+ attr = self._xtree1.getNextAttribute(attr)
+
+ # Child nodes.
+ child = self._xtree1.getFirstChild(node)
+
+ if (child < 0):
+ out.write("/><?DELETE " + tag + "?>\n")
+ self._needNewLine = False
+ return
+
+ out.write("><?DELETE " + tag + "?>\n")
+ self._needNewLine = False
+
+ while (child > 0):
+ self.writeMatchNode(out, self._xtree1, child)
+ child = self._xtree1.getNextSibling(child)
+
+ if (self._needNewLine):
+ out.write("\n")
+ self._needNewLine = False
+
+ out.write("</" + tag + ">\n")
+ else:
+ out.write("<?DELETE \"" + self.constructText(self._xtree1, node) +
+ "\"?>\n")
+ self._needNewLine = False
+
+
+ # Write an element that has been inserted from the document.
+ # @param out output file writer
+ # @param node element id
+
+ def writeInsertNode(self, out, node):
+ if (self._xtree2.isElement(node)):
+ tag = self._xtree2.getTag(node)
+ out.write("<" + tag)
+
+ # Attributes.
+ attr = self._xtree2.getFirstAttribute(node)
+ while (attr > 0):
+ atag = self._xtree2.getTag(attr)
+ value = self._xtree2.getAttributeValue(attr)
+ out.write(" " + atag + "=\"" + value + "\"")
+ attr = self._xtree2.getNextAttribute(attr)
+
+ # Child nodes.
+ child = self._xtree2.getFirstChild(node)
+ if (child < 0):
+ out.write("/><?INSERT " + tag + "?>\n")
+ self._needNewLine = False
+ return
+
+ out.write("><?INSERT " + tag + "?>\n")
+ self._needNewLine = False
+
+ while (child > 0):
+ self.writeMatchNode(out, self._xtree2, child)
+ child = self._xtree2.getNextSibling(child)
+
+ if (self._needNewLine):
+ out.write("\n")
+ self._needNewLine = False
+
+ out.write("</" + tag + ">\n")
+ else:
+ out.write(self.constructText(self._xtree2, node) +
+ "<?INSERT?>\n")
+ self._needNewLine = False
+
+
+ # Write an element that is unchanged or in a deleted node or in
+ # an inserted node.
+ # @param out output file writer
+ # @param xtree the document tree
+ # @param node element id
+
+ def writeMatchNode(self, out, xtree, node):
+ if (xtree.isElement(node)):
+ tag = xtree.getTag(node)
+ if (self._needNewLine):
+ out.write("\n")
+
+ out.write("<" + tag)
+
+ # Attributes.
+ attr = xtree.getFirstAttribute(node)
+ while (attr > 0):
+ atag = xtree.getTag(attr)
+ value = xtree.getAttributeValue(attr)
+ out.write(" " + atag + "=\"" + value + "\"")
+ attr = xtree.getNextAttribute(attr)
+
+ # Child nodes.
+ child = xtree.getFirstChild(node)
+ if (child < 0):
+ out.write("/>\n")
+ self._needNewLine = False
+ return
+
+ out.write(">")
+ self._needNewLine = True
+
+ while (child > 0):
+ self.writeMatchNode(out, xtree, child)
+ child = xtree.getNextSibling(child)
+
+ if (self._needNewLine):
+ out.write("\n")
+ self._needNewLine = False
+
+ out.write("</" + tag + ">\n")
+ else:
+ out.write(self.constructText(xtree, node))
+ self._needNewLine = False
+
+
+ # Write one node in the diff result.
+ # @param out output file writer
+ # @param node1 the node in the first tree
+ # @param node2 node1's conterpart in the second tree
+
+ def writeDiffNode(self, out, node1, node2):
+ if (self._xtree1.isElement(node1)):
+ tag = self._xtree1.getTag(node1)
+ if (self._needNewLine):
+ out.write("\n")
+ out.write("<" + tag)
+
+ # Attributes.
+ attr1 = self._xtree1.getFirstAttribute(node1)
+ diffff = ""
+ while (attr1 > 0):
+ atag = self._xtree1.getTag(attr1)
+ value = self._xtree1.getAttributeValue(attr1)
+ self._xtree1.getMatching(attr1, self._matchp)
+ if (self._matchp[0] == XTree.MATCH):
+ out.write(" " + atag + "=\"" +
+ value + "\"")
+ elif (self._matchp[0] == XTree.DELETE):
+ out.write(" " + atag + "=\"" +
+ value + "\"")
+ diffff += "<?DELETE " + atag + "?>"
+ else:
+ value2 = self._xtree2.getAttributeValue(self._matchp[1])
+ out.write(" " + atag + "=\"" +
+ value2 + "\"")
+ diffff += "<?UPDATE " + atag + \
+ " FROM \"" + value + "\"?>"
+
+ attr1 = self._xtree1.getNextAttribute(attr1)
+
+ attr2 = self._xtree2.getFirstAttribute(node2)
+ while (attr2 > 0):
+ self._xtree2.getMatching(attr2, self._matchp)
+ if (self._matchp[0] == XTree.INSERT):
+ atag = self._xtree2.getTag(attr2)
+ value = self._xtree2.getAttributeValue(attr2)
+ out.write(" " + atag + "=\"" +
+ value + "\"")
+ diffff += "<?INSERT " + atag + "?>"
+
+ attr2 = self._xtree2.getNextAttribute(attr2)
+
+ # Child nodes.
+ child1 = self._xtree1.getFirstChild(node1)
+ if (child1 < 0):
+ out.write("/>" + diffff + "\n")
+ self._needNewLine = False
+ return
+
+ out.write(">" + diffff)
+ self._needNewLine = True
+
+ while (child1 > 0):
+ self._xtree1.getMatching(child1, self._matchp)
+ if (self._matchp[0] == XTree.MATCH):
+ self.writeMatchNode(out, self._xtree1, child1)
+ elif (self._matchp[0] == XTree.DELETE):
+ self.writeDeleteNode(out, child1)
+ else:
+ self.writeDiffNode(out, child1, self._matchp[1])
+
+ child1 = self._xtree1.getNextSibling(child1)
+
+ child2 = self._xtree2.getFirstChild(node2)
+ while (child2 > 0):
+ self._xtree2.getMatching(child2, self._matchp)
+ if (self._matchp[0] == XTree.INSERT):
+ self.writeInsertNode(out, child2)
+
+ child2 = self._xtree2.getNextSibling(child2)
+
+ if (self._needNewLine):
+ out.write("\n")
+ self._needNewLine = False
+
+ out.write("</" + tag + ">\n")
+ else:
+ out.write(self.constructText(self._xtree2, node2) +
+ "<?UPDATE FROM \"" +
+ self.constructText(self._xtree1, node1) + "\"?>")
+ self._needNewLine = False
+
+
+ # Construct the text node -- to handle the possible CDATA sections.
+
+ def constructText(self, xtree, eid):
+ text = xtree.getText(eid)
+ cdatalist = xtree.getCDATA(eid)
+ if (cdatalist == None):
+ return text
+
+ buf = StringBuffer()
+ count = cdatalist.size()
+ lastEnd = 0
+
+ for i in range(0,count,2):
+ cdataStart = int(self.cdatalist[i])
+ cdataEnd = int(self.cdatalist[i+1])
+
+ if (cdataStart > lastEnd):
+ buf.append(text.substring(lastEnd, cdataStart))
+ buf.append("<![CDATA[" +
+ text.substring(cdataStart, cdataEnd) +
+ "]]>")
+ lastEnd = cdataEnd
+ if (lastEnd < text.length()):
+ buf.append(text.substring(lastEnd))
+
+ return buf.toString()
+
+def readParameters(args, parameters):
+ opid = 0
+ if (args.length < 3):
+ return False
+ # we are not in the object, so how can we get to these values?
+ # FIXME global module variables?
+ elif (args[0].equals("-o")):
+ _oFlag = True
+ opid += 1
+ elif (args[0].equals("-g")):
+ _gFlag = True
+ opid += 1
+
+ if (args[opid].equals("-p")):
+ opid += 1
+ p = 0
+# try:
+ p = float(args[opid])
+ opid += 1
+# FIXME ... most likely FloatingPointError
+# except NumberFormatException:
+# return False
+
+ if ((p <= 0) or (p > 1)):
+ return False
+ XDiff._NO_MATCH_THRESHOLD = p
+
+ if (args[opid].equals("-e")):
+ opid += 1
+ _encoding = args[opid]
+ opid += 1
+
+ if ((args.length - opid) != 3):
+ return False
+ parameters.add(args[opid])
+ opid += 1
+ parameters.add(args[opid])
+ opid += 1
+ parameters.add(args[opid])
+
+ return True
+
+if __name__ == "__main__":
+ parameters = []
+ if (not readParameters(sys.argv, parameters)):
+ print >>sys.stderr, __doc__
+ return
+
+ mydiff = XDiff(parameters[0], parameters[1], parameters[2]) \ No newline at end of file
diff --git a/XHash.py b/XHash.py
new file mode 100644
index 0000000..f0c1554
--- /dev/null
+++ b/XHash.py
@@ -0,0 +1,310 @@
+# Copyright (c) 2001 - 2005
+# Yuan Wang. All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. Redistributions in any form must be accompanied by information on
+# how to obtain complete source code for the X-Diff software and any
+# accompanying software that uses the X-Diff software. The source code
+# must either be included in the distribution or be available for no
+# more than the cost of distribution plus a nominal fee, and must be
+# freely redistributable under reasonable conditions. For an executable
+# file, complete source code means the source code for all modules it
+# contains. It does not include source code for modules or files that
+# typically accompany the major components of the operating system on
+# which the executable file runs.
+
+# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# <code>XHash</code> is an implementaion of DES
+class XHash:
+ private static final int _initialPermutation[] =
+ 57, 49, 41, 33 , 25, 17, 9, 1, 59, 51, 43, 35, 27, 19, 11, 3,
+ 61, 53, 45, 37, 29, 21, 13, 5, 63, 55, 47, 39, 31, 23, 15, 7,
+ 56, 48, 40, 32 ,24, 16, 8, 0, 58, 50, 42, 34, 26, 18, 10, 2,
+ 60, 52, 44, 36, 28, 20, 12, 4, 62, 54, 46, 38, 30, 22, 14, 6
+ };
+
+ private static final int _finalPermutation[] =
+ 39, 7, 47, 15, 55, 23, 63, 31, 38, 6, 46, 14, 54, 22, 62, 30,
+ 37, 5, 45, 13, 53, 21, 61, 29, 36, 4, 44, 12, 52, 20, 60, 28,
+ 35, 3, 43, 11, 51, 19, 59, 27, 34, 2, 42, 10, 50, 18, 58, 26,
+ 33, 1, 41, 9, 49, 17, 57, 25, 32, 0, 40, 8, 48, 16, 56, 24
+ };
+
+ private static final int _keyReducePermutation[] =
+ 60, 52, 44, 36, 59, 51, 43, 35, 27, 19, 11, 3, 58, 50,
+ 42, 34, 26, 18, 10, 2, 57, 49, 41, 33, 25, 17, 9, 1,
+ 28, 20, 12, 4, 61, 53, 45, 37, 29, 21, 13, 5, 62, 54,
+ 46, 38, 30, 22, 14, 6, 63, 55, 47, 39, 31, 23, 15, 7
+ };
+
+ private static final int _keyCompressPermutation[] =
+ 24, 27, 20, 6, 14, 10, 3, 22, 0, 17, 7, 12,
+ 8, 23, 11, 5, 16, 26, 1, 9, 19, 25, 4, 15,
+ 54, 43 ,36, 29, 49, 40, 48, 30, 52, 44, 37, 33,
+ 46, 35, 50, 41, 28, 53, 51, 55, 32, 45, 39, 42
+ };
+
+ private static final int _keyRot[] =
+ 1, 2, 4, 6, 8, 10, 12, 14, 15, 17, 19, 21, 23, 25, 27, 28
+ };
+
+ private static final int _sBoxP[][] =
+ 0x00808200, 0x00000000, 0x00008000, 0x00808202,
+ 0x00808002, 0x00008202, 0x00000002, 0x00008000,
+ 0x00000200, 0x00808200, 0x00808202, 0x00000200,
+ 0x00800202, 0x00808002, 0x00800000, 0x00000002,
+ 0x00000202, 0x00800200, 0x00800200, 0x00008200,
+ 0x00008200, 0x00808000, 0x00808000, 0x00800202,
+ 0x00008002, 0x00800002, 0x00800002, 0x00008002,
+ 0x00000000, 0x00000202, 0x00008202, 0x00800000,
+ 0x00008000, 0x00808202, 0x00000002, 0x00808000,
+ 0x00808200, 0x00800000, 0x00800000, 0x00000200,
+ 0x00808002, 0x00008000, 0x00008200, 0x00800002,
+ 0x00000200, 0x00000002, 0x00800202, 0x00008202,
+ 0x00808202, 0x00008002, 0x00808000, 0x00800202,
+ 0x00800002, 0x00000202, 0x00008202, 0x00808200,
+ 0x00000202, 0x00800200, 0x00800200, 0x00000000,
+ 0x00008002, 0x00008200, 0x00000000, 0x00808002
+ },
+ 0x40084010, 0x40004000, 0x00004000, 0x00084010,
+ 0x00080000, 0x00000010, 0x40080010, 0x40004010,
+ 0x40000010, 0x40084010, 0x40084000, 0x40000000,
+ 0x40004000, 0x00080000, 0x00000010, 0x40080010,
+ 0x00084000, 0x00080010, 0x40004010, 0x00000000,
+ 0x40000000, 0x00004000, 0x00084010, 0x40080000,
+ 0x00080010, 0x40000010, 0x00000000, 0x00084000,
+ 0x00004010, 0x40084000, 0x40080000, 0x00004010,
+ 0x00000000, 0x00084010, 0x40080010, 0x00080000,
+ 0x40004010, 0x40080000, 0x40084000, 0x00004000,
+ 0x40080000, 0x40004000, 0x00000010, 0x40084010,
+ 0x00084010, 0x00000010, 0x00004000, 0x40000000,
+ 0x00004010, 0x40084000, 0x00080000, 0x40000010,
+ 0x00080010, 0x40004010, 0x40000010, 0x00080010,
+ 0x00084000, 0x00000000, 0x40004000, 0x00004010,
+ 0x40000000, 0x40080010, 0x40084010, 0x00084000
+ },
+ 0x00000104, 0x04010100, 0x00000000, 0x04010004,
+ 0x04000100, 0x00000000, 0x00010104, 0x04000100,
+ 0x00010004, 0x04000004, 0x04000004, 0x00010000,
+ 0x04010104, 0x00010004, 0x04010000, 0x00000104,
+ 0x04000000, 0x00000004, 0x04010100, 0x00000100,
+ 0x00010100, 0x04010000, 0x04010004, 0x00010104,
+ 0x04000104, 0x00010100, 0x00010000, 0x04000104,
+ 0x00000004, 0x04010104, 0x00000100, 0x04000000,
+ 0x04010100, 0x04000000, 0x00010004, 0x00000104,
+ 0x00010000, 0x04010100, 0x04000100, 0x00000000,
+ 0x00000100, 0x00010004, 0x04010104, 0x04000100,
+ 0x04000004, 0x00000100, 0x00000000, 0x04010004,
+ 0x04000104, 0x00010000, 0x04000000, 0x04010104,
+ 0x00000004, 0x00010104, 0x00010100, 0x04000004,
+ 0x04010000, 0x04000104, 0x00000104, 0x04010000,
+ 0x00010104, 0x00000004, 0x04010004, 0x00010100
+ },
+ 0x80401000, 0x80001040, 0x80001040, 0x00000040,
+ 0x00401040, 0x80400040, 0x80400000, 0x80001000,
+ 0x00000000, 0x00401000, 0x00401000, 0x80401040,
+ 0x80000040, 0x00000000, 0x00400040, 0x80400000,
+ 0x80000000, 0x00001000, 0x00400000, 0x80401000,
+ 0x00000040, 0x00400000, 0x80001000, 0x00001040,
+ 0x80400040, 0x80000000, 0x00001040, 0x00400040,
+ 0x00001000, 0x00401040, 0x80401040, 0x80000040,
+ 0x00400040, 0x80400000, 0x00401000, 0x80401040,
+ 0x80000040, 0x00000000, 0x00000000, 0x00401000,
+ 0x00001040, 0x00400040, 0x80400040, 0x80000000,
+ 0x80401000, 0x80001040, 0x80001040, 0x00000040,
+ 0x80401040, 0x80000040, 0x80000000, 0x00001000,
+ 0x80400000, 0x80001000, 0x00401040, 0x80400040,
+ 0x80001000, 0x00001040, 0x00400000, 0x80401000,
+ 0x00000040, 0x00400000, 0x00001000, 0x00401040
+ },
+ 0x00000080, 0x01040080, 0x01040000, 0x21000080,
+ 0x00040000, 0x00000080, 0x20000000, 0x01040000,
+ 0x20040080, 0x00040000, 0x01000080, 0x20040080,
+ 0x21000080, 0x21040000, 0x00040080, 0x20000000,
+ 0x01000000, 0x20040000, 0x20040000, 0x00000000,
+ 0x20000080, 0x21040080, 0x21040080, 0x01000080,
+ 0x21040000, 0x20000080, 0x00000000, 0x21000000,
+ 0x01040080, 0x01000000, 0x21000000, 0x00040080,
+ 0x00040000, 0x21000080, 0x00000080, 0x01000000,
+ 0x20000000, 0x01040000, 0x21000080, 0x20040080,
+ 0x01000080, 0x20000000, 0x21040000, 0x01040080,
+ 0x20040080, 0x00000080, 0x01000000, 0x21040000,
+ 0x21040080, 0x00040080, 0x21000000, 0x21040080,
+ 0x01040000, 0x00000000, 0x20040000, 0x21000000,
+ 0x00040080, 0x01000080, 0x20000080, 0x00040000,
+ 0x00000000, 0x20040000, 0x01040080, 0x20000080
+ },
+ 0x10000008, 0x10200000, 0x00002000, 0x10202008,
+ 0x10200000, 0x00000008, 0x10202008, 0x00200000,
+ 0x10002000, 0x00202008, 0x00200000, 0x10000008,
+ 0x00200008, 0x10002000, 0x10000000, 0x00002008,
+ 0x00000000, 0x00200008, 0x10002008, 0x00002000,
+ 0x00202000, 0x10002008, 0x00000008, 0x10200008,
+ 0x10200008, 0x00000000, 0x00202008, 0x10202000,
+ 0x00002008, 0x00202000, 0x10202000, 0x10000000,
+ 0x10002000, 0x00000008, 0x10200008, 0x00202000,
+ 0x10202008, 0x00200000, 0x00002008, 0x10000008,
+ 0x00200000, 0x10002000, 0x10000000, 0x00002008,
+ 0x10000008, 0x10202008, 0x00202000, 0x10200000,
+ 0x00202008, 0x10202000, 0x00000000, 0x10200008,
+ 0x00000008, 0x00002000, 0x10200000, 0x00202008,
+ 0x00002000, 0x00200008, 0x10002008, 0x00000000,
+ 0x10202000, 0x10000000, 0x00200008, 0x10002008
+ },
+ 0x00100000, 0x02100001, 0x02000401, 0x00000000,
+ 0x00000400, 0x02000401, 0x00100401, 0x02100400,
+ 0x02100401, 0x00100000, 0x00000000, 0x02000001,
+ 0x00000001, 0x02000000, 0x02100001, 0x00000401,
+ 0x02000400, 0x00100401, 0x00100001, 0x02000400,
+ 0x02000001, 0x02100000, 0x02100400, 0x00100001,
+ 0x02100000, 0x00000400, 0x00000401, 0x02100401,
+ 0x00100400, 0x00000001, 0x02000000, 0x00100400,
+ 0x02000000, 0x00100400, 0x00100000, 0x02000401,
+ 0x02000401, 0x02100001, 0x02100001, 0x00000001,
+ 0x00100001, 0x02000000, 0x02000400, 0x00100000,
+ 0x02100400, 0x00000401, 0x00100401, 0x02100400,
+ 0x00000401, 0x02000001, 0x02100401, 0x02100000,
+ 0x00100400, 0x00000000, 0x00000001, 0x02100401,
+ 0x00000000, 0x00100401, 0x02100000, 0x00000400,
+ 0x02000001, 0x02000400, 0x00000400, 0x00100001
+ },
+ 0x08000820, 0x00000800, 0x00020000, 0x08020820,
+ 0x08000000, 0x08000820, 0x00000020, 0x08000000,
+ 0x00020020, 0x08020000, 0x08020820, 0x00020800,
+ 0x08020800, 0x00020820, 0x00000800, 0x00000020,
+ 0x08020000, 0x08000020, 0x08000800, 0x00000820,
+ 0x00020800, 0x00020020, 0x08020020, 0x08020800,
+ 0x00000820, 0x00000000, 0x00000000, 0x08020020,
+ 0x08000020, 0x08000800, 0x00020820, 0x00020000,
+ 0x00020820, 0x00020000, 0x08020800, 0x00000800,
+ 0x00000020, 0x08020020, 0x00000800, 0x00020820,
+ 0x08000800, 0x00000020, 0x08000020, 0x08020000,
+ 0x08020020, 0x08000000, 0x00020000, 0x08000820,
+ 0x00000000, 0x08020820, 0x00020020, 0x08000020,
+ 0x08020000, 0x08000800, 0x08000820, 0x00000000,
+ 0x08020820, 0x00020800, 0x00020800, 0x00000820,
+ 0x00000820, 0x00020020, 0x08000000, 0x08020800
+ };
+
+ private static long _keys[];
+ private static char _word[];
+
+ private static long _initialKey = 1007360890380L;
+
+ /**
+# Initialization #1.
+ public static void initialize()
+ makeKeys(_initialKey);
+ _word = new char[64];
+
+ /**
+# Initialization #2.
+ public static void initialize(long key)
+ makeKeys(key);
+ _word = new char[64];
+
+ public static long hash(String word)
+ int len = word.length();
+ long value = 0L;
+ for (int start = 0; start < len; start += 64)
+ if (len - start > 64)
+ value += (_hash(word.substring(start, start + 64)) ^ 0xffffffffL);
+ else
+ value += (_hash(word.substring(start)) ^ 0xffffffffL);
+ break;
+ return value;
+
+ /**
+# The actual hash function.
+ private static long _hash(String word)
+ int len = word.length();
+ for (int i = 0; i < len; i++)
+ _word[i] = word.charAt(i);
+ int round = len / 8;
+ int rest = len % 8;
+ if (rest > 0)
+ for (int i = 0; i < 8 - rest; i++)
+ _word[len+i] = 0;
+ round++;
+
+ int value = 0;
+ for (int i = 0, pos = 0; i < round; i++, pos += 8)
+ long todo = 0L + (byte)_word[pos];
+ for (int j = 1; j < 8; j++)
+ todo = (todo << 8) + (byte)_word[pos+j];
+ value += des(todo);
+
+ return value;
+
+ private static void makeKeys(long key)
+ long reduced = permutate(key, _keyReducePermutation);
+ int l = (int)(reduced >> 28);
+ int r = (int)(reduced & 0xfffffff);
+ _keys = new long[16];
+ for (int i = 0; i < 16; i++)
+ _keys[i] = permutate(rotate(l, r, _keyRot[i]),
+ _keyCompressPermutation);
+
+ private static long des(long w)
+ long x = permutate(w, _initialPermutation);
+ int l = (int)(x >>> 32);
+ int r = (int)x;
+ for (int i = 0; i < 16; i++)
+ int tmp = desFunc(r, _keys[i]) ^ l;
+ l = r;
+ r = tmp;
+ long y = ((long)r << 32) | ((long)l & 0xffffffffL);
+ return permutate(y, _finalPermutation);
+
+ private static long permutate(long k, int p[])
+ long s = 0;
+ for (int i = 0; i < p.length; i++)
+ if ((k & (1L << p[i])) != 0)
+ s |= 1L << i;
+
+ return s;
+
+ private static long rotate(int l, int r, int s)
+ return ((long)(((l<<s) & 0xfffffff) | (l>>>(28 - s))) << 28) |
+ ((r<<s) & 0xfffffff) | (r>> (28 - s));
+
+ private static int desFunc(int x, long k)
+ int p = x >>> 27;
+ int q = (p & 3) << 4;
+ int r = x << 5;
+ p |= r;
+ r = _sBoxP[0][(int)((k >> 42) ^ p) & 0x3f];
+ p >>>= 4;
+ r |= _sBoxP[7][(int)((k >> 0) ^ p) & 0x3f];
+ p >>>= 4;
+ r |= _sBoxP[6][(int)((k >> 6) ^ p) & 0x3f];
+ p >>>= 4;
+ r |= _sBoxP[5][(int)((k >> 12) ^ p) & 0x3f];
+ p >>>= 4;
+ r |= _sBoxP[4][(int)((k >> 18) ^ p) & 0x3f];
+ p >>>= 4;
+ r |= _sBoxP[3][(int)((k >> 24) ^ p) & 0x3f];
+ p >>>= 4;
+ r |= _sBoxP[2][(int)((k >> 30) ^ p) & 0x3f];
+ p >>>= 4;
+ r |= _sBoxP[1][(int)((k >> 36) ^ (p | q)) & 0x3f];
+ return r;
diff --git a/XLut.py b/XLut.py
new file mode 100644
index 0000000..98dc5b6
--- /dev/null
+++ b/XLut.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2001 - 2005
+# Yuan Wang. All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. Redistributions in any form must be accompanied by information on
+# how to obtain complete source code for the X-Diff software and any
+# accompanying software that uses the X-Diff software. The source code
+# must either be included in the distribution or be available for no
+# more than the cost of distribution plus a nominal fee, and must be
+# freely redistributable under reasonable conditions. For an executable
+# file, complete source code means the source code for all modules it
+# contains. It does not include source code for modules or files that
+# typically accompany the major components of the operating system on
+# which the executable file runs.
+
+# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+import XTree
+
+# <code>XLut</code> is the hash lookup table for node distance.
+class XLut:
+
+# Constructor.
+ def __init__(self):
+ self._xTable = {}
+
+# Add a node pair and their distance to this table.
+# @param eid1 element id #1
+# @param eid2 element id #2
+# @param dist distance
+ def add(self, eid1, eid2, dist):
+ key = eid1
+ key = key << 32
+ key += eid2
+
+ self._xTable[key] = int(dist)
+
+# Get the distance of a node pair.
+# @param eid1 element id #1
+# @param eid2 element id #2
+# @return distance or -1 if not found
+ def get(self, eid1, eid2):
+ key = eid1
+ key = key << 32
+ key += eid2
+
+ value = self._xTable[key]
+ if value == None:
+ return XTree.NO_CONNECTION
+ else:
+ return int(value)
diff --git a/XParser.py b/XParser.py
new file mode 100644
index 0000000..dcfedc4
--- /dev/null
+++ b/XParser.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2001 - 2005
+# Yuan Wang. All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. Redistributions in any form must be accompanied by information on
+# how to obtain complete source code for the X-Diff software and any
+# accompanying software that uses the X-Diff software. The source code
+# must either be included in the distribution or be available for no
+# more than the cost of distribution plus a nominal fee, and must be
+# freely redistributable under reasonable conditions. For an executable
+# file, complete source code means the source code for all modules it
+# contains. It does not include source code for modules or files that
+# typically accompany the major components of the operating system on
+# which the executable file runs.
+
+# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import xml.sax
+_PARSER_NAME = "org.apache.xerces.parsers.SAXParser";
+
+# FIXME
+# This is interesting
+# http://www.virtuousprogrammer.com/?page_id=183
+# http://docs.python.org/library/xml.sax.reader.html
+# <code>XParser</code> parses an input XML document and constructs an
+# <code>XTree</code>
+# class XParser extends DefaultHandler implements LexicalHandler
+class XParser(xml.sax.handler.ContentHandler):
+ _setValidation = False
+ _setNameSpaces = True
+ _setSchemaSupport = True
+ _setSchemaFullSupport = False
+ _setNameSpacePrefixes = True
+
+ _STACK_SIZE = 100
+
+ private XMLReader _parser;
+ private XTree _xtree;
+ private int _idStack[], _lsidStack[]; // id and left sibling
+ private long _valueStack[];
+ private int _stackTop, _currentNodeID;
+ private boolean _readElement;
+ private StringBuffer _elementBuffer;
+
+# Constructor.
+ def __init__(self):
+ {
+ XHash.initialize();
+ try
+ {
+ _parser = (XMLReader)Class.forName(_PARSER_NAME).newInstance();
+ _parser.setFeature("http://xml.org/sax/features/validation", _setValidation);
+ _parser.setFeature("http://xml.org/sax/features/namespaces", _setNameSpaces);
+ _parser.setFeature("http://apache.org/xml/features/validation/schema", _setSchemaSupport);
+ _parser.setFeature("http://apache.org/xml/features/validation/schema-full-checking", _setSchemaFullSupport);
+ _parser.setFeature("http://xml.org/sax/features/namespace-prefixes", _setNameSpacePrefixes);
+
+ _parser.setContentHandler(this);
+ _parser.setErrorHandler(this);
+ _parser.setProperty("http://xml.org/sax/properties/lexical-handler", this);
+ }
+ catch (Exception e)
+ {
+ System.err.println(e.getMessage());
+ System.exit(1);
+ }
+
+ _idStack = new int[_STACK_SIZE];
+ _lsidStack = new int[_STACK_SIZE];
+ _valueStack = new long[_STACK_SIZE];
+ _stackTop = 0;
+ _currentNodeID = XTree.NULL_NODE;
+ _elementBuffer = new StringBuffer();
+ }
+
+# Parse an XML document
+# @param uri input XML document
+# @return the created XTree
+ def parse(String uri):
+ {
+ _xtree = new XTree();
+ _idStack[_stackTop] = XTree.NULL_NODE;
+ _lsidStack[_stackTop] = XTree.NULL_NODE;
+
+ try
+ {
+ _parser.parse(uri);
+ }
+ catch (Exception e)
+ {
+ System.err.println(e.getMessage());
+ System.exit(1);
+ }
+
+ return _xtree;
+ }
+
+ // Document handler methods
+
+ public void startElement(String uri, String local, String raw,
+ Attributes attrs)
+ {
+ // if text is mixed with elements
+ if (_elementBuffer.length() > 0)
+ {
+ String text = _elementBuffer.toString().trim();
+ if (text.length() > 0)
+ {
+ long value = XHash.hash(text);
+ int tid = _xtree.addText(_idStack[_stackTop], _lsidStack[_stackTop], text, value);
+ _lsidStack[_stackTop] = tid;
+ _currentNodeID = tid;
+ _valueStack[_stackTop] += value;
+ }
+ }
+
+ int eid = _xtree.addElement(_idStack[_stackTop],
+ _lsidStack[_stackTop], local);
+
+ // Update last sibling info.
+ _lsidStack[_stackTop] = eid;
+
+ // Push
+ _stackTop++;
+ _idStack[_stackTop] = eid;
+ _currentNodeID = eid;
+ _lsidStack[_stackTop] = XTree.NULL_NODE;
+ _valueStack[_stackTop] = XHash.hash(local);
+
+ // Take care of attributes
+ if ((attrs != null) && (attrs.getLength() > 0))
+ {
+ for (int i = 0; i < attrs.getLength(); i++)
+ {
+ String name = attrs.getQName(i);
+ String value = attrs.getValue(i);
+ long namehash = XHash.hash(name);
+ long valuehash = XHash.hash(value);
+ long attrhash = namehash * namehash +
+ valuehash * valuehash;
+ int aid = _xtree.addAttribute(eid, _lsidStack[_stackTop], name, value, namehash, attrhash);
+
+ _lsidStack[_stackTop] = aid;
+ _currentNodeID = aid + 1;
+ _valueStack[_stackTop] += attrhash * attrhash;
+ }
+ }
+
+ _readElement = True;
+ _elementBuffer = new StringBuffer();
+ }
+
+ def characters(char ch[], int start, int length):
+ {
+ _elementBuffer.append(ch, start, length);
+ }
+
+ def endElement(String uri, String local, String raw):
+ {
+ if (_readElement)
+ {
+ if (_elementBuffer.length() > 0)
+ {
+ String text = _elementBuffer.toString();
+ long value = XHash.hash(text);
+ _currentNodeID =
+ _xtree.addText(_idStack[_stackTop],
+ _lsidStack[_stackTop],
+ text, value);
+ _valueStack[_stackTop] += value;
+ }
+ else // an empty element
+ {
+ _currentNodeID =
+ _xtree.addText(_idStack[_stackTop],
+ _lsidStack[_stackTop],
+ "", 0);
+ }
+ _readElement = False;
+ }
+ else
+ {
+ if (_elementBuffer.length() > 0)
+ {
+ String text = _elementBuffer.toString().trim();
+ // More text nodes before end of the element.
+ if (text.length() > 0)
+ {
+ long value = XHash.hash(text);
+ _currentNodeID =
+ _xtree.addText(_idStack[_stackTop],
+ _lsidStack[_stackTop],
+ text, value);
+ _valueStack[_stackTop] += value;
+ }
+ }
+ }
+
+ _elementBuffer = new StringBuffer();
+ _xtree.addHashValue(_idStack[_stackTop],
+ _valueStack[_stackTop]);
+ _valueStack[_stackTop-1] += _valueStack[_stackTop] *
+ _valueStack[_stackTop];
+ _lsidStack[_stackTop-1] = _idStack[_stackTop];
+
+ // Pop
+ _stackTop--;
+ }
+
+ // End of document handler methods
+
+ // Lexical handler methods.
+
+ def startCDATA():
+ {
+ // The text node id should be the one next to the current
+ // node id.
+ int textid = _currentNodeID + 1;
+ String text = _elementBuffer.toString();
+ _xtree.addCDATA(textid, text.length());
+ }
+
+ def endCDATA():
+ {
+ int textid = _currentNodeID + 1;
+ String text = _elementBuffer.toString();
+ _xtree.addCDATA(textid, text.length());
+ }
+
+ // Following functions are not implemented.
+ def comment(char[] ch, int start, int length):
+ {
+ }
+
+ def startDTD(String name, String publicId, String systemId):
+ {
+ }
+
+ def endDTD():
+ {
+ }
+
+ def startEntity(String name):
+ {
+ }
+
+ def endEntity(String name):
+ {
+ }
+
+ // End of lexical handler methods.
+}
diff --git a/XTree.py b/XTree.py
new file mode 100644
index 0000000..eef0af9
--- /dev/null
+++ b/XTree.py
@@ -0,0 +1,423 @@
+# Copyright (c) 2001 - 2005
+# Yuan Wang. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. Redistributions in any form must be accompanied by information on
+# how to obtain complete source code for the X-Diff software and any
+# accompanying software that uses the X-Diff software. The source code
+# must either be included in the distribution or be available for no
+# more than the cost of distribution plus a nominal fee, and must be
+# freely redistributable under reasonable conditions. For an executable
+# file, complete source code means the source code for all modules it
+# contains. It does not include source code for modules or files that
+# typically accompany the major components of the operating system on
+# which the executable file runs.
+#
+# THIS SOFTWARE IS PROVIDED BY YUAN WANG "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT,
+# ARE DISCLAIMED. IN NO EVENT SHALL YUAN WANG BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+MATCH = 0
+CHANGE = 1
+NO_MATCH = -1
+INSERT = -1
+DELETE = -1
+NULL_NODE = -1
+NO_CONNECTION = 1048576
+
+_TOP_LEVEL_CAPACITY = 16384
+_BOT_LEVEL_CAPACITY = 4096
+
+
+# <code>XTree</code> provides a DOM-like interface but somehow simplified
+# Ideally, it can be replaced by any other DOM parser output tree structures.
+class XTree:
+# private _topCap, _botCap
+# private _elementIndex, _tagIndex, self._valueCount
+# private self._firstChild[][], self._nextSibling[][]
+# private self._childrenCount[][], _valueIndex[][]
+# private boolean self._isAttribute[][]
+# private self._matching[][]
+# private long self._hashValue[][]
+# private String _value[][]
+# private Hashtable self._tagNames, _cdataTable
+
+ def __init__(self, topcap=None, botcap=None):
+ self._topCap = _TOP_LEVEL_CAPACITY
+ self._botCap = _BOT_LEVEL_CAPACITY
+ if topcap:
+ self._topCap = topcap
+ if botcap:
+ self._botCap = botcap
+ self._initialize()
+
+ # Initialization.
+ def _initialize(self):
+ self._root = 0
+ self._firstChild = []
+ self._nextSibling = []
+ self._isAttribute = []
+ self._valueIndex = []
+ self._matching = []
+ self._childrenCount = []
+ self._hashValue = []
+ self._value = []
+
+ self._value[0] = []
+ self._tagNames = []
+
+ # This hashtable is used to record CDATA section info.
+ # The key is the text node id, the value is the list of
+ # (start,end) position pair of each CDATA section.
+ self._cdataTable = {}
+
+ self._elementIndex = -1
+ self._tagIndex = -1
+ self._valueCount = self._botCap - 1
+
+ # ID Expansion
+ def _expand(self, topid):
+ self._firstChild[topid] = []
+ self._nextSibling[topid] = []
+ self._childrenCount[topid] = []
+ self._matching[topid] = []
+ self._valueIndex[topid] = []
+ self._hashValue[topid] = []
+ self._isAttribute[topid] = []
+
+ for i in range(self._botCap):
+ self._firstChild[topid][i] = NULL_NODE
+ self._nextSibling[topid][i] = NULL_NODE
+ self._childrenCount[topid][i]= 0
+ self._matching[topid][i] = MATCH
+ self._valueIndex[topid][i] = -1
+ self._isAttribute[topid][i] = False
+
+ # Start -- methods for constructing a tree.
+ # Add a new element to the tree.
+ # @param pid parent id
+ # @param lsid left-side sibling id
+ # @param tagName element name
+ # @return the element id in the tree.
+ def addElement(self, pid, lsid, tagName):
+ self._elementIndex += 1
+
+ topid = self._elementIndex / self._botCap
+ botid = self._elementIndex % self._botCap
+ if (botid == 0):
+ self._expand(topid)
+
+ # Check if we've already had the tag
+ tagID = self._tagNames[tagName]
+ if (tagID != None):
+ self._valueIndex[topid][botid] = tagID.intValue()
+ else:
+ self._tagIndex += 1
+ tagID = int(self._tagIndex)
+ self._value[0][self._tagIndex] = tagName
+ self._tagNames.append(tagName, tagID)
+ self._valueIndex[topid][botid] = self._tagIndex
+
+ if (pid == NULL_NODE):
+ return self._elementIndex
+
+ ptopid = pid / self._botCap
+ pbotid = pid % self._botCap
+ # parent-child relation or sibling-sibling relation
+ if (lsid == NULL_NODE):
+ self._firstChild[ptopid][pbotid] = self._elementIndex
+ else:
+ self._nextSibling[lsid/self._botCap][lsid%self._botCap] = self._elementIndex
+
+ # update children count
+ self._childrenCount[ptopid][pbotid] += 1
+
+ return self._elementIndex
+
+ # Add a text node.
+ # @param eid element id
+ # @param lsid the sibling id on the left
+ # @param text text value
+ # @param value hash value
+ def addText(self, eid, lsid, text, value):
+ self._elementIndex += 1
+ topid = self._elementIndex / self._botCap
+ botid = self._elementIndex % self._botCap
+ if (botid == 0):
+ self._expand(topid)
+
+ etopid = eid / self._botCap
+ ebotid = eid % self._botCap
+ if (lsid == NULL_NODE):
+ self._firstChild[etopid][ebotid] = self._elementIndex
+ else:
+ self._nextSibling[lsid/self._botCap][lsid%self._botCap] = self._elementIndex
+
+ self._childrenCount[etopid][ebotid] += 1
+ self._hashValue[topid][botid] = value
+
+ self._valueCount += 1
+ vtopid = self._valueCount / self._botCap
+ vbotid = self._valueCount % self._botCap
+ if (vbotid == 0):
+ self._value[vtopid] = str[self._botCap]
+
+ self._value[vtopid][vbotid] = text
+ self._valueIndex[topid][botid] = self._valueCount
+
+ return self._elementIndex
+
+ # Add an attribute.
+ # @param eid element id
+ # @param lsid the sibling id on the left
+ # @param name attribute name
+ # @param value attribute value
+ # @param valuehash hash value of the value
+ # @param attrhash hash value of the entire attribute
+ # @return the element id of the attribute
+ def addAttribute(self, eid, lsid, name, value, valuehash, attrhash):
+ # attribute name first.
+ aid = self.addElement(eid, lsid, name)
+
+ # attribute value second.
+ self.addText(aid, NULL_NODE, value, valuehash)
+
+ # hash value third
+ atopid = aid / self._botCap
+ abotid = aid % self._botCap
+ self._isAttribute[atopid][abotid] = True
+ self._hashValue[atopid][abotid] = attrhash
+
+ return aid
+
+ # Add more information (hash value) to an element node.
+ # @param eid element id
+ # @param value extra hash value
+ def addHashValue(self, eid, value):
+ self._hashValue[eid/self._botCap][eid%self._botCap] = value
+
+ # Add a CDATA section (either a start or an end) to the CDATA
+ # hashtable, in which each entry should have an even number of
+ # position slots.
+ # @param eid The text node id
+ # @param position the section tag position
+ def addCDATA(self, eid, position):
+ key = int(eid)
+ value = self._cdataTable[key]
+ if (value == None):
+ elem_list = []
+ elem_list.append(position)
+ self._cdataTable[key] = elem_list
+ else:
+ elem_list = value
+ elem_list.append(position)
+ self._cdataTable[key] = elem_list
+
+ # Add matching information.
+ # @param eid element id
+ # @param match ?match and matched element id
+ def addMatching(self, eid, match):
+ if (match[0] == NO_MATCH):
+ self._matching[eid/self._botCap][eid%self._botCap] = NO_MATCH
+ elif (match[0] == MATCH):
+ self._matching[eid/self._botCap][eid%self._botCap] = MATCH
+ else:
+ self._matching[eid/self._botCap][eid%self._botCap] = match[1] + 1
+
+ # End -- methods for constructing a tree.
+
+ # Start -- methods for accessing a tree.
+
+ # Get matching information.
+ # @param eid element id
+ # @param match ?change and matched element id
+ def getMatching(self, eid, match):
+ mid = self._matching[eid/self._botCap][eid%self._botCap]
+ if (mid == NO_MATCH):
+ match[0] = NO_MATCH
+ elif (mid == MATCH):
+ match[0] = MATCH
+ else:
+ match[0] = CHANGE
+ match[1] = mid - 1
+
+ # Get the root element id.
+ def getRoot(self):
+ return self._root
+
+ # Get the first child of a node.
+ # @param eid element id
+ def getFirstChild(self, eid):
+ cid = self._firstChild[eid/self._botCap][eid%self._botCap]
+ while (cid > self._root):
+ ctopid = cid / self._botCap
+ cbotid = cid % self._botCap
+ if (self._isAttribute[ctopid][cbotid]):
+ cid = self._nextSibling[ctopid][cbotid]
+ else:
+ return cid
+
+ return NULL_NODE
+
+ # Get the next sibling of a node.
+ # @param eid element id
+ def getNextSibling(self, eid):
+ return self._nextSibling[eid/self._botCap][eid%self._botCap]
+
+ # Get the first attribute of a node.
+ # @param eid element id
+ def getFirstAttribute(self, eid):
+ aid = self._firstChild[eid/self._botCap][eid%self._botCap]
+ if ((aid > self._root) and (self._isAttribute[aid/self._botCap][aid%self._botCap])):
+ return aid
+ else:
+ return NULL_NODE
+
+ # Get the next attribute of a node.
+ # @param aid attribute id
+ def getNextAttribute(self, aid):
+ aid1 = self._nextSibling[aid/self._botCap][aid%self._botCap]
+ if ((aid1 > self._root) and (self._isAttribute[aid1/self._botCap][aid1%self._botCap])):
+ return aid1
+ else:
+ return NULL_NODE
+
+ # Get the attribute value.
+ # @param aid attribute id
+ def getAttributeValue(self, aid):
+ cid = self._firstChild[aid/self._botCap][aid%self._botCap]
+ index = self._valueIndex[cid/self._botCap][cid%self._botCap]
+ if (index > 0):
+ return self._value[index/self._botCap][index%self._botCap]
+ else:
+ return ""
+
+ # Get the hash value of a node.
+ # @param eid element id
+ def getHashValue(self, eid):
+ return self._hashValue[eid/self._botCap][eid%self._botCap]
+
+ # Get the CDATA section position list of a text node.
+ # @param eid element id
+ # @return position list which is a vector or None if no CDATA
+ def getCDATA(self, eid):
+ return self._cdataTable[eid]
+
+ # Get the childern count of a node.
+ # @param eid element id
+ def getChildrenCount(self, eid):
+ return self._childrenCount[eid/self._botCap][eid%self._botCap]
+
+ # Get the # of all decendents of a node.
+ # @param eid element id
+ def getDecendentsCount(self, eid):
+ topid = eid / self._botCap
+ botid = eid % self._botCap
+ count = self._childrenCount[topid][botid]
+ if (count == 0):
+ return 0
+
+ cid = self._firstChild[topid][botid]
+ while (cid > NULL_NODE):
+ count += self.getDecendentsCount(cid)
+ cid = self._nextSibling[cid/self._botCap][cid%self._botCap]
+
+ return count
+
+ # Get the value index of a node
+ # @param eid element id
+ def getValueIndex(self, eid):
+ return self._valueIndex[eid/self._botCap][eid%self._botCap]
+
+ # Get the value of a leaf node
+ # @param index value index
+ def getValue(self, index):
+ return self._value[index/self._botCap][index%self._botCap]
+
+ # Get the tag of an element node
+ # @param eid element id
+ def getTag(self, eid):
+ index = self._valueIndex[eid/self._botCap][eid%self._botCap]
+ return self._value[0][index]
+
+ # Get the text value of a leaf node
+ # @param eid element id
+ def getText(self, eid):
+ index = self._valueIndex[eid/self._botCap][eid%self._botCap]
+ if (index >= self._botCap):
+ return self._value[index/self._botCap][index%self._botCap]
+ else:
+ return ""
+
+ # Check if a node an element node.
+ # @param eid element id
+ def isElement(self, eid):
+ vindex = self._valueIndex[eid/self._botCap][eid%self._botCap]
+ if (vindex < self._botCap):
+ return True
+ else:
+ return False
+
+ # Check if a node is an attribute node.
+ # @param eid element id
+ def isAttribute(self, eid):
+ return self._isAttribute[eid/self._botCap][eid%self._botCap]
+
+ # Check if a node an leaf text node.
+ # @param edi element id
+ def isLeaf(self, eid):
+ index = self._valueIndex[eid/self._botCap][eid%self._botCap]
+ if (index < self._botCap):
+ return False
+ else:
+ return True
+
+ # End -- methods for accessing a tree.
+
+ # For testing purpose.
+ def dump(self, eid = None):
+ if eid:
+ topid = eid / self._botCap
+ botid = eid % self._botCap
+ vid = self._valueIndex[topid][botid]
+ vtopid = vid / self._botCap
+ vbotid = vid % self._botCap
+ print eid + "\t" + \
+ self._firstChild[topid][botid] + "\t" + \
+ self._nextSibling[topid][botid] + "\t" + \
+ self._isAttribute[topid][botid] + "\t" + \
+ self._childrenCount[topid][botid] + "\t" + \
+ self._hashValue[topid][botid] + "\t" + \
+ self._matching[topid][botid] + "\t" + \
+ self._value[vtopid][vbotid]
+ else:
+ print "eid\tfirstC\tnextS\tattr?\tcCount\thash\tmatch\tvalue"
+ for i in range(self._root,self._elementIndex+1):
+ topid = i / self._botCap
+ botid = i % self._botCap
+ vid = self._valueIndex[topid][botid]
+ vtopid = vid / self._botCap
+ vbotid = vid % self._botCap
+ print i + "\t" + \
+ self._firstChild[topid][botid] + "\t" + \
+ self._nextSibling[topid][botid] + "\t" + \
+ self._isAttribute[topid][botid] + "\t" + \
+ self._childrenCount[topid][botid] + "\t" + \
+ self._hashValue[topid][botid] + "\t" + \
+ self._matching[topid][botid] + "\t" + \
+ self._value[vtopid][vbotid]