aboutsummaryrefslogtreecommitdiffstats
path: root/plumbing/object/rename_test.go
diff options
context:
space:
mode:
authorMiguel Molina <miguel@erizocosmi.co>2020-04-23 16:51:56 +0200
committerMiguel Molina <miguel@erizocosmi.co>2020-04-23 17:22:39 +0200
commit90696f07f3a64243c6f33963f5267ab98622db9f (patch)
tree29124a96a4a34927fc48038e2961c4461129c8db /plumbing/object/rename_test.go
parentbb3a1bfcc62224001d577b933355661259b80854 (diff)
downloadgo-git-90696f07f3a64243c6f33963f5267ab98622db9f.tar.gz
plumbing: detect renames by hash and similar content in diff tree
This commit implements the rename detection algorithms used in the JGit implementation. Given a list of changes, additions and deletions are extracted and matched in two ways: - By exact hash content: all additions and deletions are grouped by the content hash and paired with the best match based on the file mode and file path. All the files that cannot be paired are kept as regular deletions and additions. - By similar content: a matrix of addition and deletion pairs with all possible combinations is created and scored by how similar the content is between both files as well as how similar the file path is. The pairs with the best score and whose score is equal or greater than a threshold are paired and turned into a rename. All the files that cannot be paired are kept as regular deletions and additions. DiffTree and DiffTreeContext will not return the changes with renames detected for compatibility reasons, although this will change in v6 so that detecting renames is the default behaviour. A new function DiffTreeWithOptions has been added to configure the parameters for the rename detection to control the score threshold, the limit of renames and whether to use similar content detection in the detection. More information: - https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java - https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java - https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java Signed-off-by: Miguel Molina <miguel@erizocosmi.co>
Diffstat (limited to 'plumbing/object/rename_test.go')
-rw-r--r--plumbing/object/rename_test.go490
1 files changed, 490 insertions, 0 deletions
diff --git a/plumbing/object/rename_test.go b/plumbing/object/rename_test.go
new file mode 100644
index 0000000..bf5044d
--- /dev/null
+++ b/plumbing/object/rename_test.go
@@ -0,0 +1,490 @@
+package object
+
+import (
+ "path/filepath"
+ "strings"
+
+ "github.com/go-git/go-git/v5/plumbing"
+ "github.com/go-git/go-git/v5/plumbing/filemode"
+ "github.com/go-git/go-git/v5/storage/memory"
+ . "gopkg.in/check.v1"
+)
+
+type RenameSuite struct {
+ BaseObjectsSuite
+}
+
+var _ = Suite(&RenameSuite{})
+
+func (s *RenameSuite) TestNameSimilarityScore(c *C) {
+ testCases := []struct {
+ a, b string
+ score int
+ }{
+ {"foo/bar.c", "foo/baz.c", 70},
+ {"src/utils/Foo.java", "tests/utils/Foo.java", 64},
+ {"foo/bar/baz.py", "README.md", 0},
+ {"src/utils/something/foo.py", "src/utils/something/other/foo.py", 69},
+ {"src/utils/something/foo.py", "src/utils/yada/foo.py", 63},
+ {"src/utils/something/foo.py", "src/utils/something/other/bar.py", 44},
+ {"src/utils/something/foo.py", "src/utils/something/foo.py", 100},
+ }
+
+ for _, tt := range testCases {
+ c.Assert(nameSimilarityScore(tt.a, tt.b), Equals, tt.score)
+ }
+}
+
+const (
+ pathA = "src/A"
+ pathB = "src/B"
+ pathH = "src/H"
+ pathQ = "src/Q"
+)
+
+func (s *RenameSuite) TestExactRename_OneRename(c *C) {
+ a := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+ b := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo"))
+
+ result := detectRenames(c, Changes{a, b}, nil, 1)
+ assertRename(c, b, a, result[0])
+}
+
+func (s *RenameSuite) TestExactRename_DifferentObjects(c *C) {
+ a := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+ h := makeAdd(c, makeFile(c, pathH, filemode.Regular, "foo"))
+ q := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "bar"))
+
+ result := detectRenames(c, Changes{a, h, q}, nil, 3)
+ c.Assert(result[0], DeepEquals, a)
+ c.Assert(result[1], DeepEquals, h)
+ c.Assert(result[2], DeepEquals, q)
+}
+
+func (s *RenameSuite) TestExactRename_OneRenameOneModify(c *C) {
+ c1 := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+ c2 := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo"))
+ c3 := makeChange(c,
+ makeFile(c, pathH, filemode.Regular, "bar"),
+ makeFile(c, pathH, filemode.Regular, "bar"),
+ )
+
+ result := detectRenames(c, Changes{c1, c2, c3}, nil, 2)
+ c.Assert(result[0], DeepEquals, c3)
+ assertRename(c, c2, c1, result[1])
+}
+
+func (s *RenameSuite) TestExactRename_ManyRenames(c *C) {
+ c1 := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+ c2 := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo"))
+ c3 := makeAdd(c, makeFile(c, pathH, filemode.Regular, "bar"))
+ c4 := makeDelete(c, makeFile(c, pathB, filemode.Regular, "bar"))
+
+ result := detectRenames(c, Changes{c1, c2, c3, c4}, nil, 2)
+ assertRename(c, c4, c3, result[0])
+ assertRename(c, c2, c1, result[1])
+}
+
+func (s *RenameSuite) TestExactRename_MultipleIdenticalDeletes(c *C) {
+ changes := Changes{
+ makeDelete(c, makeFile(c, pathA, filemode.Regular, "foo")),
+ makeDelete(c, makeFile(c, pathB, filemode.Regular, "foo")),
+ makeDelete(c, makeFile(c, pathH, filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, pathQ, filemode.Regular, "foo")),
+ }
+
+ result := detectRenames(c, changes, nil, 3)
+ assertRename(c, changes[0], changes[3], result[0])
+ c.Assert(result[1], DeepEquals, changes[1])
+ c.Assert(result[2], DeepEquals, changes[2])
+}
+
+func (s *RenameSuite) TestRenameExact_PathBreaksTie(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, "src/com/foo/a.java", filemode.Regular, "foo")),
+ makeDelete(c, makeFile(c, "src/com/foo/b.java", filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, "c.txt", filemode.Regular, "foo")),
+ makeDelete(c, makeFile(c, "d.txt", filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, "the_e_file.txt", filemode.Regular, "foo")),
+ }
+
+ // Add out of order to avoid first-match succeeding
+ result := detectRenames(c, Changes{
+ changes[0],
+ changes[3],
+ changes[4],
+ changes[1],
+ changes[2],
+ }, nil, 3)
+ assertRename(c, changes[3], changes[2], result[0])
+ assertRename(c, changes[1], changes[0], result[1])
+ c.Assert(result[2], DeepEquals, changes[4])
+}
+
+func (s *RenameSuite) TestExactRename_OneDeleteManyAdds(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, "src/com/foo/a.java", filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, "src/com/foo/b.java", filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, "c.txt", filemode.Regular, "foo")),
+ makeDelete(c, makeFile(c, "d.txt", filemode.Regular, "foo")),
+ }
+
+ result := detectRenames(c, changes, nil, 3)
+ assertRename(c, changes[3], changes[2], result[0])
+ c.Assert(result[1], DeepEquals, changes[0])
+ c.Assert(result[2], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestExactRename_UnstagedFile(c *C) {
+ changes := Changes{
+ makeDelete(c, makeFile(c, pathA, filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, pathB, filemode.Regular, "foo")),
+ }
+ result := detectRenames(c, changes, nil, 1)
+ assertRename(c, changes[0], changes[1], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_OnePair(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblarg\n")),
+ makeDelete(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblah\n")),
+ }
+
+ result := detectRenames(c, changes, nil, 1)
+ assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_OneRenameTwoUnrelatedFiles(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblarg\n")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo\nbar\nbaz\nblah\n")),
+ makeAdd(c, makeFile(c, pathB, filemode.Regular, "some\nsort\nof\ntext\n")),
+ makeDelete(c, makeFile(c, pathH, filemode.Regular, "completely\nunrelated\ntext\n")),
+ }
+
+ result := detectRenames(c, changes, nil, 3)
+ c.Assert(result[0], DeepEquals, changes[2])
+ c.Assert(result[1], DeepEquals, changes[3])
+ assertRename(c, changes[1], changes[0], result[2])
+}
+
+func (s *RenameSuite) TestContentRename_LastByteDifferent(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\na")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo\nbar\nb")),
+ }
+
+ result := detectRenames(c, changes, nil, 1)
+ assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_NewlinesOnly(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, strings.Repeat("\n", 3))),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, strings.Repeat("\n", 4))),
+ }
+
+ result := detectRenames(c, changes, nil, 1)
+ assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_SameContentMultipleTimes(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "a\na\na\na\n")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "a\na\na\n")),
+ }
+
+ result := detectRenames(c, changes, nil, 1)
+ assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_OnePairRenameScore50(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "ab\nab\nab\nac\nad\nae\n")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "ac\nab\nab\nab\naa\na0\na1\n")),
+ }
+
+ result := detectRenames(c, changes, &DiffTreeOptions{RenameScore: 50}, 1)
+ assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestNoRenames_SingleByteFiles(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "a")),
+ makeAdd(c, makeFile(c, pathQ, filemode.Regular, "b")),
+ }
+
+ result := detectRenames(c, changes, nil, 2)
+ c.Assert(result[0], DeepEquals, changes[0])
+ c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestNoRenames_EmptyFile(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "")),
+ }
+ result := detectRenames(c, changes, nil, 1)
+ c.Assert(result[0], DeepEquals, changes[0])
+}
+
+func (s *RenameSuite) TestNoRenames_EmptyFile2(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "blah")),
+ }
+ result := detectRenames(c, changes, nil, 2)
+ c.Assert(result[0], DeepEquals, changes[0])
+ c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestNoRenames_SymlinkAndFile(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "src/dest")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Symlink, "src/dest")),
+ }
+ result := detectRenames(c, changes, nil, 2)
+ c.Assert(result[0], DeepEquals, changes[0])
+ c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestNoRenames_SymlinkAndFileSamePath(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "src/dest")),
+ makeDelete(c, makeFile(c, pathA, filemode.Symlink, "src/dest")),
+ }
+ result := detectRenames(c, changes, nil, 2)
+ c.Assert(result[0], DeepEquals, changes[0])
+ c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestRenameLimit(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblarg\n")),
+ makeDelete(c, makeFile(c, pathB, filemode.Regular, "foo\nbar\nbaz\nblah\n")),
+ makeAdd(c, makeFile(c, pathH, filemode.Regular, "a\nb\nc\nd\n")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "a\nb\nc\n")),
+ }
+
+ result := detectRenames(c, changes, &DiffTreeOptions{RenameLimit: 1}, 4)
+ for i, res := range result {
+ c.Assert(res, DeepEquals, changes[i])
+ }
+}
+
+func detectRenames(c *C, changes Changes, opts *DiffTreeOptions, expectedResults int) Changes {
+ result, err := DetectRenames(changes, opts)
+ c.Assert(err, IsNil)
+ c.Assert(result, HasLen, expectedResults)
+ return result
+}
+
+func assertRename(c *C, from, to *Change, rename *Change) {
+ c.Assert(&Change{From: from.From, To: to.To}, DeepEquals, rename)
+}
+
+type SimilarityIndexSuite struct {
+ BaseObjectsSuite
+}
+
+var _ = Suite(&SimilarityIndexSuite{})
+
+func (s *SimilarityIndexSuite) TestScoreFiles(c *C) {
+ tree := s.tree(c, plumbing.NewHash("a8d315b2b1c615d43042c3a62402b8a54288cf5c"))
+ binary, err := tree.File("binary.jpg")
+ c.Assert(err, IsNil)
+ binIndex, err := fileSimilarityIndex(binary)
+ c.Assert(err, IsNil)
+
+ long, err := tree.File("json/long.json")
+ c.Assert(err, IsNil)
+ longIndex, err := fileSimilarityIndex(long)
+ c.Assert(err, IsNil)
+
+ short, err := tree.File("json/short.json")
+ c.Assert(err, IsNil)
+ shortIndex, err := fileSimilarityIndex(short)
+ c.Assert(err, IsNil)
+
+ php, err := tree.File("php/crappy.php")
+ c.Assert(err, IsNil)
+ phpIndex, err := fileSimilarityIndex(php)
+ c.Assert(err, IsNil)
+
+ testCases := []struct {
+ src, dst *similarityIndex
+ expectedScore int
+ }{
+ {binIndex, binIndex, 10000}, // same file
+ {shortIndex, longIndex, 32}, // slightly similar files
+ {longIndex, shortIndex, 32}, // same as previous, diff order
+ {shortIndex, phpIndex, 1}, // different files
+ {longIndex, binIndex, 0}, // code vs binary file
+ }
+
+ for _, tt := range testCases {
+ score := tt.src.score(tt.dst, 10000)
+ c.Assert(score, Equals, tt.expectedScore)
+ }
+}
+
+func (s *SimilarityIndexSuite) TestHashContent(c *C) {
+ idx := textIndex(c, "A\n"+
+ "B\n"+
+ "D\n"+
+ "B\n")
+
+ keyA := keyFor(c, "A\n")
+ keyB := keyFor(c, "B\n")
+ keyD := keyFor(c, "D\n")
+
+ c.Assert(keyA, Not(Equals), keyB)
+ c.Assert(keyA, Not(Equals), keyD)
+ c.Assert(keyD, Not(Equals), keyB)
+
+ c.Assert(idx.numHashes, Equals, 3)
+ c.Assert(idx.hashes[findIndex(idx, keyA)].count(), Equals, uint64(2))
+ c.Assert(idx.hashes[findIndex(idx, keyB)].count(), Equals, uint64(4))
+ c.Assert(idx.hashes[findIndex(idx, keyD)].count(), Equals, uint64(2))
+}
+
+func (s *SimilarityIndexSuite) TestCommonSameFiles(c *C) {
+ content := "A\n" +
+ "B\n" +
+ "D\n" +
+ "B\n"
+
+ src := textIndex(c, content)
+ dst := textIndex(c, content)
+
+ c.Assert(src.common(dst), Equals, uint64(8))
+ c.Assert(dst.common(src), Equals, uint64(8))
+
+ c.Assert(src.score(dst, 100), Equals, 100)
+ c.Assert(dst.score(src, 100), Equals, 100)
+}
+
+func (s *SimilarityIndexSuite) TestCommonSameFilesCR(c *C) {
+ content := "A\r\n" +
+ "B\r\n" +
+ "D\r\n" +
+ "B\r\n"
+
+ src := textIndex(c, content)
+ dst := textIndex(c, strings.ReplaceAll(content, "\r", ""))
+
+ c.Assert(src.common(dst), Equals, uint64(8))
+ c.Assert(dst.common(src), Equals, uint64(8))
+
+ c.Assert(src.score(dst, 100), Equals, 100)
+ c.Assert(dst.score(src, 100), Equals, 100)
+}
+
+func (s *SimilarityIndexSuite) TestCommonEmptyFiles(c *C) {
+ src := textIndex(c, "")
+ dst := textIndex(c, "")
+
+ c.Assert(src.common(dst), Equals, uint64(0))
+ c.Assert(dst.common(src), Equals, uint64(0))
+}
+
+func (s *SimilarityIndexSuite) TestCommonTotallyDifferentFiles(c *C) {
+ src := textIndex(c, "A\n")
+ dst := textIndex(c, "D\n")
+
+ c.Assert(src.common(dst), Equals, uint64(0))
+ c.Assert(dst.common(src), Equals, uint64(0))
+}
+
+func (s *SimilarityIndexSuite) TestSimilarity75(c *C) {
+ src := textIndex(c, "A\nB\nC\nD\n")
+ dst := textIndex(c, "A\nB\nC\nQ\n")
+
+ c.Assert(src.common(dst), Equals, uint64(6))
+ c.Assert(dst.common(src), Equals, uint64(6))
+
+ c.Assert(src.score(dst, 100), Equals, 75)
+ c.Assert(dst.score(src, 100), Equals, 75)
+}
+
+func keyFor(c *C, line string) int {
+ idx := newSimilarityIndex()
+ err := idx.hashContent(strings.NewReader(line), int64(len(line)), false)
+ c.Assert(err, IsNil)
+
+ c.Assert(idx.numHashes, Equals, 1)
+ for _, h := range idx.hashes {
+ if h != 0 {
+ return h.key()
+ }
+ }
+
+ return -1
+}
+
+func textIndex(c *C, content string) *similarityIndex {
+ idx := newSimilarityIndex()
+ err := idx.hashContent(strings.NewReader(content), int64(len(content)), false)
+ c.Assert(err, IsNil)
+ return idx
+}
+
+func findIndex(idx *similarityIndex, key int) int {
+ for i, h := range idx.hashes {
+ if h.key() == key {
+ return i
+ }
+ }
+ return -1
+}
+
+func makeFile(c *C, name string, mode filemode.FileMode, content string) *File {
+ obj := new(plumbing.MemoryObject)
+ obj.SetType(plumbing.BlobObject)
+ _, err := obj.Write([]byte(content))
+ c.Assert(err, IsNil)
+ return &File{
+ Name: name,
+ Mode: mode,
+ Blob: Blob{Hash: obj.Hash(), Size: obj.Size(), obj: obj},
+ }
+}
+
+func makeChangeEntry(f *File) ChangeEntry {
+ sto := memory.NewStorage()
+ sto.SetEncodedObject(f.obj)
+ tree := &Tree{s: sto}
+
+ return ChangeEntry{
+ Name: f.Name,
+ Tree: tree,
+ TreeEntry: TreeEntry{
+ Name: filepath.Base(f.Name),
+ Mode: f.Mode,
+ Hash: f.Hash,
+ },
+ }
+}
+
+func makeAdd(c *C, f *File) *Change {
+ return makeChange(c, nil, f)
+}
+
+func makeDelete(c *C, f *File) *Change {
+ return makeChange(c, f, nil)
+}
+
+func makeChange(c *C, from *File, to *File) *Change {
+ if from == nil {
+ return &Change{To: makeChangeEntry(to)}
+ }
+
+ if to == nil {
+ return &Change{From: makeChangeEntry(from)}
+ }
+
+ if from == nil && to == nil {
+ c.Error("cannot make change without from or to")
+ }
+
+ return &Change{From: makeChangeEntry(from), To: makeChangeEntry(to)}
+}