Merge pull request #38 from go-git/feature/difftree-renames

plumbing: detect renames by hash and similar content in diff tree
author: Máximo Cuadros <mcuadros@gmail.com> 2020-04-23 18:36:50 +0200
committer: GitHub <noreply@github.com> 2020-04-23 18:36:50 +0200
commit: 218a744b6995a89f5c322aa58e79138d65392ea6 (patch)
tree: 385575e9f68787661fcd04840f6061b40658ab40 /plumbing/object
parent: 53f87846a196c856e00fe825bc5f29551b2ea524 (diff)
parent: 90696f07f3a64243c6f33963f5267ab98622db9f (diff)
download: go-git-218a744b6995a89f5c322aa58e79138d65392ea6.tar.gz
4 files changed, 1374 insertions, 4 deletions
diff --git a/plumbing/object/change.go b/plumbing/object/change.go
index 4474905..c9d1615 100644
--- a/plumbing/object/change.go
+++ b/plumbing/object/change.go
@@ -18,7 +18,7 @@ type Change struct {
 	To   ChangeEntry
 }
 
-var empty = ChangeEntry{}
+var empty ChangeEntry
 
 // Action returns the kind of action represented by the change, an
 // insertion, a deletion or a modification.
@@ -27,9 +27,11 @@ func (c *Change) Action() (merkletrie.Action, error) {
 		return merkletrie.Action(0),
 			fmt.Errorf("malformed change: empty from and to")
 	}
+
 	if c.From == empty {
 		return merkletrie.Insert, nil
 	}
+
 	if c.To == empty {
 		return merkletrie.Delete, nil
 	}
diff --git a/plumbing/object/difftree.go b/plumbing/object/difftree.go
index 72411a5..7c22227 100644
--- a/plumbing/object/difftree.go
+++ b/plumbing/object/difftree.go
@@ -10,14 +10,62 @@ import (
 
 // DiffTree compares the content and mode of the blobs found via two
 // tree objects.
+// DiffTree does not perform rename detection, use DiffTreeWithOptions
+// instead to detect renames.
 func DiffTree(a, b *Tree) (Changes, error) {
 	return DiffTreeContext(context.Background(), a, b)
 }
 
-// DiffTree compares the content and mode of the blobs found via two
+// DiffTreeContext compares the content and mode of the blobs found via two
 // tree objects. Provided context must be non-nil.
-// An error will be return if context expires
+// An error will be returned if context expires.
 func DiffTreeContext(ctx context.Context, a, b *Tree) (Changes, error) {
+	return DiffTreeWithOptions(ctx, a, b, nil)
+}
+
+// DiffTreeOptions are the configurable options when performing a diff tree.
+type DiffTreeOptions struct {
+	// DetectRenames is whether the diff tree will use rename detection.
+	DetectRenames bool
+	// RenameScore is the threshold to of similarity between files to consider
+	// that a pair of delete and insert are a rename. The number must be
+	// exactly between 0 and 100.
+	RenameScore uint
+	// RenameLimit is the maximum amount of files that can be compared when
+	// detecting renames. The number of comparisons that have to be performed
+	// is equal to the number of deleted files * the number of added files.
+	// That means, that if 100 files were deleted and 50 files were added, 5000
+	// file comparisons may be needed. So, if the rename limit is 50, the number
+	// of both deleted and added needs to be equal or less than 50.
+	// A value of 0 means no limit.
+	RenameLimit uint
+	// OnlyExactRenames performs only detection of exact renames and will not perform
+	// any detection of renames based on file similarity.
+	OnlyExactRenames bool
+}
+
+// DefaultDiffTreeOptions are the default and recommended options for the
+// diff tree.
+var DefaultDiffTreeOptions = &DiffTreeOptions{
+	DetectRenames:    true,
+	RenameScore:      60,
+	RenameLimit:      0,
+	OnlyExactRenames: false,
+}
+
+// DiffTreeWithOptions compares the content and mode of the blobs found
+// via two tree objects with the given options. The provided context
+// must be non-nil.
+// If no options are passed, no rename detection will be performed. The
+// recommended options are DefaultDiffTreeOptions.
+// An error will be returned if the context expires.
+// This function will be deprecated and removed in v6 so the default
+// behaviour of DiffTree is to detect renames.
+func DiffTreeWithOptions(
+	ctx context.Context,
+	a, b *Tree,
+	opts *DiffTreeOptions,
+) (Changes, error) {
 	from := NewTreeRootNode(a)
 	to := NewTreeRootNode(b)
 
@@ -33,5 +81,18 @@ func DiffTreeContext(ctx context.Context, a, b *Tree) (Changes, error) {
 		return nil, err
 	}
 
-	return newChanges(merkletrieChanges)
+	changes, err := newChanges(merkletrieChanges)
+	if err != nil {
+		return nil, err
+	}
+
+	if opts == nil {
+		opts = new(DiffTreeOptions)
+	}
+
+	if opts.DetectRenames {
+		return DetectRenames(changes, opts)
+	}
+
+	return changes, nil
 }
diff --git a/plumbing/object/rename.go b/plumbing/object/rename.go
new file mode 100644
index 0000000..48665eb
--- /dev/null
+++ b/plumbing/object/rename.go
@@ -0,0 +1,817 @@
+package object
+
+import (
+	"errors"
+	"io"
+	"sort"
+	"strings"
+
+	"github.com/go-git/go-git/v5/plumbing"
+	"github.com/go-git/go-git/v5/plumbing/filemode"
+	"github.com/go-git/go-git/v5/utils/ioutil"
+	"github.com/go-git/go-git/v5/utils/merkletrie"
+)
+
+// DetectRenames detects the renames in the given changes on two trees with
+// the given options. It will return the given changes grouping additions and
+// deletions into modifications when possible.
+// If options is nil, the default diff tree options will be used.
+func DetectRenames(
+	changes Changes,
+	opts *DiffTreeOptions,
+) (Changes, error) {
+	if opts == nil {
+		opts = DefaultDiffTreeOptions
+	}
+
+	detector := &renameDetector{
+		renameScore: int(opts.RenameScore),
+		renameLimit: int(opts.RenameLimit),
+		onlyExact:   opts.OnlyExactRenames,
+	}
+
+	for _, c := range changes {
+		action, err := c.Action()
+		if err != nil {
+			return nil, err
+		}
+
+		switch action {
+		case merkletrie.Insert:
+			detector.added = append(detector.added, c)
+		case merkletrie.Delete:
+			detector.deleted = append(detector.deleted, c)
+		default:
+			detector.modified = append(detector.modified, c)
+		}
+	}
+
+	return detector.detect()
+}
+
+// renameDetector will detect and resolve renames in a set of changes.
+// see: https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
+type renameDetector struct {
+	added    []*Change
+	deleted  []*Change
+	modified []*Change
+
+	renameScore int
+	renameLimit int
+	onlyExact   bool
+}
+
+// detectExactRenames detects matches files that were deleted with files that
+// were added where the hash is the same on both. If there are multiple targets
+// the one with the most similar path will be chosen as the rename and the
+// rest as either deletions or additions.
+func (d *renameDetector) detectExactRenames() {
+	added := groupChangesByHash(d.added)
+	deletes := groupChangesByHash(d.deleted)
+	var uniqueAdds []*Change
+	var nonUniqueAdds [][]*Change
+	var addedLeft []*Change
+
+	for _, cs := range added {
+		if len(cs) == 1 {
+			uniqueAdds = append(uniqueAdds, cs[0])
+		} else {
+			nonUniqueAdds = append(nonUniqueAdds, cs)
+		}
+	}
+
+	for _, c := range uniqueAdds {
+		hash := changeHash(c)
+		deleted := deletes[hash]
+
+		if len(deleted) == 1 {
+			if sameMode(c, deleted[0]) {
+				d.modified = append(d.modified, &Change{From: deleted[0].From, To: c.To})
+				delete(deletes, hash)
+			} else {
+				addedLeft = append(addedLeft, c)
+			}
+		} else if len(deleted) > 1 {
+			bestMatch := bestNameMatch(c, deleted)
+			if bestMatch != nil && sameMode(c, bestMatch) {
+				d.modified = append(d.modified, &Change{From: bestMatch.From, To: c.To})
+				delete(deletes, hash)
+
+				var newDeletes = make([]*Change, 0, len(deleted)-1)
+				for _, d := range deleted {
+					if d != bestMatch {
+						newDeletes = append(newDeletes, d)
+					}
+				}
+				deletes[hash] = newDeletes
+			}
+		} else {
+			addedLeft = append(addedLeft, c)
+		}
+	}
+
+	for _, added := range nonUniqueAdds {
+		hash := changeHash(added[0])
+		deleted := deletes[hash]
+
+		if len(deleted) == 1 {
+			deleted := deleted[0]
+			bestMatch := bestNameMatch(deleted, added)
+			if bestMatch != nil && sameMode(deleted, bestMatch) {
+				d.modified = append(d.modified, &Change{From: deleted.From, To: bestMatch.To})
+				delete(deletes, hash)
+
+				for _, c := range added {
+					if c != bestMatch {
+						addedLeft = append(addedLeft, c)
+					}
+				}
+			} else {
+				addedLeft = append(addedLeft, added...)
+			}
+		} else if len(deleted) > 1 {
+			maxSize := len(deleted) * len(added)
+			if d.renameLimit > 0 && d.renameLimit < maxSize {
+				maxSize = d.renameLimit
+			}
+
+			matrix := make(similarityMatrix, 0, maxSize)
+
+			for delIdx, del := range deleted {
+				deletedName := changeName(del)
+
+				for addIdx, add := range added {
+					addedName := changeName(add)
+
+					score := nameSimilarityScore(addedName, deletedName)
+					matrix = append(matrix, similarityPair{added: addIdx, deleted: delIdx, score: score})
+
+					if len(matrix) >= maxSize {
+						break
+					}
+				}
+
+				if len(matrix) >= maxSize {
+					break
+				}
+			}
+
+			sort.Stable(matrix)
+
+			usedAdds := make(map[*Change]struct{})
+			usedDeletes := make(map[*Change]struct{})
+			for i := len(matrix) - 1; i >= 0; i-- {
+				del := deleted[matrix[i].deleted]
+				add := added[matrix[i].added]
+
+				if add == nil || del == nil {
+					// it was already matched
+					continue
+				}
+
+				usedAdds[add] = struct{}{}
+				usedDeletes[del] = struct{}{}
+				d.modified = append(d.modified, &Change{From: del.From, To: add.To})
+				added[matrix[i].added] = nil
+				deleted[matrix[i].deleted] = nil
+			}
+
+			for _, c := range added {
+				if _, ok := usedAdds[c]; !ok {
+					addedLeft = append(addedLeft, c)
+				}
+			}
+
+			var newDeletes = make([]*Change, 0, len(deleted)-len(usedDeletes))
+			for _, c := range deleted {
+				if _, ok := usedDeletes[c]; !ok {
+					newDeletes = append(newDeletes, c)
+				}
+			}
+			deletes[hash] = newDeletes
+		} else {
+			addedLeft = append(addedLeft, added...)
+		}
+	}
+
+	d.added = addedLeft
+	d.deleted = nil
+	for _, dels := range deletes {
+		for _, del := range dels {
+			if del != nil {
+				d.deleted = append(d.deleted, del)
+			}
+		}
+	}
+}
+
+// detectContentRenames detects renames based on the similarity of the content
+// in the files by building a matrix of pairs between sources and destinations
+// and matching by the highest score.
+// see: https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
+func (d *renameDetector) detectContentRenames() error {
+	cnt := max(len(d.added), len(d.deleted))
+	if d.renameLimit > 0 && cnt > d.renameLimit {
+		return nil
+	}
+
+	srcs, dsts := d.deleted, d.added
+	matrix, err := buildSimilarityMatrix(srcs, dsts, d.renameScore)
+	if err != nil {
+		return err
+	}
+	renames := make([]*Change, 0, min(len(matrix), len(dsts)))
+
+	// Match rename pairs on a first come, first serve basis until
+	// we have looked at everything that is above the minimum score.
+	for i := len(matrix) - 1; i >= 0; i-- {
+		pair := matrix[i]
+		src := srcs[pair.deleted]
+		dst := dsts[pair.added]
+
+		if dst == nil || src == nil {
+			// It was already matched before
+			continue
+		}
+
+		renames = append(renames, &Change{From: src.From, To: dst.To})
+
+		// Claim destination and source as matched
+		dsts[pair.added] = nil
+		srcs[pair.deleted] = nil
+	}
+
+	d.modified = append(d.modified, renames...)
+	d.added = compactChanges(dsts)
+	d.deleted = compactChanges(srcs)
+
+	return nil
+}
+
+func (d *renameDetector) detect() (Changes, error) {
+	if len(d.added) > 0 && len(d.deleted) > 0 {
+		d.detectExactRenames()
+
+		if !d.onlyExact {
+			if err := d.detectContentRenames(); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	result := make(Changes, 0, len(d.added)+len(d.deleted)+len(d.modified))
+	result = append(result, d.added...)
+	result = append(result, d.deleted...)
+	result = append(result, d.modified...)
+
+	sort.Stable(result)
+
+	return result, nil
+}
+
+func bestNameMatch(change *Change, changes []*Change) *Change {
+	var best *Change
+	var bestScore int
+
+	cname := changeName(change)
+
+	for _, c := range changes {
+		score := nameSimilarityScore(cname, changeName(c))
+		if score > bestScore {
+			bestScore = score
+			best = c
+		}
+	}
+
+	return best
+}
+
+func nameSimilarityScore(a, b string) int {
+	aDirLen := strings.LastIndexByte(a, '/') + 1
+	bDirLen := strings.LastIndexByte(b, '/') + 1
+
+	dirMin := min(aDirLen, bDirLen)
+	dirMax := max(aDirLen, bDirLen)
+
+	var dirScoreLtr, dirScoreRtl int
+	if dirMax == 0 {
+		dirScoreLtr = 100
+		dirScoreRtl = 100
+	} else {
+		var dirSim int
+
+		for ; dirSim < dirMin; dirSim++ {
+			if a[dirSim] != b[dirSim] {
+				break
+			}
+		}
+
+		dirScoreLtr = dirSim * 100 / dirMax
+
+		if dirScoreLtr == 100 {
+			dirScoreRtl = 100
+		} else {
+			for dirSim = 0; dirSim < dirMin; dirSim++ {
+				if a[aDirLen-1-dirSim] != b[bDirLen-1-dirSim] {
+					break
+				}
+			}
+			dirScoreRtl = dirSim * 100 / dirMax
+		}
+	}
+
+	fileMin := min(len(a)-aDirLen, len(b)-bDirLen)
+	fileMax := max(len(a)-aDirLen, len(b)-bDirLen)
+
+	fileSim := 0
+	for ; fileSim < fileMin; fileSim++ {
+		if a[len(a)-1-fileSim] != b[len(b)-1-fileSim] {
+			break
+		}
+	}
+	fileScore := fileSim * 100 / fileMax
+
+	return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100
+}
+
+func changeName(c *Change) string {
+	if c.To != empty {
+		return c.To.Name
+	}
+	return c.From.Name
+}
+
+func changeHash(c *Change) plumbing.Hash {
+	if c.To != empty {
+		return c.To.TreeEntry.Hash
+	}
+
+	return c.From.TreeEntry.Hash
+}
+
+func changeMode(c *Change) filemode.FileMode {
+	if c.To != empty {
+		return c.To.TreeEntry.Mode
+	}
+
+	return c.From.TreeEntry.Mode
+}
+
+func sameMode(a, b *Change) bool {
+	return changeMode(a) == changeMode(b)
+}
+
+func groupChangesByHash(changes []*Change) map[plumbing.Hash][]*Change {
+	var result = make(map[plumbing.Hash][]*Change)
+	for _, c := range changes {
+		hash := changeHash(c)
+		result[hash] = append(result[hash], c)
+	}
+	return result
+}
+
+type similarityMatrix []similarityPair
+
+func (m similarityMatrix) Len() int      { return len(m) }
+func (m similarityMatrix) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
+func (m similarityMatrix) Less(i, j int) bool {
+	if m[i].score == m[j].score {
+		if m[i].added == m[j].added {
+			return m[i].deleted < m[j].deleted
+		}
+		return m[i].added < m[j].added
+	}
+	return m[i].score < m[j].score
+}
+
+type similarityPair struct {
+	// index of the added file
+	added int
+	// index of the deleted file
+	deleted int
+	// similarity score
+	score int
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func buildSimilarityMatrix(srcs, dsts []*Change, renameScore int) (similarityMatrix, error) {
+	// Allocate for the worst-case scenario where every pair has a score
+	// that we need to consider. We might not need that many.
+	matrix := make(similarityMatrix, 0, len(srcs)*len(dsts))
+	srcSizes := make([]int64, len(srcs))
+	dstSizes := make([]int64, len(dsts))
+	dstTooLarge := make(map[int]bool)
+
+	// Consider each pair of files, if the score is above the minimum
+	// threshold we need to record that scoring in the matrix so we can
+	// later find the best matches.
+outerLoop:
+	for srcIdx, src := range srcs {
+		if changeMode(src) != filemode.Regular {
+			continue
+		}
+
+		// Declare the from file and the similarity index here to be able to
+		// reuse it inside the inner loop. The reason to not initialize them
+		// here is so we can skip the initialization in case they happen to
+		// not be needed later. They will be initialized inside the inner
+		// loop if and only if they're needed and reused in subsequent passes.
+		var from *File
+		var s *similarityIndex
+		var err error
+		for dstIdx, dst := range dsts {
+			if changeMode(dst) != filemode.Regular {
+				continue
+			}
+
+			if dstTooLarge[dstIdx] {
+				continue
+			}
+
+			var to *File
+			srcSize := srcSizes[srcIdx]
+			if srcSize == 0 {
+				from, _, err = src.Files()
+				if err != nil {
+					return nil, err
+				}
+				srcSize = from.Size + 1
+				srcSizes[srcIdx] = srcSize
+			}
+
+			dstSize := dstSizes[dstIdx]
+			if dstSize == 0 {
+				_, to, err = dst.Files()
+				if err != nil {
+					return nil, err
+				}
+				dstSize = to.Size + 1
+				dstSizes[dstIdx] = dstSize
+			}
+
+			min, max := srcSize, dstSize
+			if dstSize < srcSize {
+				min = dstSize
+				max = srcSize
+			}
+
+			if int(min*100/max) < renameScore {
+				// File sizes are too different to be a match
+				continue
+			}
+
+			if s == nil {
+				s, err = fileSimilarityIndex(from)
+				if err != nil {
+					if err == errIndexFull {
+						continue outerLoop
+					}
+					return nil, err
+				}
+			}
+
+			if to == nil {
+				_, to, err = dst.Files()
+				if err != nil {
+					return nil, err
+				}
+			}
+
+			di, err := fileSimilarityIndex(to)
+			if err != nil {
+				if err == errIndexFull {
+					dstTooLarge[dstIdx] = true
+				}
+
+				return nil, err
+			}
+
+			contentScore := s.score(di, 10000)
+			// The name score returns a value between 0 and 100, so we need to
+			// convert it to the same range as the content score.
+			nameScore := nameSimilarityScore(src.From.Name, dst.To.Name) * 100
+			score := (contentScore*99 + nameScore*1) / 10000
+
+			if score < renameScore {
+				continue
+			}
+
+			matrix = append(matrix, similarityPair{added: dstIdx, deleted: srcIdx, score: score})
+		}
+	}
+
+	sort.Stable(matrix)
+
+	return matrix, nil
+}
+
+func compactChanges(changes []*Change) []*Change {
+	var result []*Change
+	for _, c := range changes {
+		if c != nil {
+			result = append(result, c)
+		}
+	}
+	return result
+}
+
+const (
+	keyShift      = 32
+	maxCountValue = (1 << keyShift) - 1
+)
+
+var errIndexFull = errors.New("index is full")
+
+// similarityIndex is an index structure of lines/blocks in one file.
+// This structure can be used to compute an approximation of the similarity
+// between two files.
+// To save space in memory, this index uses a space efficient encoding which
+// will not exceed 1MiB per instance. The index starts out at a smaller size
+// (closer to 2KiB), but may grow as more distinct blocks withing the scanned
+// file are discovered.
+// see: https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
+type similarityIndex struct {
+	hashed uint64
+	// number of non-zero entries in hashes
+	numHashes int
+	growAt    int
+	hashes    []keyCountPair
+	hashBits  int
+}
+
+func fileSimilarityIndex(f *File) (*similarityIndex, error) {
+	idx := newSimilarityIndex()
+	if err := idx.hash(f); err != nil {
+		return nil, err
+	}
+
+	sort.Stable(keyCountPairs(idx.hashes))
+
+	return idx, nil
+}
+
+func newSimilarityIndex() *similarityIndex {
+	return &similarityIndex{
+		hashBits: 8,
+		hashes:   make([]keyCountPair, 1<<8),
+		growAt:   shouldGrowAt(8),
+	}
+}
+
+func (i *similarityIndex) hash(f *File) error {
+	isBin, err := f.IsBinary()
+	if err != nil {
+		return err
+	}
+
+	r, err := f.Reader()
+	if err != nil {
+		return err
+	}
+
+	defer ioutil.CheckClose(r, &err)
+
+	return i.hashContent(r, f.Size, isBin)
+}
+
+func (i *similarityIndex) hashContent(r io.Reader, size int64, isBin bool) error {
+	var buf = make([]byte, 4096)
+	var ptr, cnt int
+	remaining := size
+
+	for 0 < remaining {
+		hash := 5381
+		var blockHashedCnt uint64
+
+		// Hash one line or block, whatever happens first
+		n := int64(0)
+		for {
+			if ptr == cnt {
+				ptr = 0
+				var err error
+				cnt, err = io.ReadFull(r, buf)
+				if err != nil && err != io.ErrUnexpectedEOF {
+					return err
+				}
+
+				if cnt == 0 {
+					return io.EOF
+				}
+			}
+			n++
+			c := buf[ptr] & 0xff
+			ptr++
+
+			// Ignore CR in CRLF sequence if it's text
+			if !isBin && c == '\r' && ptr < cnt && buf[ptr] == '\n' {
+				continue
+			}
+			blockHashedCnt++
+
+			if c == '\n' {
+				break
+			}
+
+			hash = (hash << 5) + hash + int(c)
+
+			if n >= 64 || n >= remaining {
+				break
+			}
+		}
+		i.hashed += blockHashedCnt
+		if err := i.add(hash, blockHashedCnt); err != nil {
+			return err
+		}
+		remaining -= n
+	}
+
+	return nil
+}
+
+// score computes the similarity score between this index and another one.
+// A region of a file is defined as a line in a text file or a fixed-size
+// block in a binary file. To prepare an index, each region in the file is
+// hashed; the values and counts of hashes are retained in a sorted table.
+// Define the similarity fraction F as the count of matching regions between
+// the two files divided between the maximum count of regions in either file.
+// The similarity score is F multiplied by the maxScore constant, yielding a
+// range [0, maxScore]. It is defined as maxScore for the degenerate case of
+// two empty files.
+// The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
+func (i *similarityIndex) score(other *similarityIndex, maxScore int) int {
+	var maxHashed = i.hashed
+	if maxHashed < other.hashed {
+		maxHashed = other.hashed
+	}
+	if maxHashed == 0 {
+		return maxScore
+	}
+
+	return int(i.common(other) * uint64(maxScore) / maxHashed)
+}
+
+func (i *similarityIndex) common(dst *similarityIndex) uint64 {
+	srcIdx, dstIdx := 0, 0
+	if i.numHashes == 0 || dst.numHashes == 0 {
+		return 0
+	}
+
+	var common uint64
+	srcKey, dstKey := i.hashes[srcIdx].key(), dst.hashes[dstIdx].key()
+
+	for {
+		if srcKey == dstKey {
+			srcCnt, dstCnt := i.hashes[srcIdx].count(), dst.hashes[dstIdx].count()
+			if srcCnt < dstCnt {
+				common += srcCnt
+			} else {
+				common += dstCnt
+			}
+
+			srcIdx++
+			if srcIdx == len(i.hashes) {
+				break
+			}
+			srcKey = i.hashes[srcIdx].key()
+
+			dstIdx++
+			if dstIdx == len(dst.hashes) {
+				break
+			}
+			dstKey = dst.hashes[dstIdx].key()
+		} else if srcKey < dstKey {
+			// Region of src that is not in dst
+			srcIdx++
+			if srcIdx == len(i.hashes) {
+				break
+			}
+			srcKey = i.hashes[srcIdx].key()
+		} else {
+			// Region of dst that is not in src
+			dstIdx++
+			if dstIdx == len(dst.hashes) {
+				break
+			}
+			dstKey = dst.hashes[dstIdx].key()
+		}
+	}
+
+	return common
+}
+
+func (i *similarityIndex) add(key int, cnt uint64) error {
+	key = int(uint32(key*0x9e370001) >> 1)
+
+	j := i.slot(key)
+	for {
+		v := i.hashes[j]
+		if v == 0 {
+			// It's an empty slot, so we can store it here.
+			if i.growAt <= i.numHashes {
+				if err := i.grow(); err != nil {
+					return err
+				}
+				j = i.slot(key)
+				continue
+			}
+
+			var err error
+			i.hashes[j], err = newKeyCountPair(key, cnt)
+			if err != nil {
+				return err
+			}
+			i.numHashes++
+			return nil
+		} else if v.key() == key {
+			// It's the same key, so increment the counter.
+			var err error
+			i.hashes[j], err = newKeyCountPair(key, v.count()+cnt)
+			if err != nil {
+				return err
+			}
+			return nil
+		} else if j+1 >= len(i.hashes) {
+			j = 0
+		} else {
+			j++
+		}
+	}
+}
+
+type keyCountPair uint64
+
+func newKeyCountPair(key int, cnt uint64) (keyCountPair, error) {
+	if cnt > maxCountValue {
+		return 0, errIndexFull
+	}
+
+	return keyCountPair((uint64(key) << keyShift) | cnt), nil
+}
+
+func (p keyCountPair) key() int {
+	return int(p >> keyShift)
+}
+
+func (p keyCountPair) count() uint64 {
+	return uint64(p) & maxCountValue
+}
+
+func (i *similarityIndex) slot(key int) int {
+	// We use 31 - hashBits because the upper bit was already forced
+	// to be 0 and we want the remaining high bits to be used as the
+	// table slot.
+	return int(uint32(key) >> (31 - i.hashBits))
+}
+
+func shouldGrowAt(hashBits int) int {
+	return (1 << hashBits) * (hashBits - 3) / hashBits
+}
+
+func (i *similarityIndex) grow() error {
+	if i.hashBits == 30 {
+		return errIndexFull
+	}
+
+	old := i.hashes
+
+	i.hashBits++
+	i.growAt = shouldGrowAt(i.hashBits)
+
+	// TODO(erizocosmico): find a way to check if it will OOM and return
+	// errIndexFull instead.
+	i.hashes = make([]keyCountPair, 1<<i.hashBits)
+
+	for _, v := range old {
+		if v != 0 {
+			j := i.slot(v.key())
+			for i.hashes[j] != 0 {
+				j++
+				if j >= len(i.hashes) {
+					j = 0
+				}
+			}
+			i.hashes[j] = v
+		}
+	}
+
+	return nil
+}
+
+type keyCountPairs []keyCountPair
+
+func (p keyCountPairs) Len() int           { return len(p) }
+func (p keyCountPairs) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
+func (p keyCountPairs) Less(i, j int) bool { return p[i] < p[j] }
diff --git a/plumbing/object/rename_test.go b/plumbing/object/rename_test.go
new file mode 100644
index 0000000..bf5044d
--- /dev/null
+++ b/plumbing/object/rename_test.go
@@ -0,0 +1,490 @@
+package object
+
+import (
+	"path/filepath"
+	"strings"
+
+	"github.com/go-git/go-git/v5/plumbing"
+	"github.com/go-git/go-git/v5/plumbing/filemode"
+	"github.com/go-git/go-git/v5/storage/memory"
+	. "gopkg.in/check.v1"
+)
+
+type RenameSuite struct {
+	BaseObjectsSuite
+}
+
+var _ = Suite(&RenameSuite{})
+
+func (s *RenameSuite) TestNameSimilarityScore(c *C) {
+	testCases := []struct {
+		a, b  string
+		score int
+	}{
+		{"foo/bar.c", "foo/baz.c", 70},
+		{"src/utils/Foo.java", "tests/utils/Foo.java", 64},
+		{"foo/bar/baz.py", "README.md", 0},
+		{"src/utils/something/foo.py", "src/utils/something/other/foo.py", 69},
+		{"src/utils/something/foo.py", "src/utils/yada/foo.py", 63},
+		{"src/utils/something/foo.py", "src/utils/something/other/bar.py", 44},
+		{"src/utils/something/foo.py", "src/utils/something/foo.py", 100},
+	}
+
+	for _, tt := range testCases {
+		c.Assert(nameSimilarityScore(tt.a, tt.b), Equals, tt.score)
+	}
+}
+
+const (
+	pathA = "src/A"
+	pathB = "src/B"
+	pathH = "src/H"
+	pathQ = "src/Q"
+)
+
+func (s *RenameSuite) TestExactRename_OneRename(c *C) {
+	a := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+	b := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo"))
+
+	result := detectRenames(c, Changes{a, b}, nil, 1)
+	assertRename(c, b, a, result[0])
+}
+
+func (s *RenameSuite) TestExactRename_DifferentObjects(c *C) {
+	a := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+	h := makeAdd(c, makeFile(c, pathH, filemode.Regular, "foo"))
+	q := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "bar"))
+
+	result := detectRenames(c, Changes{a, h, q}, nil, 3)
+	c.Assert(result[0], DeepEquals, a)
+	c.Assert(result[1], DeepEquals, h)
+	c.Assert(result[2], DeepEquals, q)
+}
+
+func (s *RenameSuite) TestExactRename_OneRenameOneModify(c *C) {
+	c1 := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+	c2 := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo"))
+	c3 := makeChange(c,
+		makeFile(c, pathH, filemode.Regular, "bar"),
+		makeFile(c, pathH, filemode.Regular, "bar"),
+	)
+
+	result := detectRenames(c, Changes{c1, c2, c3}, nil, 2)
+	c.Assert(result[0], DeepEquals, c3)
+	assertRename(c, c2, c1, result[1])
+}
+
+func (s *RenameSuite) TestExactRename_ManyRenames(c *C) {
+	c1 := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+	c2 := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo"))
+	c3 := makeAdd(c, makeFile(c, pathH, filemode.Regular, "bar"))
+	c4 := makeDelete(c, makeFile(c, pathB, filemode.Regular, "bar"))
+
+	result := detectRenames(c, Changes{c1, c2, c3, c4}, nil, 2)
+	assertRename(c, c4, c3, result[0])
+	assertRename(c, c2, c1, result[1])
+}
+
+func (s *RenameSuite) TestExactRename_MultipleIdenticalDeletes(c *C) {
+	changes := Changes{
+		makeDelete(c, makeFile(c, pathA, filemode.Regular, "foo")),
+		makeDelete(c, makeFile(c, pathB, filemode.Regular, "foo")),
+		makeDelete(c, makeFile(c, pathH, filemode.Regular, "foo")),
+		makeAdd(c, makeFile(c, pathQ, filemode.Regular, "foo")),
+	}
+
+	result := detectRenames(c, changes, nil, 3)
+	assertRename(c, changes[0], changes[3], result[0])
+	c.Assert(result[1], DeepEquals, changes[1])
+	c.Assert(result[2], DeepEquals, changes[2])
+}
+
+func (s *RenameSuite) TestRenameExact_PathBreaksTie(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, "src/com/foo/a.java", filemode.Regular, "foo")),
+		makeDelete(c, makeFile(c, "src/com/foo/b.java", filemode.Regular, "foo")),
+		makeAdd(c, makeFile(c, "c.txt", filemode.Regular, "foo")),
+		makeDelete(c, makeFile(c, "d.txt", filemode.Regular, "foo")),
+		makeAdd(c, makeFile(c, "the_e_file.txt", filemode.Regular, "foo")),
+	}
+
+	// Add out of order to avoid first-match succeeding
+	result := detectRenames(c, Changes{
+		changes[0],
+		changes[3],
+		changes[4],
+		changes[1],
+		changes[2],
+	}, nil, 3)
+	assertRename(c, changes[3], changes[2], result[0])
+	assertRename(c, changes[1], changes[0], result[1])
+	c.Assert(result[2], DeepEquals, changes[4])
+}
+
+func (s *RenameSuite) TestExactRename_OneDeleteManyAdds(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, "src/com/foo/a.java", filemode.Regular, "foo")),
+		makeAdd(c, makeFile(c, "src/com/foo/b.java", filemode.Regular, "foo")),
+		makeAdd(c, makeFile(c, "c.txt", filemode.Regular, "foo")),
+		makeDelete(c, makeFile(c, "d.txt", filemode.Regular, "foo")),
+	}
+
+	result := detectRenames(c, changes, nil, 3)
+	assertRename(c, changes[3], changes[2], result[0])
+	c.Assert(result[1], DeepEquals, changes[0])
+	c.Assert(result[2], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestExactRename_UnstagedFile(c *C) {
+	changes := Changes{
+		makeDelete(c, makeFile(c, pathA, filemode.Regular, "foo")),
+		makeAdd(c, makeFile(c, pathB, filemode.Regular, "foo")),
+	}
+	result := detectRenames(c, changes, nil, 1)
+	assertRename(c, changes[0], changes[1], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_OnePair(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblarg\n")),
+		makeDelete(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblah\n")),
+	}
+
+	result := detectRenames(c, changes, nil, 1)
+	assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_OneRenameTwoUnrelatedFiles(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblarg\n")),
+		makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo\nbar\nbaz\nblah\n")),
+		makeAdd(c, makeFile(c, pathB, filemode.Regular, "some\nsort\nof\ntext\n")),
+		makeDelete(c, makeFile(c, pathH, filemode.Regular, "completely\nunrelated\ntext\n")),
+	}
+
+	result := detectRenames(c, changes, nil, 3)
+	c.Assert(result[0], DeepEquals, changes[2])
+	c.Assert(result[1], DeepEquals, changes[3])
+	assertRename(c, changes[1], changes[0], result[2])
+}
+
+func (s *RenameSuite) TestContentRename_LastByteDifferent(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\na")),
+		makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo\nbar\nb")),
+	}
+
+	result := detectRenames(c, changes, nil, 1)
+	assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_NewlinesOnly(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, strings.Repeat("\n", 3))),
+		makeDelete(c, makeFile(c, pathQ, filemode.Regular, strings.Repeat("\n", 4))),
+	}
+
+	result := detectRenames(c, changes, nil, 1)
+	assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_SameContentMultipleTimes(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, "a\na\na\na\n")),
+		makeDelete(c, makeFile(c, pathQ, filemode.Regular, "a\na\na\n")),
+	}
+
+	result := detectRenames(c, changes, nil, 1)
+	assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_OnePairRenameScore50(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, "ab\nab\nab\nac\nad\nae\n")),
+		makeDelete(c, makeFile(c, pathQ, filemode.Regular, "ac\nab\nab\nab\naa\na0\na1\n")),
+	}
+
+	result := detectRenames(c, changes, &DiffTreeOptions{RenameScore: 50}, 1)
+	assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestNoRenames_SingleByteFiles(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, "a")),
+		makeAdd(c, makeFile(c, pathQ, filemode.Regular, "b")),
+	}
+
+	result := detectRenames(c, changes, nil, 2)
+	c.Assert(result[0], DeepEquals, changes[0])
+	c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestNoRenames_EmptyFile(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, "")),
+	}
+	result := detectRenames(c, changes, nil, 1)
+	c.Assert(result[0], DeepEquals, changes[0])
+}
+
+func (s *RenameSuite) TestNoRenames_EmptyFile2(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, "")),
+		makeDelete(c, makeFile(c, pathQ, filemode.Regular, "blah")),
+	}
+	result := detectRenames(c, changes, nil, 2)
+	c.Assert(result[0], DeepEquals, changes[0])
+	c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestNoRenames_SymlinkAndFile(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, "src/dest")),
+		makeDelete(c, makeFile(c, pathQ, filemode.Symlink, "src/dest")),
+	}
+	result := detectRenames(c, changes, nil, 2)
+	c.Assert(result[0], DeepEquals, changes[0])
+	c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestNoRenames_SymlinkAndFileSamePath(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, "src/dest")),
+		makeDelete(c, makeFile(c, pathA, filemode.Symlink, "src/dest")),
+	}
+	result := detectRenames(c, changes, nil, 2)
+	c.Assert(result[0], DeepEquals, changes[0])
+	c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestRenameLimit(c *C) {
+	changes := Changes{
+		makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblarg\n")),
+		makeDelete(c, makeFile(c, pathB, filemode.Regular, "foo\nbar\nbaz\nblah\n")),
+		makeAdd(c, makeFile(c, pathH, filemode.Regular, "a\nb\nc\nd\n")),
+		makeDelete(c, makeFile(c, pathQ, filemode.Regular, "a\nb\nc\n")),
+	}
+
+	result := detectRenames(c, changes, &DiffTreeOptions{RenameLimit: 1}, 4)
+	for i, res := range result {
+		c.Assert(res, DeepEquals, changes[i])
+	}
+}
+
+func detectRenames(c *C, changes Changes, opts *DiffTreeOptions, expectedResults int) Changes {
+	result, err := DetectRenames(changes, opts)
+	c.Assert(err, IsNil)
+	c.Assert(result, HasLen, expectedResults)
+	return result
+}
+
+func assertRename(c *C, from, to *Change, rename *Change) {
+	c.Assert(&Change{From: from.From, To: to.To}, DeepEquals, rename)
+}
+
+type SimilarityIndexSuite struct {
+	BaseObjectsSuite
+}
+
+var _ = Suite(&SimilarityIndexSuite{})
+
+func (s *SimilarityIndexSuite) TestScoreFiles(c *C) {
+	tree := s.tree(c, plumbing.NewHash("a8d315b2b1c615d43042c3a62402b8a54288cf5c"))
+	binary, err := tree.File("binary.jpg")
+	c.Assert(err, IsNil)
+	binIndex, err := fileSimilarityIndex(binary)
+	c.Assert(err, IsNil)
+
+	long, err := tree.File("json/long.json")
+	c.Assert(err, IsNil)
+	longIndex, err := fileSimilarityIndex(long)
+	c.Assert(err, IsNil)
+
+	short, err := tree.File("json/short.json")
+	c.Assert(err, IsNil)
+	shortIndex, err := fileSimilarityIndex(short)
+	c.Assert(err, IsNil)
+
+	php, err := tree.File("php/crappy.php")
+	c.Assert(err, IsNil)
+	phpIndex, err := fileSimilarityIndex(php)
+	c.Assert(err, IsNil)
+
+	testCases := []struct {
+		src, dst      *similarityIndex
+		expectedScore int
+	}{
+		{binIndex, binIndex, 10000}, // same file
+		{shortIndex, longIndex, 32}, // slightly similar files
+		{longIndex, shortIndex, 32}, // same as previous, diff order
+		{shortIndex, phpIndex, 1},   // different files
+		{longIndex, binIndex, 0},    // code vs binary file
+	}
+
+	for _, tt := range testCases {
+		score := tt.src.score(tt.dst, 10000)
+		c.Assert(score, Equals, tt.expectedScore)
+	}
+}
+
+func (s *SimilarityIndexSuite) TestHashContent(c *C) {
+	idx := textIndex(c, "A\n"+
+		"B\n"+
+		"D\n"+
+		"B\n")
+
+	keyA := keyFor(c, "A\n")
+	keyB := keyFor(c, "B\n")
+	keyD := keyFor(c, "D\n")
+
+	c.Assert(keyA, Not(Equals), keyB)
+	c.Assert(keyA, Not(Equals), keyD)
+	c.Assert(keyD, Not(Equals), keyB)
+
+	c.Assert(idx.numHashes, Equals, 3)
+	c.Assert(idx.hashes[findIndex(idx, keyA)].count(), Equals, uint64(2))
+	c.Assert(idx.hashes[findIndex(idx, keyB)].count(), Equals, uint64(4))
+	c.Assert(idx.hashes[findIndex(idx, keyD)].count(), Equals, uint64(2))
+}
+
+func (s *SimilarityIndexSuite) TestCommonSameFiles(c *C) {
+	content := "A\n" +
+		"B\n" +
+		"D\n" +
+		"B\n"
+
+	src := textIndex(c, content)
+	dst := textIndex(c, content)
+
+	c.Assert(src.common(dst), Equals, uint64(8))
+	c.Assert(dst.common(src), Equals, uint64(8))
+
+	c.Assert(src.score(dst, 100), Equals, 100)
+	c.Assert(dst.score(src, 100), Equals, 100)
+}
+
+func (s *SimilarityIndexSuite) TestCommonSameFilesCR(c *C) {
+	content := "A\r\n" +
+		"B\r\n" +
+		"D\r\n" +
+		"B\r\n"
+
+	src := textIndex(c, content)
+	dst := textIndex(c, strings.ReplaceAll(content, "\r", ""))
+
+	c.Assert(src.common(dst), Equals, uint64(8))
+	c.Assert(dst.common(src), Equals, uint64(8))
+
+	c.Assert(src.score(dst, 100), Equals, 100)
+	c.Assert(dst.score(src, 100), Equals, 100)
+}
+
+func (s *SimilarityIndexSuite) TestCommonEmptyFiles(c *C) {
+	src := textIndex(c, "")
+	dst := textIndex(c, "")
+
+	c.Assert(src.common(dst), Equals, uint64(0))
+	c.Assert(dst.common(src), Equals, uint64(0))
+}
+
+func (s *SimilarityIndexSuite) TestCommonTotallyDifferentFiles(c *C) {
+	src := textIndex(c, "A\n")
+	dst := textIndex(c, "D\n")
+
+	c.Assert(src.common(dst), Equals, uint64(0))
+	c.Assert(dst.common(src), Equals, uint64(0))
+}
+
+func (s *SimilarityIndexSuite) TestSimilarity75(c *C) {
+	src := textIndex(c, "A\nB\nC\nD\n")
+	dst := textIndex(c, "A\nB\nC\nQ\n")
+
+	c.Assert(src.common(dst), Equals, uint64(6))
+	c.Assert(dst.common(src), Equals, uint64(6))
+
+	c.Assert(src.score(dst, 100), Equals, 75)
+	c.Assert(dst.score(src, 100), Equals, 75)
+}
+
+func keyFor(c *C, line string) int {
+	idx := newSimilarityIndex()
+	err := idx.hashContent(strings.NewReader(line), int64(len(line)), false)
+	c.Assert(err, IsNil)
+
+	c.Assert(idx.numHashes, Equals, 1)
+	for _, h := range idx.hashes {
+		if h != 0 {
+			return h.key()
+		}
+	}
+
+	return -1
+}
+
+func textIndex(c *C, content string) *similarityIndex {
+	idx := newSimilarityIndex()
+	err := idx.hashContent(strings.NewReader(content), int64(len(content)), false)
+	c.Assert(err, IsNil)
+	return idx
+}
+
+func findIndex(idx *similarityIndex, key int) int {
+	for i, h := range idx.hashes {
+		if h.key() == key {
+			return i
+		}
+	}
+	return -1
+}
+
+func makeFile(c *C, name string, mode filemode.FileMode, content string) *File {
+	obj := new(plumbing.MemoryObject)
+	obj.SetType(plumbing.BlobObject)
+	_, err := obj.Write([]byte(content))
+	c.Assert(err, IsNil)
+	return &File{
+		Name: name,
+		Mode: mode,
+		Blob: Blob{Hash: obj.Hash(), Size: obj.Size(), obj: obj},
+	}
+}
+
+func makeChangeEntry(f *File) ChangeEntry {
+	sto := memory.NewStorage()
+	sto.SetEncodedObject(f.obj)
+	tree := &Tree{s: sto}
+
+	return ChangeEntry{
+		Name: f.Name,
+		Tree: tree,
+		TreeEntry: TreeEntry{
+			Name: filepath.Base(f.Name),
+			Mode: f.Mode,
+			Hash: f.Hash,
+		},
+	}
+}
+
+func makeAdd(c *C, f *File) *Change {
+	return makeChange(c, nil, f)
+}
+
+func makeDelete(c *C, f *File) *Change {
+	return makeChange(c, f, nil)
+}
+
+func makeChange(c *C, from *File, to *File) *Change {
+	if from == nil {
+		return &Change{To: makeChangeEntry(to)}
+	}
+
+	if to == nil {
+		return &Change{From: makeChangeEntry(from)}
+	}
+
+	if from == nil && to == nil {
+		c.Error("cannot make change without from or to")
+	}
+
+	return &Change{From: makeChangeEntry(from), To: makeChangeEntry(to)}
+}
author	Máximo Cuadros <mcuadros@gmail.com>	2020-04-23 18:36:50 +0200
committer	GitHub <noreply@github.com>	2020-04-23 18:36:50 +0200
commit	218a744b6995a89f5c322aa58e79138d65392ea6 (patch)
tree	385575e9f68787661fcd04840f6061b40658ab40 /plumbing/object
parent	53f87846a196c856e00fe825bc5f29551b2ea524 (diff)
parent	90696f07f3a64243c6f33963f5267ab98622db9f (diff)
download	go-git-218a744b6995a89f5c322aa58e79138d65392ea6.tar.gz