aboutsummaryrefslogtreecommitdiffstats
path: root/plumbing/object
diff options
context:
space:
mode:
authorMáximo Cuadros <mcuadros@gmail.com>2020-04-23 18:36:50 +0200
committerGitHub <noreply@github.com>2020-04-23 18:36:50 +0200
commit218a744b6995a89f5c322aa58e79138d65392ea6 (patch)
tree385575e9f68787661fcd04840f6061b40658ab40 /plumbing/object
parent53f87846a196c856e00fe825bc5f29551b2ea524 (diff)
parent90696f07f3a64243c6f33963f5267ab98622db9f (diff)
downloadgo-git-218a744b6995a89f5c322aa58e79138d65392ea6.tar.gz
Merge pull request #38 from go-git/feature/difftree-renames
plumbing: detect renames by hash and similar content in diff tree
Diffstat (limited to 'plumbing/object')
-rw-r--r--plumbing/object/change.go4
-rw-r--r--plumbing/object/difftree.go67
-rw-r--r--plumbing/object/rename.go817
-rw-r--r--plumbing/object/rename_test.go490
4 files changed, 1374 insertions, 4 deletions
diff --git a/plumbing/object/change.go b/plumbing/object/change.go
index 4474905..c9d1615 100644
--- a/plumbing/object/change.go
+++ b/plumbing/object/change.go
@@ -18,7 +18,7 @@ type Change struct {
To ChangeEntry
}
-var empty = ChangeEntry{}
+var empty ChangeEntry
// Action returns the kind of action represented by the change, an
// insertion, a deletion or a modification.
@@ -27,9 +27,11 @@ func (c *Change) Action() (merkletrie.Action, error) {
return merkletrie.Action(0),
fmt.Errorf("malformed change: empty from and to")
}
+
if c.From == empty {
return merkletrie.Insert, nil
}
+
if c.To == empty {
return merkletrie.Delete, nil
}
diff --git a/plumbing/object/difftree.go b/plumbing/object/difftree.go
index 72411a5..7c22227 100644
--- a/plumbing/object/difftree.go
+++ b/plumbing/object/difftree.go
@@ -10,14 +10,62 @@ import (
// DiffTree compares the content and mode of the blobs found via two
// tree objects.
+// DiffTree does not perform rename detection, use DiffTreeWithOptions
+// instead to detect renames.
func DiffTree(a, b *Tree) (Changes, error) {
return DiffTreeContext(context.Background(), a, b)
}
-// DiffTree compares the content and mode of the blobs found via two
+// DiffTreeContext compares the content and mode of the blobs found via two
// tree objects. Provided context must be non-nil.
-// An error will be return if context expires
+// An error will be returned if context expires.
func DiffTreeContext(ctx context.Context, a, b *Tree) (Changes, error) {
+ return DiffTreeWithOptions(ctx, a, b, nil)
+}
+
+// DiffTreeOptions are the configurable options when performing a diff tree.
+type DiffTreeOptions struct {
+ // DetectRenames is whether the diff tree will use rename detection.
+ DetectRenames bool
+ // RenameScore is the threshold to of similarity between files to consider
+ // that a pair of delete and insert are a rename. The number must be
+ // exactly between 0 and 100.
+ RenameScore uint
+ // RenameLimit is the maximum amount of files that can be compared when
+ // detecting renames. The number of comparisons that have to be performed
+ // is equal to the number of deleted files * the number of added files.
+ // That means, that if 100 files were deleted and 50 files were added, 5000
+ // file comparisons may be needed. So, if the rename limit is 50, the number
+ // of both deleted and added needs to be equal or less than 50.
+ // A value of 0 means no limit.
+ RenameLimit uint
+ // OnlyExactRenames performs only detection of exact renames and will not perform
+ // any detection of renames based on file similarity.
+ OnlyExactRenames bool
+}
+
+// DefaultDiffTreeOptions are the default and recommended options for the
+// diff tree.
+var DefaultDiffTreeOptions = &DiffTreeOptions{
+ DetectRenames: true,
+ RenameScore: 60,
+ RenameLimit: 0,
+ OnlyExactRenames: false,
+}
+
+// DiffTreeWithOptions compares the content and mode of the blobs found
+// via two tree objects with the given options. The provided context
+// must be non-nil.
+// If no options are passed, no rename detection will be performed. The
+// recommended options are DefaultDiffTreeOptions.
+// An error will be returned if the context expires.
+// This function will be deprecated and removed in v6 so the default
+// behaviour of DiffTree is to detect renames.
+func DiffTreeWithOptions(
+ ctx context.Context,
+ a, b *Tree,
+ opts *DiffTreeOptions,
+) (Changes, error) {
from := NewTreeRootNode(a)
to := NewTreeRootNode(b)
@@ -33,5 +81,18 @@ func DiffTreeContext(ctx context.Context, a, b *Tree) (Changes, error) {
return nil, err
}
- return newChanges(merkletrieChanges)
+ changes, err := newChanges(merkletrieChanges)
+ if err != nil {
+ return nil, err
+ }
+
+ if opts == nil {
+ opts = new(DiffTreeOptions)
+ }
+
+ if opts.DetectRenames {
+ return DetectRenames(changes, opts)
+ }
+
+ return changes, nil
}
diff --git a/plumbing/object/rename.go b/plumbing/object/rename.go
new file mode 100644
index 0000000..48665eb
--- /dev/null
+++ b/plumbing/object/rename.go
@@ -0,0 +1,817 @@
+package object
+
+import (
+ "errors"
+ "io"
+ "sort"
+ "strings"
+
+ "github.com/go-git/go-git/v5/plumbing"
+ "github.com/go-git/go-git/v5/plumbing/filemode"
+ "github.com/go-git/go-git/v5/utils/ioutil"
+ "github.com/go-git/go-git/v5/utils/merkletrie"
+)
+
+// DetectRenames detects the renames in the given changes on two trees with
+// the given options. It will return the given changes grouping additions and
+// deletions into modifications when possible.
+// If options is nil, the default diff tree options will be used.
+func DetectRenames(
+ changes Changes,
+ opts *DiffTreeOptions,
+) (Changes, error) {
+ if opts == nil {
+ opts = DefaultDiffTreeOptions
+ }
+
+ detector := &renameDetector{
+ renameScore: int(opts.RenameScore),
+ renameLimit: int(opts.RenameLimit),
+ onlyExact: opts.OnlyExactRenames,
+ }
+
+ for _, c := range changes {
+ action, err := c.Action()
+ if err != nil {
+ return nil, err
+ }
+
+ switch action {
+ case merkletrie.Insert:
+ detector.added = append(detector.added, c)
+ case merkletrie.Delete:
+ detector.deleted = append(detector.deleted, c)
+ default:
+ detector.modified = append(detector.modified, c)
+ }
+ }
+
+ return detector.detect()
+}
+
+// renameDetector will detect and resolve renames in a set of changes.
+// see: https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
+type renameDetector struct {
+ added []*Change
+ deleted []*Change
+ modified []*Change
+
+ renameScore int
+ renameLimit int
+ onlyExact bool
+}
+
+// detectExactRenames detects matches files that were deleted with files that
+// were added where the hash is the same on both. If there are multiple targets
+// the one with the most similar path will be chosen as the rename and the
+// rest as either deletions or additions.
+func (d *renameDetector) detectExactRenames() {
+ added := groupChangesByHash(d.added)
+ deletes := groupChangesByHash(d.deleted)
+ var uniqueAdds []*Change
+ var nonUniqueAdds [][]*Change
+ var addedLeft []*Change
+
+ for _, cs := range added {
+ if len(cs) == 1 {
+ uniqueAdds = append(uniqueAdds, cs[0])
+ } else {
+ nonUniqueAdds = append(nonUniqueAdds, cs)
+ }
+ }
+
+ for _, c := range uniqueAdds {
+ hash := changeHash(c)
+ deleted := deletes[hash]
+
+ if len(deleted) == 1 {
+ if sameMode(c, deleted[0]) {
+ d.modified = append(d.modified, &Change{From: deleted[0].From, To: c.To})
+ delete(deletes, hash)
+ } else {
+ addedLeft = append(addedLeft, c)
+ }
+ } else if len(deleted) > 1 {
+ bestMatch := bestNameMatch(c, deleted)
+ if bestMatch != nil && sameMode(c, bestMatch) {
+ d.modified = append(d.modified, &Change{From: bestMatch.From, To: c.To})
+ delete(deletes, hash)
+
+ var newDeletes = make([]*Change, 0, len(deleted)-1)
+ for _, d := range deleted {
+ if d != bestMatch {
+ newDeletes = append(newDeletes, d)
+ }
+ }
+ deletes[hash] = newDeletes
+ }
+ } else {
+ addedLeft = append(addedLeft, c)
+ }
+ }
+
+ for _, added := range nonUniqueAdds {
+ hash := changeHash(added[0])
+ deleted := deletes[hash]
+
+ if len(deleted) == 1 {
+ deleted := deleted[0]
+ bestMatch := bestNameMatch(deleted, added)
+ if bestMatch != nil && sameMode(deleted, bestMatch) {
+ d.modified = append(d.modified, &Change{From: deleted.From, To: bestMatch.To})
+ delete(deletes, hash)
+
+ for _, c := range added {
+ if c != bestMatch {
+ addedLeft = append(addedLeft, c)
+ }
+ }
+ } else {
+ addedLeft = append(addedLeft, added...)
+ }
+ } else if len(deleted) > 1 {
+ maxSize := len(deleted) * len(added)
+ if d.renameLimit > 0 && d.renameLimit < maxSize {
+ maxSize = d.renameLimit
+ }
+
+ matrix := make(similarityMatrix, 0, maxSize)
+
+ for delIdx, del := range deleted {
+ deletedName := changeName(del)
+
+ for addIdx, add := range added {
+ addedName := changeName(add)
+
+ score := nameSimilarityScore(addedName, deletedName)
+ matrix = append(matrix, similarityPair{added: addIdx, deleted: delIdx, score: score})
+
+ if len(matrix) >= maxSize {
+ break
+ }
+ }
+
+ if len(matrix) >= maxSize {
+ break
+ }
+ }
+
+ sort.Stable(matrix)
+
+ usedAdds := make(map[*Change]struct{})
+ usedDeletes := make(map[*Change]struct{})
+ for i := len(matrix) - 1; i >= 0; i-- {
+ del := deleted[matrix[i].deleted]
+ add := added[matrix[i].added]
+
+ if add == nil || del == nil {
+ // it was already matched
+ continue
+ }
+
+ usedAdds[add] = struct{}{}
+ usedDeletes[del] = struct{}{}
+ d.modified = append(d.modified, &Change{From: del.From, To: add.To})
+ added[matrix[i].added] = nil
+ deleted[matrix[i].deleted] = nil
+ }
+
+ for _, c := range added {
+ if _, ok := usedAdds[c]; !ok {
+ addedLeft = append(addedLeft, c)
+ }
+ }
+
+ var newDeletes = make([]*Change, 0, len(deleted)-len(usedDeletes))
+ for _, c := range deleted {
+ if _, ok := usedDeletes[c]; !ok {
+ newDeletes = append(newDeletes, c)
+ }
+ }
+ deletes[hash] = newDeletes
+ } else {
+ addedLeft = append(addedLeft, added...)
+ }
+ }
+
+ d.added = addedLeft
+ d.deleted = nil
+ for _, dels := range deletes {
+ for _, del := range dels {
+ if del != nil {
+ d.deleted = append(d.deleted, del)
+ }
+ }
+ }
+}
+
+// detectContentRenames detects renames based on the similarity of the content
+// in the files by building a matrix of pairs between sources and destinations
+// and matching by the highest score.
+// see: https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
+func (d *renameDetector) detectContentRenames() error {
+ cnt := max(len(d.added), len(d.deleted))
+ if d.renameLimit > 0 && cnt > d.renameLimit {
+ return nil
+ }
+
+ srcs, dsts := d.deleted, d.added
+ matrix, err := buildSimilarityMatrix(srcs, dsts, d.renameScore)
+ if err != nil {
+ return err
+ }
+ renames := make([]*Change, 0, min(len(matrix), len(dsts)))
+
+ // Match rename pairs on a first come, first serve basis until
+ // we have looked at everything that is above the minimum score.
+ for i := len(matrix) - 1; i >= 0; i-- {
+ pair := matrix[i]
+ src := srcs[pair.deleted]
+ dst := dsts[pair.added]
+
+ if dst == nil || src == nil {
+ // It was already matched before
+ continue
+ }
+
+ renames = append(renames, &Change{From: src.From, To: dst.To})
+
+ // Claim destination and source as matched
+ dsts[pair.added] = nil
+ srcs[pair.deleted] = nil
+ }
+
+ d.modified = append(d.modified, renames...)
+ d.added = compactChanges(dsts)
+ d.deleted = compactChanges(srcs)
+
+ return nil
+}
+
+func (d *renameDetector) detect() (Changes, error) {
+ if len(d.added) > 0 && len(d.deleted) > 0 {
+ d.detectExactRenames()
+
+ if !d.onlyExact {
+ if err := d.detectContentRenames(); err != nil {
+ return nil, err
+ }
+ }
+ }
+
+ result := make(Changes, 0, len(d.added)+len(d.deleted)+len(d.modified))
+ result = append(result, d.added...)
+ result = append(result, d.deleted...)
+ result = append(result, d.modified...)
+
+ sort.Stable(result)
+
+ return result, nil
+}
+
+func bestNameMatch(change *Change, changes []*Change) *Change {
+ var best *Change
+ var bestScore int
+
+ cname := changeName(change)
+
+ for _, c := range changes {
+ score := nameSimilarityScore(cname, changeName(c))
+ if score > bestScore {
+ bestScore = score
+ best = c
+ }
+ }
+
+ return best
+}
+
+func nameSimilarityScore(a, b string) int {
+ aDirLen := strings.LastIndexByte(a, '/') + 1
+ bDirLen := strings.LastIndexByte(b, '/') + 1
+
+ dirMin := min(aDirLen, bDirLen)
+ dirMax := max(aDirLen, bDirLen)
+
+ var dirScoreLtr, dirScoreRtl int
+ if dirMax == 0 {
+ dirScoreLtr = 100
+ dirScoreRtl = 100
+ } else {
+ var dirSim int
+
+ for ; dirSim < dirMin; dirSim++ {
+ if a[dirSim] != b[dirSim] {
+ break
+ }
+ }
+
+ dirScoreLtr = dirSim * 100 / dirMax
+
+ if dirScoreLtr == 100 {
+ dirScoreRtl = 100
+ } else {
+ for dirSim = 0; dirSim < dirMin; dirSim++ {
+ if a[aDirLen-1-dirSim] != b[bDirLen-1-dirSim] {
+ break
+ }
+ }
+ dirScoreRtl = dirSim * 100 / dirMax
+ }
+ }
+
+ fileMin := min(len(a)-aDirLen, len(b)-bDirLen)
+ fileMax := max(len(a)-aDirLen, len(b)-bDirLen)
+
+ fileSim := 0
+ for ; fileSim < fileMin; fileSim++ {
+ if a[len(a)-1-fileSim] != b[len(b)-1-fileSim] {
+ break
+ }
+ }
+ fileScore := fileSim * 100 / fileMax
+
+ return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100
+}
+
+func changeName(c *Change) string {
+ if c.To != empty {
+ return c.To.Name
+ }
+ return c.From.Name
+}
+
+func changeHash(c *Change) plumbing.Hash {
+ if c.To != empty {
+ return c.To.TreeEntry.Hash
+ }
+
+ return c.From.TreeEntry.Hash
+}
+
+func changeMode(c *Change) filemode.FileMode {
+ if c.To != empty {
+ return c.To.TreeEntry.Mode
+ }
+
+ return c.From.TreeEntry.Mode
+}
+
+func sameMode(a, b *Change) bool {
+ return changeMode(a) == changeMode(b)
+}
+
+func groupChangesByHash(changes []*Change) map[plumbing.Hash][]*Change {
+ var result = make(map[plumbing.Hash][]*Change)
+ for _, c := range changes {
+ hash := changeHash(c)
+ result[hash] = append(result[hash], c)
+ }
+ return result
+}
+
+type similarityMatrix []similarityPair
+
+func (m similarityMatrix) Len() int { return len(m) }
+func (m similarityMatrix) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
+func (m similarityMatrix) Less(i, j int) bool {
+ if m[i].score == m[j].score {
+ if m[i].added == m[j].added {
+ return m[i].deleted < m[j].deleted
+ }
+ return m[i].added < m[j].added
+ }
+ return m[i].score < m[j].score
+}
+
+type similarityPair struct {
+ // index of the added file
+ added int
+ // index of the deleted file
+ deleted int
+ // similarity score
+ score int
+}
+
+func max(a, b int) int {
+ if a > b {
+ return a
+ }
+ return b
+}
+
+func min(a, b int) int {
+ if a < b {
+ return a
+ }
+ return b
+}
+
+func buildSimilarityMatrix(srcs, dsts []*Change, renameScore int) (similarityMatrix, error) {
+ // Allocate for the worst-case scenario where every pair has a score
+ // that we need to consider. We might not need that many.
+ matrix := make(similarityMatrix, 0, len(srcs)*len(dsts))
+ srcSizes := make([]int64, len(srcs))
+ dstSizes := make([]int64, len(dsts))
+ dstTooLarge := make(map[int]bool)
+
+ // Consider each pair of files, if the score is above the minimum
+ // threshold we need to record that scoring in the matrix so we can
+ // later find the best matches.
+outerLoop:
+ for srcIdx, src := range srcs {
+ if changeMode(src) != filemode.Regular {
+ continue
+ }
+
+ // Declare the from file and the similarity index here to be able to
+ // reuse it inside the inner loop. The reason to not initialize them
+ // here is so we can skip the initialization in case they happen to
+ // not be needed later. They will be initialized inside the inner
+ // loop if and only if they're needed and reused in subsequent passes.
+ var from *File
+ var s *similarityIndex
+ var err error
+ for dstIdx, dst := range dsts {
+ if changeMode(dst) != filemode.Regular {
+ continue
+ }
+
+ if dstTooLarge[dstIdx] {
+ continue
+ }
+
+ var to *File
+ srcSize := srcSizes[srcIdx]
+ if srcSize == 0 {
+ from, _, err = src.Files()
+ if err != nil {
+ return nil, err
+ }
+ srcSize = from.Size + 1
+ srcSizes[srcIdx] = srcSize
+ }
+
+ dstSize := dstSizes[dstIdx]
+ if dstSize == 0 {
+ _, to, err = dst.Files()
+ if err != nil {
+ return nil, err
+ }
+ dstSize = to.Size + 1
+ dstSizes[dstIdx] = dstSize
+ }
+
+ min, max := srcSize, dstSize
+ if dstSize < srcSize {
+ min = dstSize
+ max = srcSize
+ }
+
+ if int(min*100/max) < renameScore {
+ // File sizes are too different to be a match
+ continue
+ }
+
+ if s == nil {
+ s, err = fileSimilarityIndex(from)
+ if err != nil {
+ if err == errIndexFull {
+ continue outerLoop
+ }
+ return nil, err
+ }
+ }
+
+ if to == nil {
+ _, to, err = dst.Files()
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ di, err := fileSimilarityIndex(to)
+ if err != nil {
+ if err == errIndexFull {
+ dstTooLarge[dstIdx] = true
+ }
+
+ return nil, err
+ }
+
+ contentScore := s.score(di, 10000)
+ // The name score returns a value between 0 and 100, so we need to
+ // convert it to the same range as the content score.
+ nameScore := nameSimilarityScore(src.From.Name, dst.To.Name) * 100
+ score := (contentScore*99 + nameScore*1) / 10000
+
+ if score < renameScore {
+ continue
+ }
+
+ matrix = append(matrix, similarityPair{added: dstIdx, deleted: srcIdx, score: score})
+ }
+ }
+
+ sort.Stable(matrix)
+
+ return matrix, nil
+}
+
+func compactChanges(changes []*Change) []*Change {
+ var result []*Change
+ for _, c := range changes {
+ if c != nil {
+ result = append(result, c)
+ }
+ }
+ return result
+}
+
+const (
+ keyShift = 32
+ maxCountValue = (1 << keyShift) - 1
+)
+
+var errIndexFull = errors.New("index is full")
+
+// similarityIndex is an index structure of lines/blocks in one file.
+// This structure can be used to compute an approximation of the similarity
+// between two files.
+// To save space in memory, this index uses a space efficient encoding which
+// will not exceed 1MiB per instance. The index starts out at a smaller size
+// (closer to 2KiB), but may grow as more distinct blocks withing the scanned
+// file are discovered.
+// see: https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
+type similarityIndex struct {
+ hashed uint64
+ // number of non-zero entries in hashes
+ numHashes int
+ growAt int
+ hashes []keyCountPair
+ hashBits int
+}
+
+func fileSimilarityIndex(f *File) (*similarityIndex, error) {
+ idx := newSimilarityIndex()
+ if err := idx.hash(f); err != nil {
+ return nil, err
+ }
+
+ sort.Stable(keyCountPairs(idx.hashes))
+
+ return idx, nil
+}
+
+func newSimilarityIndex() *similarityIndex {
+ return &similarityIndex{
+ hashBits: 8,
+ hashes: make([]keyCountPair, 1<<8),
+ growAt: shouldGrowAt(8),
+ }
+}
+
+func (i *similarityIndex) hash(f *File) error {
+ isBin, err := f.IsBinary()
+ if err != nil {
+ return err
+ }
+
+ r, err := f.Reader()
+ if err != nil {
+ return err
+ }
+
+ defer ioutil.CheckClose(r, &err)
+
+ return i.hashContent(r, f.Size, isBin)
+}
+
+func (i *similarityIndex) hashContent(r io.Reader, size int64, isBin bool) error {
+ var buf = make([]byte, 4096)
+ var ptr, cnt int
+ remaining := size
+
+ for 0 < remaining {
+ hash := 5381
+ var blockHashedCnt uint64
+
+ // Hash one line or block, whatever happens first
+ n := int64(0)
+ for {
+ if ptr == cnt {
+ ptr = 0
+ var err error
+ cnt, err = io.ReadFull(r, buf)
+ if err != nil && err != io.ErrUnexpectedEOF {
+ return err
+ }
+
+ if cnt == 0 {
+ return io.EOF
+ }
+ }
+ n++
+ c := buf[ptr] & 0xff
+ ptr++
+
+ // Ignore CR in CRLF sequence if it's text
+ if !isBin && c == '\r' && ptr < cnt && buf[ptr] == '\n' {
+ continue
+ }
+ blockHashedCnt++
+
+ if c == '\n' {
+ break
+ }
+
+ hash = (hash << 5) + hash + int(c)
+
+ if n >= 64 || n >= remaining {
+ break
+ }
+ }
+ i.hashed += blockHashedCnt
+ if err := i.add(hash, blockHashedCnt); err != nil {
+ return err
+ }
+ remaining -= n
+ }
+
+ return nil
+}
+
+// score computes the similarity score between this index and another one.
+// A region of a file is defined as a line in a text file or a fixed-size
+// block in a binary file. To prepare an index, each region in the file is
+// hashed; the values and counts of hashes are retained in a sorted table.
+// Define the similarity fraction F as the count of matching regions between
+// the two files divided between the maximum count of regions in either file.
+// The similarity score is F multiplied by the maxScore constant, yielding a
+// range [0, maxScore]. It is defined as maxScore for the degenerate case of
+// two empty files.
+// The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
+func (i *similarityIndex) score(other *similarityIndex, maxScore int) int {
+ var maxHashed = i.hashed
+ if maxHashed < other.hashed {
+ maxHashed = other.hashed
+ }
+ if maxHashed == 0 {
+ return maxScore
+ }
+
+ return int(i.common(other) * uint64(maxScore) / maxHashed)
+}
+
+func (i *similarityIndex) common(dst *similarityIndex) uint64 {
+ srcIdx, dstIdx := 0, 0
+ if i.numHashes == 0 || dst.numHashes == 0 {
+ return 0
+ }
+
+ var common uint64
+ srcKey, dstKey := i.hashes[srcIdx].key(), dst.hashes[dstIdx].key()
+
+ for {
+ if srcKey == dstKey {
+ srcCnt, dstCnt := i.hashes[srcIdx].count(), dst.hashes[dstIdx].count()
+ if srcCnt < dstCnt {
+ common += srcCnt
+ } else {
+ common += dstCnt
+ }
+
+ srcIdx++
+ if srcIdx == len(i.hashes) {
+ break
+ }
+ srcKey = i.hashes[srcIdx].key()
+
+ dstIdx++
+ if dstIdx == len(dst.hashes) {
+ break
+ }
+ dstKey = dst.hashes[dstIdx].key()
+ } else if srcKey < dstKey {
+ // Region of src that is not in dst
+ srcIdx++
+ if srcIdx == len(i.hashes) {
+ break
+ }
+ srcKey = i.hashes[srcIdx].key()
+ } else {
+ // Region of dst that is not in src
+ dstIdx++
+ if dstIdx == len(dst.hashes) {
+ break
+ }
+ dstKey = dst.hashes[dstIdx].key()
+ }
+ }
+
+ return common
+}
+
+func (i *similarityIndex) add(key int, cnt uint64) error {
+ key = int(uint32(key*0x9e370001) >> 1)
+
+ j := i.slot(key)
+ for {
+ v := i.hashes[j]
+ if v == 0 {
+ // It's an empty slot, so we can store it here.
+ if i.growAt <= i.numHashes {
+ if err := i.grow(); err != nil {
+ return err
+ }
+ j = i.slot(key)
+ continue
+ }
+
+ var err error
+ i.hashes[j], err = newKeyCountPair(key, cnt)
+ if err != nil {
+ return err
+ }
+ i.numHashes++
+ return nil
+ } else if v.key() == key {
+ // It's the same key, so increment the counter.
+ var err error
+ i.hashes[j], err = newKeyCountPair(key, v.count()+cnt)
+ if err != nil {
+ return err
+ }
+ return nil
+ } else if j+1 >= len(i.hashes) {
+ j = 0
+ } else {
+ j++
+ }
+ }
+}
+
+type keyCountPair uint64
+
+func newKeyCountPair(key int, cnt uint64) (keyCountPair, error) {
+ if cnt > maxCountValue {
+ return 0, errIndexFull
+ }
+
+ return keyCountPair((uint64(key) << keyShift) | cnt), nil
+}
+
+func (p keyCountPair) key() int {
+ return int(p >> keyShift)
+}
+
+func (p keyCountPair) count() uint64 {
+ return uint64(p) & maxCountValue
+}
+
+func (i *similarityIndex) slot(key int) int {
+ // We use 31 - hashBits because the upper bit was already forced
+ // to be 0 and we want the remaining high bits to be used as the
+ // table slot.
+ return int(uint32(key) >> (31 - i.hashBits))
+}
+
+func shouldGrowAt(hashBits int) int {
+ return (1 << hashBits) * (hashBits - 3) / hashBits
+}
+
+func (i *similarityIndex) grow() error {
+ if i.hashBits == 30 {
+ return errIndexFull
+ }
+
+ old := i.hashes
+
+ i.hashBits++
+ i.growAt = shouldGrowAt(i.hashBits)
+
+ // TODO(erizocosmico): find a way to check if it will OOM and return
+ // errIndexFull instead.
+ i.hashes = make([]keyCountPair, 1<<i.hashBits)
+
+ for _, v := range old {
+ if v != 0 {
+ j := i.slot(v.key())
+ for i.hashes[j] != 0 {
+ j++
+ if j >= len(i.hashes) {
+ j = 0
+ }
+ }
+ i.hashes[j] = v
+ }
+ }
+
+ return nil
+}
+
+type keyCountPairs []keyCountPair
+
+func (p keyCountPairs) Len() int { return len(p) }
+func (p keyCountPairs) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
+func (p keyCountPairs) Less(i, j int) bool { return p[i] < p[j] }
diff --git a/plumbing/object/rename_test.go b/plumbing/object/rename_test.go
new file mode 100644
index 0000000..bf5044d
--- /dev/null
+++ b/plumbing/object/rename_test.go
@@ -0,0 +1,490 @@
+package object
+
+import (
+ "path/filepath"
+ "strings"
+
+ "github.com/go-git/go-git/v5/plumbing"
+ "github.com/go-git/go-git/v5/plumbing/filemode"
+ "github.com/go-git/go-git/v5/storage/memory"
+ . "gopkg.in/check.v1"
+)
+
+type RenameSuite struct {
+ BaseObjectsSuite
+}
+
+var _ = Suite(&RenameSuite{})
+
+func (s *RenameSuite) TestNameSimilarityScore(c *C) {
+ testCases := []struct {
+ a, b string
+ score int
+ }{
+ {"foo/bar.c", "foo/baz.c", 70},
+ {"src/utils/Foo.java", "tests/utils/Foo.java", 64},
+ {"foo/bar/baz.py", "README.md", 0},
+ {"src/utils/something/foo.py", "src/utils/something/other/foo.py", 69},
+ {"src/utils/something/foo.py", "src/utils/yada/foo.py", 63},
+ {"src/utils/something/foo.py", "src/utils/something/other/bar.py", 44},
+ {"src/utils/something/foo.py", "src/utils/something/foo.py", 100},
+ }
+
+ for _, tt := range testCases {
+ c.Assert(nameSimilarityScore(tt.a, tt.b), Equals, tt.score)
+ }
+}
+
+const (
+ pathA = "src/A"
+ pathB = "src/B"
+ pathH = "src/H"
+ pathQ = "src/Q"
+)
+
+func (s *RenameSuite) TestExactRename_OneRename(c *C) {
+ a := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+ b := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo"))
+
+ result := detectRenames(c, Changes{a, b}, nil, 1)
+ assertRename(c, b, a, result[0])
+}
+
+func (s *RenameSuite) TestExactRename_DifferentObjects(c *C) {
+ a := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+ h := makeAdd(c, makeFile(c, pathH, filemode.Regular, "foo"))
+ q := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "bar"))
+
+ result := detectRenames(c, Changes{a, h, q}, nil, 3)
+ c.Assert(result[0], DeepEquals, a)
+ c.Assert(result[1], DeepEquals, h)
+ c.Assert(result[2], DeepEquals, q)
+}
+
+func (s *RenameSuite) TestExactRename_OneRenameOneModify(c *C) {
+ c1 := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+ c2 := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo"))
+ c3 := makeChange(c,
+ makeFile(c, pathH, filemode.Regular, "bar"),
+ makeFile(c, pathH, filemode.Regular, "bar"),
+ )
+
+ result := detectRenames(c, Changes{c1, c2, c3}, nil, 2)
+ c.Assert(result[0], DeepEquals, c3)
+ assertRename(c, c2, c1, result[1])
+}
+
+func (s *RenameSuite) TestExactRename_ManyRenames(c *C) {
+ c1 := makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo"))
+ c2 := makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo"))
+ c3 := makeAdd(c, makeFile(c, pathH, filemode.Regular, "bar"))
+ c4 := makeDelete(c, makeFile(c, pathB, filemode.Regular, "bar"))
+
+ result := detectRenames(c, Changes{c1, c2, c3, c4}, nil, 2)
+ assertRename(c, c4, c3, result[0])
+ assertRename(c, c2, c1, result[1])
+}
+
+func (s *RenameSuite) TestExactRename_MultipleIdenticalDeletes(c *C) {
+ changes := Changes{
+ makeDelete(c, makeFile(c, pathA, filemode.Regular, "foo")),
+ makeDelete(c, makeFile(c, pathB, filemode.Regular, "foo")),
+ makeDelete(c, makeFile(c, pathH, filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, pathQ, filemode.Regular, "foo")),
+ }
+
+ result := detectRenames(c, changes, nil, 3)
+ assertRename(c, changes[0], changes[3], result[0])
+ c.Assert(result[1], DeepEquals, changes[1])
+ c.Assert(result[2], DeepEquals, changes[2])
+}
+
+func (s *RenameSuite) TestRenameExact_PathBreaksTie(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, "src/com/foo/a.java", filemode.Regular, "foo")),
+ makeDelete(c, makeFile(c, "src/com/foo/b.java", filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, "c.txt", filemode.Regular, "foo")),
+ makeDelete(c, makeFile(c, "d.txt", filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, "the_e_file.txt", filemode.Regular, "foo")),
+ }
+
+ // Add out of order to avoid first-match succeeding
+ result := detectRenames(c, Changes{
+ changes[0],
+ changes[3],
+ changes[4],
+ changes[1],
+ changes[2],
+ }, nil, 3)
+ assertRename(c, changes[3], changes[2], result[0])
+ assertRename(c, changes[1], changes[0], result[1])
+ c.Assert(result[2], DeepEquals, changes[4])
+}
+
+func (s *RenameSuite) TestExactRename_OneDeleteManyAdds(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, "src/com/foo/a.java", filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, "src/com/foo/b.java", filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, "c.txt", filemode.Regular, "foo")),
+ makeDelete(c, makeFile(c, "d.txt", filemode.Regular, "foo")),
+ }
+
+ result := detectRenames(c, changes, nil, 3)
+ assertRename(c, changes[3], changes[2], result[0])
+ c.Assert(result[1], DeepEquals, changes[0])
+ c.Assert(result[2], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestExactRename_UnstagedFile(c *C) {
+ changes := Changes{
+ makeDelete(c, makeFile(c, pathA, filemode.Regular, "foo")),
+ makeAdd(c, makeFile(c, pathB, filemode.Regular, "foo")),
+ }
+ result := detectRenames(c, changes, nil, 1)
+ assertRename(c, changes[0], changes[1], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_OnePair(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblarg\n")),
+ makeDelete(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblah\n")),
+ }
+
+ result := detectRenames(c, changes, nil, 1)
+ assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_OneRenameTwoUnrelatedFiles(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblarg\n")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo\nbar\nbaz\nblah\n")),
+ makeAdd(c, makeFile(c, pathB, filemode.Regular, "some\nsort\nof\ntext\n")),
+ makeDelete(c, makeFile(c, pathH, filemode.Regular, "completely\nunrelated\ntext\n")),
+ }
+
+ result := detectRenames(c, changes, nil, 3)
+ c.Assert(result[0], DeepEquals, changes[2])
+ c.Assert(result[1], DeepEquals, changes[3])
+ assertRename(c, changes[1], changes[0], result[2])
+}
+
+func (s *RenameSuite) TestContentRename_LastByteDifferent(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\na")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "foo\nbar\nb")),
+ }
+
+ result := detectRenames(c, changes, nil, 1)
+ assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_NewlinesOnly(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, strings.Repeat("\n", 3))),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, strings.Repeat("\n", 4))),
+ }
+
+ result := detectRenames(c, changes, nil, 1)
+ assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_SameContentMultipleTimes(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "a\na\na\na\n")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "a\na\na\n")),
+ }
+
+ result := detectRenames(c, changes, nil, 1)
+ assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestContentRename_OnePairRenameScore50(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "ab\nab\nab\nac\nad\nae\n")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "ac\nab\nab\nab\naa\na0\na1\n")),
+ }
+
+ result := detectRenames(c, changes, &DiffTreeOptions{RenameScore: 50}, 1)
+ assertRename(c, changes[1], changes[0], result[0])
+}
+
+func (s *RenameSuite) TestNoRenames_SingleByteFiles(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "a")),
+ makeAdd(c, makeFile(c, pathQ, filemode.Regular, "b")),
+ }
+
+ result := detectRenames(c, changes, nil, 2)
+ c.Assert(result[0], DeepEquals, changes[0])
+ c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestNoRenames_EmptyFile(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "")),
+ }
+ result := detectRenames(c, changes, nil, 1)
+ c.Assert(result[0], DeepEquals, changes[0])
+}
+
+func (s *RenameSuite) TestNoRenames_EmptyFile2(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "blah")),
+ }
+ result := detectRenames(c, changes, nil, 2)
+ c.Assert(result[0], DeepEquals, changes[0])
+ c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestNoRenames_SymlinkAndFile(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "src/dest")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Symlink, "src/dest")),
+ }
+ result := detectRenames(c, changes, nil, 2)
+ c.Assert(result[0], DeepEquals, changes[0])
+ c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestNoRenames_SymlinkAndFileSamePath(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "src/dest")),
+ makeDelete(c, makeFile(c, pathA, filemode.Symlink, "src/dest")),
+ }
+ result := detectRenames(c, changes, nil, 2)
+ c.Assert(result[0], DeepEquals, changes[0])
+ c.Assert(result[1], DeepEquals, changes[1])
+}
+
+func (s *RenameSuite) TestRenameLimit(c *C) {
+ changes := Changes{
+ makeAdd(c, makeFile(c, pathA, filemode.Regular, "foo\nbar\nbaz\nblarg\n")),
+ makeDelete(c, makeFile(c, pathB, filemode.Regular, "foo\nbar\nbaz\nblah\n")),
+ makeAdd(c, makeFile(c, pathH, filemode.Regular, "a\nb\nc\nd\n")),
+ makeDelete(c, makeFile(c, pathQ, filemode.Regular, "a\nb\nc\n")),
+ }
+
+ result := detectRenames(c, changes, &DiffTreeOptions{RenameLimit: 1}, 4)
+ for i, res := range result {
+ c.Assert(res, DeepEquals, changes[i])
+ }
+}
+
+func detectRenames(c *C, changes Changes, opts *DiffTreeOptions, expectedResults int) Changes {
+ result, err := DetectRenames(changes, opts)
+ c.Assert(err, IsNil)
+ c.Assert(result, HasLen, expectedResults)
+ return result
+}
+
+func assertRename(c *C, from, to *Change, rename *Change) {
+ c.Assert(&Change{From: from.From, To: to.To}, DeepEquals, rename)
+}
+
+type SimilarityIndexSuite struct {
+ BaseObjectsSuite
+}
+
+var _ = Suite(&SimilarityIndexSuite{})
+
+func (s *SimilarityIndexSuite) TestScoreFiles(c *C) {
+ tree := s.tree(c, plumbing.NewHash("a8d315b2b1c615d43042c3a62402b8a54288cf5c"))
+ binary, err := tree.File("binary.jpg")
+ c.Assert(err, IsNil)
+ binIndex, err := fileSimilarityIndex(binary)
+ c.Assert(err, IsNil)
+
+ long, err := tree.File("json/long.json")
+ c.Assert(err, IsNil)
+ longIndex, err := fileSimilarityIndex(long)
+ c.Assert(err, IsNil)
+
+ short, err := tree.File("json/short.json")
+ c.Assert(err, IsNil)
+ shortIndex, err := fileSimilarityIndex(short)
+ c.Assert(err, IsNil)
+
+ php, err := tree.File("php/crappy.php")
+ c.Assert(err, IsNil)
+ phpIndex, err := fileSimilarityIndex(php)
+ c.Assert(err, IsNil)
+
+ testCases := []struct {
+ src, dst *similarityIndex
+ expectedScore int
+ }{
+ {binIndex, binIndex, 10000}, // same file
+ {shortIndex, longIndex, 32}, // slightly similar files
+ {longIndex, shortIndex, 32}, // same as previous, diff order
+ {shortIndex, phpIndex, 1}, // different files
+ {longIndex, binIndex, 0}, // code vs binary file
+ }
+
+ for _, tt := range testCases {
+ score := tt.src.score(tt.dst, 10000)
+ c.Assert(score, Equals, tt.expectedScore)
+ }
+}
+
+func (s *SimilarityIndexSuite) TestHashContent(c *C) {
+ idx := textIndex(c, "A\n"+
+ "B\n"+
+ "D\n"+
+ "B\n")
+
+ keyA := keyFor(c, "A\n")
+ keyB := keyFor(c, "B\n")
+ keyD := keyFor(c, "D\n")
+
+ c.Assert(keyA, Not(Equals), keyB)
+ c.Assert(keyA, Not(Equals), keyD)
+ c.Assert(keyD, Not(Equals), keyB)
+
+ c.Assert(idx.numHashes, Equals, 3)
+ c.Assert(idx.hashes[findIndex(idx, keyA)].count(), Equals, uint64(2))
+ c.Assert(idx.hashes[findIndex(idx, keyB)].count(), Equals, uint64(4))
+ c.Assert(idx.hashes[findIndex(idx, keyD)].count(), Equals, uint64(2))
+}
+
+func (s *SimilarityIndexSuite) TestCommonSameFiles(c *C) {
+ content := "A\n" +
+ "B\n" +
+ "D\n" +
+ "B\n"
+
+ src := textIndex(c, content)
+ dst := textIndex(c, content)
+
+ c.Assert(src.common(dst), Equals, uint64(8))
+ c.Assert(dst.common(src), Equals, uint64(8))
+
+ c.Assert(src.score(dst, 100), Equals, 100)
+ c.Assert(dst.score(src, 100), Equals, 100)
+}
+
+func (s *SimilarityIndexSuite) TestCommonSameFilesCR(c *C) {
+ content := "A\r\n" +
+ "B\r\n" +
+ "D\r\n" +
+ "B\r\n"
+
+ src := textIndex(c, content)
+ dst := textIndex(c, strings.ReplaceAll(content, "\r", ""))
+
+ c.Assert(src.common(dst), Equals, uint64(8))
+ c.Assert(dst.common(src), Equals, uint64(8))
+
+ c.Assert(src.score(dst, 100), Equals, 100)
+ c.Assert(dst.score(src, 100), Equals, 100)
+}
+
+func (s *SimilarityIndexSuite) TestCommonEmptyFiles(c *C) {
+ src := textIndex(c, "")
+ dst := textIndex(c, "")
+
+ c.Assert(src.common(dst), Equals, uint64(0))
+ c.Assert(dst.common(src), Equals, uint64(0))
+}
+
+func (s *SimilarityIndexSuite) TestCommonTotallyDifferentFiles(c *C) {
+ src := textIndex(c, "A\n")
+ dst := textIndex(c, "D\n")
+
+ c.Assert(src.common(dst), Equals, uint64(0))
+ c.Assert(dst.common(src), Equals, uint64(0))
+}
+
+func (s *SimilarityIndexSuite) TestSimilarity75(c *C) {
+ src := textIndex(c, "A\nB\nC\nD\n")
+ dst := textIndex(c, "A\nB\nC\nQ\n")
+
+ c.Assert(src.common(dst), Equals, uint64(6))
+ c.Assert(dst.common(src), Equals, uint64(6))
+
+ c.Assert(src.score(dst, 100), Equals, 75)
+ c.Assert(dst.score(src, 100), Equals, 75)
+}
+
+func keyFor(c *C, line string) int {
+ idx := newSimilarityIndex()
+ err := idx.hashContent(strings.NewReader(line), int64(len(line)), false)
+ c.Assert(err, IsNil)
+
+ c.Assert(idx.numHashes, Equals, 1)
+ for _, h := range idx.hashes {
+ if h != 0 {
+ return h.key()
+ }
+ }
+
+ return -1
+}
+
+func textIndex(c *C, content string) *similarityIndex {
+ idx := newSimilarityIndex()
+ err := idx.hashContent(strings.NewReader(content), int64(len(content)), false)
+ c.Assert(err, IsNil)
+ return idx
+}
+
+func findIndex(idx *similarityIndex, key int) int {
+ for i, h := range idx.hashes {
+ if h.key() == key {
+ return i
+ }
+ }
+ return -1
+}
+
+func makeFile(c *C, name string, mode filemode.FileMode, content string) *File {
+ obj := new(plumbing.MemoryObject)
+ obj.SetType(plumbing.BlobObject)
+ _, err := obj.Write([]byte(content))
+ c.Assert(err, IsNil)
+ return &File{
+ Name: name,
+ Mode: mode,
+ Blob: Blob{Hash: obj.Hash(), Size: obj.Size(), obj: obj},
+ }
+}
+
+func makeChangeEntry(f *File) ChangeEntry {
+ sto := memory.NewStorage()
+ sto.SetEncodedObject(f.obj)
+ tree := &Tree{s: sto}
+
+ return ChangeEntry{
+ Name: f.Name,
+ Tree: tree,
+ TreeEntry: TreeEntry{
+ Name: filepath.Base(f.Name),
+ Mode: f.Mode,
+ Hash: f.Hash,
+ },
+ }
+}
+
+func makeAdd(c *C, f *File) *Change {
+ return makeChange(c, nil, f)
+}
+
+func makeDelete(c *C, f *File) *Change {
+ return makeChange(c, f, nil)
+}
+
+func makeChange(c *C, from *File, to *File) *Change {
+ if from == nil {
+ return &Change{To: makeChangeEntry(to)}
+ }
+
+ if to == nil {
+ return &Change{From: makeChangeEntry(from)}
+ }
+
+ if from == nil && to == nil {
+ c.Error("cannot make change without from or to")
+ }
+
+ return &Change{From: makeChangeEntry(from), To: makeChangeEntry(to)}
+}