aboutsummaryrefslogtreecommitdiffstats
path: root/blame.go
diff options
context:
space:
mode:
authorArieh Schneier <15041913+AriehSchneier@users.noreply.github.com>2023-07-06 08:45:49 +1000
committerArieh Schneier <15041913+AriehSchneier@users.noreply.github.com>2023-07-06 08:45:49 +1000
commite0fc8a8f00c238f7f7cdf0e0c59e743550587bdf (patch)
tree8b6d76278f0a5020fc7fc675908abc29e2a8025d /blame.go
parentdd4e2b7f4b01e2aaafcf182d2884a186a6f37d21 (diff)
downloadgo-git-e0fc8a8f00c238f7f7cdf0e0c59e743550587bdf.tar.gz
plumbing: blame, Complete rewrite. Fixes #603
Signed-off-by: Arieh Schneier <15041913+AriehSchneier@users.noreply.github.com>
Diffstat (limited to 'blame.go')
-rw-r--r--blame.go612
1 files changed, 450 insertions, 162 deletions
diff --git a/blame.go b/blame.go
index 43634b3..2a877dc 100644
--- a/blame.go
+++ b/blame.go
@@ -2,16 +2,18 @@ package git
import (
"bytes"
+ "container/heap"
"errors"
"fmt"
+ "io"
"strconv"
- "strings"
"time"
"unicode/utf8"
"github.com/go-git/go-git/v5/plumbing"
"github.com/go-git/go-git/v5/plumbing/object"
"github.com/go-git/go-git/v5/utils/diff"
+ "github.com/sergi/go-diff/diffmatchpatch"
)
// BlameResult represents the result of a Blame operation.
@@ -29,67 +31,86 @@ type BlameResult struct {
func Blame(c *object.Commit, path string) (*BlameResult, error) {
// The file to blame is identified by the input arguments:
// commit and path. commit is a Commit object obtained from a Repository. Path
- // represents a path to a specific file contained into the repository.
+ // represents a path to a specific file contained in the repository.
//
- // Blaming a file is a two step process:
+ // Blaming a file is done by walking the tree in reverse order trying to find where each line was last modified.
//
- // 1. Create a linear history of the commits affecting a file. We use
- // revlist.New for that.
+ // When a diff is found it cannot immediately assume it came from that commit, as it may have come from 1 of its
+ // parents, so it will first try to resolve those diffs from its parents, if it couldn't find the change in its
+ // parents then it will assign the change to itself.
//
- // 2. Then build a graph with a node for every line in every file in
- // the history of the file.
+ // When encountering 2 parents that have made the same change to a file it will choose the parent that was merged
+ // into the current branch first (this is determined by the order of the parents inside the commit).
//
- // Each node is assigned a commit: Start by the nodes in the first
- // commit. Assign that commit as the creator of all its lines.
- //
- // Then jump to the nodes in the next commit, and calculate the diff
- // between the two files. Newly created lines get
- // assigned the new commit as its origin. Modified lines also get
- // this new commit. Untouched lines retain the old commit.
- //
- // All this work is done in the assignOrigin function which holds all
- // the internal relevant data in a "blame" struct, that is not
- // exported.
- //
- // TODO: ways to improve the efficiency of this function:
- // 1. Improve revlist
- // 2. Improve how to traverse the history (example a backward traversal will
- // be much more efficient)
- //
- // TODO: ways to improve the function in general:
- // 1. Add memoization between revlist and assign.
- // 2. It is using much more memory than needed, see the TODOs below.
+ // This currently works on a line by line basis, if performance becomes an issue it could be changed to work with
+ // hunks rather than lines. Then when encountering diff hunks it would need to split them where necessary.
b := new(blame)
b.fRev = c
b.path = path
+ b.q = new(priorityQueue)
- // get all the file revisions
- if err := b.fillRevs(); err != nil {
+ file, err := b.fRev.File(path)
+ if err != nil {
return nil, err
}
-
- // calculate the line tracking graph and fill in
- // file contents in data.
- if err := b.fillGraphAndData(); err != nil {
+ finalLines, err := file.Lines()
+ if err != nil {
return nil, err
}
+ finalLength := len(finalLines)
- file, err := b.fRev.File(b.path)
+ needsMap := make([]lineMap, finalLength)
+ for i := range needsMap {
+ needsMap[i] = lineMap{i, i, nil, -1}
+ }
+ contents, err := file.Contents()
if err != nil {
return nil, err
}
- finalLines, err := file.Lines()
+ b.q.Push(&queueItem{
+ nil,
+ nil,
+ c,
+ path,
+ contents,
+ needsMap,
+ 0,
+ false,
+ 0,
+ })
+ items := make([]*queueItem, 0)
+ for {
+ items = items[:0]
+ for {
+ if b.q.Len() == 0 {
+ return nil, errors.New("invalid state: no items left on the blame queue")
+ }
+ item := b.q.Pop()
+ items = append(items, item)
+ next := b.q.Peek()
+ if next == nil || next.Hash != item.Commit.Hash {
+ break
+ }
+ }
+ finished, err := b.addBlames(items)
+ if err != nil {
+ return nil, err
+ }
+ if finished == true {
+ break
+ }
+ }
if err != nil {
return nil, err
}
- // Each node (line) holds the commit where it was introduced or
- // last modified. To achieve that we use the FORWARD algorithm
- // described in Zimmermann, et al. "Mining Version Archives for
- // Co-changed Lines", in proceedings of the Mining Software
- // Repositories workshop, Shanghai, May 22-23, 2006.
- lines, err := newLines(finalLines, b.sliceGraph(len(b.graph)-1))
+ b.lineToCommit = make([]*object.Commit, finalLength)
+ for i := range needsMap {
+ b.lineToCommit[i] = needsMap[i].Commit
+ }
+
+ lines, err := newLines(finalLines, b.lineToCommit)
if err != nil {
return nil, err
}
@@ -105,6 +126,8 @@ func Blame(c *object.Commit, path string) (*BlameResult, error) {
type Line struct {
// Author is the email address of the last author that modified the line.
Author string
+ // AuthorName is the name of the last author that modified the line.
+ AuthorName string
// Text is the original text of the line.
Text string
// Date is when the original text of the line was introduced
@@ -113,31 +136,21 @@ type Line struct {
Hash plumbing.Hash
}
-func newLine(author, text string, date time.Time, hash plumbing.Hash) *Line {
+func newLine(author, authorName, text string, date time.Time, hash plumbing.Hash) *Line {
return &Line{
- Author: author,
- Text: text,
- Hash: hash,
- Date: date,
+ Author: author,
+ AuthorName: authorName,
+ Text: text,
+ Hash: hash,
+ Date: date,
}
}
func newLines(contents []string, commits []*object.Commit) ([]*Line, error) {
- lcontents := len(contents)
- lcommits := len(commits)
-
- if lcontents != lcommits {
- if lcontents == lcommits-1 && contents[lcontents-1] != "\n" {
- contents = append(contents, "\n")
- } else {
- return nil, errors.New("contents and commits have different length")
- }
- }
-
- result := make([]*Line, 0, lcontents)
+ result := make([]*Line, 0, len(contents))
for i := range contents {
result = append(result, newLine(
- commits[i].Author.Email, contents[i],
+ commits[i].Author.Email, commits[i].Author.Name, contents[i],
commits[i].Author.When, commits[i].Hash,
))
}
@@ -152,151 +165,426 @@ type blame struct {
path string
// the commit of the final revision of the file to blame
fRev *object.Commit
- // the chain of revisions affecting the the file to blame
- revs []*object.Commit
- // the contents of the file across all its revisions
- data []string
- // the graph of the lines in the file across all the revisions
- graph [][]*object.Commit
+ // resolved lines
+ lineToCommit []*object.Commit
+ // queue of commits that need resolving
+ q *priorityQueue
}
-// calculate the history of a file "path", starting from commit "from", sorted by commit date.
-func (b *blame) fillRevs() error {
- var err error
-
- b.revs, err = references(b.fRev, b.path)
- return err
+type lineMap struct {
+ Orig, Cur int
+ Commit *object.Commit
+ FromParentNo int
}
-// build graph of a file from its revision history
-func (b *blame) fillGraphAndData() error {
- //TODO: not all commits are needed, only the current rev and the prev
- b.graph = make([][]*object.Commit, len(b.revs))
- b.data = make([]string, len(b.revs)) // file contents in all the revisions
- // for every revision of the file, starting with the first
- // one...
- for i, rev := range b.revs {
+func (b *blame) addBlames(curItems []*queueItem) (bool, error) {
+ curItem := curItems[0]
+
+ // Simple optimisation to merge paths, there is potential to go a bit further here and check for any duplicates
+ // not only if they are all the same.
+ if len(curItems) == 1 {
+ curItems = nil
+ } else if curItem.IdenticalToChild {
+ allSame := true
+ lenCurItems := len(curItems)
+ lowestParentNo := curItem.ParentNo
+ for i := 1; i < lenCurItems; i++ {
+ if !curItems[i].IdenticalToChild || curItem.Child != curItems[i].Child {
+ allSame = false
+ break
+ }
+ lowestParentNo = min(lowestParentNo, curItems[i].ParentNo)
+ }
+ if allSame {
+ curItem.Child.numParentsNeedResolving = curItem.Child.numParentsNeedResolving - lenCurItems + 1
+ curItems = nil // free the memory
+ curItem.ParentNo = lowestParentNo
+
+ // Now check if we can remove the parent completely
+ for curItem.Child.IdenticalToChild && curItem.Child.MergedChildren == nil && curItem.Child.numParentsNeedResolving == 1 {
+ oldChild := curItem.Child
+ curItem.Child = oldChild.Child
+ curItem.ParentNo = oldChild.ParentNo
+ }
+ }
+ }
+
+ // if we have more than 1 item for this commit, create a single needsMap
+ if len(curItems) > 1 {
+ curItem.MergedChildren = make([]childToNeedsMap, len(curItems))
+ for i, c := range curItems {
+ curItem.MergedChildren[i] = childToNeedsMap{c.Child, c.NeedsMap, c.IdenticalToChild, c.ParentNo}
+ }
+ newNeedsMap := make([]lineMap, 0, len(curItem.NeedsMap))
+ newNeedsMap = append(newNeedsMap, curItems[0].NeedsMap...)
+
+ for i := 1; i < len(curItems); i++ {
+ cur := curItems[i].NeedsMap
+ n := 0 // position in newNeedsMap
+ c := 0 // position in current list
+ for c < len(cur) {
+ if n == len(newNeedsMap) {
+ newNeedsMap = append(newNeedsMap, cur[c:]...)
+ break
+ } else if newNeedsMap[n].Cur == cur[c].Cur {
+ n++
+ c++
+ } else if newNeedsMap[n].Cur < cur[c].Cur {
+ n++
+ } else {
+ newNeedsMap = append(newNeedsMap, cur[c])
+ newPos := len(newNeedsMap) - 1
+ for newPos > n {
+ newNeedsMap[newPos-1], newNeedsMap[newPos] = newNeedsMap[newPos], newNeedsMap[newPos-1]
+ newPos--
+ }
+ }
+ }
+ }
+ curItem.NeedsMap = newNeedsMap
+ curItem.IdenticalToChild = false
+ curItem.Child = nil
+ curItems = nil // free the memory
+ }
+
+ parents, err := parentsContainingPath(curItem.path, curItem.Commit)
+ if err != nil {
+ return false, err
+ }
+
+ anyPushed := false
+ for parnetNo, prev := range parents {
+ currentHash, err := blobHash(curItem.path, curItem.Commit)
+ if err != nil {
+ return false, err
+ }
+ prevHash, err := blobHash(prev.Path, prev.Commit)
+ if err != nil {
+ return false, err
+ }
+ if currentHash == prevHash {
+ if len(parents) == 1 && curItem.MergedChildren == nil && curItem.IdenticalToChild {
+ // commit that has 1 parent and 1 child and is the same as both, bypass it completely
+ b.q.Push(&queueItem{
+ Child: curItem.Child,
+ Commit: prev.Commit,
+ path: prev.Path,
+ Contents: curItem.Contents,
+ NeedsMap: curItem.NeedsMap, // reuse the NeedsMap as we are throwing away this item
+ IdenticalToChild: true,
+ ParentNo: curItem.ParentNo,
+ })
+ } else {
+ b.q.Push(&queueItem{
+ Child: curItem,
+ Commit: prev.Commit,
+ path: prev.Path,
+ Contents: curItem.Contents,
+ NeedsMap: append([]lineMap(nil), curItem.NeedsMap...), // create new slice and copy
+ IdenticalToChild: true,
+ ParentNo: parnetNo,
+ })
+ curItem.numParentsNeedResolving++
+ }
+ anyPushed = true
+ continue
+ }
+
// get the contents of the file
- file, err := rev.File(b.path)
+ file, err := prev.Commit.File(prev.Path)
if err != nil {
- return nil
+ return false, err
}
- b.data[i], err = file.Contents()
+ prevContents, err := file.Contents()
if err != nil {
- return err
+ return false, err
}
- nLines := countLines(b.data[i])
- // create a node for each line
- b.graph[i] = make([]*object.Commit, nLines)
- // assign a commit to each node
- // if this is the first revision, then the node is assigned to
- // this first commit.
- if i == 0 {
- for j := 0; j < nLines; j++ {
- b.graph[i][j] = b.revs[i]
+
+ hunks := diff.Do(prevContents, curItem.Contents)
+ prevl := -1
+ curl := -1
+ need := 0
+ getFromParent := make([]lineMap, 0)
+ out:
+ for h := range hunks {
+ hLines := countLines(hunks[h].Text)
+ for hl := 0; hl < hLines; hl++ {
+ switch {
+ case hunks[h].Type == diffmatchpatch.DiffEqual:
+ prevl++
+ curl++
+ if curl == curItem.NeedsMap[need].Cur {
+ // add to needs
+ getFromParent = append(getFromParent, lineMap{curl, prevl, nil, -1})
+ // move to next need
+ need++
+ if need >= len(curItem.NeedsMap) {
+ break out
+ }
+ }
+ case hunks[h].Type == diffmatchpatch.DiffInsert:
+ curl++
+ if curl == curItem.NeedsMap[need].Cur {
+ // the line we want is added, it may have been added here (or by another parent), skip it for now
+ need++
+ if need >= len(curItem.NeedsMap) {
+ break out
+ }
+ }
+ case hunks[h].Type == diffmatchpatch.DiffDelete:
+ prevl += hLines
+ continue out
+ default:
+ return false, errors.New("invalid state: invalid hunk Type")
+ }
}
- } else {
- // if this is not the first commit, then assign to the old
- // commit or to the new one, depending on what the diff
- // says.
- b.assignOrigin(i, i-1)
+ }
+
+ if len(getFromParent) > 0 {
+ b.q.Push(&queueItem{
+ curItem,
+ nil,
+ prev.Commit,
+ prev.Path,
+ prevContents,
+ getFromParent,
+ 0,
+ false,
+ parnetNo,
+ })
+ curItem.numParentsNeedResolving++
+ anyPushed = true
}
}
- return nil
-}
-// sliceGraph returns a slice of commits (one per line) for a particular
-// revision of a file (0=first revision).
-func (b *blame) sliceGraph(i int) []*object.Commit {
- fVs := b.graph[i]
- result := make([]*object.Commit, 0, len(fVs))
- for _, v := range fVs {
- c := *v
- result = append(result, &c)
+ curItem.Contents = "" // no longer need, free the memory
+
+ if !anyPushed {
+ return finishNeeds(curItem)
}
- return result
+
+ return false, nil
}
-// Assigns origin to vertexes in current (c) rev from data in its previous (p)
-// revision
-func (b *blame) assignOrigin(c, p int) {
- // assign origin based on diff info
- hunks := diff.Do(b.data[p], b.data[c])
- sl := -1 // source line
- dl := -1 // destination line
- for h := range hunks {
- hLines := countLines(hunks[h].Text)
- for hl := 0; hl < hLines; hl++ {
- switch {
- case hunks[h].Type == 0:
- sl++
- dl++
- b.graph[c][dl] = b.graph[p][sl]
- case hunks[h].Type == 1:
- dl++
- b.graph[c][dl] = b.revs[c]
- case hunks[h].Type == -1:
- sl++
- default:
- panic("unreachable")
+func finishNeeds(curItem *queueItem) (bool, error) {
+ // any needs left in the needsMap must have come from this revision
+ for i := range curItem.NeedsMap {
+ if curItem.NeedsMap[i].Commit == nil {
+ curItem.NeedsMap[i].Commit = curItem.Commit
+ curItem.NeedsMap[i].FromParentNo = -1
+ }
+ }
+
+ if curItem.Child == nil && curItem.MergedChildren == nil {
+ return true, nil
+ }
+
+ if curItem.MergedChildren == nil {
+ return applyNeeds(curItem.Child, curItem.NeedsMap, curItem.IdenticalToChild, curItem.ParentNo)
+ }
+
+ for _, ctn := range curItem.MergedChildren {
+ m := 0 // position in merged needs map
+ p := 0 // position in parent needs map
+ for p < len(ctn.NeedsMap) {
+ if ctn.NeedsMap[p].Cur == curItem.NeedsMap[m].Cur {
+ ctn.NeedsMap[p].Commit = curItem.NeedsMap[m].Commit
+ m++
+ p++
+ } else if ctn.NeedsMap[p].Cur < curItem.NeedsMap[m].Cur {
+ p++
+ } else {
+ m++
}
}
+ finished, err := applyNeeds(ctn.Child, ctn.NeedsMap, ctn.IdenticalToChild, ctn.ParentNo)
+ if finished || err != nil {
+ return finished, err
+ }
}
-}
-// GoString prints the results of a Blame using git-blame's style.
-func (b *blame) GoString() string {
- var buf bytes.Buffer
+ return false, nil
+}
- file, err := b.fRev.File(b.path)
- if err != nil {
- panic("PrettyPrint: internal error in repo.Data")
+func applyNeeds(child *queueItem, needsMap []lineMap, identicalToChild bool, parentNo int) (bool, error) {
+ if identicalToChild {
+ for i := range child.NeedsMap {
+ l := &child.NeedsMap[i]
+ if l.Cur != needsMap[i].Cur || l.Orig != needsMap[i].Orig {
+ return false, errors.New("needsMap isn't the same? Why not??")
+ }
+ if l.Commit == nil || parentNo < l.FromParentNo {
+ l.Commit = needsMap[i].Commit
+ l.FromParentNo = parentNo
+ }
+ }
+ } else {
+ i := 0
+ out:
+ for j := range child.NeedsMap {
+ l := &child.NeedsMap[j]
+ for needsMap[i].Orig < l.Cur {
+ i++
+ if i == len(needsMap) {
+ break out
+ }
+ }
+ if l.Cur == needsMap[i].Orig {
+ if l.Commit == nil || parentNo < l.FromParentNo {
+ l.Commit = needsMap[i].Commit
+ l.FromParentNo = parentNo
+ }
+ }
+ }
}
- contents, err := file.Contents()
- if err != nil {
- panic("PrettyPrint: internal error in repo.Data")
+ child.numParentsNeedResolving--
+ if child.numParentsNeedResolving == 0 {
+ finished, err := finishNeeds(child)
+ if finished || err != nil {
+ return finished, err
+ }
}
- lines := strings.Split(contents, "\n")
+ return false, nil
+}
+
+// String prints the results of a Blame using git-blame's style.
+func (b BlameResult) String() string {
+ var buf bytes.Buffer
+
// max line number length
- mlnl := len(strconv.Itoa(len(lines)))
+ mlnl := len(strconv.Itoa(len(b.Lines)))
// max author length
mal := b.maxAuthorLength()
- format := fmt.Sprintf("%%s (%%-%ds %%%dd) %%s\n",
- mal, mlnl)
+ format := fmt.Sprintf("%%s (%%-%ds %%s %%%dd) %%s\n", mal, mlnl)
- fVs := b.graph[len(b.graph)-1]
- for ln, v := range fVs {
- fmt.Fprintf(&buf, format, v.Hash.String()[:8],
- prettyPrintAuthor(fVs[ln]), ln+1, lines[ln])
+ for ln := range b.Lines {
+ _, _ = fmt.Fprintf(&buf, format, b.Lines[ln].Hash.String()[:8],
+ b.Lines[ln].AuthorName, b.Lines[ln].Date.Format("2006-01-02 15:04:05 -0700"), ln+1, b.Lines[ln].Text)
}
return buf.String()
}
-// utility function to pretty print the author.
-func prettyPrintAuthor(c *object.Commit) string {
- return fmt.Sprintf("%s %s", c.Author.Name, c.Author.When.Format("2006-01-02"))
-}
-
// utility function to calculate the number of runes needed
// to print the longest author name in the blame of a file.
-func (b *blame) maxAuthorLength() int {
- memo := make(map[plumbing.Hash]struct{}, len(b.graph)-1)
- fVs := b.graph[len(b.graph)-1]
+func (b BlameResult) maxAuthorLength() int {
m := 0
- for ln := range fVs {
- if _, ok := memo[fVs[ln].Hash]; ok {
- continue
- }
- memo[fVs[ln].Hash] = struct{}{}
- m = max(m, utf8.RuneCountInString(prettyPrintAuthor(fVs[ln])))
+ for ln := range b.Lines {
+ m = max(m, utf8.RuneCountInString(b.Lines[ln].AuthorName))
}
return m
}
+func min(a, b int) int {
+ if a < b {
+ return a
+ }
+ return b
+}
+
func max(a, b int) int {
if a > b {
return a
}
return b
}
+
+type childToNeedsMap struct {
+ Child *queueItem
+ NeedsMap []lineMap
+ IdenticalToChild bool
+ ParentNo int
+}
+
+type queueItem struct {
+ Child *queueItem
+ MergedChildren []childToNeedsMap
+ Commit *object.Commit
+ path string
+ Contents string
+ NeedsMap []lineMap
+ numParentsNeedResolving int
+ IdenticalToChild bool
+ ParentNo int
+}
+
+type priorityQueueImp []*queueItem
+
+func (pq *priorityQueueImp) Len() int { return len(*pq) }
+func (pq *priorityQueueImp) Less(i, j int) bool {
+ return !(*pq)[i].Commit.Less((*pq)[j].Commit)
+}
+func (pq *priorityQueueImp) Swap(i, j int) { (*pq)[i], (*pq)[j] = (*pq)[j], (*pq)[i] }
+func (pq *priorityQueueImp) Push(x any) { *pq = append(*pq, x.(*queueItem)) }
+func (pq *priorityQueueImp) Pop() any {
+ n := len(*pq)
+ ret := (*pq)[n-1]
+ (*pq)[n-1] = nil // ovoid memory leak
+ *pq = (*pq)[0 : n-1]
+
+ return ret
+}
+func (pq *priorityQueueImp) Peek() *object.Commit {
+ if len(*pq) == 0 {
+ return nil
+ }
+ return (*pq)[0].Commit
+}
+
+type priorityQueue priorityQueueImp
+
+func (pq *priorityQueue) Init() { heap.Init((*priorityQueueImp)(pq)) }
+func (pq *priorityQueue) Len() int { return (*priorityQueueImp)(pq).Len() }
+func (pq *priorityQueue) Push(c *queueItem) {
+ heap.Push((*priorityQueueImp)(pq), c)
+}
+func (pq *priorityQueue) Pop() *queueItem {
+ return heap.Pop((*priorityQueueImp)(pq)).(*queueItem)
+}
+func (pq *priorityQueue) Peek() *object.Commit { return (*priorityQueueImp)(pq).Peek() }
+
+type parentCommit struct {
+ Commit *object.Commit
+ Path string
+}
+
+func parentsContainingPath(path string, c *object.Commit) ([]parentCommit, error) {
+ // TODO: benchmark this method making git.object.Commit.parent public instead of using
+ // an iterator
+ var result []parentCommit
+ iter := c.Parents()
+ for {
+ parent, err := iter.Next()
+ if err == io.EOF {
+ return result, nil
+ }
+ if err != nil {
+ return nil, err
+ }
+ if _, err := parent.File(path); err == nil {
+ result = append(result, parentCommit{parent, path})
+ } else {
+ // look for renames
+ patch, err := parent.Patch(c)
+ if err != nil {
+ return nil, err
+ } else if patch != nil {
+ for _, fp := range patch.FilePatches() {
+ from, to := fp.Files()
+ if from != nil && to != nil && to.Path() == path {
+ result = append(result, parentCommit{parent, from.Path()})
+ break
+ }
+ }
+ }
+ }
+ }
+}
+
+func blobHash(path string, commit *object.Commit) (plumbing.Hash, error) {
+ file, err := commit.File(path)
+ if err != nil {
+ return plumbing.ZeroHash, err
+ }
+ return file.Hash, nil
+}