From e0fc8a8f00c238f7f7cdf0e0c59e743550587bdf Mon Sep 17 00:00:00 2001 From: Arieh Schneier <15041913+AriehSchneier@users.noreply.github.com> Date: Thu, 6 Jul 2023 08:45:49 +1000 Subject: plumbing: blame, Complete rewrite. Fixes #603 Signed-off-by: Arieh Schneier <15041913+AriehSchneier@users.noreply.github.com> --- blame.go | 612 ++++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 450 insertions(+), 162 deletions(-) (limited to 'blame.go') diff --git a/blame.go b/blame.go index 43634b3..2a877dc 100644 --- a/blame.go +++ b/blame.go @@ -2,16 +2,18 @@ package git import ( "bytes" + "container/heap" "errors" "fmt" + "io" "strconv" - "strings" "time" "unicode/utf8" "github.com/go-git/go-git/v5/plumbing" "github.com/go-git/go-git/v5/plumbing/object" "github.com/go-git/go-git/v5/utils/diff" + "github.com/sergi/go-diff/diffmatchpatch" ) // BlameResult represents the result of a Blame operation. @@ -29,67 +31,86 @@ type BlameResult struct { func Blame(c *object.Commit, path string) (*BlameResult, error) { // The file to blame is identified by the input arguments: // commit and path. commit is a Commit object obtained from a Repository. Path - // represents a path to a specific file contained into the repository. + // represents a path to a specific file contained in the repository. // - // Blaming a file is a two step process: + // Blaming a file is done by walking the tree in reverse order trying to find where each line was last modified. // - // 1. Create a linear history of the commits affecting a file. We use - // revlist.New for that. + // When a diff is found it cannot immediately assume it came from that commit, as it may have come from 1 of its + // parents, so it will first try to resolve those diffs from its parents, if it couldn't find the change in its + // parents then it will assign the change to itself. // - // 2. Then build a graph with a node for every line in every file in - // the history of the file. + // When encountering 2 parents that have made the same change to a file it will choose the parent that was merged + // into the current branch first (this is determined by the order of the parents inside the commit). // - // Each node is assigned a commit: Start by the nodes in the first - // commit. Assign that commit as the creator of all its lines. - // - // Then jump to the nodes in the next commit, and calculate the diff - // between the two files. Newly created lines get - // assigned the new commit as its origin. Modified lines also get - // this new commit. Untouched lines retain the old commit. - // - // All this work is done in the assignOrigin function which holds all - // the internal relevant data in a "blame" struct, that is not - // exported. - // - // TODO: ways to improve the efficiency of this function: - // 1. Improve revlist - // 2. Improve how to traverse the history (example a backward traversal will - // be much more efficient) - // - // TODO: ways to improve the function in general: - // 1. Add memoization between revlist and assign. - // 2. It is using much more memory than needed, see the TODOs below. + // This currently works on a line by line basis, if performance becomes an issue it could be changed to work with + // hunks rather than lines. Then when encountering diff hunks it would need to split them where necessary. b := new(blame) b.fRev = c b.path = path + b.q = new(priorityQueue) - // get all the file revisions - if err := b.fillRevs(); err != nil { + file, err := b.fRev.File(path) + if err != nil { return nil, err } - - // calculate the line tracking graph and fill in - // file contents in data. - if err := b.fillGraphAndData(); err != nil { + finalLines, err := file.Lines() + if err != nil { return nil, err } + finalLength := len(finalLines) - file, err := b.fRev.File(b.path) + needsMap := make([]lineMap, finalLength) + for i := range needsMap { + needsMap[i] = lineMap{i, i, nil, -1} + } + contents, err := file.Contents() if err != nil { return nil, err } - finalLines, err := file.Lines() + b.q.Push(&queueItem{ + nil, + nil, + c, + path, + contents, + needsMap, + 0, + false, + 0, + }) + items := make([]*queueItem, 0) + for { + items = items[:0] + for { + if b.q.Len() == 0 { + return nil, errors.New("invalid state: no items left on the blame queue") + } + item := b.q.Pop() + items = append(items, item) + next := b.q.Peek() + if next == nil || next.Hash != item.Commit.Hash { + break + } + } + finished, err := b.addBlames(items) + if err != nil { + return nil, err + } + if finished == true { + break + } + } if err != nil { return nil, err } - // Each node (line) holds the commit where it was introduced or - // last modified. To achieve that we use the FORWARD algorithm - // described in Zimmermann, et al. "Mining Version Archives for - // Co-changed Lines", in proceedings of the Mining Software - // Repositories workshop, Shanghai, May 22-23, 2006. - lines, err := newLines(finalLines, b.sliceGraph(len(b.graph)-1)) + b.lineToCommit = make([]*object.Commit, finalLength) + for i := range needsMap { + b.lineToCommit[i] = needsMap[i].Commit + } + + lines, err := newLines(finalLines, b.lineToCommit) if err != nil { return nil, err } @@ -105,6 +126,8 @@ func Blame(c *object.Commit, path string) (*BlameResult, error) { type Line struct { // Author is the email address of the last author that modified the line. Author string + // AuthorName is the name of the last author that modified the line. + AuthorName string // Text is the original text of the line. Text string // Date is when the original text of the line was introduced @@ -113,31 +136,21 @@ type Line struct { Hash plumbing.Hash } -func newLine(author, text string, date time.Time, hash plumbing.Hash) *Line { +func newLine(author, authorName, text string, date time.Time, hash plumbing.Hash) *Line { return &Line{ - Author: author, - Text: text, - Hash: hash, - Date: date, + Author: author, + AuthorName: authorName, + Text: text, + Hash: hash, + Date: date, } } func newLines(contents []string, commits []*object.Commit) ([]*Line, error) { - lcontents := len(contents) - lcommits := len(commits) - - if lcontents != lcommits { - if lcontents == lcommits-1 && contents[lcontents-1] != "\n" { - contents = append(contents, "\n") - } else { - return nil, errors.New("contents and commits have different length") - } - } - - result := make([]*Line, 0, lcontents) + result := make([]*Line, 0, len(contents)) for i := range contents { result = append(result, newLine( - commits[i].Author.Email, contents[i], + commits[i].Author.Email, commits[i].Author.Name, contents[i], commits[i].Author.When, commits[i].Hash, )) } @@ -152,151 +165,426 @@ type blame struct { path string // the commit of the final revision of the file to blame fRev *object.Commit - // the chain of revisions affecting the the file to blame - revs []*object.Commit - // the contents of the file across all its revisions - data []string - // the graph of the lines in the file across all the revisions - graph [][]*object.Commit + // resolved lines + lineToCommit []*object.Commit + // queue of commits that need resolving + q *priorityQueue } -// calculate the history of a file "path", starting from commit "from", sorted by commit date. -func (b *blame) fillRevs() error { - var err error - - b.revs, err = references(b.fRev, b.path) - return err +type lineMap struct { + Orig, Cur int + Commit *object.Commit + FromParentNo int } -// build graph of a file from its revision history -func (b *blame) fillGraphAndData() error { - //TODO: not all commits are needed, only the current rev and the prev - b.graph = make([][]*object.Commit, len(b.revs)) - b.data = make([]string, len(b.revs)) // file contents in all the revisions - // for every revision of the file, starting with the first - // one... - for i, rev := range b.revs { +func (b *blame) addBlames(curItems []*queueItem) (bool, error) { + curItem := curItems[0] + + // Simple optimisation to merge paths, there is potential to go a bit further here and check for any duplicates + // not only if they are all the same. + if len(curItems) == 1 { + curItems = nil + } else if curItem.IdenticalToChild { + allSame := true + lenCurItems := len(curItems) + lowestParentNo := curItem.ParentNo + for i := 1; i < lenCurItems; i++ { + if !curItems[i].IdenticalToChild || curItem.Child != curItems[i].Child { + allSame = false + break + } + lowestParentNo = min(lowestParentNo, curItems[i].ParentNo) + } + if allSame { + curItem.Child.numParentsNeedResolving = curItem.Child.numParentsNeedResolving - lenCurItems + 1 + curItems = nil // free the memory + curItem.ParentNo = lowestParentNo + + // Now check if we can remove the parent completely + for curItem.Child.IdenticalToChild && curItem.Child.MergedChildren == nil && curItem.Child.numParentsNeedResolving == 1 { + oldChild := curItem.Child + curItem.Child = oldChild.Child + curItem.ParentNo = oldChild.ParentNo + } + } + } + + // if we have more than 1 item for this commit, create a single needsMap + if len(curItems) > 1 { + curItem.MergedChildren = make([]childToNeedsMap, len(curItems)) + for i, c := range curItems { + curItem.MergedChildren[i] = childToNeedsMap{c.Child, c.NeedsMap, c.IdenticalToChild, c.ParentNo} + } + newNeedsMap := make([]lineMap, 0, len(curItem.NeedsMap)) + newNeedsMap = append(newNeedsMap, curItems[0].NeedsMap...) + + for i := 1; i < len(curItems); i++ { + cur := curItems[i].NeedsMap + n := 0 // position in newNeedsMap + c := 0 // position in current list + for c < len(cur) { + if n == len(newNeedsMap) { + newNeedsMap = append(newNeedsMap, cur[c:]...) + break + } else if newNeedsMap[n].Cur == cur[c].Cur { + n++ + c++ + } else if newNeedsMap[n].Cur < cur[c].Cur { + n++ + } else { + newNeedsMap = append(newNeedsMap, cur[c]) + newPos := len(newNeedsMap) - 1 + for newPos > n { + newNeedsMap[newPos-1], newNeedsMap[newPos] = newNeedsMap[newPos], newNeedsMap[newPos-1] + newPos-- + } + } + } + } + curItem.NeedsMap = newNeedsMap + curItem.IdenticalToChild = false + curItem.Child = nil + curItems = nil // free the memory + } + + parents, err := parentsContainingPath(curItem.path, curItem.Commit) + if err != nil { + return false, err + } + + anyPushed := false + for parnetNo, prev := range parents { + currentHash, err := blobHash(curItem.path, curItem.Commit) + if err != nil { + return false, err + } + prevHash, err := blobHash(prev.Path, prev.Commit) + if err != nil { + return false, err + } + if currentHash == prevHash { + if len(parents) == 1 && curItem.MergedChildren == nil && curItem.IdenticalToChild { + // commit that has 1 parent and 1 child and is the same as both, bypass it completely + b.q.Push(&queueItem{ + Child: curItem.Child, + Commit: prev.Commit, + path: prev.Path, + Contents: curItem.Contents, + NeedsMap: curItem.NeedsMap, // reuse the NeedsMap as we are throwing away this item + IdenticalToChild: true, + ParentNo: curItem.ParentNo, + }) + } else { + b.q.Push(&queueItem{ + Child: curItem, + Commit: prev.Commit, + path: prev.Path, + Contents: curItem.Contents, + NeedsMap: append([]lineMap(nil), curItem.NeedsMap...), // create new slice and copy + IdenticalToChild: true, + ParentNo: parnetNo, + }) + curItem.numParentsNeedResolving++ + } + anyPushed = true + continue + } + // get the contents of the file - file, err := rev.File(b.path) + file, err := prev.Commit.File(prev.Path) if err != nil { - return nil + return false, err } - b.data[i], err = file.Contents() + prevContents, err := file.Contents() if err != nil { - return err + return false, err } - nLines := countLines(b.data[i]) - // create a node for each line - b.graph[i] = make([]*object.Commit, nLines) - // assign a commit to each node - // if this is the first revision, then the node is assigned to - // this first commit. - if i == 0 { - for j := 0; j < nLines; j++ { - b.graph[i][j] = b.revs[i] + + hunks := diff.Do(prevContents, curItem.Contents) + prevl := -1 + curl := -1 + need := 0 + getFromParent := make([]lineMap, 0) + out: + for h := range hunks { + hLines := countLines(hunks[h].Text) + for hl := 0; hl < hLines; hl++ { + switch { + case hunks[h].Type == diffmatchpatch.DiffEqual: + prevl++ + curl++ + if curl == curItem.NeedsMap[need].Cur { + // add to needs + getFromParent = append(getFromParent, lineMap{curl, prevl, nil, -1}) + // move to next need + need++ + if need >= len(curItem.NeedsMap) { + break out + } + } + case hunks[h].Type == diffmatchpatch.DiffInsert: + curl++ + if curl == curItem.NeedsMap[need].Cur { + // the line we want is added, it may have been added here (or by another parent), skip it for now + need++ + if need >= len(curItem.NeedsMap) { + break out + } + } + case hunks[h].Type == diffmatchpatch.DiffDelete: + prevl += hLines + continue out + default: + return false, errors.New("invalid state: invalid hunk Type") + } } - } else { - // if this is not the first commit, then assign to the old - // commit or to the new one, depending on what the diff - // says. - b.assignOrigin(i, i-1) + } + + if len(getFromParent) > 0 { + b.q.Push(&queueItem{ + curItem, + nil, + prev.Commit, + prev.Path, + prevContents, + getFromParent, + 0, + false, + parnetNo, + }) + curItem.numParentsNeedResolving++ + anyPushed = true } } - return nil -} -// sliceGraph returns a slice of commits (one per line) for a particular -// revision of a file (0=first revision). -func (b *blame) sliceGraph(i int) []*object.Commit { - fVs := b.graph[i] - result := make([]*object.Commit, 0, len(fVs)) - for _, v := range fVs { - c := *v - result = append(result, &c) + curItem.Contents = "" // no longer need, free the memory + + if !anyPushed { + return finishNeeds(curItem) } - return result + + return false, nil } -// Assigns origin to vertexes in current (c) rev from data in its previous (p) -// revision -func (b *blame) assignOrigin(c, p int) { - // assign origin based on diff info - hunks := diff.Do(b.data[p], b.data[c]) - sl := -1 // source line - dl := -1 // destination line - for h := range hunks { - hLines := countLines(hunks[h].Text) - for hl := 0; hl < hLines; hl++ { - switch { - case hunks[h].Type == 0: - sl++ - dl++ - b.graph[c][dl] = b.graph[p][sl] - case hunks[h].Type == 1: - dl++ - b.graph[c][dl] = b.revs[c] - case hunks[h].Type == -1: - sl++ - default: - panic("unreachable") +func finishNeeds(curItem *queueItem) (bool, error) { + // any needs left in the needsMap must have come from this revision + for i := range curItem.NeedsMap { + if curItem.NeedsMap[i].Commit == nil { + curItem.NeedsMap[i].Commit = curItem.Commit + curItem.NeedsMap[i].FromParentNo = -1 + } + } + + if curItem.Child == nil && curItem.MergedChildren == nil { + return true, nil + } + + if curItem.MergedChildren == nil { + return applyNeeds(curItem.Child, curItem.NeedsMap, curItem.IdenticalToChild, curItem.ParentNo) + } + + for _, ctn := range curItem.MergedChildren { + m := 0 // position in merged needs map + p := 0 // position in parent needs map + for p < len(ctn.NeedsMap) { + if ctn.NeedsMap[p].Cur == curItem.NeedsMap[m].Cur { + ctn.NeedsMap[p].Commit = curItem.NeedsMap[m].Commit + m++ + p++ + } else if ctn.NeedsMap[p].Cur < curItem.NeedsMap[m].Cur { + p++ + } else { + m++ } } + finished, err := applyNeeds(ctn.Child, ctn.NeedsMap, ctn.IdenticalToChild, ctn.ParentNo) + if finished || err != nil { + return finished, err + } } -} -// GoString prints the results of a Blame using git-blame's style. -func (b *blame) GoString() string { - var buf bytes.Buffer + return false, nil +} - file, err := b.fRev.File(b.path) - if err != nil { - panic("PrettyPrint: internal error in repo.Data") +func applyNeeds(child *queueItem, needsMap []lineMap, identicalToChild bool, parentNo int) (bool, error) { + if identicalToChild { + for i := range child.NeedsMap { + l := &child.NeedsMap[i] + if l.Cur != needsMap[i].Cur || l.Orig != needsMap[i].Orig { + return false, errors.New("needsMap isn't the same? Why not??") + } + if l.Commit == nil || parentNo < l.FromParentNo { + l.Commit = needsMap[i].Commit + l.FromParentNo = parentNo + } + } + } else { + i := 0 + out: + for j := range child.NeedsMap { + l := &child.NeedsMap[j] + for needsMap[i].Orig < l.Cur { + i++ + if i == len(needsMap) { + break out + } + } + if l.Cur == needsMap[i].Orig { + if l.Commit == nil || parentNo < l.FromParentNo { + l.Commit = needsMap[i].Commit + l.FromParentNo = parentNo + } + } + } } - contents, err := file.Contents() - if err != nil { - panic("PrettyPrint: internal error in repo.Data") + child.numParentsNeedResolving-- + if child.numParentsNeedResolving == 0 { + finished, err := finishNeeds(child) + if finished || err != nil { + return finished, err + } } - lines := strings.Split(contents, "\n") + return false, nil +} + +// String prints the results of a Blame using git-blame's style. +func (b BlameResult) String() string { + var buf bytes.Buffer + // max line number length - mlnl := len(strconv.Itoa(len(lines))) + mlnl := len(strconv.Itoa(len(b.Lines))) // max author length mal := b.maxAuthorLength() - format := fmt.Sprintf("%%s (%%-%ds %%%dd) %%s\n", - mal, mlnl) + format := fmt.Sprintf("%%s (%%-%ds %%s %%%dd) %%s\n", mal, mlnl) - fVs := b.graph[len(b.graph)-1] - for ln, v := range fVs { - fmt.Fprintf(&buf, format, v.Hash.String()[:8], - prettyPrintAuthor(fVs[ln]), ln+1, lines[ln]) + for ln := range b.Lines { + _, _ = fmt.Fprintf(&buf, format, b.Lines[ln].Hash.String()[:8], + b.Lines[ln].AuthorName, b.Lines[ln].Date.Format("2006-01-02 15:04:05 -0700"), ln+1, b.Lines[ln].Text) } return buf.String() } -// utility function to pretty print the author. -func prettyPrintAuthor(c *object.Commit) string { - return fmt.Sprintf("%s %s", c.Author.Name, c.Author.When.Format("2006-01-02")) -} - // utility function to calculate the number of runes needed // to print the longest author name in the blame of a file. -func (b *blame) maxAuthorLength() int { - memo := make(map[plumbing.Hash]struct{}, len(b.graph)-1) - fVs := b.graph[len(b.graph)-1] +func (b BlameResult) maxAuthorLength() int { m := 0 - for ln := range fVs { - if _, ok := memo[fVs[ln].Hash]; ok { - continue - } - memo[fVs[ln].Hash] = struct{}{} - m = max(m, utf8.RuneCountInString(prettyPrintAuthor(fVs[ln]))) + for ln := range b.Lines { + m = max(m, utf8.RuneCountInString(b.Lines[ln].AuthorName)) } return m } +func min(a, b int) int { + if a < b { + return a + } + return b +} + func max(a, b int) int { if a > b { return a } return b } + +type childToNeedsMap struct { + Child *queueItem + NeedsMap []lineMap + IdenticalToChild bool + ParentNo int +} + +type queueItem struct { + Child *queueItem + MergedChildren []childToNeedsMap + Commit *object.Commit + path string + Contents string + NeedsMap []lineMap + numParentsNeedResolving int + IdenticalToChild bool + ParentNo int +} + +type priorityQueueImp []*queueItem + +func (pq *priorityQueueImp) Len() int { return len(*pq) } +func (pq *priorityQueueImp) Less(i, j int) bool { + return !(*pq)[i].Commit.Less((*pq)[j].Commit) +} +func (pq *priorityQueueImp) Swap(i, j int) { (*pq)[i], (*pq)[j] = (*pq)[j], (*pq)[i] } +func (pq *priorityQueueImp) Push(x any) { *pq = append(*pq, x.(*queueItem)) } +func (pq *priorityQueueImp) Pop() any { + n := len(*pq) + ret := (*pq)[n-1] + (*pq)[n-1] = nil // ovoid memory leak + *pq = (*pq)[0 : n-1] + + return ret +} +func (pq *priorityQueueImp) Peek() *object.Commit { + if len(*pq) == 0 { + return nil + } + return (*pq)[0].Commit +} + +type priorityQueue priorityQueueImp + +func (pq *priorityQueue) Init() { heap.Init((*priorityQueueImp)(pq)) } +func (pq *priorityQueue) Len() int { return (*priorityQueueImp)(pq).Len() } +func (pq *priorityQueue) Push(c *queueItem) { + heap.Push((*priorityQueueImp)(pq), c) +} +func (pq *priorityQueue) Pop() *queueItem { + return heap.Pop((*priorityQueueImp)(pq)).(*queueItem) +} +func (pq *priorityQueue) Peek() *object.Commit { return (*priorityQueueImp)(pq).Peek() } + +type parentCommit struct { + Commit *object.Commit + Path string +} + +func parentsContainingPath(path string, c *object.Commit) ([]parentCommit, error) { + // TODO: benchmark this method making git.object.Commit.parent public instead of using + // an iterator + var result []parentCommit + iter := c.Parents() + for { + parent, err := iter.Next() + if err == io.EOF { + return result, nil + } + if err != nil { + return nil, err + } + if _, err := parent.File(path); err == nil { + result = append(result, parentCommit{parent, path}) + } else { + // look for renames + patch, err := parent.Patch(c) + if err != nil { + return nil, err + } else if patch != nil { + for _, fp := range patch.FilePatches() { + from, to := fp.Files() + if from != nil && to != nil && to.Path() == path { + result = append(result, parentCommit{parent, from.Path()}) + break + } + } + } + } + } +} + +func blobHash(path string, commit *object.Commit) (plumbing.Hash, error) { + file, err := commit.File(path) + if err != nil { + return plumbing.ZeroHash, err + } + return file.Hash, nil +} -- cgit