diff options
Diffstat (limited to 'blame/blame.go')
-rw-r--r-- | blame/blame.go | 223 |
1 files changed, 223 insertions, 0 deletions
diff --git a/blame/blame.go b/blame/blame.go new file mode 100644 index 0000000..774362b --- /dev/null +++ b/blame/blame.go @@ -0,0 +1,223 @@ +// Package blame contains blaming functionality for files in the repo. +// +// Blaming a file is finding what commit was the last to modify each of +// the lines in the file, therefore the output of a blaming operation is +// usualy a slice of commits, one commit per line in the file. +// +// This package also provides a pretty print function to output the +// results of a blame in a similar format to the git-blame command. +package blame + +import ( + "bytes" + "fmt" + "sort" + "strconv" + "strings" + "unicode/utf8" + + "gopkg.in/src-d/go-git.v2" + "gopkg.in/src-d/go-git.v2/core" + "gopkg.in/src-d/go-git.v2/diff" + "gopkg.in/src-d/go-git.v2/revlist" +) + +// Blame returns the last commit that modified each line of a file in +// a repository. +// +// The file to blame is identified by the input arguments: repo, commit and path. +// The output is a slice of commits, one for each line in the file. +// +// Blaming a file is a two step process: +// +// 1. Create a linear history of the commits affecting a file. We use +// revlist.New for that. +// +// 2. Then build a graph with a node for every line in every file in +// the history of the file. +// +// Each node (line) holds the commit where it was introduced or +// last modified. To achieve that we use the FORWARD algorithm +// described in Zimmermann, et al. "Mining Version Archives for +// Co-changed Lines", in proceedings of the Mining Software +// Repositories workshop, Shanghai, May 22-23, 2006. +// +// Each node is asigned a commit: Start by the nodes in the first +// commit. Assign that commit as the creator of all its lines. +// +// Then jump to the nodes in the next commit, and calculate the diff +// between the two files. Newly created lines get +// assigned the new commit as its origin. Modified lines also get +// this new commit. Untouched lines retain the old commit. +// +// All this work is done in the assignOrigin function. +// +// This function holds all the internal relevant data in a blame +// struct, that is not exported. +// +// TODO: ways to improve the efficiency of this function: +// +// 1. Improve revlist +// +// 2. Improve how to traverse the history (example a backward +// traversal will be much more efficient) +// +// TODO: ways to improve the functrion in general +// +// 1. Add memoization betweenn revlist and assign. +// +// 2. It is using much more memmory than needed, see the TODOs below. +func Blame(repo *git.Repository, commit *git.Commit, path string) ([]*git.Commit, error) { + // init the internal blame struct + b := new(blame) + b.repo = repo + b.fRev = commit + b.path = path + + // calculte the history of the file and store it in the + // internal blame struct. + var err error + b.revs, err = revlist.New(b.repo, b.fRev, b.path) + if err != nil { + return nil, err + } + sort.Sort(b.revs) // for forward blame, we need the history sorted by commit date + + // allocate space for the data in all the revisions of the file + b.data = make([]string, len(b.revs)) + + // init the graph + b.graph = make([][]vertex, len(b.revs)) + + // for all every revision of the file, starting with the first + // one... + var found bool + for i, rev := range b.revs { + // get the contents of the file + b.data[i], found = git.Data(b.path, rev) + if !found { + continue + } + // count its lines + nLines := git.CountLines(b.data[i]) + // create a node for each line + b.graph[i] = make([]vertex, nLines) + // assign a commit to each node + // if this is the first revision, then the node is assigned to + // this first commit. + if i == 0 { + for j := 0; j < nLines; j++ { + b.graph[i][j] = vertex(b.revs[i]) + } + } else { + // if this is not the first commit, then assign to the old + // commit or to the new one, depending on what the diff + // says. + b.assignOrigin(i, i-1) + } + } + + // fill in the output results: copy the nodes of the last revision + // into the result. + fVs := b.graph[len(b.graph)-1] + result := make([]*git.Commit, 0, len(fVs)) + for _, v := range fVs { + c := git.Commit(*v) + result = append(result, &c) + } + return result, nil +} + +// this struct is internally used by the blame function to hold its +// intputs, outputs and state. +type blame struct { + repo *git.Repository // the repo holding the history of the file to blame + path string // the path of the file to blame + fRev *git.Commit // the commit of the final revision of the file to blame + revs revlist.Revs // the chain of revisions affecting the the file to blame + data []string // the contents on the file in all the revisions TODO: not all data is needed, only the current rev and the prev + graph [][]vertex // the graph of the lines in the file across all the revisions TODO: not all vertexes are needed, only the current rev and the prev +} + +type vertex *git.Commit // a vertex only needs to store the original commit it came from + +// Assigns origin to vertexes in current (c) rev from data in its previous (p) +// revision +func (b *blame) assignOrigin(c, p int) { + // assign origin based on diff info + hunks := diff.Do(b.data[p], b.data[c]) + sl := -1 // source line + dl := -1 // destination line + for h := range hunks { + hLines := git.CountLines(hunks[h].Text) + for hl := 0; hl < hLines; hl++ { + // fmt.Printf("file=%q, rev=%d, r=%d, h=%d, hunk=%v, hunkLine=%d\n", file, rev, r, h, hunks[h], hl) + switch { + case hunks[h].Type == 0: + sl++ + dl++ + b.graph[c][dl] = b.graph[p][sl] + case hunks[h].Type == 1: + dl++ + b.graph[c][dl] = vertex(b.revs[c]) + case hunks[h].Type == -1: + sl++ + default: + panic("unreachable") + } + } + } +} + +// This will print the results of a Blame as in git-blame. +func (b *blame) PrettyPrint() string { + var buf bytes.Buffer + + contents, found := git.Data(b.path, b.fRev) + if !found { + panic("PrettyPrint: internal error in repo.Data") + } + + lines := strings.Split(contents, "\n") + // max line number length + mlnl := len(fmt.Sprintf("%s", strconv.Itoa(len(lines)))) + // max author length + mal := b.maxAuthorLength() + format := fmt.Sprintf("%%s (%%-%ds %%%dd) %%s\n", + mal, mlnl) + + fVs := b.graph[len(b.graph)-1] + for ln, v := range fVs { + fmt.Fprintf(&buf, format, v.Hash.String()[:8], + prettyPrintAuthor(fVs[ln]), ln+1, lines[ln]) + } + return buf.String() +} + +// utility function to pretty print the author. +func prettyPrintAuthor(c *git.Commit) string { + return fmt.Sprintf("%s %s", c.Author.Name, c.Author.When.Format("2006-01-02")) +} + +// utility function to calculate the number of runes needed +// to print the longest author name in the blame of a file. +func (b *blame) maxAuthorLength() int { + memo := make(map[core.Hash]struct{}, len(b.graph)-1) + fVs := b.graph[len(b.graph)-1] + m := 0 + for ln := range fVs { + if _, ok := memo[fVs[ln].Hash]; ok { + continue + } + memo[fVs[ln].Hash] = struct{}{} + m = max(m, utf8.RuneCountInString(prettyPrintAuthor(fVs[ln]))) + } + return m +} + +func max(a, b int) int { + if a > b { + return a + } + return b +} |