aboutsummaryrefslogtreecommitdiffstats
path: root/blame/blame.go
diff options
context:
space:
mode:
authorAlberto Cortés <alberto@sourced.tech>2015-11-16 03:20:01 +0100
committerAlberto Cortés <alberto@sourced.tech>2015-11-25 11:09:51 +0100
commitd643cea1e8a6d618b2eddfdbed086c7bdf208658 (patch)
treeb862c72ccb674910d24eac2ab424a7fb5ea3f0fb /blame/blame.go
parentcaab43e7f4ee10a15b2af826485b688473b34356 (diff)
downloadgo-git-d643cea1e8a6d618b2eddfdbed086c7bdf208658.tar.gz
Blame support for files
This also includes a diff package and revlist package (needed by blame) Some extra packfiles (<1MB) are also included, to be used as fixtures in the tests.
Diffstat (limited to 'blame/blame.go')
-rw-r--r--blame/blame.go223
1 files changed, 223 insertions, 0 deletions
diff --git a/blame/blame.go b/blame/blame.go
new file mode 100644
index 0000000..774362b
--- /dev/null
+++ b/blame/blame.go
@@ -0,0 +1,223 @@
+// Package blame contains blaming functionality for files in the repo.
+//
+// Blaming a file is finding what commit was the last to modify each of
+// the lines in the file, therefore the output of a blaming operation is
+// usualy a slice of commits, one commit per line in the file.
+//
+// This package also provides a pretty print function to output the
+// results of a blame in a similar format to the git-blame command.
+package blame
+
+import (
+ "bytes"
+ "fmt"
+ "sort"
+ "strconv"
+ "strings"
+ "unicode/utf8"
+
+ "gopkg.in/src-d/go-git.v2"
+ "gopkg.in/src-d/go-git.v2/core"
+ "gopkg.in/src-d/go-git.v2/diff"
+ "gopkg.in/src-d/go-git.v2/revlist"
+)
+
+// Blame returns the last commit that modified each line of a file in
+// a repository.
+//
+// The file to blame is identified by the input arguments: repo, commit and path.
+// The output is a slice of commits, one for each line in the file.
+//
+// Blaming a file is a two step process:
+//
+// 1. Create a linear history of the commits affecting a file. We use
+// revlist.New for that.
+//
+// 2. Then build a graph with a node for every line in every file in
+// the history of the file.
+//
+// Each node (line) holds the commit where it was introduced or
+// last modified. To achieve that we use the FORWARD algorithm
+// described in Zimmermann, et al. "Mining Version Archives for
+// Co-changed Lines", in proceedings of the Mining Software
+// Repositories workshop, Shanghai, May 22-23, 2006.
+//
+// Each node is asigned a commit: Start by the nodes in the first
+// commit. Assign that commit as the creator of all its lines.
+//
+// Then jump to the nodes in the next commit, and calculate the diff
+// between the two files. Newly created lines get
+// assigned the new commit as its origin. Modified lines also get
+// this new commit. Untouched lines retain the old commit.
+//
+// All this work is done in the assignOrigin function.
+//
+// This function holds all the internal relevant data in a blame
+// struct, that is not exported.
+//
+// TODO: ways to improve the efficiency of this function:
+//
+// 1. Improve revlist
+//
+// 2. Improve how to traverse the history (example a backward
+// traversal will be much more efficient)
+//
+// TODO: ways to improve the functrion in general
+//
+// 1. Add memoization betweenn revlist and assign.
+//
+// 2. It is using much more memmory than needed, see the TODOs below.
+func Blame(repo *git.Repository, commit *git.Commit, path string) ([]*git.Commit, error) {
+ // init the internal blame struct
+ b := new(blame)
+ b.repo = repo
+ b.fRev = commit
+ b.path = path
+
+ // calculte the history of the file and store it in the
+ // internal blame struct.
+ var err error
+ b.revs, err = revlist.New(b.repo, b.fRev, b.path)
+ if err != nil {
+ return nil, err
+ }
+ sort.Sort(b.revs) // for forward blame, we need the history sorted by commit date
+
+ // allocate space for the data in all the revisions of the file
+ b.data = make([]string, len(b.revs))
+
+ // init the graph
+ b.graph = make([][]vertex, len(b.revs))
+
+ // for all every revision of the file, starting with the first
+ // one...
+ var found bool
+ for i, rev := range b.revs {
+ // get the contents of the file
+ b.data[i], found = git.Data(b.path, rev)
+ if !found {
+ continue
+ }
+ // count its lines
+ nLines := git.CountLines(b.data[i])
+ // create a node for each line
+ b.graph[i] = make([]vertex, nLines)
+ // assign a commit to each node
+ // if this is the first revision, then the node is assigned to
+ // this first commit.
+ if i == 0 {
+ for j := 0; j < nLines; j++ {
+ b.graph[i][j] = vertex(b.revs[i])
+ }
+ } else {
+ // if this is not the first commit, then assign to the old
+ // commit or to the new one, depending on what the diff
+ // says.
+ b.assignOrigin(i, i-1)
+ }
+ }
+
+ // fill in the output results: copy the nodes of the last revision
+ // into the result.
+ fVs := b.graph[len(b.graph)-1]
+ result := make([]*git.Commit, 0, len(fVs))
+ for _, v := range fVs {
+ c := git.Commit(*v)
+ result = append(result, &c)
+ }
+ return result, nil
+}
+
+// this struct is internally used by the blame function to hold its
+// intputs, outputs and state.
+type blame struct {
+ repo *git.Repository // the repo holding the history of the file to blame
+ path string // the path of the file to blame
+ fRev *git.Commit // the commit of the final revision of the file to blame
+ revs revlist.Revs // the chain of revisions affecting the the file to blame
+ data []string // the contents on the file in all the revisions TODO: not all data is needed, only the current rev and the prev
+ graph [][]vertex // the graph of the lines in the file across all the revisions TODO: not all vertexes are needed, only the current rev and the prev
+}
+
+type vertex *git.Commit // a vertex only needs to store the original commit it came from
+
+// Assigns origin to vertexes in current (c) rev from data in its previous (p)
+// revision
+func (b *blame) assignOrigin(c, p int) {
+ // assign origin based on diff info
+ hunks := diff.Do(b.data[p], b.data[c])
+ sl := -1 // source line
+ dl := -1 // destination line
+ for h := range hunks {
+ hLines := git.CountLines(hunks[h].Text)
+ for hl := 0; hl < hLines; hl++ {
+ // fmt.Printf("file=%q, rev=%d, r=%d, h=%d, hunk=%v, hunkLine=%d\n", file, rev, r, h, hunks[h], hl)
+ switch {
+ case hunks[h].Type == 0:
+ sl++
+ dl++
+ b.graph[c][dl] = b.graph[p][sl]
+ case hunks[h].Type == 1:
+ dl++
+ b.graph[c][dl] = vertex(b.revs[c])
+ case hunks[h].Type == -1:
+ sl++
+ default:
+ panic("unreachable")
+ }
+ }
+ }
+}
+
+// This will print the results of a Blame as in git-blame.
+func (b *blame) PrettyPrint() string {
+ var buf bytes.Buffer
+
+ contents, found := git.Data(b.path, b.fRev)
+ if !found {
+ panic("PrettyPrint: internal error in repo.Data")
+ }
+
+ lines := strings.Split(contents, "\n")
+ // max line number length
+ mlnl := len(fmt.Sprintf("%s", strconv.Itoa(len(lines))))
+ // max author length
+ mal := b.maxAuthorLength()
+ format := fmt.Sprintf("%%s (%%-%ds %%%dd) %%s\n",
+ mal, mlnl)
+
+ fVs := b.graph[len(b.graph)-1]
+ for ln, v := range fVs {
+ fmt.Fprintf(&buf, format, v.Hash.String()[:8],
+ prettyPrintAuthor(fVs[ln]), ln+1, lines[ln])
+ }
+ return buf.String()
+}
+
+// utility function to pretty print the author.
+func prettyPrintAuthor(c *git.Commit) string {
+ return fmt.Sprintf("%s %s", c.Author.Name, c.Author.When.Format("2006-01-02"))
+}
+
+// utility function to calculate the number of runes needed
+// to print the longest author name in the blame of a file.
+func (b *blame) maxAuthorLength() int {
+ memo := make(map[core.Hash]struct{}, len(b.graph)-1)
+ fVs := b.graph[len(b.graph)-1]
+ m := 0
+ for ln := range fVs {
+ if _, ok := memo[fVs[ln].Hash]; ok {
+ continue
+ }
+ memo[fVs[ln].Hash] = struct{}{}
+ m = max(m, utf8.RuneCountInString(prettyPrintAuthor(fVs[ln])))
+ }
+ return m
+}
+
+func max(a, b int) int {
+ if a > b {
+ return a
+ }
+ return b
+}