packfile: improve Index memory representation to be more compact

Instead of using a map for offset indexing, use a sorted slice. Binary searching is fast, and a slice is much more compact. This has a negligible hit on speed, but has a significant impact on memory usage, especially for larger repos. benchmark old ns/op new ns/op delta BenchmarkIndexConstruction-12 15506506 14056098 -9.35% benchmark old allocs new allocs delta BenchmarkIndexConstruction-12 60764 60385 -0.62% benchmark old bytes new bytes delta BenchmarkIndexConstruction-12 4318145 3913169 -9.38% Signed-off-by: David Symonds <dsymonds@golang.org>
author: David Symonds <dsymonds@golang.org> 2018-05-30 11:06:44 +1000
committer: David Symonds <dsymonds@golang.org> 2018-05-30 11:34:26 +1000
commit: cf532f99e3e7632bc1d813245a4c79ae38b4d320 (patch)
tree: 702b13e6e462ca39fcfff119c1be949963f32705
parent: 57570e84f8c5739f0f4a59387493e590e709dde9 (diff)
download: go-git-cf532f99e3e7632bc1d813245a4c79ae38b4d320.tar.gz
2 files changed, 67 insertions, 23 deletions
diff --git a/plumbing/format/packfile/index.go b/plumbing/format/packfile/index.go
index 2c5f98f..7d8f2ad 100644
--- a/plumbing/format/packfile/index.go
+++ b/plumbing/format/packfile/index.go
@@ -1,6 +1,8 @@
 package packfile
 
 import (
+	"sort"
+
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/format/idxfile"
 )
@@ -10,7 +12,7 @@ import (
 // or to store them.
 type Index struct {
 	byHash   map[plumbing.Hash]*idxfile.Entry
-	byOffset map[uint64]*idxfile.Entry
+	byOffset []*idxfile.Entry // sorted by their offset
 }
 
 // NewIndex creates a new empty index with the given size. Size is a hint and
@@ -19,7 +21,7 @@ type Index struct {
 func NewIndex(size int) *Index {
 	return &Index{
 		byHash:   make(map[plumbing.Hash]*idxfile.Entry, size),
-		byOffset: make(map[uint64]*idxfile.Entry, size),
+		byOffset: make([]*idxfile.Entry, 0, size),
 	}
 }
 
@@ -27,28 +29,54 @@ func NewIndex(size int) *Index {
 func NewIndexFromIdxFile(idxf *idxfile.Idxfile) *Index {
 	idx := &Index{
 		byHash:   make(map[plumbing.Hash]*idxfile.Entry, idxf.ObjectCount),
-		byOffset: make(map[uint64]*idxfile.Entry, idxf.ObjectCount),
+		byOffset: make([]*idxfile.Entry, 0, idxf.ObjectCount),
 	}
 	for _, e := range idxf.Entries {
-		idx.add(e)
+		idx.addUnsorted(e)
 	}
+	sort.Sort(orderByOffset(idx.byOffset))
 
 	return idx
 }
 
+// orderByOffset is a sort.Interface adapter that arranges
+// a slice of entries by their offset.
+type orderByOffset []*idxfile.Entry
+
+func (o orderByOffset) Len() int           { return len(o) }
+func (o orderByOffset) Less(i, j int) bool { return o[i].Offset < o[j].Offset }
+func (o orderByOffset) Swap(i, j int)      { o[i], o[j] = o[j], o[i] }
+
 // Add adds a new Entry with the given values to the index.
 func (idx *Index) Add(h plumbing.Hash, offset uint64, crc32 uint32) {
-	e := idxfile.Entry{
+	e := &idxfile.Entry{
 		Hash:   h,
 		Offset: offset,
 		CRC32:  crc32,
 	}
-	idx.add(&e)
+	idx.byHash[e.Hash] = e
+
+	// Find the right position in byOffset.
+	// Look for the first position whose offset is *greater* than e.Offset.
+	i := sort.Search(len(idx.byOffset), func(i int) bool {
+		return idx.byOffset[i].Offset > offset
+	})
+	if i == len(idx.byOffset) {
+		// Simple case: add it to the end.
+		idx.byOffset = append(idx.byOffset, e)
+		return
+	}
+	// Harder case: shift existing entries down by one to make room.
+	// Append a nil entry first so we can use existing capacity in case
+	// the index was carefully preallocated.
+	idx.byOffset = append(idx.byOffset, nil)
+	copy(idx.byOffset[i+1:], idx.byOffset[i:len(idx.byOffset)-1])
+	idx.byOffset[i] = e
 }
 
-func (idx *Index) add(e *idxfile.Entry) {
+func (idx *Index) addUnsorted(e *idxfile.Entry) {
 	idx.byHash[e.Hash] = e
-	idx.byOffset[e.Offset] = e
+	idx.byOffset = append(idx.byOffset, e)
 }
 
 // LookupHash looks an entry up by its hash. An idxfile.Entry is returned and
@@ -61,8 +89,13 @@ func (idx *Index) LookupHash(h plumbing.Hash) (*idxfile.Entry, bool) {
 // LookupHash looks an entry up by its offset in the packfile. An idxfile.Entry
 // is returned and a bool, which is true if it was found or false if it wasn't.
 func (idx *Index) LookupOffset(offset uint64) (*idxfile.Entry, bool) {
-	e, ok := idx.byOffset[offset]
-	return e, ok
+	i := sort.Search(len(idx.byOffset), func(i int) bool {
+		return idx.byOffset[i].Offset >= offset
+	})
+	if i >= len(idx.byOffset) || idx.byOffset[i].Offset != offset {
+		return nil, false // not present
+	}
+	return idx.byOffset[i], true
 }
 
 // Size returns the number of entries in the index.
diff --git a/plumbing/format/packfile/index_test.go b/plumbing/format/packfile/index_test.go
index 6714704..8de886d 100644
--- a/plumbing/format/packfile/index_test.go
+++ b/plumbing/format/packfile/index_test.go
@@ -3,6 +3,7 @@ package packfile
 import (
 	"strconv"
 	"strings"
+	"testing"
 
 	"gopkg.in/src-d/go-git.v4/plumbing"
 
@@ -26,12 +27,12 @@ func (s *IndexSuite) TestLookupOffset(c *C) {
 				e, ok := idx.LookupOffset(uint64(o2))
 				c.Assert(ok, Equals, true)
 				c.Assert(e, NotNil)
-				c.Assert(e.Hash, Equals, s.toHash(o2))
+				c.Assert(e.Hash, Equals, toHash(o2))
 				c.Assert(e.Offset, Equals, uint64(o2))
 			}
 		}
 
-		h1 := s.toHash(o1)
+		h1 := toHash(o1)
 		idx.Add(h1, uint64(o1), 0)
 
 		for o2 := 0; o2 < 10000; o2 += 100 {
@@ -43,7 +44,7 @@ func (s *IndexSuite) TestLookupOffset(c *C) {
 				e, ok := idx.LookupOffset(uint64(o2))
 				c.Assert(ok, Equals, true)
 				c.Assert(e, NotNil)
-				c.Assert(e.Hash, Equals, s.toHash(o2))
+				c.Assert(e.Hash, Equals, toHash(o2))
 				c.Assert(e.Offset, Equals, uint64(o2))
 			}
 		}
@@ -56,31 +57,31 @@ func (s *IndexSuite) TestLookupHash(c *C) {
 	for o1 := 0; o1 < 10000; o1 += 100 {
 		for o2 := 0; o2 < 10000; o2 += 100 {
 			if o2 >= o1 {
-				e, ok := idx.LookupHash(s.toHash(o2))
+				e, ok := idx.LookupHash(toHash(o2))
 				c.Assert(ok, Equals, false)
 				c.Assert(e, IsNil)
 			} else {
-				e, ok := idx.LookupHash(s.toHash(o2))
+				e, ok := idx.LookupHash(toHash(o2))
 				c.Assert(ok, Equals, true)
 				c.Assert(e, NotNil)
-				c.Assert(e.Hash, Equals, s.toHash(o2))
+				c.Assert(e.Hash, Equals, toHash(o2))
 				c.Assert(e.Offset, Equals, uint64(o2))
 			}
 		}
 
-		h1 := s.toHash(o1)
+		h1 := toHash(o1)
 		idx.Add(h1, uint64(o1), 0)
 
 		for o2 := 0; o2 < 10000; o2 += 100 {
 			if o2 > o1 {
-				e, ok := idx.LookupHash(s.toHash(o2))
+				e, ok := idx.LookupHash(toHash(o2))
 				c.Assert(ok, Equals, false)
 				c.Assert(e, IsNil)
 			} else {
-				e, ok := idx.LookupHash(s.toHash(o2))
+				e, ok := idx.LookupHash(toHash(o2))
 				c.Assert(ok, Equals, true)
 				c.Assert(e, NotNil)
-				c.Assert(e.Hash, Equals, s.toHash(o2))
+				c.Assert(e.Hash, Equals, toHash(o2))
 				c.Assert(e.Offset, Equals, uint64(o2))
 			}
 		}
@@ -92,7 +93,7 @@ func (s *IndexSuite) TestSize(c *C) {
 
 	for o1 := 0; o1 < 1000; o1++ {
 		c.Assert(idx.Size(), Equals, o1)
-		h1 := s.toHash(o1)
+		h1 := toHash(o1)
 		idx.Add(h1, uint64(o1), 0)
 	}
 }
@@ -107,7 +108,7 @@ func (s *IndexSuite) TestIdxFileEmpty(c *C) {
 func (s *IndexSuite) TestIdxFile(c *C) {
 	idx := NewIndex(0)
 	for o1 := 0; o1 < 1000; o1++ {
-		h1 := s.toHash(o1)
+		h1 := toHash(o1)
 		idx.Add(h1, uint64(o1), 0)
 	}
 
@@ -115,8 +116,18 @@ func (s *IndexSuite) TestIdxFile(c *C) {
 	c.Assert(idx, DeepEquals, idx2)
 }
 
-func (s *IndexSuite) toHash(i int) plumbing.Hash {
+func toHash(i int) plumbing.Hash {
 	is := strconv.Itoa(i)
 	padding := strings.Repeat("a", 40-len(is))
 	return plumbing.NewHash(padding + is)
 }
+
+func BenchmarkIndexConstruction(b *testing.B) {
+	b.ReportAllocs()
+
+	idx := NewIndex(0)
+	for o := 0; o < 1e6*b.N; o += 100 {
+		h1 := toHash(o)
+		idx.Add(h1, uint64(o), 0)
+	}
+}
author	David Symonds <dsymonds@golang.org>	2018-05-30 11:06:44 +1000
committer	David Symonds <dsymonds@golang.org>	2018-05-30 11:34:26 +1000
commit	cf532f99e3e7632bc1d813245a4c79ae38b4d320 (patch)
tree	702b13e6e462ca39fcfff119c1be949963f32705
parent	57570e84f8c5739f0f4a59387493e590e709dde9 (diff)
download	go-git-cf532f99e3e7632bc1d813245a4c79ae38b4d320.tar.gz