From cf532f99e3e7632bc1d813245a4c79ae38b4d320 Mon Sep 17 00:00:00 2001 From: David Symonds Date: Wed, 30 May 2018 11:06:44 +1000 Subject: packfile: improve Index memory representation to be more compact Instead of using a map for offset indexing, use a sorted slice. Binary searching is fast, and a slice is much more compact. This has a negligible hit on speed, but has a significant impact on memory usage, especially for larger repos. benchmark old ns/op new ns/op delta BenchmarkIndexConstruction-12 15506506 14056098 -9.35% benchmark old allocs new allocs delta BenchmarkIndexConstruction-12 60764 60385 -0.62% benchmark old bytes new bytes delta BenchmarkIndexConstruction-12 4318145 3913169 -9.38% Signed-off-by: David Symonds --- plumbing/format/packfile/index.go | 53 +++++++++++++++++++++++++++------- plumbing/format/packfile/index_test.go | 37 +++++++++++++++--------- 2 files changed, 67 insertions(+), 23 deletions(-) (limited to 'plumbing/format') diff --git a/plumbing/format/packfile/index.go b/plumbing/format/packfile/index.go index 2c5f98f..7d8f2ad 100644 --- a/plumbing/format/packfile/index.go +++ b/plumbing/format/packfile/index.go @@ -1,6 +1,8 @@ package packfile import ( + "sort" + "gopkg.in/src-d/go-git.v4/plumbing" "gopkg.in/src-d/go-git.v4/plumbing/format/idxfile" ) @@ -10,7 +12,7 @@ import ( // or to store them. type Index struct { byHash map[plumbing.Hash]*idxfile.Entry - byOffset map[uint64]*idxfile.Entry + byOffset []*idxfile.Entry // sorted by their offset } // NewIndex creates a new empty index with the given size. Size is a hint and @@ -19,7 +21,7 @@ type Index struct { func NewIndex(size int) *Index { return &Index{ byHash: make(map[plumbing.Hash]*idxfile.Entry, size), - byOffset: make(map[uint64]*idxfile.Entry, size), + byOffset: make([]*idxfile.Entry, 0, size), } } @@ -27,28 +29,54 @@ func NewIndex(size int) *Index { func NewIndexFromIdxFile(idxf *idxfile.Idxfile) *Index { idx := &Index{ byHash: make(map[plumbing.Hash]*idxfile.Entry, idxf.ObjectCount), - byOffset: make(map[uint64]*idxfile.Entry, idxf.ObjectCount), + byOffset: make([]*idxfile.Entry, 0, idxf.ObjectCount), } for _, e := range idxf.Entries { - idx.add(e) + idx.addUnsorted(e) } + sort.Sort(orderByOffset(idx.byOffset)) return idx } +// orderByOffset is a sort.Interface adapter that arranges +// a slice of entries by their offset. +type orderByOffset []*idxfile.Entry + +func (o orderByOffset) Len() int { return len(o) } +func (o orderByOffset) Less(i, j int) bool { return o[i].Offset < o[j].Offset } +func (o orderByOffset) Swap(i, j int) { o[i], o[j] = o[j], o[i] } + // Add adds a new Entry with the given values to the index. func (idx *Index) Add(h plumbing.Hash, offset uint64, crc32 uint32) { - e := idxfile.Entry{ + e := &idxfile.Entry{ Hash: h, Offset: offset, CRC32: crc32, } - idx.add(&e) + idx.byHash[e.Hash] = e + + // Find the right position in byOffset. + // Look for the first position whose offset is *greater* than e.Offset. + i := sort.Search(len(idx.byOffset), func(i int) bool { + return idx.byOffset[i].Offset > offset + }) + if i == len(idx.byOffset) { + // Simple case: add it to the end. + idx.byOffset = append(idx.byOffset, e) + return + } + // Harder case: shift existing entries down by one to make room. + // Append a nil entry first so we can use existing capacity in case + // the index was carefully preallocated. + idx.byOffset = append(idx.byOffset, nil) + copy(idx.byOffset[i+1:], idx.byOffset[i:len(idx.byOffset)-1]) + idx.byOffset[i] = e } -func (idx *Index) add(e *idxfile.Entry) { +func (idx *Index) addUnsorted(e *idxfile.Entry) { idx.byHash[e.Hash] = e - idx.byOffset[e.Offset] = e + idx.byOffset = append(idx.byOffset, e) } // LookupHash looks an entry up by its hash. An idxfile.Entry is returned and @@ -61,8 +89,13 @@ func (idx *Index) LookupHash(h plumbing.Hash) (*idxfile.Entry, bool) { // LookupHash looks an entry up by its offset in the packfile. An idxfile.Entry // is returned and a bool, which is true if it was found or false if it wasn't. func (idx *Index) LookupOffset(offset uint64) (*idxfile.Entry, bool) { - e, ok := idx.byOffset[offset] - return e, ok + i := sort.Search(len(idx.byOffset), func(i int) bool { + return idx.byOffset[i].Offset >= offset + }) + if i >= len(idx.byOffset) || idx.byOffset[i].Offset != offset { + return nil, false // not present + } + return idx.byOffset[i], true } // Size returns the number of entries in the index. diff --git a/plumbing/format/packfile/index_test.go b/plumbing/format/packfile/index_test.go index 6714704..8de886d 100644 --- a/plumbing/format/packfile/index_test.go +++ b/plumbing/format/packfile/index_test.go @@ -3,6 +3,7 @@ package packfile import ( "strconv" "strings" + "testing" "gopkg.in/src-d/go-git.v4/plumbing" @@ -26,12 +27,12 @@ func (s *IndexSuite) TestLookupOffset(c *C) { e, ok := idx.LookupOffset(uint64(o2)) c.Assert(ok, Equals, true) c.Assert(e, NotNil) - c.Assert(e.Hash, Equals, s.toHash(o2)) + c.Assert(e.Hash, Equals, toHash(o2)) c.Assert(e.Offset, Equals, uint64(o2)) } } - h1 := s.toHash(o1) + h1 := toHash(o1) idx.Add(h1, uint64(o1), 0) for o2 := 0; o2 < 10000; o2 += 100 { @@ -43,7 +44,7 @@ func (s *IndexSuite) TestLookupOffset(c *C) { e, ok := idx.LookupOffset(uint64(o2)) c.Assert(ok, Equals, true) c.Assert(e, NotNil) - c.Assert(e.Hash, Equals, s.toHash(o2)) + c.Assert(e.Hash, Equals, toHash(o2)) c.Assert(e.Offset, Equals, uint64(o2)) } } @@ -56,31 +57,31 @@ func (s *IndexSuite) TestLookupHash(c *C) { for o1 := 0; o1 < 10000; o1 += 100 { for o2 := 0; o2 < 10000; o2 += 100 { if o2 >= o1 { - e, ok := idx.LookupHash(s.toHash(o2)) + e, ok := idx.LookupHash(toHash(o2)) c.Assert(ok, Equals, false) c.Assert(e, IsNil) } else { - e, ok := idx.LookupHash(s.toHash(o2)) + e, ok := idx.LookupHash(toHash(o2)) c.Assert(ok, Equals, true) c.Assert(e, NotNil) - c.Assert(e.Hash, Equals, s.toHash(o2)) + c.Assert(e.Hash, Equals, toHash(o2)) c.Assert(e.Offset, Equals, uint64(o2)) } } - h1 := s.toHash(o1) + h1 := toHash(o1) idx.Add(h1, uint64(o1), 0) for o2 := 0; o2 < 10000; o2 += 100 { if o2 > o1 { - e, ok := idx.LookupHash(s.toHash(o2)) + e, ok := idx.LookupHash(toHash(o2)) c.Assert(ok, Equals, false) c.Assert(e, IsNil) } else { - e, ok := idx.LookupHash(s.toHash(o2)) + e, ok := idx.LookupHash(toHash(o2)) c.Assert(ok, Equals, true) c.Assert(e, NotNil) - c.Assert(e.Hash, Equals, s.toHash(o2)) + c.Assert(e.Hash, Equals, toHash(o2)) c.Assert(e.Offset, Equals, uint64(o2)) } } @@ -92,7 +93,7 @@ func (s *IndexSuite) TestSize(c *C) { for o1 := 0; o1 < 1000; o1++ { c.Assert(idx.Size(), Equals, o1) - h1 := s.toHash(o1) + h1 := toHash(o1) idx.Add(h1, uint64(o1), 0) } } @@ -107,7 +108,7 @@ func (s *IndexSuite) TestIdxFileEmpty(c *C) { func (s *IndexSuite) TestIdxFile(c *C) { idx := NewIndex(0) for o1 := 0; o1 < 1000; o1++ { - h1 := s.toHash(o1) + h1 := toHash(o1) idx.Add(h1, uint64(o1), 0) } @@ -115,8 +116,18 @@ func (s *IndexSuite) TestIdxFile(c *C) { c.Assert(idx, DeepEquals, idx2) } -func (s *IndexSuite) toHash(i int) plumbing.Hash { +func toHash(i int) plumbing.Hash { is := strconv.Itoa(i) padding := strings.Repeat("a", 40-len(is)) return plumbing.NewHash(padding + is) } + +func BenchmarkIndexConstruction(b *testing.B) { + b.ReportAllocs() + + idx := NewIndex(0) + for o := 0; o < 1e6*b.N; o += 100 { + h1 := toHash(o) + idx.Add(h1, uint64(o), 0) + } +} -- cgit