From cf532f99e3e7632bc1d813245a4c79ae38b4d320 Mon Sep 17 00:00:00 2001 From: David Symonds Date: Wed, 30 May 2018 11:06:44 +1000 Subject: packfile: improve Index memory representation to be more compact Instead of using a map for offset indexing, use a sorted slice. Binary searching is fast, and a slice is much more compact. This has a negligible hit on speed, but has a significant impact on memory usage, especially for larger repos. benchmark old ns/op new ns/op delta BenchmarkIndexConstruction-12 15506506 14056098 -9.35% benchmark old allocs new allocs delta BenchmarkIndexConstruction-12 60764 60385 -0.62% benchmark old bytes new bytes delta BenchmarkIndexConstruction-12 4318145 3913169 -9.38% Signed-off-by: David Symonds --- plumbing/format/packfile/index.go | 53 +++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 10 deletions(-) (limited to 'plumbing/format/packfile/index.go') diff --git a/plumbing/format/packfile/index.go b/plumbing/format/packfile/index.go index 2c5f98f..7d8f2ad 100644 --- a/plumbing/format/packfile/index.go +++ b/plumbing/format/packfile/index.go @@ -1,6 +1,8 @@ package packfile import ( + "sort" + "gopkg.in/src-d/go-git.v4/plumbing" "gopkg.in/src-d/go-git.v4/plumbing/format/idxfile" ) @@ -10,7 +12,7 @@ import ( // or to store them. type Index struct { byHash map[plumbing.Hash]*idxfile.Entry - byOffset map[uint64]*idxfile.Entry + byOffset []*idxfile.Entry // sorted by their offset } // NewIndex creates a new empty index with the given size. Size is a hint and @@ -19,7 +21,7 @@ type Index struct { func NewIndex(size int) *Index { return &Index{ byHash: make(map[plumbing.Hash]*idxfile.Entry, size), - byOffset: make(map[uint64]*idxfile.Entry, size), + byOffset: make([]*idxfile.Entry, 0, size), } } @@ -27,28 +29,54 @@ func NewIndex(size int) *Index { func NewIndexFromIdxFile(idxf *idxfile.Idxfile) *Index { idx := &Index{ byHash: make(map[plumbing.Hash]*idxfile.Entry, idxf.ObjectCount), - byOffset: make(map[uint64]*idxfile.Entry, idxf.ObjectCount), + byOffset: make([]*idxfile.Entry, 0, idxf.ObjectCount), } for _, e := range idxf.Entries { - idx.add(e) + idx.addUnsorted(e) } + sort.Sort(orderByOffset(idx.byOffset)) return idx } +// orderByOffset is a sort.Interface adapter that arranges +// a slice of entries by their offset. +type orderByOffset []*idxfile.Entry + +func (o orderByOffset) Len() int { return len(o) } +func (o orderByOffset) Less(i, j int) bool { return o[i].Offset < o[j].Offset } +func (o orderByOffset) Swap(i, j int) { o[i], o[j] = o[j], o[i] } + // Add adds a new Entry with the given values to the index. func (idx *Index) Add(h plumbing.Hash, offset uint64, crc32 uint32) { - e := idxfile.Entry{ + e := &idxfile.Entry{ Hash: h, Offset: offset, CRC32: crc32, } - idx.add(&e) + idx.byHash[e.Hash] = e + + // Find the right position in byOffset. + // Look for the first position whose offset is *greater* than e.Offset. + i := sort.Search(len(idx.byOffset), func(i int) bool { + return idx.byOffset[i].Offset > offset + }) + if i == len(idx.byOffset) { + // Simple case: add it to the end. + idx.byOffset = append(idx.byOffset, e) + return + } + // Harder case: shift existing entries down by one to make room. + // Append a nil entry first so we can use existing capacity in case + // the index was carefully preallocated. + idx.byOffset = append(idx.byOffset, nil) + copy(idx.byOffset[i+1:], idx.byOffset[i:len(idx.byOffset)-1]) + idx.byOffset[i] = e } -func (idx *Index) add(e *idxfile.Entry) { +func (idx *Index) addUnsorted(e *idxfile.Entry) { idx.byHash[e.Hash] = e - idx.byOffset[e.Offset] = e + idx.byOffset = append(idx.byOffset, e) } // LookupHash looks an entry up by its hash. An idxfile.Entry is returned and @@ -61,8 +89,13 @@ func (idx *Index) LookupHash(h plumbing.Hash) (*idxfile.Entry, bool) { // LookupHash looks an entry up by its offset in the packfile. An idxfile.Entry // is returned and a bool, which is true if it was found or false if it wasn't. func (idx *Index) LookupOffset(offset uint64) (*idxfile.Entry, bool) { - e, ok := idx.byOffset[offset] - return e, ok + i := sort.Search(len(idx.byOffset), func(i int) bool { + return idx.byOffset[i].Offset >= offset + }) + if i >= len(idx.byOffset) || idx.byOffset[i].Offset != offset { + return nil, false // not present + } + return idx.byOffset[i], true } // Size returns the number of entries in the index. -- cgit From 2d9816a5e7daea58a1419fef70bfc8d220ffd6a2 Mon Sep 17 00:00:00 2001 From: David Symonds Date: Thu, 21 Jun 2018 13:24:03 +1000 Subject: packfile: optimise NewIndexFromIdxFile for a very common case Loading from an on-disk idxfile will usually already have the idxfile entries in order, so check that before wasting time on sorting. Signed-off-by: David Symonds --- plumbing/format/packfile/index.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'plumbing/format/packfile/index.go') diff --git a/plumbing/format/packfile/index.go b/plumbing/format/packfile/index.go index 7d8f2ad..021b2d1 100644 --- a/plumbing/format/packfile/index.go +++ b/plumbing/format/packfile/index.go @@ -31,10 +31,20 @@ func NewIndexFromIdxFile(idxf *idxfile.Idxfile) *Index { byHash: make(map[plumbing.Hash]*idxfile.Entry, idxf.ObjectCount), byOffset: make([]*idxfile.Entry, 0, idxf.ObjectCount), } - for _, e := range idxf.Entries { + sorted := true + for i, e := range idxf.Entries { idx.addUnsorted(e) + if i > 0 && idx.byOffset[i-1].Offset >= e.Offset { + sorted = false + } + } + + // If the idxfile was loaded from a regular packfile index + // then it will already be in offset order, in which case we + // can avoid doing a relatively expensive idempotent sort. + if !sorted { + sort.Sort(orderByOffset(idx.byOffset)) } - sort.Sort(orderByOffset(idx.byOffset)) return idx } -- cgit From 009f1069a1248c1e9189a9e4c342f6d017156ec4 Mon Sep 17 00:00:00 2001 From: Miguel Molina Date: Thu, 19 Jul 2018 15:20:10 +0200 Subject: plumbing/format/idxfile: add new Index and MemoryIndex Signed-off-by: Miguel Molina --- plumbing/format/packfile/index.go | 125 -------------------------------------- 1 file changed, 125 deletions(-) delete mode 100644 plumbing/format/packfile/index.go (limited to 'plumbing/format/packfile/index.go') diff --git a/plumbing/format/packfile/index.go b/plumbing/format/packfile/index.go deleted file mode 100644 index 021b2d1..0000000 --- a/plumbing/format/packfile/index.go +++ /dev/null @@ -1,125 +0,0 @@ -package packfile - -import ( - "sort" - - "gopkg.in/src-d/go-git.v4/plumbing" - "gopkg.in/src-d/go-git.v4/plumbing/format/idxfile" -) - -// Index is an in-memory representation of a packfile index. -// This uses idxfile.Idxfile under the hood to obtain indexes from .idx files -// or to store them. -type Index struct { - byHash map[plumbing.Hash]*idxfile.Entry - byOffset []*idxfile.Entry // sorted by their offset -} - -// NewIndex creates a new empty index with the given size. Size is a hint and -// can be 0. It is recommended to set it to the number of objects to be indexed -// if it is known beforehand (e.g. reading from a packfile). -func NewIndex(size int) *Index { - return &Index{ - byHash: make(map[plumbing.Hash]*idxfile.Entry, size), - byOffset: make([]*idxfile.Entry, 0, size), - } -} - -// NewIndexFromIdxFile creates a new Index from an idxfile.IdxFile. -func NewIndexFromIdxFile(idxf *idxfile.Idxfile) *Index { - idx := &Index{ - byHash: make(map[plumbing.Hash]*idxfile.Entry, idxf.ObjectCount), - byOffset: make([]*idxfile.Entry, 0, idxf.ObjectCount), - } - sorted := true - for i, e := range idxf.Entries { - idx.addUnsorted(e) - if i > 0 && idx.byOffset[i-1].Offset >= e.Offset { - sorted = false - } - } - - // If the idxfile was loaded from a regular packfile index - // then it will already be in offset order, in which case we - // can avoid doing a relatively expensive idempotent sort. - if !sorted { - sort.Sort(orderByOffset(idx.byOffset)) - } - - return idx -} - -// orderByOffset is a sort.Interface adapter that arranges -// a slice of entries by their offset. -type orderByOffset []*idxfile.Entry - -func (o orderByOffset) Len() int { return len(o) } -func (o orderByOffset) Less(i, j int) bool { return o[i].Offset < o[j].Offset } -func (o orderByOffset) Swap(i, j int) { o[i], o[j] = o[j], o[i] } - -// Add adds a new Entry with the given values to the index. -func (idx *Index) Add(h plumbing.Hash, offset uint64, crc32 uint32) { - e := &idxfile.Entry{ - Hash: h, - Offset: offset, - CRC32: crc32, - } - idx.byHash[e.Hash] = e - - // Find the right position in byOffset. - // Look for the first position whose offset is *greater* than e.Offset. - i := sort.Search(len(idx.byOffset), func(i int) bool { - return idx.byOffset[i].Offset > offset - }) - if i == len(idx.byOffset) { - // Simple case: add it to the end. - idx.byOffset = append(idx.byOffset, e) - return - } - // Harder case: shift existing entries down by one to make room. - // Append a nil entry first so we can use existing capacity in case - // the index was carefully preallocated. - idx.byOffset = append(idx.byOffset, nil) - copy(idx.byOffset[i+1:], idx.byOffset[i:len(idx.byOffset)-1]) - idx.byOffset[i] = e -} - -func (idx *Index) addUnsorted(e *idxfile.Entry) { - idx.byHash[e.Hash] = e - idx.byOffset = append(idx.byOffset, e) -} - -// LookupHash looks an entry up by its hash. An idxfile.Entry is returned and -// a bool, which is true if it was found or false if it wasn't. -func (idx *Index) LookupHash(h plumbing.Hash) (*idxfile.Entry, bool) { - e, ok := idx.byHash[h] - return e, ok -} - -// LookupHash looks an entry up by its offset in the packfile. An idxfile.Entry -// is returned and a bool, which is true if it was found or false if it wasn't. -func (idx *Index) LookupOffset(offset uint64) (*idxfile.Entry, bool) { - i := sort.Search(len(idx.byOffset), func(i int) bool { - return idx.byOffset[i].Offset >= offset - }) - if i >= len(idx.byOffset) || idx.byOffset[i].Offset != offset { - return nil, false // not present - } - return idx.byOffset[i], true -} - -// Size returns the number of entries in the index. -func (idx *Index) Size() int { - return len(idx.byHash) -} - -// ToIdxFile converts the index to an idxfile.Idxfile, which can then be used -// to serialize. -func (idx *Index) ToIdxFile() *idxfile.Idxfile { - idxf := idxfile.NewIdxfile() - for _, e := range idx.byHash { - idxf.Entries = append(idxf.Entries, e) - } - - return idxf -} -- cgit