From c64eb817d5e5cbaec10dea1342e1ec95721e886b Mon Sep 17 00:00:00 2001 From: "Santiago M. Mola" Date: Tue, 25 Jul 2017 10:08:36 +0200 Subject: packfile: create packfile.Index and reuse it There was an internal type (i.e. storage/filesystem.idx) to use as in-memory index for packfiles. This was not convenient to reuse in the packfile. This commit creates a new representation (format/packfile.Index) that can be converted to and from idxfile.Idxfile. A packfile.Index now contains the functionality that was scattered on storage/filesystem.idx and packfile.Decoder's internals. storage/filesystem now reuses packfile.Index instances and this also results in higher cache hit ratios when resolving deltas. --- plumbing/format/packfile/decoder.go | 104 +++++++++++++------------- plumbing/format/packfile/decoder_test.go | 53 ++++++-------- plumbing/format/packfile/index.go | 82 +++++++++++++++++++++ plumbing/format/packfile/index_test.go | 122 +++++++++++++++++++++++++++++++ 4 files changed, 279 insertions(+), 82 deletions(-) create mode 100644 plumbing/format/packfile/index.go create mode 100644 plumbing/format/packfile/index_test.go (limited to 'plumbing/format/packfile') diff --git a/plumbing/format/packfile/decoder.go b/plumbing/format/packfile/decoder.go index 21ddbbf..39680a3 100644 --- a/plumbing/format/packfile/decoder.go +++ b/plumbing/format/packfile/decoder.go @@ -56,10 +56,12 @@ type Decoder struct { o storer.EncodedObjectStorer tx storer.Transaction - isDecoded bool - offsetToHash map[int64]plumbing.Hash - hashToOffset map[plumbing.Hash]int64 - crcs map[plumbing.Hash]uint32 + isDecoded bool + + // hasBuiltIndex indicates if the index is fully built or not. If it is not, + // will be built incrementally while decoding. + hasBuiltIndex bool + idx *Index offsetToType map[int64]plumbing.ObjectType decoderType plumbing.ObjectType @@ -102,10 +104,7 @@ func NewDecoderForType(s *Scanner, o storer.EncodedObjectStorer, s: s, o: o, - offsetToHash: make(map[int64]plumbing.Hash, 0), - hashToOffset: make(map[plumbing.Hash]int64, 0), - crcs: make(map[plumbing.Hash]uint32, 0), - + idx: NewIndex(0), offsetToType: make(map[int64]plumbing.ObjectType, 0), decoderType: t, @@ -139,6 +138,11 @@ func (d *Decoder) doDecode() error { return err } + if !d.hasBuiltIndex { + d.idx = NewIndex(int(count)) + } + defer func() { d.hasBuiltIndex = true }() + _, isTxStorer := d.o.(storer.Transactioner) switch { case d.o == nil: @@ -218,13 +222,22 @@ func (d *Decoder) DecodeObject() (plumbing.EncodedObject, error) { } func (d *Decoder) decodeIfSpecificType(h *ObjectHeader) (plumbing.EncodedObject, error) { - var realType plumbing.ObjectType - var err error + var ( + obj plumbing.EncodedObject + realType plumbing.ObjectType + err error + ) switch h.Type { case plumbing.OFSDeltaObject: realType, err = d.ofsDeltaType(h.OffsetReference) case plumbing.REFDeltaObject: realType, err = d.refDeltaType(h.Reference) + if err == plumbing.ErrObjectNotFound { + obj, err = d.decodeByHeader(h) + if err != nil { + realType = obj.Type() + } + } default: realType = h.Type } @@ -236,6 +249,10 @@ func (d *Decoder) decodeIfSpecificType(h *ObjectHeader) (plumbing.EncodedObject, d.offsetToType[h.Offset] = realType if d.decoderType == realType { + if obj != nil { + return obj, nil + } + return d.decodeByHeader(h) } @@ -252,16 +269,12 @@ func (d *Decoder) ofsDeltaType(offset int64) (plumbing.ObjectType, error) { } func (d *Decoder) refDeltaType(ref plumbing.Hash) (plumbing.ObjectType, error) { - if o, ok := d.hashToOffset[ref]; ok { - return d.ofsDeltaType(o) - } - - obj, err := d.o.EncodedObject(plumbing.AnyObject, ref) - if err != nil { - return plumbing.InvalidObject, err + e, ok := d.idx.LookupHash(ref) + if !ok { + return plumbing.InvalidObject, plumbing.ErrObjectNotFound } - return obj.Type(), nil + return d.ofsDeltaType(int64(e.Offset)) } func (d *Decoder) decodeByHeader(h *ObjectHeader) (plumbing.EncodedObject, error) { @@ -285,9 +298,9 @@ func (d *Decoder) decodeByHeader(h *ObjectHeader) (plumbing.EncodedObject, error return obj, err } - hash := obj.Hash() - d.setOffset(hash, h.Offset) - d.setCRC(hash, crc) + if !d.hasBuiltIndex { + d.idx.Add(obj.Hash(), uint64(h.Offset), crc) + } return obj, nil } @@ -365,10 +378,10 @@ func (d *Decoder) fillOFSDeltaObjectContent(obj plumbing.EncodedObject, offset i return 0, err } - h := d.offsetToHash[offset] + e, ok := d.idx.LookupOffset(uint64(offset)) var base plumbing.EncodedObject - if h != plumbing.ZeroHash { - base = d.cache.Get(h) + if ok { + base = d.cache.Get(e.Hash) } if base == nil { @@ -385,22 +398,13 @@ func (d *Decoder) fillOFSDeltaObjectContent(obj plumbing.EncodedObject, offset i return crc, err } -func (d *Decoder) setOffset(h plumbing.Hash, offset int64) { - d.offsetToHash[offset] = h - d.hashToOffset[h] = offset -} - -func (d *Decoder) setCRC(h plumbing.Hash, crc uint32) { - d.crcs[h] = crc -} - func (d *Decoder) recallByOffset(o int64) (plumbing.EncodedObject, error) { if d.s.IsSeekable { return d.DecodeObjectAt(o) } - if h, ok := d.offsetToHash[o]; ok { - return d.recallByHashNonSeekable(h) + if e, ok := d.idx.LookupOffset(uint64(o)); ok { + return d.recallByHashNonSeekable(e.Hash) } return nil, plumbing.ErrObjectNotFound @@ -408,8 +412,8 @@ func (d *Decoder) recallByOffset(o int64) (plumbing.EncodedObject, error) { func (d *Decoder) recallByHash(h plumbing.Hash) (plumbing.EncodedObject, error) { if d.s.IsSeekable { - if o, ok := d.hashToOffset[h]; ok { - return d.DecodeObjectAt(o) + if e, ok := d.idx.LookupHash(h); ok { + return d.DecodeObjectAt(int64(e.Offset)) } } @@ -432,22 +436,20 @@ func (d *Decoder) recallByHashNonSeekable(h plumbing.Hash) (obj plumbing.Encoded return nil, plumbing.ErrObjectNotFound } -// SetOffsets sets the offsets, required when using the method DecodeObjectAt, -// without decoding the full packfile -func (d *Decoder) SetOffsets(offsets map[plumbing.Hash]int64) { - d.hashToOffset = offsets -} - -// Offsets returns the objects read offset, Decode method should be called -// before to calculate the Offsets -func (d *Decoder) Offsets() map[plumbing.Hash]int64 { - return d.hashToOffset +// SetIndex sets an index for the packfile. It is recommended to set this. +// The index might be read from a file or reused from a previous Decoder usage +// (see Index function). +func (d *Decoder) SetIndex(idx *Index) { + d.hasBuiltIndex = true + d.idx = idx } -// CRCs returns the CRC-32 for each read object. Decode method should be called -// before to calculate the CRCs -func (d *Decoder) CRCs() map[plumbing.Hash]uint32 { - return d.crcs +// Index returns the index for the packfile. If index was set with SetIndex, +// Index will return it. Otherwise, it will return an index that is built while +// decoding. If neither SetIndex was called with a full index or Decode called +// for the whole packfile, then the returned index will be incomplete. +func (d *Decoder) Index() *Index { + return d.idx } // Close closes the Scanner. usually this mean that the whole reader is read and diff --git a/plumbing/format/packfile/decoder_test.go b/plumbing/format/packfile/decoder_test.go index f1e2ed7..ecf7c81 100644 --- a/plumbing/format/packfile/decoder_test.go +++ b/plumbing/format/packfile/decoder_test.go @@ -55,14 +55,8 @@ func (s *ReaderSuite) TestDecodeByTypeRefDelta(c *C) { d, err := packfile.NewDecoderForType(scanner, storage, plumbing.CommitObject) c.Assert(err, IsNil) - // Specific offset elements needed to decode correctly the ref-delta - offsets := map[plumbing.Hash]int64{ - plumbing.NewHash("a8d315b2b1c615d43042c3a62402b8a54288cf5c"): 84880, - plumbing.NewHash("fb72698cab7617ac416264415f13224dfd7a165e"): 85141, - plumbing.NewHash("eba74343e2f15d62adedfd8c883ee0262b5c8021"): 85300, - } - - d.SetOffsets(offsets) + // Index required to decode by ref-delta. + d.SetIndex(getIndexFromIdxFile(f.Idx())) defer d.Close() @@ -123,7 +117,7 @@ func (s *ReaderSuite) TestDecodeByType(c *C) { // when the packfile is ref-delta based, the offsets are required if f.Is("ref-delta") { - d.SetOffsets(getOffsetsFromIdx(f.Idx())) + d.SetIndex(getIndexFromIdxFile(f.Idx())) } defer d.Close() @@ -291,8 +285,9 @@ func (s *ReaderSuite) TestDecodeCRCs(c *C) { c.Assert(err, IsNil) var sum uint64 - for _, crc := range d.CRCs() { - sum += uint64(crc) + idx := d.Index().ToIdxFile() + for _, e := range idx.Entries { + sum += uint64(e.CRC32) } c.Assert(int(sum), Equals, 78022211966) @@ -306,8 +301,7 @@ func (s *ReaderSuite) TestReadObjectAt(c *C) { // when the packfile is ref-delta based, the offsets are required if f.Is("ref-delta") { - offsets := getOffsetsFromIdx(f.Idx()) - d.SetOffsets(offsets) + d.SetIndex(getIndexFromIdxFile(f.Idx())) } // the objects at reference 186, is a delta, so should be recall, @@ -317,32 +311,34 @@ func (s *ReaderSuite) TestReadObjectAt(c *C) { c.Assert(obj.Hash().String(), Equals, "6ecf0ef2c2dffb796033e5a02219af86ec6584e5") } -func (s *ReaderSuite) TestOffsets(c *C) { +func (s *ReaderSuite) TestIndex(c *C) { f := fixtures.Basic().One() scanner := packfile.NewScanner(f.Packfile()) d, err := packfile.NewDecoder(scanner, nil) c.Assert(err, IsNil) - c.Assert(d.Offsets(), HasLen, 0) + c.Assert(d.Index().ToIdxFile().Entries, HasLen, 0) _, err = d.Decode() c.Assert(err, IsNil) - c.Assert(d.Offsets(), HasLen, 31) + c.Assert(len(d.Index().ToIdxFile().Entries), Equals, 31) } -func (s *ReaderSuite) TestSetOffsets(c *C) { +func (s *ReaderSuite) TestSetIndex(c *C) { f := fixtures.Basic().One() scanner := packfile.NewScanner(f.Packfile()) d, err := packfile.NewDecoder(scanner, nil) c.Assert(err, IsNil) + idx := packfile.NewIndex(1) h := plumbing.NewHash("6ecf0ef2c2dffb796033e5a02219af86ec6584e5") - d.SetOffsets(map[plumbing.Hash]int64{h: 42}) + idx.Add(h, uint64(42), 0) + d.SetIndex(idx) - o := d.Offsets() - c.Assert(o, HasLen, 1) - c.Assert(o[h], Equals, int64(42)) + idxf := d.Index().ToIdxFile() + c.Assert(idxf.Entries, HasLen, 1) + c.Assert(idxf.Entries[0].Offset, Equals, uint64(42)) } func assertObjects(c *C, s storer.EncodedObjectStorer, expects []string) { @@ -362,17 +358,12 @@ func assertObjects(c *C, s storer.EncodedObjectStorer, expects []string) { } } -func getOffsetsFromIdx(r io.Reader) map[plumbing.Hash]int64 { - idx := &idxfile.Idxfile{} - err := idxfile.NewDecoder(r).Decode(idx) - if err != nil { +func getIndexFromIdxFile(r io.Reader) *packfile.Index { + idxf := idxfile.NewIdxfile() + d := idxfile.NewDecoder(r) + if err := d.Decode(idxf); err != nil { panic(err) } - offsets := make(map[plumbing.Hash]int64) - for _, e := range idx.Entries { - offsets[e.Hash] = int64(e.Offset) - } - - return offsets + return packfile.NewIndexFromIdxFile(idxf) } diff --git a/plumbing/format/packfile/index.go b/plumbing/format/packfile/index.go new file mode 100644 index 0000000..2c5f98f --- /dev/null +++ b/plumbing/format/packfile/index.go @@ -0,0 +1,82 @@ +package packfile + +import ( + "gopkg.in/src-d/go-git.v4/plumbing" + "gopkg.in/src-d/go-git.v4/plumbing/format/idxfile" +) + +// Index is an in-memory representation of a packfile index. +// This uses idxfile.Idxfile under the hood to obtain indexes from .idx files +// or to store them. +type Index struct { + byHash map[plumbing.Hash]*idxfile.Entry + byOffset map[uint64]*idxfile.Entry +} + +// NewIndex creates a new empty index with the given size. Size is a hint and +// can be 0. It is recommended to set it to the number of objects to be indexed +// if it is known beforehand (e.g. reading from a packfile). +func NewIndex(size int) *Index { + return &Index{ + byHash: make(map[plumbing.Hash]*idxfile.Entry, size), + byOffset: make(map[uint64]*idxfile.Entry, size), + } +} + +// NewIndexFromIdxFile creates a new Index from an idxfile.IdxFile. +func NewIndexFromIdxFile(idxf *idxfile.Idxfile) *Index { + idx := &Index{ + byHash: make(map[plumbing.Hash]*idxfile.Entry, idxf.ObjectCount), + byOffset: make(map[uint64]*idxfile.Entry, idxf.ObjectCount), + } + for _, e := range idxf.Entries { + idx.add(e) + } + + return idx +} + +// Add adds a new Entry with the given values to the index. +func (idx *Index) Add(h plumbing.Hash, offset uint64, crc32 uint32) { + e := idxfile.Entry{ + Hash: h, + Offset: offset, + CRC32: crc32, + } + idx.add(&e) +} + +func (idx *Index) add(e *idxfile.Entry) { + idx.byHash[e.Hash] = e + idx.byOffset[e.Offset] = e +} + +// LookupHash looks an entry up by its hash. An idxfile.Entry is returned and +// a bool, which is true if it was found or false if it wasn't. +func (idx *Index) LookupHash(h plumbing.Hash) (*idxfile.Entry, bool) { + e, ok := idx.byHash[h] + return e, ok +} + +// LookupHash looks an entry up by its offset in the packfile. An idxfile.Entry +// is returned and a bool, which is true if it was found or false if it wasn't. +func (idx *Index) LookupOffset(offset uint64) (*idxfile.Entry, bool) { + e, ok := idx.byOffset[offset] + return e, ok +} + +// Size returns the number of entries in the index. +func (idx *Index) Size() int { + return len(idx.byHash) +} + +// ToIdxFile converts the index to an idxfile.Idxfile, which can then be used +// to serialize. +func (idx *Index) ToIdxFile() *idxfile.Idxfile { + idxf := idxfile.NewIdxfile() + for _, e := range idx.byHash { + idxf.Entries = append(idxf.Entries, e) + } + + return idxf +} diff --git a/plumbing/format/packfile/index_test.go b/plumbing/format/packfile/index_test.go new file mode 100644 index 0000000..6714704 --- /dev/null +++ b/plumbing/format/packfile/index_test.go @@ -0,0 +1,122 @@ +package packfile + +import ( + "strconv" + "strings" + + "gopkg.in/src-d/go-git.v4/plumbing" + + . "gopkg.in/check.v1" +) + +type IndexSuite struct{} + +var _ = Suite(&IndexSuite{}) + +func (s *IndexSuite) TestLookupOffset(c *C) { + idx := NewIndex(0) + + for o1 := 0; o1 < 10000; o1 += 100 { + for o2 := 0; o2 < 10000; o2 += 100 { + if o2 >= o1 { + e, ok := idx.LookupOffset(uint64(o2)) + c.Assert(ok, Equals, false) + c.Assert(e, IsNil) + } else { + e, ok := idx.LookupOffset(uint64(o2)) + c.Assert(ok, Equals, true) + c.Assert(e, NotNil) + c.Assert(e.Hash, Equals, s.toHash(o2)) + c.Assert(e.Offset, Equals, uint64(o2)) + } + } + + h1 := s.toHash(o1) + idx.Add(h1, uint64(o1), 0) + + for o2 := 0; o2 < 10000; o2 += 100 { + if o2 > o1 { + e, ok := idx.LookupOffset(uint64(o2)) + c.Assert(ok, Equals, false) + c.Assert(e, IsNil) + } else { + e, ok := idx.LookupOffset(uint64(o2)) + c.Assert(ok, Equals, true) + c.Assert(e, NotNil) + c.Assert(e.Hash, Equals, s.toHash(o2)) + c.Assert(e.Offset, Equals, uint64(o2)) + } + } + } +} + +func (s *IndexSuite) TestLookupHash(c *C) { + idx := NewIndex(0) + + for o1 := 0; o1 < 10000; o1 += 100 { + for o2 := 0; o2 < 10000; o2 += 100 { + if o2 >= o1 { + e, ok := idx.LookupHash(s.toHash(o2)) + c.Assert(ok, Equals, false) + c.Assert(e, IsNil) + } else { + e, ok := idx.LookupHash(s.toHash(o2)) + c.Assert(ok, Equals, true) + c.Assert(e, NotNil) + c.Assert(e.Hash, Equals, s.toHash(o2)) + c.Assert(e.Offset, Equals, uint64(o2)) + } + } + + h1 := s.toHash(o1) + idx.Add(h1, uint64(o1), 0) + + for o2 := 0; o2 < 10000; o2 += 100 { + if o2 > o1 { + e, ok := idx.LookupHash(s.toHash(o2)) + c.Assert(ok, Equals, false) + c.Assert(e, IsNil) + } else { + e, ok := idx.LookupHash(s.toHash(o2)) + c.Assert(ok, Equals, true) + c.Assert(e, NotNil) + c.Assert(e.Hash, Equals, s.toHash(o2)) + c.Assert(e.Offset, Equals, uint64(o2)) + } + } + } +} + +func (s *IndexSuite) TestSize(c *C) { + idx := NewIndex(0) + + for o1 := 0; o1 < 1000; o1++ { + c.Assert(idx.Size(), Equals, o1) + h1 := s.toHash(o1) + idx.Add(h1, uint64(o1), 0) + } +} + +func (s *IndexSuite) TestIdxFileEmpty(c *C) { + idx := NewIndex(0) + idxf := idx.ToIdxFile() + idx2 := NewIndexFromIdxFile(idxf) + c.Assert(idx, DeepEquals, idx2) +} + +func (s *IndexSuite) TestIdxFile(c *C) { + idx := NewIndex(0) + for o1 := 0; o1 < 1000; o1++ { + h1 := s.toHash(o1) + idx.Add(h1, uint64(o1), 0) + } + + idx2 := NewIndexFromIdxFile(idx.ToIdxFile()) + c.Assert(idx, DeepEquals, idx2) +} + +func (s *IndexSuite) toHash(i int) plumbing.Hash { + is := strconv.Itoa(i) + padding := strings.Repeat("a", 40-len(is)) + return plumbing.NewHash(padding + is) +} -- cgit