From c64eb817d5e5cbaec10dea1342e1ec95721e886b Mon Sep 17 00:00:00 2001
From: "Santiago M. Mola" <santi@mola.io>
Date: Tue, 25 Jul 2017 10:08:36 +0200
Subject: packfile: create packfile.Index and reuse it

There was an internal type (i.e. storage/filesystem.idx) to
use as in-memory index for packfiles. This was not convenient
to reuse in the packfile.

This commit creates a new representation (format/packfile.Index)
that can be converted to and from idxfile.Idxfile.

A packfile.Index now contains the functionality that was scattered
on storage/filesystem.idx and packfile.Decoder's internals.

storage/filesystem now reuses packfile.Index instances and this
also results in higher cache hit ratios when resolving deltas.
---
 plumbing/format/packfile/decoder.go      | 104 +++++++++++++-------------
 plumbing/format/packfile/decoder_test.go |  53 ++++++--------
 plumbing/format/packfile/index.go        |  82 +++++++++++++++++++++
 plumbing/format/packfile/index_test.go   | 122 +++++++++++++++++++++++++++++++
 4 files changed, 279 insertions(+), 82 deletions(-)
 create mode 100644 plumbing/format/packfile/index.go
 create mode 100644 plumbing/format/packfile/index_test.go

(limited to 'plumbing/format/packfile')

diff --git a/plumbing/format/packfile/decoder.go b/plumbing/format/packfile/decoder.go
index 21ddbbf..39680a3 100644
--- a/plumbing/format/packfile/decoder.go
+++ b/plumbing/format/packfile/decoder.go
@@ -56,10 +56,12 @@ type Decoder struct {
 	o  storer.EncodedObjectStorer
 	tx storer.Transaction
 
-	isDecoded    bool
-	offsetToHash map[int64]plumbing.Hash
-	hashToOffset map[plumbing.Hash]int64
-	crcs         map[plumbing.Hash]uint32
+	isDecoded bool
+
+	// hasBuiltIndex indicates if the index is fully built or not. If it is not,
+	// will be built incrementally while decoding.
+	hasBuiltIndex bool
+	idx           *Index
 
 	offsetToType map[int64]plumbing.ObjectType
 	decoderType  plumbing.ObjectType
@@ -102,10 +104,7 @@ func NewDecoderForType(s *Scanner, o storer.EncodedObjectStorer,
 		s: s,
 		o: o,
 
-		offsetToHash: make(map[int64]plumbing.Hash, 0),
-		hashToOffset: make(map[plumbing.Hash]int64, 0),
-		crcs:         make(map[plumbing.Hash]uint32, 0),
-
+		idx:          NewIndex(0),
 		offsetToType: make(map[int64]plumbing.ObjectType, 0),
 		decoderType:  t,
 
@@ -139,6 +138,11 @@ func (d *Decoder) doDecode() error {
 		return err
 	}
 
+	if !d.hasBuiltIndex {
+		d.idx = NewIndex(int(count))
+	}
+	defer func() { d.hasBuiltIndex = true }()
+
 	_, isTxStorer := d.o.(storer.Transactioner)
 	switch {
 	case d.o == nil:
@@ -218,13 +222,22 @@ func (d *Decoder) DecodeObject() (plumbing.EncodedObject, error) {
 }
 
 func (d *Decoder) decodeIfSpecificType(h *ObjectHeader) (plumbing.EncodedObject, error) {
-	var realType plumbing.ObjectType
-	var err error
+	var (
+		obj      plumbing.EncodedObject
+		realType plumbing.ObjectType
+		err      error
+	)
 	switch h.Type {
 	case plumbing.OFSDeltaObject:
 		realType, err = d.ofsDeltaType(h.OffsetReference)
 	case plumbing.REFDeltaObject:
 		realType, err = d.refDeltaType(h.Reference)
+		if err == plumbing.ErrObjectNotFound {
+			obj, err = d.decodeByHeader(h)
+			if err != nil {
+				realType = obj.Type()
+			}
+		}
 	default:
 		realType = h.Type
 	}
@@ -236,6 +249,10 @@ func (d *Decoder) decodeIfSpecificType(h *ObjectHeader) (plumbing.EncodedObject,
 	d.offsetToType[h.Offset] = realType
 
 	if d.decoderType == realType {
+		if obj != nil {
+			return obj, nil
+		}
+
 		return d.decodeByHeader(h)
 	}
 
@@ -252,16 +269,12 @@ func (d *Decoder) ofsDeltaType(offset int64) (plumbing.ObjectType, error) {
 }
 
 func (d *Decoder) refDeltaType(ref plumbing.Hash) (plumbing.ObjectType, error) {
-	if o, ok := d.hashToOffset[ref]; ok {
-		return d.ofsDeltaType(o)
-	}
-
-	obj, err := d.o.EncodedObject(plumbing.AnyObject, ref)
-	if err != nil {
-		return plumbing.InvalidObject, err
+	e, ok := d.idx.LookupHash(ref)
+	if !ok {
+		return plumbing.InvalidObject, plumbing.ErrObjectNotFound
 	}
 
-	return obj.Type(), nil
+	return d.ofsDeltaType(int64(e.Offset))
 }
 
 func (d *Decoder) decodeByHeader(h *ObjectHeader) (plumbing.EncodedObject, error) {
@@ -285,9 +298,9 @@ func (d *Decoder) decodeByHeader(h *ObjectHeader) (plumbing.EncodedObject, error
 		return obj, err
 	}
 
-	hash := obj.Hash()
-	d.setOffset(hash, h.Offset)
-	d.setCRC(hash, crc)
+	if !d.hasBuiltIndex {
+		d.idx.Add(obj.Hash(), uint64(h.Offset), crc)
+	}
 
 	return obj, nil
 }
@@ -365,10 +378,10 @@ func (d *Decoder) fillOFSDeltaObjectContent(obj plumbing.EncodedObject, offset i
 		return 0, err
 	}
 
-	h := d.offsetToHash[offset]
+	e, ok := d.idx.LookupOffset(uint64(offset))
 	var base plumbing.EncodedObject
-	if h != plumbing.ZeroHash {
-		base = d.cache.Get(h)
+	if ok {
+		base = d.cache.Get(e.Hash)
 	}
 
 	if base == nil {
@@ -385,22 +398,13 @@ func (d *Decoder) fillOFSDeltaObjectContent(obj plumbing.EncodedObject, offset i
 	return crc, err
 }
 
-func (d *Decoder) setOffset(h plumbing.Hash, offset int64) {
-	d.offsetToHash[offset] = h
-	d.hashToOffset[h] = offset
-}
-
-func (d *Decoder) setCRC(h plumbing.Hash, crc uint32) {
-	d.crcs[h] = crc
-}
-
 func (d *Decoder) recallByOffset(o int64) (plumbing.EncodedObject, error) {
 	if d.s.IsSeekable {
 		return d.DecodeObjectAt(o)
 	}
 
-	if h, ok := d.offsetToHash[o]; ok {
-		return d.recallByHashNonSeekable(h)
+	if e, ok := d.idx.LookupOffset(uint64(o)); ok {
+		return d.recallByHashNonSeekable(e.Hash)
 	}
 
 	return nil, plumbing.ErrObjectNotFound
@@ -408,8 +412,8 @@ func (d *Decoder) recallByOffset(o int64) (plumbing.EncodedObject, error) {
 
 func (d *Decoder) recallByHash(h plumbing.Hash) (plumbing.EncodedObject, error) {
 	if d.s.IsSeekable {
-		if o, ok := d.hashToOffset[h]; ok {
-			return d.DecodeObjectAt(o)
+		if e, ok := d.idx.LookupHash(h); ok {
+			return d.DecodeObjectAt(int64(e.Offset))
 		}
 	}
 
@@ -432,22 +436,20 @@ func (d *Decoder) recallByHashNonSeekable(h plumbing.Hash) (obj plumbing.Encoded
 	return nil, plumbing.ErrObjectNotFound
 }
 
-// SetOffsets sets the offsets, required when using the method DecodeObjectAt,
-// without decoding the full packfile
-func (d *Decoder) SetOffsets(offsets map[plumbing.Hash]int64) {
-	d.hashToOffset = offsets
-}
-
-// Offsets returns the objects read offset, Decode method should be called
-// before to calculate the Offsets
-func (d *Decoder) Offsets() map[plumbing.Hash]int64 {
-	return d.hashToOffset
+// SetIndex sets an index for the packfile. It is recommended to set this.
+// The index might be read from a file or reused from a previous Decoder usage
+// (see Index function).
+func (d *Decoder) SetIndex(idx *Index) {
+	d.hasBuiltIndex = true
+	d.idx = idx
 }
 
-// CRCs returns the CRC-32 for each read object. Decode method should be called
-// before to calculate the CRCs
-func (d *Decoder) CRCs() map[plumbing.Hash]uint32 {
-	return d.crcs
+// Index returns the index for the packfile. If index was set with SetIndex,
+// Index will return it. Otherwise, it will return an index that is built while
+// decoding. If neither SetIndex was called with a full index or Decode called
+// for the whole packfile, then the returned index will be incomplete.
+func (d *Decoder) Index() *Index {
+	return d.idx
 }
 
 // Close closes the Scanner. usually this mean that the whole reader is read and
diff --git a/plumbing/format/packfile/decoder_test.go b/plumbing/format/packfile/decoder_test.go
index f1e2ed7..ecf7c81 100644
--- a/plumbing/format/packfile/decoder_test.go
+++ b/plumbing/format/packfile/decoder_test.go
@@ -55,14 +55,8 @@ func (s *ReaderSuite) TestDecodeByTypeRefDelta(c *C) {
 	d, err := packfile.NewDecoderForType(scanner, storage, plumbing.CommitObject)
 	c.Assert(err, IsNil)
 
-	// Specific offset elements needed to decode correctly the ref-delta
-	offsets := map[plumbing.Hash]int64{
-		plumbing.NewHash("a8d315b2b1c615d43042c3a62402b8a54288cf5c"): 84880,
-		plumbing.NewHash("fb72698cab7617ac416264415f13224dfd7a165e"): 85141,
-		plumbing.NewHash("eba74343e2f15d62adedfd8c883ee0262b5c8021"): 85300,
-	}
-
-	d.SetOffsets(offsets)
+	// Index required to decode by ref-delta.
+	d.SetIndex(getIndexFromIdxFile(f.Idx()))
 
 	defer d.Close()
 
@@ -123,7 +117,7 @@ func (s *ReaderSuite) TestDecodeByType(c *C) {
 
 			// when the packfile is ref-delta based, the offsets are required
 			if f.Is("ref-delta") {
-				d.SetOffsets(getOffsetsFromIdx(f.Idx()))
+				d.SetIndex(getIndexFromIdxFile(f.Idx()))
 			}
 
 			defer d.Close()
@@ -291,8 +285,9 @@ func (s *ReaderSuite) TestDecodeCRCs(c *C) {
 	c.Assert(err, IsNil)
 
 	var sum uint64
-	for _, crc := range d.CRCs() {
-		sum += uint64(crc)
+	idx := d.Index().ToIdxFile()
+	for _, e := range idx.Entries {
+		sum += uint64(e.CRC32)
 	}
 
 	c.Assert(int(sum), Equals, 78022211966)
@@ -306,8 +301,7 @@ func (s *ReaderSuite) TestReadObjectAt(c *C) {
 
 	// when the packfile is ref-delta based, the offsets are required
 	if f.Is("ref-delta") {
-		offsets := getOffsetsFromIdx(f.Idx())
-		d.SetOffsets(offsets)
+		d.SetIndex(getIndexFromIdxFile(f.Idx()))
 	}
 
 	// the objects at reference 186, is a delta, so should be recall,
@@ -317,32 +311,34 @@ func (s *ReaderSuite) TestReadObjectAt(c *C) {
 	c.Assert(obj.Hash().String(), Equals, "6ecf0ef2c2dffb796033e5a02219af86ec6584e5")
 }
 
-func (s *ReaderSuite) TestOffsets(c *C) {
+func (s *ReaderSuite) TestIndex(c *C) {
 	f := fixtures.Basic().One()
 	scanner := packfile.NewScanner(f.Packfile())
 	d, err := packfile.NewDecoder(scanner, nil)
 	c.Assert(err, IsNil)
 
-	c.Assert(d.Offsets(), HasLen, 0)
+	c.Assert(d.Index().ToIdxFile().Entries, HasLen, 0)
 
 	_, err = d.Decode()
 	c.Assert(err, IsNil)
 
-	c.Assert(d.Offsets(), HasLen, 31)
+	c.Assert(len(d.Index().ToIdxFile().Entries), Equals, 31)
 }
 
-func (s *ReaderSuite) TestSetOffsets(c *C) {
+func (s *ReaderSuite) TestSetIndex(c *C) {
 	f := fixtures.Basic().One()
 	scanner := packfile.NewScanner(f.Packfile())
 	d, err := packfile.NewDecoder(scanner, nil)
 	c.Assert(err, IsNil)
 
+	idx := packfile.NewIndex(1)
 	h := plumbing.NewHash("6ecf0ef2c2dffb796033e5a02219af86ec6584e5")
-	d.SetOffsets(map[plumbing.Hash]int64{h: 42})
+	idx.Add(h, uint64(42), 0)
+	d.SetIndex(idx)
 
-	o := d.Offsets()
-	c.Assert(o, HasLen, 1)
-	c.Assert(o[h], Equals, int64(42))
+	idxf := d.Index().ToIdxFile()
+	c.Assert(idxf.Entries, HasLen, 1)
+	c.Assert(idxf.Entries[0].Offset, Equals, uint64(42))
 }
 
 func assertObjects(c *C, s storer.EncodedObjectStorer, expects []string) {
@@ -362,17 +358,12 @@ func assertObjects(c *C, s storer.EncodedObjectStorer, expects []string) {
 	}
 }
 
-func getOffsetsFromIdx(r io.Reader) map[plumbing.Hash]int64 {
-	idx := &idxfile.Idxfile{}
-	err := idxfile.NewDecoder(r).Decode(idx)
-	if err != nil {
+func getIndexFromIdxFile(r io.Reader) *packfile.Index {
+	idxf := idxfile.NewIdxfile()
+	d := idxfile.NewDecoder(r)
+	if err := d.Decode(idxf); err != nil {
 		panic(err)
 	}
 
-	offsets := make(map[plumbing.Hash]int64)
-	for _, e := range idx.Entries {
-		offsets[e.Hash] = int64(e.Offset)
-	}
-
-	return offsets
+	return packfile.NewIndexFromIdxFile(idxf)
 }
diff --git a/plumbing/format/packfile/index.go b/plumbing/format/packfile/index.go
new file mode 100644
index 0000000..2c5f98f
--- /dev/null
+++ b/plumbing/format/packfile/index.go
@@ -0,0 +1,82 @@
+package packfile
+
+import (
+	"gopkg.in/src-d/go-git.v4/plumbing"
+	"gopkg.in/src-d/go-git.v4/plumbing/format/idxfile"
+)
+
+// Index is an in-memory representation of a packfile index.
+// This uses idxfile.Idxfile under the hood to obtain indexes from .idx files
+// or to store them.
+type Index struct {
+	byHash   map[plumbing.Hash]*idxfile.Entry
+	byOffset map[uint64]*idxfile.Entry
+}
+
+// NewIndex creates a new empty index with the given size. Size is a hint and
+// can be 0. It is recommended to set it to the number of objects to be indexed
+// if it is known beforehand (e.g. reading from a packfile).
+func NewIndex(size int) *Index {
+	return &Index{
+		byHash:   make(map[plumbing.Hash]*idxfile.Entry, size),
+		byOffset: make(map[uint64]*idxfile.Entry, size),
+	}
+}
+
+// NewIndexFromIdxFile creates a new Index from an idxfile.IdxFile.
+func NewIndexFromIdxFile(idxf *idxfile.Idxfile) *Index {
+	idx := &Index{
+		byHash:   make(map[plumbing.Hash]*idxfile.Entry, idxf.ObjectCount),
+		byOffset: make(map[uint64]*idxfile.Entry, idxf.ObjectCount),
+	}
+	for _, e := range idxf.Entries {
+		idx.add(e)
+	}
+
+	return idx
+}
+
+// Add adds a new Entry with the given values to the index.
+func (idx *Index) Add(h plumbing.Hash, offset uint64, crc32 uint32) {
+	e := idxfile.Entry{
+		Hash:   h,
+		Offset: offset,
+		CRC32:  crc32,
+	}
+	idx.add(&e)
+}
+
+func (idx *Index) add(e *idxfile.Entry) {
+	idx.byHash[e.Hash] = e
+	idx.byOffset[e.Offset] = e
+}
+
+// LookupHash looks an entry up by its hash. An idxfile.Entry is returned and
+// a bool, which is true if it was found or false if it wasn't.
+func (idx *Index) LookupHash(h plumbing.Hash) (*idxfile.Entry, bool) {
+	e, ok := idx.byHash[h]
+	return e, ok
+}
+
+// LookupHash looks an entry up by its offset in the packfile. An idxfile.Entry
+// is returned and a bool, which is true if it was found or false if it wasn't.
+func (idx *Index) LookupOffset(offset uint64) (*idxfile.Entry, bool) {
+	e, ok := idx.byOffset[offset]
+	return e, ok
+}
+
+// Size returns the number of entries in the index.
+func (idx *Index) Size() int {
+	return len(idx.byHash)
+}
+
+// ToIdxFile converts the index to an idxfile.Idxfile, which can then be used
+// to serialize.
+func (idx *Index) ToIdxFile() *idxfile.Idxfile {
+	idxf := idxfile.NewIdxfile()
+	for _, e := range idx.byHash {
+		idxf.Entries = append(idxf.Entries, e)
+	}
+
+	return idxf
+}
diff --git a/plumbing/format/packfile/index_test.go b/plumbing/format/packfile/index_test.go
new file mode 100644
index 0000000..6714704
--- /dev/null
+++ b/plumbing/format/packfile/index_test.go
@@ -0,0 +1,122 @@
+package packfile
+
+import (
+	"strconv"
+	"strings"
+
+	"gopkg.in/src-d/go-git.v4/plumbing"
+
+	. "gopkg.in/check.v1"
+)
+
+type IndexSuite struct{}
+
+var _ = Suite(&IndexSuite{})
+
+func (s *IndexSuite) TestLookupOffset(c *C) {
+	idx := NewIndex(0)
+
+	for o1 := 0; o1 < 10000; o1 += 100 {
+		for o2 := 0; o2 < 10000; o2 += 100 {
+			if o2 >= o1 {
+				e, ok := idx.LookupOffset(uint64(o2))
+				c.Assert(ok, Equals, false)
+				c.Assert(e, IsNil)
+			} else {
+				e, ok := idx.LookupOffset(uint64(o2))
+				c.Assert(ok, Equals, true)
+				c.Assert(e, NotNil)
+				c.Assert(e.Hash, Equals, s.toHash(o2))
+				c.Assert(e.Offset, Equals, uint64(o2))
+			}
+		}
+
+		h1 := s.toHash(o1)
+		idx.Add(h1, uint64(o1), 0)
+
+		for o2 := 0; o2 < 10000; o2 += 100 {
+			if o2 > o1 {
+				e, ok := idx.LookupOffset(uint64(o2))
+				c.Assert(ok, Equals, false)
+				c.Assert(e, IsNil)
+			} else {
+				e, ok := idx.LookupOffset(uint64(o2))
+				c.Assert(ok, Equals, true)
+				c.Assert(e, NotNil)
+				c.Assert(e.Hash, Equals, s.toHash(o2))
+				c.Assert(e.Offset, Equals, uint64(o2))
+			}
+		}
+	}
+}
+
+func (s *IndexSuite) TestLookupHash(c *C) {
+	idx := NewIndex(0)
+
+	for o1 := 0; o1 < 10000; o1 += 100 {
+		for o2 := 0; o2 < 10000; o2 += 100 {
+			if o2 >= o1 {
+				e, ok := idx.LookupHash(s.toHash(o2))
+				c.Assert(ok, Equals, false)
+				c.Assert(e, IsNil)
+			} else {
+				e, ok := idx.LookupHash(s.toHash(o2))
+				c.Assert(ok, Equals, true)
+				c.Assert(e, NotNil)
+				c.Assert(e.Hash, Equals, s.toHash(o2))
+				c.Assert(e.Offset, Equals, uint64(o2))
+			}
+		}
+
+		h1 := s.toHash(o1)
+		idx.Add(h1, uint64(o1), 0)
+
+		for o2 := 0; o2 < 10000; o2 += 100 {
+			if o2 > o1 {
+				e, ok := idx.LookupHash(s.toHash(o2))
+				c.Assert(ok, Equals, false)
+				c.Assert(e, IsNil)
+			} else {
+				e, ok := idx.LookupHash(s.toHash(o2))
+				c.Assert(ok, Equals, true)
+				c.Assert(e, NotNil)
+				c.Assert(e.Hash, Equals, s.toHash(o2))
+				c.Assert(e.Offset, Equals, uint64(o2))
+			}
+		}
+	}
+}
+
+func (s *IndexSuite) TestSize(c *C) {
+	idx := NewIndex(0)
+
+	for o1 := 0; o1 < 1000; o1++ {
+		c.Assert(idx.Size(), Equals, o1)
+		h1 := s.toHash(o1)
+		idx.Add(h1, uint64(o1), 0)
+	}
+}
+
+func (s *IndexSuite) TestIdxFileEmpty(c *C) {
+	idx := NewIndex(0)
+	idxf := idx.ToIdxFile()
+	idx2 := NewIndexFromIdxFile(idxf)
+	c.Assert(idx, DeepEquals, idx2)
+}
+
+func (s *IndexSuite) TestIdxFile(c *C) {
+	idx := NewIndex(0)
+	for o1 := 0; o1 < 1000; o1++ {
+		h1 := s.toHash(o1)
+		idx.Add(h1, uint64(o1), 0)
+	}
+
+	idx2 := NewIndexFromIdxFile(idx.ToIdxFile())
+	c.Assert(idx, DeepEquals, idx2)
+}
+
+func (s *IndexSuite) toHash(i int) plumbing.Hash {
+	is := strconv.Itoa(i)
+	padding := strings.Repeat("a", 40-len(is))
+	return plumbing.NewHash(padding + is)
+}
-- 
cgit