From 98a22e72a808aa0d5dd62339817404fd9e1c4db6 Mon Sep 17 00:00:00 2001 From: Máximo Cuadros Date: Tue, 6 Sep 2016 19:59:44 +0200 Subject: format: packfile new interface (wip) --- formats/packfile/decoder.go | 176 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 152 insertions(+), 24 deletions(-) (limited to 'formats/packfile/decoder.go') diff --git a/formats/packfile/decoder.go b/formats/packfile/decoder.go index 5b5763c..3da927d 100644 --- a/formats/packfile/decoder.go +++ b/formats/packfile/decoder.go @@ -1,7 +1,9 @@ package packfile import ( + "bytes" "io" + "os" "gopkg.in/src-d/go-git.v4/core" ) @@ -21,37 +23,46 @@ var ( // of objects in the packfile is higher than // Decoder.MaxObjectsLimit. ErrMaxObjectsLimitReached = NewError("max. objects limit reached") - // ErrInvalidObject is returned by Decode when an invalid object is // found in the packfile. ErrInvalidObject = NewError("invalid git object") - // ErrPackEntryNotFound is returned by Decode when a reference in // the packfile references and unknown object. ErrPackEntryNotFound = NewError("can't find a pack entry") - // ErrZLib is returned by Decode when there was an error unzipping // the packfile contents. ErrZLib = NewError("zlib reading error") + // ErrDuplicatedObject is returned by Remember if an object appears several + // times in a packfile. + ErrDuplicatedObject = NewError("duplicated object") + // ErrCannotRecall is returned by RecallByOffset or RecallByHash if the object + // to recall cannot be returned. + ErrCannotRecall = NewError("cannot recall object") ) // Decoder reads and decodes packfiles from an input stream. type Decoder struct { - p *Parser - s core.ObjectStorage + p *Parser + s core.ObjectStorage + seeker io.Seeker + offsetToObject map[int64]core.Object + hashToOffset map[core.Hash]int64 } // NewDecoder returns a new Decoder that reads from r. -func NewDecoder(r ReadRecaller, s core.ObjectStorage) *Decoder { +func NewDecoder(s core.ObjectStorage, p *Parser, seeker io.Seeker) *Decoder { return &Decoder{ - p: NewParser(r), - s: s, + p: p, + s: s, + seeker: seeker, + offsetToObject: make(map[int64]core.Object, 0), + hashToOffset: make(map[core.Hash]int64, 0), } } // Decode reads a packfile and stores it in the value pointed to by s. func (d *Decoder) Decode() error { - count, err := d.p.ReadHeader() + _, count, err := d.p.Header() if err != nil { return err } @@ -74,21 +85,7 @@ func (d *Decoder) readObjects(tx core.TxObjectStorage, count uint32) error { // That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB, // of which 12-20 % is _not_ zlib inflation (ie. is our code). for i := 0; i < int(count); i++ { - start, err := d.p.Offset() - if err != nil { - return err - } - - obj := d.s.NewObject() - if err := d.p.FillObject(obj); err != nil { - if err == io.EOF { - break - } - - return err - } - - err = d.p.Remember(start, obj) + obj, err := d.readObject() if err != nil { return err } @@ -101,3 +98,134 @@ func (d *Decoder) readObjects(tx core.TxObjectStorage, count uint32) error { return nil } + +func (d *Decoder) readObject() (core.Object, error) { + h, err := d.p.NextObjectHeader() + if err != nil { + return nil, err + } + + obj := d.s.NewObject() + obj.SetSize(h.Length) + obj.SetType(h.Type) + + switch h.Type { + case core.CommitObject, core.TreeObject, core.BlobObject, core.TagObject: + err = d.fillRegularObjectContent(obj) + case core.REFDeltaObject: + err = d.fillREFDeltaObjectContent(obj, h.Reference) + case core.OFSDeltaObject: + err = d.fillOFSDeltaObjectContent(obj, h.OffsetReference) + default: + err = ErrInvalidObject.AddDetails("type %q", h.Type) + } + + return obj, d.remember(h.Offset, obj) +} + +func (d *Decoder) fillRegularObjectContent(obj core.Object) error { + w, err := obj.Writer() + if err != nil { + return err + } + + _, err = d.p.NextObject(w) + return err +} + +func (d *Decoder) fillREFDeltaObjectContent(obj core.Object, ref core.Hash) error { + base, err := d.recallByHash(ref) + if err != nil { + return err + } + obj.SetType(base.Type()) + if err := d.readAndApplyDelta(obj, base); err != nil { + return err + } + + return nil +} + +func (d *Decoder) fillOFSDeltaObjectContent(obj core.Object, offset int64) error { + base, err := d.recallByOffset(offset) + if err != nil { + return err + } + + obj.SetType(base.Type()) + if err := d.readAndApplyDelta(obj, base); err != nil { + return err + } + + return nil +} + +// ReadAndApplyDelta reads and apply the base patched with the contents +// of a zlib compressed diff data in the delta portion of an object +// entry in the packfile. +func (d *Decoder) readAndApplyDelta(target, base core.Object) error { + buf := bytes.NewBuffer(nil) + if _, err := d.p.NextObject(buf); err != nil { + return err + } + + return ApplyDelta(target, base, buf.Bytes()) +} + +// Remember stores the offset of the object and its hash, but not the +// object itself. This implementation does not check for already stored +// offsets, as it is too expensive to build this information from an +// index every time a get operation is performed on the SeekableReadRecaller. +func (r *Decoder) remember(o int64, obj core.Object) error { + h := obj.Hash() + r.hashToOffset[h] = o + r.offsetToObject[o] = obj + return nil +} + +// RecallByHash returns the object for a given hash by looking for it again in +// the io.ReadeSeerker. +func (r *Decoder) recallByHash(h core.Hash) (core.Object, error) { + o, ok := r.hashToOffset[h] + if !ok { + return nil, ErrCannotRecall.AddDetails("hash not found: %s", h) + } + + return r.recallByOffset(o) +} + +// RecallByOffset returns the object for a given offset by looking for it again in +// the io.ReadeSeerker. For efficiency reasons, this method always find objects by +// offset, even if they have not been remembered or if they have been forgetted. +func (r *Decoder) recallByOffset(o int64) (obj core.Object, err error) { + obj, ok := r.offsetToObject[o] + if ok { + return obj, nil + } + + if !ok && r.seeker == nil { + return nil, ErrCannotRecall.AddDetails("no object found at offset %d", o) + } + + // remember current offset + beforeJump, err := r.seeker.Seek(0, os.SEEK_CUR) + if err != nil { + return nil, err + } + + defer func() { + // jump back + _, seekErr := r.seeker.Seek(beforeJump, os.SEEK_SET) + if err == nil { + err = seekErr + } + }() + + // jump to requested offset + _, err = r.seeker.Seek(o, os.SEEK_SET) + if err != nil { + return nil, err + } + + return r.readObject() +} -- cgit