From 98a22e72a808aa0d5dd62339817404fd9e1c4db6 Mon Sep 17 00:00:00 2001 From: Máximo Cuadros Date: Tue, 6 Sep 2016 19:59:44 +0200 Subject: format: packfile new interface (wip) --- formats/packfile/decoder.go | 176 +++++++++++++-- formats/packfile/decoder_test.go | 79 +------ formats/packfile/parser.go | 339 ++++++++++++++-------------- formats/packfile/parser_test.go | 67 +++++- formats/packfile/read_recaller.go | 39 ---- formats/packfile/read_recaller_impl_test.go | 293 ------------------------ formats/packfile/seekable.go | 109 --------- formats/packfile/stream.go | 95 -------- 8 files changed, 392 insertions(+), 805 deletions(-) delete mode 100644 formats/packfile/read_recaller.go delete mode 100644 formats/packfile/read_recaller_impl_test.go delete mode 100644 formats/packfile/seekable.go delete mode 100644 formats/packfile/stream.go (limited to 'formats/packfile') diff --git a/formats/packfile/decoder.go b/formats/packfile/decoder.go index 5b5763c..3da927d 100644 --- a/formats/packfile/decoder.go +++ b/formats/packfile/decoder.go @@ -1,7 +1,9 @@ package packfile import ( + "bytes" "io" + "os" "gopkg.in/src-d/go-git.v4/core" ) @@ -21,37 +23,46 @@ var ( // of objects in the packfile is higher than // Decoder.MaxObjectsLimit. ErrMaxObjectsLimitReached = NewError("max. objects limit reached") - // ErrInvalidObject is returned by Decode when an invalid object is // found in the packfile. ErrInvalidObject = NewError("invalid git object") - // ErrPackEntryNotFound is returned by Decode when a reference in // the packfile references and unknown object. ErrPackEntryNotFound = NewError("can't find a pack entry") - // ErrZLib is returned by Decode when there was an error unzipping // the packfile contents. ErrZLib = NewError("zlib reading error") + // ErrDuplicatedObject is returned by Remember if an object appears several + // times in a packfile. + ErrDuplicatedObject = NewError("duplicated object") + // ErrCannotRecall is returned by RecallByOffset or RecallByHash if the object + // to recall cannot be returned. + ErrCannotRecall = NewError("cannot recall object") ) // Decoder reads and decodes packfiles from an input stream. type Decoder struct { - p *Parser - s core.ObjectStorage + p *Parser + s core.ObjectStorage + seeker io.Seeker + offsetToObject map[int64]core.Object + hashToOffset map[core.Hash]int64 } // NewDecoder returns a new Decoder that reads from r. -func NewDecoder(r ReadRecaller, s core.ObjectStorage) *Decoder { +func NewDecoder(s core.ObjectStorage, p *Parser, seeker io.Seeker) *Decoder { return &Decoder{ - p: NewParser(r), - s: s, + p: p, + s: s, + seeker: seeker, + offsetToObject: make(map[int64]core.Object, 0), + hashToOffset: make(map[core.Hash]int64, 0), } } // Decode reads a packfile and stores it in the value pointed to by s. func (d *Decoder) Decode() error { - count, err := d.p.ReadHeader() + _, count, err := d.p.Header() if err != nil { return err } @@ -74,21 +85,7 @@ func (d *Decoder) readObjects(tx core.TxObjectStorage, count uint32) error { // That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB, // of which 12-20 % is _not_ zlib inflation (ie. is our code). for i := 0; i < int(count); i++ { - start, err := d.p.Offset() - if err != nil { - return err - } - - obj := d.s.NewObject() - if err := d.p.FillObject(obj); err != nil { - if err == io.EOF { - break - } - - return err - } - - err = d.p.Remember(start, obj) + obj, err := d.readObject() if err != nil { return err } @@ -101,3 +98,134 @@ func (d *Decoder) readObjects(tx core.TxObjectStorage, count uint32) error { return nil } + +func (d *Decoder) readObject() (core.Object, error) { + h, err := d.p.NextObjectHeader() + if err != nil { + return nil, err + } + + obj := d.s.NewObject() + obj.SetSize(h.Length) + obj.SetType(h.Type) + + switch h.Type { + case core.CommitObject, core.TreeObject, core.BlobObject, core.TagObject: + err = d.fillRegularObjectContent(obj) + case core.REFDeltaObject: + err = d.fillREFDeltaObjectContent(obj, h.Reference) + case core.OFSDeltaObject: + err = d.fillOFSDeltaObjectContent(obj, h.OffsetReference) + default: + err = ErrInvalidObject.AddDetails("type %q", h.Type) + } + + return obj, d.remember(h.Offset, obj) +} + +func (d *Decoder) fillRegularObjectContent(obj core.Object) error { + w, err := obj.Writer() + if err != nil { + return err + } + + _, err = d.p.NextObject(w) + return err +} + +func (d *Decoder) fillREFDeltaObjectContent(obj core.Object, ref core.Hash) error { + base, err := d.recallByHash(ref) + if err != nil { + return err + } + obj.SetType(base.Type()) + if err := d.readAndApplyDelta(obj, base); err != nil { + return err + } + + return nil +} + +func (d *Decoder) fillOFSDeltaObjectContent(obj core.Object, offset int64) error { + base, err := d.recallByOffset(offset) + if err != nil { + return err + } + + obj.SetType(base.Type()) + if err := d.readAndApplyDelta(obj, base); err != nil { + return err + } + + return nil +} + +// ReadAndApplyDelta reads and apply the base patched with the contents +// of a zlib compressed diff data in the delta portion of an object +// entry in the packfile. +func (d *Decoder) readAndApplyDelta(target, base core.Object) error { + buf := bytes.NewBuffer(nil) + if _, err := d.p.NextObject(buf); err != nil { + return err + } + + return ApplyDelta(target, base, buf.Bytes()) +} + +// Remember stores the offset of the object and its hash, but not the +// object itself. This implementation does not check for already stored +// offsets, as it is too expensive to build this information from an +// index every time a get operation is performed on the SeekableReadRecaller. +func (r *Decoder) remember(o int64, obj core.Object) error { + h := obj.Hash() + r.hashToOffset[h] = o + r.offsetToObject[o] = obj + return nil +} + +// RecallByHash returns the object for a given hash by looking for it again in +// the io.ReadeSeerker. +func (r *Decoder) recallByHash(h core.Hash) (core.Object, error) { + o, ok := r.hashToOffset[h] + if !ok { + return nil, ErrCannotRecall.AddDetails("hash not found: %s", h) + } + + return r.recallByOffset(o) +} + +// RecallByOffset returns the object for a given offset by looking for it again in +// the io.ReadeSeerker. For efficiency reasons, this method always find objects by +// offset, even if they have not been remembered or if they have been forgetted. +func (r *Decoder) recallByOffset(o int64) (obj core.Object, err error) { + obj, ok := r.offsetToObject[o] + if ok { + return obj, nil + } + + if !ok && r.seeker == nil { + return nil, ErrCannotRecall.AddDetails("no object found at offset %d", o) + } + + // remember current offset + beforeJump, err := r.seeker.Seek(0, os.SEEK_CUR) + if err != nil { + return nil, err + } + + defer func() { + // jump back + _, seekErr := r.seeker.Seek(beforeJump, os.SEEK_SET) + if err == nil { + err = seekErr + } + }() + + // jump to requested offset + _, err = r.seeker.Seek(o, os.SEEK_SET) + if err != nil { + return nil, err + } + + return r.readObject() +} diff --git a/formats/packfile/decoder_test.go b/formats/packfile/decoder_test.go index 8c73b4e..5a95af1 100644 --- a/formats/packfile/decoder_test.go +++ b/formats/packfile/decoder_test.go @@ -3,16 +3,12 @@ package packfile import ( "bytes" "encoding/base64" - "fmt" "os" - "runtime" "testing" - "time" "gopkg.in/src-d/go-git.v4/core" "gopkg.in/src-d/go-git.v4/storage/memory" - "github.com/dustin/go-humanize" . "gopkg.in/check.v1" ) @@ -27,9 +23,8 @@ var packFileWithEmptyObjects = "UEFDSwAAAAIAAAALnw54nKXMQWoDMQxA0b1PoX2hSLIm44FS func (s *ReaderSuite) TestReadPackfile(c *C) { data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects) f := bytes.NewReader(data) - r := NewStream(f) sto := memory.NewStorage() - d := NewDecoder(r, sto.ObjectStorage()) + d := NewDecoder(sto.ObjectStorage(), NewParser(f), nil) err := d.Decode() c.Assert(err, IsNil) @@ -60,9 +55,8 @@ func (s *ReaderSuite) TestReadPackfileREFDelta(c *C) { func (s *ReaderSuite) testReadPackfileGitFixture(c *C, file string, format Format) { f, err := os.Open(file) c.Assert(err, IsNil) - r := NewSeekable(f) sto := memory.NewStorage() - d := NewDecoder(r, sto.ObjectStorage()) + d := NewDecoder(sto.ObjectStorage(), NewParser(f), f) err = d.Decode() c.Assert(err, IsNil) @@ -109,72 +103,3 @@ func AssertObjects(c *C, s *memory.Storage, expects []string) { c.Assert(obt.Hash().String(), Equals, exp) } } - -func (s *ReaderSuite) BenchmarkFixtureRef(c *C) { - for i := 0; i < c.N; i++ { - readFromFile(c, "fixtures/git-fixture.ref-delta", REFDeltaFormat) - } -} - -func (s *ReaderSuite) BenchmarkFixtureOfs(c *C) { - for i := 0; i < c.N; i++ { - readFromFile(c, "fixtures/git-fixture.ofs-delta", OFSDeltaFormat) - } -} - -func (s *ReaderSuite) BenchmarkCandyJS(c *C) { - for i := 0; i < c.N; i++ { - readFromFile(c, "/tmp/go-candyjs", REFDeltaFormat) - } -} - -func (s *ReaderSuite) BenchmarkSymfony(c *C) { - for i := 0; i < c.N; i++ { - readFromFile(c, "/tmp/symonfy", REFDeltaFormat) - } -} - -func (s *ReaderSuite) BenchmarkGit(c *C) { - for i := 0; i < c.N; i++ { - readFromFile(c, "/tmp/git", REFDeltaFormat) - } -} - -func (s *ReaderSuite) _testMemory(c *C, format Format) { - var b, a runtime.MemStats - - start := time.Now() - runtime.ReadMemStats(&b) - p := readFromFile(c, "/tmp/symfony.ofs-delta", format) - runtime.ReadMemStats(&a) - - fmt.Println("OFS--->") - fmt.Println("Alloc", a.Alloc-b.Alloc, humanize.Bytes(a.Alloc-b.Alloc)) - fmt.Println("TotalAlloc", a.TotalAlloc-b.TotalAlloc, humanize.Bytes(a.TotalAlloc-b.TotalAlloc)) - fmt.Println("HeapAlloc", a.HeapAlloc-b.HeapAlloc, humanize.Bytes(a.HeapAlloc-b.HeapAlloc)) - fmt.Println("HeapSys", a.HeapSys, humanize.Bytes(a.HeapSys-b.HeapSys)) - - fmt.Println("objects", len(p.Objects)) - fmt.Println("time", time.Since(start)) -} - -func (s *ReaderSuite) _TestMemoryOFS(c *C) { - s._testMemory(c, OFSDeltaFormat) -} - -func (s *ReaderSuite) _TestMemoryREF(c *C) { - s._testMemory(c, REFDeltaFormat) -} - -func readFromFile(c *C, file string, format Format) *memory.ObjectStorage { - f, err := os.Open(file) - c.Assert(err, IsNil) - r := NewSeekable(f) - sto := memory.NewStorage() - d := NewDecoder(r, sto.ObjectStorage()) - - err = d.Decode() - c.Assert(err, IsNil) - - return sto.ObjectStorage().(*memory.ObjectStorage) -} diff --git a/formats/packfile/parser.go b/formats/packfile/parser.go index 2930dcb..c1653c8 100644 --- a/formats/packfile/parser.go +++ b/formats/packfile/parser.go @@ -6,6 +6,7 @@ import ( "encoding/binary" "fmt" "io" + "io/ioutil" "gopkg.in/src-d/go-git.v4/core" ) @@ -22,98 +23,167 @@ var ( const ( // VersionSupported is the packfile version supported by this parser. - VersionSupported = 2 + VersionSupported uint32 = 2 ) +type ObjectHeader struct { + Type core.ObjectType + Offset int64 + Length int64 + Reference core.Hash + OffsetReference int64 +} + // A Parser is a collection of functions to read and process data form a packfile. // Values from this type are not zero-value safe. See the NewParser function bellow. type Parser struct { - ReadRecaller - ObjectFactory func() core.Object + r *trackableReader + + // pendingObject is used to detect if an object has been read, or still + // is waiting to be read + pendingObject *ObjectHeader } // NewParser returns a new Parser that reads from the packfile represented by r. -func NewParser(r ReadRecaller) *Parser { - return &Parser{ReadRecaller: r} +func NewParser(r io.Reader) *Parser { + return &Parser{r: &trackableReader{Reader: r}} } -// ReadInt32 reads 4 bytes and returns them as a Big Endian int32. -func (p Parser) readInt32() (uint32, error) { - var v uint32 - if err := binary.Read(p, binary.BigEndian, &v); err != nil { - return 0, err +// Header reads the whole packfile header (signature, version and object count). +// It returns the version and the object count and performs checks on the +// validity of the signature and the version fields. +func (p *Parser) Header() (version, objects uint32, err error) { + sig, err := p.readSignature() + if err != nil { + if err == io.EOF { + err = ErrEmptyPackfile + } + + return } - return v, nil + if !p.isValidSignature(sig) { + err = ErrBadSignature + return + } + + version, err = p.readVersion() + if err != nil { + return + } + + if !p.isSupportedVersion(version) { + err = ErrUnsupportedVersion.AddDetails("%d", version) + return + } + + objects, err = p.readCount() + return } -// ReadSignature reads an returns the signature field in the packfile. -func (p *Parser) ReadSignature() ([]byte, error) { +// readSignature reads an returns the signature field in the packfile. +func (p *Parser) readSignature() ([]byte, error) { var sig = make([]byte, 4) - if _, err := io.ReadFull(p, sig); err != nil { + if _, err := io.ReadFull(p.r, sig); err != nil { return []byte{}, err } return sig, nil } -// IsValidSignature returns if sig is a valid packfile signature. -func (p Parser) IsValidSignature(sig []byte) bool { +// isValidSignature returns if sig is a valid packfile signature. +func (p *Parser) isValidSignature(sig []byte) bool { return bytes.Equal(sig, []byte{'P', 'A', 'C', 'K'}) } -// ReadVersion reads and returns the version field of a packfile. -func (p *Parser) ReadVersion() (uint32, error) { +// readVersion reads and returns the version field of a packfile. +func (p *Parser) readVersion() (uint32, error) { return p.readInt32() } -// IsSupportedVersion returns whether version v is supported by the parser. +// isSupportedVersion returns whether version v is supported by the parser. // The current supported version is VersionSupported, defined above. -func (p *Parser) IsSupportedVersion(v uint32) bool { +func (p *Parser) isSupportedVersion(v uint32) bool { return v == VersionSupported } -// ReadCount reads and returns the count of objects field of a packfile. -func (p *Parser) ReadCount() (uint32, error) { +// readCount reads and returns the count of objects field of a packfile. +func (p *Parser) readCount() (uint32, error) { return p.readInt32() } -// ReadHeader reads the whole packfile header (signature, version and -// object count). It returns the object count and performs checks on the -// validity of the signature and the version fields. -func (p Parser) ReadHeader() (uint32, error) { - sig, err := p.ReadSignature() - if err != nil { - if err == io.EOF { - return 0, ErrEmptyPackfile - } +// ReadInt32 reads 4 bytes and returns them as a Big Endian int32. +func (p *Parser) readInt32() (uint32, error) { + var v uint32 + if err := binary.Read(p.r, binary.BigEndian, &v); err != nil { return 0, err } - if !p.IsValidSignature(sig) { - return 0, ErrBadSignature + return v, nil +} + +func (p *Parser) NextObjectHeader() (*ObjectHeader, error) { + if err := p.discardObjectIfNeeded(); err != nil { + return nil, err } - ver, err := p.ReadVersion() + h := &ObjectHeader{} + p.pendingObject = h + + var err error + h.Offset, err = p.r.Offset() if err != nil { - return 0, err + return nil, err } - if !p.IsSupportedVersion(ver) { - return 0, ErrUnsupportedVersion.AddDetails("%d", ver) + h.Type, h.Length, err = p.readObjectTypeAndLength() + if err != nil { + return nil, err } - count, err := p.ReadCount() + switch h.Type { + case core.OFSDeltaObject: + no, err := p.readNegativeOffset() + if err != nil { + return nil, err + } + + h.OffsetReference = h.Offset + no + case core.REFDeltaObject: + var err error + h.Reference, err = p.readHash() + if err != nil { + return nil, err + } + } + + return h, nil +} + +func (s *Parser) discardObjectIfNeeded() error { + if s.pendingObject == nil { + return nil + } + + h := s.pendingObject + n, err := s.NextObject(ioutil.Discard) if err != nil { - return 0, err + return err } - return count, nil + if n != h.Length { + return fmt.Errorf( + "error discarding object, discarded %d, expected %d", + n, h.Length, + ) + } + + return nil } // ReadObjectTypeAndLength reads and returns the object type and the // length field from an object entry in a packfile. -func (p Parser) ReadObjectTypeAndLength() (core.ObjectType, int64, error) { +func (p Parser) readObjectTypeAndLength() (core.ObjectType, int64, error) { t, c, err := p.readType() if err != nil { return t, 0, err @@ -127,37 +197,25 @@ func (p Parser) ReadObjectTypeAndLength() (core.ObjectType, int64, error) { func (p Parser) readType() (core.ObjectType, byte, error) { var c byte var err error - if c, err = p.ReadByte(); err != nil { + if c, err = p.r.ReadByte(); err != nil { return core.ObjectType(0), 0, err } + typ := parseType(c) return typ, c, nil } -var ( - maskContinue = uint8(128) // 1000 0000 - maskType = uint8(112) // 0111 0000 - maskFirstLength = uint8(15) // 0000 1111 - firstLengthBits = uint8(4) // the first byte has 4 bits to store the length - maskLength = uint8(127) // 0111 1111 - lengthBits = uint8(7) // subsequent bytes has 7 bits to store the length -) - -func parseType(b byte) core.ObjectType { - return core.ObjectType((b & maskType) >> firstLengthBits) -} - // the length is codified in the last 4 bits of the first byte and in // the last 7 bits of subsequent bytes. Last byte has a 0 MSB. -func (p Parser) readLength(first byte) (int64, error) { +func (p *Parser) readLength(first byte) (int64, error) { length := int64(first & maskFirstLength) c := first shift := firstLengthBits var err error for moreBytesInLength(c) { - if c, err = p.ReadByte(); err != nil { + if c, err = p.r.ReadByte(); err != nil { return 0, err } @@ -168,56 +226,18 @@ func (p Parser) readLength(first byte) (int64, error) { return length, nil } -func moreBytesInLength(c byte) bool { - return c&maskContinue > 0 -} - -// FillObject fills the given object from an object entry in the packfile. -// Non-deltified and deltified objects are supported. -func (p Parser) FillObject(obj core.Object) error { - start, err := p.Offset() - if err != nil { - return err - } - - t, l, err := p.ReadObjectTypeAndLength() - if err != nil { - return err - } - - obj.SetSize(l) - - switch t { - case core.CommitObject, core.TreeObject, core.BlobObject, core.TagObject: - obj.SetType(t) - err = p.FillFromNonDeltaContent(obj) - case core.REFDeltaObject: - err = p.FillREFDeltaObjectContent(obj) - case core.OFSDeltaObject: - err = p.FillOFSDeltaObjectContent(obj, start) - default: - err = ErrInvalidObject.AddDetails("tag %q", t) - } - - return err +func (p *Parser) NextObject(w io.Writer) (written int64, err error) { + p.pendingObject = nil + return p.copyObject(w) } -// FillFromNonDeltaContent reads and fill a non-deltified object +// ReadRegularObject reads and write a non-deltified object // from it zlib stream in an object entry in the packfile. -func (p Parser) FillFromNonDeltaContent(obj core.Object) error { - w, err := obj.Writer() - if err != nil { - return err - } - - return p.inflate(w) -} - -func (p Parser) inflate(w io.Writer) (err error) { - zr, err := zlib.NewReader(p) +func (p *Parser) copyObject(w io.Writer) (int64, error) { + zr, err := zlib.NewReader(p.r) if err != nil { if err != zlib.ErrHeader { - return fmt.Errorf("zlib reading error: %s", err) + return -1, fmt.Errorf("zlib reading error: %s", err) } } @@ -228,78 +248,23 @@ func (p Parser) inflate(w io.Writer) (err error) { } }() - _, err = io.Copy(w, zr) - - return err + return io.Copy(w, zr) } -// FillREFDeltaObjectContent reads and returns an object specified by a -// REF-Delta entry in the packfile, form the hash onwards. -func (p Parser) FillREFDeltaObjectContent(obj core.Object) error { - refHash, err := p.ReadHash() - if err != nil { - return err - } - - base, err := p.RecallByHash(refHash) - if err != nil { - return err - } - - obj.SetType(base.Type()) - if err := p.ReadAndApplyDelta(obj, base); err != nil { - return err - } - - return nil +func (p *Parser) Checksum() (core.Hash, error) { + return p.readHash() } // ReadHash reads a hash. -func (p Parser) ReadHash() (core.Hash, error) { +func (p *Parser) readHash() (core.Hash, error) { var h core.Hash - if _, err := io.ReadFull(p, h[:]); err != nil { + if _, err := io.ReadFull(p.r, h[:]); err != nil { return core.ZeroHash, err } return h, nil } -// ReadAndApplyDelta reads and apply the base patched with the contents -// of a zlib compressed diff data in the delta portion of an object -// entry in the packfile. -func (p Parser) ReadAndApplyDelta(target, base core.Object) error { - buf := bytes.NewBuffer(nil) - if err := p.inflate(buf); err != nil { - return err - } - - return ApplyDelta(target, base, buf.Bytes()) -} - -// FillOFSDeltaObjectContent reads an fill an object specified by an -// OFS-delta entry in the packfile from it negative offset onwards. The -// start parameter is the offset of this particular object entry (the -// current offset minus the already processed type and length). -func (p Parser) FillOFSDeltaObjectContent(obj core.Object, start int64) error { - - jump, err := p.ReadNegativeOffset() - if err != nil { - return err - } - - base, err := p.RecallByOffset(start + jump) - if err != nil { - return err - } - - obj.SetType(base.Type()) - if err := p.ReadAndApplyDelta(obj, base); err != nil { - return err - } - - return nil -} - // ReadNegativeOffset reads and returns an offset from a OFS DELTA // object entry in a packfile. OFS DELTA offsets are specified in Git // VLQ special format: @@ -327,18 +292,18 @@ func (p Parser) FillOFSDeltaObjectContent(obj core.Object, start int64) error { // while (ofs >>= 7) // dheader[--pos] = 128 | (--ofs & 127); // -func (p Parser) ReadNegativeOffset() (int64, error) { +func (p *Parser) readNegativeOffset() (int64, error) { var c byte var err error - if c, err = p.ReadByte(); err != nil { + if c, err = p.r.ReadByte(); err != nil { return 0, err } var offset = int64(c & maskLength) for moreBytesInLength(c) { offset++ - if c, err = p.ReadByte(); err != nil { + if c, err = p.r.ReadByte(); err != nil { return 0, err } offset = (offset << lengthBits) + int64(c&maskLength) @@ -346,3 +311,47 @@ func (p Parser) ReadNegativeOffset() (int64, error) { return -offset, nil } + +func moreBytesInLength(c byte) bool { + return c&maskContinue > 0 +} + +var ( + maskContinue = uint8(128) // 1000 0000 + maskType = uint8(112) // 0111 0000 + maskFirstLength = uint8(15) // 0000 1111 + firstLengthBits = uint8(4) // the first byte has 4 bits to store the length + maskLength = uint8(127) // 0111 1111 + lengthBits = uint8(7) // subsequent bytes has 7 bits to store the length +) + +func parseType(b byte) core.ObjectType { + return core.ObjectType((b & maskType) >> firstLengthBits) +} + +type trackableReader struct { + io.Reader + count int64 +} + +// Read reads up to len(p) bytes into p. +func (r *trackableReader) Read(p []byte) (n int, err error) { + n, err = r.Reader.Read(p) + r.count += int64(n) + + return +} + +// ReadByte reads a byte. +func (r *trackableReader) ReadByte() (byte, error) { + var p [1]byte + _, err := r.Reader.Read(p[:]) + r.count++ + + return p[0], err +} + +// Offset returns the number of bytes read. +func (r *trackableReader) Offset() (int64, error) { + return r.count, nil +} diff --git a/formats/packfile/parser_test.go b/formats/packfile/parser_test.go index f4aff83..a7959a0 100644 --- a/formats/packfile/parser_test.go +++ b/formats/packfile/parser_test.go @@ -2,14 +2,74 @@ package packfile import ( "bytes" - "io" - "io/ioutil" - "os" + "encoding/base64" . "gopkg.in/check.v1" "gopkg.in/src-d/go-git.v4/core" ) +type ScannerSuite struct{} + +var _ = Suite(&ScannerSuite{}) + +func (s *ScannerSuite) TestHeader(c *C) { + data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects) + + p := NewParser(bytes.NewReader(data)) + version, objects, err := p.Header() + c.Assert(err, IsNil) + c.Assert(version, Equals, VersionSupported) + c.Assert(objects, Equals, uint32(11)) +} + +func (s *ScannerSuite) TestNextObjectHeader(c *C) { + data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects) + + p := NewParser(bytes.NewReader(data)) + _, objects, err := p.Header() + c.Assert(err, IsNil) + + for i := 0; i < int(objects); i++ { + h, err := p.NextObjectHeader() + c.Assert(err, IsNil) + c.Assert(*h, DeepEquals, expectedHeaders[i]) + + buf := bytes.NewBuffer(nil) + n, err := p.NextObject(buf) + c.Assert(err, IsNil) + c.Assert(n, Equals, h.Length) + } +} + +func (s *ScannerSuite) TestNextObjectHeaderWithOutReadObject(c *C) { + data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects) + + p := NewParser(bytes.NewReader(data)) + _, objects, err := p.Header() + c.Assert(err, IsNil) + + for i := 0; i < int(objects); i++ { + h, err := p.NextObjectHeader() + c.Assert(err, IsNil) + c.Assert(*h, DeepEquals, expectedHeaders[i]) + } +} + +var expectedHeaders = []ObjectHeader{ + {Type: core.CommitObject, Offset: 12, Length: 239}, + {Type: core.CommitObject, Offset: 177, Length: 244}, + {Type: core.CommitObject, Offset: 345, Length: 239}, + {Type: core.CommitObject, Offset: 507, Length: 191}, + {Type: core.TreeObject, Offset: 639, Length: 91}, + {Type: core.BlobObject, Offset: 714, Length: 0}, + {Type: core.BlobObject, Offset: 723, Length: 14}, + {Type: core.OFSDeltaObject, Offset: 740, Length: 4, OffsetReference: 639}, + {Type: core.TreeObject, Offset: 754, Length: 58}, + {Type: core.BlobObject, Offset: 820, Length: 7}, + {Type: core.TreeObject, Offset: 833, Length: 29}, +} + +/* const ( sigOffset = 0 verOffset = 4 @@ -426,3 +486,4 @@ func (s *ParserSuite) TestReadHeader(c *C) { c.Assert(err, IsNil) c.Assert(count, Equals, uint32(0x50)) } +*/ diff --git a/formats/packfile/read_recaller.go b/formats/packfile/read_recaller.go deleted file mode 100644 index a4157d1..0000000 --- a/formats/packfile/read_recaller.go +++ /dev/null @@ -1,39 +0,0 @@ -package packfile - -import "gopkg.in/src-d/go-git.v4/core" - -var ( - // ErrDuplicatedObject is returned by Remember if an object appears several - // times in a packfile. - ErrDuplicatedObject = NewError("duplicated object") - // ErrCannotRecall is returned by RecallByOffset or RecallByHash if the object - // to recall cannot be returned. - ErrCannotRecall = NewError("cannot recall object") -) - -// The ReadRecaller interface has all the functions needed by a packfile -// Parser to operate. We provide two very different implementations: -// Seekable and Stream. -type ReadRecaller interface { - // Read reads up to len(p) bytes into p. - Read(p []byte) (int, error) - // ReadByte is needed because of these: - // - https://github.com/golang/go/commit/7ba54d45732219af86bde9a5b73c145db82b70c6 - // - https://groups.google.com/forum/#!topic/golang-nuts/fWTRdHpt0QI - // - https://gowalker.org/compress/zlib#NewReader - ReadByte() (byte, error) - // Offset returns the number of bytes parsed so far from the - // packfile. - Offset() (int64, error) - // Remember ask the ReadRecaller to remember the offset and hash for - // an object, so you can later call RecallByOffset and RecallByHash. - Remember(int64, core.Object) error - // ForgetAll forgets all previously remembered objects. - ForgetAll() - // RecallByOffset returns the previously processed object found at a - // given offset. - RecallByOffset(int64) (core.Object, error) - // RecallByHash returns the previously processed object with the - // given hash. - RecallByHash(core.Hash) (core.Object, error) -} diff --git a/formats/packfile/read_recaller_impl_test.go b/formats/packfile/read_recaller_impl_test.go deleted file mode 100644 index f89171d..0000000 --- a/formats/packfile/read_recaller_impl_test.go +++ /dev/null @@ -1,293 +0,0 @@ -package packfile - -import ( - "bytes" - "fmt" - "io/ioutil" - "os" - - "gopkg.in/src-d/go-git.v4/core" - - . "gopkg.in/check.v1" -) - -type ReadRecallerImplSuite struct{} - -var _ = Suite(&ReadRecallerImplSuite{}) - -type implFn func([]byte) ReadRecaller - -func newStream(data []byte) ReadRecaller { - buf := bytes.NewBuffer(data) - return NewStream(buf) -} - -func newSeekable(data []byte) ReadRecaller { - buf := bytes.NewReader(data) - return NewSeekable(buf) -} - -func (s *ReadRecallerImplSuite) TestRead(c *C) { - for _, impl := range []struct { - id string - newFn implFn - }{ - {id: "stream", newFn: newStream}, - {id: "seekable", newFn: newSeekable}, - } { - com := Commentf("implementation %s", impl.id) - data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10} - sr := impl.newFn(data) - all := make([]byte, 0, len(data)) - - for len(all) < len(data) { - tmp := make([]byte, 3) - nr, err := sr.Read(tmp) - c.Assert(err, IsNil, com) - all = append(all, tmp[:nr]...) - } - c.Assert(data, DeepEquals, all, com) - } -} - -func (s *ReadRecallerImplSuite) TestReadbyte(c *C) { - for _, impl := range []struct { - id string - newFn implFn - }{ - {id: "stream", newFn: newStream}, - {id: "seekable", newFn: newSeekable}, - } { - com := Commentf("implementation %s", impl.id) - data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10} - sr := impl.newFn(data) - all := make([]byte, 0, len(data)) - - for len(all) < len(data) { - b, err := sr.ReadByte() - c.Assert(err, IsNil, com) - all = append(all, b) - } - c.Assert(data, DeepEquals, all, com) - } -} - -func (s *ReadRecallerImplSuite) TestOffsetWithRead(c *C) { - for _, impl := range []struct { - id string - newFn implFn - }{ - {id: "stream", newFn: newStream}, - {id: "seekable", newFn: newSeekable}, - } { - com := Commentf("implementation %s", impl.id) - data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10} - sr := impl.newFn(data) - all := make([]byte, 0, len(data)) - - for len(all) < len(data) { - tmp := make([]byte, 3) - nr, err := sr.Read(tmp) - c.Assert(err, IsNil, com) - all = append(all, tmp[:nr]...) - - off, err := sr.Offset() - c.Assert(err, IsNil, com) - c.Assert(off, Equals, int64(len(all)), com) - } - } -} - -func (s *ReadRecallerImplSuite) TestOffsetWithReadByte(c *C) { - for _, impl := range []struct { - id string - newFn implFn - }{ - {id: "stream", newFn: newStream}, - {id: "seekable", newFn: newSeekable}, - } { - com := Commentf("implementation %s", impl.id) - data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10} - sr := impl.newFn(data) - all := make([]byte, 0, len(data)) - - for len(all) < len(data) { - b, err := sr.ReadByte() - c.Assert(err, IsNil, com) - all = append(all, b) - - off, err := sr.Offset() - c.Assert(err, IsNil, com) - c.Assert(off, Equals, int64(len(all)), com) - } - } -} - -func (s *ReadRecallerImplSuite) TestRememberRecall(c *C) { - packfile := "fixtures/spinnaker-spinnaker.pack" - f, err := os.Open(packfile) - c.Assert(err, IsNil) - defer func() { - err = f.Close() - c.Assert(err, IsNil) - }() - - data, err := ioutil.ReadAll(f) - c.Assert(err, IsNil) - - for _, impl := range []struct { - id string - newFn implFn - }{ - {id: "stream", newFn: newStream}, - {id: "seekable", newFn: newSeekable}, - } { - sr := impl.newFn(data) - for i, test := range [...]struct { - off int64 - obj core.Object - err string // error regexp - ignore string // ignore this test for this implementation - }{ - { - off: 12, - obj: newObject(core.CommitObject, []byte("tree 44a1cdf21c791867c51caad8f1b77e6baee6f462\nparent 87fe6e7c6b1b89519fe3a03a8961c5aa14d4cc68\nparent 9244ee648182b91a63d8cc4cbe4b9ac2a27c0492\nauthor Matt Duftler 1448290941 -0500\ncommitter Matt Duftler 1448290941 -0500\n\nMerge pull request #615 from ewiseblatt/create_dev\n\nPreserve original credentials of spinnaker-local.yml when transforming it.")), - }, { - off: 3037, - obj: newObject(core.TagObject, []byte("object e0005f50e22140def60260960b21667f1fdfff80\ntype commit\ntag v0.10.0\ntagger cfieber 1447687536 -0800\n\nRelease of 0.10.0\n\n- e0005f50e22140def60260960b21667f1fdfff80: Merge pull request #553 from ewiseblatt/rendezvous\n- e1a2b26b784179e6903a7ae967c037c721899eba: Wait for cassandra before starting spinnaker\n- c756e09461d071e98b8660818cf42d90c90f2854: Merge pull request #552 from duftler/google-c2d-tweaks\n- 0777fadf4ca6f458d7071de414f9bd5417911037: Fix incorrect config prop names: s/SPINNAKER_GOOGLE_PROJECT_DEFAULT_REGION/SPINNAKER_GOOGLE_DEFAULT_REGION s/SPINNAKER_GOOGLE_PROJECT_DEFAULT_ZONE/SPINNAKER_GOOGLE_DEFAULT_ZONE Hardcode profile name in generated ~/.aws/credentials to [default]. Restart all of spinnaker after updating cassandra and reconfiguring spinnaker, instead of just restarting clouddriver.\n- d8d031c1ac45801074418c43424a6f2c0dff642c: Merge pull request #551 from kenzanmedia/fixGroup\n- 626d23075f9e92aad19015f2964c95d45f41fa3a: Put in correct block for public image. Delineate cloud provider.\n")), - }, { - off: 157625, - obj: newObject(core.BlobObject, []byte(".gradle\nbuild/\n*.iml\n.idea\n*.pyc\n*~\n#*\nconfig/spinnaker-local.yml\n.DS_Store\npacker/ami_table.md\npacker/ami_table.json\npacker/example_output.txt")), - }, { - off: 1234, - obj: newObject(core.BlobObject, []byte(".gradle\nbuild/\n*.iml\n.idea\n*.pyc\n*~\n#*\nconfig/spinnaker-local.yml\n.DS_Store\npacker/ami_table.md\npacker/ami_table.json\npacker/example_output.txt")), - err: "duplicated object: with hash .*", - }, { - off: 3037, - obj: newObject(core.BlobObject, []byte("")), - err: "duplicated object: with offset 3037", - ignore: "seekable", - // seekable can not check if the offset has already been added - // for performance reasons. - }, - } { - if test.ignore == impl.id { - continue - } - com := Commentf("subtest %d) implementation %s", i, impl.id) - - err := sr.Remember(test.off, test.obj) - if test.err != "" { - c.Assert(err, ErrorMatches, test.err, com) - continue - } - c.Assert(err, IsNil, com) - - result, err := sr.RecallByHash(test.obj.Hash()) - c.Assert(err, IsNil, com) - c.Assert(result.Hash(), Equals, test.obj.Hash()) - c.Assert(result, DeepEquals, test.obj, com) - - result, err = sr.RecallByOffset(test.off) - c.Assert(err, IsNil, com) - c.Assert(result.Hash(), Equals, test.obj.Hash()) - c.Assert(result, DeepEquals, test.obj, com) - } - } -} - -func (s *ReadRecallerImplSuite) TestRecallByHashErrors(c *C) { - for _, impl := range []struct { - id string - newFn implFn - }{ - {id: "stream", newFn: newStream}, - {id: "seekable", newFn: newSeekable}, - } { - com := Commentf("implementation %s", impl.id) - sr := impl.newFn([]byte{}) - obj := newObject(core.CommitObject, []byte{}) - - _, err := sr.RecallByHash(obj.Hash()) - c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) - - err = rememberSomeObjects(sr) - c.Assert(err, IsNil) - - _, err = sr.RecallByHash(obj.Hash()) - c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) - } -} - -func (s *ReadRecallerImplSuite) TestRecallByOffsetErrors(c *C) { - for _, impl := range []struct { - id string - newFn implFn - }{ - {id: "stream", newFn: newStream}, - // seekalbe allways recall every object in the packfile - } { - com := Commentf("implementation %s", impl.id) - sr := impl.newFn([]byte{}) - - _, err := sr.RecallByOffset(15) - c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) - - err = rememberSomeObjects(sr) - c.Assert(err, IsNil) - - _, err = sr.RecallByOffset(15) - c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) - } -} - -func rememberSomeObjects(sr ReadRecaller) error { - for i, init := range [...]struct { - off int64 - obj core.Object - }{ - {off: 0, obj: newObject(core.CommitObject, []byte{'a'})}, // 93114cce67ec23976d15199514399203f69cc676 - {off: 10, obj: newObject(core.CommitObject, []byte{'b'})}, // 2bb767097e479f668f0ebdabe88df11337bd8f19 - {off: 20, obj: newObject(core.CommitObject, []byte{'c'})}, // 2f8096005677370e6446541a50e074299d43d468 - } { - err := sr.Remember(init.off, init.obj) - if err != nil { - return fmt.Errorf("cannot ask StreamReader to Remember item %d", i) - } - } - - return nil -} - -func (s *ReadRecallerImplSuite) TestForgetAll(c *C) { - for _, impl := range []struct { - id string - newFn implFn - }{ - {id: "stream", newFn: newStream}, - {id: "seekable", newFn: newSeekable}, - } { - com := Commentf("implementation %s", impl.id) - sr := impl.newFn([]byte{}) - - err := rememberSomeObjects(sr) - c.Assert(err, IsNil) - - sr.ForgetAll() - - if impl.id != "seekable" { // for efficiency, seekable always finds objects by offset - _, err = sr.RecallByOffset(0) - c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) - _, err = sr.RecallByOffset(10) - c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) - _, err = sr.RecallByOffset(20) - c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) - } - _, err = sr.RecallByHash(core.NewHash("93114cce67ec23976d15199514399203f69cc676")) - c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) - _, err = sr.RecallByHash(core.NewHash("2bb767097e479f668f0ebdabe88df11337bd8f19")) - c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) - _, err = sr.RecallByHash(core.NewHash("2f8096005677370e6446541a50e074299d43d468")) - c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) - } -} diff --git a/formats/packfile/seekable.go b/formats/packfile/seekable.go deleted file mode 100644 index 65c8a69..0000000 --- a/formats/packfile/seekable.go +++ /dev/null @@ -1,109 +0,0 @@ -package packfile - -import ( - "io" - "os" - - "gopkg.in/src-d/go-git.v4/core" -) - -// Seekable implements ReadRecaller for the io.ReadSeeker of a packfile. -// Remembering does not actually stores any reference to the remembered -// objects; the object offset is remebered instead and the packfile is -// read again everytime a recall operation is requested. This saves -// memory buy can be very slow if the associated io.ReadSeeker is slow -// (like a hard disk). -type Seekable struct { - io.ReadSeeker - HashToOffset map[core.Hash]int64 -} - -// NewSeekable returns a new Seekable that reads form r. -func NewSeekable(r io.ReadSeeker) *Seekable { - return &Seekable{ - r, - make(map[core.Hash]int64), - } -} - -// Read reads up to len(p) bytes into p. -func (r *Seekable) Read(p []byte) (int, error) { - return r.ReadSeeker.Read(p) -} - -// ReadByte reads a byte. -func (r *Seekable) ReadByte() (byte, error) { - var p [1]byte - _, err := r.ReadSeeker.Read(p[:]) - if err != nil { - return 0, err - } - - return p[0], nil -} - -// Offset returns the offset for the next Read or ReadByte. -func (r *Seekable) Offset() (int64, error) { - return r.Seek(0, os.SEEK_CUR) -} - -// Remember stores the offset of the object and its hash, but not the -// object itself. This implementation does not check for already stored -// offsets, as it is too expensive to build this information from an -// index every time a get operation is performed on the SeekableReadRecaller. -func (r *Seekable) Remember(o int64, obj core.Object) error { - h := obj.Hash() - if _, ok := r.HashToOffset[h]; ok { - return ErrDuplicatedObject.AddDetails("with hash %s", h) - } - - r.HashToOffset[h] = o - - return nil -} - -// ForgetAll forgets all previously remembered objects. For efficiency -// reasons RecallByOffset always find objects, even if they have been -// forgetted or were never remembered. -func (r *Seekable) ForgetAll() { - r.HashToOffset = make(map[core.Hash]int64) -} - -// RecallByHash returns the object for a given hash by looking for it again in -// the io.ReadeSeerker. -func (r *Seekable) RecallByHash(h core.Hash) (core.Object, error) { - o, ok := r.HashToOffset[h] - if !ok { - return nil, ErrCannotRecall.AddDetails("hash not found: %s", h) - } - - return r.RecallByOffset(o) -} - -// RecallByOffset returns the object for a given offset by looking for it again in -// the io.ReadeSeerker. For efficiency reasons, this method always find objects by -// offset, even if they have not been remembered or if they have been forgetted. -func (r *Seekable) RecallByOffset(o int64) (obj core.Object, err error) { - // remember current offset - beforeJump, err := r.Offset() - if err != nil { - return nil, err - } - - defer func() { - // jump back - _, seekErr := r.Seek(beforeJump, os.SEEK_SET) - if err == nil { - err = seekErr - } - }() - - // jump to requested offset - _, err = r.Seek(o, os.SEEK_SET) - if err != nil { - return nil, err - } - - obj = &core.MemoryObject{} - return obj, NewParser(r).FillObject(obj) -} diff --git a/formats/packfile/stream.go b/formats/packfile/stream.go deleted file mode 100644 index 34ffd2f..0000000 --- a/formats/packfile/stream.go +++ /dev/null @@ -1,95 +0,0 @@ -package packfile - -import ( - "io" - - "gopkg.in/src-d/go-git.v4/core" -) - -// Stream implements ReadRecaller for the io.Reader of a packfile. This -// implementation keeps all remembered objects referenced in maps for -// quick access. -type Stream struct { - io.Reader - count int64 - offsetToObject map[int64]core.Object - hashToObject map[core.Hash]core.Object -} - -// NewStream returns a new Stream that reads form r. -func NewStream(r io.Reader) *Stream { - return &Stream{ - Reader: r, - count: 0, - hashToObject: make(map[core.Hash]core.Object, 0), - offsetToObject: make(map[int64]core.Object, 0), - } -} - -// Read reads up to len(p) bytes into p. -func (r *Stream) Read(p []byte) (n int, err error) { - n, err = r.Reader.Read(p) - r.count += int64(n) - - return -} - -// ReadByte reads a byte. -func (r *Stream) ReadByte() (byte, error) { - var p [1]byte - _, err := r.Reader.Read(p[:]) - r.count++ - - return p[0], err -} - -// Offset returns the number of bytes read. -func (r *Stream) Offset() (int64, error) { - return r.count, nil -} - -// Remember stores references to the passed object to be used later by -// RecalByHash and RecallByOffset. It receives the object and the offset -// of its object entry in the packfile. -func (r *Stream) Remember(o int64, obj core.Object) error { - h := obj.Hash() - if _, ok := r.hashToObject[h]; ok { - return ErrDuplicatedObject.AddDetails("with hash %s", h) - } - r.hashToObject[h] = obj - - if _, ok := r.offsetToObject[o]; ok { - return ErrDuplicatedObject.AddDetails("with offset %d", o) - } - r.offsetToObject[o] = obj - - return nil -} - -// ForgetAll forgets all previously remembered objects. -func (r *Stream) ForgetAll() { - r.hashToObject = make(map[core.Hash]core.Object) - r.offsetToObject = make(map[int64]core.Object) -} - -// RecallByHash returns an object that has been previously Remember-ed by -// its hash. -func (r *Stream) RecallByHash(h core.Hash) (core.Object, error) { - obj, ok := r.hashToObject[h] - if !ok { - return nil, ErrCannotRecall.AddDetails("by hash %s", h) - } - - return obj, nil -} - -// RecallByOffset returns an object that has been previously Remember-ed by -// the offset of its object entry in the packfile. -func (r *Stream) RecallByOffset(o int64) (core.Object, error) { - obj, ok := r.offsetToObject[o] - if !ok { - return nil, ErrCannotRecall.AddDetails("no object found at offset %d", o) - } - - return obj, nil -} -- cgit