From 58fe211f1b0e4863b425542d2fad15803276fd66 Mon Sep 17 00:00:00 2001 From: Máximo Cuadros Date: Sun, 11 Sep 2016 20:35:06 +0200 Subject: format: packfile fix ReadObjectAt without decode --- formats/packfile/decoder.go | 57 +++-- formats/packfile/decoder_test.go | 43 +++- formats/packfile/parser.go | 487 -------------------------------------- formats/packfile/parser_test.go | 168 ------------- formats/packfile/scanner.go | 493 +++++++++++++++++++++++++++++++++++++++ formats/packfile/scanner_test.go | 168 +++++++++++++ 6 files changed, 728 insertions(+), 688 deletions(-) delete mode 100644 formats/packfile/parser.go delete mode 100644 formats/packfile/parser_test.go create mode 100644 formats/packfile/scanner.go create mode 100644 formats/packfile/scanner_test.go (limited to 'formats/packfile') diff --git a/formats/packfile/decoder.go b/formats/packfile/decoder.go index 23a8e1a..c4b9182 100644 --- a/formats/packfile/decoder.go +++ b/formats/packfile/decoder.go @@ -43,8 +43,9 @@ type Decoder struct { o core.ObjectStorage tx core.TxObjectStorage - offsets map[int64]core.Hash - crcs map[core.Hash]uint32 + offsetToHash map[int64]core.Hash + hashToOffset map[core.Hash]int64 + crcs map[core.Hash]uint32 } // NewDecoder returns a new Decoder that reads from r. @@ -54,8 +55,9 @@ func NewDecoder(s *Scanner, o core.ObjectStorage) *Decoder { o: o, tx: o.Begin(), - offsets: make(map[int64]core.Hash, 0), - crcs: make(map[core.Hash]uint32, 0), + offsetToHash: make(map[int64]core.Hash, 0), + hashToOffset: make(map[core.Hash]int64, 0), + crcs: make(map[core.Hash]uint32, 0), } } @@ -82,11 +84,7 @@ func (d *Decoder) doDecode() error { return err } - if err := d.tx.Commit(); err != nil { - return err - } - - return nil + return d.tx.Commit() } func (d *Decoder) readObjects(count uint32) error { @@ -126,7 +124,9 @@ func (d *Decoder) ReadObject() (core.Object, error) { return obj, err } - d.remember(obj, h.Offset, crc) + hash := obj.Hash() + d.setOffset(hash, h.Offset) + d.setCRC(hash, crc) if _, err := d.tx.Set(obj); err != nil { return nil, err @@ -194,34 +194,45 @@ func (d *Decoder) fillOFSDeltaObjectContent(obj core.Object, offset int64) (uint return crc, ApplyDelta(obj, base, buf.Bytes()) } -func (d *Decoder) remember(obj core.Object, offset int64, crc uint32) { - h := obj.Hash() +func (d *Decoder) setOffset(h core.Hash, offset int64) { + d.offsetToHash[offset] = h + d.hashToOffset[h] = offset +} - d.offsets[offset] = h +func (d *Decoder) setCRC(h core.Hash, crc uint32) { d.crcs[h] = crc } func (d *Decoder) recallByOffset(o int64) (core.Object, error) { - h, ok := d.offsets[o] - if ok { - return d.recallByHash(h) + if h, ok := d.offsetToHash[o]; ok { + return d.tx.Get(core.AnyObject, h) } return d.ReadObjectAt(o) } func (d *Decoder) recallByHash(h core.Hash) (core.Object, error) { - return d.tx.Get(core.AnyObject, h) + obj, err := d.tx.Get(core.AnyObject, h) + if err != core.ErrObjectNotFound { + return obj, err + } + + if o, ok := d.hashToOffset[h]; ok { + return d.ReadObjectAt(o) + } + + return nil, core.ErrObjectNotFound +} + +// SetOffsets sets the offsets, required when using the method ReadObjectAt, +// without decoding the full packfile +func (d *Decoder) SetOffsets(offsets map[core.Hash]int64) { + d.hashToOffset = offsets } // Offsets returns the objects read offset func (d *Decoder) Offsets() map[core.Hash]int64 { - i := make(map[core.Hash]int64, len(d.offsets)) - for o, h := range d.offsets { - i[h] = o - } - - return i + return d.hashToOffset } // CRCs returns the CRC-32 for each objected read diff --git a/formats/packfile/decoder_test.go b/formats/packfile/decoder_test.go index 7baab44..d85f3bf 100644 --- a/formats/packfile/decoder_test.go +++ b/formats/packfile/decoder_test.go @@ -1,10 +1,12 @@ package packfile import ( + "io" "testing" "gopkg.in/src-d/go-git.v4/core" "gopkg.in/src-d/go-git.v4/fixtures" + "gopkg.in/src-d/go-git.v4/formats/idxfile" "gopkg.in/src-d/go-git.v4/storage/memory" . "gopkg.in/check.v1" @@ -68,7 +70,7 @@ func (s *ReaderSuite) TestDecode(c *C) { }) } func (s *ReaderSuite) TestDecodeCRCs(c *C) { - f := fixtures.Basic().ByTag("ofs-delta") + f := fixtures.Basic().ByTag("ofs-delta").One() scanner := NewScanner(f.Packfile()) storage := memory.NewStorage() @@ -86,18 +88,24 @@ func (s *ReaderSuite) TestDecodeCRCs(c *C) { } func (s *ReaderSuite) TestReadObjectAt(c *C) { - f := fixtures.Basic().One() + fixtures.Basic().Test(c, func(f *fixtures.Fixture) { + scanner := NewScanner(f.Packfile()) + storage := memory.NewStorage() - scanner := NewScanner(f.Packfile()) - storage := memory.NewStorage() + d := NewDecoder(scanner, storage.ObjectStorage()) - d := NewDecoder(scanner, storage.ObjectStorage()) + // when the packfile is ref-delta based, the offsets are required + if f.Is("ref-delta") { + offsets := getOffsetsFromIdx(f.Idx()) + d.SetOffsets(offsets) + } - // the objects at reference 186, is a delta, so should be recall, without - // being read before. - obj, err := d.ReadObjectAt(186) - c.Assert(err, IsNil) - c.Assert(obj.Hash().String(), Equals, "6ecf0ef2c2dffb796033e5a02219af86ec6584e5") + // the objects at reference 186, is a delta, so should be recall, + // without being read before. + obj, err := d.ReadObjectAt(186) + c.Assert(err, IsNil) + c.Assert(obj.Hash().String(), Equals, "6ecf0ef2c2dffb796033e5a02219af86ec6584e5") + }) } func AssertObjects(c *C, s *memory.Storage, expects []string) { @@ -110,3 +118,18 @@ func AssertObjects(c *C, s *memory.Storage, expects []string) { c.Assert(obt.Hash().String(), Equals, exp) } } + +func getOffsetsFromIdx(r io.Reader) map[core.Hash]int64 { + idx := &idxfile.Idxfile{} + err := idxfile.NewDecoder(r).Decode(idx) + if err != nil { + panic(err) + } + + offsets := make(map[core.Hash]int64) + for _, e := range idx.Entries { + offsets[e.Hash] = int64(e.Offset) + } + + return offsets +} diff --git a/formats/packfile/parser.go b/formats/packfile/parser.go deleted file mode 100644 index 6fa2f42..0000000 --- a/formats/packfile/parser.go +++ /dev/null @@ -1,487 +0,0 @@ -package packfile - -import ( - "bufio" - "bytes" - "compress/zlib" - "encoding/binary" - "fmt" - "hash" - "hash/crc32" - "io" - "io/ioutil" - - "gopkg.in/src-d/go-git.v4/core" -) - -var ( - // ErrEmptyPackfile is returned by ReadHeader when no data is found in the packfile - ErrEmptyPackfile = NewError("empty packfile") - // ErrBadSignature is returned by ReadHeader when the signature in the packfile is incorrect. - ErrBadSignature = NewError("malformed pack file signature") - // ErrUnsupportedVersion is returned by ReadHeader when the packfile version is - // different than VersionSupported. - ErrUnsupportedVersion = NewError("unsupported packfile version") - // ErrSeekNotSupported returned if seek is not support - ErrSeekNotSupported = NewError("not seek support") -) - -const ( - // VersionSupported is the packfile version supported by this parser. - VersionSupported uint32 = 2 -) - -// ObjectHeader contains the information related to the object, this information -// is collected from the previous bytes to the content of the object. -type ObjectHeader struct { - Type core.ObjectType - Offset int64 - Length int64 - Reference core.Hash - OffsetReference int64 -} - -// A Parser is a collection of functions to read and process data form a packfile. -// Values from this type are not zero-value safe. See the NewParser function bellow. -type Scanner struct { - r reader - crc hash.Hash32 - - // pendingObject is used to detect if an object has been read, or still - // is waiting to be read - pendingObject *ObjectHeader - version, objects uint32 -} - -// NewParser returns a new Parser that reads from the packfile represented by r. -func NewScannerFromReader(r io.Reader) *Scanner { - s := &trackableReader{Reader: r} - return NewScanner(s) -} - -func NewScanner(r io.ReadSeeker) *Scanner { - crc := crc32.NewIEEE() - seeker := newByteReadSeeker(r) - tee := &teeReader{seeker, crc} - - return &Scanner{r: tee, crc: crc} -} - -// Header reads the whole packfile header (signature, version and object count). -// It returns the version and the object count and performs checks on the -// validity of the signature and the version fields. -func (s *Scanner) Header() (version, objects uint32, err error) { - if s.version != 0 { - return s.version, s.objects, nil - } - - sig, err := s.readSignature() - if err != nil { - if err == io.EOF { - err = ErrEmptyPackfile - } - - return - } - - if !s.isValidSignature(sig) { - err = ErrBadSignature - return - } - - version, err = s.readVersion() - s.version = version - if err != nil { - return - } - - if !s.isSupportedVersion(version) { - err = ErrUnsupportedVersion.AddDetails("%d", version) - return - } - - objects, err = s.readCount() - s.objects = objects - return -} - -// readSignature reads an returns the signature field in the packfile. -func (s *Scanner) readSignature() ([]byte, error) { - var sig = make([]byte, 4) - if _, err := io.ReadFull(s.r, sig); err != nil { - return []byte{}, err - } - - return sig, nil -} - -// isValidSignature returns if sig is a valid packfile signature. -func (s *Scanner) isValidSignature(sig []byte) bool { - return bytes.Equal(sig, []byte{'P', 'A', 'C', 'K'}) -} - -// readVersion reads and returns the version field of a packfile. -func (s *Scanner) readVersion() (uint32, error) { - return s.readInt32() -} - -// isSupportedVersion returns whether version v is supported by the parser. -// The current supported version is VersionSupported, defined above. -func (s *Scanner) isSupportedVersion(v uint32) bool { - return v == VersionSupported -} - -// readCount reads and returns the count of objects field of a packfile. -func (s *Scanner) readCount() (uint32, error) { - return s.readInt32() -} - -// ReadInt32 reads 4 bytes and returns them as a Big Endian int32. -func (s *Scanner) readInt32() (uint32, error) { - var v uint32 - if err := binary.Read(s.r, binary.BigEndian, &v); err != nil { - return 0, err - } - - return v, nil -} - -// NextObjectHeader returns the ObjectHeader for the next object in the reader -func (s *Scanner) NextObjectHeader() (*ObjectHeader, error) { - if err := s.doPending(); err != nil { - return nil, err - } - - s.crc.Reset() - - h := &ObjectHeader{} - s.pendingObject = h - - var err error - h.Offset, err = s.r.Seek(0, io.SeekCurrent) - if err != nil { - return nil, err - } - - h.Type, h.Length, err = s.readObjectTypeAndLength() - if err != nil { - return nil, err - } - - switch h.Type { - case core.OFSDeltaObject: - no, err := s.readNegativeOffset() - if err != nil { - return nil, err - } - - h.OffsetReference = h.Offset + no - case core.REFDeltaObject: - var err error - h.Reference, err = s.readHash() - if err != nil { - return nil, err - } - } - - return h, nil -} - -func (s *Scanner) doPending() error { - if s.version == 0 { - var err error - s.version, s.objects, err = s.Header() - if err != nil { - return err - } - } - - return s.discardObjectIfNeeded() -} - -func (s *Scanner) discardObjectIfNeeded() error { - if s.pendingObject == nil { - return nil - } - - h := s.pendingObject - n, _, err := s.NextObject(ioutil.Discard) - if err != nil { - return err - } - - if n != h.Length { - return fmt.Errorf( - "error discarding object, discarded %d, expected %d", - n, h.Length, - ) - } - - return nil -} - -// ReadObjectTypeAndLength reads and returns the object type and the -// length field from an object entry in a packfile. -func (s *Scanner) readObjectTypeAndLength() (core.ObjectType, int64, error) { - t, c, err := s.readType() - if err != nil { - return t, 0, err - } - - l, err := s.readLength(c) - - return t, l, err -} - -func (s *Scanner) readType() (core.ObjectType, byte, error) { - var c byte - var err error - if c, err = s.readByte(); err != nil { - return core.ObjectType(0), 0, err - } - - typ := parseType(c) - - return typ, c, nil -} - -// the length is codified in the last 4 bits of the first byte and in -// the last 7 bits of subsequent bytes. Last byte has a 0 MSB. -func (s *Scanner) readLength(first byte) (int64, error) { - length := int64(first & maskFirstLength) - - c := first - shift := firstLengthBits - var err error - for moreBytesInLength(c) { - if c, err = s.readByte(); err != nil { - return 0, err - } - - length += int64(c&maskLength) << shift - shift += lengthBits - } - - return length, nil -} - -func (s *Scanner) NextObject(w io.Writer) (written int64, crc32 uint32, err error) { - defer s.crc.Reset() - - s.pendingObject = nil - written, err = s.copyObject(w) - crc32 = s.crc.Sum32() - return -} - -// ReadRegularObject reads and write a non-deltified object -// from it zlib stream in an object entry in the packfile. -func (s *Scanner) copyObject(w io.Writer) (int64, error) { - zr, err := zlib.NewReader(s.r) - if err != nil { - return -1, fmt.Errorf("zlib reading error: %s", err) - } - - defer func() { - closeErr := zr.Close() - if err == nil { - err = closeErr - } - }() - - return io.Copy(w, zr) -} - -// Seek sets a new offset from start, returns the old position before the change -func (s *Scanner) Seek(offset int64) (previous int64, err error) { - // if seeking we asume that you are not interested on the header - if s.version == 0 { - s.version = VersionSupported - } - - previous, err = s.r.Seek(0, io.SeekCurrent) - if err != nil { - return -1, err - } - - _, err = s.r.Seek(offset, io.SeekStart) - return previous, err -} - -func (s *Scanner) Checksum() (core.Hash, error) { - err := s.discardObjectIfNeeded() - if err != nil { - return core.ZeroHash, err - } - - return s.readHash() -} - -// ReadHash reads a hash. -func (s *Scanner) readHash() (core.Hash, error) { - var h core.Hash - if _, err := io.ReadFull(s.r, h[:]); err != nil { - return core.ZeroHash, err - } - - return h, nil -} - -// ReadNegativeOffset reads and returns an offset from a OFS DELTA -// object entry in a packfile. OFS DELTA offsets are specified in Git -// VLQ special format: -// -// Ordinary VLQ has some redundancies, example: the number 358 can be -// encoded as the 2-octet VLQ 0x8166 or the 3-octet VLQ 0x808166 or the -// 4-octet VLQ 0x80808166 and so forth. -// -// To avoid these redundancies, the VLQ format used in Git removes this -// prepending redundancy and extends the representable range of shorter -// VLQs by adding an offset to VLQs of 2 or more octets in such a way -// that the lowest possible value for such an (N+1)-octet VLQ becomes -// exactly one more than the maximum possible value for an N-octet VLQ. -// In particular, since a 1-octet VLQ can store a maximum value of 127, -// the minimum 2-octet VLQ (0x8000) is assigned the value 128 instead of -// 0. Conversely, the maximum value of such a 2-octet VLQ (0xff7f) is -// 16511 instead of just 16383. Similarly, the minimum 3-octet VLQ -// (0x808000) has a value of 16512 instead of zero, which means -// that the maximum 3-octet VLQ (0xffff7f) is 2113663 instead of -// just 2097151. And so forth. -// -// This is how the offset is saved in C: -// -// dheader[pos] = ofs & 127; -// while (ofs >>= 7) -// dheader[--pos] = 128 | (--ofs & 127); -// -func (s *Scanner) readNegativeOffset() (int64, error) { - var c byte - var err error - - if c, err = s.readByte(); err != nil { - return 0, err - } - - var offset = int64(c & maskLength) - for moreBytesInLength(c) { - offset++ - if c, err = s.readByte(); err != nil { - return 0, err - } - offset = (offset << lengthBits) + int64(c&maskLength) - } - - return -offset, nil -} - -func (s *Scanner) readByte() (byte, error) { - b, err := s.r.ReadByte() - if err != nil { - return 0, err - } - - return b, err -} - -func (s *Scanner) Close() error { - _, err := io.Copy(ioutil.Discard, s.r) - return err -} - -func moreBytesInLength(c byte) bool { - return c&maskContinue > 0 -} - -var ( - maskContinue = uint8(128) // 1000 0000 - maskType = uint8(112) // 0111 0000 - maskFirstLength = uint8(15) // 0000 1111 - firstLengthBits = uint8(4) // the first byte has 4 bits to store the length - maskLength = uint8(127) // 0111 1111 - lengthBits = uint8(7) // subsequent bytes has 7 bits to store the length -) - -func parseType(b byte) core.ObjectType { - return core.ObjectType((b & maskType) >> firstLengthBits) -} - -type trackableReader struct { - count int64 - io.Reader -} - -// Read reads up to len(p) bytes into p. -func (r *trackableReader) Read(p []byte) (n int, err error) { - n, err = r.Reader.Read(p) - r.count += int64(n) - - return -} - -// Seek only supports io.SeekCurrent, any other operation fails -func (r *trackableReader) Seek(offset int64, whence int) (int64, error) { - if whence != io.SeekCurrent { - return -1, ErrSeekNotSupported - } - - return r.count, nil -} - -func newByteReadSeeker(r io.ReadSeeker) *bufferedSeeker { - return &bufferedSeeker{ - r: r, - Reader: *bufio.NewReader(r), - } -} - -type bufferedSeeker struct { - r io.ReadSeeker - bufio.Reader -} - -func (r *bufferedSeeker) Seek(offset int64, whence int) (int64, error) { - if whence == io.SeekCurrent { - current, err := r.r.Seek(offset, whence) - if err != nil { - return current, err - } - - return current - int64(r.Buffered()), nil - } - - defer r.Reader.Reset(r.r) - return r.r.Seek(offset, whence) -} - -type reader interface { - io.Reader - io.ByteReader - io.Seeker -} - -type teeReader struct { - reader - w hash.Hash32 -} - -func (r *teeReader) Read(p []byte) (n int, err error) { - n, err = r.reader.Read(p) - if n > 0 { - if n, err := r.w.Write(p[:n]); err != nil { - return n, err - } - } - return -} - -func (r *teeReader) ReadByte() (b byte, err error) { - b, err = r.reader.ReadByte() - if err == nil { - _, err := r.w.Write([]byte{b}) - if err != nil { - return 0, err - } - } - - return -} diff --git a/formats/packfile/parser_test.go b/formats/packfile/parser_test.go deleted file mode 100644 index 2ff2887..0000000 --- a/formats/packfile/parser_test.go +++ /dev/null @@ -1,168 +0,0 @@ -package packfile - -import ( - "bytes" - - . "gopkg.in/check.v1" - "gopkg.in/src-d/go-git.v4/core" - "gopkg.in/src-d/go-git.v4/fixtures" -) - -type ScannerSuite struct{} - -var _ = Suite(&ScannerSuite{}) - -func (s *ScannerSuite) SetUpSuite(c *C) { - fixtures.RootFolder = "../../fixtures" -} - -func (s *ScannerSuite) TestHeader(c *C) { - r := fixtures.Basic().One().Packfile() - p := NewScanner(r) - - version, objects, err := p.Header() - c.Assert(err, IsNil) - c.Assert(version, Equals, VersionSupported) - c.Assert(objects, Equals, uint32(31)) -} - -func (s *ScannerSuite) TestNextObjectHeaderWithoutHeader(c *C) { - r := fixtures.Basic().One().Packfile() - p := NewScanner(r) - - h, err := p.NextObjectHeader() - c.Assert(err, IsNil) - c.Assert(h, DeepEquals, &expectedHeadersOFS[0]) - - version, objects, err := p.Header() - c.Assert(err, IsNil) - c.Assert(version, Equals, VersionSupported) - c.Assert(objects, Equals, uint32(31)) -} - -func (s *ScannerSuite) TestNextObjectHeaderREFDelta(c *C) { - s.testNextObjectHeader(c, "ref-delta", expectedHeadersREF) -} - -func (s *ScannerSuite) TestNextObjectHeaderOFSDelta(c *C) { - s.testNextObjectHeader(c, "ofs-delta", expectedHeadersOFS) -} - -func (s *ScannerSuite) testNextObjectHeader(c *C, tag string, expected []ObjectHeader) { - r := fixtures.Basic().ByTag(tag).Packfile() - p := NewScanner(r) - - _, objects, err := p.Header() - c.Assert(err, IsNil) - - for i := 0; i < int(objects); i++ { - h, err := p.NextObjectHeader() - c.Assert(err, IsNil) - c.Assert(*h, DeepEquals, expected[i]) - - buf := bytes.NewBuffer(nil) - n, _, err := p.NextObject(buf) - c.Assert(err, IsNil) - c.Assert(n, Equals, h.Length) - } - - n, err := p.Checksum() - c.Assert(err, IsNil) - c.Assert(n, HasLen, 20) -} - -func (s *ScannerSuite) TestNextObjectHeaderWithOutReadObject(c *C) { - f := fixtures.Basic().ByTag("ref-delta") - r := f.Packfile() - p := NewScanner(r) - - _, objects, err := p.Header() - c.Assert(err, IsNil) - - for i := 0; i < int(objects); i++ { - h, _ := p.NextObjectHeader() - c.Assert(err, IsNil) - c.Assert(*h, DeepEquals, expectedHeadersREF[i]) - } - - err = p.discardObjectIfNeeded() - c.Assert(err, IsNil) - - n, err := p.Checksum() - c.Assert(err, IsNil) - c.Assert(n, Equals, f.PackfileHash) -} - -var expectedHeadersOFS = []ObjectHeader{ - {Type: core.CommitObject, Offset: 12, Length: 254}, - {Type: core.OFSDeltaObject, Offset: 186, Length: 93, OffsetReference: 12}, - {Type: core.CommitObject, Offset: 286, Length: 242}, - {Type: core.CommitObject, Offset: 449, Length: 242}, - {Type: core.CommitObject, Offset: 615, Length: 333}, - {Type: core.CommitObject, Offset: 838, Length: 332}, - {Type: core.CommitObject, Offset: 1063, Length: 244}, - {Type: core.CommitObject, Offset: 1230, Length: 243}, - {Type: core.CommitObject, Offset: 1392, Length: 187}, - {Type: core.BlobObject, Offset: 1524, Length: 189}, - {Type: core.BlobObject, Offset: 1685, Length: 18}, - {Type: core.BlobObject, Offset: 1713, Length: 1072}, - {Type: core.BlobObject, Offset: 2351, Length: 76110}, - {Type: core.BlobObject, Offset: 78050, Length: 2780}, - {Type: core.BlobObject, Offset: 78882, Length: 217848}, - {Type: core.BlobObject, Offset: 80725, Length: 706}, - {Type: core.BlobObject, Offset: 80998, Length: 11488}, - {Type: core.BlobObject, Offset: 84032, Length: 78}, - {Type: core.TreeObject, Offset: 84115, Length: 272}, - {Type: core.OFSDeltaObject, Offset: 84375, Length: 43, OffsetReference: 84115}, - {Type: core.TreeObject, Offset: 84430, Length: 38}, - {Type: core.TreeObject, Offset: 84479, Length: 75}, - {Type: core.TreeObject, Offset: 84559, Length: 38}, - {Type: core.TreeObject, Offset: 84608, Length: 34}, - {Type: core.BlobObject, Offset: 84653, Length: 9}, - {Type: core.OFSDeltaObject, Offset: 84671, Length: 6, OffsetReference: 84375}, - {Type: core.OFSDeltaObject, Offset: 84688, Length: 9, OffsetReference: 84375}, - {Type: core.OFSDeltaObject, Offset: 84708, Length: 6, OffsetReference: 84375}, - {Type: core.OFSDeltaObject, Offset: 84725, Length: 5, OffsetReference: 84115}, - {Type: core.OFSDeltaObject, Offset: 84741, Length: 8, OffsetReference: 84375}, - {Type: core.OFSDeltaObject, Offset: 84760, Length: 4, OffsetReference: 84741}, -} - -var expectedHeadersREF = []ObjectHeader{ - {Type: core.CommitObject, Offset: 12, Length: 254}, - {Type: core.REFDeltaObject, Offset: 186, Length: 93, - Reference: core.NewHash("e8d3ffab552895c19b9fcf7aa264d277cde33881")}, - {Type: core.CommitObject, Offset: 304, Length: 242}, - {Type: core.CommitObject, Offset: 467, Length: 242}, - {Type: core.CommitObject, Offset: 633, Length: 333}, - {Type: core.CommitObject, Offset: 856, Length: 332}, - {Type: core.CommitObject, Offset: 1081, Length: 243}, - {Type: core.CommitObject, Offset: 1243, Length: 244}, - {Type: core.CommitObject, Offset: 1410, Length: 187}, - {Type: core.BlobObject, Offset: 1542, Length: 189}, - {Type: core.BlobObject, Offset: 1703, Length: 18}, - {Type: core.BlobObject, Offset: 1731, Length: 1072}, - {Type: core.BlobObject, Offset: 2369, Length: 76110}, - {Type: core.TreeObject, Offset: 78068, Length: 38}, - {Type: core.BlobObject, Offset: 78117, Length: 2780}, - {Type: core.TreeObject, Offset: 79049, Length: 75}, - {Type: core.BlobObject, Offset: 79129, Length: 217848}, - {Type: core.BlobObject, Offset: 80972, Length: 706}, - {Type: core.TreeObject, Offset: 81265, Length: 38}, - {Type: core.BlobObject, Offset: 81314, Length: 11488}, - {Type: core.TreeObject, Offset: 84752, Length: 34}, - {Type: core.BlobObject, Offset: 84797, Length: 78}, - {Type: core.TreeObject, Offset: 84880, Length: 271}, - {Type: core.REFDeltaObject, Offset: 85141, Length: 6, - Reference: core.NewHash("a8d315b2b1c615d43042c3a62402b8a54288cf5c")}, - {Type: core.REFDeltaObject, Offset: 85176, Length: 37, - Reference: core.NewHash("fb72698cab7617ac416264415f13224dfd7a165e")}, - {Type: core.BlobObject, Offset: 85244, Length: 9}, - {Type: core.REFDeltaObject, Offset: 85262, Length: 9, - Reference: core.NewHash("fb72698cab7617ac416264415f13224dfd7a165e")}, - {Type: core.REFDeltaObject, Offset: 85300, Length: 6, - Reference: core.NewHash("fb72698cab7617ac416264415f13224dfd7a165e")}, - {Type: core.TreeObject, Offset: 85335, Length: 110}, - {Type: core.REFDeltaObject, Offset: 85448, Length: 8, - Reference: core.NewHash("eba74343e2f15d62adedfd8c883ee0262b5c8021")}, - {Type: core.TreeObject, Offset: 85485, Length: 73}, -} diff --git a/formats/packfile/scanner.go b/formats/packfile/scanner.go new file mode 100644 index 0000000..86092a1 --- /dev/null +++ b/formats/packfile/scanner.go @@ -0,0 +1,493 @@ +package packfile + +import ( + "bufio" + "bytes" + "compress/zlib" + "encoding/binary" + "fmt" + "hash" + "hash/crc32" + "io" + "io/ioutil" + + "gopkg.in/src-d/go-git.v4/core" +) + +var ( + // ErrEmptyPackfile is returned by ReadHeader when no data is found in the packfile + ErrEmptyPackfile = NewError("empty packfile") + // ErrBadSignature is returned by ReadHeader when the signature in the packfile is incorrect. + ErrBadSignature = NewError("malformed pack file signature") + // ErrUnsupportedVersion is returned by ReadHeader when the packfile version is + // different than VersionSupported. + ErrUnsupportedVersion = NewError("unsupported packfile version") + // ErrSeekNotSupported returned if seek is not support + ErrSeekNotSupported = NewError("not seek support") +) + +const ( + // VersionSupported is the packfile version supported by this parser. + VersionSupported uint32 = 2 +) + +// ObjectHeader contains the information related to the object, this information +// is collected from the previous bytes to the content of the object. +type ObjectHeader struct { + Type core.ObjectType + Offset int64 + Length int64 + Reference core.Hash + OffsetReference int64 +} + +type Scanner struct { + r reader + crc hash.Hash32 + + // pendingObject is used to detect if an object has been read, or still + // is waiting to be read + pendingObject *ObjectHeader + version, objects uint32 +} + +// NewScanner returns a new Scanner based on a reader, if the given reader +// implements io.ReadSeeker the Scanner will be also Seekable +func NewScanner(r io.Reader) *Scanner { + seeker, ok := r.(io.ReadSeeker) + if !ok { + seeker = &trackableReader{Reader: r} + } + + crc := crc32.NewIEEE() + return &Scanner{ + r: &teeReader{ + newByteReadSeeker(seeker), + crc, + }, + crc: crc, + } +} + +// Header reads the whole packfile header (signature, version and object count). +// It returns the version and the object count and performs checks on the +// validity of the signature and the version fields. +func (s *Scanner) Header() (version, objects uint32, err error) { + if s.version != 0 { + return s.version, s.objects, nil + } + + sig, err := s.readSignature() + if err != nil { + if err == io.EOF { + err = ErrEmptyPackfile + } + + return + } + + if !s.isValidSignature(sig) { + err = ErrBadSignature + return + } + + version, err = s.readVersion() + s.version = version + if err != nil { + return + } + + if !s.isSupportedVersion(version) { + err = ErrUnsupportedVersion.AddDetails("%d", version) + return + } + + objects, err = s.readCount() + s.objects = objects + return +} + +// readSignature reads an returns the signature field in the packfile. +func (s *Scanner) readSignature() ([]byte, error) { + var sig = make([]byte, 4) + if _, err := io.ReadFull(s.r, sig); err != nil { + return []byte{}, err + } + + return sig, nil +} + +// isValidSignature returns if sig is a valid packfile signature. +func (s *Scanner) isValidSignature(sig []byte) bool { + return bytes.Equal(sig, []byte{'P', 'A', 'C', 'K'}) +} + +// readVersion reads and returns the version field of a packfile. +func (s *Scanner) readVersion() (uint32, error) { + return s.readInt32() +} + +// isSupportedVersion returns whether version v is supported by the parser. +// The current supported version is VersionSupported, defined above. +func (s *Scanner) isSupportedVersion(v uint32) bool { + return v == VersionSupported +} + +// readCount reads and returns the count of objects field of a packfile. +func (s *Scanner) readCount() (uint32, error) { + return s.readInt32() +} + +// ReadInt32 reads 4 bytes and returns them as a Big Endian int32. +func (s *Scanner) readInt32() (uint32, error) { + var v uint32 + if err := binary.Read(s.r, binary.BigEndian, &v); err != nil { + return 0, err + } + + return v, nil +} + +// NextObjectHeader returns the ObjectHeader for the next object in the reader +func (s *Scanner) NextObjectHeader() (*ObjectHeader, error) { + if err := s.doPending(); err != nil { + return nil, err + } + + s.crc.Reset() + + h := &ObjectHeader{} + s.pendingObject = h + + var err error + h.Offset, err = s.r.Seek(0, io.SeekCurrent) + if err != nil { + return nil, err + } + + h.Type, h.Length, err = s.readObjectTypeAndLength() + if err != nil { + return nil, err + } + + switch h.Type { + case core.OFSDeltaObject: + no, err := s.readNegativeOffset() + if err != nil { + return nil, err + } + + h.OffsetReference = h.Offset + no + case core.REFDeltaObject: + var err error + h.Reference, err = s.readHash() + if err != nil { + return nil, err + } + } + + return h, nil +} + +func (s *Scanner) doPending() error { + if s.version == 0 { + var err error + s.version, s.objects, err = s.Header() + if err != nil { + return err + } + } + + return s.discardObjectIfNeeded() +} + +func (s *Scanner) discardObjectIfNeeded() error { + if s.pendingObject == nil { + return nil + } + + h := s.pendingObject + n, _, err := s.NextObject(ioutil.Discard) + if err != nil { + return err + } + + if n != h.Length { + return fmt.Errorf( + "error discarding object, discarded %d, expected %d", + n, h.Length, + ) + } + + return nil +} + +// ReadObjectTypeAndLength reads and returns the object type and the +// length field from an object entry in a packfile. +func (s *Scanner) readObjectTypeAndLength() (core.ObjectType, int64, error) { + t, c, err := s.readType() + if err != nil { + return t, 0, err + } + + l, err := s.readLength(c) + + return t, l, err +} + +func (s *Scanner) readType() (core.ObjectType, byte, error) { + var c byte + var err error + if c, err = s.readByte(); err != nil { + return core.ObjectType(0), 0, err + } + + typ := parseType(c) + + return typ, c, nil +} + +// the length is codified in the last 4 bits of the first byte and in +// the last 7 bits of subsequent bytes. Last byte has a 0 MSB. +func (s *Scanner) readLength(first byte) (int64, error) { + length := int64(first & maskFirstLength) + + c := first + shift := firstLengthBits + var err error + for moreBytesInLength(c) { + if c, err = s.readByte(); err != nil { + return 0, err + } + + length += int64(c&maskLength) << shift + shift += lengthBits + } + + return length, nil +} + +// NextObject writes the content of the next object into the reader, returns +// the number of bytes written, the CRC32 of the content and an error, if any +func (s *Scanner) NextObject(w io.Writer) (written int64, crc32 uint32, err error) { + defer s.crc.Reset() + + s.pendingObject = nil + written, err = s.copyObject(w) + crc32 = s.crc.Sum32() + return +} + +// ReadRegularObject reads and write a non-deltified object +// from it zlib stream in an object entry in the packfile. +func (s *Scanner) copyObject(w io.Writer) (int64, error) { + zr, err := zlib.NewReader(s.r) + if err != nil { + return -1, fmt.Errorf("zlib reading error: %s", err) + } + + defer func() { + closeErr := zr.Close() + if err == nil { + err = closeErr + } + }() + + return io.Copy(w, zr) +} + +// Seek sets a new offset from start, returns the old position before the change +func (s *Scanner) Seek(offset int64) (previous int64, err error) { + // if seeking we asume that you are not interested on the header + if s.version == 0 { + s.version = VersionSupported + } + + previous, err = s.r.Seek(0, io.SeekCurrent) + if err != nil { + return -1, err + } + + _, err = s.r.Seek(offset, io.SeekStart) + return previous, err +} + +// Checksum returns the checksum of the packfile +func (s *Scanner) Checksum() (core.Hash, error) { + err := s.discardObjectIfNeeded() + if err != nil { + return core.ZeroHash, err + } + + return s.readHash() +} + +// ReadHash reads a hash. +func (s *Scanner) readHash() (core.Hash, error) { + var h core.Hash + if _, err := io.ReadFull(s.r, h[:]); err != nil { + return core.ZeroHash, err + } + + return h, nil +} + +// ReadNegativeOffset reads and returns an offset from a OFS DELTA +// object entry in a packfile. OFS DELTA offsets are specified in Git +// VLQ special format: +// +// Ordinary VLQ has some redundancies, example: the number 358 can be +// encoded as the 2-octet VLQ 0x8166 or the 3-octet VLQ 0x808166 or the +// 4-octet VLQ 0x80808166 and so forth. +// +// To avoid these redundancies, the VLQ format used in Git removes this +// prepending redundancy and extends the representable range of shorter +// VLQs by adding an offset to VLQs of 2 or more octets in such a way +// that the lowest possible value for such an (N+1)-octet VLQ becomes +// exactly one more than the maximum possible value for an N-octet VLQ. +// In particular, since a 1-octet VLQ can store a maximum value of 127, +// the minimum 2-octet VLQ (0x8000) is assigned the value 128 instead of +// 0. Conversely, the maximum value of such a 2-octet VLQ (0xff7f) is +// 16511 instead of just 16383. Similarly, the minimum 3-octet VLQ +// (0x808000) has a value of 16512 instead of zero, which means +// that the maximum 3-octet VLQ (0xffff7f) is 2113663 instead of +// just 2097151. And so forth. +// +// This is how the offset is saved in C: +// +// dheader[pos] = ofs & 127; +// while (ofs >>= 7) +// dheader[--pos] = 128 | (--ofs & 127); +// +func (s *Scanner) readNegativeOffset() (int64, error) { + var c byte + var err error + + if c, err = s.readByte(); err != nil { + return 0, err + } + + var offset = int64(c & maskLength) + for moreBytesInLength(c) { + offset++ + if c, err = s.readByte(); err != nil { + return 0, err + } + offset = (offset << lengthBits) + int64(c&maskLength) + } + + return -offset, nil +} + +func (s *Scanner) readByte() (byte, error) { + b, err := s.r.ReadByte() + if err != nil { + return 0, err + } + + return b, err +} + +// Close reads the reader until io.EOF +func (s *Scanner) Close() error { + _, err := io.Copy(ioutil.Discard, s.r) + return err +} + +func moreBytesInLength(c byte) bool { + return c&maskContinue > 0 +} + +var ( + maskContinue = uint8(128) // 1000 0000 + maskType = uint8(112) // 0111 0000 + maskFirstLength = uint8(15) // 0000 1111 + firstLengthBits = uint8(4) // the first byte has 4 bits to store the length + maskLength = uint8(127) // 0111 1111 + lengthBits = uint8(7) // subsequent bytes has 7 bits to store the length +) + +func parseType(b byte) core.ObjectType { + return core.ObjectType((b & maskType) >> firstLengthBits) +} + +type trackableReader struct { + count int64 + io.Reader +} + +// Read reads up to len(p) bytes into p. +func (r *trackableReader) Read(p []byte) (n int, err error) { + n, err = r.Reader.Read(p) + r.count += int64(n) + + return +} + +// Seek only supports io.SeekCurrent, any other operation fails +func (r *trackableReader) Seek(offset int64, whence int) (int64, error) { + if whence != io.SeekCurrent { + return -1, ErrSeekNotSupported + } + + return r.count, nil +} + +func newByteReadSeeker(r io.ReadSeeker) *bufferedSeeker { + return &bufferedSeeker{ + r: r, + Reader: *bufio.NewReader(r), + } +} + +type bufferedSeeker struct { + r io.ReadSeeker + bufio.Reader +} + +func (r *bufferedSeeker) Seek(offset int64, whence int) (int64, error) { + if whence == io.SeekCurrent { + current, err := r.r.Seek(offset, whence) + if err != nil { + return current, err + } + + return current - int64(r.Buffered()), nil + } + + defer r.Reader.Reset(r.r) + return r.r.Seek(offset, whence) +} + +type reader interface { + io.Reader + io.ByteReader + io.Seeker +} + +type teeReader struct { + reader + w hash.Hash32 +} + +func (r *teeReader) Read(p []byte) (n int, err error) { + n, err = r.reader.Read(p) + if n > 0 { + if n, err := r.w.Write(p[:n]); err != nil { + return n, err + } + } + return +} + +func (r *teeReader) ReadByte() (b byte, err error) { + b, err = r.reader.ReadByte() + if err == nil { + _, err := r.w.Write([]byte{b}) + if err != nil { + return 0, err + } + } + + return +} diff --git a/formats/packfile/scanner_test.go b/formats/packfile/scanner_test.go new file mode 100644 index 0000000..6161fdb --- /dev/null +++ b/formats/packfile/scanner_test.go @@ -0,0 +1,168 @@ +package packfile + +import ( + "bytes" + + . "gopkg.in/check.v1" + "gopkg.in/src-d/go-git.v4/core" + "gopkg.in/src-d/go-git.v4/fixtures" +) + +type ScannerSuite struct{} + +var _ = Suite(&ScannerSuite{}) + +func (s *ScannerSuite) SetUpSuite(c *C) { + fixtures.RootFolder = "../../fixtures" +} + +func (s *ScannerSuite) TestHeader(c *C) { + r := fixtures.Basic().One().Packfile() + p := NewScanner(r) + + version, objects, err := p.Header() + c.Assert(err, IsNil) + c.Assert(version, Equals, VersionSupported) + c.Assert(objects, Equals, uint32(31)) +} + +func (s *ScannerSuite) TestNextObjectHeaderWithoutHeader(c *C) { + r := fixtures.Basic().One().Packfile() + p := NewScanner(r) + + h, err := p.NextObjectHeader() + c.Assert(err, IsNil) + c.Assert(h, DeepEquals, &expectedHeadersOFS[0]) + + version, objects, err := p.Header() + c.Assert(err, IsNil) + c.Assert(version, Equals, VersionSupported) + c.Assert(objects, Equals, uint32(31)) +} + +func (s *ScannerSuite) TestNextObjectHeaderREFDelta(c *C) { + s.testNextObjectHeader(c, "ref-delta", expectedHeadersREF) +} + +func (s *ScannerSuite) TestNextObjectHeaderOFSDelta(c *C) { + s.testNextObjectHeader(c, "ofs-delta", expectedHeadersOFS) +} + +func (s *ScannerSuite) testNextObjectHeader(c *C, tag string, expected []ObjectHeader) { + r := fixtures.Basic().ByTag(tag).One().Packfile() + p := NewScanner(r) + + _, objects, err := p.Header() + c.Assert(err, IsNil) + + for i := 0; i < int(objects); i++ { + h, err := p.NextObjectHeader() + c.Assert(err, IsNil) + c.Assert(*h, DeepEquals, expected[i]) + + buf := bytes.NewBuffer(nil) + n, _, err := p.NextObject(buf) + c.Assert(err, IsNil) + c.Assert(n, Equals, h.Length) + } + + n, err := p.Checksum() + c.Assert(err, IsNil) + c.Assert(n, HasLen, 20) +} + +func (s *ScannerSuite) TestNextObjectHeaderWithOutReadObject(c *C) { + f := fixtures.Basic().ByTag("ref-delta").One() + r := f.Packfile() + p := NewScanner(r) + + _, objects, err := p.Header() + c.Assert(err, IsNil) + + for i := 0; i < int(objects); i++ { + h, _ := p.NextObjectHeader() + c.Assert(err, IsNil) + c.Assert(*h, DeepEquals, expectedHeadersREF[i]) + } + + err = p.discardObjectIfNeeded() + c.Assert(err, IsNil) + + n, err := p.Checksum() + c.Assert(err, IsNil) + c.Assert(n, Equals, f.PackfileHash) +} + +var expectedHeadersOFS = []ObjectHeader{ + {Type: core.CommitObject, Offset: 12, Length: 254}, + {Type: core.OFSDeltaObject, Offset: 186, Length: 93, OffsetReference: 12}, + {Type: core.CommitObject, Offset: 286, Length: 242}, + {Type: core.CommitObject, Offset: 449, Length: 242}, + {Type: core.CommitObject, Offset: 615, Length: 333}, + {Type: core.CommitObject, Offset: 838, Length: 332}, + {Type: core.CommitObject, Offset: 1063, Length: 244}, + {Type: core.CommitObject, Offset: 1230, Length: 243}, + {Type: core.CommitObject, Offset: 1392, Length: 187}, + {Type: core.BlobObject, Offset: 1524, Length: 189}, + {Type: core.BlobObject, Offset: 1685, Length: 18}, + {Type: core.BlobObject, Offset: 1713, Length: 1072}, + {Type: core.BlobObject, Offset: 2351, Length: 76110}, + {Type: core.BlobObject, Offset: 78050, Length: 2780}, + {Type: core.BlobObject, Offset: 78882, Length: 217848}, + {Type: core.BlobObject, Offset: 80725, Length: 706}, + {Type: core.BlobObject, Offset: 80998, Length: 11488}, + {Type: core.BlobObject, Offset: 84032, Length: 78}, + {Type: core.TreeObject, Offset: 84115, Length: 272}, + {Type: core.OFSDeltaObject, Offset: 84375, Length: 43, OffsetReference: 84115}, + {Type: core.TreeObject, Offset: 84430, Length: 38}, + {Type: core.TreeObject, Offset: 84479, Length: 75}, + {Type: core.TreeObject, Offset: 84559, Length: 38}, + {Type: core.TreeObject, Offset: 84608, Length: 34}, + {Type: core.BlobObject, Offset: 84653, Length: 9}, + {Type: core.OFSDeltaObject, Offset: 84671, Length: 6, OffsetReference: 84375}, + {Type: core.OFSDeltaObject, Offset: 84688, Length: 9, OffsetReference: 84375}, + {Type: core.OFSDeltaObject, Offset: 84708, Length: 6, OffsetReference: 84375}, + {Type: core.OFSDeltaObject, Offset: 84725, Length: 5, OffsetReference: 84115}, + {Type: core.OFSDeltaObject, Offset: 84741, Length: 8, OffsetReference: 84375}, + {Type: core.OFSDeltaObject, Offset: 84760, Length: 4, OffsetReference: 84741}, +} + +var expectedHeadersREF = []ObjectHeader{ + {Type: core.CommitObject, Offset: 12, Length: 254}, + {Type: core.REFDeltaObject, Offset: 186, Length: 93, + Reference: core.NewHash("e8d3ffab552895c19b9fcf7aa264d277cde33881")}, + {Type: core.CommitObject, Offset: 304, Length: 242}, + {Type: core.CommitObject, Offset: 467, Length: 242}, + {Type: core.CommitObject, Offset: 633, Length: 333}, + {Type: core.CommitObject, Offset: 856, Length: 332}, + {Type: core.CommitObject, Offset: 1081, Length: 243}, + {Type: core.CommitObject, Offset: 1243, Length: 244}, + {Type: core.CommitObject, Offset: 1410, Length: 187}, + {Type: core.BlobObject, Offset: 1542, Length: 189}, + {Type: core.BlobObject, Offset: 1703, Length: 18}, + {Type: core.BlobObject, Offset: 1731, Length: 1072}, + {Type: core.BlobObject, Offset: 2369, Length: 76110}, + {Type: core.TreeObject, Offset: 78068, Length: 38}, + {Type: core.BlobObject, Offset: 78117, Length: 2780}, + {Type: core.TreeObject, Offset: 79049, Length: 75}, + {Type: core.BlobObject, Offset: 79129, Length: 217848}, + {Type: core.BlobObject, Offset: 80972, Length: 706}, + {Type: core.TreeObject, Offset: 81265, Length: 38}, + {Type: core.BlobObject, Offset: 81314, Length: 11488}, + {Type: core.TreeObject, Offset: 84752, Length: 34}, + {Type: core.BlobObject, Offset: 84797, Length: 78}, + {Type: core.TreeObject, Offset: 84880, Length: 271}, + {Type: core.REFDeltaObject, Offset: 85141, Length: 6, + Reference: core.NewHash("a8d315b2b1c615d43042c3a62402b8a54288cf5c")}, + {Type: core.REFDeltaObject, Offset: 85176, Length: 37, + Reference: core.NewHash("fb72698cab7617ac416264415f13224dfd7a165e")}, + {Type: core.BlobObject, Offset: 85244, Length: 9}, + {Type: core.REFDeltaObject, Offset: 85262, Length: 9, + Reference: core.NewHash("fb72698cab7617ac416264415f13224dfd7a165e")}, + {Type: core.REFDeltaObject, Offset: 85300, Length: 6, + Reference: core.NewHash("fb72698cab7617ac416264415f13224dfd7a165e")}, + {Type: core.TreeObject, Offset: 85335, Length: 110}, + {Type: core.REFDeltaObject, Offset: 85448, Length: 8, + Reference: core.NewHash("eba74343e2f15d62adedfd8c883ee0262b5c8021")}, + {Type: core.TreeObject, Offset: 85485, Length: 73}, +} -- cgit