diff options
author | Máximo Cuadros <mcuadros@gmail.com> | 2016-11-08 23:46:38 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-11-08 23:46:38 +0100 |
commit | ac095bb12c4d29722b60ba9f20590fa7cfa6bc7d (patch) | |
tree | 223f36f336ba3414b1e45cac8af6c4744a5d7ef6 /plumbing/format/packfile | |
parent | e523701393598f4fa241dd407af9ff8925507a1a (diff) | |
download | go-git-ac095bb12c4d29722b60ba9f20590fa7cfa6bc7d.tar.gz |
new plumbing package (#118)
* plumbing: now core was renamed to core, and formats and clients moved inside
Diffstat (limited to 'plumbing/format/packfile')
-rw-r--r-- | plumbing/format/packfile/decoder.go | 307 | ||||
-rw-r--r-- | plumbing/format/packfile/decoder_test.go | 182 | ||||
-rw-r--r-- | plumbing/format/packfile/delta.go | 181 | ||||
-rw-r--r-- | plumbing/format/packfile/doc.go | 168 | ||||
-rw-r--r-- | plumbing/format/packfile/error.go | 30 | ||||
-rw-r--r-- | plumbing/format/packfile/scanner.go | 418 | ||||
-rw-r--r-- | plumbing/format/packfile/scanner_test.go | 189 |
7 files changed, 1475 insertions, 0 deletions
diff --git a/plumbing/format/packfile/decoder.go b/plumbing/format/packfile/decoder.go new file mode 100644 index 0000000..470e59b --- /dev/null +++ b/plumbing/format/packfile/decoder.go @@ -0,0 +1,307 @@ +package packfile + +import ( + "bytes" + + "gopkg.in/src-d/go-git.v4/plumbing" + "gopkg.in/src-d/go-git.v4/plumbing/storer" +) + +// Format specifies if the packfile uses ref-deltas or ofs-deltas. +type Format int + +// Possible values of the Format type. +const ( + UnknownFormat Format = iota + OFSDeltaFormat + REFDeltaFormat +) + +var ( + // ErrMaxObjectsLimitReached is returned by Decode when the number + // of objects in the packfile is higher than + // Decoder.MaxObjectsLimit. + ErrMaxObjectsLimitReached = NewError("max. objects limit reached") + // ErrInvalidObject is returned by Decode when an invalid object is + // found in the packfile. + ErrInvalidObject = NewError("invalid git object") + // ErrPackEntryNotFound is returned by Decode when a reference in + // the packfile references and unknown object. + ErrPackEntryNotFound = NewError("can't find a pack entry") + // ErrZLib is returned by Decode when there was an error unzipping + // the packfile contents. + ErrZLib = NewError("zlib reading error") + // ErrCannotRecall is returned by RecallByOffset or RecallByHash if the object + // to recall cannot be returned. + ErrCannotRecall = NewError("cannot recall object") + // ErrNonSeekable is returned if a NewDecoder is used with a non-seekable + // reader and without a plumbing.ObjectStorage or ReadObjectAt method is called + // without a seekable scanner + ErrNonSeekable = NewError("non-seekable scanner") + // ErrRollback error making Rollback over a transaction after an error + ErrRollback = NewError("rollback error, during set error") +) + +// Decoder reads and decodes packfiles from an input stream. +type Decoder struct { + s *Scanner + o storer.ObjectStorer + tx storer.Transaction + + offsetToHash map[int64]plumbing.Hash + hashToOffset map[plumbing.Hash]int64 + crcs map[plumbing.Hash]uint32 +} + +// NewDecoder returns a new Decoder that reads from r. +func NewDecoder(s *Scanner, o storer.ObjectStorer) (*Decoder, error) { + if !s.IsSeekable && o == nil { + return nil, ErrNonSeekable + } + + return &Decoder{ + s: s, + o: o, + + offsetToHash: make(map[int64]plumbing.Hash, 0), + hashToOffset: make(map[plumbing.Hash]int64, 0), + crcs: make(map[plumbing.Hash]uint32, 0), + }, nil +} + +// Decode reads a packfile and stores it in the value pointed to by s. +func (d *Decoder) Decode() (checksum plumbing.Hash, err error) { + if err := d.doDecode(); err != nil { + return plumbing.ZeroHash, err + } + + return d.s.Checksum() +} + +func (d *Decoder) doDecode() error { + _, count, err := d.s.Header() + if err != nil { + return err + } + + _, isTxStorer := d.o.(storer.Transactioner) + switch { + case d.o == nil: + return d.readObjects(int(count)) + case isTxStorer: + return d.readObjectsWithObjectStorerTx(int(count)) + default: + return d.readObjectsWithObjectStorer(int(count)) + } +} + +func (d *Decoder) readObjects(count int) error { + for i := 0; i < count; i++ { + if _, err := d.ReadObject(); err != nil { + return err + } + } + + return nil +} + +func (d *Decoder) readObjectsWithObjectStorer(count int) error { + for i := 0; i < count; i++ { + obj, err := d.ReadObject() + if err != nil { + return err + } + + if _, err := d.o.SetObject(obj); err != nil { + return err + } + } + + return nil +} + +func (d *Decoder) readObjectsWithObjectStorerTx(count int) error { + tx := d.o.(storer.Transactioner).Begin() + + for i := 0; i < count; i++ { + obj, err := d.ReadObject() + if err != nil { + return err + } + + if _, err := tx.SetObject(obj); err != nil { + if rerr := d.tx.Rollback(); rerr != nil { + return ErrRollback.AddDetails( + "error: %s, during tx.Set error: %s", rerr, err, + ) + } + + return err + } + + } + + return tx.Commit() +} + +// ReadObject reads a object from the stream and return it +func (d *Decoder) ReadObject() (plumbing.Object, error) { + h, err := d.s.NextObjectHeader() + if err != nil { + return nil, err + } + + obj := d.newObject() + obj.SetSize(h.Length) + obj.SetType(h.Type) + var crc uint32 + switch h.Type { + case plumbing.CommitObject, plumbing.TreeObject, plumbing.BlobObject, plumbing.TagObject: + crc, err = d.fillRegularObjectContent(obj) + case plumbing.REFDeltaObject: + crc, err = d.fillREFDeltaObjectContent(obj, h.Reference) + case plumbing.OFSDeltaObject: + crc, err = d.fillOFSDeltaObjectContent(obj, h.OffsetReference) + default: + err = ErrInvalidObject.AddDetails("type %q", h.Type) + } + + if err != nil { + return obj, err + } + + hash := obj.Hash() + d.setOffset(hash, h.Offset) + d.setCRC(hash, crc) + + return obj, nil +} + +func (d *Decoder) newObject() plumbing.Object { + if d.o == nil { + return &plumbing.MemoryObject{} + } + + return d.o.NewObject() +} + +// ReadObjectAt reads an object at the given location +func (d *Decoder) ReadObjectAt(offset int64) (plumbing.Object, error) { + if !d.s.IsSeekable { + return nil, ErrNonSeekable + } + + beforeJump, err := d.s.Seek(offset) + if err != nil { + return nil, err + } + + defer func() { + _, seekErr := d.s.Seek(beforeJump) + if err == nil { + err = seekErr + } + }() + + return d.ReadObject() +} + +func (d *Decoder) fillRegularObjectContent(obj plumbing.Object) (uint32, error) { + w, err := obj.Writer() + if err != nil { + return 0, err + } + + _, crc, err := d.s.NextObject(w) + return crc, err +} + +func (d *Decoder) fillREFDeltaObjectContent(obj plumbing.Object, ref plumbing.Hash) (uint32, error) { + buf := bytes.NewBuffer(nil) + _, crc, err := d.s.NextObject(buf) + if err != nil { + return 0, err + } + + base, err := d.recallByHash(ref) + if err != nil { + return 0, err + } + + obj.SetType(base.Type()) + return crc, ApplyDelta(obj, base, buf.Bytes()) +} + +func (d *Decoder) fillOFSDeltaObjectContent(obj plumbing.Object, offset int64) (uint32, error) { + buf := bytes.NewBuffer(nil) + _, crc, err := d.s.NextObject(buf) + if err != nil { + return 0, err + } + + base, err := d.recallByOffset(offset) + if err != nil { + return 0, err + } + + obj.SetType(base.Type()) + return crc, ApplyDelta(obj, base, buf.Bytes()) +} + +func (d *Decoder) setOffset(h plumbing.Hash, offset int64) { + d.offsetToHash[offset] = h + d.hashToOffset[h] = offset +} + +func (d *Decoder) setCRC(h plumbing.Hash, crc uint32) { + d.crcs[h] = crc +} + +func (d *Decoder) recallByOffset(o int64) (plumbing.Object, error) { + if d.s.IsSeekable { + return d.ReadObjectAt(o) + } + + if h, ok := d.offsetToHash[o]; ok { + return d.tx.Object(plumbing.AnyObject, h) + } + + return nil, plumbing.ErrObjectNotFound +} + +func (d *Decoder) recallByHash(h plumbing.Hash) (plumbing.Object, error) { + if d.s.IsSeekable { + if o, ok := d.hashToOffset[h]; ok { + return d.ReadObjectAt(o) + } + } + + obj, err := d.tx.Object(plumbing.AnyObject, h) + if err != plumbing.ErrObjectNotFound { + return obj, err + } + + return nil, plumbing.ErrObjectNotFound +} + +// SetOffsets sets the offsets, required when using the method ReadObjectAt, +// without decoding the full packfile +func (d *Decoder) SetOffsets(offsets map[plumbing.Hash]int64) { + d.hashToOffset = offsets +} + +// Offsets returns the objects read offset +func (d *Decoder) Offsets() map[plumbing.Hash]int64 { + return d.hashToOffset +} + +// CRCs returns the CRC-32 for each objected read +func (d *Decoder) CRCs() map[plumbing.Hash]uint32 { + return d.crcs +} + +// Close close the Scanner, usually this mean that the whole reader is read and +// discarded +func (d *Decoder) Close() error { + return d.s.Close() +} diff --git a/plumbing/format/packfile/decoder_test.go b/plumbing/format/packfile/decoder_test.go new file mode 100644 index 0000000..e510cf2 --- /dev/null +++ b/plumbing/format/packfile/decoder_test.go @@ -0,0 +1,182 @@ +package packfile + +import ( + "io" + "testing" + + "gopkg.in/src-d/go-git.v4/fixtures" + "gopkg.in/src-d/go-git.v4/plumbing" + "gopkg.in/src-d/go-git.v4/plumbing/format/idxfile" + "gopkg.in/src-d/go-git.v4/storage/memory" + + . "gopkg.in/check.v1" +) + +func Test(t *testing.T) { TestingT(t) } + +type ReaderSuite struct { + fixtures.Suite +} + +var _ = Suite(&ReaderSuite{}) + +func (s *ReaderSuite) TestNewDecodeNonSeekable(c *C) { + scanner := NewScanner(nil) + d, err := NewDecoder(scanner, nil) + + c.Assert(d, IsNil) + c.Assert(err, NotNil) +} + +func (s *ReaderSuite) TestDecode(c *C) { + fixtures.Basic().ByTag("packfile").Test(c, func(f *fixtures.Fixture) { + scanner := NewScanner(f.Packfile()) + storage := memory.NewStorage() + + d, err := NewDecoder(scanner, storage) + c.Assert(err, IsNil) + defer d.Close() + + ch, err := d.Decode() + c.Assert(err, IsNil) + c.Assert(ch, Equals, f.PackfileHash) + + assertObjects(c, storage, expectedHashes) + }) +} + +func (s *ReaderSuite) TestDecodeInMemory(c *C) { + fixtures.Basic().ByTag("packfile").Test(c, func(f *fixtures.Fixture) { + scanner := NewScanner(f.Packfile()) + d, err := NewDecoder(scanner, nil) + c.Assert(err, IsNil) + + ch, err := d.Decode() + c.Assert(err, IsNil) + c.Assert(ch, Equals, f.PackfileHash) + }) +} + +var expectedHashes = []string{ + "918c48b83bd081e863dbe1b80f8998f058cd8294", + "af2d6a6954d532f8ffb47615169c8fdf9d383a1a", + "1669dce138d9b841a518c64b10914d88f5e488ea", + "a5b8b09e2f8fcb0bb99d3ccb0958157b40890d69", + "b8e471f58bcbca63b07bda20e428190409c2db47", + "35e85108805c84807bc66a02d91535e1e24b38b9", + "b029517f6300c2da0f4b651b8642506cd6aaf45d", + "32858aad3c383ed1ff0a0f9bdf231d54a00c9e88", + "d3ff53e0564a9f87d8e84b6e28e5060e517008aa", + "c192bd6a24ea1ab01d78686e417c8bdc7c3d197f", + "d5c0f4ab811897cadf03aec358ae60d21f91c50d", + "49c6bb89b17060d7b4deacb7b338fcc6ea2352a9", + "cf4aa3b38974fb7d81f367c0830f7d78d65ab86b", + "9dea2395f5403188298c1dabe8bdafe562c491e3", + "586af567d0bb5e771e49bdd9434f5e0fb76d25fa", + "9a48f23120e880dfbe41f7c9b7b708e9ee62a492", + "5a877e6a906a2743ad6e45d99c1793642aaf8eda", + "c8f1d8c61f9da76f4cb49fd86322b6e685dba956", + "a8d315b2b1c615d43042c3a62402b8a54288cf5c", + "a39771a7651f97faf5c72e08224d857fc35133db", + "880cd14280f4b9b6ed3986d6671f907d7cc2a198", + "fb72698cab7617ac416264415f13224dfd7a165e", + "4d081c50e250fa32ea8b1313cf8bb7c2ad7627fd", + "eba74343e2f15d62adedfd8c883ee0262b5c8021", + "c2d30fa8ef288618f65f6eed6e168e0d514886f4", + "8dcef98b1d52143e1e2dbc458ffe38f925786bf2", + "aa9b383c260e1d05fbbf6b30a02914555e20c725", + "6ecf0ef2c2dffb796033e5a02219af86ec6584e5", + "dbd3641b371024f44d0e469a9c8f5457b0660de1", + "e8d3ffab552895c19b9fcf7aa264d277cde33881", + "7e59600739c96546163833214c36459e324bad0a", +} + +func (s *ReaderSuite) TestDecodeCRCs(c *C) { + f := fixtures.Basic().ByTag("ofs-delta").One() + + scanner := NewScanner(f.Packfile()) + storage := memory.NewStorage() + + d, err := NewDecoder(scanner, storage) + c.Assert(err, IsNil) + _, err = d.Decode() + c.Assert(err, IsNil) + + var sum uint64 + for _, crc := range d.CRCs() { + sum += uint64(crc) + } + + c.Assert(int(sum), Equals, 78022211966) +} + +func (s *ReaderSuite) TestReadObjectAt(c *C) { + f := fixtures.Basic().One() + scanner := NewScanner(f.Packfile()) + d, err := NewDecoder(scanner, nil) + c.Assert(err, IsNil) + + // when the packfile is ref-delta based, the offsets are required + if f.Is("ref-delta") { + offsets := getOffsetsFromIdx(f.Idx()) + d.SetOffsets(offsets) + } + + // the objects at reference 186, is a delta, so should be recall, + // without being read before. + obj, err := d.ReadObjectAt(186) + c.Assert(err, IsNil) + c.Assert(obj.Hash().String(), Equals, "6ecf0ef2c2dffb796033e5a02219af86ec6584e5") +} + +func (s *ReaderSuite) TestOffsets(c *C) { + f := fixtures.Basic().One() + scanner := NewScanner(f.Packfile()) + d, err := NewDecoder(scanner, nil) + c.Assert(err, IsNil) + + c.Assert(d.Offsets(), HasLen, 0) + + _, err = d.Decode() + c.Assert(err, IsNil) + + c.Assert(d.Offsets(), HasLen, 31) +} + +func (s *ReaderSuite) TestSetOffsets(c *C) { + f := fixtures.Basic().One() + scanner := NewScanner(f.Packfile()) + d, err := NewDecoder(scanner, nil) + c.Assert(err, IsNil) + + h := plumbing.NewHash("6ecf0ef2c2dffb796033e5a02219af86ec6584e5") + d.SetOffsets(map[plumbing.Hash]int64{h: 42}) + + o := d.Offsets() + c.Assert(o, HasLen, 1) + c.Assert(o[h], Equals, int64(42)) +} + +func assertObjects(c *C, s *memory.Storage, expects []string) { + c.Assert(len(expects), Equals, len(s.Objects)) + for _, exp := range expects { + obt, err := s.Object(plumbing.AnyObject, plumbing.NewHash(exp)) + c.Assert(err, IsNil) + c.Assert(obt.Hash().String(), Equals, exp) + } +} + +func getOffsetsFromIdx(r io.Reader) map[plumbing.Hash]int64 { + idx := &idxfile.Idxfile{} + err := idxfile.NewDecoder(r).Decode(idx) + if err != nil { + panic(err) + } + + offsets := make(map[plumbing.Hash]int64) + for _, e := range idx.Entries { + offsets[e.Hash] = int64(e.Offset) + } + + return offsets +} diff --git a/plumbing/format/packfile/delta.go b/plumbing/format/packfile/delta.go new file mode 100644 index 0000000..2493a39 --- /dev/null +++ b/plumbing/format/packfile/delta.go @@ -0,0 +1,181 @@ +package packfile + +import ( + "io/ioutil" + + "gopkg.in/src-d/go-git.v4/plumbing" +) + +// See https://github.com/git/git/blob/49fa3dc76179e04b0833542fa52d0f287a4955ac/delta.h +// https://github.com/git/git/blob/c2c5f6b1e479f2c38e0e01345350620944e3527f/patch-delta.c, +// and https://github.com/tarruda/node-git-core/blob/master/src/js/delta.js +// for details about the delta format. + +const deltaSizeMin = 4 + +// ApplyDelta writes to taget the result of applying the modification deltas in delta to base. +func ApplyDelta(target, base plumbing.Object, delta []byte) error { + r, err := base.Reader() + if err != nil { + return err + } + + w, err := target.Writer() + if err != nil { + return err + } + + src, err := ioutil.ReadAll(r) + if err != nil { + return err + } + + dst := PatchDelta(src, delta) + target.SetSize(int64(len(dst))) + + if _, err := w.Write(dst); err != nil { + return err + } + + return nil +} + +// PatchDelta returns the result of applying the modification deltas in delta to src. +func PatchDelta(src, delta []byte) []byte { + if len(delta) < deltaSizeMin { + return nil + } + + srcSz, delta := decodeLEB128(delta) + if srcSz != uint(len(src)) { + return nil + } + + targetSz, delta := decodeLEB128(delta) + remainingTargetSz := targetSz + + var dest []byte + var cmd byte + for { + cmd = delta[0] + delta = delta[1:] + if isCopyFromSrc(cmd) { + var offset, sz uint + offset, delta = decodeOffset(cmd, delta) + sz, delta = decodeSize(cmd, delta) + if invalidSize(sz, targetSz) || + invalidOffsetSize(offset, sz, srcSz) { + break + } + dest = append(dest, src[offset:offset+sz]...) + remainingTargetSz -= sz + } else if isCopyFromDelta(cmd) { + sz := uint(cmd) // cmd is the size itself + if invalidSize(sz, targetSz) { + break + } + dest = append(dest, delta[0:sz]...) + remainingTargetSz -= sz + delta = delta[sz:] + } else { + return nil + } + + if remainingTargetSz <= 0 { + break + } + } + + return dest +} + +// Decodes a number encoded as an unsigned LEB128 at the start of some +// binary data and returns the decoded number and the rest of the +// stream. +// +// This must be called twice on the delta data buffer, first to get the +// expected source buffer size, and again to get the target buffer size. +func decodeLEB128(input []byte) (uint, []byte) { + var num, sz uint + var b byte + for { + b = input[sz] + num |= (uint(b) & payload) << (sz * 7) // concats 7 bits chunks + sz++ + + if uint(b)&continuation == 0 || sz == uint(len(input)) { + break + } + } + + return num, input[sz:] +} + +const ( + payload = 0x7f // 0111 1111 + continuation = 0x80 // 1000 0000 +) + +func isCopyFromSrc(cmd byte) bool { + return (cmd & 0x80) != 0 +} + +func isCopyFromDelta(cmd byte) bool { + return (cmd&0x80) == 0 && cmd != 0 +} + +func decodeOffset(cmd byte, delta []byte) (uint, []byte) { + var offset uint + if (cmd & 0x01) != 0 { + offset = uint(delta[0]) + delta = delta[1:] + } + if (cmd & 0x02) != 0 { + offset |= uint(delta[0]) << 8 + delta = delta[1:] + } + if (cmd & 0x04) != 0 { + offset |= uint(delta[0]) << 16 + delta = delta[1:] + } + if (cmd & 0x08) != 0 { + offset |= uint(delta[0]) << 24 + delta = delta[1:] + } + + return offset, delta +} + +func decodeSize(cmd byte, delta []byte) (uint, []byte) { + var sz uint + if (cmd & 0x10) != 0 { + sz = uint(delta[0]) + delta = delta[1:] + } + if (cmd & 0x20) != 0 { + sz |= uint(delta[0]) << 8 + delta = delta[1:] + } + if (cmd & 0x40) != 0 { + sz |= uint(delta[0]) << 16 + delta = delta[1:] + } + if sz == 0 { + sz = 0x10000 + } + + return sz, delta +} + +func invalidSize(sz, targetSz uint) bool { + return sz > targetSz +} + +func invalidOffsetSize(offset, sz, srcSz uint) bool { + return sumOverflows(offset, sz) || + offset+sz > srcSz +} + +func sumOverflows(a, b uint) bool { + return a+b < a +} diff --git a/plumbing/format/packfile/doc.go b/plumbing/format/packfile/doc.go new file mode 100644 index 0000000..0b173ca --- /dev/null +++ b/plumbing/format/packfile/doc.go @@ -0,0 +1,168 @@ +// Package packfile implements a encoder/decoder of packfile format +package packfile + +/* +GIT pack format +=============== + +== pack-*.pack files have the following format: + + - A header appears at the beginning and consists of the following: + + 4-byte signature: + The signature is: {'P', 'A', 'C', 'K'} + + 4-byte version number (network byte order): + GIT currently accepts version number 2 or 3 but + generates version 2 only. + + 4-byte number of objects contained in the pack (network byte order) + + Observation: we cannot have more than 4G versions ;-) and + more than 4G objects in a pack. + + - The header is followed by number of object entries, each of + which looks like this: + + (undeltified representation) + n-byte type and length (3-bit type, (n-1)*7+4-bit length) + compressed data + + (deltified representation) + n-byte type and length (3-bit type, (n-1)*7+4-bit length) + 20-byte base object name + compressed delta data + + Observation: length of each object is encoded in a variable + length format and is not constrained to 32-bit or anything. + + - The trailer records 20-byte SHA1 checksum of all of the above. + +== Original (version 1) pack-*.idx files have the following format: + + - The header consists of 256 4-byte network byte order + integers. N-th entry of this table records the number of + objects in the corresponding pack, the first byte of whose + object name is less than or equal to N. This is called the + 'first-level fan-out' table. + + - The header is followed by sorted 24-byte entries, one entry + per object in the pack. Each entry is: + + 4-byte network byte order integer, recording where the + object is stored in the packfile as the offset from the + beginning. + + 20-byte object name. + + - The file is concluded with a trailer: + + A copy of the 20-byte SHA1 checksum at the end of + corresponding packfile. + + 20-byte SHA1-checksum of all of the above. + +Pack Idx file: + + -- +--------------------------------+ +fanout | fanout[0] = 2 (for example) |-. +table +--------------------------------+ | + | fanout[1] | | + +--------------------------------+ | + | fanout[2] | | + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | + | fanout[255] = total objects |---. + -- +--------------------------------+ | | +main | offset | | | +index | object name 00XXXXXXXXXXXXXXXX | | | +table +--------------------------------+ | | + | offset | | | + | object name 00XXXXXXXXXXXXXXXX | | | + +--------------------------------+<+ | + .-| offset | | + | | object name 01XXXXXXXXXXXXXXXX | | + | +--------------------------------+ | + | | offset | | + | | object name 01XXXXXXXXXXXXXXXX | | + | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | + | | offset | | + | | object name FFXXXXXXXXXXXXXXXX | | + --| +--------------------------------+<--+ +trailer | | packfile checksum | + | +--------------------------------+ + | | idxfile checksum | + | +--------------------------------+ + .-------. + | +Pack file entry: <+ + + packed object header: + 1-byte size extension bit (MSB) + type (next 3 bit) + size0 (lower 4-bit) + n-byte sizeN (as long as MSB is set, each 7-bit) + size0..sizeN form 4+7+7+..+7 bit integer, size0 + is the least significant part, and sizeN is the + most significant part. + packed object data: + If it is not DELTA, then deflated bytes (the size above + is the size before compression). + If it is REF_DELTA, then + 20-byte base object name SHA1 (the size above is the + size of the delta data that follows). + delta data, deflated. + If it is OFS_DELTA, then + n-byte offset (see below) interpreted as a negative + offset from the type-byte of the header of the + ofs-delta entry (the size above is the size of + the delta data that follows). + delta data, deflated. + + offset encoding: + n bytes with MSB set in all but the last one. + The offset is then the number constructed by + concatenating the lower 7 bit of each byte, and + for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1)) + to the result. + + + +== Version 2 pack-*.idx files support packs larger than 4 GiB, and + have some other reorganizations. They have the format: + + - A 4-byte magic number '\377tOc' which is an unreasonable + fanout[0] value. + + - A 4-byte version number (= 2) + + - A 256-entry fan-out table just like v1. + + - A table of sorted 20-byte SHA1 object names. These are + packed together without offset values to reduce the cache + footprint of the binary search for a specific object name. + + - A table of 4-byte CRC32 values of the packed object data. + This is new in v2 so compressed data can be copied directly + from pack to pack during repacking without undetected + data corruption. + + - A table of 4-byte offset values (in network byte order). + These are usually 31-bit pack file offsets, but large + offsets are encoded as an index into the next table with + the msbit set. + + - A table of 8-byte offset entries (empty for pack files less + than 2 GiB). Pack files are organized with heavily used + objects toward the front, so most object references should + not need to refer to this table. + + - The same trailer as a v1 pack file: + + A copy of the 20-byte SHA1 checksum at the end of + corresponding packfile. + + 20-byte SHA1-checksum of all of the above. + +From: +https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt +*/ diff --git a/plumbing/format/packfile/error.go b/plumbing/format/packfile/error.go new file mode 100644 index 0000000..c0b9163 --- /dev/null +++ b/plumbing/format/packfile/error.go @@ -0,0 +1,30 @@ +package packfile + +import "fmt" + +// Error specifies errors returned during packfile parsing. +type Error struct { + reason, details string +} + +// NewError returns a new error. +func NewError(reason string) *Error { + return &Error{reason: reason} +} + +// Error returns a text representation of the error. +func (e *Error) Error() string { + if e.details == "" { + return e.reason + } + + return fmt.Sprintf("%s: %s", e.reason, e.details) +} + +// AddDetails adds details to an error, with additional text. +func (e *Error) AddDetails(format string, args ...interface{}) *Error { + return &Error{ + reason: e.reason, + details: fmt.Sprintf(format, args...), + } +} diff --git a/plumbing/format/packfile/scanner.go b/plumbing/format/packfile/scanner.go new file mode 100644 index 0000000..130bb94 --- /dev/null +++ b/plumbing/format/packfile/scanner.go @@ -0,0 +1,418 @@ +package packfile + +import ( + "bufio" + "bytes" + "compress/zlib" + "fmt" + "hash" + "hash/crc32" + "io" + "io/ioutil" + + "gopkg.in/src-d/go-git.v4/plumbing" + "gopkg.in/src-d/go-git.v4/utils/binary" +) + +var ( + // ErrEmptyPackfile is returned by ReadHeader when no data is found in the packfile + ErrEmptyPackfile = NewError("empty packfile") + // ErrBadSignature is returned by ReadHeader when the signature in the packfile is incorrect. + ErrBadSignature = NewError("malformed pack file signature") + // ErrUnsupportedVersion is returned by ReadHeader when the packfile version is + // different than VersionSupported. + ErrUnsupportedVersion = NewError("unsupported packfile version") + // ErrSeekNotSupported returned if seek is not support + ErrSeekNotSupported = NewError("not seek support") +) + +const ( + // VersionSupported is the packfile version supported by this parser. + VersionSupported uint32 = 2 +) + +// ObjectHeader contains the information related to the object, this information +// is collected from the previous bytes to the content of the object. +type ObjectHeader struct { + Type plumbing.ObjectType + Offset int64 + Length int64 + Reference plumbing.Hash + OffsetReference int64 +} + +type Scanner struct { + r reader + crc hash.Hash32 + + // pendingObject is used to detect if an object has been read, or still + // is waiting to be read + pendingObject *ObjectHeader + version, objects uint32 + + // lsSeekable says if this scanner can do Seek or not, to have a Scanner + // seekable a r implementing io.Seeker is required + IsSeekable bool +} + +// NewScanner returns a new Scanner based on a reader, if the given reader +// implements io.ReadSeeker the Scanner will be also Seekable +func NewScanner(r io.Reader) *Scanner { + seeker, ok := r.(io.ReadSeeker) + if !ok { + seeker = &trackableReader{Reader: r} + } + + crc := crc32.NewIEEE() + return &Scanner{ + r: &teeReader{ + newByteReadSeeker(seeker), + crc, + }, + crc: crc, + IsSeekable: ok, + } +} + +// Header reads the whole packfile header (signature, version and object count). +// It returns the version and the object count and performs checks on the +// validity of the signature and the version fields. +func (s *Scanner) Header() (version, objects uint32, err error) { + if s.version != 0 { + return s.version, s.objects, nil + } + + sig, err := s.readSignature() + if err != nil { + if err == io.EOF { + err = ErrEmptyPackfile + } + + return + } + + if !s.isValidSignature(sig) { + err = ErrBadSignature + return + } + + version, err = s.readVersion() + s.version = version + if err != nil { + return + } + + if !s.isSupportedVersion(version) { + err = ErrUnsupportedVersion.AddDetails("%d", version) + return + } + + objects, err = s.readCount() + s.objects = objects + return +} + +// readSignature reads an returns the signature field in the packfile. +func (s *Scanner) readSignature() ([]byte, error) { + var sig = make([]byte, 4) + if _, err := io.ReadFull(s.r, sig); err != nil { + return []byte{}, err + } + + return sig, nil +} + +// isValidSignature returns if sig is a valid packfile signature. +func (s *Scanner) isValidSignature(sig []byte) bool { + return bytes.Equal(sig, []byte{'P', 'A', 'C', 'K'}) +} + +// readVersion reads and returns the version field of a packfile. +func (s *Scanner) readVersion() (uint32, error) { + return binary.ReadUint32(s.r) +} + +// isSupportedVersion returns whether version v is supported by the parser. +// The current supported version is VersionSupported, defined above. +func (s *Scanner) isSupportedVersion(v uint32) bool { + return v == VersionSupported +} + +// readCount reads and returns the count of objects field of a packfile. +func (s *Scanner) readCount() (uint32, error) { + return binary.ReadUint32(s.r) +} + +// NextObjectHeader returns the ObjectHeader for the next object in the reader +func (s *Scanner) NextObjectHeader() (*ObjectHeader, error) { + if err := s.doPending(); err != nil { + return nil, err + } + + s.crc.Reset() + + h := &ObjectHeader{} + s.pendingObject = h + + var err error + h.Offset, err = s.r.Seek(0, io.SeekCurrent) + if err != nil { + return nil, err + } + + h.Type, h.Length, err = s.readObjectTypeAndLength() + if err != nil { + return nil, err + } + + switch h.Type { + case plumbing.OFSDeltaObject: + no, err := binary.ReadVariableWidthInt(s.r) + if err != nil { + return nil, err + } + + h.OffsetReference = h.Offset - no + case plumbing.REFDeltaObject: + var err error + h.Reference, err = binary.ReadHash(s.r) + if err != nil { + return nil, err + } + } + + return h, nil +} + +func (s *Scanner) doPending() error { + if s.version == 0 { + var err error + s.version, s.objects, err = s.Header() + if err != nil { + return err + } + } + + return s.discardObjectIfNeeded() +} + +func (s *Scanner) discardObjectIfNeeded() error { + if s.pendingObject == nil { + return nil + } + + h := s.pendingObject + n, _, err := s.NextObject(ioutil.Discard) + if err != nil { + return err + } + + if n != h.Length { + return fmt.Errorf( + "error discarding object, discarded %d, expected %d", + n, h.Length, + ) + } + + return nil +} + +// ReadObjectTypeAndLength reads and returns the object type and the +// length field from an object entry in a packfile. +func (s *Scanner) readObjectTypeAndLength() (plumbing.ObjectType, int64, error) { + t, c, err := s.readType() + if err != nil { + return t, 0, err + } + + l, err := s.readLength(c) + + return t, l, err +} + +const ( + maskType = uint8(112) // 0111 0000 + maskFirstLength = uint8(15) // 0000 1111 + maskContinue = uint8(128) // 1000 000 + firstLengthBits = uint8(4) // the first byte has 4 bits to store the length + maskLength = uint8(127) // 0111 1111 + lengthBits = uint8(7) // subsequent bytes has 7 bits to store the length +) + +func (s *Scanner) readType() (plumbing.ObjectType, byte, error) { + var c byte + var err error + if c, err = s.r.ReadByte(); err != nil { + return plumbing.ObjectType(0), 0, err + } + + typ := parseType(c) + + return typ, c, nil +} + +func parseType(b byte) plumbing.ObjectType { + return plumbing.ObjectType((b & maskType) >> firstLengthBits) +} + +// the length is codified in the last 4 bits of the first byte and in +// the last 7 bits of subsequent bytes. Last byte has a 0 MSB. +func (s *Scanner) readLength(first byte) (int64, error) { + length := int64(first & maskFirstLength) + + c := first + shift := firstLengthBits + var err error + for c&maskContinue > 0 { + if c, err = s.r.ReadByte(); err != nil { + return 0, err + } + + length += int64(c&maskLength) << shift + shift += lengthBits + } + + return length, nil +} + +// NextObject writes the content of the next object into the reader, returns +// the number of bytes written, the CRC32 of the content and an error, if any +func (s *Scanner) NextObject(w io.Writer) (written int64, crc32 uint32, err error) { + defer s.crc.Reset() + + s.pendingObject = nil + written, err = s.copyObject(w) + crc32 = s.crc.Sum32() + return +} + +// ReadRegularObject reads and write a non-deltified object +// from it zlib stream in an object entry in the packfile. +func (s *Scanner) copyObject(w io.Writer) (int64, error) { + zr, err := zlib.NewReader(s.r) + if err != nil { + return -1, fmt.Errorf("zlib reading error: %s", err) + } + + defer func() { + closeErr := zr.Close() + if err == nil { + err = closeErr + } + }() + + return io.Copy(w, zr) +} + +// Seek sets a new offset from start, returns the old position before the change +func (s *Scanner) Seek(offset int64) (previous int64, err error) { + // if seeking we asume that you are not interested on the header + if s.version == 0 { + s.version = VersionSupported + } + + previous, err = s.r.Seek(0, io.SeekCurrent) + if err != nil { + return -1, err + } + + _, err = s.r.Seek(offset, io.SeekStart) + return previous, err +} + +// Checksum returns the checksum of the packfile +func (s *Scanner) Checksum() (plumbing.Hash, error) { + err := s.discardObjectIfNeeded() + if err != nil { + return plumbing.ZeroHash, err + } + + return binary.ReadHash(s.r) +} + +// Close reads the reader until io.EOF +func (s *Scanner) Close() error { + _, err := io.Copy(ioutil.Discard, s.r) + return err +} + +type trackableReader struct { + count int64 + io.Reader +} + +// Read reads up to len(p) bytes into p. +func (r *trackableReader) Read(p []byte) (n int, err error) { + n, err = r.Reader.Read(p) + r.count += int64(n) + + return +} + +// Seek only supports io.SeekCurrent, any other operation fails +func (r *trackableReader) Seek(offset int64, whence int) (int64, error) { + if whence != io.SeekCurrent { + return -1, ErrSeekNotSupported + } + + return r.count, nil +} + +func newByteReadSeeker(r io.ReadSeeker) *bufferedSeeker { + return &bufferedSeeker{ + r: r, + Reader: *bufio.NewReader(r), + } +} + +type bufferedSeeker struct { + r io.ReadSeeker + bufio.Reader +} + +func (r *bufferedSeeker) Seek(offset int64, whence int) (int64, error) { + if whence == io.SeekCurrent { + current, err := r.r.Seek(offset, whence) + if err != nil { + return current, err + } + + return current - int64(r.Buffered()), nil + } + + defer r.Reader.Reset(r.r) + return r.r.Seek(offset, whence) +} + +type reader interface { + io.Reader + io.ByteReader + io.Seeker +} + +type teeReader struct { + reader + w hash.Hash32 +} + +func (r *teeReader) Read(p []byte) (n int, err error) { + n, err = r.reader.Read(p) + if n > 0 { + if n, err := r.w.Write(p[:n]); err != nil { + return n, err + } + } + return +} + +func (r *teeReader) ReadByte() (b byte, err error) { + b, err = r.reader.ReadByte() + if err == nil { + _, err := r.w.Write([]byte{b}) + if err != nil { + return 0, err + } + } + + return +} diff --git a/plumbing/format/packfile/scanner_test.go b/plumbing/format/packfile/scanner_test.go new file mode 100644 index 0000000..8e9a593 --- /dev/null +++ b/plumbing/format/packfile/scanner_test.go @@ -0,0 +1,189 @@ +package packfile + +import ( + "bytes" + "io" + + . "gopkg.in/check.v1" + "gopkg.in/src-d/go-git.v4/fixtures" + "gopkg.in/src-d/go-git.v4/plumbing" +) + +type ScannerSuite struct { + fixtures.Suite +} + +var _ = Suite(&ScannerSuite{}) + +func (s *ScannerSuite) TestHeader(c *C) { + r := fixtures.Basic().One().Packfile() + p := NewScanner(r) + + version, objects, err := p.Header() + c.Assert(err, IsNil) + c.Assert(version, Equals, VersionSupported) + c.Assert(objects, Equals, uint32(31)) +} + +func (s *ScannerSuite) TestNextObjectHeaderWithoutHeader(c *C) { + r := fixtures.Basic().One().Packfile() + p := NewScanner(r) + + h, err := p.NextObjectHeader() + c.Assert(err, IsNil) + c.Assert(h, DeepEquals, &expectedHeadersOFS[0]) + + version, objects, err := p.Header() + c.Assert(err, IsNil) + c.Assert(version, Equals, VersionSupported) + c.Assert(objects, Equals, uint32(31)) +} + +func (s *ScannerSuite) TestNextObjectHeaderREFDelta(c *C) { + s.testNextObjectHeader(c, "ref-delta", expectedHeadersREF) +} + +func (s *ScannerSuite) TestNextObjectHeaderOFSDelta(c *C) { + s.testNextObjectHeader(c, "ofs-delta", expectedHeadersOFS) +} + +func (s *ScannerSuite) testNextObjectHeader(c *C, tag string, expected []ObjectHeader) { + r := fixtures.Basic().ByTag(tag).One().Packfile() + p := NewScanner(r) + + _, objects, err := p.Header() + c.Assert(err, IsNil) + + for i := 0; i < int(objects); i++ { + h, err := p.NextObjectHeader() + c.Assert(err, IsNil) + c.Assert(*h, DeepEquals, expected[i]) + + buf := bytes.NewBuffer(nil) + n, _, err := p.NextObject(buf) + c.Assert(err, IsNil) + c.Assert(n, Equals, h.Length) + } + + n, err := p.Checksum() + c.Assert(err, IsNil) + c.Assert(n, HasLen, 20) +} + +func (s *ScannerSuite) TestNextObjectHeaderWithOutReadObject(c *C) { + f := fixtures.Basic().ByTag("ref-delta").One() + r := f.Packfile() + p := NewScanner(r) + + _, objects, err := p.Header() + c.Assert(err, IsNil) + + for i := 0; i < int(objects); i++ { + h, _ := p.NextObjectHeader() + c.Assert(err, IsNil) + c.Assert(*h, DeepEquals, expectedHeadersREF[i]) + } + + err = p.discardObjectIfNeeded() + c.Assert(err, IsNil) + + n, err := p.Checksum() + c.Assert(err, IsNil) + c.Assert(n, Equals, f.PackfileHash) +} + +func (s *ScannerSuite) TestNextObjectHeaderWithOutReadObjectNonSeekable(c *C) { + f := fixtures.Basic().ByTag("ref-delta").One() + r := io.MultiReader(f.Packfile()) + p := NewScanner(r) + + _, objects, err := p.Header() + c.Assert(err, IsNil) + + for i := 0; i < int(objects); i++ { + h, _ := p.NextObjectHeader() + c.Assert(err, IsNil) + c.Assert(*h, DeepEquals, expectedHeadersREF[i]) + } + + err = p.discardObjectIfNeeded() + c.Assert(err, IsNil) + + n, err := p.Checksum() + c.Assert(err, IsNil) + c.Assert(n, Equals, f.PackfileHash) +} + +var expectedHeadersOFS = []ObjectHeader{ + {Type: plumbing.CommitObject, Offset: 12, Length: 254}, + {Type: plumbing.OFSDeltaObject, Offset: 186, Length: 93, OffsetReference: 12}, + {Type: plumbing.CommitObject, Offset: 286, Length: 242}, + {Type: plumbing.CommitObject, Offset: 449, Length: 242}, + {Type: plumbing.CommitObject, Offset: 615, Length: 333}, + {Type: plumbing.CommitObject, Offset: 838, Length: 332}, + {Type: plumbing.CommitObject, Offset: 1063, Length: 244}, + {Type: plumbing.CommitObject, Offset: 1230, Length: 243}, + {Type: plumbing.CommitObject, Offset: 1392, Length: 187}, + {Type: plumbing.BlobObject, Offset: 1524, Length: 189}, + {Type: plumbing.BlobObject, Offset: 1685, Length: 18}, + {Type: plumbing.BlobObject, Offset: 1713, Length: 1072}, + {Type: plumbing.BlobObject, Offset: 2351, Length: 76110}, + {Type: plumbing.BlobObject, Offset: 78050, Length: 2780}, + {Type: plumbing.BlobObject, Offset: 78882, Length: 217848}, + {Type: plumbing.BlobObject, Offset: 80725, Length: 706}, + {Type: plumbing.BlobObject, Offset: 80998, Length: 11488}, + {Type: plumbing.BlobObject, Offset: 84032, Length: 78}, + {Type: plumbing.TreeObject, Offset: 84115, Length: 272}, + {Type: plumbing.OFSDeltaObject, Offset: 84375, Length: 43, OffsetReference: 84115}, + {Type: plumbing.TreeObject, Offset: 84430, Length: 38}, + {Type: plumbing.TreeObject, Offset: 84479, Length: 75}, + {Type: plumbing.TreeObject, Offset: 84559, Length: 38}, + {Type: plumbing.TreeObject, Offset: 84608, Length: 34}, + {Type: plumbing.BlobObject, Offset: 84653, Length: 9}, + {Type: plumbing.OFSDeltaObject, Offset: 84671, Length: 6, OffsetReference: 84375}, + {Type: plumbing.OFSDeltaObject, Offset: 84688, Length: 9, OffsetReference: 84375}, + {Type: plumbing.OFSDeltaObject, Offset: 84708, Length: 6, OffsetReference: 84375}, + {Type: plumbing.OFSDeltaObject, Offset: 84725, Length: 5, OffsetReference: 84115}, + {Type: plumbing.OFSDeltaObject, Offset: 84741, Length: 8, OffsetReference: 84375}, + {Type: plumbing.OFSDeltaObject, Offset: 84760, Length: 4, OffsetReference: 84741}, +} + +var expectedHeadersREF = []ObjectHeader{ + {Type: plumbing.CommitObject, Offset: 12, Length: 254}, + {Type: plumbing.REFDeltaObject, Offset: 186, Length: 93, + Reference: plumbing.NewHash("e8d3ffab552895c19b9fcf7aa264d277cde33881")}, + {Type: plumbing.CommitObject, Offset: 304, Length: 242}, + {Type: plumbing.CommitObject, Offset: 467, Length: 242}, + {Type: plumbing.CommitObject, Offset: 633, Length: 333}, + {Type: plumbing.CommitObject, Offset: 856, Length: 332}, + {Type: plumbing.CommitObject, Offset: 1081, Length: 243}, + {Type: plumbing.CommitObject, Offset: 1243, Length: 244}, + {Type: plumbing.CommitObject, Offset: 1410, Length: 187}, + {Type: plumbing.BlobObject, Offset: 1542, Length: 189}, + {Type: plumbing.BlobObject, Offset: 1703, Length: 18}, + {Type: plumbing.BlobObject, Offset: 1731, Length: 1072}, + {Type: plumbing.BlobObject, Offset: 2369, Length: 76110}, + {Type: plumbing.TreeObject, Offset: 78068, Length: 38}, + {Type: plumbing.BlobObject, Offset: 78117, Length: 2780}, + {Type: plumbing.TreeObject, Offset: 79049, Length: 75}, + {Type: plumbing.BlobObject, Offset: 79129, Length: 217848}, + {Type: plumbing.BlobObject, Offset: 80972, Length: 706}, + {Type: plumbing.TreeObject, Offset: 81265, Length: 38}, + {Type: plumbing.BlobObject, Offset: 81314, Length: 11488}, + {Type: plumbing.TreeObject, Offset: 84752, Length: 34}, + {Type: plumbing.BlobObject, Offset: 84797, Length: 78}, + {Type: plumbing.TreeObject, Offset: 84880, Length: 271}, + {Type: plumbing.REFDeltaObject, Offset: 85141, Length: 6, + Reference: plumbing.NewHash("a8d315b2b1c615d43042c3a62402b8a54288cf5c")}, + {Type: plumbing.REFDeltaObject, Offset: 85176, Length: 37, + Reference: plumbing.NewHash("fb72698cab7617ac416264415f13224dfd7a165e")}, + {Type: plumbing.BlobObject, Offset: 85244, Length: 9}, + {Type: plumbing.REFDeltaObject, Offset: 85262, Length: 9, + Reference: plumbing.NewHash("fb72698cab7617ac416264415f13224dfd7a165e")}, + {Type: plumbing.REFDeltaObject, Offset: 85300, Length: 6, + Reference: plumbing.NewHash("fb72698cab7617ac416264415f13224dfd7a165e")}, + {Type: plumbing.TreeObject, Offset: 85335, Length: 110}, + {Type: plumbing.REFDeltaObject, Offset: 85448, Length: 8, + Reference: plumbing.NewHash("eba74343e2f15d62adedfd8c883ee0262b5c8021")}, + {Type: plumbing.TreeObject, Offset: 85485, Length: 73}, +} |