aboutsummaryrefslogtreecommitdiffstats
path: root/formats
diff options
context:
space:
mode:
Diffstat (limited to 'formats')
-rw-r--r--formats/idxfile/decoder.go160
-rw-r--r--formats/idxfile/decoder_test.go40
-rw-r--r--formats/idxfile/doc.go130
-rw-r--r--formats/idxfile/encoder.go124
-rw-r--r--formats/idxfile/encoder_test.go47
-rw-r--r--formats/idxfile/idxfile.go61
-rw-r--r--formats/objfile/reader_test.go6
-rw-r--r--formats/objfile/writer_test.go8
-rw-r--r--formats/packfile/common.go63
-rw-r--r--formats/packfile/decoder.go116
-rw-r--r--formats/packfile/decoder_test.go (renamed from formats/packfile/reader_test.go)72
-rw-r--r--formats/packfile/delta.go195
-rw-r--r--formats/packfile/doc.go331
-rw-r--r--formats/packfile/error.go30
-rw-r--r--formats/packfile/parser.go353
-rw-r--r--formats/packfile/parser_test.go412
-rw-r--r--formats/packfile/read_recaller.go39
-rw-r--r--formats/packfile/read_recaller_impl_test.go296
-rw-r--r--formats/packfile/reader.go338
-rw-r--r--formats/packfile/seekable.go108
-rw-r--r--formats/packfile/stream.go95
21 files changed, 2328 insertions, 696 deletions
diff --git a/formats/idxfile/decoder.go b/formats/idxfile/decoder.go
new file mode 100644
index 0000000..72a9338
--- /dev/null
+++ b/formats/idxfile/decoder.go
@@ -0,0 +1,160 @@
+package idxfile
+
+import (
+ "bytes"
+ "encoding/binary"
+ "errors"
+ "io"
+
+ "gopkg.in/src-d/go-git.v3/core"
+)
+
+var (
+ // ErrUnsupportedVersion is returned by Decode when the idx file version
+ // is not supported.
+ ErrUnsupportedVersion = errors.New("Unsuported version")
+ // ErrMalformedIdxFile is returned by Decode when the idx file is corrupted.
+ ErrMalformedIdxFile = errors.New("Malformed IDX file")
+)
+
+// A Decoder reads and decodes idx files from an input stream.
+type Decoder struct {
+ io.Reader
+}
+
+// NewDecoder returns a new decoder that reads from r.
+func NewDecoder(r io.Reader) *Decoder {
+ return &Decoder{r}
+}
+
+// Decode reads the whole idx object from its input and stores it in the
+// value pointed to by idx.
+func (d *Decoder) Decode(idx *Idxfile) error {
+ if err := validateHeader(d); err != nil {
+ return err
+ }
+
+ flow := []func(*Idxfile, io.Reader) error{
+ readVersion,
+ readFanout,
+ readObjectNames,
+ readCRC32,
+ readOffsets,
+ readChecksums,
+ }
+
+ for _, f := range flow {
+ if err := f(idx, d); err != nil {
+ return err
+ }
+ }
+
+ if !idx.isValid() {
+ return ErrMalformedIdxFile
+ }
+
+ return nil
+}
+
+func validateHeader(r io.Reader) error {
+ var h = make([]byte, 4)
+ if _, err := r.Read(h); err != nil {
+ return err
+ }
+
+ if !bytes.Equal(h, idxHeader) {
+ return ErrMalformedIdxFile
+ }
+
+ return nil
+}
+
+func readVersion(idx *Idxfile, r io.Reader) error {
+ v, err := readInt32(r)
+ if err != nil {
+ return err
+ }
+
+ if v > VersionSupported {
+ return ErrUnsupportedVersion
+ }
+
+ idx.Version = v
+
+ return nil
+}
+
+func readFanout(idx *Idxfile, r io.Reader) error {
+ var err error
+
+ for i := 0; i < 255; i++ {
+ idx.Fanout[i], err = readInt32(r)
+ if err != nil {
+ return err
+ }
+ }
+
+ idx.ObjectCount, err = readInt32(r)
+
+ return err
+}
+
+func readObjectNames(idx *Idxfile, r io.Reader) error {
+ c := int(idx.ObjectCount)
+ for i := 0; i < c; i++ {
+ var ref core.Hash
+ if _, err := r.Read(ref[:]); err != nil {
+ return err
+ }
+
+ idx.Entries = append(idx.Entries, Entry{Hash: ref})
+ }
+
+ return nil
+}
+
+func readCRC32(idx *Idxfile, r io.Reader) error {
+ c := int(idx.ObjectCount)
+ for i := 0; i < c; i++ {
+ if _, err := r.Read(idx.Entries[i].CRC32[:]); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+func readOffsets(idx *Idxfile, r io.Reader) error {
+ c := int(idx.ObjectCount)
+ for i := 0; i < c; i++ {
+ o, err := readInt32(r)
+ if err != nil {
+ return err
+ }
+
+ idx.Entries[i].Offset = uint64(o)
+ }
+
+ return nil
+}
+
+func readChecksums(idx *Idxfile, r io.Reader) error {
+ if _, err := r.Read(idx.PackfileChecksum[:]); err != nil {
+ return err
+ }
+
+ if _, err := r.Read(idx.IdxChecksum[:]); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func readInt32(r io.Reader) (uint32, error) {
+ var v uint32
+ if err := binary.Read(r, binary.BigEndian, &v); err != nil {
+ return 0, err
+ }
+
+ return v, nil
+}
diff --git a/formats/idxfile/decoder_test.go b/formats/idxfile/decoder_test.go
new file mode 100644
index 0000000..597a002
--- /dev/null
+++ b/formats/idxfile/decoder_test.go
@@ -0,0 +1,40 @@
+package idxfile
+
+import (
+ "fmt"
+ "os"
+ "testing"
+
+ . "gopkg.in/check.v1"
+)
+
+func Test(t *testing.T) { TestingT(t) }
+
+type IdxfileSuite struct{}
+
+var _ = Suite(&IdxfileSuite{})
+
+func (s *IdxfileSuite) TestDecode(c *C) {
+ f, err := os.Open("fixtures/git-fixture.idx")
+ c.Assert(err, IsNil)
+
+ d := NewDecoder(f)
+ idx := &Idxfile{}
+ err = d.Decode(idx)
+ c.Assert(err, IsNil)
+
+ err = f.Close()
+ c.Assert(err, IsNil)
+
+ c.Assert(int(idx.ObjectCount), Equals, 31)
+ c.Assert(idx.Entries, HasLen, 31)
+ c.Assert(idx.Entries[0].Hash.String(), Equals,
+ "1669dce138d9b841a518c64b10914d88f5e488ea")
+ c.Assert(idx.Entries[0].Offset, Equals, uint64(615))
+
+ c.Assert(fmt.Sprintf("%x", idx.IdxChecksum), Equals,
+ "bba9b7a9895724819225a044c857d391bb9d61d9")
+ c.Assert(fmt.Sprintf("%x", idx.PackfileChecksum), Equals,
+ "54bb61360ab2dad1a3e344a8cd3f82b848518cba")
+
+}
diff --git a/formats/idxfile/doc.go b/formats/idxfile/doc.go
new file mode 100644
index 0000000..74149ab
--- /dev/null
+++ b/formats/idxfile/doc.go
@@ -0,0 +1,130 @@
+/*
+== Original (version 1) pack-*.idx files have the following format:
+
+ - The header consists of 256 4-byte network byte order
+ integers. N-th entry of this table records the number of
+ objects in the corresponding pack, the first byte of whose
+ object name is less than or equal to N. This is called the
+ 'first-level fan-out' table.
+
+ - The header is followed by sorted 24-byte entries, one entry
+ per object in the pack. Each entry is:
+
+ 4-byte network byte order integer, recording where the
+ object is stored in the packfile as the offset from the
+ beginning.
+
+ 20-byte object name.
+
+ - The file is concluded with a trailer:
+
+ A copy of the 20-byte SHA1 checksum at the end of
+ corresponding packfile.
+
+ 20-byte SHA1-checksum of all of the above.
+
+Pack Idx file:
+
+ -- +--------------------------------+
+fanout | fanout[0] = 2 (for example) |-.
+table +--------------------------------+ |
+ | fanout[1] | |
+ +--------------------------------+ |
+ | fanout[2] | |
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
+ | fanout[255] = total objects |---.
+ -- +--------------------------------+ | |
+main | offset | | |
+index | object name 00XXXXXXXXXXXXXXXX | | |
+table +--------------------------------+ | |
+ | offset | | |
+ | object name 00XXXXXXXXXXXXXXXX | | |
+ +--------------------------------+<+ |
+ .-| offset | |
+ | | object name 01XXXXXXXXXXXXXXXX | |
+ | +--------------------------------+ |
+ | | offset | |
+ | | object name 01XXXXXXXXXXXXXXXX | |
+ | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
+ | | offset | |
+ | | object name FFXXXXXXXXXXXXXXXX | |
+ --| +--------------------------------+<--+
+trailer | | packfile checksum |
+ | +--------------------------------+
+ | | idxfile checksum |
+ | +--------------------------------+
+ .-------.
+ |
+Pack file entry: <+
+
+ packed object header:
+ 1-byte size extension bit (MSB)
+ type (next 3 bit)
+ size0 (lower 4-bit)
+ n-byte sizeN (as long as MSB is set, each 7-bit)
+ size0..sizeN form 4+7+7+..+7 bit integer, size0
+ is the least significant part, and sizeN is the
+ most significant part.
+ packed object data:
+ If it is not DELTA, then deflated bytes (the size above
+ is the size before compression).
+ If it is REF_DELTA, then
+ 20-byte base object name SHA1 (the size above is the
+ size of the delta data that follows).
+ delta data, deflated.
+ If it is OFS_DELTA, then
+ n-byte offset (see below) interpreted as a negative
+ offset from the type-byte of the header of the
+ ofs-delta entry (the size above is the size of
+ the delta data that follows).
+ delta data, deflated.
+
+ offset encoding:
+ n bytes with MSB set in all but the last one.
+ The offset is then the number constructed by
+ concatenating the lower 7 bit of each byte, and
+ for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1))
+ to the result.
+
+
+
+== Version 2 pack-*.idx files support packs larger than 4 GiB, and
+ have some other reorganizations. They have the format:
+
+ - A 4-byte magic number '\377tOc' which is an unreasonable
+ fanout[0] value.
+
+ - A 4-byte version number (= 2)
+
+ - A 256-entry fan-out table just like v1.
+
+ - A table of sorted 20-byte SHA1 object names. These are
+ packed together without offset values to reduce the cache
+ footprint of the binary search for a specific object name.
+
+ - A table of 4-byte CRC32 values of the packed object data.
+ This is new in v2 so compressed data can be copied directly
+ from pack to pack during repacking without undetected
+ data corruption.
+
+ - A table of 4-byte offset values (in network byte order).
+ These are usually 31-bit pack file offsets, but large
+ offsets are encoded as an index into the next table with
+ the msbit set.
+
+ - A table of 8-byte offset entries (empty for pack files less
+ than 2 GiB). Pack files are organized with heavily used
+ objects toward the front, so most object references should
+ not need to refer to this table.
+
+ - The same trailer as a v1 pack file:
+
+ A copy of the 20-byte SHA1 checksum at the end of
+ corresponding packfile.
+
+ 20-byte SHA1-checksum of all of the above.
+
+From:
+https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt
+*/
+package idxfile
diff --git a/formats/idxfile/encoder.go b/formats/idxfile/encoder.go
new file mode 100644
index 0000000..f85ff84
--- /dev/null
+++ b/formats/idxfile/encoder.go
@@ -0,0 +1,124 @@
+package idxfile
+
+import (
+ "crypto/sha1"
+ "encoding/binary"
+ "hash"
+ "io"
+)
+
+// An Encoder writes idx files to an output stream.
+type Encoder struct {
+ io.Writer
+ hash hash.Hash
+}
+
+// NewEncoder returns a new encoder that writes to w.
+func NewEncoder(w io.Writer) *Encoder {
+ h := sha1.New()
+ mw := io.MultiWriter(w, h)
+ return &Encoder{mw, h}
+}
+
+// Encode writes the idx in an idx file format to the stream of the encoder.
+func (e *Encoder) Encode(idx *Idxfile) (int, error) {
+ flow := []func(*Idxfile) (int, error){
+ e.encodeHeader,
+ e.encodeFanout,
+ e.encodeHashes,
+ e.encodeCRC32,
+ e.encodeOffsets,
+ e.encodeChecksums,
+ }
+
+ sz := 0
+ for _, f := range flow {
+ i, err := f(idx)
+ sz += i
+
+ if err != nil {
+ return sz, err
+ }
+ }
+
+ return sz, nil
+}
+
+func (e *Encoder) encodeHeader(idx *Idxfile) (int, error) {
+ c, err := e.Write(idxHeader)
+ if err != nil {
+ return c, err
+ }
+
+ return c + 4, e.writeInt32(idx.Version)
+}
+
+func (e *Encoder) encodeFanout(idx *Idxfile) (int, error) {
+ fanout := idx.calculateFanout()
+ for _, c := range fanout {
+ if err := e.writeInt32(c); err != nil {
+ return 0, err
+ }
+ }
+
+ return 1024, nil
+}
+
+func (e *Encoder) encodeHashes(idx *Idxfile) (int, error) {
+ return e.encodeEntryField(idx, true)
+}
+
+func (e *Encoder) encodeCRC32(idx *Idxfile) (int, error) {
+ return e.encodeEntryField(idx, false)
+}
+
+func (e *Encoder) encodeEntryField(idx *Idxfile, isHash bool) (int, error) {
+ sz := 0
+ for _, ent := range idx.Entries {
+ var data []byte
+ if isHash {
+ data = ent.Hash[:]
+ } else {
+ data = ent.CRC32[:]
+ }
+ i, err := e.Write(data)
+ sz += i
+
+ if err != nil {
+ return sz, err
+ }
+ }
+
+ return sz, nil
+}
+
+func (e *Encoder) encodeOffsets(idx *Idxfile) (int, error) {
+ sz := 0
+ for _, ent := range idx.Entries {
+ if err := e.writeInt32(uint32(ent.Offset)); err != nil {
+ return sz, err
+ }
+
+ sz += 4
+
+ }
+
+ return sz, nil
+}
+
+func (e *Encoder) encodeChecksums(idx *Idxfile) (int, error) {
+ if _, err := e.Write(idx.PackfileChecksum[:]); err != nil {
+ return 0, err
+ }
+
+ copy(idx.IdxChecksum[:], e.hash.Sum(nil)[:20])
+ if _, err := e.Write(idx.IdxChecksum[:]); err != nil {
+ return 0, err
+ }
+
+ return 40, nil
+}
+
+func (e *Encoder) writeInt32(value uint32) error {
+ return binary.Write(e, binary.BigEndian, value)
+}
diff --git a/formats/idxfile/encoder_test.go b/formats/idxfile/encoder_test.go
new file mode 100644
index 0000000..bfb9f91
--- /dev/null
+++ b/formats/idxfile/encoder_test.go
@@ -0,0 +1,47 @@
+package idxfile
+
+import (
+ "bytes"
+ "io"
+ "os"
+
+ . "gopkg.in/check.v1"
+)
+
+func (s *IdxfileSuite) TestEncode(c *C) {
+ for i, path := range [...]string{
+ "fixtures/git-fixture.idx",
+ "../packfile/fixtures/spinnaker-spinnaker.idx",
+ } {
+ com := Commentf("subtest %d: path = %s", i, path)
+
+ exp, idx, err := decode(path)
+ c.Assert(err, IsNil, com)
+
+ obt := new(bytes.Buffer)
+ e := NewEncoder(obt)
+ size, err := e.Encode(idx)
+ c.Assert(err, IsNil, com)
+
+ c.Assert(size, Equals, exp.Len(), com)
+ c.Assert(obt, DeepEquals, exp, com)
+ }
+}
+
+func decode(path string) (*bytes.Buffer, *Idxfile, error) {
+ f, err := os.Open(path)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ cont := new(bytes.Buffer)
+ tee := io.TeeReader(f, cont)
+
+ d := NewDecoder(tee)
+ idx := &Idxfile{}
+ if err = d.Decode(idx); err != nil {
+ return nil, nil, err
+ }
+
+ return cont, idx, f.Close()
+}
diff --git a/formats/idxfile/idxfile.go b/formats/idxfile/idxfile.go
new file mode 100644
index 0000000..5f12dad
--- /dev/null
+++ b/formats/idxfile/idxfile.go
@@ -0,0 +1,61 @@
+package idxfile
+
+import "gopkg.in/src-d/go-git.v3/core"
+
+const (
+ // VersionSupported is the only idx version supported.
+ VersionSupported = 2
+)
+
+var (
+ idxHeader = []byte{255, 't', 'O', 'c'}
+)
+
+// An Idxfile represents an idx file in memory.
+type Idxfile struct {
+ Version uint32
+ Fanout [255]uint32
+ ObjectCount uint32
+ Entries []Entry
+ PackfileChecksum [20]byte
+ IdxChecksum [20]byte
+}
+
+// An Entry represents data about an object in the packfile: its hash,
+// offset and CRC32 checksum.
+type Entry struct {
+ Hash core.Hash
+ CRC32 [4]byte
+ Offset uint64
+}
+
+func (idx *Idxfile) isValid() bool {
+ fanout := idx.calculateFanout()
+ for k, c := range idx.Fanout {
+ if fanout[k] != c {
+ return false
+ }
+ }
+
+ return true
+}
+
+func (idx *Idxfile) calculateFanout() [256]uint32 {
+ fanout := [256]uint32{}
+ var c uint32
+ for _, e := range idx.Entries {
+ c++
+ fanout[e.Hash[0]] = c
+ }
+
+ var i uint32
+ for k, c := range fanout {
+ if c != 0 {
+ i = c
+ }
+
+ fanout[k] = i
+ }
+
+ return fanout
+}
diff --git a/formats/objfile/reader_test.go b/formats/objfile/reader_test.go
index caebb60..b2c3d0c 100644
--- a/formats/objfile/reader_test.go
+++ b/formats/objfile/reader_test.go
@@ -19,16 +19,16 @@ var _ = Suite(&SuiteReader{})
func (s *SuiteReader) TestReadObjfile(c *C) {
for k, fixture := range objfileFixtures {
- comment := fmt.Sprintf("test %d: ", k)
+ com := fmt.Sprintf("test %d: ", k)
hash := core.NewHash(fixture.hash)
content, _ := base64.StdEncoding.DecodeString(fixture.content)
data, _ := base64.StdEncoding.DecodeString(fixture.data)
- testReader(c, bytes.NewReader(data), hash, fixture.t, content, comment)
+ testReader(c, bytes.NewReader(data), hash, fixture.t, content, com)
}
}
-func testReader(c *C, source io.Reader, hash core.Hash, typ core.ObjectType, content []byte, comment string) {
+func testReader(c *C, source io.Reader, hash core.Hash, typ core.ObjectType, content []byte, com string) {
r, err := NewReader(source)
c.Assert(err, IsNil)
c.Assert(r.Type(), Equals, typ)
diff --git a/formats/objfile/writer_test.go b/formats/objfile/writer_test.go
index 0061f3f..160491c 100644
--- a/formats/objfile/writer_test.go
+++ b/formats/objfile/writer_test.go
@@ -16,20 +16,20 @@ var _ = Suite(&SuiteWriter{})
func (s *SuiteWriter) TestWriteObjfile(c *C) {
for k, fixture := range objfileFixtures {
- comment := fmt.Sprintf("test %d: ", k)
+ com := fmt.Sprintf("test %d: ", k)
hash := core.NewHash(fixture.hash)
content, _ := base64.StdEncoding.DecodeString(fixture.content)
buffer := new(bytes.Buffer)
// Write the data out to the buffer
- testWriter(c, buffer, hash, fixture.t, content, comment)
+ testWriter(c, buffer, hash, fixture.t, content, com)
// Read the data back in from the buffer to be sure it matches
- testReader(c, buffer, hash, fixture.t, content, comment)
+ testReader(c, buffer, hash, fixture.t, content, com)
}
}
-func testWriter(c *C, dest io.Writer, hash core.Hash, typ core.ObjectType, content []byte, comment string) {
+func testWriter(c *C, dest io.Writer, hash core.Hash, typ core.ObjectType, content []byte, com string) {
length := int64(len(content))
w, err := NewWriter(dest, typ, length)
c.Assert(err, IsNil)
diff --git a/formats/packfile/common.go b/formats/packfile/common.go
deleted file mode 100644
index b5f8de2..0000000
--- a/formats/packfile/common.go
+++ /dev/null
@@ -1,63 +0,0 @@
-package packfile
-
-import (
- "bufio"
- "fmt"
- "io"
-)
-
-type trackingReader struct {
- r io.Reader
- position int64
-}
-
-func NewTrackingReader(r io.Reader) *trackingReader {
- return &trackingReader{
- r: bufio.NewReader(r),
- }
-}
-
-func (t *trackingReader) Read(p []byte) (n int, err error) {
- n, err = t.r.Read(p)
- if err != nil {
- return 0, err
- }
-
- t.position += int64(n)
- return n, err
-}
-
-func (t *trackingReader) ReadByte() (c byte, err error) {
- var p [1]byte
- n, err := t.r.Read(p[:])
- if err != nil {
- return 0, err
- }
-
- if n > 1 {
- return 0, fmt.Errorf("read %d bytes, should have read just 1", n)
- }
-
- t.position++
- return p[0], nil
-}
-
-// checkClose is used with defer to close the given io.Closer and check its
-// returned error value. If Close returns an error and the given *error
-// is not nil, *error is set to the error returned by Close.
-//
-// checkClose is typically used with named return values like so:
-//
-// func do(obj *Object) (err error) {
-// w, err := obj.Writer()
-// if err != nil {
-// return nil
-// }
-// defer checkClose(w, &err)
-// // work with w
-// }
-func checkClose(c io.Closer, err *error) {
- if cerr := c.Close(); cerr != nil && *err == nil {
- *err = cerr
- }
-}
diff --git a/formats/packfile/decoder.go b/formats/packfile/decoder.go
new file mode 100644
index 0000000..e8c5c6a
--- /dev/null
+++ b/formats/packfile/decoder.go
@@ -0,0 +1,116 @@
+package packfile
+
+import (
+ "io"
+
+ "gopkg.in/src-d/go-git.v3/core"
+)
+
+// Format specifies if the packfile uses ref-deltas or ofs-deltas.
+type Format int
+
+// Possible values of the Format type.
+const (
+ UnknownFormat Format = iota
+ OFSDeltaFormat
+ REFDeltaFormat
+)
+
+var (
+ // ErrMaxObjectsLimitReached is returned by Decode when the number
+ // of objects in the packfile is higher than
+ // Decoder.MaxObjectsLimit.
+ ErrMaxObjectsLimitReached = NewError("max. objects limit reached")
+
+ // ErrInvalidObject is returned by Decode when an invalid object is
+ // found in the packfile.
+ ErrInvalidObject = NewError("invalid git object")
+
+ // ErrPackEntryNotFound is returned by Decode when a reference in
+ // the packfile references and unknown object.
+ ErrPackEntryNotFound = NewError("can't find a pack entry")
+
+ // ErrZLib is returned by Decode when there was an error unzipping
+ // the packfile contents.
+ ErrZLib = NewError("zlib reading error")
+)
+
+const (
+ // DefaultMaxObjectsLimit is the maximum amount of objects the
+ // decoder will decode before returning ErrMaxObjectsLimitReached.
+ DefaultMaxObjectsLimit = 1 << 20
+)
+
+// Decoder reads and decodes packfiles from an input stream.
+type Decoder struct {
+ // MaxObjectsLimit is the limit of objects to be load in the packfile, if
+ // a packfile excess this number an error is throw, the default value
+ // is defined by DefaultMaxObjectsLimit, usually the default limit is more
+ // than enough to work with any repository, with higher values and huge
+ // repositories you can run out of memory.
+ MaxObjectsLimit uint32
+
+ p *Parser
+ s core.ObjectStorage
+}
+
+// NewDecoder returns a new Decoder that reads from r.
+func NewDecoder(r ReadRecaller) *Decoder {
+ return &Decoder{
+ MaxObjectsLimit: DefaultMaxObjectsLimit,
+
+ p: NewParser(r),
+ }
+}
+
+// Decode reads a packfile and stores it in the value pointed to by s.
+func (d *Decoder) Decode(s core.ObjectStorage) error {
+ d.s = s
+
+ count, err := d.p.ReadHeader()
+ if err != nil {
+ return err
+ }
+
+ if count > d.MaxObjectsLimit {
+ return ErrMaxObjectsLimitReached.AddDetails("%d", count)
+ }
+
+ err = d.readObjects(count)
+
+ return err
+}
+
+func (d *Decoder) readObjects(count uint32) error {
+ // This code has 50-80 µs of overhead per object not counting zlib inflation.
+ // Together with zlib inflation, it's 400-410 µs for small objects.
+ // That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB,
+ // of which 12-20 % is _not_ zlib inflation (ie. is our code).
+ for i := 0; i < int(count); i++ {
+ start, err := d.p.Offset()
+ if err != nil {
+ return err
+ }
+
+ obj, err := d.p.ReadObject()
+ if err != nil {
+ if err == io.EOF {
+ break
+ }
+
+ return err
+ }
+
+ err = d.p.Remember(start, obj)
+ if err != nil {
+ return err
+ }
+
+ _, err = d.s.Set(obj)
+ if err == io.EOF {
+ break
+ }
+ }
+
+ return nil
+}
diff --git a/formats/packfile/reader_test.go b/formats/packfile/decoder_test.go
index 9ae569d..0c471a2 100644
--- a/formats/packfile/reader_test.go
+++ b/formats/packfile/decoder_test.go
@@ -26,15 +26,15 @@ var packFileWithEmptyObjects = "UEFDSwAAAAIAAAALnw54nKXMQWoDMQxA0b1PoX2hSLIm44FS
func (s *ReaderSuite) TestReadPackfile(c *C) {
data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects)
- d := bytes.NewReader(data)
+ f := bytes.NewReader(data)
+ r := NewStream(f)
+ d := NewDecoder(r)
- r := NewReader(d)
-
- storage := memory.NewObjectStorage()
- _, err := r.Read(storage)
+ sto := memory.NewObjectStorage()
+ err := d.Decode(sto)
c.Assert(err, IsNil)
- AssertObjects(c, storage, []string{
+ AssertObjects(c, sto, []string{
"778c85ff95b5514fea0ba4c7b6a029d32e2c3b96",
"db4002e880a08bf6cc7217512ad937f1ac8824a2",
"551fe11a9ef992763b7e0be4500cf7169f2f8575",
@@ -57,18 +57,17 @@ func (s *ReaderSuite) TestReadPackfileREFDelta(c *C) {
s.testReadPackfileGitFixture(c, "fixtures/git-fixture.ref-delta", REFDeltaFormat)
}
-func (s *ReaderSuite) testReadPackfileGitFixture(c *C, file string, f Format) {
- d, err := os.Open(file)
+func (s *ReaderSuite) testReadPackfileGitFixture(c *C, file string, format Format) {
+ f, err := os.Open(file)
c.Assert(err, IsNil)
+ r := NewSeekable(f)
+ d := NewDecoder(r)
- r := NewReader(d)
- r.Format = f
-
- storage := memory.NewObjectStorage()
- _, err = r.Read(storage)
+ sto := memory.NewObjectStorage()
+ err = d.Decode(sto)
c.Assert(err, IsNil)
- AssertObjects(c, storage, []string{
+ AssertObjects(c, sto, []string{
"918c48b83bd081e863dbe1b80f8998f058cd8294",
"af2d6a6954d532f8ffb47615169c8fdf9d383a1a",
"1669dce138d9b841a518c64b10914d88f5e488ea",
@@ -102,10 +101,10 @@ func (s *ReaderSuite) testReadPackfileGitFixture(c *C, file string, f Format) {
func AssertObjects(c *C, s *memory.ObjectStorage, expects []string) {
c.Assert(len(expects), Equals, len(s.Objects))
- for _, expected := range expects {
- obtained, err := s.Get(core.NewHash(expected))
+ for _, exp := range expects {
+ obt, err := s.Get(core.NewHash(exp))
c.Assert(err, IsNil)
- c.Assert(obtained.Hash().String(), Equals, expected)
+ c.Assert(obt.Hash().String(), Equals, exp)
}
}
@@ -139,12 +138,12 @@ func (s *ReaderSuite) BenchmarkGit(c *C) {
}
}
-func (s *ReaderSuite) _TestMemoryOFS(c *C) {
+func (s *ReaderSuite) _testMemory(c *C, format Format) {
var b, a runtime.MemStats
start := time.Now()
runtime.ReadMemStats(&b)
- p := readFromFile(c, "/tmp/symfony.ofs-delta", OFSDeltaFormat)
+ p := readFromFile(c, "/tmp/symfony.ofs-delta", format)
runtime.ReadMemStats(&a)
fmt.Println("OFS--->")
@@ -157,34 +156,23 @@ func (s *ReaderSuite) _TestMemoryOFS(c *C) {
fmt.Println("time", time.Since(start))
}
-func (s *ReaderSuite) _TestMemoryREF(c *C) {
- var b, a runtime.MemStats
-
- start := time.Now()
- runtime.ReadMemStats(&b)
- p := readFromFile(c, "/tmp/symonfy", REFDeltaFormat)
- runtime.ReadMemStats(&a)
-
- fmt.Println("REF--->")
- fmt.Println("Alloc", a.Alloc-b.Alloc, humanize.Bytes(a.Alloc-b.Alloc))
- fmt.Println("TotalAlloc", a.TotalAlloc-b.TotalAlloc, humanize.Bytes(a.TotalAlloc-b.TotalAlloc))
- fmt.Println("HeapAlloc", a.HeapAlloc-b.HeapAlloc, humanize.Bytes(a.HeapAlloc-b.HeapAlloc))
- fmt.Println("HeapSys", a.HeapSys, humanize.Bytes(a.HeapSys-b.HeapSys))
+func (s *ReaderSuite) _TestMemoryOFS(c *C) {
+ s._testMemory(c, OFSDeltaFormat)
+}
- fmt.Println("objects", len(p.Objects))
- fmt.Println("time", time.Since(start))
+func (s *ReaderSuite) _TestMemoryREF(c *C) {
+ s._testMemory(c, REFDeltaFormat)
}
-func readFromFile(c *C, file string, f Format) *memory.ObjectStorage {
- d, err := os.Open(file)
+func readFromFile(c *C, file string, format Format) *memory.ObjectStorage {
+ f, err := os.Open(file)
c.Assert(err, IsNil)
+ r := NewSeekable(f)
+ d := NewDecoder(r)
- r := NewReader(d)
- r.Format = f
-
- storage := memory.NewObjectStorage()
- _, err = r.Read(storage)
+ sto := memory.NewObjectStorage()
+ err = d.Decode(sto)
c.Assert(err, IsNil)
- return storage
+ return sto
}
diff --git a/formats/packfile/delta.go b/formats/packfile/delta.go
index 571ccf8..e0bbb65 100644
--- a/formats/packfile/delta.go
+++ b/formats/packfile/delta.go
@@ -1,117 +1,148 @@
package packfile
-import "io"
+// See https://github.com/git/git/blob/49fa3dc76179e04b0833542fa52d0f287a4955ac/delta.h
+// https://github.com/git/git/blob/c2c5f6b1e479f2c38e0e01345350620944e3527f/patch-delta.c,
+// and https://github.com/tarruda/node-git-core/blob/master/src/js/delta.js
+// for details about the delta format.
const deltaSizeMin = 4
-func deltaHeaderSize(b []byte) (uint, []byte) {
- var size, j uint
- var cmd byte
- for {
- cmd = b[j]
- size |= (uint(cmd) & 0x7f) << (j * 7)
- j++
- if uint(cmd)&0xb80 == 0 || j == uint(len(b)) {
- break
- }
- }
- return size, b[j:]
-}
-
-func patchDelta(src, delta []byte) []byte {
+// PatchDelta returns the result of applying the modification deltas in delta to src.
+func PatchDelta(src, delta []byte) []byte {
if len(delta) < deltaSizeMin {
return nil
}
- size, delta := deltaHeaderSize(delta)
- if size != uint(len(src)) {
+
+ srcSz, delta := decodeLEB128(delta)
+ if srcSz != uint(len(src)) {
return nil
}
- size, delta = deltaHeaderSize(delta)
- origSize := size
- dest := make([]byte, 0)
+ targetSz, delta := decodeLEB128(delta)
+ remainingTargetSz := targetSz
- // var offset uint
+ var dest []byte
var cmd byte
for {
cmd = delta[0]
delta = delta[1:]
- if (cmd & 0x80) != 0 {
- var cp_off, cp_size uint
- if (cmd & 0x01) != 0 {
- cp_off = uint(delta[0])
- delta = delta[1:]
- }
- if (cmd & 0x02) != 0 {
- cp_off |= uint(delta[0]) << 8
- delta = delta[1:]
- }
- if (cmd & 0x04) != 0 {
- cp_off |= uint(delta[0]) << 16
- delta = delta[1:]
- }
- if (cmd & 0x08) != 0 {
- cp_off |= uint(delta[0]) << 24
- delta = delta[1:]
- }
-
- if (cmd & 0x10) != 0 {
- cp_size = uint(delta[0])
- delta = delta[1:]
- }
- if (cmd & 0x20) != 0 {
- cp_size |= uint(delta[0]) << 8
- delta = delta[1:]
- }
- if (cmd & 0x40) != 0 {
- cp_size |= uint(delta[0]) << 16
- delta = delta[1:]
- }
- if cp_size == 0 {
- cp_size = 0x10000
- }
- if cp_off+cp_size < cp_off ||
- cp_off+cp_size > uint(len(src)) ||
- cp_size > origSize {
+ if isCopyFromSrc(cmd) {
+ var offset, sz uint
+ offset, delta = decodeOffset(cmd, delta)
+ sz, delta = decodeSize(cmd, delta)
+ if invalidSize(sz, targetSz) ||
+ invalidOffsetSize(offset, sz, srcSz) {
break
}
- dest = append(dest, src[cp_off:cp_off+cp_size]...)
- size -= cp_size
- } else if cmd != 0 {
- if uint(cmd) > origSize {
+ dest = append(dest, src[offset:offset+sz]...)
+ remainingTargetSz -= sz
+ } else if isCopyFromDelta(cmd) {
+ sz := uint(cmd) // cmd is the size itself
+ if invalidSize(sz, targetSz) {
break
}
- dest = append(dest, delta[0:uint(cmd)]...)
- size -= uint(cmd)
- delta = delta[uint(cmd):]
+ dest = append(dest, delta[0:sz]...)
+ remainingTargetSz -= sz
+ delta = delta[sz:]
} else {
return nil
}
- if size <= 0 {
+
+ if remainingTargetSz <= 0 {
break
}
}
+
return dest
}
-func decodeOffset(src io.ByteReader, steps int64) (int64, error) {
- b, err := src.ReadByte()
- if err != nil {
- return 0, err
- }
+// Decodes a number encoded as an unsigned LEB128 at the start of some
+// binary data and returns the decoded number and the rest of the
+// stream.
+//
+// This must be called twice on the delta data buffer, first to get the
+// expected source buffer size, and again to get the target buffer size.
+func decodeLEB128(input []byte) (uint, []byte) {
+ var num, sz uint
+ var b byte
+ for {
+ b = input[sz]
+ num |= (uint(b) & payload) << (sz * 7) // concats 7 bits chunks
+ sz++
- var offset = int64(b & 0x7f)
- for (b & 0x80) != 0 {
- offset++ // WHY?
- b, err = src.ReadByte()
- if err != nil {
- return 0, err
+ if uint(b)&continuation == 0 || sz == uint(len(input)) {
+ break
}
+ }
+
+ return num, input[sz:]
+}
+
+const (
+ payload = 0x7f // 0111 1111
+ continuation = 0x80 // 1000 0000
+)
- offset = (offset << 7) + int64(b&0x7f)
+func isCopyFromSrc(cmd byte) bool {
+ return (cmd & 0x80) != 0
+}
+
+func isCopyFromDelta(cmd byte) bool {
+ return (cmd&0x80) == 0 && cmd != 0
+}
+
+func decodeOffset(cmd byte, delta []byte) (uint, []byte) {
+ var offset uint
+ if (cmd & 0x01) != 0 {
+ offset = uint(delta[0])
+ delta = delta[1:]
+ }
+ if (cmd & 0x02) != 0 {
+ offset |= uint(delta[0]) << 8
+ delta = delta[1:]
+ }
+ if (cmd & 0x04) != 0 {
+ offset |= uint(delta[0]) << 16
+ delta = delta[1:]
+ }
+ if (cmd & 0x08) != 0 {
+ offset |= uint(delta[0]) << 24
+ delta = delta[1:]
+ }
+
+ return offset, delta
+}
+
+func decodeSize(cmd byte, delta []byte) (uint, []byte) {
+ var sz uint
+ if (cmd & 0x10) != 0 {
+ sz = uint(delta[0])
+ delta = delta[1:]
+ }
+ if (cmd & 0x20) != 0 {
+ sz |= uint(delta[0]) << 8
+ delta = delta[1:]
+ }
+ if (cmd & 0x40) != 0 {
+ sz |= uint(delta[0]) << 16
+ delta = delta[1:]
+ }
+ if sz == 0 {
+ sz = 0x10000
}
- // offset needs to be aware of the bytes we read for `o.typ` and `o.size`
- offset += steps
- return -offset, nil
+ return sz, delta
+}
+
+func invalidSize(sz, targetSz uint) bool {
+ return sz > targetSz
+}
+
+func invalidOffsetSize(offset, sz, srcSz uint) bool {
+ return sumOverflows(offset, sz) ||
+ offset+sz > srcSz
+}
+
+func sumOverflows(a, b uint) bool {
+ return a+b < a
}
diff --git a/formats/packfile/doc.go b/formats/packfile/doc.go
index cb3f542..c79c180 100644
--- a/formats/packfile/doc.go
+++ b/formats/packfile/doc.go
@@ -1,165 +1,168 @@
-package packfile
+// Package packfile documentation:
+/*
+
+GIT pack format
+===============
+
+== pack-*.pack files have the following format:
+
+ - A header appears at the beginning and consists of the following:
+
+ 4-byte signature:
+ The signature is: {'P', 'A', 'C', 'K'}
+
+ 4-byte version number (network byte order):
+ GIT currently accepts version number 2 or 3 but
+ generates version 2 only.
+
+ 4-byte number of objects contained in the pack (network byte order)
+
+ Observation: we cannot have more than 4G versions ;-) and
+ more than 4G objects in a pack.
+
+ - The header is followed by number of object entries, each of
+ which looks like this:
+
+ (undeltified representation)
+ n-byte type and length (3-bit type, (n-1)*7+4-bit length)
+ compressed data
+
+ (deltified representation)
+ n-byte type and length (3-bit type, (n-1)*7+4-bit length)
+ 20-byte base object name
+ compressed delta data
+
+ Observation: length of each object is encoded in a variable
+ length format and is not constrained to 32-bit or anything.
+
+ - The trailer records 20-byte SHA1 checksum of all of the above.
+
+== Original (version 1) pack-*.idx files have the following format:
+
+ - The header consists of 256 4-byte network byte order
+ integers. N-th entry of this table records the number of
+ objects in the corresponding pack, the first byte of whose
+ object name is less than or equal to N. This is called the
+ 'first-level fan-out' table.
+
+ - The header is followed by sorted 24-byte entries, one entry
+ per object in the pack. Each entry is:
+
+ 4-byte network byte order integer, recording where the
+ object is stored in the packfile as the offset from the
+ beginning.
+
+ 20-byte object name.
+
+ - The file is concluded with a trailer:
+
+ A copy of the 20-byte SHA1 checksum at the end of
+ corresponding packfile.
-// GIT pack format
-// ===============
-//
-// == pack-*.pack files have the following format:
-//
-// - A header appears at the beginning and consists of the following:
-//
-// 4-byte signature:
-// The signature is: {'P', 'A', 'C', 'K'}
-//
-// 4-byte version number (network byte order):
-// GIT currently accepts version number 2 or 3 but
-// generates version 2 only.
-//
-// 4-byte number of objects contained in the pack (network byte order)
-//
-// Observation: we cannot have more than 4G versions ;-) and
-// more than 4G objects in a pack.
-//
-// - The header is followed by number of object entries, each of
-// which looks like this:
-//
-// (undeltified representation)
-// n-byte type and length (3-bit type, (n-1)*7+4-bit length)
-// compressed data
-//
-// (deltified representation)
-// n-byte type and length (3-bit type, (n-1)*7+4-bit length)
-// 20-byte base object name
-// compressed delta data
-//
-// Observation: length of each object is encoded in a variable
-// length format and is not constrained to 32-bit or anything.
-//
-// - The trailer records 20-byte SHA1 checksum of all of the above.
-//
-// == Original (version 1) pack-*.idx files have the following format:
-//
-// - The header consists of 256 4-byte network byte order
-// integers. N-th entry of this table records the number of
-// objects in the corresponding pack, the first byte of whose
-// object name is less than or equal to N. This is called the
-// 'first-level fan-out' table.
-//
-// - The header is followed by sorted 24-byte entries, one entry
-// per object in the pack. Each entry is:
-//
-// 4-byte network byte order integer, recording where the
-// object is stored in the packfile as the offset from the
-// beginning.
-//
-// 20-byte object name.
-//
-// - The file is concluded with a trailer:
-//
-// A copy of the 20-byte SHA1 checksum at the end of
-// corresponding packfile.
-//
-// 20-byte SHA1-checksum of all of the above.
-//
-// Pack Idx file:
-//
-// -- +--------------------------------+
-// fanout | fanout[0] = 2 (for example) |-.
-// table +--------------------------------+ |
-// | fanout[1] | |
-// +--------------------------------+ |
-// | fanout[2] | |
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
-// | fanout[255] = total objects |---.
-// -- +--------------------------------+ | |
-// main | offset | | |
-// index | object name 00XXXXXXXXXXXXXXXX | | |
-// table +--------------------------------+ | |
-// | offset | | |
-// | object name 00XXXXXXXXXXXXXXXX | | |
-// +--------------------------------+<+ |
-// .-| offset | |
-// | | object name 01XXXXXXXXXXXXXXXX | |
-// | +--------------------------------+ |
-// | | offset | |
-// | | object name 01XXXXXXXXXXXXXXXX | |
-// | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
-// | | offset | |
-// | | object name FFXXXXXXXXXXXXXXXX | |
-// --| +--------------------------------+<--+
-// trailer | | packfile checksum |
-// | +--------------------------------+
-// | | idxfile checksum |
-// | +--------------------------------+
-// .-------.
-// |
-// Pack file entry: <+
-//
-// packed object header:
-// 1-byte size extension bit (MSB)
-// type (next 3 bit)
-// size0 (lower 4-bit)
-// n-byte sizeN (as long as MSB is set, each 7-bit)
-// size0..sizeN form 4+7+7+..+7 bit integer, size0
-// is the least significant part, and sizeN is the
-// most significant part.
-// packed object data:
-// If it is not DELTA, then deflated bytes (the size above
-// is the size before compression).
-// If it is REF_DELTA, then
-// 20-byte base object name SHA1 (the size above is the
-// size of the delta data that follows).
-// delta data, deflated.
-// If it is OFS_DELTA, then
-// n-byte offset (see below) interpreted as a negative
-// offset from the type-byte of the header of the
-// ofs-delta entry (the size above is the size of
-// the delta data that follows).
-// delta data, deflated.
-//
-// offset encoding:
-// n bytes with MSB set in all but the last one.
-// The offset is then the number constructed by
-// concatenating the lower 7 bit of each byte, and
-// for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1))
-// to the result.
-//
-//
-//
-// == Version 2 pack-*.idx files support packs larger than 4 GiB, and
-// have some other reorganizations. They have the format:
-//
-// - A 4-byte magic number '\377tOc' which is an unreasonable
-// fanout[0] value.
-//
-// - A 4-byte version number (= 2)
-//
-// - A 256-entry fan-out table just like v1.
-//
-// - A table of sorted 20-byte SHA1 object names. These are
-// packed together without offset values to reduce the cache
-// footprint of the binary search for a specific object name.
-//
-// - A table of 4-byte CRC32 values of the packed object data.
-// This is new in v2 so compressed data can be copied directly
-// from pack to pack during repacking without undetected
-// data corruption.
-//
-// - A table of 4-byte offset values (in network byte order).
-// These are usually 31-bit pack file offsets, but large
-// offsets are encoded as an index into the next table with
-// the msbit set.
-//
-// - A table of 8-byte offset entries (empty for pack files less
-// than 2 GiB). Pack files are organized with heavily used
-// objects toward the front, so most object references should
-// not need to refer to this table.
-//
-// - The same trailer as a v1 pack file:
-//
-// A copy of the 20-byte SHA1 checksum at the end of
-// corresponding packfile.
-//
-// 20-byte SHA1-checksum of all of the above.
-//
-// From:
-// https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt
+ 20-byte SHA1-checksum of all of the above.
+
+Pack Idx file:
+
+ -- +--------------------------------+
+fanout | fanout[0] = 2 (for example) |-.
+table +--------------------------------+ |
+ | fanout[1] | |
+ +--------------------------------+ |
+ | fanout[2] | |
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
+ | fanout[255] = total objects |---.
+ -- +--------------------------------+ | |
+main | offset | | |
+index | object name 00XXXXXXXXXXXXXXXX | | |
+table +--------------------------------+ | |
+ | offset | | |
+ | object name 00XXXXXXXXXXXXXXXX | | |
+ +--------------------------------+<+ |
+ .-| offset | |
+ | | object name 01XXXXXXXXXXXXXXXX | |
+ | +--------------------------------+ |
+ | | offset | |
+ | | object name 01XXXXXXXXXXXXXXXX | |
+ | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
+ | | offset | |
+ | | object name FFXXXXXXXXXXXXXXXX | |
+ --| +--------------------------------+<--+
+trailer | | packfile checksum |
+ | +--------------------------------+
+ | | idxfile checksum |
+ | +--------------------------------+
+ .-------.
+ |
+Pack file entry: <+
+
+ packed object header:
+ 1-byte size extension bit (MSB)
+ type (next 3 bit)
+ size0 (lower 4-bit)
+ n-byte sizeN (as long as MSB is set, each 7-bit)
+ size0..sizeN form 4+7+7+..+7 bit integer, size0
+ is the least significant part, and sizeN is the
+ most significant part.
+ packed object data:
+ If it is not DELTA, then deflated bytes (the size above
+ is the size before compression).
+ If it is REF_DELTA, then
+ 20-byte base object name SHA1 (the size above is the
+ size of the delta data that follows).
+ delta data, deflated.
+ If it is OFS_DELTA, then
+ n-byte offset (see below) interpreted as a negative
+ offset from the type-byte of the header of the
+ ofs-delta entry (the size above is the size of
+ the delta data that follows).
+ delta data, deflated.
+
+ offset encoding:
+ n bytes with MSB set in all but the last one.
+ The offset is then the number constructed by
+ concatenating the lower 7 bit of each byte, and
+ for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1))
+ to the result.
+
+
+
+== Version 2 pack-*.idx files support packs larger than 4 GiB, and
+ have some other reorganizations. They have the format:
+
+ - A 4-byte magic number '\377tOc' which is an unreasonable
+ fanout[0] value.
+
+ - A 4-byte version number (= 2)
+
+ - A 256-entry fan-out table just like v1.
+
+ - A table of sorted 20-byte SHA1 object names. These are
+ packed together without offset values to reduce the cache
+ footprint of the binary search for a specific object name.
+
+ - A table of 4-byte CRC32 values of the packed object data.
+ This is new in v2 so compressed data can be copied directly
+ from pack to pack during repacking without undetected
+ data corruption.
+
+ - A table of 4-byte offset values (in network byte order).
+ These are usually 31-bit pack file offsets, but large
+ offsets are encoded as an index into the next table with
+ the msbit set.
+
+ - A table of 8-byte offset entries (empty for pack files less
+ than 2 GiB). Pack files are organized with heavily used
+ objects toward the front, so most object references should
+ not need to refer to this table.
+
+ - The same trailer as a v1 pack file:
+
+ A copy of the 20-byte SHA1 checksum at the end of
+ corresponding packfile.
+
+ 20-byte SHA1-checksum of all of the above.
+
+From:
+https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt
+*/
+package packfile
diff --git a/formats/packfile/error.go b/formats/packfile/error.go
new file mode 100644
index 0000000..c0b9163
--- /dev/null
+++ b/formats/packfile/error.go
@@ -0,0 +1,30 @@
+package packfile
+
+import "fmt"
+
+// Error specifies errors returned during packfile parsing.
+type Error struct {
+ reason, details string
+}
+
+// NewError returns a new error.
+func NewError(reason string) *Error {
+ return &Error{reason: reason}
+}
+
+// Error returns a text representation of the error.
+func (e *Error) Error() string {
+ if e.details == "" {
+ return e.reason
+ }
+
+ return fmt.Sprintf("%s: %s", e.reason, e.details)
+}
+
+// AddDetails adds details to an error, with additional text.
+func (e *Error) AddDetails(format string, args ...interface{}) *Error {
+ return &Error{
+ reason: e.reason,
+ details: fmt.Sprintf(format, args...),
+ }
+}
diff --git a/formats/packfile/parser.go b/formats/packfile/parser.go
new file mode 100644
index 0000000..d3463bd
--- /dev/null
+++ b/formats/packfile/parser.go
@@ -0,0 +1,353 @@
+package packfile
+
+import (
+ "bytes"
+ "compress/zlib"
+ "encoding/binary"
+ "fmt"
+ "io"
+
+ "gopkg.in/src-d/go-git.v3/core"
+ "gopkg.in/src-d/go-git.v3/storage/memory"
+)
+
+var (
+ // ErrEmptyPackfile is returned by ReadHeader when no data is found in the packfile
+ ErrEmptyPackfile = NewError("empty packfile")
+ // ErrBadSignature is returned by ReadHeader when the signature in the packfile is incorrect.
+ ErrBadSignature = NewError("malformed pack file signature")
+ // ErrUnsupportedVersion is returned by ReadHeader when the packfile version is
+ // different than VersionSupported.
+ ErrUnsupportedVersion = NewError("unsupported packfile version")
+)
+
+const (
+ // VersionSupported is the packfile version supported by this parser.
+ VersionSupported = 2
+)
+
+// A Parser is a collection of functions to read and process data form a packfile.
+// Values from this type are not zero-value safe. See the NewParser function bellow.
+type Parser struct {
+ ReadRecaller
+}
+
+// NewParser returns a new Parser that reads from the packfile represented by r.
+func NewParser(r ReadRecaller) *Parser {
+ return &Parser{ReadRecaller: r}
+}
+
+// ReadInt32 reads 4 bytes and returns them as a Big Endian int32.
+func (p Parser) readInt32() (uint32, error) {
+ var v uint32
+ if err := binary.Read(p, binary.BigEndian, &v); err != nil {
+ return 0, err
+ }
+
+ return v, nil
+}
+
+// ReadSignature reads an returns the signature field in the packfile.
+func (p *Parser) ReadSignature() ([]byte, error) {
+ var sig = make([]byte, 4)
+ if _, err := io.ReadFull(p, sig); err != nil {
+ return []byte{}, err
+ }
+
+ return sig, nil
+}
+
+// IsValidSignature returns if sig is a valid packfile signature.
+func (p Parser) IsValidSignature(sig []byte) bool {
+ return bytes.Equal(sig, []byte{'P', 'A', 'C', 'K'})
+}
+
+// ReadVersion reads and returns the version field of a packfile.
+func (p *Parser) ReadVersion() (uint32, error) {
+ return p.readInt32()
+}
+
+// IsSupportedVersion returns whether version v is supported by the parser.
+// The current supported version is VersionSupported, defined above.
+func (p *Parser) IsSupportedVersion(v uint32) bool {
+ return v == VersionSupported
+}
+
+// ReadCount reads and returns the count of objects field of a packfile.
+func (p *Parser) ReadCount() (uint32, error) {
+ return p.readInt32()
+}
+
+// ReadHeader reads the whole packfile header (signature, version and
+// object count). It returns the object count and performs checks on the
+// validity of the signature and the version fields.
+func (p Parser) ReadHeader() (uint32, error) {
+ sig, err := p.ReadSignature()
+ if err != nil {
+ if err == io.EOF {
+ return 0, ErrEmptyPackfile
+ }
+ return 0, err
+ }
+
+ if !p.IsValidSignature(sig) {
+ return 0, ErrBadSignature
+ }
+
+ ver, err := p.ReadVersion()
+ if err != nil {
+ return 0, err
+ }
+
+ if !p.IsSupportedVersion(ver) {
+ return 0, ErrUnsupportedVersion.AddDetails("%d", ver)
+ }
+
+ count, err := p.ReadCount()
+ if err != nil {
+ return 0, err
+ }
+
+ return count, nil
+}
+
+// ReadObjectTypeAndLength reads and returns the object type and the
+// length field from an object entry in a packfile.
+func (p Parser) ReadObjectTypeAndLength() (core.ObjectType, int64, error) {
+ t, c, err := p.readType()
+ if err != nil {
+ return t, 0, err
+ }
+
+ l, err := p.readLength(c)
+
+ return t, l, err
+}
+
+func (p Parser) readType() (core.ObjectType, byte, error) {
+ var c byte
+ var err error
+ if c, err = p.ReadByte(); err != nil {
+ return core.ObjectType(0), 0, err
+ }
+ typ := parseType(c)
+
+ return typ, c, nil
+}
+
+var (
+ maskContinue = uint8(128) // 1000 0000
+ maskType = uint8(112) // 0111 0000
+ maskFirstLength = uint8(15) // 0000 1111
+ firstLengthBits = uint8(4) // the first byte has 4 bits to store the length
+ maskLength = uint8(127) // 0111 1111
+ lengthBits = uint8(7) // subsequent bytes has 7 bits to store the length
+)
+
+func parseType(b byte) core.ObjectType {
+ return core.ObjectType((b & maskType) >> firstLengthBits)
+}
+
+// the length is codified in the last 4 bits of the first byte and in
+// the last 7 bits of subsequent bytes. Last byte has a 0 MSB.
+func (p Parser) readLength(first byte) (int64, error) {
+ length := int64(first & maskFirstLength)
+
+ c := first
+ shift := firstLengthBits
+ var err error
+ for moreBytesInLength(c) {
+ if c, err = p.ReadByte(); err != nil {
+ return 0, err
+ }
+
+ length += int64(c&maskLength) << shift
+ shift += lengthBits
+ }
+
+ return length, nil
+}
+
+func moreBytesInLength(c byte) bool {
+ return c&maskContinue > 0
+}
+
+// ReadObject reads and returns a git object from an object entry in the packfile.
+// Non-deltified and deltified objects are supported.
+func (p Parser) ReadObject() (core.Object, error) {
+ start, err := p.Offset()
+ if err != nil {
+ return nil, err
+ }
+
+ var typ core.ObjectType
+ typ, _, err = p.ReadObjectTypeAndLength()
+ if err != nil {
+ return nil, err
+ }
+
+ var cont []byte
+ switch typ {
+ case core.CommitObject, core.TreeObject, core.BlobObject, core.TagObject:
+ cont, err = p.ReadNonDeltaObjectContent()
+ case core.REFDeltaObject:
+ cont, typ, err = p.ReadREFDeltaObjectContent()
+ case core.OFSDeltaObject:
+ cont, typ, err = p.ReadOFSDeltaObjectContent(start)
+ default:
+ err = ErrInvalidObject.AddDetails("tag %q", typ)
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ return memory.NewObject(typ, int64(len(cont)), cont), nil
+}
+
+// ReadNonDeltaObjectContent reads and returns a non-deltified object
+// from it zlib stream in an object entry in the packfile.
+func (p Parser) ReadNonDeltaObjectContent() ([]byte, error) {
+ return p.readZip()
+}
+
+func (p Parser) readZip() ([]byte, error) {
+ buf := bytes.NewBuffer(nil)
+ err := p.inflate(buf)
+
+ return buf.Bytes(), err
+}
+
+func (p Parser) inflate(w io.Writer) (err error) {
+ zr, err := zlib.NewReader(p)
+ if err != nil {
+ if err != zlib.ErrHeader {
+ return fmt.Errorf("zlib reading error: %s", err)
+ }
+ }
+
+ defer func() {
+ closeErr := zr.Close()
+ if err == nil {
+ err = closeErr
+ }
+ }()
+
+ _, err = io.Copy(w, zr)
+
+ return err
+}
+
+// ReadREFDeltaObjectContent reads and returns an object specified by a
+// REF-Delta entry in the packfile, form the hash onwards.
+func (p Parser) ReadREFDeltaObjectContent() ([]byte, core.ObjectType, error) {
+ refHash, err := p.ReadHash()
+ if err != nil {
+ return nil, core.ObjectType(0), err
+ }
+
+ refObj, err := p.RecallByHash(refHash)
+ if err != nil {
+ return nil, core.ObjectType(0), err
+ }
+
+ content, err := p.ReadSolveDelta(refObj.Content())
+ if err != nil {
+ return nil, refObj.Type(), err
+ }
+
+ return content, refObj.Type(), nil
+}
+
+// ReadHash reads a hash.
+func (p Parser) ReadHash() (core.Hash, error) {
+ var h core.Hash
+ if _, err := io.ReadFull(p, h[:]); err != nil {
+ return core.ZeroHash, err
+ }
+
+ return h, nil
+}
+
+// ReadSolveDelta reads and returns the base patched with the contents
+// of a zlib compressed diff data in the delta portion of an object
+// entry in the packfile.
+func (p Parser) ReadSolveDelta(base []byte) ([]byte, error) {
+ diff, err := p.readZip()
+ if err != nil {
+ return nil, err
+ }
+
+ return PatchDelta(base, diff), nil
+}
+
+// ReadOFSDeltaObjectContent reads an returns an object specified by an
+// OFS-delta entry in the packfile from it negative offset onwards. The
+// start parameter is the offset of this particular object entry (the
+// current offset minus the already processed type and length).
+func (p Parser) ReadOFSDeltaObjectContent(start int64) (
+ []byte, core.ObjectType, error) {
+
+ jump, err := p.ReadNegativeOffset()
+ if err != nil {
+ return nil, core.ObjectType(0), err
+ }
+
+ ref, err := p.RecallByOffset(start + jump)
+ if err != nil {
+ return nil, core.ObjectType(0), err
+ }
+
+ content, err := p.ReadSolveDelta(ref.Content())
+ if err != nil {
+ return nil, ref.Type(), err
+ }
+
+ return content, ref.Type(), nil
+}
+
+// ReadNegativeOffset reads and returns an offset from a OFS DELTA
+// object entry in a packfile. OFS DELTA offsets are specified in Git
+// VLQ special format:
+//
+// Ordinary VLQ has some redundancies, example: the number 358 can be
+// encoded as the 2-octet VLQ 0x8166 or the 3-octet VLQ 0x808166 or the
+// 4-octet VLQ 0x80808166 and so forth.
+//
+// To avoid these redundancies, the VLQ format used in Git removes this
+// prepending redundancy and extends the representable range of shorter
+// VLQs by adding an offset to VLQs of 2 or more octets in such a way
+// that the lowest possible value for such an (N+1)-octet VLQ becomes
+// exactly one more than the maximum possible value for an N-octet VLQ.
+// In particular, since a 1-octet VLQ can store a maximum value of 127,
+// the minimum 2-octet VLQ (0x8000) is assigned the value 128 instead of
+// 0. Conversely, the maximum value of such a 2-octet VLQ (0xff7f) is
+// 16511 instead of just 16383. Similarly, the minimum 3-octet VLQ
+// (0x808000) has a value of 16512 instead of zero, which means
+// that the maximum 3-octet VLQ (0xffff7f) is 2113663 instead of
+// just 2097151. And so forth.
+//
+// This is how the offset is saved in C:
+//
+// dheader[pos] = ofs & 127;
+// while (ofs >>= 7)
+// dheader[--pos] = 128 | (--ofs & 127);
+//
+func (p Parser) ReadNegativeOffset() (int64, error) {
+ var c byte
+ var err error
+
+ if c, err = p.ReadByte(); err != nil {
+ return 0, err
+ }
+
+ var offset = int64(c & maskLength)
+ for moreBytesInLength(c) {
+ offset++
+ if c, err = p.ReadByte(); err != nil {
+ return 0, err
+ }
+ offset = (offset << lengthBits) + int64(c&maskLength)
+ }
+
+ return -offset, nil
+}
diff --git a/formats/packfile/parser_test.go b/formats/packfile/parser_test.go
new file mode 100644
index 0000000..12d5f0d
--- /dev/null
+++ b/formats/packfile/parser_test.go
@@ -0,0 +1,412 @@
+package packfile
+
+import (
+ "bytes"
+ "io"
+ "io/ioutil"
+ "os"
+
+ . "gopkg.in/check.v1"
+ "gopkg.in/src-d/go-git.v3/core"
+ "gopkg.in/src-d/go-git.v3/storage/memory"
+)
+
+const (
+ sigOffset = 0
+ verOffset = 4
+ countOffset = 8
+)
+
+type ParserSuite struct {
+ fixtures map[string]*fix
+}
+
+type fix struct {
+ path string
+ parser *Parser
+ seekable io.Seeker
+}
+
+func newFix(path string) (*fix, error) {
+ fix := new(fix)
+ fix.path = path
+
+ f, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+
+ data, err := ioutil.ReadAll(f)
+ if err != nil {
+ return nil, err
+ }
+
+ if err = f.Close(); err != nil {
+ return nil, err
+ }
+
+ seekable := NewSeekable(bytes.NewReader(data))
+ fix.seekable = seekable
+ fix.parser = NewParser(seekable)
+
+ return fix, nil
+}
+
+func (f *fix) seek(o int64) error {
+ _, err := f.seekable.Seek(o, os.SEEK_SET)
+ return err
+}
+
+var _ = Suite(&ParserSuite{})
+
+func (s *ParserSuite) SetUpSuite(c *C) {
+ s.fixtures = make(map[string]*fix)
+ for _, fixData := range []struct {
+ id string
+ path string
+ }{
+ {"ofs-deltas", "fixtures/alcortesm-binary-relations.pack"},
+ {"ref-deltas", "fixtures/git-fixture.ref-delta"},
+ } {
+ fix, err := newFix(fixData.path)
+ c.Assert(err, IsNil,
+ Commentf("setting up fixture id %s: %s", fixData.id, err))
+
+ _, ok := s.fixtures[fixData.id]
+ c.Assert(ok, Equals, false,
+ Commentf("duplicated fixture id: %s", fixData.id))
+
+ s.fixtures[fixData.id] = fix
+ }
+}
+
+func (s *ParserSuite) TestSignature(c *C) {
+ for id, fix := range s.fixtures {
+ com := Commentf("fixture id = %s", id)
+ err := fix.seek(sigOffset)
+ c.Assert(err, IsNil, com)
+ p := fix.parser
+
+ sig, err := p.ReadSignature()
+ c.Assert(err, IsNil, com)
+ c.Assert(p.IsValidSignature(sig), Equals, true, com)
+ }
+}
+
+func (s *ParserSuite) TestVersion(c *C) {
+ for i, test := range [...]struct {
+ fixID string
+ expected uint32
+ }{
+ {
+ fixID: "ofs-deltas",
+ expected: uint32(2),
+ }, {
+ fixID: "ref-deltas",
+ expected: uint32(2),
+ },
+ } {
+ com := Commentf("test %d) fixture id = %s", i, test.fixID)
+ fix, ok := s.fixtures[test.fixID]
+ c.Assert(ok, Equals, true, com)
+
+ err := fix.seek(verOffset)
+ c.Assert(err, IsNil, com)
+ p := fix.parser
+
+ v, err := p.ReadVersion()
+ c.Assert(err, IsNil, com)
+ c.Assert(v, Equals, test.expected, com)
+ c.Assert(p.IsSupportedVersion(v), Equals, true, com)
+ }
+}
+
+func (s *ParserSuite) TestCount(c *C) {
+ for i, test := range [...]struct {
+ fixID string
+ expected uint32
+ }{
+ {
+ fixID: "ofs-deltas",
+ expected: uint32(0x50),
+ }, {
+ fixID: "ref-deltas",
+ expected: uint32(0x1c),
+ },
+ } {
+ com := Commentf("test %d) fixture id = %s", i, test.fixID)
+ fix, ok := s.fixtures[test.fixID]
+ c.Assert(ok, Equals, true, com)
+
+ err := fix.seek(countOffset)
+ c.Assert(err, IsNil, com)
+ p := fix.parser
+
+ count, err := p.ReadCount()
+ c.Assert(err, IsNil, com)
+ c.Assert(count, Equals, test.expected, com)
+ }
+}
+
+func (s *ParserSuite) TestReadObjectTypeAndLength(c *C) {
+ for i, test := range [...]struct {
+ fixID string
+ offset int64
+ expType core.ObjectType
+ expLength int64
+ }{
+ {
+ fixID: "ofs-deltas",
+ offset: 12,
+ expType: core.CommitObject,
+ expLength: 342,
+ }, {
+ fixID: "ofs-deltas",
+ offset: 1212,
+ expType: core.OFSDeltaObject,
+ expLength: 104,
+ }, {
+ fixID: "ofs-deltas",
+ offset: 3193,
+ expType: core.TreeObject,
+ expLength: 226,
+ }, {
+ fixID: "ofs-deltas",
+ offset: 3639,
+ expType: core.BlobObject,
+ expLength: 90,
+ }, {
+ fixID: "ofs-deltas",
+ offset: 4504,
+ expType: core.BlobObject,
+ expLength: 7107,
+ }, {
+ fixID: "ref-deltas",
+ offset: 84849,
+ expType: core.REFDeltaObject,
+ expLength: 6,
+ }, {
+ fixID: "ref-deltas",
+ offset: 85070,
+ expType: core.REFDeltaObject,
+ expLength: 8,
+ },
+ } {
+ com := Commentf("test %d) fixture id = %s", i, test.fixID)
+ fix, ok := s.fixtures[test.fixID]
+ c.Assert(ok, Equals, true, com)
+
+ err := fix.seek(test.offset)
+ c.Assert(err, IsNil, com)
+ p := fix.parser
+
+ typ, length, err := p.ReadObjectTypeAndLength()
+ c.Assert(err, IsNil, com)
+ c.Assert(typ, Equals, test.expType, com)
+ c.Assert(length, Equals, test.expLength, com)
+ }
+}
+
+func (s *ParserSuite) TestReadNonDeltaObjectContent(c *C) {
+ for i, test := range [...]struct {
+ fixID string
+ offset int64
+ expected []byte
+ }{
+ {
+ fixID: "ofs-deltas",
+ offset: 12,
+ expected: []byte("tree 87c87d16e815a43e4e574dd8edd72c5450ac3a8e\nparent a87d72684d1cf68099ce6e9f68689e25e645a14c\nauthor Gorka Guardiola <Gorka Guardiola Múzquiz> 1450265632 +0100\ncommitter Gorka Guardiola <Gorka Guardiola Múzquiz> 1450265632 +0100\n\nChanged example to use dot.\nI did not remove the original files outside of the\ntex, I leave that to alcortes.\n"),
+ }, {
+ fixID: "ofs-deltas",
+ offset: 1610,
+ expected: []byte("tree 4b4f0d9a07109ef0b8a3051138cc20cdb47fa513\nparent b373f85fa2594d7dcd9989f4a5858a81647fb8ea\nauthor Alberto Cortés <alberto@sourced.tech> 1448017995 +0100\ncommitter Alberto Cortés <alberto@sourced.tech> 1448018112 +0100\n\nMove generated images to it own dir (img/)\n\nFixes #1.\n"),
+ }, {
+ fixID: "ofs-deltas",
+ offset: 10566,
+ expected: []byte("40000 map-slice\x00\x00\xce\xfb\x8ew\xf7\xa8\xc6\x1b\x99\xdd$\x91\xffH\xa3\xb0\xb1fy40000 simple-arrays\x00\x9a7\x81\xb7\xfd\x9d(Q\xe2\xa4H\x8c\x03^٬\x90Z\xecy"),
+ },
+ } {
+ com := Commentf("test %d) fixture id = %s", i, test.fixID)
+ fix, ok := s.fixtures[test.fixID]
+ c.Assert(ok, Equals, true, com)
+
+ err := fix.seek(test.offset)
+ c.Assert(err, IsNil, com)
+ p := fix.parser
+
+ _, _, err = p.ReadObjectTypeAndLength()
+ c.Assert(err, IsNil, com)
+
+ cont, err := p.ReadNonDeltaObjectContent()
+ c.Assert(err, IsNil, com)
+ c.Assert(cont, DeepEquals, test.expected, com)
+ }
+}
+
+func (s *ParserSuite) TestReadOFSDeltaObjectContent(c *C) {
+ for i, test := range [...]struct {
+ fixID string
+ offset int64
+ expOffset int64
+ expType core.ObjectType
+ expContent []byte
+ }{
+ {
+ fixID: "ofs-deltas",
+ offset: 1212,
+ expOffset: -212,
+ expType: core.CommitObject,
+ expContent: []byte("tree c4573589ce78ac63769c20742b9a970f6e274a38\nparent 4571a24948494ebe1cb3dc18ca5a9286e79705ae\nauthor Alberto Cortés <alberto@sourced.tech> 1448139640 +0100\ncommitter Alberto Cortés <alberto@sourced.tech> 1448139640 +0100\n\nUpdate reference to binrels module\n"),
+ }, {
+ fixID: "ofs-deltas",
+ offset: 3514,
+ expOffset: -102,
+ expType: core.TreeObject,
+ expContent: []byte("100644 .gitignore\x00\u007fA\x90[Mw\xabJ\x9a-3O\xcd\x0f\xb5\xdbn\x8e!\x83100644 .gitmodules\x00\xd4`\xa8>\x15\xcfd\x05\x81B7_\xc4\v\x04\xa7\xa9A\x85\n100644 Makefile\x00-ҭ\x8c\x14\xdef\x12\xed\x15\x816y\xa6UK\xad\x993\v100644 binary-relations.tex\x00\x802\x05@\x11'^ \xf5<\xf7\xfd\x81%3\xd1o\xa9_$40000 graphs\x00\xdehu\x16\xc6\x0e\\H\x8e\xe9\xa1JIXE\xbaڽg\xc540000 imgs-gen\x00\xeb\"\xddhzg\xa3\x1f\xc8j\xc5\xfc豢\xe9\x96\xce\xce^40000 src\x00\x895\x11t\xff\x86\xa7\xea\xa6\xc0v%\x11E\x10f,ݒ\x1a"),
+ }, {
+ fixID: "ofs-deltas",
+ offset: 9806,
+ expOffset: -6613,
+ expType: core.TreeObject,
+ expContent: []byte("100644 .gitignore\x00\u007fA\x90[Mw\xabJ\x9a-3O\xcd\x0f\xb5\xdbn\x8e!\x83100644 .gitmodules\x00\xd4`\xa8>\x15\xcfd\x05\x81B7_\xc4\v\x04\xa7\xa9A\x85\n100644 Makefile\x00-ҭ\x8c\x14\xdef\x12\xed\x15\x816y\xa6UK\xad\x993\v100644 binary-relations.tex\x00I\x13~\xb8کEU\x9f\x99#\xc4E.\x9d>\uef1e\xad40000 graphs\x00\xb9\x00\xf34\xde\xff\xce@+\xbd\xf8 9\xb8=\xc1\xb9\x00\x84]40000 imgs-gen\x00\xeb\"\xddhzg\xa3\x1f\xc8j\xc5\xfc豢\xe9\x96\xce\xce^40000 src\x00\x895\x11t\xff\x86\xa7\xea\xa6\xc0v%\x11E\x10f,ݒ\x1a"),
+ },
+ } {
+ com := Commentf("test %d) fixture id = %s", i, test.fixID)
+ fix, ok := s.fixtures[test.fixID]
+ c.Assert(ok, Equals, true, com)
+
+ err := fix.seek(test.offset)
+ c.Assert(err, IsNil, com)
+ p := fix.parser
+
+ _, _, err = p.ReadObjectTypeAndLength()
+ c.Assert(err, IsNil, com)
+
+ beforeJumpSize, err := p.Offset()
+ c.Assert(err, IsNil, com)
+
+ jump, err := p.ReadNegativeOffset()
+ c.Assert(err, IsNil, com)
+ c.Assert(jump, Equals, test.expOffset, com)
+
+ err = fix.seek(beforeJumpSize)
+ c.Assert(err, IsNil, com)
+
+ cont, typ, err := p.ReadOFSDeltaObjectContent(test.offset)
+ c.Assert(err, IsNil, com)
+ c.Assert(typ, Equals, test.expType, com)
+ c.Assert(cont, DeepEquals, test.expContent, com)
+ }
+}
+
+func (s *ParserSuite) TestReadREFDeltaObjectContent(c *C) {
+ for i, test := range [...]struct {
+ fixID string
+ offset int64
+ deps map[int64]core.Object
+ expHash core.Hash
+ expType core.ObjectType
+ expContent []byte
+ }{
+ {
+ fixID: "ref-deltas",
+ offset: 84849,
+ deps: map[int64]core.Object{
+ 83607: newObject(core.TreeObject, []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 CHANGELOG\x00\xd3\xffS\xe0VJ\x9f\x87\xd8\xe8Kn(\xe5\x06\x0eQp\b\xaa100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r40000 go\x00\xa3\x97q\xa7e\x1f\x97\xfa\xf5\xc7.\b\"M\x85\u007f\xc3Q3\xdb40000 json\x00Z\x87~j\x90j'C\xadnEٜ\x17\x93d*\xaf\x8e\xda40000 php\x00Xj\xf5gл^w\x1eI\xbd\xd9CO^\x0f\xb7m%\xfa40000 vendor\x00\xcfJ\xa3\xb3\x89t\xfb}\x81\xf3g\xc0\x83\x0f}x\xd6Z\xb8k")),
+ },
+ expHash: core.NewHash("a8d315b2b1c615d43042c3a62402b8a54288cf5c"),
+ expType: core.TreeObject,
+ expContent: []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 CHANGELOG\x00\xd3\xffS\xe0VJ\x9f\x87\xd8\xe8Kn(\xe5\x06\x0eQp\b\xaa100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r40000 go\x00\xa3\x97q\xa7e\x1f\x97\xfa\xf5\xc7.\b\"M\x85\u007f\xc3Q3\xdb40000 json\x00Z\x87~j\x90j'C\xadnEٜ\x17\x93d*\xaf\x8e\xda40000 php\x00Xj\xf5gл^w\x1eI\xbd\xd9CO^\x0f\xb7m%\xfa"),
+ }, {
+ fixID: "ref-deltas",
+ offset: 85070,
+ deps: map[int64]core.Object{
+ 84922: newObject(core.TreeObject, []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 CHANGELOG\x00\xd3\xffS\xe0VJ\x9f\x87\xd8\xe8Kn(\xe5\x06\x0eQp\b\xaa100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r")),
+ 84849: newObject(core.TreeObject, []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 CHANGELOG\x00\xd3\xffS\xe0VJ\x9f\x87\xd8\xe8Kn(\xe5\x06\x0eQp\b\xaa100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r40000 go\x00\xa3\x97q\xa7e\x1f\x97\xfa\xf5\xc7.\b\"M\x85\u007f\xc3Q3\xdb40000 json\x00Z\x87~j\x90j'C\xadnEٜ\x17\x93d*\xaf\x8e\xda40000 php\x00Xj\xf5gл^w\x1eI\xbd\xd9CO^\x0f\xb7m%\xfa")),
+ 83607: newObject(core.TreeObject, []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 CHANGELOG\x00\xd3\xffS\xe0VJ\x9f\x87\xd8\xe8Kn(\xe5\x06\x0eQp\b\xaa100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r40000 go\x00\xa3\x97q\xa7e\x1f\x97\xfa\xf5\xc7.\b\"M\x85\u007f\xc3Q3\xdb40000 json\x00Z\x87~j\x90j'C\xadnEٜ\x17\x93d*\xaf\x8e\xda40000 php\x00Xj\xf5gл^w\x1eI\xbd\xd9CO^\x0f\xb7m%\xfa40000 vendor\x00\xcfJ\xa3\xb3\x89t\xfb}\x81\xf3g\xc0\x83\x0f}x\xd6Z\xb8k")),
+ },
+ expHash: core.NewHash("eba74343e2f15d62adedfd8c883ee0262b5c8021"),
+ expType: core.TreeObject,
+ expContent: []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r"),
+ },
+ } {
+ com := Commentf("test %d) fixture id = %s", i, test.fixID)
+ fix, ok := s.fixtures[test.fixID]
+ c.Assert(ok, Equals, true, com)
+
+ err := fix.seek(test.offset)
+ c.Assert(err, IsNil, com)
+ p := fix.parser
+ for k, v := range test.deps {
+ err = p.Remember(k, v)
+ c.Assert(err, IsNil, com)
+ }
+
+ _, _, err = p.ReadObjectTypeAndLength()
+ c.Assert(err, IsNil, com)
+
+ beforeHash, err := p.Offset()
+ c.Assert(err, IsNil, com)
+
+ hash, err := p.ReadHash()
+ c.Assert(err, IsNil, com)
+ c.Assert(hash, Equals, test.expHash, com)
+
+ err = fix.seek(beforeHash)
+ c.Assert(err, IsNil, com)
+
+ cont, typ, err := p.ReadREFDeltaObjectContent()
+ c.Assert(err, IsNil, com)
+ c.Assert(typ, Equals, test.expType, com)
+ c.Assert(cont, DeepEquals, test.expContent, com)
+
+ p.ForgetAll()
+ }
+}
+
+func newObject(t core.ObjectType, c []byte) *memory.Object {
+ return memory.NewObject(t, int64(len(c)), c)
+}
+
+func (s *ParserSuite) TestReadHeaderBadSignatureError(c *C) {
+ data := []byte{
+ 0x50, 0x42, 0x43, 0x4b, 0x00, 0x00, 0x00, 0x02,
+ 0x00, 0x00, 0x00, 0x50,
+ }
+ p := NewParser(NewSeekable(bytes.NewReader(data)))
+
+ _, err := p.ReadHeader()
+ c.Assert(err, ErrorMatches, ErrBadSignature.Error())
+}
+
+func (s *ParserSuite) TestReadHeaderEmptyPackfileError(c *C) {
+ data := []byte{}
+ p := NewParser(NewSeekable(bytes.NewReader(data)))
+
+ _, err := p.ReadHeader()
+ c.Assert(err, ErrorMatches, ErrEmptyPackfile.Error())
+}
+
+func (s *ParserSuite) TestReadHeaderUnsupportedVersionError(c *C) {
+ data := []byte{
+ 0x50, 0x41, 0x43, 0x4b, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x50,
+ }
+ p := NewParser(NewSeekable(bytes.NewReader(data)))
+
+ _, err := p.ReadHeader()
+ c.Assert(err, ErrorMatches, ErrUnsupportedVersion.Error()+".*")
+}
+
+func (s *ParserSuite) TestReadHeader(c *C) {
+ data := []byte{
+ 0x50, 0x41, 0x43, 0x4b, 0x00, 0x00, 0x00, 0x02,
+ 0x00, 0x00, 0x00, 0x50,
+ }
+ p := NewParser(NewSeekable(bytes.NewReader(data)))
+
+ count, err := p.ReadHeader()
+ c.Assert(err, IsNil)
+ c.Assert(count, Equals, uint32(0x50))
+}
diff --git a/formats/packfile/read_recaller.go b/formats/packfile/read_recaller.go
new file mode 100644
index 0000000..92ab1b2
--- /dev/null
+++ b/formats/packfile/read_recaller.go
@@ -0,0 +1,39 @@
+package packfile
+
+import "gopkg.in/src-d/go-git.v3/core"
+
+var (
+ // ErrDuplicatedObject is returned by Remember if an object appears several
+ // times in a packfile.
+ ErrDuplicatedObject = NewError("duplicated object")
+ // ErrCannotRecall is returned by RecallByOffset or RecallByHash if the object
+ // to recall cannot be returned.
+ ErrCannotRecall = NewError("cannot recall object")
+)
+
+// The ReadRecaller interface has all the functions needed by a packfile
+// Parser to operate. We provide two very different implementations:
+// Seekable and Stream.
+type ReadRecaller interface {
+ // Read reads up to len(p) bytes into p.
+ Read(p []byte) (int, error)
+ // ReadByte is needed because of these:
+ // - https://github.com/golang/go/commit/7ba54d45732219af86bde9a5b73c145db82b70c6
+ // - https://groups.google.com/forum/#!topic/golang-nuts/fWTRdHpt0QI
+ // - https://gowalker.org/compress/zlib#NewReader
+ ReadByte() (byte, error)
+ // Offset returns the number of bytes parsed so far from the
+ // packfile.
+ Offset() (int64, error)
+ // Remember ask the ReadRecaller to remember the offset and hash for
+ // an object, so you can later call RecallByOffset and RecallByHash.
+ Remember(int64, core.Object) error
+ // ForgetAll forgets all previously remembered objects.
+ ForgetAll()
+ // RecallByOffset returns the previously processed object found at a
+ // given offset.
+ RecallByOffset(int64) (core.Object, error)
+ // RecallByHash returns the previously processed object with the
+ // given hash.
+ RecallByHash(core.Hash) (core.Object, error)
+}
diff --git a/formats/packfile/read_recaller_impl_test.go b/formats/packfile/read_recaller_impl_test.go
new file mode 100644
index 0000000..438439d
--- /dev/null
+++ b/formats/packfile/read_recaller_impl_test.go
@@ -0,0 +1,296 @@
+package packfile
+
+import (
+ "bytes"
+ "fmt"
+ "io/ioutil"
+ "os"
+
+ "gopkg.in/src-d/go-git.v3/core"
+ "gopkg.in/src-d/go-git.v3/storage/memory"
+
+ . "gopkg.in/check.v1"
+)
+
+type ReadRecallerImplSuite struct{}
+
+var _ = Suite(&ReadRecallerImplSuite{})
+
+type implFn func([]byte) ReadRecaller
+
+func newStream(data []byte) ReadRecaller {
+ buf := bytes.NewBuffer(data)
+ return NewStream(buf)
+}
+
+func newSeekable(data []byte) ReadRecaller {
+ buf := bytes.NewReader(data)
+ return NewSeekable(buf)
+}
+
+func (s *ReadRecallerImplSuite) TestRead(c *C) {
+ for _, impl := range []struct {
+ id string
+ newFn implFn
+ }{
+ {id: "stream", newFn: newStream},
+ {id: "seekable", newFn: newSeekable},
+ } {
+ com := Commentf("implementation %s", impl.id)
+ data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10}
+ sr := impl.newFn(data)
+ all := make([]byte, 0, len(data))
+
+ for len(all) < len(data) {
+ tmp := make([]byte, 3)
+ nr, err := sr.Read(tmp)
+ c.Assert(err, IsNil, com)
+ all = append(all, tmp[:nr]...)
+ }
+ c.Assert(data, DeepEquals, all, com)
+ }
+}
+
+func (s *ReadRecallerImplSuite) TestReadbyte(c *C) {
+ for _, impl := range []struct {
+ id string
+ newFn implFn
+ }{
+ {id: "stream", newFn: newStream},
+ {id: "seekable", newFn: newSeekable},
+ } {
+ com := Commentf("implementation %s", impl.id)
+ data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10}
+ sr := impl.newFn(data)
+ all := make([]byte, 0, len(data))
+
+ for len(all) < len(data) {
+ b, err := sr.ReadByte()
+ c.Assert(err, IsNil, com)
+ all = append(all, b)
+ }
+ c.Assert(data, DeepEquals, all, com)
+ }
+}
+
+func (s *ReadRecallerImplSuite) TestOffsetWithRead(c *C) {
+ for _, impl := range []struct {
+ id string
+ newFn implFn
+ }{
+ {id: "stream", newFn: newStream},
+ {id: "seekable", newFn: newSeekable},
+ } {
+ com := Commentf("implementation %s", impl.id)
+ data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10}
+ sr := impl.newFn(data)
+ all := make([]byte, 0, len(data))
+
+ for len(all) < len(data) {
+ tmp := make([]byte, 3)
+ nr, err := sr.Read(tmp)
+ c.Assert(err, IsNil, com)
+ all = append(all, tmp[:nr]...)
+
+ off, err := sr.Offset()
+ c.Assert(err, IsNil, com)
+ c.Assert(off, Equals, int64(len(all)), com)
+ }
+ }
+}
+
+func (s *ReadRecallerImplSuite) TestOffsetWithReadByte(c *C) {
+ for _, impl := range []struct {
+ id string
+ newFn implFn
+ }{
+ {id: "stream", newFn: newStream},
+ {id: "seekable", newFn: newSeekable},
+ } {
+ com := Commentf("implementation %s", impl.id)
+ data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10}
+ sr := impl.newFn(data)
+ all := make([]byte, 0, len(data))
+
+ for len(all) < len(data) {
+ b, err := sr.ReadByte()
+ c.Assert(err, IsNil, com)
+ all = append(all, b)
+
+ off, err := sr.Offset()
+ c.Assert(err, IsNil, com)
+ c.Assert(off, Equals, int64(len(all)), com)
+ }
+ }
+}
+
+func (s *ReadRecallerImplSuite) TestRememberRecall(c *C) {
+ packfile := "fixtures/spinnaker-spinnaker.pack"
+ f, err := os.Open(packfile)
+ c.Assert(err, IsNil)
+ defer func() {
+ err = f.Close()
+ c.Assert(err, IsNil)
+ }()
+
+ data, err := ioutil.ReadAll(f)
+ c.Assert(err, IsNil)
+
+ for _, impl := range []struct {
+ id string
+ newFn implFn
+ }{
+ {id: "stream", newFn: newStream},
+ {id: "seekable", newFn: newSeekable},
+ } {
+ sr := impl.newFn(data)
+ for i, test := range [...]struct {
+ off int64
+ obj core.Object
+ err string // error regexp
+ ignore string // ignore this test for this implementation
+ }{
+ {
+ off: 12,
+ obj: newObj(core.CommitObject, []byte("tree 44a1cdf21c791867c51caad8f1b77e6baee6f462\nparent 87fe6e7c6b1b89519fe3a03a8961c5aa14d4cc68\nparent 9244ee648182b91a63d8cc4cbe4b9ac2a27c0492\nauthor Matt Duftler <duftler@google.com> 1448290941 -0500\ncommitter Matt Duftler <duftler@google.com> 1448290941 -0500\n\nMerge pull request #615 from ewiseblatt/create_dev\n\nPreserve original credentials of spinnaker-local.yml when transforming it.")),
+ }, {
+ off: 3037,
+ obj: newObj(core.TagObject, []byte("object e0005f50e22140def60260960b21667f1fdfff80\ntype commit\ntag v0.10.0\ntagger cfieber <cfieber@netflix.com> 1447687536 -0800\n\nRelease of 0.10.0\n\n- e0005f50e22140def60260960b21667f1fdfff80: Merge pull request #553 from ewiseblatt/rendezvous\n- e1a2b26b784179e6903a7ae967c037c721899eba: Wait for cassandra before starting spinnaker\n- c756e09461d071e98b8660818cf42d90c90f2854: Merge pull request #552 from duftler/google-c2d-tweaks\n- 0777fadf4ca6f458d7071de414f9bd5417911037: Fix incorrect config prop names: s/SPINNAKER_GOOGLE_PROJECT_DEFAULT_REGION/SPINNAKER_GOOGLE_DEFAULT_REGION s/SPINNAKER_GOOGLE_PROJECT_DEFAULT_ZONE/SPINNAKER_GOOGLE_DEFAULT_ZONE Hardcode profile name in generated ~/.aws/credentials to [default]. Restart all of spinnaker after updating cassandra and reconfiguring spinnaker, instead of just restarting clouddriver.\n- d8d031c1ac45801074418c43424a6f2c0dff642c: Merge pull request #551 from kenzanmedia/fixGroup\n- 626d23075f9e92aad19015f2964c95d45f41fa3a: Put in correct block for public image. Delineate cloud provider.\n")),
+ }, {
+ off: 157625,
+ obj: newObj(core.BlobObject, []byte(".gradle\nbuild/\n*.iml\n.idea\n*.pyc\n*~\n#*\nconfig/spinnaker-local.yml\n.DS_Store\npacker/ami_table.md\npacker/ami_table.json\npacker/example_output.txt")),
+ }, {
+ off: 1234,
+ obj: newObj(core.BlobObject, []byte(".gradle\nbuild/\n*.iml\n.idea\n*.pyc\n*~\n#*\nconfig/spinnaker-local.yml\n.DS_Store\npacker/ami_table.md\npacker/ami_table.json\npacker/example_output.txt")),
+ err: "duplicated object: with hash .*",
+ }, {
+ off: 3037,
+ obj: newObj(core.BlobObject, []byte("")),
+ err: "duplicated object: with offset 3037",
+ ignore: "seekable",
+ // seekable can not check if the offset has already been added
+ // for performance reasons.
+ },
+ } {
+ if test.ignore == impl.id {
+ continue
+ }
+ com := Commentf("subtest %d) implementation %s", i, impl.id)
+
+ err := sr.Remember(test.off, test.obj)
+ if test.err != "" {
+ c.Assert(err, ErrorMatches, test.err, com)
+ continue
+ }
+ c.Assert(err, IsNil, com)
+
+ result, err := sr.RecallByHash(test.obj.Hash())
+ c.Assert(err, IsNil, com)
+ c.Assert(result, DeepEquals, test.obj, com)
+
+ result, err = sr.RecallByOffset(test.off)
+ c.Assert(err, IsNil, com)
+ c.Assert(result, DeepEquals, test.obj, com)
+ }
+ }
+}
+
+func newObj(typ core.ObjectType, cont []byte) core.Object {
+ return memory.NewObject(typ, int64(len(cont)), cont)
+}
+
+func (s *ReadRecallerImplSuite) TestRecallByHashErrors(c *C) {
+ for _, impl := range []struct {
+ id string
+ newFn implFn
+ }{
+ {id: "stream", newFn: newStream},
+ {id: "seekable", newFn: newSeekable},
+ } {
+ com := Commentf("implementation %s", impl.id)
+ sr := impl.newFn([]byte{})
+ obj := newObj(core.CommitObject, []byte{})
+
+ _, err := sr.RecallByHash(obj.Hash())
+ c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com)
+
+ err = rememberSomeObjects(sr)
+ c.Assert(err, IsNil)
+
+ _, err = sr.RecallByHash(obj.Hash())
+ c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com)
+ }
+}
+
+func (s *ReadRecallerImplSuite) TestRecallByOffsetErrors(c *C) {
+ for _, impl := range []struct {
+ id string
+ newFn implFn
+ }{
+ {id: "stream", newFn: newStream},
+ // seekalbe allways recall every object in the packfile
+ } {
+ com := Commentf("implementation %s", impl.id)
+ sr := impl.newFn([]byte{})
+
+ _, err := sr.RecallByOffset(15)
+ c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com)
+
+ err = rememberSomeObjects(sr)
+ c.Assert(err, IsNil)
+
+ _, err = sr.RecallByOffset(15)
+ c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com)
+ }
+}
+
+func rememberSomeObjects(sr ReadRecaller) error {
+ for i, init := range [...]struct {
+ off int64
+ obj core.Object
+ }{
+ {off: 0, obj: newObj(core.CommitObject, []byte{'a'})}, // 93114cce67ec23976d15199514399203f69cc676
+ {off: 10, obj: newObj(core.CommitObject, []byte{'b'})}, // 2bb767097e479f668f0ebdabe88df11337bd8f19
+ {off: 20, obj: newObj(core.CommitObject, []byte{'c'})}, // 2f8096005677370e6446541a50e074299d43d468
+ } {
+ err := sr.Remember(init.off, init.obj)
+ if err != nil {
+ return fmt.Errorf("cannot ask StreamReader to Remember item %d", i)
+ }
+ }
+
+ return nil
+}
+
+func (s *ReadRecallerImplSuite) TestForgetAll(c *C) {
+ for _, impl := range []struct {
+ id string
+ newFn implFn
+ }{
+ {id: "stream", newFn: newStream},
+ {id: "seekable", newFn: newSeekable},
+ } {
+ com := Commentf("implementation %s", impl.id)
+ sr := impl.newFn([]byte{})
+
+ err := rememberSomeObjects(sr)
+ c.Assert(err, IsNil)
+
+ sr.ForgetAll()
+
+ if impl.id != "seekable" { // for efficiency, seekable always finds objects by offset
+ _, err = sr.RecallByOffset(0)
+ c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com)
+ _, err = sr.RecallByOffset(10)
+ c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com)
+ _, err = sr.RecallByOffset(20)
+ c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com)
+ }
+ _, err = sr.RecallByHash(core.NewHash("93114cce67ec23976d15199514399203f69cc676"))
+ c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com)
+ _, err = sr.RecallByHash(core.NewHash("2bb767097e479f668f0ebdabe88df11337bd8f19"))
+ c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com)
+ _, err = sr.RecallByHash(core.NewHash("2f8096005677370e6446541a50e074299d43d468"))
+ c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com)
+ }
+}
diff --git a/formats/packfile/reader.go b/formats/packfile/reader.go
deleted file mode 100644
index 3f7081b..0000000
--- a/formats/packfile/reader.go
+++ /dev/null
@@ -1,338 +0,0 @@
-package packfile
-
-import (
- "bytes"
- "encoding/binary"
- "fmt"
- "io"
- "io/ioutil"
-
- "gopkg.in/src-d/go-git.v3/core"
-
- "github.com/klauspost/compress/zlib"
-)
-
-type Format int
-
-var (
- EmptyRepositoryErr = newError("empty repository")
- UnsupportedVersionErr = newError("unsupported packfile version")
- MaxObjectsLimitReachedErr = newError("max. objects limit reached")
- MalformedPackfileErr = newError("malformed pack file, does not start with 'PACK'")
- InvalidObjectErr = newError("invalid git object")
- PatchingErr = newError("patching error")
- PackEntryNotFoundErr = newError("can't find a pack entry")
- ErrObjectNotFound = newError("can't find a object")
- ZLibErr = newError("zlib reading error")
-)
-
-const (
- DefaultMaxObjectsLimit = 1 << 20
-
- VersionSupported = 2
- UnknownFormat Format = 0
- OFSDeltaFormat Format = 1
- REFDeltaFormat Format = 2
-)
-
-// Reader reads a packfile from a binary string splitting it on objects
-type Reader struct {
- // MaxObjectsLimit is the limit of objects to be load in the packfile, if
- // a packfile excess this number an error is throw, the default value
- // is defined by DefaultMaxObjectsLimit, usually the default limit is more
- // than enough to work with any repository, working extremely big repositories
- // where the number of object is bigger the memory can be exhausted.
- MaxObjectsLimit uint32
-
- // Format specifies if we are using ref-delta's or ofs-delta's, choosing the
- // correct format the memory usage is optimized
- // https://github.com/git/git/blob/8d530c4d64ffcc853889f7b385f554d53db375ed/Documentation/technical/protocol-capabilities.txt#L154
- Format Format
-
- r *trackingReader
- s core.ObjectStorage
- offsets map[int64]core.Hash
-}
-
-// NewReader returns a new Reader that reads from a io.Reader
-func NewReader(r io.Reader) *Reader {
- return &Reader{
- MaxObjectsLimit: DefaultMaxObjectsLimit,
-
- r: NewTrackingReader(r),
- offsets: make(map[int64]core.Hash, 0),
- }
-}
-
-// Read reads the objects and stores it at the ObjectStorage
-func (r *Reader) Read(s core.ObjectStorage) (int64, error) {
- r.s = s
- if err := r.validateHeader(); err != nil {
- if err == io.EOF {
- return -1, EmptyRepositoryErr
- }
-
- return -1, err
- }
-
- version, err := r.readInt32()
- if err != nil {
- return -1, err
- }
-
- if version > VersionSupported {
- return -1, UnsupportedVersionErr
- }
-
- count, err := r.readInt32()
- if err != nil {
- return -1, err
- }
-
- if count > r.MaxObjectsLimit {
- return -1, MaxObjectsLimitReachedErr
- }
-
- return r.r.position, r.readObjects(count)
-}
-
-func (r *Reader) validateHeader() error {
- var header = make([]byte, 4)
- if _, err := io.ReadFull(r.r, header); err != nil {
- return err
- }
-
- if !bytes.Equal(header, []byte{'P', 'A', 'C', 'K'}) {
- return MalformedPackfileErr
- }
-
- return nil
-}
-
-func (r *Reader) readInt32() (uint32, error) {
- var value uint32
- if err := binary.Read(r.r, binary.BigEndian, &value); err != nil {
- return 0, err
- }
-
- return value, nil
-}
-
-func (r *Reader) readObjects(count uint32) error {
- // This code has 50-80 µs of overhead per object not counting zlib inflation.
- // Together with zlib inflation, it's 400-410 µs for small objects.
- // That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB,
- // of which 12-20 % is _not_ zlib inflation (ie. is our code).
- for i := 0; i < int(count); i++ {
- start := r.r.position
- obj, err := r.newObject()
- if err != nil && err != io.EOF {
- return err
- }
-
- if r.Format == UnknownFormat || r.Format == OFSDeltaFormat {
- r.offsets[start] = obj.Hash()
- }
-
- r.s.Set(obj)
- if err == io.EOF {
- break
- }
- }
-
- return nil
-}
-
-func (r *Reader) newObject() (core.Object, error) {
- raw, err := r.s.New()
- if err != nil {
- return nil, err
- }
- var steps int64
-
- var buf [1]byte
- if _, err := r.r.Read(buf[:]); err != nil {
- return nil, err
- }
-
- typ := core.ObjectType((buf[0] >> 4) & 7)
- size := int64(buf[0] & 15)
- steps++ // byte we just read to get `o.typ` and `o.size`
-
- var shift uint = 4
- for buf[0]&0x80 == 0x80 {
- if _, err := r.r.Read(buf[:]); err != nil {
- return nil, err
- }
-
- size += int64(buf[0]&0x7f) << shift
- steps++ // byte we just read to update `o.size`
- shift += 7
- }
-
- raw.SetType(typ)
- raw.SetSize(size)
-
- switch raw.Type() {
- case core.REFDeltaObject:
- err = r.readREFDelta(raw)
- case core.OFSDeltaObject:
- err = r.readOFSDelta(raw, steps)
- case core.CommitObject, core.TreeObject, core.BlobObject, core.TagObject:
- err = r.readObject(raw)
- default:
- err = InvalidObjectErr.n("tag %q", raw.Type)
- }
-
- return raw, err
-}
-
-func (r *Reader) readREFDelta(raw core.Object) (err error) {
- var ref core.Hash
- if _, err := io.ReadFull(r.r, ref[:]); err != nil {
- return err
- }
-
- buf := bytes.NewBuffer(nil)
- if err := r.inflate(buf); err != nil {
- return err
- }
-
- referenced, err := r.s.Get(ref)
- if err != nil {
- if err == core.ErrObjectNotFound {
- return ErrObjectNotFound.n("%s", ref)
- }
- return err
- }
-
- reader, err := referenced.Reader()
- if err != nil {
- return err
- }
- defer checkClose(reader, &err)
-
- d, err := ioutil.ReadAll(reader)
- if err != nil {
- return err
- }
-
- patched := patchDelta(d, buf.Bytes())
- if patched == nil {
- return PatchingErr.n("hash %q", ref)
- }
-
- raw.SetType(referenced.Type())
- raw.SetSize(int64(len(patched)))
-
- writer, err := raw.Writer()
- if err != nil {
- return err
- }
- defer checkClose(writer, &err)
-
- writer.Write(patched)
-
- return nil
-}
-
-func (r *Reader) readOFSDelta(raw core.Object, steps int64) (err error) {
- start := r.r.position
- offset, err := decodeOffset(r.r, steps)
- if err != nil {
- return err
- }
-
- buf := bytes.NewBuffer(nil)
- if err = r.inflate(buf); err != nil {
- return err
- }
-
- ref, ok := r.offsets[start+offset]
- if !ok {
- return PackEntryNotFoundErr.n("offset %d", start+offset)
- }
-
- referenced, err := r.s.Get(ref)
- if err != nil {
- return err
- }
-
- reader, err := referenced.Reader()
- if err != nil {
- return err
- }
- defer checkClose(reader, &err)
-
- d, err := ioutil.ReadAll(reader)
- if err != nil {
- return err
- }
-
- patched := patchDelta(d, buf.Bytes())
- if patched == nil {
- return PatchingErr.n("hash %q", ref)
- }
-
- raw.SetType(referenced.Type())
- raw.SetSize(int64(len(patched)))
-
- writer, err := raw.Writer()
- if err != nil {
- return err
- }
- defer checkClose(writer, &err)
-
- writer.Write(patched)
-
- return nil
-}
-
-func (r *Reader) readObject(raw core.Object) (err error) {
- writer, err := raw.Writer()
- if err != nil {
- return err
- }
- defer checkClose(writer, &err)
-
- return r.inflate(writer)
-}
-
-func (r *Reader) inflate(w io.Writer) error {
- zr, err := zlib.NewReader(r.r)
- if err != nil {
- if err == zlib.ErrHeader {
- return zlib.ErrHeader
- }
-
- return ZLibErr.n("%s", err)
- }
-
- defer zr.Close()
-
- _, err = io.Copy(w, zr)
- return err
-}
-
-type ReaderError struct {
- reason, additional string
-}
-
-func newError(reason string) *ReaderError {
- return &ReaderError{reason: reason}
-}
-
-func (e *ReaderError) Error() string {
- if e.additional == "" {
- return e.reason
- }
-
- return fmt.Sprintf("%s: %s", e.reason, e.additional)
-}
-
-func (e *ReaderError) n(format string, args ...interface{}) *ReaderError {
- return &ReaderError{
- reason: e.reason,
- additional: fmt.Sprintf(format, args...),
- }
-}
diff --git a/formats/packfile/seekable.go b/formats/packfile/seekable.go
new file mode 100644
index 0000000..ea1c501
--- /dev/null
+++ b/formats/packfile/seekable.go
@@ -0,0 +1,108 @@
+package packfile
+
+import (
+ "io"
+ "os"
+
+ "gopkg.in/src-d/go-git.v3/core"
+)
+
+// Seekable implements ReadRecaller for the io.ReadSeeker of a packfile.
+// Remembering does not actually stores any reference to the remembered
+// objects; the object offset is remebered instead and the packfile is
+// read again everytime a recall operation is requested. This saves
+// memory buy can be very slow if the associated io.ReadSeeker is slow
+// (like a hard disk).
+type Seekable struct {
+ io.ReadSeeker
+ HashToOffset map[core.Hash]int64
+}
+
+// NewSeekable returns a new Seekable that reads form r.
+func NewSeekable(r io.ReadSeeker) *Seekable {
+ return &Seekable{
+ r,
+ make(map[core.Hash]int64),
+ }
+}
+
+// Read reads up to len(p) bytes into p.
+func (r *Seekable) Read(p []byte) (int, error) {
+ return r.ReadSeeker.Read(p)
+}
+
+// ReadByte reads a byte.
+func (r *Seekable) ReadByte() (byte, error) {
+ var p [1]byte
+ _, err := r.ReadSeeker.Read(p[:])
+ if err != nil {
+ return 0, err
+ }
+
+ return p[0], nil
+}
+
+// Offset returns the offset for the next Read or ReadByte.
+func (r *Seekable) Offset() (int64, error) {
+ return r.Seek(0, os.SEEK_CUR)
+}
+
+// Remember stores the offset of the object and its hash, but not the
+// object itself. This implementation does not check for already stored
+// offsets, as it is too expensive to build this information from an
+// index every time a get operation is performed on the SeekableReadRecaller.
+func (r *Seekable) Remember(o int64, obj core.Object) error {
+ h := obj.Hash()
+ if _, ok := r.HashToOffset[h]; ok {
+ return ErrDuplicatedObject.AddDetails("with hash %s", h)
+ }
+
+ r.HashToOffset[h] = o
+
+ return nil
+}
+
+// ForgetAll forgets all previously remembered objects. For efficiency
+// reasons RecallByOffset always find objects, even if they have been
+// forgetted or were never remembered.
+func (r *Seekable) ForgetAll() {
+ r.HashToOffset = make(map[core.Hash]int64)
+}
+
+// RecallByHash returns the object for a given hash by looking for it again in
+// the io.ReadeSeerker.
+func (r *Seekable) RecallByHash(h core.Hash) (core.Object, error) {
+ o, ok := r.HashToOffset[h]
+ if !ok {
+ return nil, ErrCannotRecall.AddDetails("hash not found: %s", h)
+ }
+
+ return r.RecallByOffset(o)
+}
+
+// RecallByOffset returns the object for a given offset by looking for it again in
+// the io.ReadeSeerker. For efficiency reasons, this method always find objects by
+// offset, even if they have not been remembered or if they have been forgetted.
+func (r *Seekable) RecallByOffset(o int64) (obj core.Object, err error) {
+ // remember current offset
+ beforeJump, err := r.Offset()
+ if err != nil {
+ return nil, err
+ }
+
+ defer func() {
+ // jump back
+ _, seekErr := r.Seek(beforeJump, os.SEEK_SET)
+ if err == nil {
+ err = seekErr
+ }
+ }()
+
+ // jump to requested offset
+ _, err = r.Seek(o, os.SEEK_SET)
+ if err != nil {
+ return nil, err
+ }
+
+ return NewParser(r).ReadObject()
+}
diff --git a/formats/packfile/stream.go b/formats/packfile/stream.go
new file mode 100644
index 0000000..41266b1
--- /dev/null
+++ b/formats/packfile/stream.go
@@ -0,0 +1,95 @@
+package packfile
+
+import (
+ "io"
+
+ "gopkg.in/src-d/go-git.v3/core"
+)
+
+// Stream implements ReadRecaller for the io.Reader of a packfile. This
+// implementation keeps all remembered objects referenced in maps for
+// quick access.
+type Stream struct {
+ io.Reader
+ count int64
+ offsetToObject map[int64]core.Object
+ hashToObject map[core.Hash]core.Object
+}
+
+// NewStream returns a new Stream that reads form r.
+func NewStream(r io.Reader) *Stream {
+ return &Stream{
+ Reader: r,
+ count: 0,
+ hashToObject: make(map[core.Hash]core.Object, 0),
+ offsetToObject: make(map[int64]core.Object, 0),
+ }
+}
+
+// Read reads up to len(p) bytes into p.
+func (r *Stream) Read(p []byte) (n int, err error) {
+ n, err = r.Reader.Read(p)
+ r.count += int64(n)
+
+ return
+}
+
+// ReadByte reads a byte.
+func (r *Stream) ReadByte() (byte, error) {
+ var p [1]byte
+ _, err := r.Reader.Read(p[:])
+ r.count++
+
+ return p[0], err
+}
+
+// Offset returns the number of bytes read.
+func (r *Stream) Offset() (int64, error) {
+ return r.count, nil
+}
+
+// Remember stores references to the passed object to be used later by
+// RecalByHash and RecallByOffset. It receives the object and the offset
+// of its object entry in the packfile.
+func (r *Stream) Remember(o int64, obj core.Object) error {
+ h := obj.Hash()
+ if _, ok := r.hashToObject[h]; ok {
+ return ErrDuplicatedObject.AddDetails("with hash %s", h)
+ }
+ r.hashToObject[h] = obj
+
+ if _, ok := r.offsetToObject[o]; ok {
+ return ErrDuplicatedObject.AddDetails("with offset %d", o)
+ }
+ r.offsetToObject[o] = obj
+
+ return nil
+}
+
+// ForgetAll forgets all previously remembered objects.
+func (r *Stream) ForgetAll() {
+ r.hashToObject = make(map[core.Hash]core.Object)
+ r.offsetToObject = make(map[int64]core.Object)
+}
+
+// RecallByHash returns an object that has been previously Remember-ed by
+// its hash.
+func (r *Stream) RecallByHash(h core.Hash) (core.Object, error) {
+ obj, ok := r.hashToObject[h]
+ if !ok {
+ return nil, ErrCannotRecall.AddDetails("by hash %s", h)
+ }
+
+ return obj, nil
+}
+
+// RecallByOffset returns an object that has been previously Remember-ed by
+// the offset of its object entry in the packfile.
+func (r *Stream) RecallByOffset(o int64) (core.Object, error) {
+ obj, ok := r.offsetToObject[o]
+ if !ok {
+ return nil, ErrCannotRecall.AddDetails("no object found at offset %d", o)
+ }
+
+ return obj, nil
+}