aboutsummaryrefslogtreecommitdiffstats
path: root/formats/packfile
diff options
context:
space:
mode:
Diffstat (limited to 'formats/packfile')
-rw-r--r--formats/packfile/common.go39
-rw-r--r--formats/packfile/common_test.go9
-rw-r--r--formats/packfile/delta.go93
-rw-r--r--formats/packfile/doc.go168
-rw-r--r--formats/packfile/objects.go244
-rw-r--r--formats/packfile/objects_test.go116
-rw-r--r--formats/packfile/packfile.go82
-rw-r--r--formats/packfile/reader.go385
-rw-r--r--formats/packfile/reader_test.go35
9 files changed, 1171 insertions, 0 deletions
diff --git a/formats/packfile/common.go b/formats/packfile/common.go
new file mode 100644
index 0000000..4a97dc7
--- /dev/null
+++ b/formats/packfile/common.go
@@ -0,0 +1,39 @@
+package packfile
+
+import (
+ "fmt"
+ "io"
+)
+
+type trackingReader struct {
+ r io.Reader
+ n int
+}
+
+func (t *trackingReader) Pos() int { return t.n }
+
+func (t *trackingReader) Read(p []byte) (n int, err error) {
+ n, err = t.r.Read(p)
+ if err != nil {
+ return 0, err
+ }
+
+ t.n += n
+
+ return n, err
+}
+
+func (t *trackingReader) ReadByte() (c byte, err error) {
+ var p [1]byte
+ n, err := t.r.Read(p[:])
+ if err != nil {
+ return 0, err
+ }
+
+ if n > 1 {
+ return 0, fmt.Errorf("read %d bytes, should have read just 1", n)
+ }
+
+ t.n += n // n is 1
+ return p[0], nil
+}
diff --git a/formats/packfile/common_test.go b/formats/packfile/common_test.go
new file mode 100644
index 0000000..104a5d2
--- /dev/null
+++ b/formats/packfile/common_test.go
@@ -0,0 +1,9 @@
+package packfile
+
+import (
+ "testing"
+
+ . "gopkg.in/check.v1"
+)
+
+func Test(t *testing.T) { TestingT(t) }
diff --git a/formats/packfile/delta.go b/formats/packfile/delta.go
new file mode 100644
index 0000000..86b556f
--- /dev/null
+++ b/formats/packfile/delta.go
@@ -0,0 +1,93 @@
+package packfile
+
+const delta_size_min = 4
+
+func deltaHeaderSize(b []byte) (uint, []byte) {
+ var size, j uint
+ var cmd byte
+ for {
+ cmd = b[j]
+ size |= (uint(cmd) & 0x7f) << (j * 7)
+ j++
+ if uint(cmd)&0xb80 == 0 || j == uint(len(b)) {
+ break
+ }
+ }
+ return size, b[j:]
+}
+
+func PatchDelta(src, delta []byte) []byte {
+ if len(delta) < delta_size_min {
+ return nil
+ }
+ size, delta := deltaHeaderSize(delta)
+ if size != uint(len(src)) {
+ return nil
+ }
+ size, delta = deltaHeaderSize(delta)
+ origSize := size
+
+ dest := make([]byte, 0)
+
+ // var offset uint
+ var cmd byte
+ for {
+ cmd = delta[0]
+ delta = delta[1:]
+ if (cmd & 0x80) != 0 {
+ var cp_off, cp_size uint
+ if (cmd & 0x01) != 0 {
+ cp_off = uint(delta[0])
+ delta = delta[1:]
+ }
+ if (cmd & 0x02) != 0 {
+ cp_off |= uint(delta[0]) << 8
+ delta = delta[1:]
+ }
+ if (cmd & 0x04) != 0 {
+ cp_off |= uint(delta[0]) << 16
+ delta = delta[1:]
+ }
+ if (cmd & 0x08) != 0 {
+ cp_off |= uint(delta[0]) << 24
+ delta = delta[1:]
+ }
+
+ if (cmd & 0x10) != 0 {
+ cp_size = uint(delta[0])
+ delta = delta[1:]
+ }
+ if (cmd & 0x20) != 0 {
+ cp_size |= uint(delta[0]) << 8
+ delta = delta[1:]
+ }
+ if (cmd & 0x40) != 0 {
+ cp_size |= uint(delta[0]) << 16
+ delta = delta[1:]
+ }
+ if cp_size == 0 {
+ cp_size = 0x10000
+ }
+ if cp_off+cp_size < cp_off ||
+ cp_off+cp_size > uint(len(src)) ||
+ cp_size > origSize {
+ break
+ }
+ dest = append(dest, src[cp_off:cp_off+cp_size]...)
+ size -= cp_size
+ } else if cmd != 0 {
+ if uint(cmd) > origSize {
+ break
+ }
+ dest = append(dest, delta[0:uint(cmd)]...)
+ size -= uint(cmd)
+ delta = delta[uint(cmd):]
+ } else {
+ return nil
+ }
+ if size <= 0 {
+ break
+ }
+ }
+ return dest
+}
diff --git a/formats/packfile/doc.go b/formats/packfile/doc.go
new file mode 100644
index 0000000..1fc28da
--- /dev/null
+++ b/formats/packfile/doc.go
@@ -0,0 +1,168 @@
+package packfile
+
+// Code from:
+// https://github.com/gitchain/gitchain/tree/master/git @ 4c2fabdf9
+//
+// GIT pack format
+// ===============
+//
+// == pack-*.pack files have the following format:
+//
+// - A header appears at the beginning and consists of the following:
+//
+// 4-byte signature:
+// The signature is: {'P', 'A', 'C', 'K'}
+//
+// 4-byte version number (network byte order):
+// GIT currently accepts version number 2 or 3 but
+// generates version 2 only.
+//
+// 4-byte number of objects contained in the pack (network byte order)
+//
+// Observation: we cannot have more than 4G versions ;-) and
+// more than 4G objects in a pack.
+//
+// - The header is followed by number of object entries, each of
+// which looks like this:
+//
+// (undeltified representation)
+// n-byte type and length (3-bit type, (n-1)*7+4-bit length)
+// compressed data
+//
+// (deltified representation)
+// n-byte type and length (3-bit type, (n-1)*7+4-bit length)
+// 20-byte base object name
+// compressed delta data
+//
+// Observation: length of each object is encoded in a variable
+// length format and is not constrained to 32-bit or anything.
+//
+// - The trailer records 20-byte SHA1 checksum of all of the above.
+//
+// == Original (version 1) pack-*.idx files have the following format:
+//
+// - The header consists of 256 4-byte network byte order
+// integers. N-th entry of this table records the number of
+// objects in the corresponding pack, the first byte of whose
+// object name is less than or equal to N. This is called the
+// 'first-level fan-out' table.
+//
+// - The header is followed by sorted 24-byte entries, one entry
+// per object in the pack. Each entry is:
+//
+// 4-byte network byte order integer, recording where the
+// object is stored in the packfile as the offset from the
+// beginning.
+//
+// 20-byte object name.
+//
+// - The file is concluded with a trailer:
+//
+// A copy of the 20-byte SHA1 checksum at the end of
+// corresponding packfile.
+//
+// 20-byte SHA1-checksum of all of the above.
+//
+// Pack Idx file:
+//
+// -- +--------------------------------+
+// fanout | fanout[0] = 2 (for example) |-.
+// table +--------------------------------+ |
+// | fanout[1] | |
+// +--------------------------------+ |
+// | fanout[2] | |
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
+// | fanout[255] = total objects |---.
+// -- +--------------------------------+ | |
+// main | offset | | |
+// index | object name 00XXXXXXXXXXXXXXXX | | |
+// table +--------------------------------+ | |
+// | offset | | |
+// | object name 00XXXXXXXXXXXXXXXX | | |
+// +--------------------------------+<+ |
+// .-| offset | |
+// | | object name 01XXXXXXXXXXXXXXXX | |
+// | +--------------------------------+ |
+// | | offset | |
+// | | object name 01XXXXXXXXXXXXXXXX | |
+// | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
+// | | offset | |
+// | | object name FFXXXXXXXXXXXXXXXX | |
+// --| +--------------------------------+<--+
+// trailer | | packfile checksum |
+// | +--------------------------------+
+// | | idxfile checksum |
+// | +--------------------------------+
+// .-------.
+// |
+// Pack file entry: <+
+//
+// packed object header:
+// 1-byte size extension bit (MSB)
+// type (next 3 bit)
+// size0 (lower 4-bit)
+// n-byte sizeN (as long as MSB is set, each 7-bit)
+// size0..sizeN form 4+7+7+..+7 bit integer, size0
+// is the least significant part, and sizeN is the
+// most significant part.
+// packed object data:
+// If it is not DELTA, then deflated bytes (the size above
+// is the size before compression).
+// If it is REF_DELTA, then
+// 20-byte base object name SHA1 (the size above is the
+// size of the delta data that follows).
+// delta data, deflated.
+// If it is OFS_DELTA, then
+// n-byte offset (see below) interpreted as a negative
+// offset from the type-byte of the header of the
+// ofs-delta entry (the size above is the size of
+// the delta data that follows).
+// delta data, deflated.
+//
+// offset encoding:
+// n bytes with MSB set in all but the last one.
+// The offset is then the number constructed by
+// concatenating the lower 7 bit of each byte, and
+// for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1))
+// to the result.
+//
+//
+//
+// == Version 2 pack-*.idx files support packs larger than 4 GiB, and
+// have some other reorganizations. They have the format:
+//
+// - A 4-byte magic number '\377tOc' which is an unreasonable
+// fanout[0] value.
+//
+// - A 4-byte version number (= 2)
+//
+// - A 256-entry fan-out table just like v1.
+//
+// - A table of sorted 20-byte SHA1 object names. These are
+// packed together without offset values to reduce the cache
+// footprint of the binary search for a specific object name.
+//
+// - A table of 4-byte CRC32 values of the packed object data.
+// This is new in v2 so compressed data can be copied directly
+// from pack to pack during repacking without undetected
+// data corruption.
+//
+// - A table of 4-byte offset values (in network byte order).
+// These are usually 31-bit pack file offsets, but large
+// offsets are encoded as an index into the next table with
+// the msbit set.
+//
+// - A table of 8-byte offset entries (empty for pack files less
+// than 2 GiB). Pack files are organized with heavily used
+// objects toward the front, so most object references should
+// not need to refer to this table.
+//
+// - The same trailer as a v1 pack file:
+//
+// A copy of the 20-byte SHA1 checksum at the end of
+// corresponding packfile.
+//
+// 20-byte SHA1-checksum of all of the above.
+//
+// From:
+// https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt
diff --git a/formats/packfile/objects.go b/formats/packfile/objects.go
new file mode 100644
index 0000000..1077b5f
--- /dev/null
+++ b/formats/packfile/objects.go
@@ -0,0 +1,244 @@
+package packfile
+
+import (
+ "bytes"
+ "crypto/sha1"
+ "encoding/hex"
+ "fmt"
+ "strconv"
+ "time"
+)
+
+type ObjectType string
+
+const (
+ CommitObject ObjectType = "commit"
+ TreeObject ObjectType = "tree"
+ BlobObject ObjectType = "blob"
+)
+
+// Object generic object interface
+type Object interface {
+ Type() ObjectType
+ Hash() Hash
+}
+
+// Hash SHA1 hased content
+type Hash [20]byte
+
+// ComputeHash compute the hash for a given objType and content
+func ComputeHash(t ObjectType, content []byte) Hash {
+ h := []byte(t)
+ h = append(h, ' ')
+ h = strconv.AppendInt(h, int64(len(content)), 10)
+ h = append(h, 0)
+ h = append(h, content...)
+
+ return Hash(sha1.Sum(h))
+}
+
+func (h Hash) String() string {
+ return hex.EncodeToString(h[:])
+}
+
+// Commit points to a single tree, marking it as what the project looked like
+// at a certain point in time. It contains meta-information about that point
+// in time, such as a timestamp, the author of the changes since the last
+// commit, a pointer to the previous commit(s), etc.
+// http://schacon.github.io/gitbook/1_the_git_object_model.html
+type Commit struct {
+ Tree Hash
+ Parents []Hash
+ Author Signature
+ Committer Signature
+ Message string
+ hash Hash
+}
+
+// ParseCommit transform a byte slice into a Commit struct
+func ParseCommit(b []byte) (*Commit, error) {
+ o := &Commit{hash: ComputeHash(CommitObject, b)}
+
+ lines := bytes.Split(b, []byte{'\n'})
+ for i := range lines {
+ if len(lines[i]) > 0 {
+ var err error
+
+ split := bytes.SplitN(lines[i], []byte{' '}, 2)
+ switch string(split[0]) {
+ case "tree":
+ _, err = hex.Decode(o.Tree[:], split[1])
+ case "parent":
+ var h Hash
+ _, err = hex.Decode(h[:], split[1])
+ if err == nil {
+ o.Parents = append(o.Parents, h)
+ }
+ case "author":
+ o.Author = ParseSignature(split[1])
+ case "committer":
+ o.Committer = ParseSignature(split[1])
+ }
+
+ if err != nil {
+ return nil, err
+ }
+ } else {
+ o.Message = string(bytes.Join(append(lines[i+1:]), []byte{'\n'}))
+ break
+ }
+
+ }
+
+ return o, nil
+}
+
+// Type returns the object type
+func (o *Commit) Type() ObjectType {
+ return CommitObject
+}
+
+// Hash returns the computed hash of the commit
+func (o *Commit) Hash() Hash {
+ return o.hash
+}
+
+// Tree is basically like a directory - it references a bunch of other trees
+// and/or blobs (i.e. files and sub-directories)
+type Tree struct {
+ Entries []TreeEntry
+ hash Hash
+}
+
+// TreeEntry represents a file
+type TreeEntry struct {
+ Name string
+ Hash Hash
+}
+
+// ParseTree transform a byte slice into a Tree struct
+func ParseTree(b []byte) (*Tree, error) {
+ o := &Tree{hash: ComputeHash(TreeObject, b)}
+
+ if len(b) == 0 {
+ return o, nil
+ }
+
+ for {
+ split := bytes.SplitN(b, []byte{0}, 2)
+ split1 := bytes.SplitN(split[0], []byte{' '}, 2)
+
+ entry := TreeEntry{}
+ entry.Name = string(split1[1])
+ copy(entry.Hash[:], split[1][0:20])
+
+ o.Entries = append(o.Entries, entry)
+
+ b = split[1][20:]
+ if len(split[1]) == 20 {
+ break
+ }
+ }
+
+ return o, nil
+}
+
+// Type returns the object type
+func (o *Tree) Type() ObjectType {
+ return TreeObject
+}
+
+// Hash returns the computed hash of the tree
+func (o *Tree) Hash() Hash {
+ return o.hash
+}
+
+// Blob is used to store file data - it is generally a file.
+type Blob struct {
+ Len int
+ hash Hash
+}
+
+// ParseBlob transform a byte slice into a Blob struct
+func ParseBlob(b []byte) (*Blob, error) {
+ return &Blob{
+ Len: len(b),
+ hash: ComputeHash(BlobObject, b),
+ }, nil
+}
+
+// Type returns the object type
+func (o *Blob) Type() ObjectType {
+ return BlobObject
+}
+
+// Hash returns the computed hash of the blob
+func (o *Blob) Hash() Hash {
+ return o.hash
+}
+
+type ContentCallback func(hash Hash, content []byte)
+
+// Signature represents an action signed by a person
+type Signature struct {
+ Name string
+ Email string
+ When time.Time
+}
+
+// ParseSignature parse a byte slice returning a new action signature.
+func ParseSignature(signature []byte) Signature {
+ ret := Signature{}
+ if len(signature) == 0 {
+ return ret
+ }
+
+ from := 0
+ state := 'n' // n: name, e: email, t: timestamp, z: timezone
+ for i := 0; ; i++ {
+ var c byte
+ var end bool
+ if i < len(signature) {
+ c = signature[i]
+ } else {
+ end = true
+ }
+
+ switch state {
+ case 'n':
+ if c == '<' || end {
+ if i == 0 {
+ break
+ }
+ ret.Name = string(signature[from : i-1])
+ state = 'e'
+ from = i + 1
+ }
+ case 'e':
+ if c == '>' || end {
+ ret.Email = string(signature[from:i])
+ i++
+ state = 't'
+ from = i + 1
+ }
+ case 't':
+ if c == ' ' || end {
+ t, err := strconv.ParseInt(string(signature[from:i]), 10, 64)
+ if err == nil {
+ ret.When = time.Unix(t, 0)
+ }
+ end = true
+ }
+ }
+
+ if end {
+ break
+ }
+ }
+
+ return ret
+}
+
+func (s *Signature) String() string {
+ return fmt.Sprintf("%q <%s> @ %s", s.Name, s.Email, s.When)
+}
diff --git a/formats/packfile/objects_test.go b/formats/packfile/objects_test.go
new file mode 100644
index 0000000..5952432
--- /dev/null
+++ b/formats/packfile/objects_test.go
@@ -0,0 +1,116 @@
+package packfile
+
+import (
+ "encoding/base64"
+ "time"
+
+ . "gopkg.in/check.v1"
+)
+
+type ObjectsSuite struct{}
+
+var _ = Suite(&ObjectsSuite{})
+
+func (s *ObjectsSuite) TestComputeHash(c *C) {
+ hash := ComputeHash("blob", []byte(""))
+ c.Assert(hash.String(), Equals, "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391")
+
+ hash = ComputeHash("blob", []byte("Hello, World!\n"))
+ c.Assert(hash.String(), Equals, "8ab686eafeb1f44702738c8b0f24f2567c36da6d")
+}
+
+var CommitFixture = "dHJlZSBjMmQzMGZhOGVmMjg4NjE4ZjY1ZjZlZWQ2ZTE2OGUwZDUxNDg4NmY0CnBhcmVudCBiMDI5NTE3ZjYzMDBjMmRhMGY0YjY1MWI4NjQyNTA2Y2Q2YWFmNDVkCnBhcmVudCBiOGU0NzFmNThiY2JjYTYzYjA3YmRhMjBlNDI4MTkwNDA5YzJkYjQ3CmF1dGhvciBNw6F4aW1vIEN1YWRyb3MgPG1jdWFkcm9zQGdtYWlsLmNvbT4gMTQyNzgwMjQzNCArMDIwMApjb21taXR0ZXIgTcOheGltbyBDdWFkcm9zIDxtY3VhZHJvc0BnbWFpbC5jb20+IDE0Mjc4MDI0MzQgKzAyMDAKCk1lcmdlIHB1bGwgcmVxdWVzdCAjMSBmcm9tIGRyaXBvbGxlcy9mZWF0dXJlCgpDcmVhdGluZyBjaGFuZ2Vsb2c="
+
+func (s *ObjectsSuite) TestParseCommit(c *C) {
+ data, _ := base64.StdEncoding.DecodeString(CommitFixture)
+ commit, err := ParseCommit(data)
+ c.Assert(err, IsNil)
+
+ c.Assert(commit.Tree.String(), Equals, "c2d30fa8ef288618f65f6eed6e168e0d514886f4")
+ c.Assert(commit.Parents, HasLen, 2)
+ c.Assert(commit.Parents[0].String(), Equals, "b029517f6300c2da0f4b651b8642506cd6aaf45d")
+ c.Assert(commit.Parents[1].String(), Equals, "b8e471f58bcbca63b07bda20e428190409c2db47")
+ c.Assert(commit.Author.Email, Equals, "mcuadros@gmail.com")
+ c.Assert(commit.Author.Name, Equals, "Máximo Cuadros")
+ c.Assert(commit.Author.When.Unix(), Equals, int64(1427802434))
+ c.Assert(commit.Committer.Email, Equals, "mcuadros@gmail.com")
+ c.Assert(commit.Message, Equals, "Merge pull request #1 from dripolles/feature\n\nCreating changelog")
+}
+
+func (s *ObjectsSuite) TestCommitHash(c *C) {
+ data, _ := base64.StdEncoding.DecodeString(CommitFixture)
+ commit, err := ParseCommit(data)
+
+ c.Assert(err, IsNil)
+ c.Assert(commit.Hash().String(), Equals, "a5b8b09e2f8fcb0bb99d3ccb0958157b40890d69")
+}
+
+var TreeFixture = "MTAwNjQ0IC5naXRpZ25vcmUAMoWKrTw4PtH/Cg+b3yMdVKAMnogxMDA2NDQgQ0hBTkdFTE9HANP/U+BWSp+H2OhLbijlBg5RcAiqMTAwNjQ0IExJQ0VOU0UAwZK9aiTqGrAdeGhuQXyL3Hw9GX8xMDA2NDQgYmluYXJ5LmpwZwDVwPSrgRiXyt8DrsNYrmDSH5HFDTQwMDAwIGdvAKOXcadlH5f69ccuCCJNhX/DUTPbNDAwMDAganNvbgBah35qkGonQ61uRdmcF5NkKq+O2jQwMDAwIHBocABYavVn0Ltedx5JvdlDT14Pt20l+jQwMDAwIHZlbmRvcgDPSqOziXT7fYHzZ8CDD3141lq4aw=="
+
+func (s *ObjectsSuite) TestParseTree(c *C) {
+ data, _ := base64.StdEncoding.DecodeString(TreeFixture)
+ tree, err := ParseTree(data)
+ c.Assert(err, IsNil)
+
+ c.Assert(tree.Entries, HasLen, 8)
+ c.Assert(tree.Entries[0].Name, Equals, ".gitignore")
+ c.Assert(tree.Entries[0].Hash.String(), Equals, "32858aad3c383ed1ff0a0f9bdf231d54a00c9e88")
+}
+
+func (s *ObjectsSuite) TestTreeHash(c *C) {
+ data, _ := base64.StdEncoding.DecodeString(TreeFixture)
+ tree, err := ParseTree(data)
+
+ c.Assert(err, IsNil)
+ c.Assert(tree.Hash().String(), Equals, "a8d315b2b1c615d43042c3a62402b8a54288cf5c")
+}
+
+func (s *ObjectsSuite) TestBlobHash(c *C) {
+ blob, err := ParseBlob([]byte{'F', 'O', 'O'})
+ c.Assert(err, IsNil)
+
+ c.Assert(blob.Len, Equals, 3)
+ c.Assert(blob.Hash().String(), Equals, "d96c7efbfec2814ae0301ad054dc8d9fc416c9b5")
+}
+
+func (s *ObjectsSuite) TestParseSignature(c *C) {
+ cases := map[string]Signature{
+ `Foo Bar <foo@bar.com> 1257894000 +0100`: {
+ Name: "Foo Bar",
+ Email: "foo@bar.com",
+ When: time.Unix(1257894000, 0),
+ },
+ `Foo Bar <> 1257894000 +0100`: {
+ Name: "Foo Bar",
+ Email: "",
+ When: time.Unix(1257894000, 0),
+ },
+ ` <> 1257894000`: {
+ Name: "",
+ Email: "",
+ When: time.Unix(1257894000, 0),
+ },
+ `Foo Bar <foo@bar.com>`: {
+ Name: "Foo Bar",
+ Email: "foo@bar.com",
+ When: time.Time{},
+ },
+ ``: {
+ Name: "",
+ Email: "",
+ When: time.Time{},
+ },
+ `<`: {
+ Name: "",
+ Email: "",
+ When: time.Time{},
+ },
+ }
+
+ for raw, exp := range cases {
+ got := ParseSignature([]byte(raw))
+ c.Assert(got.Name, Equals, exp.Name)
+ c.Assert(got.Email, Equals, exp.Email)
+ c.Assert(got.When.Unix(), Equals, exp.When.Unix())
+ }
+}
diff --git a/formats/packfile/packfile.go b/formats/packfile/packfile.go
new file mode 100644
index 0000000..d70f396
--- /dev/null
+++ b/formats/packfile/packfile.go
@@ -0,0 +1,82 @@
+package packfile
+
+import "fmt"
+
+type Packfile struct {
+ Version uint32
+ Size int64
+ ObjectCount int
+ Checksum []byte
+ Commits map[Hash]*Commit
+ Trees map[Hash]*Tree
+ Blobs map[Hash]*Blob
+}
+
+func NewPackfile() *Packfile {
+ return &Packfile{
+ Commits: make(map[Hash]*Commit, 0),
+ Trees: make(map[Hash]*Tree, 0),
+ Blobs: make(map[Hash]*Blob, 0),
+ }
+}
+
+type BlobEntry struct {
+ path string
+ *Blob
+}
+
+type SubtreeEntry struct {
+ path string
+ *Tree
+ TreeCh
+}
+
+type treeEntry interface {
+ isTreeEntry()
+ Path() string
+}
+
+func (b BlobEntry) isTreeEntry() {}
+func (b BlobEntry) Path() string { return b.path }
+func (b SubtreeEntry) isTreeEntry() {}
+func (b SubtreeEntry) Path() string { return b.path }
+
+type TreeCh <-chan treeEntry
+
+func (p *Packfile) WalkCommit(commitHash Hash) (TreeCh, error) {
+ commit, ok := p.Commits[commitHash]
+ if !ok {
+ return nil, fmt.Errorf("Unable to find %q commit", commitHash)
+ }
+
+ return p.WalkTree(p.Trees[commit.Tree]), nil
+}
+
+func (p *Packfile) WalkTree(tree *Tree) TreeCh {
+ return p.walkTree(tree, "")
+}
+
+func (p *Packfile) walkTree(tree *Tree, pathPrefix string) TreeCh {
+ ch := make(chan treeEntry)
+
+ if tree == nil {
+ close(ch)
+ return ch
+ }
+
+ go func() {
+ defer func() {
+ close(ch)
+ }()
+ for _, e := range tree.Entries {
+ path := pathPrefix + e.Name
+ if blob, ok := p.Blobs[e.Hash]; ok {
+ ch <- BlobEntry{path, blob}
+ } else if subtree, ok := p.Trees[e.Hash]; ok {
+ ch <- SubtreeEntry{path, subtree, p.walkTree(subtree, path+"/")}
+ }
+ }
+ }()
+
+ return ch
+}
diff --git a/formats/packfile/reader.go b/formats/packfile/reader.go
new file mode 100644
index 0000000..d5f40b9
--- /dev/null
+++ b/formats/packfile/reader.go
@@ -0,0 +1,385 @@
+package packfile
+
+import (
+ "bytes"
+ "encoding/binary"
+ "fmt"
+ "io"
+
+ "github.com/klauspost/compress/zlib"
+)
+
+const (
+ DefaultMaxObjectsLimit = 1 << 20
+ DefaultMaxObjectSize = 1 << 32 // 4GB
+
+ rawCommit = 1
+ rawTree = 2
+ rawBlob = 3
+ rawTag = 4
+ rawOFSDelta = 6
+ rawREFDelta = 7
+)
+
+type PackfileReader struct {
+ // MaxObjectsLimit is the limit of objects to be load in the packfile, if
+ // a packfile excess this number an error is throw, the default value
+ // is defined by DefaultMaxObjectsLimit, usually the default limit is more
+ // than enough to work with any repository, working extremly big repositories
+ // where the number of object is bigger the memory can be exhausted.
+ MaxObjectsLimit int
+
+ // MaxObjectSize is the maximum size in bytes, reading objects with a bigger
+ // size cause a error. The default value is defined by DefaultMaxObjectSize
+ MaxObjectSize int
+
+ r *trackingReader
+ objects map[Hash]packfileObject
+ offsets map[int]Hash
+ deltas []packfileDelta
+ contentCallback ContentCallback
+}
+
+type packfileObject struct {
+ bytes []byte
+ typ int8
+}
+
+type packfileDelta struct {
+ hash Hash
+ delta []byte
+}
+
+func NewPackfileReader(r io.Reader, fn ContentCallback) (*PackfileReader, error) {
+ return &PackfileReader{
+ MaxObjectsLimit: DefaultMaxObjectsLimit,
+ MaxObjectSize: DefaultMaxObjectSize,
+ r: &trackingReader{r: r},
+ objects: make(map[Hash]packfileObject, 0),
+ offsets: make(map[int]Hash, 0),
+ contentCallback: fn,
+ }, nil
+}
+
+func (pr *PackfileReader) Read() (*Packfile, error) {
+ packfile := NewPackfile()
+
+ if err := pr.validateHeader(); err != nil {
+ if err == io.EOF {
+ // This is an empty repo. It's OK.
+ return packfile, nil
+ }
+ return nil, err
+ }
+
+ ver, err := pr.readInt32()
+ if err != nil {
+ return nil, err
+ }
+
+ count, err := pr.readInt32()
+ if err != nil {
+ return nil, err
+ }
+
+ packfile.Version = uint32(ver)
+ packfile.ObjectCount = int(count)
+
+ if packfile.ObjectCount > pr.MaxObjectsLimit {
+ return nil, NewError("too many objects %d, limit is %d",
+ packfile.ObjectCount, pr.MaxObjectsLimit)
+ }
+
+ if err := pr.readObjects(packfile); err != nil {
+ return nil, err
+ }
+
+ packfile.Size = int64(pr.r.Pos())
+
+ return packfile, nil
+}
+
+func (pr *PackfileReader) validateHeader() error {
+ var header = make([]byte, 4)
+ if _, err := pr.r.Read(header); err != nil {
+ return err
+ }
+
+ if !bytes.Equal(header, []byte{'P', 'A', 'C', 'K'}) {
+ return NewError("Pack file does not start with 'PACK'")
+ }
+
+ return nil
+}
+
+func (pr *PackfileReader) readInt32() (uint32, error) {
+ var value uint32
+ if err := binary.Read(pr.r, binary.BigEndian, &value); err != nil {
+ return 0, err
+ }
+
+ return value, nil
+}
+
+func (pr *PackfileReader) readObjects(packfile *Packfile) error {
+ // This code has 50-80 µs of overhead per object not counting zlib inflation.
+ // Together with zlib inflation, it's 400-410 µs for small objects.
+ // That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB,
+ // of which 12-20 % is _not_ zlib inflation (ie. is our code).
+
+ for i := 0; i < packfile.ObjectCount; i++ {
+ var pos = pr.Pos()
+ obj, err := pr.readObject(packfile)
+ if err != nil && err != io.EOF {
+ return err
+ }
+
+ pr.offsets[pos] = obj.hash
+
+ if err == io.EOF {
+ break
+ }
+ }
+
+ return nil
+}
+
+func (pr *PackfileReader) readObject(packfile *Packfile) (*objectReader, error) {
+ o, err := newObjectReader(pr, packfile, pr.MaxObjectSize)
+ if err != nil {
+ return nil, err
+ }
+
+ switch o.typ {
+ case rawREFDelta:
+ err = o.readREFDelta()
+ case rawOFSDelta:
+ err = o.readOFSDelta()
+ case rawCommit, rawTree, rawBlob, rawTag:
+ err = o.readObject()
+ default:
+ err = NewError("Invalid git object tag %q", o.typ)
+ }
+
+ if err != nil {
+ return nil, err
+ }
+
+ return o, err
+}
+
+func (pr *PackfileReader) Pos() int { return pr.r.Pos() }
+
+type objectReader struct {
+ pr *PackfileReader
+ pf *Packfile
+ maxSize uint64
+
+ hash Hash
+ steps int
+ typ int8
+ size uint64
+}
+
+func newObjectReader(pr *PackfileReader, pf *Packfile, maxSize int) (*objectReader, error) {
+ o := &objectReader{pr: pr, pf: pf, maxSize: uint64(maxSize)}
+
+ var buf [1]byte
+ if _, err := o.Read(buf[:]); err != nil {
+ return nil, err
+ }
+
+ o.typ = int8((buf[0] >> 4) & 7)
+ o.size = uint64(buf[0] & 15)
+ o.steps++ // byte we just read to get `o.typ` and `o.size`
+
+ var shift uint = 4
+ for buf[0]&0x80 == 0x80 {
+ if _, err := o.Read(buf[:]); err != nil {
+ return nil, err
+ }
+
+ o.size += uint64(buf[0]&0x7f) << shift
+ o.steps++ // byte we just read to update `o.size`
+ shift += 7
+ }
+
+ return o, nil
+}
+
+func (o *objectReader) readREFDelta() error {
+ var ref Hash
+ if _, err := o.Read(ref[:]); err != nil {
+ return err
+ }
+
+ buf, err := o.inflate()
+ if err != nil {
+ return err
+ }
+
+ referenced, ok := o.pr.objects[ref]
+ if !ok {
+ o.pr.deltas = append(o.pr.deltas, packfileDelta{hash: ref, delta: buf[:]})
+ } else {
+ patched := PatchDelta(referenced.bytes, buf[:])
+ if patched == nil {
+ return NewError("error while patching %x", ref)
+ }
+
+ o.typ = referenced.typ
+ err = o.addObject(patched)
+ if err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
+func decodeOffset(src io.ByteReader, steps int) (int, error) {
+ b, err := src.ReadByte()
+ if err != nil {
+ return 0, err
+ }
+ var offset = int(b & 0x7f)
+ for (b & 0x80) != 0 {
+ offset++ // WHY?
+ b, err = src.ReadByte()
+ if err != nil {
+ return 0, err
+ }
+
+ offset = (offset << 7) + int(b&0x7f)
+ }
+
+ // offset needs to be aware of the bytes we read for `o.typ` and `o.size`
+ offset += steps
+ return -offset, nil
+}
+
+func (o *objectReader) readOFSDelta() error {
+ var pos = o.pr.Pos()
+
+ // read negative offset
+ offset, err := decodeOffset(o.pr.r, o.steps)
+ if err != nil {
+ return err
+ }
+
+ buf, err := o.inflate()
+ if err != nil {
+ return err
+ }
+
+ ref := o.pr.offsets[pos+offset]
+ referenced, ok := o.pr.objects[ref]
+ if !ok {
+ return NewError("can't find a pack entry at %d", pos+offset)
+ }
+
+ patched := PatchDelta(referenced.bytes, buf)
+ if patched == nil {
+ return NewError("error while patching %q", ref)
+ }
+
+ o.typ = referenced.typ
+ err = o.addObject(patched)
+ if err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func (o *objectReader) readObject() error {
+ buf, err := o.inflate()
+ if err != nil {
+ return err
+ }
+
+ return o.addObject(buf)
+}
+
+func (o *objectReader) addObject(bytes []byte) error {
+ var hash Hash
+
+ switch o.typ {
+ case rawCommit:
+ c, err := ParseCommit(bytes)
+ if err != nil {
+ return err
+ }
+ o.pf.Commits[c.Hash()] = c
+ hash = c.Hash()
+ case rawTree:
+ c, err := ParseTree(bytes)
+ if err != nil {
+ return err
+ }
+ o.pf.Trees[c.Hash()] = c
+ hash = c.Hash()
+ case rawBlob:
+ c, err := ParseBlob(bytes)
+ if err != nil {
+ return err
+ }
+ o.pf.Blobs[c.Hash()] = c
+ hash = c.Hash()
+
+ if o.pr.contentCallback != nil {
+ o.pr.contentCallback(hash, bytes)
+ }
+ }
+
+ o.pr.objects[hash] = packfileObject{bytes: bytes, typ: o.typ}
+ o.hash = hash
+
+ return nil
+}
+
+func (o *objectReader) inflate() ([]byte, error) {
+ zr, err := zlib.NewReader(o.pr.r)
+ if err != nil {
+ if err == zlib.ErrHeader {
+ return nil, zlib.ErrHeader
+ }
+
+ return nil, NewError("error opening packfile's object zlib: %v", err)
+ }
+
+ defer zr.Close()
+
+ if o.size > o.maxSize {
+ return nil, NewError("the object size %q exceeed the allowed limit: %q",
+ o.size, o.maxSize)
+ }
+
+ var buf bytes.Buffer
+ io.Copy(&buf, zr) // also: io.CopyN(&buf, zr, int64(o.size))
+
+ var bufLen = buf.Len()
+ if bufLen != int(o.size) {
+ return nil, NewError("inflated size mismatch, expected %d, got %d", o.size, bufLen)
+ }
+
+ return buf.Bytes(), nil
+}
+
+func (o *objectReader) Read(p []byte) (int, error) {
+ return o.pr.r.Read(p)
+}
+
+func (o *objectReader) ReadByte() (byte, error) {
+ return o.pr.r.ReadByte()
+}
+
+type ReaderError struct {
+ Msg string // description of error
+}
+
+func NewError(format string, args ...interface{}) error {
+ return &ReaderError{Msg: fmt.Sprintf(format, args...)}
+}
+
+func (e *ReaderError) Error() string { return e.Msg }
diff --git a/formats/packfile/reader_test.go b/formats/packfile/reader_test.go
new file mode 100644
index 0000000..e52cbc3
--- /dev/null
+++ b/formats/packfile/reader_test.go
@@ -0,0 +1,35 @@
+package packfile
+
+import (
+ "bytes"
+ "encoding/base64"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+var packFileWithEmptyObjects = "UEFDSwAAAAIAAAALnw54nKXMQWoDMQxA0b1PoX2hSLIm44FSAlmXnEG2NYlhXAfHgdLb5Cy9WAM5Qpb/Lf7oZqArUpakyYtQjCoxZ5lmWXwwyuzJbHqAuYt2+x6QoyCyhYCKIa67lGameSLWvPh5JU0hsCg7vY1z6/D1d/8ptcHhprm3Kxz7KL/wUdOz96eqZXtPrX4CCeOOPU8Eb0iI7qG1jGGvXdxaNoPs/gHeNkp8lA94nKXMQUpDMRCA4X1OMXtBZpI3L3kiRXAtPcMkmWjgxZSYQultPEsv1oJHcPl/i38OVRC0IXF0lshrJorZEcpKmTEJYbA+B3aFzEmGfk9gpqJEsmnZNutXF71i1IURU/G0bsWWwJ6NnOdXH/Bx+73U1uH9LHn0HziOWa/w2tJfv302qftz6u0AtFh0wQdmeEJCNA9tdU7938WUuivEF5CczR11ZEsNnw54nKWMUQoCIRRF/13F+w/ijY6jQkTQd7SGpz5LyAxzINpNa2ljTbSEPu/hnNsbM4TJTzqyt561GdUUmJKT6K2MeiCVgnZWoY/iRo2vHVS0URrUS+e+dkqIEp11HMhh9IaUkRM6QXM/1waH9+uRS4X9TLHVOxxbz0/YlPDbu1OhfFmHWrYwjBKVNVaNsMIBUSy05N75vxeR8oXBiw8GoErCnwt4nKXMzQkCMRBA4XuqmLsgM2M2ZkAWwbNYQ341sCEQsyB2Yy02pmAJHt93eKOnBFpMNJqtl5CFxVIMomViomQSEWP2JrN3yq3j1jqc369HqQ1Oq4u93eHSR3nCoYZfH6/VlWUbWp2BNOPO7i1OsEFCVF+tZYz030XlsiRw6gPZ0jxaqwV4nDM0MDAzMVFIZHg299HsTRevOXt3a64rj7px6ElP8ERDiGQSQ2uoXe8RrcodS5on+J4/u8HjD4NDKFQyRS8tPx+rbgDt3yiEMHicAwAAAAABPnicS0wEAa4kMOACACTjBKdkZXici7aaYAUAA3gBYKoDeJwzNDAwMzFRSGR4NvfR7E0Xrzl7d2uuK4+6cehJT/BEQ4hkEsOELYFJvS2eX47UJdVttFQrenrmzQwA13MaiDd4nEtMBAEuAApMAlGtAXicMzQwMDMxUUhkeDb30exNF685e3drriuPunHoSU/wRACvkA258N/i8hVXx9CiAZzvFXNIhCuSFmE="
+
+func TestReadPackfile(t *testing.T) {
+ data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects)
+ d := bytes.NewReader(data)
+
+ r, err := NewPackfileReader(d, nil)
+ assert.Nil(t, err)
+
+ p, err := r.Read()
+ assert.Nil(t, err)
+
+ assert.Equal(t, 11, p.ObjectCount)
+ assert.Equal(t, 4, len(p.Commits))
+ assert.Equal(t, 4, len(p.Trees))
+}
+
+func TestReadPackfileInvalid(t *testing.T) {
+ r, err := NewPackfileReader(bytes.NewReader([]byte("dasdsadasas")), nil)
+ assert.Nil(t, err)
+
+ _, err = r.Read()
+ _, ok := err.(*ReaderError)
+ assert.True(t, ok)
+}