From d0a18ccd8eea3bdabc76d6dc5420af1ea30aae9f Mon Sep 17 00:00:00 2001 From: Máximo Cuadros Date: Fri, 23 Oct 2015 14:28:49 +0200 Subject: formats/packfile: type Hash instead of strings --- commons/hash.go | 17 -- commons/hash_test.go | 12 -- formats/packfile/delta.go | 93 +++++++++ formats/packfile/doc.go | 168 ++++++++++++++++ formats/packfile/objects.go | 237 +++++++++++++++++++++++ formats/packfile/objects_test.go | 88 +++++++++ formats/packfile/packfile.go | 82 ++++++++ formats/packfile/reader.go | 401 ++++++++++++++++++++++++++++++++++++++ formats/packfile/reader_test.go | 35 ++++ packfile/delta.go | 93 --------- packfile/doc.go | 168 ---------------- packfile/objects.go | 200 ------------------- packfile/objects_test.go | 50 ----- packfile/packfile.go | 83 -------- packfile/reader.go | 402 --------------------------------------- packfile/reader_test.go | 35 ---- remote_test.go | 3 +- 17 files changed, 1106 insertions(+), 1061 deletions(-) delete mode 100644 commons/hash.go delete mode 100644 commons/hash_test.go create mode 100644 formats/packfile/delta.go create mode 100644 formats/packfile/doc.go create mode 100644 formats/packfile/objects.go create mode 100644 formats/packfile/objects_test.go create mode 100644 formats/packfile/packfile.go create mode 100644 formats/packfile/reader.go create mode 100644 formats/packfile/reader_test.go delete mode 100644 packfile/delta.go delete mode 100644 packfile/doc.go delete mode 100644 packfile/objects.go delete mode 100644 packfile/objects_test.go delete mode 100644 packfile/packfile.go delete mode 100644 packfile/reader.go delete mode 100644 packfile/reader_test.go diff --git a/commons/hash.go b/commons/hash.go deleted file mode 100644 index 03fa53f..0000000 --- a/commons/hash.go +++ /dev/null @@ -1,17 +0,0 @@ -package commons - -import ( - "crypto/sha1" - "fmt" - "strconv" -) - -func GitHash(t string, b []byte) string { - h := []byte(t) - h = append(h, ' ') - h = strconv.AppendInt(h, int64(len(b)), 10) - h = append(h, 0) - h = append(h, b...) - - return fmt.Sprintf("%x", sha1.Sum(h)) -} diff --git a/commons/hash_test.go b/commons/hash_test.go deleted file mode 100644 index 79a22ea..0000000 --- a/commons/hash_test.go +++ /dev/null @@ -1,12 +0,0 @@ -package commons - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestCalculateHash(t *testing.T) { - assert.Equal(t, "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391", GitHash("blob", []byte(""))) - assert.Equal(t, "8ab686eafeb1f44702738c8b0f24f2567c36da6d", GitHash("blob", []byte("Hello, World!\n"))) -} diff --git a/formats/packfile/delta.go b/formats/packfile/delta.go new file mode 100644 index 0000000..86b556f --- /dev/null +++ b/formats/packfile/delta.go @@ -0,0 +1,93 @@ +package packfile + +const delta_size_min = 4 + +func deltaHeaderSize(b []byte) (uint, []byte) { + var size, j uint + var cmd byte + for { + cmd = b[j] + size |= (uint(cmd) & 0x7f) << (j * 7) + j++ + if uint(cmd)&0xb80 == 0 || j == uint(len(b)) { + break + } + } + return size, b[j:] +} + +func PatchDelta(src, delta []byte) []byte { + if len(delta) < delta_size_min { + return nil + } + size, delta := deltaHeaderSize(delta) + if size != uint(len(src)) { + return nil + } + size, delta = deltaHeaderSize(delta) + origSize := size + + dest := make([]byte, 0) + + // var offset uint + var cmd byte + for { + cmd = delta[0] + delta = delta[1:] + if (cmd & 0x80) != 0 { + var cp_off, cp_size uint + if (cmd & 0x01) != 0 { + cp_off = uint(delta[0]) + delta = delta[1:] + } + if (cmd & 0x02) != 0 { + cp_off |= uint(delta[0]) << 8 + delta = delta[1:] + } + if (cmd & 0x04) != 0 { + cp_off |= uint(delta[0]) << 16 + delta = delta[1:] + } + if (cmd & 0x08) != 0 { + cp_off |= uint(delta[0]) << 24 + delta = delta[1:] + } + + if (cmd & 0x10) != 0 { + cp_size = uint(delta[0]) + delta = delta[1:] + } + if (cmd & 0x20) != 0 { + cp_size |= uint(delta[0]) << 8 + delta = delta[1:] + } + if (cmd & 0x40) != 0 { + cp_size |= uint(delta[0]) << 16 + delta = delta[1:] + } + if cp_size == 0 { + cp_size = 0x10000 + } + if cp_off+cp_size < cp_off || + cp_off+cp_size > uint(len(src)) || + cp_size > origSize { + break + } + dest = append(dest, src[cp_off:cp_off+cp_size]...) + size -= cp_size + } else if cmd != 0 { + if uint(cmd) > origSize { + break + } + dest = append(dest, delta[0:uint(cmd)]...) + size -= uint(cmd) + delta = delta[uint(cmd):] + } else { + return nil + } + if size <= 0 { + break + } + } + return dest +} diff --git a/formats/packfile/doc.go b/formats/packfile/doc.go new file mode 100644 index 0000000..1fc28da --- /dev/null +++ b/formats/packfile/doc.go @@ -0,0 +1,168 @@ +package packfile + +// Code from: +// https://github.com/gitchain/gitchain/tree/master/git @ 4c2fabdf9 +// +// GIT pack format +// =============== +// +// == pack-*.pack files have the following format: +// +// - A header appears at the beginning and consists of the following: +// +// 4-byte signature: +// The signature is: {'P', 'A', 'C', 'K'} +// +// 4-byte version number (network byte order): +// GIT currently accepts version number 2 or 3 but +// generates version 2 only. +// +// 4-byte number of objects contained in the pack (network byte order) +// +// Observation: we cannot have more than 4G versions ;-) and +// more than 4G objects in a pack. +// +// - The header is followed by number of object entries, each of +// which looks like this: +// +// (undeltified representation) +// n-byte type and length (3-bit type, (n-1)*7+4-bit length) +// compressed data +// +// (deltified representation) +// n-byte type and length (3-bit type, (n-1)*7+4-bit length) +// 20-byte base object name +// compressed delta data +// +// Observation: length of each object is encoded in a variable +// length format and is not constrained to 32-bit or anything. +// +// - The trailer records 20-byte SHA1 checksum of all of the above. +// +// == Original (version 1) pack-*.idx files have the following format: +// +// - The header consists of 256 4-byte network byte order +// integers. N-th entry of this table records the number of +// objects in the corresponding pack, the first byte of whose +// object name is less than or equal to N. This is called the +// 'first-level fan-out' table. +// +// - The header is followed by sorted 24-byte entries, one entry +// per object in the pack. Each entry is: +// +// 4-byte network byte order integer, recording where the +// object is stored in the packfile as the offset from the +// beginning. +// +// 20-byte object name. +// +// - The file is concluded with a trailer: +// +// A copy of the 20-byte SHA1 checksum at the end of +// corresponding packfile. +// +// 20-byte SHA1-checksum of all of the above. +// +// Pack Idx file: +// +// -- +--------------------------------+ +// fanout | fanout[0] = 2 (for example) |-. +// table +--------------------------------+ | +// | fanout[1] | | +// +--------------------------------+ | +// | fanout[2] | | +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | +// | fanout[255] = total objects |---. +// -- +--------------------------------+ | | +// main | offset | | | +// index | object name 00XXXXXXXXXXXXXXXX | | | +// table +--------------------------------+ | | +// | offset | | | +// | object name 00XXXXXXXXXXXXXXXX | | | +// +--------------------------------+<+ | +// .-| offset | | +// | | object name 01XXXXXXXXXXXXXXXX | | +// | +--------------------------------+ | +// | | offset | | +// | | object name 01XXXXXXXXXXXXXXXX | | +// | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | +// | | offset | | +// | | object name FFXXXXXXXXXXXXXXXX | | +// --| +--------------------------------+<--+ +// trailer | | packfile checksum | +// | +--------------------------------+ +// | | idxfile checksum | +// | +--------------------------------+ +// .-------. +// | +// Pack file entry: <+ +// +// packed object header: +// 1-byte size extension bit (MSB) +// type (next 3 bit) +// size0 (lower 4-bit) +// n-byte sizeN (as long as MSB is set, each 7-bit) +// size0..sizeN form 4+7+7+..+7 bit integer, size0 +// is the least significant part, and sizeN is the +// most significant part. +// packed object data: +// If it is not DELTA, then deflated bytes (the size above +// is the size before compression). +// If it is REF_DELTA, then +// 20-byte base object name SHA1 (the size above is the +// size of the delta data that follows). +// delta data, deflated. +// If it is OFS_DELTA, then +// n-byte offset (see below) interpreted as a negative +// offset from the type-byte of the header of the +// ofs-delta entry (the size above is the size of +// the delta data that follows). +// delta data, deflated. +// +// offset encoding: +// n bytes with MSB set in all but the last one. +// The offset is then the number constructed by +// concatenating the lower 7 bit of each byte, and +// for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1)) +// to the result. +// +// +// +// == Version 2 pack-*.idx files support packs larger than 4 GiB, and +// have some other reorganizations. They have the format: +// +// - A 4-byte magic number '\377tOc' which is an unreasonable +// fanout[0] value. +// +// - A 4-byte version number (= 2) +// +// - A 256-entry fan-out table just like v1. +// +// - A table of sorted 20-byte SHA1 object names. These are +// packed together without offset values to reduce the cache +// footprint of the binary search for a specific object name. +// +// - A table of 4-byte CRC32 values of the packed object data. +// This is new in v2 so compressed data can be copied directly +// from pack to pack during repacking without undetected +// data corruption. +// +// - A table of 4-byte offset values (in network byte order). +// These are usually 31-bit pack file offsets, but large +// offsets are encoded as an index into the next table with +// the msbit set. +// +// - A table of 8-byte offset entries (empty for pack files less +// than 2 GiB). Pack files are organized with heavily used +// objects toward the front, so most object references should +// not need to refer to this table. +// +// - The same trailer as a v1 pack file: +// +// A copy of the 20-byte SHA1 checksum at the end of +// corresponding packfile. +// +// 20-byte SHA1-checksum of all of the above. +// +// From: +// https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt diff --git a/formats/packfile/objects.go b/formats/packfile/objects.go new file mode 100644 index 0000000..4c7ee75 --- /dev/null +++ b/formats/packfile/objects.go @@ -0,0 +1,237 @@ +package packfile + +import ( + "bytes" + "crypto/sha1" + "encoding/hex" + "fmt" + "strconv" + "time" +) + +type ObjectType string + +const ( + CommitObject ObjectType = "commit" + TreeObject ObjectType = "tree" + BlobObject ObjectType = "blob" +) + +// Object generic object interface +type Object interface { + Type() ObjectType + Hash() Hash +} + +// Hash SHA1 hased content +type Hash [20]byte + +// ComputeHash compute the hash for a given objType and content +func ComputeHash(t ObjectType, content []byte) Hash { + h := []byte(t) + h = append(h, ' ') + h = strconv.AppendInt(h, int64(len(content)), 10) + h = append(h, 0) + h = append(h, content...) + + return Hash(sha1.Sum(h)) +} + +func (h Hash) String() string { + return hex.EncodeToString(h[:]) +} + +// Commit points to a single tree, marking it as what the project looked like +// at a certain point in time. It contains meta-information about that point +// in time, such as a timestamp, the author of the changes since the last +// commit, a pointer to the previous commit(s), etc. +// http://schacon.github.io/gitbook/1_the_git_object_model.html +type Commit struct { + Tree Hash + Parents []Hash + Author Signature + Committer Signature + Message string + hash Hash +} + +// ParseCommit transform a byte slice into a Commit struct +func ParseCommit(b []byte) (*Commit, error) { + // b64 := base64.StdEncoding.EncodeToString(b) + //fmt.Printf("%q\n", b64) + + o := &Commit{hash: ComputeHash(CommitObject, b)} + + lines := bytes.Split(b, []byte{'\n'}) + for i := range lines { + if len(lines[i]) > 0 { + var err error + + split := bytes.SplitN(lines[i], []byte{' '}, 2) + switch string(split[0]) { + case "tree": + _, err = hex.Decode(o.Tree[:], split[1]) + case "parent": + var h Hash + _, err = hex.Decode(h[:], split[1]) + if err == nil { + o.Parents = append(o.Parents, h) + } + case "author": + o.Author = ParseSignature(split[1]) + case "committer": + o.Committer = ParseSignature(split[1]) + } + + if err != nil { + return nil, err + } + } else { + o.Message = string(bytes.Join(append(lines[i+1:]), []byte{'\n'})) + break + } + + } + + return o, nil +} + +// Type returns the object type +func (o *Commit) Type() ObjectType { + return CommitObject +} + +// Hash returns the computed hash of the commit +func (o *Commit) Hash() Hash { + return o.hash +} + +type Tree struct { + Entries []TreeEntry + hash Hash +} + +type TreeEntry struct { + Name string + Hash Hash +} + +func NewTree(body []byte) (*Tree, error) { + o := &Tree{hash: ComputeHash(TreeObject, body)} + + if len(body) == 0 { + return o, nil + } + + for { + split := bytes.SplitN(body, []byte{0}, 2) + split1 := bytes.SplitN(split[0], []byte{' '}, 2) + + entry := TreeEntry{} + entry.Name = string(split1[1]) + copy(entry.Hash[:], split[1][0:20]) + + o.Entries = append(o.Entries, entry) + + body = split[1][20:] + if len(split[1]) == 20 { + break + } + } + + return o, nil +} + +func (o *Tree) Type() ObjectType { + return TreeObject +} + +func (o *Tree) Hash() Hash { + return o.hash +} + +type Blob struct { + Len int + hash Hash +} + +func NewBlob(b []byte) (*Blob, error) { + return &Blob{ + Len: len(b), + hash: ComputeHash(BlobObject, b), + }, nil +} + +func (o *Blob) Type() ObjectType { + return BlobObject +} + +func (o *Blob) Hash() Hash { + return o.hash +} + +type ContentCallback func(hash Hash, content []byte) + +// Signature represents an action signed by a person +type Signature struct { + Name string + Email string + When time.Time +} + +// ParseSignature parse a byte slice returning a new action signature. +func ParseSignature(signature []byte) Signature { + ret := Signature{} + if len(signature) == 0 { + return ret + } + + from := 0 + state := 'n' // n: name, e: email, t: timestamp, z: timezone + for i := 0; ; i++ { + var c byte + var end bool + if i < len(signature) { + c = signature[i] + } else { + end = true + } + + switch state { + case 'n': + if c == '<' || end { + if i == 0 { + break + } + ret.Name = string(signature[from : i-1]) + state = 'e' + from = i + 1 + } + case 'e': + if c == '>' || end { + ret.Email = string(signature[from:i]) + i++ + state = 't' + from = i + 1 + } + case 't': + if c == ' ' || end { + t, err := strconv.ParseInt(string(signature[from:i]), 10, 64) + if err == nil { + ret.When = time.Unix(t, 0) + } + end = true + } + } + + if end { + break + } + } + + return ret +} + +func (s *Signature) String() string { + return fmt.Sprintf("%q <%s> @ %s", s.Name, s.Email, s.When) +} diff --git a/formats/packfile/objects_test.go b/formats/packfile/objects_test.go new file mode 100644 index 0000000..70f4ae6 --- /dev/null +++ b/formats/packfile/objects_test.go @@ -0,0 +1,88 @@ +package packfile + +import ( + "encoding/base64" + "time" + + . "gopkg.in/check.v1" +) + +type ObjectsSuite struct{} + +var _ = Suite(&ObjectsSuite{}) + +func (s *ObjectsSuite) TestComputeHash(c *C) { + hash := ComputeHash("blob", []byte("")) + c.Assert(hash.String(), Equals, "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") + + hash = ComputeHash("blob", []byte("Hello, World!\n")) + c.Assert(hash.String(), Equals, "8ab686eafeb1f44702738c8b0f24f2567c36da6d") +} + +var CommitFixture = "dHJlZSBjMmQzMGZhOGVmMjg4NjE4ZjY1ZjZlZWQ2ZTE2OGUwZDUxNDg4NmY0CnBhcmVudCBiMDI5NTE3ZjYzMDBjMmRhMGY0YjY1MWI4NjQyNTA2Y2Q2YWFmNDVkCnBhcmVudCBiOGU0NzFmNThiY2JjYTYzYjA3YmRhMjBlNDI4MTkwNDA5YzJkYjQ3CmF1dGhvciBNw6F4aW1vIEN1YWRyb3MgPG1jdWFkcm9zQGdtYWlsLmNvbT4gMTQyNzgwMjQzNCArMDIwMApjb21taXR0ZXIgTcOheGltbyBDdWFkcm9zIDxtY3VhZHJvc0BnbWFpbC5jb20+IDE0Mjc4MDI0MzQgKzAyMDAKCk1lcmdlIHB1bGwgcmVxdWVzdCAjMSBmcm9tIGRyaXBvbGxlcy9mZWF0dXJlCgpDcmVhdGluZyBjaGFuZ2Vsb2c=" + +func (s *ObjectsSuite) TestParseCommit(c *C) { + data, _ := base64.StdEncoding.DecodeString(CommitFixture) + commit, err := ParseCommit(data) + c.Assert(err, IsNil) + + c.Assert(commit.Tree.String(), Equals, "c2d30fa8ef288618f65f6eed6e168e0d514886f4") + c.Assert(commit.Parents, HasLen, 2) + c.Assert(commit.Parents[0].String(), Equals, "b029517f6300c2da0f4b651b8642506cd6aaf45d") + c.Assert(commit.Parents[1].String(), Equals, "b8e471f58bcbca63b07bda20e428190409c2db47") + c.Assert(commit.Author.Email, Equals, "mcuadros@gmail.com") + c.Assert(commit.Author.Name, Equals, "Máximo Cuadros") + c.Assert(commit.Author.When.Unix(), Equals, int64(1427802434)) + c.Assert(commit.Committer.Email, Equals, "mcuadros@gmail.com") + c.Assert(commit.Message, Equals, "Merge pull request #1 from dripolles/feature\n\nCreating changelog") +} + +func (s *ObjectsSuite) TestCommitHash(c *C) { + data, _ := base64.StdEncoding.DecodeString(CommitFixture) + commit, err := ParseCommit(data) + + c.Assert(err, IsNil) + c.Assert(commit.Hash().String(), Equals, "a5b8b09e2f8fcb0bb99d3ccb0958157b40890d69") +} + +func (s *ObjectsSuite) TestParseSignature(c *C) { + cases := map[string]Signature{ + `Foo Bar 1257894000 +0100`: { + Name: "Foo Bar", + Email: "foo@bar.com", + When: time.Unix(1257894000, 0), + }, + `Foo Bar <> 1257894000 +0100`: { + Name: "Foo Bar", + Email: "", + When: time.Unix(1257894000, 0), + }, + ` <> 1257894000`: { + Name: "", + Email: "", + When: time.Unix(1257894000, 0), + }, + `Foo Bar `: { + Name: "Foo Bar", + Email: "foo@bar.com", + When: time.Time{}, + }, + ``: { + Name: "", + Email: "", + When: time.Time{}, + }, + `<`: { + Name: "", + Email: "", + When: time.Time{}, + }, + } + + for raw, exp := range cases { + got := ParseSignature([]byte(raw)) + c.Assert(got.Name, Equals, exp.Name) + c.Assert(got.Email, Equals, exp.Email) + c.Assert(got.When.Unix(), Equals, exp.When.Unix()) + } +} diff --git a/formats/packfile/packfile.go b/formats/packfile/packfile.go new file mode 100644 index 0000000..d70f396 --- /dev/null +++ b/formats/packfile/packfile.go @@ -0,0 +1,82 @@ +package packfile + +import "fmt" + +type Packfile struct { + Version uint32 + Size int64 + ObjectCount int + Checksum []byte + Commits map[Hash]*Commit + Trees map[Hash]*Tree + Blobs map[Hash]*Blob +} + +func NewPackfile() *Packfile { + return &Packfile{ + Commits: make(map[Hash]*Commit, 0), + Trees: make(map[Hash]*Tree, 0), + Blobs: make(map[Hash]*Blob, 0), + } +} + +type BlobEntry struct { + path string + *Blob +} + +type SubtreeEntry struct { + path string + *Tree + TreeCh +} + +type treeEntry interface { + isTreeEntry() + Path() string +} + +func (b BlobEntry) isTreeEntry() {} +func (b BlobEntry) Path() string { return b.path } +func (b SubtreeEntry) isTreeEntry() {} +func (b SubtreeEntry) Path() string { return b.path } + +type TreeCh <-chan treeEntry + +func (p *Packfile) WalkCommit(commitHash Hash) (TreeCh, error) { + commit, ok := p.Commits[commitHash] + if !ok { + return nil, fmt.Errorf("Unable to find %q commit", commitHash) + } + + return p.WalkTree(p.Trees[commit.Tree]), nil +} + +func (p *Packfile) WalkTree(tree *Tree) TreeCh { + return p.walkTree(tree, "") +} + +func (p *Packfile) walkTree(tree *Tree, pathPrefix string) TreeCh { + ch := make(chan treeEntry) + + if tree == nil { + close(ch) + return ch + } + + go func() { + defer func() { + close(ch) + }() + for _, e := range tree.Entries { + path := pathPrefix + e.Name + if blob, ok := p.Blobs[e.Hash]; ok { + ch <- BlobEntry{path, blob} + } else if subtree, ok := p.Trees[e.Hash]; ok { + ch <- SubtreeEntry{path, subtree, p.walkTree(subtree, path+"/")} + } + } + }() + + return ch +} diff --git a/formats/packfile/reader.go b/formats/packfile/reader.go new file mode 100644 index 0000000..ccf4822 --- /dev/null +++ b/formats/packfile/reader.go @@ -0,0 +1,401 @@ +package packfile + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + + "github.com/klauspost/compress/zlib" +) + +const MaxObjectsLimit = 1000000 + +var ErrMaxSize = fmt.Errorf("Max size exceeded for in-memory client") + +type TrackingByteReader struct { + r io.Reader + n, l int +} + +func (t *TrackingByteReader) Pos() int { return t.n } + +func (t *TrackingByteReader) Read(p []byte) (n int, err error) { + n, err = t.r.Read(p) + if err != nil { + return 0, err + } + t.n += n + if t.n >= t.l { + return n, ErrMaxSize + } + return n, err +} + +func (t *TrackingByteReader) ReadByte() (c byte, err error) { + var p [1]byte + n, err := t.r.Read(p[:]) + if err != nil { + return 0, err + } + if n > 1 { + return 0, fmt.Errorf("read %d bytes, should have read just 1", n) + } + t.n += n // n is 1 + return p[0], nil +} + +type PackfileReader struct { + r *TrackingByteReader + + objects map[Hash]packfileObject + offsets map[int]Hash + deltas []packfileDelta + + contentCallback ContentCallback +} + +type packfileObject struct { + bytes []byte + typ int8 +} + +type packfileDelta struct { + hash Hash + delta []byte +} + +func NewPackfileReader(r io.Reader, l int, fn ContentCallback) (*PackfileReader, error) { + return &PackfileReader{ + r: &TrackingByteReader{r: r, n: 0, l: l}, + objects: make(map[Hash]packfileObject, 0), + offsets: make(map[int]Hash, 0), + contentCallback: fn, + }, nil +} + +func (pr *PackfileReader) Pos() int { return pr.r.Pos() } + +func (pr *PackfileReader) Read() (*Packfile, error) { + packfile := NewPackfile() + + if err := pr.validateHeader(); err != nil { + if err == io.EOF { + // This is an empty repo. It's OK. + return packfile, nil + } + return nil, err + } + + ver, err := pr.readInt32() + if err != nil { + return nil, err + } + + count, err := pr.readInt32() + if err != nil { + return nil, err + } + + packfile.Version = uint32(ver) + packfile.ObjectCount = int(count) + + if packfile.ObjectCount > MaxObjectsLimit { + return nil, NewError("too many objects (%d)", packfile.ObjectCount) + } + + if err := pr.readObjects(packfile); err != nil { + return nil, err + } + + packfile.Size = int64(pr.r.Pos()) + + return packfile, nil +} + +func (pr *PackfileReader) validateHeader() error { + var header = make([]byte, 4) + if _, err := pr.r.Read(header); err != nil { + return err + } + + if !bytes.Equal(header, []byte{'P', 'A', 'C', 'K'}) { + return NewError("Pack file does not start with 'PACK'") + } + + return nil +} + +func (pr *PackfileReader) readInt32() (uint32, error) { + var value uint32 + if err := binary.Read(pr.r, binary.BigEndian, &value); err != nil { + return 0, err + } + + return value, nil +} + +func (pr *PackfileReader) readObjects(packfile *Packfile) error { + // This code has 50-80 µs of overhead per object not counting zlib inflation. + // Together with zlib inflation, it's 400-410 µs for small objects. + // That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB, + // of which 12-20 % is _not_ zlib inflation (ie. is our code). + + for i := 0; i < packfile.ObjectCount; i++ { + var pos = pr.Pos() + obj, err := pr.readObject(packfile) + if err != nil && err != io.EOF { + return err + } + + pr.offsets[pos] = obj.hash + + if err == io.EOF { + break + } + } + + return nil +} + +func (pr *PackfileReader) readObject(packfile *Packfile) (*objectReader, error) { + o, err := newObjectReader(pr, packfile) + if err != nil { + return nil, err + } + + switch o.typ { + case OBJ_REF_DELTA: + err = o.readREFDelta() + case OBJ_OFS_DELTA: + err = o.readOFSDelta() + case OBJ_COMMIT, OBJ_TREE, OBJ_BLOB, OBJ_TAG: + err = o.readObject() + default: + err = NewError("Invalid git object tag %q", o.typ) + } + + if err != nil { + return nil, err + } + + return o, err +} + +const ( + OBJ_COMMIT = 1 + OBJ_TREE = 2 + OBJ_BLOB = 3 + OBJ_TAG = 4 + OBJ_OFS_DELTA = 6 + OBJ_REF_DELTA = 7 +) + +const SIZE_LIMIT uint64 = 1 << 32 // 4GB + +type objectReader struct { + pr *PackfileReader + pf *Packfile + hash Hash + steps int + + typ int8 + size uint64 +} + +func newObjectReader(pr *PackfileReader, pf *Packfile) (*objectReader, error) { + o := &objectReader{pr: pr, pf: pf} + + var buf [1]byte + if _, err := o.Read(buf[:]); err != nil { + return nil, err + } + + o.typ = int8((buf[0] >> 4) & 7) + o.size = uint64(buf[0] & 15) + o.steps++ // byte we just read to get `o.typ` and `o.size` + + var shift uint = 4 + for buf[0]&0x80 == 0x80 { + if _, err := o.Read(buf[:]); err != nil { + return nil, err + } + + o.size += uint64(buf[0]&0x7f) << shift + o.steps++ // byte we just read to update `o.size` + shift += 7 + } + + return o, nil +} + +func (o *objectReader) readREFDelta() error { + var ref Hash + if _, err := o.Read(ref[:]); err != nil { + return err + } + + buf, err := o.inflate() + if err != nil { + return err + } + + referenced, ok := o.pr.objects[ref] + if !ok { + o.pr.deltas = append(o.pr.deltas, packfileDelta{hash: ref, delta: buf[:]}) + } else { + patched := PatchDelta(referenced.bytes, buf[:]) + if patched == nil { + return NewError("error while patching %x", ref) + } + o.typ = referenced.typ + err = o.addObject(patched) + if err != nil { + return err + } + } + + return nil +} + +func decodeOffset(src io.ByteReader, steps int) (int, error) { + b, err := src.ReadByte() + if err != nil { + return 0, err + } + var offset = int(b & 0x7f) + for (b & 0x80) != 0 { + offset += 1 // WHY? + b, err = src.ReadByte() + if err != nil { + return 0, err + } + offset = (offset << 7) + int(b&0x7f) + } + // offset needs to be aware of the bytes we read for `o.typ` and `o.size` + offset += steps + return -offset, nil +} + +func (o *objectReader) readOFSDelta() error { + var pos = o.pr.Pos() + + // read negative offset + offset, err := decodeOffset(o.pr.r, o.steps) + if err != nil { + return err + } + + buf, err := o.inflate() + if err != nil { + return err + } + + ref := o.pr.offsets[pos+offset] + referenced, ok := o.pr.objects[ref] + if !ok { + return NewError("can't find a pack entry at %d", pos+offset) + } else { + patched := PatchDelta(referenced.bytes, buf) + if patched == nil { + return NewError("error while patching %q", ref) + } + o.typ = referenced.typ + err = o.addObject(patched) + if err != nil { + return err + } + } + + return nil +} + +func (o *objectReader) readObject() error { + buf, err := o.inflate() + if err != nil { + return err + } + + return o.addObject(buf) +} + +func (o *objectReader) addObject(bytes []byte) error { + var hash Hash + + switch o.typ { + case OBJ_COMMIT: + c, err := ParseCommit(bytes) + if err != nil { + return err + } + o.pf.Commits[c.Hash()] = c + hash = c.Hash() + case OBJ_TREE: + c, err := NewTree(bytes) + if err != nil { + return err + } + o.pf.Trees[c.Hash()] = c + hash = c.Hash() + case OBJ_BLOB: + c, err := NewBlob(bytes) + if err != nil { + return err + } + o.pf.Blobs[c.Hash()] = c + hash = c.Hash() + + if o.pr.contentCallback != nil { + o.pr.contentCallback(hash, bytes) + } + } + + o.pr.objects[hash] = packfileObject{bytes: bytes, typ: o.typ} + o.hash = hash + + return nil +} + +func (o *objectReader) inflate() ([]byte, error) { + zr, err := zlib.NewReader(o.pr.r) + if err != nil { + if err == zlib.ErrHeader { + return nil, zlib.ErrHeader + } else { + return nil, NewError("error opening packfile's object zlib: %v", err) + } + } + defer zr.Close() + + if o.size > SIZE_LIMIT { + return nil, NewError("the object size exceeed the allowed limit: %d", o.size) + } + + var buf bytes.Buffer + io.Copy(&buf, zr) // also: io.CopyN(&buf, zr, int64(o.size)) + + var bufLen = buf.Len() + if bufLen != int(o.size) { + return nil, NewError("inflated size mismatch, expected %d, got %d", o.size, bufLen) + } + + return buf.Bytes(), nil +} + +func (o *objectReader) Read(p []byte) (int, error) { + return o.pr.r.Read(p) +} + +func (o *objectReader) ReadByte() (byte, error) { + return o.pr.r.ReadByte() +} + +type ReaderError struct { + Msg string // description of error +} + +func NewError(format string, args ...interface{}) error { + return &ReaderError{Msg: fmt.Sprintf(format, args...)} +} + +func (e *ReaderError) Error() string { return e.Msg } diff --git a/formats/packfile/reader_test.go b/formats/packfile/reader_test.go new file mode 100644 index 0000000..04f2948 --- /dev/null +++ b/formats/packfile/reader_test.go @@ -0,0 +1,35 @@ +package packfile + +import ( + "bytes" + "encoding/base64" + "testing" + + "github.com/stretchr/testify/assert" +) + +var packFileWithEmptyObjects = "UEFDSwAAAAIAAAALnw54nKXMQWoDMQxA0b1PoX2hSLIm44FSAlmXnEG2NYlhXAfHgdLb5Cy9WAM5Qpb/Lf7oZqArUpakyYtQjCoxZ5lmWXwwyuzJbHqAuYt2+x6QoyCyhYCKIa67lGameSLWvPh5JU0hsCg7vY1z6/D1d/8ptcHhprm3Kxz7KL/wUdOz96eqZXtPrX4CCeOOPU8Eb0iI7qG1jGGvXdxaNoPs/gHeNkp8lA94nKXMQUpDMRCA4X1OMXtBZpI3L3kiRXAtPcMkmWjgxZSYQultPEsv1oJHcPl/i38OVRC0IXF0lshrJorZEcpKmTEJYbA+B3aFzEmGfk9gpqJEsmnZNutXF71i1IURU/G0bsWWwJ6NnOdXH/Bx+73U1uH9LHn0HziOWa/w2tJfv302qftz6u0AtFh0wQdmeEJCNA9tdU7938WUuivEF5CczR11ZEsNnw54nKWMUQoCIRRF/13F+w/ijY6jQkTQd7SGpz5LyAxzINpNa2ljTbSEPu/hnNsbM4TJTzqyt561GdUUmJKT6K2MeiCVgnZWoY/iRo2vHVS0URrUS+e+dkqIEp11HMhh9IaUkRM6QXM/1waH9+uRS4X9TLHVOxxbz0/YlPDbu1OhfFmHWrYwjBKVNVaNsMIBUSy05N75vxeR8oXBiw8GoErCnwt4nKXMzQkCMRBA4XuqmLsgM2M2ZkAWwbNYQ341sCEQsyB2Yy02pmAJHt93eKOnBFpMNJqtl5CFxVIMomViomQSEWP2JrN3yq3j1jqc369HqQ1Oq4u93eHSR3nCoYZfH6/VlWUbWp2BNOPO7i1OsEFCVF+tZYz030XlsiRw6gPZ0jxaqwV4nDM0MDAzMVFIZHg299HsTRevOXt3a64rj7px6ElP8ERDiGQSQ2uoXe8RrcodS5on+J4/u8HjD4NDKFQyRS8tPx+rbgDt3yiEMHicAwAAAAABPnicS0wEAa4kMOACACTjBKdkZXici7aaYAUAA3gBYKoDeJwzNDAwMzFRSGR4NvfR7E0Xrzl7d2uuK4+6cehJT/BEQ4hkEsOELYFJvS2eX47UJdVttFQrenrmzQwA13MaiDd4nEtMBAEuAApMAlGtAXicMzQwMDMxUUhkeDb30exNF685e3drriuPunHoSU/wRACvkA258N/i8hVXx9CiAZzvFXNIhCuSFmE=" + +func TestReadPackfile(t *testing.T) { + data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects) + d := bytes.NewReader(data) + + r, err := NewPackfileReader(d, 8<<20, nil) + assert.Nil(t, err) + + p, err := r.Read() + assert.Nil(t, err) + + assert.Equal(t, 11, p.ObjectCount) + assert.Equal(t, 4, len(p.Commits)) + assert.Equal(t, 4, len(p.Trees)) +} + +func TestReadPackfileInvalid(t *testing.T) { + r, err := NewPackfileReader(bytes.NewReader([]byte("dasdsadasas")), 8<<20, nil) + assert.Nil(t, err) + + _, err = r.Read() + _, ok := err.(*ReaderError) + assert.True(t, ok) +} diff --git a/packfile/delta.go b/packfile/delta.go deleted file mode 100644 index 86b556f..0000000 --- a/packfile/delta.go +++ /dev/null @@ -1,93 +0,0 @@ -package packfile - -const delta_size_min = 4 - -func deltaHeaderSize(b []byte) (uint, []byte) { - var size, j uint - var cmd byte - for { - cmd = b[j] - size |= (uint(cmd) & 0x7f) << (j * 7) - j++ - if uint(cmd)&0xb80 == 0 || j == uint(len(b)) { - break - } - } - return size, b[j:] -} - -func PatchDelta(src, delta []byte) []byte { - if len(delta) < delta_size_min { - return nil - } - size, delta := deltaHeaderSize(delta) - if size != uint(len(src)) { - return nil - } - size, delta = deltaHeaderSize(delta) - origSize := size - - dest := make([]byte, 0) - - // var offset uint - var cmd byte - for { - cmd = delta[0] - delta = delta[1:] - if (cmd & 0x80) != 0 { - var cp_off, cp_size uint - if (cmd & 0x01) != 0 { - cp_off = uint(delta[0]) - delta = delta[1:] - } - if (cmd & 0x02) != 0 { - cp_off |= uint(delta[0]) << 8 - delta = delta[1:] - } - if (cmd & 0x04) != 0 { - cp_off |= uint(delta[0]) << 16 - delta = delta[1:] - } - if (cmd & 0x08) != 0 { - cp_off |= uint(delta[0]) << 24 - delta = delta[1:] - } - - if (cmd & 0x10) != 0 { - cp_size = uint(delta[0]) - delta = delta[1:] - } - if (cmd & 0x20) != 0 { - cp_size |= uint(delta[0]) << 8 - delta = delta[1:] - } - if (cmd & 0x40) != 0 { - cp_size |= uint(delta[0]) << 16 - delta = delta[1:] - } - if cp_size == 0 { - cp_size = 0x10000 - } - if cp_off+cp_size < cp_off || - cp_off+cp_size > uint(len(src)) || - cp_size > origSize { - break - } - dest = append(dest, src[cp_off:cp_off+cp_size]...) - size -= cp_size - } else if cmd != 0 { - if uint(cmd) > origSize { - break - } - dest = append(dest, delta[0:uint(cmd)]...) - size -= uint(cmd) - delta = delta[uint(cmd):] - } else { - return nil - } - if size <= 0 { - break - } - } - return dest -} diff --git a/packfile/doc.go b/packfile/doc.go deleted file mode 100644 index 1fc28da..0000000 --- a/packfile/doc.go +++ /dev/null @@ -1,168 +0,0 @@ -package packfile - -// Code from: -// https://github.com/gitchain/gitchain/tree/master/git @ 4c2fabdf9 -// -// GIT pack format -// =============== -// -// == pack-*.pack files have the following format: -// -// - A header appears at the beginning and consists of the following: -// -// 4-byte signature: -// The signature is: {'P', 'A', 'C', 'K'} -// -// 4-byte version number (network byte order): -// GIT currently accepts version number 2 or 3 but -// generates version 2 only. -// -// 4-byte number of objects contained in the pack (network byte order) -// -// Observation: we cannot have more than 4G versions ;-) and -// more than 4G objects in a pack. -// -// - The header is followed by number of object entries, each of -// which looks like this: -// -// (undeltified representation) -// n-byte type and length (3-bit type, (n-1)*7+4-bit length) -// compressed data -// -// (deltified representation) -// n-byte type and length (3-bit type, (n-1)*7+4-bit length) -// 20-byte base object name -// compressed delta data -// -// Observation: length of each object is encoded in a variable -// length format and is not constrained to 32-bit or anything. -// -// - The trailer records 20-byte SHA1 checksum of all of the above. -// -// == Original (version 1) pack-*.idx files have the following format: -// -// - The header consists of 256 4-byte network byte order -// integers. N-th entry of this table records the number of -// objects in the corresponding pack, the first byte of whose -// object name is less than or equal to N. This is called the -// 'first-level fan-out' table. -// -// - The header is followed by sorted 24-byte entries, one entry -// per object in the pack. Each entry is: -// -// 4-byte network byte order integer, recording where the -// object is stored in the packfile as the offset from the -// beginning. -// -// 20-byte object name. -// -// - The file is concluded with a trailer: -// -// A copy of the 20-byte SHA1 checksum at the end of -// corresponding packfile. -// -// 20-byte SHA1-checksum of all of the above. -// -// Pack Idx file: -// -// -- +--------------------------------+ -// fanout | fanout[0] = 2 (for example) |-. -// table +--------------------------------+ | -// | fanout[1] | | -// +--------------------------------+ | -// | fanout[2] | | -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | -// | fanout[255] = total objects |---. -// -- +--------------------------------+ | | -// main | offset | | | -// index | object name 00XXXXXXXXXXXXXXXX | | | -// table +--------------------------------+ | | -// | offset | | | -// | object name 00XXXXXXXXXXXXXXXX | | | -// +--------------------------------+<+ | -// .-| offset | | -// | | object name 01XXXXXXXXXXXXXXXX | | -// | +--------------------------------+ | -// | | offset | | -// | | object name 01XXXXXXXXXXXXXXXX | | -// | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | -// | | offset | | -// | | object name FFXXXXXXXXXXXXXXXX | | -// --| +--------------------------------+<--+ -// trailer | | packfile checksum | -// | +--------------------------------+ -// | | idxfile checksum | -// | +--------------------------------+ -// .-------. -// | -// Pack file entry: <+ -// -// packed object header: -// 1-byte size extension bit (MSB) -// type (next 3 bit) -// size0 (lower 4-bit) -// n-byte sizeN (as long as MSB is set, each 7-bit) -// size0..sizeN form 4+7+7+..+7 bit integer, size0 -// is the least significant part, and sizeN is the -// most significant part. -// packed object data: -// If it is not DELTA, then deflated bytes (the size above -// is the size before compression). -// If it is REF_DELTA, then -// 20-byte base object name SHA1 (the size above is the -// size of the delta data that follows). -// delta data, deflated. -// If it is OFS_DELTA, then -// n-byte offset (see below) interpreted as a negative -// offset from the type-byte of the header of the -// ofs-delta entry (the size above is the size of -// the delta data that follows). -// delta data, deflated. -// -// offset encoding: -// n bytes with MSB set in all but the last one. -// The offset is then the number constructed by -// concatenating the lower 7 bit of each byte, and -// for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1)) -// to the result. -// -// -// -// == Version 2 pack-*.idx files support packs larger than 4 GiB, and -// have some other reorganizations. They have the format: -// -// - A 4-byte magic number '\377tOc' which is an unreasonable -// fanout[0] value. -// -// - A 4-byte version number (= 2) -// -// - A 256-entry fan-out table just like v1. -// -// - A table of sorted 20-byte SHA1 object names. These are -// packed together without offset values to reduce the cache -// footprint of the binary search for a specific object name. -// -// - A table of 4-byte CRC32 values of the packed object data. -// This is new in v2 so compressed data can be copied directly -// from pack to pack during repacking without undetected -// data corruption. -// -// - A table of 4-byte offset values (in network byte order). -// These are usually 31-bit pack file offsets, but large -// offsets are encoded as an index into the next table with -// the msbit set. -// -// - A table of 8-byte offset entries (empty for pack files less -// than 2 GiB). Pack files are organized with heavily used -// objects toward the front, so most object references should -// not need to refer to this table. -// -// - The same trailer as a v1 pack file: -// -// A copy of the 20-byte SHA1 checksum at the end of -// corresponding packfile. -// -// 20-byte SHA1-checksum of all of the above. -// -// From: -// https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt diff --git a/packfile/objects.go b/packfile/objects.go deleted file mode 100644 index 6449808..0000000 --- a/packfile/objects.go +++ /dev/null @@ -1,200 +0,0 @@ -package packfile - -import ( - "bytes" - "encoding/hex" - "fmt" - "strconv" - "time" - - "gopkg.in/src-d/go-git.v2/commons" -) - -type Object interface { - Type() string - Hash() string -} - -type Hash []byte - -func (h Hash) String() string { - return hex.EncodeToString(h) -} - -type Commit struct { - Tree Hash - Parents []Hash - Author Signature - Committer Signature - Message string - hash string -} - -func NewCommit(b []byte) (*Commit, error) { - o := &Commit{hash: commons.GitHash("commit", b)} - - lines := bytes.Split(b, []byte{'\n'}) - for i := range lines { - if len(lines[i]) > 0 { - var err error - - split := bytes.SplitN(lines[i], []byte{' '}, 2) - switch string(split[0]) { - case "tree": - o.Tree = make([]byte, 20) - _, err = hex.Decode(o.Tree, split[1]) - case "parent": - h := make([]byte, 20) - _, err = hex.Decode(h, split[1]) - if err == nil { - o.Parents = append(o.Parents, h) - } - case "author": - o.Author = NewSignature(split[1]) - case "committer": - o.Committer = NewSignature(split[1]) - } - - if err != nil { - return nil, err - } - } else { - o.Message = string(bytes.Join(append(lines[i+1:]), []byte{'\n'})) - break - } - } - - return o, nil -} - -func (o *Commit) Type() string { - return "commit" -} - -func (o *Commit) Hash() string { - return o.hash -} - -type Signature struct { - Name string - Email string - When time.Time -} - -func NewSignature(signature []byte) Signature { - ret := Signature{} - if len(signature) == 0 { - return ret - } - - from := 0 - state := 'n' // n: name, e: email, t: timestamp, z: timezone - for i := 0; ; i++ { - var c byte - var end bool - if i < len(signature) { - c = signature[i] - } else { - end = true - } - - switch state { - case 'n': - if c == '<' || end { - if i == 0 { - break - } - ret.Name = string(signature[from : i-1]) - state = 'e' - from = i + 1 - } - case 'e': - if c == '>' || end { - ret.Email = string(signature[from:i]) - i++ - state = 't' - from = i + 1 - } - case 't': - if c == ' ' || end { - t, err := strconv.ParseInt(string(signature[from:i]), 10, 64) - if err == nil { - ret.When = time.Unix(t, 0) - } - end = true - } - } - - if end { - break - } - } - - return ret -} - -func (s *Signature) String() string { - return fmt.Sprintf("%q <%s> @ %s", s.Name, s.Email, s.When) -} - -type Tree struct { - Entries []TreeEntry - hash string -} - -type TreeEntry struct { - Name string - Hash string -} - -func NewTree(body []byte) (*Tree, error) { - o := &Tree{hash: commons.GitHash("tree", body)} - - if len(body) == 0 { - return o, nil - } - - for { - split := bytes.SplitN(body, []byte{0}, 2) - split1 := bytes.SplitN(split[0], []byte{' '}, 2) - - o.Entries = append(o.Entries, TreeEntry{ - Name: string(split1[1]), - Hash: fmt.Sprintf("%x", split[1][0:20]), - }) - - body = split[1][20:] - if len(split[1]) == 20 { - break - } - } - - return o, nil -} - -func (o *Tree) Type() string { - return "tree" -} - -func (o *Tree) Hash() string { - return o.hash -} - -type Blob struct { - Len int - hash string -} - -func NewBlob(b []byte) (*Blob, error) { - return &Blob{Len: len(b), hash: commons.GitHash("blob", b)}, nil -} - -func (o *Blob) Type() string { - return "blob" -} - -func (o *Blob) Hash() string { - return o.hash -} - -type ContentCallback func(hash string, content []byte) diff --git a/packfile/objects_test.go b/packfile/objects_test.go deleted file mode 100644 index 93c348b..0000000 --- a/packfile/objects_test.go +++ /dev/null @@ -1,50 +0,0 @@ -package packfile - -import ( - "testing" - "time" - - "github.com/stretchr/testify/assert" -) - -func TestSignature(t *testing.T) { - cases := map[string]Signature{ - `Foo Bar 1257894000 +0100`: { - Name: "Foo Bar", - Email: "foo@bar.com", - When: time.Unix(1257894000, 0), - }, - `Foo Bar <> 1257894000 +0100`: { - Name: "Foo Bar", - Email: "", - When: time.Unix(1257894000, 0), - }, - ` <> 1257894000`: { - Name: "", - Email: "", - When: time.Unix(1257894000, 0), - }, - `Foo Bar `: { - Name: "Foo Bar", - Email: "foo@bar.com", - When: time.Time{}, - }, - ``: { - Name: "", - Email: "", - When: time.Time{}, - }, - `<`: { - Name: "", - Email: "", - When: time.Time{}, - }, - } - - for raw, exp := range cases { - got := NewSignature([]byte(raw)) - assert.Equal(t, exp.Name, got.Name) - assert.Equal(t, exp.Email, got.Email) - assert.Equal(t, exp.When.Unix(), got.When.Unix()) - } -} diff --git a/packfile/packfile.go b/packfile/packfile.go deleted file mode 100644 index 11ef969..0000000 --- a/packfile/packfile.go +++ /dev/null @@ -1,83 +0,0 @@ -package packfile - -import "fmt" - -type Packfile struct { - Version uint32 - Size int64 - ObjectCount int - Checksum []byte - Commits map[string]*Commit - Trees map[string]*Tree - Blobs map[string]*Blob -} - -func NewPackfile() *Packfile { - return &Packfile{ - Commits: make(map[string]*Commit, 0), - Trees: make(map[string]*Tree, 0), - Blobs: make(map[string]*Blob, 0), - } -} - -type BlobEntry struct { - path string - *Blob -} - -type SubtreeEntry struct { - path string - *Tree - TreeCh -} - -type treeEntry interface { - isTreeEntry() - Path() string -} - -func (b BlobEntry) isTreeEntry() {} -func (b BlobEntry) Path() string { return b.path } -func (b SubtreeEntry) isTreeEntry() {} -func (b SubtreeEntry) Path() string { return b.path } - -type TreeCh <-chan treeEntry - -func (p *Packfile) WalkCommit(commitHash string) (TreeCh, error) { - commit, ok := p.Commits[commitHash] - if !ok { - return nil, fmt.Errorf("Unable to find %q commit", commitHash) - } - - treeHash := fmt.Sprintf("%x", string(commit.Tree)) - return p.WalkTree(p.Trees[treeHash]), nil -} - -func (p *Packfile) WalkTree(tree *Tree) TreeCh { - return p.walkTree(tree, "") -} - -func (p *Packfile) walkTree(tree *Tree, pathPrefix string) TreeCh { - ch := make(chan treeEntry) - - if tree == nil { - close(ch) - return ch - } - - go func() { - defer func() { - close(ch) - }() - for _, e := range tree.Entries { - path := pathPrefix + e.Name - if blob, ok := p.Blobs[e.Hash]; ok { - ch <- BlobEntry{path, blob} - } else if subtree, ok := p.Trees[e.Hash]; ok { - ch <- SubtreeEntry{path, subtree, p.walkTree(subtree, path+"/")} - } - } - }() - - return ch -} diff --git a/packfile/reader.go b/packfile/reader.go deleted file mode 100644 index e761654..0000000 --- a/packfile/reader.go +++ /dev/null @@ -1,402 +0,0 @@ -package packfile - -import ( - "bytes" - "encoding/binary" - "fmt" - "io" - - "github.com/klauspost/compress/zlib" -) - -const MaxObjectsLimit = 1000000 - -var ErrMaxSize = fmt.Errorf("Max size exceeded for in-memory client") - -type TrackingByteReader struct { - r io.Reader - n, l int -} - -func (t *TrackingByteReader) Pos() int { return t.n } - -func (t *TrackingByteReader) Read(p []byte) (n int, err error) { - n, err = t.r.Read(p) - if err != nil { - return 0, err - } - t.n += n - if t.n >= t.l { - return n, ErrMaxSize - } - return n, err -} - -func (t *TrackingByteReader) ReadByte() (c byte, err error) { - var p [1]byte - n, err := t.r.Read(p[:]) - if err != nil { - return 0, err - } - if n > 1 { - return 0, fmt.Errorf("read %d bytes, should have read just 1", n) - } - t.n += n // n is 1 - return p[0], nil -} - -type PackfileReader struct { - r *TrackingByteReader - - objects map[string]packfileObject - offsets map[int]string - deltas []packfileDelta - - contentCallback ContentCallback -} - -type packfileObject struct { - bytes []byte - typ int8 -} - -type packfileDelta struct { - hash string - delta []byte -} - -func NewPackfileReader(r io.Reader, l int, fn ContentCallback) (*PackfileReader, error) { - return &PackfileReader{ - r: &TrackingByteReader{r: r, n: 0, l: l}, - objects: make(map[string]packfileObject, 0), - offsets: make(map[int]string, 0), - contentCallback: fn, - }, nil -} - -func (pr *PackfileReader) Pos() int { return pr.r.Pos() } - -func (pr *PackfileReader) Read() (*Packfile, error) { - packfile := NewPackfile() - - if err := pr.validateHeader(); err != nil { - if err == io.EOF { - // This is an empty repo. It's OK. - return packfile, nil - } - return nil, err - } - - ver, err := pr.readInt32() - if err != nil { - return nil, err - } - - count, err := pr.readInt32() - if err != nil { - return nil, err - } - - packfile.Version = uint32(ver) - packfile.ObjectCount = int(count) - - if packfile.ObjectCount > MaxObjectsLimit { - return nil, NewError("too many objects (%d)", packfile.ObjectCount) - } - - if err := pr.readObjects(packfile); err != nil { - return nil, err - } - - packfile.Size = int64(pr.r.Pos()) - - return packfile, nil -} - -func (pr *PackfileReader) validateHeader() error { - var header = make([]byte, 4) - if _, err := pr.r.Read(header); err != nil { - return err - } - - if !bytes.Equal(header, []byte{'P', 'A', 'C', 'K'}) { - return NewError("Pack file does not start with 'PACK'") - } - - return nil -} - -func (pr *PackfileReader) readInt32() (uint32, error) { - var value uint32 - if err := binary.Read(pr.r, binary.BigEndian, &value); err != nil { - return 0, err - } - - return value, nil -} - -func (pr *PackfileReader) readObjects(packfile *Packfile) error { - // This code has 50-80 µs of overhead per object not counting zlib inflation. - // Together with zlib inflation, it's 400-410 µs for small objects. - // That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB, - // of which 12-20 % is _not_ zlib inflation (ie. is our code). - - for i := 0; i < packfile.ObjectCount; i++ { - var pos = pr.Pos() - obj, err := pr.readObject(packfile) - if err != nil && err != io.EOF { - return err - } - - pr.offsets[pos] = obj.hash - - if err == io.EOF { - break - } - } - - return nil -} - -func (pr *PackfileReader) readObject(packfile *Packfile) (*objectReader, error) { - o, err := newObjectReader(pr, packfile) - if err != nil { - return nil, err - } - - switch o.typ { - case OBJ_REF_DELTA: - err = o.readREFDelta() - case OBJ_OFS_DELTA: - err = o.readOFSDelta() - case OBJ_COMMIT, OBJ_TREE, OBJ_BLOB, OBJ_TAG: - err = o.readObject() - default: - err = NewError("Invalid git object tag %q", o.typ) - } - - if err != nil { - return nil, err - } - - return o, err -} - -const ( - OBJ_COMMIT = 1 - OBJ_TREE = 2 - OBJ_BLOB = 3 - OBJ_TAG = 4 - OBJ_OFS_DELTA = 6 - OBJ_REF_DELTA = 7 -) - -const SIZE_LIMIT uint64 = 1 << 32 // 4GB - -type objectReader struct { - pr *PackfileReader - pf *Packfile - hash string - steps int - - typ int8 - size uint64 -} - -func newObjectReader(pr *PackfileReader, pf *Packfile) (*objectReader, error) { - o := &objectReader{pr: pr, pf: pf} - - var buf [1]byte - if _, err := o.Read(buf[:]); err != nil { - return nil, err - } - - o.typ = int8((buf[0] >> 4) & 7) - o.size = uint64(buf[0] & 15) - o.steps++ // byte we just read to get `o.typ` and `o.size` - - var shift uint = 4 - for buf[0]&0x80 == 0x80 { - if _, err := o.Read(buf[:]); err != nil { - return nil, err - } - - o.size += uint64(buf[0]&0x7f) << shift - o.steps++ // byte we just read to update `o.size` - shift += 7 - } - - return o, nil -} - -func (o *objectReader) readREFDelta() error { - var ref [20]byte - if _, err := o.Read(ref[:]); err != nil { - return err - } - - buf, err := o.inflate() - if err != nil { - return err - } - - refhash := fmt.Sprintf("%x", ref) - referenced, ok := o.pr.objects[refhash] - if !ok { - o.pr.deltas = append(o.pr.deltas, packfileDelta{hash: refhash, delta: buf[:]}) - } else { - patched := PatchDelta(referenced.bytes, buf[:]) - if patched == nil { - return NewError("error while patching %x", ref) - } - o.typ = referenced.typ - err = o.addObject(patched) - if err != nil { - return err - } - } - - return nil -} - -func decodeOffset(src io.ByteReader, steps int) (int, error) { - b, err := src.ReadByte() - if err != nil { - return 0, err - } - var offset = int(b & 0x7f) - for (b & 0x80) != 0 { - offset += 1 // WHY? - b, err = src.ReadByte() - if err != nil { - return 0, err - } - offset = (offset << 7) + int(b&0x7f) - } - // offset needs to be aware of the bytes we read for `o.typ` and `o.size` - offset += steps - return -offset, nil -} - -func (o *objectReader) readOFSDelta() error { - var pos = o.pr.Pos() - - // read negative offset - offset, err := decodeOffset(o.pr.r, o.steps) - if err != nil { - return err - } - - buf, err := o.inflate() - if err != nil { - return err - } - - refhash := o.pr.offsets[pos+offset] - referenced, ok := o.pr.objects[refhash] - if !ok { - return NewError("can't find a pack entry at %d", pos+offset) - } else { - patched := PatchDelta(referenced.bytes, buf) - if patched == nil { - return NewError("error while patching %x", refhash) - } - o.typ = referenced.typ - err = o.addObject(patched) - if err != nil { - return err - } - } - - return nil -} - -func (o *objectReader) readObject() error { - buf, err := o.inflate() - if err != nil { - return err - } - - return o.addObject(buf) -} - -func (o *objectReader) addObject(bytes []byte) error { - var hash string - - switch o.typ { - case OBJ_COMMIT: - c, err := NewCommit(bytes) - if err != nil { - return err - } - o.pf.Commits[c.Hash()] = c - hash = c.Hash() - case OBJ_TREE: - c, err := NewTree(bytes) - if err != nil { - return err - } - o.pf.Trees[c.Hash()] = c - hash = c.Hash() - case OBJ_BLOB: - c, err := NewBlob(bytes) - if err != nil { - return err - } - o.pf.Blobs[c.Hash()] = c - hash = c.Hash() - - if o.pr.contentCallback != nil { - o.pr.contentCallback(hash, bytes) - } - } - - o.pr.objects[hash] = packfileObject{bytes: bytes, typ: o.typ} - o.hash = hash - - return nil -} - -func (o *objectReader) inflate() ([]byte, error) { - zr, err := zlib.NewReader(o.pr.r) - if err != nil { - if err == zlib.ErrHeader { - return nil, zlib.ErrHeader - } else { - return nil, NewError("error opening packfile's object zlib: %v", err) - } - } - defer zr.Close() - - if o.size > SIZE_LIMIT { - return nil, NewError("the object size exceeed the allowed limit: %d", o.size) - } - - var buf bytes.Buffer - io.Copy(&buf, zr) // also: io.CopyN(&buf, zr, int64(o.size)) - - var bufLen = buf.Len() - if bufLen != int(o.size) { - return nil, NewError("inflated size mismatch, expected %d, got %d", o.size, bufLen) - } - - return buf.Bytes(), nil -} - -func (o *objectReader) Read(p []byte) (int, error) { - return o.pr.r.Read(p) -} - -func (o *objectReader) ReadByte() (byte, error) { - return o.pr.r.ReadByte() -} - -type ReaderError struct { - Msg string // description of error -} - -func NewError(format string, args ...interface{}) error { - return &ReaderError{Msg: fmt.Sprintf(format, args...)} -} - -func (e *ReaderError) Error() string { return e.Msg } diff --git a/packfile/reader_test.go b/packfile/reader_test.go deleted file mode 100644 index 04f2948..0000000 --- a/packfile/reader_test.go +++ /dev/null @@ -1,35 +0,0 @@ -package packfile - -import ( - "bytes" - "encoding/base64" - "testing" - - "github.com/stretchr/testify/assert" -) - -var packFileWithEmptyObjects = "UEFDSwAAAAIAAAALnw54nKXMQWoDMQxA0b1PoX2hSLIm44FSAlmXnEG2NYlhXAfHgdLb5Cy9WAM5Qpb/Lf7oZqArUpakyYtQjCoxZ5lmWXwwyuzJbHqAuYt2+x6QoyCyhYCKIa67lGameSLWvPh5JU0hsCg7vY1z6/D1d/8ptcHhprm3Kxz7KL/wUdOz96eqZXtPrX4CCeOOPU8Eb0iI7qG1jGGvXdxaNoPs/gHeNkp8lA94nKXMQUpDMRCA4X1OMXtBZpI3L3kiRXAtPcMkmWjgxZSYQultPEsv1oJHcPl/i38OVRC0IXF0lshrJorZEcpKmTEJYbA+B3aFzEmGfk9gpqJEsmnZNutXF71i1IURU/G0bsWWwJ6NnOdXH/Bx+73U1uH9LHn0HziOWa/w2tJfv302qftz6u0AtFh0wQdmeEJCNA9tdU7938WUuivEF5CczR11ZEsNnw54nKWMUQoCIRRF/13F+w/ijY6jQkTQd7SGpz5LyAxzINpNa2ljTbSEPu/hnNsbM4TJTzqyt561GdUUmJKT6K2MeiCVgnZWoY/iRo2vHVS0URrUS+e+dkqIEp11HMhh9IaUkRM6QXM/1waH9+uRS4X9TLHVOxxbz0/YlPDbu1OhfFmHWrYwjBKVNVaNsMIBUSy05N75vxeR8oXBiw8GoErCnwt4nKXMzQkCMRBA4XuqmLsgM2M2ZkAWwbNYQ341sCEQsyB2Yy02pmAJHt93eKOnBFpMNJqtl5CFxVIMomViomQSEWP2JrN3yq3j1jqc369HqQ1Oq4u93eHSR3nCoYZfH6/VlWUbWp2BNOPO7i1OsEFCVF+tZYz030XlsiRw6gPZ0jxaqwV4nDM0MDAzMVFIZHg299HsTRevOXt3a64rj7px6ElP8ERDiGQSQ2uoXe8RrcodS5on+J4/u8HjD4NDKFQyRS8tPx+rbgDt3yiEMHicAwAAAAABPnicS0wEAa4kMOACACTjBKdkZXici7aaYAUAA3gBYKoDeJwzNDAwMzFRSGR4NvfR7E0Xrzl7d2uuK4+6cehJT/BEQ4hkEsOELYFJvS2eX47UJdVttFQrenrmzQwA13MaiDd4nEtMBAEuAApMAlGtAXicMzQwMDMxUUhkeDb30exNF685e3drriuPunHoSU/wRACvkA258N/i8hVXx9CiAZzvFXNIhCuSFmE=" - -func TestReadPackfile(t *testing.T) { - data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects) - d := bytes.NewReader(data) - - r, err := NewPackfileReader(d, 8<<20, nil) - assert.Nil(t, err) - - p, err := r.Read() - assert.Nil(t, err) - - assert.Equal(t, 11, p.ObjectCount) - assert.Equal(t, 4, len(p.Commits)) - assert.Equal(t, 4, len(p.Trees)) -} - -func TestReadPackfileInvalid(t *testing.T) { - r, err := NewPackfileReader(bytes.NewReader([]byte("dasdsadasas")), 8<<20, nil) - assert.Nil(t, err) - - _, err = r.Read() - _, ok := err.(*ReaderError) - assert.True(t, ok) -} diff --git a/remote_test.go b/remote_test.go index 3426fce..a3c602d 100644 --- a/remote_test.go +++ b/remote_test.go @@ -1,8 +1,9 @@ package git import ( + "gopkg.in/src-d/go-git.v2/formats/packfile" + . "gopkg.in/check.v1" - "gopkg.in/src-d/go-git.v2/packfile" ) type SuiteRemote struct{} -- cgit From 27aa8cdd2431068606741a589383c02c149ea625 Mon Sep 17 00:00:00 2001 From: Máximo Cuadros Date: Fri, 23 Oct 2015 16:27:08 +0200 Subject: formats/packfile: cleanup --- .travis.yml | 2 +- formats/packfile/common.go | 39 ++++++++++ formats/packfile/common_test.go | 9 +++ formats/packfile/objects.go | 25 +++--- formats/packfile/objects_test.go | 28 +++++++ formats/packfile/reader.go | 160 ++++++++++++++++++--------------------- formats/packfile/reader_test.go | 4 +- remote_test.go | 2 +- 8 files changed, 168 insertions(+), 101 deletions(-) create mode 100644 formats/packfile/common.go create mode 100644 formats/packfile/common_test.go diff --git a/.travis.yml b/.travis.yml index 235ad5d..8bd2ecb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ script: - tail -n +2 tmp.out >> coverage.out - go test -v gopkg.in/src-d/go-git.v2/clients/http -covermode=count -coverprofile=tmp.out - tail -n +2 tmp.out >> coverage.out - - go test -v gopkg.in/src-d/go-git.v2/packfile -covermode=count -coverprofile=tmp.out + - go test -v gopkg.in/src-d/go-git.v2/formats/packfile -covermode=count -coverprofile=tmp.out - tail -n +2 tmp.out >> coverage.out - go test -v gopkg.in/src-d/go-git.v2/formats/pktline -covermode=count -coverprofile=tmp.out - tail -n +2 tmp.out >> coverage.out diff --git a/formats/packfile/common.go b/formats/packfile/common.go new file mode 100644 index 0000000..4a97dc7 --- /dev/null +++ b/formats/packfile/common.go @@ -0,0 +1,39 @@ +package packfile + +import ( + "fmt" + "io" +) + +type trackingReader struct { + r io.Reader + n int +} + +func (t *trackingReader) Pos() int { return t.n } + +func (t *trackingReader) Read(p []byte) (n int, err error) { + n, err = t.r.Read(p) + if err != nil { + return 0, err + } + + t.n += n + + return n, err +} + +func (t *trackingReader) ReadByte() (c byte, err error) { + var p [1]byte + n, err := t.r.Read(p[:]) + if err != nil { + return 0, err + } + + if n > 1 { + return 0, fmt.Errorf("read %d bytes, should have read just 1", n) + } + + t.n += n // n is 1 + return p[0], nil +} diff --git a/formats/packfile/common_test.go b/formats/packfile/common_test.go new file mode 100644 index 0000000..104a5d2 --- /dev/null +++ b/formats/packfile/common_test.go @@ -0,0 +1,9 @@ +package packfile + +import ( + "testing" + + . "gopkg.in/check.v1" +) + +func Test(t *testing.T) { TestingT(t) } diff --git a/formats/packfile/objects.go b/formats/packfile/objects.go index 4c7ee75..1077b5f 100644 --- a/formats/packfile/objects.go +++ b/formats/packfile/objects.go @@ -57,9 +57,6 @@ type Commit struct { // ParseCommit transform a byte slice into a Commit struct func ParseCommit(b []byte) (*Commit, error) { - // b64 := base64.StdEncoding.EncodeToString(b) - //fmt.Printf("%q\n", b64) - o := &Commit{hash: ComputeHash(CommitObject, b)} lines := bytes.Split(b, []byte{'\n'}) @@ -106,25 +103,29 @@ func (o *Commit) Hash() Hash { return o.hash } +// Tree is basically like a directory - it references a bunch of other trees +// and/or blobs (i.e. files and sub-directories) type Tree struct { Entries []TreeEntry hash Hash } +// TreeEntry represents a file type TreeEntry struct { Name string Hash Hash } -func NewTree(body []byte) (*Tree, error) { - o := &Tree{hash: ComputeHash(TreeObject, body)} +// ParseTree transform a byte slice into a Tree struct +func ParseTree(b []byte) (*Tree, error) { + o := &Tree{hash: ComputeHash(TreeObject, b)} - if len(body) == 0 { + if len(b) == 0 { return o, nil } for { - split := bytes.SplitN(body, []byte{0}, 2) + split := bytes.SplitN(b, []byte{0}, 2) split1 := bytes.SplitN(split[0], []byte{' '}, 2) entry := TreeEntry{} @@ -133,7 +134,7 @@ func NewTree(body []byte) (*Tree, error) { o.Entries = append(o.Entries, entry) - body = split[1][20:] + b = split[1][20:] if len(split[1]) == 20 { break } @@ -142,30 +143,36 @@ func NewTree(body []byte) (*Tree, error) { return o, nil } +// Type returns the object type func (o *Tree) Type() ObjectType { return TreeObject } +// Hash returns the computed hash of the tree func (o *Tree) Hash() Hash { return o.hash } +// Blob is used to store file data - it is generally a file. type Blob struct { Len int hash Hash } -func NewBlob(b []byte) (*Blob, error) { +// ParseBlob transform a byte slice into a Blob struct +func ParseBlob(b []byte) (*Blob, error) { return &Blob{ Len: len(b), hash: ComputeHash(BlobObject, b), }, nil } +// Type returns the object type func (o *Blob) Type() ObjectType { return BlobObject } +// Hash returns the computed hash of the blob func (o *Blob) Hash() Hash { return o.hash } diff --git a/formats/packfile/objects_test.go b/formats/packfile/objects_test.go index 70f4ae6..5952432 100644 --- a/formats/packfile/objects_test.go +++ b/formats/packfile/objects_test.go @@ -45,6 +45,34 @@ func (s *ObjectsSuite) TestCommitHash(c *C) { c.Assert(commit.Hash().String(), Equals, "a5b8b09e2f8fcb0bb99d3ccb0958157b40890d69") } +var TreeFixture = "MTAwNjQ0IC5naXRpZ25vcmUAMoWKrTw4PtH/Cg+b3yMdVKAMnogxMDA2NDQgQ0hBTkdFTE9HANP/U+BWSp+H2OhLbijlBg5RcAiqMTAwNjQ0IExJQ0VOU0UAwZK9aiTqGrAdeGhuQXyL3Hw9GX8xMDA2NDQgYmluYXJ5LmpwZwDVwPSrgRiXyt8DrsNYrmDSH5HFDTQwMDAwIGdvAKOXcadlH5f69ccuCCJNhX/DUTPbNDAwMDAganNvbgBah35qkGonQ61uRdmcF5NkKq+O2jQwMDAwIHBocABYavVn0Ltedx5JvdlDT14Pt20l+jQwMDAwIHZlbmRvcgDPSqOziXT7fYHzZ8CDD3141lq4aw==" + +func (s *ObjectsSuite) TestParseTree(c *C) { + data, _ := base64.StdEncoding.DecodeString(TreeFixture) + tree, err := ParseTree(data) + c.Assert(err, IsNil) + + c.Assert(tree.Entries, HasLen, 8) + c.Assert(tree.Entries[0].Name, Equals, ".gitignore") + c.Assert(tree.Entries[0].Hash.String(), Equals, "32858aad3c383ed1ff0a0f9bdf231d54a00c9e88") +} + +func (s *ObjectsSuite) TestTreeHash(c *C) { + data, _ := base64.StdEncoding.DecodeString(TreeFixture) + tree, err := ParseTree(data) + + c.Assert(err, IsNil) + c.Assert(tree.Hash().String(), Equals, "a8d315b2b1c615d43042c3a62402b8a54288cf5c") +} + +func (s *ObjectsSuite) TestBlobHash(c *C) { + blob, err := ParseBlob([]byte{'F', 'O', 'O'}) + c.Assert(err, IsNil) + + c.Assert(blob.Len, Equals, 3) + c.Assert(blob.Hash().String(), Equals, "d96c7efbfec2814ae0301ad054dc8d9fc416c9b5") +} + func (s *ObjectsSuite) TestParseSignature(c *C) { cases := map[string]Signature{ `Foo Bar 1257894000 +0100`: { diff --git a/formats/packfile/reader.go b/formats/packfile/reader.go index ccf4822..d5f40b9 100644 --- a/formats/packfile/reader.go +++ b/formats/packfile/reader.go @@ -9,49 +9,34 @@ import ( "github.com/klauspost/compress/zlib" ) -const MaxObjectsLimit = 1000000 - -var ErrMaxSize = fmt.Errorf("Max size exceeded for in-memory client") - -type TrackingByteReader struct { - r io.Reader - n, l int -} - -func (t *TrackingByteReader) Pos() int { return t.n } - -func (t *TrackingByteReader) Read(p []byte) (n int, err error) { - n, err = t.r.Read(p) - if err != nil { - return 0, err - } - t.n += n - if t.n >= t.l { - return n, ErrMaxSize - } - return n, err -} - -func (t *TrackingByteReader) ReadByte() (c byte, err error) { - var p [1]byte - n, err := t.r.Read(p[:]) - if err != nil { - return 0, err - } - if n > 1 { - return 0, fmt.Errorf("read %d bytes, should have read just 1", n) - } - t.n += n // n is 1 - return p[0], nil -} +const ( + DefaultMaxObjectsLimit = 1 << 20 + DefaultMaxObjectSize = 1 << 32 // 4GB + + rawCommit = 1 + rawTree = 2 + rawBlob = 3 + rawTag = 4 + rawOFSDelta = 6 + rawREFDelta = 7 +) type PackfileReader struct { - r *TrackingByteReader - - objects map[Hash]packfileObject - offsets map[int]Hash - deltas []packfileDelta - + // MaxObjectsLimit is the limit of objects to be load in the packfile, if + // a packfile excess this number an error is throw, the default value + // is defined by DefaultMaxObjectsLimit, usually the default limit is more + // than enough to work with any repository, working extremly big repositories + // where the number of object is bigger the memory can be exhausted. + MaxObjectsLimit int + + // MaxObjectSize is the maximum size in bytes, reading objects with a bigger + // size cause a error. The default value is defined by DefaultMaxObjectSize + MaxObjectSize int + + r *trackingReader + objects map[Hash]packfileObject + offsets map[int]Hash + deltas []packfileDelta contentCallback ContentCallback } @@ -65,17 +50,17 @@ type packfileDelta struct { delta []byte } -func NewPackfileReader(r io.Reader, l int, fn ContentCallback) (*PackfileReader, error) { +func NewPackfileReader(r io.Reader, fn ContentCallback) (*PackfileReader, error) { return &PackfileReader{ - r: &TrackingByteReader{r: r, n: 0, l: l}, + MaxObjectsLimit: DefaultMaxObjectsLimit, + MaxObjectSize: DefaultMaxObjectSize, + r: &trackingReader{r: r}, objects: make(map[Hash]packfileObject, 0), offsets: make(map[int]Hash, 0), contentCallback: fn, }, nil } -func (pr *PackfileReader) Pos() int { return pr.r.Pos() } - func (pr *PackfileReader) Read() (*Packfile, error) { packfile := NewPackfile() @@ -100,8 +85,9 @@ func (pr *PackfileReader) Read() (*Packfile, error) { packfile.Version = uint32(ver) packfile.ObjectCount = int(count) - if packfile.ObjectCount > MaxObjectsLimit { - return nil, NewError("too many objects (%d)", packfile.ObjectCount) + if packfile.ObjectCount > pr.MaxObjectsLimit { + return nil, NewError("too many objects %d, limit is %d", + packfile.ObjectCount, pr.MaxObjectsLimit) } if err := pr.readObjects(packfile); err != nil { @@ -159,17 +145,17 @@ func (pr *PackfileReader) readObjects(packfile *Packfile) error { } func (pr *PackfileReader) readObject(packfile *Packfile) (*objectReader, error) { - o, err := newObjectReader(pr, packfile) + o, err := newObjectReader(pr, packfile, pr.MaxObjectSize) if err != nil { return nil, err } switch o.typ { - case OBJ_REF_DELTA: + case rawREFDelta: err = o.readREFDelta() - case OBJ_OFS_DELTA: + case rawOFSDelta: err = o.readOFSDelta() - case OBJ_COMMIT, OBJ_TREE, OBJ_BLOB, OBJ_TAG: + case rawCommit, rawTree, rawBlob, rawTag: err = o.readObject() default: err = NewError("Invalid git object tag %q", o.typ) @@ -182,29 +168,21 @@ func (pr *PackfileReader) readObject(packfile *Packfile) (*objectReader, error) return o, err } -const ( - OBJ_COMMIT = 1 - OBJ_TREE = 2 - OBJ_BLOB = 3 - OBJ_TAG = 4 - OBJ_OFS_DELTA = 6 - OBJ_REF_DELTA = 7 -) - -const SIZE_LIMIT uint64 = 1 << 32 // 4GB +func (pr *PackfileReader) Pos() int { return pr.r.Pos() } type objectReader struct { - pr *PackfileReader - pf *Packfile + pr *PackfileReader + pf *Packfile + maxSize uint64 + hash Hash steps int - - typ int8 - size uint64 + typ int8 + size uint64 } -func newObjectReader(pr *PackfileReader, pf *Packfile) (*objectReader, error) { - o := &objectReader{pr: pr, pf: pf} +func newObjectReader(pr *PackfileReader, pf *Packfile, maxSize int) (*objectReader, error) { + o := &objectReader{pr: pr, pf: pf, maxSize: uint64(maxSize)} var buf [1]byte if _, err := o.Read(buf[:]); err != nil { @@ -248,6 +226,7 @@ func (o *objectReader) readREFDelta() error { if patched == nil { return NewError("error while patching %x", ref) } + o.typ = referenced.typ err = o.addObject(patched) if err != nil { @@ -265,13 +244,15 @@ func decodeOffset(src io.ByteReader, steps int) (int, error) { } var offset = int(b & 0x7f) for (b & 0x80) != 0 { - offset += 1 // WHY? + offset++ // WHY? b, err = src.ReadByte() if err != nil { return 0, err } + offset = (offset << 7) + int(b&0x7f) } + // offset needs to be aware of the bytes we read for `o.typ` and `o.size` offset += steps return -offset, nil @@ -295,16 +276,17 @@ func (o *objectReader) readOFSDelta() error { referenced, ok := o.pr.objects[ref] if !ok { return NewError("can't find a pack entry at %d", pos+offset) - } else { - patched := PatchDelta(referenced.bytes, buf) - if patched == nil { - return NewError("error while patching %q", ref) - } - o.typ = referenced.typ - err = o.addObject(patched) - if err != nil { - return err - } + } + + patched := PatchDelta(referenced.bytes, buf) + if patched == nil { + return NewError("error while patching %q", ref) + } + + o.typ = referenced.typ + err = o.addObject(patched) + if err != nil { + return err } return nil @@ -323,22 +305,22 @@ func (o *objectReader) addObject(bytes []byte) error { var hash Hash switch o.typ { - case OBJ_COMMIT: + case rawCommit: c, err := ParseCommit(bytes) if err != nil { return err } o.pf.Commits[c.Hash()] = c hash = c.Hash() - case OBJ_TREE: - c, err := NewTree(bytes) + case rawTree: + c, err := ParseTree(bytes) if err != nil { return err } o.pf.Trees[c.Hash()] = c hash = c.Hash() - case OBJ_BLOB: - c, err := NewBlob(bytes) + case rawBlob: + c, err := ParseBlob(bytes) if err != nil { return err } @@ -361,14 +343,16 @@ func (o *objectReader) inflate() ([]byte, error) { if err != nil { if err == zlib.ErrHeader { return nil, zlib.ErrHeader - } else { - return nil, NewError("error opening packfile's object zlib: %v", err) } + + return nil, NewError("error opening packfile's object zlib: %v", err) } + defer zr.Close() - if o.size > SIZE_LIMIT { - return nil, NewError("the object size exceeed the allowed limit: %d", o.size) + if o.size > o.maxSize { + return nil, NewError("the object size %q exceeed the allowed limit: %q", + o.size, o.maxSize) } var buf bytes.Buffer diff --git a/formats/packfile/reader_test.go b/formats/packfile/reader_test.go index 04f2948..e52cbc3 100644 --- a/formats/packfile/reader_test.go +++ b/formats/packfile/reader_test.go @@ -14,7 +14,7 @@ func TestReadPackfile(t *testing.T) { data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects) d := bytes.NewReader(data) - r, err := NewPackfileReader(d, 8<<20, nil) + r, err := NewPackfileReader(d, nil) assert.Nil(t, err) p, err := r.Read() @@ -26,7 +26,7 @@ func TestReadPackfile(t *testing.T) { } func TestReadPackfileInvalid(t *testing.T) { - r, err := NewPackfileReader(bytes.NewReader([]byte("dasdsadasas")), 8<<20, nil) + r, err := NewPackfileReader(bytes.NewReader([]byte("dasdsadasas")), nil) assert.Nil(t, err) _, err = r.Read() diff --git a/remote_test.go b/remote_test.go index a3c602d..881f629 100644 --- a/remote_test.go +++ b/remote_test.go @@ -40,7 +40,7 @@ func (s *SuiteRemote) TestFetchDefaultBranch(c *C) { reader, err := r.FetchDefaultBranch() c.Assert(err, IsNil) - pr, err := packfile.NewPackfileReader(reader, 8<<20, nil) + pr, err := packfile.NewPackfileReader(reader, nil) c.Assert(err, IsNil) pf, err := pr.Read() -- cgit