diff options
author | Máximo Cuadros Ortiz <mcuadros@gmail.com> | 2015-04-05 23:34:43 +0200 |
---|---|---|
committer | Máximo Cuadros Ortiz <mcuadros@gmail.com> | 2015-04-06 04:12:04 +0200 |
commit | 5d7303c49ac984a9fec60523f2d5297682e16646 (patch) | |
tree | 53ac3a7eae7e271e58cc37ab1b7d2c27f3f2a9e5 | |
download | go-git-5d7303c49ac984a9fec60523f2d5297682e16646.tar.gz |
some refactor in folders and crawler
-rw-r--r-- | client.go | 155 | ||||
-rw-r--r-- | packfile/delta.go | 93 | ||||
-rw-r--r-- | packfile/doc.go | 168 | ||||
-rw-r--r-- | packfile/objects.go | 222 | ||||
-rw-r--r-- | packfile/objects_test.go | 55 | ||||
-rw-r--r-- | packfile/packfile.go | 82 | ||||
-rw-r--r-- | packfile/reader.go | 412 | ||||
-rw-r--r-- | packfile/reader_test.go | 35 | ||||
-rw-r--r-- | pktline/decoder.go | 104 | ||||
-rw-r--r-- | pktline/decoder_test.go | 70 | ||||
-rw-r--r-- | pktline/doc.go | 56 | ||||
-rw-r--r-- | pktline/encoder.go | 56 | ||||
-rw-r--r-- | pktline/encoder_test.go | 34 |
13 files changed, 1542 insertions, 0 deletions
diff --git a/client.go b/client.go new file mode 100644 index 0000000..6a2c4e0 --- /dev/null +++ b/client.go @@ -0,0 +1,155 @@ +package git + +import ( + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "github.com/tyba/opensource-search/sources/vcs/clients/git/pktline" + + "github.com/sourcegraph/go-vcsurl" +) + +type Client struct { + url string + client *http.Client +} + +func NewClient(url string) *Client { + vcs, _ := vcsurl.Parse(url) + return &Client{url: vcs.Link(), client: &http.Client{}} +} + +func (c *Client) GetLastCommit() (string, error) { + req, _ := c.buildRequest( + "GET", + fmt.Sprintf("%s/info/refs?service=git-upload-pack", c.url), + nil, + ) + + res, err := c.client.Do(req) + if err != nil { + return "", err + } + + if res.StatusCode >= 400 { + return "", &NotFoundError{c.url} + } + + defer res.Body.Close() + d := pktline.NewDecoder(res.Body) + + content, err := d.ReadAll() + if err != nil { + return "", err + } + + var head string + for _, line := range content { + if line[0] == '#' { + continue + } + + if head == "" { + head = c.getHEADFromLine(line) + } else { + commit, branch := c.getCommitAndBranch(line) + if branch == head { + return commit, nil + } + } + } + + return "", nil +} + +func (c *Client) getHEADFromLine(line string) string { + args, _ := url.ParseQuery(strings.Replace(line, " ", "&", -1)) + + link, ok := args["symref"] + if !ok { + return "" + } + + parts := strings.Split(link[0], ":") + if len(parts) != 2 || parts[0] != "HEAD" { + return "" + } + + return parts[1] +} + +func (c *Client) getCommitAndBranch(line string) (string, string) { + parts := strings.Split(strings.Trim(line, " \n"), " ") + if len(parts) != 2 { + return "", "" + } + + return parts[0], parts[1] +} + +func (c *Client) GetPackFile(want string) (io.ReadCloser, error) { + e := pktline.NewEncoder() + e.AddLine(fmt.Sprintf("want %s", want)) + e.AddFlush() + e.AddLine("done") + + req, err := c.buildRequest( + "POST", + fmt.Sprintf("%s/git-upload-pack", c.url), + e.GetReader(), + ) + if err != nil { + return nil, err + } + + res, err := c.client.Do(req) + if err != nil { + return nil, err + } + + h := make([]byte, 8) + if _, err := res.Body.Read(h); err != nil { + return nil, err + } + + return res.Body, nil +} + +func (c *Client) buildRequest(method, url string, content *strings.Reader) (*http.Request, error) { + + var req *http.Request + var err error + if content == nil { + req, err = http.NewRequest(method, url, nil) + } else { + req, err = http.NewRequest(method, url, content) + } + + if err != nil { + return nil, err + } + + req.Header.Add("User-Agent", "git/1.0") + req.Header.Add("Host", "github.com") + + if content == nil { + req.Header.Add("Accept", "*/*") + } else { + req.Header.Add("Accept", "application/x-git-upload-pack-result") + req.Header.Add("Content-Type", "application/x-git-upload-pack-request") + req.Header.Add("Content-Length", string(content.Len())) + } + + return req, nil +} + +type NotFoundError struct { + url string +} + +func (e NotFoundError) Error() string { + return e.url +} diff --git a/packfile/delta.go b/packfile/delta.go new file mode 100644 index 0000000..86b556f --- /dev/null +++ b/packfile/delta.go @@ -0,0 +1,93 @@ +package packfile + +const delta_size_min = 4 + +func deltaHeaderSize(b []byte) (uint, []byte) { + var size, j uint + var cmd byte + for { + cmd = b[j] + size |= (uint(cmd) & 0x7f) << (j * 7) + j++ + if uint(cmd)&0xb80 == 0 || j == uint(len(b)) { + break + } + } + return size, b[j:] +} + +func PatchDelta(src, delta []byte) []byte { + if len(delta) < delta_size_min { + return nil + } + size, delta := deltaHeaderSize(delta) + if size != uint(len(src)) { + return nil + } + size, delta = deltaHeaderSize(delta) + origSize := size + + dest := make([]byte, 0) + + // var offset uint + var cmd byte + for { + cmd = delta[0] + delta = delta[1:] + if (cmd & 0x80) != 0 { + var cp_off, cp_size uint + if (cmd & 0x01) != 0 { + cp_off = uint(delta[0]) + delta = delta[1:] + } + if (cmd & 0x02) != 0 { + cp_off |= uint(delta[0]) << 8 + delta = delta[1:] + } + if (cmd & 0x04) != 0 { + cp_off |= uint(delta[0]) << 16 + delta = delta[1:] + } + if (cmd & 0x08) != 0 { + cp_off |= uint(delta[0]) << 24 + delta = delta[1:] + } + + if (cmd & 0x10) != 0 { + cp_size = uint(delta[0]) + delta = delta[1:] + } + if (cmd & 0x20) != 0 { + cp_size |= uint(delta[0]) << 8 + delta = delta[1:] + } + if (cmd & 0x40) != 0 { + cp_size |= uint(delta[0]) << 16 + delta = delta[1:] + } + if cp_size == 0 { + cp_size = 0x10000 + } + if cp_off+cp_size < cp_off || + cp_off+cp_size > uint(len(src)) || + cp_size > origSize { + break + } + dest = append(dest, src[cp_off:cp_off+cp_size]...) + size -= cp_size + } else if cmd != 0 { + if uint(cmd) > origSize { + break + } + dest = append(dest, delta[0:uint(cmd)]...) + size -= uint(cmd) + delta = delta[uint(cmd):] + } else { + return nil + } + if size <= 0 { + break + } + } + return dest +} diff --git a/packfile/doc.go b/packfile/doc.go new file mode 100644 index 0000000..1fc28da --- /dev/null +++ b/packfile/doc.go @@ -0,0 +1,168 @@ +package packfile + +// Code from: +// https://github.com/gitchain/gitchain/tree/master/git @ 4c2fabdf9 +// +// GIT pack format +// =============== +// +// == pack-*.pack files have the following format: +// +// - A header appears at the beginning and consists of the following: +// +// 4-byte signature: +// The signature is: {'P', 'A', 'C', 'K'} +// +// 4-byte version number (network byte order): +// GIT currently accepts version number 2 or 3 but +// generates version 2 only. +// +// 4-byte number of objects contained in the pack (network byte order) +// +// Observation: we cannot have more than 4G versions ;-) and +// more than 4G objects in a pack. +// +// - The header is followed by number of object entries, each of +// which looks like this: +// +// (undeltified representation) +// n-byte type and length (3-bit type, (n-1)*7+4-bit length) +// compressed data +// +// (deltified representation) +// n-byte type and length (3-bit type, (n-1)*7+4-bit length) +// 20-byte base object name +// compressed delta data +// +// Observation: length of each object is encoded in a variable +// length format and is not constrained to 32-bit or anything. +// +// - The trailer records 20-byte SHA1 checksum of all of the above. +// +// == Original (version 1) pack-*.idx files have the following format: +// +// - The header consists of 256 4-byte network byte order +// integers. N-th entry of this table records the number of +// objects in the corresponding pack, the first byte of whose +// object name is less than or equal to N. This is called the +// 'first-level fan-out' table. +// +// - The header is followed by sorted 24-byte entries, one entry +// per object in the pack. Each entry is: +// +// 4-byte network byte order integer, recording where the +// object is stored in the packfile as the offset from the +// beginning. +// +// 20-byte object name. +// +// - The file is concluded with a trailer: +// +// A copy of the 20-byte SHA1 checksum at the end of +// corresponding packfile. +// +// 20-byte SHA1-checksum of all of the above. +// +// Pack Idx file: +// +// -- +--------------------------------+ +// fanout | fanout[0] = 2 (for example) |-. +// table +--------------------------------+ | +// | fanout[1] | | +// +--------------------------------+ | +// | fanout[2] | | +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | +// | fanout[255] = total objects |---. +// -- +--------------------------------+ | | +// main | offset | | | +// index | object name 00XXXXXXXXXXXXXXXX | | | +// table +--------------------------------+ | | +// | offset | | | +// | object name 00XXXXXXXXXXXXXXXX | | | +// +--------------------------------+<+ | +// .-| offset | | +// | | object name 01XXXXXXXXXXXXXXXX | | +// | +--------------------------------+ | +// | | offset | | +// | | object name 01XXXXXXXXXXXXXXXX | | +// | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | +// | | offset | | +// | | object name FFXXXXXXXXXXXXXXXX | | +// --| +--------------------------------+<--+ +// trailer | | packfile checksum | +// | +--------------------------------+ +// | | idxfile checksum | +// | +--------------------------------+ +// .-------. +// | +// Pack file entry: <+ +// +// packed object header: +// 1-byte size extension bit (MSB) +// type (next 3 bit) +// size0 (lower 4-bit) +// n-byte sizeN (as long as MSB is set, each 7-bit) +// size0..sizeN form 4+7+7+..+7 bit integer, size0 +// is the least significant part, and sizeN is the +// most significant part. +// packed object data: +// If it is not DELTA, then deflated bytes (the size above +// is the size before compression). +// If it is REF_DELTA, then +// 20-byte base object name SHA1 (the size above is the +// size of the delta data that follows). +// delta data, deflated. +// If it is OFS_DELTA, then +// n-byte offset (see below) interpreted as a negative +// offset from the type-byte of the header of the +// ofs-delta entry (the size above is the size of +// the delta data that follows). +// delta data, deflated. +// +// offset encoding: +// n bytes with MSB set in all but the last one. +// The offset is then the number constructed by +// concatenating the lower 7 bit of each byte, and +// for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1)) +// to the result. +// +// +// +// == Version 2 pack-*.idx files support packs larger than 4 GiB, and +// have some other reorganizations. They have the format: +// +// - A 4-byte magic number '\377tOc' which is an unreasonable +// fanout[0] value. +// +// - A 4-byte version number (= 2) +// +// - A 256-entry fan-out table just like v1. +// +// - A table of sorted 20-byte SHA1 object names. These are +// packed together without offset values to reduce the cache +// footprint of the binary search for a specific object name. +// +// - A table of 4-byte CRC32 values of the packed object data. +// This is new in v2 so compressed data can be copied directly +// from pack to pack during repacking without undetected +// data corruption. +// +// - A table of 4-byte offset values (in network byte order). +// These are usually 31-bit pack file offsets, but large +// offsets are encoded as an index into the next table with +// the msbit set. +// +// - A table of 8-byte offset entries (empty for pack files less +// than 2 GiB). Pack files are organized with heavily used +// objects toward the front, so most object references should +// not need to refer to this table. +// +// - The same trailer as a v1 pack file: +// +// A copy of the 20-byte SHA1 checksum at the end of +// corresponding packfile. +// +// 20-byte SHA1-checksum of all of the above. +// +// From: +// https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt diff --git a/packfile/objects.go b/packfile/objects.go new file mode 100644 index 0000000..3c77a33 --- /dev/null +++ b/packfile/objects.go @@ -0,0 +1,222 @@ +package packfile + +import ( + "bytes" + "compress/zlib" + "crypto/sha1" + "encoding/hex" + "fmt" + "io/ioutil" + "strconv" + "time" +) + +type Object interface { + Type() string + Hash() string +} + +type Hash []byte + +func (h Hash) String() string { + return hex.EncodeToString(h) +} + +type Commit struct { + Tree Hash + Parents []Hash + Author Signature + Committer Signature + Message string + hash string +} + +func NewCommit(b []byte) (*Commit, error) { + o := &Commit{hash: calculateHash("commit", b)} + + lines := bytes.Split(b, []byte{'\n'}) + for i := range lines { + if len(lines[i]) > 0 { + var err error + + split := bytes.SplitN(lines[i], []byte{' '}, 2) + switch string(split[0]) { + case "tree": + o.Tree = make([]byte, 20) + _, err = hex.Decode(o.Tree, split[1]) + case "parent": + h := make([]byte, 20) + _, err = hex.Decode(h, split[1]) + if err == nil { + o.Parents = append(o.Parents, h) + } + case "author": + o.Author = NewSignature(split[1]) + case "committer": + o.Committer = NewSignature(split[1]) + } + + if err != nil { + return nil, err + } + } else { + o.Message = string(bytes.Join(append(lines[i+1:]), []byte{'\n'})) + break + } + } + + return o, nil +} + +func (o *Commit) Type() string { + return "commit" +} + +func (o *Commit) Hash() string { + return o.hash +} + +type Signature struct { + Name string + Email string + When time.Time +} + +func NewSignature(signature []byte) Signature { + ret := Signature{} + if len(signature) == 0 { + return ret + } + + from := 0 + state := 'n' // n: name, e: email, t: timestamp, z: timezone + for i := 0; ; i++ { + var c byte + var end bool + if i < len(signature) { + c = signature[i] + } else { + end = true + } + + switch state { + case 'n': + if c == '<' || end { + if i == 0 { + break + } + ret.Name = string(signature[from : i-1]) + state = 'e' + from = i + 1 + } + case 'e': + if c == '>' || end { + ret.Email = string(signature[from:i]) + i++ + state = 't' + from = i + 1 + } + case 't': + if c == ' ' || end { + t, err := strconv.ParseInt(string(signature[from:i]), 10, 64) + if err == nil { + ret.When = time.Unix(t, 0) + } + end = true + } + } + + if end { + break + } + } + + return ret +} + +func (s *Signature) String() string { + return fmt.Sprintf("%q <%s> @ %s", s.Name, s.Email, s.When) +} + +type Tree struct { + Entries []TreeEntry + hash string +} + +type TreeEntry struct { + Name string + Hash string +} + +func NewTree(b []byte) (*Tree, error) { + o := &Tree{hash: calculateHash("tree", b)} + + if len(b) == 0 { + return o, nil + } + + zr, e := zlib.NewReader(bytes.NewBuffer(b)) + if e == nil { + defer zr.Close() + var err error + b, err = ioutil.ReadAll(zr) + if err != nil { + return nil, err + } + } + + body := b + for { + split := bytes.SplitN(body, []byte{0}, 2) + split1 := bytes.SplitN(split[0], []byte{' '}, 2) + + o.Entries = append(o.Entries, TreeEntry{ + Name: string(split1[1]), + Hash: fmt.Sprintf("%x", split[1][0:20]), + }) + + body = split[1][20:] + if len(split[1]) == 20 { + break + } + } + + return o, nil +} + +func (o *Tree) Type() string { + return "tree" +} + +func (o *Tree) Hash() string { + return o.hash +} + +type Blob struct { + Len int + hash string +} + +func NewBlob(b []byte) (*Blob, error) { + return &Blob{Len: len(b), hash: calculateHash("blob", b)}, nil +} + +func (o *Blob) Type() string { + return "blob" +} + +func (o *Blob) Hash() string { + return o.hash +} + +func calculateHash(objType string, content []byte) string { + header := []byte(objType) + header = append(header, ' ') + header = strconv.AppendInt(header, int64(len(content)), 10) + header = append(header, 0) + header = append(header, content...) + + return fmt.Sprintf("%x", sha1.Sum(header)) +} + +type ContentCallback func(hash string, content []byte) diff --git a/packfile/objects_test.go b/packfile/objects_test.go new file mode 100644 index 0000000..07609e3 --- /dev/null +++ b/packfile/objects_test.go @@ -0,0 +1,55 @@ +package packfile + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestCalculateHash(t *testing.T) { + assert.Equal(t, "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391", calculateHash("blob", []byte(""))) + assert.Equal(t, "8ab686eafeb1f44702738c8b0f24f2567c36da6d", calculateHash("blob", []byte("Hello, World!\n"))) +} + +func TestSignature(t *testing.T) { + cases := map[string]Signature{ + `Foo Bar <foo@bar.com> 1257894000 +0100`: { + Name: "Foo Bar", + Email: "foo@bar.com", + When: time.Unix(1257894000, 0), + }, + `Foo Bar <> 1257894000 +0100`: { + Name: "Foo Bar", + Email: "", + When: time.Unix(1257894000, 0), + }, + ` <> 1257894000`: { + Name: "", + Email: "", + When: time.Unix(1257894000, 0), + }, + `Foo Bar <foo@bar.com>`: { + Name: "Foo Bar", + Email: "foo@bar.com", + When: time.Time{}, + }, + ``: { + Name: "", + Email: "", + When: time.Time{}, + }, + `<`: { + Name: "", + Email: "", + When: time.Time{}, + }, + } + + for raw, exp := range cases { + got := NewSignature([]byte(raw)) + assert.Equal(t, exp.Name, got.Name) + assert.Equal(t, exp.Email, got.Email) + assert.Equal(t, exp.When.Unix(), got.When.Unix()) + } +} diff --git a/packfile/packfile.go b/packfile/packfile.go new file mode 100644 index 0000000..e670cd0 --- /dev/null +++ b/packfile/packfile.go @@ -0,0 +1,82 @@ +package packfile + +import "fmt" + +type Packfile struct { + Version uint32 + ObjectCount int + Checksum []byte + Commits map[string]*Commit + Trees map[string]*Tree + Blobs map[string]*Blob +} + +func NewPackfile() *Packfile { + return &Packfile{ + Commits: make(map[string]*Commit, 0), + Trees: make(map[string]*Tree, 0), + Blobs: make(map[string]*Blob, 0), + } +} + +type BlobEntry struct { + path string + *Blob +} + +type SubtreeEntry struct { + path string + *Tree + TreeCh +} + +type treeEntry interface { + isTreeEntry() + Path() string +} + +func (b BlobEntry) isTreeEntry() {} +func (b BlobEntry) Path() string { return b.path } +func (b SubtreeEntry) isTreeEntry() {} +func (b SubtreeEntry) Path() string { return b.path } + +type TreeCh <-chan treeEntry + +func (p *Packfile) WalkCommit(commitHash string) (TreeCh, error) { + commit, ok := p.Commits[commitHash] + if !ok { + return nil, fmt.Errorf("Unable to find %q commit", commitHash) + } + + treeHash := fmt.Sprintf("%x", string(commit.Tree)) + return p.WalkTree(p.Trees[treeHash]), nil +} + +func (p *Packfile) WalkTree(tree *Tree) TreeCh { + return p.walkTree(tree, "") +} + +func (p *Packfile) walkTree(tree *Tree, pathPrefix string) TreeCh { + ch := make(chan treeEntry) + + if tree == nil { + close(ch) + return ch + } + + go func() { + defer func() { + close(ch) + }() + for _, e := range tree.Entries { + path := pathPrefix + e.Name + if blob, ok := p.Blobs[e.Hash]; ok { + ch <- BlobEntry{path, blob} + } else if subtree, ok := p.Trees[e.Hash]; ok { + ch <- SubtreeEntry{path, subtree, p.walkTree(subtree, path+"/")} + } + } + }() + + return ch +} diff --git a/packfile/reader.go b/packfile/reader.go new file mode 100644 index 0000000..3725822 --- /dev/null +++ b/packfile/reader.go @@ -0,0 +1,412 @@ +package packfile + +import ( + "bytes" + "compress/zlib" + "encoding/binary" + "errors" + "fmt" + "io" + "io/ioutil" +) + +const MaxObjectsLimit = 1000000 + +type PackfileReader struct { + r io.Reader + + objects map[string]packfileObject + offsets map[int]string + deltas []packfileDelta + + // The give back logic is explained in the giveBack method. + startedGivingBack bool + givebackBuffer []byte + givenBack io.Reader + contentCallback ContentCallback +} + +// Sometimes, after reading an object from a packfile, there will be +// a few bytes with garbage data before the next object comes by. +// There is no way of reliably noticing this until when trying to read the +// next object and failing because zlib parses an invalid header. We can't +// notice before, because parsing the object's header (size, type, etc.) +// doesn't fail. +// +// At that point, we want to give back to the reader the bytes we've read +// since the last object, shift the input by one byte, and try again. That's +// why we save the bytes we read on each object and, if it fails in the middle +// of parsing it, those bytes will be read the next times you call Read() on +// a objectReader derived from a PackfileReader.readObject, until they run out. +func (pr *PackfileReader) giveBack() { + pr.givenBack = bytes.NewReader(pr.givebackBuffer) + pr.givebackBuffer = nil +} + +type packfileObject struct { + bytes []byte + typ int8 +} + +type packfileDelta struct { + hash string + delta []byte +} + +func NewPackfileReader(r io.Reader, contentCallback ContentCallback) (*PackfileReader, error) { + return &PackfileReader{ + r: r, + objects: map[string]packfileObject{}, + offsets: map[int]string{}, + contentCallback: contentCallback, + }, nil +} + +func (p *PackfileReader) Read() (*Packfile, error) { + packfile := NewPackfile() + + if err := p.validateSignature(); err != nil { + if err == io.EOF { + // This is an empty repo. It's OK. + return packfile, nil + } + return nil, err + } + + var err error + ver, err := p.readInt32() + if err != nil { + return nil, err + } + + count, err := p.readInt32() + if err != nil { + return nil, err + } + + packfile.Version = uint32(ver) + packfile.ObjectCount = int(count) + + if packfile.ObjectCount > MaxObjectsLimit { + return nil, NewError("too many objects (%d)", packfile.ObjectCount) + } + + if err := p.readObjects(packfile); err != nil { + return nil, err + } + + return packfile, nil +} + +func (p *PackfileReader) validateSignature() error { + var signature = make([]byte, 4) + if _, err := p.r.Read(signature); err != nil { + return err + } + + if !bytes.Equal(signature, []byte{'P', 'A', 'C', 'K'}) { + return NewError("Pack file does not start with 'PACK'") + } + + return nil +} + +func (p *PackfileReader) readInt32() (uint32, error) { + var value uint32 + if err := binary.Read(p.r, binary.BigEndian, &value); err != nil { + fmt.Println(err) + + return 0, err + } + + return value, nil +} + +func (p *PackfileReader) readObjects(packfile *Packfile) error { + p.startedGivingBack = true + + offset := 12 + for i := 0; i < packfile.ObjectCount; i++ { + r, err := p.readObject(packfile, offset) + if err != nil && err != io.EOF { + return err + } + + p.offsets[offset] = r.hash + offset += r.counter + 4 + + unknownForBytes := make([]byte, 4) + p.r.Read(unknownForBytes) + + if err == io.EOF { + break + } + } + + return nil +} + +const ( + OBJ_COMMIT = 1 + OBJ_TREE = 2 + OBJ_BLOB = 3 + OBJ_TAG = 4 + OBJ_OFS_DELTA = 6 + OBJ_REF_DELTA = 7 +) + +const SIZE_LIMIT uint64 = 1 << 32 //4GB + +type objectReader struct { + pr *PackfileReader + pf *Packfile + offset int + hash string + + typ int8 + size uint64 + counter int +} + +func (p *PackfileReader) readObject(packfile *Packfile, offset int) (*objectReader, error) { + o, err := newObjectReader(p, packfile, offset) + if err != nil { + return nil, err + } + + switch o.typ { + case OBJ_REF_DELTA: + err = o.readREFDelta() + case OBJ_OFS_DELTA: + err = o.readOFSDelta() + case OBJ_COMMIT, OBJ_TREE, OBJ_BLOB, OBJ_TAG: + err = o.readObject() + default: + err = NewError("Invalid git object tag %q", o.typ) + } + if err == ErrZlibHeader { + p.giveBack() + io.CopyN(ioutil.Discard, p.r, 1) + return p.readObject(packfile, offset) + } + + return o, err +} + +func newObjectReader(pr *PackfileReader, pf *Packfile, offset int) (*objectReader, error) { + o := &objectReader{pr: pr, pf: pf, offset: offset} + buf := make([]byte, 1) + if _, err := o.Read(buf); err != nil { + return nil, err + } + + o.typ = int8((buf[0] >> 4) & 7) + o.size = uint64(buf[0] & 15) + + var shift uint = 4 + for buf[0]&0x80 == 0x80 { + if _, err := o.Read(buf); err != nil { + return nil, err + } + + o.size += uint64(buf[0]&0x7f) << shift + shift += 7 + } + + return o, nil +} + +func (o *objectReader) readREFDelta() error { + ref := make([]byte, 20) + o.Read(ref) + + buf, err := o.inflate() + if err != nil { + return err + } + + refhash := fmt.Sprintf("%x", ref) + referenced, ok := o.pr.objects[refhash] + if !ok { + o.pr.deltas = append(o.pr.deltas, packfileDelta{hash: refhash, delta: buf}) + } else { + patched := PatchDelta(referenced.bytes, buf) + if patched == nil { + return NewError("error while patching %x", ref) + } + o.typ = referenced.typ + err = o.addObject(patched) + if err != nil { + return err + } + } + + return nil +} + +func (o *objectReader) readOFSDelta() error { + // read negative offset + var b uint8 + binary.Read(o, binary.BigEndian, &b) + var noffset int = int(b & 0x7f) + for (b & 0x80) != 0 { + noffset += 1 + binary.Read(o, binary.BigEndian, &b) + noffset = (noffset << 7) + int(b&0x7f) + } + + buf, err := o.inflate() + if err != nil { + return err + } + + refhash := o.pr.offsets[o.offset-noffset] + referenced, ok := o.pr.objects[refhash] + if !ok { + return NewError("can't find a pack entry at %d", o.offset-noffset) + } else { + patched := PatchDelta(referenced.bytes, buf) + if patched == nil { + return NewError("error while patching %x", refhash) + } + o.typ = referenced.typ + err = o.addObject(patched) + if err != nil { + return err + } + } + + return nil +} + +func (o *objectReader) readObject() error { + buf, err := o.inflate() + if err != nil { + return err + } + + return o.addObject(buf) +} + +func (o *objectReader) addObject(bytes []byte) error { + var hash string + + switch o.typ { + case OBJ_COMMIT: + c, err := NewCommit(bytes) + if err != nil { + return err + } + o.pf.Commits[c.Hash()] = c + hash = c.Hash() + case OBJ_TREE: + c, err := NewTree(bytes) + if err != nil { + return err + } + o.pf.Trees[c.Hash()] = c + hash = c.Hash() + case OBJ_BLOB: + c, err := NewBlob(bytes) + if err != nil { + return err + } + o.pf.Blobs[c.Hash()] = c + hash = c.Hash() + + if o.pr.contentCallback != nil { + o.pr.contentCallback(hash, bytes) + } + } + + o.pr.objects[hash] = packfileObject{bytes: bytes, typ: o.typ} + o.hash = hash + + return nil + +} + +func (o *objectReader) inflate() ([]byte, error) { + //Quick fix "Invalid git object tag '\x00'" when the length of a object is 0 + if o.size == 0 { + buf := make([]byte, 4) + if _, err := o.Read(buf); err != nil { + return nil, err + } + + return nil, nil + } + + zr, err := zlib.NewReader(o) + if err != nil { + if err.Error() == "zlib: invalid header" { + return nil, ErrZlibHeader + } else { + return nil, NewError("error opening packfile's object zlib: %v", err) + } + } + + defer zr.Close() + + if o.size > SIZE_LIMIT { + return nil, NewError("the object size exceeed the allowed limit: %d", o.size) + } + + buf := make([]byte, o.size) + read := 0 + for read < int(o.size) { + n, err := zr.Read(buf[read:]) + if err != nil { + return nil, err + } + + read += n + } + + if read != int(o.size) { + return nil, NewError("inflated size mismatch, expected %d, got %d", o.size, read) + } + + return buf, nil +} + +func (o *objectReader) Read(p []byte) (int, error) { + i := 0 + if o.pr.givenBack != nil { + i1, err := o.pr.givenBack.Read(p) + if err == nil { + i += i1 + } else { + o.pr.givenBack = nil + } + } + + i2, err := o.pr.r.Read(p[i:]) + i += i2 + o.counter += i + if err == nil && o.pr.startedGivingBack { + o.pr.givebackBuffer = append(o.pr.givebackBuffer, p[:i]...) + } + return i, err +} + +func (o *objectReader) ReadByte() (byte, error) { + var c byte + if err := binary.Read(o, binary.BigEndian, &c); err != nil { + return 0, err + } + + return c, nil +} + +type ReaderError struct { + Msg string // description of error +} + +func NewError(format string, args ...interface{}) error { + return &ReaderError{Msg: fmt.Sprintf(format, args...)} +} + +func (e *ReaderError) Error() string { return e.Msg } + +var ErrZlibHeader = errors.New("zlib: invalid header") diff --git a/packfile/reader_test.go b/packfile/reader_test.go new file mode 100644 index 0000000..e52cbc3 --- /dev/null +++ b/packfile/reader_test.go @@ -0,0 +1,35 @@ +package packfile + +import ( + "bytes" + "encoding/base64" + "testing" + + "github.com/stretchr/testify/assert" +) + +var packFileWithEmptyObjects = "UEFDSwAAAAIAAAALnw54nKXMQWoDMQxA0b1PoX2hSLIm44FSAlmXnEG2NYlhXAfHgdLb5Cy9WAM5Qpb/Lf7oZqArUpakyYtQjCoxZ5lmWXwwyuzJbHqAuYt2+x6QoyCyhYCKIa67lGameSLWvPh5JU0hsCg7vY1z6/D1d/8ptcHhprm3Kxz7KL/wUdOz96eqZXtPrX4CCeOOPU8Eb0iI7qG1jGGvXdxaNoPs/gHeNkp8lA94nKXMQUpDMRCA4X1OMXtBZpI3L3kiRXAtPcMkmWjgxZSYQultPEsv1oJHcPl/i38OVRC0IXF0lshrJorZEcpKmTEJYbA+B3aFzEmGfk9gpqJEsmnZNutXF71i1IURU/G0bsWWwJ6NnOdXH/Bx+73U1uH9LHn0HziOWa/w2tJfv302qftz6u0AtFh0wQdmeEJCNA9tdU7938WUuivEF5CczR11ZEsNnw54nKWMUQoCIRRF/13F+w/ijY6jQkTQd7SGpz5LyAxzINpNa2ljTbSEPu/hnNsbM4TJTzqyt561GdUUmJKT6K2MeiCVgnZWoY/iRo2vHVS0URrUS+e+dkqIEp11HMhh9IaUkRM6QXM/1waH9+uRS4X9TLHVOxxbz0/YlPDbu1OhfFmHWrYwjBKVNVaNsMIBUSy05N75vxeR8oXBiw8GoErCnwt4nKXMzQkCMRBA4XuqmLsgM2M2ZkAWwbNYQ341sCEQsyB2Yy02pmAJHt93eKOnBFpMNJqtl5CFxVIMomViomQSEWP2JrN3yq3j1jqc369HqQ1Oq4u93eHSR3nCoYZfH6/VlWUbWp2BNOPO7i1OsEFCVF+tZYz030XlsiRw6gPZ0jxaqwV4nDM0MDAzMVFIZHg299HsTRevOXt3a64rj7px6ElP8ERDiGQSQ2uoXe8RrcodS5on+J4/u8HjD4NDKFQyRS8tPx+rbgDt3yiEMHicAwAAAAABPnicS0wEAa4kMOACACTjBKdkZXici7aaYAUAA3gBYKoDeJwzNDAwMzFRSGR4NvfR7E0Xrzl7d2uuK4+6cehJT/BEQ4hkEsOELYFJvS2eX47UJdVttFQrenrmzQwA13MaiDd4nEtMBAEuAApMAlGtAXicMzQwMDMxUUhkeDb30exNF685e3drriuPunHoSU/wRACvkA258N/i8hVXx9CiAZzvFXNIhCuSFmE=" + +func TestReadPackfile(t *testing.T) { + data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects) + d := bytes.NewReader(data) + + r, err := NewPackfileReader(d, nil) + assert.Nil(t, err) + + p, err := r.Read() + assert.Nil(t, err) + + assert.Equal(t, 11, p.ObjectCount) + assert.Equal(t, 4, len(p.Commits)) + assert.Equal(t, 4, len(p.Trees)) +} + +func TestReadPackfileInvalid(t *testing.T) { + r, err := NewPackfileReader(bytes.NewReader([]byte("dasdsadasas")), nil) + assert.Nil(t, err) + + _, err = r.Read() + _, ok := err.(*ReaderError) + assert.True(t, ok) +} diff --git a/pktline/decoder.go b/pktline/decoder.go new file mode 100644 index 0000000..194a3e6 --- /dev/null +++ b/pktline/decoder.go @@ -0,0 +1,104 @@ +package pktline + +import ( + "errors" + "io" + "strconv" +) + +var ( + ErrUnderflow = errors.New("unexepected string length") + ErrInvalidHeader = errors.New("invalid header") + ErrInvalidLen = errors.New("invalid length") +) + +type Decoder struct { + r io.Reader +} + +func NewDecoder(r io.Reader) *Decoder { + return &Decoder{r} +} + +func (d *Decoder) readLine() (string, error) { + raw := make([]byte, HEADER_LENGTH) + if _, err := d.r.Read(raw); err != nil { + return "", err + } + + header, err := strconv.ParseInt(string(raw), 16, 16) + if err != nil { + return "", ErrInvalidHeader + } + + if header == 0 { + return "", nil + } + + exp := int(header - HEADER_LENGTH) + if exp < 0 { + return "", ErrInvalidLen + } + + line := make([]byte, exp) + if read, err := d.r.Read(line); err != nil { + return "", err + } else if read != exp { + return "", ErrUnderflow + } + + return string(line), nil +} + +func (d *Decoder) ReadLine() (string, error) { + return d.readLine() +} + +func (d *Decoder) ReadBlock() ([]string, error) { + o := make([]string, 0) + + for { + line, err := d.readLine() + if err == io.EOF { + return o, nil + } + + if err != nil { + return o, err + } + + if err == nil && line == "" { + return o, nil + } + + o = append(o, line) + } + + return o, nil +} + +func (d *Decoder) ReadAll() ([]string, error) { + result, err := d.ReadBlock() + if err != nil { + return result, err + } + + for { + lines, err := d.ReadBlock() + if err == io.EOF { + return result, nil + } + + if err != nil { + return result, err + } + + if err == nil && len(lines) == 0 { + return result, nil + } + + result = append(result, lines...) + } + + return result, nil +} diff --git a/pktline/decoder_test.go b/pktline/decoder_test.go new file mode 100644 index 0000000..a0f85ce --- /dev/null +++ b/pktline/decoder_test.go @@ -0,0 +1,70 @@ +package pktline + +import ( + "strings" + "testing" + + . "gopkg.in/check.v1" +) + +func Test(t *testing.T) { TestingT(t) } + +type DecoderSuite struct{} + +var _ = Suite(&DecoderSuite{}) + +func (s *DecoderSuite) TestReadLine(c *C) { + j := &Decoder{strings.NewReader("0006a\n")} + + line, err := j.ReadLine() + c.Assert(err, IsNil) + c.Assert(line, Equals, "a\n") +} + +func (s *DecoderSuite) TestReadLineBufferUnderflow(c *C) { + j := &Decoder{strings.NewReader("00e7a\n")} + + line, err := j.ReadLine() + c.Assert(err, ErrorMatches, "unexepected string length") + c.Assert(line, Equals, "") +} + +func (s *DecoderSuite) TestReadLineBufferInvalidLen(c *C) { + j := &Decoder{strings.NewReader("0001foo\n")} + + line, err := j.ReadLine() + c.Assert(err, ErrorMatches, "invalid length") + c.Assert(line, Equals, "") +} + +func (s *DecoderSuite) TestReadBlock(c *C) { + j := &Decoder{strings.NewReader("0006a\n")} + + lines, err := j.ReadBlock() + c.Assert(err, IsNil) + c.Assert(lines, HasLen, 1) + c.Assert(lines[0], Equals, "a\n") +} + +func (s *DecoderSuite) TestReadBlockWithFlush(c *C) { + j := &Decoder{strings.NewReader("0006a\n0006b\n00000006c\n")} + + lines, err := j.ReadBlock() + c.Assert(err, IsNil) + c.Assert(lines, HasLen, 2) + c.Assert(lines[0], Equals, "a\n") + c.Assert(lines[1], Equals, "b\n") +} + +func (s *DecoderSuite) TestReadAll(c *C) { + j := &Decoder{strings.NewReader("0006a\n0006b\n00000006c\n0006d\n0006e\n")} + + lines, err := j.ReadAll() + c.Assert(err, IsNil) + c.Assert(lines, HasLen, 5) + c.Assert(lines[0], Equals, "a\n") + c.Assert(lines[1], Equals, "b\n") + c.Assert(lines[2], Equals, "c\n") + c.Assert(lines[3], Equals, "d\n") + c.Assert(lines[4], Equals, "e\n") +} diff --git a/pktline/doc.go b/pktline/doc.go new file mode 100644 index 0000000..a976b54 --- /dev/null +++ b/pktline/doc.go @@ -0,0 +1,56 @@ +package pktline + +// pkt-line Format +// --------------- +// +// Much (but not all) of the payload is described around pkt-lines. +// +// A pkt-line is a variable length binary string. The first four bytes +// of the line, the pkt-len, indicates the total length of the line, +// in hexadecimal. The pkt-len includes the 4 bytes used to contain +// the length's hexadecimal representation. +// +// A pkt-line MAY contain binary data, so implementors MUST ensure +// pkt-line parsing/formatting routines are 8-bit clean. +// +// A non-binary line SHOULD BE terminated by an LF, which if present +// MUST be included in the total length. +// +// The maximum length of a pkt-line's data component is 65520 bytes. +// Implementations MUST NOT send pkt-line whose length exceeds 65524 +// (65520 bytes of payload + 4 bytes of length data). +// +// Implementations SHOULD NOT send an empty pkt-line ("0004"). +// +// A pkt-line with a length field of 0 ("0000"), called a flush-pkt, +// is a special case and MUST be handled differently than an empty +// pkt-line ("0004"). +// +// ---- +// pkt-line = data-pkt / flush-pkt +// +// data-pkt = pkt-len pkt-payload +// pkt-len = 4*(HEXDIG) +// pkt-payload = (pkt-len - 4)*(OCTET) +// +// flush-pkt = "0000" +// ---- +// +// Examples (as C-style strings): +// +// ---- +// pkt-line actual value +// --------------------------------- +// "0006a\n" "a\n" +// "0005a" "a" +// "000bfoobar\n" "foobar\n" +// "0004" "" +// ---- +// +// Extracted from: +// https://github.com/git/git/blob/master/Documentation/technical/protocol-common.txt + +const ( + HEADER_LENGTH = 4 + MAX_LENGTH = 65524 +) diff --git a/pktline/encoder.go b/pktline/encoder.go new file mode 100644 index 0000000..b439c54 --- /dev/null +++ b/pktline/encoder.go @@ -0,0 +1,56 @@ +package pktline + +import ( + "errors" + "fmt" + "strings" +) + +var ( + ErrOverflow = errors.New("unexepected string length") +) + +type Encoder struct { + lines []string +} + +func NewEncoder() *Encoder { + return &Encoder{make([]string, 0)} +} + +func (e *Encoder) AddLine(line string) error { + le, err := EncodeFromString(line + "\n") + if err != nil { + return err + } + + e.lines = append(e.lines, le) + return nil +} + +func (e *Encoder) AddFlush() { + e.lines = append(e.lines, "0000") +} + +func (e *Encoder) GetReader() *strings.Reader { + data := strings.Join(e.lines, "") + + return strings.NewReader(data) +} + +func EncodeFromString(line string) (string, error) { + return Encode([]byte(line)) +} + +func Encode(line []byte) (string, error) { + if line == nil { + return "0000", nil + } + + l := len(line) + HEADER_LENGTH + if l > MAX_LENGTH { + return "", ErrOverflow + } + + return fmt.Sprintf("%04x%s", l, line), nil +} diff --git a/pktline/encoder_test.go b/pktline/encoder_test.go new file mode 100644 index 0000000..091ad1c --- /dev/null +++ b/pktline/encoder_test.go @@ -0,0 +1,34 @@ +package pktline + +import ( + "io/ioutil" + + . "gopkg.in/check.v1" +) + +type EncoderSuite struct{} + +var _ = Suite(&EncoderSuite{}) + +func (s *EncoderSuite) TestEncode(c *C) { + line, err := Encode([]byte("a\n")) + c.Assert(err, IsNil) + c.Assert(string(line), Equals, "0006a\n") +} + +func (s *EncoderSuite) TestEncodeFromString(c *C) { + line, err := EncodeFromString("a\n") + c.Assert(err, IsNil) + c.Assert(string(line), Equals, "0006a\n") +} + +func (s *EncoderSuite) TestEncoder(c *C) { + e := NewEncoder() + e.AddLine("a") + e.AddFlush() + e.AddLine("b") + + r := e.GetReader() + a, _ := ioutil.ReadAll(r) + c.Assert(string(a), Equals, "0006a\n00000006b\n") +} |