diff options
author | Máximo Cuadros <mcuadros@gmail.com> | 2015-10-23 17:45:01 +0200 |
---|---|---|
committer | Máximo Cuadros <mcuadros@gmail.com> | 2015-10-23 17:45:01 +0200 |
commit | d82f291cde9987322c8a0c81a325e1ba6159684c (patch) | |
tree | d423447ee374fbfa802f7ff354651fd34afe0fb2 /formats/packfile/reader.go | |
parent | 6c629843a1750a27c9af01ed2985f362f619c47a (diff) | |
parent | 27aa8cdd2431068606741a589383c02c149ea625 (diff) | |
download | go-git-d82f291cde9987322c8a0c81a325e1ba6159684c.tar.gz |
Merge pull request #2 from mcuadros/v2.0.0
formats/packfile: cleanup and hash type
Diffstat (limited to 'formats/packfile/reader.go')
-rw-r--r-- | formats/packfile/reader.go | 385 |
1 files changed, 385 insertions, 0 deletions
diff --git a/formats/packfile/reader.go b/formats/packfile/reader.go new file mode 100644 index 0000000..d5f40b9 --- /dev/null +++ b/formats/packfile/reader.go @@ -0,0 +1,385 @@ +package packfile + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + + "github.com/klauspost/compress/zlib" +) + +const ( + DefaultMaxObjectsLimit = 1 << 20 + DefaultMaxObjectSize = 1 << 32 // 4GB + + rawCommit = 1 + rawTree = 2 + rawBlob = 3 + rawTag = 4 + rawOFSDelta = 6 + rawREFDelta = 7 +) + +type PackfileReader struct { + // MaxObjectsLimit is the limit of objects to be load in the packfile, if + // a packfile excess this number an error is throw, the default value + // is defined by DefaultMaxObjectsLimit, usually the default limit is more + // than enough to work with any repository, working extremly big repositories + // where the number of object is bigger the memory can be exhausted. + MaxObjectsLimit int + + // MaxObjectSize is the maximum size in bytes, reading objects with a bigger + // size cause a error. The default value is defined by DefaultMaxObjectSize + MaxObjectSize int + + r *trackingReader + objects map[Hash]packfileObject + offsets map[int]Hash + deltas []packfileDelta + contentCallback ContentCallback +} + +type packfileObject struct { + bytes []byte + typ int8 +} + +type packfileDelta struct { + hash Hash + delta []byte +} + +func NewPackfileReader(r io.Reader, fn ContentCallback) (*PackfileReader, error) { + return &PackfileReader{ + MaxObjectsLimit: DefaultMaxObjectsLimit, + MaxObjectSize: DefaultMaxObjectSize, + r: &trackingReader{r: r}, + objects: make(map[Hash]packfileObject, 0), + offsets: make(map[int]Hash, 0), + contentCallback: fn, + }, nil +} + +func (pr *PackfileReader) Read() (*Packfile, error) { + packfile := NewPackfile() + + if err := pr.validateHeader(); err != nil { + if err == io.EOF { + // This is an empty repo. It's OK. + return packfile, nil + } + return nil, err + } + + ver, err := pr.readInt32() + if err != nil { + return nil, err + } + + count, err := pr.readInt32() + if err != nil { + return nil, err + } + + packfile.Version = uint32(ver) + packfile.ObjectCount = int(count) + + if packfile.ObjectCount > pr.MaxObjectsLimit { + return nil, NewError("too many objects %d, limit is %d", + packfile.ObjectCount, pr.MaxObjectsLimit) + } + + if err := pr.readObjects(packfile); err != nil { + return nil, err + } + + packfile.Size = int64(pr.r.Pos()) + + return packfile, nil +} + +func (pr *PackfileReader) validateHeader() error { + var header = make([]byte, 4) + if _, err := pr.r.Read(header); err != nil { + return err + } + + if !bytes.Equal(header, []byte{'P', 'A', 'C', 'K'}) { + return NewError("Pack file does not start with 'PACK'") + } + + return nil +} + +func (pr *PackfileReader) readInt32() (uint32, error) { + var value uint32 + if err := binary.Read(pr.r, binary.BigEndian, &value); err != nil { + return 0, err + } + + return value, nil +} + +func (pr *PackfileReader) readObjects(packfile *Packfile) error { + // This code has 50-80 µs of overhead per object not counting zlib inflation. + // Together with zlib inflation, it's 400-410 µs for small objects. + // That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB, + // of which 12-20 % is _not_ zlib inflation (ie. is our code). + + for i := 0; i < packfile.ObjectCount; i++ { + var pos = pr.Pos() + obj, err := pr.readObject(packfile) + if err != nil && err != io.EOF { + return err + } + + pr.offsets[pos] = obj.hash + + if err == io.EOF { + break + } + } + + return nil +} + +func (pr *PackfileReader) readObject(packfile *Packfile) (*objectReader, error) { + o, err := newObjectReader(pr, packfile, pr.MaxObjectSize) + if err != nil { + return nil, err + } + + switch o.typ { + case rawREFDelta: + err = o.readREFDelta() + case rawOFSDelta: + err = o.readOFSDelta() + case rawCommit, rawTree, rawBlob, rawTag: + err = o.readObject() + default: + err = NewError("Invalid git object tag %q", o.typ) + } + + if err != nil { + return nil, err + } + + return o, err +} + +func (pr *PackfileReader) Pos() int { return pr.r.Pos() } + +type objectReader struct { + pr *PackfileReader + pf *Packfile + maxSize uint64 + + hash Hash + steps int + typ int8 + size uint64 +} + +func newObjectReader(pr *PackfileReader, pf *Packfile, maxSize int) (*objectReader, error) { + o := &objectReader{pr: pr, pf: pf, maxSize: uint64(maxSize)} + + var buf [1]byte + if _, err := o.Read(buf[:]); err != nil { + return nil, err + } + + o.typ = int8((buf[0] >> 4) & 7) + o.size = uint64(buf[0] & 15) + o.steps++ // byte we just read to get `o.typ` and `o.size` + + var shift uint = 4 + for buf[0]&0x80 == 0x80 { + if _, err := o.Read(buf[:]); err != nil { + return nil, err + } + + o.size += uint64(buf[0]&0x7f) << shift + o.steps++ // byte we just read to update `o.size` + shift += 7 + } + + return o, nil +} + +func (o *objectReader) readREFDelta() error { + var ref Hash + if _, err := o.Read(ref[:]); err != nil { + return err + } + + buf, err := o.inflate() + if err != nil { + return err + } + + referenced, ok := o.pr.objects[ref] + if !ok { + o.pr.deltas = append(o.pr.deltas, packfileDelta{hash: ref, delta: buf[:]}) + } else { + patched := PatchDelta(referenced.bytes, buf[:]) + if patched == nil { + return NewError("error while patching %x", ref) + } + + o.typ = referenced.typ + err = o.addObject(patched) + if err != nil { + return err + } + } + + return nil +} + +func decodeOffset(src io.ByteReader, steps int) (int, error) { + b, err := src.ReadByte() + if err != nil { + return 0, err + } + var offset = int(b & 0x7f) + for (b & 0x80) != 0 { + offset++ // WHY? + b, err = src.ReadByte() + if err != nil { + return 0, err + } + + offset = (offset << 7) + int(b&0x7f) + } + + // offset needs to be aware of the bytes we read for `o.typ` and `o.size` + offset += steps + return -offset, nil +} + +func (o *objectReader) readOFSDelta() error { + var pos = o.pr.Pos() + + // read negative offset + offset, err := decodeOffset(o.pr.r, o.steps) + if err != nil { + return err + } + + buf, err := o.inflate() + if err != nil { + return err + } + + ref := o.pr.offsets[pos+offset] + referenced, ok := o.pr.objects[ref] + if !ok { + return NewError("can't find a pack entry at %d", pos+offset) + } + + patched := PatchDelta(referenced.bytes, buf) + if patched == nil { + return NewError("error while patching %q", ref) + } + + o.typ = referenced.typ + err = o.addObject(patched) + if err != nil { + return err + } + + return nil +} + +func (o *objectReader) readObject() error { + buf, err := o.inflate() + if err != nil { + return err + } + + return o.addObject(buf) +} + +func (o *objectReader) addObject(bytes []byte) error { + var hash Hash + + switch o.typ { + case rawCommit: + c, err := ParseCommit(bytes) + if err != nil { + return err + } + o.pf.Commits[c.Hash()] = c + hash = c.Hash() + case rawTree: + c, err := ParseTree(bytes) + if err != nil { + return err + } + o.pf.Trees[c.Hash()] = c + hash = c.Hash() + case rawBlob: + c, err := ParseBlob(bytes) + if err != nil { + return err + } + o.pf.Blobs[c.Hash()] = c + hash = c.Hash() + + if o.pr.contentCallback != nil { + o.pr.contentCallback(hash, bytes) + } + } + + o.pr.objects[hash] = packfileObject{bytes: bytes, typ: o.typ} + o.hash = hash + + return nil +} + +func (o *objectReader) inflate() ([]byte, error) { + zr, err := zlib.NewReader(o.pr.r) + if err != nil { + if err == zlib.ErrHeader { + return nil, zlib.ErrHeader + } + + return nil, NewError("error opening packfile's object zlib: %v", err) + } + + defer zr.Close() + + if o.size > o.maxSize { + return nil, NewError("the object size %q exceeed the allowed limit: %q", + o.size, o.maxSize) + } + + var buf bytes.Buffer + io.Copy(&buf, zr) // also: io.CopyN(&buf, zr, int64(o.size)) + + var bufLen = buf.Len() + if bufLen != int(o.size) { + return nil, NewError("inflated size mismatch, expected %d, got %d", o.size, bufLen) + } + + return buf.Bytes(), nil +} + +func (o *objectReader) Read(p []byte) (int, error) { + return o.pr.r.Read(p) +} + +func (o *objectReader) ReadByte() (byte, error) { + return o.pr.r.ReadByte() +} + +type ReaderError struct { + Msg string // description of error +} + +func NewError(format string, args ...interface{}) error { + return &ReaderError{Msg: fmt.Sprintf(format, args...)} +} + +func (e *ReaderError) Error() string { return e.Msg } |