package packfile
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"github.com/klauspost/compress/zlib"
)
const (
DefaultMaxObjectsLimit = 1 << 20
DefaultMaxObjectSize = 1 << 32 // 4GB
rawCommit = 1
rawTree = 2
rawBlob = 3
rawTag = 4
rawOFSDelta = 6
rawREFDelta = 7
)
type PackfileReader struct {
// MaxObjectsLimit is the limit of objects to be load in the packfile, if
// a packfile excess this number an error is throw, the default value
// is defined by DefaultMaxObjectsLimit, usually the default limit is more
// than enough to work with any repository, working extremly big repositories
// where the number of object is bigger the memory can be exhausted.
MaxObjectsLimit int
// MaxObjectSize is the maximum size in bytes, reading objects with a bigger
// size cause a error. The default value is defined by DefaultMaxObjectSize
MaxObjectSize int
r *trackingReader
objects map[Hash]packfileObject
offsets map[int]Hash
deltas []packfileDelta
contentCallback ContentCallback
}
type packfileObject struct {
bytes []byte
typ int8
}
type packfileDelta struct {
hash Hash
delta []byte
}
func NewPackfileReader(r io.Reader, fn ContentCallback) (*PackfileReader, error) {
return &PackfileReader{
MaxObjectsLimit: DefaultMaxObjectsLimit,
MaxObjectSize: DefaultMaxObjectSize,
r: &trackingReader{r: r},
objects: make(map[Hash]packfileObject, 0),
offsets: make(map[int]Hash, 0),
contentCallback: fn,
}, nil
}
func (pr *PackfileReader) Read() (*Packfile, error) {
packfile := NewPackfile()
if err := pr.validateHeader(); err != nil {
if err == io.EOF {
// This is an empty repo. It's OK.
return packfile, nil
}
return nil, err
}
ver, err := pr.readInt32()
if err != nil {
return nil, err
}
count, err := pr.readInt32()
if err != nil {
return nil, err
}
packfile.Version = uint32(ver)
packfile.ObjectCount = int(count)
if packfile.ObjectCount > pr.MaxObjectsLimit {
return nil, NewError("too many objects %d, limit is %d",
packfile.ObjectCount, pr.MaxObjectsLimit)
}
if err := pr.readObjects(packfile); err != nil {
return nil, err
}
packfile.Size = int64(pr.r.Pos())
return packfile, nil
}
func (pr *PackfileReader) validateHeader() error {
var header = make([]byte, 4)
if _, err := pr.r.Read(header); err != nil {
return err
}
if !bytes.Equal(header, []byte{'P', 'A', 'C', 'K'}) {
return NewError("Pack file does not start with 'PACK'")
}
return nil
}
func (pr *PackfileReader) readInt32() (uint32, error) {
var value uint32
if err := binary.Read(pr.r, binary.BigEndian, &value); err != nil {
return 0, err
}
return value, nil
}
func (pr *PackfileReader) readObjects(packfile *Packfile) error {
// This code has 50-80 µs of overhead per object not counting zlib inflation.
// Together with zlib inflation, it's 400-410 µs for small objects.
// That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB,
// of which 12-20 % is _not_ zlib inflation (ie. is our code).
for i := 0; i < packfile.ObjectCount; i++ {
var pos = pr.Pos()
obj, err := pr.readObject(packfile)
if err != nil && err != io.EOF {
return err
}
pr.offsets[pos] = obj.hash
if err == io.EOF {
break
}
}
return nil
}
func (pr *PackfileReader) readObject(packfile *Packfile) (*objectReader, error) {
o, err := newObjectReader(pr, packfile, pr.MaxObjectSize)
if err != nil {
return nil, err
}
switch o.typ {
case rawREFDelta:
err = o.readREFDelta()
case rawOFSDelta:
err = o.readOFSDelta()
case rawCommit, rawTree, rawBlob, rawTag:
err = o.readObject()
default:
err = NewError("Invalid git object tag %q", o.typ)
}
if err != nil {
return nil, err
}
return o, err
}
func (pr *PackfileReader) Pos() int { return pr.r.Pos() }
type objectReader struct {
pr *PackfileReader
pf *Packfile
maxSize uint64
hash Hash
steps int
typ int8
size uint64
}
func newObjectReader(pr *PackfileReader, pf *Packfile, maxSize int) (*objectReader, error) {
o := &objectReader{pr: pr, pf: pf, maxSize: uint64(maxSize)}
var buf [1]byte
if _, err := o.Read(buf[:]); err != nil {
return nil, err
}
o.typ = int8((buf[0] >> 4) & 7)
o.size = uint64(buf[0] & 15)
o.steps++ // byte we just read to get `o.typ` and `o.size`
var shift uint = 4
for buf[0]&0x80 == 0x80 {
if _, err := o.Read(buf[:]); err != nil {
return nil, err
}
o.size += uint64(buf[0]&0x7f) << shift
o.steps++ // byte we just read to update `o.size`
shift += 7
}
return o, nil
}
func (o *objectReader) readREFDelta() error {
var ref Hash
if _, err := o.Read(ref[:]); err != nil {
return err
}
buf, err := o.inflate()
if err != nil {
return err
}
referenced, ok := o.pr.objects[ref]
if !ok {
o.pr.deltas = append(o.pr.deltas, packfileDelta{hash: ref, delta: buf[:]})
} else {
patched := PatchDelta(referenced.bytes, buf[:])
if patched == nil {
return NewError("error while patching %x", ref)
}
o.typ = referenced.typ
err = o.addObject(patched)
if err != nil {
return err
}
}
return nil
}
func decodeOffset(src io.ByteReader, steps int) (int, error) {
b, err := src.ReadByte()
if err != nil {
return 0, err
}
var offset = int(b & 0x7f)
for (b & 0x80) != 0 {
offset++ // WHY?
b, err = src.ReadByte()
if err != nil {
return 0, err
}
offset = (offset << 7) + int(b&0x7f)
}
// offset needs to be aware of the bytes we read for `o.typ` and `o.size`
offset += steps
return -offset, nil
}
func (o *objectReader) readOFSDelta() error {
var pos = o.pr.Pos()
// read negative offset
offset, err := decodeOffset(o.pr.r, o.steps)
if err != nil {
return err
}
buf, err := o.inflate()
if err != nil {
return err
}
ref := o.pr.offsets[pos+offset]
referenced, ok := o.pr.objects[ref]
if !ok {
return NewError("can't find a pack entry at %d", pos+offset)
}
patched := PatchDelta(referenced.bytes, buf)
if patched == nil {
return NewError("error while patching %q", ref)
}
o.typ = referenced.typ
err = o.addObject(patched)
if err != nil {
return err
}
return nil
}
func (o *objectReader) readObject() error {
buf, err := o.inflate()
if err != nil {
return err
}
return o.addObject(buf)
}
func (o *objectReader) addObject(bytes []byte) error {
var hash Hash
switch o.typ {
case rawCommit:
c, err := ParseCommit(bytes)
if err != nil {
return err
}
o.pf.Commits[c.Hash()] = c
hash = c.Hash()
case rawTree:
c, err := ParseTree(bytes)
if err != nil {
return err
}
o.pf.Trees[c.Hash()] = c
hash = c.Hash()
case rawBlob:
c, err := ParseBlob(bytes)
if err != nil {
return err
}
o.pf.Blobs[c.Hash()] = c
hash = c.Hash()
if o.pr.contentCallback != nil {
o.pr.contentCallback(hash, bytes)
}
}
o.pr.objects[hash] = packfileObject{bytes: bytes, typ: o.typ}
o.hash = hash
return nil
}
func (o *objectReader) inflate() ([]byte, error) {
zr, err := zlib.NewReader(o.pr.r)
if err != nil {
if err == zlib.ErrHeader {
return nil, zlib.ErrHeader
}
return nil, NewError("error opening packfile's object zlib: %v", err)
}
defer zr.Close()
if o.size > o.maxSize {
return nil, NewError("the object size %q exceeed the allowed limit: %q",
o.size, o.maxSize)
}
var buf bytes.Buffer
io.Copy(&buf, zr) // also: io.CopyN(&buf, zr, int64(o.size))
var bufLen = buf.Len()
if bufLen != int(o.size) {
return nil, NewError("inflated size mismatch, expected %d, got %d", o.size, bufLen)
}
return buf.Bytes(), nil
}
func (o *objectReader) Read(p []byte) (int, error) {
return o.pr.r.Read(p)
}
func (o *objectReader) ReadByte() (byte, error) {
return o.pr.r.ReadByte()
}
type ReaderError struct {
Msg string // description of error
}
func NewError(format string, args ...interface{}) error {
return &ReaderError{Msg: fmt.Sprintf(format, args...)}
}
func (e *ReaderError) Error() string { return e.Msg }