package packfile import ( "bytes" "encoding/binary" "fmt" "io" "github.com/klauspost/compress/zlib" ) type Format int const ( DefaultMaxObjectsLimit = 1 << 20 DefaultMaxObjectSize = 1 << 32 // 4GB VersionSupported = 2 UnknownFormat Format = 0 OFSDeltaFormat Format = 1 REFDeltaFormat Format = 2 ) type PackfileReader struct { // MaxObjectsLimit is the limit of objects to be load in the packfile, if // a packfile excess this number an error is throw, the default value // is defined by DefaultMaxObjectsLimit, usually the default limit is more // than enough to work with any repository, working extremly big repositories // where the number of object is bigger the memory can be exhausted. MaxObjectsLimit uint32 // MaxObjectSize is the maximum size in bytes, reading objects with a bigger // size cause a error. The default value is defined by DefaultMaxObjectSize MaxObjectSize uint64 // Format specifies if we are using ref-delta's or ofs-delta's, choosing the // correct format the memory usage is optimized // https://github.com/git/git/blob/8d530c4d64ffcc853889f7b385f554d53db375ed/Documentation/technical/protocol-capabilities.txt#L154 Format Format r *trackingReader objects map[Hash]*RAWObject offsets map[int]*RAWObject } func NewPackfileReader(r io.Reader, fn ContentCallback) (*PackfileReader, error) { return &PackfileReader{ MaxObjectsLimit: DefaultMaxObjectsLimit, MaxObjectSize: DefaultMaxObjectSize, r: &trackingReader{r: r}, objects: make(map[Hash]*RAWObject, 0), offsets: make(map[int]*RAWObject, 0), }, nil } func (pr *PackfileReader) Read() (chan *RAWObject, error) { if err := pr.validateHeader(); err != nil { if err == io.EOF { // This is an empty repo. It's OK. return nil, nil } return nil, err } version, err := pr.readInt32() if err != nil { return nil, err } if version > VersionSupported { return nil, NewError("unsupported packfile version %d", version) } count, err := pr.readInt32() if err != nil { return nil, err } if count > pr.MaxObjectsLimit { return nil, NewError("too many objects %d, limit is %d", count, pr.MaxObjectsLimit) } ch := make(chan *RAWObject, 1) go pr.readObjects(ch, count) // packfile.Size = int64(pr.r.Pos()) return ch, nil } func (pr *PackfileReader) validateHeader() error { var header = make([]byte, 4) if _, err := pr.r.Read(header); err != nil { return err } if !bytes.Equal(header, []byte{'P', 'A', 'C', 'K'}) { return NewError("Pack file does not start with 'PACK'") } return nil } func (pr *PackfileReader) readInt32() (uint32, error) { var value uint32 if err := binary.Read(pr.r, binary.BigEndian, &value); err != nil { return 0, err } return value, nil } func (pr *PackfileReader) readObjects(ch chan *RAWObject, count uint32) error { // This code has 50-80 µs of overhead per object not counting zlib inflation. // Together with zlib inflation, it's 400-410 µs for small objects. // That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB, // of which 12-20 % is _not_ zlib inflation (ie. is our code). defer func() { close(ch) }() for i := 0; i < int(count); i++ { var pos = pr.Pos() obj, err := pr.newRAWObject() if err != nil && err != io.EOF { fmt.Println(err) return err } if pr.Format == UnknownFormat || pr.Format == OFSDeltaFormat { pr.offsets[pos] = obj } if pr.Format == UnknownFormat || pr.Format == REFDeltaFormat { pr.objects[obj.Hash] = obj } ch <- obj if err == io.EOF { break } } return nil } func (pr *PackfileReader) Pos() int { return pr.r.Pos() } func (pr *PackfileReader) newRAWObject() (*RAWObject, error) { raw := &RAWObject{} steps := 0 var buf [1]byte if _, err := pr.r.Read(buf[:]); err != nil { return nil, err } raw.Type = ObjectType((buf[0] >> 4) & 7) raw.Size = uint64(buf[0] & 15) steps++ // byte we just read to get `o.typ` and `o.size` var shift uint = 4 for buf[0]&0x80 == 0x80 { if _, err := pr.r.Read(buf[:]); err != nil { return nil, err } raw.Size += uint64(buf[0]&0x7f) << shift steps++ // byte we just read to update `o.size` shift += 7 } var err error switch raw.Type { case REFDeltaObject: err = pr.readREFDelta(raw) case OFSDeltaObject: err = pr.readOFSDelta(raw, steps) case CommitObject, TreeObject, BlobObject, TagObject: err = pr.readObject(raw) default: err = NewError("Invalid git object tag %q", raw.Type) } return raw, err } func (pr *PackfileReader) readREFDelta(raw *RAWObject) error { var ref Hash if _, err := pr.r.Read(ref[:]); err != nil { return err } buf, err := pr.inflate(raw.Size) if err != nil { return err } referenced, ok := pr.objects[ref] if !ok { fmt.Println("not found", ref) } else { patched := PatchDelta(referenced.Bytes, buf[:]) if patched == nil { return NewError("error while patching %x", ref) } raw.Type = referenced.Type raw.Bytes = patched raw.Size = uint64(len(patched)) raw.Hash = ComputeHash(raw.Type, raw.Bytes) } return nil } func (pr *PackfileReader) readOFSDelta(raw *RAWObject, steps int) error { var pos = pr.Pos() // read negative offset offset, err := decodeOffset(pr.r, steps) if err != nil { return err } buf, err := pr.inflate(raw.Size) if err != nil { return err } ref, ok := pr.offsets[pos+offset] if !ok { return NewError("can't find a pack entry at %d", pos+offset) } patched := PatchDelta(ref.Bytes, buf) if patched == nil { return NewError("error while patching %q", ref) } raw.Type = ref.Type raw.Bytes = patched raw.Size = uint64(len(patched)) raw.Hash = ComputeHash(raw.Type, raw.Bytes) return nil } func (pr *PackfileReader) readObject(raw *RAWObject) error { buf, err := pr.inflate(raw.Size) if err != nil { return err } raw.Bytes = buf raw.Hash = ComputeHash(raw.Type, raw.Bytes) return nil } func (pr *PackfileReader) inflate(size uint64) ([]byte, error) { zr, err := zlib.NewReader(pr.r) if err != nil { if err == zlib.ErrHeader { return nil, zlib.ErrHeader } return nil, NewError("error opening packfile's object zlib: %v", err) } defer zr.Close() if size > pr.MaxObjectSize { return nil, NewError("the object size %q exceeed the allowed limit: %q", size, pr.MaxObjectSize) } var buf bytes.Buffer io.Copy(&buf, zr) // also: io.CopyN(&buf, zr, int64(o.size)) if buf.Len() != int(size) { return nil, NewError( "inflated size mismatch, expected %d, got %d", size, buf.Len()) } return buf.Bytes(), nil } type ReaderError struct { Msg string // description of error } func NewError(format string, args ...interface{}) error { return &ReaderError{Msg: fmt.Sprintf(format, args...)} } func (e *ReaderError) Error() string { return e.Msg }