aboutsummaryrefslogtreecommitdiffstats
path: root/plumbing/format/packfile/parser.go
diff options
context:
space:
mode:
authorMáximo Cuadros <mcuadros@gmail.com>2018-08-14 09:57:46 +0200
committerGitHub <noreply@github.com>2018-08-14 09:57:46 +0200
commita28c2ce44695f13ddf28748958f236afd8e0b544 (patch)
tree107dd441cd96b44b4f3994d26faf5f0bfae933fc /plumbing/format/packfile/parser.go
parentc3740924da0d1929cb523c85ae9da3b456b901ea (diff)
parent8d75d239e93474e4287870e4e5143da14e2c360d (diff)
downloadgo-git-a28c2ce44695f13ddf28748958f236afd8e0b544.tar.gz
Merge pull request #906 from src-d/perf/packfile-reads
Improve packfile reading performance
Diffstat (limited to 'plumbing/format/packfile/parser.go')
-rw-r--r--plumbing/format/packfile/parser.go505
1 files changed, 505 insertions, 0 deletions
diff --git a/plumbing/format/packfile/parser.go b/plumbing/format/packfile/parser.go
new file mode 100644
index 0000000..581c334
--- /dev/null
+++ b/plumbing/format/packfile/parser.go
@@ -0,0 +1,505 @@
+package packfile
+
+import (
+ "bytes"
+ "errors"
+ "io"
+
+ "gopkg.in/src-d/go-git.v4/plumbing"
+ "gopkg.in/src-d/go-git.v4/plumbing/cache"
+ "gopkg.in/src-d/go-git.v4/plumbing/storer"
+)
+
+var (
+ // ErrReferenceDeltaNotFound is returned when the reference delta is not
+ // found.
+ ErrReferenceDeltaNotFound = errors.New("reference delta not found")
+
+ // ErrNotSeekableSource is returned when the source for the parser is not
+ // seekable and a storage was not provided, so it can't be parsed.
+ ErrNotSeekableSource = errors.New("parser source is not seekable and storage was not provided")
+
+ // ErrDeltaNotCached is returned when the delta could not be found in cache.
+ ErrDeltaNotCached = errors.New("delta could not be found in cache")
+)
+
+// Observer interface is implemented by index encoders.
+type Observer interface {
+ // OnHeader is called when a new packfile is opened.
+ OnHeader(count uint32) error
+ // OnInflatedObjectHeader is called for each object header read.
+ OnInflatedObjectHeader(t plumbing.ObjectType, objSize int64, pos int64) error
+ // OnInflatedObjectContent is called for each decoded object.
+ OnInflatedObjectContent(h plumbing.Hash, pos int64, crc uint32, content []byte) error
+ // OnFooter is called when decoding is done.
+ OnFooter(h plumbing.Hash) error
+}
+
+// Parser decodes a packfile and calls any observer associated to it. Is used
+// to generate indexes.
+type Parser struct {
+ storage storer.EncodedObjectStorer
+ scanner *Scanner
+ count uint32
+ oi []*objectInfo
+ oiByHash map[plumbing.Hash]*objectInfo
+ oiByOffset map[int64]*objectInfo
+ hashOffset map[plumbing.Hash]int64
+ pendingRefDeltas map[plumbing.Hash][]*objectInfo
+ checksum plumbing.Hash
+
+ cache *cache.ObjectLRU
+ // delta content by offset, only used if source is not seekable
+ deltas map[int64][]byte
+
+ ob []Observer
+}
+
+// NewParser creates a new Parser. The Scanner source must be seekable.
+// If it's not, NewParserWithStorage should be used instead.
+func NewParser(scanner *Scanner, ob ...Observer) (*Parser, error) {
+ return NewParserWithStorage(scanner, nil, ob...)
+}
+
+// NewParserWithStorage creates a new Parser. The scanner source must either
+// be seekable or a storage must be provided.
+func NewParserWithStorage(
+ scanner *Scanner,
+ storage storer.EncodedObjectStorer,
+ ob ...Observer,
+) (*Parser, error) {
+ if !scanner.IsSeekable && storage == nil {
+ return nil, ErrNotSeekableSource
+ }
+
+ var deltas map[int64][]byte
+ if !scanner.IsSeekable {
+ deltas = make(map[int64][]byte)
+ }
+
+ return &Parser{
+ storage: storage,
+ scanner: scanner,
+ ob: ob,
+ count: 0,
+ cache: cache.NewObjectLRUDefault(),
+ pendingRefDeltas: make(map[plumbing.Hash][]*objectInfo),
+ deltas: deltas,
+ }, nil
+}
+
+func (p *Parser) forEachObserver(f func(o Observer) error) error {
+ for _, o := range p.ob {
+ if err := f(o); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (p *Parser) onHeader(count uint32) error {
+ return p.forEachObserver(func(o Observer) error {
+ return o.OnHeader(count)
+ })
+}
+
+func (p *Parser) onInflatedObjectHeader(
+ t plumbing.ObjectType,
+ objSize int64,
+ pos int64,
+) error {
+ return p.forEachObserver(func(o Observer) error {
+ return o.OnInflatedObjectHeader(t, objSize, pos)
+ })
+}
+
+func (p *Parser) onInflatedObjectContent(
+ h plumbing.Hash,
+ pos int64,
+ crc uint32,
+ content []byte,
+) error {
+ return p.forEachObserver(func(o Observer) error {
+ return o.OnInflatedObjectContent(h, pos, crc, content)
+ })
+}
+
+func (p *Parser) onFooter(h plumbing.Hash) error {
+ return p.forEachObserver(func(o Observer) error {
+ return o.OnFooter(h)
+ })
+}
+
+// Parse start decoding phase of the packfile.
+func (p *Parser) Parse() (plumbing.Hash, error) {
+ if err := p.init(); err != nil {
+ return plumbing.ZeroHash, err
+ }
+
+ if err := p.indexObjects(); err != nil {
+ return plumbing.ZeroHash, err
+ }
+
+ var err error
+ p.checksum, err = p.scanner.Checksum()
+ if err != nil && err != io.EOF {
+ return plumbing.ZeroHash, err
+ }
+
+ if err := p.resolveDeltas(); err != nil {
+ return plumbing.ZeroHash, err
+ }
+
+ if len(p.pendingRefDeltas) > 0 {
+ return plumbing.ZeroHash, ErrReferenceDeltaNotFound
+ }
+
+ if err := p.onFooter(p.checksum); err != nil {
+ return plumbing.ZeroHash, err
+ }
+
+ return p.checksum, nil
+}
+
+func (p *Parser) init() error {
+ _, c, err := p.scanner.Header()
+ if err != nil {
+ return err
+ }
+
+ if err := p.onHeader(c); err != nil {
+ return err
+ }
+
+ p.count = c
+ p.oiByHash = make(map[plumbing.Hash]*objectInfo, p.count)
+ p.oiByOffset = make(map[int64]*objectInfo, p.count)
+ p.oi = make([]*objectInfo, p.count)
+
+ return nil
+}
+
+func (p *Parser) indexObjects() error {
+ buf := new(bytes.Buffer)
+
+ for i := uint32(0); i < p.count; i++ {
+ buf.Reset()
+
+ oh, err := p.scanner.NextObjectHeader()
+ if err != nil {
+ return err
+ }
+
+ delta := false
+ var ota *objectInfo
+ switch t := oh.Type; t {
+ case plumbing.OFSDeltaObject:
+ delta = true
+
+ parent, ok := p.oiByOffset[oh.OffsetReference]
+ if !ok {
+ return plumbing.ErrObjectNotFound
+ }
+
+ ota = newDeltaObject(oh.Offset, oh.Length, t, parent)
+ parent.Children = append(parent.Children, ota)
+ case plumbing.REFDeltaObject:
+ delta = true
+
+ parent, ok := p.oiByHash[oh.Reference]
+ if ok {
+ ota = newDeltaObject(oh.Offset, oh.Length, t, parent)
+ parent.Children = append(parent.Children, ota)
+ } else {
+ ota = newBaseObject(oh.Offset, oh.Length, t)
+ p.pendingRefDeltas[oh.Reference] = append(
+ p.pendingRefDeltas[oh.Reference],
+ ota,
+ )
+ }
+ default:
+ ota = newBaseObject(oh.Offset, oh.Length, t)
+ }
+
+ size, crc, err := p.scanner.NextObject(buf)
+ if err != nil {
+ return err
+ }
+
+ ota.Crc32 = crc
+ ota.PackSize = size
+ ota.Length = oh.Length
+
+ data := buf.Bytes()
+ if !delta {
+ if _, err := ota.Write(data); err != nil {
+ return err
+ }
+ ota.SHA1 = ota.Sum()
+ p.oiByHash[ota.SHA1] = ota
+ }
+
+ if p.storage != nil && !delta {
+ obj := new(plumbing.MemoryObject)
+ obj.SetSize(oh.Length)
+ obj.SetType(oh.Type)
+ if _, err := obj.Write(data); err != nil {
+ return err
+ }
+
+ if _, err := p.storage.SetEncodedObject(obj); err != nil {
+ return err
+ }
+ }
+
+ if delta && !p.scanner.IsSeekable {
+ p.deltas[oh.Offset] = make([]byte, len(data))
+ copy(p.deltas[oh.Offset], data)
+ }
+
+ p.oiByOffset[oh.Offset] = ota
+ p.oi[i] = ota
+ }
+
+ return nil
+}
+
+func (p *Parser) resolveDeltas() error {
+ for _, obj := range p.oi {
+ content, err := p.get(obj)
+ if err != nil {
+ return err
+ }
+
+ if err := p.onInflatedObjectHeader(obj.Type, obj.Length, obj.Offset); err != nil {
+ return err
+ }
+
+ if err := p.onInflatedObjectContent(obj.SHA1, obj.Offset, obj.Crc32, content); err != nil {
+ return err
+ }
+
+ if !obj.IsDelta() && len(obj.Children) > 0 {
+ for _, child := range obj.Children {
+ if _, err := p.resolveObject(child, content); err != nil {
+ return err
+ }
+ }
+
+ // Remove the delta from the cache.
+ if obj.DiskType.IsDelta() && !p.scanner.IsSeekable {
+ delete(p.deltas, obj.Offset)
+ }
+ }
+
+ obj.Content = nil
+ }
+
+ return nil
+}
+
+func (p *Parser) get(o *objectInfo) ([]byte, error) {
+ if len(o.Content) > 0 {
+ return o.Content, nil
+ }
+
+ e, ok := p.cache.Get(o.SHA1)
+ // If it's not on the cache and is not a delta we can try to find it in the
+ // storage, if there's one.
+ if !ok && p.storage != nil && !o.Type.IsDelta() {
+ var err error
+ e, err = p.storage.EncodedObject(plumbing.AnyObject, o.SHA1)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ if e != nil {
+ r, err := e.Reader()
+ if err != nil {
+ return nil, err
+ }
+
+ buf := make([]byte, e.Size())
+ if _, err = r.Read(buf); err != nil {
+ return nil, err
+ }
+
+ return buf, nil
+ }
+
+ var data []byte
+ if o.DiskType.IsDelta() {
+ base, err := p.get(o.Parent)
+ if err != nil {
+ return nil, err
+ }
+
+ data, err = p.resolveObject(o, base)
+ if err != nil {
+ return nil, err
+ }
+ } else {
+ var err error
+ data, err = p.readData(o)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ if len(o.Children) > 0 {
+ m := &plumbing.MemoryObject{}
+ m.Write(data)
+ m.SetType(o.Type)
+ m.SetSize(o.Size())
+ p.cache.Put(m)
+ }
+
+ return data, nil
+}
+
+func (p *Parser) resolveObject(
+ o *objectInfo,
+ base []byte,
+) ([]byte, error) {
+ if !o.DiskType.IsDelta() {
+ return nil, nil
+ }
+
+ data, err := p.readData(o)
+ if err != nil {
+ return nil, err
+ }
+
+ data, err = applyPatchBase(o, data, base)
+ if err != nil {
+ return nil, err
+ }
+
+ if pending, ok := p.pendingRefDeltas[o.SHA1]; ok {
+ for _, po := range pending {
+ po.Parent = o
+ o.Children = append(o.Children, po)
+ }
+ delete(p.pendingRefDeltas, o.SHA1)
+ }
+
+ if p.storage != nil {
+ obj := new(plumbing.MemoryObject)
+ obj.SetSize(o.Size())
+ obj.SetType(o.Type)
+ if _, err := obj.Write(data); err != nil {
+ return nil, err
+ }
+
+ if _, err := p.storage.SetEncodedObject(obj); err != nil {
+ return nil, err
+ }
+ }
+
+ return data, nil
+}
+
+func (p *Parser) readData(o *objectInfo) ([]byte, error) {
+ if !p.scanner.IsSeekable && o.DiskType.IsDelta() {
+ data, ok := p.deltas[o.Offset]
+ if !ok {
+ return nil, ErrDeltaNotCached
+ }
+
+ return data, nil
+ }
+
+ // TODO: skip header. Header size can be calculated with the offset of the
+ // next offset in the first pass.
+ if _, err := p.scanner.SeekFromStart(o.Offset); err != nil {
+ return nil, err
+ }
+
+ if _, err := p.scanner.NextObjectHeader(); err != nil {
+ return nil, err
+ }
+
+ buf := new(bytes.Buffer)
+ if _, _, err := p.scanner.NextObject(buf); err != nil {
+ return nil, err
+ }
+
+ return buf.Bytes(), nil
+}
+
+func applyPatchBase(ota *objectInfo, data, base []byte) ([]byte, error) {
+ patched, err := PatchDelta(base, data)
+ if err != nil {
+ return nil, err
+ }
+
+ ota.Type = ota.Parent.Type
+ ota.Hasher = plumbing.NewHasher(ota.Type, int64(len(patched)))
+ if _, err := ota.Write(patched); err != nil {
+ return nil, err
+ }
+ ota.SHA1 = ota.Sum()
+ ota.Length = int64(len(patched))
+
+ return patched, nil
+}
+
+type objectInfo struct {
+ plumbing.Hasher
+
+ Offset int64
+ Length int64
+ HeaderLength int64
+ PackSize int64
+ Type plumbing.ObjectType
+ DiskType plumbing.ObjectType
+
+ Crc32 uint32
+
+ Parent *objectInfo
+ Children []*objectInfo
+ SHA1 plumbing.Hash
+
+ Content []byte
+}
+
+func newBaseObject(offset, length int64, t plumbing.ObjectType) *objectInfo {
+ return newDeltaObject(offset, length, t, nil)
+}
+
+func newDeltaObject(
+ offset, length int64,
+ t plumbing.ObjectType,
+ parent *objectInfo,
+) *objectInfo {
+ children := make([]*objectInfo, 0)
+
+ obj := &objectInfo{
+ Hasher: plumbing.NewHasher(t, length),
+ Offset: offset,
+ Length: length,
+ PackSize: 0,
+ Type: t,
+ DiskType: t,
+ Crc32: 0,
+ Parent: parent,
+ Children: children,
+ }
+
+ return obj
+}
+
+func (o *objectInfo) Write(b []byte) (int, error) {
+ o.Content = make([]byte, len(b))
+ copy(o.Content, b)
+ return o.Hasher.Write(b)
+}
+
+func (o *objectInfo) IsDelta() bool {
+ return o.Type.IsDelta()
+}
+
+func (o *objectInfo) Size() int64 {
+ return o.Length
+}