From 8f52c5099e7fe4a2519920a7bbf5a9bb52ff9cec Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Wed, 28 Nov 2018 01:29:41 +0100 Subject: plumbing: format/packfile, performance optimizations for reading large commit histories (#963) Signed-off-by: Filip Navara --- plumbing/format/packfile/fsobject.go | 2 +- plumbing/format/packfile/packfile.go | 51 ++++++++++++++------------- plumbing/format/packfile/parser.go | 6 +--- plumbing/format/packfile/scanner.go | 46 ++++++++++++++++++++++--- plumbing/format/packfile/scanner_test.go | 17 +++++++++ storage/filesystem/object.go | 59 +++++++++++++++++++------------- storage/filesystem/storage.go | 6 +--- 7 files changed, 126 insertions(+), 61 deletions(-) diff --git a/plumbing/format/packfile/fsobject.go b/plumbing/format/packfile/fsobject.go index 330cb73..a268bce 100644 --- a/plumbing/format/packfile/fsobject.go +++ b/plumbing/format/packfile/fsobject.go @@ -48,7 +48,7 @@ func NewFSObject( // Reader implements the plumbing.EncodedObject interface. func (o *FSObject) Reader() (io.ReadCloser, error) { obj, ok := o.cache.Get(o.hash) - if ok { + if ok && obj != o { reader, err := obj.Reader() if err != nil { return nil, err diff --git a/plumbing/format/packfile/packfile.go b/plumbing/format/packfile/packfile.go index 2166e0a..1e7ef26 100644 --- a/plumbing/format/packfile/packfile.go +++ b/plumbing/format/packfile/packfile.go @@ -21,6 +21,16 @@ var ( ErrZLib = NewError("zlib reading error") ) +// When reading small objects from packfile it is beneficial to do so at +// once to exploit the buffered I/O. In many cases the objects are so small +// that they were already loaded to memory when the object header was +// loaded from the packfile. Wrapping in FSObject would cause this buffered +// data to be thrown away and then re-read later, with the additional +// seeking causing reloads from disk. Objects smaller than this threshold +// are now always read into memory and stored in cache instead of being +// wrapped in FSObject. +const smallObjectThreshold = 16 * 1024 + // Packfile allows retrieving information from inside a packfile. type Packfile struct { idxfile.Index @@ -79,15 +89,7 @@ func (p *Packfile) GetByOffset(o int64) (plumbing.EncodedObject, error) { } } - if _, err := p.s.SeekFromStart(o); err != nil { - if err == io.EOF || isInvalid(err) { - return nil, plumbing.ErrObjectNotFound - } - - return nil, err - } - - return p.nextObject() + return p.objectAtOffset(o) } // GetSizeByOffset retrieves the size of the encoded object from the @@ -108,6 +110,12 @@ func (p *Packfile) GetSizeByOffset(o int64) (size int64, err error) { return h.Length, nil } +func (p *Packfile) objectHeaderAtOffset(offset int64) (*ObjectHeader, error) { + h, err := p.s.SeekObjectHeader(offset) + p.s.pendingObject = nil + return h, err +} + func (p *Packfile) nextObjectHeader() (*ObjectHeader, error) { h, err := p.s.NextObjectHeader() p.s.pendingObject = nil @@ -154,11 +162,7 @@ func (p *Packfile) getObjectType(h *ObjectHeader) (typ plumbing.ObjectType, err if baseType, ok := p.offsetToType[offset]; ok { typ = baseType } else { - if _, err = p.s.SeekFromStart(offset); err != nil { - return - } - - h, err = p.nextObjectHeader() + h, err = p.objectHeaderAtOffset(offset) if err != nil { return } @@ -175,8 +179,8 @@ func (p *Packfile) getObjectType(h *ObjectHeader) (typ plumbing.ObjectType, err return } -func (p *Packfile) nextObject() (plumbing.EncodedObject, error) { - h, err := p.nextObjectHeader() +func (p *Packfile) objectAtOffset(offset int64) (plumbing.EncodedObject, error) { + h, err := p.objectHeaderAtOffset(offset) if err != nil { if err == io.EOF || isInvalid(err) { return nil, plumbing.ErrObjectNotFound @@ -190,6 +194,13 @@ func (p *Packfile) nextObject() (plumbing.EncodedObject, error) { return p.getNextObject(h) } + // If the object is not a delta and it's small enough then read it + // completely into memory now since it is already read from disk + // into buffer anyway. + if h.Length <= smallObjectThreshold && h.Type != plumbing.OFSDeltaObject && h.Type != plumbing.REFDeltaObject { + return p.getNextObject(h) + } + hash, err := p.FindHash(h.Offset) if err != nil { return nil, err @@ -233,11 +244,7 @@ func (p *Packfile) getObjectContent(offset int64) (io.ReadCloser, error) { } } - if _, err := p.s.SeekFromStart(offset); err != nil { - return nil, err - } - - h, err := p.nextObjectHeader() + h, err := p.objectHeaderAtOffset(offset) if err != nil { return nil, err } @@ -329,8 +336,6 @@ func (p *Packfile) fillOFSDeltaObjectContent(obj plumbing.EncodedObject, offset if err != nil { return err } - - p.cachePut(base) } obj.SetType(base.Type()) diff --git a/plumbing/format/packfile/parser.go b/plumbing/format/packfile/parser.go index 5a62d63..71cbba9 100644 --- a/plumbing/format/packfile/parser.go +++ b/plumbing/format/packfile/parser.go @@ -398,11 +398,7 @@ func (p *Parser) readData(o *objectInfo) ([]byte, error) { return data, nil } - if _, err := p.scanner.SeekFromStart(o.Offset); err != nil { - return nil, err - } - - if _, err := p.scanner.NextObjectHeader(); err != nil { + if _, err := p.scanner.SeekObjectHeader(o.Offset); err != nil { return nil, err } diff --git a/plumbing/format/packfile/scanner.go b/plumbing/format/packfile/scanner.go index 6fc183b..614b0d1 100644 --- a/plumbing/format/packfile/scanner.go +++ b/plumbing/format/packfile/scanner.go @@ -138,14 +138,52 @@ func (s *Scanner) readCount() (uint32, error) { return binary.ReadUint32(s.r) } +// SeekObjectHeader seeks to specified offset and returns the ObjectHeader +// for the next object in the reader +func (s *Scanner) SeekObjectHeader(offset int64) (*ObjectHeader, error) { + // if seeking we assume that you are not interested in the header + if s.version == 0 { + s.version = VersionSupported + } + + if _, err := s.r.Seek(offset, io.SeekStart); err != nil { + return nil, err + } + + h, err := s.nextObjectHeader() + if err != nil { + return nil, err + } + + h.Offset = offset + return h, nil +} + // NextObjectHeader returns the ObjectHeader for the next object in the reader func (s *Scanner) NextObjectHeader() (*ObjectHeader, error) { - defer s.Flush() - if err := s.doPending(); err != nil { return nil, err } + offset, err := s.r.Seek(0, io.SeekCurrent) + if err != nil { + return nil, err + } + + h, err := s.nextObjectHeader() + if err != nil { + return nil, err + } + + h.Offset = offset + return h, nil +} + +// nextObjectHeader returns the ObjectHeader for the next object in the reader +// without the Offset field +func (s *Scanner) nextObjectHeader() (*ObjectHeader, error) { + defer s.Flush() + s.crc.Reset() h := &ObjectHeader{} @@ -308,7 +346,7 @@ var byteSlicePool = sync.Pool{ // SeekFromStart sets a new offset from start, returns the old position before // the change. func (s *Scanner) SeekFromStart(offset int64) (previous int64, err error) { - // if seeking we assume that you are not interested on the header + // if seeking we assume that you are not interested in the header if s.version == 0 { s.version = VersionSupported } @@ -385,7 +423,7 @@ type bufferedSeeker struct { } func (r *bufferedSeeker) Seek(offset int64, whence int) (int64, error) { - if whence == io.SeekCurrent { + if whence == io.SeekCurrent && offset == 0 { current, err := r.r.Seek(offset, whence) if err != nil { return current, err diff --git a/plumbing/format/packfile/scanner_test.go b/plumbing/format/packfile/scanner_test.go index 644d0eb..091b457 100644 --- a/plumbing/format/packfile/scanner_test.go +++ b/plumbing/format/packfile/scanner_test.go @@ -118,6 +118,23 @@ func (s *ScannerSuite) TestNextObjectHeaderWithOutReadObjectNonSeekable(c *C) { c.Assert(n, Equals, f.PackfileHash) } +func (s *ScannerSuite) TestSeekObjectHeader(c *C) { + r := fixtures.Basic().One().Packfile() + p := NewScanner(r) + + h, err := p.SeekObjectHeader(expectedHeadersOFS[4].Offset) + c.Assert(err, IsNil) + c.Assert(h, DeepEquals, &expectedHeadersOFS[4]) +} + +func (s *ScannerSuite) TestSeekObjectHeaderNonSeekable(c *C) { + r := io.MultiReader(fixtures.Basic().One().Packfile()) + p := NewScanner(r) + + _, err := p.SeekObjectHeader(expectedHeadersOFS[4].Offset) + c.Assert(err, Equals, ErrSeekNotSupported) +} + var expectedHeadersOFS = []ObjectHeader{ {Type: plumbing.CommitObject, Offset: 12, Length: 254}, {Type: plumbing.OFSDeltaObject, Offset: 186, Length: 93, OffsetReference: 12}, diff --git a/storage/filesystem/object.go b/storage/filesystem/object.go index 57dcbb4..85bb798 100644 --- a/storage/filesystem/object.go +++ b/storage/filesystem/object.go @@ -20,24 +20,25 @@ import ( type ObjectStorage struct { options Options - // deltaBaseCache is an object cache uses to cache delta's bases when - deltaBaseCache cache.Object + // objectCache is an object cache uses to cache delta's bases and also recently + // loaded loose objects + objectCache cache.Object dir *dotgit.DotGit index map[plumbing.Hash]idxfile.Index } // NewObjectStorage creates a new ObjectStorage with the given .git directory and cache. -func NewObjectStorage(dir *dotgit.DotGit, cache cache.Object) *ObjectStorage { - return NewObjectStorageWithOptions(dir, cache, Options{}) +func NewObjectStorage(dir *dotgit.DotGit, objectCache cache.Object) *ObjectStorage { + return NewObjectStorageWithOptions(dir, objectCache, Options{}) } // NewObjectStorageWithOptions creates a new ObjectStorage with the given .git directory, cache and extra options -func NewObjectStorageWithOptions(dir *dotgit.DotGit, cache cache.Object, ops Options) *ObjectStorage { +func NewObjectStorageWithOptions(dir *dotgit.DotGit, objectCache cache.Object, ops Options) *ObjectStorage { return &ObjectStorage{ - options: ops, - deltaBaseCache: cache, - dir: dir, + options: ops, + objectCache: objectCache, + dir: dir, } } @@ -206,7 +207,7 @@ func (s *ObjectStorage) encodedObjectSizeFromPackfile(h plumbing.Hash) ( idx := s.index[pack] hash, err := idx.FindHash(offset) if err == nil { - obj, ok := s.deltaBaseCache.Get(hash) + obj, ok := s.objectCache.Get(hash) if ok { return obj.Size(), nil } @@ -215,8 +216,8 @@ func (s *ObjectStorage) encodedObjectSizeFromPackfile(h plumbing.Hash) ( } var p *packfile.Packfile - if s.deltaBaseCache != nil { - p = packfile.NewPackfileWithCache(idx, s.dir.Fs(), f, s.deltaBaseCache) + if s.objectCache != nil { + p = packfile.NewPackfileWithCache(idx, s.dir.Fs(), f, s.objectCache) } else { p = packfile.NewPackfile(idx, s.dir.Fs(), f) } @@ -241,9 +242,19 @@ func (s *ObjectStorage) EncodedObjectSize(h plumbing.Hash) ( // EncodedObject returns the object with the given hash, by searching for it in // the packfile and the git object directories. func (s *ObjectStorage) EncodedObject(t plumbing.ObjectType, h plumbing.Hash) (plumbing.EncodedObject, error) { - obj, err := s.getFromUnpacked(h) - if err == plumbing.ErrObjectNotFound { + var obj plumbing.EncodedObject + var err error + + if s.index != nil { obj, err = s.getFromPackfile(h, false) + if err == plumbing.ErrObjectNotFound { + obj, err = s.getFromUnpacked(h) + } + } else { + obj, err = s.getFromUnpacked(h) + if err == plumbing.ErrObjectNotFound { + obj, err = s.getFromPackfile(h, false) + } } // If the error is still object not found, check if it's a shared object @@ -254,7 +265,7 @@ func (s *ObjectStorage) EncodedObject(t plumbing.ObjectType, h plumbing.Hash) (p // Create a new object storage with the DotGit(s) and check for the // required hash object. Skip when not found. for _, dg := range dotgits { - o := NewObjectStorage(dg, s.deltaBaseCache) + o := NewObjectStorage(dg, s.objectCache) enobj, enerr := o.EncodedObject(t, h) if enerr != nil { continue @@ -296,6 +307,10 @@ func (s *ObjectStorage) DeltaObject(t plumbing.ObjectType, } func (s *ObjectStorage) getFromUnpacked(h plumbing.Hash) (obj plumbing.EncodedObject, err error) { + if cacheObj, found := s.objectCache.Get(h); found { + return cacheObj, nil + } + f, err := s.dir.Object(h) if err != nil { if os.IsNotExist(err) { @@ -327,6 +342,8 @@ func (s *ObjectStorage) getFromUnpacked(h plumbing.Hash) (obj plumbing.EncodedOb return nil, err } + s.objectCache.Put(obj) + _, err = io.Copy(w, r) return obj, err } @@ -369,7 +386,7 @@ func (s *ObjectStorage) decodeObjectAt( ) (plumbing.EncodedObject, error) { hash, err := idx.FindHash(offset) if err == nil { - obj, ok := s.deltaBaseCache.Get(hash) + obj, ok := s.objectCache.Get(hash) if ok { return obj, nil } @@ -380,8 +397,8 @@ func (s *ObjectStorage) decodeObjectAt( } var p *packfile.Packfile - if s.deltaBaseCache != nil { - p = packfile.NewPackfileWithCache(idx, s.dir.Fs(), f, s.deltaBaseCache) + if s.objectCache != nil { + p = packfile.NewPackfileWithCache(idx, s.dir.Fs(), f, s.objectCache) } else { p = packfile.NewPackfile(idx, s.dir.Fs(), f) } @@ -400,11 +417,7 @@ func (s *ObjectStorage) decodeDeltaObjectAt( } p := packfile.NewScanner(f) - if _, err := p.SeekFromStart(offset); err != nil { - return nil, err - } - - header, err := p.NextObjectHeader() + header, err := p.SeekObjectHeader(offset) if err != nil { return nil, err } @@ -495,7 +508,7 @@ func (s *ObjectStorage) buildPackfileIters( } return newPackfileIter( s.dir.Fs(), pack, t, seen, s.index[h], - s.deltaBaseCache, s.options.KeepDescriptors, + s.objectCache, s.options.KeepDescriptors, ) }, }, nil diff --git a/storage/filesystem/storage.go b/storage/filesystem/storage.go index 14a772a..370f7bd 100644 --- a/storage/filesystem/storage.go +++ b/storage/filesystem/storage.go @@ -51,11 +51,7 @@ func NewStorageWithOptions(fs billy.Filesystem, cache cache.Object, ops Options) fs: fs, dir: dir, - ObjectStorage: ObjectStorage{ - options: ops, - deltaBaseCache: cache, - dir: dir, - }, + ObjectStorage: *NewObjectStorageWithOptions(dir, cache, ops), ReferenceStorage: ReferenceStorage{dir: dir}, IndexStorage: IndexStorage{dir: dir}, ShallowStorage: ShallowStorage{dir: dir}, -- cgit