diff options
Diffstat (limited to 'formats')
-rw-r--r-- | formats/packfile/delta.go | 23 | ||||
-rw-r--r-- | formats/packfile/objects.go | 1 | ||||
-rw-r--r-- | formats/packfile/reader.go | 215 | ||||
-rw-r--r-- | formats/packfile/reader_test.go | 110 |
4 files changed, 207 insertions, 142 deletions
diff --git a/formats/packfile/delta.go b/formats/packfile/delta.go index 86b556f..30703eb 100644 --- a/formats/packfile/delta.go +++ b/formats/packfile/delta.go @@ -1,5 +1,7 @@ package packfile +import "io" + const delta_size_min = 4 func deltaHeaderSize(b []byte) (uint, []byte) { @@ -91,3 +93,24 @@ func PatchDelta(src, delta []byte) []byte { } return dest } + +func decodeOffset(src io.ByteReader, steps int) (int, error) { + b, err := src.ReadByte() + if err != nil { + return 0, err + } + var offset = int(b & 0x7f) + for (b & 0x80) != 0 { + offset++ // WHY? + b, err = src.ReadByte() + if err != nil { + return 0, err + } + + offset = (offset << 7) + int(b&0x7f) + } + + // offset needs to be aware of the bytes we read for `o.typ` and `o.size` + offset += steps + return -offset, nil +} diff --git a/formats/packfile/objects.go b/formats/packfile/objects.go index bd76896..9286090 100644 --- a/formats/packfile/objects.go +++ b/formats/packfile/objects.go @@ -36,6 +36,7 @@ func (t ObjectType) String() string { type RAWObject struct { Hash Hash Type ObjectType + Size uint64 Bytes []byte } diff --git a/formats/packfile/reader.go b/formats/packfile/reader.go index f79f2ab..c355e12 100644 --- a/formats/packfile/reader.go +++ b/formats/packfile/reader.go @@ -9,10 +9,16 @@ import ( "github.com/klauspost/compress/zlib" ) +type Format int + const ( DefaultMaxObjectsLimit = 1 << 20 DefaultMaxObjectSize = 1 << 32 // 4GB + VersionSupported = 2 + UnknownFormat Format = 0 + OFSDeltaFormat Format = 1 + REFDeltaFormat Format = 2 ) type PackfileReader struct { @@ -21,43 +27,34 @@ type PackfileReader struct { // is defined by DefaultMaxObjectsLimit, usually the default limit is more // than enough to work with any repository, working extremly big repositories // where the number of object is bigger the memory can be exhausted. - MaxObjectsLimit int + MaxObjectsLimit uint32 // MaxObjectSize is the maximum size in bytes, reading objects with a bigger // size cause a error. The default value is defined by DefaultMaxObjectSize - MaxObjectSize int - - r *trackingReader - objects map[Hash]*RAWObject - offsets map[int]Hash - deltas []packfileDelta - contentCallback ContentCallback -} + MaxObjectSize uint64 -type packfileObject struct { - bytes []byte - typ ObjectType -} + // Format specifies if we are using ref-delta's or ofs-delta's, choosing the + // correct format the memory usage is optimized + // https://github.com/git/git/blob/8d530c4d64ffcc853889f7b385f554d53db375ed/Documentation/technical/protocol-capabilities.txt#L154 + Format Format -type packfileDelta struct { - hash Hash - delta []byte + r *trackingReader + objects map[Hash]*RAWObject + offsets map[int]*RAWObject } func NewPackfileReader(r io.Reader, fn ContentCallback) (*PackfileReader, error) { return &PackfileReader{ MaxObjectsLimit: DefaultMaxObjectsLimit, MaxObjectSize: DefaultMaxObjectSize, - r: &trackingReader{r: r}, - objects: make(map[Hash]*RAWObject, 0), - offsets: make(map[int]Hash, 0), - contentCallback: fn, + + r: &trackingReader{r: r}, + objects: make(map[Hash]*RAWObject, 0), + offsets: make(map[int]*RAWObject, 0), }, nil } func (pr *PackfileReader) Read() (chan *RAWObject, error) { - packfile := NewPackfile() - if err := pr.validateHeader(); err != nil { if err == io.EOF { // This is an empty repo. It's OK. @@ -67,29 +64,28 @@ func (pr *PackfileReader) Read() (chan *RAWObject, error) { return nil, err } - ver, err := pr.readInt32() + version, err := pr.readInt32() if err != nil { return nil, err } + if version > VersionSupported { + return nil, NewError("unsupported packfile version %d", version) + } + count, err := pr.readInt32() if err != nil { return nil, err } - packfile.Version = uint32(ver) - packfile.ObjectCount = int(count) - - if packfile.ObjectCount > pr.MaxObjectsLimit { - return nil, NewError("too many objects %d, limit is %d", - packfile.ObjectCount, pr.MaxObjectsLimit) + if count > pr.MaxObjectsLimit { + return nil, NewError("too many objects %d, limit is %d", count, pr.MaxObjectsLimit) } ch := make(chan *RAWObject, 1) - go pr.readObjects(ch, count) - packfile.Size = int64(pr.r.Pos()) + // packfile.Size = int64(pr.r.Pos()) return ch, nil } @@ -127,14 +123,20 @@ func (pr *PackfileReader) readObjects(ch chan *RAWObject, count uint32) error { for i := 0; i < int(count); i++ { var pos = pr.Pos() - obj, err := pr.readObject() + obj, err := pr.newRAWObject() if err != nil && err != io.EOF { fmt.Println(err) return err } - pr.offsets[pos] = obj.Hash - pr.objects[obj.Hash] = obj + if pr.Format == UnknownFormat || pr.Format == OFSDeltaFormat { + pr.offsets[pos] = obj + } + + if pr.Format == UnknownFormat || pr.Format == REFDeltaFormat { + pr.objects[obj.Hash] = obj + } + ch <- obj if err == io.EOF { @@ -145,86 +147,61 @@ func (pr *PackfileReader) readObjects(ch chan *RAWObject, count uint32) error { return nil } -func (pr *PackfileReader) readObject() (*RAWObject, error) { - - o, err := newObjectReader(pr, pr.MaxObjectSize) - if err != nil { - return nil, err - } - - raw := &RAWObject{Type: o.typ} - - switch o.typ { - case REFDeltaObject: - err = o.readREFDelta(raw) - case OFSDeltaObject: - err = o.readOFSDelta(raw) - case CommitObject, TreeObject, BlobObject, TagObject: - err = o.readObject(raw) - default: - err = NewError("Invalid git object tag %q", o.typ) - } - - if err != nil { - return nil, err - } - - return raw, err -} - func (pr *PackfileReader) Pos() int { return pr.r.Pos() } -type objectReader struct { - pr *PackfileReader - pf *Packfile - maxSize uint64 - - hash Hash - steps int - typ ObjectType - size uint64 -} - -func newObjectReader(pr *PackfileReader, maxSize int) (*objectReader, error) { - o := &objectReader{pr: pr, maxSize: uint64(maxSize)} +func (pr *PackfileReader) newRAWObject() (*RAWObject, error) { + raw := &RAWObject{} + steps := 0 var buf [1]byte - if _, err := o.Read(buf[:]); err != nil { + if _, err := pr.r.Read(buf[:]); err != nil { return nil, err } - o.typ = ObjectType((buf[0] >> 4) & 7) - o.size = uint64(buf[0] & 15) - o.steps++ // byte we just read to get `o.typ` and `o.size` + raw.Type = ObjectType((buf[0] >> 4) & 7) + raw.Size = uint64(buf[0] & 15) + steps++ // byte we just read to get `o.typ` and `o.size` var shift uint = 4 for buf[0]&0x80 == 0x80 { - if _, err := o.Read(buf[:]); err != nil { + if _, err := pr.r.Read(buf[:]); err != nil { return nil, err } - o.size += uint64(buf[0]&0x7f) << shift - o.steps++ // byte we just read to update `o.size` + raw.Size += uint64(buf[0]&0x7f) << shift + steps++ // byte we just read to update `o.size` shift += 7 } - return o, nil + var err error + switch raw.Type { + case REFDeltaObject: + err = pr.readREFDelta(raw) + case OFSDeltaObject: + err = pr.readOFSDelta(raw, steps) + case CommitObject, TreeObject, BlobObject, TagObject: + err = pr.readObject(raw) + default: + err = NewError("Invalid git object tag %q", raw.Type) + } + + return raw, err } -func (o *objectReader) readREFDelta(raw *RAWObject) error { +func (pr *PackfileReader) readREFDelta(raw *RAWObject) error { var ref Hash - if _, err := o.Read(ref[:]); err != nil { + if _, err := pr.r.Read(ref[:]); err != nil { return err } - buf, err := o.inflate() + buf, err := pr.inflate(raw.Size) if err != nil { return err } - referenced, ok := o.pr.objects[ref] + referenced, ok := pr.objects[ref] if !ok { - o.pr.deltas = append(o.pr.deltas, packfileDelta{hash: ref, delta: buf[:]}) + fmt.Println("not found", ref) } else { patched := PatchDelta(referenced.Bytes, buf[:]) if patched == nil { @@ -233,67 +210,47 @@ func (o *objectReader) readREFDelta(raw *RAWObject) error { raw.Type = referenced.Type raw.Bytes = patched + raw.Size = uint64(len(patched)) raw.Hash = ComputeHash(raw.Type, raw.Bytes) } return nil } -func decodeOffset(src io.ByteReader, steps int) (int, error) { - b, err := src.ReadByte() - if err != nil { - return 0, err - } - var offset = int(b & 0x7f) - for (b & 0x80) != 0 { - offset++ // WHY? - b, err = src.ReadByte() - if err != nil { - return 0, err - } - - offset = (offset << 7) + int(b&0x7f) - } - - // offset needs to be aware of the bytes we read for `o.typ` and `o.size` - offset += steps - return -offset, nil -} - -func (o *objectReader) readOFSDelta(raw *RAWObject) error { - var pos = o.pr.Pos() +func (pr *PackfileReader) readOFSDelta(raw *RAWObject, steps int) error { + var pos = pr.Pos() // read negative offset - offset, err := decodeOffset(o.pr.r, o.steps) + offset, err := decodeOffset(pr.r, steps) if err != nil { return err } - buf, err := o.inflate() + buf, err := pr.inflate(raw.Size) if err != nil { return err } - ref := o.pr.offsets[pos+offset] - referenced, ok := o.pr.objects[ref] + ref, ok := pr.offsets[pos+offset] if !ok { return NewError("can't find a pack entry at %d", pos+offset) } - patched := PatchDelta(referenced.Bytes, buf) + patched := PatchDelta(ref.Bytes, buf) if patched == nil { return NewError("error while patching %q", ref) } - raw.Type = referenced.Type + raw.Type = ref.Type raw.Bytes = patched + raw.Size = uint64(len(patched)) raw.Hash = ComputeHash(raw.Type, raw.Bytes) return nil } -func (o *objectReader) readObject(raw *RAWObject) error { - buf, err := o.inflate() +func (pr *PackfileReader) readObject(raw *RAWObject) error { + buf, err := pr.inflate(raw.Size) if err != nil { return err } @@ -304,8 +261,8 @@ func (o *objectReader) readObject(raw *RAWObject) error { return nil } -func (o *objectReader) inflate() ([]byte, error) { - zr, err := zlib.NewReader(o.pr.r) +func (pr *PackfileReader) inflate(size uint64) ([]byte, error) { + zr, err := zlib.NewReader(pr.r) if err != nil { if err == zlib.ErrHeader { return nil, zlib.ErrHeader @@ -316,30 +273,22 @@ func (o *objectReader) inflate() ([]byte, error) { defer zr.Close() - if o.size > o.maxSize { + if size > pr.MaxObjectSize { return nil, NewError("the object size %q exceeed the allowed limit: %q", - o.size, o.maxSize) + size, pr.MaxObjectSize) } var buf bytes.Buffer io.Copy(&buf, zr) // also: io.CopyN(&buf, zr, int64(o.size)) - var bufLen = buf.Len() - if bufLen != int(o.size) { - return nil, NewError("inflated size mismatch, expected %d, got %d", o.size, bufLen) + if buf.Len() != int(size) { + return nil, NewError( + "inflated size mismatch, expected %d, got %d", size, buf.Len()) } return buf.Bytes(), nil } -func (o *objectReader) Read(p []byte) (int, error) { - return o.pr.r.Read(p) -} - -func (o *objectReader) ReadByte() (byte, error) { - return o.pr.r.ReadByte() -} - type ReaderError struct { Msg string // description of error } diff --git a/formats/packfile/reader_test.go b/formats/packfile/reader_test.go index e49a976..917eee1 100644 --- a/formats/packfile/reader_test.go +++ b/formats/packfile/reader_test.go @@ -3,7 +3,12 @@ package packfile import ( "bytes" "encoding/base64" + "fmt" "os" + "runtime" + "time" + + "github.com/dustin/go-humanize" . "gopkg.in/check.v1" ) @@ -40,20 +45,21 @@ func (s *ReaderSuite) TestReadPackfile(c *C) { } func (s *ReaderSuite) TestReadPackfileOFSDelta(c *C) { - s.testReadPackfileGitFixture(c, "fixtures/git-fixture.ofs-delta") + s.testReadPackfileGitFixture(c, "fixtures/git-fixture.ofs-delta", OFSDeltaFormat) } func (s *ReaderSuite) TestReadPackfileREFDelta(c *C) { - s.testReadPackfileGitFixture(c, "fixtures/git-fixture.ref-delta") + s.testReadPackfileGitFixture(c, "fixtures/git-fixture.ref-delta", REFDeltaFormat) } -func (s *ReaderSuite) testReadPackfileGitFixture(c *C, file string) { +func (s *ReaderSuite) testReadPackfileGitFixture(c *C, file string, f Format) { d, err := os.Open(file) c.Assert(err, IsNil) r, err := NewPackfileReader(d, nil) c.Assert(err, IsNil) + r.Format = f ch, err := r.Read() c.Assert(err, IsNil) @@ -89,13 +95,99 @@ func (s *ReaderSuite) testReadPackfileGitFixture(c *C, file string) { }) } -func AssertObjects(c *C, ch chan *RAWObject, expected []string) { - i := 0 - for obtained := range ch { - c.Assert(obtained.Hash.String(), Equals, expected[i]) +func AssertObjects(c *C, ch chan *RAWObject, expects []string) { + for _, expected := range expects { + obtained := <-ch + c.Assert(obtained.Hash.String(), Equals, expected) + computed := ComputeHash(obtained.Type, obtained.Bytes) - c.Assert(computed.String(), Equals, expected[i]) + c.Assert(computed.String(), Equals, expected) + c.Assert(obtained.Bytes, HasLen, int(obtained.Size)) + } +} + +func (s *ReaderSuite) BenchmarkFixtureRef(c *C) { + for i := 0; i < c.N; i++ { + readFromFile(c, "fixtures/git-fixture.ref-delta", REFDeltaFormat) + } +} + +func (s *ReaderSuite) BenchmarkFixtureOfs(c *C) { + for i := 0; i < c.N; i++ { + readFromFile(c, "fixtures/git-fixture.ofs-delta", OFSDeltaFormat) + } +} - i++ +func (s *ReaderSuite) BenchmarkCandyJS(c *C) { + for i := 0; i < c.N; i++ { + readFromFile(c, "/tmp/go-candyjs", REFDeltaFormat) } } + +func (s *ReaderSuite) BenchmarkSymfony(c *C) { + for i := 0; i < c.N; i++ { + readFromFile(c, "/tmp/symonfy", REFDeltaFormat) + } +} + +func (s *ReaderSuite) BenchmarkGit(c *C) { + for i := 0; i < c.N; i++ { + readFromFile(c, "/tmp/git", REFDeltaFormat) + } +} + +func (s *ReaderSuite) _TestMemoryOFS(c *C) { + var b, a runtime.MemStats + + start := time.Now() + runtime.ReadMemStats(&b) + p := readFromFile(c, "/tmp/symfony.ofs-delta", OFSDeltaFormat) + runtime.ReadMemStats(&a) + + fmt.Println("OFS--->") + fmt.Println("Alloc", a.Alloc-b.Alloc, humanize.Bytes(a.Alloc-b.Alloc)) + fmt.Println("TotalAlloc", a.TotalAlloc-b.TotalAlloc, humanize.Bytes(a.TotalAlloc-b.TotalAlloc)) + fmt.Println("HeapAlloc", a.HeapAlloc-b.HeapAlloc, humanize.Bytes(a.HeapAlloc-b.HeapAlloc)) + fmt.Println("HeapSys", a.HeapSys, humanize.Bytes(a.HeapSys-b.HeapSys)) + + fmt.Println("objects", len(p)) + fmt.Println("time", time.Since(start)) +} + +func (s *ReaderSuite) _TestMemoryREF(c *C) { + var b, a runtime.MemStats + + start := time.Now() + runtime.ReadMemStats(&b) + p := readFromFile(c, "/tmp/symonfy", REFDeltaFormat) + runtime.ReadMemStats(&a) + + fmt.Println("REF--->") + fmt.Println("Alloc", a.Alloc-b.Alloc, humanize.Bytes(a.Alloc-b.Alloc)) + fmt.Println("TotalAlloc", a.TotalAlloc-b.TotalAlloc, humanize.Bytes(a.TotalAlloc-b.TotalAlloc)) + fmt.Println("HeapAlloc", a.HeapAlloc-b.HeapAlloc, humanize.Bytes(a.HeapAlloc-b.HeapAlloc)) + fmt.Println("HeapSys", a.HeapSys, humanize.Bytes(a.HeapSys-b.HeapSys)) + + fmt.Println("objects", len(p)) + fmt.Println("time", time.Since(start)) +} + +func readFromFile(c *C, file string, f Format) []*RAWObject { + d, err := os.Open(file) + c.Assert(err, IsNil) + + r, err := NewPackfileReader(d, nil) + c.Assert(err, IsNil) + + r.Format = f + ch, err := r.Read() + c.Assert(err, IsNil) + c.Assert(ch, NotNil) + + var objs []*RAWObject + for o := range ch { + objs = append(objs, o) + } + + return objs +} |