From 5e73f01cb2e027a8f02801635b79d3a9bc866914 Mon Sep 17 00:00:00 2001 From: Alberto Cortés Date: Mon, 4 Jul 2016 17:09:22 +0200 Subject: Adds support to open local repositories and to use file-based object storage (#55) * remove some comments * idx writer/reader * Shut up ssh tests, they are annoying * Add file scheme test to clients * Add dummy file client * Add test fot file client * Make tests use fixture endpoint * add parser for packed-refs format * add parser for packed-refs format * WIP adding dir.Refs() tests * Add test for fixture refs * refs parser for the refs directory * Documentation * Add Capabilities to file client * tgz.Exatract now accpets a path instead of a Reader * fix bug in idxfile fanout calculation * remove dead code * packfile documentation * clean packfile parser code * add core.Object.Content() and returns errors for core.ObjectStorage.Iter() * add seekable storage * add dir repos to NewRepository * clean prints * Add dir client documentation to README * Organize the README * README * Clean tgz package * Clean temp dirs after tgz tests * Gometalinter on gitdir * Clean pattern function * metalinter tgz * metalinter gitdir * gitdir coverage and remove seekable packfile filedescriptor leak * gitdir Idxfile tests and remove file descriptor leak * gitdir Idxfile tests when no idx is found * clean storage/seekable/internal/index and some formats/idxfile API issues * clean storage/seekable * clean formats/idx * turn packfile/doc.go into packfile/doc.txt * move formats/packfile/reader to decoder * fix packfile decoder error names * improve documentation * comment packfile decoder errors * comment public API (format/packfile) * remve duplicated code in packfile decoder test * move tracking_reader into an internal package and clean it * use iota for packfile format * rename packfile parse.go to packfile object_at.go * clean packfile deltas * fix delta header size bug * improve delta documentation * clean packfile deltas * clean packfiles deltas * clean repository.go * Remove go 1.5 from Travis CI Because go 1.5 does not suport internal packages. * change local repo scheme to local:// * change "local://" to "file://" as the local scheme * fix broken indentation * shortens names of variables in short scopes * more shortening of variable names * more shortening of variable names * Rename git dir client to "file", as the scheme used for it * Fix file format ctor name, now that the package name has change * Sortcut local repo constructor to not use remotes The object storage is build directly in the repository ctor, instead of creating a remote and waiting for the user to pull it. * update README and fix some errors in it * remove file scheme client * Local respositories has now a new ctor This is, they are no longer identified by the scheme of the URL, but are created different from inception. * remove unused URL field form Repository * move all git dir logic to seekable sotrage ctor * fix documentation * Make formats/file/dir an internal package to storage/seekable * change package storage/seekable to storage/fs * clean storage/fs * overall storage/fs clean * more cleaning * some metalinter fixes * upgrade cshared to last changes * remove dead code * fix test error info * remove file scheme check from clients * fix test error message * fix test error message * fix error messages * style changes * fix comments everywhere * style changes * style changes * scaffolding and tests for local packfiles without ifx files * outsource index building from packfile to the packfile decoder * refactor packfile header reading into a new function * move code to generate index from packfile back to index package * add header parsing * fix documentation errata * add undeltified and OFS delta support for index building from the packfile * add tests for packfile with ref-deltas * support for packfiles with ref-deltas and no idx * refactor packfile format parser to reuse code * refactor packfile format parser to reuse code * refactor packfile format parser to reuse code * refactor packfile format parser to reuse code * refactor packfile format parser to reuse code * WIP refactor packfile format parser to reuse code * refactor packfile format parser to reuse code * remove prints from tests * remove prints from tests * refactor packfile.core into packfile.parser * rename packfile reader to something that shows it is a recaller * rename cannot recall error * rename packfile.Reader to packfile.ReadRecaller and document * speed up test by using StreamReader instead of SeekableReader when possible * clean packfile StreamReader * stream_reader tests * refactor packfile.StreamReader into packfile.StreamReadRecaller * refactor packfile.SeekableReader into packfile.SeekableReadRecaller and document it * generalize packfile.StreamReadRecaller test to all packfile.ReadRecaller implementations * speed up storage/fs tests * speed up tests in . by loading packfiles in memory * speed up repository tests by using and smaller fixture * restore doc.go files * rename packfile.ReadRecaller implementations to shorter names * update comments to type changes * packfile.Parser test (WIP) * packfile.Parser tests and add ForgetAll() to packfile.ReadRecaller * add test for packfile.ReadRecaller.ForgetAll() * clarify seekable being able to recallByOffset forgetted objects * use better names for internal maps * metalinter packfile package * speed up some tests * documentation fixes * change storage.fs package name to storage.proxy to avoid confusion with new filesystem support * New fs package and os transparent implementation Now NewRepositoryFromFS receives a fs and a path and tests are modified accordingly, but it is still not using for anything. * add fs to gitdir and proxy.store * reduce fs interface for easier implementation * remove garbage dirs from tgz tests * change file name gitdir/dir.go to gitdir/gitdir.go * fs.OS tests * metalinter utils/fs * add NewRepositoryFromFS documentation to README * Readability fixes to README * move tgz to an external dependency * move filesystem impl. example to example dir * rename proxy/store.go to proxy/storage.go for coherence with memory/storage.go * rename proxy package to seekable --- formats/packfile/common.go | 63 ----- formats/packfile/decoder.go | 116 ++++++++ formats/packfile/decoder_test.go | 178 ++++++++++++ formats/packfile/delta.go | 195 +++++++------ formats/packfile/doc.go | 331 +++++++++++----------- formats/packfile/error.go | 30 ++ formats/packfile/parser.go | 353 ++++++++++++++++++++++++ formats/packfile/parser_test.go | 412 ++++++++++++++++++++++++++++ formats/packfile/read_recaller.go | 39 +++ formats/packfile/read_recaller_impl_test.go | 296 ++++++++++++++++++++ formats/packfile/reader.go | 338 ----------------------- formats/packfile/reader_test.go | 190 ------------- formats/packfile/seekable.go | 108 ++++++++ formats/packfile/stream.go | 95 +++++++ 14 files changed, 1907 insertions(+), 837 deletions(-) delete mode 100644 formats/packfile/common.go create mode 100644 formats/packfile/decoder.go create mode 100644 formats/packfile/decoder_test.go create mode 100644 formats/packfile/error.go create mode 100644 formats/packfile/parser.go create mode 100644 formats/packfile/parser_test.go create mode 100644 formats/packfile/read_recaller.go create mode 100644 formats/packfile/read_recaller_impl_test.go delete mode 100644 formats/packfile/reader.go delete mode 100644 formats/packfile/reader_test.go create mode 100644 formats/packfile/seekable.go create mode 100644 formats/packfile/stream.go (limited to 'formats/packfile') diff --git a/formats/packfile/common.go b/formats/packfile/common.go deleted file mode 100644 index b5f8de2..0000000 --- a/formats/packfile/common.go +++ /dev/null @@ -1,63 +0,0 @@ -package packfile - -import ( - "bufio" - "fmt" - "io" -) - -type trackingReader struct { - r io.Reader - position int64 -} - -func NewTrackingReader(r io.Reader) *trackingReader { - return &trackingReader{ - r: bufio.NewReader(r), - } -} - -func (t *trackingReader) Read(p []byte) (n int, err error) { - n, err = t.r.Read(p) - if err != nil { - return 0, err - } - - t.position += int64(n) - return n, err -} - -func (t *trackingReader) ReadByte() (c byte, err error) { - var p [1]byte - n, err := t.r.Read(p[:]) - if err != nil { - return 0, err - } - - if n > 1 { - return 0, fmt.Errorf("read %d bytes, should have read just 1", n) - } - - t.position++ - return p[0], nil -} - -// checkClose is used with defer to close the given io.Closer and check its -// returned error value. If Close returns an error and the given *error -// is not nil, *error is set to the error returned by Close. -// -// checkClose is typically used with named return values like so: -// -// func do(obj *Object) (err error) { -// w, err := obj.Writer() -// if err != nil { -// return nil -// } -// defer checkClose(w, &err) -// // work with w -// } -func checkClose(c io.Closer, err *error) { - if cerr := c.Close(); cerr != nil && *err == nil { - *err = cerr - } -} diff --git a/formats/packfile/decoder.go b/formats/packfile/decoder.go new file mode 100644 index 0000000..e8c5c6a --- /dev/null +++ b/formats/packfile/decoder.go @@ -0,0 +1,116 @@ +package packfile + +import ( + "io" + + "gopkg.in/src-d/go-git.v3/core" +) + +// Format specifies if the packfile uses ref-deltas or ofs-deltas. +type Format int + +// Possible values of the Format type. +const ( + UnknownFormat Format = iota + OFSDeltaFormat + REFDeltaFormat +) + +var ( + // ErrMaxObjectsLimitReached is returned by Decode when the number + // of objects in the packfile is higher than + // Decoder.MaxObjectsLimit. + ErrMaxObjectsLimitReached = NewError("max. objects limit reached") + + // ErrInvalidObject is returned by Decode when an invalid object is + // found in the packfile. + ErrInvalidObject = NewError("invalid git object") + + // ErrPackEntryNotFound is returned by Decode when a reference in + // the packfile references and unknown object. + ErrPackEntryNotFound = NewError("can't find a pack entry") + + // ErrZLib is returned by Decode when there was an error unzipping + // the packfile contents. + ErrZLib = NewError("zlib reading error") +) + +const ( + // DefaultMaxObjectsLimit is the maximum amount of objects the + // decoder will decode before returning ErrMaxObjectsLimitReached. + DefaultMaxObjectsLimit = 1 << 20 +) + +// Decoder reads and decodes packfiles from an input stream. +type Decoder struct { + // MaxObjectsLimit is the limit of objects to be load in the packfile, if + // a packfile excess this number an error is throw, the default value + // is defined by DefaultMaxObjectsLimit, usually the default limit is more + // than enough to work with any repository, with higher values and huge + // repositories you can run out of memory. + MaxObjectsLimit uint32 + + p *Parser + s core.ObjectStorage +} + +// NewDecoder returns a new Decoder that reads from r. +func NewDecoder(r ReadRecaller) *Decoder { + return &Decoder{ + MaxObjectsLimit: DefaultMaxObjectsLimit, + + p: NewParser(r), + } +} + +// Decode reads a packfile and stores it in the value pointed to by s. +func (d *Decoder) Decode(s core.ObjectStorage) error { + d.s = s + + count, err := d.p.ReadHeader() + if err != nil { + return err + } + + if count > d.MaxObjectsLimit { + return ErrMaxObjectsLimitReached.AddDetails("%d", count) + } + + err = d.readObjects(count) + + return err +} + +func (d *Decoder) readObjects(count uint32) error { + // This code has 50-80 µs of overhead per object not counting zlib inflation. + // Together with zlib inflation, it's 400-410 µs for small objects. + // That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB, + // of which 12-20 % is _not_ zlib inflation (ie. is our code). + for i := 0; i < int(count); i++ { + start, err := d.p.Offset() + if err != nil { + return err + } + + obj, err := d.p.ReadObject() + if err != nil { + if err == io.EOF { + break + } + + return err + } + + err = d.p.Remember(start, obj) + if err != nil { + return err + } + + _, err = d.s.Set(obj) + if err == io.EOF { + break + } + } + + return nil +} diff --git a/formats/packfile/decoder_test.go b/formats/packfile/decoder_test.go new file mode 100644 index 0000000..0c471a2 --- /dev/null +++ b/formats/packfile/decoder_test.go @@ -0,0 +1,178 @@ +package packfile + +import ( + "bytes" + "encoding/base64" + "fmt" + "os" + "runtime" + "testing" + "time" + + "gopkg.in/src-d/go-git.v3/core" + "gopkg.in/src-d/go-git.v3/storage/memory" + + "github.com/dustin/go-humanize" + . "gopkg.in/check.v1" +) + +func Test(t *testing.T) { TestingT(t) } + +type ReaderSuite struct{} + +var _ = Suite(&ReaderSuite{}) + +var packFileWithEmptyObjects = "UEFDSwAAAAIAAAALnw54nKXMQWoDMQxA0b1PoX2hSLIm44FSAlmXnEG2NYlhXAfHgdLb5Cy9WAM5Qpb/Lf7oZqArUpakyYtQjCoxZ5lmWXwwyuzJbHqAuYt2+x6QoyCyhYCKIa67lGameSLWvPh5JU0hsCg7vY1z6/D1d/8ptcHhprm3Kxz7KL/wUdOz96eqZXtPrX4CCeOOPU8Eb0iI7qG1jGGvXdxaNoPs/gHeNkp8lA94nKXMQUpDMRCA4X1OMXtBZpI3L3kiRXAtPcMkmWjgxZSYQultPEsv1oJHcPl/i38OVRC0IXF0lshrJorZEcpKmTEJYbA+B3aFzEmGfk9gpqJEsmnZNutXF71i1IURU/G0bsWWwJ6NnOdXH/Bx+73U1uH9LHn0HziOWa/w2tJfv302qftz6u0AtFh0wQdmeEJCNA9tdU7938WUuivEF5CczR11ZEsNnw54nKWMUQoCIRRF/13F+w/ijY6jQkTQd7SGpz5LyAxzINpNa2ljTbSEPu/hnNsbM4TJTzqyt561GdUUmJKT6K2MeiCVgnZWoY/iRo2vHVS0URrUS+e+dkqIEp11HMhh9IaUkRM6QXM/1waH9+uRS4X9TLHVOxxbz0/YlPDbu1OhfFmHWrYwjBKVNVaNsMIBUSy05N75vxeR8oXBiw8GoErCnwt4nKXMzQkCMRBA4XuqmLsgM2M2ZkAWwbNYQ341sCEQsyB2Yy02pmAJHt93eKOnBFpMNJqtl5CFxVIMomViomQSEWP2JrN3yq3j1jqc369HqQ1Oq4u93eHSR3nCoYZfH6/VlWUbWp2BNOPO7i1OsEFCVF+tZYz030XlsiRw6gPZ0jxaqwV4nDM0MDAzMVFIZHg299HsTRevOXt3a64rj7px6ElP8ERDiGQSQ2uoXe8RrcodS5on+J4/u8HjD4NDKFQyRS8tPx+rbgDt3yiEMHicAwAAAAABPnicS0wEAa4kMOACACTjBKdkZXici7aaYAUAA3gBYKoDeJwzNDAwMzFRSGR4NvfR7E0Xrzl7d2uuK4+6cehJT/BEQ4hkEsOELYFJvS2eX47UJdVttFQrenrmzQwA13MaiDd4nEtMBAEuAApMAlGtAXicMzQwMDMxUUhkeDb30exNF685e3drriuPunHoSU/wRACvkA258N/i8hVXx9CiAZzvFXNIhCuSFmE=" + +func (s *ReaderSuite) TestReadPackfile(c *C) { + data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects) + f := bytes.NewReader(data) + r := NewStream(f) + d := NewDecoder(r) + + sto := memory.NewObjectStorage() + err := d.Decode(sto) + c.Assert(err, IsNil) + + AssertObjects(c, sto, []string{ + "778c85ff95b5514fea0ba4c7b6a029d32e2c3b96", + "db4002e880a08bf6cc7217512ad937f1ac8824a2", + "551fe11a9ef992763b7e0be4500cf7169f2f8575", + "3d8d2705c6b936ceff0020989eca90db7a372609", + "af01d4cac3441bba4bdd4574938e1d231ee5d45e", + "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391", + "85553e8dc42a79b8a483904dcfcdb048fc004055", + "a028c5b32117ed11bd310a61d50ca10827d853f1", + "c6b65deb8be57436ceaf920b82d51a3fc59830bd", + "90b451628d8449f4c47e627eb1392672e5ccec98", + "496d6428b9cf92981dc9495211e6e1120fb6f2ba", + }) +} + +func (s *ReaderSuite) TestReadPackfileOFSDelta(c *C) { + s.testReadPackfileGitFixture(c, "fixtures/git-fixture.ofs-delta", OFSDeltaFormat) + +} +func (s *ReaderSuite) TestReadPackfileREFDelta(c *C) { + s.testReadPackfileGitFixture(c, "fixtures/git-fixture.ref-delta", REFDeltaFormat) +} + +func (s *ReaderSuite) testReadPackfileGitFixture(c *C, file string, format Format) { + f, err := os.Open(file) + c.Assert(err, IsNil) + r := NewSeekable(f) + d := NewDecoder(r) + + sto := memory.NewObjectStorage() + err = d.Decode(sto) + c.Assert(err, IsNil) + + AssertObjects(c, sto, []string{ + "918c48b83bd081e863dbe1b80f8998f058cd8294", + "af2d6a6954d532f8ffb47615169c8fdf9d383a1a", + "1669dce138d9b841a518c64b10914d88f5e488ea", + "a5b8b09e2f8fcb0bb99d3ccb0958157b40890d69", + "b8e471f58bcbca63b07bda20e428190409c2db47", + "35e85108805c84807bc66a02d91535e1e24b38b9", + "b029517f6300c2da0f4b651b8642506cd6aaf45d", + "32858aad3c383ed1ff0a0f9bdf231d54a00c9e88", + "d3ff53e0564a9f87d8e84b6e28e5060e517008aa", + "c192bd6a24ea1ab01d78686e417c8bdc7c3d197f", + "d5c0f4ab811897cadf03aec358ae60d21f91c50d", + "49c6bb89b17060d7b4deacb7b338fcc6ea2352a9", + "cf4aa3b38974fb7d81f367c0830f7d78d65ab86b", + "9dea2395f5403188298c1dabe8bdafe562c491e3", + "586af567d0bb5e771e49bdd9434f5e0fb76d25fa", + "9a48f23120e880dfbe41f7c9b7b708e9ee62a492", + "5a877e6a906a2743ad6e45d99c1793642aaf8eda", + "c8f1d8c61f9da76f4cb49fd86322b6e685dba956", + "a8d315b2b1c615d43042c3a62402b8a54288cf5c", + "a39771a7651f97faf5c72e08224d857fc35133db", + "880cd14280f4b9b6ed3986d6671f907d7cc2a198", + "fb72698cab7617ac416264415f13224dfd7a165e", + "4d081c50e250fa32ea8b1313cf8bb7c2ad7627fd", + "eba74343e2f15d62adedfd8c883ee0262b5c8021", + "c2d30fa8ef288618f65f6eed6e168e0d514886f4", + "8dcef98b1d52143e1e2dbc458ffe38f925786bf2", + "aa9b383c260e1d05fbbf6b30a02914555e20c725", + "6ecf0ef2c2dffb796033e5a02219af86ec6584e5", + }) +} + +func AssertObjects(c *C, s *memory.ObjectStorage, expects []string) { + c.Assert(len(expects), Equals, len(s.Objects)) + for _, exp := range expects { + obt, err := s.Get(core.NewHash(exp)) + c.Assert(err, IsNil) + c.Assert(obt.Hash().String(), Equals, exp) + } +} + +func (s *ReaderSuite) BenchmarkFixtureRef(c *C) { + for i := 0; i < c.N; i++ { + readFromFile(c, "fixtures/git-fixture.ref-delta", REFDeltaFormat) + } +} + +func (s *ReaderSuite) BenchmarkFixtureOfs(c *C) { + for i := 0; i < c.N; i++ { + readFromFile(c, "fixtures/git-fixture.ofs-delta", OFSDeltaFormat) + } +} + +func (s *ReaderSuite) BenchmarkCandyJS(c *C) { + for i := 0; i < c.N; i++ { + readFromFile(c, "/tmp/go-candyjs", REFDeltaFormat) + } +} + +func (s *ReaderSuite) BenchmarkSymfony(c *C) { + for i := 0; i < c.N; i++ { + readFromFile(c, "/tmp/symonfy", REFDeltaFormat) + } +} + +func (s *ReaderSuite) BenchmarkGit(c *C) { + for i := 0; i < c.N; i++ { + readFromFile(c, "/tmp/git", REFDeltaFormat) + } +} + +func (s *ReaderSuite) _testMemory(c *C, format Format) { + var b, a runtime.MemStats + + start := time.Now() + runtime.ReadMemStats(&b) + p := readFromFile(c, "/tmp/symfony.ofs-delta", format) + runtime.ReadMemStats(&a) + + fmt.Println("OFS--->") + fmt.Println("Alloc", a.Alloc-b.Alloc, humanize.Bytes(a.Alloc-b.Alloc)) + fmt.Println("TotalAlloc", a.TotalAlloc-b.TotalAlloc, humanize.Bytes(a.TotalAlloc-b.TotalAlloc)) + fmt.Println("HeapAlloc", a.HeapAlloc-b.HeapAlloc, humanize.Bytes(a.HeapAlloc-b.HeapAlloc)) + fmt.Println("HeapSys", a.HeapSys, humanize.Bytes(a.HeapSys-b.HeapSys)) + + fmt.Println("objects", len(p.Objects)) + fmt.Println("time", time.Since(start)) +} + +func (s *ReaderSuite) _TestMemoryOFS(c *C) { + s._testMemory(c, OFSDeltaFormat) +} + +func (s *ReaderSuite) _TestMemoryREF(c *C) { + s._testMemory(c, REFDeltaFormat) +} + +func readFromFile(c *C, file string, format Format) *memory.ObjectStorage { + f, err := os.Open(file) + c.Assert(err, IsNil) + r := NewSeekable(f) + d := NewDecoder(r) + + sto := memory.NewObjectStorage() + err = d.Decode(sto) + c.Assert(err, IsNil) + + return sto +} diff --git a/formats/packfile/delta.go b/formats/packfile/delta.go index 571ccf8..e0bbb65 100644 --- a/formats/packfile/delta.go +++ b/formats/packfile/delta.go @@ -1,117 +1,148 @@ package packfile -import "io" +// See https://github.com/git/git/blob/49fa3dc76179e04b0833542fa52d0f287a4955ac/delta.h +// https://github.com/git/git/blob/c2c5f6b1e479f2c38e0e01345350620944e3527f/patch-delta.c, +// and https://github.com/tarruda/node-git-core/blob/master/src/js/delta.js +// for details about the delta format. const deltaSizeMin = 4 -func deltaHeaderSize(b []byte) (uint, []byte) { - var size, j uint - var cmd byte - for { - cmd = b[j] - size |= (uint(cmd) & 0x7f) << (j * 7) - j++ - if uint(cmd)&0xb80 == 0 || j == uint(len(b)) { - break - } - } - return size, b[j:] -} - -func patchDelta(src, delta []byte) []byte { +// PatchDelta returns the result of applying the modification deltas in delta to src. +func PatchDelta(src, delta []byte) []byte { if len(delta) < deltaSizeMin { return nil } - size, delta := deltaHeaderSize(delta) - if size != uint(len(src)) { + + srcSz, delta := decodeLEB128(delta) + if srcSz != uint(len(src)) { return nil } - size, delta = deltaHeaderSize(delta) - origSize := size - dest := make([]byte, 0) + targetSz, delta := decodeLEB128(delta) + remainingTargetSz := targetSz - // var offset uint + var dest []byte var cmd byte for { cmd = delta[0] delta = delta[1:] - if (cmd & 0x80) != 0 { - var cp_off, cp_size uint - if (cmd & 0x01) != 0 { - cp_off = uint(delta[0]) - delta = delta[1:] - } - if (cmd & 0x02) != 0 { - cp_off |= uint(delta[0]) << 8 - delta = delta[1:] - } - if (cmd & 0x04) != 0 { - cp_off |= uint(delta[0]) << 16 - delta = delta[1:] - } - if (cmd & 0x08) != 0 { - cp_off |= uint(delta[0]) << 24 - delta = delta[1:] - } - - if (cmd & 0x10) != 0 { - cp_size = uint(delta[0]) - delta = delta[1:] - } - if (cmd & 0x20) != 0 { - cp_size |= uint(delta[0]) << 8 - delta = delta[1:] - } - if (cmd & 0x40) != 0 { - cp_size |= uint(delta[0]) << 16 - delta = delta[1:] - } - if cp_size == 0 { - cp_size = 0x10000 - } - if cp_off+cp_size < cp_off || - cp_off+cp_size > uint(len(src)) || - cp_size > origSize { + if isCopyFromSrc(cmd) { + var offset, sz uint + offset, delta = decodeOffset(cmd, delta) + sz, delta = decodeSize(cmd, delta) + if invalidSize(sz, targetSz) || + invalidOffsetSize(offset, sz, srcSz) { break } - dest = append(dest, src[cp_off:cp_off+cp_size]...) - size -= cp_size - } else if cmd != 0 { - if uint(cmd) > origSize { + dest = append(dest, src[offset:offset+sz]...) + remainingTargetSz -= sz + } else if isCopyFromDelta(cmd) { + sz := uint(cmd) // cmd is the size itself + if invalidSize(sz, targetSz) { break } - dest = append(dest, delta[0:uint(cmd)]...) - size -= uint(cmd) - delta = delta[uint(cmd):] + dest = append(dest, delta[0:sz]...) + remainingTargetSz -= sz + delta = delta[sz:] } else { return nil } - if size <= 0 { + + if remainingTargetSz <= 0 { break } } + return dest } -func decodeOffset(src io.ByteReader, steps int64) (int64, error) { - b, err := src.ReadByte() - if err != nil { - return 0, err - } +// Decodes a number encoded as an unsigned LEB128 at the start of some +// binary data and returns the decoded number and the rest of the +// stream. +// +// This must be called twice on the delta data buffer, first to get the +// expected source buffer size, and again to get the target buffer size. +func decodeLEB128(input []byte) (uint, []byte) { + var num, sz uint + var b byte + for { + b = input[sz] + num |= (uint(b) & payload) << (sz * 7) // concats 7 bits chunks + sz++ - var offset = int64(b & 0x7f) - for (b & 0x80) != 0 { - offset++ // WHY? - b, err = src.ReadByte() - if err != nil { - return 0, err + if uint(b)&continuation == 0 || sz == uint(len(input)) { + break } + } + + return num, input[sz:] +} + +const ( + payload = 0x7f // 0111 1111 + continuation = 0x80 // 1000 0000 +) - offset = (offset << 7) + int64(b&0x7f) +func isCopyFromSrc(cmd byte) bool { + return (cmd & 0x80) != 0 +} + +func isCopyFromDelta(cmd byte) bool { + return (cmd&0x80) == 0 && cmd != 0 +} + +func decodeOffset(cmd byte, delta []byte) (uint, []byte) { + var offset uint + if (cmd & 0x01) != 0 { + offset = uint(delta[0]) + delta = delta[1:] + } + if (cmd & 0x02) != 0 { + offset |= uint(delta[0]) << 8 + delta = delta[1:] + } + if (cmd & 0x04) != 0 { + offset |= uint(delta[0]) << 16 + delta = delta[1:] + } + if (cmd & 0x08) != 0 { + offset |= uint(delta[0]) << 24 + delta = delta[1:] + } + + return offset, delta +} + +func decodeSize(cmd byte, delta []byte) (uint, []byte) { + var sz uint + if (cmd & 0x10) != 0 { + sz = uint(delta[0]) + delta = delta[1:] + } + if (cmd & 0x20) != 0 { + sz |= uint(delta[0]) << 8 + delta = delta[1:] + } + if (cmd & 0x40) != 0 { + sz |= uint(delta[0]) << 16 + delta = delta[1:] + } + if sz == 0 { + sz = 0x10000 } - // offset needs to be aware of the bytes we read for `o.typ` and `o.size` - offset += steps - return -offset, nil + return sz, delta +} + +func invalidSize(sz, targetSz uint) bool { + return sz > targetSz +} + +func invalidOffsetSize(offset, sz, srcSz uint) bool { + return sumOverflows(offset, sz) || + offset+sz > srcSz +} + +func sumOverflows(a, b uint) bool { + return a+b < a } diff --git a/formats/packfile/doc.go b/formats/packfile/doc.go index cb3f542..c79c180 100644 --- a/formats/packfile/doc.go +++ b/formats/packfile/doc.go @@ -1,165 +1,168 @@ -package packfile +// Package packfile documentation: +/* + +GIT pack format +=============== + +== pack-*.pack files have the following format: + + - A header appears at the beginning and consists of the following: + + 4-byte signature: + The signature is: {'P', 'A', 'C', 'K'} + + 4-byte version number (network byte order): + GIT currently accepts version number 2 or 3 but + generates version 2 only. + + 4-byte number of objects contained in the pack (network byte order) + + Observation: we cannot have more than 4G versions ;-) and + more than 4G objects in a pack. + + - The header is followed by number of object entries, each of + which looks like this: + + (undeltified representation) + n-byte type and length (3-bit type, (n-1)*7+4-bit length) + compressed data + + (deltified representation) + n-byte type and length (3-bit type, (n-1)*7+4-bit length) + 20-byte base object name + compressed delta data + + Observation: length of each object is encoded in a variable + length format and is not constrained to 32-bit or anything. + + - The trailer records 20-byte SHA1 checksum of all of the above. + +== Original (version 1) pack-*.idx files have the following format: + + - The header consists of 256 4-byte network byte order + integers. N-th entry of this table records the number of + objects in the corresponding pack, the first byte of whose + object name is less than or equal to N. This is called the + 'first-level fan-out' table. + + - The header is followed by sorted 24-byte entries, one entry + per object in the pack. Each entry is: + + 4-byte network byte order integer, recording where the + object is stored in the packfile as the offset from the + beginning. + + 20-byte object name. + + - The file is concluded with a trailer: + + A copy of the 20-byte SHA1 checksum at the end of + corresponding packfile. -// GIT pack format -// =============== -// -// == pack-*.pack files have the following format: -// -// - A header appears at the beginning and consists of the following: -// -// 4-byte signature: -// The signature is: {'P', 'A', 'C', 'K'} -// -// 4-byte version number (network byte order): -// GIT currently accepts version number 2 or 3 but -// generates version 2 only. -// -// 4-byte number of objects contained in the pack (network byte order) -// -// Observation: we cannot have more than 4G versions ;-) and -// more than 4G objects in a pack. -// -// - The header is followed by number of object entries, each of -// which looks like this: -// -// (undeltified representation) -// n-byte type and length (3-bit type, (n-1)*7+4-bit length) -// compressed data -// -// (deltified representation) -// n-byte type and length (3-bit type, (n-1)*7+4-bit length) -// 20-byte base object name -// compressed delta data -// -// Observation: length of each object is encoded in a variable -// length format and is not constrained to 32-bit or anything. -// -// - The trailer records 20-byte SHA1 checksum of all of the above. -// -// == Original (version 1) pack-*.idx files have the following format: -// -// - The header consists of 256 4-byte network byte order -// integers. N-th entry of this table records the number of -// objects in the corresponding pack, the first byte of whose -// object name is less than or equal to N. This is called the -// 'first-level fan-out' table. -// -// - The header is followed by sorted 24-byte entries, one entry -// per object in the pack. Each entry is: -// -// 4-byte network byte order integer, recording where the -// object is stored in the packfile as the offset from the -// beginning. -// -// 20-byte object name. -// -// - The file is concluded with a trailer: -// -// A copy of the 20-byte SHA1 checksum at the end of -// corresponding packfile. -// -// 20-byte SHA1-checksum of all of the above. -// -// Pack Idx file: -// -// -- +--------------------------------+ -// fanout | fanout[0] = 2 (for example) |-. -// table +--------------------------------+ | -// | fanout[1] | | -// +--------------------------------+ | -// | fanout[2] | | -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | -// | fanout[255] = total objects |---. -// -- +--------------------------------+ | | -// main | offset | | | -// index | object name 00XXXXXXXXXXXXXXXX | | | -// table +--------------------------------+ | | -// | offset | | | -// | object name 00XXXXXXXXXXXXXXXX | | | -// +--------------------------------+<+ | -// .-| offset | | -// | | object name 01XXXXXXXXXXXXXXXX | | -// | +--------------------------------+ | -// | | offset | | -// | | object name 01XXXXXXXXXXXXXXXX | | -// | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | -// | | offset | | -// | | object name FFXXXXXXXXXXXXXXXX | | -// --| +--------------------------------+<--+ -// trailer | | packfile checksum | -// | +--------------------------------+ -// | | idxfile checksum | -// | +--------------------------------+ -// .-------. -// | -// Pack file entry: <+ -// -// packed object header: -// 1-byte size extension bit (MSB) -// type (next 3 bit) -// size0 (lower 4-bit) -// n-byte sizeN (as long as MSB is set, each 7-bit) -// size0..sizeN form 4+7+7+..+7 bit integer, size0 -// is the least significant part, and sizeN is the -// most significant part. -// packed object data: -// If it is not DELTA, then deflated bytes (the size above -// is the size before compression). -// If it is REF_DELTA, then -// 20-byte base object name SHA1 (the size above is the -// size of the delta data that follows). -// delta data, deflated. -// If it is OFS_DELTA, then -// n-byte offset (see below) interpreted as a negative -// offset from the type-byte of the header of the -// ofs-delta entry (the size above is the size of -// the delta data that follows). -// delta data, deflated. -// -// offset encoding: -// n bytes with MSB set in all but the last one. -// The offset is then the number constructed by -// concatenating the lower 7 bit of each byte, and -// for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1)) -// to the result. -// -// -// -// == Version 2 pack-*.idx files support packs larger than 4 GiB, and -// have some other reorganizations. They have the format: -// -// - A 4-byte magic number '\377tOc' which is an unreasonable -// fanout[0] value. -// -// - A 4-byte version number (= 2) -// -// - A 256-entry fan-out table just like v1. -// -// - A table of sorted 20-byte SHA1 object names. These are -// packed together without offset values to reduce the cache -// footprint of the binary search for a specific object name. -// -// - A table of 4-byte CRC32 values of the packed object data. -// This is new in v2 so compressed data can be copied directly -// from pack to pack during repacking without undetected -// data corruption. -// -// - A table of 4-byte offset values (in network byte order). -// These are usually 31-bit pack file offsets, but large -// offsets are encoded as an index into the next table with -// the msbit set. -// -// - A table of 8-byte offset entries (empty for pack files less -// than 2 GiB). Pack files are organized with heavily used -// objects toward the front, so most object references should -// not need to refer to this table. -// -// - The same trailer as a v1 pack file: -// -// A copy of the 20-byte SHA1 checksum at the end of -// corresponding packfile. -// -// 20-byte SHA1-checksum of all of the above. -// -// From: -// https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt + 20-byte SHA1-checksum of all of the above. + +Pack Idx file: + + -- +--------------------------------+ +fanout | fanout[0] = 2 (for example) |-. +table +--------------------------------+ | + | fanout[1] | | + +--------------------------------+ | + | fanout[2] | | + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | + | fanout[255] = total objects |---. + -- +--------------------------------+ | | +main | offset | | | +index | object name 00XXXXXXXXXXXXXXXX | | | +table +--------------------------------+ | | + | offset | | | + | object name 00XXXXXXXXXXXXXXXX | | | + +--------------------------------+<+ | + .-| offset | | + | | object name 01XXXXXXXXXXXXXXXX | | + | +--------------------------------+ | + | | offset | | + | | object name 01XXXXXXXXXXXXXXXX | | + | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | + | | offset | | + | | object name FFXXXXXXXXXXXXXXXX | | + --| +--------------------------------+<--+ +trailer | | packfile checksum | + | +--------------------------------+ + | | idxfile checksum | + | +--------------------------------+ + .-------. + | +Pack file entry: <+ + + packed object header: + 1-byte size extension bit (MSB) + type (next 3 bit) + size0 (lower 4-bit) + n-byte sizeN (as long as MSB is set, each 7-bit) + size0..sizeN form 4+7+7+..+7 bit integer, size0 + is the least significant part, and sizeN is the + most significant part. + packed object data: + If it is not DELTA, then deflated bytes (the size above + is the size before compression). + If it is REF_DELTA, then + 20-byte base object name SHA1 (the size above is the + size of the delta data that follows). + delta data, deflated. + If it is OFS_DELTA, then + n-byte offset (see below) interpreted as a negative + offset from the type-byte of the header of the + ofs-delta entry (the size above is the size of + the delta data that follows). + delta data, deflated. + + offset encoding: + n bytes with MSB set in all but the last one. + The offset is then the number constructed by + concatenating the lower 7 bit of each byte, and + for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1)) + to the result. + + + +== Version 2 pack-*.idx files support packs larger than 4 GiB, and + have some other reorganizations. They have the format: + + - A 4-byte magic number '\377tOc' which is an unreasonable + fanout[0] value. + + - A 4-byte version number (= 2) + + - A 256-entry fan-out table just like v1. + + - A table of sorted 20-byte SHA1 object names. These are + packed together without offset values to reduce the cache + footprint of the binary search for a specific object name. + + - A table of 4-byte CRC32 values of the packed object data. + This is new in v2 so compressed data can be copied directly + from pack to pack during repacking without undetected + data corruption. + + - A table of 4-byte offset values (in network byte order). + These are usually 31-bit pack file offsets, but large + offsets are encoded as an index into the next table with + the msbit set. + + - A table of 8-byte offset entries (empty for pack files less + than 2 GiB). Pack files are organized with heavily used + objects toward the front, so most object references should + not need to refer to this table. + + - The same trailer as a v1 pack file: + + A copy of the 20-byte SHA1 checksum at the end of + corresponding packfile. + + 20-byte SHA1-checksum of all of the above. + +From: +https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt +*/ +package packfile diff --git a/formats/packfile/error.go b/formats/packfile/error.go new file mode 100644 index 0000000..c0b9163 --- /dev/null +++ b/formats/packfile/error.go @@ -0,0 +1,30 @@ +package packfile + +import "fmt" + +// Error specifies errors returned during packfile parsing. +type Error struct { + reason, details string +} + +// NewError returns a new error. +func NewError(reason string) *Error { + return &Error{reason: reason} +} + +// Error returns a text representation of the error. +func (e *Error) Error() string { + if e.details == "" { + return e.reason + } + + return fmt.Sprintf("%s: %s", e.reason, e.details) +} + +// AddDetails adds details to an error, with additional text. +func (e *Error) AddDetails(format string, args ...interface{}) *Error { + return &Error{ + reason: e.reason, + details: fmt.Sprintf(format, args...), + } +} diff --git a/formats/packfile/parser.go b/formats/packfile/parser.go new file mode 100644 index 0000000..d3463bd --- /dev/null +++ b/formats/packfile/parser.go @@ -0,0 +1,353 @@ +package packfile + +import ( + "bytes" + "compress/zlib" + "encoding/binary" + "fmt" + "io" + + "gopkg.in/src-d/go-git.v3/core" + "gopkg.in/src-d/go-git.v3/storage/memory" +) + +var ( + // ErrEmptyPackfile is returned by ReadHeader when no data is found in the packfile + ErrEmptyPackfile = NewError("empty packfile") + // ErrBadSignature is returned by ReadHeader when the signature in the packfile is incorrect. + ErrBadSignature = NewError("malformed pack file signature") + // ErrUnsupportedVersion is returned by ReadHeader when the packfile version is + // different than VersionSupported. + ErrUnsupportedVersion = NewError("unsupported packfile version") +) + +const ( + // VersionSupported is the packfile version supported by this parser. + VersionSupported = 2 +) + +// A Parser is a collection of functions to read and process data form a packfile. +// Values from this type are not zero-value safe. See the NewParser function bellow. +type Parser struct { + ReadRecaller +} + +// NewParser returns a new Parser that reads from the packfile represented by r. +func NewParser(r ReadRecaller) *Parser { + return &Parser{ReadRecaller: r} +} + +// ReadInt32 reads 4 bytes and returns them as a Big Endian int32. +func (p Parser) readInt32() (uint32, error) { + var v uint32 + if err := binary.Read(p, binary.BigEndian, &v); err != nil { + return 0, err + } + + return v, nil +} + +// ReadSignature reads an returns the signature field in the packfile. +func (p *Parser) ReadSignature() ([]byte, error) { + var sig = make([]byte, 4) + if _, err := io.ReadFull(p, sig); err != nil { + return []byte{}, err + } + + return sig, nil +} + +// IsValidSignature returns if sig is a valid packfile signature. +func (p Parser) IsValidSignature(sig []byte) bool { + return bytes.Equal(sig, []byte{'P', 'A', 'C', 'K'}) +} + +// ReadVersion reads and returns the version field of a packfile. +func (p *Parser) ReadVersion() (uint32, error) { + return p.readInt32() +} + +// IsSupportedVersion returns whether version v is supported by the parser. +// The current supported version is VersionSupported, defined above. +func (p *Parser) IsSupportedVersion(v uint32) bool { + return v == VersionSupported +} + +// ReadCount reads and returns the count of objects field of a packfile. +func (p *Parser) ReadCount() (uint32, error) { + return p.readInt32() +} + +// ReadHeader reads the whole packfile header (signature, version and +// object count). It returns the object count and performs checks on the +// validity of the signature and the version fields. +func (p Parser) ReadHeader() (uint32, error) { + sig, err := p.ReadSignature() + if err != nil { + if err == io.EOF { + return 0, ErrEmptyPackfile + } + return 0, err + } + + if !p.IsValidSignature(sig) { + return 0, ErrBadSignature + } + + ver, err := p.ReadVersion() + if err != nil { + return 0, err + } + + if !p.IsSupportedVersion(ver) { + return 0, ErrUnsupportedVersion.AddDetails("%d", ver) + } + + count, err := p.ReadCount() + if err != nil { + return 0, err + } + + return count, nil +} + +// ReadObjectTypeAndLength reads and returns the object type and the +// length field from an object entry in a packfile. +func (p Parser) ReadObjectTypeAndLength() (core.ObjectType, int64, error) { + t, c, err := p.readType() + if err != nil { + return t, 0, err + } + + l, err := p.readLength(c) + + return t, l, err +} + +func (p Parser) readType() (core.ObjectType, byte, error) { + var c byte + var err error + if c, err = p.ReadByte(); err != nil { + return core.ObjectType(0), 0, err + } + typ := parseType(c) + + return typ, c, nil +} + +var ( + maskContinue = uint8(128) // 1000 0000 + maskType = uint8(112) // 0111 0000 + maskFirstLength = uint8(15) // 0000 1111 + firstLengthBits = uint8(4) // the first byte has 4 bits to store the length + maskLength = uint8(127) // 0111 1111 + lengthBits = uint8(7) // subsequent bytes has 7 bits to store the length +) + +func parseType(b byte) core.ObjectType { + return core.ObjectType((b & maskType) >> firstLengthBits) +} + +// the length is codified in the last 4 bits of the first byte and in +// the last 7 bits of subsequent bytes. Last byte has a 0 MSB. +func (p Parser) readLength(first byte) (int64, error) { + length := int64(first & maskFirstLength) + + c := first + shift := firstLengthBits + var err error + for moreBytesInLength(c) { + if c, err = p.ReadByte(); err != nil { + return 0, err + } + + length += int64(c&maskLength) << shift + shift += lengthBits + } + + return length, nil +} + +func moreBytesInLength(c byte) bool { + return c&maskContinue > 0 +} + +// ReadObject reads and returns a git object from an object entry in the packfile. +// Non-deltified and deltified objects are supported. +func (p Parser) ReadObject() (core.Object, error) { + start, err := p.Offset() + if err != nil { + return nil, err + } + + var typ core.ObjectType + typ, _, err = p.ReadObjectTypeAndLength() + if err != nil { + return nil, err + } + + var cont []byte + switch typ { + case core.CommitObject, core.TreeObject, core.BlobObject, core.TagObject: + cont, err = p.ReadNonDeltaObjectContent() + case core.REFDeltaObject: + cont, typ, err = p.ReadREFDeltaObjectContent() + case core.OFSDeltaObject: + cont, typ, err = p.ReadOFSDeltaObjectContent(start) + default: + err = ErrInvalidObject.AddDetails("tag %q", typ) + } + if err != nil { + return nil, err + } + + return memory.NewObject(typ, int64(len(cont)), cont), nil +} + +// ReadNonDeltaObjectContent reads and returns a non-deltified object +// from it zlib stream in an object entry in the packfile. +func (p Parser) ReadNonDeltaObjectContent() ([]byte, error) { + return p.readZip() +} + +func (p Parser) readZip() ([]byte, error) { + buf := bytes.NewBuffer(nil) + err := p.inflate(buf) + + return buf.Bytes(), err +} + +func (p Parser) inflate(w io.Writer) (err error) { + zr, err := zlib.NewReader(p) + if err != nil { + if err != zlib.ErrHeader { + return fmt.Errorf("zlib reading error: %s", err) + } + } + + defer func() { + closeErr := zr.Close() + if err == nil { + err = closeErr + } + }() + + _, err = io.Copy(w, zr) + + return err +} + +// ReadREFDeltaObjectContent reads and returns an object specified by a +// REF-Delta entry in the packfile, form the hash onwards. +func (p Parser) ReadREFDeltaObjectContent() ([]byte, core.ObjectType, error) { + refHash, err := p.ReadHash() + if err != nil { + return nil, core.ObjectType(0), err + } + + refObj, err := p.RecallByHash(refHash) + if err != nil { + return nil, core.ObjectType(0), err + } + + content, err := p.ReadSolveDelta(refObj.Content()) + if err != nil { + return nil, refObj.Type(), err + } + + return content, refObj.Type(), nil +} + +// ReadHash reads a hash. +func (p Parser) ReadHash() (core.Hash, error) { + var h core.Hash + if _, err := io.ReadFull(p, h[:]); err != nil { + return core.ZeroHash, err + } + + return h, nil +} + +// ReadSolveDelta reads and returns the base patched with the contents +// of a zlib compressed diff data in the delta portion of an object +// entry in the packfile. +func (p Parser) ReadSolveDelta(base []byte) ([]byte, error) { + diff, err := p.readZip() + if err != nil { + return nil, err + } + + return PatchDelta(base, diff), nil +} + +// ReadOFSDeltaObjectContent reads an returns an object specified by an +// OFS-delta entry in the packfile from it negative offset onwards. The +// start parameter is the offset of this particular object entry (the +// current offset minus the already processed type and length). +func (p Parser) ReadOFSDeltaObjectContent(start int64) ( + []byte, core.ObjectType, error) { + + jump, err := p.ReadNegativeOffset() + if err != nil { + return nil, core.ObjectType(0), err + } + + ref, err := p.RecallByOffset(start + jump) + if err != nil { + return nil, core.ObjectType(0), err + } + + content, err := p.ReadSolveDelta(ref.Content()) + if err != nil { + return nil, ref.Type(), err + } + + return content, ref.Type(), nil +} + +// ReadNegativeOffset reads and returns an offset from a OFS DELTA +// object entry in a packfile. OFS DELTA offsets are specified in Git +// VLQ special format: +// +// Ordinary VLQ has some redundancies, example: the number 358 can be +// encoded as the 2-octet VLQ 0x8166 or the 3-octet VLQ 0x808166 or the +// 4-octet VLQ 0x80808166 and so forth. +// +// To avoid these redundancies, the VLQ format used in Git removes this +// prepending redundancy and extends the representable range of shorter +// VLQs by adding an offset to VLQs of 2 or more octets in such a way +// that the lowest possible value for such an (N+1)-octet VLQ becomes +// exactly one more than the maximum possible value for an N-octet VLQ. +// In particular, since a 1-octet VLQ can store a maximum value of 127, +// the minimum 2-octet VLQ (0x8000) is assigned the value 128 instead of +// 0. Conversely, the maximum value of such a 2-octet VLQ (0xff7f) is +// 16511 instead of just 16383. Similarly, the minimum 3-octet VLQ +// (0x808000) has a value of 16512 instead of zero, which means +// that the maximum 3-octet VLQ (0xffff7f) is 2113663 instead of +// just 2097151. And so forth. +// +// This is how the offset is saved in C: +// +// dheader[pos] = ofs & 127; +// while (ofs >>= 7) +// dheader[--pos] = 128 | (--ofs & 127); +// +func (p Parser) ReadNegativeOffset() (int64, error) { + var c byte + var err error + + if c, err = p.ReadByte(); err != nil { + return 0, err + } + + var offset = int64(c & maskLength) + for moreBytesInLength(c) { + offset++ + if c, err = p.ReadByte(); err != nil { + return 0, err + } + offset = (offset << lengthBits) + int64(c&maskLength) + } + + return -offset, nil +} diff --git a/formats/packfile/parser_test.go b/formats/packfile/parser_test.go new file mode 100644 index 0000000..12d5f0d --- /dev/null +++ b/formats/packfile/parser_test.go @@ -0,0 +1,412 @@ +package packfile + +import ( + "bytes" + "io" + "io/ioutil" + "os" + + . "gopkg.in/check.v1" + "gopkg.in/src-d/go-git.v3/core" + "gopkg.in/src-d/go-git.v3/storage/memory" +) + +const ( + sigOffset = 0 + verOffset = 4 + countOffset = 8 +) + +type ParserSuite struct { + fixtures map[string]*fix +} + +type fix struct { + path string + parser *Parser + seekable io.Seeker +} + +func newFix(path string) (*fix, error) { + fix := new(fix) + fix.path = path + + f, err := os.Open(path) + if err != nil { + return nil, err + } + + data, err := ioutil.ReadAll(f) + if err != nil { + return nil, err + } + + if err = f.Close(); err != nil { + return nil, err + } + + seekable := NewSeekable(bytes.NewReader(data)) + fix.seekable = seekable + fix.parser = NewParser(seekable) + + return fix, nil +} + +func (f *fix) seek(o int64) error { + _, err := f.seekable.Seek(o, os.SEEK_SET) + return err +} + +var _ = Suite(&ParserSuite{}) + +func (s *ParserSuite) SetUpSuite(c *C) { + s.fixtures = make(map[string]*fix) + for _, fixData := range []struct { + id string + path string + }{ + {"ofs-deltas", "fixtures/alcortesm-binary-relations.pack"}, + {"ref-deltas", "fixtures/git-fixture.ref-delta"}, + } { + fix, err := newFix(fixData.path) + c.Assert(err, IsNil, + Commentf("setting up fixture id %s: %s", fixData.id, err)) + + _, ok := s.fixtures[fixData.id] + c.Assert(ok, Equals, false, + Commentf("duplicated fixture id: %s", fixData.id)) + + s.fixtures[fixData.id] = fix + } +} + +func (s *ParserSuite) TestSignature(c *C) { + for id, fix := range s.fixtures { + com := Commentf("fixture id = %s", id) + err := fix.seek(sigOffset) + c.Assert(err, IsNil, com) + p := fix.parser + + sig, err := p.ReadSignature() + c.Assert(err, IsNil, com) + c.Assert(p.IsValidSignature(sig), Equals, true, com) + } +} + +func (s *ParserSuite) TestVersion(c *C) { + for i, test := range [...]struct { + fixID string + expected uint32 + }{ + { + fixID: "ofs-deltas", + expected: uint32(2), + }, { + fixID: "ref-deltas", + expected: uint32(2), + }, + } { + com := Commentf("test %d) fixture id = %s", i, test.fixID) + fix, ok := s.fixtures[test.fixID] + c.Assert(ok, Equals, true, com) + + err := fix.seek(verOffset) + c.Assert(err, IsNil, com) + p := fix.parser + + v, err := p.ReadVersion() + c.Assert(err, IsNil, com) + c.Assert(v, Equals, test.expected, com) + c.Assert(p.IsSupportedVersion(v), Equals, true, com) + } +} + +func (s *ParserSuite) TestCount(c *C) { + for i, test := range [...]struct { + fixID string + expected uint32 + }{ + { + fixID: "ofs-deltas", + expected: uint32(0x50), + }, { + fixID: "ref-deltas", + expected: uint32(0x1c), + }, + } { + com := Commentf("test %d) fixture id = %s", i, test.fixID) + fix, ok := s.fixtures[test.fixID] + c.Assert(ok, Equals, true, com) + + err := fix.seek(countOffset) + c.Assert(err, IsNil, com) + p := fix.parser + + count, err := p.ReadCount() + c.Assert(err, IsNil, com) + c.Assert(count, Equals, test.expected, com) + } +} + +func (s *ParserSuite) TestReadObjectTypeAndLength(c *C) { + for i, test := range [...]struct { + fixID string + offset int64 + expType core.ObjectType + expLength int64 + }{ + { + fixID: "ofs-deltas", + offset: 12, + expType: core.CommitObject, + expLength: 342, + }, { + fixID: "ofs-deltas", + offset: 1212, + expType: core.OFSDeltaObject, + expLength: 104, + }, { + fixID: "ofs-deltas", + offset: 3193, + expType: core.TreeObject, + expLength: 226, + }, { + fixID: "ofs-deltas", + offset: 3639, + expType: core.BlobObject, + expLength: 90, + }, { + fixID: "ofs-deltas", + offset: 4504, + expType: core.BlobObject, + expLength: 7107, + }, { + fixID: "ref-deltas", + offset: 84849, + expType: core.REFDeltaObject, + expLength: 6, + }, { + fixID: "ref-deltas", + offset: 85070, + expType: core.REFDeltaObject, + expLength: 8, + }, + } { + com := Commentf("test %d) fixture id = %s", i, test.fixID) + fix, ok := s.fixtures[test.fixID] + c.Assert(ok, Equals, true, com) + + err := fix.seek(test.offset) + c.Assert(err, IsNil, com) + p := fix.parser + + typ, length, err := p.ReadObjectTypeAndLength() + c.Assert(err, IsNil, com) + c.Assert(typ, Equals, test.expType, com) + c.Assert(length, Equals, test.expLength, com) + } +} + +func (s *ParserSuite) TestReadNonDeltaObjectContent(c *C) { + for i, test := range [...]struct { + fixID string + offset int64 + expected []byte + }{ + { + fixID: "ofs-deltas", + offset: 12, + expected: []byte("tree 87c87d16e815a43e4e574dd8edd72c5450ac3a8e\nparent a87d72684d1cf68099ce6e9f68689e25e645a14c\nauthor Gorka Guardiola 1450265632 +0100\ncommitter Gorka Guardiola 1450265632 +0100\n\nChanged example to use dot.\nI did not remove the original files outside of the\ntex, I leave that to alcortes.\n"), + }, { + fixID: "ofs-deltas", + offset: 1610, + expected: []byte("tree 4b4f0d9a07109ef0b8a3051138cc20cdb47fa513\nparent b373f85fa2594d7dcd9989f4a5858a81647fb8ea\nauthor Alberto Cortés 1448017995 +0100\ncommitter Alberto Cortés 1448018112 +0100\n\nMove generated images to it own dir (img/)\n\nFixes #1.\n"), + }, { + fixID: "ofs-deltas", + offset: 10566, + expected: []byte("40000 map-slice\x00\x00\xce\xfb\x8ew\xf7\xa8\xc6\x1b\x99\xdd$\x91\xffH\xa3\xb0\xb1fy40000 simple-arrays\x00\x9a7\x81\xb7\xfd\x9d(Q\xe2\xa4H\x8c\x03^٬\x90Z\xecy"), + }, + } { + com := Commentf("test %d) fixture id = %s", i, test.fixID) + fix, ok := s.fixtures[test.fixID] + c.Assert(ok, Equals, true, com) + + err := fix.seek(test.offset) + c.Assert(err, IsNil, com) + p := fix.parser + + _, _, err = p.ReadObjectTypeAndLength() + c.Assert(err, IsNil, com) + + cont, err := p.ReadNonDeltaObjectContent() + c.Assert(err, IsNil, com) + c.Assert(cont, DeepEquals, test.expected, com) + } +} + +func (s *ParserSuite) TestReadOFSDeltaObjectContent(c *C) { + for i, test := range [...]struct { + fixID string + offset int64 + expOffset int64 + expType core.ObjectType + expContent []byte + }{ + { + fixID: "ofs-deltas", + offset: 1212, + expOffset: -212, + expType: core.CommitObject, + expContent: []byte("tree c4573589ce78ac63769c20742b9a970f6e274a38\nparent 4571a24948494ebe1cb3dc18ca5a9286e79705ae\nauthor Alberto Cortés 1448139640 +0100\ncommitter Alberto Cortés 1448139640 +0100\n\nUpdate reference to binrels module\n"), + }, { + fixID: "ofs-deltas", + offset: 3514, + expOffset: -102, + expType: core.TreeObject, + expContent: []byte("100644 .gitignore\x00\u007fA\x90[Mw\xabJ\x9a-3O\xcd\x0f\xb5\xdbn\x8e!\x83100644 .gitmodules\x00\xd4`\xa8>\x15\xcfd\x05\x81B7_\xc4\v\x04\xa7\xa9A\x85\n100644 Makefile\x00-ҭ\x8c\x14\xdef\x12\xed\x15\x816y\xa6UK\xad\x993\v100644 binary-relations.tex\x00\x802\x05@\x11'^ \xf5<\xf7\xfd\x81%3\xd1o\xa9_$40000 graphs\x00\xdehu\x16\xc6\x0e\\H\x8e\xe9\xa1JIXE\xbaڽg\xc540000 imgs-gen\x00\xeb\"\xddhzg\xa3\x1f\xc8j\xc5\xfc豢\xe9\x96\xce\xce^40000 src\x00\x895\x11t\xff\x86\xa7\xea\xa6\xc0v%\x11E\x10f,ݒ\x1a"), + }, { + fixID: "ofs-deltas", + offset: 9806, + expOffset: -6613, + expType: core.TreeObject, + expContent: []byte("100644 .gitignore\x00\u007fA\x90[Mw\xabJ\x9a-3O\xcd\x0f\xb5\xdbn\x8e!\x83100644 .gitmodules\x00\xd4`\xa8>\x15\xcfd\x05\x81B7_\xc4\v\x04\xa7\xa9A\x85\n100644 Makefile\x00-ҭ\x8c\x14\xdef\x12\xed\x15\x816y\xa6UK\xad\x993\v100644 binary-relations.tex\x00I\x13~\xb8کEU\x9f\x99#\xc4E.\x9d>\uef1e\xad40000 graphs\x00\xb9\x00\xf34\xde\xff\xce@+\xbd\xf8 9\xb8=\xc1\xb9\x00\x84]40000 imgs-gen\x00\xeb\"\xddhzg\xa3\x1f\xc8j\xc5\xfc豢\xe9\x96\xce\xce^40000 src\x00\x895\x11t\xff\x86\xa7\xea\xa6\xc0v%\x11E\x10f,ݒ\x1a"), + }, + } { + com := Commentf("test %d) fixture id = %s", i, test.fixID) + fix, ok := s.fixtures[test.fixID] + c.Assert(ok, Equals, true, com) + + err := fix.seek(test.offset) + c.Assert(err, IsNil, com) + p := fix.parser + + _, _, err = p.ReadObjectTypeAndLength() + c.Assert(err, IsNil, com) + + beforeJumpSize, err := p.Offset() + c.Assert(err, IsNil, com) + + jump, err := p.ReadNegativeOffset() + c.Assert(err, IsNil, com) + c.Assert(jump, Equals, test.expOffset, com) + + err = fix.seek(beforeJumpSize) + c.Assert(err, IsNil, com) + + cont, typ, err := p.ReadOFSDeltaObjectContent(test.offset) + c.Assert(err, IsNil, com) + c.Assert(typ, Equals, test.expType, com) + c.Assert(cont, DeepEquals, test.expContent, com) + } +} + +func (s *ParserSuite) TestReadREFDeltaObjectContent(c *C) { + for i, test := range [...]struct { + fixID string + offset int64 + deps map[int64]core.Object + expHash core.Hash + expType core.ObjectType + expContent []byte + }{ + { + fixID: "ref-deltas", + offset: 84849, + deps: map[int64]core.Object{ + 83607: newObject(core.TreeObject, []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 CHANGELOG\x00\xd3\xffS\xe0VJ\x9f\x87\xd8\xe8Kn(\xe5\x06\x0eQp\b\xaa100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r40000 go\x00\xa3\x97q\xa7e\x1f\x97\xfa\xf5\xc7.\b\"M\x85\u007f\xc3Q3\xdb40000 json\x00Z\x87~j\x90j'C\xadnEٜ\x17\x93d*\xaf\x8e\xda40000 php\x00Xj\xf5gл^w\x1eI\xbd\xd9CO^\x0f\xb7m%\xfa40000 vendor\x00\xcfJ\xa3\xb3\x89t\xfb}\x81\xf3g\xc0\x83\x0f}x\xd6Z\xb8k")), + }, + expHash: core.NewHash("a8d315b2b1c615d43042c3a62402b8a54288cf5c"), + expType: core.TreeObject, + expContent: []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 CHANGELOG\x00\xd3\xffS\xe0VJ\x9f\x87\xd8\xe8Kn(\xe5\x06\x0eQp\b\xaa100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r40000 go\x00\xa3\x97q\xa7e\x1f\x97\xfa\xf5\xc7.\b\"M\x85\u007f\xc3Q3\xdb40000 json\x00Z\x87~j\x90j'C\xadnEٜ\x17\x93d*\xaf\x8e\xda40000 php\x00Xj\xf5gл^w\x1eI\xbd\xd9CO^\x0f\xb7m%\xfa"), + }, { + fixID: "ref-deltas", + offset: 85070, + deps: map[int64]core.Object{ + 84922: newObject(core.TreeObject, []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 CHANGELOG\x00\xd3\xffS\xe0VJ\x9f\x87\xd8\xe8Kn(\xe5\x06\x0eQp\b\xaa100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r")), + 84849: newObject(core.TreeObject, []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 CHANGELOG\x00\xd3\xffS\xe0VJ\x9f\x87\xd8\xe8Kn(\xe5\x06\x0eQp\b\xaa100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r40000 go\x00\xa3\x97q\xa7e\x1f\x97\xfa\xf5\xc7.\b\"M\x85\u007f\xc3Q3\xdb40000 json\x00Z\x87~j\x90j'C\xadnEٜ\x17\x93d*\xaf\x8e\xda40000 php\x00Xj\xf5gл^w\x1eI\xbd\xd9CO^\x0f\xb7m%\xfa")), + 83607: newObject(core.TreeObject, []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 CHANGELOG\x00\xd3\xffS\xe0VJ\x9f\x87\xd8\xe8Kn(\xe5\x06\x0eQp\b\xaa100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r40000 go\x00\xa3\x97q\xa7e\x1f\x97\xfa\xf5\xc7.\b\"M\x85\u007f\xc3Q3\xdb40000 json\x00Z\x87~j\x90j'C\xadnEٜ\x17\x93d*\xaf\x8e\xda40000 php\x00Xj\xf5gл^w\x1eI\xbd\xd9CO^\x0f\xb7m%\xfa40000 vendor\x00\xcfJ\xa3\xb3\x89t\xfb}\x81\xf3g\xc0\x83\x0f}x\xd6Z\xb8k")), + }, + expHash: core.NewHash("eba74343e2f15d62adedfd8c883ee0262b5c8021"), + expType: core.TreeObject, + expContent: []byte("100644 .gitignore\x002\x85\x8a\xad<8>\xd1\xff\n\x0f\x9b\xdf#\x1dT\xa0\f\x9e\x88100644 LICENSE\x00\xc1\x92\xbdj$\xea\x1a\xb0\x1dxhnA|\x8b\xdc|=\x19\u007f100644 binary.jpg\x00\xd5\xc0\xf4\xab\x81\x18\x97\xca\xdf\x03\xae\xc3X\xae`\xd2\x1f\x91\xc5\r"), + }, + } { + com := Commentf("test %d) fixture id = %s", i, test.fixID) + fix, ok := s.fixtures[test.fixID] + c.Assert(ok, Equals, true, com) + + err := fix.seek(test.offset) + c.Assert(err, IsNil, com) + p := fix.parser + for k, v := range test.deps { + err = p.Remember(k, v) + c.Assert(err, IsNil, com) + } + + _, _, err = p.ReadObjectTypeAndLength() + c.Assert(err, IsNil, com) + + beforeHash, err := p.Offset() + c.Assert(err, IsNil, com) + + hash, err := p.ReadHash() + c.Assert(err, IsNil, com) + c.Assert(hash, Equals, test.expHash, com) + + err = fix.seek(beforeHash) + c.Assert(err, IsNil, com) + + cont, typ, err := p.ReadREFDeltaObjectContent() + c.Assert(err, IsNil, com) + c.Assert(typ, Equals, test.expType, com) + c.Assert(cont, DeepEquals, test.expContent, com) + + p.ForgetAll() + } +} + +func newObject(t core.ObjectType, c []byte) *memory.Object { + return memory.NewObject(t, int64(len(c)), c) +} + +func (s *ParserSuite) TestReadHeaderBadSignatureError(c *C) { + data := []byte{ + 0x50, 0x42, 0x43, 0x4b, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x50, + } + p := NewParser(NewSeekable(bytes.NewReader(data))) + + _, err := p.ReadHeader() + c.Assert(err, ErrorMatches, ErrBadSignature.Error()) +} + +func (s *ParserSuite) TestReadHeaderEmptyPackfileError(c *C) { + data := []byte{} + p := NewParser(NewSeekable(bytes.NewReader(data))) + + _, err := p.ReadHeader() + c.Assert(err, ErrorMatches, ErrEmptyPackfile.Error()) +} + +func (s *ParserSuite) TestReadHeaderUnsupportedVersionError(c *C) { + data := []byte{ + 0x50, 0x41, 0x43, 0x4b, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x50, + } + p := NewParser(NewSeekable(bytes.NewReader(data))) + + _, err := p.ReadHeader() + c.Assert(err, ErrorMatches, ErrUnsupportedVersion.Error()+".*") +} + +func (s *ParserSuite) TestReadHeader(c *C) { + data := []byte{ + 0x50, 0x41, 0x43, 0x4b, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x50, + } + p := NewParser(NewSeekable(bytes.NewReader(data))) + + count, err := p.ReadHeader() + c.Assert(err, IsNil) + c.Assert(count, Equals, uint32(0x50)) +} diff --git a/formats/packfile/read_recaller.go b/formats/packfile/read_recaller.go new file mode 100644 index 0000000..92ab1b2 --- /dev/null +++ b/formats/packfile/read_recaller.go @@ -0,0 +1,39 @@ +package packfile + +import "gopkg.in/src-d/go-git.v3/core" + +var ( + // ErrDuplicatedObject is returned by Remember if an object appears several + // times in a packfile. + ErrDuplicatedObject = NewError("duplicated object") + // ErrCannotRecall is returned by RecallByOffset or RecallByHash if the object + // to recall cannot be returned. + ErrCannotRecall = NewError("cannot recall object") +) + +// The ReadRecaller interface has all the functions needed by a packfile +// Parser to operate. We provide two very different implementations: +// Seekable and Stream. +type ReadRecaller interface { + // Read reads up to len(p) bytes into p. + Read(p []byte) (int, error) + // ReadByte is needed because of these: + // - https://github.com/golang/go/commit/7ba54d45732219af86bde9a5b73c145db82b70c6 + // - https://groups.google.com/forum/#!topic/golang-nuts/fWTRdHpt0QI + // - https://gowalker.org/compress/zlib#NewReader + ReadByte() (byte, error) + // Offset returns the number of bytes parsed so far from the + // packfile. + Offset() (int64, error) + // Remember ask the ReadRecaller to remember the offset and hash for + // an object, so you can later call RecallByOffset and RecallByHash. + Remember(int64, core.Object) error + // ForgetAll forgets all previously remembered objects. + ForgetAll() + // RecallByOffset returns the previously processed object found at a + // given offset. + RecallByOffset(int64) (core.Object, error) + // RecallByHash returns the previously processed object with the + // given hash. + RecallByHash(core.Hash) (core.Object, error) +} diff --git a/formats/packfile/read_recaller_impl_test.go b/formats/packfile/read_recaller_impl_test.go new file mode 100644 index 0000000..438439d --- /dev/null +++ b/formats/packfile/read_recaller_impl_test.go @@ -0,0 +1,296 @@ +package packfile + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + + "gopkg.in/src-d/go-git.v3/core" + "gopkg.in/src-d/go-git.v3/storage/memory" + + . "gopkg.in/check.v1" +) + +type ReadRecallerImplSuite struct{} + +var _ = Suite(&ReadRecallerImplSuite{}) + +type implFn func([]byte) ReadRecaller + +func newStream(data []byte) ReadRecaller { + buf := bytes.NewBuffer(data) + return NewStream(buf) +} + +func newSeekable(data []byte) ReadRecaller { + buf := bytes.NewReader(data) + return NewSeekable(buf) +} + +func (s *ReadRecallerImplSuite) TestRead(c *C) { + for _, impl := range []struct { + id string + newFn implFn + }{ + {id: "stream", newFn: newStream}, + {id: "seekable", newFn: newSeekable}, + } { + com := Commentf("implementation %s", impl.id) + data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10} + sr := impl.newFn(data) + all := make([]byte, 0, len(data)) + + for len(all) < len(data) { + tmp := make([]byte, 3) + nr, err := sr.Read(tmp) + c.Assert(err, IsNil, com) + all = append(all, tmp[:nr]...) + } + c.Assert(data, DeepEquals, all, com) + } +} + +func (s *ReadRecallerImplSuite) TestReadbyte(c *C) { + for _, impl := range []struct { + id string + newFn implFn + }{ + {id: "stream", newFn: newStream}, + {id: "seekable", newFn: newSeekable}, + } { + com := Commentf("implementation %s", impl.id) + data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10} + sr := impl.newFn(data) + all := make([]byte, 0, len(data)) + + for len(all) < len(data) { + b, err := sr.ReadByte() + c.Assert(err, IsNil, com) + all = append(all, b) + } + c.Assert(data, DeepEquals, all, com) + } +} + +func (s *ReadRecallerImplSuite) TestOffsetWithRead(c *C) { + for _, impl := range []struct { + id string + newFn implFn + }{ + {id: "stream", newFn: newStream}, + {id: "seekable", newFn: newSeekable}, + } { + com := Commentf("implementation %s", impl.id) + data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10} + sr := impl.newFn(data) + all := make([]byte, 0, len(data)) + + for len(all) < len(data) { + tmp := make([]byte, 3) + nr, err := sr.Read(tmp) + c.Assert(err, IsNil, com) + all = append(all, tmp[:nr]...) + + off, err := sr.Offset() + c.Assert(err, IsNil, com) + c.Assert(off, Equals, int64(len(all)), com) + } + } +} + +func (s *ReadRecallerImplSuite) TestOffsetWithReadByte(c *C) { + for _, impl := range []struct { + id string + newFn implFn + }{ + {id: "stream", newFn: newStream}, + {id: "seekable", newFn: newSeekable}, + } { + com := Commentf("implementation %s", impl.id) + data := []byte{0, 1, 2, 3, 4, 5, 7, 8, 9, 10} + sr := impl.newFn(data) + all := make([]byte, 0, len(data)) + + for len(all) < len(data) { + b, err := sr.ReadByte() + c.Assert(err, IsNil, com) + all = append(all, b) + + off, err := sr.Offset() + c.Assert(err, IsNil, com) + c.Assert(off, Equals, int64(len(all)), com) + } + } +} + +func (s *ReadRecallerImplSuite) TestRememberRecall(c *C) { + packfile := "fixtures/spinnaker-spinnaker.pack" + f, err := os.Open(packfile) + c.Assert(err, IsNil) + defer func() { + err = f.Close() + c.Assert(err, IsNil) + }() + + data, err := ioutil.ReadAll(f) + c.Assert(err, IsNil) + + for _, impl := range []struct { + id string + newFn implFn + }{ + {id: "stream", newFn: newStream}, + {id: "seekable", newFn: newSeekable}, + } { + sr := impl.newFn(data) + for i, test := range [...]struct { + off int64 + obj core.Object + err string // error regexp + ignore string // ignore this test for this implementation + }{ + { + off: 12, + obj: newObj(core.CommitObject, []byte("tree 44a1cdf21c791867c51caad8f1b77e6baee6f462\nparent 87fe6e7c6b1b89519fe3a03a8961c5aa14d4cc68\nparent 9244ee648182b91a63d8cc4cbe4b9ac2a27c0492\nauthor Matt Duftler 1448290941 -0500\ncommitter Matt Duftler 1448290941 -0500\n\nMerge pull request #615 from ewiseblatt/create_dev\n\nPreserve original credentials of spinnaker-local.yml when transforming it.")), + }, { + off: 3037, + obj: newObj(core.TagObject, []byte("object e0005f50e22140def60260960b21667f1fdfff80\ntype commit\ntag v0.10.0\ntagger cfieber 1447687536 -0800\n\nRelease of 0.10.0\n\n- e0005f50e22140def60260960b21667f1fdfff80: Merge pull request #553 from ewiseblatt/rendezvous\n- e1a2b26b784179e6903a7ae967c037c721899eba: Wait for cassandra before starting spinnaker\n- c756e09461d071e98b8660818cf42d90c90f2854: Merge pull request #552 from duftler/google-c2d-tweaks\n- 0777fadf4ca6f458d7071de414f9bd5417911037: Fix incorrect config prop names: s/SPINNAKER_GOOGLE_PROJECT_DEFAULT_REGION/SPINNAKER_GOOGLE_DEFAULT_REGION s/SPINNAKER_GOOGLE_PROJECT_DEFAULT_ZONE/SPINNAKER_GOOGLE_DEFAULT_ZONE Hardcode profile name in generated ~/.aws/credentials to [default]. Restart all of spinnaker after updating cassandra and reconfiguring spinnaker, instead of just restarting clouddriver.\n- d8d031c1ac45801074418c43424a6f2c0dff642c: Merge pull request #551 from kenzanmedia/fixGroup\n- 626d23075f9e92aad19015f2964c95d45f41fa3a: Put in correct block for public image. Delineate cloud provider.\n")), + }, { + off: 157625, + obj: newObj(core.BlobObject, []byte(".gradle\nbuild/\n*.iml\n.idea\n*.pyc\n*~\n#*\nconfig/spinnaker-local.yml\n.DS_Store\npacker/ami_table.md\npacker/ami_table.json\npacker/example_output.txt")), + }, { + off: 1234, + obj: newObj(core.BlobObject, []byte(".gradle\nbuild/\n*.iml\n.idea\n*.pyc\n*~\n#*\nconfig/spinnaker-local.yml\n.DS_Store\npacker/ami_table.md\npacker/ami_table.json\npacker/example_output.txt")), + err: "duplicated object: with hash .*", + }, { + off: 3037, + obj: newObj(core.BlobObject, []byte("")), + err: "duplicated object: with offset 3037", + ignore: "seekable", + // seekable can not check if the offset has already been added + // for performance reasons. + }, + } { + if test.ignore == impl.id { + continue + } + com := Commentf("subtest %d) implementation %s", i, impl.id) + + err := sr.Remember(test.off, test.obj) + if test.err != "" { + c.Assert(err, ErrorMatches, test.err, com) + continue + } + c.Assert(err, IsNil, com) + + result, err := sr.RecallByHash(test.obj.Hash()) + c.Assert(err, IsNil, com) + c.Assert(result, DeepEquals, test.obj, com) + + result, err = sr.RecallByOffset(test.off) + c.Assert(err, IsNil, com) + c.Assert(result, DeepEquals, test.obj, com) + } + } +} + +func newObj(typ core.ObjectType, cont []byte) core.Object { + return memory.NewObject(typ, int64(len(cont)), cont) +} + +func (s *ReadRecallerImplSuite) TestRecallByHashErrors(c *C) { + for _, impl := range []struct { + id string + newFn implFn + }{ + {id: "stream", newFn: newStream}, + {id: "seekable", newFn: newSeekable}, + } { + com := Commentf("implementation %s", impl.id) + sr := impl.newFn([]byte{}) + obj := newObj(core.CommitObject, []byte{}) + + _, err := sr.RecallByHash(obj.Hash()) + c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) + + err = rememberSomeObjects(sr) + c.Assert(err, IsNil) + + _, err = sr.RecallByHash(obj.Hash()) + c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) + } +} + +func (s *ReadRecallerImplSuite) TestRecallByOffsetErrors(c *C) { + for _, impl := range []struct { + id string + newFn implFn + }{ + {id: "stream", newFn: newStream}, + // seekalbe allways recall every object in the packfile + } { + com := Commentf("implementation %s", impl.id) + sr := impl.newFn([]byte{}) + + _, err := sr.RecallByOffset(15) + c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) + + err = rememberSomeObjects(sr) + c.Assert(err, IsNil) + + _, err = sr.RecallByOffset(15) + c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) + } +} + +func rememberSomeObjects(sr ReadRecaller) error { + for i, init := range [...]struct { + off int64 + obj core.Object + }{ + {off: 0, obj: newObj(core.CommitObject, []byte{'a'})}, // 93114cce67ec23976d15199514399203f69cc676 + {off: 10, obj: newObj(core.CommitObject, []byte{'b'})}, // 2bb767097e479f668f0ebdabe88df11337bd8f19 + {off: 20, obj: newObj(core.CommitObject, []byte{'c'})}, // 2f8096005677370e6446541a50e074299d43d468 + } { + err := sr.Remember(init.off, init.obj) + if err != nil { + return fmt.Errorf("cannot ask StreamReader to Remember item %d", i) + } + } + + return nil +} + +func (s *ReadRecallerImplSuite) TestForgetAll(c *C) { + for _, impl := range []struct { + id string + newFn implFn + }{ + {id: "stream", newFn: newStream}, + {id: "seekable", newFn: newSeekable}, + } { + com := Commentf("implementation %s", impl.id) + sr := impl.newFn([]byte{}) + + err := rememberSomeObjects(sr) + c.Assert(err, IsNil) + + sr.ForgetAll() + + if impl.id != "seekable" { // for efficiency, seekable always finds objects by offset + _, err = sr.RecallByOffset(0) + c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) + _, err = sr.RecallByOffset(10) + c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) + _, err = sr.RecallByOffset(20) + c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) + } + _, err = sr.RecallByHash(core.NewHash("93114cce67ec23976d15199514399203f69cc676")) + c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) + _, err = sr.RecallByHash(core.NewHash("2bb767097e479f668f0ebdabe88df11337bd8f19")) + c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) + _, err = sr.RecallByHash(core.NewHash("2f8096005677370e6446541a50e074299d43d468")) + c.Assert(err, ErrorMatches, ErrCannotRecall.Error()+".*", com) + } +} diff --git a/formats/packfile/reader.go b/formats/packfile/reader.go deleted file mode 100644 index 3f7081b..0000000 --- a/formats/packfile/reader.go +++ /dev/null @@ -1,338 +0,0 @@ -package packfile - -import ( - "bytes" - "encoding/binary" - "fmt" - "io" - "io/ioutil" - - "gopkg.in/src-d/go-git.v3/core" - - "github.com/klauspost/compress/zlib" -) - -type Format int - -var ( - EmptyRepositoryErr = newError("empty repository") - UnsupportedVersionErr = newError("unsupported packfile version") - MaxObjectsLimitReachedErr = newError("max. objects limit reached") - MalformedPackfileErr = newError("malformed pack file, does not start with 'PACK'") - InvalidObjectErr = newError("invalid git object") - PatchingErr = newError("patching error") - PackEntryNotFoundErr = newError("can't find a pack entry") - ErrObjectNotFound = newError("can't find a object") - ZLibErr = newError("zlib reading error") -) - -const ( - DefaultMaxObjectsLimit = 1 << 20 - - VersionSupported = 2 - UnknownFormat Format = 0 - OFSDeltaFormat Format = 1 - REFDeltaFormat Format = 2 -) - -// Reader reads a packfile from a binary string splitting it on objects -type Reader struct { - // MaxObjectsLimit is the limit of objects to be load in the packfile, if - // a packfile excess this number an error is throw, the default value - // is defined by DefaultMaxObjectsLimit, usually the default limit is more - // than enough to work with any repository, working extremely big repositories - // where the number of object is bigger the memory can be exhausted. - MaxObjectsLimit uint32 - - // Format specifies if we are using ref-delta's or ofs-delta's, choosing the - // correct format the memory usage is optimized - // https://github.com/git/git/blob/8d530c4d64ffcc853889f7b385f554d53db375ed/Documentation/technical/protocol-capabilities.txt#L154 - Format Format - - r *trackingReader - s core.ObjectStorage - offsets map[int64]core.Hash -} - -// NewReader returns a new Reader that reads from a io.Reader -func NewReader(r io.Reader) *Reader { - return &Reader{ - MaxObjectsLimit: DefaultMaxObjectsLimit, - - r: NewTrackingReader(r), - offsets: make(map[int64]core.Hash, 0), - } -} - -// Read reads the objects and stores it at the ObjectStorage -func (r *Reader) Read(s core.ObjectStorage) (int64, error) { - r.s = s - if err := r.validateHeader(); err != nil { - if err == io.EOF { - return -1, EmptyRepositoryErr - } - - return -1, err - } - - version, err := r.readInt32() - if err != nil { - return -1, err - } - - if version > VersionSupported { - return -1, UnsupportedVersionErr - } - - count, err := r.readInt32() - if err != nil { - return -1, err - } - - if count > r.MaxObjectsLimit { - return -1, MaxObjectsLimitReachedErr - } - - return r.r.position, r.readObjects(count) -} - -func (r *Reader) validateHeader() error { - var header = make([]byte, 4) - if _, err := io.ReadFull(r.r, header); err != nil { - return err - } - - if !bytes.Equal(header, []byte{'P', 'A', 'C', 'K'}) { - return MalformedPackfileErr - } - - return nil -} - -func (r *Reader) readInt32() (uint32, error) { - var value uint32 - if err := binary.Read(r.r, binary.BigEndian, &value); err != nil { - return 0, err - } - - return value, nil -} - -func (r *Reader) readObjects(count uint32) error { - // This code has 50-80 µs of overhead per object not counting zlib inflation. - // Together with zlib inflation, it's 400-410 µs for small objects. - // That's 1 sec for ~2450 objects, ~4.20 MB, or ~250 ms per MB, - // of which 12-20 % is _not_ zlib inflation (ie. is our code). - for i := 0; i < int(count); i++ { - start := r.r.position - obj, err := r.newObject() - if err != nil && err != io.EOF { - return err - } - - if r.Format == UnknownFormat || r.Format == OFSDeltaFormat { - r.offsets[start] = obj.Hash() - } - - r.s.Set(obj) - if err == io.EOF { - break - } - } - - return nil -} - -func (r *Reader) newObject() (core.Object, error) { - raw, err := r.s.New() - if err != nil { - return nil, err - } - var steps int64 - - var buf [1]byte - if _, err := r.r.Read(buf[:]); err != nil { - return nil, err - } - - typ := core.ObjectType((buf[0] >> 4) & 7) - size := int64(buf[0] & 15) - steps++ // byte we just read to get `o.typ` and `o.size` - - var shift uint = 4 - for buf[0]&0x80 == 0x80 { - if _, err := r.r.Read(buf[:]); err != nil { - return nil, err - } - - size += int64(buf[0]&0x7f) << shift - steps++ // byte we just read to update `o.size` - shift += 7 - } - - raw.SetType(typ) - raw.SetSize(size) - - switch raw.Type() { - case core.REFDeltaObject: - err = r.readREFDelta(raw) - case core.OFSDeltaObject: - err = r.readOFSDelta(raw, steps) - case core.CommitObject, core.TreeObject, core.BlobObject, core.TagObject: - err = r.readObject(raw) - default: - err = InvalidObjectErr.n("tag %q", raw.Type) - } - - return raw, err -} - -func (r *Reader) readREFDelta(raw core.Object) (err error) { - var ref core.Hash - if _, err := io.ReadFull(r.r, ref[:]); err != nil { - return err - } - - buf := bytes.NewBuffer(nil) - if err := r.inflate(buf); err != nil { - return err - } - - referenced, err := r.s.Get(ref) - if err != nil { - if err == core.ErrObjectNotFound { - return ErrObjectNotFound.n("%s", ref) - } - return err - } - - reader, err := referenced.Reader() - if err != nil { - return err - } - defer checkClose(reader, &err) - - d, err := ioutil.ReadAll(reader) - if err != nil { - return err - } - - patched := patchDelta(d, buf.Bytes()) - if patched == nil { - return PatchingErr.n("hash %q", ref) - } - - raw.SetType(referenced.Type()) - raw.SetSize(int64(len(patched))) - - writer, err := raw.Writer() - if err != nil { - return err - } - defer checkClose(writer, &err) - - writer.Write(patched) - - return nil -} - -func (r *Reader) readOFSDelta(raw core.Object, steps int64) (err error) { - start := r.r.position - offset, err := decodeOffset(r.r, steps) - if err != nil { - return err - } - - buf := bytes.NewBuffer(nil) - if err = r.inflate(buf); err != nil { - return err - } - - ref, ok := r.offsets[start+offset] - if !ok { - return PackEntryNotFoundErr.n("offset %d", start+offset) - } - - referenced, err := r.s.Get(ref) - if err != nil { - return err - } - - reader, err := referenced.Reader() - if err != nil { - return err - } - defer checkClose(reader, &err) - - d, err := ioutil.ReadAll(reader) - if err != nil { - return err - } - - patched := patchDelta(d, buf.Bytes()) - if patched == nil { - return PatchingErr.n("hash %q", ref) - } - - raw.SetType(referenced.Type()) - raw.SetSize(int64(len(patched))) - - writer, err := raw.Writer() - if err != nil { - return err - } - defer checkClose(writer, &err) - - writer.Write(patched) - - return nil -} - -func (r *Reader) readObject(raw core.Object) (err error) { - writer, err := raw.Writer() - if err != nil { - return err - } - defer checkClose(writer, &err) - - return r.inflate(writer) -} - -func (r *Reader) inflate(w io.Writer) error { - zr, err := zlib.NewReader(r.r) - if err != nil { - if err == zlib.ErrHeader { - return zlib.ErrHeader - } - - return ZLibErr.n("%s", err) - } - - defer zr.Close() - - _, err = io.Copy(w, zr) - return err -} - -type ReaderError struct { - reason, additional string -} - -func newError(reason string) *ReaderError { - return &ReaderError{reason: reason} -} - -func (e *ReaderError) Error() string { - if e.additional == "" { - return e.reason - } - - return fmt.Sprintf("%s: %s", e.reason, e.additional) -} - -func (e *ReaderError) n(format string, args ...interface{}) *ReaderError { - return &ReaderError{ - reason: e.reason, - additional: fmt.Sprintf(format, args...), - } -} diff --git a/formats/packfile/reader_test.go b/formats/packfile/reader_test.go deleted file mode 100644 index 9ae569d..0000000 --- a/formats/packfile/reader_test.go +++ /dev/null @@ -1,190 +0,0 @@ -package packfile - -import ( - "bytes" - "encoding/base64" - "fmt" - "os" - "runtime" - "testing" - "time" - - "gopkg.in/src-d/go-git.v3/core" - "gopkg.in/src-d/go-git.v3/storage/memory" - - "github.com/dustin/go-humanize" - . "gopkg.in/check.v1" -) - -func Test(t *testing.T) { TestingT(t) } - -type ReaderSuite struct{} - -var _ = Suite(&ReaderSuite{}) - -var packFileWithEmptyObjects = "UEFDSwAAAAIAAAALnw54nKXMQWoDMQxA0b1PoX2hSLIm44FSAlmXnEG2NYlhXAfHgdLb5Cy9WAM5Qpb/Lf7oZqArUpakyYtQjCoxZ5lmWXwwyuzJbHqAuYt2+x6QoyCyhYCKIa67lGameSLWvPh5JU0hsCg7vY1z6/D1d/8ptcHhprm3Kxz7KL/wUdOz96eqZXtPrX4CCeOOPU8Eb0iI7qG1jGGvXdxaNoPs/gHeNkp8lA94nKXMQUpDMRCA4X1OMXtBZpI3L3kiRXAtPcMkmWjgxZSYQultPEsv1oJHcPl/i38OVRC0IXF0lshrJorZEcpKmTEJYbA+B3aFzEmGfk9gpqJEsmnZNutXF71i1IURU/G0bsWWwJ6NnOdXH/Bx+73U1uH9LHn0HziOWa/w2tJfv302qftz6u0AtFh0wQdmeEJCNA9tdU7938WUuivEF5CczR11ZEsNnw54nKWMUQoCIRRF/13F+w/ijY6jQkTQd7SGpz5LyAxzINpNa2ljTbSEPu/hnNsbM4TJTzqyt561GdUUmJKT6K2MeiCVgnZWoY/iRo2vHVS0URrUS+e+dkqIEp11HMhh9IaUkRM6QXM/1waH9+uRS4X9TLHVOxxbz0/YlPDbu1OhfFmHWrYwjBKVNVaNsMIBUSy05N75vxeR8oXBiw8GoErCnwt4nKXMzQkCMRBA4XuqmLsgM2M2ZkAWwbNYQ341sCEQsyB2Yy02pmAJHt93eKOnBFpMNJqtl5CFxVIMomViomQSEWP2JrN3yq3j1jqc369HqQ1Oq4u93eHSR3nCoYZfH6/VlWUbWp2BNOPO7i1OsEFCVF+tZYz030XlsiRw6gPZ0jxaqwV4nDM0MDAzMVFIZHg299HsTRevOXt3a64rj7px6ElP8ERDiGQSQ2uoXe8RrcodS5on+J4/u8HjD4NDKFQyRS8tPx+rbgDt3yiEMHicAwAAAAABPnicS0wEAa4kMOACACTjBKdkZXici7aaYAUAA3gBYKoDeJwzNDAwMzFRSGR4NvfR7E0Xrzl7d2uuK4+6cehJT/BEQ4hkEsOELYFJvS2eX47UJdVttFQrenrmzQwA13MaiDd4nEtMBAEuAApMAlGtAXicMzQwMDMxUUhkeDb30exNF685e3drriuPunHoSU/wRACvkA258N/i8hVXx9CiAZzvFXNIhCuSFmE=" - -func (s *ReaderSuite) TestReadPackfile(c *C) { - data, _ := base64.StdEncoding.DecodeString(packFileWithEmptyObjects) - d := bytes.NewReader(data) - - r := NewReader(d) - - storage := memory.NewObjectStorage() - _, err := r.Read(storage) - c.Assert(err, IsNil) - - AssertObjects(c, storage, []string{ - "778c85ff95b5514fea0ba4c7b6a029d32e2c3b96", - "db4002e880a08bf6cc7217512ad937f1ac8824a2", - "551fe11a9ef992763b7e0be4500cf7169f2f8575", - "3d8d2705c6b936ceff0020989eca90db7a372609", - "af01d4cac3441bba4bdd4574938e1d231ee5d45e", - "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391", - "85553e8dc42a79b8a483904dcfcdb048fc004055", - "a028c5b32117ed11bd310a61d50ca10827d853f1", - "c6b65deb8be57436ceaf920b82d51a3fc59830bd", - "90b451628d8449f4c47e627eb1392672e5ccec98", - "496d6428b9cf92981dc9495211e6e1120fb6f2ba", - }) -} - -func (s *ReaderSuite) TestReadPackfileOFSDelta(c *C) { - s.testReadPackfileGitFixture(c, "fixtures/git-fixture.ofs-delta", OFSDeltaFormat) - -} -func (s *ReaderSuite) TestReadPackfileREFDelta(c *C) { - s.testReadPackfileGitFixture(c, "fixtures/git-fixture.ref-delta", REFDeltaFormat) -} - -func (s *ReaderSuite) testReadPackfileGitFixture(c *C, file string, f Format) { - d, err := os.Open(file) - c.Assert(err, IsNil) - - r := NewReader(d) - r.Format = f - - storage := memory.NewObjectStorage() - _, err = r.Read(storage) - c.Assert(err, IsNil) - - AssertObjects(c, storage, []string{ - "918c48b83bd081e863dbe1b80f8998f058cd8294", - "af2d6a6954d532f8ffb47615169c8fdf9d383a1a", - "1669dce138d9b841a518c64b10914d88f5e488ea", - "a5b8b09e2f8fcb0bb99d3ccb0958157b40890d69", - "b8e471f58bcbca63b07bda20e428190409c2db47", - "35e85108805c84807bc66a02d91535e1e24b38b9", - "b029517f6300c2da0f4b651b8642506cd6aaf45d", - "32858aad3c383ed1ff0a0f9bdf231d54a00c9e88", - "d3ff53e0564a9f87d8e84b6e28e5060e517008aa", - "c192bd6a24ea1ab01d78686e417c8bdc7c3d197f", - "d5c0f4ab811897cadf03aec358ae60d21f91c50d", - "49c6bb89b17060d7b4deacb7b338fcc6ea2352a9", - "cf4aa3b38974fb7d81f367c0830f7d78d65ab86b", - "9dea2395f5403188298c1dabe8bdafe562c491e3", - "586af567d0bb5e771e49bdd9434f5e0fb76d25fa", - "9a48f23120e880dfbe41f7c9b7b708e9ee62a492", - "5a877e6a906a2743ad6e45d99c1793642aaf8eda", - "c8f1d8c61f9da76f4cb49fd86322b6e685dba956", - "a8d315b2b1c615d43042c3a62402b8a54288cf5c", - "a39771a7651f97faf5c72e08224d857fc35133db", - "880cd14280f4b9b6ed3986d6671f907d7cc2a198", - "fb72698cab7617ac416264415f13224dfd7a165e", - "4d081c50e250fa32ea8b1313cf8bb7c2ad7627fd", - "eba74343e2f15d62adedfd8c883ee0262b5c8021", - "c2d30fa8ef288618f65f6eed6e168e0d514886f4", - "8dcef98b1d52143e1e2dbc458ffe38f925786bf2", - "aa9b383c260e1d05fbbf6b30a02914555e20c725", - "6ecf0ef2c2dffb796033e5a02219af86ec6584e5", - }) -} - -func AssertObjects(c *C, s *memory.ObjectStorage, expects []string) { - c.Assert(len(expects), Equals, len(s.Objects)) - for _, expected := range expects { - obtained, err := s.Get(core.NewHash(expected)) - c.Assert(err, IsNil) - c.Assert(obtained.Hash().String(), Equals, expected) - } -} - -func (s *ReaderSuite) BenchmarkFixtureRef(c *C) { - for i := 0; i < c.N; i++ { - readFromFile(c, "fixtures/git-fixture.ref-delta", REFDeltaFormat) - } -} - -func (s *ReaderSuite) BenchmarkFixtureOfs(c *C) { - for i := 0; i < c.N; i++ { - readFromFile(c, "fixtures/git-fixture.ofs-delta", OFSDeltaFormat) - } -} - -func (s *ReaderSuite) BenchmarkCandyJS(c *C) { - for i := 0; i < c.N; i++ { - readFromFile(c, "/tmp/go-candyjs", REFDeltaFormat) - } -} - -func (s *ReaderSuite) BenchmarkSymfony(c *C) { - for i := 0; i < c.N; i++ { - readFromFile(c, "/tmp/symonfy", REFDeltaFormat) - } -} - -func (s *ReaderSuite) BenchmarkGit(c *C) { - for i := 0; i < c.N; i++ { - readFromFile(c, "/tmp/git", REFDeltaFormat) - } -} - -func (s *ReaderSuite) _TestMemoryOFS(c *C) { - var b, a runtime.MemStats - - start := time.Now() - runtime.ReadMemStats(&b) - p := readFromFile(c, "/tmp/symfony.ofs-delta", OFSDeltaFormat) - runtime.ReadMemStats(&a) - - fmt.Println("OFS--->") - fmt.Println("Alloc", a.Alloc-b.Alloc, humanize.Bytes(a.Alloc-b.Alloc)) - fmt.Println("TotalAlloc", a.TotalAlloc-b.TotalAlloc, humanize.Bytes(a.TotalAlloc-b.TotalAlloc)) - fmt.Println("HeapAlloc", a.HeapAlloc-b.HeapAlloc, humanize.Bytes(a.HeapAlloc-b.HeapAlloc)) - fmt.Println("HeapSys", a.HeapSys, humanize.Bytes(a.HeapSys-b.HeapSys)) - - fmt.Println("objects", len(p.Objects)) - fmt.Println("time", time.Since(start)) -} - -func (s *ReaderSuite) _TestMemoryREF(c *C) { - var b, a runtime.MemStats - - start := time.Now() - runtime.ReadMemStats(&b) - p := readFromFile(c, "/tmp/symonfy", REFDeltaFormat) - runtime.ReadMemStats(&a) - - fmt.Println("REF--->") - fmt.Println("Alloc", a.Alloc-b.Alloc, humanize.Bytes(a.Alloc-b.Alloc)) - fmt.Println("TotalAlloc", a.TotalAlloc-b.TotalAlloc, humanize.Bytes(a.TotalAlloc-b.TotalAlloc)) - fmt.Println("HeapAlloc", a.HeapAlloc-b.HeapAlloc, humanize.Bytes(a.HeapAlloc-b.HeapAlloc)) - fmt.Println("HeapSys", a.HeapSys, humanize.Bytes(a.HeapSys-b.HeapSys)) - - fmt.Println("objects", len(p.Objects)) - fmt.Println("time", time.Since(start)) -} - -func readFromFile(c *C, file string, f Format) *memory.ObjectStorage { - d, err := os.Open(file) - c.Assert(err, IsNil) - - r := NewReader(d) - r.Format = f - - storage := memory.NewObjectStorage() - _, err = r.Read(storage) - c.Assert(err, IsNil) - - return storage -} diff --git a/formats/packfile/seekable.go b/formats/packfile/seekable.go new file mode 100644 index 0000000..ea1c501 --- /dev/null +++ b/formats/packfile/seekable.go @@ -0,0 +1,108 @@ +package packfile + +import ( + "io" + "os" + + "gopkg.in/src-d/go-git.v3/core" +) + +// Seekable implements ReadRecaller for the io.ReadSeeker of a packfile. +// Remembering does not actually stores any reference to the remembered +// objects; the object offset is remebered instead and the packfile is +// read again everytime a recall operation is requested. This saves +// memory buy can be very slow if the associated io.ReadSeeker is slow +// (like a hard disk). +type Seekable struct { + io.ReadSeeker + HashToOffset map[core.Hash]int64 +} + +// NewSeekable returns a new Seekable that reads form r. +func NewSeekable(r io.ReadSeeker) *Seekable { + return &Seekable{ + r, + make(map[core.Hash]int64), + } +} + +// Read reads up to len(p) bytes into p. +func (r *Seekable) Read(p []byte) (int, error) { + return r.ReadSeeker.Read(p) +} + +// ReadByte reads a byte. +func (r *Seekable) ReadByte() (byte, error) { + var p [1]byte + _, err := r.ReadSeeker.Read(p[:]) + if err != nil { + return 0, err + } + + return p[0], nil +} + +// Offset returns the offset for the next Read or ReadByte. +func (r *Seekable) Offset() (int64, error) { + return r.Seek(0, os.SEEK_CUR) +} + +// Remember stores the offset of the object and its hash, but not the +// object itself. This implementation does not check for already stored +// offsets, as it is too expensive to build this information from an +// index every time a get operation is performed on the SeekableReadRecaller. +func (r *Seekable) Remember(o int64, obj core.Object) error { + h := obj.Hash() + if _, ok := r.HashToOffset[h]; ok { + return ErrDuplicatedObject.AddDetails("with hash %s", h) + } + + r.HashToOffset[h] = o + + return nil +} + +// ForgetAll forgets all previously remembered objects. For efficiency +// reasons RecallByOffset always find objects, even if they have been +// forgetted or were never remembered. +func (r *Seekable) ForgetAll() { + r.HashToOffset = make(map[core.Hash]int64) +} + +// RecallByHash returns the object for a given hash by looking for it again in +// the io.ReadeSeerker. +func (r *Seekable) RecallByHash(h core.Hash) (core.Object, error) { + o, ok := r.HashToOffset[h] + if !ok { + return nil, ErrCannotRecall.AddDetails("hash not found: %s", h) + } + + return r.RecallByOffset(o) +} + +// RecallByOffset returns the object for a given offset by looking for it again in +// the io.ReadeSeerker. For efficiency reasons, this method always find objects by +// offset, even if they have not been remembered or if they have been forgetted. +func (r *Seekable) RecallByOffset(o int64) (obj core.Object, err error) { + // remember current offset + beforeJump, err := r.Offset() + if err != nil { + return nil, err + } + + defer func() { + // jump back + _, seekErr := r.Seek(beforeJump, os.SEEK_SET) + if err == nil { + err = seekErr + } + }() + + // jump to requested offset + _, err = r.Seek(o, os.SEEK_SET) + if err != nil { + return nil, err + } + + return NewParser(r).ReadObject() +} diff --git a/formats/packfile/stream.go b/formats/packfile/stream.go new file mode 100644 index 0000000..41266b1 --- /dev/null +++ b/formats/packfile/stream.go @@ -0,0 +1,95 @@ +package packfile + +import ( + "io" + + "gopkg.in/src-d/go-git.v3/core" +) + +// Stream implements ReadRecaller for the io.Reader of a packfile. This +// implementation keeps all remembered objects referenced in maps for +// quick access. +type Stream struct { + io.Reader + count int64 + offsetToObject map[int64]core.Object + hashToObject map[core.Hash]core.Object +} + +// NewStream returns a new Stream that reads form r. +func NewStream(r io.Reader) *Stream { + return &Stream{ + Reader: r, + count: 0, + hashToObject: make(map[core.Hash]core.Object, 0), + offsetToObject: make(map[int64]core.Object, 0), + } +} + +// Read reads up to len(p) bytes into p. +func (r *Stream) Read(p []byte) (n int, err error) { + n, err = r.Reader.Read(p) + r.count += int64(n) + + return +} + +// ReadByte reads a byte. +func (r *Stream) ReadByte() (byte, error) { + var p [1]byte + _, err := r.Reader.Read(p[:]) + r.count++ + + return p[0], err +} + +// Offset returns the number of bytes read. +func (r *Stream) Offset() (int64, error) { + return r.count, nil +} + +// Remember stores references to the passed object to be used later by +// RecalByHash and RecallByOffset. It receives the object and the offset +// of its object entry in the packfile. +func (r *Stream) Remember(o int64, obj core.Object) error { + h := obj.Hash() + if _, ok := r.hashToObject[h]; ok { + return ErrDuplicatedObject.AddDetails("with hash %s", h) + } + r.hashToObject[h] = obj + + if _, ok := r.offsetToObject[o]; ok { + return ErrDuplicatedObject.AddDetails("with offset %d", o) + } + r.offsetToObject[o] = obj + + return nil +} + +// ForgetAll forgets all previously remembered objects. +func (r *Stream) ForgetAll() { + r.hashToObject = make(map[core.Hash]core.Object) + r.offsetToObject = make(map[int64]core.Object) +} + +// RecallByHash returns an object that has been previously Remember-ed by +// its hash. +func (r *Stream) RecallByHash(h core.Hash) (core.Object, error) { + obj, ok := r.hashToObject[h] + if !ok { + return nil, ErrCannotRecall.AddDetails("by hash %s", h) + } + + return obj, nil +} + +// RecallByOffset returns an object that has been previously Remember-ed by +// the offset of its object entry in the packfile. +func (r *Stream) RecallByOffset(o int64) (core.Object, error) { + obj, ok := r.offsetToObject[o] + if !ok { + return nil, ErrCannotRecall.AddDetails("no object found at offset %d", o) + } + + return obj, nil +} -- cgit