From 5e73f01cb2e027a8f02801635b79d3a9bc866914 Mon Sep 17 00:00:00 2001 From: Alberto Cortés Date: Mon, 4 Jul 2016 17:09:22 +0200 Subject: Adds support to open local repositories and to use file-based object storage (#55) * remove some comments * idx writer/reader * Shut up ssh tests, they are annoying * Add file scheme test to clients * Add dummy file client * Add test fot file client * Make tests use fixture endpoint * add parser for packed-refs format * add parser for packed-refs format * WIP adding dir.Refs() tests * Add test for fixture refs * refs parser for the refs directory * Documentation * Add Capabilities to file client * tgz.Exatract now accpets a path instead of a Reader * fix bug in idxfile fanout calculation * remove dead code * packfile documentation * clean packfile parser code * add core.Object.Content() and returns errors for core.ObjectStorage.Iter() * add seekable storage * add dir repos to NewRepository * clean prints * Add dir client documentation to README * Organize the README * README * Clean tgz package * Clean temp dirs after tgz tests * Gometalinter on gitdir * Clean pattern function * metalinter tgz * metalinter gitdir * gitdir coverage and remove seekable packfile filedescriptor leak * gitdir Idxfile tests and remove file descriptor leak * gitdir Idxfile tests when no idx is found * clean storage/seekable/internal/index and some formats/idxfile API issues * clean storage/seekable * clean formats/idx * turn packfile/doc.go into packfile/doc.txt * move formats/packfile/reader to decoder * fix packfile decoder error names * improve documentation * comment packfile decoder errors * comment public API (format/packfile) * remve duplicated code in packfile decoder test * move tracking_reader into an internal package and clean it * use iota for packfile format * rename packfile parse.go to packfile object_at.go * clean packfile deltas * fix delta header size bug * improve delta documentation * clean packfile deltas * clean packfiles deltas * clean repository.go * Remove go 1.5 from Travis CI Because go 1.5 does not suport internal packages. * change local repo scheme to local:// * change "local://" to "file://" as the local scheme * fix broken indentation * shortens names of variables in short scopes * more shortening of variable names * more shortening of variable names * Rename git dir client to "file", as the scheme used for it * Fix file format ctor name, now that the package name has change * Sortcut local repo constructor to not use remotes The object storage is build directly in the repository ctor, instead of creating a remote and waiting for the user to pull it. * update README and fix some errors in it * remove file scheme client * Local respositories has now a new ctor This is, they are no longer identified by the scheme of the URL, but are created different from inception. * remove unused URL field form Repository * move all git dir logic to seekable sotrage ctor * fix documentation * Make formats/file/dir an internal package to storage/seekable * change package storage/seekable to storage/fs * clean storage/fs * overall storage/fs clean * more cleaning * some metalinter fixes * upgrade cshared to last changes * remove dead code * fix test error info * remove file scheme check from clients * fix test error message * fix test error message * fix error messages * style changes * fix comments everywhere * style changes * style changes * scaffolding and tests for local packfiles without ifx files * outsource index building from packfile to the packfile decoder * refactor packfile header reading into a new function * move code to generate index from packfile back to index package * add header parsing * fix documentation errata * add undeltified and OFS delta support for index building from the packfile * add tests for packfile with ref-deltas * support for packfiles with ref-deltas and no idx * refactor packfile format parser to reuse code * refactor packfile format parser to reuse code * refactor packfile format parser to reuse code * refactor packfile format parser to reuse code * refactor packfile format parser to reuse code * WIP refactor packfile format parser to reuse code * refactor packfile format parser to reuse code * remove prints from tests * remove prints from tests * refactor packfile.core into packfile.parser * rename packfile reader to something that shows it is a recaller * rename cannot recall error * rename packfile.Reader to packfile.ReadRecaller and document * speed up test by using StreamReader instead of SeekableReader when possible * clean packfile StreamReader * stream_reader tests * refactor packfile.StreamReader into packfile.StreamReadRecaller * refactor packfile.SeekableReader into packfile.SeekableReadRecaller and document it * generalize packfile.StreamReadRecaller test to all packfile.ReadRecaller implementations * speed up storage/fs tests * speed up tests in . by loading packfiles in memory * speed up repository tests by using and smaller fixture * restore doc.go files * rename packfile.ReadRecaller implementations to shorter names * update comments to type changes * packfile.Parser test (WIP) * packfile.Parser tests and add ForgetAll() to packfile.ReadRecaller * add test for packfile.ReadRecaller.ForgetAll() * clarify seekable being able to recallByOffset forgetted objects * use better names for internal maps * metalinter packfile package * speed up some tests * documentation fixes * change storage.fs package name to storage.proxy to avoid confusion with new filesystem support * New fs package and os transparent implementation Now NewRepositoryFromFS receives a fs and a path and tests are modified accordingly, but it is still not using for anything. * add fs to gitdir and proxy.store * reduce fs interface for easier implementation * remove garbage dirs from tgz tests * change file name gitdir/dir.go to gitdir/gitdir.go * fs.OS tests * metalinter utils/fs * add NewRepositoryFromFS documentation to README * Readability fixes to README * move tgz to an external dependency * move filesystem impl. example to example dir * rename proxy/store.go to proxy/storage.go for coherence with memory/storage.go * rename proxy package to seekable --- formats/idxfile/decoder.go | 160 ++++++++++++++++++++++++++++++++++++++++ formats/idxfile/decoder_test.go | 40 ++++++++++ formats/idxfile/doc.go | 130 ++++++++++++++++++++++++++++++++ formats/idxfile/encoder.go | 124 +++++++++++++++++++++++++++++++ formats/idxfile/encoder_test.go | 47 ++++++++++++ formats/idxfile/idxfile.go | 61 +++++++++++++++ 6 files changed, 562 insertions(+) create mode 100644 formats/idxfile/decoder.go create mode 100644 formats/idxfile/decoder_test.go create mode 100644 formats/idxfile/doc.go create mode 100644 formats/idxfile/encoder.go create mode 100644 formats/idxfile/encoder_test.go create mode 100644 formats/idxfile/idxfile.go (limited to 'formats/idxfile') diff --git a/formats/idxfile/decoder.go b/formats/idxfile/decoder.go new file mode 100644 index 0000000..72a9338 --- /dev/null +++ b/formats/idxfile/decoder.go @@ -0,0 +1,160 @@ +package idxfile + +import ( + "bytes" + "encoding/binary" + "errors" + "io" + + "gopkg.in/src-d/go-git.v3/core" +) + +var ( + // ErrUnsupportedVersion is returned by Decode when the idx file version + // is not supported. + ErrUnsupportedVersion = errors.New("Unsuported version") + // ErrMalformedIdxFile is returned by Decode when the idx file is corrupted. + ErrMalformedIdxFile = errors.New("Malformed IDX file") +) + +// A Decoder reads and decodes idx files from an input stream. +type Decoder struct { + io.Reader +} + +// NewDecoder returns a new decoder that reads from r. +func NewDecoder(r io.Reader) *Decoder { + return &Decoder{r} +} + +// Decode reads the whole idx object from its input and stores it in the +// value pointed to by idx. +func (d *Decoder) Decode(idx *Idxfile) error { + if err := validateHeader(d); err != nil { + return err + } + + flow := []func(*Idxfile, io.Reader) error{ + readVersion, + readFanout, + readObjectNames, + readCRC32, + readOffsets, + readChecksums, + } + + for _, f := range flow { + if err := f(idx, d); err != nil { + return err + } + } + + if !idx.isValid() { + return ErrMalformedIdxFile + } + + return nil +} + +func validateHeader(r io.Reader) error { + var h = make([]byte, 4) + if _, err := r.Read(h); err != nil { + return err + } + + if !bytes.Equal(h, idxHeader) { + return ErrMalformedIdxFile + } + + return nil +} + +func readVersion(idx *Idxfile, r io.Reader) error { + v, err := readInt32(r) + if err != nil { + return err + } + + if v > VersionSupported { + return ErrUnsupportedVersion + } + + idx.Version = v + + return nil +} + +func readFanout(idx *Idxfile, r io.Reader) error { + var err error + + for i := 0; i < 255; i++ { + idx.Fanout[i], err = readInt32(r) + if err != nil { + return err + } + } + + idx.ObjectCount, err = readInt32(r) + + return err +} + +func readObjectNames(idx *Idxfile, r io.Reader) error { + c := int(idx.ObjectCount) + for i := 0; i < c; i++ { + var ref core.Hash + if _, err := r.Read(ref[:]); err != nil { + return err + } + + idx.Entries = append(idx.Entries, Entry{Hash: ref}) + } + + return nil +} + +func readCRC32(idx *Idxfile, r io.Reader) error { + c := int(idx.ObjectCount) + for i := 0; i < c; i++ { + if _, err := r.Read(idx.Entries[i].CRC32[:]); err != nil { + return err + } + } + + return nil +} + +func readOffsets(idx *Idxfile, r io.Reader) error { + c := int(idx.ObjectCount) + for i := 0; i < c; i++ { + o, err := readInt32(r) + if err != nil { + return err + } + + idx.Entries[i].Offset = uint64(o) + } + + return nil +} + +func readChecksums(idx *Idxfile, r io.Reader) error { + if _, err := r.Read(idx.PackfileChecksum[:]); err != nil { + return err + } + + if _, err := r.Read(idx.IdxChecksum[:]); err != nil { + return err + } + + return nil +} + +func readInt32(r io.Reader) (uint32, error) { + var v uint32 + if err := binary.Read(r, binary.BigEndian, &v); err != nil { + return 0, err + } + + return v, nil +} diff --git a/formats/idxfile/decoder_test.go b/formats/idxfile/decoder_test.go new file mode 100644 index 0000000..597a002 --- /dev/null +++ b/formats/idxfile/decoder_test.go @@ -0,0 +1,40 @@ +package idxfile + +import ( + "fmt" + "os" + "testing" + + . "gopkg.in/check.v1" +) + +func Test(t *testing.T) { TestingT(t) } + +type IdxfileSuite struct{} + +var _ = Suite(&IdxfileSuite{}) + +func (s *IdxfileSuite) TestDecode(c *C) { + f, err := os.Open("fixtures/git-fixture.idx") + c.Assert(err, IsNil) + + d := NewDecoder(f) + idx := &Idxfile{} + err = d.Decode(idx) + c.Assert(err, IsNil) + + err = f.Close() + c.Assert(err, IsNil) + + c.Assert(int(idx.ObjectCount), Equals, 31) + c.Assert(idx.Entries, HasLen, 31) + c.Assert(idx.Entries[0].Hash.String(), Equals, + "1669dce138d9b841a518c64b10914d88f5e488ea") + c.Assert(idx.Entries[0].Offset, Equals, uint64(615)) + + c.Assert(fmt.Sprintf("%x", idx.IdxChecksum), Equals, + "bba9b7a9895724819225a044c857d391bb9d61d9") + c.Assert(fmt.Sprintf("%x", idx.PackfileChecksum), Equals, + "54bb61360ab2dad1a3e344a8cd3f82b848518cba") + +} diff --git a/formats/idxfile/doc.go b/formats/idxfile/doc.go new file mode 100644 index 0000000..74149ab --- /dev/null +++ b/formats/idxfile/doc.go @@ -0,0 +1,130 @@ +/* +== Original (version 1) pack-*.idx files have the following format: + + - The header consists of 256 4-byte network byte order + integers. N-th entry of this table records the number of + objects in the corresponding pack, the first byte of whose + object name is less than or equal to N. This is called the + 'first-level fan-out' table. + + - The header is followed by sorted 24-byte entries, one entry + per object in the pack. Each entry is: + + 4-byte network byte order integer, recording where the + object is stored in the packfile as the offset from the + beginning. + + 20-byte object name. + + - The file is concluded with a trailer: + + A copy of the 20-byte SHA1 checksum at the end of + corresponding packfile. + + 20-byte SHA1-checksum of all of the above. + +Pack Idx file: + + -- +--------------------------------+ +fanout | fanout[0] = 2 (for example) |-. +table +--------------------------------+ | + | fanout[1] | | + +--------------------------------+ | + | fanout[2] | | + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | + | fanout[255] = total objects |---. + -- +--------------------------------+ | | +main | offset | | | +index | object name 00XXXXXXXXXXXXXXXX | | | +table +--------------------------------+ | | + | offset | | | + | object name 00XXXXXXXXXXXXXXXX | | | + +--------------------------------+<+ | + .-| offset | | + | | object name 01XXXXXXXXXXXXXXXX | | + | +--------------------------------+ | + | | offset | | + | | object name 01XXXXXXXXXXXXXXXX | | + | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | + | | offset | | + | | object name FFXXXXXXXXXXXXXXXX | | + --| +--------------------------------+<--+ +trailer | | packfile checksum | + | +--------------------------------+ + | | idxfile checksum | + | +--------------------------------+ + .-------. + | +Pack file entry: <+ + + packed object header: + 1-byte size extension bit (MSB) + type (next 3 bit) + size0 (lower 4-bit) + n-byte sizeN (as long as MSB is set, each 7-bit) + size0..sizeN form 4+7+7+..+7 bit integer, size0 + is the least significant part, and sizeN is the + most significant part. + packed object data: + If it is not DELTA, then deflated bytes (the size above + is the size before compression). + If it is REF_DELTA, then + 20-byte base object name SHA1 (the size above is the + size of the delta data that follows). + delta data, deflated. + If it is OFS_DELTA, then + n-byte offset (see below) interpreted as a negative + offset from the type-byte of the header of the + ofs-delta entry (the size above is the size of + the delta data that follows). + delta data, deflated. + + offset encoding: + n bytes with MSB set in all but the last one. + The offset is then the number constructed by + concatenating the lower 7 bit of each byte, and + for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1)) + to the result. + + + +== Version 2 pack-*.idx files support packs larger than 4 GiB, and + have some other reorganizations. They have the format: + + - A 4-byte magic number '\377tOc' which is an unreasonable + fanout[0] value. + + - A 4-byte version number (= 2) + + - A 256-entry fan-out table just like v1. + + - A table of sorted 20-byte SHA1 object names. These are + packed together without offset values to reduce the cache + footprint of the binary search for a specific object name. + + - A table of 4-byte CRC32 values of the packed object data. + This is new in v2 so compressed data can be copied directly + from pack to pack during repacking without undetected + data corruption. + + - A table of 4-byte offset values (in network byte order). + These are usually 31-bit pack file offsets, but large + offsets are encoded as an index into the next table with + the msbit set. + + - A table of 8-byte offset entries (empty for pack files less + than 2 GiB). Pack files are organized with heavily used + objects toward the front, so most object references should + not need to refer to this table. + + - The same trailer as a v1 pack file: + + A copy of the 20-byte SHA1 checksum at the end of + corresponding packfile. + + 20-byte SHA1-checksum of all of the above. + +From: +https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt +*/ +package idxfile diff --git a/formats/idxfile/encoder.go b/formats/idxfile/encoder.go new file mode 100644 index 0000000..f85ff84 --- /dev/null +++ b/formats/idxfile/encoder.go @@ -0,0 +1,124 @@ +package idxfile + +import ( + "crypto/sha1" + "encoding/binary" + "hash" + "io" +) + +// An Encoder writes idx files to an output stream. +type Encoder struct { + io.Writer + hash hash.Hash +} + +// NewEncoder returns a new encoder that writes to w. +func NewEncoder(w io.Writer) *Encoder { + h := sha1.New() + mw := io.MultiWriter(w, h) + return &Encoder{mw, h} +} + +// Encode writes the idx in an idx file format to the stream of the encoder. +func (e *Encoder) Encode(idx *Idxfile) (int, error) { + flow := []func(*Idxfile) (int, error){ + e.encodeHeader, + e.encodeFanout, + e.encodeHashes, + e.encodeCRC32, + e.encodeOffsets, + e.encodeChecksums, + } + + sz := 0 + for _, f := range flow { + i, err := f(idx) + sz += i + + if err != nil { + return sz, err + } + } + + return sz, nil +} + +func (e *Encoder) encodeHeader(idx *Idxfile) (int, error) { + c, err := e.Write(idxHeader) + if err != nil { + return c, err + } + + return c + 4, e.writeInt32(idx.Version) +} + +func (e *Encoder) encodeFanout(idx *Idxfile) (int, error) { + fanout := idx.calculateFanout() + for _, c := range fanout { + if err := e.writeInt32(c); err != nil { + return 0, err + } + } + + return 1024, nil +} + +func (e *Encoder) encodeHashes(idx *Idxfile) (int, error) { + return e.encodeEntryField(idx, true) +} + +func (e *Encoder) encodeCRC32(idx *Idxfile) (int, error) { + return e.encodeEntryField(idx, false) +} + +func (e *Encoder) encodeEntryField(idx *Idxfile, isHash bool) (int, error) { + sz := 0 + for _, ent := range idx.Entries { + var data []byte + if isHash { + data = ent.Hash[:] + } else { + data = ent.CRC32[:] + } + i, err := e.Write(data) + sz += i + + if err != nil { + return sz, err + } + } + + return sz, nil +} + +func (e *Encoder) encodeOffsets(idx *Idxfile) (int, error) { + sz := 0 + for _, ent := range idx.Entries { + if err := e.writeInt32(uint32(ent.Offset)); err != nil { + return sz, err + } + + sz += 4 + + } + + return sz, nil +} + +func (e *Encoder) encodeChecksums(idx *Idxfile) (int, error) { + if _, err := e.Write(idx.PackfileChecksum[:]); err != nil { + return 0, err + } + + copy(idx.IdxChecksum[:], e.hash.Sum(nil)[:20]) + if _, err := e.Write(idx.IdxChecksum[:]); err != nil { + return 0, err + } + + return 40, nil +} + +func (e *Encoder) writeInt32(value uint32) error { + return binary.Write(e, binary.BigEndian, value) +} diff --git a/formats/idxfile/encoder_test.go b/formats/idxfile/encoder_test.go new file mode 100644 index 0000000..bfb9f91 --- /dev/null +++ b/formats/idxfile/encoder_test.go @@ -0,0 +1,47 @@ +package idxfile + +import ( + "bytes" + "io" + "os" + + . "gopkg.in/check.v1" +) + +func (s *IdxfileSuite) TestEncode(c *C) { + for i, path := range [...]string{ + "fixtures/git-fixture.idx", + "../packfile/fixtures/spinnaker-spinnaker.idx", + } { + com := Commentf("subtest %d: path = %s", i, path) + + exp, idx, err := decode(path) + c.Assert(err, IsNil, com) + + obt := new(bytes.Buffer) + e := NewEncoder(obt) + size, err := e.Encode(idx) + c.Assert(err, IsNil, com) + + c.Assert(size, Equals, exp.Len(), com) + c.Assert(obt, DeepEquals, exp, com) + } +} + +func decode(path string) (*bytes.Buffer, *Idxfile, error) { + f, err := os.Open(path) + if err != nil { + return nil, nil, err + } + + cont := new(bytes.Buffer) + tee := io.TeeReader(f, cont) + + d := NewDecoder(tee) + idx := &Idxfile{} + if err = d.Decode(idx); err != nil { + return nil, nil, err + } + + return cont, idx, f.Close() +} diff --git a/formats/idxfile/idxfile.go b/formats/idxfile/idxfile.go new file mode 100644 index 0000000..5f12dad --- /dev/null +++ b/formats/idxfile/idxfile.go @@ -0,0 +1,61 @@ +package idxfile + +import "gopkg.in/src-d/go-git.v3/core" + +const ( + // VersionSupported is the only idx version supported. + VersionSupported = 2 +) + +var ( + idxHeader = []byte{255, 't', 'O', 'c'} +) + +// An Idxfile represents an idx file in memory. +type Idxfile struct { + Version uint32 + Fanout [255]uint32 + ObjectCount uint32 + Entries []Entry + PackfileChecksum [20]byte + IdxChecksum [20]byte +} + +// An Entry represents data about an object in the packfile: its hash, +// offset and CRC32 checksum. +type Entry struct { + Hash core.Hash + CRC32 [4]byte + Offset uint64 +} + +func (idx *Idxfile) isValid() bool { + fanout := idx.calculateFanout() + for k, c := range idx.Fanout { + if fanout[k] != c { + return false + } + } + + return true +} + +func (idx *Idxfile) calculateFanout() [256]uint32 { + fanout := [256]uint32{} + var c uint32 + for _, e := range idx.Entries { + c++ + fanout[e.Hash[0]] = c + } + + var i uint32 + for k, c := range fanout { + if c != 0 { + i = c + } + + fanout[k] = i + } + + return fanout +} -- cgit