diff options
Diffstat (limited to 'formats/idxfile')
-rw-r--r-- | formats/idxfile/decoder.go | 160 | ||||
-rw-r--r-- | formats/idxfile/decoder_test.go | 40 | ||||
-rw-r--r-- | formats/idxfile/doc.go | 130 | ||||
-rw-r--r-- | formats/idxfile/encoder.go | 124 | ||||
-rw-r--r-- | formats/idxfile/encoder_test.go | 47 | ||||
-rw-r--r-- | formats/idxfile/idxfile.go | 61 |
6 files changed, 562 insertions, 0 deletions
diff --git a/formats/idxfile/decoder.go b/formats/idxfile/decoder.go new file mode 100644 index 0000000..72a9338 --- /dev/null +++ b/formats/idxfile/decoder.go @@ -0,0 +1,160 @@ +package idxfile + +import ( + "bytes" + "encoding/binary" + "errors" + "io" + + "gopkg.in/src-d/go-git.v3/core" +) + +var ( + // ErrUnsupportedVersion is returned by Decode when the idx file version + // is not supported. + ErrUnsupportedVersion = errors.New("Unsuported version") + // ErrMalformedIdxFile is returned by Decode when the idx file is corrupted. + ErrMalformedIdxFile = errors.New("Malformed IDX file") +) + +// A Decoder reads and decodes idx files from an input stream. +type Decoder struct { + io.Reader +} + +// NewDecoder returns a new decoder that reads from r. +func NewDecoder(r io.Reader) *Decoder { + return &Decoder{r} +} + +// Decode reads the whole idx object from its input and stores it in the +// value pointed to by idx. +func (d *Decoder) Decode(idx *Idxfile) error { + if err := validateHeader(d); err != nil { + return err + } + + flow := []func(*Idxfile, io.Reader) error{ + readVersion, + readFanout, + readObjectNames, + readCRC32, + readOffsets, + readChecksums, + } + + for _, f := range flow { + if err := f(idx, d); err != nil { + return err + } + } + + if !idx.isValid() { + return ErrMalformedIdxFile + } + + return nil +} + +func validateHeader(r io.Reader) error { + var h = make([]byte, 4) + if _, err := r.Read(h); err != nil { + return err + } + + if !bytes.Equal(h, idxHeader) { + return ErrMalformedIdxFile + } + + return nil +} + +func readVersion(idx *Idxfile, r io.Reader) error { + v, err := readInt32(r) + if err != nil { + return err + } + + if v > VersionSupported { + return ErrUnsupportedVersion + } + + idx.Version = v + + return nil +} + +func readFanout(idx *Idxfile, r io.Reader) error { + var err error + + for i := 0; i < 255; i++ { + idx.Fanout[i], err = readInt32(r) + if err != nil { + return err + } + } + + idx.ObjectCount, err = readInt32(r) + + return err +} + +func readObjectNames(idx *Idxfile, r io.Reader) error { + c := int(idx.ObjectCount) + for i := 0; i < c; i++ { + var ref core.Hash + if _, err := r.Read(ref[:]); err != nil { + return err + } + + idx.Entries = append(idx.Entries, Entry{Hash: ref}) + } + + return nil +} + +func readCRC32(idx *Idxfile, r io.Reader) error { + c := int(idx.ObjectCount) + for i := 0; i < c; i++ { + if _, err := r.Read(idx.Entries[i].CRC32[:]); err != nil { + return err + } + } + + return nil +} + +func readOffsets(idx *Idxfile, r io.Reader) error { + c := int(idx.ObjectCount) + for i := 0; i < c; i++ { + o, err := readInt32(r) + if err != nil { + return err + } + + idx.Entries[i].Offset = uint64(o) + } + + return nil +} + +func readChecksums(idx *Idxfile, r io.Reader) error { + if _, err := r.Read(idx.PackfileChecksum[:]); err != nil { + return err + } + + if _, err := r.Read(idx.IdxChecksum[:]); err != nil { + return err + } + + return nil +} + +func readInt32(r io.Reader) (uint32, error) { + var v uint32 + if err := binary.Read(r, binary.BigEndian, &v); err != nil { + return 0, err + } + + return v, nil +} diff --git a/formats/idxfile/decoder_test.go b/formats/idxfile/decoder_test.go new file mode 100644 index 0000000..597a002 --- /dev/null +++ b/formats/idxfile/decoder_test.go @@ -0,0 +1,40 @@ +package idxfile + +import ( + "fmt" + "os" + "testing" + + . "gopkg.in/check.v1" +) + +func Test(t *testing.T) { TestingT(t) } + +type IdxfileSuite struct{} + +var _ = Suite(&IdxfileSuite{}) + +func (s *IdxfileSuite) TestDecode(c *C) { + f, err := os.Open("fixtures/git-fixture.idx") + c.Assert(err, IsNil) + + d := NewDecoder(f) + idx := &Idxfile{} + err = d.Decode(idx) + c.Assert(err, IsNil) + + err = f.Close() + c.Assert(err, IsNil) + + c.Assert(int(idx.ObjectCount), Equals, 31) + c.Assert(idx.Entries, HasLen, 31) + c.Assert(idx.Entries[0].Hash.String(), Equals, + "1669dce138d9b841a518c64b10914d88f5e488ea") + c.Assert(idx.Entries[0].Offset, Equals, uint64(615)) + + c.Assert(fmt.Sprintf("%x", idx.IdxChecksum), Equals, + "bba9b7a9895724819225a044c857d391bb9d61d9") + c.Assert(fmt.Sprintf("%x", idx.PackfileChecksum), Equals, + "54bb61360ab2dad1a3e344a8cd3f82b848518cba") + +} diff --git a/formats/idxfile/doc.go b/formats/idxfile/doc.go new file mode 100644 index 0000000..74149ab --- /dev/null +++ b/formats/idxfile/doc.go @@ -0,0 +1,130 @@ +/* +== Original (version 1) pack-*.idx files have the following format: + + - The header consists of 256 4-byte network byte order + integers. N-th entry of this table records the number of + objects in the corresponding pack, the first byte of whose + object name is less than or equal to N. This is called the + 'first-level fan-out' table. + + - The header is followed by sorted 24-byte entries, one entry + per object in the pack. Each entry is: + + 4-byte network byte order integer, recording where the + object is stored in the packfile as the offset from the + beginning. + + 20-byte object name. + + - The file is concluded with a trailer: + + A copy of the 20-byte SHA1 checksum at the end of + corresponding packfile. + + 20-byte SHA1-checksum of all of the above. + +Pack Idx file: + + -- +--------------------------------+ +fanout | fanout[0] = 2 (for example) |-. +table +--------------------------------+ | + | fanout[1] | | + +--------------------------------+ | + | fanout[2] | | + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | + | fanout[255] = total objects |---. + -- +--------------------------------+ | | +main | offset | | | +index | object name 00XXXXXXXXXXXXXXXX | | | +table +--------------------------------+ | | + | offset | | | + | object name 00XXXXXXXXXXXXXXXX | | | + +--------------------------------+<+ | + .-| offset | | + | | object name 01XXXXXXXXXXXXXXXX | | + | +--------------------------------+ | + | | offset | | + | | object name 01XXXXXXXXXXXXXXXX | | + | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | + | | offset | | + | | object name FFXXXXXXXXXXXXXXXX | | + --| +--------------------------------+<--+ +trailer | | packfile checksum | + | +--------------------------------+ + | | idxfile checksum | + | +--------------------------------+ + .-------. + | +Pack file entry: <+ + + packed object header: + 1-byte size extension bit (MSB) + type (next 3 bit) + size0 (lower 4-bit) + n-byte sizeN (as long as MSB is set, each 7-bit) + size0..sizeN form 4+7+7+..+7 bit integer, size0 + is the least significant part, and sizeN is the + most significant part. + packed object data: + If it is not DELTA, then deflated bytes (the size above + is the size before compression). + If it is REF_DELTA, then + 20-byte base object name SHA1 (the size above is the + size of the delta data that follows). + delta data, deflated. + If it is OFS_DELTA, then + n-byte offset (see below) interpreted as a negative + offset from the type-byte of the header of the + ofs-delta entry (the size above is the size of + the delta data that follows). + delta data, deflated. + + offset encoding: + n bytes with MSB set in all but the last one. + The offset is then the number constructed by + concatenating the lower 7 bit of each byte, and + for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1)) + to the result. + + + +== Version 2 pack-*.idx files support packs larger than 4 GiB, and + have some other reorganizations. They have the format: + + - A 4-byte magic number '\377tOc' which is an unreasonable + fanout[0] value. + + - A 4-byte version number (= 2) + + - A 256-entry fan-out table just like v1. + + - A table of sorted 20-byte SHA1 object names. These are + packed together without offset values to reduce the cache + footprint of the binary search for a specific object name. + + - A table of 4-byte CRC32 values of the packed object data. + This is new in v2 so compressed data can be copied directly + from pack to pack during repacking without undetected + data corruption. + + - A table of 4-byte offset values (in network byte order). + These are usually 31-bit pack file offsets, but large + offsets are encoded as an index into the next table with + the msbit set. + + - A table of 8-byte offset entries (empty for pack files less + than 2 GiB). Pack files are organized with heavily used + objects toward the front, so most object references should + not need to refer to this table. + + - The same trailer as a v1 pack file: + + A copy of the 20-byte SHA1 checksum at the end of + corresponding packfile. + + 20-byte SHA1-checksum of all of the above. + +From: +https://www.kernel.org/pub/software/scm/git/docs/v1.7.5/technical/pack-protocol.txt +*/ +package idxfile diff --git a/formats/idxfile/encoder.go b/formats/idxfile/encoder.go new file mode 100644 index 0000000..f85ff84 --- /dev/null +++ b/formats/idxfile/encoder.go @@ -0,0 +1,124 @@ +package idxfile + +import ( + "crypto/sha1" + "encoding/binary" + "hash" + "io" +) + +// An Encoder writes idx files to an output stream. +type Encoder struct { + io.Writer + hash hash.Hash +} + +// NewEncoder returns a new encoder that writes to w. +func NewEncoder(w io.Writer) *Encoder { + h := sha1.New() + mw := io.MultiWriter(w, h) + return &Encoder{mw, h} +} + +// Encode writes the idx in an idx file format to the stream of the encoder. +func (e *Encoder) Encode(idx *Idxfile) (int, error) { + flow := []func(*Idxfile) (int, error){ + e.encodeHeader, + e.encodeFanout, + e.encodeHashes, + e.encodeCRC32, + e.encodeOffsets, + e.encodeChecksums, + } + + sz := 0 + for _, f := range flow { + i, err := f(idx) + sz += i + + if err != nil { + return sz, err + } + } + + return sz, nil +} + +func (e *Encoder) encodeHeader(idx *Idxfile) (int, error) { + c, err := e.Write(idxHeader) + if err != nil { + return c, err + } + + return c + 4, e.writeInt32(idx.Version) +} + +func (e *Encoder) encodeFanout(idx *Idxfile) (int, error) { + fanout := idx.calculateFanout() + for _, c := range fanout { + if err := e.writeInt32(c); err != nil { + return 0, err + } + } + + return 1024, nil +} + +func (e *Encoder) encodeHashes(idx *Idxfile) (int, error) { + return e.encodeEntryField(idx, true) +} + +func (e *Encoder) encodeCRC32(idx *Idxfile) (int, error) { + return e.encodeEntryField(idx, false) +} + +func (e *Encoder) encodeEntryField(idx *Idxfile, isHash bool) (int, error) { + sz := 0 + for _, ent := range idx.Entries { + var data []byte + if isHash { + data = ent.Hash[:] + } else { + data = ent.CRC32[:] + } + i, err := e.Write(data) + sz += i + + if err != nil { + return sz, err + } + } + + return sz, nil +} + +func (e *Encoder) encodeOffsets(idx *Idxfile) (int, error) { + sz := 0 + for _, ent := range idx.Entries { + if err := e.writeInt32(uint32(ent.Offset)); err != nil { + return sz, err + } + + sz += 4 + + } + + return sz, nil +} + +func (e *Encoder) encodeChecksums(idx *Idxfile) (int, error) { + if _, err := e.Write(idx.PackfileChecksum[:]); err != nil { + return 0, err + } + + copy(idx.IdxChecksum[:], e.hash.Sum(nil)[:20]) + if _, err := e.Write(idx.IdxChecksum[:]); err != nil { + return 0, err + } + + return 40, nil +} + +func (e *Encoder) writeInt32(value uint32) error { + return binary.Write(e, binary.BigEndian, value) +} diff --git a/formats/idxfile/encoder_test.go b/formats/idxfile/encoder_test.go new file mode 100644 index 0000000..bfb9f91 --- /dev/null +++ b/formats/idxfile/encoder_test.go @@ -0,0 +1,47 @@ +package idxfile + +import ( + "bytes" + "io" + "os" + + . "gopkg.in/check.v1" +) + +func (s *IdxfileSuite) TestEncode(c *C) { + for i, path := range [...]string{ + "fixtures/git-fixture.idx", + "../packfile/fixtures/spinnaker-spinnaker.idx", + } { + com := Commentf("subtest %d: path = %s", i, path) + + exp, idx, err := decode(path) + c.Assert(err, IsNil, com) + + obt := new(bytes.Buffer) + e := NewEncoder(obt) + size, err := e.Encode(idx) + c.Assert(err, IsNil, com) + + c.Assert(size, Equals, exp.Len(), com) + c.Assert(obt, DeepEquals, exp, com) + } +} + +func decode(path string) (*bytes.Buffer, *Idxfile, error) { + f, err := os.Open(path) + if err != nil { + return nil, nil, err + } + + cont := new(bytes.Buffer) + tee := io.TeeReader(f, cont) + + d := NewDecoder(tee) + idx := &Idxfile{} + if err = d.Decode(idx); err != nil { + return nil, nil, err + } + + return cont, idx, f.Close() +} diff --git a/formats/idxfile/idxfile.go b/formats/idxfile/idxfile.go new file mode 100644 index 0000000..5f12dad --- /dev/null +++ b/formats/idxfile/idxfile.go @@ -0,0 +1,61 @@ +package idxfile + +import "gopkg.in/src-d/go-git.v3/core" + +const ( + // VersionSupported is the only idx version supported. + VersionSupported = 2 +) + +var ( + idxHeader = []byte{255, 't', 'O', 'c'} +) + +// An Idxfile represents an idx file in memory. +type Idxfile struct { + Version uint32 + Fanout [255]uint32 + ObjectCount uint32 + Entries []Entry + PackfileChecksum [20]byte + IdxChecksum [20]byte +} + +// An Entry represents data about an object in the packfile: its hash, +// offset and CRC32 checksum. +type Entry struct { + Hash core.Hash + CRC32 [4]byte + Offset uint64 +} + +func (idx *Idxfile) isValid() bool { + fanout := idx.calculateFanout() + for k, c := range idx.Fanout { + if fanout[k] != c { + return false + } + } + + return true +} + +func (idx *Idxfile) calculateFanout() [256]uint32 { + fanout := [256]uint32{} + var c uint32 + for _, e := range idx.Entries { + c++ + fanout[e.Hash[0]] = c + } + + var i uint32 + for k, c := range fanout { + if c != 0 { + i = c + } + + fanout[k] = i + } + + return fanout +} |