From f5b199f725c4695bbab7b3e202b6fca2a66f6ca3 Mon Sep 17 00:00:00 2001
From: Máximo Cuadros <mcuadros@gmail.com>
Date: Wed, 26 Oct 2016 15:52:16 +0000
Subject: formats: Index read support (#91)

* utils: fs generic TestSuite

* fs: fs.TempFile

* utils: fs small changes requested

* utils: fs, test fs.Create overwriting files

* formats: index, basic v2 reader

* formats: index, tree extension support

* formats: index, stage decoding

* formats: index, extended flags, v3 support

* formats: index, v4 support

* formats: index, Resolve undo support

* formats: index, fix error when decoding invalidated entries

* formats: index, fix style issues
---
 formats/index/decoder.go      | 466 ++++++++++++++++++++++++++++++++++++++++++
 formats/index/decoder_test.go | 196 ++++++++++++++++++
 formats/index/doc.go          | 302 +++++++++++++++++++++++++++
 formats/index/index.go        |  86 ++++++++
 4 files changed, 1050 insertions(+)
 create mode 100644 formats/index/decoder.go
 create mode 100644 formats/index/decoder_test.go
 create mode 100644 formats/index/doc.go
 create mode 100644 formats/index/index.go

(limited to 'formats/index')

diff --git a/formats/index/decoder.go b/formats/index/decoder.go
new file mode 100644
index 0000000..9bb25dd
--- /dev/null
+++ b/formats/index/decoder.go
@@ -0,0 +1,466 @@
+package index
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"io"
+	"io/ioutil"
+	"strconv"
+	"time"
+
+	"gopkg.in/src-d/go-git.v4/core"
+)
+
+var (
+	// IndexVersionSupported is the range of supported index versions
+	IndexVersionSupported = struct{ Min, Max uint32 }{Min: 2, Max: 4}
+
+	// ErrUnsupportedVersion is returned by Decode when the idxindex file
+	// version is not supported.
+	ErrUnsupportedVersion = errors.New("Unsuported version")
+	// ErrMalformedSignature is returned by Decode when the index header file is
+	// malformed
+	ErrMalformedSignature = errors.New("Malformed index signature file")
+
+	indexSignature          = []byte{'D', 'I', 'R', 'C'}
+	treeExtSignature        = []byte{'T', 'R', 'E', 'E'}
+	resolveUndoExtSignature = []byte{'R', 'E', 'U', 'C'}
+)
+
+const (
+	EntryExtended = 0x4000
+	EntryValid    = 0x8000
+
+	nameMask         = 0xfff
+	intentToAddMask  = 1 << 13
+	skipWorkTreeMask = 1 << 14
+)
+
+type Decoder struct {
+	r         io.Reader
+	lastEntry *Entry
+}
+
+// NewDecoder returns a new decoder that reads from r.
+func NewDecoder(r io.Reader) *Decoder {
+	return &Decoder{r: r}
+}
+
+// Decode reads the whole index object from its input and stores it in the
+// value pointed to by idx.
+func (d *Decoder) Decode(idx *Index) error {
+	version, err := validateHeader(d.r)
+	if err != nil {
+		return err
+	}
+
+	idx.Version = version
+
+	if err := binary.Read(d.r, binary.BigEndian, &idx.EntryCount); err != nil {
+		return err
+	}
+
+	if err := d.readEntries(idx); err != nil {
+		return err
+	}
+
+	return d.readExtensions(idx)
+}
+
+func (d *Decoder) readEntries(idx *Index) error {
+	for i := 0; i < int(idx.EntryCount); i++ {
+		e, err := d.readEntry(idx)
+		if err != nil {
+			return err
+		}
+
+		d.lastEntry = e
+		idx.Entries = append(idx.Entries, *e)
+	}
+
+	return nil
+}
+
+func (d *Decoder) readEntry(idx *Index) (*Entry, error) {
+	e := &Entry{}
+
+	var msec, mnsec, sec, nsec uint32
+
+	flowSize := 62
+	flow := []interface{}{
+		&msec, &mnsec,
+		&sec, &nsec,
+		&e.Dev,
+		&e.Inode,
+		&e.Mode,
+		&e.UID,
+		&e.GID,
+		&e.Size,
+		&e.Hash,
+		&e.Flags,
+	}
+
+	if err := readBinary(d.r, flow...); err != nil {
+		return nil, err
+	}
+
+	read := flowSize
+	e.CreatedAt = time.Unix(int64(msec), int64(mnsec))
+	e.ModifiedAt = time.Unix(int64(sec), int64(nsec))
+	e.Stage = Stage(e.Flags>>12) & 0x3
+
+	if e.Flags&EntryExtended != 0 {
+		var extended uint16
+		if err := readBinary(d.r, &extended); err != nil {
+			return nil, err
+		}
+
+		read += 2
+		e.IntentToAdd = extended&intentToAddMask != 0
+		e.SkipWorktree = extended&skipWorkTreeMask != 0
+	}
+
+	if err := d.readEntryName(idx, e); err != nil {
+		return nil, err
+	}
+
+	return e, d.padEntry(idx, e, read)
+}
+
+func (d *Decoder) readEntryName(idx *Index, e *Entry) error {
+	var name string
+	var err error
+
+	switch idx.Version {
+	case 2, 3:
+		name, err = d.doReadEntryName(e)
+	case 4:
+		name, err = d.doReadEntryNameV4()
+	default:
+		return ErrUnsupportedVersion
+	}
+
+	if err != nil {
+		return err
+	}
+
+	e.Name = name
+	return nil
+}
+
+func (d *Decoder) doReadEntryNameV4() (string, error) {
+	l, err := readVariableWidthInt(d.r)
+	if err != nil {
+		return "", err
+	}
+
+	var base string
+	if d.lastEntry != nil {
+		base = d.lastEntry.Name[:len(d.lastEntry.Name)-int(l)]
+	}
+
+	name, err := readUntil(d.r, '\x00')
+	if err != nil {
+		return "", err
+	}
+
+	return base + string(name), nil
+}
+
+func (d *Decoder) doReadEntryName(e *Entry) (string, error) {
+	pLen := e.Flags & nameMask
+
+	name := make([]byte, int64(pLen))
+	if err := binary.Read(d.r, binary.BigEndian, &name); err != nil {
+		return "", err
+	}
+
+	return string(name), nil
+}
+
+// Index entries are padded out to the next 8 byte alignment
+// for historical reasons related to how C Git read the files.
+func (d *Decoder) padEntry(idx *Index, e *Entry, read int) error {
+	if idx.Version == 4 {
+		return nil
+	}
+
+	entrySize := read + len(e.Name)
+	padLen := 8 - entrySize%8
+	if _, err := io.CopyN(ioutil.Discard, d.r, int64(padLen)); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (d *Decoder) readExtensions(idx *Index) error {
+	var err error
+	for {
+		err = d.readExtension(idx)
+		if err != nil {
+			break
+		}
+	}
+
+	if err == io.EOF {
+		return nil
+	}
+
+	return err
+}
+
+func (d *Decoder) readExtension(idx *Index) error {
+	var s = make([]byte, 4)
+	if _, err := io.ReadFull(d.r, s); err != nil {
+		return err
+	}
+
+	var len uint32
+	if err := binary.Read(d.r, binary.BigEndian, &len); err != nil {
+		return err
+	}
+
+	switch {
+	case bytes.Equal(s, treeExtSignature):
+		t := &Tree{}
+		td := &treeExtensionDecoder{&io.LimitedReader{R: d.r, N: int64(len)}}
+		if err := td.Decode(t); err != nil {
+			return err
+		}
+
+		idx.Cache = t
+	case bytes.Equal(s, resolveUndoExtSignature):
+		ru := &ResolveUndo{}
+		rud := &resolveUndoDecoder{&io.LimitedReader{R: d.r, N: int64(len)}}
+		if err := rud.Decode(ru); err != nil {
+			return err
+		}
+
+		idx.ResolveUndo = ru
+	}
+
+	return nil
+}
+
+func validateHeader(r io.Reader) (version uint32, err error) {
+	var s = make([]byte, 4)
+	if _, err := io.ReadFull(r, s); err != nil {
+		return 0, err
+	}
+
+	if !bytes.Equal(s, indexSignature) {
+		return 0, ErrMalformedSignature
+	}
+
+	if err := binary.Read(r, binary.BigEndian, &version); err != nil {
+		return 0, err
+	}
+
+	if version < IndexVersionSupported.Min || version > IndexVersionSupported.Max {
+		return 0, ErrUnsupportedVersion
+	}
+
+	return
+}
+
+type treeExtensionDecoder struct {
+	r io.Reader
+}
+
+func (d *treeExtensionDecoder) Decode(t *Tree) error {
+	for {
+		e, err := d.readEntry()
+		if err != nil {
+			if err == io.EOF {
+				return nil
+			}
+
+			return err
+		}
+
+		if e == nil {
+			continue
+		}
+
+		t.Entries = append(t.Entries, *e)
+	}
+}
+
+func (d *treeExtensionDecoder) readEntry() (*TreeEntry, error) {
+	e := &TreeEntry{}
+
+	path, err := readUntil(d.r, '\x00')
+	if err != nil {
+		return nil, err
+	}
+
+	e.Path = string(path)
+
+	count, err := readUntil(d.r, ' ')
+	if err != nil {
+		return nil, err
+	}
+
+	i, err := strconv.Atoi(string(count))
+	if err != nil {
+		return nil, err
+	}
+
+	// An entry can be in an invalidated state and is represented by having a
+	// negative number in the entry_count field.
+	if i == -1 {
+		return nil, nil
+	}
+
+	e.Entries = i
+	trees, err := readUntil(d.r, '\n')
+	if err != nil {
+		return nil, err
+	}
+
+	i, err = strconv.Atoi(string(trees))
+	if err != nil {
+		return nil, err
+	}
+
+	e.Trees = i
+
+	if err := binary.Read(d.r, binary.BigEndian, &e.Hash); err != nil {
+		return nil, err
+	}
+
+	return e, nil
+}
+
+type resolveUndoDecoder struct {
+	r io.Reader
+}
+
+func (d *resolveUndoDecoder) Decode(ru *ResolveUndo) error {
+	for {
+		e, err := d.readEntry()
+		if err != nil {
+			if err == io.EOF {
+				return nil
+			}
+
+			return err
+		}
+
+		ru.Entries = append(ru.Entries, *e)
+	}
+}
+
+func (d *resolveUndoDecoder) readEntry() (*ResolveUndoEntry, error) {
+	e := &ResolveUndoEntry{
+		Stages: make(map[Stage]core.Hash, 0),
+	}
+
+	path, err := readUntil(d.r, '\x00')
+	if err != nil {
+		return nil, err
+	}
+
+	e.Path = string(path)
+
+	for i := 0; i < 3; i++ {
+		if err := d.readStage(e, Stage(i+1)); err != nil {
+			return nil, err
+		}
+	}
+
+	for s := range e.Stages {
+		var hash core.Hash
+		if err := binary.Read(d.r, binary.BigEndian, hash[:]); err != nil {
+			return nil, err
+		}
+
+		e.Stages[s] = hash
+	}
+
+	return e, nil
+}
+
+func (d *resolveUndoDecoder) readStage(e *ResolveUndoEntry, s Stage) error {
+	ascii, err := readUntil(d.r, '\x00')
+	if err != nil {
+		return err
+	}
+
+	stage, err := strconv.ParseInt(string(ascii), 8, 64)
+	if err != nil {
+		return err
+	}
+
+	if stage != 0 {
+		e.Stages[s] = core.ZeroHash
+	}
+
+	return nil
+}
+
+func readBinary(r io.Reader, data ...interface{}) error {
+	for _, v := range data {
+		err := binary.Read(r, binary.BigEndian, v)
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func readUntil(r io.Reader, delim byte) ([]byte, error) {
+	var buf [1]byte
+	value := make([]byte, 0, 16)
+	for {
+		if _, err := r.Read(buf[:]); err != nil {
+			if err == io.EOF {
+				return nil, err
+			}
+
+			return nil, err
+		}
+
+		if buf[0] == delim {
+			return value, nil
+		}
+
+		value = append(value, buf[0])
+	}
+}
+
+//     dheader[pos] = ofs & 127;
+//     while (ofs >>= 7)
+//         dheader[--pos] = 128 | (--ofs & 127);
+//
+func readVariableWidthInt(r io.Reader) (int64, error) {
+	var c byte
+	if err := readBinary(r, &c); err != nil {
+		return 0, err
+	}
+
+	var v = int64(c & maskLength)
+	for moreBytesInLength(c) {
+		v++
+		if err := readBinary(r, &c); err != nil {
+			return 0, err
+		}
+
+		v = (v << lengthBits) + int64(c&maskLength)
+	}
+
+	return v, nil
+}
+
+const (
+	maskContinue = uint8(128) // 1000 000
+	maskLength   = uint8(127) // 0111 1111
+	lengthBits   = uint8(7)   // subsequent bytes has 7 bits to store the length
+)
+
+func moreBytesInLength(c byte) bool {
+	return c&maskContinue > 0
+}
diff --git a/formats/index/decoder_test.go b/formats/index/decoder_test.go
new file mode 100644
index 0000000..cf4c872
--- /dev/null
+++ b/formats/index/decoder_test.go
@@ -0,0 +1,196 @@
+package index
+
+import (
+	"testing"
+
+	. "gopkg.in/check.v1"
+	"gopkg.in/src-d/go-git.v4/core"
+	"gopkg.in/src-d/go-git.v4/fixtures"
+)
+
+func Test(t *testing.T) { TestingT(t) }
+
+type IdxfileSuite struct {
+	fixtures.Suite
+}
+
+var _ = Suite(&IdxfileSuite{})
+
+func (s *IdxfileSuite) TestDecode(c *C) {
+	f, err := fixtures.Basic().One().DotGit().Open("index")
+	c.Assert(err, IsNil)
+
+	idx := &Index{}
+	d := NewDecoder(f)
+	err = d.Decode(idx)
+	c.Assert(err, IsNil)
+
+	c.Assert(idx.Version, Equals, uint32(2))
+	c.Assert(idx.EntryCount, Equals, uint32(9))
+}
+
+func (s *IdxfileSuite) TestDecodeEntries(c *C) {
+	f, err := fixtures.Basic().One().DotGit().Open("index")
+	c.Assert(err, IsNil)
+
+	idx := &Index{}
+	d := NewDecoder(f)
+	err = d.Decode(idx)
+	c.Assert(err, IsNil)
+
+	c.Assert(idx.Entries, HasLen, 9)
+
+	e := idx.Entries[0]
+	c.Assert(e.CreatedAt.Unix(), Equals, int64(1473350251))
+	c.Assert(e.CreatedAt.Nanosecond(), Equals, 12059307)
+	c.Assert(e.ModifiedAt.Unix(), Equals, int64(1473350251))
+	c.Assert(e.ModifiedAt.Nanosecond(), Equals, 12059307)
+	c.Assert(e.Dev, Equals, uint32(38))
+	c.Assert(e.Inode, Equals, uint32(1715795))
+	c.Assert(e.UID, Equals, uint32(1000))
+	c.Assert(e.GID, Equals, uint32(100))
+	c.Assert(e.Size, Equals, uint32(189))
+	c.Assert(e.Hash.String(), Equals, "32858aad3c383ed1ff0a0f9bdf231d54a00c9e88")
+	c.Assert(e.Name, Equals, ".gitignore")
+	c.Assert(e.Mode.String(), Equals, "-rw-r--r--")
+
+	e = idx.Entries[1]
+	c.Assert(e.Name, Equals, "CHANGELOG")
+}
+
+func (s *IdxfileSuite) TestDecodeCacheTree(c *C) {
+	f, err := fixtures.Basic().One().DotGit().Open("index")
+	c.Assert(err, IsNil)
+
+	idx := &Index{}
+	d := NewDecoder(f)
+	err = d.Decode(idx)
+	c.Assert(err, IsNil)
+
+	c.Assert(idx.Entries, HasLen, 9)
+	c.Assert(idx.Cache.Entries, HasLen, 5)
+
+	for i, expected := range expectedEntries {
+		c.Assert(idx.Cache.Entries[i].Path, Equals, expected.Path)
+		c.Assert(idx.Cache.Entries[i].Entries, Equals, expected.Entries)
+		c.Assert(idx.Cache.Entries[i].Trees, Equals, expected.Trees)
+		c.Assert(idx.Cache.Entries[i].Hash.String(), Equals, expected.Hash.String())
+	}
+
+}
+
+var expectedEntries = []TreeEntry{
+	{Path: "", Entries: 9, Trees: 4, Hash: core.NewHash("a8d315b2b1c615d43042c3a62402b8a54288cf5c")},
+	{Path: "go", Entries: 1, Trees: 0, Hash: core.NewHash("a39771a7651f97faf5c72e08224d857fc35133db")},
+	{Path: "php", Entries: 1, Trees: 0, Hash: core.NewHash("586af567d0bb5e771e49bdd9434f5e0fb76d25fa")},
+	{Path: "json", Entries: 2, Trees: 0, Hash: core.NewHash("5a877e6a906a2743ad6e45d99c1793642aaf8eda")},
+	{Path: "vendor", Entries: 1, Trees: 0, Hash: core.NewHash("cf4aa3b38974fb7d81f367c0830f7d78d65ab86b")},
+}
+
+func (s *IdxfileSuite) TestDecodeMergeConflict(c *C) {
+	f, err := fixtures.Basic().ByTag("merge-conflict").One().DotGit().Open("index")
+	c.Assert(err, IsNil)
+
+	idx := &Index{}
+	d := NewDecoder(f)
+	err = d.Decode(idx)
+	c.Assert(err, IsNil)
+
+	c.Assert(idx.Version, Equals, uint32(2))
+	c.Assert(idx.EntryCount, Equals, uint32(13))
+
+	expected := []struct {
+		Stage Stage
+		Hash  string
+	}{
+		{AncestorMode, "880cd14280f4b9b6ed3986d6671f907d7cc2a198"},
+		{OurMode, "d499a1a0b79b7d87a35155afd0c1cce78b37a91c"},
+		{TheirMode, "14f8e368114f561c38e134f6e68ea6fea12d77ed"},
+	}
+
+	// stagged files
+	for i, e := range idx.Entries[4:7] {
+		c.Assert(e.Stage, Equals, expected[i].Stage)
+		c.Assert(e.CreatedAt.Unix(), Equals, int64(0))
+		c.Assert(e.CreatedAt.Nanosecond(), Equals, 0)
+		c.Assert(e.ModifiedAt.Unix(), Equals, int64(0))
+		c.Assert(e.ModifiedAt.Nanosecond(), Equals, 0)
+		c.Assert(e.Dev, Equals, uint32(0))
+		c.Assert(e.Inode, Equals, uint32(0))
+		c.Assert(e.UID, Equals, uint32(0))
+		c.Assert(e.GID, Equals, uint32(0))
+		c.Assert(e.Size, Equals, uint32(0))
+		c.Assert(e.Hash.String(), Equals, expected[i].Hash)
+		c.Assert(e.Name, Equals, "go/example.go")
+	}
+
+}
+
+func (s *IdxfileSuite) TestDecodeExtendedV3(c *C) {
+	f, err := fixtures.Basic().ByTag("intent-to-add").One().DotGit().Open("index")
+	c.Assert(err, IsNil)
+
+	idx := &Index{}
+	d := NewDecoder(f)
+	err = d.Decode(idx)
+	c.Assert(err, IsNil)
+
+	c.Assert(idx.Version, Equals, uint32(3))
+	c.Assert(idx.EntryCount, Equals, uint32(11))
+
+	c.Assert(idx.Entries[6].Name, Equals, "intent-to-add")
+	c.Assert(idx.Entries[6].IntentToAdd, Equals, true)
+	c.Assert(idx.Entries[6].SkipWorktree, Equals, false)
+}
+
+func (s *IdxfileSuite) TestDecodeResolveUndo(c *C) {
+	f, err := fixtures.Basic().ByTag("resolve-undo").One().DotGit().Open("index")
+	c.Assert(err, IsNil)
+
+	idx := &Index{}
+	d := NewDecoder(f)
+	err = d.Decode(idx)
+	c.Assert(err, IsNil)
+
+	c.Assert(idx.Version, Equals, uint32(2))
+	c.Assert(idx.EntryCount, Equals, uint32(8))
+
+	ru := idx.ResolveUndo
+	c.Assert(ru.Entries, HasLen, 2)
+	c.Assert(ru.Entries[0].Path, Equals, "go/example.go")
+	c.Assert(ru.Entries[0].Stages, HasLen, 3)
+	c.Assert(ru.Entries[0].Stages[AncestorMode], Not(Equals), core.ZeroHash)
+	c.Assert(ru.Entries[0].Stages[OurMode], Not(Equals), core.ZeroHash)
+	c.Assert(ru.Entries[0].Stages[TheirMode], Not(Equals), core.ZeroHash)
+	c.Assert(ru.Entries[1].Path, Equals, "haskal/haskal.hs")
+	c.Assert(ru.Entries[1].Stages, HasLen, 2)
+	c.Assert(ru.Entries[1].Stages[OurMode], Not(Equals), core.ZeroHash)
+	c.Assert(ru.Entries[1].Stages[TheirMode], Not(Equals), core.ZeroHash)
+}
+
+func (s *IdxfileSuite) TestDecodeV4(c *C) {
+	f, err := fixtures.Basic().ByTag("index-v4").One().DotGit().Open("index")
+	c.Assert(err, IsNil)
+
+	idx := &Index{}
+	d := NewDecoder(f)
+	err = d.Decode(idx)
+	c.Assert(err, IsNil)
+
+	c.Assert(idx.Version, Equals, uint32(4))
+	c.Assert(idx.EntryCount, Equals, uint32(11))
+
+	names := []string{
+		".gitignore", "CHANGELOG", "LICENSE", "binary.jpg", "go/example.go",
+		"haskal/haskal.hs", "intent-to-add", "json/long.json",
+		"json/short.json", "php/crappy.php", "vendor/foo.go",
+	}
+
+	for i, e := range idx.Entries {
+		c.Assert(e.Name, Equals, names[i])
+	}
+
+	c.Assert(idx.Entries[6].Name, Equals, "intent-to-add")
+	c.Assert(idx.Entries[6].IntentToAdd, Equals, true)
+	c.Assert(idx.Entries[6].SkipWorktree, Equals, false)
+}
diff --git a/formats/index/doc.go b/formats/index/doc.go
new file mode 100644
index 0000000..285eade
--- /dev/null
+++ b/formats/index/doc.go
@@ -0,0 +1,302 @@
+// Package idxfile implements a encoder/decoder of index format files
+package index
+
+/*
+Git index format
+================
+
+== The Git index file has the following format
+
+  All binary numbers are in network byte order. Version 2 is described
+  here unless stated otherwise.
+
+   - A 12-byte header consisting of
+
+     4-byte signature:
+       The signature is { 'D', 'I', 'R', 'C' } (stands for "dircache")
+
+     4-byte version number:
+       The current supported versions are 2, 3 and 4.
+
+     32-bit number of index entries.
+
+   - A number of sorted index entries (see below).
+
+   - Extensions
+
+     Extensions are identified by signature. Optional extensions can
+     be ignored if Git does not understand them.
+
+     Git currently supports cached tree and resolve undo extensions.
+
+     4-byte extension signature. If the first byte is 'A'..'Z' the
+     extension is optional and can be ignored.
+
+     32-bit size of the extension
+
+     Extension data
+
+   - 160-bit SHA-1 over the content of the index file before this
+     checksum.
+
+== Index entry
+
+  Index entries are sorted in ascending order on the name field,
+  interpreted as a string of unsigned bytes (i.e. memcmp() order, no
+  localization, no special casing of directory separator '/'). Entries
+  with the same name are sorted by their stage field.
+
+  32-bit ctime seconds, the last time a file's metadata changed
+    this is stat(2) data
+
+  32-bit ctime nanosecond fractions
+    this is stat(2) data
+
+  32-bit mtime seconds, the last time a file's data changed
+    this is stat(2) data
+
+  32-bit mtime nanosecond fractions
+    this is stat(2) data
+
+  32-bit dev
+    this is stat(2) data
+
+  32-bit ino
+    this is stat(2) data
+
+  32-bit mode, split into (high to low bits)
+
+    4-bit object type
+      valid values in binary are 1000 (regular file), 1010 (symbolic link)
+      and 1110 (gitlink)
+
+    3-bit unused
+
+    9-bit unix permission. Only 0755 and 0644 are valid for regular files.
+    Symbolic links and gitlinks have value 0 in this field.
+
+  32-bit uid
+    this is stat(2) data
+
+  32-bit gid
+    this is stat(2) data
+
+  32-bit file size
+    This is the on-disk size from stat(2), truncated to 32-bit.
+
+  160-bit SHA-1 for the represented object
+
+  A 16-bit 'flags' field split into (high to low bits)
+
+    1-bit assume-valid flag
+
+    1-bit extended flag (must be zero in version 2)
+
+    2-bit stage (during merge)
+
+    12-bit name length if the length is less than 0xFFF; otherwise 0xFFF
+    is stored in this field.
+
+  (Version 3 or later) A 16-bit field, only applicable if the
+  "extended flag" above is 1, split into (high to low bits).
+
+    1-bit reserved for future
+
+    1-bit skip-worktree flag (used by sparse checkout)
+
+    1-bit intent-to-add flag (used by "git add -N")
+
+    13-bit unused, must be zero
+
+  Entry path name (variable length) relative to top level directory
+    (without leading slash). '/' is used as path separator. The special
+    path components ".", ".." and ".git" (without quotes) are disallowed.
+    Trailing slash is also disallowed.
+
+    The exact encoding is undefined, but the '.' and '/' characters
+    are encoded in 7-bit ASCII and the encoding cannot contain a NUL
+    byte (iow, this is a UNIX pathname).
+
+  (Version 4) In version 4, the entry path name is prefix-compressed
+    relative to the path name for the previous entry (the very first
+    entry is encoded as if the path name for the previous entry is an
+    empty string).  At the beginning of an entry, an integer N in the
+    variable width encoding (the same encoding as the offset is encoded
+    for OFS_DELTA pack entries; see pack-format.txt) is stored, followed
+    by a NUL-terminated string S.  Removing N bytes from the end of the
+    path name for the previous entry, and replacing it with the string S
+    yields the path name for this entry.
+
+  1-8 nul bytes as necessary to pad the entry to a multiple of eight bytes
+  while keeping the name NUL-terminated.
+
+  (Version 4) In version 4, the padding after the pathname does not
+  exist.
+
+  Interpretation of index entries in split index mode is completely
+  different. See below for details.
+
+== Extensions
+
+=== Cached tree
+
+  Cached tree extension contains pre-computed hashes for trees that can
+  be derived from the index. It helps speed up tree object generation
+  from index for a new commit.
+
+  When a path is updated in index, the path must be invalidated and
+  removed from tree cache.
+
+  The signature for this extension is { 'T', 'R', 'E', 'E' }.
+
+  A series of entries fill the entire extension; each of which
+  consists of:
+
+  - NUL-terminated path component (relative to its parent directory);
+
+  - ASCII decimal number of entries in the index that is covered by the
+    tree this entry represents (entry_count);
+
+  - A space (ASCII 32);
+
+  - ASCII decimal number that represents the number of subtrees this
+    tree has;
+
+  - A newline (ASCII 10); and
+
+  - 160-bit object name for the object that would result from writing
+    this span of index as a tree.
+
+  An entry can be in an invalidated state and is represented by having
+  a negative number in the entry_count field. In this case, there is no
+  object name and the next entry starts immediately after the newline.
+  When writing an invalid entry, -1 should always be used as entry_count.
+
+  The entries are written out in the top-down, depth-first order.  The
+  first entry represents the root level of the repository, followed by the
+  first subtree--let's call this A--of the root level (with its name
+  relative to the root level), followed by the first subtree of A (with
+  its name relative to A), ...
+
+=== Resolve undo
+
+  A conflict is represented in the index as a set of higher stage entries.
+  When a conflict is resolved (e.g. with "git add path"), these higher
+  stage entries will be removed and a stage-0 entry with proper resolution
+  is added.
+
+  When these higher stage entries are removed, they are saved in the
+  resolve undo extension, so that conflicts can be recreated (e.g. with
+  "git checkout -m"), in case users want to redo a conflict resolution
+  from scratch.
+
+  The signature for this extension is { 'R', 'E', 'U', 'C' }.
+
+  A series of entries fill the entire extension; each of which
+  consists of:
+
+  - NUL-terminated pathname the entry describes (relative to the root of
+    the repository, i.e. full pathname);
+
+  - Three NUL-terminated ASCII octal numbers, entry mode of entries in
+    stage 1 to 3 (a missing stage is represented by "0" in this field);
+    and
+
+  - At most three 160-bit object names of the entry in stages from 1 to 3
+    (nothing is written for a missing stage).
+
+=== Split index
+
+  In split index mode, the majority of index entries could be stored
+  in a separate file. This extension records the changes to be made on
+  top of that to produce the final index.
+
+  The signature for this extension is { 'l', 'i', 'n', 'k' }.
+
+  The extension consists of:
+
+  - 160-bit SHA-1 of the shared index file. The shared index file path
+    is $GIT_DIR/sharedindex.<SHA-1>. If all 160 bits are zero, the
+    index does not require a shared index file.
+
+  - An ewah-encoded delete bitmap, each bit represents an entry in the
+    shared index. If a bit is set, its corresponding entry in the
+    shared index will be removed from the final index.  Note, because
+    a delete operation changes index entry positions, but we do need
+    original positions in replace phase, it's best to just mark
+    entries for removal, then do a mass deletion after replacement.
+
+  - An ewah-encoded replace bitmap, each bit represents an entry in
+    the shared index. If a bit is set, its corresponding entry in the
+    shared index will be replaced with an entry in this index
+    file. All replaced entries are stored in sorted order in this
+    index. The first "1" bit in the replace bitmap corresponds to the
+    first index entry, the second "1" bit to the second entry and so
+    on. Replaced entries may have empty path names to save space.
+
+  The remaining index entries after replaced ones will be added to the
+  final index. These added entries are also sorted by entry name then
+  stage.
+
+== Untracked cache
+
+  Untracked cache saves the untracked file list and necessary data to
+  verify the cache. The signature for this extension is { 'U', 'N',
+  'T', 'R' }.
+
+  The extension starts with
+
+  - A sequence of NUL-terminated strings, preceded by the size of the
+    sequence in variable width encoding. Each string describes the
+    environment where the cache can be used.
+
+  - Stat data of $GIT_DIR/info/exclude. See "Index entry" section from
+    ctime field until "file size".
+
+  - Stat data of core.excludesfile
+
+  - 32-bit dir_flags (see struct dir_struct)
+
+  - 160-bit SHA-1 of $GIT_DIR/info/exclude. Null SHA-1 means the file
+    does not exist.
+
+  - 160-bit SHA-1 of core.excludesfile. Null SHA-1 means the file does
+    not exist.
+
+  - NUL-terminated string of per-dir exclude file name. This usually
+    is ".gitignore".
+
+  - The number of following directory blocks, variable width
+    encoding. If this number is zero, the extension ends here with a
+    following NUL.
+
+  - A number of directory blocks in depth-first-search order, each
+    consists of
+
+    - The number of untracked entries, variable width encoding.
+
+    - The number of sub-directory blocks, variable width encoding.
+
+    - The directory name terminated by NUL.
+
+    - A number of untracked file/dir names terminated by NUL.
+
+The remaining data of each directory block is grouped by type:
+
+  - An ewah bitmap, the n-th bit marks whether the n-th directory has
+    valid untracked cache entries.
+
+  - An ewah bitmap, the n-th bit records "check-only" bit of
+    read_directory_recursive() for the n-th directory.
+
+  - An ewah bitmap, the n-th bit indicates whether SHA-1 and stat data
+    is valid for the n-th directory and exists in the next data.
+
+  - An array of stat data. The n-th data corresponds with the n-th
+    "one" bit in the previous ewah bitmap.
+
+  - An array of SHA-1. The n-th SHA-1 corresponds with the n-th "one" bit
+    in the previous ewah bitmap.
+
+  - One NUL.
+*/
diff --git a/formats/index/index.go b/formats/index/index.go
new file mode 100644
index 0000000..bea199e
--- /dev/null
+++ b/formats/index/index.go
@@ -0,0 +1,86 @@
+package index
+
+import (
+	"os"
+	"time"
+
+	"gopkg.in/src-d/go-git.v4/core"
+)
+
+type Stage int
+
+const (
+	// Merged is the default stage, fully merged
+	Merged Stage = 1
+	// AncestorMode is the base revision
+	AncestorMode Stage = 1
+	// OurMode is the first tree revision, ours
+	OurMode Stage = 2
+	// TheirMode is the second tree revision, theirs
+	TheirMode Stage = 3
+)
+
+// Index contains the information about which objects are currently checked out
+// in the worktree, having information about the working files. Changes in
+// worktree are detected using this Index. The Index is also used during merges
+type Index struct {
+	Version     uint32
+	EntryCount  uint32
+	Entries     []Entry
+	Cache       *Tree
+	ResolveUndo *ResolveUndo
+}
+
+// Entry represents a single file (or stage of a file) in the cache. An entry
+// represents exactly one stage of a file. If a file path is unmerged then
+// multiple Entry instances may appear for the same path name.
+type Entry struct {
+	CreatedAt  time.Time
+	ModifiedAt time.Time
+	Dev, Inode uint32
+	Mode       os.FileMode
+	UID, GID   uint32
+	Size       uint32
+	Flags      uint16
+	Stage      Stage
+
+	SkipWorktree bool
+	IntentToAdd  bool
+
+	Hash core.Hash
+	Name string
+}
+
+// Tree contains pre-computed hashes for trees that can be derived from the
+// index. It helps speed up tree object generation from index for a new commit.
+type Tree struct {
+	Entries []TreeEntry
+}
+
+// TreeEntry entry of a cached Tree
+type TreeEntry struct {
+	// Path component (relative to its parent directory)
+	Path string
+	// Entries is the number of entries in the index that is covered by the tree
+	// this entry represents
+	Entries int
+	// Trees is the number that represents the number of subtrees this tree has
+	Trees int
+	// Hash object name for the object that would result from writing this span
+	// of index as a tree.
+	Hash core.Hash
+}
+
+// ResolveUndo when a conflict is resolved (e.g. with "git add path"), these
+// higher stage entries will be removed and a stage-0 entry with proper
+// resolution is added. When these higher stage entries are removed, they are
+// saved in the resolve undo extension
+type ResolveUndo struct {
+	Entries []ResolveUndoEntry
+}
+
+// ResolveUndoEntry contains the information about a conflict when is resolved
+type ResolveUndoEntry struct {
+	Path   string
+	Stages map[Stage]core.Hash
+}
-- 
cgit