From 0b68b0fd8e8a0f57419a72479ddeb8d267f15f72 Mon Sep 17 00:00:00 2001 From: Miguel Molina Date: Wed, 6 Sep 2017 12:13:41 +0200 Subject: packfile: reuse delta indexes when possible Signed-off-by: Miguel Molina --- plumbing/format/packfile/diff_delta.go | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'plumbing/format/packfile/diff_delta.go') diff --git a/plumbing/format/packfile/diff_delta.go b/plumbing/format/packfile/diff_delta.go index 7e9f822..d0db46b 100644 --- a/plumbing/format/packfile/diff_delta.go +++ b/plumbing/format/packfile/diff_delta.go @@ -26,6 +26,10 @@ const ( // To generate target again, you will need the obtained object and "base" one. // Error will be returned if base or target object cannot be read. func GetDelta(base, target plumbing.EncodedObject) (plumbing.EncodedObject, error) { + return getDelta(make(deltaIndex), base, target) +} + +func getDelta(index deltaIndex, base, target plumbing.EncodedObject) (plumbing.EncodedObject, error) { br, err := base.Reader() if err != nil { return nil, err @@ -45,7 +49,7 @@ func GetDelta(base, target plumbing.EncodedObject) (plumbing.EncodedObject, erro return nil, err } - db := DiffDelta(bb, tb) + db := DiffDelta(index, bb, tb) delta := &plumbing.MemoryObject{} _, err = delta.Write(db) if err != nil { @@ -59,13 +63,15 @@ func GetDelta(base, target plumbing.EncodedObject) (plumbing.EncodedObject, erro } // DiffDelta returns the delta that transforms src into tgt. -func DiffDelta(src []byte, tgt []byte) []byte { +func DiffDelta(sindex deltaIndex, src []byte, tgt []byte) []byte { buf := bufPool.Get().(*bytes.Buffer) buf.Reset() buf.Write(deltaEncodeSize(len(src))) buf.Write(deltaEncodeSize(len(tgt))) - sindex := initMatch(src) + if len(sindex) == 0 { + initMatch(sindex, src) + } ibuf := bufPool.Get().(*bytes.Buffer) ibuf.Reset() @@ -126,9 +132,8 @@ func encodeInsertOperation(ibuf, buf *bytes.Buffer) { ibuf.Reset() } -func initMatch(src []byte) map[uint32]int { +func initMatch(index map[uint32]int, src []byte) map[uint32]int { i := 0 - index := make(map[uint32]int) for { if i+s > len(src) { break @@ -213,3 +218,5 @@ func encodeCopyOperation(offset, length int) []byte { return append([]byte{byte(code)}, opcodes...) } + +type deltaIndex map[uint32]int -- cgit From b953011e07002189f614a5c42f7bb28079558fdb Mon Sep 17 00:00:00 2001 From: Miguel Molina Date: Wed, 6 Sep 2017 14:26:30 +0200 Subject: packfile: slightly haster hash function for chunk-offset index key Signed-off-by: Miguel Molina --- plumbing/format/packfile/diff_delta.go | 39 ++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 9 deletions(-) (limited to 'plumbing/format/packfile/diff_delta.go') diff --git a/plumbing/format/packfile/diff_delta.go b/plumbing/format/packfile/diff_delta.go index d0db46b..295b411 100644 --- a/plumbing/format/packfile/diff_delta.go +++ b/plumbing/format/packfile/diff_delta.go @@ -2,7 +2,6 @@ package packfile import ( "bytes" - "hash/adler32" "io/ioutil" "gopkg.in/src-d/go-git.v4/plumbing" @@ -49,7 +48,7 @@ func getDelta(index deltaIndex, base, target plumbing.EncodedObject) (plumbing.E return nil, err } - db := DiffDelta(index, bb, tb) + db := diffDelta(index, bb, tb) delta := &plumbing.MemoryObject{} _, err = delta.Write(db) if err != nil { @@ -63,7 +62,11 @@ func getDelta(index deltaIndex, base, target plumbing.EncodedObject) (plumbing.E } // DiffDelta returns the delta that transforms src into tgt. -func DiffDelta(sindex deltaIndex, src []byte, tgt []byte) []byte { +func DiffDelta(src, tgt []byte) []byte { + return diffDelta(make(deltaIndex), src, tgt) +} + +func diffDelta(sindex deltaIndex, src []byte, tgt []byte) []byte { buf := bufPool.Get().(*bytes.Buffer) buf.Reset() buf.Write(deltaEncodeSize(len(src))) @@ -132,24 +135,42 @@ func encodeInsertOperation(ibuf, buf *bytes.Buffer) { ibuf.Reset() } -func initMatch(index map[uint32]int, src []byte) map[uint32]int { +func initMatch(index deltaIndex, src []byte) { i := 0 for { if i+s > len(src) { break } - ch := adler32.Checksum(src[i : i+s]) + ch := hashBuf(src[i : i+s]) index[ch] = i i += s } +} + +// https://lemire.me/blog/2015/10/22/faster-hashing-without-effort/ +func hashBuf(buf []byte) int64 { + var h int64 + var i int + len := len(buf) + for ; i+3 < len; i += 4 { + h = 31*31*31*31*h + + 31*31*31*int64(buf[i]) + + 31*31*int64(buf[i+1]) + + 31*int64(buf[i+2]) + + int64(buf[i+3]) + } + + for ; i < len; i++ { + h = 31*h + int64(buf[i]) + } - return index + return h } -func findMatch(src, tgt []byte, sindex map[uint32]int, tgtOffset int) (srcOffset, l int) { +func findMatch(src, tgt []byte, sindex deltaIndex, tgtOffset int) (srcOffset, l int) { if len(tgt) >= tgtOffset+s { - ch := adler32.Checksum(tgt[tgtOffset : tgtOffset+s]) + ch := hashBuf(tgt[tgtOffset : tgtOffset+s]) var ok bool srcOffset, ok = sindex[ch] if !ok { @@ -219,4 +240,4 @@ func encodeCopyOperation(offset, length int) []byte { return append([]byte{byte(code)}, opcodes...) } -type deltaIndex map[uint32]int +type deltaIndex map[int64]int -- cgit From 508a89726c9287b3fb5daac23b7957d4297bc42a Mon Sep 17 00:00:00 2001 From: Miguel Molina Date: Wed, 6 Sep 2017 18:14:55 +0200 Subject: packfile: use a modified version of JGit DeltaIndex and DeltaIndexScanner Signed-off-by: Miguel Molina --- plumbing/format/packfile/diff_delta.go | 44 ++++++---------------------------- 1 file changed, 7 insertions(+), 37 deletions(-) (limited to 'plumbing/format/packfile/diff_delta.go') diff --git a/plumbing/format/packfile/diff_delta.go b/plumbing/format/packfile/diff_delta.go index 295b411..7c9360e 100644 --- a/plumbing/format/packfile/diff_delta.go +++ b/plumbing/format/packfile/diff_delta.go @@ -25,10 +25,10 @@ const ( // To generate target again, you will need the obtained object and "base" one. // Error will be returned if base or target object cannot be read. func GetDelta(base, target plumbing.EncodedObject) (plumbing.EncodedObject, error) { - return getDelta(make(deltaIndex), base, target) + return getDelta(new(deltaIndex), base, target) } -func getDelta(index deltaIndex, base, target plumbing.EncodedObject) (plumbing.EncodedObject, error) { +func getDelta(index *deltaIndex, base, target plumbing.EncodedObject) (plumbing.EncodedObject, error) { br, err := base.Reader() if err != nil { return nil, err @@ -63,23 +63,23 @@ func getDelta(index deltaIndex, base, target plumbing.EncodedObject) (plumbing.E // DiffDelta returns the delta that transforms src into tgt. func DiffDelta(src, tgt []byte) []byte { - return diffDelta(make(deltaIndex), src, tgt) + return diffDelta(new(deltaIndex), src, tgt) } -func diffDelta(sindex deltaIndex, src []byte, tgt []byte) []byte { +func diffDelta(index *deltaIndex, src []byte, tgt []byte) []byte { buf := bufPool.Get().(*bytes.Buffer) buf.Reset() buf.Write(deltaEncodeSize(len(src))) buf.Write(deltaEncodeSize(len(tgt))) - if len(sindex) == 0 { - initMatch(sindex, src) + if len(index.entries) == 0 { + index.init(src) } ibuf := bufPool.Get().(*bytes.Buffer) ibuf.Reset() for i := 0; i < len(tgt); i++ { - offset, l := findMatch(src, tgt, sindex, i) + offset, l := index.findMatch(src, tgt, i) if l < s { ibuf.WriteByte(tgt[i]) @@ -135,19 +135,6 @@ func encodeInsertOperation(ibuf, buf *bytes.Buffer) { ibuf.Reset() } -func initMatch(index deltaIndex, src []byte) { - i := 0 - for { - if i+s > len(src) { - break - } - - ch := hashBuf(src[i : i+s]) - index[ch] = i - i += s - } -} - // https://lemire.me/blog/2015/10/22/faster-hashing-without-effort/ func hashBuf(buf []byte) int64 { var h int64 @@ -168,21 +155,6 @@ func hashBuf(buf []byte) int64 { return h } -func findMatch(src, tgt []byte, sindex deltaIndex, tgtOffset int) (srcOffset, l int) { - if len(tgt) >= tgtOffset+s { - ch := hashBuf(tgt[tgtOffset : tgtOffset+s]) - var ok bool - srcOffset, ok = sindex[ch] - if !ok { - return - } - - l = matchLength(src, tgt, tgtOffset, srcOffset) - } - - return -} - func matchLength(src, tgt []byte, otgt, osrc int) int { l := 0 for { @@ -239,5 +211,3 @@ func encodeCopyOperation(offset, length int) []byte { return append([]byte{byte(code)}, opcodes...) } - -type deltaIndex map[int64]int -- cgit From 90418e6ab682d1888dd7658a8ebd72a7be8cf502 Mon Sep 17 00:00:00 2001 From: Miguel Molina Date: Thu, 7 Sep 2017 10:02:29 +0200 Subject: packfile: parallelize deltification of objects in groups Signed-off-by: Miguel Molina --- plumbing/format/packfile/diff_delta.go | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'plumbing/format/packfile/diff_delta.go') diff --git a/plumbing/format/packfile/diff_delta.go b/plumbing/format/packfile/diff_delta.go index 7c9360e..2e58ec6 100644 --- a/plumbing/format/packfile/diff_delta.go +++ b/plumbing/format/packfile/diff_delta.go @@ -135,26 +135,6 @@ func encodeInsertOperation(ibuf, buf *bytes.Buffer) { ibuf.Reset() } -// https://lemire.me/blog/2015/10/22/faster-hashing-without-effort/ -func hashBuf(buf []byte) int64 { - var h int64 - var i int - len := len(buf) - for ; i+3 < len; i += 4 { - h = 31*31*31*31*h + - 31*31*31*int64(buf[i]) + - 31*31*int64(buf[i+1]) + - 31*int64(buf[i+2]) + - int64(buf[i+3]) - } - - for ; i < len; i++ { - h = 31*h + int64(buf[i]) - } - - return h -} - func matchLength(src, tgt []byte, otgt, osrc int) int { l := 0 for { -- cgit From 6d486b4f996c8abffd3206590303d38381a3dca8 Mon Sep 17 00:00:00 2001 From: Miguel Molina Date: Thu, 7 Sep 2017 12:02:52 +0200 Subject: packfile: small optimizations for findMatch and matchLength Signed-off-by: Miguel Molina --- plumbing/format/packfile/diff_delta.go | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'plumbing/format/packfile/diff_delta.go') diff --git a/plumbing/format/packfile/diff_delta.go b/plumbing/format/packfile/diff_delta.go index 2e58ec6..d4b207a 100644 --- a/plumbing/format/packfile/diff_delta.go +++ b/plumbing/format/packfile/diff_delta.go @@ -81,8 +81,22 @@ func diffDelta(index *deltaIndex, src []byte, tgt []byte) []byte { for i := 0; i < len(tgt); i++ { offset, l := index.findMatch(src, tgt, i) - if l < s { + if l == 0 { + // couldn't find a match, just write the current byte and continue ibuf.WriteByte(tgt[i]) + } else if l < 0 { + // src is less than blksz, copy the rest of the target to avoid + // calls to findMatch + for ; i < len(tgt); i++ { + ibuf.WriteByte(tgt[i]) + } + } else if l < s { + // remaining target is less than blksz, copy what's left of it + // and avoid calls to findMatch + for j := i; j < i+l; j++ { + ibuf.WriteByte(tgt[j]) + } + i += l - 1 } else { encodeInsertOperation(ibuf, buf) @@ -135,21 +149,6 @@ func encodeInsertOperation(ibuf, buf *bytes.Buffer) { ibuf.Reset() } -func matchLength(src, tgt []byte, otgt, osrc int) int { - l := 0 - for { - if (osrc >= len(src) || otgt >= len(tgt)) || src[osrc] != tgt[otgt] { - break - } - - l++ - osrc++ - otgt++ - } - - return l -} - func deltaEncodeSize(size int) []byte { var ret []byte c := size & 0x7f -- cgit