plumbing: Optimise memory consumption for filesystem storage

Previously, as part of building the index representation, the resolveObject func would create an interim plumbing.MemoryObject, which would then be saved into storage via storage.SetEncodedObject. This meant that objects would be unnecessarily loaded into memory, to then be saved into disk. The changes streamlines this process by: - Introducing the LazyObjectWriter interface which enables the write operation to take places directly against the filesystem-based storage. - Leverage multi-writers to process the input data once, while targeting multiple writers (e.g. hasher and storage). An additional change relates to the caching of object info children within Parser.get. The cache is now skipped when a seekable filesystem is being used. The impact of the changes can be observed when using seekable filesystem storages, especially when cloning large repositories. The stats below were captured by adapting the BenchmarkPlainClone test to clone https://github.com/torvalds/linux.git: pkg: github.com/go-git/go-git/v5 cpu: Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz │ /tmp/old │ /tmp/new │ │ sec/op │ sec/op vs base │ PlainClone-16 41.68 ± 17% 48.04 ± 9% +15.27% (p=0.015 n=6) │ /tmp/old │ /tmp/new │ │ B/op │ B/op vs base │ PlainClone-16 1127.8Mi ± 7% 256.7Mi ± 50% -77.23% (p=0.002 n=6) │ /tmp/old │ /tmp/new │ │ allocs/op │ allocs/op vs base │ PlainClone-16 3.125M ± 0% 3.800M ± 0% +21.60% (p=0.002 n=6) Notice that on average the memory consumption per operation is over 75% smaller. The time per operation increased by 15%, which may actual be less on long running applications, due to the decreased GC pressure and the garbage collection costs. Signed-off-by: Paulo Gomes <pjbgf@linux.com>
author: Paulo Gomes <pjbgf@linux.com> 2023-07-20 21:19:28 +0100
committer: Paulo Gomes <pjbgf@linux.com> 2023-10-28 03:51:35 +0100
commit: 1c361adbc1f4b0e3d0743d11f187fb0b3ac4cb4d (patch)
tree: b7eecbf517a8e1a59047cb98d414bf00ba1f4653 /plumbing/format/packfile/patch_delta.go
parent: 814abc098d033f77315d3bfb89ae5991aae10457 (diff)
download: go-git-1c361adbc1f4b0e3d0743d11f187fb0b3ac4cb4d.tar.gz
1 files changed, 102 insertions, 0 deletions
diff --git a/plumbing/format/packfile/patch_delta.go b/plumbing/format/packfile/patch_delta.go
index f00562d..67c20ff 100644
--- a/plumbing/format/packfile/patch_delta.go
+++ b/plumbing/format/packfile/patch_delta.go
@@ -4,6 +4,7 @@ import (
 	"bufio"
 	"bytes"
 	"errors"
+	"fmt"
 	"io"
 	"math"
 
@@ -265,6 +266,107 @@ func patchDelta(dst *bytes.Buffer, src, delta []byte) error {
 	return nil
 }
 
+func patchDeltaWriter(dst io.Writer, base io.ReaderAt, delta io.Reader,
+	typ plumbing.ObjectType, writeHeader objectHeaderWriter) (uint, plumbing.Hash, error) {
+	deltaBuf := bufio.NewReaderSize(delta, 1024)
+	srcSz, err := decodeLEB128ByteReader(deltaBuf)
+	if err != nil {
+		if err == io.EOF {
+			return 0, plumbing.ZeroHash, ErrInvalidDelta
+		}
+		return 0, plumbing.ZeroHash, err
+	}
+
+	if r, ok := base.(*bytes.Reader); ok && srcSz != uint(r.Size()) {
+		return 0, plumbing.ZeroHash, ErrInvalidDelta
+	}
+
+	targetSz, err := decodeLEB128ByteReader(deltaBuf)
+	if err != nil {
+		if err == io.EOF {
+			return 0, plumbing.ZeroHash, ErrInvalidDelta
+		}
+		return 0, plumbing.ZeroHash, err
+	}
+
+	// If header still needs to be written, caller will provide
+	// a LazyObjectWriterHeader. This seems to be the case when
+	// dealing with thin-packs.
+	if writeHeader != nil {
+		err = writeHeader(typ, int64(targetSz))
+		if err != nil {
+			return 0, plumbing.ZeroHash, fmt.Errorf("could not lazy write header: %w", err)
+		}
+	}
+
+	remainingTargetSz := targetSz
+
+	hasher := plumbing.NewHasher(typ, int64(targetSz))
+	mw := io.MultiWriter(dst, hasher)
+
+	bufp := sync.GetByteSlice()
+	defer sync.PutByteSlice(bufp)
+
+	sr := io.NewSectionReader(base, int64(0), int64(srcSz))
+	// Keep both the io.LimitedReader types, so we can reset N.
+	baselr := io.LimitReader(sr, 0).(*io.LimitedReader)
+	deltalr := io.LimitReader(deltaBuf, 0).(*io.LimitedReader)
+
+	for {
+		buf := *bufp
+		cmd, err := deltaBuf.ReadByte()
+		if err == io.EOF {
+			return 0, plumbing.ZeroHash, ErrInvalidDelta
+		}
+		if err != nil {
+			return 0, plumbing.ZeroHash, err
+		}
+
+		if isCopyFromSrc(cmd) {
+			offset, err := decodeOffsetByteReader(cmd, deltaBuf)
+			if err != nil {
+				return 0, plumbing.ZeroHash, err
+			}
+			sz, err := decodeSizeByteReader(cmd, deltaBuf)
+			if err != nil {
+				return 0, plumbing.ZeroHash, err
+			}
+
+			if invalidSize(sz, targetSz) ||
+				invalidOffsetSize(offset, sz, srcSz) {
+				return 0, plumbing.ZeroHash, err
+			}
+
+			if _, err := sr.Seek(int64(offset), io.SeekStart); err != nil {
+				return 0, plumbing.ZeroHash, err
+			}
+			baselr.N = int64(sz)
+			if _, err := io.CopyBuffer(mw, baselr, buf); err != nil {
+				return 0, plumbing.ZeroHash, err
+			}
+			remainingTargetSz -= sz
+		} else if isCopyFromDelta(cmd) {
+			sz := uint(cmd) // cmd is the size itself
+			if invalidSize(sz, targetSz) {
+				return 0, plumbing.ZeroHash, ErrInvalidDelta
+			}
+			deltalr.N = int64(sz)
+			if _, err := io.CopyBuffer(mw, deltalr, buf); err != nil {
+				return 0, plumbing.ZeroHash, err
+			}
+
+			remainingTargetSz -= sz
+		} else {
+			return 0, plumbing.ZeroHash, err
+		}
+		if remainingTargetSz <= 0 {
+			break
+		}
+	}
+
+	return targetSz, hasher.Sum(), nil
+}
+
 // Decodes a number encoded as an unsigned LEB128 at the start of some
 // binary data and returns the decoded number and the rest of the
 // stream.
author	Paulo Gomes <pjbgf@linux.com>	2023-07-20 21:19:28 +0100
committer	Paulo Gomes <pjbgf@linux.com>	2023-10-28 03:51:35 +0100
commit	1c361adbc1f4b0e3d0743d11f187fb0b3ac4cb4d (patch)
tree	b7eecbf517a8e1a59047cb98d414bf00ba1f4653 /plumbing/format/packfile/patch_delta.go
parent	814abc098d033f77315d3bfb89ae5991aae10457 (diff)
download	go-git-1c361adbc1f4b0e3d0743d11f187fb0b3ac4cb4d.tar.gz