utils: filesystem, Calculate filesystem node's hash lazily.

The main motivation behind this change is to speed up status operation. Currently it's very slow, especially for repositories with lots of ignored files (e.g. node.js repository with node_modules directory). One of the reasons for this slowness is the fact that traversing filesystem involves calculating file hashes for all files, even if those hashes are not needed in the end because the files are in gitignore. On my machine, for a random repository with sizable (ignored) node_modules directory this changes bring the execution time for Worktree.Status from ~3.5s to ~1.4s. This is still very slow, but a significant improvement. A better fix (instead of or in addition to this one) would be to avoid traversing ignored files in the first place. However, such change seem to be more intrusive and will require much deeper understanding of the codebase.
author: Roman Bataev <roman@bataev.me> 2023-08-06 16:40:23 -0400
committer: Roman Bataev <roman@bataev.me> 2023-08-21 22:30:00 -0400
commit: c5b2e50b8907555538f52356fc04b2a5820b5c6f (patch)
tree: 41e9e9b9a0c231f2c86819804b0b6d9bc16823fb /utils
parent: cd6170c6f808453f58dcfd15c6c59345e3df402b (diff)
download: go-git-c5b2e50b8907555538f52356fc04b2a5820b5c6f.tar.gz
1 files changed, 37 insertions, 39 deletions
diff --git a/utils/merkletrie/filesystem/node.go b/utils/merkletrie/filesystem/node.go
index ad169ff..f9a54d7 100644
--- a/utils/merkletrie/filesystem/node.go
+++ b/utils/merkletrie/filesystem/node.go
@@ -29,6 +29,8 @@ type node struct {
 	hash     []byte
 	children []noder.Noder
 	isDir    bool
+	mode     os.FileMode
+	size     int64
 }
 
 // NewRootNode returns the root node based on a given billy.Filesystem.
@@ -50,6 +52,9 @@ func NewRootNode(
 //
 // The hash of a directory is always a 24-bytes slice of zero values
 func (n *node) Hash() []byte {
+	if n.hash == nil {
+		n.calculateHash()
+	}
 	return n.hash
 }
 
@@ -117,81 +122,74 @@ func (n *node) calculateChildren() error {
 func (n *node) newChildNode(file os.FileInfo) (*node, error) {
 	path := path.Join(n.path, file.Name())
 
-	hash, err := n.calculateHash(path, file)
-	if err != nil {
-		return nil, err
-	}
-
 	node := &node{
 		fs:         n.fs,
 		submodules: n.submodules,
 
 		path:  path,
-		hash:  hash,
 		isDir: file.IsDir(),
+		size:  file.Size(),
+		mode:  file.Mode(),
 	}
 
-	if hash, isSubmodule := n.submodules[path]; isSubmodule {
-		node.hash = append(hash[:], filemode.Submodule.Bytes()...)
+	if _, isSubmodule := n.submodules[path]; isSubmodule {
 		node.isDir = false
 	}
 
 	return node, nil
 }
 
-func (n *node) calculateHash(path string, file os.FileInfo) ([]byte, error) {
-	if file.IsDir() {
-		return make([]byte, 24), nil
-	}
-
-	var hash plumbing.Hash
-	var err error
-	if file.Mode()&os.ModeSymlink != 0 {
-		hash, err = n.doCalculateHashForSymlink(path, file)
-	} else {
-		hash, err = n.doCalculateHashForRegular(path, file)
+func (n *node) calculateHash() {
+	if n.isDir {
+		n.hash = make([]byte, 24)
+		return
 	}
-
+	mode, err := filemode.NewFromOSFileMode(n.mode)
 	if err != nil {
-		return nil, err
+		n.hash = plumbing.ZeroHash[:]
+		return
 	}
-
-	mode, err := filemode.NewFromOSFileMode(file.Mode())
-	if err != nil {
-		return nil, err
+	if submoduleHash, isSubmodule := n.submodules[n.path]; isSubmodule {
+		n.hash = append(submoduleHash[:], filemode.Submodule.Bytes()...)
+		return
 	}
-
-	return append(hash[:], mode.Bytes()...), nil
+	var hash plumbing.Hash
+	if n.mode&os.ModeSymlink != 0 {
+		hash = n.doCalculateHashForSymlink()
+	} else {
+		hash = n.doCalculateHashForRegular()
+	}
+	n.hash = append(hash[:], mode.Bytes()...)
 }
 
-func (n *node) doCalculateHashForRegular(path string, file os.FileInfo) (plumbing.Hash, error) {
-	f, err := n.fs.Open(path)
+func (n *node) doCalculateHashForRegular() plumbing.Hash {
+	f, err := n.fs.Open(n.path)
 	if err != nil {
-		return plumbing.ZeroHash, err
+		return plumbing.ZeroHash
 	}
 
 	defer f.Close()
 
-	h := plumbing.NewHasher(plumbing.BlobObject, file.Size())
+	h := plumbing.NewHasher(plumbing.BlobObject, n.size)
 	if _, err := io.Copy(h, f); err != nil {
-		return plumbing.ZeroHash, err
+		return plumbing.ZeroHash
 	}
 
-	return h.Sum(), nil
+	return h.Sum()
 }
 
-func (n *node) doCalculateHashForSymlink(path string, file os.FileInfo) (plumbing.Hash, error) {
-	target, err := n.fs.Readlink(path)
+func (n *node) doCalculateHashForSymlink() plumbing.Hash {
+	target, err := n.fs.Readlink(n.path)
 	if err != nil {
-		return plumbing.ZeroHash, err
+		return plumbing.ZeroHash
 	}
 
-	h := plumbing.NewHasher(plumbing.BlobObject, file.Size())
+	h := plumbing.NewHasher(plumbing.BlobObject, n.size)
 	if _, err := h.Write([]byte(target)); err != nil {
-		return plumbing.ZeroHash, err
+		return plumbing.ZeroHash
 	}
 
-	return h.Sum(), nil
+	return h.Sum()
 }
 
 func (n *node) String() string {
author	Roman Bataev <roman@bataev.me>	2023-08-06 16:40:23 -0400
committer	Roman Bataev <roman@bataev.me>	2023-08-21 22:30:00 -0400
commit	c5b2e50b8907555538f52356fc04b2a5820b5c6f (patch)
tree	41e9e9b9a0c231f2c86819804b0b6d9bc16823fb /utils
parent	cd6170c6f808453f58dcfd15c6c59345e3df402b (diff)
download	go-git-c5b2e50b8907555538f52356fc04b2a5820b5c6f.tar.gz