aboutsummaryrefslogtreecommitdiffstats
path: root/repository
diff options
context:
space:
mode:
authorMichael Muré <batolettre@gmail.com>2022-12-19 16:12:49 +0100
committerMichael Muré <batolettre@gmail.com>2022-12-19 18:09:37 +0100
commit60d40d60e9f91247b61f541888f1469bff44f573 (patch)
treea3cdfd90a06358e7648a4a85a36cee680e121642 /repository
parent3b62fe0a4c7b3f1ae4f9a163b6c483444b5a8d20 (diff)
downloadgit-bug-60d40d60e9f91247b61f541888f1469bff44f573.tar.gz
repo: proper reduced interface for full-text indexing
Additionally, remove and concentrate quite a lot of complexity from the cache layer into a "per app" single site where to configure how indexing is done.
Diffstat (limited to 'repository')
-rw-r--r--repository/gogit.go55
-rw-r--r--repository/gogit_test.go11
-rw-r--r--repository/index_bleve.go154
-rw-r--r--repository/mock_repo.go82
-rw-r--r--repository/repo.go35
-rw-r--r--repository/repo_testing.go47
6 files changed, 297 insertions, 87 deletions
diff --git a/repository/gogit.go b/repository/gogit.go
index c1f1fe37..746204df 100644
--- a/repository/gogit.go
+++ b/repository/gogit.go
@@ -12,7 +12,6 @@ import (
"time"
"github.com/ProtonMail/go-crypto/openpgp"
- "github.com/blevesearch/bleve"
"github.com/go-git/go-billy/v5"
"github.com/go-git/go-billy/v5/osfs"
gogit "github.com/go-git/go-git/v5"
@@ -45,7 +44,7 @@ type GoGitRepo struct {
clocks map[string]lamport.Clock
indexesMutex sync.Mutex
- indexes map[string]bleve.Index
+ indexes map[string]Index
keyring Keyring
localStorage billy.Filesystem
@@ -75,7 +74,7 @@ func OpenGoGitRepo(path, namespace string, clockLoaders []ClockLoader) (*GoGitRe
r: r,
path: path,
clocks: make(map[string]lamport.Clock),
- indexes: make(map[string]bleve.Index),
+ indexes: make(map[string]Index),
keyring: k,
localStorage: osfs.New(filepath.Join(path, namespace)),
}
@@ -129,7 +128,7 @@ func InitGoGitRepo(path, namespace string) (*GoGitRepo, error) {
r: r,
path: filepath.Join(path, ".git"),
clocks: make(map[string]lamport.Clock),
- indexes: make(map[string]bleve.Index),
+ indexes: make(map[string]Index),
keyring: k,
localStorage: osfs.New(filepath.Join(path, ".git", namespace)),
}, nil
@@ -154,7 +153,7 @@ func InitBareGoGitRepo(path, namespace string) (*GoGitRepo, error) {
r: r,
path: path,
clocks: make(map[string]lamport.Clock),
- indexes: make(map[string]bleve.Index),
+ indexes: make(map[string]Index),
keyring: k,
localStorage: osfs.New(filepath.Join(path, namespace)),
}, nil
@@ -323,8 +322,7 @@ func (repo *GoGitRepo) LocalStorage() billy.Filesystem {
return repo.localStorage
}
-// GetBleveIndex return a bleve.Index that can be used to index documents
-func (repo *GoGitRepo) GetBleveIndex(name string) (bleve.Index, error) {
+func (repo *GoGitRepo) GetIndex(name string) (Index, error) {
repo.indexesMutex.Lock()
defer repo.indexesMutex.Unlock()
@@ -334,50 +332,11 @@ func (repo *GoGitRepo) GetBleveIndex(name string) (bleve.Index, error) {
path := filepath.Join(repo.localStorage.Root(), indexPath, name)
- index, err := bleve.Open(path)
+ index, err := openBleveIndex(path)
if err == nil {
repo.indexes[name] = index
- return index, nil
- }
-
- err = os.MkdirAll(path, os.ModePerm)
- if err != nil {
- return nil, err
- }
-
- mapping := bleve.NewIndexMapping()
- mapping.DefaultAnalyzer = "en"
-
- index, err = bleve.New(path, mapping)
- if err != nil {
- return nil, err
- }
-
- repo.indexes[name] = index
-
- return index, nil
-}
-
-// ClearBleveIndex will wipe the given index
-func (repo *GoGitRepo) ClearBleveIndex(name string) error {
- repo.indexesMutex.Lock()
- defer repo.indexesMutex.Unlock()
-
- if index, ok := repo.indexes[name]; ok {
- err := index.Close()
- if err != nil {
- return err
- }
- delete(repo.indexes, name)
}
-
- path := filepath.Join(repo.localStorage.Root(), indexPath, name)
- err := os.RemoveAll(path)
- if err != nil {
- return err
- }
-
- return nil
+ return index, err
}
// FetchRefs fetch git refs matching a directory prefix to a remote
diff --git a/repository/gogit_test.go b/repository/gogit_test.go
index a3de0a03..02bd42fd 100644
--- a/repository/gogit_test.go
+++ b/repository/gogit_test.go
@@ -65,24 +65,19 @@ func TestGoGitRepo_Indexes(t *testing.T) {
plainRoot := goGitRepoDir(t, repo)
// Can create indices
- indexA, err := repo.GetBleveIndex("a")
+ indexA, err := repo.GetIndex("a")
require.NoError(t, err)
require.NotZero(t, indexA)
require.FileExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "a", "index_meta.json"))
require.FileExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "a", "store"))
- indexB, err := repo.GetBleveIndex("b")
+ indexB, err := repo.GetIndex("b")
require.NoError(t, err)
require.NotZero(t, indexB)
require.DirExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "b"))
// Can get an existing index
- indexA, err = repo.GetBleveIndex("a")
+ indexA, err = repo.GetIndex("a")
require.NoError(t, err)
require.NotZero(t, indexA)
-
- // Can delete an index
- err = repo.ClearBleveIndex("a")
- require.NoError(t, err)
- require.NoDirExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "a"))
}
diff --git a/repository/index_bleve.go b/repository/index_bleve.go
new file mode 100644
index 00000000..aae41d5f
--- /dev/null
+++ b/repository/index_bleve.go
@@ -0,0 +1,154 @@
+package repository
+
+import (
+ "fmt"
+ "os"
+ "strings"
+ "sync"
+ "unicode/utf8"
+
+ "github.com/blevesearch/bleve"
+)
+
+var _ Index = &bleveIndex{}
+
+type bleveIndex struct {
+ path string
+
+ mu sync.RWMutex
+ index bleve.Index
+}
+
+func openBleveIndex(path string) (*bleveIndex, error) {
+ index, err := bleve.Open(path)
+ if err == nil {
+ return &bleveIndex{path: path, index: index}, nil
+ }
+
+ b := &bleveIndex{path: path}
+ err = b.makeIndex()
+ if err != nil {
+ return nil, err
+ }
+
+ return b, nil
+}
+
+func (b *bleveIndex) makeIndex() error {
+ err := os.MkdirAll(b.path, os.ModePerm)
+ if err != nil {
+ return err
+ }
+
+ // TODO: follow https://github.com/blevesearch/bleve/issues/1576 recommendations
+
+ mapping := bleve.NewIndexMapping()
+ mapping.DefaultAnalyzer = "en"
+
+ index, err := bleve.New(b.path, mapping)
+ if err != nil {
+ return err
+ }
+ b.index = index
+ return nil
+}
+
+func (b *bleveIndex) IndexOne(id string, texts []string) error {
+ b.mu.Lock()
+ defer b.mu.Unlock()
+ return b._index(b.index.Index, id, texts)
+}
+
+func (b *bleveIndex) IndexBatch() (indexer func(id string, texts []string) error, closer func() error) {
+ b.mu.Lock()
+ defer b.mu.Unlock()
+
+ batch := b.index.NewBatch()
+
+ indexer = func(id string, texts []string) error {
+ return b._index(batch.Index, id, texts)
+ }
+
+ closer = func() error {
+ return b.index.Batch(batch)
+ }
+
+ return indexer, closer
+}
+
+func (b *bleveIndex) _index(indexer func(string, interface{}) error, id string, texts []string) error {
+ searchable := struct{ Text []string }{Text: texts}
+
+ // See https://github.com/blevesearch/bleve/issues/1576
+ var sb strings.Builder
+ normalize := func(text string) string {
+ sb.Reset()
+ for _, field := range strings.Fields(text) {
+ if utf8.RuneCountInString(field) < 100 {
+ sb.WriteString(field)
+ sb.WriteRune(' ')
+ }
+ }
+ return sb.String()
+ }
+
+ for i, s := range searchable.Text {
+ searchable.Text[i] = normalize(s)
+ }
+
+ return indexer(id, searchable)
+}
+
+func (b *bleveIndex) Search(terms []string) ([]string, error) {
+ b.mu.RLock()
+ defer b.mu.RUnlock()
+
+ for i, term := range terms {
+ if strings.Contains(term, " ") {
+ terms[i] = fmt.Sprintf("\"%s\"", term)
+ }
+ }
+
+ query := bleve.NewQueryStringQuery(strings.Join(terms, " "))
+ search := bleve.NewSearchRequest(query)
+
+ res, err := b.index.Search(search)
+ if err != nil {
+ return nil, err
+ }
+
+ ids := make([]string, len(res.Hits))
+ for i, hit := range res.Hits {
+ ids[i] = hit.ID
+ }
+
+ return ids, nil
+}
+
+func (b *bleveIndex) DocCount() (uint64, error) {
+ return b.index.DocCount()
+}
+
+func (b *bleveIndex) Clear() error {
+ b.mu.Lock()
+ defer b.mu.Unlock()
+
+ err := b.index.Close()
+ if err != nil {
+ return err
+ }
+
+ err = os.RemoveAll(b.path)
+ if err != nil {
+ return err
+ }
+
+ return b.makeIndex()
+}
+
+func (b *bleveIndex) Close() error {
+ b.mu.Lock()
+ defer b.mu.Unlock()
+
+ return b.index.Close()
+}
diff --git a/repository/mock_repo.go b/repository/mock_repo.go
index 3d7f0e73..f869795d 100644
--- a/repository/mock_repo.go
+++ b/repository/mock_repo.go
@@ -9,7 +9,6 @@ import (
"github.com/99designs/keyring"
"github.com/ProtonMail/go-crypto/openpgp"
- "github.com/blevesearch/bleve"
"github.com/go-git/go-billy/v5"
"github.com/go-git/go-billy/v5/memfs"
@@ -25,7 +24,7 @@ type mockRepo struct {
*mockRepoKeyring
*mockRepoCommon
*mockRepoStorage
- *mockRepoBleve
+ *mockRepoIndex
*mockRepoData
*mockRepoClock
*mockRepoTest
@@ -39,7 +38,7 @@ func NewMockRepo() *mockRepo {
mockRepoKeyring: NewMockRepoKeyring(),
mockRepoCommon: NewMockRepoCommon(),
mockRepoStorage: NewMockRepoStorage(),
- mockRepoBleve: newMockRepoBleve(),
+ mockRepoIndex: newMockRepoIndex(),
mockRepoData: NewMockRepoData(),
mockRepoClock: NewMockRepoClock(),
mockRepoTest: NewMockRepoTest(),
@@ -135,20 +134,20 @@ func (m *mockRepoStorage) LocalStorage() billy.Filesystem {
return m.localFs
}
-var _ RepoBleve = &mockRepoBleve{}
+var _ RepoIndex = &mockRepoIndex{}
-type mockRepoBleve struct {
+type mockRepoIndex struct {
indexesMutex sync.Mutex
- indexes map[string]bleve.Index
+ indexes map[string]Index
}
-func newMockRepoBleve() *mockRepoBleve {
- return &mockRepoBleve{
- indexes: make(map[string]bleve.Index),
+func newMockRepoIndex() *mockRepoIndex {
+ return &mockRepoIndex{
+ indexes: make(map[string]Index),
}
}
-func (m *mockRepoBleve) GetBleveIndex(name string) (bleve.Index, error) {
+func (m *mockRepoIndex) GetIndex(name string) (Index, error) {
m.indexesMutex.Lock()
defer m.indexesMutex.Unlock()
@@ -156,24 +155,63 @@ func (m *mockRepoBleve) GetBleveIndex(name string) (bleve.Index, error) {
return index, nil
}
- mapping := bleve.NewIndexMapping()
- mapping.DefaultAnalyzer = "en"
+ index := newIndex()
+ m.indexes[name] = index
+ return index, nil
+}
- index, err := bleve.NewMemOnly(mapping)
- if err != nil {
- return nil, err
- }
+var _ Index = &mockIndex{}
- m.indexes[name] = index
+type mockIndex map[string][]string
- return index, nil
+func newIndex() *mockIndex {
+ m := make(map[string][]string)
+ return (*mockIndex)(&m)
}
-func (m *mockRepoBleve) ClearBleveIndex(name string) error {
- m.indexesMutex.Lock()
- defer m.indexesMutex.Unlock()
+func (m *mockIndex) IndexOne(id string, texts []string) error {
+ (*m)[id] = texts
+ return nil
+}
+
+func (m *mockIndex) IndexBatch() (indexer func(id string, texts []string) error, closer func() error) {
+ indexer = func(id string, texts []string) error {
+ (*m)[id] = texts
+ return nil
+ }
+ closer = func() error { return nil }
+ return indexer, closer
+}
+
+func (m *mockIndex) Search(terms []string) (ids []string, err error) {
+loop:
+ for id, texts := range *m {
+ for _, text := range texts {
+ for _, s := range strings.Fields(text) {
+ for _, term := range terms {
+ if s == term {
+ ids = append(ids, id)
+ continue loop
+ }
+ }
+ }
+ }
+ }
+ return ids, nil
+}
+
+func (m *mockIndex) DocCount() (uint64, error) {
+ return uint64(len(*m)), nil
+}
+
+func (m *mockIndex) Clear() error {
+ for k, _ := range *m {
+ delete(*m, k)
+ }
+ return nil
+}
- delete(m.indexes, name)
+func (m *mockIndex) Close() error {
return nil
}
diff --git a/repository/repo.go b/repository/repo.go
index 2f90b437..42ed194c 100644
--- a/repository/repo.go
+++ b/repository/repo.go
@@ -6,7 +6,6 @@ import (
"io"
"github.com/ProtonMail/go-crypto/openpgp"
- "github.com/blevesearch/bleve"
"github.com/go-git/go-billy/v5"
"github.com/MichaelMure/git-bug/util/lamport"
@@ -25,7 +24,7 @@ type Repo interface {
RepoKeyring
RepoCommon
RepoStorage
- RepoBleve
+ RepoIndex
RepoData
Close() error
@@ -81,13 +80,33 @@ type RepoStorage interface {
LocalStorage() billy.Filesystem
}
-// RepoBleve give access to Bleve to implement full-text search indexes.
-type RepoBleve interface {
- // GetBleveIndex return a bleve.Index that can be used to index documents
- GetBleveIndex(name string) (bleve.Index, error)
+// RepoIndex gives access to full-text search indexes
+type RepoIndex interface {
+ GetIndex(name string) (Index, error)
+}
+
+// Index is a full-text search index
+type Index interface {
+ // IndexOne indexes one document, for the given ID. If the document already exist,
+ // it replaces it.
+ IndexOne(id string, texts []string) error
+
+ // IndexBatch start a batch indexing. The returned indexer function is used the same
+ // way as IndexOne, and the closer function complete the batch insertion.
+ IndexBatch() (indexer func(id string, texts []string) error, closer func() error)
+
+ // Search returns the list of IDs matching the given terms.
+ Search(terms []string) (ids []string, err error)
- // ClearBleveIndex will wipe the given index
- ClearBleveIndex(name string) error
+ // DocCount returns the number of document in the index.
+ DocCount() (uint64, error)
+
+ // Clear empty the index.
+ Clear() error
+
+ // Close closes the index and make sure everything is safely written. After this call
+ // the index can't be used anymore.
+ Close() error
}
type Commit struct {
diff --git a/repository/repo_testing.go b/repository/repo_testing.go
index 5d51d23f..c4ed6d77 100644
--- a/repository/repo_testing.go
+++ b/repository/repo_testing.go
@@ -10,7 +10,6 @@ import (
"github.com/MichaelMure/git-bug/util/lamport"
)
-// TODO: add tests for RepoBleve
// TODO: add tests for RepoStorage
type RepoCreator func(t testing.TB, bare bool) TestedRepo
@@ -33,6 +32,10 @@ func RepoTest(t *testing.T, creator RepoCreator) {
RepoConfigTest(t, repo)
})
+ t.Run("Index", func(t *testing.T) {
+ RepoIndexTest(t, repo)
+ })
+
t.Run("Clocks", func(t *testing.T) {
RepoClockTest(t, repo)
})
@@ -234,6 +237,48 @@ func RepoDataSignatureTest(t *testing.T, repo RepoData) {
require.Error(t, err)
}
+func RepoIndexTest(t *testing.T, repo RepoIndex) {
+ idx, err := repo.GetIndex("a")
+ require.NoError(t, err)
+
+ // simple indexing
+ err = idx.IndexOne("id1", []string{"foo", "bar", "foobar barfoo"})
+ require.NoError(t, err)
+
+ // batched indexing
+ indexer, closer := idx.IndexBatch()
+ err = indexer("id2", []string{"hello", "foo bar"})
+ require.NoError(t, err)
+ err = indexer("id3", []string{"Hola", "Esta bien"})
+ require.NoError(t, err)
+ err = closer()
+ require.NoError(t, err)
+
+ // search
+ res, err := idx.Search([]string{"foobar"})
+ require.NoError(t, err)
+ require.ElementsMatch(t, []string{"id1"}, res)
+
+ res, err = idx.Search([]string{"foo"})
+ require.NoError(t, err)
+ require.ElementsMatch(t, []string{"id1", "id2"}, res)
+
+ // re-indexing an item replace previous versions
+ err = idx.IndexOne("id2", []string{"hello"})
+ require.NoError(t, err)
+
+ res, err = idx.Search([]string{"foo"})
+ require.NoError(t, err)
+ require.ElementsMatch(t, []string{"id1"}, res)
+
+ err = idx.Clear()
+ require.NoError(t, err)
+
+ res, err = idx.Search([]string{"foo"})
+ require.NoError(t, err)
+ require.Empty(t, res)
+}
+
// helper to test a RepoClock
func RepoClockTest(t *testing.T, repo RepoClock) {
allClocks, err := repo.AllClocks()