From 60d40d60e9f91247b61f541888f1469bff44f573 Mon Sep 17 00:00:00 2001 From: Michael Muré Date: Mon, 19 Dec 2022 16:12:49 +0100 Subject: repo: proper reduced interface for full-text indexing Additionally, remove and concentrate quite a lot of complexity from the cache layer into a "per app" single site where to configure how indexing is done. --- repository/gogit.go | 55 +++------------- repository/gogit_test.go | 11 +--- repository/index_bleve.go | 154 +++++++++++++++++++++++++++++++++++++++++++++ repository/mock_repo.go | 82 +++++++++++++++++------- repository/repo.go | 35 ++++++++--- repository/repo_testing.go | 47 +++++++++++++- 6 files changed, 297 insertions(+), 87 deletions(-) create mode 100644 repository/index_bleve.go (limited to 'repository') diff --git a/repository/gogit.go b/repository/gogit.go index c1f1fe37..746204df 100644 --- a/repository/gogit.go +++ b/repository/gogit.go @@ -12,7 +12,6 @@ import ( "time" "github.com/ProtonMail/go-crypto/openpgp" - "github.com/blevesearch/bleve" "github.com/go-git/go-billy/v5" "github.com/go-git/go-billy/v5/osfs" gogit "github.com/go-git/go-git/v5" @@ -45,7 +44,7 @@ type GoGitRepo struct { clocks map[string]lamport.Clock indexesMutex sync.Mutex - indexes map[string]bleve.Index + indexes map[string]Index keyring Keyring localStorage billy.Filesystem @@ -75,7 +74,7 @@ func OpenGoGitRepo(path, namespace string, clockLoaders []ClockLoader) (*GoGitRe r: r, path: path, clocks: make(map[string]lamport.Clock), - indexes: make(map[string]bleve.Index), + indexes: make(map[string]Index), keyring: k, localStorage: osfs.New(filepath.Join(path, namespace)), } @@ -129,7 +128,7 @@ func InitGoGitRepo(path, namespace string) (*GoGitRepo, error) { r: r, path: filepath.Join(path, ".git"), clocks: make(map[string]lamport.Clock), - indexes: make(map[string]bleve.Index), + indexes: make(map[string]Index), keyring: k, localStorage: osfs.New(filepath.Join(path, ".git", namespace)), }, nil @@ -154,7 +153,7 @@ func InitBareGoGitRepo(path, namespace string) (*GoGitRepo, error) { r: r, path: path, clocks: make(map[string]lamport.Clock), - indexes: make(map[string]bleve.Index), + indexes: make(map[string]Index), keyring: k, localStorage: osfs.New(filepath.Join(path, namespace)), }, nil @@ -323,8 +322,7 @@ func (repo *GoGitRepo) LocalStorage() billy.Filesystem { return repo.localStorage } -// GetBleveIndex return a bleve.Index that can be used to index documents -func (repo *GoGitRepo) GetBleveIndex(name string) (bleve.Index, error) { +func (repo *GoGitRepo) GetIndex(name string) (Index, error) { repo.indexesMutex.Lock() defer repo.indexesMutex.Unlock() @@ -334,50 +332,11 @@ func (repo *GoGitRepo) GetBleveIndex(name string) (bleve.Index, error) { path := filepath.Join(repo.localStorage.Root(), indexPath, name) - index, err := bleve.Open(path) + index, err := openBleveIndex(path) if err == nil { repo.indexes[name] = index - return index, nil - } - - err = os.MkdirAll(path, os.ModePerm) - if err != nil { - return nil, err - } - - mapping := bleve.NewIndexMapping() - mapping.DefaultAnalyzer = "en" - - index, err = bleve.New(path, mapping) - if err != nil { - return nil, err - } - - repo.indexes[name] = index - - return index, nil -} - -// ClearBleveIndex will wipe the given index -func (repo *GoGitRepo) ClearBleveIndex(name string) error { - repo.indexesMutex.Lock() - defer repo.indexesMutex.Unlock() - - if index, ok := repo.indexes[name]; ok { - err := index.Close() - if err != nil { - return err - } - delete(repo.indexes, name) } - - path := filepath.Join(repo.localStorage.Root(), indexPath, name) - err := os.RemoveAll(path) - if err != nil { - return err - } - - return nil + return index, err } // FetchRefs fetch git refs matching a directory prefix to a remote diff --git a/repository/gogit_test.go b/repository/gogit_test.go index a3de0a03..02bd42fd 100644 --- a/repository/gogit_test.go +++ b/repository/gogit_test.go @@ -65,24 +65,19 @@ func TestGoGitRepo_Indexes(t *testing.T) { plainRoot := goGitRepoDir(t, repo) // Can create indices - indexA, err := repo.GetBleveIndex("a") + indexA, err := repo.GetIndex("a") require.NoError(t, err) require.NotZero(t, indexA) require.FileExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "a", "index_meta.json")) require.FileExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "a", "store")) - indexB, err := repo.GetBleveIndex("b") + indexB, err := repo.GetIndex("b") require.NoError(t, err) require.NotZero(t, indexB) require.DirExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "b")) // Can get an existing index - indexA, err = repo.GetBleveIndex("a") + indexA, err = repo.GetIndex("a") require.NoError(t, err) require.NotZero(t, indexA) - - // Can delete an index - err = repo.ClearBleveIndex("a") - require.NoError(t, err) - require.NoDirExists(t, filepath.Join(plainRoot, ".git", namespace, "indexes", "a")) } diff --git a/repository/index_bleve.go b/repository/index_bleve.go new file mode 100644 index 00000000..aae41d5f --- /dev/null +++ b/repository/index_bleve.go @@ -0,0 +1,154 @@ +package repository + +import ( + "fmt" + "os" + "strings" + "sync" + "unicode/utf8" + + "github.com/blevesearch/bleve" +) + +var _ Index = &bleveIndex{} + +type bleveIndex struct { + path string + + mu sync.RWMutex + index bleve.Index +} + +func openBleveIndex(path string) (*bleveIndex, error) { + index, err := bleve.Open(path) + if err == nil { + return &bleveIndex{path: path, index: index}, nil + } + + b := &bleveIndex{path: path} + err = b.makeIndex() + if err != nil { + return nil, err + } + + return b, nil +} + +func (b *bleveIndex) makeIndex() error { + err := os.MkdirAll(b.path, os.ModePerm) + if err != nil { + return err + } + + // TODO: follow https://github.com/blevesearch/bleve/issues/1576 recommendations + + mapping := bleve.NewIndexMapping() + mapping.DefaultAnalyzer = "en" + + index, err := bleve.New(b.path, mapping) + if err != nil { + return err + } + b.index = index + return nil +} + +func (b *bleveIndex) IndexOne(id string, texts []string) error { + b.mu.Lock() + defer b.mu.Unlock() + return b._index(b.index.Index, id, texts) +} + +func (b *bleveIndex) IndexBatch() (indexer func(id string, texts []string) error, closer func() error) { + b.mu.Lock() + defer b.mu.Unlock() + + batch := b.index.NewBatch() + + indexer = func(id string, texts []string) error { + return b._index(batch.Index, id, texts) + } + + closer = func() error { + return b.index.Batch(batch) + } + + return indexer, closer +} + +func (b *bleveIndex) _index(indexer func(string, interface{}) error, id string, texts []string) error { + searchable := struct{ Text []string }{Text: texts} + + // See https://github.com/blevesearch/bleve/issues/1576 + var sb strings.Builder + normalize := func(text string) string { + sb.Reset() + for _, field := range strings.Fields(text) { + if utf8.RuneCountInString(field) < 100 { + sb.WriteString(field) + sb.WriteRune(' ') + } + } + return sb.String() + } + + for i, s := range searchable.Text { + searchable.Text[i] = normalize(s) + } + + return indexer(id, searchable) +} + +func (b *bleveIndex) Search(terms []string) ([]string, error) { + b.mu.RLock() + defer b.mu.RUnlock() + + for i, term := range terms { + if strings.Contains(term, " ") { + terms[i] = fmt.Sprintf("\"%s\"", term) + } + } + + query := bleve.NewQueryStringQuery(strings.Join(terms, " ")) + search := bleve.NewSearchRequest(query) + + res, err := b.index.Search(search) + if err != nil { + return nil, err + } + + ids := make([]string, len(res.Hits)) + for i, hit := range res.Hits { + ids[i] = hit.ID + } + + return ids, nil +} + +func (b *bleveIndex) DocCount() (uint64, error) { + return b.index.DocCount() +} + +func (b *bleveIndex) Clear() error { + b.mu.Lock() + defer b.mu.Unlock() + + err := b.index.Close() + if err != nil { + return err + } + + err = os.RemoveAll(b.path) + if err != nil { + return err + } + + return b.makeIndex() +} + +func (b *bleveIndex) Close() error { + b.mu.Lock() + defer b.mu.Unlock() + + return b.index.Close() +} diff --git a/repository/mock_repo.go b/repository/mock_repo.go index 3d7f0e73..f869795d 100644 --- a/repository/mock_repo.go +++ b/repository/mock_repo.go @@ -9,7 +9,6 @@ import ( "github.com/99designs/keyring" "github.com/ProtonMail/go-crypto/openpgp" - "github.com/blevesearch/bleve" "github.com/go-git/go-billy/v5" "github.com/go-git/go-billy/v5/memfs" @@ -25,7 +24,7 @@ type mockRepo struct { *mockRepoKeyring *mockRepoCommon *mockRepoStorage - *mockRepoBleve + *mockRepoIndex *mockRepoData *mockRepoClock *mockRepoTest @@ -39,7 +38,7 @@ func NewMockRepo() *mockRepo { mockRepoKeyring: NewMockRepoKeyring(), mockRepoCommon: NewMockRepoCommon(), mockRepoStorage: NewMockRepoStorage(), - mockRepoBleve: newMockRepoBleve(), + mockRepoIndex: newMockRepoIndex(), mockRepoData: NewMockRepoData(), mockRepoClock: NewMockRepoClock(), mockRepoTest: NewMockRepoTest(), @@ -135,20 +134,20 @@ func (m *mockRepoStorage) LocalStorage() billy.Filesystem { return m.localFs } -var _ RepoBleve = &mockRepoBleve{} +var _ RepoIndex = &mockRepoIndex{} -type mockRepoBleve struct { +type mockRepoIndex struct { indexesMutex sync.Mutex - indexes map[string]bleve.Index + indexes map[string]Index } -func newMockRepoBleve() *mockRepoBleve { - return &mockRepoBleve{ - indexes: make(map[string]bleve.Index), +func newMockRepoIndex() *mockRepoIndex { + return &mockRepoIndex{ + indexes: make(map[string]Index), } } -func (m *mockRepoBleve) GetBleveIndex(name string) (bleve.Index, error) { +func (m *mockRepoIndex) GetIndex(name string) (Index, error) { m.indexesMutex.Lock() defer m.indexesMutex.Unlock() @@ -156,24 +155,63 @@ func (m *mockRepoBleve) GetBleveIndex(name string) (bleve.Index, error) { return index, nil } - mapping := bleve.NewIndexMapping() - mapping.DefaultAnalyzer = "en" + index := newIndex() + m.indexes[name] = index + return index, nil +} - index, err := bleve.NewMemOnly(mapping) - if err != nil { - return nil, err - } +var _ Index = &mockIndex{} - m.indexes[name] = index +type mockIndex map[string][]string - return index, nil +func newIndex() *mockIndex { + m := make(map[string][]string) + return (*mockIndex)(&m) } -func (m *mockRepoBleve) ClearBleveIndex(name string) error { - m.indexesMutex.Lock() - defer m.indexesMutex.Unlock() +func (m *mockIndex) IndexOne(id string, texts []string) error { + (*m)[id] = texts + return nil +} + +func (m *mockIndex) IndexBatch() (indexer func(id string, texts []string) error, closer func() error) { + indexer = func(id string, texts []string) error { + (*m)[id] = texts + return nil + } + closer = func() error { return nil } + return indexer, closer +} + +func (m *mockIndex) Search(terms []string) (ids []string, err error) { +loop: + for id, texts := range *m { + for _, text := range texts { + for _, s := range strings.Fields(text) { + for _, term := range terms { + if s == term { + ids = append(ids, id) + continue loop + } + } + } + } + } + return ids, nil +} + +func (m *mockIndex) DocCount() (uint64, error) { + return uint64(len(*m)), nil +} + +func (m *mockIndex) Clear() error { + for k, _ := range *m { + delete(*m, k) + } + return nil +} - delete(m.indexes, name) +func (m *mockIndex) Close() error { return nil } diff --git a/repository/repo.go b/repository/repo.go index 2f90b437..42ed194c 100644 --- a/repository/repo.go +++ b/repository/repo.go @@ -6,7 +6,6 @@ import ( "io" "github.com/ProtonMail/go-crypto/openpgp" - "github.com/blevesearch/bleve" "github.com/go-git/go-billy/v5" "github.com/MichaelMure/git-bug/util/lamport" @@ -25,7 +24,7 @@ type Repo interface { RepoKeyring RepoCommon RepoStorage - RepoBleve + RepoIndex RepoData Close() error @@ -81,13 +80,33 @@ type RepoStorage interface { LocalStorage() billy.Filesystem } -// RepoBleve give access to Bleve to implement full-text search indexes. -type RepoBleve interface { - // GetBleveIndex return a bleve.Index that can be used to index documents - GetBleveIndex(name string) (bleve.Index, error) +// RepoIndex gives access to full-text search indexes +type RepoIndex interface { + GetIndex(name string) (Index, error) +} + +// Index is a full-text search index +type Index interface { + // IndexOne indexes one document, for the given ID. If the document already exist, + // it replaces it. + IndexOne(id string, texts []string) error + + // IndexBatch start a batch indexing. The returned indexer function is used the same + // way as IndexOne, and the closer function complete the batch insertion. + IndexBatch() (indexer func(id string, texts []string) error, closer func() error) + + // Search returns the list of IDs matching the given terms. + Search(terms []string) (ids []string, err error) - // ClearBleveIndex will wipe the given index - ClearBleveIndex(name string) error + // DocCount returns the number of document in the index. + DocCount() (uint64, error) + + // Clear empty the index. + Clear() error + + // Close closes the index and make sure everything is safely written. After this call + // the index can't be used anymore. + Close() error } type Commit struct { diff --git a/repository/repo_testing.go b/repository/repo_testing.go index 5d51d23f..c4ed6d77 100644 --- a/repository/repo_testing.go +++ b/repository/repo_testing.go @@ -10,7 +10,6 @@ import ( "github.com/MichaelMure/git-bug/util/lamport" ) -// TODO: add tests for RepoBleve // TODO: add tests for RepoStorage type RepoCreator func(t testing.TB, bare bool) TestedRepo @@ -33,6 +32,10 @@ func RepoTest(t *testing.T, creator RepoCreator) { RepoConfigTest(t, repo) }) + t.Run("Index", func(t *testing.T) { + RepoIndexTest(t, repo) + }) + t.Run("Clocks", func(t *testing.T) { RepoClockTest(t, repo) }) @@ -234,6 +237,48 @@ func RepoDataSignatureTest(t *testing.T, repo RepoData) { require.Error(t, err) } +func RepoIndexTest(t *testing.T, repo RepoIndex) { + idx, err := repo.GetIndex("a") + require.NoError(t, err) + + // simple indexing + err = idx.IndexOne("id1", []string{"foo", "bar", "foobar barfoo"}) + require.NoError(t, err) + + // batched indexing + indexer, closer := idx.IndexBatch() + err = indexer("id2", []string{"hello", "foo bar"}) + require.NoError(t, err) + err = indexer("id3", []string{"Hola", "Esta bien"}) + require.NoError(t, err) + err = closer() + require.NoError(t, err) + + // search + res, err := idx.Search([]string{"foobar"}) + require.NoError(t, err) + require.ElementsMatch(t, []string{"id1"}, res) + + res, err = idx.Search([]string{"foo"}) + require.NoError(t, err) + require.ElementsMatch(t, []string{"id1", "id2"}, res) + + // re-indexing an item replace previous versions + err = idx.IndexOne("id2", []string{"hello"}) + require.NoError(t, err) + + res, err = idx.Search([]string{"foo"}) + require.NoError(t, err) + require.ElementsMatch(t, []string{"id1"}, res) + + err = idx.Clear() + require.NoError(t, err) + + res, err = idx.Search([]string{"foo"}) + require.NoError(t, err) + require.Empty(t, res) +} + // helper to test a RepoClock func RepoClockTest(t *testing.T, repo RepoClock) { allClocks, err := repo.AllClocks() -- cgit