From 60d40d60e9f91247b61f541888f1469bff44f573 Mon Sep 17 00:00:00 2001 From: Michael Muré Date: Mon, 19 Dec 2022 16:12:49 +0100 Subject: repo: proper reduced interface for full-text indexing Additionally, remove and concentrate quite a lot of complexity from the cache layer into a "per app" single site where to configure how indexing is done. --- repository/index_bleve.go | 154 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 repository/index_bleve.go (limited to 'repository/index_bleve.go') diff --git a/repository/index_bleve.go b/repository/index_bleve.go new file mode 100644 index 00000000..aae41d5f --- /dev/null +++ b/repository/index_bleve.go @@ -0,0 +1,154 @@ +package repository + +import ( + "fmt" + "os" + "strings" + "sync" + "unicode/utf8" + + "github.com/blevesearch/bleve" +) + +var _ Index = &bleveIndex{} + +type bleveIndex struct { + path string + + mu sync.RWMutex + index bleve.Index +} + +func openBleveIndex(path string) (*bleveIndex, error) { + index, err := bleve.Open(path) + if err == nil { + return &bleveIndex{path: path, index: index}, nil + } + + b := &bleveIndex{path: path} + err = b.makeIndex() + if err != nil { + return nil, err + } + + return b, nil +} + +func (b *bleveIndex) makeIndex() error { + err := os.MkdirAll(b.path, os.ModePerm) + if err != nil { + return err + } + + // TODO: follow https://github.com/blevesearch/bleve/issues/1576 recommendations + + mapping := bleve.NewIndexMapping() + mapping.DefaultAnalyzer = "en" + + index, err := bleve.New(b.path, mapping) + if err != nil { + return err + } + b.index = index + return nil +} + +func (b *bleveIndex) IndexOne(id string, texts []string) error { + b.mu.Lock() + defer b.mu.Unlock() + return b._index(b.index.Index, id, texts) +} + +func (b *bleveIndex) IndexBatch() (indexer func(id string, texts []string) error, closer func() error) { + b.mu.Lock() + defer b.mu.Unlock() + + batch := b.index.NewBatch() + + indexer = func(id string, texts []string) error { + return b._index(batch.Index, id, texts) + } + + closer = func() error { + return b.index.Batch(batch) + } + + return indexer, closer +} + +func (b *bleveIndex) _index(indexer func(string, interface{}) error, id string, texts []string) error { + searchable := struct{ Text []string }{Text: texts} + + // See https://github.com/blevesearch/bleve/issues/1576 + var sb strings.Builder + normalize := func(text string) string { + sb.Reset() + for _, field := range strings.Fields(text) { + if utf8.RuneCountInString(field) < 100 { + sb.WriteString(field) + sb.WriteRune(' ') + } + } + return sb.String() + } + + for i, s := range searchable.Text { + searchable.Text[i] = normalize(s) + } + + return indexer(id, searchable) +} + +func (b *bleveIndex) Search(terms []string) ([]string, error) { + b.mu.RLock() + defer b.mu.RUnlock() + + for i, term := range terms { + if strings.Contains(term, " ") { + terms[i] = fmt.Sprintf("\"%s\"", term) + } + } + + query := bleve.NewQueryStringQuery(strings.Join(terms, " ")) + search := bleve.NewSearchRequest(query) + + res, err := b.index.Search(search) + if err != nil { + return nil, err + } + + ids := make([]string, len(res.Hits)) + for i, hit := range res.Hits { + ids[i] = hit.ID + } + + return ids, nil +} + +func (b *bleveIndex) DocCount() (uint64, error) { + return b.index.DocCount() +} + +func (b *bleveIndex) Clear() error { + b.mu.Lock() + defer b.mu.Unlock() + + err := b.index.Close() + if err != nil { + return err + } + + err = os.RemoveAll(b.path) + if err != nil { + return err + } + + return b.makeIndex() +} + +func (b *bleveIndex) Close() error { + b.mu.Lock() + defer b.mu.Unlock() + + return b.index.Close() +} -- cgit