diff options
author | Michael Muré <batolettre@gmail.com> | 2020-03-28 19:22:27 +0100 |
---|---|---|
committer | Michael Muré <batolettre@gmail.com> | 2020-03-28 19:22:27 +0100 |
commit | ecde909b0a87c329db3cd62562c7bf3902139320 (patch) | |
tree | 13b0f45e6484dc7393ccd9a63f0f1bc658f98c93 | |
parent | aec81b7039effb59ba81383da0682e0937340962 (diff) | |
download | git-bug-ecde909b0a87c329db3cd62562c7bf3902139320.tar.gz |
query: more robust tokenizer
-rw-r--r-- | query/lexer.go | 69 | ||||
-rw-r--r-- | query/lexer_test.go | 7 |
2 files changed, 58 insertions, 18 deletions
diff --git a/query/lexer.go b/query/lexer.go index ec05f44e..ca67d641 100644 --- a/query/lexer.go +++ b/query/lexer.go @@ -11,13 +11,13 @@ type token struct { value string } -// TODO: this lexer implementation behave badly with unmatched quotes. -// A hand written one would be better instead of relying on strings.FieldsFunc() - // tokenize parse and break a input into tokens ready to be // interpreted later by a parser to get the semantic. func tokenize(query string) ([]token, error) { - fields := splitQuery(query) + fields, err := splitQuery(query) + if err != nil { + return nil, err + } var tokens []token for _, field := range fields { @@ -41,30 +41,63 @@ func tokenize(query string) ([]token, error) { return tokens, nil } -func splitQuery(query string) []string { +func splitQuery(query string) ([]string, error) { lastQuote := rune(0) - f := func(c rune) bool { + inQuote := false + + isToken := func(r rune) bool { switch { - case c == lastQuote: + case !inQuote && isQuote(r): + lastQuote = r + inQuote = true + return true + case inQuote && r == lastQuote: lastQuote = rune(0) - return false - case lastQuote != rune(0): - return false - case unicode.In(c, unicode.Quotation_Mark): - lastQuote = c - return false + inQuote = false + return true + case inQuote: + return true default: - return unicode.IsSpace(c) + return !unicode.IsSpace(r) } } - return strings.FieldsFunc(query, f) + var result []string + var token strings.Builder + for _, r := range query { + if isToken(r) { + token.WriteRune(r) + } else { + if token.Len() > 0 { + result = append(result, token.String()) + token.Reset() + } + } + } + + if inQuote { + return nil, fmt.Errorf("unmatched quote") + } + + if token.Len() > 0 { + result = append(result, token.String()) + } + + return result, nil +} + +func isQuote(r rune) bool { + return r == '"' || r == '\'' } func removeQuote(field string) string { - if len(field) >= 2 { - if field[0] == '"' && field[len(field)-1] == '"' { - return field[1 : len(field)-1] + runes := []rune(field) + if len(runes) >= 2 { + r1 := runes[0] + r2 := runes[len(runes)-1] + + if r1 == r2 && isQuote(r1) { + return string(runes[1 : len(runes)-1]) } } return field diff --git a/query/lexer_test.go b/query/lexer_test.go index 922e3fc9..36b9ba10 100644 --- a/query/lexer_test.go +++ b/query/lexer_test.go @@ -30,6 +30,13 @@ func TestTokenize(t *testing.T) { {"author", "René Descartes"}, }, }, + + // quotes + {`key:"value value"`, []token{{"key", "value value"}}}, + {`key:'value value'`, []token{{"key", "value value"}}}, + // unmatched quotes + {`key:'value value`, nil}, + {`key:value value'`, nil}, } for _, tc := range tests { |