aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Muré <batolettre@gmail.com>2020-03-28 19:22:27 +0100
committerMichael Muré <batolettre@gmail.com>2020-03-28 19:22:27 +0100
commitecde909b0a87c329db3cd62562c7bf3902139320 (patch)
tree13b0f45e6484dc7393ccd9a63f0f1bc658f98c93
parentaec81b7039effb59ba81383da0682e0937340962 (diff)
downloadgit-bug-ecde909b0a87c329db3cd62562c7bf3902139320.tar.gz
query: more robust tokenizer
-rw-r--r--query/lexer.go69
-rw-r--r--query/lexer_test.go7
2 files changed, 58 insertions, 18 deletions
diff --git a/query/lexer.go b/query/lexer.go
index ec05f44e..ca67d641 100644
--- a/query/lexer.go
+++ b/query/lexer.go
@@ -11,13 +11,13 @@ type token struct {
value string
}
-// TODO: this lexer implementation behave badly with unmatched quotes.
-// A hand written one would be better instead of relying on strings.FieldsFunc()
-
// tokenize parse and break a input into tokens ready to be
// interpreted later by a parser to get the semantic.
func tokenize(query string) ([]token, error) {
- fields := splitQuery(query)
+ fields, err := splitQuery(query)
+ if err != nil {
+ return nil, err
+ }
var tokens []token
for _, field := range fields {
@@ -41,30 +41,63 @@ func tokenize(query string) ([]token, error) {
return tokens, nil
}
-func splitQuery(query string) []string {
+func splitQuery(query string) ([]string, error) {
lastQuote := rune(0)
- f := func(c rune) bool {
+ inQuote := false
+
+ isToken := func(r rune) bool {
switch {
- case c == lastQuote:
+ case !inQuote && isQuote(r):
+ lastQuote = r
+ inQuote = true
+ return true
+ case inQuote && r == lastQuote:
lastQuote = rune(0)
- return false
- case lastQuote != rune(0):
- return false
- case unicode.In(c, unicode.Quotation_Mark):
- lastQuote = c
- return false
+ inQuote = false
+ return true
+ case inQuote:
+ return true
default:
- return unicode.IsSpace(c)
+ return !unicode.IsSpace(r)
}
}
- return strings.FieldsFunc(query, f)
+ var result []string
+ var token strings.Builder
+ for _, r := range query {
+ if isToken(r) {
+ token.WriteRune(r)
+ } else {
+ if token.Len() > 0 {
+ result = append(result, token.String())
+ token.Reset()
+ }
+ }
+ }
+
+ if inQuote {
+ return nil, fmt.Errorf("unmatched quote")
+ }
+
+ if token.Len() > 0 {
+ result = append(result, token.String())
+ }
+
+ return result, nil
+}
+
+func isQuote(r rune) bool {
+ return r == '"' || r == '\''
}
func removeQuote(field string) string {
- if len(field) >= 2 {
- if field[0] == '"' && field[len(field)-1] == '"' {
- return field[1 : len(field)-1]
+ runes := []rune(field)
+ if len(runes) >= 2 {
+ r1 := runes[0]
+ r2 := runes[len(runes)-1]
+
+ if r1 == r2 && isQuote(r1) {
+ return string(runes[1 : len(runes)-1])
}
}
return field
diff --git a/query/lexer_test.go b/query/lexer_test.go
index 922e3fc9..36b9ba10 100644
--- a/query/lexer_test.go
+++ b/query/lexer_test.go
@@ -30,6 +30,13 @@ func TestTokenize(t *testing.T) {
{"author", "René Descartes"},
},
},
+
+ // quotes
+ {`key:"value value"`, []token{{"key", "value value"}}},
+ {`key:'value value'`, []token{{"key", "value value"}}},
+ // unmatched quotes
+ {`key:'value value`, nil},
+ {`key:value value'`, nil},
}
for _, tc := range tests {