diff options
author | Robin Jarry <robin@jarry.cc> | 2023-06-15 00:03:58 +0200 |
---|---|---|
committer | Robin Jarry <robin@jarry.cc> | 2023-06-25 22:53:19 +0200 |
commit | 9cbfb190f65eba73e42b034a39de4b4d35fd404c (patch) | |
tree | ed043239ed50991401ddb14001b6fddd534e1d67 | |
parent | 9d0fdffeef2f4ae1f8c1b57f558e68ba4bd9ad28 (diff) | |
download | aerc-9cbfb190f65eba73e42b034a39de4b4d35fd404c.tar.gz |
parse-links: be more strict with url parsing
Reuse the same logic than colorize.c to allow parsing markdown links.
Signed-off-by: Robin Jarry <robin@jarry.cc>
Acked-by: Moritz Poldrack <moritz@poldrack.dev>
-rw-r--r-- | lib/parse/hyperlinks.go | 112 | ||||
-rw-r--r-- | lib/parse/hyperlinks_test.go | 15 |
2 files changed, 108 insertions, 19 deletions
diff --git a/lib/parse/hyperlinks.go b/lib/parse/hyperlinks.go index af8c3006..dd334dd4 100644 --- a/lib/parse/hyperlinks.go +++ b/lib/parse/hyperlinks.go @@ -1,40 +1,114 @@ package parse import ( - "bufio" "bytes" "io" - "net/url" "regexp" - "strings" + "sort" ) -var urlRe = regexp.MustCompile(`([\w\d]{2,}:([^\s>\]\)"]|\][^\s>\)"]|\]$){8,})`) +// Partial regexp to match the beginning of URLs and email addresses. +// The remainder of the matched URLs/emails is parsed manually. +var urlRe = regexp.MustCompile( + `([a-z]{2,8})://` + // URL start + `|` + // or + `(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@`, // email start +) // HttpLinks searches a reader for a http link and returns a copy of the // reader and a slice with links. func HttpLinks(r io.Reader) (io.Reader, []string) { - var buf bytes.Buffer - tr := io.TeeReader(r, &buf) - - scanner := bufio.NewScanner(tr) - linkMap := make(map[string]struct{}) - for scanner.Scan() { - line := scanner.Text() - for _, word := range strings.Fields(line) { - if links := urlRe.FindStringSubmatch(word); len(links) > 0 { - if _, err := url.Parse(links[0]); err != nil { - continue + buf, err := io.ReadAll(r) + if err != nil { + return r, nil + } + + links := make(map[string]bool) + b := buf + match := urlRe.FindSubmatchIndex(b) + for ; match != nil; match = urlRe.FindSubmatchIndex(b) { + // Regular expressions do not really cut it here and we + // need to detect opening/closing braces to handle + // markdown link syntax. + var paren, bracket, ltgt, scheme int + var emitUrl bool + i, j := match[0], match[1] + b = b[i:] + scheme = j - i + j = scheme + + for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 { + switch b[j] { + case '[': + bracket++ + j++ + case '(': + paren++ + j++ + case '<': + ltgt++ + j++ + case ']': + bracket-- + if bracket < 0 { + emitUrl = true + } else { + j++ } - linkMap[strings.TrimSpace(links[0])] = struct{}{} + case ')': + paren-- + if paren < 0 { + emitUrl = true + } else { + j++ + } + case '>': + ltgt-- + if ltgt < 0 { + emitUrl = true + } else { + j++ + } + default: + j++ } } + + // Heuristic to remove trailing characters that are + // valid URL characters, but typically not at the end of + // the URL + for trim := true; trim && j > 0; { + switch b[j-1] { + case '.', ',', ':', ';', '?', '!', '"', '\'', '%': + j-- + default: + trim = false + } + } + if j == scheme { + // Only an URL scheme, ignore. + b = b[j:] + continue + } + url := string(b[:j]) + if match[2] == -1 && match[4] == -1 { + // Email address with missing mailto: scheme. Add it. + url = "mailto:" + url + } + links[url] = true + b = b[j:] } - results := []string{} - for link := range linkMap { + results := make([]string, 0, len(links)) + for link := range links { results = append(results, link) } + sort.Strings(results) - return &buf, results + return bytes.NewReader(buf), results } + +var urichars = []byte( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + + "0123456789-_.,~:;/?#@!$&%*+=\"'<>()[]", +) diff --git a/lib/parse/hyperlinks_test.go b/lib/parse/hyperlinks_test.go index cd0c85cb..cedad648 100644 --- a/lib/parse/hyperlinks_test.go +++ b/lib/parse/hyperlinks_test.go @@ -55,6 +55,11 @@ func TestHyperlinks(t *testing.T) { links: []string{"https://aerc-mail.org"}, }, { + name: "https-link-markdown", + text: "text [https://aerc-mail.org](https://aerc-mail.org) more text", + links: []string{"https://aerc-mail.org"}, + }, + { name: "multiple-links", text: "text https://aerc-mail.org more text http://git.sr.ht/~rjarry/aerc more text", links: []string{"https://aerc-mail.org", "http://git.sr.ht/~rjarry/aerc"}, @@ -90,6 +95,16 @@ func TestHyperlinks(t *testing.T) { links: []string{"https://1drv.ms/w/s!Ap-KLfhNxS4fRt6tIvw?e=dW8WLO"}, }, { + name: "email", + text: "You can reach me via the somewhat strange, but nonetheless valid, email foo@baz.com", + links: []string{"mailto:foo@baz.com"}, + }, + { + name: "mailto", + text: "You can reach me via the somewhat strange, but nonetheless valid, email mailto:bar@fooz.fr. Thank you", + links: []string{"mailto:bar@fooz.fr"}, + }, + { name: "mailto-ipv6", text: "You can reach me via the somewhat strange, but nonetheless valid, email mailto:~mpldr/list@[2001:db8::7]", links: []string{"mailto:~mpldr/list@[2001:db8::7]"}, |