aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRobin Jarry <robin@jarry.cc>2023-06-15 00:03:58 +0200
committerRobin Jarry <robin@jarry.cc>2023-06-25 22:53:19 +0200
commit9cbfb190f65eba73e42b034a39de4b4d35fd404c (patch)
treeed043239ed50991401ddb14001b6fddd534e1d67
parent9d0fdffeef2f4ae1f8c1b57f558e68ba4bd9ad28 (diff)
downloadaerc-9cbfb190f65eba73e42b034a39de4b4d35fd404c.tar.gz
parse-links: be more strict with url parsing
Reuse the same logic than colorize.c to allow parsing markdown links. Signed-off-by: Robin Jarry <robin@jarry.cc> Acked-by: Moritz Poldrack <moritz@poldrack.dev>
-rw-r--r--lib/parse/hyperlinks.go112
-rw-r--r--lib/parse/hyperlinks_test.go15
2 files changed, 108 insertions, 19 deletions
diff --git a/lib/parse/hyperlinks.go b/lib/parse/hyperlinks.go
index af8c3006..dd334dd4 100644
--- a/lib/parse/hyperlinks.go
+++ b/lib/parse/hyperlinks.go
@@ -1,40 +1,114 @@
package parse
import (
- "bufio"
"bytes"
"io"
- "net/url"
"regexp"
- "strings"
+ "sort"
)
-var urlRe = regexp.MustCompile(`([\w\d]{2,}:([^\s>\]\)"]|\][^\s>\)"]|\]$){8,})`)
+// Partial regexp to match the beginning of URLs and email addresses.
+// The remainder of the matched URLs/emails is parsed manually.
+var urlRe = regexp.MustCompile(
+ `([a-z]{2,8})://` + // URL start
+ `|` + // or
+ `(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@`, // email start
+)
// HttpLinks searches a reader for a http link and returns a copy of the
// reader and a slice with links.
func HttpLinks(r io.Reader) (io.Reader, []string) {
- var buf bytes.Buffer
- tr := io.TeeReader(r, &buf)
-
- scanner := bufio.NewScanner(tr)
- linkMap := make(map[string]struct{})
- for scanner.Scan() {
- line := scanner.Text()
- for _, word := range strings.Fields(line) {
- if links := urlRe.FindStringSubmatch(word); len(links) > 0 {
- if _, err := url.Parse(links[0]); err != nil {
- continue
+ buf, err := io.ReadAll(r)
+ if err != nil {
+ return r, nil
+ }
+
+ links := make(map[string]bool)
+ b := buf
+ match := urlRe.FindSubmatchIndex(b)
+ for ; match != nil; match = urlRe.FindSubmatchIndex(b) {
+ // Regular expressions do not really cut it here and we
+ // need to detect opening/closing braces to handle
+ // markdown link syntax.
+ var paren, bracket, ltgt, scheme int
+ var emitUrl bool
+ i, j := match[0], match[1]
+ b = b[i:]
+ scheme = j - i
+ j = scheme
+
+ for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 {
+ switch b[j] {
+ case '[':
+ bracket++
+ j++
+ case '(':
+ paren++
+ j++
+ case '<':
+ ltgt++
+ j++
+ case ']':
+ bracket--
+ if bracket < 0 {
+ emitUrl = true
+ } else {
+ j++
}
- linkMap[strings.TrimSpace(links[0])] = struct{}{}
+ case ')':
+ paren--
+ if paren < 0 {
+ emitUrl = true
+ } else {
+ j++
+ }
+ case '>':
+ ltgt--
+ if ltgt < 0 {
+ emitUrl = true
+ } else {
+ j++
+ }
+ default:
+ j++
}
}
+
+ // Heuristic to remove trailing characters that are
+ // valid URL characters, but typically not at the end of
+ // the URL
+ for trim := true; trim && j > 0; {
+ switch b[j-1] {
+ case '.', ',', ':', ';', '?', '!', '"', '\'', '%':
+ j--
+ default:
+ trim = false
+ }
+ }
+ if j == scheme {
+ // Only an URL scheme, ignore.
+ b = b[j:]
+ continue
+ }
+ url := string(b[:j])
+ if match[2] == -1 && match[4] == -1 {
+ // Email address with missing mailto: scheme. Add it.
+ url = "mailto:" + url
+ }
+ links[url] = true
+ b = b[j:]
}
- results := []string{}
- for link := range linkMap {
+ results := make([]string, 0, len(links))
+ for link := range links {
results = append(results, link)
}
+ sort.Strings(results)
- return &buf, results
+ return bytes.NewReader(buf), results
}
+
+var urichars = []byte(
+ "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +
+ "0123456789-_.,~:;/?#@!$&%*+=\"'<>()[]",
+)
diff --git a/lib/parse/hyperlinks_test.go b/lib/parse/hyperlinks_test.go
index cd0c85cb..cedad648 100644
--- a/lib/parse/hyperlinks_test.go
+++ b/lib/parse/hyperlinks_test.go
@@ -55,6 +55,11 @@ func TestHyperlinks(t *testing.T) {
links: []string{"https://aerc-mail.org"},
},
{
+ name: "https-link-markdown",
+ text: "text [https://aerc-mail.org](https://aerc-mail.org) more text",
+ links: []string{"https://aerc-mail.org"},
+ },
+ {
name: "multiple-links",
text: "text https://aerc-mail.org more text http://git.sr.ht/~rjarry/aerc more text",
links: []string{"https://aerc-mail.org", "http://git.sr.ht/~rjarry/aerc"},
@@ -90,6 +95,16 @@ func TestHyperlinks(t *testing.T) {
links: []string{"https://1drv.ms/w/s!Ap-KLfhNxS4fRt6tIvw?e=dW8WLO"},
},
{
+ name: "email",
+ text: "You can reach me via the somewhat strange, but nonetheless valid, email foo@baz.com",
+ links: []string{"mailto:foo@baz.com"},
+ },
+ {
+ name: "mailto",
+ text: "You can reach me via the somewhat strange, but nonetheless valid, email mailto:bar@fooz.fr. Thank you",
+ links: []string{"mailto:bar@fooz.fr"},
+ },
+ {
name: "mailto-ipv6",
text: "You can reach me via the somewhat strange, but nonetheless valid, email mailto:~mpldr/list@[2001:db8::7]",
links: []string{"mailto:~mpldr/list@[2001:db8::7]"},