parse-links: be more strict with url parsing

Reuse the same logic than colorize.c to allow parsing markdown links. Signed-off-by: Robin Jarry <robin@jarry.cc> Acked-by: Moritz Poldrack <moritz@poldrack.dev>
author: Robin Jarry <robin@jarry.cc> 2023-06-15 00:03:58 +0200
committer: Robin Jarry <robin@jarry.cc> 2023-06-25 22:53:19 +0200
commit: 9cbfb190f65eba73e42b034a39de4b4d35fd404c (patch)
tree: ed043239ed50991401ddb14001b6fddd534e1d67 /lib/parse
parent: 9d0fdffeef2f4ae1f8c1b57f558e68ba4bd9ad28 (diff)
download: aerc-9cbfb190f65eba73e42b034a39de4b4d35fd404c.tar.gz
2 files changed, 108 insertions, 19 deletions
diff --git a/lib/parse/hyperlinks.go b/lib/parse/hyperlinks.go
index af8c3006..dd334dd4 100644
--- a/lib/parse/hyperlinks.go
+++ b/lib/parse/hyperlinks.go
@@ -1,40 +1,114 @@
 package parse
 
 import (
-	"bufio"
 	"bytes"
 	"io"
-	"net/url"
 	"regexp"
-	"strings"
+	"sort"
 )
 
-var urlRe = regexp.MustCompile(`([\w\d]{2,}:([^\s>\]\)"]|\][^\s>\)"]|\]$){8,})`)
+// Partial regexp to match the beginning of URLs and email addresses.
+// The remainder of the matched URLs/emails is parsed manually.
+var urlRe = regexp.MustCompile(
+	`([a-z]{2,8})://` + // URL start
+		`|` + // or
+		`(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@`, // email start
+)
 
 // HttpLinks searches a reader for a http link and returns a copy of the
 // reader and a slice with links.
 func HttpLinks(r io.Reader) (io.Reader, []string) {
-	var buf bytes.Buffer
-	tr := io.TeeReader(r, &buf)
-
-	scanner := bufio.NewScanner(tr)
-	linkMap := make(map[string]struct{})
-	for scanner.Scan() {
-		line := scanner.Text()
-		for _, word := range strings.Fields(line) {
-			if links := urlRe.FindStringSubmatch(word); len(links) > 0 {
-				if _, err := url.Parse(links[0]); err != nil {
-					continue
+	buf, err := io.ReadAll(r)
+	if err != nil {
+		return r, nil
+	}
+
+	links := make(map[string]bool)
+	b := buf
+	match := urlRe.FindSubmatchIndex(b)
+	for ; match != nil; match = urlRe.FindSubmatchIndex(b) {
+		// Regular expressions do not really cut it here and we
+		// need to detect opening/closing braces to handle
+		// markdown link syntax.
+		var paren, bracket, ltgt, scheme int
+		var emitUrl bool
+		i, j := match[0], match[1]
+		b = b[i:]
+		scheme = j - i
+		j = scheme
+
+		for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 {
+			switch b[j] {
+			case '[':
+				bracket++
+				j++
+			case '(':
+				paren++
+				j++
+			case '<':
+				ltgt++
+				j++
+			case ']':
+				bracket--
+				if bracket < 0 {
+					emitUrl = true
+				} else {
+					j++
 				}
-				linkMap[strings.TrimSpace(links[0])] = struct{}{}
+			case ')':
+				paren--
+				if paren < 0 {
+					emitUrl = true
+				} else {
+					j++
+				}
+			case '>':
+				ltgt--
+				if ltgt < 0 {
+					emitUrl = true
+				} else {
+					j++
+				}
+			default:
+				j++
 			}
 		}
+
+		// Heuristic to remove trailing characters that are
+		// valid URL characters, but typically not at the end of
+		// the URL
+		for trim := true; trim && j > 0; {
+			switch b[j-1] {
+			case '.', ',', ':', ';', '?', '!', '"', '\'', '%':
+				j--
+			default:
+				trim = false
+			}
+		}
+		if j == scheme {
+			// Only an URL scheme, ignore.
+			b = b[j:]
+			continue
+		}
+		url := string(b[:j])
+		if match[2] == -1 && match[4] == -1 {
+			// Email address with missing mailto: scheme. Add it.
+			url = "mailto:" + url
+		}
+		links[url] = true
+		b = b[j:]
 	}
 
-	results := []string{}
-	for link := range linkMap {
+	results := make([]string, 0, len(links))
+	for link := range links {
 		results = append(results, link)
 	}
+	sort.Strings(results)
 
-	return &buf, results
+	return bytes.NewReader(buf), results
 }
+
+var urichars = []byte(
+	"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +
+		"0123456789-_.,~:;/?#@!$&%*+=\"'<>()[]",
+)
diff --git a/lib/parse/hyperlinks_test.go b/lib/parse/hyperlinks_test.go
index cd0c85cb..cedad648 100644
--- a/lib/parse/hyperlinks_test.go
+++ b/lib/parse/hyperlinks_test.go
@@ -55,6 +55,11 @@ func TestHyperlinks(t *testing.T) {
 			links: []string{"https://aerc-mail.org"},
 		},
 		{
+			name:  "https-link-markdown",
+			text:  "text [https://aerc-mail.org](https://aerc-mail.org) more text",
+			links: []string{"https://aerc-mail.org"},
+		},
+		{
 			name:  "multiple-links",
 			text:  "text https://aerc-mail.org more text http://git.sr.ht/~rjarry/aerc more text",
 			links: []string{"https://aerc-mail.org", "http://git.sr.ht/~rjarry/aerc"},
@@ -90,6 +95,16 @@ func TestHyperlinks(t *testing.T) {
 			links: []string{"https://1drv.ms/w/s!Ap-KLfhNxS4fRt6tIvw?e=dW8WLO"},
 		},
 		{
+			name:  "email",
+			text:  "You can reach me via the somewhat strange, but nonetheless valid, email foo@baz.com",
+			links: []string{"mailto:foo@baz.com"},
+		},
+		{
+			name:  "mailto",
+			text:  "You can reach me via the somewhat strange, but nonetheless valid, email mailto:bar@fooz.fr. Thank you",
+			links: []string{"mailto:bar@fooz.fr"},
+		},
+		{
 			name:  "mailto-ipv6",
 			text:  "You can reach me via the somewhat strange, but nonetheless valid, email mailto:~mpldr/list@[2001:db8::7]",
 			links: []string{"mailto:~mpldr/list@[2001:db8::7]"},
author	Robin Jarry <robin@jarry.cc>	2023-06-15 00:03:58 +0200
committer	Robin Jarry <robin@jarry.cc>	2023-06-25 22:53:19 +0200
commit	9cbfb190f65eba73e42b034a39de4b4d35fd404c (patch)
tree	ed043239ed50991401ddb14001b6fddd534e1d67 /lib/parse
parent	9d0fdffeef2f4ae1f8c1b57f558e68ba4bd9ad28 (diff)
download	aerc-9cbfb190f65eba73e42b034a39de4b4d35fd404c.tar.gz