1 files changed, 93 insertions, 19 deletions
diff --git a/lib/parse/hyperlinks.go b/lib/parse/hyperlinks.go
index af8c3006..dd334dd4 100644
--- a/lib/parse/hyperlinks.go
+++ b/lib/parse/hyperlinks.go
@@ -1,40 +1,114 @@
 package parse
 
 import (
-	"bufio"
 	"bytes"
 	"io"
-	"net/url"
 	"regexp"
-	"strings"
+	"sort"
 )
 
-var urlRe = regexp.MustCompile(`([\w\d]{2,}:([^\s>\]\)"]|\][^\s>\)"]|\]$){8,})`)
+// Partial regexp to match the beginning of URLs and email addresses.
+// The remainder of the matched URLs/emails is parsed manually.
+var urlRe = regexp.MustCompile(
+	`([a-z]{2,8})://` + // URL start
+		`|` + // or
+		`(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@`, // email start
+)
 
 // HttpLinks searches a reader for a http link and returns a copy of the
 // reader and a slice with links.
 func HttpLinks(r io.Reader) (io.Reader, []string) {
-	var buf bytes.Buffer
-	tr := io.TeeReader(r, &buf)
-
-	scanner := bufio.NewScanner(tr)
-	linkMap := make(map[string]struct{})
-	for scanner.Scan() {
-		line := scanner.Text()
-		for _, word := range strings.Fields(line) {
-			if links := urlRe.FindStringSubmatch(word); len(links) > 0 {
-				if _, err := url.Parse(links[0]); err != nil {
-					continue
+	buf, err := io.ReadAll(r)
+	if err != nil {
+		return r, nil
+	}
+
+	links := make(map[string]bool)
+	b := buf
+	match := urlRe.FindSubmatchIndex(b)
+	for ; match != nil; match = urlRe.FindSubmatchIndex(b) {
+		// Regular expressions do not really cut it here and we
+		// need to detect opening/closing braces to handle
+		// markdown link syntax.
+		var paren, bracket, ltgt, scheme int
+		var emitUrl bool
+		i, j := match[0], match[1]
+		b = b[i:]
+		scheme = j - i
+		j = scheme
+
+		for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 {
+			switch b[j] {
+			case '[':
+				bracket++
+				j++
+			case '(':
+				paren++
+				j++
+			case '<':
+				ltgt++
+				j++
+			case ']':
+				bracket--
+				if bracket < 0 {
+					emitUrl = true
+				} else {
+					j++
 				}
-				linkMap[strings.TrimSpace(links[0])] = struct{}{}
+			case ')':
+				paren--
+				if paren < 0 {
+					emitUrl = true
+				} else {
+					j++
+				}
+			case '>':
+				ltgt--
+				if ltgt < 0 {
+					emitUrl = true
+				} else {
+					j++
+				}
+			default:
+				j++
 			}
 		}
+
+		// Heuristic to remove trailing characters that are
+		// valid URL characters, but typically not at the end of
+		// the URL
+		for trim := true; trim && j > 0; {
+			switch b[j-1] {
+			case '.', ',', ':', ';', '?', '!', '"', '\'', '%':
+				j--
+			default:
+				trim = false
+			}
+		}
+		if j == scheme {
+			// Only an URL scheme, ignore.
+			b = b[j:]
+			continue
+		}
+		url := string(b[:j])
+		if match[2] == -1 && match[4] == -1 {
+			// Email address with missing mailto: scheme. Add it.
+			url = "mailto:" + url
+		}
+		links[url] = true
+		b = b[j:]
 	}
 
-	results := []string{}
-	for link := range linkMap {
+	results := make([]string, 0, len(links))
+	for link := range links {
 		results = append(results, link)
 	}
+	sort.Strings(results)
 
-	return &buf, results
+	return bytes.NewReader(buf), results
 }
+
+var urichars = []byte(
+	"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +
+		"0123456789-_.,~:;/?#@!$&%*+=\"'<>()[]",
+)