aboutsummaryrefslogtreecommitdiffstats
path: root/lib/parse/hyperlinks.go
diff options
context:
space:
mode:
Diffstat (limited to 'lib/parse/hyperlinks.go')
-rw-r--r--lib/parse/hyperlinks.go112
1 files changed, 93 insertions, 19 deletions
diff --git a/lib/parse/hyperlinks.go b/lib/parse/hyperlinks.go
index af8c3006..dd334dd4 100644
--- a/lib/parse/hyperlinks.go
+++ b/lib/parse/hyperlinks.go
@@ -1,40 +1,114 @@
package parse
import (
- "bufio"
"bytes"
"io"
- "net/url"
"regexp"
- "strings"
+ "sort"
)
-var urlRe = regexp.MustCompile(`([\w\d]{2,}:([^\s>\]\)"]|\][^\s>\)"]|\]$){8,})`)
+// Partial regexp to match the beginning of URLs and email addresses.
+// The remainder of the matched URLs/emails is parsed manually.
+var urlRe = regexp.MustCompile(
+ `([a-z]{2,8})://` + // URL start
+ `|` + // or
+ `(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@`, // email start
+)
// HttpLinks searches a reader for a http link and returns a copy of the
// reader and a slice with links.
func HttpLinks(r io.Reader) (io.Reader, []string) {
- var buf bytes.Buffer
- tr := io.TeeReader(r, &buf)
-
- scanner := bufio.NewScanner(tr)
- linkMap := make(map[string]struct{})
- for scanner.Scan() {
- line := scanner.Text()
- for _, word := range strings.Fields(line) {
- if links := urlRe.FindStringSubmatch(word); len(links) > 0 {
- if _, err := url.Parse(links[0]); err != nil {
- continue
+ buf, err := io.ReadAll(r)
+ if err != nil {
+ return r, nil
+ }
+
+ links := make(map[string]bool)
+ b := buf
+ match := urlRe.FindSubmatchIndex(b)
+ for ; match != nil; match = urlRe.FindSubmatchIndex(b) {
+ // Regular expressions do not really cut it here and we
+ // need to detect opening/closing braces to handle
+ // markdown link syntax.
+ var paren, bracket, ltgt, scheme int
+ var emitUrl bool
+ i, j := match[0], match[1]
+ b = b[i:]
+ scheme = j - i
+ j = scheme
+
+ for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 {
+ switch b[j] {
+ case '[':
+ bracket++
+ j++
+ case '(':
+ paren++
+ j++
+ case '<':
+ ltgt++
+ j++
+ case ']':
+ bracket--
+ if bracket < 0 {
+ emitUrl = true
+ } else {
+ j++
}
- linkMap[strings.TrimSpace(links[0])] = struct{}{}
+ case ')':
+ paren--
+ if paren < 0 {
+ emitUrl = true
+ } else {
+ j++
+ }
+ case '>':
+ ltgt--
+ if ltgt < 0 {
+ emitUrl = true
+ } else {
+ j++
+ }
+ default:
+ j++
}
}
+
+ // Heuristic to remove trailing characters that are
+ // valid URL characters, but typically not at the end of
+ // the URL
+ for trim := true; trim && j > 0; {
+ switch b[j-1] {
+ case '.', ',', ':', ';', '?', '!', '"', '\'', '%':
+ j--
+ default:
+ trim = false
+ }
+ }
+ if j == scheme {
+ // Only an URL scheme, ignore.
+ b = b[j:]
+ continue
+ }
+ url := string(b[:j])
+ if match[2] == -1 && match[4] == -1 {
+ // Email address with missing mailto: scheme. Add it.
+ url = "mailto:" + url
+ }
+ links[url] = true
+ b = b[j:]
}
- results := []string{}
- for link := range linkMap {
+ results := make([]string, 0, len(links))
+ for link := range links {
results = append(results, link)
}
+ sort.Strings(results)
- return &buf, results
+ return bytes.NewReader(buf), results
}
+
+var urichars = []byte(
+ "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +
+ "0123456789-_.,~:;/?#@!$&%*+=\"'<>()[]",
+)