lib/parse/hyperlinks.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

package parse

import (
	"bytes"
	"io"
	"regexp"
	"sort"
)

// Partial regexp to match the beginning of URLs and email addresses.
// The remainder of the matched URLs/emails is parsed manually.
var urlRe = regexp.MustCompile(
	`([a-z]{2,8})://` + // URL start
		`|` + // or
		`(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@`, // email start
)

// HttpLinks searches a reader for a http link and returns a copy of the
// reader and a slice with links.
func HttpLinks(r io.Reader) (io.Reader, []string) {
	buf, err := io.ReadAll(r)
	if err != nil {
		return r, nil
	}

	links := make(map[string]bool)
	b := buf
	match := urlRe.FindSubmatchIndex(b)
	for ; match != nil; match = urlRe.FindSubmatchIndex(b) {
		// Regular expressions do not really cut it here and we
		// need to detect opening/closing braces to handle
		// markdown link syntax.
		var paren, bracket, ltgt, scheme int
		var emitUrl bool
		i, j := match[0], match[1]
		b = b[i:]
		scheme = j - i
		j = scheme

		// "inline" email without a mailto: prefix - add some extra checks for those
		inlineEmail := len(match) > 4 && match[2] == -1 && match[4] == -1

		for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 {
			switch b[j] {
			case '[':
				bracket++
				j++
			case '(':
				paren++
				j++
			case '<':
				ltgt++
				j++
			case ']':
				bracket--
				if bracket < 0 {
					emitUrl = true
				} else {
					j++
				}
			case ')':
				paren--
				if paren < 0 {
					emitUrl = true
				} else {
					j++
				}
			case '>':
				ltgt--
				if ltgt < 0 {
					emitUrl = true
				} else {
					j++
				}
			case '&':
				if inlineEmail {
					emitUrl = true
				} else {
					j++
				}
			default:
				j++
			}

			// we don't want those in inline emails
			if inlineEmail && (paren > 0 || ltgt > 0 || bracket > 0) {
				j--
				emitUrl = true
			}
		}

		// Heuristic to remove trailing characters that are
		// valid URL characters, but typically not at the end of
		// the URL
		for trim := true; trim && j > 0; {
			switch b[j-1] {
			case '.', ',', ':', ';', '?', '!', '"', '\'', '%':
				j--
			default:
				trim = false
			}
		}
		if j == scheme {
			// Only an URL scheme, ignore.
			b = b[j:]
			continue
		}
		url := string(b[:j])
		if inlineEmail {
			// Email address with missing mailto: scheme. Add it.
			url = "mailto:" + url
		}
		links[url] = true
		b = b[j:]
	}

	results := make([]string, 0, len(links))
	for link := range links {
		results = append(results, link)
	}
	sort.Strings(results)

	return bytes.NewReader(buf), results
}

var urichars = []byte(
	"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +
		"0123456789-_.,~:;/?#@!$&%*+=\"'<>()[]",
)