1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
package parse
import (
"bytes"
"io"
"regexp"
"sort"
)
// Partial regexp to match the beginning of URLs and email addresses.
// The remainder of the matched URLs/emails is parsed manually.
var urlRe = regexp.MustCompile(
`([a-z]{2,8})://` + // URL start
`|` + // or
`(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@`, // email start
)
// HttpLinks searches a reader for a http link and returns a copy of the
// reader and a slice with links.
func HttpLinks(r io.Reader) (io.Reader, []string) {
buf, err := io.ReadAll(r)
if err != nil {
return r, nil
}
links := make(map[string]bool)
b := buf
match := urlRe.FindSubmatchIndex(b)
for ; match != nil; match = urlRe.FindSubmatchIndex(b) {
// Regular expressions do not really cut it here and we
// need to detect opening/closing braces to handle
// markdown link syntax.
var paren, bracket, ltgt, scheme int
var emitUrl bool
i, j := match[0], match[1]
b = b[i:]
scheme = j - i
j = scheme
for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 {
switch b[j] {
case '[':
bracket++
j++
case '(':
paren++
j++
case '<':
ltgt++
j++
case ']':
bracket--
if bracket < 0 {
emitUrl = true
} else {
j++
}
case ')':
paren--
if paren < 0 {
emitUrl = true
} else {
j++
}
case '>':
ltgt--
if ltgt < 0 {
emitUrl = true
} else {
j++
}
default:
j++
}
}
// Heuristic to remove trailing characters that are
// valid URL characters, but typically not at the end of
// the URL
for trim := true; trim && j > 0; {
switch b[j-1] {
case '.', ',', ':', ';', '?', '!', '"', '\'', '%':
j--
default:
trim = false
}
}
if j == scheme {
// Only an URL scheme, ignore.
b = b[j:]
continue
}
url := string(b[:j])
if match[2] == -1 && match[4] == -1 {
// Email address with missing mailto: scheme. Add it.
url = "mailto:" + url
}
links[url] = true
b = b[j:]
}
results := make([]string, 0, len(links))
for link := range links {
results = append(results, link)
}
sort.Strings(results)
return bytes.NewReader(buf), results
}
var urichars = []byte(
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +
"0123456789-_.,~:;/?#@!$&%*+=\"'<>()[]",
)
|