aboutsummaryrefslogtreecommitdiffstats
path: root/lib/parse/hyperlinks.go
blob: dd334dd409f9f0ba74aebbf34c6d5e58298bbbed (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package parse

import (
	"bytes"
	"io"
	"regexp"
	"sort"
)

// Partial regexp to match the beginning of URLs and email addresses.
// The remainder of the matched URLs/emails is parsed manually.
var urlRe = regexp.MustCompile(
	`([a-z]{2,8})://` + // URL start
		`|` + // or
		`(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@`, // email start
)

// HttpLinks searches a reader for a http link and returns a copy of the
// reader and a slice with links.
func HttpLinks(r io.Reader) (io.Reader, []string) {
	buf, err := io.ReadAll(r)
	if err != nil {
		return r, nil
	}

	links := make(map[string]bool)
	b := buf
	match := urlRe.FindSubmatchIndex(b)
	for ; match != nil; match = urlRe.FindSubmatchIndex(b) {
		// Regular expressions do not really cut it here and we
		// need to detect opening/closing braces to handle
		// markdown link syntax.
		var paren, bracket, ltgt, scheme int
		var emitUrl bool
		i, j := match[0], match[1]
		b = b[i:]
		scheme = j - i
		j = scheme

		for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 {
			switch b[j] {
			case '[':
				bracket++
				j++
			case '(':
				paren++
				j++
			case '<':
				ltgt++
				j++
			case ']':
				bracket--
				if bracket < 0 {
					emitUrl = true
				} else {
					j++
				}
			case ')':
				paren--
				if paren < 0 {
					emitUrl = true
				} else {
					j++
				}
			case '>':
				ltgt--
				if ltgt < 0 {
					emitUrl = true
				} else {
					j++
				}
			default:
				j++
			}
		}

		// Heuristic to remove trailing characters that are
		// valid URL characters, but typically not at the end of
		// the URL
		for trim := true; trim && j > 0; {
			switch b[j-1] {
			case '.', ',', ':', ';', '?', '!', '"', '\'', '%':
				j--
			default:
				trim = false
			}
		}
		if j == scheme {
			// Only an URL scheme, ignore.
			b = b[j:]
			continue
		}
		url := string(b[:j])
		if match[2] == -1 && match[4] == -1 {
			// Email address with missing mailto: scheme. Add it.
			url = "mailto:" + url
		}
		links[url] = true
		b = b[j:]
	}

	results := make([]string, 0, len(links))
	for link := range links {
		results = append(results, link)
	}
	sort.Strings(results)

	return bytes.NewReader(buf), results
}

var urichars = []byte(
	"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +
		"0123456789-_.,~:;/?#@!$&%*+=\"'<>()[]",
)