aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/parse/hyperlinks.go18
-rw-r--r--lib/parse/hyperlinks_test.go72
2 files changed, 56 insertions, 34 deletions
diff --git a/lib/parse/hyperlinks.go b/lib/parse/hyperlinks.go
index 2087a55c..af8c3006 100644
--- a/lib/parse/hyperlinks.go
+++ b/lib/parse/hyperlinks.go
@@ -4,14 +4,12 @@ import (
"bufio"
"bytes"
"io"
+ "net/url"
"regexp"
"strings"
)
-var (
- submatch = `(https?:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,10}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*))`
- httpRe = regexp.MustCompile("\"" + submatch + "\"" + "|" + "\\(" + submatch + "\\)" + "|" + "<" + submatch + ">" + "|" + submatch)
-)
+var urlRe = regexp.MustCompile(`([\w\d]{2,}:([^\s>\]\)"]|\][^\s>\)"]|\]$){8,})`)
// HttpLinks searches a reader for a http link and returns a copy of the
// reader and a slice with links.
@@ -23,16 +21,12 @@ func HttpLinks(r io.Reader) (io.Reader, []string) {
linkMap := make(map[string]struct{})
for scanner.Scan() {
line := scanner.Text()
- if !strings.Contains(line, "http") {
- continue
- }
for _, word := range strings.Fields(line) {
- if links := httpRe.FindStringSubmatch(word); len(links) > 0 {
- for _, l := range links[1:] {
- if l != "" {
- linkMap[strings.TrimSpace(l)] = struct{}{}
- }
+ if links := urlRe.FindStringSubmatch(word); len(links) > 0 {
+ if _, err := url.Parse(links[0]); err != nil {
+ continue
}
+ linkMap[strings.TrimSpace(links[0])] = struct{}{}
}
}
}
diff --git a/lib/parse/hyperlinks_test.go b/lib/parse/hyperlinks_test.go
index ba67664b..cd0c85cb 100644
--- a/lib/parse/hyperlinks_test.go
+++ b/lib/parse/hyperlinks_test.go
@@ -10,90 +10,118 @@ import (
func TestHyperlinks(t *testing.T) {
tests := []struct {
+ name string
text string
links []string
}{
{
+ name: "http-link",
text: "http://aerc-mail.org",
links: []string{"http://aerc-mail.org"},
},
{
+ name: "https-link",
text: "https://aerc-mail.org",
links: []string{"https://aerc-mail.org"},
},
{
+ name: "https-link-in-text",
text: "text https://aerc-mail.org more text",
links: []string{"https://aerc-mail.org"},
},
{
+ name: "https-link-in-parenthesis",
text: "text (https://aerc-mail.org) more text",
links: []string{"https://aerc-mail.org"},
},
{
+ name: "https-link-in-quotes",
text: "text \"https://aerc-mail.org\" more text",
links: []string{"https://aerc-mail.org"},
},
{
+ name: "https-link-in-angle-brackets",
text: "text <https://aerc-mail.org> more text",
links: []string{"https://aerc-mail.org"},
},
{
+ name: "https-link-in-html",
text: "<a href=\"https://aerc-mail.org\">",
links: []string{"https://aerc-mail.org"},
},
{
+ name: "https-link-twice",
text: "text https://aerc-mail.org more text https://aerc-mail.org more text",
links: []string{"https://aerc-mail.org"},
},
{
+ name: "multiple-links",
text: "text https://aerc-mail.org more text http://git.sr.ht/~rjarry/aerc more text",
links: []string{"https://aerc-mail.org", "http://git.sr.ht/~rjarry/aerc"},
},
{
+ name: "rfc",
text: "text http://www.ietf.org/rfc/rfc2396.txt more text",
links: []string{"http://www.ietf.org/rfc/rfc2396.txt"},
},
{
+ name: "http-with-query-and-fragment",
text: "text <http://example.com:8042/over/there?name=ferret#nose> more text",
links: []string{"http://example.com:8042/over/there?name=ferret#nose"},
},
{
+ name: "http-with-at",
text: "text http://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm more text",
links: []string{"http://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm"},
},
{
+ name: "https-with-fragment",
text: "text https://www.ics.uci.edu/pub/ietf/uri/#Related more text",
links: []string{"https://www.ics.uci.edu/pub/ietf/uri/#Related"},
},
{
+ name: "https-with-query",
text: "text https://www.example.com/index.php?id_sezione=360&sid=3a5ebc944f41daa6f849f730f1 more text",
links: []string{"https://www.example.com/index.php?id_sezione=360&sid=3a5ebc944f41daa6f849f730f1"},
},
+ {
+ name: "https-onedrive",
+ text: "I have a link like this in an email (I deleted a few characters here-and-there for privacy) https://1drv.ms/w/s!Ap-KLfhNxS4fRt6tIvw?e=dW8WLO",
+ links: []string{"https://1drv.ms/w/s!Ap-KLfhNxS4fRt6tIvw?e=dW8WLO"},
+ },
+ {
+ name: "mailto-ipv6",
+ text: "You can reach me via the somewhat strange, but nonetheless valid, email mailto:~mpldr/list@[2001:db8::7]",
+ links: []string{"mailto:~mpldr/list@[2001:db8::7]"},
+ },
+ {
+ name: "mailto-ipv6-query",
+ text: "You can reach me via the somewhat strange, but nonetheless valid, email mailto:~mpldr/list@[2001:db8::7]?subject=whazzup%3F",
+ links: []string{"mailto:~mpldr/list@[2001:db8::7]?subject=whazzup%3F"},
+ },
}
- for _, test := range tests {
-
- // make sure reader is exact copy of input reader
- reader, links := parse.HttpLinks(strings.NewReader(test.text))
- if data, err := io.ReadAll(reader); err != nil {
- t.Errorf("could not read text: %v", err)
- } else if string(data) != test.text {
- t.Errorf("did not copy input reader correctly")
- }
-
- // check correct parsed links
- if len(links) != len(test.links) {
- t.Errorf("different number of links: got %d but expected %d", len(links), len(test.links))
- }
- linkMap := make(map[string]struct{})
- for _, got := range links {
- linkMap[got] = struct{}{}
- }
- for _, expected := range test.links {
- if _, ok := linkMap[expected]; !ok {
- t.Errorf("link not parsed: %s", expected)
+ for i, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ // make sure reader is exact copy of input reader
+ reader, parsedLinks := parse.HttpLinks(strings.NewReader(test.text))
+ if _, err := io.ReadAll(reader); err != nil {
+ t.Skipf("could not read text: %v", err)
}
- }
+ // check correct parsed links
+ if len(parsedLinks) != len(test.links) {
+ t.Errorf("different number of links: got %d but expected %d", len(parsedLinks), len(test.links))
+ }
+ linkMap := make(map[string]struct{})
+ for _, got := range parsedLinks {
+ linkMap[got] = struct{}{}
+ }
+ for _, expected := range test.links {
+ if _, ok := linkMap[expected]; !ok {
+ t.Errorf("link[%d] not parsed: %s", i, expected)
+ }
+ }
+ })
}
}