diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/parse/hyperlinks.go | 18 | ||||
-rw-r--r-- | lib/parse/hyperlinks_test.go | 72 |
2 files changed, 56 insertions, 34 deletions
diff --git a/lib/parse/hyperlinks.go b/lib/parse/hyperlinks.go index 2087a55c..af8c3006 100644 --- a/lib/parse/hyperlinks.go +++ b/lib/parse/hyperlinks.go @@ -4,14 +4,12 @@ import ( "bufio" "bytes" "io" + "net/url" "regexp" "strings" ) -var ( - submatch = `(https?:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,10}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*))` - httpRe = regexp.MustCompile("\"" + submatch + "\"" + "|" + "\\(" + submatch + "\\)" + "|" + "<" + submatch + ">" + "|" + submatch) -) +var urlRe = regexp.MustCompile(`([\w\d]{2,}:([^\s>\]\)"]|\][^\s>\)"]|\]$){8,})`) // HttpLinks searches a reader for a http link and returns a copy of the // reader and a slice with links. @@ -23,16 +21,12 @@ func HttpLinks(r io.Reader) (io.Reader, []string) { linkMap := make(map[string]struct{}) for scanner.Scan() { line := scanner.Text() - if !strings.Contains(line, "http") { - continue - } for _, word := range strings.Fields(line) { - if links := httpRe.FindStringSubmatch(word); len(links) > 0 { - for _, l := range links[1:] { - if l != "" { - linkMap[strings.TrimSpace(l)] = struct{}{} - } + if links := urlRe.FindStringSubmatch(word); len(links) > 0 { + if _, err := url.Parse(links[0]); err != nil { + continue } + linkMap[strings.TrimSpace(links[0])] = struct{}{} } } } diff --git a/lib/parse/hyperlinks_test.go b/lib/parse/hyperlinks_test.go index ba67664b..cd0c85cb 100644 --- a/lib/parse/hyperlinks_test.go +++ b/lib/parse/hyperlinks_test.go @@ -10,90 +10,118 @@ import ( func TestHyperlinks(t *testing.T) { tests := []struct { + name string text string links []string }{ { + name: "http-link", text: "http://aerc-mail.org", links: []string{"http://aerc-mail.org"}, }, { + name: "https-link", text: "https://aerc-mail.org", links: []string{"https://aerc-mail.org"}, }, { + name: "https-link-in-text", text: "text https://aerc-mail.org more text", links: []string{"https://aerc-mail.org"}, }, { + name: "https-link-in-parenthesis", text: "text (https://aerc-mail.org) more text", links: []string{"https://aerc-mail.org"}, }, { + name: "https-link-in-quotes", text: "text \"https://aerc-mail.org\" more text", links: []string{"https://aerc-mail.org"}, }, { + name: "https-link-in-angle-brackets", text: "text <https://aerc-mail.org> more text", links: []string{"https://aerc-mail.org"}, }, { + name: "https-link-in-html", text: "<a href=\"https://aerc-mail.org\">", links: []string{"https://aerc-mail.org"}, }, { + name: "https-link-twice", text: "text https://aerc-mail.org more text https://aerc-mail.org more text", links: []string{"https://aerc-mail.org"}, }, { + name: "multiple-links", text: "text https://aerc-mail.org more text http://git.sr.ht/~rjarry/aerc more text", links: []string{"https://aerc-mail.org", "http://git.sr.ht/~rjarry/aerc"}, }, { + name: "rfc", text: "text http://www.ietf.org/rfc/rfc2396.txt more text", links: []string{"http://www.ietf.org/rfc/rfc2396.txt"}, }, { + name: "http-with-query-and-fragment", text: "text <http://example.com:8042/over/there?name=ferret#nose> more text", links: []string{"http://example.com:8042/over/there?name=ferret#nose"}, }, { + name: "http-with-at", text: "text http://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm more text", links: []string{"http://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm"}, }, { + name: "https-with-fragment", text: "text https://www.ics.uci.edu/pub/ietf/uri/#Related more text", links: []string{"https://www.ics.uci.edu/pub/ietf/uri/#Related"}, }, { + name: "https-with-query", text: "text https://www.example.com/index.php?id_sezione=360&sid=3a5ebc944f41daa6f849f730f1 more text", links: []string{"https://www.example.com/index.php?id_sezione=360&sid=3a5ebc944f41daa6f849f730f1"}, }, + { + name: "https-onedrive", + text: "I have a link like this in an email (I deleted a few characters here-and-there for privacy) https://1drv.ms/w/s!Ap-KLfhNxS4fRt6tIvw?e=dW8WLO", + links: []string{"https://1drv.ms/w/s!Ap-KLfhNxS4fRt6tIvw?e=dW8WLO"}, + }, + { + name: "mailto-ipv6", + text: "You can reach me via the somewhat strange, but nonetheless valid, email mailto:~mpldr/list@[2001:db8::7]", + links: []string{"mailto:~mpldr/list@[2001:db8::7]"}, + }, + { + name: "mailto-ipv6-query", + text: "You can reach me via the somewhat strange, but nonetheless valid, email mailto:~mpldr/list@[2001:db8::7]?subject=whazzup%3F", + links: []string{"mailto:~mpldr/list@[2001:db8::7]?subject=whazzup%3F"}, + }, } - for _, test := range tests { - - // make sure reader is exact copy of input reader - reader, links := parse.HttpLinks(strings.NewReader(test.text)) - if data, err := io.ReadAll(reader); err != nil { - t.Errorf("could not read text: %v", err) - } else if string(data) != test.text { - t.Errorf("did not copy input reader correctly") - } - - // check correct parsed links - if len(links) != len(test.links) { - t.Errorf("different number of links: got %d but expected %d", len(links), len(test.links)) - } - linkMap := make(map[string]struct{}) - for _, got := range links { - linkMap[got] = struct{}{} - } - for _, expected := range test.links { - if _, ok := linkMap[expected]; !ok { - t.Errorf("link not parsed: %s", expected) + for i, test := range tests { + t.Run(test.name, func(t *testing.T) { + // make sure reader is exact copy of input reader + reader, parsedLinks := parse.HttpLinks(strings.NewReader(test.text)) + if _, err := io.ReadAll(reader); err != nil { + t.Skipf("could not read text: %v", err) } - } + // check correct parsed links + if len(parsedLinks) != len(test.links) { + t.Errorf("different number of links: got %d but expected %d", len(parsedLinks), len(test.links)) + } + linkMap := make(map[string]struct{}) + for _, got := range parsedLinks { + linkMap[got] = struct{}{} + } + for _, expected := range test.links { + if _, ok := linkMap[expected]; !ok { + t.Errorf("link[%d] not parsed: %s", i, expected) + } + } + }) } } |