open-link: make URL parsing more lenient

URLs are extremely loosely defined and can take many shapes which may not be parsed at all if unusual characters like the exclamation mark are present. To ensure lists and odd use of spaces are not parsed as links some sanity-checks are in place: - the URL's schema must be at least two characters long - the URL's authority, path, and fragment must have a combined length of 8 characters or longer - the URL must not contain a whitespace character, >, ), or " - the URL may only contain a ] when followed by a different allowed character or at the end of the line (necessary for IPv6 authorities) The tests for this function now include links with an exclamation point and IPv6 addresses. The tests are given names to be easier identifiable. Link: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml Reported-by: "Bence Ferdinandy" <bence@ferdinandy.com> Cc: "Koni Marti" <koni.marti@gmail.com> Fixes: e1d8bc4d17cb ("msgviewer: open http links from messages") Signed-off-by: Moritz Poldrack <git@moritz.sh> Acked-by: Tim Culverhouse <tim@timculverhouse.com>
author: Moritz Poldrack <git@moritz.sh> 2022-09-11 02:14:53 +0200
committer: Robin Jarry <robin@jarry.cc> 2022-09-13 23:58:53 +0200
commit: fad90c2956c6e57fd8da83da9c218b35cf2a2f93 (patch)
tree: 8ddc3a435a2d261c5b32bbb53ad8891f182c7661 /lib
parent: ba9d79fd2d6d17b8d2ec940697cab2348293c510 (diff)
download: aerc-fad90c2956c6e57fd8da83da9c218b35cf2a2f93.tar.gz
2 files changed, 56 insertions, 34 deletions
diff --git a/lib/parse/hyperlinks.go b/lib/parse/hyperlinks.go
index 2087a55c..af8c3006 100644
--- a/lib/parse/hyperlinks.go
+++ b/lib/parse/hyperlinks.go
@@ -4,14 +4,12 @@ import (
 	"bufio"
 	"bytes"
 	"io"
+	"net/url"
 	"regexp"
 	"strings"
 )
 
-var (
-	submatch = `(https?:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,10}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*))`
-	httpRe   = regexp.MustCompile("\"" + submatch + "\"" + "|" + "\\(" + submatch + "\\)" + "|" + "<" + submatch + ">" + "|" + submatch)
-)
+var urlRe = regexp.MustCompile(`([\w\d]{2,}:([^\s>\]\)"]|\][^\s>\)"]|\]$){8,})`)
 
 // HttpLinks searches a reader for a http link and returns a copy of the
 // reader and a slice with links.
@@ -23,16 +21,12 @@ func HttpLinks(r io.Reader) (io.Reader, []string) {
 	linkMap := make(map[string]struct{})
 	for scanner.Scan() {
 		line := scanner.Text()
-		if !strings.Contains(line, "http") {
-			continue
-		}
 		for _, word := range strings.Fields(line) {
-			if links := httpRe.FindStringSubmatch(word); len(links) > 0 {
-				for _, l := range links[1:] {
-					if l != "" {
-						linkMap[strings.TrimSpace(l)] = struct{}{}
-					}
+			if links := urlRe.FindStringSubmatch(word); len(links) > 0 {
+				if _, err := url.Parse(links[0]); err != nil {
+					continue
 				}
+				linkMap[strings.TrimSpace(links[0])] = struct{}{}
 			}
 		}
 	}
diff --git a/lib/parse/hyperlinks_test.go b/lib/parse/hyperlinks_test.go
index ba67664b..cd0c85cb 100644
--- a/lib/parse/hyperlinks_test.go
+++ b/lib/parse/hyperlinks_test.go
@@ -10,90 +10,118 @@ import (
 
 func TestHyperlinks(t *testing.T) {
 	tests := []struct {
+		name  string
 		text  string
 		links []string
 	}{
 		{
+			name:  "http-link",
 			text:  "http://aerc-mail.org",
 			links: []string{"http://aerc-mail.org"},
 		},
 		{
+			name:  "https-link",
 			text:  "https://aerc-mail.org",
 			links: []string{"https://aerc-mail.org"},
 		},
 		{
+			name:  "https-link-in-text",
 			text:  "text https://aerc-mail.org more text",
 			links: []string{"https://aerc-mail.org"},
 		},
 		{
+			name:  "https-link-in-parenthesis",
 			text:  "text (https://aerc-mail.org) more text",
 			links: []string{"https://aerc-mail.org"},
 		},
 		{
+			name:  "https-link-in-quotes",
 			text:  "text \"https://aerc-mail.org\" more text",
 			links: []string{"https://aerc-mail.org"},
 		},
 		{
+			name:  "https-link-in-angle-brackets",
 			text:  "text <https://aerc-mail.org> more text",
 			links: []string{"https://aerc-mail.org"},
 		},
 		{
+			name:  "https-link-in-html",
 			text:  "<a href=\"https://aerc-mail.org\">",
 			links: []string{"https://aerc-mail.org"},
 		},
 		{
+			name:  "https-link-twice",
 			text:  "text https://aerc-mail.org more text https://aerc-mail.org more text",
 			links: []string{"https://aerc-mail.org"},
 		},
 		{
+			name:  "multiple-links",
 			text:  "text https://aerc-mail.org more text http://git.sr.ht/~rjarry/aerc more text",
 			links: []string{"https://aerc-mail.org", "http://git.sr.ht/~rjarry/aerc"},
 		},
 		{
+			name:  "rfc",
 			text:  "text http://www.ietf.org/rfc/rfc2396.txt more text",
 			links: []string{"http://www.ietf.org/rfc/rfc2396.txt"},
 		},
 		{
+			name:  "http-with-query-and-fragment",
 			text:  "text <http://example.com:8042/over/there?name=ferret#nose> more text",
 			links: []string{"http://example.com:8042/over/there?name=ferret#nose"},
 		},
 		{
+			name:  "http-with-at",
 			text:  "text http://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm more text",
 			links: []string{"http://cnn.example.com&story=breaking_news@10.0.0.1/top_story.htm"},
 		},
 		{
+			name:  "https-with-fragment",
 			text:  "text https://www.ics.uci.edu/pub/ietf/uri/#Related more text",
 			links: []string{"https://www.ics.uci.edu/pub/ietf/uri/#Related"},
 		},
 		{
+			name:  "https-with-query",
 			text:  "text https://www.example.com/index.php?id_sezione=360&sid=3a5ebc944f41daa6f849f730f1 more text",
 			links: []string{"https://www.example.com/index.php?id_sezione=360&sid=3a5ebc944f41daa6f849f730f1"},
 		},
+		{
+			name:  "https-onedrive",
+			text:  "I have a link like this in an email (I deleted a few characters here-and-there for privacy) https://1drv.ms/w/s!Ap-KLfhNxS4fRt6tIvw?e=dW8WLO",
+			links: []string{"https://1drv.ms/w/s!Ap-KLfhNxS4fRt6tIvw?e=dW8WLO"},
+		},
+		{
+			name:  "mailto-ipv6",
+			text:  "You can reach me via the somewhat strange, but nonetheless valid, email mailto:~mpldr/list@[2001:db8::7]",
+			links: []string{"mailto:~mpldr/list@[2001:db8::7]"},
+		},
+		{
+			name:  "mailto-ipv6-query",
+			text:  "You can reach me via the somewhat strange, but nonetheless valid, email mailto:~mpldr/list@[2001:db8::7]?subject=whazzup%3F",
+			links: []string{"mailto:~mpldr/list@[2001:db8::7]?subject=whazzup%3F"},
+		},
 	}
 
-	for _, test := range tests {
-
-		// make sure reader is exact copy of input reader
-		reader, links := parse.HttpLinks(strings.NewReader(test.text))
-		if data, err := io.ReadAll(reader); err != nil {
-			t.Errorf("could not read text: %v", err)
-		} else if string(data) != test.text {
-			t.Errorf("did not copy input reader correctly")
-		}
-
-		// check correct parsed links
-		if len(links) != len(test.links) {
-			t.Errorf("different number of links: got %d but expected %d", len(links), len(test.links))
-		}
-		linkMap := make(map[string]struct{})
-		for _, got := range links {
-			linkMap[got] = struct{}{}
-		}
-		for _, expected := range test.links {
-			if _, ok := linkMap[expected]; !ok {
-				t.Errorf("link not parsed: %s", expected)
+	for i, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			// make sure reader is exact copy of input reader
+			reader, parsedLinks := parse.HttpLinks(strings.NewReader(test.text))
+			if _, err := io.ReadAll(reader); err != nil {
+				t.Skipf("could not read text: %v", err)
 			}
-		}
 
+			// check correct parsed links
+			if len(parsedLinks) != len(test.links) {
+				t.Errorf("different number of links: got %d but expected %d", len(parsedLinks), len(test.links))
+			}
+			linkMap := make(map[string]struct{})
+			for _, got := range parsedLinks {
+				linkMap[got] = struct{}{}
+			}
+			for _, expected := range test.links {
+				if _, ok := linkMap[expected]; !ok {
+					t.Errorf("link[%d] not parsed: %s", i, expected)
+				}
+			}
+		})
 	}
 }
author	Moritz Poldrack <git@moritz.sh>	2022-09-11 02:14:53 +0200
committer	Robin Jarry <robin@jarry.cc>	2022-09-13 23:58:53 +0200
commit	fad90c2956c6e57fd8da83da9c218b35cf2a2f93 (patch)
tree	8ddc3a435a2d261c5b32bbb53ad8891f182c7661 /lib
parent	ba9d79fd2d6d17b8d2ec940697cab2348293c510 (diff)
download	aerc-fad90c2956c6e57fd8da83da9c218b35cf2a2f93.tar.gz