aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRobin Jarry <robin@jarry.cc>2023-04-02 13:02:22 +0200
committerRobin Jarry <robin@jarry.cc>2023-04-02 20:59:11 +0200
commit1540d645dff3c610d7d25c47699ec1313601303b (patch)
tree8075b8f463f21784311657086f327f166337709d
parent476a94eef81658cb2f9b3435420401899ea33bf6 (diff)
downloadaerc-1540d645dff3c610d7d25c47699ec1313601303b.tar.gz
colorize: make url parsing more robust
Reuse the URL parsing algorithm from foot. Basically, it involves recording the opening [, (, < and take into account their closing counterparts. If a closing character is encountered with no matching opening one, assume the URL ends. This allows handling markdown link syntax such as: [http://foobaz.org/xxx](http://foobaz.org/xxx) Avoid coloring bare URL protocols such as http:// or https:// Update test vector to handle more corner cases. Link: https://codeberg.org/dnkl/foot/src/tag/1.13.1/url-mode.c#L331-L471 Signed-off-by: Robin Jarry <robin@jarry.cc> Tested-by: Kirill Chibisov <contact@kchibisov.com>
-rw-r--r--filters/colorize.c87
-rw-r--r--filters/vectors/colorize-quotes.expected12
-rw-r--r--filters/vectors/colorize-quotes.in10
3 files changed, 84 insertions, 25 deletions
diff --git a/filters/colorize.c b/filters/colorize.c
index 26c25a8a..8e0cc3c7 100644
--- a/filters/colorize.c
+++ b/filters/colorize.c
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: MIT */
/* Copyright (c) 2023 Robin Jarry */
+#include <ctype.h>
#include <errno.h>
#include <fnmatch.h>
#include <getopt.h>
@@ -439,8 +440,19 @@ static void diff_chunk(const char *in)
print_notabs(in, BUFSIZ);
}
+static inline bool isurichar(char c)
+{
+ if (c == '\0')
+ return false;
+ if (isalnum(c))
+ return true;
+ if (strchr("-_.,~:;/?#@!$&%*+=\"'<>()[]", c) != NULL)
+ return true;
+ return false;
+}
+
#define URL_RE \
- "([a-z]{2,8}:)//[][:alnum:]._~:/?#[@!$&'()*+,;=%-]{4,}" \
+ "([a-z]{2,8})://" \
"|(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@[a-z][[:alnum:].-]*[a-z]"
static regex_t url_re;
@@ -457,22 +469,69 @@ static void urls(const char *in, struct style *ctx)
while (!regexec(&url_re, in, 3, groups, 0)) {
in += print_notabs(in, groups[0].rm_so);
- print(seq(&styles.url));
len = groups[0].rm_eo - groups[0].rm_so;
- /* Heuristic to remove trailing characters that are valid URL
- * characters, but typically not at the end of the URL */
- trim = true;
- while (trim && len > 0) {
- switch (in[len - 1]) {
- case '.': case ',': case ';': case ')':
- case '!': case '?': case '\'':
- len--;
- break;
- default:
- trim = false;
- break;
+
+ if (groups[1].rm_so != -1) {
+ /* Standard URL (i.e. not mailto: nor email address).
+ * Regular expressions do not really cut it here and
+ * we need to detect opening/closing braces to handle
+ * markdown link syntax. */
+ int paren = 0, bracket = 0, ltgt = 0;
+ bool emit_url = false;
+ size_t l = len;
+
+ while (!emit_url && isurichar(in[l])) {
+ switch (in[l]) {
+ case '[': bracket++; l++; break;
+ case '(': paren++; l++; break;
+ case '<': ltgt++; l++; break;
+ case ']':
+ if (--bracket < 0)
+ emit_url = true;
+ else
+ l++;
+ break;
+ case ')':
+ if (--paren < 0)
+ emit_url = true;
+ else
+ l++;
+ break;
+ case '>':
+ if (--ltgt < 0)
+ emit_url = true;
+ else
+ l++;
+ break;
+ default:
+ l++;
+ break;
+ }
+ }
+ /* Heuristic to remove trailing characters that are
+ * valid URL characters, but typically not at the end
+ * of the URL */
+ trim = true;
+ while (trim && l > len) {
+ switch (in[l - 1]) {
+ case '.': case ',': case ':':
+ case ';': case '?': case '!':
+ case '"': case '\'': case '%':
+ l--;
+ break;
+ default:
+ trim = false;
+ break;
+ }
+ }
+ if (l == len) {
+ /* only an URL protocol, do not colorize */
+ in += print_notabs(in, len);
+ continue;
}
+ len = l;
}
+ print(seq(&styles.url));
bool email = groups[2].rm_so == -1 && groups[1].rm_so == -1;
print_osc8(in, len, url_id, email);
in += print_notabs(in, len);
diff --git a/filters/vectors/colorize-quotes.expected b/filters/vectors/colorize-quotes.expected
index 46d3adba..c136fcb0 100644
--- a/filters/vectors/colorize-quotes.expected
+++ b/filters/vectors/colorize-quotes.expected
@@ -20,13 +20,13 @@ nibh petentium at! Sit docendi laboramus ei, animal insolens ad mea.
> detracto sententia <]8;id=colorize-0;https://foobar.com\https://foobar.com]8;id=colorize-0;\> && "]8;id=colorize-1;https://foobaz.org/\https://foobaz.org/]8;id=colorize-1;\".
>
> Error libris deleniti ea mei, vis at elit probo munere, his sint unum
-> albucius ex.
+> albucius ex. []8;id=colorize-2;https://pouet.com/oksuper\https://pouet.com/oksuper]8;id=colorize-2;\](]8;id=colorize-3;https://pouet.com/oksuper\https://pouet.com/oksuper]8;id=colorize-3;\).
Graece definiebas scripserit ne est? Nec nonumes explicari contentiones ne,
vocent iuvaret placerat no vix. Nec et partem salutandi deseruisse, his no
possim malorum pericula. Te quando reprehendunt nam, at consul sadipscing vel?
Velit possim aliquando ei per, ne simul quodsi antiopam sea, ullum choro
-facilisi et pri!
+facilisi et pri http:// or https://!
> Dico soleat partem ea pro, ad vix impetus splendide. Primis melius principes
> pri ad, tacimates pertinacia ei pro? Appareat atomorum oportere at nam, eu
@@ -47,17 +47,17 @@ facilisi et pri!
> if err != nil || err2 != nil {
Id vix referrentur philosophia, veri labores an nec. Noster denique no duo, sit
-ei diam inermis vocibus! Mutat principes ex pro, at pericula assueverit vel.
+ei diam inermis vocibus! Mutat principes ex pro, at ]8;id=colorize-4;mailto://~rjarry/aerc-devel@lists.sr.ht\~rjarry/aerc-devel@lists.sr.ht]8;id=colorize-4;\.
Has putent verterem constituto ex, tale electram duo at! Ei nulla lucilius
intellegat nam, pro quod epicuri dissentiet ut, omnis voluptatibus definitiones
-vim at.
+vim at []8;id=colorize-5;irc://foo.bar\irc://foo.bar]8;id=colorize-5;\] <]8;id=colorize-6;mailto://jeanpierre@foobaz.org\jeanpierre@foobaz.org]8;id=colorize-6;\>.
-]8;id=colorize-2;https://git-man-page-generator.lokaltog.net/#Y2xhcCQkY29tbWFuZA==\https://git-man-page-generator.lokaltog.net/#Y2xhcCQkY29tbWFuZA==]8;id=colorize-2;\
+]8;id=colorize-7;https://git-man-page-generator.lokaltog.net/#Y2xhcCQkY29tbWFuZA==\https://git-man-page-generator.lokaltog.net/#Y2xhcCQkY29tbWFuZA==]8;id=colorize-7;\
Eam mundi libris debitis ad, eam regione numquam at. Eum omnes bonorum eu,
oporteat assueverit disputationi nam ne, nonumes iracundia mea ad! Duo libris
recusabo id, ceteros salutatus inciderint vim ea. Et graeco reformidans vel? Ei
-has labore quidam?
+has labore quidam ]8;id=colorize-8;https://foobaz.com/ooo<uuuu>okf\https://foobaz.com/ooo<uuuu>okf]8;id=colorize-8;\?
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> sympa, non?
diff --git a/filters/vectors/colorize-quotes.in b/filters/vectors/colorize-quotes.in
index 31b04911..2e717868 100644
--- a/filters/vectors/colorize-quotes.in
+++ b/filters/vectors/colorize-quotes.in
@@ -20,13 +20,13 @@ nibh petentium at! Sit docendi laboramus ei, animal insolens ad mea.
> detracto sententia <https://foobar.com> && "https://foobaz.org/".
>
> Error libris deleniti ea mei, vis at elit probo munere, his sint unum
-> albucius ex.
+> albucius ex. [https://pouet.com/oksuper](https://pouet.com/oksuper).
Graece definiebas scripserit ne est? Nec nonumes explicari contentiones ne,
vocent iuvaret placerat no vix. Nec et partem salutandi deseruisse, his no
possim malorum pericula. Te quando reprehendunt nam, at consul sadipscing vel?
Velit possim aliquando ei per, ne simul quodsi antiopam sea, ullum choro
-facilisi et pri!
+facilisi et pri http:// or https://!
> Dico soleat partem ea pro, ad vix impetus splendide. Primis melius principes
> pri ad, tacimates pertinacia ei pro? Appareat atomorum oportere at nam, eu
@@ -47,17 +47,17 @@ facilisi et pri!
> if err != nil || err2 != nil {
Id vix referrentur philosophia, veri labores an nec. Noster denique no duo, sit
-ei diam inermis vocibus! Mutat principes ex pro, at pericula assueverit vel.
+ei diam inermis vocibus! Mutat principes ex pro, at ~rjarry/aerc-devel@lists.sr.ht.
Has putent verterem constituto ex, tale electram duo at! Ei nulla lucilius
intellegat nam, pro quod epicuri dissentiet ut, omnis voluptatibus definitiones
-vim at.
+vim at [irc://foo.bar] <jeanpierre@foobaz.org>.
https://git-man-page-generator.lokaltog.net/#Y2xhcCQkY29tbWFuZA==
Eam mundi libris debitis ad, eam regione numquam at. Eum omnes bonorum eu,
oporteat assueverit disputationi nam ne, nonumes iracundia mea ad! Duo libris
recusabo id, ceteros salutatus inciderint vim ea. Et graeco reformidans vel? Ei
-has labore quidam?
+has labore quidam https://foobaz.com/ooo<uuuu>okf?
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> sympa, non?