diff options
author | Robin Jarry <robin@jarry.cc> | 2023-04-02 13:02:22 +0200 |
---|---|---|
committer | Robin Jarry <robin@jarry.cc> | 2023-04-02 20:59:11 +0200 |
commit | 1540d645dff3c610d7d25c47699ec1313601303b (patch) | |
tree | 8075b8f463f21784311657086f327f166337709d | |
parent | 476a94eef81658cb2f9b3435420401899ea33bf6 (diff) | |
download | aerc-1540d645dff3c610d7d25c47699ec1313601303b.tar.gz |
colorize: make url parsing more robust
Reuse the URL parsing algorithm from foot. Basically, it involves
recording the opening [, (, < and take into account their closing
counterparts. If a closing character is encountered with no matching
opening one, assume the URL ends. This allows handling markdown link
syntax such as:
[http://foobaz.org/xxx](http://foobaz.org/xxx)
Avoid coloring bare URL protocols such as http:// or https://
Update test vector to handle more corner cases.
Link: https://codeberg.org/dnkl/foot/src/tag/1.13.1/url-mode.c#L331-L471
Signed-off-by: Robin Jarry <robin@jarry.cc>
Tested-by: Kirill Chibisov <contact@kchibisov.com>
-rw-r--r-- | filters/colorize.c | 87 | ||||
-rw-r--r-- | filters/vectors/colorize-quotes.expected | 12 | ||||
-rw-r--r-- | filters/vectors/colorize-quotes.in | 10 |
3 files changed, 84 insertions, 25 deletions
diff --git a/filters/colorize.c b/filters/colorize.c index 26c25a8a..8e0cc3c7 100644 --- a/filters/colorize.c +++ b/filters/colorize.c @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: MIT */ /* Copyright (c) 2023 Robin Jarry */ +#include <ctype.h> #include <errno.h> #include <fnmatch.h> #include <getopt.h> @@ -439,8 +440,19 @@ static void diff_chunk(const char *in) print_notabs(in, BUFSIZ); } +static inline bool isurichar(char c) +{ + if (c == '\0') + return false; + if (isalnum(c)) + return true; + if (strchr("-_.,~:;/?#@!$&%*+=\"'<>()[]", c) != NULL) + return true; + return false; +} + #define URL_RE \ - "([a-z]{2,8}:)//[][:alnum:]._~:/?#[@!$&'()*+,;=%-]{4,}" \ + "([a-z]{2,8})://" \ "|(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@[a-z][[:alnum:].-]*[a-z]" static regex_t url_re; @@ -457,22 +469,69 @@ static void urls(const char *in, struct style *ctx) while (!regexec(&url_re, in, 3, groups, 0)) { in += print_notabs(in, groups[0].rm_so); - print(seq(&styles.url)); len = groups[0].rm_eo - groups[0].rm_so; - /* Heuristic to remove trailing characters that are valid URL - * characters, but typically not at the end of the URL */ - trim = true; - while (trim && len > 0) { - switch (in[len - 1]) { - case '.': case ',': case ';': case ')': - case '!': case '?': case '\'': - len--; - break; - default: - trim = false; - break; + + if (groups[1].rm_so != -1) { + /* Standard URL (i.e. not mailto: nor email address). + * Regular expressions do not really cut it here and + * we need to detect opening/closing braces to handle + * markdown link syntax. */ + int paren = 0, bracket = 0, ltgt = 0; + bool emit_url = false; + size_t l = len; + + while (!emit_url && isurichar(in[l])) { + switch (in[l]) { + case '[': bracket++; l++; break; + case '(': paren++; l++; break; + case '<': ltgt++; l++; break; + case ']': + if (--bracket < 0) + emit_url = true; + else + l++; + break; + case ')': + if (--paren < 0) + emit_url = true; + else + l++; + break; + case '>': + if (--ltgt < 0) + emit_url = true; + else + l++; + break; + default: + l++; + break; + } + } + /* Heuristic to remove trailing characters that are + * valid URL characters, but typically not at the end + * of the URL */ + trim = true; + while (trim && l > len) { + switch (in[l - 1]) { + case '.': case ',': case ':': + case ';': case '?': case '!': + case '"': case '\'': case '%': + l--; + break; + default: + trim = false; + break; + } + } + if (l == len) { + /* only an URL protocol, do not colorize */ + in += print_notabs(in, len); + continue; } + len = l; } + print(seq(&styles.url)); bool email = groups[2].rm_so == -1 && groups[1].rm_so == -1; print_osc8(in, len, url_id, email); in += print_notabs(in, len); diff --git a/filters/vectors/colorize-quotes.expected b/filters/vectors/colorize-quotes.expected index 46d3adba..c136fcb0 100644 --- a/filters/vectors/colorize-quotes.expected +++ b/filters/vectors/colorize-quotes.expected @@ -20,13 +20,13 @@ nibh petentium at! Sit docendi laboramus ei, animal insolens ad mea. [38;2;95;175;255m> detracto sententia <[4;38;2;255;255;175m]8;id=colorize-0;https://foobar.com\https://foobar.com]8;id=colorize-0;\[0m[38;2;95;175;255m> && "[4;38;2;255;255;175m]8;id=colorize-1;https://foobaz.org/\https://foobaz.org/]8;id=colorize-1;\[0m[38;2;95;175;255m".[0m [38;2;95;175;255m>[0m [38;2;95;175;255m> Error libris deleniti ea mei, vis at elit probo munere, his sint unum[0m -[38;2;95;175;255m> albucius ex.[0m +[38;2;95;175;255m> albucius ex. [[4;38;2;255;255;175m]8;id=colorize-2;https://pouet.com/oksuper\https://pouet.com/oksuper]8;id=colorize-2;\[0m[38;2;95;175;255m]([4;38;2;255;255;175m]8;id=colorize-3;https://pouet.com/oksuper\https://pouet.com/oksuper]8;id=colorize-3;\[0m[38;2;95;175;255m).[0m Graece definiebas scripserit ne est? Nec nonumes explicari contentiones ne, vocent iuvaret placerat no vix. Nec et partem salutandi deseruisse, his no possim malorum pericula. Te quando reprehendunt nam, at consul sadipscing vel? Velit possim aliquando ei per, ne simul quodsi antiopam sea, ullum choro -facilisi et pri! +facilisi et pri http:// or https://! [38;2;95;175;255m> Dico soleat partem ea pro, ad vix impetus splendide. Primis melius principes[0m [38;2;95;175;255m> pri ad, tacimates pertinacia ei pro? Appareat atomorum oportere at nam, eu[0m @@ -47,17 +47,17 @@ facilisi et pri! [38;2;95;175;255m> if err != nil || err2 != nil {[0m Id vix referrentur philosophia, veri labores an nec. Noster denique no duo, sit -ei diam inermis vocibus! Mutat principes ex pro, at pericula assueverit vel. +ei diam inermis vocibus! Mutat principes ex pro, at [4;38;2;255;255;175m]8;id=colorize-4;mailto://~rjarry/aerc-devel@lists.sr.ht\~rjarry/aerc-devel@lists.sr.ht]8;id=colorize-4;\[0m. Has putent verterem constituto ex, tale electram duo at! Ei nulla lucilius intellegat nam, pro quod epicuri dissentiet ut, omnis voluptatibus definitiones -vim at. +vim at [[4;38;2;255;255;175m]8;id=colorize-5;irc://foo.bar\irc://foo.bar]8;id=colorize-5;\[0m] <[4;38;2;255;255;175m]8;id=colorize-6;mailto://jeanpierre@foobaz.org\jeanpierre@foobaz.org]8;id=colorize-6;\[0m>. -[4;38;2;255;255;175m]8;id=colorize-2;https://git-man-page-generator.lokaltog.net/#Y2xhcCQkY29tbWFuZA==\https://git-man-page-generator.lokaltog.net/#Y2xhcCQkY29tbWFuZA==]8;id=colorize-2;\[0m +[4;38;2;255;255;175m]8;id=colorize-7;https://git-man-page-generator.lokaltog.net/#Y2xhcCQkY29tbWFuZA==\https://git-man-page-generator.lokaltog.net/#Y2xhcCQkY29tbWFuZA==]8;id=colorize-7;\[0m Eam mundi libris debitis ad, eam regione numquam at. Eum omnes bonorum eu, oporteat assueverit disputationi nam ne, nonumes iracundia mea ad! Duo libris recusabo id, ceteros salutatus inciderint vim ea. Et graeco reformidans vel? Ei -has labore quidam? +has labore quidam [4;38;2;255;255;175m]8;id=colorize-8;https://foobaz.com/ooo<uuuu>okf\https://foobaz.com/ooo<uuuu>okf]8;id=colorize-8;\[0m? [38;2;128;128;128m>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> sympa, non?[0m diff --git a/filters/vectors/colorize-quotes.in b/filters/vectors/colorize-quotes.in index 31b04911..2e717868 100644 --- a/filters/vectors/colorize-quotes.in +++ b/filters/vectors/colorize-quotes.in @@ -20,13 +20,13 @@ nibh petentium at! Sit docendi laboramus ei, animal insolens ad mea. > detracto sententia <https://foobar.com> && "https://foobaz.org/". > > Error libris deleniti ea mei, vis at elit probo munere, his sint unum -> albucius ex. +> albucius ex. [https://pouet.com/oksuper](https://pouet.com/oksuper). Graece definiebas scripserit ne est? Nec nonumes explicari contentiones ne, vocent iuvaret placerat no vix. Nec et partem salutandi deseruisse, his no possim malorum pericula. Te quando reprehendunt nam, at consul sadipscing vel? Velit possim aliquando ei per, ne simul quodsi antiopam sea, ullum choro -facilisi et pri! +facilisi et pri http:// or https://! > Dico soleat partem ea pro, ad vix impetus splendide. Primis melius principes > pri ad, tacimates pertinacia ei pro? Appareat atomorum oportere at nam, eu @@ -47,17 +47,17 @@ facilisi et pri! > if err != nil || err2 != nil { Id vix referrentur philosophia, veri labores an nec. Noster denique no duo, sit -ei diam inermis vocibus! Mutat principes ex pro, at pericula assueverit vel. +ei diam inermis vocibus! Mutat principes ex pro, at ~rjarry/aerc-devel@lists.sr.ht. Has putent verterem constituto ex, tale electram duo at! Ei nulla lucilius intellegat nam, pro quod epicuri dissentiet ut, omnis voluptatibus definitiones -vim at. +vim at [irc://foo.bar] <jeanpierre@foobaz.org>. https://git-man-page-generator.lokaltog.net/#Y2xhcCQkY29tbWFuZA== Eam mundi libris debitis ad, eam regione numquam at. Eum omnes bonorum eu, oporteat assueverit disputationi nam ne, nonumes iracundia mea ad! Duo libris recusabo id, ceteros salutatus inciderint vim ea. Et graeco reformidans vel? Ei -has labore quidam? +has labore quidam https://foobaz.com/ooo<uuuu>okf? >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> sympa, non? |