diff options
Diffstat (limited to 'filters/wrap.c')
-rw-r--r-- | filters/wrap.c | 481 |
1 files changed, 481 insertions, 0 deletions
diff --git a/filters/wrap.c b/filters/wrap.c new file mode 100644 index 00000000..ba084c35 --- /dev/null +++ b/filters/wrap.c @@ -0,0 +1,481 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright (c) 2023 Robin Jarry */ + +#define _XOPEN_SOURCE +#include <errno.h> +#include <getopt.h> +#include <locale.h> +#include <regex.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <wchar.h> +#include <wctype.h> + +static void usage(void) +{ + puts("usage: wrap [-h] [-w INT] [-r] [-l INT] [-f FILE]"); + puts(""); + puts("Wrap text without messing up email quotes."); + puts(""); + puts("options:"); + puts(" -h show this help message"); + puts(" -w INT preferred wrap margin (default 80)"); + puts(" -r reflow all paragraphs even if no trailing space"); + puts(" -l INT minimum percentage of letters in a line to be"); + puts(" considered a paragaph"); + puts(" -f FILE read from filename (default stdin)"); +} + +static size_t margin = 80; +static long prose_ratio = 50; +static bool reflow; +static FILE *in_file; + +int parse_args(int argc, char **argv) +{ + const char *filename = NULL; + char c; + + while ((c = getopt(argc, argv, "hrw:l:f:")) != -1) { + errno = 0; + switch (c) { + case 'r': + reflow = true; + break; + case 'l': + prose_ratio = strtol(optarg, NULL, 10); + if (errno) { + perror("error: invalid ratio value"); + return 1; + } + if (prose_ratio <= 0 || prose_ratio >= 100) { + fprintf(stderr, "error: ratio must be ]0,100[\n"); + return 1; + } + break; + case 'w': + margin = strtol(optarg, NULL, 10); + if (errno) { + perror("error: invalid width value"); + return 1; + } + if (margin < 1) { + fprintf(stderr, "error: width must be positive\n"); + return 1; + } + break; + case 'f': + filename = optarg; + break; + default: + usage(); + return 1; + } + } + if (optind < argc) { + fprintf(stderr, "%s: unexpected argument -- '%s'\n", + argv[0], argv[optind]); + usage(); + return 1; + } + if (filename == NULL || !strcmp(filename, "-")) { + in_file = stdin; + } else { + in_file = fopen(filename, "r"); + if (!in_file) { + perror("error: cannot open file"); + return 1; + } + } + return 0; +} + +static bool is_empty(const wchar_t *s) +{ + while (*s != L'\0') { + if (!iswspace(*s++)) + return false; + } + return true; +} + +__attribute__((malloc,returns_nonnull)) +static void *xmalloc(size_t s) +{ + void *ptr = malloc(s); + if (ptr == NULL) { + perror("fatal: cannot allocate buffer"); + abort(); + } + return ptr; +} + +__attribute__((malloc,returns_nonnull)) +static void *xrealloc(void *ptr, size_t s) +{ + ptr = realloc(ptr, s); + if (ptr == NULL) { + perror("fatal: cannot reallocate buffer"); + abort(); + } + return ptr; +} + +struct paragraph { + /* email quote prefix, if any */ + wchar_t *quotes; + /* list item indent, if any */ + wchar_t *indent; + /* actual text of this paragraph */ + wchar_t *text; + /* percentage of letters in text */ + int prose_ratio; + /* text ends with a space */ + bool flowed; + /* paragraph is a list item */ + bool list_item; +}; + +static void free_paragraph(struct paragraph *p) +{ + if (!p) + return; + free(p->quotes); + free(p->indent); + free(p->text); + free(p); +} + +static wchar_t *read_part(const wchar_t *in, size_t len) +{ + wchar_t *out = xmalloc((len + 1) * sizeof(wchar_t)); + wcsncpy(out, in, len); + out[len] = L'\0'; + return out; +} + +static size_t list_item_offset(const wchar_t *buf) +{ + size_t i = 0; + wchar_t c; + + if (buf[i] == L'-' || buf[i] == '*' || buf[i] == '.') { + /* bullet list */ + i++; + } else if (iswdigit(buf[i])) { + /* numbered list */ + i++; + if (iswdigit(buf[i])) { + i++; + } + } else if (iswalpha(buf[i])) { + /* lettered list */ + c = towlower(buf[i]); + i++; + if (c == L'i' || c == L'v') { + /* roman i. ii. iii. iv. ... */ + c = towlower(buf[i]); + while (i < 4 && (c == L'i' || c == L'v')) { + c = towlower(buf[++i]); + } + } + } else { + return 0; + } + if (iswdigit(buf[0]) || iswalpha(buf[0])) { + if (buf[i] == L')' || buf[i] == L'/' || buf[i] == L'.') { + i++; + } else { + return 0; + } + } + if (buf[i] == L' ') { + i++; + } else { + return 0; + } + + return i; +} + +static struct paragraph *parse_line(const wchar_t *buf) +{ + size_t i, q, t, e, letters, indent_len, text_len; + bool list_item, flowed; + struct paragraph *p; + + /* + * Find relevant positions in the line: + * + * '> > > > 2) blah blah blah blah ' + * ^ ^ ^ ^ + * 0 q t e + * <------><-------------> + * quotes indent + * <--------------------------------> + * text + */ + + /* detect the end of quotes prefix if any */ + q = 0; + while (buf[q] == L'>') { + q++; + if (buf[q] == L' ') { + q++; + } + } + /* detect list item prefix & indent */ + t = q; + while (iswspace(buf[t])) { + t++; + } + i = list_item_offset(&buf[t]); + list_item = i != 0; + t += i; + while (iswspace(buf[t])) { + t++; + } + indent_len = t - q; + /* compute prose ratio */ + e = t; + letters = 0; + while (buf[e] != L'\0') { + if (iswalpha(buf[e++])) { + letters++; + } + } + /* strip trailing whitespace */ + flowed = false; + while (e > q && iswspace(buf[e - 1])) { + e--; + flowed = true; + } + text_len = e - q; + + p = xmalloc(sizeof(*p)); + memset(p, 0, sizeof(*p)); + p->quotes = read_part(buf, q); + p->indent = xmalloc((indent_len + 1) * sizeof(wchar_t)); + for (i = 0; i < indent_len; i++) + p->indent[i] = L' '; + p->indent[i] = L'\0'; + p->text = read_part(&buf[q], text_len); + p->flowed = flowed; + p->list_item = list_item; + p->prose_ratio = 100 * letters / (text_len ? text_len : 1); + + return p; +} + +static bool is_continuation( + const struct paragraph *p, const struct paragraph *next +) { + if (next->list_item) + /* new list items always start a new paragraph */ + return false; + if (next->prose_ratio < prose_ratio || p->prose_ratio < prose_ratio) + /* does not look like prose, maybe ascii art */ + return false; + if (wcscmp(next->quotes, p->quotes) != 0) + /* quote prefix has changed */ + return false; + if (wcscmp(next->indent, p->indent) != 0) + /* list item indent has changed */ + return false; + if (is_empty(next->text)) + /* empty or whitespace only line */ + return false; + if (wcscmp(p->text, L"--") == 0) + /* never join anything with signature start */ + return false; + if (p->flowed) + /* current paragraph has trailing space, indicating + * format=flowed */ + return true; + if (reflow) + /* user forced paragraph reflow on the command line */ + return true; + return false; +} + +static void join_paragraph( + struct paragraph *p, const struct paragraph *next +) { + const wchar_t *append = next->text; + const wchar_t *separator = L" "; + size_t len, extra_len; + wchar_t *text; + + /* trim leading whitespace of the next paragraph before joining */ + while (*append != L'\0' && iswspace(*append)) + append++; + + len = wcslen(p->text); + if (len == 0) { + separator = L""; + } + extra_len = wcslen(separator) + wcslen(append) + 1; + + text = xrealloc(p->text, (len + extra_len) * sizeof(wchar_t)); + swprintf(&text[len], extra_len, L"%ls%ls", separator, append); + + p->text = text; + p->prose_ratio = (p->prose_ratio + next->prose_ratio) / 2; + p->flowed = next->flowed; +} + +/* + * BUFSIZ has different values depending on the libc implementation. + * Use a self defined value to have consistent behaviour accross all platforms. + */ +#define BUFFER_SIZE 8192 + +/* + * Write a paragraph, wrapping at words boundaries. + * + * Only try to do word wrapping on things that look like prose. When the text + * contains too many non-letter characters, print it as-is. + */ +static void write_paragraph(struct paragraph *p) +{ + size_t quotes_width = wcswidth(p->quotes, wcslen(p->quotes)); + size_t remain = wcswidth(p->text, wcslen(p->text)); + const wchar_t *indent = L""; + wchar_t *text = p->text; + bool more = true; + wchar_t *line; + size_t width; + + while (more) { + width = quotes_width + wcswidth(indent, wcslen(indent)); + + if (width + remain <= margin || p->prose_ratio < prose_ratio) { + /* whole paragraph fits on a single line */ + line = text; + more = false; + } else { + /* find split point, preferably before margin */ + int split = -1; + int w = 0; + for (int i = 0; text[i] != L'\0'; i++) { + w += wcwidth(text[i]); + if (width + w > margin && split != -1) { + break; + } + if (iswspace(text[i])) { + split = i; + } + } + if (split == -1) { + /* no space found to split, print a long line */ + line = text; + more = false; + } else { + text[split] = L'\0'; + line = text; + split++; + /* find start of next word */ + while (iswspace(text[split])) { + split++; + } + if (text[split] != L'\0') { + text = &text[split]; + remain -= split; + } else { + /* only trailing whitespace, we're done */ + more = false; + } + } + } + wprintf(L"%ls%ls%ls\n", p->quotes, indent, line); + indent = p->indent; + } +} + +#define SPACES_PER_TAB 8 + +/* + * Trim LF CR CRLF LFCR and replace tabs with spaces. + */ +static void sanitize_line(const wchar_t *in, wchar_t *out) +{ + /* No bounds checking needed. This function is only used with + * 'buf' and 'line' buffers from main. 'out' is large enough no + * matter what is present in 'in'. */ + while (*in != L'\0' && *in != L'\n' && *in != L'\r') { + if (*in == L'\t') { + /* tabs cause indentation/alignment issues + * replace them with 8 spaces */ + in++; + for (int i = 0; i < SPACES_PER_TAB; i++) + *out++ = L' '; + } else { + *out++ = *in++; + } + } + *out = L'\0'; +} + +int main(int argc, char **argv) +{ + /* line needs to be 8 times larger than buf since every read character + * may be a tab (very unlikely, but it could happen). */ + static wchar_t buf[BUFFER_SIZE], line[BUFFER_SIZE * SPACES_PER_TAB]; + struct paragraph *cur = NULL, *next; + bool is_patch = false; + regmatch_t groups[2]; + char *subject; + regex_t re; + int err; + + err = parse_args(argc, argv); + if (err) + goto end; + + regcomp(&re, "\\<PATCH\\>", REG_EXTENDED); + subject = getenv("AERC_SUBJECT"); + if (subject && !regexec(&re, subject, 2, groups, 0)) + is_patch = true; + regfree(&re); + + /* aerc will always send UTF-8 text, force locale here */ + if (!setlocale(LC_CTYPE, "C.UTF-8")) { + err = 1; + perror("error: failed to set locale"); + goto end; + } + fwide(in_file, true); + fwide(stdout, true); + + while (fgetws(buf, BUFFER_SIZE, in_file)) { + if (is_patch) { + /* never reflow patches */ + fputws(buf, stdout); + continue; + } + sanitize_line(buf, line); + next = parse_line(line); + if (!cur) { + cur = next; + } else if (is_continuation(cur, next)) { + join_paragraph(cur, next); + free_paragraph(next); + } else { + write_paragraph(cur); + free_paragraph(cur); + cur = next; + } + } + if (cur) { + write_paragraph(cur); + } + +end: + free_paragraph(cur); + if (in_file) { + fclose(in_file); + } + return err; +} |