From c9524d265793775e4c3e326c7191471d982c1e66 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Thu, 1 Dec 2022 20:17:04 +0100 Subject: filters: add wrap utility written in go I had started writing this as an awk script but quickly got stuck with obscure code which did not even work properly. I jumped the gun and re did it in go. Bonus, we will not have MacOS's 1987 BSD awk issues. On the other hand, instead of a 20.0K awk script, we now have a 2.2M static go binary. If this makes people scream, I challenge them to do that with BSD awk :) Basically, this takes text from stdin or from a file and wraps long lines on word boundaries. It takes care of not breaking up email quotes nor list items (numbered as well). Also, it is conservative by default and only wraps long lines and lines that end with a space (indicating a format=flowed message). If the AERC_SUBJECT environment variable is defined and contains the word PATCH, the text is not modified at all (i.e. wrap behaves as cat(1)). There are a few command line options to control behavior: Usage of ./wrap: -f string read from file instead of stdin -l int minimum percentage of letters in a line to be considered a paragaph (default 50) -r reflow all paragraphs even if no trailing space -w int preferred wrap margin (default 80) Update docs, makefile and default config file with examples. Add a torture test to ensure it works as expected. Signed-off-by: Robin Jarry Tested-by: Bence Ferdinandy --- .gitignore | 1 + CHANGELOG.md | 1 + Makefile | 11 ++- config/aerc.conf | 2 +- doc/aerc-config.5.scd | 13 +-- filters/wrap.go | 267 ++++++++++++++++++++++++++++++++++++++++++++++++++ filters/wrap_test.go | 212 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 495 insertions(+), 12 deletions(-) create mode 100644 filters/wrap.go create mode 100644 filters/wrap_test.go diff --git a/.gitignore b/.gitignore index 7caf227c..2b1b7a4f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ /aerc2 /aerc /aerc.debug +/wrap /.aerc.d race.log.* raw.log diff --git a/CHANGELOG.md b/CHANGELOG.md index 6931391c..5e29ea5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). `subject-re-pattern`. - Search/filter by absolute and relative date ranges with the `-d` flag. - LIST-STATUS support for imap +- built-in `wrap` filter that does not mess up nested quotes and lists. ### Fixed diff --git a/Makefile b/Makefile index 8e1c89bd..f7a0b573 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ GO_LDFLAGS+=-X main.Flags=$(flags) GO_LDFLAGS+=-X git.sr.ht/~rjarry/aerc/config.shareDir=$(SHAREDIR) GO_LDFLAGS+=$(GO_EXTRA_LDFLAGS) -GOSRC!=find * -name '*.go' +GOSRC!=find * -name '*.go' | grep -v filters/wrap.go GOSRC+=go.mod go.sum DOCS := \ @@ -37,7 +37,7 @@ DOCS := \ aerc-templates.7 \ aerc-stylesets.7 -all: aerc $(DOCS) +all: aerc wrap $(DOCS) build_cmd:=$(GO) build $(BUILD_OPTS) $(GOFLAGS) -ldflags "$(GO_LDFLAGS)" -o aerc @@ -52,6 +52,10 @@ _!=grep -sqFx '$(build_cmd)' .aerc.d || rm -f .aerc.d aerc: $(GOSRC) .aerc.d $(build_cmd) +wrap: filters/wrap.go .aerc.d + $(GO) build $(BUILD_OPTS) $(GOFLAGS) -ldflags "$(GO_EXTRA_LDFLAGS)" \ + -o wrap filters/wrap.go + .PHONY: dev dev: $(MAKE) aerc BUILD_OPTS="-trimpath -race" @@ -102,7 +106,7 @@ RM?=rm -f clean: $(RM) $(DOCS) aerc -install: $(DOCS) aerc +install: $(DOCS) aerc wrap mkdir -m755 -p $(DESTDIR)$(BINDIR) $(DESTDIR)$(MANDIR)/man1 $(DESTDIR)$(MANDIR)/man5 $(DESTDIR)$(MANDIR)/man7 \ $(DESTDIR)$(SHAREDIR) $(DESTDIR)$(SHAREDIR)/filters $(DESTDIR)$(SHAREDIR)/templates $(DESTDIR)$(SHAREDIR)/stylesets \ $(DESTDIR)$(PREFIX)/share/applications @@ -130,6 +134,7 @@ install: $(DOCS) aerc install -m755 filters/html-unsafe $(DESTDIR)$(SHAREDIR)/filters/html-unsafe install -m755 filters/plaintext $(DESTDIR)$(SHAREDIR)/filters/plaintext install -m755 filters/show-ics-details.py $(DESTDIR)$(SHAREDIR)/filters/show-ics-details.py + install -m755 wrap $(DESTDIR)$(SHAREDIR)/filters/wrap install -m644 templates/new_message $(DESTDIR)$(SHAREDIR)/templates/new_message install -m644 templates/quoted_reply $(DESTDIR)$(SHAREDIR)/templates/quoted_reply install -m644 templates/forward_as_body $(DESTDIR)$(SHAREDIR)/templates/forward_as_body diff --git a/config/aerc.conf b/config/aerc.conf index ebd361ad..862b3be8 100644 --- a/config/aerc.conf +++ b/config/aerc.conf @@ -396,7 +396,7 @@ message/rfc822=colorize #application/x-sh=bat -fP -l sh #image/*=catimg -w $(tput cols) - #subject,~Git(hub|lab)=lolcat -f -#from,thatguywhodoesnothardwraphismessages=fmt -w 72 | colorize +#from,thatguywhodoesnothardwraphismessages=wrap -w 100 | colorize [openers] # diff --git a/doc/aerc-config.5.scd b/doc/aerc-config.5.scd index ac9a3a73..17a96f07 100644 --- a/doc/aerc-config.5.scd +++ b/doc/aerc-config.5.scd @@ -636,22 +636,19 @@ _text/plain_ ``` Wrap long lines at 100 characters, while not messing up nested quotes. - Not perfect, but works for most emails: + Handles format=flowed emails properly: ``` - text/plain=fmt -s -p ">>" -w 100 | fmt -s -p ">" -w 100 | fmt -s -w 100 | colorize + text/plain=wrap -w 100 | colorize ``` _from,_ Another example of hard wrapping lines of emails sent by a specific - person but using neovim which handles nested quotes without issues: + person. Explicitly reflow all paragraphs instead of only wrapping long + lines. This may break manual formatting in some messages: ``` - from,thatguywhoneverhardwrapshismessages=case "$AERC_SUBJECT" in \\ - \*PATCH\*) cat;; \\ - \*) nvim - -u NONE -es '+set ft=mail fo=tcqwn1j tw=80' \\ - '+:norm! gggqG' '+%print' '+:q!';; \\ - esac | colorize + from,thatguywhoneverhardwrapshismessages=wrap -r -w 72 | colorize ``` _subject,~_ diff --git a/filters/wrap.go b/filters/wrap.go new file mode 100644 index 00000000..f1b335cc --- /dev/null +++ b/filters/wrap.go @@ -0,0 +1,267 @@ +package main + +import ( + "bufio" + "errors" + "flag" + "fmt" + "io" + "os" + "regexp" + "strings" + + "github.com/mattn/go-runewidth" +) + +type paragraph struct { + // email quote prefix, if any + quotes string + // list item indent, if any + leader string + // actual text of this paragraph + text string + // percentage of letters in text + proseRatio int + // text ends with a space + flowed bool + // paragraph is a list item + listItem bool +} + +func main() { + var err error + var width int + var reflow bool + var file string + var proseRatio int + var input *os.File + + fs := flag.NewFlagSet(os.Args[0], flag.ExitOnError) + fs.IntVar(&width, "w", 80, "preferred wrap margin") + fs.BoolVar(&reflow, "r", false, + "reflow all paragraphs even if no trailing space") + fs.IntVar(&proseRatio, "l", 50, + "minimum percentage of letters in a line to be considered a paragaph") + fs.StringVar(&file, "f", "", "read from file instead of stdin") + _ = fs.Parse(os.Args[1:]) + + if file != "" { + input, err = os.OpenFile(file, os.O_RDONLY, 0o644) + if err != nil { + goto end + } + } else { + input = os.Stdin + } + + err = wrap(input, os.Stdout, width, reflow, proseRatio) + +end: + if err != nil && !errors.Is(err, io.EOF) { + fmt.Fprintf(os.Stderr, "error: %s\n", err) + os.Exit(1) + } +} + +func wrap( + in io.Reader, out io.Writer, width int, reflow bool, proseRatio int, +) error { + var para *paragraph = nil + var line string + var err error + + if patchSubjectRe.MatchString(os.Getenv("AERC_SUBJECT")) { + // never reflow patches + _, err = io.Copy(out, in) + } else { + reader := bufio.NewReader(in) + line, err = reader.ReadString('\n') + for ; err == nil; line, err = reader.ReadString('\n') { + next := parse(line) + switch { + case para == nil: + para = next + case para.isContinuation(next, reflow, proseRatio): + para.join(next) + default: + para.write(out, width, proseRatio) + para = next + } + } + if para != nil { + para.write(out, width, proseRatio) + } + } + + return err +} + +// Parse a line of text into a paragraph structure +func parse(line string) *paragraph { + p := new(paragraph) + q := 0 + t := 0 + line = strings.TrimRight(line, "\r\n") + // tabs cause a whole lot of troubles, replace them with 8 spaces + line = strings.ReplaceAll(line, "\t", " ") + + // Use integer offsets to find relevant positions in the line + // + // > > > 2) blah blah blah blah + // ^--------+-----^ + // q | t + // end of quotes | start of text + // | + // list item leader + + // detect the end of quotes prefix if any + for q < len(line) && line[q] == '>' { + q += 1 + if q < len(line) && line[q] == ' ' { + q += 1 + } + } + + // detect list item leader + loc := listItemRe.FindStringIndex(line[q:]) + if loc != nil { + // start of list item + p.listItem = true + } else { + // maybe list item continuation + loc = leadingSpaceRe.FindStringIndex(line[q:]) + } + if loc != nil { + t = q + loc[1] + } else { + // no list at all + t = q + } + + // check if there is trailing whitespace, indicating format=flowed + loc = trailingSpaceRe.FindStringIndex(line[t:]) + if loc != nil { + p.flowed = true + // trim whitespace + line = line[:t+loc[0]] + } + + p.quotes = line[:q] + p.leader = strings.Repeat(" ", runewidth.StringWidth(line[q:t])) + p.text = line[q:] + + // compute the ratio of letters in the actual text + onlyLetters := strings.TrimLeft(line[q:], " ") + totalLen := runewidth.StringWidth(onlyLetters) + if totalLen == 0 { + // to avoid division by zero + totalLen = 1 + } + onlyLetters = notLetterRe.ReplaceAllLiteralString(onlyLetters, "") + p.proseRatio = 100 * runewidth.StringWidth(onlyLetters) / totalLen + + return p +} + +// Return true if a paragraph is a continuation of the current one. +func (p *paragraph) isContinuation( + next *paragraph, reflow bool, proseRatio int, +) bool { + switch { + case next.listItem: + // new list items always start a new paragraph + return false + case next.proseRatio < proseRatio || p.proseRatio < proseRatio: + // does not look like prose, maybe ascii art + return false + case next.quotes != p.quotes || next.leader != p.leader: + // quote level and/or list item leader have changed + return false + case len(strings.Trim(next.text, " ")) == 0: + // empty line + return false + case p.flowed: + // current paragraph has trailing space, indicating + // format=flowed + return true + case reflow: + // user forced paragraph reflow on the command line + return true + default: + return false + } +} + +// Join next paragraph into current one. +func (p *paragraph) join(next *paragraph) { + if p.text == "" { + p.text = next.text + } else { + p.text = p.text + " " + strings.Trim(next.text, " ") + } + p.proseRatio = (p.proseRatio + next.proseRatio) / 2 + p.flowed = next.flowed +} + +// Write a paragraph, wrapping at words boundaries. +// +// Only try to do word wrapping on things that look like prose. When the text +// contains too many non-letter characters, print it as-is. +func (p *paragraph) write(out io.Writer, margin int, proseRatio int) { + leader := "" + more := true + quotesWidth := runewidth.StringWidth(p.quotes) + for more { + var line string + width := quotesWidth + runewidth.StringWidth(leader) + remain := runewidth.StringWidth(p.text) + if width+remain <= margin || p.proseRatio < proseRatio { + // whole paragraph fits on a single line + line = p.text + p.text = "" + more = false + } else { + // find split point, preferably before margin + split := -1 + w := 0 + for i, r := range p.text { + w += runewidth.RuneWidth(r) + if width+w > margin && split != -1 { + break + } + if r == ' ' { + split = i + } + } + if split == -1 { + // no space found to split, print a long line + line = p.text + p.text = "" + more = false + } else { + line = p.text[:split] + // find start of next word + for split < len(p.text) && p.text[split] == ' ' { + split++ + } + if split < len(p.text) { + p.text = p.text[split:] + } else { + // only trailing whitespace, we're done + p.text = "" + more = false + } + } + } + fmt.Fprintf(out, "%s%s%s\n", p.quotes, leader, line) + leader = p.leader + } +} + +var ( + patchSubjectRe = regexp.MustCompile(`\bPATCH\b`) + listItemRe = regexp.MustCompile(`^\s*([\-\*\.]|\d{1,2}[\)\]\.])\s+`) + leadingSpaceRe = regexp.MustCompile(`^\s+`) + trailingSpaceRe = regexp.MustCompile(`\s+$`) + notLetterRe = regexp.MustCompile(`[^\pL]`) +) diff --git a/filters/wrap_test.go b/filters/wrap_test.go new file mode 100644 index 00000000..f8d82cbe --- /dev/null +++ b/filters/wrap_test.go @@ -0,0 +1,212 @@ +package main + +import ( + "bytes" + "errors" + "io" + "testing" +) + +type vector struct { + name string + in string + out string + width int + reflow bool + ratio int +} + +var vectors = []vector{ + { + name: "simple", + in: `long line that exceeds margin by many words +`, + width: 30, + reflow: false, + ratio: 50, + out: `long line that exceeds margin +by many words +`, + }, + { + name: "two-paragraphs", + in: `this is one long paragraph +this is another long one +`, + width: 20, + reflow: false, + ratio: 50, + out: `this is one long +paragraph +this is another +long one +`, + }, + { + name: "reflow", + in: `this is one long paragraph +this is another long one +`, + width: 20, + reflow: true, + ratio: 50, + out: `this is one long +paragraph this is +another long one +`, + }, + { + name: "quotes", + in: `Let's play with quotes: + +>> Hi there how are you doing? +> Great thanks + +How rude. + +>> Fantastic. Let's go wrap some words. +`, + width: 20, + reflow: false, + ratio: 50, + out: `Let's play with +quotes: + +>> Hi there how are +>> you doing? +> Great thanks + +How rude. + +>> Fantastic. Let's +>> go wrap some +>> words. +`, + }, + { + name: "ascii-art", + in: `This is a nice drawing, isn't it? + ++-------------------+ +| foobaz | ++-------------------+ + | + | ++-------------------+ +| foobar | ++-------------------+ +`, + width: 15, + ratio: 50, + reflow: true, + out: `This is a nice +drawing, isn't +it? + ++-------------------+ +| foobaz | ++-------------------+ + | + | ++-------------------+ +| foobar | ++-------------------+ +`, + }, + { + name: "list-items", + in: `Shopping list: + + - milk + - chocolate + - cookies (please, with nuts) +`, + width: 20, + reflow: false, + ratio: 50, + out: `Shopping list: + + - milk + - chocolate + - cookies + (please, with + nuts) +`, + }, + { + name: "list-items-reflow", + in: `Shopping list: + + * milk + * chocolate + * cookies + (please, + with nuts) +`, + width: 100, + reflow: true, + ratio: 30, + out: `Shopping list: + + * milk + * chocolate + * cookies (please, with nuts) +`, + }, + { + name: "long-url", + in: `Please follow this ugly link: +http://foobaz.org/xapapzolmkdmldfk-fldskjflsk-cisjoij/onoes.jsp?xxx=2&yyy=3 +`, + width: 20, + reflow: true, + ratio: 50, + out: `Please follow this +ugly link: +http://foobaz.org/xapapzolmkdmldfk-fldskjflsk-cisjoij/onoes.jsp?xxx=2&yyy=3 +`, + }, + { + name: "format=flowed", + in: "Oh, \nI'm \nso \nhip \nI \nuse \nformat=flowed.\n", + width: 30, + reflow: false, + ratio: 50, + out: "Oh, I'm so hip I use\nformat=flowed.\n", + }, + { + name: "non-ascii", + in: `Lorem ççççç ççççç ççç ççççç çç ççç ççççç çççççççç ççç çç ççççç ççççççççççç ççççç + +Lorem жжжжж жжжжж жжж жжжжж жж жжж жжжжж жжжжжжжж жжж жж жжжжж жжжжжжжжжжж жжжжж жжжжжжжж +`, + width: 40, + reflow: false, + ratio: 50, + out: `Lorem ççççç ççççç ççç ççççç çç ççç +ççççç çççççççç ççç çç ççççç ççççççççççç +ççççç + +Lorem жжжжж жжжжж жжж жжжжж жж жжж +жжжжж жжжжжжжж жжж жж жжжжж жжжжжжжжжжж +жжжжж жжжжжжжж +`, + }, +} + +func TestWrap(t *testing.T) { + for _, vec := range vectors { + t.Run(vec.name, func(t *testing.T) { + r := bytes.NewReader([]byte(vec.in)) + var buf bytes.Buffer + err := wrap(r, &buf, vec.width, vec.reflow, vec.ratio) + if err != nil && !errors.Is(err, io.EOF) { + t.Fatalf("[%s]: %v", vec.name, err) + } + if buf.String() != vec.out { + t.Errorf("[%s] invalid format:\n%q\nexpected\n%q", + vec.name, buf.String(), vec.out) + } + }) + } +} -- cgit