From c9524d265793775e4c3e326c7191471d982c1e66 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Thu, 1 Dec 2022 20:17:04 +0100 Subject: filters: add wrap utility written in go I had started writing this as an awk script but quickly got stuck with obscure code which did not even work properly. I jumped the gun and re did it in go. Bonus, we will not have MacOS's 1987 BSD awk issues. On the other hand, instead of a 20.0K awk script, we now have a 2.2M static go binary. If this makes people scream, I challenge them to do that with BSD awk :) Basically, this takes text from stdin or from a file and wraps long lines on word boundaries. It takes care of not breaking up email quotes nor list items (numbered as well). Also, it is conservative by default and only wraps long lines and lines that end with a space (indicating a format=flowed message). If the AERC_SUBJECT environment variable is defined and contains the word PATCH, the text is not modified at all (i.e. wrap behaves as cat(1)). There are a few command line options to control behavior: Usage of ./wrap: -f string read from file instead of stdin -l int minimum percentage of letters in a line to be considered a paragaph (default 50) -r reflow all paragraphs even if no trailing space -w int preferred wrap margin (default 80) Update docs, makefile and default config file with examples. Add a torture test to ensure it works as expected. Signed-off-by: Robin Jarry Tested-by: Bence Ferdinandy --- filters/wrap.go | 267 +++++++++++++++++++++++++++++++++++++++++++++++++++ filters/wrap_test.go | 212 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 479 insertions(+) create mode 100644 filters/wrap.go create mode 100644 filters/wrap_test.go (limited to 'filters') diff --git a/filters/wrap.go b/filters/wrap.go new file mode 100644 index 00000000..f1b335cc --- /dev/null +++ b/filters/wrap.go @@ -0,0 +1,267 @@ +package main + +import ( + "bufio" + "errors" + "flag" + "fmt" + "io" + "os" + "regexp" + "strings" + + "github.com/mattn/go-runewidth" +) + +type paragraph struct { + // email quote prefix, if any + quotes string + // list item indent, if any + leader string + // actual text of this paragraph + text string + // percentage of letters in text + proseRatio int + // text ends with a space + flowed bool + // paragraph is a list item + listItem bool +} + +func main() { + var err error + var width int + var reflow bool + var file string + var proseRatio int + var input *os.File + + fs := flag.NewFlagSet(os.Args[0], flag.ExitOnError) + fs.IntVar(&width, "w", 80, "preferred wrap margin") + fs.BoolVar(&reflow, "r", false, + "reflow all paragraphs even if no trailing space") + fs.IntVar(&proseRatio, "l", 50, + "minimum percentage of letters in a line to be considered a paragaph") + fs.StringVar(&file, "f", "", "read from file instead of stdin") + _ = fs.Parse(os.Args[1:]) + + if file != "" { + input, err = os.OpenFile(file, os.O_RDONLY, 0o644) + if err != nil { + goto end + } + } else { + input = os.Stdin + } + + err = wrap(input, os.Stdout, width, reflow, proseRatio) + +end: + if err != nil && !errors.Is(err, io.EOF) { + fmt.Fprintf(os.Stderr, "error: %s\n", err) + os.Exit(1) + } +} + +func wrap( + in io.Reader, out io.Writer, width int, reflow bool, proseRatio int, +) error { + var para *paragraph = nil + var line string + var err error + + if patchSubjectRe.MatchString(os.Getenv("AERC_SUBJECT")) { + // never reflow patches + _, err = io.Copy(out, in) + } else { + reader := bufio.NewReader(in) + line, err = reader.ReadString('\n') + for ; err == nil; line, err = reader.ReadString('\n') { + next := parse(line) + switch { + case para == nil: + para = next + case para.isContinuation(next, reflow, proseRatio): + para.join(next) + default: + para.write(out, width, proseRatio) + para = next + } + } + if para != nil { + para.write(out, width, proseRatio) + } + } + + return err +} + +// Parse a line of text into a paragraph structure +func parse(line string) *paragraph { + p := new(paragraph) + q := 0 + t := 0 + line = strings.TrimRight(line, "\r\n") + // tabs cause a whole lot of troubles, replace them with 8 spaces + line = strings.ReplaceAll(line, "\t", " ") + + // Use integer offsets to find relevant positions in the line + // + // > > > 2) blah blah blah blah + // ^--------+-----^ + // q | t + // end of quotes | start of text + // | + // list item leader + + // detect the end of quotes prefix if any + for q < len(line) && line[q] == '>' { + q += 1 + if q < len(line) && line[q] == ' ' { + q += 1 + } + } + + // detect list item leader + loc := listItemRe.FindStringIndex(line[q:]) + if loc != nil { + // start of list item + p.listItem = true + } else { + // maybe list item continuation + loc = leadingSpaceRe.FindStringIndex(line[q:]) + } + if loc != nil { + t = q + loc[1] + } else { + // no list at all + t = q + } + + // check if there is trailing whitespace, indicating format=flowed + loc = trailingSpaceRe.FindStringIndex(line[t:]) + if loc != nil { + p.flowed = true + // trim whitespace + line = line[:t+loc[0]] + } + + p.quotes = line[:q] + p.leader = strings.Repeat(" ", runewidth.StringWidth(line[q:t])) + p.text = line[q:] + + // compute the ratio of letters in the actual text + onlyLetters := strings.TrimLeft(line[q:], " ") + totalLen := runewidth.StringWidth(onlyLetters) + if totalLen == 0 { + // to avoid division by zero + totalLen = 1 + } + onlyLetters = notLetterRe.ReplaceAllLiteralString(onlyLetters, "") + p.proseRatio = 100 * runewidth.StringWidth(onlyLetters) / totalLen + + return p +} + +// Return true if a paragraph is a continuation of the current one. +func (p *paragraph) isContinuation( + next *paragraph, reflow bool, proseRatio int, +) bool { + switch { + case next.listItem: + // new list items always start a new paragraph + return false + case next.proseRatio < proseRatio || p.proseRatio < proseRatio: + // does not look like prose, maybe ascii art + return false + case next.quotes != p.quotes || next.leader != p.leader: + // quote level and/or list item leader have changed + return false + case len(strings.Trim(next.text, " ")) == 0: + // empty line + return false + case p.flowed: + // current paragraph has trailing space, indicating + // format=flowed + return true + case reflow: + // user forced paragraph reflow on the command line + return true + default: + return false + } +} + +// Join next paragraph into current one. +func (p *paragraph) join(next *paragraph) { + if p.text == "" { + p.text = next.text + } else { + p.text = p.text + " " + strings.Trim(next.text, " ") + } + p.proseRatio = (p.proseRatio + next.proseRatio) / 2 + p.flowed = next.flowed +} + +// Write a paragraph, wrapping at words boundaries. +// +// Only try to do word wrapping on things that look like prose. When the text +// contains too many non-letter characters, print it as-is. +func (p *paragraph) write(out io.Writer, margin int, proseRatio int) { + leader := "" + more := true + quotesWidth := runewidth.StringWidth(p.quotes) + for more { + var line string + width := quotesWidth + runewidth.StringWidth(leader) + remain := runewidth.StringWidth(p.text) + if width+remain <= margin || p.proseRatio < proseRatio { + // whole paragraph fits on a single line + line = p.text + p.text = "" + more = false + } else { + // find split point, preferably before margin + split := -1 + w := 0 + for i, r := range p.text { + w += runewidth.RuneWidth(r) + if width+w > margin && split != -1 { + break + } + if r == ' ' { + split = i + } + } + if split == -1 { + // no space found to split, print a long line + line = p.text + p.text = "" + more = false + } else { + line = p.text[:split] + // find start of next word + for split < len(p.text) && p.text[split] == ' ' { + split++ + } + if split < len(p.text) { + p.text = p.text[split:] + } else { + // only trailing whitespace, we're done + p.text = "" + more = false + } + } + } + fmt.Fprintf(out, "%s%s%s\n", p.quotes, leader, line) + leader = p.leader + } +} + +var ( + patchSubjectRe = regexp.MustCompile(`\bPATCH\b`) + listItemRe = regexp.MustCompile(`^\s*([\-\*\.]|\d{1,2}[\)\]\.])\s+`) + leadingSpaceRe = regexp.MustCompile(`^\s+`) + trailingSpaceRe = regexp.MustCompile(`\s+$`) + notLetterRe = regexp.MustCompile(`[^\pL]`) +) diff --git a/filters/wrap_test.go b/filters/wrap_test.go new file mode 100644 index 00000000..f8d82cbe --- /dev/null +++ b/filters/wrap_test.go @@ -0,0 +1,212 @@ +package main + +import ( + "bytes" + "errors" + "io" + "testing" +) + +type vector struct { + name string + in string + out string + width int + reflow bool + ratio int +} + +var vectors = []vector{ + { + name: "simple", + in: `long line that exceeds margin by many words +`, + width: 30, + reflow: false, + ratio: 50, + out: `long line that exceeds margin +by many words +`, + }, + { + name: "two-paragraphs", + in: `this is one long paragraph +this is another long one +`, + width: 20, + reflow: false, + ratio: 50, + out: `this is one long +paragraph +this is another +long one +`, + }, + { + name: "reflow", + in: `this is one long paragraph +this is another long one +`, + width: 20, + reflow: true, + ratio: 50, + out: `this is one long +paragraph this is +another long one +`, + }, + { + name: "quotes", + in: `Let's play with quotes: + +>> Hi there how are you doing? +> Great thanks + +How rude. + +>> Fantastic. Let's go wrap some words. +`, + width: 20, + reflow: false, + ratio: 50, + out: `Let's play with +quotes: + +>> Hi there how are +>> you doing? +> Great thanks + +How rude. + +>> Fantastic. Let's +>> go wrap some +>> words. +`, + }, + { + name: "ascii-art", + in: `This is a nice drawing, isn't it? + ++-------------------+ +| foobaz | ++-------------------+ + | + | ++-------------------+ +| foobar | ++-------------------+ +`, + width: 15, + ratio: 50, + reflow: true, + out: `This is a nice +drawing, isn't +it? + ++-------------------+ +| foobaz | ++-------------------+ + | + | ++-------------------+ +| foobar | ++-------------------+ +`, + }, + { + name: "list-items", + in: `Shopping list: + + - milk + - chocolate + - cookies (please, with nuts) +`, + width: 20, + reflow: false, + ratio: 50, + out: `Shopping list: + + - milk + - chocolate + - cookies + (please, with + nuts) +`, + }, + { + name: "list-items-reflow", + in: `Shopping list: + + * milk + * chocolate + * cookies + (please, + with nuts) +`, + width: 100, + reflow: true, + ratio: 30, + out: `Shopping list: + + * milk + * chocolate + * cookies (please, with nuts) +`, + }, + { + name: "long-url", + in: `Please follow this ugly link: +http://foobaz.org/xapapzolmkdmldfk-fldskjflsk-cisjoij/onoes.jsp?xxx=2&yyy=3 +`, + width: 20, + reflow: true, + ratio: 50, + out: `Please follow this +ugly link: +http://foobaz.org/xapapzolmkdmldfk-fldskjflsk-cisjoij/onoes.jsp?xxx=2&yyy=3 +`, + }, + { + name: "format=flowed", + in: "Oh, \nI'm \nso \nhip \nI \nuse \nformat=flowed.\n", + width: 30, + reflow: false, + ratio: 50, + out: "Oh, I'm so hip I use\nformat=flowed.\n", + }, + { + name: "non-ascii", + in: `Lorem ççççç ççççç ççç ççççç çç ççç ççççç çççççççç ççç çç ççççç ççççççççççç ççççç + +Lorem жжжжж жжжжж жжж жжжжж жж жжж жжжжж жжжжжжжж жжж жж жжжжж жжжжжжжжжжж жжжжж жжжжжжжж +`, + width: 40, + reflow: false, + ratio: 50, + out: `Lorem ççççç ççççç ççç ççççç çç ççç +ççççç çççççççç ççç çç ççççç ççççççççççç +ççççç + +Lorem жжжжж жжжжж жжж жжжжж жж жжж +жжжжж жжжжжжжж жжж жж жжжжж жжжжжжжжжжж +жжжжж жжжжжжжж +`, + }, +} + +func TestWrap(t *testing.T) { + for _, vec := range vectors { + t.Run(vec.name, func(t *testing.T) { + r := bytes.NewReader([]byte(vec.in)) + var buf bytes.Buffer + err := wrap(r, &buf, vec.width, vec.reflow, vec.ratio) + if err != nil && !errors.Is(err, io.EOF) { + t.Fatalf("[%s]: %v", vec.name, err) + } + if buf.String() != vec.out { + t.Errorf("[%s] invalid format:\n%q\nexpected\n%q", + vec.name, buf.String(), vec.out) + } + }) + } +} -- cgit