From 7c5a1afbda60ec81e84161306c2cf71d974b341a Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Mon, 24 Jun 2024 22:53:53 +0200 Subject: wrap: fix cjk prose ratio on macos Depending on the locale and the libc implementation, iswalpha() may return true or false for CJK symbols. Reuse the same logic as in the split point detection introduced in commit 99bc69918ea7 ("wrap: fix wide CJK characters support"). Include all missing Korean and Japanese specific Unicode blocks. Handle syllabic symbols with a parameter to avoid wrapping in the middle of syllables. Signed-off-by: Robin Jarry Tested-by: Gregory Anders --- filters/wrap.c | 61 +++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 15 deletions(-) (limited to 'filters') diff --git a/filters/wrap.c b/filters/wrap.c index c3109747..1a8d5810 100644 --- a/filters/wrap.c +++ b/filters/wrap.c @@ -209,6 +209,49 @@ static size_t list_item_offset(const wchar_t *buf) return i; } +static bool is_cjk(wchar_t c, bool include_syllables) { + /* CJK Radicals Supplement */ + if (c >= 0x2e80 && c <= 0x2fd5) + return true; + /* CJK Compatibility */ + if (c >= 0x3300 && c <= 0x33ff) + return true; + /* CJK Unified Ideographs Extension A */ + if (c >= 0x3400 && c <= 0x4db5) + return true; + /* CJK Unified Ideographs */ + if (c >= 0x4e00 && c <= 0x9fcb) + return true; + /* CJK Compatibility Ideographs */ + if (c >= 0xf900 && c <= 0xfa6a) + return true; + /* Hangul Jamo */ + if (c >= 0x1100 && c <= 0x11ff) + return true; + /* Hangul Compatibility Jamo */ + if (c >= 0x3130 && c <= 0x318f) + return true; + /* Hangul Jamo Extended-A */ + if (c >= 0xa960 && c <= 0xa97f) + return true; + /* Hangul Jamo Extended-B */ + if (c >= 0xd7b0 && c <= 0xd7ff) + return true; + + if (include_syllables) { + /* Japanese Hiragana */ + if (c >= 0x3040 && c <= 0x309f) + return true; + /* Japanese Katakana */ + if (c >= 0x30a0 && c <= 0x30ff) + return true; + /* Hangul Syllables */ + if (c >= 0xac00 && c <= 0xd7af) + return true; + } + return false; +} + static struct paragraph *parse_line(const wchar_t *buf) { size_t i, q, t, e, letters, indent_len, text_len; @@ -251,7 +294,8 @@ static struct paragraph *parse_line(const wchar_t *buf) e = t; letters = 0; while (buf[e] != L'\0') { - if (iswalpha((wint_t)buf[e++])) { + wchar_t c = buf[e++]; + if (iswalpha((wint_t)c) || is_cjk(c, true)) { letters++; } } @@ -351,20 +395,7 @@ static bool is_split_point(const wchar_t c) if (iswspace((wint_t)c)) return true; - /* CJK Radicals Supplement */ - if (c >= 0x2e80 && c <= 0x2fd5) - return true; - /* CJK Compatibility */ - if (c >= 0x3300 && c <= 0x33ff) - return true; - /* CJK Unified Ideographs Extension A */ - if (c >= 0x3400 && c <= 0x4db5) - return true; - /* CJK Unified Ideographs */ - if (c >= 0x4e00 && c <= 0x9fcb) - return true; - /* CJK Compatibility Ideographs */ - if (c >= 0xf900 && c <= 0xfa6a) + if (is_cjk(c, false)) return true; return false; -- cgit