diff options
Diffstat (limited to 'migratetags/matchers/gntmatcher.h')
-rw-r--r-- | migratetags/matchers/gntmatcher.h | 16 |
1 files changed, 15 insertions, 1 deletions
diff --git a/migratetags/matchers/gntmatcher.h b/migratetags/matchers/gntmatcher.h index 8c8f3e4..d2fcbd8 100644 --- a/migratetags/matchers/gntmatcher.h +++ b/migratetags/matchers/gntmatcher.h @@ -104,12 +104,26 @@ virtual void matchWords(vector<int> &targetWordTags, const vector<SWBuf> &target } } +const char *ignoreSeries = "[]\nʼ‾̷‾"; virtual SWBuf sanitizeWord(const SWBuf &word) { SWBuf t1 = word; // remove greek accents sanitizeGreekAccentFilter.processText(t1); t1.toUpper(); - t1.replaceBytes("[]", 0); + + // remove ignoreSeries characters + SWBuf o = t1; + const unsigned char* from = (unsigned char*)o.c_str(); + t1 = ""; + while (*from) { + SW_u32 ch = getUniCharFromUTF8(&from, true); + // if ch is bad, then convert to replacement char + if (!ch) ch = 0xFFFD; + SWBuf checkChar; + getUTF8FromUniChar(ch, &checkChar); + if (checkChar != " " && strstr(ignoreSeries, checkChar.c_str())) continue; + t1.append(checkChar); + } return t1; } |