5#include <unicode/datefmt.h>
6#include <unicode/dtfmtsym.h>
7#include <unicode/gregocal.h>
8#include <unicode/timezone.h>
9#include <unicode/unistr.h>
10#include <unicode/ustring.h>
11#include <unicode/dtptngen.h>
12#include <unicode/dtitvfmt.h>
13#include <unicode/normalizer2.h>
14#include <unicode/unistr.h>
40 static wregex hyphen_pattern(L
"-");
44 while (regex_search(sentence_w, matches, word_pattern)) {
47 UErrorCode errc = U_ZERO_ERROR;
51 const icu::Normalizer2* normalizer = icu::Normalizer2::getNFKCInstance(errc);
56 icu::UnicodeString match_morph;
57 normalizer->normalize(match,match_morph,errc);
60 string normalizedMatch_temp;
61 match_morph.toUTF8String(normalizedMatch_temp);
66 sentence_w.replace(matches.position(), matches.length(), normalizedMatch);
69 sentence_w = regex_replace(sentence_w,hyphen_pattern,L
"-");
93 unordered_map<wchar_t, wchar_t> conversion_map = {
94 {u
'!', u
'!'}, {u
'"', u
'”'}, {u
'#', u
'#'}, {u
'$', u
'$'}, {u
'%', u
'%'},
95 {u
'&', u
'&'}, {u
'\'',u
'’'}, {u
'(', u
'('}, {u
')', u
')'}, {u
'*', u
'*'},
96 {u
'+', u
'+'}, {u
',', u
','}, {u
'-', u
'-'}, {u
'.', u
'.'}, {u
'/', u
'/'},
97 {u
':', u
':'}, {u
';', u
';'}, {u
'<', u
'<'}, {u
'=', u
'='}, {u
'>', u
'>'},
98 {u
'?', u
'?'}, {u
'@', u
'@'}, {u
'[', u
'['}, {u
']', u
']'}, {u
'^', u
'^'},
99 {u
'_', u
'_'}, {u
'`', u
'`'}, {u
'{', u
'{'}, {u
'|', u
'|'}, {u
'}', u
'}'},
104 for (
wchar_t word : sentence_w) {
105 if (conversion_map.find(word) != conversion_map.end()) {
106 output += conversion_map[word];
137 wstring result = regex_replace(sentence, wregex(L
"[ ]+"), L
" ");
139 wstring blocks = LR
"(\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF\u3000-\u303F\uFF00-\uFFEF)";
140 wstring basicLatin = LR"(\u0000-\u007F)";
142 auto removeSpaceBetween = [](
const wstring& cls1,
const wstring& cls2, wstring str) {
143 wregex pattern(L
"([" + cls1 + L
"]) ([" + cls2 + L
"])");
144 while (regex_search(str, pattern)) {
145 str = regex_replace(str, pattern, L
"$1$2");
150 result = removeSpaceBetween(blocks, blocks, result);
151 result = removeSpaceBetween(blocks, basicLatin, result);
152 result = removeSpaceBetween(basicLatin, blocks, result);
176 static wregex word_pattern(L
"(([0-9A-Za-z。-゚]+))");
179 sentence =
Strip(sentence);
186 static wregex small_hyhpen_pattern(L
"([˗֊‐‑‒–⁃⁻₋−]+)");
187 sentence_w = regex_replace(sentence_w,small_hyhpen_pattern,L
"-");
188 static wregex large_hyhpen_pattern(L
"([﹣-—―─━ーー]+)");
189 sentence_w = regex_replace(sentence_w,large_hyhpen_pattern,L
"ー");
192 static wregex tilde_pattern(L
"[~∼∾〜〰~]");
193 sentence_w = regex_replace(sentence_w,tilde_pattern,L
"");
198 static wregex special_word_pattern(L
"(([!#$%&()*+,-./:;<>?@[¥]^_`{|}〜]+))");
201 static wregex quotation_pattern(L
"[’]");
202 sentence_w = regex_replace(sentence_w,quotation_pattern,L
"\'");
203 static wregex wquotation_pattern(L
"[”]");
204 sentence_w = regex_replace(sentence_w,wquotation_pattern,L
"\"");
205 static wregex equal_pattern(L
"[=]");
206 sentence_w = regex_replace(sentence_w,equal_pattern,L
"=");
209 sentence =
Strip(sentence);
string NormalizeNeologd(string sentence)
Neologd Normalized function.
wstring TranslateToFullwidth(const wstring &sentence_w)
Replace a specific string from half-width to full-width.
wstring RemoveExtraSpaces(const wstring &sentence)
remove half-width spaces that meet the conditions
wstring UnicodeNormalize(wregex word_pattern, wstring sentence_w)
nfkc normalize sentence by icu::Normalizer2
string ConvertWstringToUTF8(const wstring &sentence)
Convert Wstring to string.
wstring ConvertUTF8ToWstring(const string &sentence)
Convert string to Wstring.
string Strip(const string &sentence)
Remove leading and trailing white space.