Corpus Cleaner
normalizer.cpp
Go to the documentation of this file.
1#include "normalizer.hpp"
2#include "util.hpp"
3// #include "util.cpp"
4
5#include <unicode/datefmt.h>
6#include <unicode/dtfmtsym.h>
7#include <unicode/gregocal.h>
8#include <unicode/timezone.h>
9#include <unicode/unistr.h>
10#include <unicode/ustring.h>
11#include <unicode/dtptngen.h>
12#include <unicode/dtitvfmt.h>
13#include <unicode/normalizer2.h>
14#include <unicode/unistr.h>
15
16
17using namespace std;
18
19/**
20 * @brief nfkc normalize sentence by icu::Normalizer2
21 * @details
22 * Search for words that match the word_pattern regular expression in the sentence
23 * and perform NFKC normalization using icu::Normalizer2.
24 *
25 * Example:
26 * ```cpp
27 * wstring sentence = L"0123456789";
28 * static wregex word_pattern(L"(([0-9]+))");
29 * wstring normalized_sentence = UnicodeNormalize(word_pattern, sentence)
30 * // normalized_sentence == L"0123456789"
31 * ```
32 * @param wregex word_pattern: Regular expression for string to be normalized
33 * @param wstring: sentence
34 * @return wstring: normalized sentence
35 * @ref https://ja.wikipedia.org/wiki/Unicode%E4%B8%80%E8%A6%A7_0000-0FFF
36 * @note
37**/
38wstring UnicodeNormalize(wregex word_pattern,wstring sentence_w)
39{
40 static wregex hyphen_pattern(L"-");
41
42 //object for stock the part of matchinng string
43 wsmatch matches;
44 while (regex_search(sentence_w, matches, word_pattern)) {
45 // cout <<"matches.str():"<<ConvertWstringToUTF8(matches.str())<<endl;
46 //caution: must initialization of errc
47 UErrorCode errc = U_ZERO_ERROR;
48 errc = U_ZERO_ERROR;
49
50 // generate NFKC normalizer instance
51 const icu::Normalizer2* normalizer = icu::Normalizer2::getNFKCInstance(errc);
52
53 // convert matching part of sentence to UnicodeString
54 icu::UnicodeString match(ConvertWstringToUTF8(matches.str()).c_str(), "UTF-8");
55 // Normalize the matching part of sentence
56 icu::UnicodeString match_morph;
57 normalizer->normalize(match,match_morph,errc);
58
59 // convert normalized sentence to string
60 string normalizedMatch_temp;
61 match_morph.toUTF8String(normalizedMatch_temp);
62 // cout <<"normalizedMatch_temp:"<<normalizedMatch_temp<<endl;
63
64 wstring normalizedMatch = ConvertUTF8ToWstring(normalizedMatch_temp);
65 // replace original text to normalized text
66 sentence_w.replace(matches.position(), matches.length(), normalizedMatch);
67 }
68
69 sentence_w = regex_replace(sentence_w,hyphen_pattern,L"-");
70 return sentence_w;
71}
72
73
74
75
76/**
77 * @brief Replace a specific string from half-width to full-width
78 * @details
79 * Replace the following full-width symbols with half-width symbols
80 * /!”#$%&’()*+,−./:;<>?@[¥]^_`{|}
81 *
82 * Example:
83 * ```cpp
84 * wstring sentence= "()";
85 * sentence = TranslateToFullwidth(sentence); //"()"
86 * ```
87 * @param const string& sentence: text sentence
88 * @return wstring: sentence has been processed
89 * @note
90**/
91wstring TranslateToFullwidth(const wstring& sentence_w)
92{
93 unordered_map<wchar_t, wchar_t> conversion_map = {
94 {u'!', u'!'}, {u'"', u'”'}, {u'#', u'#'}, {u'$', u'$'}, {u'%', u'%'},
95 {u'&', u'&'}, {u'\'',u'’'}, {u'(', u'('}, {u')', u')'}, {u'*', u'*'},
96 {u'+', u'+'}, {u',', u','}, {u'-', u'-'}, {u'.', u'.'}, {u'/', u'/'},
97 {u':', u':'}, {u';', u';'}, {u'<', u'<'}, {u'=', u'='}, {u'>', u'>'},
98 {u'?', u'?'}, {u'@', u'@'}, {u'[', u'['}, {u']', u']'}, {u'^', u'^'},
99 {u'_', u'_'}, {u'`', u'`'}, {u'{', u'{'}, {u'|', u'|'}, {u'}', u'}'},
100 {u'~', u'〜'}
101 };
102
103 wstring output;
104 for (wchar_t word : sentence_w) {
105 if (conversion_map.find(word) != conversion_map.end()) {
106 output += conversion_map[word];
107 } else {
108 output += word;
109 }
110 }
111
112 return output;
113}
114
115/**
116 * @brief remove half-width spaces that meet the conditions
117 * @details
118 * Replace one or more half-width spaces with one half-width space.
119 * And Remove half-width spaces included in the following conditions.
120 * - Half-width spaces included between "hiragana, full-width katakana,
121 * half-width katakana, kanji, and full-width symbols"
122 * - Half-width space included between "hiragana, full-width katakana,
123 * half-width katakana, kanji,
124 * full-width symbols" and "half-width alphanumeric characters"
125 *
126 * Example:
127 * ```cpp
128 * wstring sentence= "()";
129 * sentence = TranslateToFullwidth(sentence); //"()"
130 * ```
131 * @param const string& sentence: text sentence
132 * @return wstring: sentence has been processed
133 * @note
134**/
135wstring RemoveExtraSpaces(const wstring& sentence)
136{
137 wstring result = regex_replace(sentence, wregex(L"[  ]+"), L" ");
138
139 wstring blocks = LR"(\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF\u3000-\u303F\uFF00-\uFFEF)";
140 wstring basicLatin = LR"(\u0000-\u007F)";
141
142 auto removeSpaceBetween = [](const wstring& cls1, const wstring& cls2, wstring str) {
143 wregex pattern(L"([" + cls1 + L"]) ([" + cls2 + L"])");
144 while (regex_search(str, pattern)) {
145 str = regex_replace(str, pattern, L"$1$2");
146 }
147 return str;
148 };
149
150 result = removeSpaceBetween(blocks, blocks, result);
151 result = removeSpaceBetween(blocks, basicLatin, result);
152 result = removeSpaceBetween(basicLatin, blocks, result);
153
154 return result;
155}
156
157/**
158 * @brief Neologd Normalized function
159 * @details
160 * Perform the normalization process described in the link below.
161 * https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja
162 *
163 * Example:
164 * ```cpp
165 * string sentence= "検索 エンジン 自作 入門 を 買い ました!!!";
166 * sentence = NormalizeNeologd(sentence); //"検索エンジン自作入門を買いました"
167 * ```
168 * @param const string& sentence: text sentence
169 * @return wstring: sentence has been processed
170 * @attention
171 * This process is for Japanese text. Do not use English text or code in your corpus.
172 * For example, in English text, spaces between words will be removed.
173**/
174string NormalizeNeologd(string sentence)
175{
176 static wregex word_pattern(L"(([0-9A-Za-z。-゚]+))");
177
178 //Remove leadingand trailing white space
179 sentence = Strip(sentence);
180
181 // Normalize full-width alphanumeric characters
182 wstring sentence_w = ConvertUTF8ToWstring(sentence);
183 sentence_w = UnicodeNormalize(word_pattern,sentence_w);
184
185 //Normalize hyphens
186 static wregex small_hyhpen_pattern(L"([˗֊‐‑‒–⁃⁻₋−]+)");
187 sentence_w = regex_replace(sentence_w,small_hyhpen_pattern,L"-");
188 static wregex large_hyhpen_pattern(L"([﹣-—―─━ーー]+)");
189 sentence_w = regex_replace(sentence_w,large_hyhpen_pattern,L"ー");
190
191 // Remove tilde words
192 static wregex tilde_pattern(L"[~∼∾〜〰~]");
193 sentence_w = regex_replace(sentence_w,tilde_pattern,L"");
194
195 sentence_w =TranslateToFullwidth(sentence_w);
196 sentence_w = RemoveExtraSpaces(sentence_w);
197
198 static wregex special_word_pattern(L"(([!#$%&()*+,-./:;<>?@[¥]^_`{|}〜]+))");
199
200 sentence_w = UnicodeNormalize(special_word_pattern,sentence_w);
201 static wregex quotation_pattern(L"[’]");
202 sentence_w = regex_replace(sentence_w,quotation_pattern,L"\'");
203 static wregex wquotation_pattern(L"[”]");
204 sentence_w = regex_replace(sentence_w,wquotation_pattern,L"\"");
205 static wregex equal_pattern(L"[=]");
206 sentence_w = regex_replace(sentence_w,equal_pattern,L"=");
207
208 sentence = ConvertWstringToUTF8(sentence_w);
209 sentence = Strip(sentence);
210 return sentence;
211}
212
213
214
215/*
216int main(void)
217{
218 string input_path = "../data/wiki_test.txt";
219 string output_path = "../results/wiki_test_NormalizeNeologd.txt";
220
221 // Normalizer(input_path,output_path);
222
223 //original
224 assert("Hello,C++!" == NormalizeNeologd(" Hello, C++! "));// TODO: Write the comment that this normalizer is don't applied for English text. Because spaces are removed.
225 assert("-" == NormalizeNeologd("˗֊‐‑‒–⁃⁻₋−"));
226 assert("-" == NormalizeNeologd("-"));
227 assert("ー" == NormalizeNeologd("﹣—―─━ーー"));
228 assert("=" == NormalizeNeologd("="));
229
230 assert("0123456789" == NormalizeNeologd("0123456789"));
231 assert("ABCDEFGHIJKLMNOPQRSTUVWXYZ" == NormalizeNeologd("ABCDEFGHIJKLMNOPQRSTUVWXYZ"));
232 assert("abcdefghijklmnopqrstuvwxyz" == NormalizeNeologd("abcdefghijklmnopqrstuvwxyz"));
233 assert("!\"#$%&\'()*+,-./:;<>?@[¥]^_`{|}" == NormalizeNeologd("!”#$%&’()*+,-./:;<>?@[¥]^_`{|}"));
234 assert("=。、・「」" == NormalizeNeologd("=。、・「」"));
235 assert("ハンカク" == NormalizeNeologd("ハンカク"));
236 assert("o-o" == NormalizeNeologd("o₋o"));
237 assert("majikaー" == NormalizeNeologd("majika━"));
238 assert("わい" == NormalizeNeologd("わ〰い"));
239 assert("スーパー" == NormalizeNeologd("スーパーーーー"));
240 assert("!#" == NormalizeNeologd("!#"));
241 assert("ゼンカクスペース" == NormalizeNeologd("ゼンカク スペース"));
242 assert("おお" == NormalizeNeologd("お お"));
243 assert("おお" == NormalizeNeologd(" おお"));
244 assert("おお" == NormalizeNeologd("おお "));
245 assert("検索エンジン自作入門を買いました!!!" ==NormalizeNeologd("検索 エンジン 自作 入門 を 買い ました!!!"));
246 assert("アルゴリズムC" == NormalizeNeologd("アルゴリズム C"));
247 assert("PRML副読本" == NormalizeNeologd("   PRML  副 読 本   "));
248 assert("Coding the Matrix" == NormalizeNeologd("Coding the Matrix"));
249 assert("南アルプスの天然水Sparking Lemonレモン一絞り" == NormalizeNeologd("南アルプスの 天然水 Sparking Lemon レモン一絞り"));
250 assert("南アルプスの天然水-Sparking*Lemon+レモン一絞り" == NormalizeNeologd("南アルプスの 天然水- Sparking* Lemon+ レモン一絞り"));
251 cout << "Normalizing Text is completed." << endl;
252 return 0;
253}
254*/
string NormalizeNeologd(string sentence)
Neologd Normalized function.
wstring TranslateToFullwidth(const wstring &sentence_w)
Replace a specific string from half-width to full-width.
wstring RemoveExtraSpaces(const wstring &sentence)
remove half-width spaces that meet the conditions
wstring UnicodeNormalize(wregex word_pattern, wstring sentence_w)
nfkc normalize sentence by icu::Normalizer2
string ConvertWstringToUTF8(const wstring &sentence)
Convert Wstring to string.
Definition util.cpp:286
wstring ConvertUTF8ToWstring(const string &sentence)
Convert string to Wstring.
Definition util.cpp:270
string Strip(const string &sentence)
Remove leading and trailing white space.
Definition util.cpp:391