Corpus Cleaner
normalizer.hpp
Go to the documentation of this file.
1#include <bits/stdc++.h>
2
3using namespace std;
4namespace fs = filesystem;
5
6wstring UnicodeNormalize(wregex word_pattern,wstring sentence_w);
7wstring TranslateToFullwidth(const wstring& sentence_w);
8wstring RemoveExtraSpaces(const wstring& sentence);
9string NormalizeNeologd(string sentence);
10int Normalizer(string input_path,string output_path);
string NormalizeNeologd(string sentence)
Neologd Normalized function.
int Normalizer(string input_path, string output_path)
wstring TranslateToFullwidth(const wstring &sentence_w)
Replace a specific string from half-width to full-width.
wstring RemoveExtraSpaces(const wstring &sentence)
remove half-width spaces that meet the conditions
wstring UnicodeNormalize(wregex word_pattern, wstring sentence_w)
nfkc normalize sentence by icu::Normalizer2