Corpus Cleaner
perplexity_filter.cc
Go to the documentation of this file.
2
3
4using namespace std;
5// using namespace ;
6lm::ngram::Model model("ja.arpa.bin"); //TODO: Refactor. Don't use global parameter.
7
8
9
10/**
11 * @brief Score sentence by KenLM.
12 * @details
13 * The step is...
14 * 1. Split sentence into single characters.
15 * 2.
16 *
17 * Example:
18 * ```cpp
19 * wstring sentence = L"吾輩は猫である.名前はまだない.";
20 * cout << KenLMScore(sentence) << endl;
21 * // -60.5849
22 * ```
23 * @param const wstring &sentence: text sentence
24 * @return double: score by KenLM
25 * @note
26**/
28{
29 const auto status = processor.Load("ja.sp.model");
30 if (!status.ok()) {
31 std::cerr << status.ToString() << std::endl;
32 // error
33 }
34}
35
36
37/**
38 * @brief Score sentence by KenLM.
39 * @details
40 * The step is...
41 * 1. Split sentence into single characters.
42 * 2.
43 *
44 * Example:
45 * ```cpp
46 * wstring sentence = L"吾輩は猫である.名前はまだない.";
47 * cout << KenLMScore(sentence) << endl;
48 * // -60.5849
49 * ```
50 * @param const wstring &sentence: text sentence
51 * @return double: score by KenLM
52 * @ref
53 * https://github.com/google/sentencepiece/blob/master/doc/api.md
54 * https://github.com/google/sentencepiece
55 * @note
56**/
57double KenLMFilter::Score(const wstring sentence)
58{
59 double total_score=0,score=0;
60
61 string word_w ="";
62
63 lm::ngram::State state(model.BeginSentenceState()), out_state;
64 const lm::ngram::Vocabulary &vocab = model.GetVocabulary();
65 for (int i=0;i<(int)sentence.size();i++) {
66 // Split sentence into single characters.
67 wstring word_w=sentence.substr(i,1);
68 string word=ConvertWstringToUTF8(word_w);
69 // cout << word << endl;
70 score=model.BaseScore(&state, vocab.Index(word), &out_state);
71 // cout << score << endl;
72 total_score += score;
73 state = out_state;
74 }
75 //eos
76 score=model.BaseScore(&state, vocab.EndSentence(), &out_state);
77 total_score += score;
78
79 return total_score;
80}
81
82/**
83 * @brief Score sentence by KenLM with SentencePiece Tokenizing.
84 * @details
85 * The step is...
86 * 1. Split sentence into single characters.
87 * 2.
88 *
89 * Example:
90 * ```cpp
91 * wstring sentence = L"吾輩は猫である.名前はまだない.";
92 * cout << KenLMScore(sentence) << endl;
93 * //
94 * ```
95 * @param const wstring &sentence: text sentence
96 * @return double: score by KenLM
97 * @ref
98 * https://github.com/google/sentencepiece/blob/master/doc/api.md
99 * https://github.com/google/sentencepiece
100 * @note
101**/
102double KenLMFilter::ScoreWithSentencePiece(const wstring sentence)
103{
104
105 double total_score=0,score=0;
106
107 string word_w ="";
108 vector<string> pieces;
109 processor.Encode(ConvertWstringToUTF8(sentence), &pieces);
110
111 lm::ngram::State state(model.BeginSentenceState()), out_state;
112 const lm::ngram::Vocabulary &vocab = model.GetVocabulary();
113 for (auto piece:pieces) {
114 //string word = pieces[i];
115 // cout << piece << endl;
116 score=model.BaseScore(&state, vocab.Index(piece), &out_state);
117 // cout << score << endl;
118 total_score += score;
119 state = out_state;
120 }
121 //eos
122 score=model.BaseScore(&state, vocab.EndSentence(), &out_state);
123 total_score += score;
124
125 return total_score;
126}
127
128/**
129 * @brief Perplexity sentence by KenLM.
130 * @details
131 * The step is...
132 * 1. Split sentence into single characters.
133 * 2.
134 *
135 * Example:
136 * wstring sentence = L"吾輩は猫である.名前はまだない.";
137 * cout << KenLMPerplexity(sentence) << endl;
138 * // 4117.1
139 * @param const string& src: text sentence
140 * @return double: score by KenLM
141 * @ref
142 * https://github.com/kpu/kenlm/blob/master/python/kenlm.pyx#L209
143 * https://zenn.dev/syoyo/articles/529ce949121ca4
144 * https://github.com/facebookresearch/cc_net
145 * @note
146**/
147double KenLMFilter::Perplexity(const wstring sentence)
148{
149 //words = len(as_str(sentence).split()) + 1 // For </s>
150 double words = (double)(sentence.size()+1);
151
152 //cout << "sentence.size:"<<words<<endl;
153 return pow(10.0,(-this->Score(sentence) / words));
154}
155
156/**
157 * @brief Perplexity sentence by KenLM with SentencePiece Tokenizing.
158 * @details
159 * The step is...
160 * 1. Split sentence into token by SentencePiece.
161 * 2. Calculate perplexity value.
162 *
163 * The usage is following.
164 *
165 * wstring sentence = L"吾輩は猫である.名前はまだない.";
166 * cout << PerplexityWithSentencePiece(sentence) << endl;
167 * // 677.5
168 * @param const string& src: text sentence
169 * @return double: score by KenLM
170 * @ref
171 * https://github.com/kpu/kenlm/blob/master/python/kenlm.pyx#L209
172 * https://zenn.dev/syoyo/articles/529ce949121ca4
173 * https://github.com/facebookresearch/cc_net
174 * @note
175**/
176double KenLMFilter::PerplexityWithSentencePiece(const wstring sentence)
177{
178 vector<string> pieces;
179 processor.Encode(ConvertWstringToUTF8(sentence), &pieces);
180 string sentence_tokenized = "";
181 //for (auto piece:pieces) sentence_tokenized += piece+" ";
182 //sentence_tokenized.pop_back();
183 //cout << sentence_tokenized << endl;
184
185 double words = (double)pieces.size()+1;
186 //cout << "sentence.size:"<<words<<endl;
187 return pow(10.0,(-this->ScoreWithSentencePiece(sentence) / words));
188}
189
190/*
191int main() {
192 vector<wstring> sentence_list;
193 sentence_list.push_back(L"東京はッ晴れ");
194 sentence_list.push_back(L"東京は元気です");
195 sentence_list.push_back(L"吾輩は猫である。名前はまだない。");
196 sentence_list.push_back(L"東京は晴れ");
197 sentence_list.push_back(L"東京 大阪 名古屋 秋田 千葉");
198 sentence_list.push_back(L"あああああああ");
199 chrono::system_clock::time_point start, end;
200 start = chrono::system_clock::now();
201
202 #pragma omp parallel
203 {
204 #pragma omp for nowait ordered
205 for (wstring sentence:sentence_list) {
206 #pragma omp ordered
207 {
208 // cout << ConvertWstringToUTF8(sentence) <<endl;
209 // cout << ConvertWstringToUTF8(sentence) << KenLMScore(sentence) <<endl;
210
211 cout << ConvertWstringToUTF8(sentence) <<" perplexity:"<<KenLMPerplexity(sentence)<<endl;}
212 }
213 }
214
215 end = chrono::system_clock::now();
216 double elapsed = chrono::duration_cast<chrono::duration<double>>(end - start).count();
217 cout << "passed time[s]"<< elapsed <<endl;
218
219 return 0;
220}
221*/
KenLMFilter()
Score sentence by KenLM.
double PerplexityWithSentencePiece(const wstring sentence)
Perplexity sentence by KenLM with SentencePiece Tokenizing.
double ScoreWithSentencePiece(const wstring sentence)
Score sentence by KenLM with SentencePiece Tokenizing.
sentencepiece::SentencePieceProcessor processor
double Score(const wstring sentence)
Score sentence by KenLM.
double Perplexity(const wstring sentence)
Perplexity sentence by KenLM.
lm::ngram::Model model("ja.arpa.bin")
string ConvertWstringToUTF8(const wstring &sentence)
Convert Wstring to string.
Definition util.cpp:286