Corpus Cleaner
corpus_cleaner.cpp
Go to the documentation of this file.
1#include "corpus_cleaner.hpp"
2#include "util.hpp"
3#include "normalizer.hpp"
4#include "simdjson.h"
5using namespace simdjson;
6
7/**
8 * @brief Loggging Document to output_file_path
9 * @details
10 * Example:
11 * ```cpp
12 * string input_file_path = "./input.jsonl";
13 * ifstream ifs(input_file_path);
14 * Document document;
15 * string line = "";
16 * while(getline(ifs,line)){
17 * ReadDocumentFromJsonl(document,line);
18 * // Write process for document.
19 * }
20 * ```
21 * @param Document document: document
22 * @param string output_file_path: Path of file for statistics.
23 * @return void: None
24 * @note
25**/
27 string input_jsonl_line)
28{
29 try{
30 simdjson::ondemand::parser parser;
31 simdjson::ondemand::document jsonl_line = parser.iterate(input_jsonl_line);
32 string_view line_view;
33
34 jsonl_line["text"].get(line_view);
35 document.text = string(line_view);
36
37 jsonl_line["id"].get(line_view);
38 document.id = string(line_view);
39
40 jsonl_line["is_rejected"].get(line_view);
41 document.is_rejected = stoi(string(line_view));
42
43 //split metadata to ,
44 jsonl_line["metadata"].get(line_view);
45 string tmp = string(line_view);
46 stringstream ss(tmp);
47 string token;
48 while(getline(ss, token, ',')){
49 document.metadata.insert(token);
50 }
51
52 jsonl_line["language"].get(line_view);
53 document.language=string(line_view);
54
55 jsonl_line["language_score"].get(line_view);
56 document.language_score=stod(string(line_view));
57
58 jsonl_line["perplexity"].get(line_view);
59 document.perplexity=stod(string(line_view));
60 }
61 catch(...){
62 throw;
63 }
64}
65
66/**
67 * @brief Loggging Document to output_file_path
68 * @details
69 * @param Document document: document
70 * @param string output_file_path: Path of file for statistics.
71 * @return void: None
72 * @note
73**/
75 string output_file_path)
76{
77 document.text = EscapeWord(document.text);
78 ofstream output_file(output_file_path, ios::app);
79
80 try{
81 output_file << "{" ;
82 output_file << "\"text\":\"" <<document.text << "\",";
83 output_file << "\"id\":\"" << document.id << "\",";
84 output_file << "\"is_rejected\":\"" << document.is_rejected << "\",";
85 output_file << "\"metadata\":\"";
86 for (auto iter = document.metadata.begin(); iter != document.metadata.end(); ++iter) {
87 output_file << *iter << ",";
88 }
89 output_file << "\",";
90 output_file << "\"language\":\"" <<document.language << "\",";
91 output_file << "\"language_score\":\"" <<document.language_score << "\",";
92 output_file << "\"perplexity\":\"" <<document.perplexity << "\"";
93 output_file <<"}"<< endl;
94 }
95 catch(...){
96 output_file.close();
97 throw;
98 }
99 output_file.close();
100}
101
102
103/**
104 * @brief Convert input files to jsonl that has Document's element.
105 * @details
106 * @param string sentence: sentence
107 * @param string filename: filename without file extention
108 * @param string file_line_count: the line number of sentence in "filename"
109 * @param Document document: document converted
110 * @return void: None
111 * @note
112**/
113void ConvertTextToDocument(string sentence,
114 string filename,
115 string file_line_count,
116 Document &document)
117{
118 document.text = sentence;
119 document.id = filename+"_"+file_line_count;
120 document.is_rejected = false;
121 // document.metadata;
122 document.language="";
123 document.language_score=-1;
124 document.perplexity=-1;
125}
126
127/**
128 * @brief Convert input files to jsonl that has Document's element.
129 * @details
130 * @param Document document: document
131 * @param string output_file_path: Path of file for statistics.
132 * @return void: None
133 * @note
134**/
135void ConvertInputFilesToJsonl(const string input_folder_path,
136 const string output_folder_path)
137{
138
139 string target_line="",source_line="";
140 cout << "### Convert input file(.txt) to .jsonl. ###" << endl;
141
142 vector<string> filename_list;
143 vector<uint64_t> file_line_number_list;
144
145 // Get the list of files in this->intermediate_folder and set it to vector<string> file_list
146 GetFileNameListWithoutExtention(input_folder_path,&filename_list);
147 GetFileLineNumberList(input_folder_path,&filename_list,".txt",&file_line_number_list);
148
149 string line;
150 // Compare all lines of source_file and target_file
151 for(int i=0;i<(int)filename_list.size();i++){
152 string filename = filename_list[i];
153 uint64_t file_line_number = file_line_number_list[i];
154 ifstream input_file(input_folder_path+"/"+filename+".txt");
155 string output_file_path(output_folder_path+"/"+filename+".jsonl");
156 uint64_t line_count = 0;
157 chrono::system_clock::time_point start, end;
158 start = chrono::system_clock::now();
159
160 while(getline(input_file, line)){
161 Document document;
162 end = std::chrono::system_clock::now();
163 uint32_t elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
164
165 ProceedProgressBar(line_count+1,file_line_number,elapsed_time);
166 ConvertTextToDocument(line,filename,to_string(line_count),document);
167 WriteDocumentToJsonl(document,output_file_path);
168 line_count++;
169 }
170 input_file.close();
171 }
172}
173
174/**
175 * @brief Format statistics
176 * @details
177 * Example:
178 * ```cpp
179 * ```
180 * @param string process_name: Cleaning filter name.
181 * @param string output_path: Path of file for statistics.
182 * @param double elapsed_time: elapsed process time.
183 * @return Stats: statistics
184 * @note
185**/
186Stats MakeStats(string process_name,
187 string output_path,
188 double elapsed_time)
189{
190 Stats stats;
191
192 // int index_filename = output_path.find_last_of("/")+1;
193 stats.process_name=process_name;
194 // stats.file_name=output_path.substr(index_filename);
195 stats.file_name="";
196 stats.elapsed_time=elapsed_time;
197 // stats.result_file_size=filesystem::file_size(output_path);
198 stats.result_file_size=0;
199 return stats;
200}
201
202/**
203 * @brief Output statistics
204 * @details
205 * Example:
206 * @param Stats stats: Statistics to be output.
207 * @return None
208 * @note
209**/
211{
212 cout <<"######### "<<stats.process_name<<" #########"<<endl;
213 cout << "file_name:" << stats.file_name << endl;
214 cout << "elapsed_time[s]:"<< stats.elapsed_time << endl;
215 cout << "result file size[Byte]:"<<stats.result_file_size<<endl;
216 cout << endl;
217}
218
219/**
220 * @brief Save exception in file
221 * @details
222 * @param string reference: reference infomation. For example, sentence.
223 * @param string function_name: function name cause exeption
224 * @return None
225 * @note
226**/
227void CorpusCleaner::StoreException(string function_name, string reference)
228{
229 string filename = this->exception_path+"/exception.txt";
230 ofstream output_file(filename, ios::app);
231
232 output_file << "Function Name: "<< function_name << " , ";
233 output_file << "Reference:" << reference << " , ";
234 output_file << endl;
235 output_file.close();
236}
237
238/***constructor***/
240 string output_path,
241 uint32_t min_length,
242 uint32_t max_length,
243 set<string> accept_language,
244 bool store_rejected,
245 bool sentence_segment,
246 float language_threshold,
247 double perplexity_threshold,
248 GenerateDedupLSH *generate_dedup_lsh,
249 LSHDeduplicator *deduplicator)
250{
251 this->input_path = input_path;
252 this->output_path = output_path+"/cleaned/";
253 this->intermediate_path = output_path+"/intermediate/";
254 this->rejected_path = output_path+"/rejected/";
255 this->exception_path = output_path+"/exception/";
256
257 this->min_length = min_length;
258 this->max_length = max_length;
259 this->accept_language = accept_language;
260 this->store_rejected = store_rejected;
261 this->sentence_segment = sentence_segment;
262 this->language_threshold = language_threshold;
263 this->perplexity_threshold = perplexity_threshold;
264 this->generate_dedup_lsh=generate_dedup_lsh;
265 this->deduplicator=deduplicator;
266
267 if(filesystem::exists(this->output_path) |
268 filesystem::exists(this->rejected_path)) {
269 cout << "ERROR: output_path or rejected_path folder already exists. ";
270 cout << "Please RENAME to delete the selection." << endl;
271 exit(EXIT_FAILURE);
272 }
273
274 RemoveFolder(this->intermediate_path);
275 RemoveFolder(this->exception_path);
276
277 mkdir(output_path.c_str(), 0777);
278 mkdir(this->intermediate_path.c_str(), 0777);
279 mkdir(this->output_path.c_str(), 0777);
280 mkdir(this->exception_path.c_str(), 0777);
281 mkdir(this->rejected_path.c_str(), 0777);
282
283 //Read from input_path's files, and write to output_path in jsonl format.
284 // TODO: uncommentout next line
285 // ConvertInputFilesToJsonl(this->input_path,this->output_path);
286 CopyFolder(this->input_path,this->intermediate_path);
287}
288
289/***destructor***/
291{
292 //remove intermediate folder
293 // TODO: uncommentout next line
294 // RemoveFolder(this->intermediate_path);
295}
296
297
298/***member function***/
299/**
300 * @brief Remove too long sentence and too short sentence.
301 * @details
302 * Remove too long sentence that is length is more thanand too short sentence.
303 * The length of too long sentence is more than "max_length".
304 * The length of too short sentence is lesser than "min_length".
305 * @param Document &document: single line text to clean be cleaned
306 * @return void: None
307 * @note
308**/
310{
311 uint32_t line_length = strlen_utf8(document.text);
312 if (line_length < this->min_length || this->max_length < line_length) {
313 document.is_rejected=true;
314 document.metadata.insert(__func__);
315 }
316}
317
318/**
319 * @brief KenLM's Perplexity Quality filtering
320 * @details
321 * Please Refer document of "TODO"
322 * 1.
323 * 2. If the perplexity is less than "threshold", the "document" is to be rejected.
324 *
325 * Example:
326 *
327 * @param Document &document: single line text to be cleaned
328 * @return void: None
329 * @note
330**/
332{
333 document.perplexity = this->kenlm_filter.PerplexityWithSentencePiece(ConvertUTF8ToWstring(document.text));
334
335 // If kenlm's perplexity is less than threshold, the text is to be rejected.
336 document.is_rejected=true;
337 if(document.perplexity<=this->perplexity_threshold){
338 document.is_rejected=false;
339 }
340
341 if(document.is_rejected) document.metadata.insert(__func__);
342}
343
344/**
345 * @brief Language filtering using fastText.
346 * @details
347 * ```cpp
348 * string in = "吾輩は猫である。名前はまだ無い。";
349 * FastTextEx language_filter;
350 * pair<float, string> score;
351 * score = language_filter.filter(in);
352 * // score.first ==1.00005, score.second ==__label__ja
353 *
354 * string in2 = "I am a cat. No name yet.";
355 * score = language_filter.filter(in2);
356 * // score.first ==0.75237, score.second ==__label__en
357 * ```
358 * @param Document &document: single line text to be cleaned
359 * @return void: None
360 * @ref
361 * https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja#python-written-by-hideaki-t--overlast
362 * https://fasttext.cc/docs/en/supervised-tutorial.html
363 * @note
364**/
366{
367 vector<pair<float, string>> predictions;
368 int32_t k = 1;
369 float threshold = 0.0;
370
371 try{
372 this->language_filter.predictOneLine(document.text, predictions, k, threshold);
373 //return pair<float, string> : float: Language assessment score, string: Language determination results
374 pair<float,string> result;
375 if((int)predictions.size()==0) result=make_pair(0.0,"other"); //Remove. This case often applies to lines containing only spaces.
376 else result = make_pair((float)predictions[0].first,predictions[0].second);
377
378 document.language = result.second;
379 document.language_score = result.first;
380
381 document.is_rejected=true;
382 if(accept_language.find(document.language)!=accept_language.end()){
383 // If fasttext's score is less than threshold, the text to be rejected.
384 if(document.language_score>=this->language_threshold){
385 document.is_rejected=false;
386 }
387 }
388 }
389 catch(...){
390 cout << "Exception:LanguageFilter" << endl;
391 throw;
392 }
393
394 if(document.is_rejected) document.metadata.insert(__func__);
395}
396
397
398/**
399 * @brief Remove URLs matching regular expression.
400 * @details
401 * Remove URLs matching regular expression.
402 * The regular expression is "(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+\$,%#]+)".
403 * @param Document &document: single line text to clean be cleaned
404 * @return void: None
405 * @note
406**/
408{
409 static regex url_pattern(R"((https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+\$,%#]+))");
410 string sentence = regex_replace(document.text,url_pattern,"");
411
412 if(sentence!=document.text) document.metadata.insert(__func__);
413 document.text = sentence;
414}
415
416/**
417 * @brief Remove special character. For example, ☀, ♡, ☆, and so on.
418 * @details
419 * Remove emoji characters that is \U00002600(☀) to \U000027ff(⟿),
420 * \U00002190(←) to \U000021ff(⇿),\U00002300(⌀) to \U000023ff(⏿)
421 * \U00002900(⤀) to \U0000297f(⥿),\U00002b00(⬀) to \U00002bff(⯿),
422 * and \U0001f000(🀀) to \U0001f0ff(🃿).
423 * The C++ regex library does not support 4-byte characters.
424 * Therefore, characters like 🀀 cannot be matched using regular expressions.
425 * So, in a full search, those that completely match the pictogram are searched and removed.
426 *
427 * Example:
428 * TODO.
429 * @param string input_path: The path of filterd file.
430 * @param string output_path: The output path of results file.
431 * @return Stats: statics imformation of this function.
432 * @ref https://guppy.eng.kagawa-u.ac.jp/OpenCampus/unicode.html
433 * @note
434**/
436{
437 string special_character = "";
438 vector<string> start_character = {"☀","←","⌀","⤀","⬀","🀀"};
439 vector<int> character_range = {512,112,256,128,256,256};
440 string sentence=document.text;
441
442 for(int i=0;i<(int)start_character.size();i++){
443 special_character = start_character[i];
444 //remove special_character that is ,for example, "☀" to "⟿"
445 for(int j=0;j<character_range[i];j++){
446 ReplaceSubstring(sentence,special_character,"");
447 special_character = CalculateNextEmoji(special_character);
448 }
449 }
450
451 if(sentence!=document.text) document.metadata.insert(__func__);
452 document.text = sentence;
453}
454
455/**
456 * @brief Remove emoji. For example, 🤗, 🐉, 📊, and so on.
457 * @details
458 * Remove emoji characters that is \U0001F300(🌀) to \U0001F9FF(🧿).
459 * The C++ regex library does not support 4-byte characters.
460 * Therefore, characters like 🌀 cannot be matched using regular expressions.
461 * So, in a full search, those that completely match the pictogram are searched and removed.
462 * @param Document &document: single line text to be cleaned
463 * @return void: None
464 * @ref https://guppy.eng.kagawa-u.ac.jp/OpenCampus/unicode.html
465 * @note
466**/
468{
469 string sentence = document.text;
470 string emoji ="🌀";
471 //remove emoji that is "🌀" to "🧿"
472
473 for(int i=0;i<1792;i++){
474 ReplaceSubstring(sentence,emoji,"");
475 emoji = CalculateNextEmoji(emoji);
476 }
477 if(sentence!=document.text) document.metadata.insert(__func__);
478 document.text = sentence;
479}
480
481/**
482 * @brief Remove quotes. For example, [1], {245}, and so on.
483 * @details
484 * Remove remarks matching regular expression.
485 * The regular expression is "(\[([0-9]+)\]|\{([0-9]+)\})".
486 * @param Document &document: single line text to be cleaned
487 * @return void: None
488 * @attention Don't use this on corpus that contain formulas or programs.
489**/
491{
492 static regex remaks_pattern(R"((\[([0-9]+)\]|\{([0-9]+)\}))");
493 string sentence = regex_replace(document.text,remaks_pattern,"");
494
495 if(sentence!=document.text) document.metadata.insert(__func__);
496 document.text = sentence;
497}
498
499/**
500 * @brief Remove sentence without punctuation.
501 * @details
502 * Remove sentence without punctuation that is "、","、","。","。",".",".","?","?","!","!".
503 *
504 * Example:
505 *
506 * @param Document &document: single line text to be cleaned
507 * @return void: None
508 * @note
509 * This filter is heuristic.
510 * For example, a sentence that "https://github.com/" is not removed because it includes '.'.
511**/
513{
514 vector<string> punctures = {"、","、","。","。",".",".","?","?","!","!"};
515 string sentence = document.text;
516
517 document.is_rejected = true;
518 for(auto puncture: punctures){
519 // If there is no puncture in sentence, the return of .find() is string::npos
520 if(document.text.find(puncture)!=string::npos){
521 document.is_rejected = false;
522 break;
523 }
524 }
525
526 if(document.is_rejected) document.metadata.insert(__func__);
527}
528
529/**
530 * @brief Neologd Normalize sentence
531 * @details
532 * Please Refer document of "NormalizeNeologd()".
533 * @param string input_path: The path of filterd file.
534 * @param string output_path: The output path of results file.
535 * @return Stats: statics imformation of this function.
536 * @ref
537 * https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja#python-written-by-hideaki-t--overlast
538 * @note
539**/
541{
542 string sentence = NormalizeNeologd(document.text);
543
544 if(sentence!=document.text) document.metadata.insert(__func__);
545 document.text = sentence;
546
547}
548
549
550/**
551 * @brief MinHashLSH Deduplication files in the this->intermediate folder
552 * @details
553 * Follow the steps below to remove duplication between all lines of all files in the this->intermediate folder.
554 * 1. Get the list of files in this->intermediate_folder and set it to vector<string> file_list
555 * 2. Compare all lines of source_file and target_file in file_list.
556 * 3. Check duplication between all lines of souce file and all lines of target_file.
557 * Therefore, characters like 🌀 cannot be matched using regular expressions.
558 * I considered deduplication using set or multiset,
559 * but I did not use this method because the file size could exceed the memory capacity.
560 *
561 * Example:
562 *
563 * @param string input_folder_path: input folder path
564 * @param string output_folder_path: output folder path
565 * @return Stats: statics imformation of this function.
566 * @ref
567 * @note TODO: fix return stats.
568**/
570{
571 // Read Document from jsonl
572 vector<string> lshs = this->generate_dedup_lsh->CalculateLSH(ConvertUTF8ToWstring(document.text));
573 try{
574 if(this->deduplicator->Apply(&lshs)){
575 document.is_rejected = true;
576 document.metadata.insert(__func__);
577 }
578
579 //If seen is greater than or equal to bucket_size, clear seen to 0
580 if(this->deduplicator->SizeOfSeen()>=this->deduplicator->GetTotalBucketSize()){
581 cout << "MinhashDeduplicator: The size of Seen is more than total_bucket_size." << endl;
582 cout << "Now, clear seen and blacklist." << endl;
583 this->deduplicator->InitializeSeen();
584 }
585
586 //If seen is greater than or equal to bucket_size, clear seen to 0
587 if(this->deduplicator->SizeOfBlacklist()>=this->deduplicator->GetTotalBucketSize()){
588 cout << "MinhashDeduplicator: The size of blacklist is more than total_bucket_size." << endl;
589 cout << "Now, clear blacklist." << endl;
590 this->deduplicator->InitializeBlacklist();
591 }
592
593 }
594 catch(...){
595 cout << "Exception:MinhashDeduplication" << endl;
596 throw;
597 }
598}
599
600/**
601 * @brief Simple sentence splitter for japanese text.
602 * @details
603 * I used Pragmatic Segmenter's Japanese rules as a reference for sentence separation rules.
604 * The C++ regex library does not support 4-byte characters.
605 * Therefore, characters like 🌀 cannot be matched using regular expressions.
606 * So, in a full search, those that completely match the pictogram are searched and removed.
607 *
608 * Example: TODO
609 * @param string input_path: The path of filterd file.
610 * @param string output_path: The output path of results file.
611 * @return Stats: statics imformation of this function.
612 * @ref
613 * https://github.com/wwwcojp/ja_sentence_segmenter/blob/main/ja_sentence_segmenter/split/simple_splitter.py
614 * https://github.com/diasks2/pragmatic_segmenter#golden-rules-japanese
615 * @note
616**/
617void CorpusCleaner::SentenceSegmenter(string input_folder_path, string output_folder_path)
618{
619 string target_line="",source_line="";
620 vector<string> file_list;
621 // Get the list of files in this->intermediate_folder and set it to vector<string> file_list
622 GetFileNameListWithoutExtention(input_folder_path,&file_list);
623 // Compare all lines of source_file and target_file
624 for(int i=0;i<(int)file_list.size();i++){
625 ifstream target_file(input_folder_path+"/"+file_list[i]+".txt");
626 string output_file_path(output_folder_path+"/"+file_list[i]+".jsonl");
627 int64_t line_count =-1;
628 while (getline(target_file, target_line)) {
629 vector<string> segments;
630 Document document;
631 line_count++;
632 try{ConvertTextToDocument(target_line,file_list[i],to_string(line_count),document);}
633 catch(...){
634 string exception_detail = "line: "+target_line;
635 cout << "Exeption(ConvertTextToDocument): "<< exception_detail << endl;
636 StoreException("ConvertTextToDocument","input{"+exception_detail+"}");
637 continue;
638 }
639
640 try{SegmentSentence(document.text, segments);}
641 catch(...){
642 StoreException(__func__, "target_line:{"+target_line+"}");
643 continue;
644 }
645
646 try{
647 uint64_t sentence_count=0;
648 if((int64_t)segments.size()!=1){
649 for(auto sentence:segments){
650 Document document_segmented = document;
651 document_segmented.text = sentence;
652 document_segmented.id = document.id+"_"+to_string(sentence_count);
653 document_segmented.metadata.insert(__func__);
654 WriteDocumentToJsonl(document_segmented,output_file_path);
655 }
656 }
657 else WriteDocumentToJsonl(document,output_file_path);
658 }
659 catch(...){continue;}
660 }
661 target_file.close();
662 }
663}
664
665/**
666 * @brief PipelineStep
667 * @details
668 * @param Document &document: document is to be filtered
669 * @param void (CorpusCleaner::*cleaner)(Document &): filter function list
670 * @return Stats: statics imformation of this function.
671 * @ref
672**/
674{
675 chrono::system_clock::time_point start, end;
676 start = chrono::system_clock::now();
677
678 // Execute filtering function
679 try{ (this->*cleaner)(document); }
680 catch(...){
681 cout << "Exeption(PipelineStep): "<<document.id <<" "<<document.text << endl;
682 StoreException(__func__, "document.text{"+document.text+"}");
683 throw;
684 return MakeStats(__func__,"",0);
685 }
686
687 end = chrono::system_clock::now();
688 double elapsed = chrono::duration_cast<chrono::duration<double>>(end - start).count();
689 // TODO: fix second parameters
690 return MakeStats(__func__,"",elapsed);
691}
692
693
694/**
695 * @brief Pipeline that sequentially executes the configured CorpusCleaner methods
696 * @details
697 * Perform the following steps in order.
698 * 1. Set CorpusCleaner process to pipeline_list that will be executed. (Please read attention.)
699 * 2. Loop processing as many times as pipeline_list
700 * 2-1. copy output folder to intermediate folder
701 * 2-2. Get list of files in intermediate folder.
702 * 2-3. Execute the each CorpusCleaner processing on all files in the intermediate folder.
703 *
704 * Example:
705 * ```cpp
706 * string input_folder_path = "../../results/dataset/input/";
707 * string output_folder_path = "../../results/dataset/output/";
708 * uint32_t min_length= 5;
709 * uint32_t max_length = 5000;
710 * set<string> accept_language{"__label__ja"};
711 * bool store_rejected = true;
712 * bool execute_sentence_segment = false; // TODO: switch true
713 * double language_threshold = 0.3;
714 * double perplexity_threshold = 40000;
715 *
716 * string blacklist_file_path = output_folder_path+"/blacklist.txt";
717 * GenerateDedupLSH generate_dedup_lsh(4,200,20,10);
718 * LSHDeduplicator deduplicator(true,blacklist_file_path,true,1280000000);
719 *
720 * // create instance
721 * CorpusCleaner corpus_cleaner(input_folder_path,
722 * output_folder_path,
723 * min_length,
724 * max_length,
725 * accept_language,
726 * store_rejected,
727 * execute_sentence_segment,
728 * language_threshold,
729 * perplexity_threshold,
730 * &generate_dedup_lsh,
731 * &deduplicator);
732 *
733 * // Execute cleaning pipeline
734 * corpus_cleaner.CleanPipeline();
735 * ```
736 * @param void: None
737 * @return None
738 * @attention
739 * CorpusCleaner processing is performed in the order set in Cleaner_array.
740 * For example, set cleaner_array as follows:
741 * ```cpp
742 * vector<void (CorpusCleaner::*)(Document &)> cleaner_list = {
743 * &CorpusCleaner::URLRemover ,
744 * &CorpusCleaner::LengthFilter,
745 * &CorpusCleaner::SpecialCharacterRemover
746 * };
747 * ```
748 * At this time, processing is performed in the order of
749 * 1. URLRemover, 2. LengthFilter, and 3. SpecialCharacterRemover.
750**/
752{
753 // Set CorpusCleaner process that will be executed.
754 // They will be executed in the order you set them.
755 vector<void (CorpusCleaner::*)(Document &)> cleaner_list = {
766 };
767
768 cout << "### Start Clean Pipeline. ###" << endl;
769 if(this->sentence_segment==true){
770 cout << "### Execute Sentence Segmenter. ###" << endl;
771 // Loop processing as many times as deduplicate_list
772 // MoveFolder(this->output_path, this->intermediate_path);
773 SentenceSegmenter(this->intermediate_path,this->output_path);
774 RemoveFolder(this->intermediate_path);
775 // OutputStats(stats);
776 // copy output folder to intermediate folder
777 MoveFolder(this->output_path, this->intermediate_path);
778 }
779
780 vector<string> filename_list;
781 vector<uint64_t> file_line_number_list;
782
783 // Get list of files in intermediate folder
784 GetFileNameListWithoutExtention(this->intermediate_path,&filename_list);
785 string extention = (this->sentence_segment==true) ? ".jsonl":".txt";
786 GetFileLineNumberList(this->intermediate_path,&filename_list,extention,&file_line_number_list);
787
788 cout << "### Excecute CleanPipeline. ###" << endl;
789 // Execute the each CorpusCleaner processing on all files in the intermediate folder.
790 for (int i=0;i<(int)filename_list.size();i++){
791 // load data
792 string filename = filename_list[i];
793 uint64_t file_line_number = file_line_number_list[i];
794 cout << "Start Cleaning "+this->intermediate_path+filename+extention << endl;
795
796 ifstream input_file(this->intermediate_path+filename+extention);
797 string output_file_path(this->output_path+filename+".jsonl");
798 string rejected_file_path(this->rejected_path+filename+".jsonl");
799 string line="";
800 uint64_t line_count=-1; // The fist incrementation is overflow.
801 uint64_t removed_line_count = 0;
802 chrono::system_clock::time_point start, end;
803 start = chrono::system_clock::now();
804
805 while (getline(input_file, line)) {
806 Document document;
807 line_count++;
808 // load data
809 end = std::chrono::system_clock::now();
810 uint32_t elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
811 ProceedProgressBar(line_count+1,file_line_number,elapsed_time);
812 if(this->sentence_segment==true){
813 try{ReadDocumentFromJsonlOneLine(document,line);}
814 catch(...){
815 string exception_detail = "line: "+line;
816 cout << "Exeption(ReadDocumentFromJsonlOneLine): "<< exception_detail << endl;
817 StoreException("ReadDocumentFromJsonlOneLine","input_jsonl_line{"+exception_detail+"}");
818 continue;
819 }
820 }
821 else{
822 try{ConvertTextToDocument(line,filename,to_string(line_count),document);}
823 catch(...){
824 string exception_detail = "line: "+line;
825 cout << "Exeption(ConvertTextToDocument): "<< exception_detail << endl;
826 StoreException("ConvertTextToDocument","input{"+exception_detail+"}");
827 continue;
828 }
829 }
830
831
832 // Loop processing as many times as cleaner_list
833 for (const auto& cleaner : cleaner_list) {
834 try{Stats stats = PipelineStep(document,(cleaner));}
835 catch(...){continue;}
836 //OutputStats(stats);
837 // if rejected, break and turn to next line.
838 if(document.is_rejected){
839 removed_line_count++;
840 break;
841 }
842 }
843
844 // dump data
845 try{
846 if(document.is_rejected){
847 if(this->store_rejected) WriteDocumentToJsonl(document,rejected_file_path);
848 }
849 else WriteDocumentToJsonl(document,output_file_path);
850 }
851 catch(...){
852 string exception_detail = "document.text: "+document.text+" document.id: "+document.id;
853 cout << "Exeption(WriteDocumentToJsonl): "<< exception_detail << endl;
854 StoreException("WriteDocumentToJsonl","input_jsonl_line{"+exception_detail+"}");
855 continue;
856 }
857 }
858
859 // output removed results
860 printf("Removed line number: %ld\n",removed_line_count);
861 printf("Remaining line number: %ld\n",file_line_number - removed_line_count);
862 printf("Removed ratio: %.2f%%\n",double(removed_line_count) / file_line_number * 100);
863 printf("Remaining ratio: %.2f%%\n",100 - double(removed_line_count) / file_line_number * 100);
864 input_file.close();
865 }
866
867 return 0;
868}
869
void Normalizer(Document &document)
Neologd Normalize sentence.
CorpusCleaner(string input_path, string output_path, uint32_t min_length, uint32_t max_length, set< string > accept_language, bool store_rejected, bool sentence_segment, float language_threshold, double perplexity_threshold, GenerateDedupLSH *generate_dedup_lsh, LSHDeduplicator *deduplicator)
void LengthFilter(Document &document)
Remove too long sentence and too short sentence.
void SpecialCharacterRemover(Document &document)
Remove special character. For example, ☀, ♡, ☆, and so on.
void ZeroPunctuationFilter(Document &document)
Remove sentence without punctuation.
void PerplexityFilter(Document &document)
KenLM's Perplexity Quality filtering.
void MinhashDeduplication(Document &document)
MinHashLSH Deduplication files in the this->intermediate folder.
int32_t CleanPipeline(void)
Pipeline that sequentially executes the configured CorpusCleaner methods.
Stats PipelineStep(Document &document, void(CorpusCleaner::*cleaner)(Document &))
PipelineStep.
void LanguageFilter(Document &document)
Language filtering using fastText.
void EmojiRemover(Document &document)
Remove emoji. For example, 🤗, 🐉, 📊, and so on.
void StoreException(string function_name, string reference)
Save exception in file.
void QuotesRemover(Document &document)
Remove quotes. For example, [1], {245}, and so on.
void SentenceSegmenter(string input_folder_path, string output_folder_path)
Simple sentence splitter for japanese text.
void URLRemover(Document &document)
Remove URLs matching regular expression.
vector< string > CalculateLSH(wstring text)
Calculate minhash list of text.
Definition minhash.cpp:107
double PerplexityWithSentencePiece(const wstring sentence)
Perplexity sentence by KenLM with SentencePiece Tokenizing.
void InitializeSeen(void)
Initialize seen parameter.
Definition minhash.cpp:300
size_t SizeOfBlacklist(void)
Calculate size of blacklist (rough estimate)
Definition minhash.cpp:265
void InitializeBlacklist(void)
Initialize blacklist parameter.
Definition minhash.cpp:331
size_t SizeOfSeen(void)
Calculate size of blacklist (rough estimate)
Definition minhash.cpp:232
bool Apply(const vector< string > *lshs)
Calculate minhash list of text.
Definition minhash.cpp:200
void predictOneLine(string sentence, vector< pair< real, string > > &predictions, int32_t k, real threshold) const
Judge the language of one sentence.
void ReadDocumentFromJsonlOneLine(Document &document, string input_jsonl_line)
Loggging Document to output_file_path.
void ConvertInputFilesToJsonl(const string input_folder_path, const string output_folder_path)
Convert input files to jsonl that has Document's element.
void ConvertTextToDocument(string sentence, string filename, string file_line_count, Document &document)
Convert input files to jsonl that has Document's element.
void OutputStats(Stats stats)
Output statistics.
void WriteDocumentToJsonl(Document &document, string output_file_path)
Loggging Document to output_file_path.
Stats MakeStats(string process_name, string output_path, double elapsed_time)
Format statistics.
void ReadDocumentFromJsonlOneLine(Document &document, string input_jsonl_line)
Loggging Document to output_file_path.
void ConvertTextToDocument(string sentence, string filename, string file_line_count, Document &document)
Convert input files to jsonl that has Document's element.
void WriteDocumentToJsonl(Document &document, string output_file_path)
Loggging Document to output_file_path.
Stats MakeStats(string process_name, string output_path, double elapsed_time)
Format statistics.
string NormalizeNeologd(string sentence)
Neologd Normalized function.
Structure for storing statistical information for each process of CorpusCleaner.
float language_score
set< string > metadata
double perplexity
Structure for storing statistical information for each process of CorpusCleaner.
double elapsed_time
string file_name
uint32_t result_file_size
string process_name
void GetFileLineNumberList(const string folder_path, const vector< string > *file_list, const string file_extention, vector< uint64_t > *file_line_number_list)
Get file line number list.
Definition util.cpp:237
void RemoveFolder(const std::string &path)
Delete a folder with its contents.
Definition util.cpp:41
void ProceedProgressBar(unsigned long long line_count, unsigned long long file_line_number, uint32_t elapsed_time_ms)
Update progress bar.
Definition util.cpp:498
void CopyFolder(string source_folder, string target_folder)
copy source_folder to target_folder
Definition util.cpp:116
uint32_t strlen_utf8(string input)
Get exact length of UTF-8 string in C.
Definition util.cpp:17
string CalculateNextEmoji(string pre_emoji)
Derive the next emoji.
Definition util.cpp:177
void ReplaceSubstring(string &sentence, const string &target, const string &replacement)
Definition util.cpp:403
wstring ConvertUTF8ToWstring(const string &sentence)
Convert string to Wstring.
Definition util.cpp:270
void GetFileNameListWithoutExtention(const string folder_path, vector< string > *file_list)
Get filename list in folder_path.
Definition util.cpp:208
void SegmentSentence(string sentence, vector< string > &segments)
Segmentation Sentence.
Definition util.cpp:315
void MoveFolder(string source_folder, string target_folder)
copy source_folder to target_folder
Definition util.cpp:142
string EscapeWord(const string &input)
Escape word.
Definition util.cpp:474