5using namespace simdjson;
27 string input_jsonl_line)
30 simdjson::ondemand::parser parser;
31 simdjson::ondemand::document jsonl_line = parser.iterate(input_jsonl_line);
32 string_view line_view;
34 jsonl_line[
"text"].get(line_view);
35 document.
text = string(line_view);
37 jsonl_line[
"id"].get(line_view);
38 document.
id = string(line_view);
40 jsonl_line[
"is_rejected"].get(line_view);
44 jsonl_line[
"metadata"].get(line_view);
45 string tmp = string(line_view);
48 while(getline(ss, token,
',')){
52 jsonl_line[
"language"].get(line_view);
55 jsonl_line[
"language_score"].get(line_view);
58 jsonl_line[
"perplexity"].get(line_view);
75 string output_file_path)
78 ofstream output_file(output_file_path, ios::app);
82 output_file <<
"\"text\":\"" <<document.
text <<
"\",";
83 output_file <<
"\"id\":\"" << document.
id <<
"\",";
84 output_file <<
"\"is_rejected\":\"" << document.
is_rejected <<
"\",";
85 output_file <<
"\"metadata\":\"";
86 for (
auto iter = document.
metadata.begin(); iter != document.
metadata.end(); ++iter) {
87 output_file << *iter <<
",";
90 output_file <<
"\"language\":\"" <<document.
language <<
"\",";
91 output_file <<
"\"language_score\":\"" <<document.
language_score <<
"\",";
92 output_file <<
"\"perplexity\":\"" <<document.
perplexity <<
"\"";
93 output_file <<
"}"<< endl;
115 string file_line_count,
118 document.
text = sentence;
119 document.
id = filename+
"_"+file_line_count;
136 const string output_folder_path)
139 string target_line=
"",source_line=
"";
140 cout <<
"### Convert input file(.txt) to .jsonl. ###" << endl;
142 vector<string> filename_list;
143 vector<uint64_t> file_line_number_list;
151 for(
int i=0;i<(int)filename_list.size();i++){
152 string filename = filename_list[i];
153 uint64_t file_line_number = file_line_number_list[i];
154 ifstream input_file(input_folder_path+
"/"+filename+
".txt");
155 string output_file_path(output_folder_path+
"/"+filename+
".jsonl");
156 uint64_t line_count = 0;
157 chrono::system_clock::time_point start, end;
158 start = chrono::system_clock::now();
160 while(getline(input_file, line)){
162 end = std::chrono::system_clock::now();
163 uint32_t elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
212 cout <<
"######### "<<stats.
process_name<<
" #########"<<endl;
213 cout <<
"file_name:" << stats.
file_name << endl;
214 cout <<
"elapsed_time[s]:"<< stats.
elapsed_time << endl;
229 string filename = this->exception_path+
"/exception.txt";
230 ofstream output_file(filename, ios::app);
232 output_file <<
"Function Name: "<< function_name <<
" , ";
233 output_file <<
"Reference:" << reference <<
" , ";
243 set<string> accept_language,
245 bool sentence_segment,
246 float language_threshold,
247 double perplexity_threshold,
251 this->input_path = input_path;
252 this->output_path = output_path+
"/cleaned/";
253 this->intermediate_path = output_path+
"/intermediate/";
254 this->rejected_path = output_path+
"/rejected/";
255 this->exception_path = output_path+
"/exception/";
257 this->min_length = min_length;
258 this->max_length = max_length;
259 this->accept_language = accept_language;
260 this->store_rejected = store_rejected;
261 this->sentence_segment = sentence_segment;
262 this->language_threshold = language_threshold;
263 this->perplexity_threshold = perplexity_threshold;
264 this->generate_dedup_lsh=generate_dedup_lsh;
265 this->deduplicator=deduplicator;
267 if(filesystem::exists(this->output_path) |
268 filesystem::exists(this->rejected_path)) {
269 cout <<
"ERROR: output_path or rejected_path folder already exists. ";
270 cout <<
"Please RENAME to delete the selection." << endl;
277 mkdir(output_path.c_str(), 0777);
278 mkdir(this->intermediate_path.c_str(), 0777);
279 mkdir(this->output_path.c_str(), 0777);
280 mkdir(this->exception_path.c_str(), 0777);
281 mkdir(this->rejected_path.c_str(), 0777);
286 CopyFolder(this->input_path,this->intermediate_path);
312 if (line_length < this->min_length || this->max_length < line_length) {
337 if(document.
perplexity<=this->perplexity_threshold){
367 vector<pair<float, string>> predictions;
369 float threshold = 0.0;
374 pair<float,string> result;
375 if((
int)predictions.size()==0) result=make_pair(0.0,
"other");
376 else result = make_pair((
float)predictions[0].first,predictions[0].second);
382 if(accept_language.find(document.
language)!=accept_language.end()){
390 cout <<
"Exception:LanguageFilter" << endl;
409 static regex url_pattern(R
"((https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+\$,%#]+))");
410 string sentence = regex_replace(document.
text,url_pattern,
"");
412 if(sentence!=document.
text) document.
metadata.insert(__func__);
413 document.
text = sentence;
437 string special_character =
"";
438 vector<string> start_character = {
"☀",
"←",
"⌀",
"⤀",
"⬀",
"🀀"};
439 vector<int> character_range = {512,112,256,128,256,256};
440 string sentence=document.
text;
442 for(
int i=0;i<(int)start_character.size();i++){
443 special_character = start_character[i];
445 for(
int j=0;j<character_range[i];j++){
451 if(sentence!=document.
text) document.
metadata.insert(__func__);
452 document.
text = sentence;
469 string sentence = document.
text;
473 for(
int i=0;i<1792;i++){
477 if(sentence!=document.
text) document.
metadata.insert(__func__);
478 document.
text = sentence;
492 static regex remaks_pattern(R
"((\[([0-9]+)\]|\{([0-9]+)\}))");
493 string sentence = regex_replace(document.
text,remaks_pattern,
"");
495 if(sentence!=document.
text) document.
metadata.insert(__func__);
496 document.
text = sentence;
514 vector<string> punctures = {
"、",
"、",
"。",
"。",
".",
".",
"?",
"?",
"!",
"!"};
515 string sentence = document.
text;
518 for(
auto puncture: punctures){
520 if(document.
text.find(puncture)!=string::npos){
544 if(sentence!=document.
text) document.
metadata.insert(__func__);
545 document.
text = sentence;
574 if(this->deduplicator->
Apply(&lshs)){
580 if(this->deduplicator->
SizeOfSeen()>=this->deduplicator->GetTotalBucketSize()){
581 cout <<
"MinhashDeduplicator: The size of Seen is more than total_bucket_size." << endl;
582 cout <<
"Now, clear seen and blacklist." << endl;
587 if(this->deduplicator->
SizeOfBlacklist()>=this->deduplicator->GetTotalBucketSize()){
588 cout <<
"MinhashDeduplicator: The size of blacklist is more than total_bucket_size." << endl;
589 cout <<
"Now, clear blacklist." << endl;
595 cout <<
"Exception:MinhashDeduplication" << endl;
619 string target_line=
"",source_line=
"";
620 vector<string> file_list;
624 for(
int i=0;i<(int)file_list.size();i++){
625 ifstream target_file(input_folder_path+
"/"+file_list[i]+
".txt");
626 string output_file_path(output_folder_path+
"/"+file_list[i]+
".jsonl");
627 int64_t line_count =-1;
628 while (getline(target_file, target_line)) {
629 vector<string> segments;
634 string exception_detail =
"line: "+target_line;
635 cout <<
"Exeption(ConvertTextToDocument): "<< exception_detail << endl;
636 StoreException(
"ConvertTextToDocument",
"input{"+exception_detail+
"}");
647 uint64_t sentence_count=0;
648 if((int64_t)segments.size()!=1){
649 for(
auto sentence:segments){
650 Document document_segmented = document;
651 document_segmented.
text = sentence;
652 document_segmented.
id = document.
id+
"_"+to_string(sentence_count);
653 document_segmented.
metadata.insert(__func__);
659 catch(...){
continue;}
675 chrono::system_clock::time_point start, end;
676 start = chrono::system_clock::now();
679 try{ (this->*cleaner)(document); }
681 cout <<
"Exeption(PipelineStep): "<<document.
id <<
" "<<document.
text << endl;
687 end = chrono::system_clock::now();
688 double elapsed = chrono::duration_cast<chrono::duration<double>>(end - start).count();
768 cout <<
"### Start Clean Pipeline. ###" << endl;
769 if(this->sentence_segment==
true){
770 cout <<
"### Execute Sentence Segmenter. ###" << endl;
777 MoveFolder(this->output_path, this->intermediate_path);
780 vector<string> filename_list;
781 vector<uint64_t> file_line_number_list;
785 string extention = (this->sentence_segment==
true) ?
".jsonl":
".txt";
788 cout <<
"### Excecute CleanPipeline. ###" << endl;
790 for (
int i=0;i<(int)filename_list.size();i++){
792 string filename = filename_list[i];
793 uint64_t file_line_number = file_line_number_list[i];
794 cout <<
"Start Cleaning "+this->intermediate_path+filename+extention << endl;
796 ifstream input_file(this->intermediate_path+filename+extention);
797 string output_file_path(this->output_path+filename+
".jsonl");
798 string rejected_file_path(this->rejected_path+filename+
".jsonl");
800 uint64_t line_count=-1;
801 uint64_t removed_line_count = 0;
802 chrono::system_clock::time_point start, end;
803 start = chrono::system_clock::now();
805 while (getline(input_file, line)) {
809 end = std::chrono::system_clock::now();
810 uint32_t elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
812 if(this->sentence_segment==
true){
815 string exception_detail =
"line: "+line;
816 cout <<
"Exeption(ReadDocumentFromJsonlOneLine): "<< exception_detail << endl;
817 StoreException(
"ReadDocumentFromJsonlOneLine",
"input_jsonl_line{"+exception_detail+
"}");
824 string exception_detail =
"line: "+line;
825 cout <<
"Exeption(ConvertTextToDocument): "<< exception_detail << endl;
826 StoreException(
"ConvertTextToDocument",
"input{"+exception_detail+
"}");
833 for (
const auto& cleaner : cleaner_list) {
835 catch(...){
continue;}
839 removed_line_count++;
852 string exception_detail =
"document.text: "+document.
text+
" document.id: "+document.
id;
853 cout <<
"Exeption(WriteDocumentToJsonl): "<< exception_detail << endl;
854 StoreException(
"WriteDocumentToJsonl",
"input_jsonl_line{"+exception_detail+
"}");
860 printf(
"Removed line number: %ld\n",removed_line_count);
861 printf(
"Remaining line number: %ld\n",file_line_number - removed_line_count);
862 printf(
"Removed ratio: %.2f%%\n",
double(removed_line_count) / file_line_number * 100);
863 printf(
"Remaining ratio: %.2f%%\n",100 -
double(removed_line_count) / file_line_number * 100);
void Normalizer(Document &document)
Neologd Normalize sentence.
CorpusCleaner(string input_path, string output_path, uint32_t min_length, uint32_t max_length, set< string > accept_language, bool store_rejected, bool sentence_segment, float language_threshold, double perplexity_threshold, GenerateDedupLSH *generate_dedup_lsh, LSHDeduplicator *deduplicator)
void LengthFilter(Document &document)
Remove too long sentence and too short sentence.
void SpecialCharacterRemover(Document &document)
Remove special character. For example, ☀, ♡, ☆, and so on.
void ZeroPunctuationFilter(Document &document)
Remove sentence without punctuation.
void PerplexityFilter(Document &document)
KenLM's Perplexity Quality filtering.
void MinhashDeduplication(Document &document)
MinHashLSH Deduplication files in the this->intermediate folder.
int32_t CleanPipeline(void)
Pipeline that sequentially executes the configured CorpusCleaner methods.
Stats PipelineStep(Document &document, void(CorpusCleaner::*cleaner)(Document &))
PipelineStep.
void LanguageFilter(Document &document)
Language filtering using fastText.
void EmojiRemover(Document &document)
Remove emoji. For example, 🤗, 🐉, 📊, and so on.
void StoreException(string function_name, string reference)
Save exception in file.
void QuotesRemover(Document &document)
Remove quotes. For example, [1], {245}, and so on.
void SentenceSegmenter(string input_folder_path, string output_folder_path)
Simple sentence splitter for japanese text.
void URLRemover(Document &document)
Remove URLs matching regular expression.
vector< string > CalculateLSH(wstring text)
Calculate minhash list of text.
double PerplexityWithSentencePiece(const wstring sentence)
Perplexity sentence by KenLM with SentencePiece Tokenizing.
void InitializeSeen(void)
Initialize seen parameter.
size_t SizeOfBlacklist(void)
Calculate size of blacklist (rough estimate)
void InitializeBlacklist(void)
Initialize blacklist parameter.
size_t SizeOfSeen(void)
Calculate size of blacklist (rough estimate)
bool Apply(const vector< string > *lshs)
Calculate minhash list of text.
void predictOneLine(string sentence, vector< pair< real, string > > &predictions, int32_t k, real threshold) const
Judge the language of one sentence.
void ReadDocumentFromJsonlOneLine(Document &document, string input_jsonl_line)
Loggging Document to output_file_path.
void ConvertInputFilesToJsonl(const string input_folder_path, const string output_folder_path)
Convert input files to jsonl that has Document's element.
void ConvertTextToDocument(string sentence, string filename, string file_line_count, Document &document)
Convert input files to jsonl that has Document's element.
void OutputStats(Stats stats)
Output statistics.
void WriteDocumentToJsonl(Document &document, string output_file_path)
Loggging Document to output_file_path.
Stats MakeStats(string process_name, string output_path, double elapsed_time)
Format statistics.
void ReadDocumentFromJsonlOneLine(Document &document, string input_jsonl_line)
Loggging Document to output_file_path.
void ConvertTextToDocument(string sentence, string filename, string file_line_count, Document &document)
Convert input files to jsonl that has Document's element.
void WriteDocumentToJsonl(Document &document, string output_file_path)
Loggging Document to output_file_path.
Stats MakeStats(string process_name, string output_path, double elapsed_time)
Format statistics.
string NormalizeNeologd(string sentence)
Neologd Normalized function.
Structure for storing statistical information for each process of CorpusCleaner.
Structure for storing statistical information for each process of CorpusCleaner.
uint32_t result_file_size
void GetFileLineNumberList(const string folder_path, const vector< string > *file_list, const string file_extention, vector< uint64_t > *file_line_number_list)
Get file line number list.
void RemoveFolder(const std::string &path)
Delete a folder with its contents.
void ProceedProgressBar(unsigned long long line_count, unsigned long long file_line_number, uint32_t elapsed_time_ms)
Update progress bar.
void CopyFolder(string source_folder, string target_folder)
copy source_folder to target_folder
uint32_t strlen_utf8(string input)
Get exact length of UTF-8 string in C.
string CalculateNextEmoji(string pre_emoji)
Derive the next emoji.
void ReplaceSubstring(string &sentence, const string &target, const string &replacement)
wstring ConvertUTF8ToWstring(const string &sentence)
Convert string to Wstring.
void GetFileNameListWithoutExtention(const string folder_path, vector< string > *file_list)
Get filename list in folder_path.
void SegmentSentence(string sentence, vector< string > &segments)
Segmentation Sentence.
void MoveFolder(string source_folder, string target_folder)
copy source_folder to target_folder
string EscapeWord(const string &input)
Escape word.