1#include <bits/stdc++.h>
6namespace fs = filesystem;
54 string file_line_count,
69 string intermediate_path;
72 string exception_path;
73 uint32_t min_length=5;
74 uint32_t max_length=1000;
75 set<string> accept_language{
"__label__ja"};
76 bool sentence_segment=
true;
77 bool store_rejected=
true;
78 float language_threshold=0.3;
79 double perplexity_threshold=999999;
92 set<string> accept_language,
94 bool sentence_segment,
95 float language_threshold,
96 double perplexity_threshold,
void Normalizer(Document &document)
Neologd Normalize sentence.
CorpusCleaner(string input_path, string output_path, uint32_t min_length, uint32_t max_length, set< string > accept_language, bool store_rejected, bool sentence_segment, float language_threshold, double perplexity_threshold, GenerateDedupLSH *generate_dedup_lsh, LSHDeduplicator *deduplicator)
void LengthFilter(Document &document)
Remove too long sentence and too short sentence.
void SpecialCharacterRemover(Document &document)
Remove special character. For example, ☀, ♡, ☆, and so on.
void ZeroPunctuationFilter(Document &document)
Remove sentence without punctuation.
void PerplexityFilter(Document &document)
KenLM's Perplexity Quality filtering.
void MinhashDeduplication(Document &document)
MinHashLSH Deduplication files in the this->intermediate folder.
int32_t CleanPipeline(void)
Pipeline that sequentially executes the configured CorpusCleaner methods.
Stats PipelineStep(Document &document, void(CorpusCleaner::*cleaner)(Document &))
PipelineStep.
void LanguageFilter(Document &document)
Language filtering using fastText.
void EmojiRemover(Document &document)
Remove emoji. For example, 🤗, 🐉, 📊, and so on.
void StoreException(string function_name, string reference)
Save exception in file.
void QuotesRemover(Document &document)
Remove quotes. For example, [1], {245}, and so on.
void SentenceSegmenter(string input_folder_path, string output_folder_path)
Simple sentence splitter for japanese text.
void URLRemover(Document &document)
Remove URLs matching regular expression.
void ReadDocumentFromJsonlOneLine(Document &document, string input_jsonl_line)
Loggging Document to output_file_path.
void ConvertInputFilesToJsonl(string input_folder_path, string output_folder_path)
Convert input files to jsonl that has Document's element.
void ConvertTextToDocument(string sentence, string filename, string file_line_count, Document &document)
Convert input files to jsonl that has Document's element.
void OutputStats(Stats stats)
Output statistics.
struct _DOCUMENT Document
Structure for storing statistical information for each process of CorpusCleaner.
void WriteDocumentToJsonl(Document &document, string output_file_path)
Loggging Document to output_file_path.
Stats MakeStats(string process_name, string output_path, double elapsed_time)
Format statistics.
struct _STATS Stats
Structure for storing statistical information for each process of CorpusCleaner.
Structure for storing statistical information for each process of CorpusCleaner.
Structure for storing statistical information for each process of CorpusCleaner.
uint32_t result_file_size