Corpus Cleaner
corpus_cleaner.hpp
Go to the documentation of this file.
1#include <bits/stdc++.h>
2#include "language_filter.hpp"
4#include "minhash.hpp"
5using namespace std;
6namespace fs = filesystem;
7
8
9/**
10 * @brief Structure for storing statistical information for each process of CorpusCleaner
11 * @details
12 * Each process of CorpusCleaner obtains the following specific information.
13 * - text: one sentence of corpus
14 * - id: text identification
15 * - is_rejected: True if this text is eligible for deletion
16 * - metadata: tags added during the filtering process
17 * - language: Language determined by LanguageFilter
18 * - language_score: Language score calculated by LanguageFilter
19 * - perplexity: perplexity calculated by PerplexityFilter
20 * These will be used later for drawing processing, etc.
21 * @note
22**/
23typedef struct _DOCUMENT {
24 string text="";
25 string id="";
26 bool is_rejected=false;
27 set<string> metadata;
28 string language;
30 double perplexity=999999;
32
33
34/**
35 * @brief Structure for storing statistical information for each process of CorpusCleaner
36 * @details
37 * Each process of CorpusCleaner obtains the following statistical information.
38 * 1. CopusCleaner processing name
39 * 2. Processed file name
40 * 3. Elapsed processing time
41 * 4. File size after processing
42 *
43 * These will be used later for drawing processing, etc.
44 * @note
45**/
46typedef struct _STATS {
48 string file_name;
52void ConvertTextToDocument(string sentence,
53 string filename,
54 string file_line_count,
55 Document &document);
56void ConvertInputFilesToJsonl(string input_folder_path,string output_folder_path);
57void ReadDocumentFromJsonlOneLine(Document &document,string input_jsonl_line);
58void WriteDocumentToJsonl(Document &document,string output_file_path);
59Stats MakeStats(string process_name,
60 string output_path,
61 double elapsed_time);
62void OutputStats(Stats stats);
63
65{
66private:
67 /***member value***/
68 string input_path;
69 string intermediate_path;
70 string output_path;
71 string rejected_path;
72 string exception_path;
73 uint32_t min_length=5;
74 uint32_t max_length=1000;
75 set<string> accept_language{"__label__ja"};
76 bool sentence_segment=true;
77 bool store_rejected=true;
78 float language_threshold=0.3;
79 double perplexity_threshold=999999;
80 fasttext::FastTextEx language_filter;
81 KenLMFilter kenlm_filter;
82 //TODO: add vecter of result's file size of each cleaning process. At the end, analysys it.
83 GenerateDedupLSH *generate_dedup_lsh;
84 LSHDeduplicator *deduplicator;
85
86public:
87 /***constructor***/
88 CorpusCleaner(string input_path,
89 string output_path,
90 uint32_t min_length,
91 uint32_t max_length,
92 set<string> accept_language,
93 bool store_rejected,
94 bool sentence_segment,
95 float language_threshold,
96 double perplexity_threshold,
97 GenerateDedupLSH *generate_dedup_lsh,
98 LSHDeduplicator *deduplicator);
99 /***destructor***/
101 /***member function***/
102 void Normalizer(Document &document);
103 void URLRemover(Document &document);
104 void SpecialCharacterRemover(Document &document);
105 void EmojiRemover(Document &document);
106 void QuotesRemover(Document &document);
107 void LengthFilter(Document &document);
108 void LanguageFilter(Document &document);
109 void PerplexityFilter(Document &document);
110 void MinhashDeduplication(Document &document);
111 void ZeroPunctuationFilter(Document &document);
112 void SentenceSegmenter(string input_folder_path,string output_folder_path);
113 Stats PipelineStep(Document &document, void (CorpusCleaner::*cleaner)(Document &));
114 int32_t CleanPipeline(void);
115 void StoreException(string function_name, string reference);
116};
void Normalizer(Document &document)
Neologd Normalize sentence.
CorpusCleaner(string input_path, string output_path, uint32_t min_length, uint32_t max_length, set< string > accept_language, bool store_rejected, bool sentence_segment, float language_threshold, double perplexity_threshold, GenerateDedupLSH *generate_dedup_lsh, LSHDeduplicator *deduplicator)
void LengthFilter(Document &document)
Remove too long sentence and too short sentence.
void SpecialCharacterRemover(Document &document)
Remove special character. For example, ☀, ♡, ☆, and so on.
void ZeroPunctuationFilter(Document &document)
Remove sentence without punctuation.
void PerplexityFilter(Document &document)
KenLM's Perplexity Quality filtering.
void MinhashDeduplication(Document &document)
MinHashLSH Deduplication files in the this->intermediate folder.
int32_t CleanPipeline(void)
Pipeline that sequentially executes the configured CorpusCleaner methods.
Stats PipelineStep(Document &document, void(CorpusCleaner::*cleaner)(Document &))
PipelineStep.
void LanguageFilter(Document &document)
Language filtering using fastText.
void EmojiRemover(Document &document)
Remove emoji. For example, 🤗, 🐉, 📊, and so on.
void StoreException(string function_name, string reference)
Save exception in file.
void QuotesRemover(Document &document)
Remove quotes. For example, [1], {245}, and so on.
void SentenceSegmenter(string input_folder_path, string output_folder_path)
Simple sentence splitter for japanese text.
void URLRemover(Document &document)
Remove URLs matching regular expression.
void ReadDocumentFromJsonlOneLine(Document &document, string input_jsonl_line)
Loggging Document to output_file_path.
void ConvertInputFilesToJsonl(string input_folder_path, string output_folder_path)
Convert input files to jsonl that has Document's element.
void ConvertTextToDocument(string sentence, string filename, string file_line_count, Document &document)
Convert input files to jsonl that has Document's element.
void OutputStats(Stats stats)
Output statistics.
struct _DOCUMENT Document
Structure for storing statistical information for each process of CorpusCleaner.
void WriteDocumentToJsonl(Document &document, string output_file_path)
Loggging Document to output_file_path.
Stats MakeStats(string process_name, string output_path, double elapsed_time)
Format statistics.
struct _STATS Stats
Structure for storing statistical information for each process of CorpusCleaner.
Structure for storing statistical information for each process of CorpusCleaner.
float language_score
set< string > metadata
double perplexity
Structure for storing statistical information for each process of CorpusCleaner.
double elapsed_time
string file_name
uint32_t result_file_size
string process_name