Corpus Cleaner
main.cpp
Go to the documentation of this file.
1
2
3#include "corpus_cleaner.hpp"
4
5
6
7/**
8 * @brief Get line count of filename file
9 * @details
10 * Example:
11 * string input_path = "../data/input/";
12 * ConutLines(input_path);
13 * @param string filename: file name
14 * @return uint64_t: count of file line
15**/
16uint64_t ConutLines(const string& filename)
17{
18 ifstream file(filename);
19 uint64_t line_count = 0;
20 string line="";
21 while (getline(file, line)) line_count++;
22 file.close();
23 return line_count;
24}
25
26/**
27 * @brief split one file into multiple equal parts based on the number of lines
28 * @details
29 * @param string filename: file name
30 * @return uint64_t: count of file line
31**/
32void SplitFiles(const vector<string>& output_files, const string& input_file)
33{
34 ifstream input(input_file);
35
36 vector<ofstream> outputs((int)output_files.size());
37 // get file name of input
38 filesystem::path path_string(input_file);
39 string filename=path_string.filename();
40 cout << filename << endl;
41 cout << input_file<<endl;
42 for (int i = 0; i < (int)output_files.size(); i++) {
43 string output_filename = output_files[i]+"/"+filename;
44 outputs[i].open(output_filename);
45 if (!outputs[i]) {
46 cerr << "Error: Can't create file: " << output_filename << endl;
47 return;
48 }
49 }
50 // cout << "split"<<endl;
51
52 string line="";
53 uint64_t chunk_count=(ConutLines(input_file)+(uint64_t)output_files.size())/(uint64_t)output_files.size();
54 uint64_t file_index = 0;
55 uint64_t line_count = 0;
56 while (getline(input, line)) {
57
58 outputs[file_index] << line << endl;
59 line_count++;
60 if (line_count % chunk_count == 0) {
61 file_index++;
62 }
63 }
64 // cout << "split completed."<<endl;
65
66 input.close();
67 for (auto& output : outputs) {
68 output.close();
69 }
70}
71
72
73/**
74 * @brief split one file into multiple equal parts based on the number of lines
75 * @details
76 * @param sconst vector<string>& input_files: file list that is merged
77 * @param const string& output_file: merged file
78 * @return void: None
79**/
80void MergeFiles(const vector<string>& input_files, const string& output_file)
81{
82 ofstream output(output_file, ios::binary);
83 if (!output) {
84 cerr << "Failed to open output file: " << output_file << endl;
85 return;
86 }
87
88 for (const auto &input_file : input_files) {
89 ifstream input(input_file, ios::binary);
90 if (!input) {
91 cerr << "Failed to open input file: " << input_file << endl;
92 continue;
93 }
94
95 // write input files to output file
96 output << input.rdbuf();
97 input.close();
98 }
99
100 output.close();
101}
102
103
104void MultiProcessCorpusClean(const string input_folder_path,
105 const string output_folder_path)
106{
107 //const string input_folder_path=;
108 //const string output_folder_path=;
109
110 //string input_folder_path = "../../results/dataset/input/";
111 //string output_folder_path = "../../results/dataset/output/";
112 uint32_t min_length= 5;
113 uint32_t max_length = 5000;
114 set<string> accept_language{"__label__ja"};
115// RemoveFolder(output_folder_path);
116 bool store_rejected = true;
117 bool execute_sentence_segment = false; // TODO: switch true
118 double language_threshold = 0.3;
119 double perplexity_threshold = 40000;
120
121 const string blacklist_file_path = output_folder_path+"/blacklist.txt";
122
123 GenerateDedupLSH generate_dedup_lsh(4,200,20,10);
124 //LSHDeduplicator deduplicator(true,"../../results/dataset/blacklist.txt",true,5120000000);
125 LSHDeduplicator deduplicator(true,blacklist_file_path,true,1280000000);
126
127 // create instance
128 CorpusCleaner corpus_cleaner(input_folder_path,
129 output_folder_path,
130 min_length,
131 max_length,
132 accept_language,
133 store_rejected,
134 execute_sentence_segment,
135 language_threshold,
136 perplexity_threshold,
137 &generate_dedup_lsh,
138 &deduplicator);
139
140 // Execute cleaning pipeline
141 corpus_cleaner.CleanPipeline();
142}
143
144int main(void)
145{
146 // Please put the original .txt file in the original folder
147 const string original_folder_path = "../../results/dataset/original/";
148 const string base_folder_path = "../../results/dataset/";
149 const string results_folder_path = "../../results/dataset/cleaned/";
150 filesystem::create_directories(fs::path(results_folder_path));
151
152 // get file list
153 vector<string> filelist;
154 GetFileNameListWithoutExtention(original_folder_path,&filelist);
155
156 // Get the number of CPU threads and define it as thread_number = cpu thread count - 4; (value of 1 or more)
157 int32_t num_threads = std::thread::hardware_concurrency() - 4;
158 num_threads = (num_threads>0) ? num_threads : 1;
159 // num_threads=1;
160 cout << "Multi Process Number: "<< num_threads << endl;
161
162 // Create thread number folders in original_folder_path
163 vector<string> output_files;
164 for(int i=0;i<num_threads;i++){
165 string temp = base_folder_path+to_string(i)+"/input/";
166 fs::path new_directory(temp);
167 filesystem::create_directories(new_directory);
168 output_files.push_back(temp);
169 }
170 // Divide the files under original_folder_path into thread_number and put them into the input_folder_path+"i"th folder.
171 for(auto file:filelist) SplitFiles(output_files, original_folder_path+"/"+file+".txt");
172
173 // Use thread library to execute MultiProcessCorpusClean() in parallel for each thread
174 vector<thread> threads;
175 for(int i=0;i<num_threads;i++){
176 string input_folder_path = base_folder_path+to_string(i)+"/input/";
177 string output_folder_path = base_folder_path+to_string(i)+"/output/";
178 threads.emplace_back(MultiProcessCorpusClean,
179 input_folder_path,
180 output_folder_path);
181 }
182
183 // call each thread
184 for(auto& process : threads) {
185 process.join();
186 }
187
188 // merging all results files and make them into one
189 for(auto file:filelist){
190 vector<string> splited_filelist;
191 for(int i=0;i<num_threads;i++) splited_filelist.push_back(base_folder_path+to_string(i)+"/output/cleaned/"+file+".jsonl");
192 MergeFiles(splited_filelist,results_folder_path+file+".jsonl");
193 // for(int i=0;i<num_threads;i++) RemoveFolder(base_folder_path+to_string(i)+"/output/cleaned/");
194 }
195 return 0;
196}
197
198
199
200// #include "corpus_cleaner.hpp"
201
202// int main(void)
203// {
204// string input_folder_path = "../../results/dataset/input/";
205// string output_folder_path = "../../results/dataset/output/";
206// uint32_t min_length= 5;
207// uint32_t max_length = 5000;
208// set<string> accept_language{"__label__ja"};
209// // RemoveFolder(output_folder_path);
210// bool store_rejected = true;
211// bool execute_sentence_segment = false; // TODO: switch true
212// double language_threshold = 0.3;
213// double perplexity_threshold = 40000;
214
215// GenerateDedupLSH generate_dedup_lsh(4,200,20,10);
216// LSHDeduplicator deduplicator(true,"../../results/dataset/blacklist.txt",true,1280000000);
217
218// // create instance
219// CorpusCleaner corpus_cleaner(input_folder_path,
220// output_folder_path,
221// min_length,
222// max_length,
223// accept_language,
224// store_rejected,
225// execute_sentence_segment,
226// language_threshold,
227// perplexity_threshold,
228// &generate_dedup_lsh,
229// &deduplicator);
230
231// // Execute cleaning pipeline
232// corpus_cleaner.CleanPipeline();
233// return 0;
234// }
int32_t CleanPipeline(void)
Pipeline that sequentially executes the configured CorpusCleaner methods.
void SplitFiles(const vector< string > &output_files, const string &input_file)
split one file into multiple equal parts based on the number of lines
Definition main.cpp:32
void MergeFiles(const vector< string > &input_files, const string &output_file)
split one file into multiple equal parts based on the number of lines
Definition main.cpp:80
uint64_t ConutLines(const string &filename)
Get line count of filename file.
Definition main.cpp:16
int main(void)
Definition main.cpp:144
void MultiProcessCorpusClean(const string input_folder_path, const string output_folder_path)
Definition main.cpp:104
void GetFileNameListWithoutExtention(const string folder_path, vector< string > *file_list)
Get filename list in folder_path.
Definition util.cpp:208