Corpus Cleaner
util.cpp
Go to the documentation of this file.
1#include "util.hpp"
2using namespace std;
3
4/**
5 * @brief Get exact length of UTF-8 string in C
6 * @details
7 * Example:
8 * ```cpp
9 * string input="こんにちわ。";
10 * uint32_t length = strlen_utf8(input);
11 * ```
12 * @param string input: text string.
13 * @return uint32_t: length of input
14 * @ref https://ja.stackoverflow.com/questions/2988/c%E8%A8%80%E8%AA%9E%E3%81%A7%E3%81%AEutf-8%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E6%AD%A3%E7%A2%BA%E3%81%AA%E9%95%B7%E3%81%95%E3%82%92%E5%8F%96%E5%BE%97%E3%81%99%E3%82%8B
15 * @attention
16**/
17uint32_t strlen_utf8(string input)
18{
19 uint32_t i = 0, length = 0;
20 while (input[i])
21 {
22 if ((input[i] & 0xc0) != 0x80) length++;
23 i++;
24 }
25 return length;
26}
27
28/**
29 * @brief Delete a folder with its contents
30 * @details
31 * Recursively delete files in a folder.
32 * Example:
33 * ```cpp
34 * string folder_path="data/intermediate";
35 * RemoveFolder(folder_path);
36 * ```
37 * @param string path: folder path to be deleted
38 * @return None
39 * @attention
40**/
41void RemoveFolder(const std::string& path)
42{
43 filesystem::path source_path = path;
44 // Check the file that is existed.
45 if (!filesystem::exists(source_path)) {
46 // cerr << "Source directory does not exist: " << source_path << endl;
47 return;
48 }
49 for (const auto& entry : filesystem::directory_iterator(path)) {
50 if (entry.is_directory()) {
51 RemoveFolder(entry.path().string());
52 } else {
53 filesystem::remove(entry.path());
54 }
55 }
56 filesystem::remove(path);
57}
58
59/**
60 * @brief copy source_fileto target_file
61 * @details
62 * Example:
63 * ```cpp
64 * string source_path="data/source/a.txt";
65 * string target_path="data/target/a.txt";
66 * CopyFile(source_path,target_path);
67 * ```
68 * @param string source_path: Copy source file path
69 * @param string target_path: Copy target file path
70 * @return None
71 * @attention
72**/
73void CopyFile(string source_path, string target_path)
74{
75 filesystem::path source = source_path;
76 filesystem::path destination = target_path;
77 filesystem::copy(source, destination, filesystem::copy_options::overwrite_existing);
78}
79
80/**
81 * @brief copy source_fileto target_file
82 * @details
83 * Example:
84 * ```cpp
85 * string source_path="data/source/";
86 * string target_path="data/target/";
87 * CopyFolder(source_path,target_path);
88 * ```
89 * @param string source_path: Copy source file path
90 * @param string target_path: Copy target file path
91 * @return None
92 * @attention
93**/
94void MoveFile(string source_path, string target_folder)
95{
96 filesystem::path source = source_path;
97 filesystem::path destination = target_folder;
98 filesystem::path new_path = destination / source.filename();
99 filesystem::rename(source, new_path);
100}
101
102/**
103 * @brief copy source_folder to target_folder
104 * @details
105 * Example:
106 * ```cpp
107 * string source_folder="data/source";
108 * string target_folder="data/target";
109 * CopyFolder(source_folder,target_folder);
110 * ```
111 * @param string source_folder: Copy source folder path
112 * @param string target_folder: Copy target folder path
113 * @return None
114 * @attention
115**/
116void CopyFolder(string source_folder, string target_folder)
117{
118 // copy source_folder to target_folder
119 filesystem::path sourceDir = source_folder;
120 filesystem::path destinationDir = target_folder;
121 for (const auto &file : filesystem::directory_iterator(sourceDir)) {
122 if (filesystem::is_regular_file(file.path())) {
123 filesystem::copy(file, destinationDir / file.path().filename(), filesystem::copy_options::overwrite_existing);
124 }
125 }
126}
127
128/**
129 * @brief copy source_folder to target_folder
130 * @details
131 * Example:
132 * ```cpp
133 * string source_folder="data/source";
134 * string target_folder="data/target";
135 * MoveFolder(source_folder,target_folder);
136 * ```
137 * @param string source_folder: Copy source folder path
138 * @param string target_folder: Copy target folder path
139 * @return None
140 * @attention
141**/
142void MoveFolder(string source_folder, string target_folder)
143{
144 // Check the file that is existed.
145 if (!filesystem::exists(source_folder)) {
146 std::cerr << "Source directory does not exist: " << source_folder << std::endl;
147 return;
148 }
149 if (!filesystem::exists(target_folder)) filesystem::create_directory(target_folder);
150
151 // Move file and folder
152 for (const auto& entry : filesystem::directory_iterator(source_folder)) {
153 try {
154 filesystem::path newPath = target_folder / entry.path().filename();
155 filesystem::rename(entry.path(), newPath);
156 std::cout << "Moved: " << entry.path().filename() << std::endl;
157 } catch (const filesystem::filesystem_error& e) {
158 std::cerr << "Error moving " << entry.path() << ": " << e.what() << std::endl;
159 }
160 }
161}
162
163
164/**
165 * @brief Derive the next emoji
166 * @details
167 * Example:
168 * ```cpp
169 * string emoji="🌀";
170 * CalculateNextEmoji(emoji); //return:🌁
171 * ```
172 * @param string pre_emoji: emoji string
173 * @return string: next emoji (pre_emoji + 1)
174 * @ref https://guppy.eng.kagawa-u.ac.jp/OpenCampus/unicode.html
175 * @note
176**/
177string CalculateNextEmoji(string pre_emoji)
178{
179 string emoji=pre_emoji;
180 //If pre_emoji[0]='\360', pre_emoji has 4 elements
181 //and is determined by pre_emoji[1]='\237'
182 if(pre_emoji[0]=='\360'&&pre_emoji.size()!=4) return "";
183 //If pre_emoji[0]='\342' then pre_emoji is determined by three elements
184 if(pre_emoji[0]=='\342'&&pre_emoji.size()!=3) return "";
185
186 emoji[emoji.size()-1]++;
187 if(int(emoji[emoji.size()-1])>=-64){
188 emoji[emoji.size()-1]=-128;
189 emoji[emoji.size()-2]++;
190 }
191 return emoji;
192}
193
194/**
195 * @brief Get filename list in folder_path
196 * @details
197 * Example:
198 * ```cpp
199 * string input_path = "../data/input/";
200 * vector<string> file_list;
201 * GetFileList(input_path, &file_list);
202 * ```
203 * @param string folder_path: folder path
204 * @param vector<string> *file_list: (return) filename list
205 * @return None
206 * @attention
207**/
208void GetFileNameListWithoutExtention(const string folder_path, vector<string> *file_list)
209{
210 filesystem::path path = folder_path;
211 for (const auto &entry : filesystem::directory_iterator(path)) {
212 if (entry.is_regular_file()) {
213 file_list->push_back(string(entry.path().stem()));
214 }
215 }
216 return;
217}
218
219/**
220 * @brief Get file line number list
221 * @details
222 * Example:
223 * ```cpp
224 * string folder_path = "../data/input/";
225 * vector<string> file_list;
226 * GetFileNameListWithoutExtention(folder_path,&file_list);
227 * vector<string> file_line_number_list;
228 * GetFileLineNumberList(folder_path,&file_list,&file_line_number_list);
229 * ```
230 * @param const string folder_path: folder path
231 * @param const vector<string> *file_list: (return) filename list
232 * @param const string file_extention: file extention of file_list (".json",".txt", and so on.)
233 * @param vector<uint64_t> *file_line_number_list: (return) file line number list
234 * @return None
235 * @attention
236**/
237void GetFileLineNumberList(const string folder_path,
238 const vector<string> *file_list,
239 const string file_extention,
240 vector<uint64_t> *file_line_number_list)
241{
242 uint64_t line_count=0;
243 for(int i=0;i<(int)file_list->size();i++){
244 ifstream input_file(folder_path+(*file_list)[i]+file_extention);
245 cout << folder_path+(*file_list)[i]+file_extention << endl;
246 string line="";
247 line_count=0;
248 while(getline(input_file,line)) line_count++;
249
250 file_line_number_list->push_back(line_count);
251 cout << "file_name:"<<(*file_list)[i]+file_extention<<endl;
252 cout << "file_lines:"<<(*file_line_number_list)[i]<<endl;
253 input_file.close();
254 }
255 return;
256}
257
258/**
259 * @brief Convert string to Wstring
260 * @details
261 * Example:
262 * ```cpp
263 * string input= "こんにちわ。";
264 * wstring input_w = ConvertUTF8ToWstring(input);
265 * ```
266 * @param const string& src: text sentence
267 * @return wstring: text sentence converted wstring
268 * @attention
269**/
270wstring ConvertUTF8ToWstring(const string& sentence)
271{
272 wstring_convert<codecvt_utf8_utf16<wchar_t>> converter;
273 return converter.from_bytes(sentence);
274}
275
276/**
277 * @brief Convert Wstring to string
278 * @details
279 * Example:
280 * wstring input_w = L"こんにちわ。";
281 * string input = ConvertWstringToUTF8(input_w);
282 * @param const string& src: text sentence
283 * @return wstring: text sentence converted wstring
284 * @attention
285**/
286string ConvertWstringToUTF8(const wstring& sentence)
287{
288 wstring_convert<codecvt_utf8<wchar_t> > converter;
289 return converter.to_bytes(sentence);
290}
291
292/**
293 * @brief Segmentation Sentence
294 * @details
295 * Segmentation sentence is following steps...
296 * 1. Check if there are sentences enclosed in quotation marks.
297 * 2. Puctuations enclosed in quotation marks are ignored.
298 * 3. Replace "。"(puctuation_list) to "。\n".
299 * 4. Remove the last character of the string.
300 * Example:
301 * ```cpp
302 * string sentence = "../data/input/";
303 * SegmentationSentence(sentence);
304 * ```
305 * @param string sentence: sentence
306 * @return string: segmentated sentence
307 * @note Be sure to use Normalizer to normalize your corpus before this process.
308 * @attention
309 * TODO: "." to be segmented. (ignore that "12.3", "wiki.txt". )
310 * In the case of Japanese sentences,
311 * if you are talking about a full stop, there should be a space at the end,
312 * and the missing mark should be \n(at the end of the sentence).
313 * Only in that case should it be used as a full stop.
314**/
315void SegmentSentence(string sentence, vector<string> &segments)
316{
317 wstring sentence_w = ConvertUTF8ToWstring(sentence);
318 wstring sentence_segmented=L"";
319 vector<wchar_t> quote_list = {L'「',L'(',L'"',L'['};
320 vector<wchar_t> quote_end_list = {L'」',L')',L'"',L']'};
321 vector<wchar_t> punctuation_list={L'。',L'。',L'?',L'?',L'!',L'!'};
322 wstring delimiter = L"<<<NEW_LINE>>>";
323
324 for (int i=0;i<(int)sentence_w.size();i++) {
325 bool found_quote=false;
326 int32_t quote_index = -1;
327
328 //Check if there are sentences enclosed in quotation marks
329 for(int j=0;j<(int)quote_list.size();j++){
330 if(sentence_w[i]==quote_list[j]){
331 found_quote=true;
332 quote_index=j;
333 }
334 }
335
336 //Puctuations enclosed in quotation marks are ignored
337 if (found_quote) {
338 for(int j=i+1;j<(int)sentence_w.size();j++){
339 if(sentence_w[j]==quote_end_list[quote_index]){
340 sentence_segmented+=sentence_w.substr(i,j-i+1);
341 // cout << "「」:"<<ConvertWstringToUTF8(sentence_w.substr(i,j-i+1))<<endl;
342 i=j;
343 found_quote=false;
344 break;
345 }
346 }
347 //If there is no end quote, the beginning quote is added to the "sentence_segmented".
348 if(found_quote) sentence_segmented += sentence_w[i];
349 }
350 else{
351 sentence_segmented += sentence_w[i];
352 // replace "。"(punctuation_list) to "。\n"
353 for(auto punctuation: punctuation_list){
354 if(sentence_w[i]==punctuation) {
355 sentence_segmented+=delimiter;
356 break;
357 }
358 }
359 }
360 }
361
362 size_t pos = 0;
363 size_t prevPos = 0;
364
365 // segmented sentence push to segments
366 while ((pos = sentence_segmented.find(delimiter, prevPos)) != wstring::npos) {
367 wstring sub = sentence_segmented.substr(prevPos, pos - prevPos);
368 segments.push_back(ConvertWstringToUTF8(sub));
369 prevPos = pos + delimiter.length();
370 }
371
372 // Add the last substring after the last delimiter
373 if (prevPos < sentence_segmented.length()) {
374 wstring sub = sentence_segmented.substr(prevPos);
375 segments.push_back(ConvertWstringToUTF8(sub));
376 }
377}
378
379/**
380 * @brief Remove leading and trailing white space
381 * @details
382 * Example:
383 * ```cpp
384 * string sentence= " こんにちわ。 ";
385 * sentence = Strip(sentence); //"こんにちは。"
386 * ```
387 * @param const string& sentence: text sentence
388 * @return string: sentence has been processed
389 * @attention
390**/
391string Strip(const string& sentence)
392{
393 // Search for non-blank characters from the beginning
394 size_t start = sentence.find_first_not_of(" \t\n\r");
395 // Search for non-blank characters from the end
396 size_t end = sentence.find_last_not_of(" \t\n\r");
397 // For empty string
398 if (start == string::npos) return "";
399 // Return substring from beginning to end
400 return sentence.substr(start, end - start + 1);
401}
402
403void ReplaceSubstring(string& sentence, const string& target, const string& replacement)
404{
405 size_t pos = 0;
406 while ((pos = sentence.find(target, pos)) != string::npos) {
407 sentence.replace(pos, target.length(), replacement);
408 pos += replacement.length();
409 }
410}
411
412/**
413 * @brief Get file name from path without file extention (.txt).
414 * @details
415 * Example:
416 * ```cpp
417 * string path = "/path/to/input.txt";
418 * cout << GetFileName(path)<< endl; // /path/to/input
419 * ```
420 * @param const string& sentence: text sentence
421 * @return string: sentence has been processed
422 * @attention
423**/
424string GetFilePathWithoutExtention(const string& file_path)
425{
426 filesystem::path path_object(file_path);
427 // Get file name (without extension)
428 filesystem::path fs_file_path_witout_extention = path_object.parent_path() / path_object.stem();
429 string file_path_witout_extention = fs_file_path_witout_extention.string();
430 return file_path_witout_extention;
431}
432
433
434/**
435 * @brief Extract file name list from file path list and add .jsonl extention
436 * @details
437 * Example:
438 * ```cpp
439 * vector<string> path_list,filename_list;
440 * path_list.push_back("/path/to/input.txt");
441 * path_list.push_back("/path/to/input2.txt");
442 * GetFileNameListAddedJsonl(pathlist,filename_list)
443 * for(auto filename:filename_list)cout << filename << endl;
444 * // /path/to/input.jsonl /path/to/input2.jsonl
445 * ```
446 * @param const vector<string> &file_path_list: original file path list
447 * @param vector<string> &jsonl_file_path_list: (output) file path list added jsonl extention
448 * @return void: None
449 * @attention
450**/
451void GetFileNameListAddedJsonl(const vector<string> &file_path_list,
452 vector<string> &jsonl_file_path_list)
453{
454 for(auto file_path:file_path_list){
455 string filepath_witout_extention = GetFilePathWithoutExtention(file_path);
456 jsonl_file_path_list.push_back(filepath_witout_extention+".jsonl");
457 }
458}
459
460/**
461 * @brief Escape word
462 * @details
463 * If " or ' or \, replace to \", \', and \\.
464 * Example:
465 * ```cpp
466 * string input = """;
467 * string output = EscapeWord(input);
468 * // output == "\""
469 * ```
470 * @param const string& sentence: text sentence
471 * @return string: converted sentence
472 * @attention
473**/
474string EscapeWord(const string& input)
475{
476 string output="";
477 for(char word: input){
478 if(word == L'\"') output+= "\\\"";
479 else if(word == '\t') output+="\\t";
480 else if(word == '\\') output+="\\\\";
481 else output+=word;
482 // else if(word == '\'') output+="\\\'";
483 // else if(word == '\t') output+="\\\t";
484 }
485 return output;
486 // else if(word == ' ') output+="\\u3000";
487}
488
489/**
490 * @brief Update progress bar
491 * @details
492 * Example:
493 * @param uint64_t line_count
494 * @param uint64_t file_line_number
495 * @return None
496 * @attention
497**/
498void ProceedProgressBar(unsigned long long line_count,unsigned long long file_line_number,uint32_t elapsed_time_ms)
499{
500 //Division by 0 occurs
501 if(file_line_number==0) return;
502 if(line_count==0) return;
503
504 // clear terminal
505 // cout << "\033[2J\033[1;1H";
506 // cout << "\033[1;1H\033[2K";
507 cout.fill('0');
508 cout << "\033[2K";
509
510 unsigned long long progress_percentage = (unsigned long long)((double(line_count))/double(file_line_number)*100);
511 //printf("\r%llu%% |",progress_percentage);
512 cout << "\r" << progress_percentage << "% |";
513 for(int i=0;i<int(progress_percentage/5);i++) cout << "█";//printf("█");
514 for(int i=0;i<int(20-progress_percentage/5);i++) cout << " ";//printf(" ");
515 //printf("| %llu/%llu",line_count,file_line_number);
516 cout << "| "<< line_count << "/" << file_line_number;
517
518 //passed time
519 uint32_t hours = elapsed_time_ms / (1000 * 60 * 60);
520 uint32_t minutes = (elapsed_time_ms / (1000 * 60)) % 60;
521 uint32_t seconds = (elapsed_time_ms / (1000)) % 60;
522 //printf(" [%02d:%02d:%02d",hours,minutes,seconds);
523 cout << setw(2) << "[" << hours << ":" << minutes << ":"<< seconds;
524
525 //Remaining time
526 uint32_t remaining_time = uint32_t(double(elapsed_time_ms)/line_count*(file_line_number-line_count));
527 hours = remaining_time / (1000 * 60 * 60);
528 minutes = (remaining_time / (1000 * 60)) % 60;
529 seconds = (remaining_time / (1000)) % 60;
530 //printf("<%02d:%02d:%02d",hours,minutes,seconds);
531 cout << setw(2) << "<"<<hours << ":" << minutes << ":" <<seconds;
532
533 //second per iteration
534 double msecond_per_iter = double(elapsed_time_ms)/line_count;
535 //printf(", %.2fms/it",msecond_per_iter);
536 //printf("]");
537 cout << setprecision(2) << "," << msecond_per_iter << "ms/it" ;
538 cout << "]";
539 fflush(stdout);
540 if (progress_percentage==100) cout << endl;
541}
void GetFileLineNumberList(const string folder_path, const vector< string > *file_list, const string file_extention, vector< uint64_t > *file_line_number_list)
Get file line number list.
Definition util.cpp:237
string ConvertWstringToUTF8(const wstring &sentence)
Convert Wstring to string.
Definition util.cpp:286
void MoveFile(string source_path, string target_folder)
copy source_fileto target_file
Definition util.cpp:94
void RemoveFolder(const std::string &path)
Delete a folder with its contents.
Definition util.cpp:41
void ProceedProgressBar(unsigned long long line_count, unsigned long long file_line_number, uint32_t elapsed_time_ms)
Update progress bar.
Definition util.cpp:498
void CopyFolder(string source_folder, string target_folder)
copy source_folder to target_folder
Definition util.cpp:116
uint32_t strlen_utf8(string input)
Get exact length of UTF-8 string in C.
Definition util.cpp:17
string GetFilePathWithoutExtention(const string &file_path)
Get file name from path without file extention (.txt).
Definition util.cpp:424
string CalculateNextEmoji(string pre_emoji)
Derive the next emoji.
Definition util.cpp:177
void ReplaceSubstring(string &sentence, const string &target, const string &replacement)
Definition util.cpp:403
void CopyFile(string source_path, string target_path)
copy source_fileto target_file
Definition util.cpp:73
wstring ConvertUTF8ToWstring(const string &sentence)
Convert string to Wstring.
Definition util.cpp:270
string Strip(const string &sentence)
Remove leading and trailing white space.
Definition util.cpp:391
void GetFileNameListWithoutExtention(const string folder_path, vector< string > *file_list)
Get filename list in folder_path.
Definition util.cpp:208
void SegmentSentence(string sentence, vector< string > &segments)
Segmentation Sentence.
Definition util.cpp:315
void MoveFolder(string source_folder, string target_folder)
copy source_folder to target_folder
Definition util.cpp:142
string EscapeWord(const string &input)
Escape word.
Definition util.cpp:474
void GetFileNameListAddedJsonl(const vector< string > &file_path_list, vector< string > &jsonl_file_path_list)
Extract file name list from file path list and add .jsonl extention.
Definition util.cpp:451