19 uint32_t i = 0, length = 0;
22 if ((input[i] & 0xc0) != 0x80) length++;
43 filesystem::path source_path = path;
45 if (!filesystem::exists(source_path)) {
49 for (
const auto& entry : filesystem::directory_iterator(path)) {
50 if (entry.is_directory()) {
53 filesystem::remove(entry.path());
56 filesystem::remove(path);
73void CopyFile(
string source_path,
string target_path)
75 filesystem::path source = source_path;
76 filesystem::path destination = target_path;
77 filesystem::copy(source, destination, filesystem::copy_options::overwrite_existing);
94void MoveFile(
string source_path,
string target_folder)
96 filesystem::path source = source_path;
97 filesystem::path destination = target_folder;
98 filesystem::path new_path = destination / source.filename();
99 filesystem::rename(source, new_path);
119 filesystem::path sourceDir = source_folder;
120 filesystem::path destinationDir = target_folder;
121 for (
const auto &file : filesystem::directory_iterator(sourceDir)) {
122 if (filesystem::is_regular_file(file.path())) {
123 filesystem::copy(file, destinationDir / file.path().filename(), filesystem::copy_options::overwrite_existing);
145 if (!filesystem::exists(source_folder)) {
146 std::cerr <<
"Source directory does not exist: " << source_folder << std::endl;
149 if (!filesystem::exists(target_folder)) filesystem::create_directory(target_folder);
152 for (
const auto& entry : filesystem::directory_iterator(source_folder)) {
154 filesystem::path newPath = target_folder / entry.path().filename();
155 filesystem::rename(entry.path(), newPath);
156 std::cout <<
"Moved: " << entry.path().filename() << std::endl;
157 }
catch (
const filesystem::filesystem_error& e) {
158 std::cerr <<
"Error moving " << entry.path() <<
": " << e.what() << std::endl;
179 string emoji=pre_emoji;
182 if(pre_emoji[0]==
'\360'&&pre_emoji.size()!=4)
return "";
184 if(pre_emoji[0]==
'\342'&&pre_emoji.size()!=3)
return "";
186 emoji[emoji.size()-1]++;
187 if(
int(emoji[emoji.size()-1])>=-64){
188 emoji[emoji.size()-1]=-128;
189 emoji[emoji.size()-2]++;
210 filesystem::path path = folder_path;
211 for (
const auto &entry : filesystem::directory_iterator(path)) {
212 if (entry.is_regular_file()) {
213 file_list->push_back(
string(entry.path().stem()));
238 const vector<string> *file_list,
239 const string file_extention,
240 vector<uint64_t> *file_line_number_list)
242 uint64_t line_count=0;
243 for(
int i=0;i<(int)file_list->size();i++){
244 ifstream input_file(folder_path+(*file_list)[i]+file_extention);
245 cout << folder_path+(*file_list)[i]+file_extention << endl;
248 while(getline(input_file,line)) line_count++;
250 file_line_number_list->push_back(line_count);
251 cout <<
"file_name:"<<(*file_list)[i]+file_extention<<endl;
252 cout <<
"file_lines:"<<(*file_line_number_list)[i]<<endl;
272 wstring_convert<codecvt_utf8_utf16<wchar_t>> converter;
273 return converter.from_bytes(sentence);
288 wstring_convert<codecvt_utf8<wchar_t> > converter;
289 return converter.to_bytes(sentence);
318 wstring sentence_segmented=L
"";
319 vector<wchar_t> quote_list = {L
'「',L
'(',L
'"',L
'['};
320 vector<wchar_t> quote_end_list = {L
'」',L
')',L
'"',L
']'};
321 vector<wchar_t> punctuation_list={L
'。',L
'。',L
'?',L
'?',L
'!',L
'!'};
322 wstring delimiter = L
"<<<NEW_LINE>>>";
324 for (
int i=0;i<(int)sentence_w.size();i++) {
325 bool found_quote=
false;
326 int32_t quote_index = -1;
329 for(
int j=0;j<(int)quote_list.size();j++){
330 if(sentence_w[i]==quote_list[j]){
338 for(
int j=i+1;j<(int)sentence_w.size();j++){
339 if(sentence_w[j]==quote_end_list[quote_index]){
340 sentence_segmented+=sentence_w.substr(i,j-i+1);
348 if(found_quote) sentence_segmented += sentence_w[i];
351 sentence_segmented += sentence_w[i];
353 for(
auto punctuation: punctuation_list){
354 if(sentence_w[i]==punctuation) {
355 sentence_segmented+=delimiter;
366 while ((pos = sentence_segmented.find(delimiter, prevPos)) != wstring::npos) {
367 wstring sub = sentence_segmented.substr(prevPos, pos - prevPos);
369 prevPos = pos + delimiter.length();
373 if (prevPos < sentence_segmented.length()) {
374 wstring sub = sentence_segmented.substr(prevPos);
394 size_t start = sentence.find_first_not_of(
" \t\n\r");
396 size_t end = sentence.find_last_not_of(
" \t\n\r");
398 if (start == string::npos)
return "";
400 return sentence.substr(start, end - start + 1);
406 while ((pos = sentence.find(target, pos)) != string::npos) {
407 sentence.replace(pos, target.length(), replacement);
408 pos += replacement.length();
426 filesystem::path path_object(file_path);
428 filesystem::path fs_file_path_witout_extention = path_object.parent_path() / path_object.stem();
429 string file_path_witout_extention = fs_file_path_witout_extention.string();
430 return file_path_witout_extention;
452 vector<string> &jsonl_file_path_list)
454 for(
auto file_path:file_path_list){
456 jsonl_file_path_list.push_back(filepath_witout_extention+
".jsonl");
477 for(
char word: input){
478 if(word == L
'\"') output+=
"\\\"";
479 else if(word ==
'\t') output+=
"\\t";
480 else if(word ==
'\\') output+=
"\\\\";
498void ProceedProgressBar(
unsigned long long line_count,
unsigned long long file_line_number,uint32_t elapsed_time_ms)
501 if(file_line_number==0)
return;
502 if(line_count==0)
return;
510 unsigned long long progress_percentage = (
unsigned long long)((
double(line_count))/double(file_line_number)*100);
512 cout <<
"\r" << progress_percentage <<
"% |";
513 for(
int i=0;i<int(progress_percentage/5);i++) cout <<
"█";
514 for(
int i=0;i<int(20-progress_percentage/5);i++) cout <<
" ";
516 cout <<
"| "<< line_count <<
"/" << file_line_number;
519 uint32_t hours = elapsed_time_ms / (1000 * 60 * 60);
520 uint32_t minutes = (elapsed_time_ms / (1000 * 60)) % 60;
521 uint32_t seconds = (elapsed_time_ms / (1000)) % 60;
523 cout << setw(2) <<
"[" << hours <<
":" << minutes <<
":"<< seconds;
526 uint32_t remaining_time = uint32_t(
double(elapsed_time_ms)/line_count*(file_line_number-line_count));
527 hours = remaining_time / (1000 * 60 * 60);
528 minutes = (remaining_time / (1000 * 60)) % 60;
529 seconds = (remaining_time / (1000)) % 60;
531 cout << setw(2) <<
"<"<<hours <<
":" << minutes <<
":" <<seconds;
534 double msecond_per_iter = double(elapsed_time_ms)/line_count;
537 cout << setprecision(2) <<
"," << msecond_per_iter <<
"ms/it" ;
540 if (progress_percentage==100) cout << endl;
void GetFileLineNumberList(const string folder_path, const vector< string > *file_list, const string file_extention, vector< uint64_t > *file_line_number_list)
Get file line number list.
string ConvertWstringToUTF8(const wstring &sentence)
Convert Wstring to string.
void MoveFile(string source_path, string target_folder)
copy source_fileto target_file
void RemoveFolder(const std::string &path)
Delete a folder with its contents.
void ProceedProgressBar(unsigned long long line_count, unsigned long long file_line_number, uint32_t elapsed_time_ms)
Update progress bar.
void CopyFolder(string source_folder, string target_folder)
copy source_folder to target_folder
uint32_t strlen_utf8(string input)
Get exact length of UTF-8 string in C.
string GetFilePathWithoutExtention(const string &file_path)
Get file name from path without file extention (.txt).
string CalculateNextEmoji(string pre_emoji)
Derive the next emoji.
void ReplaceSubstring(string &sentence, const string &target, const string &replacement)
void CopyFile(string source_path, string target_path)
copy source_fileto target_file
wstring ConvertUTF8ToWstring(const string &sentence)
Convert string to Wstring.
string Strip(const string &sentence)
Remove leading and trailing white space.
void GetFileNameListWithoutExtention(const string folder_path, vector< string > *file_list)
Get filename list in folder_path.
void SegmentSentence(string sentence, vector< string > &segments)
Segmentation Sentence.
void MoveFolder(string source_folder, string target_folder)
copy source_folder to target_folder
string EscapeWord(const string &input)
Escape word.
void GetFileNameListAddedJsonl(const vector< string > &file_path_list, vector< string > &jsonl_file_path_list)
Extract file name list from file path list and add .jsonl extention.