Corpus Cleaner
Functions
util.hpp File Reference
#include <bits/stdc++.h>
#include <sys/stat.h>

Go to the source code of this file.

Functions

uint32_t strlen_utf8 (string input)
 Get exact length of UTF-8 string in C.
 
void RemoveFolder (const string &path)
 
string CalculateNextEmoji (string pre_emoji)
 Derive the next emoji.
 
void GetFileNameListWithoutExtention (string folder_path, vector< string > *file_list)
 Get filename list in folder_path.
 
void GetFileLineNumberList (const string folder_path, const vector< string > *file_list, const string file_extention, vector< uint64_t > *file_line_number_list)
 Get file line number list.
 
void CopyFolder (string source_folder, string target_folder)
 copy source_folder to target_folder
 
void CopyFile (string source_path, string target_path)
 copy source_fileto target_file
 
void MoveFolder (string source_folder, string target_folder)
 copy source_folder to target_folder
 
wstring ConvertUTF8ToWstring (const string &sentence)
 Convert string to Wstring.
 
string ConvertWstringToUTF8 (const wstring &sentence)
 Convert Wstring to string.
 
void SegmentSentence (string sentence, vector< string > &segments)
 Segmentation Sentence.
 
string Strip (const string &sentence)
 Remove leading and trailing white space.
 
void ReplaceSubstring (string &sentence, const string &target, const string &replacement)
 
string EscapeWord (const string &input)
 Escape word.
 
void ProceedProgressBar (unsigned long long line_count, unsigned long long file_line_number, uint32_t elapsed_time_ms)
 Update progress bar.
 

Function Documentation

◆ CalculateNextEmoji()

string CalculateNextEmoji ( string pre_emoji)

Derive the next emoji.

Example:

string emoji="๐ŸŒ€";
CalculateNextEmoji(emoji); //return:๐ŸŒ
string CalculateNextEmoji(string pre_emoji)
Derive the next emoji.
Definition util.cpp:177
Parameters
stringpre_emoji: emoji string
Returns
string: next emoji (pre_emoji + 1) https://guppy.eng.kagawa-u.ac.jp/OpenCampus/unicode.html
Note

Definition at line 177 of file util.cpp.

◆ ConvertUTF8ToWstring()

wstring ConvertUTF8ToWstring ( const string & sentence)

Convert string to Wstring.

Example:

string input= "ใ“ใ‚“ใซใกใ‚ใ€‚";
wstring input_w = ConvertUTF8ToWstring(input);
wstring ConvertUTF8ToWstring(const string &sentence)
Convert string to Wstring.
Definition util.cpp:270
Parameters
conststring& src: text sentence
Returns
wstring: text sentence converted wstring
Attention

Definition at line 270 of file util.cpp.

◆ ConvertWstringToUTF8()

string ConvertWstringToUTF8 ( const wstring & sentence)

Convert Wstring to string.

Example: wstring input_w = L"ใ“ใ‚“ใซใกใ‚ใ€‚"; string input = ConvertWstringToUTF8(input_w);

Parameters
conststring& src: text sentence
Returns
wstring: text sentence converted wstring
Attention

Definition at line 286 of file util.cpp.

◆ CopyFile()

void CopyFile ( string source_path,
string target_path )

copy source_fileto target_file

Example:

string source_path="data/source/a.txt";
string target_path="data/target/a.txt";
CopyFile(source_path,target_path);
void CopyFile(string source_path, string target_path)
copy source_fileto target_file
Definition util.cpp:73
Parameters
stringsource_path: Copy source file path
stringtarget_path: Copy target file path
Returns
None
Attention

Definition at line 73 of file util.cpp.

◆ CopyFolder()

void CopyFolder ( string source_folder,
string target_folder )

copy source_folder to target_folder

Example:

string source_folder="data/source";
string target_folder="data/target";
CopyFolder(source_folder,target_folder);
void CopyFolder(string source_folder, string target_folder)
copy source_folder to target_folder
Definition util.cpp:116
Parameters
stringsource_folder: Copy source folder path
stringtarget_folder: Copy target folder path
Returns
None
Attention

Definition at line 116 of file util.cpp.

◆ EscapeWord()

string EscapeWord ( const string & input)

Escape word.

If " or ' or \, replace to \", \', and \\. Example: @icode{cpp} string input = """; string output = EscapeWord(input); // output == "\""

Parameters
conststring& sentence: text sentence
Returns
string: converted sentence
Attention

Definition at line 474 of file util.cpp.

◆ GetFileLineNumberList()

void GetFileLineNumberList ( const string folder_path,
const vector< string > * file_list,
const string file_extention,
vector< uint64_t > * file_line_number_list )

Get file line number list.

Example:

string folder_path = "../data/input/";
vector<string> file_list;
GetFileNameListWithoutExtention(folder_path,&file_list);
vector<string> file_line_number_list;
GetFileLineNumberList(folder_path,&file_list,&file_line_number_list);
void GetFileLineNumberList(const string folder_path, const vector< string > *file_list, const string file_extention, vector< uint64_t > *file_line_number_list)
Get file line number list.
Definition util.cpp:237
void GetFileNameListWithoutExtention(const string folder_path, vector< string > *file_list)
Get filename list in folder_path.
Definition util.cpp:208
Parameters
conststring folder_path: folder path
constvector<string> *file_list: (return) filename list
conststring file_extention: file extention of file_list (".json",".txt", and so on.)
vector<uint64_t>*file_line_number_list: (return) file line number list
Returns
None
Attention

Definition at line 237 of file util.cpp.

◆ GetFileNameListWithoutExtention()

void GetFileNameListWithoutExtention ( const string folder_path,
vector< string > * file_list )

Get filename list in folder_path.

Example:

string input_path = "../data/input/";
vector<string> file_list;
GetFileList(input_path, &file_list);
Parameters
stringfolder_path: folder path
vector<string>*file_list: (return) filename list
Returns
None
Attention

Definition at line 208 of file util.cpp.

◆ MoveFolder()

void MoveFolder ( string source_folder,
string target_folder )

copy source_folder to target_folder

Example:

string source_folder="data/source";
string target_folder="data/target";
MoveFolder(source_folder,target_folder);
void MoveFolder(string source_folder, string target_folder)
copy source_folder to target_folder
Definition util.cpp:142
Parameters
stringsource_folder: Copy source folder path
stringtarget_folder: Copy target folder path
Returns
None
Attention

Definition at line 142 of file util.cpp.

◆ ProceedProgressBar()

void ProceedProgressBar ( unsigned long long line_count,
unsigned long long file_line_number,
uint32_t elapsed_time_ms )

Update progress bar.

Example:

Parameters
uint64_tline_count
uint64_tfile_line_number
Returns
None
Attention

Definition at line 498 of file util.cpp.

◆ RemoveFolder()

void RemoveFolder ( const string & path)

◆ ReplaceSubstring()

void ReplaceSubstring ( string & sentence,
const string & target,
const string & replacement )

Definition at line 403 of file util.cpp.

◆ SegmentSentence()

void SegmentSentence ( string sentence,
vector< string > & segments )

Segmentation Sentence.

Segmentation sentence is following steps...

  1. Check if there are sentences enclosed in quotation marks.
  2. Puctuations enclosed in quotation marks are ignored.
  3. Replace "ใ€‚"(puctuation_list) to "ใ€‚\n".
  4. Remove the last character of the string. Example:
    string sentence = "../data/input/";
    SegmentationSentence(sentence);
    Parameters
    stringsentence: sentence
    Returns
    string: segmentated sentence
    Note
    Be sure to use Normalizer to normalize your corpus before this process.
    Attention
    TODO: "." to be segmented. (ignore that "12.3", "wiki.txt". ) In the case of Japanese sentences, if you are talking about a full stop, there should be a space at the end, and the missing mark should be
    (at the end of the sentence). Only in that case should it be used as a full stop.

Definition at line 315 of file util.cpp.

◆ Strip()

string Strip ( const string & sentence)

Remove leading and trailing white space.

Example:

string sentence= " ใ“ใ‚“ใซใกใ‚ใ€‚ ";
sentence = Strip(sentence); //"ใ“ใ‚“ใซใกใฏใ€‚"
string Strip(const string &sentence)
Remove leading and trailing white space.
Definition util.cpp:391
Parameters
conststring& sentence: text sentence
Returns
string: sentence has been processed
Attention

Definition at line 391 of file util.cpp.

◆ strlen_utf8()

uint32_t strlen_utf8 ( string input)

Get exact length of UTF-8 string in C.

Example:

string input="ใ“ใ‚“ใซใกใ‚ใ€‚";
uint32_t length = strlen_utf8(input);
uint32_t strlen_utf8(string input)
Get exact length of UTF-8 string in C.
Definition util.cpp:17
Parameters
stringinput: text string.
Returns
uint32_t: length of input https://ja.stackoverflow.com/questions/2988/cE8A8%80E8AA%9EE3%81A7E3%81AEutf-8E6%96%87E5AD%97E5%88%97E3%81AEE6ADA3E7A2BAE3%81AAE9%95B7E3%81%95E3%82%92E5%8F%96E5BE%97E3%81%99E3%82%8B
Attention

Definition at line 17 of file util.cpp.