#include <minhash.hpp>

Public Member Functions
	GenerateDedupLSH (uint32_t n_gram, uint32_t n_minhash, uint32_t n_buckets, uint32_t bucket_size)

	~GenerateDedupLSH ()

vector< wstring >	NGramTokenize (wstring text, int32_t n)
	Tokenize a string into n-gram tokens.

uint64_t	GetMinhash (vector< wstring > *tokens, uint32_t seed)
	Calculate minhash of tokens list.

vector< string >	CalculateLSH (wstring text)
	Calculate minhash list of text.

Detailed Description

Definition at line 6 of file minhash.hpp.

Constructor & Destructor Documentation

◆ GenerateDedupLSH()

GenerateDedupLSH::GenerateDedupLSH	(	uint32_t	n_gram = 5,
		uint32_t	n_minhash = 200,
		uint32_t	n_buckets = 20,
		uint32_t	bucket_size = 10 )

Definition at line 6 of file minhash.cpp.

◆ ~GenerateDedupLSH()

GenerateDedupLSH::~GenerateDedupLSH ( )

Definition at line 19 of file minhash.cpp.

Member Function Documentation

◆ CalculateLSH()

vector< string > GenerateDedupLSH::CalculateLSH ( wstring text )

Calculate minhash list of text.

Generates a sequence of hash values for duplicate handling from text.
If two documents have the same hash value at most, the documents are considered duplicates.
A list of hash values, each hash value is in the format '0+07ad0b7b163f434643387f3f4799a2d466bccd0c',
The first two characters represent the hash value.
This allows duplicate processing by pooling duplicate processing hashes into a single hash table.

Example:

Parameters

wstring text: input sentence

Returns: vector<string> : hash list https://github.com/HojiChar/HojiChar/blob/v0.9.0/hojichar/filters/deduplication.py https://arxiv.org/abs/2107.06499 , Appendix A https://arxiv.org/abs/2107.06499 , Appendix A

Note

Definition at line 107 of file minhash.cpp.

◆ GetMinhash()

uint64_t GenerateDedupLSH::GetMinhash	(	vector< wstring > *	tokens,
		uint32_t	seed )

Calculate minhash of tokens list.

Example:

GenerateDedupLSH generate_dedupe_lsh(3);
wstring text = L"おはようございます。";
vector<wstring> tokens = generate_dedupe_lsh.NGramTokenize(text, 3);
uint64_t minhash = generate_dedupe_lsh.GetMinhash(&tokens,0);
//minhash == 2147483647

Parameters

vector<wstring>	*tokens: tokens list
uint32_t	seed: the seed for murmurminhash3's calculation

Returns: uint64_t : minhash https://github.com/HojiChar/HojiChar/blob/v0.9.0/hojichar/filters/deduplication.py

Note

Definition at line 75 of file minhash.cpp.

◆ NGramTokenize()

vector< wstring > GenerateDedupLSH::NGramTokenize	(	wstring	text,
		int32_t	n )

Tokenize a string into n-gram tokens.

Example:

GenerateDedupLSH generate_dedupe_lsh;
generate_dedupe_lsh.n_gram_tokenize(L"おはようございます。", 3);
// {"おはよ", "はよう", "ようご", "うござ", "ござい", "ざいま", "います", "ます。"}

Parameters

wstring	text: input text
string	n: the n number of n_gram

Returns: vector<wstring> : n_gram tokenized text https://github.com/HojiChar/HojiChar/blob/v0.9.0/hojichar/filters/deduplication.py

Note

Definition at line 36 of file minhash.cpp.

The documentation for this class was generated from the following files:

/home/corpus-cleaner/corpus_cleaner/minhash.hpp
/home/corpus-cleaner/corpus_cleaner/minhash.cpp

Public Member Functions

Detailed Description

Constructor & Destructor Documentation

◆ GenerateDedupLSH()

◆ ~GenerateDedupLSH()

Member Function Documentation

◆ CalculateLSH()

◆ GetMinhash()

◆ NGramTokenize()