1#include <bits/stdc++.h>
10 uint32_t n_minhash = 200;
11 uint32_t n_buckets=20;
12 uint32_t bucket_size=10;
18 uint32_t bucket_size);
23 uint64_t
GetMinhash(vector<wstring> *tokens,uint32_t seed);
32 string blacklist_path;
34 set<string> seen,blacklist;
35 size_t total_backet_size_mb=5000;
39 string blacklist_path,
41 size_t total_backet_size_mb);
44 bool Apply(
const vector<string> *lshs);
GenerateDedupLSH(uint32_t n_gram, uint32_t n_minhash, uint32_t n_buckets, uint32_t bucket_size)
vector< string > CalculateLSH(wstring text)
Calculate minhash list of text.
uint64_t GetMinhash(vector< wstring > *tokens, uint32_t seed)
Calculate minhash of tokens list.
vector< wstring > NGramTokenize(wstring text, int32_t n)
Tokenize a string into n-gram tokens.
size_t GetTotalBucketSize(void)
Read Blacklist from file.
void InitializeSeen(void)
Initialize seen parameter.
size_t SizeOfBlacklist(void)
Calculate size of blacklist (rough estimate)
void InitializeBlacklist(void)
Initialize blacklist parameter.
size_t SizeOfSeen(void)
Calculate size of blacklist (rough estimate)
bool Apply(const vector< string > *lshs)
Calculate minhash list of text.
LSHDeduplicator(bool onlin_dedupe, string blacklist_path, bool store_blacklist, size_t total_backet_size_mb)
void LoadBlacklistToSeen(void)
Read Blacklist from file.
void StoreBlacklist(void)
Save Blacklist to file.