3#include "../scripts/smhasher/src/MurmurHash3.h"
7 uint32_t n_minhash=200,
9 uint32_t bucket_size=10)
11 assert(n_minhash==n_buckets*bucket_size);
12 this->n_gram = n_gram;
13 this->n_minhash = n_minhash;
14 this->n_buckets = n_buckets;
15 this->bucket_size = bucket_size;
38 vector<wstring> tokenized_text;
40 if ((int32_t)text.size() < n){
41 tokenized_text.push_back(text);
42 return tokenized_text;
45 for(int32_t i=0;i<(int32_t)text.size();i++){
46 if(i+n-1>=(int32_t)text.size()){
50 tokenized_text.push_back(text.substr(i,n));
54 return tokenized_text;
77 uint64_t minhash = UINT64_MAX;
79 for(
auto token:*tokens) {
80 MurmurHash3_x64_128(token.data(), token.length(), seed, &out);
83 minhash = min(minhash,out[0]);
111 vector<uint64_t> fingerprints;
113 for(uint32_t seed=0;seed<this->n_minhash;seed++){
117 uint64_t minhash = this->
GetMinhash(&tokens,seed);
120 fingerprints.push_back(minhash);
124 for(uint32_t bucket_idx=0;bucket_idx<this->n_buckets;bucket_idx++){
127 for(uint32_t fp_idx=bucket_idx*this->bucket_size;fp_idx<(bucket_idx+1)*this->bucket_size;fp_idx++){
128 sprintf(buffer,
"%016lx", fingerprints[fp_idx]);
131 temp = temp.substr(12);
134 lshs.push_back(to_string(bucket_idx)+
"+"+hash);
141 string blacklist_path=
"",
142 bool store_blacklist=
false,
143 size_t total_backet_size_mb = 5120)
145 this->online_dedup = online_dedup;
146 this->blacklist_path = blacklist_path;
147 this->store_blacklist = store_blacklist;
149 this->total_backet_size_mb=total_backet_size_mb;
151 if(this->store_blacklist) this->blacklist=this->seen;
202 if(lshs->size() == 0){
203 cout <<
"LSHs for deduplication are not caluculated. Filter \
204 `GenerateDedupLSH` must be composed before this filter." << endl;
209 bool is_duplicated=
false;
213 if (this->seen.find(lsh) != this->seen.end()){
214 is_duplicated =
true;
215 if(this->store_blacklist) this->blacklist.insert(lsh);
217 if(this->online_dedup) this->seen.insert(lsh);
220 return is_duplicated;
236 size_t element_size=0;
237 if(this->seen.empty())element_size=0;
239 auto itr = this->seen.begin();
242 string seen_first_element = *itr;
243 size_t element_unit_size = seen_first_element.length();
245 size_t element_size = element_unit_size * this->seen.size();
248 size_t node_overhead =
sizeof(
void*) * 2 * this->seen.size();
250 size_t string_overhead =
sizeof(std::string) * this->seen.size();
252 size_t container_overhead = 64;
253 size_t total_size = element_size + node_overhead + string_overhead + container_overhead;
268 size_t element_size=0;
269 if(this->blacklist.empty())element_size=0;
271 auto itr = this->blacklist.begin();
274 string blacklist_first_element = *itr;
275 size_t element_unit_size = blacklist_first_element.length();
277 size_t element_size = element_unit_size * this->blacklist.size();
280 size_t node_overhead =
sizeof(
void*) * 2 * this->blacklist.size();
282 size_t string_overhead =
sizeof(std::string) * this->blacklist.size();
284 size_t container_overhead = 64;
285 size_t total_size = element_size + node_overhead + string_overhead + container_overhead;
305 this->blacklist.clear();
308 if(this->store_blacklist) this->blacklist=this->seen;
315 cout << this->total_backet_size_mb <<endl;
317 if(this->
SizeOfSeen()>=this->total_backet_size_mb){
319 this->blacklist.clear();
334 this->blacklist.clear();
346 if(this->store_blacklist){
348 ofstream blacklist_file(this->blacklist_path);
349 for(
auto lsh: this->blacklist) blacklist_file<<lsh<<endl;
350 blacklist_file.close();
365 ifstream blacklist_file(this->blacklist_path);
367 while (getline(blacklist_file, line)) {
369 this->seen.insert(line);
371 blacklist_file.close();
383 return this->total_backet_size_mb;
GenerateDedupLSH(uint32_t n_gram, uint32_t n_minhash, uint32_t n_buckets, uint32_t bucket_size)
vector< string > CalculateLSH(wstring text)
Calculate minhash list of text.
uint64_t GetMinhash(vector< wstring > *tokens, uint32_t seed)
Calculate minhash of tokens list.
vector< wstring > NGramTokenize(wstring text, int32_t n)
Tokenize a string into n-gram tokens.
size_t GetTotalBucketSize(void)
Read Blacklist from file.
void InitializeSeen(void)
Initialize seen parameter.
size_t SizeOfBlacklist(void)
Calculate size of blacklist (rough estimate)
void InitializeBlacklist(void)
Initialize blacklist parameter.
size_t SizeOfSeen(void)
Calculate size of blacklist (rough estimate)
bool Apply(const vector< string > *lshs)
Calculate minhash list of text.
LSHDeduplicator(bool onlin_dedupe, string blacklist_path, bool store_blacklist, size_t total_backet_size_mb)
void LoadBlacklistToSeen(void)
Read Blacklist from file.
void StoreBlacklist(void)
Save Blacklist to file.
string Strip(const string &sentence)
Remove leading and trailing white space.