Corpus Cleaner
minhash.hpp
Go to the documentation of this file.
1#include <bits/stdc++.h>
2
3#define INF 1L<<30;
4using namespace std;
5
7{
8private:
9 uint32_t n_gram = 5;
10 uint32_t n_minhash = 200;
11 uint32_t n_buckets=20;
12 uint32_t bucket_size=10;
13public:
14 /***constructor***/
15 GenerateDedupLSH(uint32_t n_gram,
16 uint32_t n_minhash,
17 uint32_t n_buckets,
18 uint32_t bucket_size);
19 /***destructor***/
21 vector<wstring> NGramTokenize(wstring text, int32_t n);
22 // int32_t HashfuncSigned32FromSeed(int32_t seed);
23 uint64_t GetMinhash(vector<wstring> *tokens,uint32_t seed);
24 vector<string> CalculateLSH(wstring text);
25 // wstring Apply(wstring document);//TODO
26};
27
29{
30private:
31 bool online_dedup;
32 string blacklist_path;
33 bool store_blacklist;
34 set<string> seen,blacklist; // TODO: unordered_set is faster than set.
35 size_t total_backet_size_mb=5000;
36public:
37 /***constructor***/
38 LSHDeduplicator(bool onlin_dedupe,
39 string blacklist_path,
40 bool store_blacklist,
41 size_t total_backet_size_mb);
42 /***destructor***/
44 bool Apply(const vector<string> *lshs);//TODO
45 size_t SizeOfSeen(void);
46 size_t SizeOfBlacklist(void);
47 void InitializeSeen(void);
48 void StoreBlacklist(void);
49 void LoadBlacklistToSeen(void);
50 size_t GetTotalBucketSize(void);
51 void InitializeBlacklist(void);
52};
GenerateDedupLSH(uint32_t n_gram, uint32_t n_minhash, uint32_t n_buckets, uint32_t bucket_size)
Definition minhash.cpp:6
vector< string > CalculateLSH(wstring text)
Calculate minhash list of text.
Definition minhash.cpp:107
uint64_t GetMinhash(vector< wstring > *tokens, uint32_t seed)
Calculate minhash of tokens list.
Definition minhash.cpp:75
vector< wstring > NGramTokenize(wstring text, int32_t n)
Tokenize a string into n-gram tokens.
Definition minhash.cpp:36
size_t GetTotalBucketSize(void)
Read Blacklist from file.
Definition minhash.cpp:381
void InitializeSeen(void)
Initialize seen parameter.
Definition minhash.cpp:300
size_t SizeOfBlacklist(void)
Calculate size of blacklist (rough estimate)
Definition minhash.cpp:265
void InitializeBlacklist(void)
Initialize blacklist parameter.
Definition minhash.cpp:331
size_t SizeOfSeen(void)
Calculate size of blacklist (rough estimate)
Definition minhash.cpp:232
bool Apply(const vector< string > *lshs)
Calculate minhash list of text.
Definition minhash.cpp:200
LSHDeduplicator(bool onlin_dedupe, string blacklist_path, bool store_blacklist, size_t total_backet_size_mb)
Definition minhash.cpp:140
void LoadBlacklistToSeen(void)
Read Blacklist from file.
Definition minhash.cpp:361
void StoreBlacklist(void)
Save Blacklist to file.
Definition minhash.cpp:344