Corpus Cleaner
minhash.cpp
Go to the documentation of this file.
1#include "minhash.hpp"
2#include "util.hpp"
3#include "../scripts/smhasher/src/MurmurHash3.h"
4
5/***constructor***/
7 uint32_t n_minhash=200,
8 uint32_t n_buckets=20,
9 uint32_t bucket_size=10)
10{
11 assert(n_minhash==n_buckets*bucket_size);
12 this->n_gram = n_gram;
13 this->n_minhash = n_minhash;
14 this->n_buckets = n_buckets;
15 this->bucket_size = bucket_size;
16}
17
18/***destructor***/
20
21/**
22 * @brief Tokenize a string into n-gram tokens.
23 * @details
24 * Example:
25 * ```cpp
26 * GenerateDedupLSH generate_dedupe_lsh;
27 * generate_dedupe_lsh.n_gram_tokenize(L"おはようございます。", 3);
28 * // {"おはよ", "はよう", "ようご", "うござ", "ござい", "ざいま", "います", "ます。"}
29 * ```
30 * @param wstring text: input text
31 * @param string n: the n number of n_gram
32 * @return vector<wstring> : n_gram tokenized text
33 * @ref https://github.com/HojiChar/HojiChar/blob/v0.9.0/hojichar/filters/deduplication.py
34 * @note
35**/
36vector<wstring> GenerateDedupLSH::NGramTokenize(wstring text, int32_t n)
37{
38 vector<wstring> tokenized_text;
39
40 if ((int32_t)text.size() < n){
41 tokenized_text.push_back(text);
42 return tokenized_text;
43 }
44 else{
45 for(int32_t i=0;i<(int32_t)text.size();i++){
46 if(i+n-1>=(int32_t)text.size()){
47 break;
48 }
49 else{
50 tokenized_text.push_back(text.substr(i,n));
51 // cout << ConvertWstringToUTF8(text.substr(i,n)) << endl;
52 }
53 }
54 return tokenized_text;
55 }
56}
57
58/**
59 * @brief Calculate minhash of tokens list
60 * @details
61 * Example:
62 * ```cpp
63 * GenerateDedupLSH generate_dedupe_lsh(3);
64 * wstring text = L"おはようございます。";
65 * vector<wstring> tokens = generate_dedupe_lsh.NGramTokenize(text, 3);
66 * uint64_t minhash = generate_dedupe_lsh.GetMinhash(&tokens,0);
67 * //minhash == 2147483647
68 * ```
69 * @param vector<wstring> *tokens: tokens list
70 * @param uint32_t seed: the seed for murmurminhash3's calculation
71 * @return uint64_t : minhash
72 * @ref https://github.com/HojiChar/HojiChar/blob/v0.9.0/hojichar/filters/deduplication.py
73 * @note
74**/
75uint64_t GenerateDedupLSH::GetMinhash(vector<wstring> *tokens,uint32_t seed)
76{
77 uint64_t minhash = UINT64_MAX;//INF
78 uint64_t out[2] = {};
79 for(auto token:*tokens) {
80 MurmurHash3_x64_128(token.data(), token.length(), seed, &out);
81 //cout << "out[0]:"<<out[0] << endl;
82 //minhash = MinU64(minhash,out[0]);
83 minhash = min(minhash,out[0]);
84 //cout << "minhash:"<<minhash << endl;
85 }
86 return minhash;
87}
88
89/**
90 * @brief Calculate minhash list of text
91 * @details
92 * Generates a sequence of hash values ​​for duplicate handling from text.
93 * If two documents have the same hash value at most, the documents are considered duplicates.
94 * A list of hash values, each hash value is in the format '0+07ad0b7b163f434643387f3f4799a2d466bccd0c',
95 * The first two characters represent the hash value.
96 * This allows duplicate processing by pooling duplicate processing hashes into a single hash table.
97 *
98 * Example:
99 * @param wstring text: input sentence
100 * @return vector<string> : hash list
101 * @ref
102 * https://github.com/HojiChar/HojiChar/blob/v0.9.0/hojichar/filters/deduplication.py
103 * https://arxiv.org/abs/2107.06499 , Appendix A
104 * https://arxiv.org/abs/2107.06499 , Appendix A
105 * @note
106**/
107vector<string> GenerateDedupLSH::CalculateLSH(wstring text)
108{
109 // Finally calculate N_BUKET duplicate hashes from N_MINHASH mmh3 hash values
110 // Creates n_minhash fingerprints elements
111 vector<uint64_t> fingerprints;
112 // cout << "minhash generated by GetMinhash" << endl;
113 for(uint32_t seed=0;seed<this->n_minhash;seed++){
114 // tokenized n-gram
115 vector<wstring> tokens = NGramTokenize(text,this->n_gram);
116 //calculate minhash
117 uint64_t minhash = this->GetMinhash(&tokens,seed);
118 //cout << minhash << endl;
119 //printf("%016lx\n",minhash);
120 fingerprints.push_back(minhash);
121 }
122
123 vector<string> lshs;
124 for(uint32_t bucket_idx=0;bucket_idx<this->n_buckets;bucket_idx++){
125 string hash = "";
126 char buffer[200]={};
127 for(uint32_t fp_idx=bucket_idx*this->bucket_size;fp_idx<(bucket_idx+1)*this->bucket_size;fp_idx++){
128 sprintf(buffer, "%016lx", fingerprints[fp_idx]);
129 string temp(buffer);
130 //Extract the last 4 digits
131 temp = temp.substr(12);
132 hash+=temp;
133 }
134 lshs.push_back(to_string(bucket_idx)+"+"+hash);
135 }
136 return lshs;
137}
138
139/***constructor***/
140LSHDeduplicator::LSHDeduplicator(bool online_dedup=true,
141 string blacklist_path="",
142 bool store_blacklist=false,
143 size_t total_backet_size_mb = 5120)
144{
145 this->online_dedup = online_dedup;
146 this->blacklist_path = blacklist_path;
147 this->store_blacklist = store_blacklist;
148 string line="";
149 this->total_backet_size_mb=total_backet_size_mb;
150 if(this->blacklist_path!="") this->LoadBlacklistToSeen();
151 if(this->store_blacklist) this->blacklist=this->seen;
152}
153
154/***destructor***/
156{
157 this->StoreBlacklist();
158}
159
160/**
161 * @brief Calculate minhash list of text
162 * @details
163 * Duplication is determined based on the hash value generated by `GenerateDedupLSH`.
164 * If the target corpus is approximately 10^6 or less (~several 10 GB is a rough guide),
165 * duplicate processing is possible without prior processing.
166 * If you want to perform deduplication without preprocessing (online), set the `online_dedup` flag to `True`.
167 * The guaranteed value for `online_dedup` is `True`.
168 *
169 * For larger corpora, it becomes difficult to store hash values ​​of all documents in memory.
170 * Duplicate documents are considered to be a few percent of all documents, so
171 * By reading only hash values ​​from a file as a blacklist, it is possible to process a corpus of several 100 GB.
172 *
173 * Reads duplicate hash values ​​from the file specified by `blacklist_path`. blacklist files every other line
174 * Assume that the hash value generated by `GenerateDedupLSH` is recorded.
175 * When the `store_blacklist` flag is set to `True`,
176 * duplicate hash values ​​will be recorded as a set of strings in the `LSHDeduplicator.blacklist` attribute.
177 * This option is useful, for example, when creating a blacklist hash value file.
178 * The default value of the `store_blacklist` flag is `False`.
179 *
180 * Example:
181 * ```cpp
182 * GenerateDedupLSH generate_dedup_lsh;
183 * d1 = generate_dedup_lsh.CalculateLSH("Hello, World.");
184 * d2 = generate_dedup_lsh.CalculateLSH("吾輩は猫である。名前はまだ無い。どこで生まれたかとんと見当がつかぬ。");
185 * d3 = generate_dedup_lsh.CalculateLSH("吾輩は鳥である。名前はまだ無い。どこで生まれたかとんと見当がつかぬ。");
186 * LSHDeduplicator deduplicator;
187 * cout << deduplicator.Apply(d1) << endl;
188 * //false
189 * cout << deduplicator.Apply(d2) << endl;
190 * //false
191 * cout << deduplicator._deApply(d3) << endl;
192 * //true
193 * ```
194 * @param vector<string> *lshs: lshs list generated by GenerateDedupLSH
195 * @return bool : duplication(true), or not duplication(false)
196 * @ref
197 * https://github.com/HojiChar/HojiChar/blob/v0.9.0/hojichar/filters/deduplication.py
198 * @note
199**/
200bool LSHDeduplicator::Apply(const vector<string> *lshs)
201{
202 if(lshs->size() == 0){
203 cout << "LSHs for deduplication are not caluculated. Filter \
204 `GenerateDedupLSH` must be composed before this filter." << endl;
205 assert(false);
206 return false;
207 }
208
209 bool is_duplicated=false;
210
211 for(auto lsh:*lshs){
212 // if lsh in this->seen,
213 if (this->seen.find(lsh) != this->seen.end()){
214 is_duplicated = true;
215 if(this->store_blacklist) this->blacklist.insert(lsh);
216 }
217 if(this->online_dedup) this->seen.insert(lsh);
218 }
219
220 return is_duplicated;
221
222}
223
224
225/**
226 * @brief Calculate size of blacklist (rough estimate)
227 * @details
228 * @param void: None
229 * @return size_t : size of blacklist
230 * @note
231**/
233{
234 // Get first element of seen
235
236 size_t element_size=0;
237 if(this->seen.empty())element_size=0;
238 else{
239 auto itr = this->seen.begin();
240 // cout << *itr<<endl;
241
242 string seen_first_element = *itr;
243 size_t element_unit_size = seen_first_element.length();//TODO:check this size
244 // All elements of seen are same size.
245 size_t element_size = element_unit_size * this->seen.size();
246 }
247 // overhead of node
248 size_t node_overhead = sizeof(void*) * 2 * this->seen.size();
249 // overhead of std::string
250 size_t string_overhead = sizeof(std::string) * this->seen.size();
251 // overhead of container structure (rough estimate)
252 size_t container_overhead = 64;
253 size_t total_size = element_size + node_overhead + string_overhead + container_overhead;
254
255 return total_size;
256}
257
258/**
259 * @brief Calculate size of blacklist (rough estimate)
260 * @details
261 * @param void: None
262 * @return size_t : size of blacklist
263 * @note
264**/
266{
267 // Get first element of blacklist
268 size_t element_size=0;
269 if(this->blacklist.empty())element_size=0;
270 else{
271 auto itr = this->blacklist.begin();
272 // cout << *itr<<endl;
273
274 string blacklist_first_element = *itr;
275 size_t element_unit_size = blacklist_first_element.length();//TODO:check this size
276 // All elements of seen are same size.
277 size_t element_size = element_unit_size * this->blacklist.size();
278 }
279 // overhead of node
280 size_t node_overhead = sizeof(void*) * 2 * this->blacklist.size();
281 // overhead of std::string
282 size_t string_overhead = sizeof(std::string) * this->blacklist.size();
283 // overhead of container structure (rough estimate)
284 size_t container_overhead = 64;
285 size_t total_size = element_size + node_overhead + string_overhead + container_overhead;
286
287 return total_size;
288}
289
290/**
291 * @brief Initialize seen parameter
292 * @details
293 * Caluclate size of seen. The step is following.
294 * Example:
295 * GenerateDedupLSH generate_dedupe_lsh(3);
296 * @param void: None
297 * @return size_t : size of seen
298 * @note
299**/
301{
302
303 if(this->store_blacklist) this->StoreBlacklist();
304 this->seen.clear();
305 this->blacklist.clear();
306
307 if(this->blacklist_path!="") this->LoadBlacklistToSeen();
308 if(this->store_blacklist) this->blacklist=this->seen;
309
310 // this->seen is include this->blacklist.
311 // So blacklist size is more than total_bucket_size_mb,
312 // seen size is also more than total_bucket_size_mb.
313
314 cout << this->SizeOfSeen() <<endl;
315 cout << this->total_backet_size_mb <<endl;
316
317 if(this->SizeOfSeen()>=this->total_backet_size_mb){
318 this->seen.clear();
319 this->blacklist.clear();
320 }
321}
322
323/**
324 * @brief Initialize blacklist parameter
325 * @details
326 * Example:
327 * @param void: None
328 * @return size_t : size of seen
329 * @note
330**/
332{
333 if(this->store_blacklist) this->StoreBlacklist();
334 this->blacklist.clear();
335}
336
337/**
338 * @brief Save Blacklist to file
339 * @details
340 * @param void: None
341 * @return size_t : size of blacklist
342 * @note
343**/
345{
346 if(this->store_blacklist){
347 // output blacklist
348 ofstream blacklist_file(this->blacklist_path);
349 for(auto lsh: this->blacklist) blacklist_file<<lsh<<endl;
350 blacklist_file.close();
351 }
352}
353
354/**
355 * @brief Read Blacklist from file
356 * @details
357 * @param void: None
358 * @return size_t : size of blacklist
359 * @note
360**/
362{
363 string line="";
364 //load blacklist from blacklist_path
365 ifstream blacklist_file(this->blacklist_path);
366 //insert seen
367 while (getline(blacklist_file, line)) {
368 Strip(line);
369 this->seen.insert(line);
370 }
371 blacklist_file.close();
372}
373
374/**
375 * @brief Read Blacklist from file
376 * @details
377 * @param void: None
378 * @return size_t : size of blacklist
379 * @note
380**/
382{
383 return this->total_backet_size_mb;
384}
385
GenerateDedupLSH(uint32_t n_gram, uint32_t n_minhash, uint32_t n_buckets, uint32_t bucket_size)
Definition minhash.cpp:6
vector< string > CalculateLSH(wstring text)
Calculate minhash list of text.
Definition minhash.cpp:107
uint64_t GetMinhash(vector< wstring > *tokens, uint32_t seed)
Calculate minhash of tokens list.
Definition minhash.cpp:75
vector< wstring > NGramTokenize(wstring text, int32_t n)
Tokenize a string into n-gram tokens.
Definition minhash.cpp:36
size_t GetTotalBucketSize(void)
Read Blacklist from file.
Definition minhash.cpp:381
void InitializeSeen(void)
Initialize seen parameter.
Definition minhash.cpp:300
size_t SizeOfBlacklist(void)
Calculate size of blacklist (rough estimate)
Definition minhash.cpp:265
void InitializeBlacklist(void)
Initialize blacklist parameter.
Definition minhash.cpp:331
size_t SizeOfSeen(void)
Calculate size of blacklist (rough estimate)
Definition minhash.cpp:232
bool Apply(const vector< string > *lshs)
Calculate minhash list of text.
Definition minhash.cpp:200
LSHDeduplicator(bool onlin_dedupe, string blacklist_path, bool store_blacklist, size_t total_backet_size_mb)
Definition minhash.cpp:140
void LoadBlacklistToSeen(void)
Read Blacklist from file.
Definition minhash.cpp:361
void StoreBlacklist(void)
Save Blacklist to file.
Definition minhash.cpp:344
string Strip(const string &sentence)
Remove leading and trailing white space.
Definition util.cpp:391