diff options
author | CHRISTOPHER DYER <cdyer@CHRISTOPHERs-MacBook-Pro.local> | 2015-02-03 21:23:17 -0500 |
---|---|---|
committer | CHRISTOPHER DYER <cdyer@CHRISTOPHERs-MacBook-Pro.local> | 2015-02-03 21:23:17 -0500 |
commit | e2d9eb0ba94acd728a0706fa4209a36f67dd6d80 (patch) | |
tree | 04c7148abfe06102e5bbd728033acfb7eae9f87c /utils/dedup_corpus.cc | |
parent | 1bce604809399a0adc581fb0102bff11decf3436 (diff) |
tool to dedupilate corpus with hashing
Diffstat (limited to 'utils/dedup_corpus.cc')
-rw-r--r-- | utils/dedup_corpus.cc | 21 |
1 files changed, 21 insertions, 0 deletions
diff --git a/utils/dedup_corpus.cc b/utils/dedup_corpus.cc new file mode 100644 index 00000000..818a6ffa --- /dev/null +++ b/utils/dedup_corpus.cc @@ -0,0 +1,21 @@ +#include <iostream> +#include "hash.h" + +using namespace std; + +#define INITIAL_SIZE 20000000 + +int main(int argc, char **argv) { + if (argc != 1) { + cerr << "Usage: " << argv[0] << " < file.txt\n"; + return 1; + } + SPARSE_HASH_SET<uint64_t> seen(INITIAL_SIZE); + string line; + while(getline(cin, line)) { + uint64_t h = cdec::MurmurHash3_64(&line[0], line.size(), 17); + if (seen.insert(h).second) + cout << line << '\n'; + } +} + |