diff options
Diffstat (limited to 'utils/dedup_corpus.cc')
-rw-r--r-- | utils/dedup_corpus.cc | 21 |
1 files changed, 21 insertions, 0 deletions
diff --git a/utils/dedup_corpus.cc b/utils/dedup_corpus.cc new file mode 100644 index 00000000..818a6ffa --- /dev/null +++ b/utils/dedup_corpus.cc @@ -0,0 +1,21 @@ +#include <iostream> +#include "hash.h" + +using namespace std; + +#define INITIAL_SIZE 20000000 + +int main(int argc, char **argv) { + if (argc != 1) { + cerr << "Usage: " << argv[0] << " < file.txt\n"; + return 1; + } + SPARSE_HASH_SET<uint64_t> seen(INITIAL_SIZE); + string line; + while(getline(cin, line)) { + uint64_t h = cdec::MurmurHash3_64(&line[0], line.size(), 17); + if (seen.insert(h).second) + cout << line << '\n'; + } +} + |