diff options
author | Patrick Simianer <p@simianer.de> | 2015-02-26 14:24:41 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2015-02-26 14:24:41 +0100 |
commit | 29ddfafb0dea599965e6a881c25b396a6db2f40f (patch) | |
tree | 5755ec058361776657041ba088062b086eea6d68 /utils/dedup_corpus.cc | |
parent | 4223261682388944fe1b1cf31b9d51d88f9ad53b (diff) | |
parent | 03989754cb2511431e1df6001fca41b3806ad461 (diff) |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'utils/dedup_corpus.cc')
-rw-r--r-- | utils/dedup_corpus.cc | 21 |
1 files changed, 21 insertions, 0 deletions
diff --git a/utils/dedup_corpus.cc b/utils/dedup_corpus.cc new file mode 100644 index 00000000..818a6ffa --- /dev/null +++ b/utils/dedup_corpus.cc @@ -0,0 +1,21 @@ +#include <iostream> +#include "hash.h" + +using namespace std; + +#define INITIAL_SIZE 20000000 + +int main(int argc, char **argv) { + if (argc != 1) { + cerr << "Usage: " << argv[0] << " < file.txt\n"; + return 1; + } + SPARSE_HASH_SET<uint64_t> seen(INITIAL_SIZE); + string line; + while(getline(cin, line)) { + uint64_t h = cdec::MurmurHash3_64(&line[0], line.size(), 17); + if (seen.insert(h).second) + cout << line << '\n'; + } +} + |