diff options
Diffstat (limited to 'utils/dedup_corpus.cc')
| -rw-r--r-- | utils/dedup_corpus.cc | 21 | 
1 files changed, 21 insertions, 0 deletions
| diff --git a/utils/dedup_corpus.cc b/utils/dedup_corpus.cc new file mode 100644 index 00000000..818a6ffa --- /dev/null +++ b/utils/dedup_corpus.cc @@ -0,0 +1,21 @@ +#include <iostream> +#include "hash.h" + +using namespace std; + +#define INITIAL_SIZE 20000000 + +int main(int argc, char **argv) { +  if (argc != 1) { +    cerr << "Usage: " << argv[0] << " < file.txt\n"; +    return 1; +  } +  SPARSE_HASH_SET<uint64_t> seen(INITIAL_SIZE); +  string line; +  while(getline(cin, line)) { +    uint64_t h = cdec::MurmurHash3_64(&line[0], line.size(), 17); +    if (seen.insert(h).second) +      cout << line << '\n'; +  } +} + | 
