summaryrefslogtreecommitdiff
path: root/utils/dedup_corpus.cc
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-02-26 14:24:41 +0100
committerPatrick Simianer <p@simianer.de>2015-02-26 14:24:41 +0100
commit29ddfafb0dea599965e6a881c25b396a6db2f40f (patch)
tree5755ec058361776657041ba088062b086eea6d68 /utils/dedup_corpus.cc
parent4223261682388944fe1b1cf31b9d51d88f9ad53b (diff)
parent03989754cb2511431e1df6001fca41b3806ad461 (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'utils/dedup_corpus.cc')
-rw-r--r--utils/dedup_corpus.cc21
1 files changed, 21 insertions, 0 deletions
diff --git a/utils/dedup_corpus.cc b/utils/dedup_corpus.cc
new file mode 100644
index 00000000..818a6ffa
--- /dev/null
+++ b/utils/dedup_corpus.cc
@@ -0,0 +1,21 @@
+#include <iostream>
+#include "hash.h"
+
+using namespace std;
+
+#define INITIAL_SIZE 20000000
+
+int main(int argc, char **argv) {
+ if (argc != 1) {
+ cerr << "Usage: " << argv[0] << " < file.txt\n";
+ return 1;
+ }
+ SPARSE_HASH_SET<uint64_t> seen(INITIAL_SIZE);
+ string line;
+ while(getline(cin, line)) {
+ uint64_t h = cdec::MurmurHash3_64(&line[0], line.size(), 17);
+ if (seen.insert(h).second)
+ cout << line << '\n';
+ }
+}
+