summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCHRISTOPHER DYER <cdyer@CHRISTOPHERs-MacBook-Pro.local>2015-02-03 21:23:17 -0500
committerCHRISTOPHER DYER <cdyer@CHRISTOPHERs-MacBook-Pro.local>2015-02-03 21:23:17 -0500
commite2d9eb0ba94acd728a0706fa4209a36f67dd6d80 (patch)
tree04c7148abfe06102e5bbd728033acfb7eae9f87c
parent1bce604809399a0adc581fb0102bff11decf3436 (diff)
tool to dedupilate corpus with hashing
-rw-r--r--.gitignore1
-rw-r--r--utils/Makefile.am6
-rw-r--r--utils/dedup_corpus.cc21
-rw-r--r--utils/hash.h3
4 files changed, 30 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore
index dd8fcd7b..545ffdbd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+utils/dedup_corpus
klm/lm/builder/dump_counts
klm/util/cat_compressed
example_extff/ff_example.lo
diff --git a/utils/Makefile.am b/utils/Makefile.am
index dd74ddc0..c858ac7e 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = reconstruct_weights atools
+bin_PROGRAMS = reconstruct_weights atools dedup_corpus
noinst_PROGRAMS = \
ts \
@@ -98,6 +98,10 @@ atools_SOURCES = atools.cc
atools_LDADD = libutils.a
atools_LDFLAGS = $(STATIC_FLAGS)
+dedup_corpus_SOURCES = dedup_corpus.cc
+dedup_corpus_LDADD = libutils.a
+dedup_corpus_LDFLAGS = $(STATIC_FLAGS)
+
phmt_SOURCES = phmt.cc
phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
ts_SOURCES = ts.cc
diff --git a/utils/dedup_corpus.cc b/utils/dedup_corpus.cc
new file mode 100644
index 00000000..818a6ffa
--- /dev/null
+++ b/utils/dedup_corpus.cc
@@ -0,0 +1,21 @@
+#include <iostream>
+#include "hash.h"
+
+using namespace std;
+
+#define INITIAL_SIZE 20000000
+
+int main(int argc, char **argv) {
+ if (argc != 1) {
+ cerr << "Usage: " << argv[0] << " < file.txt\n";
+ return 1;
+ }
+ SPARSE_HASH_SET<uint64_t> seen(INITIAL_SIZE);
+ string line;
+ while(getline(cin, line)) {
+ uint64_t h = cdec::MurmurHash3_64(&line[0], line.size(), 17);
+ if (seen.insert(h).second)
+ cout << line << '\n';
+ }
+}
+
diff --git a/utils/hash.h b/utils/hash.h
index 24d2b6ad..7de4db6d 100644
--- a/utils/hash.h
+++ b/utils/hash.h
@@ -13,7 +13,9 @@
# include <sparsehash/dense_hash_map>
# include <sparsehash/dense_hash_set>
# include <sparsehash/sparse_hash_map>
+# include <sparsehash/sparse_hash_set>
# define SPARSE_HASH_MAP google::sparse_hash_map
+# define SPARSE_HASH_SET google::sparse_hash_set
# define HASH_MAP google::dense_hash_map
# define HASH_SET google::dense_hash_set
# define HASH_MAP_DELETED(h,deleted) do { (h).set_deleted_key(deleted); } while(0)
@@ -29,6 +31,7 @@
namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; }
#endif
# define SPARSE_HASH_MAP std::unordered_map
+# define SPARSE_HASH_SET std::unordered_set
# define HASH_MAP std::unordered_map
# define HASH_SET std::unordered_set
# define HASH_MAP_DELETED(h,deleted)