From afd65846cf1456a8b49e8482b9a40777014f6883 Mon Sep 17 00:00:00 2001 From: CHRISTOPHER DYER Date: Tue, 3 Feb 2015 21:23:17 -0500 Subject: tool to dedupilate corpus with hashing --- .gitignore | 1 + utils/Makefile.am | 6 +++++- utils/dedup_corpus.cc | 21 +++++++++++++++++++++ utils/hash.h | 3 +++ 4 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 utils/dedup_corpus.cc diff --git a/.gitignore b/.gitignore index dd8fcd7b..545ffdbd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +utils/dedup_corpus klm/lm/builder/dump_counts klm/util/cat_compressed example_extff/ff_example.lo diff --git a/utils/Makefile.am b/utils/Makefile.am index dd74ddc0..c858ac7e 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = reconstruct_weights atools +bin_PROGRAMS = reconstruct_weights atools dedup_corpus noinst_PROGRAMS = \ ts \ @@ -98,6 +98,10 @@ atools_SOURCES = atools.cc atools_LDADD = libutils.a atools_LDFLAGS = $(STATIC_FLAGS) +dedup_corpus_SOURCES = dedup_corpus.cc +dedup_corpus_LDADD = libutils.a +dedup_corpus_LDFLAGS = $(STATIC_FLAGS) + phmt_SOURCES = phmt.cc phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) ts_SOURCES = ts.cc diff --git a/utils/dedup_corpus.cc b/utils/dedup_corpus.cc new file mode 100644 index 00000000..818a6ffa --- /dev/null +++ b/utils/dedup_corpus.cc @@ -0,0 +1,21 @@ +#include +#include "hash.h" + +using namespace std; + +#define INITIAL_SIZE 20000000 + +int main(int argc, char **argv) { + if (argc != 1) { + cerr << "Usage: " << argv[0] << " < file.txt\n"; + return 1; + } + SPARSE_HASH_SET seen(INITIAL_SIZE); + string line; + while(getline(cin, line)) { + uint64_t h = cdec::MurmurHash3_64(&line[0], line.size(), 17); + if (seen.insert(h).second) + cout << line << '\n'; + } +} + diff --git a/utils/hash.h b/utils/hash.h index 24d2b6ad..7de4db6d 100644 --- a/utils/hash.h +++ b/utils/hash.h @@ -13,7 +13,9 @@ # include # include # include +# include # define SPARSE_HASH_MAP google::sparse_hash_map +# define SPARSE_HASH_SET google::sparse_hash_set # define HASH_MAP google::dense_hash_map # define HASH_SET google::dense_hash_set # define HASH_MAP_DELETED(h,deleted) do { (h).set_deleted_key(deleted); } while(0) @@ -29,6 +31,7 @@ namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; } #endif # define SPARSE_HASH_MAP std::unordered_map +# define SPARSE_HASH_SET std::unordered_set # define HASH_MAP std::unordered_map # define HASH_SET std::unordered_set # define HASH_MAP_DELETED(h,deleted) -- cgit v1.2.3