diff options
author | CHRISTOPHER DYER <cdyer@CHRISTOPHERs-MacBook-Pro.local> | 2015-02-03 21:23:17 -0500 |
---|---|---|
committer | CHRISTOPHER DYER <cdyer@CHRISTOPHERs-MacBook-Pro.local> | 2015-02-03 21:23:17 -0500 |
commit | afd65846cf1456a8b49e8482b9a40777014f6883 (patch) | |
tree | bb653f7ffa506c70b63e10b95e77b5f2e37586e8 | |
parent | c485a6b8b1230e319b69adbb46788405d4e48c89 (diff) |
tool to dedupilate corpus with hashing
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | utils/Makefile.am | 6 | ||||
-rw-r--r-- | utils/dedup_corpus.cc | 21 | ||||
-rw-r--r-- | utils/hash.h | 3 |
4 files changed, 30 insertions, 1 deletions
@@ -1,3 +1,4 @@ +utils/dedup_corpus klm/lm/builder/dump_counts klm/util/cat_compressed example_extff/ff_example.lo diff --git a/utils/Makefile.am b/utils/Makefile.am index dd74ddc0..c858ac7e 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = reconstruct_weights atools +bin_PROGRAMS = reconstruct_weights atools dedup_corpus noinst_PROGRAMS = \ ts \ @@ -98,6 +98,10 @@ atools_SOURCES = atools.cc atools_LDADD = libutils.a atools_LDFLAGS = $(STATIC_FLAGS) +dedup_corpus_SOURCES = dedup_corpus.cc +dedup_corpus_LDADD = libutils.a +dedup_corpus_LDFLAGS = $(STATIC_FLAGS) + phmt_SOURCES = phmt.cc phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) ts_SOURCES = ts.cc diff --git a/utils/dedup_corpus.cc b/utils/dedup_corpus.cc new file mode 100644 index 00000000..818a6ffa --- /dev/null +++ b/utils/dedup_corpus.cc @@ -0,0 +1,21 @@ +#include <iostream> +#include "hash.h" + +using namespace std; + +#define INITIAL_SIZE 20000000 + +int main(int argc, char **argv) { + if (argc != 1) { + cerr << "Usage: " << argv[0] << " < file.txt\n"; + return 1; + } + SPARSE_HASH_SET<uint64_t> seen(INITIAL_SIZE); + string line; + while(getline(cin, line)) { + uint64_t h = cdec::MurmurHash3_64(&line[0], line.size(), 17); + if (seen.insert(h).second) + cout << line << '\n'; + } +} + diff --git a/utils/hash.h b/utils/hash.h index 24d2b6ad..7de4db6d 100644 --- a/utils/hash.h +++ b/utils/hash.h @@ -13,7 +13,9 @@ # include <sparsehash/dense_hash_map> # include <sparsehash/dense_hash_set> # include <sparsehash/sparse_hash_map> +# include <sparsehash/sparse_hash_set> # define SPARSE_HASH_MAP google::sparse_hash_map +# define SPARSE_HASH_SET google::sparse_hash_set # define HASH_MAP google::dense_hash_map # define HASH_SET google::dense_hash_set # define HASH_MAP_DELETED(h,deleted) do { (h).set_deleted_key(deleted); } while(0) @@ -29,6 +31,7 @@ namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; } #endif # define SPARSE_HASH_MAP std::unordered_map +# define SPARSE_HASH_SET std::unordered_set # define HASH_MAP std::unordered_map # define HASH_SET std::unordered_set # define HASH_MAP_DELETED(h,deleted) |