summaryrefslogtreecommitdiff
path: root/klm/util
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2011-10-24 18:17:24 +0100
committerKenneth Heafield <github@kheafield.com>2011-10-24 18:17:24 +0100
commit3106cf8eca76df8b46d139b8f5ce5002200d660d (patch)
tree637930011a082de54bf21a97078cb67483ea248c /klm/util
parentb2171f53c6c597ac4326f63250269aa13df84718 (diff)
KenLM update. EnumerateVocab moved up a namespace. Fix trie building when bigrams are pruned. Make Chris feel better about MurmurHashNative.
Diffstat (limited to 'klm/util')
-rw-r--r--klm/util/mmap.cc2
-rw-r--r--klm/util/murmur_hash.cc15
-rw-r--r--klm/util/tokenize_piece.hh75
-rw-r--r--klm/util/tokenize_piece_test.cc94
4 files changed, 180 insertions, 6 deletions
diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc
index 5ce7adc9..279bafa8 100644
--- a/klm/util/mmap.cc
+++ b/klm/util/mmap.cc
@@ -15,7 +15,7 @@ namespace util {
scoped_mmap::~scoped_mmap() {
if (data_ != (void*)-1) {
- // Thanks Denis Filimonov for pointing on NFS likes msync first.
+ // Thanks Denis Filimonov for pointing out NFS likes msync first.
if (msync(data_, size_, MS_SYNC) || munmap(data_, size_)) {
std::cerr << "msync or mmap failed for " << size_ << " bytes." << std::endl;
abort();
diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc
index fec47fd9..ef5783fe 100644
--- a/klm/util/murmur_hash.cc
+++ b/klm/util/murmur_hash.cc
@@ -117,13 +117,18 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
return h;
}
+// Trick to test for 64-bit architecture at compile time.
+namespace {
+template <unsigned L> uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, unsigned int seed) {
+ return MurmurHash64A(key, len, seed);
+}
+template <> uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, unsigned int seed) {
+ return MurmurHash64B(key, len, seed);
+}
+} // namespace
uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) {
- if (sizeof(int) == 4) {
- return MurmurHash64B(key, len, seed);
- } else {
- return MurmurHash64A(key, len, seed);
- }
+ return MurmurHashNativeBackend<sizeof(void*)>(key, len, seed);
}
} // namespace util
diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh
index ee1c7ab2..413bda0b 100644
--- a/klm/util/tokenize_piece.hh
+++ b/klm/util/tokenize_piece.hh
@@ -5,6 +5,9 @@
#include <boost/iterator/iterator_facade.hpp>
+#include <algorithm>
+#include <iostream>
+
/* Usage:
*
* for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) {
@@ -64,6 +67,78 @@ template <char d> class PieceIterator : public boost::iterator_facade<PieceItera
StringPiece after_;
};
+class MultiCharacter {
+ public:
+ explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {}
+
+ StringPiece Find(const StringPiece &in) const {
+ return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size());
+ }
+
+ private:
+ StringPiece delimiter_;
+};
+
+class AnyCharacter {
+ public:
+ explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {}
+
+ StringPiece Find(const StringPiece &in) const {
+ return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1);
+ }
+
+ private:
+ StringPiece chars_;
+};
+
+template <class Find, bool SkipEmpty = false> class TokenIter : public boost::iterator_facade<TokenIter<Find, SkipEmpty>, const StringPiece, boost::forward_traversal_tag> {
+ public:
+ TokenIter() {}
+
+ TokenIter(const StringPiece &str, const Find &finder) : after_(str), finder_(finder) {
+ increment();
+ }
+
+ bool operator!() const {
+ return current_.data() == 0;
+ }
+ operator bool() const {
+ return current_.data() != 0;
+ }
+
+ static TokenIter<Find> end() {
+ return TokenIter<Find>();
+ }
+
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ do {
+ StringPiece found(finder_.Find(after_));
+ current_ = StringPiece(after_.data(), found.data() - after_.data());
+ if (found.data() == after_.data() + after_.size()) {
+ after_ = StringPiece(NULL, 0);
+ } else {
+ after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size());
+ }
+ } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false.
+ }
+
+ bool equal(const TokenIter<Find> &other) const {
+ return after_.data() == other.after_.data();
+ }
+
+ const StringPiece &dereference() const {
+ return current_;
+ }
+
+ StringPiece current_;
+ StringPiece after_;
+
+ Find finder_;
+};
+
} // namespace util
#endif // UTIL_TOKENIZE_PIECE__
diff --git a/klm/util/tokenize_piece_test.cc b/klm/util/tokenize_piece_test.cc
new file mode 100644
index 00000000..e07ebcf5
--- /dev/null
+++ b/klm/util/tokenize_piece_test.cc
@@ -0,0 +1,94 @@
+#include "util/tokenize_piece.hh"
+#include "util/string_piece.hh"
+
+#define BOOST_TEST_MODULE TokenIteratorTest
+#include <boost/test/unit_test.hpp>
+
+#include <iostream>
+
+namespace util {
+namespace {
+
+BOOST_AUTO_TEST_CASE(simple) {
+ PieceIterator<' '> it("single spaced words.");
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("single"), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("spaced"), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("words."), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+
+BOOST_AUTO_TEST_CASE(null_delimiter) {
+ const char str[] = "\0first\0\0second\0\0\0third\0fourth\0\0\0";
+ PieceIterator<'\0'> it(StringPiece(str, sizeof(str) - 1));
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("first"), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("second"), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("third"), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece("fourth"), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+
+BOOST_AUTO_TEST_CASE(null_entries) {
+ const char str[] = "\0split\0\0 \0me\0 ";
+ PieceIterator<' '> it(StringPiece(str, sizeof(str) - 1));
+ BOOST_REQUIRE(it);
+ const char first[] = "\0split\0\0";
+ BOOST_CHECK_EQUAL(StringPiece(first, sizeof(first) - 1), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ const char second[] = "\0me\0";
+ BOOST_CHECK_EQUAL(StringPiece(second, sizeof(second) - 1), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+
+/*BOOST_AUTO_TEST_CASE(pipe_pipe_none) {
+ const char str[] = "nodelimit at all";
+ TokenIter<MultiCharacter> it(str, MultiCharacter("|||"));
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece(str), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+BOOST_AUTO_TEST_CASE(pipe_pipe_two) {
+ const char str[] = "|||";
+ TokenIter<MultiCharacter> it(str, MultiCharacter("|||"));
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece(), *it);
+ ++it;
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece(), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+
+BOOST_AUTO_TEST_CASE(remove_empty) {
+ const char str[] = "|||";
+ TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||"));
+ BOOST_CHECK(!it);
+}*/
+
+BOOST_AUTO_TEST_CASE(remove_empty_keep) {
+ const char str[] = " |||";
+ TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||"));
+ BOOST_REQUIRE(it);
+ BOOST_CHECK_EQUAL(StringPiece(" "), *it);
+ ++it;
+ BOOST_CHECK(!it);
+}
+
+} // namespace
+} // namespace util