diff options
Diffstat (limited to 'klm/util')
| -rw-r--r-- | klm/util/mmap.cc | 2 | ||||
| -rw-r--r-- | klm/util/murmur_hash.cc | 15 | ||||
| -rw-r--r-- | klm/util/probing_hash_table.hh | 4 | ||||
| -rw-r--r-- | klm/util/tokenize_piece.hh | 75 | ||||
| -rw-r--r-- | klm/util/tokenize_piece_test.cc | 94 | 
5 files changed, 182 insertions, 8 deletions
| diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 5ce7adc9..279bafa8 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -15,7 +15,7 @@ namespace util {  scoped_mmap::~scoped_mmap() {    if (data_ != (void*)-1) { -    // Thanks Denis Filimonov for pointing on NFS likes msync first.   +    // Thanks Denis Filimonov for pointing out NFS likes msync first.        if (msync(data_, size_, MS_SYNC) || munmap(data_, size_)) {        std::cerr << "msync or mmap failed for " << size_ << " bytes." << std::endl;        abort(); diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc index fec47fd9..ef5783fe 100644 --- a/klm/util/murmur_hash.cc +++ b/klm/util/murmur_hash.cc @@ -117,13 +117,18 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )    return h;  } +// Trick to test for 64-bit architecture at compile time.   +namespace { +template <unsigned L> uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, unsigned int seed) { +  return MurmurHash64A(key, len, seed); +} +template <> uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, unsigned int seed) { +  return MurmurHash64B(key, len, seed); +} +} // namespace  uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) { -  if (sizeof(int) == 4) { -    return MurmurHash64B(key, len, seed); -  } else { -    return MurmurHash64A(key, len, seed); -  } +  return MurmurHashNativeBackend<sizeof(void*)>(key, len, seed);  }  } // namespace util diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh index 2ec342a6..8122d69c 100644 --- a/klm/util/probing_hash_table.hh +++ b/klm/util/probing_hash_table.hh @@ -61,14 +61,14 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac  #endif      {} -    template <class T> void Insert(const T &t) { +    template <class T> MutableIterator Insert(const T &t) {        if (++entries_ >= buckets_)          UTIL_THROW(ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");  #ifdef DEBUG        assert(initialized_);  #endif        for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { -        if (equal_(i->GetKey(), invalid_)) { *i = t; return; } +        if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }          if (++i == end_) { i = begin_; }        }      } diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh index ee1c7ab2..413bda0b 100644 --- a/klm/util/tokenize_piece.hh +++ b/klm/util/tokenize_piece.hh @@ -5,6 +5,9 @@  #include <boost/iterator/iterator_facade.hpp> +#include <algorithm> +#include <iostream> +  /* Usage:   *   * for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) { @@ -64,6 +67,78 @@ template <char d> class PieceIterator : public boost::iterator_facade<PieceItera      StringPiece after_;  }; +class MultiCharacter { +  public: +    explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {} + +    StringPiece Find(const StringPiece &in) const { +      return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size()); +    } + +  private: +    StringPiece delimiter_; +}; + +class AnyCharacter { +  public: +    explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {} + +    StringPiece Find(const StringPiece &in) const { +      return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); +    } + +  private: +    StringPiece chars_; +}; + +template <class Find, bool SkipEmpty = false> class TokenIter : public boost::iterator_facade<TokenIter<Find, SkipEmpty>, const StringPiece, boost::forward_traversal_tag> { +  public: +    TokenIter() {} + +    TokenIter(const StringPiece &str, const Find &finder) : after_(str), finder_(finder) { +      increment(); +    } + +    bool operator!() const { +      return current_.data() == 0; +    } +    operator bool() const { +      return current_.data() != 0; +    } + +    static TokenIter<Find> end() { +      return TokenIter<Find>(); +    } + +  private: +    friend class boost::iterator_core_access; + +    void increment() { +      do { +        StringPiece found(finder_.Find(after_)); +        current_ = StringPiece(after_.data(), found.data() - after_.data()); +        if (found.data() == after_.data() + after_.size()) { +          after_ = StringPiece(NULL, 0); +        } else { +          after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size()); +        } +      } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false.   +    } + +    bool equal(const TokenIter<Find> &other) const { +      return after_.data() == other.after_.data(); +    } + +    const StringPiece &dereference() const { +      return current_; +    } + +    StringPiece current_; +    StringPiece after_; + +    Find finder_; +}; +  } // namespace util  #endif // UTIL_TOKENIZE_PIECE__ diff --git a/klm/util/tokenize_piece_test.cc b/klm/util/tokenize_piece_test.cc new file mode 100644 index 00000000..e07ebcf5 --- /dev/null +++ b/klm/util/tokenize_piece_test.cc @@ -0,0 +1,94 @@ +#include "util/tokenize_piece.hh" +#include "util/string_piece.hh" + +#define BOOST_TEST_MODULE TokenIteratorTest +#include <boost/test/unit_test.hpp> + +#include <iostream> + +namespace util { +namespace { + +BOOST_AUTO_TEST_CASE(simple) { +  PieceIterator<' '> it("single spaced words."); +  BOOST_REQUIRE(it); +  BOOST_CHECK_EQUAL(StringPiece("single"), *it); +  ++it; +  BOOST_REQUIRE(it); +  BOOST_CHECK_EQUAL(StringPiece("spaced"), *it); +  ++it; +  BOOST_REQUIRE(it); +  BOOST_CHECK_EQUAL(StringPiece("words."), *it); +  ++it; +  BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(null_delimiter) { +  const char str[] = "\0first\0\0second\0\0\0third\0fourth\0\0\0"; +  PieceIterator<'\0'> it(StringPiece(str, sizeof(str) - 1)); +  BOOST_REQUIRE(it); +  BOOST_CHECK_EQUAL(StringPiece("first"), *it); +  ++it; +  BOOST_REQUIRE(it); +  BOOST_CHECK_EQUAL(StringPiece("second"), *it); +  ++it; +  BOOST_REQUIRE(it); +  BOOST_CHECK_EQUAL(StringPiece("third"), *it); +  ++it; +  BOOST_REQUIRE(it); +  BOOST_CHECK_EQUAL(StringPiece("fourth"), *it); +  ++it; +  BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(null_entries) { +  const char str[] = "\0split\0\0 \0me\0 "; +  PieceIterator<' '> it(StringPiece(str, sizeof(str) - 1)); +  BOOST_REQUIRE(it); +  const char first[] = "\0split\0\0"; +  BOOST_CHECK_EQUAL(StringPiece(first, sizeof(first) - 1), *it); +  ++it; +  BOOST_REQUIRE(it); +  const char second[] = "\0me\0"; +  BOOST_CHECK_EQUAL(StringPiece(second, sizeof(second) - 1), *it); +  ++it; +  BOOST_CHECK(!it); +} + +/*BOOST_AUTO_TEST_CASE(pipe_pipe_none) { +  const char str[] = "nodelimit at all"; +  TokenIter<MultiCharacter> it(str, MultiCharacter("|||")); +  BOOST_REQUIRE(it); +  BOOST_CHECK_EQUAL(StringPiece(str), *it); +  ++it; +  BOOST_CHECK(!it); +} +BOOST_AUTO_TEST_CASE(pipe_pipe_two) { +  const char str[] = "|||"; +  TokenIter<MultiCharacter> it(str, MultiCharacter("|||")); +  BOOST_REQUIRE(it); +  BOOST_CHECK_EQUAL(StringPiece(), *it); +  ++it; +  BOOST_REQUIRE(it); +  BOOST_CHECK_EQUAL(StringPiece(), *it); +  ++it; +  BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(remove_empty) { +  const char str[] = "|||"; +  TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||")); +  BOOST_CHECK(!it); +}*/ + +BOOST_AUTO_TEST_CASE(remove_empty_keep) { +  const char str[] = " |||"; +  TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||")); +  BOOST_REQUIRE(it); +  BOOST_CHECK_EQUAL(StringPiece(" "), *it); +  ++it; +  BOOST_CHECK(!it); +} + +} // namespace +} // namespace util | 
