From 3106cf8eca76df8b46d139b8f5ce5002200d660d Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 24 Oct 2011 18:17:24 +0100 Subject: KenLM update. EnumerateVocab moved up a namespace. Fix trie building when bigrams are pruned. Make Chris feel better about MurmurHashNative. --- klm/util/tokenize_piece_test.cc | 94 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 klm/util/tokenize_piece_test.cc (limited to 'klm/util/tokenize_piece_test.cc') diff --git a/klm/util/tokenize_piece_test.cc b/klm/util/tokenize_piece_test.cc new file mode 100644 index 00000000..e07ebcf5 --- /dev/null +++ b/klm/util/tokenize_piece_test.cc @@ -0,0 +1,94 @@ +#include "util/tokenize_piece.hh" +#include "util/string_piece.hh" + +#define BOOST_TEST_MODULE TokenIteratorTest +#include + +#include + +namespace util { +namespace { + +BOOST_AUTO_TEST_CASE(simple) { + PieceIterator<' '> it("single spaced words."); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("single"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("spaced"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("words."), *it); + ++it; + BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(null_delimiter) { + const char str[] = "\0first\0\0second\0\0\0third\0fourth\0\0\0"; + PieceIterator<'\0'> it(StringPiece(str, sizeof(str) - 1)); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("first"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("second"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("third"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("fourth"), *it); + ++it; + BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(null_entries) { + const char str[] = "\0split\0\0 \0me\0 "; + PieceIterator<' '> it(StringPiece(str, sizeof(str) - 1)); + BOOST_REQUIRE(it); + const char first[] = "\0split\0\0"; + BOOST_CHECK_EQUAL(StringPiece(first, sizeof(first) - 1), *it); + ++it; + BOOST_REQUIRE(it); + const char second[] = "\0me\0"; + BOOST_CHECK_EQUAL(StringPiece(second, sizeof(second) - 1), *it); + ++it; + BOOST_CHECK(!it); +} + +/*BOOST_AUTO_TEST_CASE(pipe_pipe_none) { + const char str[] = "nodelimit at all"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(str), *it); + ++it; + BOOST_CHECK(!it); +} +BOOST_AUTO_TEST_CASE(pipe_pipe_two) { + const char str[] = "|||"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(), *it); + ++it; + BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(remove_empty) { + const char str[] = "|||"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_CHECK(!it); +}*/ + +BOOST_AUTO_TEST_CASE(remove_empty_keep) { + const char str[] = " |||"; + TokenIter it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(" "), *it); + ++it; + BOOST_CHECK(!it); +} + +} // namespace +} // namespace util -- cgit v1.2.3