#include "lm/left.hh" #include "lm/model.hh" #include "util/tokenize_piece.hh" #include <vector> #define BOOST_TEST_MODULE LeftTest #include <boost/test/unit_test.hpp> #include <boost/test/floating_point_comparison.hpp> namespace lm { namespace ngram { namespace { #define Term(word) score.Terminal(m.GetVocabulary().Index(word)); #define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value); // Apparently some Boost versions use templates and are pretty strict about types matching. #define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol)); template <class M> void Short(const M &m) { ChartState base; { RuleScore<M> score(m, base); Term("more"); Term("loin"); SLOPPY_CHECK_CLOSE(-1.206319 - 0.3561665, score.Finish(), 0.001); } BOOST_CHECK(base.left.full); BOOST_CHECK_EQUAL(2, base.left.length); BOOST_CHECK_EQUAL(1, base.right.length); VCheck("loin", base.right.words[0]); ChartState more_left; { RuleScore<M> score(m, more_left); Term("little"); score.NonTerminal(base, -1.206319 - 0.3561665); // p(little more loin | null context) SLOPPY_CHECK_CLOSE(-1.56538, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(3, more_left.left.length); BOOST_CHECK_EQUAL(1, more_left.right.length); VCheck("loin", more_left.right.words[0]); BOOST_CHECK(more_left.left.full); ChartState shorter; { RuleScore<M> score(m, shorter); Term("to"); score.NonTerminal(base, -1.206319 - 0.3561665); SLOPPY_CHECK_CLOSE(-0.30103 - 1.687872 - 1.206319 - 0.3561665, score.Finish(), 0.01); } BOOST_CHECK_EQUAL(1, shorter.left.length); BOOST_CHECK_EQUAL(1, shorter.right.length); VCheck("loin", shorter.right.words[0]); BOOST_CHECK(shorter.left.full); } template <class M> void Charge(const M &m) { ChartState base; { RuleScore<M> score(m, base); Term("on"); Term("more"); SLOPPY_CHECK_CLOSE(-1.509559 -0.4771212 -1.206319, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(1, base.left.length); BOOST_CHECK_EQUAL(1, base.right.length); VCheck("more", base.right.words[0]); BOOST_CHECK(base.left.full); ChartState extend; { RuleScore<M> score(m, extend); Term("looking"); score.NonTerminal(base, -1.509559 -0.4771212 -1.206319); SLOPPY_CHECK_CLOSE(-3.91039, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(2, extend.left.length); BOOST_CHECK_EQUAL(1, extend.right.length); VCheck("more", extend.right.words[0]); BOOST_CHECK(extend.left.full); ChartState tobos; { RuleScore<M> score(m, tobos); score.BeginSentence(); score.NonTerminal(extend, -3.91039); SLOPPY_CHECK_CLOSE(-3.471169, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(0, tobos.left.length); BOOST_CHECK_EQUAL(1, tobos.right.length); } template <class M> float LeftToRight(const M &m, const std::vector<WordIndex> &words, bool begin_sentence = false) { float ret = 0.0; State right = begin_sentence ? m.BeginSentenceState() : m.NullContextState(); for (std::vector<WordIndex>::const_iterator i = words.begin(); i != words.end(); ++i) { State copy(right); ret += m.Score(copy, *i, right); } return ret; } template <class M> float RightToLeft(const M &m, const std::vector<WordIndex> &words, bool begin_sentence = false) { float ret = 0.0; ChartState state; state.left.length = 0; state.right.length = 0; state.left.full = false; for (std::vector<WordIndex>::const_reverse_iterator i = words.rbegin(); i != words.rend(); ++i) { ChartState copy(state); RuleScore<M> score(m, state); score.Terminal(*i); score.NonTerminal(copy, ret); ret = score.Finish(); } if (begin_sentence) { ChartState copy(state); RuleScore<M> score(m, state); score.BeginSentence(); score.NonTerminal(copy, ret); ret = score.Finish(); } return ret; } template <class M> float TreeMiddle(const M &m, const std::vector<WordIndex> &words, bool begin_sentence = false) { std::vector<std::pair<ChartState, float> > states(words.size()); for (unsigned int i = 0; i < words.size(); ++i) { RuleScore<M> score(m, states[i].first); score.Terminal(words[i]); states[i].second = score.Finish(); } while (states.size() > 1) { std::vector<std::pair<ChartState, float> > upper((states.size() + 1) / 2); for (unsigned int i = 0; i < states.size() / 2; ++i) { RuleScore<M> score(m, upper[i].first); score.NonTerminal(states[i*2].first, states[i*2].second); score.NonTerminal(states[i*2+1].first, states[i*2+1].second); upper[i].second = score.Finish(); } if (states.size() % 2) { upper.back() = states.back(); } std::swap(states, upper); } if (states.empty()) return 0.0; if (begin_sentence) { ChartState ignored; RuleScore<M> score(m, ignored); score.BeginSentence(); score.NonTerminal(states.front().first, states.front().second); return score.Finish(); } else { return states.front().second; } } template <class M> void LookupVocab(const M &m, const StringPiece &str, std::vector<WordIndex> &out) { out.clear(); for (util::TokenIter<util::SingleCharacter, true> i(str, ' '); i; ++i) { out.push_back(m.GetVocabulary().Index(*i)); } } #define TEXT_TEST(str) \ LookupVocab(m, str, words); \ expect = LeftToRight(m, words, rest); \ SLOPPY_CHECK_CLOSE(expect, RightToLeft(m, words, rest), 0.001); \ SLOPPY_CHECK_CLOSE(expect, TreeMiddle(m, words, rest), 0.001); \ // Build sentences, or parts thereof, from right to left. template <class M> void GrowBig(const M &m, bool rest = false) { std::vector<WordIndex> words; float expect; TEXT_TEST("in biarritz watching considering looking . on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown </s>"); TEXT_TEST("on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown </s>"); TEXT_TEST("on a little more loin also would consider higher to look good"); TEXT_TEST("more loin also would consider higher to look good"); TEXT_TEST("more loin also would consider higher to look"); TEXT_TEST("also would consider higher to look"); TEXT_TEST("also would consider higher"); TEXT_TEST("would consider higher to look"); TEXT_TEST("consider higher to look"); TEXT_TEST("consider higher to"); TEXT_TEST("consider higher"); } template <class M> void GrowSmall(const M &m, bool rest = false) { std::vector<WordIndex> words; float expect; TEXT_TEST("in biarritz watching considering looking . </s>"); TEXT_TEST("in biarritz watching considering looking ."); TEXT_TEST("in biarritz"); } template <class M> void AlsoWouldConsiderHigher(const M &m) { ChartState also; { RuleScore<M> score(m, also); score.Terminal(m.GetVocabulary().Index("also")); SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); } ChartState would; { RuleScore<M> score(m, would); score.Terminal(m.GetVocabulary().Index("would")); SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); } ChartState combine_also_would; { RuleScore<M> score(m, combine_also_would); score.NonTerminal(also, -1.687872); score.NonTerminal(would, -1.687872); SLOPPY_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(2, combine_also_would.right.length); ChartState also_would; { RuleScore<M> score(m, also_would); score.Terminal(m.GetVocabulary().Index("also")); score.Terminal(m.GetVocabulary().Index("would")); SLOPPY_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(2, also_would.right.length); ChartState consider; { RuleScore<M> score(m, consider); score.Terminal(m.GetVocabulary().Index("consider")); SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(1, consider.left.length); BOOST_CHECK_EQUAL(1, consider.right.length); BOOST_CHECK(!consider.left.full); ChartState higher; float higher_score; { RuleScore<M> score(m, higher); score.Terminal(m.GetVocabulary().Index("higher")); higher_score = score.Finish(); } SLOPPY_CHECK_CLOSE(-1.509559, higher_score, 0.001); BOOST_CHECK_EQUAL(1, higher.left.length); BOOST_CHECK_EQUAL(1, higher.right.length); BOOST_CHECK(!higher.left.full); VCheck("higher", higher.right.words[0]); SLOPPY_CHECK_CLOSE(-0.30103, higher.right.backoff[0], 0.001); ChartState consider_higher; { RuleScore<M> score(m, consider_higher); score.NonTerminal(consider, -1.687872); score.NonTerminal(higher, higher_score); SLOPPY_CHECK_CLOSE(-1.509559 - 1.687872 - 0.30103, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(2, consider_higher.left.length); BOOST_CHECK(!consider_higher.left.full); ChartState full; { RuleScore<M> score(m, full); score.NonTerminal(combine_also_would, -1.687872 - 2.0); score.NonTerminal(consider_higher, -1.509559 - 1.687872 - 0.30103); SLOPPY_CHECK_CLOSE(-10.6879, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(4, full.right.length); } #define CHECK_SCORE(str, val) \ { \ float got = val; \ std::vector<WordIndex> indices; \ LookupVocab(m, str, indices); \ SLOPPY_CHECK_CLOSE(LeftToRight(m, indices), got, 0.001); \ } template <class M> void FullGrow(const M &m) { std::vector<WordIndex> words; LookupVocab(m, "in biarritz watching considering looking . </s>", words); ChartState lexical[7]; float lexical_scores[7]; for (unsigned int i = 0; i < 7; ++i) { RuleScore<M> score(m, lexical[i]); score.Terminal(words[i]); lexical_scores[i] = score.Finish(); } CHECK_SCORE("in", lexical_scores[0]); CHECK_SCORE("biarritz", lexical_scores[1]); CHECK_SCORE("watching", lexical_scores[2]); CHECK_SCORE("</s>", lexical_scores[6]); ChartState l1[4]; float l1_scores[4]; { RuleScore<M> score(m, l1[0]); score.NonTerminal(lexical[0], lexical_scores[0]); score.NonTerminal(lexical[1], lexical_scores[1]); CHECK_SCORE("in biarritz", l1_scores[0] = score.Finish()); } { RuleScore<M> score(m, l1[1]); score.NonTerminal(lexical[2], lexical_scores[2]); score.NonTerminal(lexical[3], lexical_scores[3]); CHECK_SCORE("watching considering", l1_scores[1] = score.Finish()); } { RuleScore<M> score(m, l1[2]); score.NonTerminal(lexical[4], lexical_scores[4]); score.NonTerminal(lexical[5], lexical_scores[5]); CHECK_SCORE("looking .", l1_scores[2] = score.Finish()); } BOOST_CHECK_EQUAL(l1[2].left.length, 1); l1[3] = lexical[6]; l1_scores[3] = lexical_scores[6]; ChartState l2[2]; float l2_scores[2]; { RuleScore<M> score(m, l2[0]); score.NonTerminal(l1[0], l1_scores[0]); score.NonTerminal(l1[1], l1_scores[1]); CHECK_SCORE("in biarritz watching considering", l2_scores[0] = score.Finish()); } { RuleScore<M> score(m, l2[1]); score.NonTerminal(l1[2], l1_scores[2]); score.NonTerminal(l1[3], l1_scores[3]); CHECK_SCORE("looking . </s>", l2_scores[1] = score.Finish()); } BOOST_CHECK_EQUAL(l2[1].left.length, 1); BOOST_CHECK(l2[1].left.full); ChartState top; { RuleScore<M> score(m, top); score.NonTerminal(l2[0], l2_scores[0]); score.NonTerminal(l2[1], l2_scores[1]); CHECK_SCORE("in biarritz watching considering looking . </s>", score.Finish()); } } const char *FileLocation() { if (boost::unit_test::framework::master_test_suite().argc < 2) { return "test.arpa"; } return boost::unit_test::framework::master_test_suite().argv[1]; } template <class M> void Everything() { Config config; config.messages = NULL; M m(FileLocation(), config); Short(m); Charge(m); GrowBig(m); AlsoWouldConsiderHigher(m); GrowSmall(m); FullGrow(m); } BOOST_AUTO_TEST_CASE(ProbingAll) { Everything<Model>(); } BOOST_AUTO_TEST_CASE(TrieAll) { Everything<TrieModel>(); } BOOST_AUTO_TEST_CASE(QuantTrieAll) { Everything<QuantTrieModel>(); } BOOST_AUTO_TEST_CASE(ArrayQuantTrieAll) { Everything<QuantArrayTrieModel>(); } BOOST_AUTO_TEST_CASE(ArrayTrieAll) { Everything<ArrayTrieModel>(); } BOOST_AUTO_TEST_CASE(RestProbing) { Config config; config.messages = NULL; RestProbingModel m(FileLocation(), config); GrowBig(m, true); } } // namespace } // namespace ngram } // namespace lm