diff options
Diffstat (limited to 'klm/lm/model.hh')
-rw-r--r-- | klm/lm/model.hh | 33 |
1 files changed, 23 insertions, 10 deletions
diff --git a/klm/lm/model.hh b/klm/lm/model.hh index fd9640c3..b85ccdcc 100644 --- a/klm/lm/model.hh +++ b/klm/lm/model.hh @@ -65,7 +65,7 @@ size_t hash_value(const State &state); namespace detail { // Should return the same results as SRI. -// Why VocabularyT instead of just Vocabulary? ModelFacade defines Vocabulary. +// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts. template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> { private: typedef base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> P; @@ -75,23 +75,37 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod // itself. static size_t Size(const std::vector<uint64_t> &counts, const Config &config = Config()); + /* Load the model from a file. It may be an ARPA or binary file. Binary + * files must have the format expected by this class or you'll get an + * exception. So TrieModel can only load ARPA or binary created by + * TrieModel. To classify binary files, call RecognizeBinary in + * lm/binary_format.hh. + */ GenericModel(const char *file, const Config &config = Config()); + /* Score p(new_word | in_state) and incorporate new_word into out_state. + * Note that in_state and out_state must be different references: + * &in_state != &out_state. + */ FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const; - /* Slower call without in_state. Don't use this if you can avoid it. This - * is mostly a hack for Hieu to integrate it into Moses which sometimes - * forgets LM state (i.e. it doesn't store it with the phrase). Sigh. - * The context indices should be in an array. - * If context_rbegin != context_rend then *context_rbegin is the word - * before new_word. + /* Slower call without in_state. Try to remember state, but sometimes it + * would cost too much memory or your decoder isn't setup properly. + * To use this function, make an array of WordIndex containing the context + * vocabulary ids in reverse order. Then, pass the bounds of the array: + * [context_rbegin, context_rend). The new_word is not part of the context + * array unless you intend to repeat words. */ FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; /* Get the state for a context. Don't use this if you can avoid it. Use * BeginSentenceState or EmptyContextState and extend from those. If * you're only going to use this state to call FullScore once, use - * FullScoreForgotState. */ + * FullScoreForgotState. + * To use this function, make an array of WordIndex containing the context + * vocabulary ids in reverse order. Then, pass the bounds of the array: + * [context_rbegin, context_rend). + */ void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const; private: @@ -131,9 +145,8 @@ typedef detail::GenericModel<detail::ProbingHashedSearch, Vocabulary> ProbingMod // Default implementation. No real reason for it to be the default. typedef ProbingModel Model; +// Smaller implementation. typedef ::lm::ngram::SortedVocabulary SortedVocabulary; -typedef detail::GenericModel<detail::SortedHashedSearch, SortedVocabulary> SortedModel; - typedef detail::GenericModel<trie::TrieSearch, SortedVocabulary> TrieModel; } // namespace ngram |