summaryrefslogtreecommitdiff
path: root/klm/lm/model.hh
diff options
context:
space:
mode:
Diffstat (limited to 'klm/lm/model.hh')
-rw-r--r--klm/lm/model.hh33
1 files changed, 23 insertions, 10 deletions
diff --git a/klm/lm/model.hh b/klm/lm/model.hh
index fd9640c3..b85ccdcc 100644
--- a/klm/lm/model.hh
+++ b/klm/lm/model.hh
@@ -65,7 +65,7 @@ size_t hash_value(const State &state);
namespace detail {
// Should return the same results as SRI.
-// Why VocabularyT instead of just Vocabulary? ModelFacade defines Vocabulary.
+// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts.
template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
private:
typedef base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> P;
@@ -75,23 +75,37 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
// itself.
static size_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
+ /* Load the model from a file. It may be an ARPA or binary file. Binary
+ * files must have the format expected by this class or you'll get an
+ * exception. So TrieModel can only load ARPA or binary created by
+ * TrieModel. To classify binary files, call RecognizeBinary in
+ * lm/binary_format.hh.
+ */
GenericModel(const char *file, const Config &config = Config());
+ /* Score p(new_word | in_state) and incorporate new_word into out_state.
+ * Note that in_state and out_state must be different references:
+ * &in_state != &out_state.
+ */
FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
- /* Slower call without in_state. Don't use this if you can avoid it. This
- * is mostly a hack for Hieu to integrate it into Moses which sometimes
- * forgets LM state (i.e. it doesn't store it with the phrase). Sigh.
- * The context indices should be in an array.
- * If context_rbegin != context_rend then *context_rbegin is the word
- * before new_word.
+ /* Slower call without in_state. Try to remember state, but sometimes it
+ * would cost too much memory or your decoder isn't setup properly.
+ * To use this function, make an array of WordIndex containing the context
+ * vocabulary ids in reverse order. Then, pass the bounds of the array:
+ * [context_rbegin, context_rend). The new_word is not part of the context
+ * array unless you intend to repeat words.
*/
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
/* Get the state for a context. Don't use this if you can avoid it. Use
* BeginSentenceState or EmptyContextState and extend from those. If
* you're only going to use this state to call FullScore once, use
- * FullScoreForgotState. */
+ * FullScoreForgotState.
+ * To use this function, make an array of WordIndex containing the context
+ * vocabulary ids in reverse order. Then, pass the bounds of the array:
+ * [context_rbegin, context_rend).
+ */
void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
private:
@@ -131,9 +145,8 @@ typedef detail::GenericModel<detail::ProbingHashedSearch, Vocabulary> ProbingMod
// Default implementation. No real reason for it to be the default.
typedef ProbingModel Model;
+// Smaller implementation.
typedef ::lm::ngram::SortedVocabulary SortedVocabulary;
-typedef detail::GenericModel<detail::SortedHashedSearch, SortedVocabulary> SortedModel;
-
typedef detail::GenericModel<trie::TrieSearch, SortedVocabulary> TrieModel;
} // namespace ngram