summaryrefslogtreecommitdiff
path: root/klm/lm/virtual_interface.hh
diff options
context:
space:
mode:
Diffstat (limited to 'klm/lm/virtual_interface.hh')
-rw-r--r--klm/lm/virtual_interface.hh156
1 files changed, 156 insertions, 0 deletions
diff --git a/klm/lm/virtual_interface.hh b/klm/lm/virtual_interface.hh
new file mode 100644
index 00000000..621a129e
--- /dev/null
+++ b/klm/lm/virtual_interface.hh
@@ -0,0 +1,156 @@
+#ifndef LM_VIRTUAL_INTERFACE__
+#define LM_VIRTUAL_INTERFACE__
+
+#include "lm/word_index.hh"
+#include "util/string_piece.hh"
+
+#include <string>
+
+namespace lm {
+
+struct FullScoreReturn {
+ float prob;
+ unsigned char ngram_length;
+};
+
+namespace base {
+
+template <class T, class U, class V> class ModelFacade;
+
+/* Vocabulary interface. Call Index(string) and get a word index for use in
+ * calling Model. It provides faster convenience functions for <s>, </s>, and
+ * <unk> although you can also find these using Index.
+ *
+ * Some models do not load the mapping from index to string. If you need this,
+ * check if the model Vocabulary class implements such a function and access it
+ * directly.
+ *
+ * The Vocabulary object is always owned by the Model and can be retrieved from
+ * the Model using BaseVocabulary() for this abstract interface or
+ * GetVocabulary() for the actual implementation (in which case you'll need the
+ * actual implementation of the Model too).
+ */
+class Vocabulary {
+ public:
+ virtual ~Vocabulary();
+
+ WordIndex BeginSentence() const { return begin_sentence_; }
+ WordIndex EndSentence() const { return end_sentence_; }
+ WordIndex NotFound() const { return not_found_; }
+ // FullScoreReturn start index of unused word assignments.
+ WordIndex Available() const { return available_; }
+
+ /* Most implementations allow StringPiece lookups and need only override
+ * Index(StringPiece). SRI requires null termination and overrides all
+ * three methods.
+ */
+ virtual WordIndex Index(const StringPiece &str) const = 0;
+ virtual WordIndex Index(const std::string &str) const {
+ return Index(StringPiece(str));
+ }
+ virtual WordIndex Index(const char *str) const {
+ return Index(StringPiece(str));
+ }
+
+ protected:
+ // Call SetSpecial afterward.
+ Vocabulary() {}
+
+ Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found, WordIndex available) {
+ SetSpecial(begin_sentence, end_sentence, not_found, available);
+ }
+
+ void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found, WordIndex available);
+
+ WordIndex begin_sentence_, end_sentence_, not_found_, available_;
+
+ private:
+ // Disable copy constructors. They're private and undefined.
+ // Ersatz boost::noncopyable.
+ Vocabulary(const Vocabulary &);
+ Vocabulary &operator=(const Vocabulary &);
+};
+
+/* There are two ways to access a Model.
+ *
+ *
+ * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in ngram.hh).
+ * Every Model implements the scoring function:
+ * float Score(
+ * const Model::State &in_state,
+ * const WordIndex new_word,
+ * Model::State &out_state) const;
+ *
+ * It can also return the length of n-gram matched by the model:
+ * FullScoreReturn FullScore(
+ * const Model::State &in_state,
+ * const WordIndex new_word,
+ * Model::State &out_state) const;
+ *
+ * There are also accessor functions:
+ * const State &BeginSentenceState() const;
+ * const State &NullContextState() const;
+ * const Vocabulary &GetVocabulary() const;
+ * unsigned int Order() const;
+ *
+ * NB: In case you're wondering why the model implementation looks like it's
+ * missing these methods, see facade.hh.
+ *
+ * This is the fastest way to use a model and presents a normal State class to
+ * be included in hypothesis state structure.
+ *
+ *
+ * OPTION 2: Use the virtual interface below.
+ *
+ * The virtual interface allow you to decide which Model to use at runtime
+ * without templatizing everything on the Model type. However, each Model has
+ * its own State class, so a single State cannot be efficiently provided (it
+ * would require using the maximum memory of any Model's State or memory
+ * allocation with each lookup). This means you become responsible for
+ * allocating memory with size StateSize() and passing it to the Score or
+ * FullScore functions provided here.
+ *
+ * For example, cdec has a std::string containing the entire state of a
+ * hypothesis. It can reserve StateSize bytes in this string for the model
+ * state.
+ *
+ * All the State objects are POD, so it's ok to use raw memory for storing
+ * State.
+ */
+class Model {
+ public:
+ virtual ~Model();
+
+ size_t StateSize() const { return state_size_; }
+ const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
+ const void *NullContextMemory() const { return null_context_memory_; }
+
+ virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
+
+ virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
+
+ unsigned char Order() const { return order_; }
+
+ const Vocabulary &BaseVocabulary() const { return *base_vocab_; }
+
+ private:
+ template <class T, class U, class V> friend class ModelFacade;
+ explicit Model(size_t state_size) : state_size_(state_size) {}
+
+ const size_t state_size_;
+ const void *begin_sentence_memory_, *null_context_memory_;
+
+ const Vocabulary *base_vocab_;
+
+ unsigned char order_;
+
+ // Disable copy constructors. They're private and undefined.
+ // Ersatz boost::noncopyable.
+ Model(const Model &);
+ Model &operator=(const Model &);
+};
+
+} // mamespace base
+} // namespace lm
+
+#endif // LM_VIRTUAL_INTERFACE__