diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-18 23:24:01 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-10-18 23:24:01 +0000 |
commit | de379496ee411993dff94e52f393f6e19437a204 (patch) | |
tree | a3fdb3b299100384e0a82dd2bc424fd52177d411 /klm/lm/virtual_interface.hh | |
parent | 08ff0e0332b562dd9c1f36fce24439db81287c68 (diff) |
kenneth's LM preliminary integration
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@681 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'klm/lm/virtual_interface.hh')
-rw-r--r-- | klm/lm/virtual_interface.hh | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/klm/lm/virtual_interface.hh b/klm/lm/virtual_interface.hh new file mode 100644 index 00000000..621a129e --- /dev/null +++ b/klm/lm/virtual_interface.hh @@ -0,0 +1,156 @@ +#ifndef LM_VIRTUAL_INTERFACE__ +#define LM_VIRTUAL_INTERFACE__ + +#include "lm/word_index.hh" +#include "util/string_piece.hh" + +#include <string> + +namespace lm { + +struct FullScoreReturn { + float prob; + unsigned char ngram_length; +}; + +namespace base { + +template <class T, class U, class V> class ModelFacade; + +/* Vocabulary interface. Call Index(string) and get a word index for use in + * calling Model. It provides faster convenience functions for <s>, </s>, and + * <unk> although you can also find these using Index. + * + * Some models do not load the mapping from index to string. If you need this, + * check if the model Vocabulary class implements such a function and access it + * directly. + * + * The Vocabulary object is always owned by the Model and can be retrieved from + * the Model using BaseVocabulary() for this abstract interface or + * GetVocabulary() for the actual implementation (in which case you'll need the + * actual implementation of the Model too). + */ +class Vocabulary { + public: + virtual ~Vocabulary(); + + WordIndex BeginSentence() const { return begin_sentence_; } + WordIndex EndSentence() const { return end_sentence_; } + WordIndex NotFound() const { return not_found_; } + // FullScoreReturn start index of unused word assignments. + WordIndex Available() const { return available_; } + + /* Most implementations allow StringPiece lookups and need only override + * Index(StringPiece). SRI requires null termination and overrides all + * three methods. + */ + virtual WordIndex Index(const StringPiece &str) const = 0; + virtual WordIndex Index(const std::string &str) const { + return Index(StringPiece(str)); + } + virtual WordIndex Index(const char *str) const { + return Index(StringPiece(str)); + } + + protected: + // Call SetSpecial afterward. + Vocabulary() {} + + Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found, WordIndex available) { + SetSpecial(begin_sentence, end_sentence, not_found, available); + } + + void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found, WordIndex available); + + WordIndex begin_sentence_, end_sentence_, not_found_, available_; + + private: + // Disable copy constructors. They're private and undefined. + // Ersatz boost::noncopyable. + Vocabulary(const Vocabulary &); + Vocabulary &operator=(const Vocabulary &); +}; + +/* There are two ways to access a Model. + * + * + * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in ngram.hh). + * Every Model implements the scoring function: + * float Score( + * const Model::State &in_state, + * const WordIndex new_word, + * Model::State &out_state) const; + * + * It can also return the length of n-gram matched by the model: + * FullScoreReturn FullScore( + * const Model::State &in_state, + * const WordIndex new_word, + * Model::State &out_state) const; + * + * There are also accessor functions: + * const State &BeginSentenceState() const; + * const State &NullContextState() const; + * const Vocabulary &GetVocabulary() const; + * unsigned int Order() const; + * + * NB: In case you're wondering why the model implementation looks like it's + * missing these methods, see facade.hh. + * + * This is the fastest way to use a model and presents a normal State class to + * be included in hypothesis state structure. + * + * + * OPTION 2: Use the virtual interface below. + * + * The virtual interface allow you to decide which Model to use at runtime + * without templatizing everything on the Model type. However, each Model has + * its own State class, so a single State cannot be efficiently provided (it + * would require using the maximum memory of any Model's State or memory + * allocation with each lookup). This means you become responsible for + * allocating memory with size StateSize() and passing it to the Score or + * FullScore functions provided here. + * + * For example, cdec has a std::string containing the entire state of a + * hypothesis. It can reserve StateSize bytes in this string for the model + * state. + * + * All the State objects are POD, so it's ok to use raw memory for storing + * State. + */ +class Model { + public: + virtual ~Model(); + + size_t StateSize() const { return state_size_; } + const void *BeginSentenceMemory() const { return begin_sentence_memory_; } + const void *NullContextMemory() const { return null_context_memory_; } + + virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0; + + virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; + + unsigned char Order() const { return order_; } + + const Vocabulary &BaseVocabulary() const { return *base_vocab_; } + + private: + template <class T, class U, class V> friend class ModelFacade; + explicit Model(size_t state_size) : state_size_(state_size) {} + + const size_t state_size_; + const void *begin_sentence_memory_, *null_context_memory_; + + const Vocabulary *base_vocab_; + + unsigned char order_; + + // Disable copy constructors. They're private and undefined. + // Ersatz boost::noncopyable. + Model(const Model &); + Model &operator=(const Model &); +}; + +} // mamespace base +} // namespace lm + +#endif // LM_VIRTUAL_INTERFACE__ |