diff options
Diffstat (limited to 'klm/lm/neural')
| -rw-r--r-- | klm/lm/neural/wordvecs.cc | 23 | ||||
| -rw-r--r-- | klm/lm/neural/wordvecs.hh | 38 | 
2 files changed, 61 insertions, 0 deletions
| diff --git a/klm/lm/neural/wordvecs.cc b/klm/lm/neural/wordvecs.cc new file mode 100644 index 00000000..09bb4260 --- /dev/null +++ b/klm/lm/neural/wordvecs.cc @@ -0,0 +1,23 @@ +#include "lm/neural/wordvecs.hh" + +#include "util/file_piece.hh" + +namespace lm { namespace neural { + +WordVecs::WordVecs(util::FilePiece &f) { +  const unsigned long lines = f.ReadULong(); +  const std::size_t vocab_mem = ngram::ProbingVocabulary::Size(lines, 1.5); +  vocab_backing_.reset(util::CallocOrThrow(vocab_mem)); +  vocab_.SetupMemory(vocab_backing_.get(), vocab_mem); +  const unsigned long width = f.ReadULong(); +  vecs_.resize(width, lines); +  for (unsigned long i = 0; i < lines; ++i) { +    WordIndex column = vocab_.Insert(f.ReadDelimited()); +    for (unsigned int row = 0; row < width; ++row) { +      vecs_(row,column) = f.ReadFloat(); +    } +  } +  vocab_.FinishedLoading(); +} + +}} // namespaces diff --git a/klm/lm/neural/wordvecs.hh b/klm/lm/neural/wordvecs.hh new file mode 100644 index 00000000..921a2b22 --- /dev/null +++ b/klm/lm/neural/wordvecs.hh @@ -0,0 +1,38 @@ +#ifndef LM_NEURAL_WORDVECS_H +#define LM_NEURAL_WORDVECS_H + +#include "util/scoped.hh" +#include "lm/vocab.hh" + +#include <Eigen/Dense> + +namespace util { class FilePiece; } + +namespace lm { +namespace neural { + +class WordVecs { +  public: +    // Columns of the matrix are word vectors.  The column index is the word. +    typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor> Storage; + +    /* The file should begin with a line stating the number of word vectors and +     * the length of the vectors.  Then it's followed by lines containing a +     * word followed by floating-point values. +     */ +    explicit WordVecs(util::FilePiece &in); + +    const Storage &Vectors() const { return vecs_; } + +    WordIndex Index(StringPiece str) const { return vocab_.Index(str); } + +  private: +    util::scoped_malloc vocab_backing_; +    ngram::ProbingVocabulary vocab_; + +    Storage vecs_; +}; + +}} // namespaces + +#endif // LM_NEURAL_WORDVECS_H | 
