Merge with upstream

author: Wu, Ke <wuke@cs.umd.edu> 2014-12-17 16:11:38 -0500
committer: Wu, Ke <wuke@cs.umd.edu> 2014-12-17 16:11:38 -0500
commit: 1613f1fc44ca67820afd7e7b21eb54b316c8ce55 (patch)
tree: e02b77084f28a18df6b854f87a986124db44d717 /klm/lm/neural
parent: bd9308e22b5434aa220cc57d82ee867464a011f1 (diff)
parent: 796768086a687d3f1856fef6489c34fe4d373642 (diff)
2 files changed, 61 insertions, 0 deletions
diff --git a/klm/lm/neural/wordvecs.cc b/klm/lm/neural/wordvecs.cc
new file mode 100644
index 00000000..09bb4260
--- /dev/null
+++ b/klm/lm/neural/wordvecs.cc
@@ -0,0 +1,23 @@
+#include "lm/neural/wordvecs.hh"
+
+#include "util/file_piece.hh"
+
+namespace lm { namespace neural {
+
+WordVecs::WordVecs(util::FilePiece &f) {
+  const unsigned long lines = f.ReadULong();
+  const std::size_t vocab_mem = ngram::ProbingVocabulary::Size(lines, 1.5);
+  vocab_backing_.reset(util::CallocOrThrow(vocab_mem));
+  vocab_.SetupMemory(vocab_backing_.get(), vocab_mem);
+  const unsigned long width = f.ReadULong();
+  vecs_.resize(width, lines);
+  for (unsigned long i = 0; i < lines; ++i) {
+    WordIndex column = vocab_.Insert(f.ReadDelimited());
+    for (unsigned int row = 0; row < width; ++row) {
+      vecs_(row,column) = f.ReadFloat();
+    }
+  }
+  vocab_.FinishedLoading();
+}
+
+}} // namespaces
diff --git a/klm/lm/neural/wordvecs.hh b/klm/lm/neural/wordvecs.hh
new file mode 100644
index 00000000..921a2b22
--- /dev/null
+++ b/klm/lm/neural/wordvecs.hh
@@ -0,0 +1,38 @@
+#ifndef LM_NEURAL_WORDVECS_H
+#define LM_NEURAL_WORDVECS_H
+
+#include "util/scoped.hh"
+#include "lm/vocab.hh"
+
+#include <Eigen/Dense>
+
+namespace util { class FilePiece; }
+
+namespace lm {
+namespace neural {
+
+class WordVecs {
+  public:
+    // Columns of the matrix are word vectors.  The column index is the word.
+    typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor> Storage;
+
+    /* The file should begin with a line stating the number of word vectors and
+     * the length of the vectors.  Then it's followed by lines containing a
+     * word followed by floating-point values.
+     */
+    explicit WordVecs(util::FilePiece &in);
+
+    const Storage &Vectors() const { return vecs_; }
+
+    WordIndex Index(StringPiece str) const { return vocab_.Index(str); }
+
+  private:
+    util::scoped_malloc vocab_backing_;
+    ngram::ProbingVocabulary vocab_;
+
+    Storage vecs_;
+};
+
+}} // namespaces
+
+#endif // LM_NEURAL_WORDVECS_H
author	Wu, Ke <wuke@cs.umd.edu>	2014-12-17 16:11:38 -0500
committer	Wu, Ke <wuke@cs.umd.edu>	2014-12-17 16:11:38 -0500
commit	1613f1fc44ca67820afd7e7b21eb54b316c8ce55 (patch)
tree	e02b77084f28a18df6b854f87a986124db44d717 /klm/lm/neural
parent	bd9308e22b5434aa220cc57d82ee867464a011f1 (diff)
parent	796768086a687d3f1856fef6489c34fe4d373642 (diff)