From d884099e0db8b4510847ec106b59ef7dca3c245b Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Fri, 18 Jan 2013 17:12:51 +0000
Subject: KenLM dffafbf with lmplz source (but not built)

---
 klm/lm/filter/vocab.cc | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 klm/lm/filter/vocab.cc

(limited to 'klm/lm/filter/vocab.cc')
diff --git a/klm/lm/filter/vocab.cc b/klm/lm/filter/vocab.cc
new file mode 100644
index 00000000..7ee4e84b
--- /dev/null
+++ b/klm/lm/filter/vocab.cc
@@ -0,0 +1,54 @@
+#include "lm/filter/vocab.hh"
+
+#include <istream>
+#include <iostream>
+
+#include <ctype.h>
+#include <err.h>
+
+namespace lm {
+namespace vocab {
+
+void ReadSingle(std::istream &in, boost::unordered_set<std::string> &out) {
+  in.exceptions(std::istream::badbit);
+  std::string word;
+  while (in >> word) {
+    out.insert(word);
+  }
+}
+
+namespace {
+bool IsLineEnd(std::istream &in) {
+  int got;
+  do {
+    got = in.get();
+    if (!in) return true;
+    if (got == '\n') return true;
+  } while (isspace(got));
+  in.unget();
+  return false;
+}
+}// namespace
+
+// Read space separated words in enter separated lines.  These lines can be
+// very long, so don't read an entire line at a time.  
+unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
+  in.exceptions(std::istream::badbit);
+  unsigned int sentence = 0;
+  bool used_id = false;
+  std::string word;
+  while (in >> word) {
+    used_id = true;
+    std::vector<unsigned int> &posting = out[word];
+    if (posting.empty() || (posting.back() != sentence))
+      posting.push_back(sentence);
+    if (IsLineEnd(in)) {
+      ++sentence;
+      used_id = false;
+    }
+  }
+  return sentence + used_id;
+}
+
+} // namespace vocab
+} // namespace lm
-- 
cgit v1.2.3