Merge branch 'master' of github.com:redpony/cdec

author: armatthews <armatthe@cmu.edu> 2014-10-13 14:59:23 -0400
committer: armatthews <armatthe@cmu.edu> 2014-10-13 14:59:23 -0400
commit: 9a06ff1465eb3477ac3d1e92ab52e7eae40316a8 (patch)
tree: 808c266a3f510d00f37cd19c3f1da91d8fc683f7 /klm/lm/interpolate/arpa_to_stream.cc
parent: e51da099233df0a384b04fe5908b30e44040d13e (diff)
parent: d3e2ec203a5cf550320caa8023ac3dd103b0be7d (diff)
1 files changed, 47 insertions, 0 deletions
diff --git a/klm/lm/interpolate/arpa_to_stream.cc b/klm/lm/interpolate/arpa_to_stream.cc
new file mode 100644
index 00000000..f2696f39
--- /dev/null
+++ b/klm/lm/interpolate/arpa_to_stream.cc
@@ -0,0 +1,47 @@
+#include "lm/interpolate/arpa_to_stream.hh"
+
+// TODO: should this move out of builder?
+#include "lm/builder/ngram_stream.hh"
+#include "lm/read_arpa.hh"
+#include "lm/vocab.hh"
+
+namespace lm { namespace interpolate {
+
+ARPAToStream::ARPAToStream(int fd, ngram::GrowableVocab<ngram::WriteUniqueWords> &vocab)
+  : in_(fd), vocab_(vocab) {
+    
+  // Read the ARPA file header.
+  //
+  // After the following call, counts_ will be correctly initialized,
+  // and in_ will be positioned for reading the body of the ARPA file.  
+  ReadARPACounts(in_, counts_);
+  
+}
+
+void ARPAToStream::Run(const util::stream::ChainPositions &positions) {
+  // Make one stream for each order.
+  builder::NGramStreams streams(positions);
+  PositiveProbWarn warn;
+
+  // Unigrams are handled specially because they're being inserted into the vocab.
+  ReadNGramHeader(in_, 1);
+  for (uint64_t i = 0; i < counts_[0]; ++i, ++streams[0]) {
+    streams[0]->begin()[0] = vocab_.FindOrInsert(Read1Gram(in_, streams[0]->Value().complete, warn));
+  }
+  // Finish off the unigram stream.
+  streams[0].Poison();
+
+  // TODO: don't waste backoff field for highest order.
+  for (unsigned char n = 2; n <= counts_.size(); ++n) {
+    ReadNGramHeader(in_, n);
+    builder::NGramStream &stream = streams[n - 1];
+    const uint64_t end = counts_[n - 1];
+    for (std::size_t i = 0; i < end; ++i, ++stream) {
+      ReadNGram(in_, n, vocab_, stream->begin(), stream->Value().complete, warn);
+    }
+    // Finish the stream for n-grams..
+    stream.Poison();
+  }
+}
+
+}} // namespaces
author	armatthews <armatthe@cmu.edu>	2014-10-13 14:59:23 -0400
committer	armatthews <armatthe@cmu.edu>	2014-10-13 14:59:23 -0400
commit	9a06ff1465eb3477ac3d1e92ab52e7eae40316a8 (patch)
tree	808c266a3f510d00f37cd19c3f1da91d8fc683f7 /klm/lm/interpolate/arpa_to_stream.cc
parent	e51da099233df0a384b04fe5908b30e44040d13e (diff)
parent	d3e2ec203a5cf550320caa8023ac3dd103b0be7d (diff)