Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <p@simianer.de> 2014-10-13 19:03:48 +0100
committer: Patrick Simianer <p@simianer.de> 2014-10-13 19:03:48 +0100
commit: cb9fb7088dde35881516c088db402abe747d49fa (patch)
tree: a91e4935a7941f1b261f76d88ab41fa3078a1891 /klm/lm/interpolate/arpa_to_stream.cc
parent: 0a00e57e921c8eca8e02364db7d2e6607bfdcebc (diff)
parent: b1ed81ef3216b212295afa76c5d20a56fb647204 (diff)
1 files changed, 47 insertions, 0 deletions
diff --git a/klm/lm/interpolate/arpa_to_stream.cc b/klm/lm/interpolate/arpa_to_stream.cc
new file mode 100644
index 00000000..f2696f39
--- /dev/null
+++ b/klm/lm/interpolate/arpa_to_stream.cc
@@ -0,0 +1,47 @@
+#include "lm/interpolate/arpa_to_stream.hh"
+
+// TODO: should this move out of builder?
+#include "lm/builder/ngram_stream.hh"
+#include "lm/read_arpa.hh"
+#include "lm/vocab.hh"
+
+namespace lm { namespace interpolate {
+
+ARPAToStream::ARPAToStream(int fd, ngram::GrowableVocab<ngram::WriteUniqueWords> &vocab)
+  : in_(fd), vocab_(vocab) {
+    
+  // Read the ARPA file header.
+  //
+  // After the following call, counts_ will be correctly initialized,
+  // and in_ will be positioned for reading the body of the ARPA file.  
+  ReadARPACounts(in_, counts_);
+  
+}
+
+void ARPAToStream::Run(const util::stream::ChainPositions &positions) {
+  // Make one stream for each order.
+  builder::NGramStreams streams(positions);
+  PositiveProbWarn warn;
+
+  // Unigrams are handled specially because they're being inserted into the vocab.
+  ReadNGramHeader(in_, 1);
+  for (uint64_t i = 0; i < counts_[0]; ++i, ++streams[0]) {
+    streams[0]->begin()[0] = vocab_.FindOrInsert(Read1Gram(in_, streams[0]->Value().complete, warn));
+  }
+  // Finish off the unigram stream.
+  streams[0].Poison();
+
+  // TODO: don't waste backoff field for highest order.
+  for (unsigned char n = 2; n <= counts_.size(); ++n) {
+    ReadNGramHeader(in_, n);
+    builder::NGramStream &stream = streams[n - 1];
+    const uint64_t end = counts_[n - 1];
+    for (std::size_t i = 0; i < end; ++i, ++stream) {
+      ReadNGram(in_, n, vocab_, stream->begin(), stream->Value().complete, warn);
+    }
+    // Finish the stream for n-grams..
+    stream.Poison();
+  }
+}
+
+}} // namespaces
author	Patrick Simianer <p@simianer.de>	2014-10-13 19:03:48 +0100
committer	Patrick Simianer <p@simianer.de>	2014-10-13 19:03:48 +0100
commit	cb9fb7088dde35881516c088db402abe747d49fa (patch)
tree	a91e4935a7941f1b261f76d88ab41fa3078a1891 /klm/lm/interpolate/arpa_to_stream.cc
parent	0a00e57e921c8eca8e02364db7d2e6607bfdcebc (diff)
parent	b1ed81ef3216b212295afa76c5d20a56fb647204 (diff)