summaryrefslogtreecommitdiff
path: root/klm/lm/interpolate/arpa_to_stream.cc
diff options
context:
space:
mode:
authorWu, Ke <wuke@cs.umd.edu>2014-12-17 16:11:38 -0500
committerWu, Ke <wuke@cs.umd.edu>2014-12-17 16:11:38 -0500
commit7468e8d85e99b4619442c7afaf4a0d92870111bb (patch)
treea6f17da7c69048c8900260b5490bb9d8611be3bb /klm/lm/interpolate/arpa_to_stream.cc
parentb6dd5a683db9dda2d634dd2fdb76606819594901 (diff)
parent1a79175f9a101d46cf27ca921213d5dd9300518f (diff)
Merge with upstream
Diffstat (limited to 'klm/lm/interpolate/arpa_to_stream.cc')
-rw-r--r--klm/lm/interpolate/arpa_to_stream.cc47
1 files changed, 47 insertions, 0 deletions
diff --git a/klm/lm/interpolate/arpa_to_stream.cc b/klm/lm/interpolate/arpa_to_stream.cc
new file mode 100644
index 00000000..f2696f39
--- /dev/null
+++ b/klm/lm/interpolate/arpa_to_stream.cc
@@ -0,0 +1,47 @@
+#include "lm/interpolate/arpa_to_stream.hh"
+
+// TODO: should this move out of builder?
+#include "lm/builder/ngram_stream.hh"
+#include "lm/read_arpa.hh"
+#include "lm/vocab.hh"
+
+namespace lm { namespace interpolate {
+
+ARPAToStream::ARPAToStream(int fd, ngram::GrowableVocab<ngram::WriteUniqueWords> &vocab)
+ : in_(fd), vocab_(vocab) {
+
+ // Read the ARPA file header.
+ //
+ // After the following call, counts_ will be correctly initialized,
+ // and in_ will be positioned for reading the body of the ARPA file.
+ ReadARPACounts(in_, counts_);
+
+}
+
+void ARPAToStream::Run(const util::stream::ChainPositions &positions) {
+ // Make one stream for each order.
+ builder::NGramStreams streams(positions);
+ PositiveProbWarn warn;
+
+ // Unigrams are handled specially because they're being inserted into the vocab.
+ ReadNGramHeader(in_, 1);
+ for (uint64_t i = 0; i < counts_[0]; ++i, ++streams[0]) {
+ streams[0]->begin()[0] = vocab_.FindOrInsert(Read1Gram(in_, streams[0]->Value().complete, warn));
+ }
+ // Finish off the unigram stream.
+ streams[0].Poison();
+
+ // TODO: don't waste backoff field for highest order.
+ for (unsigned char n = 2; n <= counts_.size(); ++n) {
+ ReadNGramHeader(in_, n);
+ builder::NGramStream &stream = streams[n - 1];
+ const uint64_t end = counts_[n - 1];
+ for (std::size_t i = 0; i < end; ++i, ++stream) {
+ ReadNGram(in_, n, vocab_, stream->begin(), stream->Value().complete, warn);
+ }
+ // Finish the stream for n-grams..
+ stream.Poison();
+ }
+}
+
+}} // namespaces