diff options
author | armatthews <armatthe@cmu.edu> | 2014-10-13 14:59:23 -0400 |
---|---|---|
committer | armatthews <armatthe@cmu.edu> | 2014-10-13 14:59:23 -0400 |
commit | 9a06ff1465eb3477ac3d1e92ab52e7eae40316a8 (patch) | |
tree | 808c266a3f510d00f37cd19c3f1da91d8fc683f7 /klm/lm/interpolate/arpa_to_stream.cc | |
parent | e51da099233df0a384b04fe5908b30e44040d13e (diff) | |
parent | d3e2ec203a5cf550320caa8023ac3dd103b0be7d (diff) |
Merge branch 'master' of github.com:redpony/cdec
Diffstat (limited to 'klm/lm/interpolate/arpa_to_stream.cc')
-rw-r--r-- | klm/lm/interpolate/arpa_to_stream.cc | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/klm/lm/interpolate/arpa_to_stream.cc b/klm/lm/interpolate/arpa_to_stream.cc new file mode 100644 index 00000000..f2696f39 --- /dev/null +++ b/klm/lm/interpolate/arpa_to_stream.cc @@ -0,0 +1,47 @@ +#include "lm/interpolate/arpa_to_stream.hh" + +// TODO: should this move out of builder? +#include "lm/builder/ngram_stream.hh" +#include "lm/read_arpa.hh" +#include "lm/vocab.hh" + +namespace lm { namespace interpolate { + +ARPAToStream::ARPAToStream(int fd, ngram::GrowableVocab<ngram::WriteUniqueWords> &vocab) + : in_(fd), vocab_(vocab) { + + // Read the ARPA file header. + // + // After the following call, counts_ will be correctly initialized, + // and in_ will be positioned for reading the body of the ARPA file. + ReadARPACounts(in_, counts_); + +} + +void ARPAToStream::Run(const util::stream::ChainPositions &positions) { + // Make one stream for each order. + builder::NGramStreams streams(positions); + PositiveProbWarn warn; + + // Unigrams are handled specially because they're being inserted into the vocab. + ReadNGramHeader(in_, 1); + for (uint64_t i = 0; i < counts_[0]; ++i, ++streams[0]) { + streams[0]->begin()[0] = vocab_.FindOrInsert(Read1Gram(in_, streams[0]->Value().complete, warn)); + } + // Finish off the unigram stream. + streams[0].Poison(); + + // TODO: don't waste backoff field for highest order. + for (unsigned char n = 2; n <= counts_.size(); ++n) { + ReadNGramHeader(in_, n); + builder::NGramStream &stream = streams[n - 1]; + const uint64_t end = counts_[n - 1]; + for (std::size_t i = 0; i < end; ++i, ++stream) { + ReadNGram(in_, n, vocab_, stream->begin(), stream->Value().complete, warn); + } + // Finish the stream for n-grams.. + stream.Poison(); + } +} + +}} // namespaces |