1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
#include "lm/interpolate/arpa_to_stream.hh"
// TODO: should this move out of builder?
#include "lm/builder/ngram_stream.hh"
#include "lm/read_arpa.hh"
#include "lm/vocab.hh"
namespace lm { namespace interpolate {
ARPAToStream::ARPAToStream(int fd, ngram::GrowableVocab<ngram::WriteUniqueWords> &vocab)
: in_(fd), vocab_(vocab) {
// Read the ARPA file header.
//
// After the following call, counts_ will be correctly initialized,
// and in_ will be positioned for reading the body of the ARPA file.
ReadARPACounts(in_, counts_);
}
void ARPAToStream::Run(const util::stream::ChainPositions &positions) {
// Make one stream for each order.
builder::NGramStreams streams(positions);
PositiveProbWarn warn;
// Unigrams are handled specially because they're being inserted into the vocab.
ReadNGramHeader(in_, 1);
for (uint64_t i = 0; i < counts_[0]; ++i, ++streams[0]) {
streams[0]->begin()[0] = vocab_.FindOrInsert(Read1Gram(in_, streams[0]->Value().complete, warn));
}
// Finish off the unigram stream.
streams[0].Poison();
// TODO: don't waste backoff field for highest order.
for (unsigned char n = 2; n <= counts_.size(); ++n) {
ReadNGramHeader(in_, n);
builder::NGramStream &stream = streams[n - 1];
const uint64_t end = counts_[n - 1];
for (std::size_t i = 0; i < end; ++i, ++stream) {
ReadNGram(in_, n, vocab_, stream->begin(), stream->Value().complete, warn);
}
// Finish the stream for n-grams..
stream.Poison();
}
}
}} // namespaces
|