From d3e2ec203a5cf550320caa8023ac3dd103b0be7d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 13 Oct 2014 00:42:37 -0400 Subject: new kenlm --- klm/lm/read_arpa.hh | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'klm/lm/read_arpa.hh') diff --git a/klm/lm/read_arpa.hh b/klm/lm/read_arpa.hh index 234d130c..64eeef30 100644 --- a/klm/lm/read_arpa.hh +++ b/klm/lm/read_arpa.hh @@ -1,5 +1,5 @@ -#ifndef LM_READ_ARPA__ -#define LM_READ_ARPA__ +#ifndef LM_READ_ARPA_H +#define LM_READ_ARPA_H #include "lm/lm_exception.hh" #include "lm/word_index.hh" @@ -28,7 +28,7 @@ void ReadEnd(util::FilePiece &in); extern const bool kARPASpaces[256]; -// Positive log probability warning. +// Positive log probability warning. class PositiveProbWarn { public: PositiveProbWarn() : action_(THROW_UP) {} @@ -48,17 +48,17 @@ template void Read1Gram(util::FilePiece &f, Voc &voca warn.Warn(prob); prob = 0.0; } - if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability"); - Weights &value = unigrams[vocab.Insert(f.ReadDelimited(kARPASpaces))]; - value.prob = prob; - ReadBackoff(f, value); + UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability"); + WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces)); + Weights &w = unigrams[word]; + w.prob = prob; + ReadBackoff(f, w); } catch(util::Exception &e) { e << " in the 1-gram at byte " << f.Offset(); throw; } } -// Return true if a positive log probability came out. template void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) { ReadNGramHeader(f, 1); for (std::size_t i = 0; i < count; ++i) { @@ -67,16 +67,21 @@ template void Read1Grams(util::FilePiece &f, std::siz vocab.FinishedLoading(unigrams); } -// Return true if a positive log probability came out. -template void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights, PositiveProbWarn &warn) { +// Read ngram, write vocab ids to indices_out. +template void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, Iterator indices_out, Weights &weights, PositiveProbWarn &warn) { try { weights.prob = f.ReadFloat(); if (weights.prob > 0.0) { warn.Warn(weights.prob); weights.prob = 0.0; } - for (WordIndex *vocab_out = reverse_indices + n - 1; vocab_out >= reverse_indices; --vocab_out) { - *vocab_out = vocab.Index(f.ReadDelimited(kARPASpaces)); + for (unsigned char i = 0; i < n; ++i, ++indices_out) { + StringPiece word(f.ReadDelimited(kARPASpaces)); + WordIndex index = vocab.Index(word); + *indices_out = index; + // Check for words mapped to that are not the string . + UTIL_THROW_IF(index == 0 /* mapped to */ && (word != StringPiece("", 5)) && (word != StringPiece("", 5)), + FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears"); } ReadBackoff(f, weights); } catch(util::Exception &e) { @@ -87,4 +92,4 @@ template void ReadNGram(util::FilePiece &f, const uns } // namespace lm -#endif // LM_READ_ARPA__ +#endif // LM_READ_ARPA_H -- cgit v1.2.3