diff options
author | Chris Dyer <redpony@gmail.com> | 2014-10-13 00:42:37 -0400 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2014-10-13 00:42:37 -0400 |
commit | b1ed81ef3216b212295afa76c5d20a56fb647204 (patch) | |
tree | 9633cdc1b8a341dfa58b0b7fec0e2cae44d28835 /klm/lm/builder | |
parent | 1b17f61d359be6e1c3cea29f8c100db3bcdd73a0 (diff) |
new kenlm
Diffstat (limited to 'klm/lm/builder')
24 files changed, 958 insertions, 275 deletions
diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am index 38259c51..bb15ff04 100644 --- a/klm/lm/builder/Makefile.am +++ b/klm/lm/builder/Makefile.am @@ -1,4 +1,8 @@ -bin_PROGRAMS = lmplz +bin_PROGRAMS = lmplz dump_counts + +dump_counts_SOURCES = \ + print.cc \ + dump_counts_main.cc lmplz_SOURCES = \ lmplz_main.cc \ @@ -7,6 +11,7 @@ lmplz_SOURCES = \ corpus_count.cc \ corpus_count.hh \ discount.hh \ + hash_gamma.hh \ header_info.hh \ initial_probabilities.cc \ initial_probabilities.hh \ @@ -22,6 +27,7 @@ lmplz_SOURCES = \ print.hh \ sort.hh +dump_counts_LDADD = ../libklm.a ../../util/double-conversion/libklm_util_double.a ../../util/stream/libklm_util_stream.a ../../util/libklm_util.a $(BOOST_THREAD_LIBS) lmplz_LDADD = ../libklm.a ../../util/double-conversion/libklm_util_double.a ../../util/stream/libklm_util_stream.a ../../util/libklm_util.a $(BOOST_THREAD_LIBS) AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm diff --git a/klm/lm/builder/adjust_counts.cc b/klm/lm/builder/adjust_counts.cc index a6f48011..803c557d 100644 --- a/klm/lm/builder/adjust_counts.cc +++ b/klm/lm/builder/adjust_counts.cc @@ -1,8 +1,9 @@ #include "lm/builder/adjust_counts.hh" -#include "lm/builder/multi_stream.hh" +#include "lm/builder/ngram_stream.hh" #include "util/stream/timer.hh" #include <algorithm> +#include <iostream> namespace lm { namespace builder { @@ -10,56 +11,78 @@ BadDiscountException::BadDiscountException() throw() {} BadDiscountException::~BadDiscountException() throw() {} namespace { -// Return last word in full that is different. +// Return last word in full that is different. const WordIndex* FindDifference(const NGram &full, const NGram &lower_last) { const WordIndex *cur_word = full.end() - 1; const WordIndex *pre_word = lower_last.end() - 1; - // Find last difference. + // Find last difference. for (; pre_word >= lower_last.begin() && *pre_word == *cur_word; --cur_word, --pre_word) {} return cur_word; } class StatCollector { public: - StatCollector(std::size_t order, std::vector<uint64_t> &counts, std::vector<Discount> &discounts) - : orders_(order), full_(orders_.back()), counts_(counts), discounts_(discounts) { + StatCollector(std::size_t order, std::vector<uint64_t> &counts, std::vector<uint64_t> &counts_pruned, std::vector<Discount> &discounts) + : orders_(order), full_(orders_.back()), counts_(counts), counts_pruned_(counts_pruned), discounts_(discounts) { memset(&orders_[0], 0, sizeof(OrderStat) * order); } ~StatCollector() {} - void CalculateDiscounts() { + void CalculateDiscounts(const DiscountConfig &config) { counts_.resize(orders_.size()); - discounts_.resize(orders_.size()); + counts_pruned_.resize(orders_.size()); for (std::size_t i = 0; i < orders_.size(); ++i) { const OrderStat &s = orders_[i]; counts_[i] = s.count; + counts_pruned_[i] = s.count_pruned; + } - for (unsigned j = 1; j < 4; ++j) { - // TODO: Specialize error message for j == 3, meaning 3+ - UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for " - << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any " - << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?"); - } - - // See equation (26) in Chen and Goodman. - discounts_[i].amount[0] = 0.0; - float y = static_cast<float>(s.n[1]) / static_cast<float>(s.n[1] + 2.0 * s.n[2]); - for (unsigned j = 1; j < 4; ++j) { - discounts_[i].amount[j] = static_cast<float>(j) - static_cast<float>(j + 1) * y * static_cast<float>(s.n[j+1]) / static_cast<float>(s.n[j]); - UTIL_THROW_IF(discounts_[i].amount[j] < 0.0 || discounts_[i].amount[j] > j, BadDiscountException, "ERROR: " << (i+1) << "-gram discount out of range for adjusted count " << j << ": " << discounts_[i].amount[j]); + discounts_ = config.overwrite; + discounts_.resize(orders_.size()); + for (std::size_t i = config.overwrite.size(); i < orders_.size(); ++i) { + const OrderStat &s = orders_[i]; + try { + for (unsigned j = 1; j < 4; ++j) { + // TODO: Specialize error message for j == 3, meaning 3+ + UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for " + << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any " + << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?"); + } + + // See equation (26) in Chen and Goodman. + discounts_[i].amount[0] = 0.0; + float y = static_cast<float>(s.n[1]) / static_cast<float>(s.n[1] + 2.0 * s.n[2]); + for (unsigned j = 1; j < 4; ++j) { + discounts_[i].amount[j] = static_cast<float>(j) - static_cast<float>(j + 1) * y * static_cast<float>(s.n[j+1]) / static_cast<float>(s.n[j]); + UTIL_THROW_IF(discounts_[i].amount[j] < 0.0 || discounts_[i].amount[j] > j, BadDiscountException, "ERROR: " << (i+1) << "-gram discount out of range for adjusted count " << j << ": " << discounts_[i].amount[j]); + } + } catch (const BadDiscountException &e) { + switch (config.bad_action) { + case THROW_UP: + throw; + case COMPLAIN: + std::cerr << e.what() << " Substituting fallback discounts D1=" << config.fallback.amount[1] << " D2=" << config.fallback.amount[2] << " D3+=" << config.fallback.amount[3] << std::endl; + case SILENT: + break; + } + discounts_[i] = config.fallback; } } } - void Add(std::size_t order_minus_1, uint64_t count) { + void Add(std::size_t order_minus_1, uint64_t count, bool pruned = false) { OrderStat &stat = orders_[order_minus_1]; ++stat.count; + if (!pruned) + ++stat.count_pruned; if (count < 5) ++stat.n[count]; } - void AddFull(uint64_t count) { + void AddFull(uint64_t count, bool pruned = false) { ++full_.count; + if (!pruned) + ++full_.count_pruned; if (count < 5) ++full_.n[count]; } @@ -68,24 +91,27 @@ class StatCollector { // n_1 in equation 26 of Chen and Goodman etc uint64_t n[5]; uint64_t count; + uint64_t count_pruned; }; std::vector<OrderStat> orders_; OrderStat &full_; std::vector<uint64_t> &counts_; + std::vector<uint64_t> &counts_pruned_; std::vector<Discount> &discounts_; }; -// Reads all entries in order like NGramStream does. +// Reads all entries in order like NGramStream does. // But deletes any entries that have <s> in the 1st (not 0th) position on the // way out by putting other entries in their place. This disrupts the sort -// order but we don't care because the data is going to be sorted again. +// order but we don't care because the data is going to be sorted again. class CollapseStream { public: - CollapseStream(const util::stream::ChainPosition &position) : + CollapseStream(const util::stream::ChainPosition &position, uint64_t prune_threshold) : current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())), - block_(position) { + prune_threshold_(prune_threshold), + block_(position) { StartBlock(); } @@ -96,10 +122,18 @@ class CollapseStream { CollapseStream &operator++() { assert(block_); + if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) { memcpy(current_.Base(), copy_from_, current_.TotalSize()); UpdateCopyFrom(); + + // Mark highest order n-grams for later pruning + if(current_.Count() <= prune_threshold_) { + current_.Mark(); + } + } + current_.NextInMemory(); uint8_t *block_base = static_cast<uint8_t*>(block_->Get()); if (current_.Base() == block_base + block_->ValidSize()) { @@ -107,6 +141,12 @@ class CollapseStream { ++block_; StartBlock(); } + + // Mark highest order n-grams for later pruning + if(current_.Count() <= prune_threshold_) { + current_.Mark(); + } + return *this; } @@ -119,9 +159,15 @@ class CollapseStream { current_.ReBase(block_->Get()); copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize(); UpdateCopyFrom(); + + // Mark highest order n-grams for later pruning + if(current_.Count() <= prune_threshold_) { + current_.Mark(); + } + } - // Find last without bos. + // Find last without bos. void UpdateCopyFrom() { for (copy_from_ -= current_.TotalSize(); copy_from_ >= current_.Base(); copy_from_ -= current_.TotalSize()) { if (NGram(copy_from_, current_.Order()).begin()[1] != kBOS) break; @@ -132,83 +178,107 @@ class CollapseStream { // Goes backwards in the block uint8_t *copy_from_; - + uint64_t prune_threshold_; util::stream::Link block_; }; } // namespace -void AdjustCounts::Run(const ChainPositions &positions) { +void AdjustCounts::Run(const util::stream::ChainPositions &positions) { UTIL_TIMER("(%w s) Adjusted counts\n"); const std::size_t order = positions.size(); - StatCollector stats(order, counts_, discounts_); + StatCollector stats(order, counts_, counts_pruned_, discounts_); if (order == 1) { + // Only unigrams. Just collect stats. for (NGramStream full(positions[0]); full; ++full) stats.AddFull(full->Count()); - stats.CalculateDiscounts(); + + stats.CalculateDiscounts(discount_config_); return; } NGramStreams streams; streams.Init(positions, positions.size() - 1); - CollapseStream full(positions[positions.size() - 1]); + + CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back()); - // Initialization: <unk> has count 0 and so does <s>. + // Initialization: <unk> has count 0 and so does <s>. NGramStream *lower_valid = streams.begin(); streams[0]->Count() = 0; *streams[0]->begin() = kUNK; stats.Add(0, 0); (++streams[0])->Count() = 0; *streams[0]->begin() = kBOS; - // not in stats because it will get put in later. + // not in stats because it will get put in later. + std::vector<uint64_t> lower_counts(positions.size(), 0); + // iterate over full (the stream of the highest order ngrams) - for (; full; ++full) { + for (; full; ++full) { const WordIndex *different = FindDifference(*full, **lower_valid); std::size_t same = full->end() - 1 - different; - // Increment the adjusted count. + // Increment the adjusted count. if (same) ++streams[same - 1]->Count(); - // Output all the valid ones that changed. + // Output all the valid ones that changed. for (; lower_valid >= &streams[same]; --lower_valid) { - stats.Add(lower_valid - streams.begin(), (*lower_valid)->Count()); + + // mjd: review this! + uint64_t order = (*lower_valid)->Order(); + uint64_t realCount = lower_counts[order - 1]; + if(order > 1 && prune_thresholds_[order - 1] && realCount <= prune_thresholds_[order - 1]) + (*lower_valid)->Mark(); + + stats.Add(lower_valid - streams.begin(), (*lower_valid)->UnmarkedCount(), (*lower_valid)->IsMarked()); ++*lower_valid; } + + // Count the true occurrences of lower-order n-grams + for (std::size_t i = 0; i < lower_counts.size(); ++i) { + if (i >= same) { + lower_counts[i] = 0; + } + lower_counts[i] += full->UnmarkedCount(); + } // This is here because bos is also const WordIndex *, so copy gets - // consistent argument types. + // consistent argument types. const WordIndex *full_end = full->end(); - // Initialize and mark as valid up to bos. + // Initialize and mark as valid up to bos. const WordIndex *bos; for (bos = different; (bos > full->begin()) && (*bos != kBOS); --bos) { ++lower_valid; std::copy(bos, full_end, (*lower_valid)->begin()); (*lower_valid)->Count() = 1; } - // Now bos indicates where <s> is or is the 0th word of full. + // Now bos indicates where <s> is or is the 0th word of full. if (bos != full->begin()) { - // There is an <s> beyond the 0th word. + // There is an <s> beyond the 0th word. NGramStream &to = *++lower_valid; std::copy(bos, full_end, to->begin()); - to->Count() = full->Count(); + + // mjd: what is this doing? + to->Count() = full->UnmarkedCount(); } else { - stats.AddFull(full->Count()); + stats.AddFull(full->UnmarkedCount(), full->IsMarked()); } assert(lower_valid >= &streams[0]); } // Output everything valid. for (NGramStream *s = streams.begin(); s <= lower_valid; ++s) { - stats.Add(s - streams.begin(), (*s)->Count()); + if((*s)->Count() <= prune_thresholds_[(*s)->Order() - 1]) + (*s)->Mark(); + stats.Add(s - streams.begin(), (*s)->UnmarkedCount(), (*s)->IsMarked()); ++*s; } - // Poison everyone! Except the N-grams which were already poisoned by the input. + // Poison everyone! Except the N-grams which were already poisoned by the input. for (NGramStream *s = streams.begin(); s != streams.end(); ++s) s->Poison(); - stats.CalculateDiscounts(); + stats.CalculateDiscounts(discount_config_); // NOTE: See special early-return case for unigrams near the top of this function } diff --git a/klm/lm/builder/adjust_counts.hh b/klm/lm/builder/adjust_counts.hh index f38ff79d..a5435c28 100644 --- a/klm/lm/builder/adjust_counts.hh +++ b/klm/lm/builder/adjust_counts.hh @@ -1,24 +1,35 @@ -#ifndef LM_BUILDER_ADJUST_COUNTS__ -#define LM_BUILDER_ADJUST_COUNTS__ +#ifndef LM_BUILDER_ADJUST_COUNTS_H +#define LM_BUILDER_ADJUST_COUNTS_H #include "lm/builder/discount.hh" +#include "lm/lm_exception.hh" #include "util/exception.hh" #include <vector> #include <stdint.h> +namespace util { namespace stream { class ChainPositions; } } + namespace lm { namespace builder { -class ChainPositions; - class BadDiscountException : public util::Exception { public: BadDiscountException() throw(); ~BadDiscountException() throw(); }; +struct DiscountConfig { + // Overrides discounts for orders [1,discount_override.size()]. + std::vector<Discount> overwrite; + // If discounting fails for an order, copy them from here. + Discount fallback; + // What to do when discounts are out of range or would trigger divison by + // zero. It it does something other than THROW_UP, use fallback_discount. + WarningAction bad_action; +}; + /* Compute adjusted counts. * Input: unique suffix sorted N-grams (and just the N-grams) with raw counts. * Output: [1,N]-grams with adjusted counts. @@ -27,18 +38,32 @@ class BadDiscountException : public util::Exception { */ class AdjustCounts { public: - AdjustCounts(std::vector<uint64_t> &counts, std::vector<Discount> &discounts) - : counts_(counts), discounts_(discounts) {} + // counts: output + // counts_pruned: output + // discounts: mostly output. If the input already has entries, they will be kept. + // prune_thresholds: input. n-grams with normal (not adjusted) count below this will be pruned. + AdjustCounts( + const std::vector<uint64_t> &prune_thresholds, + std::vector<uint64_t> &counts, + std::vector<uint64_t> &counts_pruned, + const DiscountConfig &discount_config, + std::vector<Discount> &discounts) + : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned), discount_config_(discount_config), discounts_(discounts) + {} - void Run(const ChainPositions &positions); + void Run(const util::stream::ChainPositions &positions); private: + const std::vector<uint64_t> &prune_thresholds_; std::vector<uint64_t> &counts_; + std::vector<uint64_t> &counts_pruned_; + + DiscountConfig discount_config_; std::vector<Discount> &discounts_; }; } // namespace builder } // namespace lm -#endif // LM_BUILDER_ADJUST_COUNTS__ +#endif // LM_BUILDER_ADJUST_COUNTS_H diff --git a/klm/lm/builder/adjust_counts_test.cc b/klm/lm/builder/adjust_counts_test.cc index 68b5f33e..073c5dfe 100644 --- a/klm/lm/builder/adjust_counts_test.cc +++ b/klm/lm/builder/adjust_counts_test.cc @@ -1,6 +1,6 @@ #include "lm/builder/adjust_counts.hh" -#include "lm/builder/multi_stream.hh" +#include "lm/builder/ngram_stream.hh" #include "util/scoped.hh" #include <boost/thread/thread.hpp> @@ -61,19 +61,24 @@ BOOST_AUTO_TEST_CASE(Simple) { util::stream::ChainConfig config; config.total_memory = 100; config.block_count = 1; - Chains chains(4); + util::stream::Chains chains(4); for (unsigned i = 0; i < 4; ++i) { config.entry_size = NGram::TotalSize(i + 1); chains.push_back(config); } chains[3] >> WriteInput(); - ChainPositions for_adjust(chains); + util::stream::ChainPositions for_adjust(chains); for (unsigned i = 0; i < 4; ++i) { chains[i] >> boost::ref(outputs[i]); } chains >> util::stream::kRecycle; - BOOST_CHECK_THROW(AdjustCounts(counts, discount).Run(for_adjust), BadDiscountException); + std::vector<uint64_t> counts_pruned(4); + std::vector<uint64_t> prune_thresholds(4); + DiscountConfig discount_config; + discount_config.fallback = Discount(); + discount_config.bad_action = THROW_UP; + BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, discount_config, discount).Run(for_adjust), BadDiscountException); } BOOST_REQUIRE_EQUAL(4UL, counts.size()); BOOST_CHECK_EQUAL(4UL, counts[0]); diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc index ccc06efc..590e79fa 100644 --- a/klm/lm/builder/corpus_count.cc +++ b/klm/lm/builder/corpus_count.cc @@ -2,6 +2,7 @@ #include "lm/builder/ngram.hh" #include "lm/lm_exception.hh" +#include "lm/vocab.hh" #include "lm/word_index.hh" #include "util/fake_ofstream.hh" #include "util/file.hh" @@ -37,60 +38,6 @@ struct VocabEntry { }; #pragma pack(pop) -const float kProbingMultiplier = 1.5; - -class VocabHandout { - public: - static std::size_t MemUsage(WordIndex initial_guess) { - if (initial_guess < 2) initial_guess = 2; - return util::CheckOverflow(Table::Size(initial_guess, kProbingMultiplier)); - } - - explicit VocabHandout(int fd, WordIndex initial_guess) : - table_backing_(util::CallocOrThrow(MemUsage(initial_guess))), - table_(table_backing_.get(), MemUsage(initial_guess)), - double_cutoff_(std::max<std::size_t>(initial_guess * 1.1, 1)), - word_list_(fd) { - Lookup("<unk>"); // Force 0 - Lookup("<s>"); // Force 1 - Lookup("</s>"); // Force 2 - } - - WordIndex Lookup(const StringPiece &word) { - VocabEntry entry; - entry.key = util::MurmurHashNative(word.data(), word.size()); - entry.value = table_.SizeNoSerialization(); - - Table::MutableIterator it; - if (table_.FindOrInsert(entry, it)) - return it->value; - word_list_ << word << '\0'; - UTIL_THROW_IF(Size() >= std::numeric_limits<lm::WordIndex>::max(), VocabLoadException, "Too many vocabulary words. Change WordIndex to uint64_t in lm/word_index.hh."); - if (Size() >= double_cutoff_) { - table_backing_.call_realloc(table_.DoubleTo()); - table_.Double(table_backing_.get()); - double_cutoff_ *= 2; - } - return entry.value; - } - - WordIndex Size() const { - return table_.SizeNoSerialization(); - } - - private: - // TODO: factor out a resizable probing hash table. - // TODO: use mremap on linux to get all zeros on resizes. - util::scoped_malloc table_backing_; - - typedef util::ProbingHashTable<VocabEntry, util::IdentityHash> Table; - Table table_; - - std::size_t double_cutoff_; - - util::FakeOFStream word_list_; -}; - class DedupeHash : public std::unary_function<const WordIndex *, bool> { public: explicit DedupeHash(std::size_t order) : size_(order * sizeof(WordIndex)) {} @@ -127,6 +74,10 @@ struct DedupeEntry { } }; + +// TODO: don't have this here, should be with probing hash table defaults? +const float kProbingMultiplier = 1.5; + typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe; class Writer { @@ -220,37 +171,50 @@ float CorpusCount::DedupeMultiplier(std::size_t order) { } std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) { - return VocabHandout::MemUsage(vocab_estimate); + return ngram::GrowableVocab<ngram::WriteUniqueWords>::MemUsage(vocab_estimate); } -CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) +CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block, WarningAction disallowed_symbol) : from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count), dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)), - dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) { + dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)), + disallowed_symbol_action_(disallowed_symbol) { } -void CorpusCount::Run(const util::stream::ChainPosition &position) { - UTIL_TIMER("(%w s) Counted n-grams\n"); +namespace { + void ComplainDisallowed(StringPiece word, WarningAction &action) { + switch (action) { + case SILENT: + return; + case COMPLAIN: + std::cerr << "Warning: " << word << " appears in the input. All instances of <s>, </s>, and <unk> will be interpreted as whitespace." << std::endl; + action = SILENT; + return; + case THROW_UP: + UTIL_THROW(FormatLoadException, "Special word " << word << " is not allowed in the corpus. I plan to support models containing <unk> in the future. Pass --skip_symbols to convert these symbols to whitespace."); + } + } +} // namespace - VocabHandout vocab(vocab_write_, type_count_); +void CorpusCount::Run(const util::stream::ChainPosition &position) { + ngram::GrowableVocab<ngram::WriteUniqueWords> vocab(type_count_, vocab_write_); token_count_ = 0; type_count_ = 0; - const WordIndex end_sentence = vocab.Lookup("</s>"); + const WordIndex end_sentence = vocab.FindOrInsert("</s>"); Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_); uint64_t count = 0; bool delimiters[256]; - memset(delimiters, 0, sizeof(delimiters)); - const char kDelimiterSet[] = "\0\t\n\r "; - for (const char *i = kDelimiterSet; i < kDelimiterSet + sizeof(kDelimiterSet); ++i) { - delimiters[static_cast<unsigned char>(*i)] = true; - } + util::BoolCharacter::Build("\0\t\n\r ", delimiters); try { while(true) { StringPiece line(from_.ReadLine()); writer.StartSentence(); for (util::TokenIter<util::BoolCharacter, true> w(line, delimiters); w; ++w) { - WordIndex word = vocab.Lookup(*w); - UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing <unk> in the future."); + WordIndex word = vocab.FindOrInsert(*w); + if (word <= 2) { + ComplainDisallowed(*w, disallowed_symbol_action_); + continue; + } writer.Append(word); ++count; } diff --git a/klm/lm/builder/corpus_count.hh b/klm/lm/builder/corpus_count.hh index aa0ed8ed..da4ff9fc 100644 --- a/klm/lm/builder/corpus_count.hh +++ b/klm/lm/builder/corpus_count.hh @@ -1,6 +1,7 @@ -#ifndef LM_BUILDER_CORPUS_COUNT__ -#define LM_BUILDER_CORPUS_COUNT__ +#ifndef LM_BUILDER_CORPUS_COUNT_H +#define LM_BUILDER_CORPUS_COUNT_H +#include "lm/lm_exception.hh" #include "lm/word_index.hh" #include "util/scoped.hh" @@ -28,7 +29,7 @@ class CorpusCount { // token_count: out. // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value. - CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block); + CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block, WarningAction disallowed_symbol); void Run(const util::stream::ChainPosition &position); @@ -40,8 +41,10 @@ class CorpusCount { std::size_t dedupe_mem_size_; util::scoped_malloc dedupe_mem_; + + WarningAction disallowed_symbol_action_; }; } // namespace builder } // namespace lm -#endif // LM_BUILDER_CORPUS_COUNT__ +#endif // LM_BUILDER_CORPUS_COUNT_H diff --git a/klm/lm/builder/corpus_count_test.cc b/klm/lm/builder/corpus_count_test.cc index 6d325ef5..26cb6346 100644 --- a/klm/lm/builder/corpus_count_test.cc +++ b/klm/lm/builder/corpus_count_test.cc @@ -45,7 +45,7 @@ BOOST_AUTO_TEST_CASE(Short) { NGramStream stream; uint64_t token_count; WordIndex type_count = 10; - CorpusCount counter(input_piece, vocab.get(), token_count, type_count, chain.BlockSize() / chain.EntrySize()); + CorpusCount counter(input_piece, vocab.get(), token_count, type_count, chain.BlockSize() / chain.EntrySize(), SILENT); chain >> boost::ref(counter) >> stream >> util::stream::kRecycle; const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"}; diff --git a/klm/lm/builder/discount.hh b/klm/lm/builder/discount.hh index 4d0aa4fd..e2f40846 100644 --- a/klm/lm/builder/discount.hh +++ b/klm/lm/builder/discount.hh @@ -1,5 +1,5 @@ -#ifndef BUILDER_DISCOUNT__ -#define BUILDER_DISCOUNT__ +#ifndef LM_BUILDER_DISCOUNT_H +#define LM_BUILDER_DISCOUNT_H #include <algorithm> @@ -23,4 +23,4 @@ struct Discount { } // namespace builder } // namespace lm -#endif // BUILDER_DISCOUNT__ +#endif // LM_BUILDER_DISCOUNT_H diff --git a/klm/lm/builder/dump_counts_main.cc b/klm/lm/builder/dump_counts_main.cc new file mode 100644 index 00000000..fa001679 --- /dev/null +++ b/klm/lm/builder/dump_counts_main.cc @@ -0,0 +1,36 @@ +#include "lm/builder/print.hh" +#include "lm/word_index.hh" +#include "util/file.hh" +#include "util/read_compressed.hh" + +#include <boost/lexical_cast.hpp> + +#include <iostream> +#include <vector> + +int main(int argc, char *argv[]) { + if (argc != 4) { + std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n" + "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n" + "counts. Each record has order many vocabulary ids.\n" + "The vocabulary file contains the words delimited by NULL in order of id.\n" + "The vocabulary file may not be compressed because it is mmapped but the counts\n" + "file can be compressed.\n"; + return 1; + } + util::ReadCompressed counts(util::OpenReadOrThrow(argv[1])); + util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2])); + lm::builder::VocabReconstitute vocab(vocab_file.get()); + unsigned int order = boost::lexical_cast<unsigned int>(argv[3]); + std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t)); + while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) { + UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size()); + const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin()); + for (const lm::WordIndex *i = words; i != words + order; ++i) { + UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?"); + std::cout << vocab.Lookup(*i) << ' '; + } + // TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FakeOFStream. + std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n'; + } +} diff --git a/klm/lm/builder/hash_gamma.hh b/klm/lm/builder/hash_gamma.hh new file mode 100644 index 00000000..4bef47e8 --- /dev/null +++ b/klm/lm/builder/hash_gamma.hh @@ -0,0 +1,19 @@ +#ifndef LM_BUILDER_HASH_GAMMA__ +#define LM_BUILDER_HASH_GAMMA__ + +#include <stdint.h> + +namespace lm { namespace builder { + +#pragma pack(push) +#pragma pack(4) + +struct HashGamma { + uint64_t hash_value; + float gamma; +}; + +#pragma pack(pop) + +}} // namespaces +#endif // LM_BUILDER_HASH_GAMMA__ diff --git a/klm/lm/builder/header_info.hh b/klm/lm/builder/header_info.hh index ccca1456..16f3f609 100644 --- a/klm/lm/builder/header_info.hh +++ b/klm/lm/builder/header_info.hh @@ -1,5 +1,5 @@ -#ifndef LM_BUILDER_HEADER_INFO__ -#define LM_BUILDER_HEADER_INFO__ +#ifndef LM_BUILDER_HEADER_INFO_H +#define LM_BUILDER_HEADER_INFO_H #include <string> #include <stdint.h> diff --git a/klm/lm/builder/initial_probabilities.cc b/klm/lm/builder/initial_probabilities.cc index 58b42a20..5d19a897 100644 --- a/klm/lm/builder/initial_probabilities.cc +++ b/klm/lm/builder/initial_probabilities.cc @@ -3,6 +3,8 @@ #include "lm/builder/discount.hh" #include "lm/builder/ngram_stream.hh" #include "lm/builder/sort.hh" +#include "lm/builder/hash_gamma.hh" +#include "util/murmur_hash.hh" #include "util/file.hh" #include "util/stream/chain.hh" #include "util/stream/io.hh" @@ -14,55 +16,182 @@ namespace lm { namespace builder { namespace { struct BufferEntry { - // Gamma from page 20 of Chen and Goodman. + // Gamma from page 20 of Chen and Goodman. float gamma; - // \sum_w a(c w) for all w. + // \sum_w a(c w) for all w. float denominator; }; -// Extract an array of gamma from an array of BufferEntry. +struct HashBufferEntry : public BufferEntry { + // Hash value of ngram. Used to join contexts with backoffs. + uint64_t hash_value; +}; + +// Reads all entries in order like NGramStream does. +// But deletes any entries that have CutoffCount below or equal to pruning +// threshold. +class PruneNGramStream { + public: + PruneNGramStream(const util::stream::ChainPosition &position) : + current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())), + dest_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())), + currentCount_(0), + block_(position) + { + StartBlock(); + } + + NGram &operator*() { return current_; } + NGram *operator->() { return ¤t_; } + + operator bool() const { + return block_; + } + + PruneNGramStream &operator++() { + assert(block_); + + if (current_.Order() > 1) { + if(currentCount_ > 0) { + if(dest_.Base() < current_.Base()) { + memcpy(dest_.Base(), current_.Base(), current_.TotalSize()); + } + dest_.NextInMemory(); + } + } else { + dest_.NextInMemory(); + } + + current_.NextInMemory(); + + uint8_t *block_base = static_cast<uint8_t*>(block_->Get()); + if (current_.Base() == block_base + block_->ValidSize()) { + block_->SetValidSize(dest_.Base() - block_base); + ++block_; + StartBlock(); + if (block_) { + currentCount_ = current_.CutoffCount(); + } + } else { + currentCount_ = current_.CutoffCount(); + } + + return *this; + } + + private: + void StartBlock() { + for (; ; ++block_) { + if (!block_) return; + if (block_->ValidSize()) break; + } + current_.ReBase(block_->Get()); + currentCount_ = current_.CutoffCount(); + + dest_.ReBase(block_->Get()); + } + + NGram current_; // input iterator + NGram dest_; // output iterator + + uint64_t currentCount_; + + util::stream::Link block_; +}; + +// Extract an array of HashedGamma from an array of BufferEntry. class OnlyGamma { public: + OnlyGamma(bool pruning) : pruning_(pruning) {} + void Run(const util::stream::ChainPosition &position) { for (util::stream::Link block_it(position); block_it; ++block_it) { - float *out = static_cast<float*>(block_it->Get()); - const float *in = out; - const float *end = static_cast<const float*>(block_it->ValidEnd()); - for (out += 1, in += 2; in < end; out += 1, in += 2) { - *out = *in; + if(pruning_) { + const HashBufferEntry *in = static_cast<const HashBufferEntry*>(block_it->Get()); + const HashBufferEntry *end = static_cast<const HashBufferEntry*>(block_it->ValidEnd()); + + // Just make it point to the beginning of the stream so it can be overwritten + // With HashGamma values. Do not attempt to interpret the values until set below. + HashGamma *out = static_cast<HashGamma*>(block_it->Get()); + for (; in < end; out += 1, in += 1) { + // buffering, otherwise might overwrite values too early + float gamma_buf = in->gamma; + uint64_t hash_buf = in->hash_value; + + out->gamma = gamma_buf; + out->hash_value = hash_buf; + } + block_it->SetValidSize((block_it->ValidSize() * sizeof(HashGamma)) / sizeof(HashBufferEntry)); + } + else { + float *out = static_cast<float*>(block_it->Get()); + const float *in = out; + const float *end = static_cast<const float*>(block_it->ValidEnd()); + for (out += 1, in += 2; in < end; out += 1, in += 2) { + *out = *in; + } + block_it->SetValidSize(block_it->ValidSize() / 2); } - block_it->SetValidSize(block_it->ValidSize() / 2); } } + + private: + bool pruning_; }; class AddRight { public: - AddRight(const Discount &discount, const util::stream::ChainPosition &input) - : discount_(discount), input_(input) {} + AddRight(const Discount &discount, const util::stream::ChainPosition &input, bool pruning) + : discount_(discount), input_(input), pruning_(pruning) {} void Run(const util::stream::ChainPosition &output) { NGramStream in(input_); util::stream::Stream out(output); std::vector<WordIndex> previous(in->Order() - 1); + // Silly windows requires this workaround to just get an invalid pointer when empty. + void *const previous_raw = previous.empty() ? NULL : static_cast<void*>(&previous[0]); const std::size_t size = sizeof(WordIndex) * previous.size(); + for(; in; ++out) { - memcpy(&previous[0], in->begin(), size); + memcpy(previous_raw, in->begin(), size); uint64_t denominator = 0; + uint64_t normalizer = 0; + uint64_t counts[4]; memset(counts, 0, sizeof(counts)); do { - denominator += in->Count(); - ++counts[std::min(in->Count(), static_cast<uint64_t>(3))]; - } while (++in && !memcmp(&previous[0], in->begin(), size)); + denominator += in->UnmarkedCount(); + + // Collect unused probability mass from pruning. + // Becomes 0 for unpruned ngrams. + normalizer += in->UnmarkedCount() - in->CutoffCount(); + + // Chen&Goodman do not mention counting based on cutoffs, but + // backoff becomes larger than 1 otherwise, so probably needs + // to count cutoffs. Counts normally without pruning. + if(in->CutoffCount() > 0) + ++counts[std::min(in->CutoffCount(), static_cast<uint64_t>(3))]; + + } while (++in && !memcmp(previous_raw, in->begin(), size)); + BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get()); entry.denominator = static_cast<float>(denominator); entry.gamma = 0.0; for (unsigned i = 1; i <= 3; ++i) { entry.gamma += discount_.Get(i) * static_cast<float>(counts[i]); } + + // Makes model sum to 1 with pruning (I hope). + entry.gamma += normalizer; + entry.gamma /= entry.denominator; + + if(pruning_) { + // If pruning is enabled the stream actually contains HashBufferEntry, see InitialProbabilities(...), + // so add a hash value that identifies the current ngram. + static_cast<HashBufferEntry*>(&entry)->hash_value = util::MurmurHashNative(previous_raw, size); + } } out.Poison(); } @@ -70,6 +199,7 @@ class AddRight { private: const Discount &discount_; const util::stream::ChainPosition input_; + bool pruning_; }; class MergeRight { @@ -82,7 +212,7 @@ class MergeRight { void Run(const util::stream::ChainPosition &primary) { util::stream::Stream summed(from_adder_); - NGramStream grams(primary); + PruneNGramStream grams(primary); // Without interpolation, the interpolation weight goes to <unk>. if (grams->Order() == 1 && !interpolate_unigrams_) { @@ -97,15 +227,16 @@ class MergeRight { ++summed; return; } - + std::vector<WordIndex> previous(grams->Order() - 1); const std::size_t size = sizeof(WordIndex) * previous.size(); for (; grams; ++summed) { memcpy(&previous[0], grams->begin(), size); const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get()); + do { Payload &pay = grams->Value(); - pay.uninterp.prob = discount_.Apply(pay.count) / sums.denominator; + pay.uninterp.prob = discount_.Apply(grams->UnmarkedCount()) / sums.denominator; pay.uninterp.gamma = sums.gamma; } while (++grams && !memcmp(&previous[0], grams->begin(), size)); } @@ -119,17 +250,29 @@ class MergeRight { } // namespace -void InitialProbabilities(const InitialProbabilitiesConfig &config, const std::vector<Discount> &discounts, Chains &primary, Chains &second_in, Chains &gamma_out) { - util::stream::ChainConfig gamma_config = config.adder_out; - gamma_config.entry_size = sizeof(BufferEntry); +void InitialProbabilities( + const InitialProbabilitiesConfig &config, + const std::vector<Discount> &discounts, + util::stream::Chains &primary, + util::stream::Chains &second_in, + util::stream::Chains &gamma_out, + const std::vector<uint64_t> &prune_thresholds) { for (size_t i = 0; i < primary.size(); ++i) { + util::stream::ChainConfig gamma_config = config.adder_out; + if(prune_thresholds[i] > 0) + gamma_config.entry_size = sizeof(HashBufferEntry); + else + gamma_config.entry_size = sizeof(BufferEntry); + util::stream::ChainPosition second(second_in[i].Add()); second_in[i] >> util::stream::kRecycle; gamma_out.push_back(gamma_config); - gamma_out[i] >> AddRight(discounts[i], second); + gamma_out[i] >> AddRight(discounts[i], second, prune_thresholds[i] > 0); + primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i]); - // Don't bother with the OnlyGamma thread for something to discard. - if (i) gamma_out[i] >> OnlyGamma(); + + // Don't bother with the OnlyGamma thread for something to discard. + if (i) gamma_out[i] >> OnlyGamma(prune_thresholds[i] > 0); } } diff --git a/klm/lm/builder/initial_probabilities.hh b/klm/lm/builder/initial_probabilities.hh index 626388eb..c1010e08 100644 --- a/klm/lm/builder/initial_probabilities.hh +++ b/klm/lm/builder/initial_probabilities.hh @@ -1,14 +1,15 @@ -#ifndef LM_BUILDER_INITIAL_PROBABILITIES__ -#define LM_BUILDER_INITIAL_PROBABILITIES__ +#ifndef LM_BUILDER_INITIAL_PROBABILITIES_H +#define LM_BUILDER_INITIAL_PROBABILITIES_H #include "lm/builder/discount.hh" #include "util/stream/config.hh" #include <vector> +namespace util { namespace stream { class Chains; } } + namespace lm { namespace builder { -class Chains; struct InitialProbabilitiesConfig { // These should be small buffers to keep the adder from getting too far ahead @@ -26,9 +27,15 @@ struct InitialProbabilitiesConfig { * The values are bare floats and should be buffered for interpolation to * use. */ -void InitialProbabilities(const InitialProbabilitiesConfig &config, const std::vector<Discount> &discounts, Chains &primary, Chains &second_in, Chains &gamma_out); +void InitialProbabilities( + const InitialProbabilitiesConfig &config, + const std::vector<Discount> &discounts, + util::stream::Chains &primary, + util::stream::Chains &second_in, + util::stream::Chains &gamma_out, + const std::vector<uint64_t> &prune_thresholds); } // namespace builder } // namespace lm -#endif // LM_BUILDER_INITIAL_PROBABILITIES__ +#endif // LM_BUILDER_INITIAL_PROBABILITIES_H diff --git a/klm/lm/builder/interpolate.cc b/klm/lm/builder/interpolate.cc index 50026806..a7947a42 100644 --- a/klm/lm/builder/interpolate.cc +++ b/klm/lm/builder/interpolate.cc @@ -1,18 +1,74 @@ #include "lm/builder/interpolate.hh" +#include "lm/builder/hash_gamma.hh" #include "lm/builder/joint_order.hh" -#include "lm/builder/multi_stream.hh" +#include "lm/builder/ngram_stream.hh" #include "lm/builder/sort.hh" #include "lm/lm_exception.hh" +#include "util/fixed_array.hh" +#include "util/murmur_hash.hh" #include <assert.h> +#include <math.h> namespace lm { namespace builder { namespace { -class Callback { +/* Calculate q, the collapsed probability and backoff, as defined in + * @inproceedings{Heafield-rest, + * author = {Kenneth Heafield and Philipp Koehn and Alon Lavie}, + * title = {Language Model Rest Costs and Space-Efficient Storage}, + * year = {2012}, + * month = {July}, + * booktitle = {Proceedings of the Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning}, + * address = {Jeju Island, Korea}, + * pages = {1169--1178}, + * url = {http://kheafield.com/professional/edinburgh/rest\_paper.pdf}, + * } + * This is particularly convenient to calculate during interpolation because + * the needed backoff terms are already accessed at the same time. + */ +class OutputQ { public: - Callback(float uniform_prob, const ChainPositions &backoffs) : backoffs_(backoffs.size()), probs_(backoffs.size() + 2) { + explicit OutputQ(std::size_t order) : q_delta_(order) {} + + void Gram(unsigned order_minus_1, float full_backoff, ProbBackoff &out) { + float &q_del = q_delta_[order_minus_1]; + if (order_minus_1) { + // Divide by context's backoff (which comes in as out.backoff) + q_del = q_delta_[order_minus_1 - 1] / out.backoff * full_backoff; + } else { + q_del = full_backoff; + } + out.prob = log10f(out.prob * q_del); + // TODO: stop wastefully outputting this! + out.backoff = 0.0; + } + + private: + // Product of backoffs in the numerator divided by backoffs in the + // denominator. Does not include + std::vector<float> q_delta_; +}; + +/* Default: output probability and backoff */ +class OutputProbBackoff { + public: + explicit OutputProbBackoff(std::size_t /*order*/) {} + + void Gram(unsigned /*order_minus_1*/, float full_backoff, ProbBackoff &out) const { + // Correcting for numerical precision issues. Take that IRST. + out.prob = std::min(0.0f, log10f(out.prob)); + out.backoff = log10f(full_backoff); + } +}; + +template <class Output> class Callback { + public: + Callback(float uniform_prob, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds) + : backoffs_(backoffs.size()), probs_(backoffs.size() + 2), + prune_thresholds_(prune_thresholds), + output_(backoffs.size() + 1 /* order */) { probs_[0] = uniform_prob; for (std::size_t i = 0; i < backoffs.size(); ++i) { backoffs_.push_back(backoffs[i]); @@ -21,6 +77,10 @@ class Callback { ~Callback() { for (std::size_t i = 0; i < backoffs_.size(); ++i) { + if(prune_thresholds_[i + 1] > 0) + while(backoffs_[i]) + ++backoffs_[i]; + if (backoffs_[i]) { std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl; abort(); @@ -32,34 +92,66 @@ class Callback { Payload &pay = gram.Value(); pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1]; probs_[order_minus_1 + 1] = pay.complete.prob; - pay.complete.prob = log10(pay.complete.prob); - // TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling. + + float out_backoff; if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) { - pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get())); - ++backoffs_[order_minus_1]; + if(prune_thresholds_[order_minus_1 + 1] > 0) { + //Compute hash value for current context + uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex)); + + const HashGamma *hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get()); + while(current_hash != hashed_backoff->hash_value && ++backoffs_[order_minus_1]) + hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get()); + + if(current_hash == hashed_backoff->hash_value) { + out_backoff = hashed_backoff->gamma; + ++backoffs_[order_minus_1]; + } else { + // Has been pruned away so it is not a context anymore + out_backoff = 1.0; + } + } else { + out_backoff = *static_cast<const float*>(backoffs_[order_minus_1].Get()); + ++backoffs_[order_minus_1]; + } } else { - // Not a context. - pay.complete.backoff = 0.0; + // Not a context. + out_backoff = 1.0; } + + output_.Gram(order_minus_1, out_backoff, pay.complete); } void Exit(unsigned, const NGram &) const {} private: - FixedArray<util::stream::Stream> backoffs_; + util::FixedArray<util::stream::Stream> backoffs_; std::vector<float> probs_; + const std::vector<uint64_t>& prune_thresholds_; + + Output output_; }; } // namespace -Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs) - : uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {} +Interpolate::Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t>& prune_thresholds, bool output_q) + : uniform_prob_(1.0 / static_cast<float>(vocab_size)), // Includes <unk> but excludes <s>. + backoffs_(backoffs), + prune_thresholds_(prune_thresholds), + output_q_(output_q) {} // perform order-wise interpolation -void Interpolate::Run(const ChainPositions &positions) { +void Interpolate::Run(const util::stream::ChainPositions &positions) { assert(positions.size() == backoffs_.size() + 1); - Callback callback(uniform_prob_, backoffs_); - JointOrder<Callback, SuffixOrder>(positions, callback); + if (output_q_) { + typedef Callback<OutputQ> C; + C callback(uniform_prob_, backoffs_, prune_thresholds_); + JointOrder<C, SuffixOrder>(positions, callback); + } else { + typedef Callback<OutputProbBackoff> C; + C callback(uniform_prob_, backoffs_, prune_thresholds_); + JointOrder<C, SuffixOrder>(positions, callback); + } } }} // namespaces diff --git a/klm/lm/builder/interpolate.hh b/klm/lm/builder/interpolate.hh index 9268d404..0acece92 100644 --- a/klm/lm/builder/interpolate.hh +++ b/klm/lm/builder/interpolate.hh @@ -1,9 +1,11 @@ -#ifndef LM_BUILDER_INTERPOLATE__ -#define LM_BUILDER_INTERPOLATE__ +#ifndef LM_BUILDER_INTERPOLATE_H +#define LM_BUILDER_INTERPOLATE_H -#include <stdint.h> +#include "util/stream/multi_stream.hh" + +#include <vector> -#include "lm/builder/multi_stream.hh" +#include <stdint.h> namespace lm { namespace builder { @@ -14,14 +16,18 @@ namespace lm { namespace builder { */ class Interpolate { public: - explicit Interpolate(uint64_t unigram_count, const ChainPositions &backoffs); + // Normally vocab_size is the unigram count-1 (since p(<s>) = 0) but might + // be larger when the user specifies a consistent vocabulary size. + explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool output_q_); - void Run(const ChainPositions &positions); + void Run(const util::stream::ChainPositions &positions); private: float uniform_prob_; - ChainPositions backoffs_; + util::stream::ChainPositions backoffs_; + const std::vector<uint64_t> prune_thresholds_; + bool output_q_; }; }} // namespaces -#endif // LM_BUILDER_INTERPOLATE__ +#endif // LM_BUILDER_INTERPOLATE_H diff --git a/klm/lm/builder/joint_order.hh b/klm/lm/builder/joint_order.hh index b5620144..7235d4f7 100644 --- a/klm/lm/builder/joint_order.hh +++ b/klm/lm/builder/joint_order.hh @@ -1,14 +1,14 @@ -#ifndef LM_BUILDER_JOINT_ORDER__ -#define LM_BUILDER_JOINT_ORDER__ +#ifndef LM_BUILDER_JOINT_ORDER_H +#define LM_BUILDER_JOINT_ORDER_H -#include "lm/builder/multi_stream.hh" +#include "lm/builder/ngram_stream.hh" #include "lm/lm_exception.hh" #include <string.h> namespace lm { namespace builder { -template <class Callback, class Compare> void JointOrder(const ChainPositions &positions, Callback &callback) { +template <class Callback, class Compare> void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) { // Allow matching to reference streams[-1]. NGramStreams streams_with_dummy; streams_with_dummy.InitWithDummy(positions); @@ -40,4 +40,4 @@ template <class Callback, class Compare> void JointOrder(const ChainPositions &p }} // namespaces -#endif // LM_BUILDER_JOINT_ORDER__ +#endif // LM_BUILDER_JOINT_ORDER_H diff --git a/klm/lm/builder/lmplz_main.cc b/klm/lm/builder/lmplz_main.cc index 2563deed..265dd216 100644 --- a/klm/lm/builder/lmplz_main.cc +++ b/klm/lm/builder/lmplz_main.cc @@ -1,4 +1,5 @@ #include "lm/builder/pipeline.hh" +#include "lm/lm_exception.hh" #include "util/file.hh" #include "util/file_piece.hh" #include "util/usage.hh" @@ -7,6 +8,7 @@ #include <boost/program_options.hpp> #include <boost/version.hpp> +#include <vector> namespace { class SizeNotify { @@ -25,6 +27,57 @@ boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, co return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value); } +// Parse and validate pruning thresholds then return vector of threshold counts +// for each n-grams order. +std::vector<uint64_t> ParsePruning(const std::vector<std::string> ¶m, std::size_t order) { + // convert to vector of integers + std::vector<uint64_t> prune_thresholds; + prune_thresholds.reserve(order); + for (std::vector<std::string>::const_iterator it(param.begin()); it != param.end(); ++it) { + try { + prune_thresholds.push_back(boost::lexical_cast<uint64_t>(*it)); + } catch(const boost::bad_lexical_cast &) { + UTIL_THROW(util::Exception, "Bad pruning threshold " << *it); + } + } + + // Fill with zeros by default. + if (prune_thresholds.empty()) { + prune_thresholds.resize(order, 0); + return prune_thresholds; + } + + // validate pruning threshold if specified + // throw if each n-gram order has not threshold specified + UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order); + // threshold for unigram can only be 0 (no pruning) + UTIL_THROW_IF(prune_thresholds[0] != 0, util::Exception, "Unigram pruning is not implemented, so the first pruning threshold must be 0."); + + // check if threshold are not in decreasing order + uint64_t lower_threshold = 0; + for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) { + UTIL_THROW_IF(lower_threshold > *it, util::Exception, "Pruning thresholds should be in non-decreasing order. Otherwise substrings would be removed, which is bad for query-time data structures."); + lower_threshold = *it; + } + + // Pad to all orders using the last value. + prune_thresholds.resize(order, prune_thresholds.back()); + return prune_thresholds; +} + +lm::builder::Discount ParseDiscountFallback(const std::vector<std::string> ¶m) { + lm::builder::Discount ret; + UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+"); + UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified"); + ret.amount[0] = 0.0; + for (unsigned i = 0; i < 3; ++i) { + float discount = boost::lexical_cast<float>(param[i < param.size() ? i : (param.size() - 1)]); + UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "]."); + ret.amount[i + 1] = discount; + } + return ret; +} + } // namespace int main(int argc, char *argv[]) { @@ -34,25 +87,36 @@ int main(int argc, char *argv[]) { lm::builder::PipelineConfig pipeline; std::string text, arpa; + std::vector<std::string> pruning; + std::vector<std::string> discount_fallback; + std::vector<std::string> discount_fallback_default; + discount_fallback_default.push_back("0.5"); + discount_fallback_default.push_back("1"); + discount_fallback_default.push_back("1.5"); options.add_options() - ("help", po::bool_switch(), "Show this help message") + ("help,h", po::bool_switch(), "Show this help message") ("order,o", po::value<std::size_t>(&pipeline.order) #if BOOST_VERSION >= 104200 ->required() #endif , "Order of the model") - ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)") + ("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI. If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.") + ("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception") ("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix") ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory") ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow") ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)") - ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table") ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)") - ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file") + ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table") + ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write a file containing the unique vocabulary strings delimited by null bytes") + ("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams") ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.") ("text", po::value<std::string>(&text), "Read text from a file instead of stdin") - ("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout"); + ("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout") + ("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.") + ("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Unigram pruning is not implemented, so the first value must be zero. Default is to not prune, which is equivalent to --prune 0.") + ("discount_fallback", po::value<std::vector<std::string> >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons. It can also fail if these values are out of range. This option falls back to user-specified discounts when the closed-form estimate fails. Note that this option is generally a bad idea: you should deduplicate your corpus instead. However, class-based models need custom discounts because they lack singleton unigrams. Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail."); po::variables_map vm; po::store(po::parse_command_line(argc, argv, options), vm); @@ -95,6 +159,29 @@ int main(int argc, char *argv[]) { } #endif + if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) { + std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl; + return 1; + } + + if (vm["skip_symbols"].as<bool>()) { + pipeline.disallowed_symbol_action = lm::COMPLAIN; + } else { + pipeline.disallowed_symbol_action = lm::THROW_UP; + } + + if (vm.count("discount_fallback")) { + pipeline.discount.fallback = ParseDiscountFallback(discount_fallback); + pipeline.discount.bad_action = lm::COMPLAIN; + } else { + // Unused, just here to prevent the compiler from complaining about uninitialized. + pipeline.discount.fallback = lm::builder::Discount(); + pipeline.discount.bad_action = lm::THROW_UP; + } + + // parse pruning thresholds. These depend on order, so it is not done as a notifier. + pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order); + util::NormalizeTempPrefix(pipeline.sort.temp_prefix); lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; diff --git a/klm/lm/builder/ngram.hh b/klm/lm/builder/ngram.hh index f5681516..0472bcb1 100644 --- a/klm/lm/builder/ngram.hh +++ b/klm/lm/builder/ngram.hh @@ -1,5 +1,5 @@ -#ifndef LM_BUILDER_NGRAM__ -#define LM_BUILDER_NGRAM__ +#ifndef LM_BUILDER_NGRAM_H +#define LM_BUILDER_NGRAM_H #include "lm/weights.hh" #include "lm/word_index.hh" @@ -26,7 +26,7 @@ union Payload { class NGram { public: - NGram(void *begin, std::size_t order) + NGram(void *begin, std::size_t order) : begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {} const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); } @@ -38,12 +38,12 @@ class NGram { end_ = begin_ + difference; } - // Would do operator++ but that can get confusing for a stream. + // Would do operator++ but that can get confusing for a stream. void NextInMemory() { ReBase(&Value() + 1); } - // Lower-case in deference to STL. + // Lower-case in deference to STL. const WordIndex *begin() const { return begin_; } WordIndex *begin() { return begin_; } const WordIndex *end() const { return end_; } @@ -61,7 +61,7 @@ class NGram { return order * sizeof(WordIndex) + sizeof(Payload); } std::size_t TotalSize() const { - // Compiler should optimize this. + // Compiler should optimize this. return TotalSize(Order()); } static std::size_t OrderFromSize(std::size_t size) { @@ -69,6 +69,31 @@ class NGram { assert(size == TotalSize(ret)); return ret; } + + // manipulate msb to signal that ngram can be pruned + /*mjd**********************************************************************/ + + bool IsMarked() const { + return Value().count >> (sizeof(Value().count) * 8 - 1); + } + + void Mark() { + Value().count |= (1ul << (sizeof(Value().count) * 8 - 1)); + } + + void Unmark() { + Value().count &= ~(1ul << (sizeof(Value().count) * 8 - 1)); + } + + uint64_t UnmarkedCount() const { + return Value().count & ~(1ul << (sizeof(Value().count) * 8 - 1)); + } + + uint64_t CutoffCount() const { + return IsMarked() ? 0 : UnmarkedCount(); + } + + /*mjd**********************************************************************/ private: WordIndex *begin_, *end_; @@ -81,4 +106,4 @@ const WordIndex kEOS = 2; } // namespace builder } // namespace lm -#endif // LM_BUILDER_NGRAM__ +#endif // LM_BUILDER_NGRAM_H diff --git a/klm/lm/builder/ngram_stream.hh b/klm/lm/builder/ngram_stream.hh index 3c994664..ab42734c 100644 --- a/klm/lm/builder/ngram_stream.hh +++ b/klm/lm/builder/ngram_stream.hh @@ -1,8 +1,9 @@ -#ifndef LM_BUILDER_NGRAM_STREAM__ -#define LM_BUILDER_NGRAM_STREAM__ +#ifndef LM_BUILDER_NGRAM_STREAM_H +#define LM_BUILDER_NGRAM_STREAM_H #include "lm/builder/ngram.hh" #include "util/stream/chain.hh" +#include "util/stream/multi_stream.hh" #include "util/stream/stream.hh" #include <cstddef> @@ -51,5 +52,7 @@ inline util::stream::Chain &operator>>(util::stream::Chain &chain, NGramStream & return chain; } +typedef util::stream::GenericStreams<NGramStream> NGramStreams; + }} // namespaces -#endif // LM_BUILDER_NGRAM_STREAM__ +#endif // LM_BUILDER_NGRAM_STREAM_H diff --git a/klm/lm/builder/pipeline.cc b/klm/lm/builder/pipeline.cc index 44a2313c..21064ab3 100644 --- a/klm/lm/builder/pipeline.cc +++ b/klm/lm/builder/pipeline.cc @@ -2,6 +2,7 @@ #include "lm/builder/adjust_counts.hh" #include "lm/builder/corpus_count.hh" +#include "lm/builder/hash_gamma.hh" #include "lm/builder/initial_probabilities.hh" #include "lm/builder/interpolate.hh" #include "lm/builder/print.hh" @@ -20,10 +21,13 @@ namespace lm { namespace builder { namespace { -void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<Discount> &discounts) { +void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<uint64_t> &counts_pruned, const std::vector<Discount> &discounts) { std::cerr << "Statistics:\n"; for (size_t i = 0; i < counts.size(); ++i) { - std::cerr << (i + 1) << ' ' << counts[i]; + std::cerr << (i + 1) << ' ' << counts_pruned[i]; + if(counts[i] != counts_pruned[i]) + std::cerr << "/" << counts[i]; + for (size_t d = 1; d <= 3; ++d) std::cerr << " D" << d << (d == 3 ? "+=" : "=") << discounts[i].amount[d]; std::cerr << '\n'; @@ -39,7 +43,7 @@ class Master { const PipelineConfig &Config() const { return config_; } - Chains &MutableChains() { return chains_; } + util::stream::Chains &MutableChains() { return chains_; } template <class T> Master &operator>>(const T &worker) { chains_ >> worker; @@ -64,7 +68,7 @@ class Master { } // For initial probabilities, but this is generic. - void SortAndReadTwice(const std::vector<uint64_t> &counts, Sorts<ContextOrder> &sorts, Chains &second, util::stream::ChainConfig second_config) { + void SortAndReadTwice(const std::vector<uint64_t> &counts, Sorts<ContextOrder> &sorts, util::stream::Chains &second, util::stream::ChainConfig second_config) { // Do merge first before allocating chain memory. for (std::size_t i = 1; i < config_.order; ++i) { sorts[i - 1].Merge(0); @@ -198,9 +202,9 @@ class Master { PipelineConfig config_; - Chains chains_; + util::stream::Chains chains_; // Often only unigrams, but sometimes all orders. - FixedArray<util::stream::FileBuffer> files_; + util::FixedArray<util::stream::FileBuffer> files_; }; void CountText(int text_file /* input */, int vocab_file /* output */, Master &master, uint64_t &token_count, std::string &text_file_name) { @@ -221,7 +225,7 @@ void CountText(int text_file /* input */, int vocab_file /* output */, Master &m WordIndex type_count = config.vocab_estimate; util::FilePiece text(text_file, NULL, &std::cerr); text_file_name = text.FileName(); - CorpusCount counter(text, vocab_file, token_count, type_count, chain.BlockSize() / chain.EntrySize()); + CorpusCount counter(text, vocab_file, token_count, type_count, chain.BlockSize() / chain.EntrySize(), config.disallowed_symbol_action); chain >> boost::ref(counter); util::stream::Sort<SuffixOrder, AddCombiner> sorter(chain, config.sort, SuffixOrder(config.order), AddCombiner()); @@ -231,21 +235,22 @@ void CountText(int text_file /* input */, int vocab_file /* output */, Master &m master.InitForAdjust(sorter, type_count); } -void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector<Discount> &discounts, Master &master, Sorts<SuffixOrder> &primary, FixedArray<util::stream::FileBuffer> &gammas) { +void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector<uint64_t> &counts_pruned, const std::vector<Discount> &discounts, Master &master, Sorts<SuffixOrder> &primary, + util::FixedArray<util::stream::FileBuffer> &gammas, const std::vector<uint64_t> &prune_thresholds) { const PipelineConfig &config = master.Config(); - Chains second(config.order); + util::stream::Chains second(config.order); { Sorts<ContextOrder> sorts; master.SetupSorts(sorts); - PrintStatistics(counts, discounts); - lm::ngram::ShowSizes(counts); + PrintStatistics(counts, counts_pruned, discounts); + lm::ngram::ShowSizes(counts_pruned); std::cerr << "=== 3/5 Calculating and sorting initial probabilities ===" << std::endl; - master.SortAndReadTwice(counts, sorts, second, config.initial_probs.adder_in); + master.SortAndReadTwice(counts_pruned, sorts, second, config.initial_probs.adder_in); } - Chains gamma_chains(config.order); - InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains); + util::stream::Chains gamma_chains(config.order); + InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds); // Don't care about gamma for 0. gamma_chains[0] >> util::stream::kRecycle; gammas.Init(config.order - 1); @@ -257,19 +262,25 @@ void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector master.SetupSorts(primary); } -void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &master, Sorts<SuffixOrder> &primary, FixedArray<util::stream::FileBuffer> &gammas) { +void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &master, Sorts<SuffixOrder> &primary, util::FixedArray<util::stream::FileBuffer> &gammas) { std::cerr << "=== 4/5 Calculating and writing order-interpolated probabilities ===" << std::endl; const PipelineConfig &config = master.Config(); master.MaximumLazyInput(counts, primary); - Chains gamma_chains(config.order - 1); - util::stream::ChainConfig read_backoffs(config.read_backoffs); - read_backoffs.entry_size = sizeof(float); + util::stream::Chains gamma_chains(config.order - 1); for (std::size_t i = 0; i < config.order - 1; ++i) { + util::stream::ChainConfig read_backoffs(config.read_backoffs); + + // Add 1 because here we are skipping unigrams + if(config.prune_thresholds[i + 1] > 0) + read_backoffs.entry_size = sizeof(HashGamma); + else + read_backoffs.entry_size = sizeof(float); + gamma_chains.push_back(read_backoffs); gamma_chains.back() >> gammas[i].Source(); } - master >> Interpolate(counts[0], ChainPositions(gamma_chains)); + master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds, config.output_q); gamma_chains >> util::stream::kRecycle; master.BufferFinal(counts); } @@ -291,32 +302,40 @@ void Pipeline(PipelineConfig config, int text_file, int out_arpa) { "Not enough memory to fit " << (config.order * config.block_count) << " blocks with minimum size " << config.minimum_block << ". Increase memory to " << (config.minimum_block * config.order * config.block_count) << " bytes or decrease the minimum block size."); UTIL_TIMER("(%w s) Total wall time elapsed\n"); - Master master(config); - - util::scoped_fd vocab_file(config.vocab_file.empty() ? - util::MakeTemp(config.TempPrefix()) : - util::CreateOrThrow(config.vocab_file.c_str())); - uint64_t token_count; - std::string text_file_name; - CountText(text_file, vocab_file.get(), master, token_count, text_file_name); - std::vector<uint64_t> counts; - std::vector<Discount> discounts; - master >> AdjustCounts(counts, discounts); + Master master(config); + // master's destructor will wait for chains. But they might be deadlocked if + // this thread dies because e.g. it ran out of memory. + try { + util::scoped_fd vocab_file(config.vocab_file.empty() ? + util::MakeTemp(config.TempPrefix()) : + util::CreateOrThrow(config.vocab_file.c_str())); + uint64_t token_count; + std::string text_file_name; + CountText(text_file, vocab_file.get(), master, token_count, text_file_name); + + std::vector<uint64_t> counts; + std::vector<uint64_t> counts_pruned; + std::vector<Discount> discounts; + master >> AdjustCounts(config.prune_thresholds, counts, counts_pruned, config.discount, discounts); + + { + util::FixedArray<util::stream::FileBuffer> gammas; + Sorts<SuffixOrder> primary; + InitialProbabilities(counts, counts_pruned, discounts, master, primary, gammas, config.prune_thresholds); + InterpolateProbabilities(counts_pruned, master, primary, gammas); + } - { - FixedArray<util::stream::FileBuffer> gammas; - Sorts<SuffixOrder> primary; - InitialProbabilities(counts, discounts, master, primary, gammas); - InterpolateProbabilities(counts, master, primary, gammas); + std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl; + VocabReconstitute vocab(vocab_file.get()); + UTIL_THROW_IF(vocab.Size() != counts[0], util::Exception, "Vocab words don't match up. Is there a null byte in the input?"); + HeaderInfo header_info(text_file_name, token_count); + master >> PrintARPA(vocab, counts_pruned, (config.verbose_header ? &header_info : NULL), out_arpa) >> util::stream::kRecycle; + master.MutableChains().Wait(true); + } catch (const util::Exception &e) { + std::cerr << e.what() << std::endl; + abort(); } - - std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl; - VocabReconstitute vocab(vocab_file.get()); - UTIL_THROW_IF(vocab.Size() != counts[0], util::Exception, "Vocab words don't match up. Is there a null byte in the input?"); - HeaderInfo header_info(text_file_name, token_count); - master >> PrintARPA(vocab, counts, (config.verbose_header ? &header_info : NULL), out_arpa) >> util::stream::kRecycle; - master.MutableChains().Wait(true); } }} // namespaces diff --git a/klm/lm/builder/pipeline.hh b/klm/lm/builder/pipeline.hh index 845e5481..09e1a4d5 100644 --- a/klm/lm/builder/pipeline.hh +++ b/klm/lm/builder/pipeline.hh @@ -1,8 +1,10 @@ -#ifndef LM_BUILDER_PIPELINE__ -#define LM_BUILDER_PIPELINE__ +#ifndef LM_BUILDER_PIPELINE_H +#define LM_BUILDER_PIPELINE_H +#include "lm/builder/adjust_counts.hh" #include "lm/builder/initial_probabilities.hh" #include "lm/builder/header_info.hh" +#include "lm/lm_exception.hh" #include "lm/word_index.hh" #include "util/stream/config.hh" #include "util/file_piece.hh" @@ -18,6 +20,8 @@ struct PipelineConfig { util::stream::SortConfig sort; InitialProbabilitiesConfig initial_probs; util::stream::ChainConfig read_backoffs; + + // Include a header in the ARPA with some statistics? bool verbose_header; // Estimated vocabulary size. Used for sizing CorpusCount memory and @@ -30,6 +34,34 @@ struct PipelineConfig { // Number of blocks to use. This will be overridden to 1 if everything fits. std::size_t block_count; + // n-gram count thresholds for pruning. 0 values means no pruning for + // corresponding n-gram order + std::vector<uint64_t> prune_thresholds; //mjd + + // What to do with discount failures. + DiscountConfig discount; + + // Compute collapsed q values instead of probability and backoff + bool output_q; + + /* Computing the perplexity of LMs with different vocabularies is hard. For + * example, the lowest perplexity is attained by a unigram model that + * predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly + * interpolated models will sum to more than 1 because <unk> is duplicated + * (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to + * 1 but comes with its own problems). This option will make the vocabulary + * a particular size by replicating <unk> multiple times for purposes of + * computing vocabulary size. It has no effect if the actual vocabulary is + * larger. This parameter serves the same purpose as IRSTLM's "dub". + */ + uint64_t vocab_size_for_unk; + + /* What to do the first time <s>, </s>, or <unk> appears in the input. If + * this is anything but THROW_UP, then the symbol will always be treated as + * whitespace. + */ + WarningAction disallowed_symbol_action; + const std::string &TempPrefix() const { return sort.temp_prefix; } std::size_t TotalMemory() const { return sort.total_memory; } }; @@ -38,4 +70,4 @@ struct PipelineConfig { void Pipeline(PipelineConfig config, int text_file, int out_arpa); }} // namespaces -#endif // LM_BUILDER_PIPELINE__ +#endif // LM_BUILDER_PIPELINE_H diff --git a/klm/lm/builder/print.cc b/klm/lm/builder/print.cc index 84bd81ca..aee6e134 100644 --- a/klm/lm/builder/print.cc +++ b/klm/lm/builder/print.cc @@ -42,22 +42,22 @@ PrintARPA::PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> util::WriteOrThrow(out_fd, as_string.data(), as_string.size()); } -void PrintARPA::Run(const ChainPositions &positions) { +void PrintARPA::Run(const util::stream::ChainPositions &positions) { util::scoped_fd closer(out_fd_); UTIL_TIMER("(%w s) Wrote ARPA file\n"); util::FakeOFStream out(out_fd_); for (unsigned order = 1; order <= positions.size(); ++order) { out << "\\" << order << "-grams:" << '\n'; for (NGramStream stream(positions[order - 1]); stream; ++stream) { - // Correcting for numerical precision issues. Take that IRST. - out << std::min(0.0f, stream->Value().complete.prob) << '\t' << vocab_.Lookup(*stream->begin()); + // Correcting for numerical precision issues. Take that IRST. + out << stream->Value().complete.prob << '\t' << vocab_.Lookup(*stream->begin()); for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) { out << ' ' << vocab_.Lookup(*i); } - float backoff = stream->Value().complete.backoff; - if (backoff != 0.0) - out << '\t' << backoff; + if (order != positions.size()) + out << '\t' << stream->Value().complete.backoff; out << '\n'; + } out << '\n'; } diff --git a/klm/lm/builder/print.hh b/klm/lm/builder/print.hh index adbbb94a..9856cea8 100644 --- a/klm/lm/builder/print.hh +++ b/klm/lm/builder/print.hh @@ -1,8 +1,8 @@ -#ifndef LM_BUILDER_PRINT__ -#define LM_BUILDER_PRINT__ +#ifndef LM_BUILDER_PRINT_H +#define LM_BUILDER_PRINT_H #include "lm/builder/ngram.hh" -#include "lm/builder/multi_stream.hh" +#include "lm/builder/ngram_stream.hh" #include "lm/builder/header_info.hh" #include "util/file.hh" #include "util/mmap.hh" @@ -59,7 +59,7 @@ template <class V> class Print { public: explicit Print(const VocabReconstitute &vocab, std::ostream &to) : vocab_(vocab), to_(to) {} - void Run(const ChainPositions &chains) { + void Run(const util::stream::ChainPositions &chains) { NGramStreams streams(chains); for (NGramStream *s = streams.begin(); s != streams.end(); ++s) { DumpStream(*s); @@ -92,7 +92,7 @@ class PrintARPA { // Takes ownership of out_fd upon Run(). explicit PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd); - void Run(const ChainPositions &positions); + void Run(const util::stream::ChainPositions &positions); private: const VocabReconstitute &vocab_; @@ -100,4 +100,4 @@ class PrintARPA { }; }} // namespaces -#endif // LM_BUILDER_PRINT__ +#endif // LM_BUILDER_PRINT_H diff --git a/klm/lm/builder/sort.hh b/klm/lm/builder/sort.hh index 9989389b..712bb8e3 100644 --- a/klm/lm/builder/sort.hh +++ b/klm/lm/builder/sort.hh @@ -1,7 +1,7 @@ -#ifndef LM_BUILDER_SORT__ -#define LM_BUILDER_SORT__ +#ifndef LM_BUILDER_SORT_H +#define LM_BUILDER_SORT_H -#include "lm/builder/multi_stream.hh" +#include "lm/builder/ngram_stream.hh" #include "lm/builder/ngram.hh" #include "lm/word_index.hh" #include "util/stream/sort.hh" @@ -14,24 +14,71 @@ namespace lm { namespace builder { +/** + * Abstract parent class for defining custom n-gram comparators. + */ template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> { public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ explicit Comparator(std::size_t order) : order_(order) {} + /** + * Applies the comparator using the Compare method that must be defined in any class that inherits from this class. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + * + * @see ContextOrder::Compare + * @see PrefixOrder::Compare + * @see SuffixOrder::Compare + */ inline bool operator()(const void *lhs, const void *rhs) const { return static_cast<const Child*>(this)->Compare(static_cast<const WordIndex*>(lhs), static_cast<const WordIndex*>(rhs)); } + /** Gets the n-gram order defined for this comparator. */ std::size_t Order() const { return order_; } protected: std::size_t order_; }; +/** + * N-gram comparator that compares n-grams according to their reverse (suffix) order. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the last word of each n-gram and ending with the first word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c > a d b + * - a b c > a b b + * - a b c > x a c + * - a b c < x y z + */ class SuffixOrder : public Comparator<SuffixOrder> { public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {} + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the last word of each n-gram and ending with the first word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { for (std::size_t i = order_ - 1; i != 0; --i) { if (lhs[i] != rhs[i]) @@ -43,10 +90,40 @@ class SuffixOrder : public Comparator<SuffixOrder> { static const unsigned kMatchOffset = 1; }; + +/** + * N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; + * finally, this comparator compares the last word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c < a d b + * - a b c > a b b + * - a b c > x a c + * - a b c < x y z + */ class ContextOrder : public Comparator<ContextOrder> { public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {} + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram; + * finally, this comparator compares the last word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { for (int i = order_ - 2; i >= 0; --i) { if (lhs[i] != rhs[i]) @@ -56,10 +133,37 @@ class ContextOrder : public Comparator<ContextOrder> { } }; +/** + * N-gram comparator that compares n-grams according to their natural (prefix) order. + * + * This comparator compares n-grams lexicographically, one word at a time, + * beginning with the first word of each n-gram and ending with the last word of each n-gram. + * + * Some examples of n-gram comparisons as defined by this comparator: + * - a b c == a b c + * - a b c < a b d + * - a b c < a d b + * - a b c > a b b + * - a b c < x a c + * - a b c < x y z + */ class PrefixOrder : public Comparator<PrefixOrder> { public: + + /** + * Constructs a comparator capable of comparing two n-grams. + * + * @param order Number of words in each n-gram + */ explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {} + /** + * Compares two n-grams lexicographically, one word at a time, + * beginning with the first word of each n-gram and ending with the last word of each n-gram. + * + * @param lhs A pointer to the n-gram on the left-hand side of the comparison + * @param rhs A pointer to the n-gram on the right-hand side of the comparison + */ inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { for (std::size_t i = 0; i < order_; ++i) { if (lhs[i] != rhs[i]) @@ -84,15 +188,52 @@ struct AddCombiner { }; // The combiner is only used on a single chain, so I didn't bother to allow -// that template. -template <class Compare> class Sorts : public FixedArray<util::stream::Sort<Compare> > { +// that template. +/** + * Represents an @ref util::FixedArray "array" capable of storing @ref util::stream::Sort "Sort" objects. + * + * In the anticipated use case, an instance of this class will maintain one @ref util::stream::Sort "Sort" object + * for each n-gram order (ranging from 1 up to the maximum n-gram order being processed). + * Use in this manner would enable the n-grams each n-gram order to be sorted, in parallel. + * + * @tparam Compare An @ref Comparator "ngram comparator" to use during sorting. + */ +template <class Compare> class Sorts : public util::FixedArray<util::stream::Sort<Compare> > { private: typedef util::stream::Sort<Compare> S; - typedef FixedArray<S> P; + typedef util::FixedArray<S> P; public: + + /** + * Constructs, but does not initialize. + * + * @ref util::FixedArray::Init() "Init" must be called before use. + * + * @see util::FixedArray::Init() + */ + Sorts() {} + + /** + * Constructs an @ref util::FixedArray "array" capable of storing a fixed number of @ref util::stream::Sort "Sort" objects. + * + * @param number The maximum number of @ref util::stream::Sort "sorters" that can be held by this @ref util::FixedArray "array" + * @see util::FixedArray::FixedArray() + */ + explicit Sorts(std::size_t number) : util::FixedArray<util::stream::Sort<Compare> >(number) {} + + /** + * Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array". + * + * The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator"; + * once constructed, a new worker @ref util::stream::Thread "thread" (owned by the @ref util::stream::Chain "chain") will sort the n-gram data stored + * in the @ref util::stream::Block "blocks" of the provided @ref util::stream::Chain "chain". + * + * @see util::stream::Sort::Sort() + * @see util::stream::Chain::operator>>() + */ void push_back(util::stream::Chain &chain, const util::stream::SortConfig &config, const Compare &compare) { - new (P::end()) S(chain, config, compare); + new (P::end()) S(chain, config, compare); // use "placement new" syntax to initalize S in an already-allocated memory location P::Constructed(); } }; @@ -100,4 +241,4 @@ template <class Compare> class Sorts : public FixedArray<util::stream::Sort<Comp } // namespace builder } // namespace lm -#endif // LM_BUILDER_SORT__ +#endif // LM_BUILDER_SORT_H |