diff options
Diffstat (limited to 'klm/lm')
72 files changed, 4711 insertions, 346 deletions
diff --git a/klm/lm/Jamfile b/klm/lm/Jamfile deleted file mode 100644 index b1971d88..00000000 --- a/klm/lm/Jamfile +++ /dev/null @@ -1,14 +0,0 @@ -lib kenlm : bhiksha.cc binary_format.cc config.cc lm_exception.cc model.cc quantize.cc read_arpa.cc search_hashed.cc search_trie.cc trie.cc trie_sort.cc value_build.cc virtual_interface.cc vocab.cc ../util//kenutil : <include>.. : : <include>.. <library>../util//kenutil ; - -import testing ; - -run left_test.cc ../util//kenutil kenlm ../..//boost_unit_test_framework : : test.arpa ; -run model_test.cc ../util//kenutil kenlm ../..//boost_unit_test_framework : : test.arpa test_nounk.arpa ; - -exe query : ngram_query.cc kenlm ../util//kenutil ; -exe build_binary : build_binary.cc kenlm ../util//kenutil ; - -install legacy : build_binary query - : <location>$(TOP)/klm/lm <install-type>EXE <install-dependencies>on <link>shared:<dll-path>$(TOP)/klm/lm <link>shared:<install-type>LIB ; - -alias programs : build_binary query ; diff --git a/klm/lm/Makefile.am b/klm/lm/Makefile.am index a12c5f03..48b0ba34 100644 --- a/klm/lm/Makefile.am +++ b/klm/lm/Makefile.am @@ -1,7 +1,10 @@ -bin_PROGRAMS = build_binary +bin_PROGRAMS = build_binary ngram_query -build_binary_SOURCES = build_binary.cc -build_binary_LDADD = libklm.a ../util/libklm_util.a -lz +build_binary_SOURCES = build_binary_main.cc +build_binary_LDADD = libklm.a ../util/libklm_util.a ../util/double-conversion/libklm_util_double.a -lz + +ngram_query_SOURCES = query_main.cc +ngram_query_LDADD = libklm.a ../util/libklm_util.a ../util/double-conversion/libklm_util_double.a -lz #noinst_PROGRAMS = \ # ngram_test @@ -12,21 +15,49 @@ build_binary_LDADD = libklm.a ../util/libklm_util.a -lz noinst_LIBRARIES = libklm.a libklm_a_SOURCES = \ + bhiksha.hh \ + binary_format.hh \ + blank.hh \ + config.hh \ + enumerate_vocab.hh \ + facade.hh \ + left.hh \ + lm_exception.hh \ + max_order.hh \ + model.hh \ + model_type.hh \ + ngram_query.hh \ + partial.hh \ + quantize.hh \ + read_arpa.hh \ + return.hh \ + search_hashed.hh \ + search_trie.hh \ + sizes.hh \ + state.hh \ + trie.hh \ + trie_sort.hh \ + value.hh \ + value_build.hh \ + virtual_interface.hh \ + vocab.hh \ + weights.hh \ + word_index.hh \ bhiksha.cc \ binary_format.cc \ config.cc \ lm_exception.cc \ quantize.cc \ model.cc \ - ngram_query.cc \ read_arpa.cc \ search_hashed.cc \ search_trie.cc \ + sizes.cc \ trie.cc \ trie_sort.cc \ value_build.cc \ virtual_interface.cc \ vocab.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/klm diff --git a/klm/lm/bhiksha.cc b/klm/lm/bhiksha.cc index 870a4eee..088ea98d 100644 --- a/klm/lm/bhiksha.cc +++ b/klm/lm/bhiksha.cc @@ -50,7 +50,7 @@ std::size_t ArrayCount(uint64_t max_offset, uint64_t max_next, const Config &con } } // namespace -std::size_t ArrayBhiksha::Size(uint64_t max_offset, uint64_t max_next, const Config &config) { +uint64_t ArrayBhiksha::Size(uint64_t max_offset, uint64_t max_next, const Config &config) { return sizeof(uint64_t) * (1 /* header */ + ArrayCount(max_offset, max_next, config)) + 7 /* 8-byte alignment */; } diff --git a/klm/lm/bhiksha.hh b/klm/lm/bhiksha.hh index 9734f3ab..8ff88654 100644 --- a/klm/lm/bhiksha.hh +++ b/klm/lm/bhiksha.hh @@ -33,7 +33,7 @@ class DontBhiksha { static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {} - static std::size_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; } + static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; } static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) { return util::RequiredBits(max_next); @@ -67,7 +67,7 @@ class ArrayBhiksha { static void UpdateConfigFromBinary(int fd, Config &config); - static std::size_t Size(uint64_t max_offset, uint64_t max_next, const Config &config); + static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config); static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config); diff --git a/klm/lm/binary_format.cc b/klm/lm/binary_format.cc index a56e998e..39c4a9b6 100644 --- a/klm/lm/binary_format.cc +++ b/klm/lm/binary_format.cc @@ -16,11 +16,11 @@ namespace ngram { namespace { const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version"; const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0"; -// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed). +// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed). const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n"; const long int kMagicVersion = 5; -// Old binary files built on 32-bit machines have this header. +// Old binary files built on 32-bit machines have this header. // TODO: eliminate with next binary release. struct OldSanity { char magic[sizeof(kMagicBytes)]; @@ -39,7 +39,7 @@ struct OldSanity { }; -// Test values aligned to 8 bytes. +// Test values aligned to 8 bytes. struct Sanity { char magic[ALIGN8(sizeof(kMagicBytes))]; float zero_f, one_f, minus_half_f; @@ -83,7 +83,13 @@ void WriteHeader(void *to, const Parameters ¶ms) { uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) { if (config.write_mmap) { std::size_t total = TotalHeaderSize(order) + memory_size; - backing.vocab.reset(util::MapZeroedWrite(config.write_mmap, total, backing.file), total, util::scoped_memory::MMAP_ALLOCATED); + backing.file.reset(util::CreateOrThrow(config.write_mmap)); + if (config.write_method == Config::WRITE_MMAP) { + backing.vocab.reset(util::MapZeroedWrite(backing.file.get(), total), total, util::scoped_memory::MMAP_ALLOCATED); + } else { + util::ResizeOrThrow(backing.file.get(), 0); + util::MapAnonymous(total, backing.vocab); + } strncpy(reinterpret_cast<char*>(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order)); return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order); } else { @@ -95,7 +101,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) { std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad; if (config.write_mmap) { - // Grow the file to accomodate the search, using zeros. + // Grow the file to accomodate the search, using zeros. try { util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size); } catch (util::ErrnoException &e) { @@ -108,7 +114,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t return reinterpret_cast<uint8_t*>(backing.search.get()); } // mmap it now. - // We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down. + // We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down. std::size_t page_size = util::SizePage(); std::size_t alignment_cruft = adjusted_vocab % page_size; backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED); @@ -116,23 +122,25 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t } else { util::MapAnonymous(memory_size, backing.search); return reinterpret_cast<uint8_t*>(backing.search.get()); - } + } } void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) { if (!config.write_mmap) return; - util::SyncOrThrow(backing.vocab.get(), backing.vocab.size()); switch (config.write_method) { case Config::WRITE_MMAP: + util::SyncOrThrow(backing.vocab.get(), backing.vocab.size()); util::SyncOrThrow(backing.search.get(), backing.search.size()); break; case Config::WRITE_AFTER: + util::SeekOrThrow(backing.file.get(), 0); + util::WriteOrThrow(backing.file.get(), backing.vocab.get(), backing.vocab.size()); util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad); util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size()); util::FSyncOrThrow(backing.file.get()); break; } - // header and vocab share the same mmap. The header is written here because we know the counts. + // header and vocab share the same mmap. The header is written here because we know the counts. Parameters params = Parameters(); params.counts = counts; params.fixed.order = counts.size(); @@ -141,6 +149,10 @@ void FinishFile(const Config &config, ModelType model_type, unsigned int search_ params.fixed.has_vocabulary = config.include_vocab; params.fixed.search_version = search_version; WriteHeader(backing.vocab.get(), params); + if (config.write_method == Config::WRITE_AFTER) { + util::SeekOrThrow(backing.file.get(), 0); + util::WriteOrThrow(backing.file.get(), backing.vocab.get(), TotalHeaderSize(counts.size())); + } } namespace detail { @@ -148,7 +160,7 @@ namespace detail { bool IsBinaryFormat(int fd) { const uint64_t size = util::SizeFile(fd); if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false; - // Try reading the header. + // Try reading the header. util::scoped_memory memory; try { util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory); @@ -200,10 +212,10 @@ void SeekPastHeader(int fd, const Parameters ¶ms) { util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size())); } -uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t memory_size, Backing &backing) { +uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing) { const uint64_t file_size = util::SizeFile(backing.file.get()); - // The header is smaller than a page, so we have to map the whole header as well. - std::size_t total_map = TotalHeaderSize(params.counts.size()) + memory_size; + // The header is smaller than a page, so we have to map the whole header as well. + std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size); if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map) UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map); @@ -221,7 +233,8 @@ void ComplainAboutARPA(const Config &config, ModelType model_type) { if (config.write_mmap || !config.messages) return; if (config.arpa_complain == Config::ALL) { *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl; - } else if (config.arpa_complain == Config::EXPENSIVE && model_type == TRIE_SORTED) { + } else if (config.arpa_complain == Config::EXPENSIVE && + (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) { *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl; } } diff --git a/klm/lm/binary_format.hh b/klm/lm/binary_format.hh index dd795f62..bf699d5f 100644 --- a/klm/lm/binary_format.hh +++ b/klm/lm/binary_format.hh @@ -70,7 +70,7 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet void SeekPastHeader(int fd, const Parameters ¶ms); -uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t memory_size, Backing &backing); +uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing); void ComplainAboutARPA(const Config &config, ModelType model_type); @@ -90,7 +90,7 @@ template <class To> void LoadLM(const char *file, const Config &config, To &to) new_config.probing_multiplier = params.fixed.probing_multiplier; detail::SeekPastHeader(backing.file.get(), params); To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config); - std::size_t memory_size = To::Size(params.counts, new_config); + uint64_t memory_size = To::Size(params.counts, new_config); uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing); to.InitializeFromBinary(start, params, new_config, backing.file.get()); } else { diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary_main.cc index c2ca1101..ab2c0c32 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary_main.cc @@ -1,25 +1,30 @@ #include "lm/model.hh" +#include "lm/sizes.hh" #include "util/file_piece.hh" +#include "util/usage.hh" +#include <algorithm> #include <cstdlib> #include <exception> #include <iostream> #include <iomanip> +#include <limits> #include <math.h> #include <stdlib.h> -#include <unistd.h> #ifdef WIN32 #include "util/getopt.hh" +#else +#include <unistd.h> #endif namespace lm { namespace ngram { namespace { -void Usage(const char *name) { - std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" +void Usage(const char *name, const char *default_mem) { + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" "-u sets the log10 probability for <unk> if the ARPA file does not have one.\n" " Default is -100. The ARPA file will always take precedence.\n" "-s allows models to be built even if they do not have <s> and </s>.\n" @@ -37,8 +42,11 @@ void Usage(const char *name) { "trie is a straightforward trie with bit-level packing. It uses the least\n" "memory and is still faster than SRI or IRST. Building the trie format uses an\n" "on-disk sort to save memory.\n" -"-t is the temporary directory prefix. Default is the output file name.\n" -"-m limits memory use for sorting. Measured in MB. Default is 1024MB.\n" +"-T is the temporary directory prefix. Default is the output file name.\n" +"-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n" +" with GNU sort. The number is followed by a unit: \% for percent of physical\n" +" memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n" +" Default unit is K for Kilobytes.\n" "-q turns quantization on and sets the number of bits (e.g. -q 8).\n" "-b sets backoff quantization bits. Requires -q and defaults to that value.\n" "-a compresses pointers using an array of offsets. The parameter is the\n" @@ -82,47 +90,6 @@ void ParseFileList(const char *from, std::vector<std::string> &to) { } } -void ShowSizes(const char *file, const lm::ngram::Config &config) { - std::vector<uint64_t> counts; - util::FilePiece f(file); - lm::ReadARPACounts(f, counts); - std::size_t sizes[6]; - sizes[0] = ProbingModel::Size(counts, config); - sizes[1] = RestProbingModel::Size(counts, config); - sizes[2] = TrieModel::Size(counts, config); - sizes[3] = QuantTrieModel::Size(counts, config); - sizes[4] = ArrayTrieModel::Size(counts, config); - sizes[5] = QuantArrayTrieModel::Size(counts, config); - std::size_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(size_t)); - std::size_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(size_t)); - std::size_t divide; - char prefix; - if (min_length < (1 << 10) * 10) { - prefix = ' '; - divide = 1; - } else if (min_length < (1 << 20) * 10) { - prefix = 'k'; - divide = 1 << 10; - } else if (min_length < (1ULL << 30) * 10) { - prefix = 'M'; - divide = 1 << 20; - } else { - prefix = 'G'; - divide = 1 << 30; - } - long int length = std::max<long int>(2, static_cast<long int>(ceil(log10((double) max_length / divide)))); - std::cout << "Memory estimate:\ntype "; - // right align bytes. - for (long int i = 0; i < length - 2; ++i) std::cout << ' '; - std::cout << prefix << "B\n" - "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n" - "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r models -p " << config.probing_multiplier << "\n" - "trie " << std::setw(length) << (sizes[2] / divide) << " without quantization\n" - "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n" - "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n" - "trie " << std::setw(length) << (sizes[5] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n"; -} - void ProbingQuantizationUnsupported() { std::cerr << "Quantization is only implemented in the trie data structure." << std::endl; exit(1); @@ -135,11 +102,14 @@ void ProbingQuantizationUnsupported() { int main(int argc, char *argv[]) { using namespace lm::ngram; + const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G"; + try { bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false; lm::ngram::Config config; + config.building_memory = util::ParseSize(default_mem); int opt; - while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:sir:")) != -1) { + while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:")) != -1) { switch(opt) { case 'q': config.prob_bits = ParseBitCount(optarg); @@ -160,12 +130,16 @@ int main(int argc, char *argv[]) { case 'p': config.probing_multiplier = ParseFloat(optarg); break; - case 't': + case 't': // legacy + case 'T': config.temporary_directory_prefix = optarg; break; - case 'm': + case 'm': // legacy config.building_memory = ParseUInt(optarg) * 1048576; break; + case 'S': + config.building_memory = std::min(static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), util::ParseSize(optarg)); + break; case 'w': set_write_method = true; if (!strcmp(optarg, "mmap")) { @@ -173,7 +147,7 @@ int main(int argc, char *argv[]) { } else if (!strcmp(optarg, "after")) { config.write_method = Config::WRITE_AFTER; } else { - Usage(argv[0]); + Usage(argv[0], default_mem); } break; case 's': @@ -188,7 +162,7 @@ int main(int argc, char *argv[]) { config.rest_function = Config::REST_LOWER; break; default: - Usage(argv[0]); + Usage(argv[0], default_mem); } } if (!quantize && set_backoff_bits) { @@ -211,7 +185,7 @@ int main(int argc, char *argv[]) { from_file = argv[optind + 1]; config.write_mmap = argv[optind + 2]; } else { - Usage(argv[0]); + Usage(argv[0], default_mem); } if (!strcmp(model_type, "probing")) { if (!set_write_method) config.write_method = Config::WRITE_AFTER; @@ -241,7 +215,7 @@ int main(int argc, char *argv[]) { } } } else { - Usage(argv[0]); + Usage(argv[0], default_mem); } } catch (const std::exception &e) { diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am new file mode 100644 index 00000000..317e03ce --- /dev/null +++ b/klm/lm/builder/Makefile.am @@ -0,0 +1,28 @@ +bin_PROGRAMS = builder + +builder_SOURCES = \ + lmplz_main.cc \ + adjust_counts.cc \ + adjust_counts.hh \ + corpus_count.cc \ + corpus_count.hh \ + discount.hh \ + header_info.hh \ + initial_probabilities.cc \ + initial_probabilities.hh \ + interpolate.cc \ + interpolate.hh \ + joint_order.hh \ + multi_stream.hh \ + ngram.hh \ + ngram_stream.hh \ + pipeline.cc \ + pipeline.hh \ + print.cc \ + print.hh \ + sort.hh + +builder_LDADD = ../libklm.a ../../util/double-conversion/libklm_util_double.a ../../util/stream/libklm_util_stream.a ../../util/libklm_util.a $(BOOST_THREAD_LIBS) + +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm + diff --git a/klm/lm/builder/README.md b/klm/lm/builder/README.md new file mode 100644 index 00000000..be0d35e2 --- /dev/null +++ b/klm/lm/builder/README.md @@ -0,0 +1,47 @@ +Dependencies +============ + +Boost >= 1.42.0 is required. + +For Ubuntu, +```bash +sudo apt-get install libboost1.48-all-dev +``` + +Alternatively, you can download, compile, and install it yourself: + +```bash +wget http://sourceforge.net/projects/boost/files/boost/1.52.0/boost_1_52_0.tar.gz/download -O boost_1_52_0.tar.gz +tar -xvzf boost_1_52_0.tar.gz +cd boost_1_52_0 +./bootstrap.sh +./b2 +sudo ./b2 install +``` + +Local install options (in a user-space prefix directory) are also possible. See http://www.boost.org/doc/libs/1_52_0/doc/html/bbv2/installation.html. + + +Building +======== + +```bash +bjam +``` +Your distribution might package bjam and boost-build separately from Boost. Both are required. + +Usage +===== + +Run +```bash +$ bin/lmplz +``` +to see command line arguments + +Running +======= + +```bash +bin/lmplz -o 5 <text >text.arpa +``` diff --git a/klm/lm/builder/TODO b/klm/lm/builder/TODO new file mode 100644 index 00000000..cb5aef3a --- /dev/null +++ b/klm/lm/builder/TODO @@ -0,0 +1,5 @@ +More tests! +Sharding. +Some way to manage all the crazy config options. +Option to build the binary file directly. +Interpolation of different orders. diff --git a/klm/lm/builder/adjust_counts.cc b/klm/lm/builder/adjust_counts.cc new file mode 100644 index 00000000..a6f48011 --- /dev/null +++ b/klm/lm/builder/adjust_counts.cc @@ -0,0 +1,216 @@ +#include "lm/builder/adjust_counts.hh" +#include "lm/builder/multi_stream.hh" +#include "util/stream/timer.hh" + +#include <algorithm> + +namespace lm { namespace builder { + +BadDiscountException::BadDiscountException() throw() {} +BadDiscountException::~BadDiscountException() throw() {} + +namespace { +// Return last word in full that is different. +const WordIndex* FindDifference(const NGram &full, const NGram &lower_last) { + const WordIndex *cur_word = full.end() - 1; + const WordIndex *pre_word = lower_last.end() - 1; + // Find last difference. + for (; pre_word >= lower_last.begin() && *pre_word == *cur_word; --cur_word, --pre_word) {} + return cur_word; +} + +class StatCollector { + public: + StatCollector(std::size_t order, std::vector<uint64_t> &counts, std::vector<Discount> &discounts) + : orders_(order), full_(orders_.back()), counts_(counts), discounts_(discounts) { + memset(&orders_[0], 0, sizeof(OrderStat) * order); + } + + ~StatCollector() {} + + void CalculateDiscounts() { + counts_.resize(orders_.size()); + discounts_.resize(orders_.size()); + for (std::size_t i = 0; i < orders_.size(); ++i) { + const OrderStat &s = orders_[i]; + counts_[i] = s.count; + + for (unsigned j = 1; j < 4; ++j) { + // TODO: Specialize error message for j == 3, meaning 3+ + UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for " + << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any " + << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?"); + } + + // See equation (26) in Chen and Goodman. + discounts_[i].amount[0] = 0.0; + float y = static_cast<float>(s.n[1]) / static_cast<float>(s.n[1] + 2.0 * s.n[2]); + for (unsigned j = 1; j < 4; ++j) { + discounts_[i].amount[j] = static_cast<float>(j) - static_cast<float>(j + 1) * y * static_cast<float>(s.n[j+1]) / static_cast<float>(s.n[j]); + UTIL_THROW_IF(discounts_[i].amount[j] < 0.0 || discounts_[i].amount[j] > j, BadDiscountException, "ERROR: " << (i+1) << "-gram discount out of range for adjusted count " << j << ": " << discounts_[i].amount[j]); + } + } + } + + void Add(std::size_t order_minus_1, uint64_t count) { + OrderStat &stat = orders_[order_minus_1]; + ++stat.count; + if (count < 5) ++stat.n[count]; + } + + void AddFull(uint64_t count) { + ++full_.count; + if (count < 5) ++full_.n[count]; + } + + private: + struct OrderStat { + // n_1 in equation 26 of Chen and Goodman etc + uint64_t n[5]; + uint64_t count; + }; + + std::vector<OrderStat> orders_; + OrderStat &full_; + + std::vector<uint64_t> &counts_; + std::vector<Discount> &discounts_; +}; + +// Reads all entries in order like NGramStream does. +// But deletes any entries that have <s> in the 1st (not 0th) position on the +// way out by putting other entries in their place. This disrupts the sort +// order but we don't care because the data is going to be sorted again. +class CollapseStream { + public: + CollapseStream(const util::stream::ChainPosition &position) : + current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())), + block_(position) { + StartBlock(); + } + + const NGram &operator*() const { return current_; } + const NGram *operator->() const { return ¤t_; } + + operator bool() const { return block_; } + + CollapseStream &operator++() { + assert(block_); + if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) { + memcpy(current_.Base(), copy_from_, current_.TotalSize()); + UpdateCopyFrom(); + } + current_.NextInMemory(); + uint8_t *block_base = static_cast<uint8_t*>(block_->Get()); + if (current_.Base() == block_base + block_->ValidSize()) { + block_->SetValidSize(copy_from_ + current_.TotalSize() - block_base); + ++block_; + StartBlock(); + } + return *this; + } + + private: + void StartBlock() { + for (; ; ++block_) { + if (!block_) return; + if (block_->ValidSize()) break; + } + current_.ReBase(block_->Get()); + copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize(); + UpdateCopyFrom(); + } + + // Find last without bos. + void UpdateCopyFrom() { + for (copy_from_ -= current_.TotalSize(); copy_from_ >= current_.Base(); copy_from_ -= current_.TotalSize()) { + if (NGram(copy_from_, current_.Order()).begin()[1] != kBOS) break; + } + } + + NGram current_; + + // Goes backwards in the block + uint8_t *copy_from_; + + util::stream::Link block_; +}; + +} // namespace + +void AdjustCounts::Run(const ChainPositions &positions) { + UTIL_TIMER("(%w s) Adjusted counts\n"); + + const std::size_t order = positions.size(); + StatCollector stats(order, counts_, discounts_); + if (order == 1) { + // Only unigrams. Just collect stats. + for (NGramStream full(positions[0]); full; ++full) + stats.AddFull(full->Count()); + stats.CalculateDiscounts(); + return; + } + + NGramStreams streams; + streams.Init(positions, positions.size() - 1); + CollapseStream full(positions[positions.size() - 1]); + + // Initialization: <unk> has count 0 and so does <s>. + NGramStream *lower_valid = streams.begin(); + streams[0]->Count() = 0; + *streams[0]->begin() = kUNK; + stats.Add(0, 0); + (++streams[0])->Count() = 0; + *streams[0]->begin() = kBOS; + // not in stats because it will get put in later. + + // iterate over full (the stream of the highest order ngrams) + for (; full; ++full) { + const WordIndex *different = FindDifference(*full, **lower_valid); + std::size_t same = full->end() - 1 - different; + // Increment the adjusted count. + if (same) ++streams[same - 1]->Count(); + + // Output all the valid ones that changed. + for (; lower_valid >= &streams[same]; --lower_valid) { + stats.Add(lower_valid - streams.begin(), (*lower_valid)->Count()); + ++*lower_valid; + } + + // This is here because bos is also const WordIndex *, so copy gets + // consistent argument types. + const WordIndex *full_end = full->end(); + // Initialize and mark as valid up to bos. + const WordIndex *bos; + for (bos = different; (bos > full->begin()) && (*bos != kBOS); --bos) { + ++lower_valid; + std::copy(bos, full_end, (*lower_valid)->begin()); + (*lower_valid)->Count() = 1; + } + // Now bos indicates where <s> is or is the 0th word of full. + if (bos != full->begin()) { + // There is an <s> beyond the 0th word. + NGramStream &to = *++lower_valid; + std::copy(bos, full_end, to->begin()); + to->Count() = full->Count(); + } else { + stats.AddFull(full->Count()); + } + assert(lower_valid >= &streams[0]); + } + + // Output everything valid. + for (NGramStream *s = streams.begin(); s <= lower_valid; ++s) { + stats.Add(s - streams.begin(), (*s)->Count()); + ++*s; + } + // Poison everyone! Except the N-grams which were already poisoned by the input. + for (NGramStream *s = streams.begin(); s != streams.end(); ++s) + s->Poison(); + + stats.CalculateDiscounts(); + + // NOTE: See special early-return case for unigrams near the top of this function +} + +}} // namespaces diff --git a/klm/lm/builder/adjust_counts.hh b/klm/lm/builder/adjust_counts.hh new file mode 100644 index 00000000..f38ff79d --- /dev/null +++ b/klm/lm/builder/adjust_counts.hh @@ -0,0 +1,44 @@ +#ifndef LM_BUILDER_ADJUST_COUNTS__ +#define LM_BUILDER_ADJUST_COUNTS__ + +#include "lm/builder/discount.hh" +#include "util/exception.hh" + +#include <vector> + +#include <stdint.h> + +namespace lm { +namespace builder { + +class ChainPositions; + +class BadDiscountException : public util::Exception { + public: + BadDiscountException() throw(); + ~BadDiscountException() throw(); +}; + +/* Compute adjusted counts. + * Input: unique suffix sorted N-grams (and just the N-grams) with raw counts. + * Output: [1,N]-grams with adjusted counts. + * [1,N)-grams are in suffix order + * N-grams are in undefined order (they're going to be sorted anyway). + */ +class AdjustCounts { + public: + AdjustCounts(std::vector<uint64_t> &counts, std::vector<Discount> &discounts) + : counts_(counts), discounts_(discounts) {} + + void Run(const ChainPositions &positions); + + private: + std::vector<uint64_t> &counts_; + std::vector<Discount> &discounts_; +}; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_ADJUST_COUNTS__ + diff --git a/klm/lm/builder/adjust_counts_test.cc b/klm/lm/builder/adjust_counts_test.cc new file mode 100644 index 00000000..68b5f33e --- /dev/null +++ b/klm/lm/builder/adjust_counts_test.cc @@ -0,0 +1,106 @@ +#include "lm/builder/adjust_counts.hh" + +#include "lm/builder/multi_stream.hh" +#include "util/scoped.hh" + +#include <boost/thread/thread.hpp> +#define BOOST_TEST_MODULE AdjustCounts +#include <boost/test/unit_test.hpp> + +namespace lm { namespace builder { namespace { + +class KeepCopy { + public: + KeepCopy() : size_(0) {} + + void Run(const util::stream::ChainPosition &position) { + for (util::stream::Link link(position); link; ++link) { + mem_.call_realloc(size_ + link->ValidSize()); + memcpy(static_cast<uint8_t*>(mem_.get()) + size_, link->Get(), link->ValidSize()); + size_ += link->ValidSize(); + } + } + + uint8_t *Get() { return static_cast<uint8_t*>(mem_.get()); } + std::size_t Size() const { return size_; } + + private: + util::scoped_malloc mem_; + std::size_t size_; +}; + +struct Gram4 { + WordIndex ids[4]; + uint64_t count; +}; + +class WriteInput { + public: + void Run(const util::stream::ChainPosition &position) { + NGramStream input(position); + Gram4 grams[] = { + {{0,0,0,0},10}, + {{0,0,3,0},3}, + // bos + {{1,1,1,2},5}, + {{0,0,3,2},5}, + }; + for (size_t i = 0; i < sizeof(grams) / sizeof(Gram4); ++i, ++input) { + memcpy(input->begin(), grams[i].ids, sizeof(WordIndex) * 4); + input->Count() = grams[i].count; + } + input.Poison(); + } +}; + +BOOST_AUTO_TEST_CASE(Simple) { + KeepCopy outputs[4]; + std::vector<uint64_t> counts; + std::vector<Discount> discount; + { + util::stream::ChainConfig config; + config.total_memory = 100; + config.block_count = 1; + Chains chains(4); + for (unsigned i = 0; i < 4; ++i) { + config.entry_size = NGram::TotalSize(i + 1); + chains.push_back(config); + } + + chains[3] >> WriteInput(); + ChainPositions for_adjust(chains); + for (unsigned i = 0; i < 4; ++i) { + chains[i] >> boost::ref(outputs[i]); + } + chains >> util::stream::kRecycle; + BOOST_CHECK_THROW(AdjustCounts(counts, discount).Run(for_adjust), BadDiscountException); + } + BOOST_REQUIRE_EQUAL(4UL, counts.size()); + BOOST_CHECK_EQUAL(4UL, counts[0]); + // These are no longer set because the discounts are bad. +/* BOOST_CHECK_EQUAL(4UL, counts[1]); + BOOST_CHECK_EQUAL(3UL, counts[2]); + BOOST_CHECK_EQUAL(3UL, counts[3]);*/ + BOOST_REQUIRE_EQUAL(NGram::TotalSize(1) * 4, outputs[0].Size()); + NGram uni(outputs[0].Get(), 1); + BOOST_CHECK_EQUAL(kUNK, *uni.begin()); + BOOST_CHECK_EQUAL(0ULL, uni.Count()); + uni.NextInMemory(); + BOOST_CHECK_EQUAL(kBOS, *uni.begin()); + BOOST_CHECK_EQUAL(0ULL, uni.Count()); + uni.NextInMemory(); + BOOST_CHECK_EQUAL(0UL, *uni.begin()); + BOOST_CHECK_EQUAL(2ULL, uni.Count()); + uni.NextInMemory(); + BOOST_CHECK_EQUAL(2ULL, uni.Count()); + BOOST_CHECK_EQUAL(2UL, *uni.begin()); + + BOOST_REQUIRE_EQUAL(NGram::TotalSize(2) * 4, outputs[1].Size()); + NGram bi(outputs[1].Get(), 2); + BOOST_CHECK_EQUAL(0UL, *bi.begin()); + BOOST_CHECK_EQUAL(0UL, *(bi.begin() + 1)); + BOOST_CHECK_EQUAL(1ULL, bi.Count()); + bi.NextInMemory(); +} + +}}} // namespaces diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc new file mode 100644 index 00000000..abea4ed0 --- /dev/null +++ b/klm/lm/builder/corpus_count.cc @@ -0,0 +1,224 @@ +#include "lm/builder/corpus_count.hh" + +#include "lm/builder/ngram.hh" +#include "lm/lm_exception.hh" +#include "lm/word_index.hh" +#include "util/file.hh" +#include "util/file_piece.hh" +#include "util/murmur_hash.hh" +#include "util/probing_hash_table.hh" +#include "util/scoped.hh" +#include "util/stream/chain.hh" +#include "util/stream/timer.hh" +#include "util/tokenize_piece.hh" + +#include <boost/unordered_set.hpp> +#include <boost/unordered_map.hpp> + +#include <functional> + +#include <stdint.h> + +namespace lm { +namespace builder { +namespace { + +class VocabHandout { + public: + explicit VocabHandout(int fd) { + util::scoped_fd duped(util::DupOrThrow(fd)); + word_list_.reset(util::FDOpenOrThrow(duped)); + + Lookup("<unk>"); // Force 0 + Lookup("<s>"); // Force 1 + Lookup("</s>"); // Force 2 + } + + WordIndex Lookup(const StringPiece &word) { + uint64_t hashed = util::MurmurHashNative(word.data(), word.size()); + std::pair<Seen::iterator, bool> ret(seen_.insert(std::pair<uint64_t, lm::WordIndex>(hashed, seen_.size()))); + if (ret.second) { + char null_delimit = 0; + util::WriteOrThrow(word_list_.get(), word.data(), word.size()); + util::WriteOrThrow(word_list_.get(), &null_delimit, 1); + UTIL_THROW_IF(seen_.size() >= std::numeric_limits<lm::WordIndex>::max(), VocabLoadException, "Too many vocabulary words. Change WordIndex to uint64_t in lm/word_index.hh."); + } + return ret.first->second; + } + + WordIndex Size() const { + return seen_.size(); + } + + private: + typedef boost::unordered_map<uint64_t, lm::WordIndex> Seen; + + Seen seen_; + + util::scoped_FILE word_list_; +}; + +class DedupeHash : public std::unary_function<const WordIndex *, bool> { + public: + explicit DedupeHash(std::size_t order) : size_(order * sizeof(WordIndex)) {} + + std::size_t operator()(const WordIndex *start) const { + return util::MurmurHashNative(start, size_); + } + + private: + const std::size_t size_; +}; + +class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> { + public: + explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {} + + bool operator()(const WordIndex *first, const WordIndex *second) const { + return !memcmp(first, second, size_); + } + + private: + const std::size_t size_; +}; + +struct DedupeEntry { + typedef WordIndex *Key; + Key GetKey() const { return key; } + Key key; + static DedupeEntry Construct(WordIndex *at) { + DedupeEntry ret; + ret.key = at; + return ret; + } +}; + +typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe; + +const float kProbingMultiplier = 1.5; + +class Writer { + public: + Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) + : block_(position), gram_(block_->Get(), order), + dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()), + dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)), + buffer_(new WordIndex[order - 1]), + block_size_(position.GetChain().BlockSize()) { + dedupe_.Clear(DedupeEntry::Construct(&dedupe_invalid_[0])); + assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size); + if (order == 1) { + // Add special words. AdjustCounts is responsible if order != 1. + AddUnigramWord(kUNK); + AddUnigramWord(kBOS); + } + } + + ~Writer() { + block_->SetValidSize(reinterpret_cast<const uint8_t*>(gram_.begin()) - static_cast<const uint8_t*>(block_->Get())); + (++block_).Poison(); + } + + // Write context with a bunch of <s> + void StartSentence() { + for (WordIndex *i = gram_.begin(); i != gram_.end() - 1; ++i) { + *i = kBOS; + } + } + + void Append(WordIndex word) { + *(gram_.end() - 1) = word; + Dedupe::MutableIterator at; + bool found = dedupe_.FindOrInsert(DedupeEntry::Construct(gram_.begin()), at); + if (found) { + // Already present. + NGram already(at->key, gram_.Order()); + ++(already.Count()); + // Shift left by one. + memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1)); + return; + } + // Complete the write. + gram_.Count() = 1; + // Prepare the next n-gram. + if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) { + NGram last(gram_); + gram_.NextInMemory(); + std::copy(last.begin() + 1, last.end(), gram_.begin()); + return; + } + // Block end. Need to store the context in a temporary buffer. + std::copy(gram_.begin() + 1, gram_.end(), buffer_.get()); + dedupe_.Clear(DedupeEntry::Construct(&dedupe_invalid_[0])); + block_->SetValidSize(block_size_); + gram_.ReBase((++block_)->Get()); + std::copy(buffer_.get(), buffer_.get() + gram_.Order() - 1, gram_.begin()); + } + + private: + void AddUnigramWord(WordIndex index) { + *gram_.begin() = index; + gram_.Count() = 0; + gram_.NextInMemory(); + if (gram_.Base() == static_cast<uint8_t*>(block_->Get()) + block_size_) { + block_->SetValidSize(block_size_); + gram_.ReBase((++block_)->Get()); + } + } + + util::stream::Link block_; + + NGram gram_; + + // This is the memory behind the invalid value in dedupe_. + std::vector<WordIndex> dedupe_invalid_; + // Hash table combiner implementation. + Dedupe dedupe_; + + // Small buffer to hold existing ngrams when shifting across a block boundary. + boost::scoped_array<WordIndex> buffer_; + + const std::size_t block_size_; +}; + +} // namespace + +float CorpusCount::DedupeMultiplier(std::size_t order) { + return kProbingMultiplier * static_cast<float>(sizeof(DedupeEntry)) / static_cast<float>(NGram::TotalSize(order)); +} + +CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) + : from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count), + dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)), + dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) { + token_count_ = 0; + type_count_ = 0; +} + +void CorpusCount::Run(const util::stream::ChainPosition &position) { + UTIL_TIMER("(%w s) Counted n-grams\n"); + + VocabHandout vocab(vocab_write_); + const WordIndex end_sentence = vocab.Lookup("</s>"); + Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_); + uint64_t count = 0; + StringPiece delimiters("\0\t\r ", 4); + try { + while(true) { + StringPiece line(from_.ReadLine()); + writer.StartSentence(); + for (util::TokenIter<util::AnyCharacter, true> w(line, delimiters); w; ++w) { + WordIndex word = vocab.Lookup(*w); + UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing <unk> in the future."); + writer.Append(word); + ++count; + } + writer.Append(end_sentence); + } + } catch (const util::EndOfFileException &e) {} + token_count_ = count; + type_count_ = vocab.Size(); +} + +} // namespace builder +} // namespace lm diff --git a/klm/lm/builder/corpus_count.hh b/klm/lm/builder/corpus_count.hh new file mode 100644 index 00000000..e255bad1 --- /dev/null +++ b/klm/lm/builder/corpus_count.hh @@ -0,0 +1,42 @@ +#ifndef LM_BUILDER_CORPUS_COUNT__ +#define LM_BUILDER_CORPUS_COUNT__ + +#include "lm/word_index.hh" +#include "util/scoped.hh" + +#include <cstddef> +#include <string> +#include <stdint.h> + +namespace util { +class FilePiece; +namespace stream { +class ChainPosition; +} // namespace stream +} // namespace util + +namespace lm { +namespace builder { + +class CorpusCount { + public: + // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size + static float DedupeMultiplier(std::size_t order); + + CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block); + + void Run(const util::stream::ChainPosition &position); + + private: + util::FilePiece &from_; + int vocab_write_; + uint64_t &token_count_; + WordIndex &type_count_; + + std::size_t dedupe_mem_size_; + util::scoped_malloc dedupe_mem_; +}; + +} // namespace builder +} // namespace lm +#endif // LM_BUILDER_CORPUS_COUNT__ diff --git a/klm/lm/builder/corpus_count_test.cc b/klm/lm/builder/corpus_count_test.cc new file mode 100644 index 00000000..8d53ca9d --- /dev/null +++ b/klm/lm/builder/corpus_count_test.cc @@ -0,0 +1,76 @@ +#include "lm/builder/corpus_count.hh" + +#include "lm/builder/ngram.hh" +#include "lm/builder/ngram_stream.hh" + +#include "util/file.hh" +#include "util/file_piece.hh" +#include "util/tokenize_piece.hh" +#include "util/stream/chain.hh" +#include "util/stream/stream.hh" + +#define BOOST_TEST_MODULE CorpusCountTest +#include <boost/test/unit_test.hpp> + +namespace lm { namespace builder { namespace { + +#define Check(str, count) { \ + BOOST_REQUIRE(stream); \ + w = stream->begin(); \ + for (util::TokenIter<util::AnyCharacter, true> t(str, " "); t; ++t, ++w) { \ + BOOST_CHECK_EQUAL(*t, v[*w]); \ + } \ + BOOST_CHECK_EQUAL((uint64_t)count, stream->Count()); \ + ++stream; \ +} + +BOOST_AUTO_TEST_CASE(Short) { + util::scoped_fd input_file(util::MakeTemp("corpus_count_test_temp")); + const char input[] = "looking on a little more loin\non a little more loin\non foo little more loin\nbar\n\n"; + // Blocks of 10 are + // looking on a little more loin </s> on a little[duplicate] more[duplicate] loin[duplicate] </s>[duplicate] on[duplicate] foo + // little more loin </s> bar </s> </s> + + util::WriteOrThrow(input_file.get(), input, sizeof(input) - 1); + util::FilePiece input_piece(input_file.release(), "temp file"); + + util::stream::ChainConfig config; + config.entry_size = NGram::TotalSize(3); + config.total_memory = config.entry_size * 20; + config.block_count = 2; + + util::scoped_fd vocab(util::MakeTemp("corpus_count_test_vocab")); + + util::stream::Chain chain(config); + NGramStream stream; + uint64_t token_count; + WordIndex type_count; + CorpusCount counter(input_piece, vocab.get(), token_count, type_count, chain.BlockSize() / chain.EntrySize()); + chain >> boost::ref(counter) >> stream >> util::stream::kRecycle; + + const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"}; + + WordIndex *w; + + Check("<s> <s> looking", 1); + Check("<s> looking on", 1); + Check("looking on a", 1); + Check("on a little", 2); + Check("a little more", 2); + Check("little more loin", 2); + Check("more loin </s>", 2); + Check("<s> <s> on", 2); + Check("<s> on a", 1); + Check("<s> on foo", 1); + Check("on foo little", 1); + Check("foo little more", 1); + Check("little more loin", 1); + Check("more loin </s>", 1); + Check("<s> <s> bar", 1); + Check("<s> bar </s>", 1); + Check("<s> <s> </s>", 1); + BOOST_CHECK(!stream); + BOOST_CHECK_EQUAL(sizeof(v) / sizeof(const char*), type_count); +} + +}}} // namespaces diff --git a/klm/lm/builder/discount.hh b/klm/lm/builder/discount.hh new file mode 100644 index 00000000..4d0aa4fd --- /dev/null +++ b/klm/lm/builder/discount.hh @@ -0,0 +1,26 @@ +#ifndef BUILDER_DISCOUNT__ +#define BUILDER_DISCOUNT__ + +#include <algorithm> + +#include <stdint.h> + +namespace lm { +namespace builder { + +struct Discount { + float amount[4]; + + float Get(uint64_t count) const { + return amount[std::min<uint64_t>(count, 3)]; + } + + float Apply(uint64_t count) const { + return static_cast<float>(count) - Get(count); + } +}; + +} // namespace builder +} // namespace lm + +#endif // BUILDER_DISCOUNT__ diff --git a/klm/lm/builder/header_info.hh b/klm/lm/builder/header_info.hh new file mode 100644 index 00000000..ccca1456 --- /dev/null +++ b/klm/lm/builder/header_info.hh @@ -0,0 +1,20 @@ +#ifndef LM_BUILDER_HEADER_INFO__ +#define LM_BUILDER_HEADER_INFO__ + +#include <string> +#include <stdint.h> + +// Some configuration info that is used to add +// comments to the beginning of an ARPA file +struct HeaderInfo { + const std::string input_file; + const uint64_t token_count; + + HeaderInfo(const std::string& input_file_in, uint64_t token_count_in) + : input_file(input_file_in), token_count(token_count_in) {} + + // TODO: Add smoothing type + // TODO: More info if multiple models were interpolated +}; + +#endif diff --git a/klm/lm/builder/initial_probabilities.cc b/klm/lm/builder/initial_probabilities.cc new file mode 100644 index 00000000..58b42a20 --- /dev/null +++ b/klm/lm/builder/initial_probabilities.cc @@ -0,0 +1,136 @@ +#include "lm/builder/initial_probabilities.hh" + +#include "lm/builder/discount.hh" +#include "lm/builder/ngram_stream.hh" +#include "lm/builder/sort.hh" +#include "util/file.hh" +#include "util/stream/chain.hh" +#include "util/stream/io.hh" +#include "util/stream/stream.hh" + +#include <vector> + +namespace lm { namespace builder { + +namespace { +struct BufferEntry { + // Gamma from page 20 of Chen and Goodman. + float gamma; + // \sum_w a(c w) for all w. + float denominator; +}; + +// Extract an array of gamma from an array of BufferEntry. +class OnlyGamma { + public: + void Run(const util::stream::ChainPosition &position) { + for (util::stream::Link block_it(position); block_it; ++block_it) { + float *out = static_cast<float*>(block_it->Get()); + const float *in = out; + const float *end = static_cast<const float*>(block_it->ValidEnd()); + for (out += 1, in += 2; in < end; out += 1, in += 2) { + *out = *in; + } + block_it->SetValidSize(block_it->ValidSize() / 2); + } + } +}; + +class AddRight { + public: + AddRight(const Discount &discount, const util::stream::ChainPosition &input) + : discount_(discount), input_(input) {} + + void Run(const util::stream::ChainPosition &output) { + NGramStream in(input_); + util::stream::Stream out(output); + + std::vector<WordIndex> previous(in->Order() - 1); + const std::size_t size = sizeof(WordIndex) * previous.size(); + for(; in; ++out) { + memcpy(&previous[0], in->begin(), size); + uint64_t denominator = 0; + uint64_t counts[4]; + memset(counts, 0, sizeof(counts)); + do { + denominator += in->Count(); + ++counts[std::min(in->Count(), static_cast<uint64_t>(3))]; + } while (++in && !memcmp(&previous[0], in->begin(), size)); + BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get()); + entry.denominator = static_cast<float>(denominator); + entry.gamma = 0.0; + for (unsigned i = 1; i <= 3; ++i) { + entry.gamma += discount_.Get(i) * static_cast<float>(counts[i]); + } + entry.gamma /= entry.denominator; + } + out.Poison(); + } + + private: + const Discount &discount_; + const util::stream::ChainPosition input_; +}; + +class MergeRight { + public: + MergeRight(bool interpolate_unigrams, const util::stream::ChainPosition &from_adder, const Discount &discount) + : interpolate_unigrams_(interpolate_unigrams), from_adder_(from_adder), discount_(discount) {} + + // calculate the initial probability of each n-gram (before order-interpolation) + // Run() gets invoked once for each order + void Run(const util::stream::ChainPosition &primary) { + util::stream::Stream summed(from_adder_); + + NGramStream grams(primary); + + // Without interpolation, the interpolation weight goes to <unk>. + if (grams->Order() == 1 && !interpolate_unigrams_) { + BufferEntry sums(*static_cast<const BufferEntry*>(summed.Get())); + assert(*grams->begin() == kUNK); + grams->Value().uninterp.prob = sums.gamma; + grams->Value().uninterp.gamma = 0.0; + while (++grams) { + grams->Value().uninterp.prob = discount_.Apply(grams->Count()) / sums.denominator; + grams->Value().uninterp.gamma = 0.0; + } + ++summed; + return; + } + + std::vector<WordIndex> previous(grams->Order() - 1); + const std::size_t size = sizeof(WordIndex) * previous.size(); + for (; grams; ++summed) { + memcpy(&previous[0], grams->begin(), size); + const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get()); + do { + Payload &pay = grams->Value(); + pay.uninterp.prob = discount_.Apply(pay.count) / sums.denominator; + pay.uninterp.gamma = sums.gamma; + } while (++grams && !memcmp(&previous[0], grams->begin(), size)); + } + } + + private: + bool interpolate_unigrams_; + util::stream::ChainPosition from_adder_; + Discount discount_; +}; + +} // namespace + +void InitialProbabilities(const InitialProbabilitiesConfig &config, const std::vector<Discount> &discounts, Chains &primary, Chains &second_in, Chains &gamma_out) { + util::stream::ChainConfig gamma_config = config.adder_out; + gamma_config.entry_size = sizeof(BufferEntry); + for (size_t i = 0; i < primary.size(); ++i) { + util::stream::ChainPosition second(second_in[i].Add()); + second_in[i] >> util::stream::kRecycle; + gamma_out.push_back(gamma_config); + gamma_out[i] >> AddRight(discounts[i], second); + primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i]); + // Don't bother with the OnlyGamma thread for something to discard. + if (i) gamma_out[i] >> OnlyGamma(); + } +} + +}} // namespaces diff --git a/klm/lm/builder/initial_probabilities.hh b/klm/lm/builder/initial_probabilities.hh new file mode 100644 index 00000000..626388eb --- /dev/null +++ b/klm/lm/builder/initial_probabilities.hh @@ -0,0 +1,34 @@ +#ifndef LM_BUILDER_INITIAL_PROBABILITIES__ +#define LM_BUILDER_INITIAL_PROBABILITIES__ + +#include "lm/builder/discount.hh" +#include "util/stream/config.hh" + +#include <vector> + +namespace lm { +namespace builder { +class Chains; + +struct InitialProbabilitiesConfig { + // These should be small buffers to keep the adder from getting too far ahead + util::stream::ChainConfig adder_in; + util::stream::ChainConfig adder_out; + // SRILM doesn't normally interpolate unigrams. + bool interpolate_unigrams; +}; + +/* Compute initial (uninterpolated) probabilities + * primary: the normal chain of n-grams. Incoming is context sorted adjusted + * counts. Outgoing has uninterpolated probabilities for use by Interpolate. + * second_in: a second copy of the primary input. Discard the output. + * gamma_out: Computed gamma values are output on these chains in suffix order. + * The values are bare floats and should be buffered for interpolation to + * use. + */ +void InitialProbabilities(const InitialProbabilitiesConfig &config, const std::vector<Discount> &discounts, Chains &primary, Chains &second_in, Chains &gamma_out); + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_INITIAL_PROBABILITIES__ diff --git a/klm/lm/builder/interpolate.cc b/klm/lm/builder/interpolate.cc new file mode 100644 index 00000000..50026806 --- /dev/null +++ b/klm/lm/builder/interpolate.cc @@ -0,0 +1,65 @@ +#include "lm/builder/interpolate.hh" + +#include "lm/builder/joint_order.hh" +#include "lm/builder/multi_stream.hh" +#include "lm/builder/sort.hh" +#include "lm/lm_exception.hh" + +#include <assert.h> + +namespace lm { namespace builder { +namespace { + +class Callback { + public: + Callback(float uniform_prob, const ChainPositions &backoffs) : backoffs_(backoffs.size()), probs_(backoffs.size() + 2) { + probs_[0] = uniform_prob; + for (std::size_t i = 0; i < backoffs.size(); ++i) { + backoffs_.push_back(backoffs[i]); + } + } + + ~Callback() { + for (std::size_t i = 0; i < backoffs_.size(); ++i) { + if (backoffs_[i]) { + std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl; + abort(); + } + } + } + + void Enter(unsigned order_minus_1, NGram &gram) { + Payload &pay = gram.Value(); + pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1]; + probs_[order_minus_1 + 1] = pay.complete.prob; + pay.complete.prob = log10(pay.complete.prob); + // TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling. + if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) { + pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get())); + ++backoffs_[order_minus_1]; + } else { + // Not a context. + pay.complete.backoff = 0.0; + } + } + + void Exit(unsigned, const NGram &) const {} + + private: + FixedArray<util::stream::Stream> backoffs_; + + std::vector<float> probs_; +}; +} // namespace + +Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs) + : uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {} + +// perform order-wise interpolation +void Interpolate::Run(const ChainPositions &positions) { + assert(positions.size() == backoffs_.size() + 1); + Callback callback(uniform_prob_, backoffs_); + JointOrder<Callback, SuffixOrder>(positions, callback); +} + +}} // namespaces diff --git a/klm/lm/builder/interpolate.hh b/klm/lm/builder/interpolate.hh new file mode 100644 index 00000000..9268d404 --- /dev/null +++ b/klm/lm/builder/interpolate.hh @@ -0,0 +1,27 @@ +#ifndef LM_BUILDER_INTERPOLATE__ +#define LM_BUILDER_INTERPOLATE__ + +#include <stdint.h> + +#include "lm/builder/multi_stream.hh" + +namespace lm { namespace builder { + +/* Interpolate step. + * Input: suffix sorted n-grams with (p_uninterpolated, gamma) from + * InitialProbabilities. + * Output: suffix sorted n-grams with complete probability + */ +class Interpolate { + public: + explicit Interpolate(uint64_t unigram_count, const ChainPositions &backoffs); + + void Run(const ChainPositions &positions); + + private: + float uniform_prob_; + ChainPositions backoffs_; +}; + +}} // namespaces +#endif // LM_BUILDER_INTERPOLATE__ diff --git a/klm/lm/builder/joint_order.hh b/klm/lm/builder/joint_order.hh new file mode 100644 index 00000000..b5620144 --- /dev/null +++ b/klm/lm/builder/joint_order.hh @@ -0,0 +1,43 @@ +#ifndef LM_BUILDER_JOINT_ORDER__ +#define LM_BUILDER_JOINT_ORDER__ + +#include "lm/builder/multi_stream.hh" +#include "lm/lm_exception.hh" + +#include <string.h> + +namespace lm { namespace builder { + +template <class Callback, class Compare> void JointOrder(const ChainPositions &positions, Callback &callback) { + // Allow matching to reference streams[-1]. + NGramStreams streams_with_dummy; + streams_with_dummy.InitWithDummy(positions); + NGramStream *streams = streams_with_dummy.begin() + 1; + + unsigned int order; + for (order = 0; order < positions.size() && streams[order]; ++order) {} + assert(order); // should always have <unk>. + unsigned int current = 0; + while (true) { + // Does the context match the lower one? + if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) { + callback.Enter(current, *streams[current]); + // Transition to looking for extensions. + if (++current < order) continue; + } + // No extension left. + while(true) { + assert(current > 0); + --current; + callback.Exit(current, *streams[current]); + if (++streams[current]) break; + UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix"); + order = current; + if (!order) return; + } + } +} + +}} // namespaces + +#endif // LM_BUILDER_JOINT_ORDER__ diff --git a/klm/lm/builder/lmplz_main.cc b/klm/lm/builder/lmplz_main.cc new file mode 100644 index 00000000..90b9dca2 --- /dev/null +++ b/klm/lm/builder/lmplz_main.cc @@ -0,0 +1,94 @@ +#include "lm/builder/pipeline.hh" +#include "util/file.hh" +#include "util/file_piece.hh" +#include "util/usage.hh" + +#include <iostream> + +#include <boost/program_options.hpp> + +namespace { +class SizeNotify { + public: + SizeNotify(std::size_t &out) : behind_(out) {} + + void operator()(const std::string &from) { + behind_ = util::ParseSize(from); + } + + private: + std::size_t &behind_; +}; + +boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) { + return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value); +} + +} // namespace + +int main(int argc, char *argv[]) { + try { + namespace po = boost::program_options; + po::options_description options("Language model building options"); + lm::builder::PipelineConfig pipeline; + + options.add_options() + ("order,o", po::value<std::size_t>(&pipeline.order)->required(), "Order of the model") + ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)") + ("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix") + ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory") + ("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step") + ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow") + ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)") + ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)") + ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file") + ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc."); + if (argc == 1) { + std::cerr << + "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n" + "Please cite:\n" + "@inproceedings{kenlm,\n" + "author = {Kenneth Heafield},\n" + "title = {{KenLM}: Faster and Smaller Language Model Queries},\n" + "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n" + "month = {July}, year={2011},\n" + "address = {Edinburgh, UK},\n" + "publisher = {Association for Computational Linguistics},\n" + "}\n\n" + "Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n" + "the model (-o) is the only mandatory option. As this is an on-disk program,\n" + "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n" + "Memory sizes are specified like GNU sort: a number followed by a unit character.\n" + "Valid units are \% for percentage of memory (supported platforms only) and (in\n" + "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n\n"; + std::cerr << options << std::endl; + return 1; + } + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, options), vm); + po::notify(vm); + + util::NormalizeTempPrefix(pipeline.sort.temp_prefix); + + lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; + // TODO: evaluate options for these. + initial.adder_in.total_memory = 32768; + initial.adder_in.block_count = 2; + initial.adder_out.total_memory = 32768; + initial.adder_out.block_count = 2; + pipeline.read_backoffs = initial.adder_out; + + // Read from stdin + try { + lm::builder::Pipeline(pipeline, 0, 1); + } catch (const util::MallocException &e) { + std::cerr << e.what() << std::endl; + std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl; + return 1; + } + util::PrintUsage(std::cerr); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } +} diff --git a/klm/lm/builder/multi_stream.hh b/klm/lm/builder/multi_stream.hh new file mode 100644 index 00000000..707a98c7 --- /dev/null +++ b/klm/lm/builder/multi_stream.hh @@ -0,0 +1,180 @@ +#ifndef LM_BUILDER_MULTI_STREAM__ +#define LM_BUILDER_MULTI_STREAM__ + +#include "lm/builder/ngram_stream.hh" +#include "util/scoped.hh" +#include "util/stream/chain.hh" + +#include <cstddef> +#include <new> + +#include <assert.h> +#include <stdlib.h> + +namespace lm { namespace builder { + +template <class T> class FixedArray { + public: + explicit FixedArray(std::size_t count) { + Init(count); + } + + FixedArray() : newed_end_(NULL) {} + + void Init(std::size_t count) { + assert(!block_.get()); + block_.reset(malloc(sizeof(T) * count)); + if (!block_.get()) throw std::bad_alloc(); + newed_end_ = begin(); + } + + FixedArray(const FixedArray &from) { + std::size_t size = from.newed_end_ - static_cast<const T*>(from.block_.get()); + Init(size); + for (std::size_t i = 0; i < size; ++i) { + new(end()) T(from[i]); + Constructed(); + } + } + + ~FixedArray() { clear(); } + + T *begin() { return static_cast<T*>(block_.get()); } + const T *begin() const { return static_cast<const T*>(block_.get()); } + // Always call Constructed after successful completion of new. + T *end() { return newed_end_; } + const T *end() const { return newed_end_; } + + T &back() { return *(end() - 1); } + const T &back() const { return *(end() - 1); } + + std::size_t size() const { return end() - begin(); } + bool empty() const { return begin() == end(); } + + T &operator[](std::size_t i) { return begin()[i]; } + const T &operator[](std::size_t i) const { return begin()[i]; } + + template <class C> void push_back(const C &c) { + new (end()) T(c); + Constructed(); + } + + void clear() { + for (T *i = begin(); i != end(); ++i) + i->~T(); + newed_end_ = begin(); + } + + protected: + void Constructed() { + ++newed_end_; + } + + private: + util::scoped_malloc block_; + + T *newed_end_; +}; + +class Chains; + +class ChainPositions : public FixedArray<util::stream::ChainPosition> { + public: + ChainPositions() {} + + void Init(Chains &chains); + + explicit ChainPositions(Chains &chains) { + Init(chains); + } +}; + +class Chains : public FixedArray<util::stream::Chain> { + private: + template <class T, void (T::*ptr)(const ChainPositions &) = &T::Run> struct CheckForRun { + typedef Chains type; + }; + + public: + explicit Chains(std::size_t limit) : FixedArray<util::stream::Chain>(limit) {} + + template <class Worker> typename CheckForRun<Worker>::type &operator>>(const Worker &worker) { + threads_.push_back(new util::stream::Thread(ChainPositions(*this), worker)); + return *this; + } + + template <class Worker> typename CheckForRun<Worker>::type &operator>>(const boost::reference_wrapper<Worker> &worker) { + threads_.push_back(new util::stream::Thread(ChainPositions(*this), worker)); + return *this; + } + + Chains &operator>>(const util::stream::Recycler &recycler) { + for (util::stream::Chain *i = begin(); i != end(); ++i) + *i >> recycler; + return *this; + } + + void Wait(bool release_memory = true) { + threads_.clear(); + for (util::stream::Chain *i = begin(); i != end(); ++i) { + i->Wait(release_memory); + } + } + + private: + boost::ptr_vector<util::stream::Thread> threads_; + + Chains(const Chains &); + void operator=(const Chains &); +}; + +inline void ChainPositions::Init(Chains &chains) { + FixedArray<util::stream::ChainPosition>::Init(chains.size()); + for (util::stream::Chain *i = chains.begin(); i != chains.end(); ++i) { + new (end()) util::stream::ChainPosition(i->Add()); Constructed(); + } +} + +inline Chains &operator>>(Chains &chains, ChainPositions &positions) { + positions.Init(chains); + return chains; +} + +class NGramStreams : public FixedArray<NGramStream> { + public: + NGramStreams() {} + + // This puts a dummy NGramStream at the beginning (useful to algorithms that need to reference something at the beginning). + void InitWithDummy(const ChainPositions &positions) { + FixedArray<NGramStream>::Init(positions.size() + 1); + new (end()) NGramStream(); Constructed(); + for (const util::stream::ChainPosition *i = positions.begin(); i != positions.end(); ++i) { + push_back(*i); + } + } + + // Limit restricts to positions[0,limit) + void Init(const ChainPositions &positions, std::size_t limit) { + FixedArray<NGramStream>::Init(limit); + for (const util::stream::ChainPosition *i = positions.begin(); i != positions.begin() + limit; ++i) { + push_back(*i); + } + } + void Init(const ChainPositions &positions) { + Init(positions, positions.size()); + } + + NGramStreams(const ChainPositions &positions) { + Init(positions); + } +}; + +inline Chains &operator>>(Chains &chains, NGramStreams &streams) { + ChainPositions positions; + chains >> positions; + streams.Init(positions); + return chains; +} + +}} // namespaces +#endif // LM_BUILDER_MULTI_STREAM__ diff --git a/klm/lm/builder/ngram.hh b/klm/lm/builder/ngram.hh new file mode 100644 index 00000000..2984ed0b --- /dev/null +++ b/klm/lm/builder/ngram.hh @@ -0,0 +1,84 @@ +#ifndef LM_BUILDER_NGRAM__ +#define LM_BUILDER_NGRAM__ + +#include "lm/weights.hh" +#include "lm/word_index.hh" + +#include <cstddef> + +#include <assert.h> +#include <stdint.h> +#include <string.h> + +namespace lm { +namespace builder { + +struct Uninterpolated { + float prob; // Uninterpolated probability. + float gamma; // Interpolation weight for lower order. +}; + +union Payload { + uint64_t count; + Uninterpolated uninterp; + ProbBackoff complete; +}; + +class NGram { + public: + NGram(void *begin, std::size_t order) + : begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {} + + const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); } + uint8_t *Base() { return reinterpret_cast<uint8_t*>(begin_); } + + void ReBase(void *to) { + std::size_t difference = end_ - begin_; + begin_ = reinterpret_cast<WordIndex*>(to); + end_ = begin_ + difference; + } + + // Would do operator++ but that can get confusing for a stream. + void NextInMemory() { + ReBase(&Value() + 1); + } + + // Lower-case in deference to STL. + const WordIndex *begin() const { return begin_; } + WordIndex *begin() { return begin_; } + const WordIndex *end() const { return end_; } + WordIndex *end() { return end_; } + + const Payload &Value() const { return *reinterpret_cast<const Payload *>(end_); } + Payload &Value() { return *reinterpret_cast<Payload *>(end_); } + + uint64_t &Count() { return Value().count; } + const uint64_t Count() const { return Value().count; } + + std::size_t Order() const { return end_ - begin_; } + + static std::size_t TotalSize(std::size_t order) { + return order * sizeof(WordIndex) + sizeof(Payload); + } + std::size_t TotalSize() const { + // Compiler should optimize this. + return TotalSize(Order()); + } + static std::size_t OrderFromSize(std::size_t size) { + std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex); + assert(size == TotalSize(ret)); + return ret; + } + + private: + WordIndex *begin_, *end_; +}; + +const WordIndex kUNK = 0; +const WordIndex kBOS = 1; +const WordIndex kEOS = 2; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_NGRAM__ diff --git a/klm/lm/builder/ngram_stream.hh b/klm/lm/builder/ngram_stream.hh new file mode 100644 index 00000000..3c994664 --- /dev/null +++ b/klm/lm/builder/ngram_stream.hh @@ -0,0 +1,55 @@ +#ifndef LM_BUILDER_NGRAM_STREAM__ +#define LM_BUILDER_NGRAM_STREAM__ + +#include "lm/builder/ngram.hh" +#include "util/stream/chain.hh" +#include "util/stream/stream.hh" + +#include <cstddef> + +namespace lm { namespace builder { + +class NGramStream { + public: + NGramStream() : gram_(NULL, 0) {} + + NGramStream(const util::stream::ChainPosition &position) : gram_(NULL, 0) { + Init(position); + } + + void Init(const util::stream::ChainPosition &position) { + stream_.Init(position); + gram_ = NGram(stream_.Get(), NGram::OrderFromSize(position.GetChain().EntrySize())); + } + + NGram &operator*() { return gram_; } + const NGram &operator*() const { return gram_; } + + NGram *operator->() { return &gram_; } + const NGram *operator->() const { return &gram_; } + + void *Get() { return stream_.Get(); } + const void *Get() const { return stream_.Get(); } + + operator bool() const { return stream_; } + bool operator!() const { return !stream_; } + void Poison() { stream_.Poison(); } + + NGramStream &operator++() { + ++stream_; + gram_.ReBase(stream_.Get()); + return *this; + } + + private: + NGram gram_; + util::stream::Stream stream_; +}; + +inline util::stream::Chain &operator>>(util::stream::Chain &chain, NGramStream &str) { + str.Init(chain.Add()); + return chain; +} + +}} // namespaces +#endif // LM_BUILDER_NGRAM_STREAM__ diff --git a/klm/lm/builder/pipeline.cc b/klm/lm/builder/pipeline.cc new file mode 100644 index 00000000..14a1f721 --- /dev/null +++ b/klm/lm/builder/pipeline.cc @@ -0,0 +1,320 @@ +#include "lm/builder/pipeline.hh" + +#include "lm/builder/adjust_counts.hh" +#include "lm/builder/corpus_count.hh" +#include "lm/builder/initial_probabilities.hh" +#include "lm/builder/interpolate.hh" +#include "lm/builder/print.hh" +#include "lm/builder/sort.hh" + +#include "lm/sizes.hh" + +#include "util/exception.hh" +#include "util/file.hh" +#include "util/stream/io.hh" + +#include <algorithm> +#include <iostream> +#include <vector> + +namespace lm { namespace builder { + +namespace { +void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<Discount> &discounts) { + std::cerr << "Statistics:\n"; + for (size_t i = 0; i < counts.size(); ++i) { + std::cerr << (i + 1) << ' ' << counts[i]; + for (size_t d = 1; d <= 3; ++d) + std::cerr << " D" << d << (d == 3 ? "+=" : "=") << discounts[i].amount[d]; + std::cerr << '\n'; + } +} + +class Master { + public: + explicit Master(const PipelineConfig &config) + : config_(config), chains_(config.order), files_(config.order) { + config_.minimum_block = std::max(NGram::TotalSize(config_.order), config_.minimum_block); + } + + const PipelineConfig &Config() const { return config_; } + + Chains &MutableChains() { return chains_; } + + template <class T> Master &operator>>(const T &worker) { + chains_ >> worker; + return *this; + } + + // This takes the (partially) sorted ngrams and sets up for adjusted counts. + void InitForAdjust(util::stream::Sort<SuffixOrder, AddCombiner> &ngrams, WordIndex types) { + const std::size_t each_order_min = config_.minimum_block * config_.block_count; + // We know how many unigrams there are. Don't allocate more than needed to them. + const std::size_t min_chains = (config_.order - 1) * each_order_min + + std::min(types * NGram::TotalSize(1), each_order_min); + // Do merge sort with calculated laziness. + const std::size_t merge_using = ngrams.Merge(std::min(config_.TotalMemory() - min_chains, ngrams.DefaultLazy())); + + std::vector<uint64_t> count_bounds(1, types); + CreateChains(config_.TotalMemory() - merge_using, count_bounds); + ngrams.Output(chains_.back(), merge_using); + + // Setup unigram file. + files_.push_back(util::MakeTemp(config_.TempPrefix())); + } + + // For initial probabilities, but this is generic. + void SortAndReadTwice(const std::vector<uint64_t> &counts, Sorts<ContextOrder> &sorts, Chains &second, util::stream::ChainConfig second_config) { + // Do merge first before allocating chain memory. + for (std::size_t i = 1; i < config_.order; ++i) { + sorts[i - 1].Merge(0); + } + // There's no lazy merge, so just divide memory amongst the chains. + CreateChains(config_.TotalMemory(), counts); + chains_.back().ActivateProgress(); + chains_[0] >> files_[0].Source(); + second_config.entry_size = NGram::TotalSize(1); + second.push_back(second_config); + second.back() >> files_[0].Source(); + for (std::size_t i = 1; i < config_.order; ++i) { + util::scoped_fd fd(sorts[i - 1].StealCompleted()); + chains_[i].SetProgressTarget(util::SizeOrThrow(fd.get())); + chains_[i] >> util::stream::PRead(util::DupOrThrow(fd.get()), true); + second_config.entry_size = NGram::TotalSize(i + 1); + second.push_back(second_config); + second.back() >> util::stream::PRead(fd.release(), true); + } + } + + // There is no sort after this, so go for broke on lazy merging. + template <class Compare> void MaximumLazyInput(const std::vector<uint64_t> &counts, Sorts<Compare> &sorts) { + // Determine the minimum we can use for all the chains. + std::size_t min_chains = 0; + for (std::size_t i = 0; i < config_.order; ++i) { + min_chains += std::min(counts[i] * NGram::TotalSize(i + 1), static_cast<uint64_t>(config_.minimum_block)); + } + std::size_t for_merge = min_chains > config_.TotalMemory() ? 0 : (config_.TotalMemory() - min_chains); + std::vector<std::size_t> laziness; + // Prioritize longer n-grams. + for (util::stream::Sort<SuffixOrder> *i = sorts.end() - 1; i >= sorts.begin(); --i) { + laziness.push_back(i->Merge(for_merge)); + assert(for_merge >= laziness.back()); + for_merge -= laziness.back(); + } + std::reverse(laziness.begin(), laziness.end()); + + CreateChains(for_merge + min_chains, counts); + chains_.back().ActivateProgress(); + chains_[0] >> files_[0].Source(); + for (std::size_t i = 1; i < config_.order; ++i) { + sorts[i - 1].Output(chains_[i], laziness[i - 1]); + } + } + + void BufferFinal(const std::vector<uint64_t> &counts) { + chains_[0] >> files_[0].Sink(); + for (std::size_t i = 1; i < config_.order; ++i) { + files_.push_back(util::MakeTemp(config_.TempPrefix())); + chains_[i] >> files_[i].Sink(); + } + chains_.Wait(true); + // Use less memory. Because we can. + CreateChains(std::min(config_.sort.buffer_size * config_.order, config_.TotalMemory()), counts); + for (std::size_t i = 0; i < config_.order; ++i) { + chains_[i] >> files_[i].Source(); + } + } + + template <class Compare> void SetupSorts(Sorts<Compare> &sorts) { + sorts.Init(config_.order - 1); + // Unigrams don't get sorted because their order is always the same. + chains_[0] >> files_[0].Sink(); + for (std::size_t i = 1; i < config_.order; ++i) { + sorts.push_back(chains_[i], config_.sort, Compare(i + 1)); + } + chains_.Wait(true); + } + + private: + // Create chains, allocating memory to them. Totally heuristic. Count + // bounds are upper bounds on the counts or not present. + void CreateChains(std::size_t remaining_mem, const std::vector<uint64_t> &count_bounds) { + std::vector<std::size_t> assignments; + assignments.reserve(config_.order); + // Start by assigning maximum memory usage (to be refined later). + for (std::size_t i = 0; i < count_bounds.size(); ++i) { + assignments.push_back(static_cast<std::size_t>(std::min( + static_cast<uint64_t>(remaining_mem), + count_bounds[i] * static_cast<uint64_t>(NGram::TotalSize(i + 1))))); + } + assignments.resize(config_.order, remaining_mem); + + // Now we know how much memory everybody wants. How much will they get? + // Proportional to this. + std::vector<float> portions; + // Indices of orders that have yet to be assigned. + std::vector<std::size_t> unassigned; + for (std::size_t i = 0; i < config_.order; ++i) { + portions.push_back(static_cast<float>((i+1) * NGram::TotalSize(i+1))); + unassigned.push_back(i); + } + /*If somebody doesn't eat their full dinner, give it to the rest of the + * family. Then somebody else might not eat their full dinner etc. Ends + * when everybody unassigned is hungry. + */ + float sum; + bool found_more; + std::vector<std::size_t> block_count(config_.order); + do { + sum = 0.0; + for (std::size_t i = 0; i < unassigned.size(); ++i) { + sum += portions[unassigned[i]]; + } + found_more = false; + // If the proportional assignment is more than needed, give it just what it needs. + for (std::vector<std::size_t>::iterator i = unassigned.begin(); i != unassigned.end();) { + if (assignments[*i] <= remaining_mem * (portions[*i] / sum)) { + remaining_mem -= assignments[*i]; + block_count[*i] = 1; + i = unassigned.erase(i); + found_more = true; + } else { + ++i; + } + } + } while (found_more); + for (std::vector<std::size_t>::iterator i = unassigned.begin(); i != unassigned.end(); ++i) { + assignments[*i] = remaining_mem * (portions[*i] / sum); + block_count[*i] = config_.block_count; + } + chains_.clear(); + std::cerr << "Chain sizes:"; + for (std::size_t i = 0; i < config_.order; ++i) { + std::cerr << ' ' << (i+1) << ":" << assignments[i]; + chains_.push_back(util::stream::ChainConfig(NGram::TotalSize(i + 1), block_count[i], assignments[i])); + } + std::cerr << std::endl; + } + + PipelineConfig config_; + + Chains chains_; + // Often only unigrams, but sometimes all orders. + FixedArray<util::stream::FileBuffer> files_; +}; + +void CountText(int text_file /* input */, int vocab_file /* output */, Master &master, uint64_t &token_count, std::string &text_file_name) { + const PipelineConfig &config = master.Config(); + std::cerr << "=== 1/5 Counting and sorting n-grams ===" << std::endl; + + UTIL_THROW_IF(config.TotalMemory() < config.assume_vocab_hash_size, util::Exception, "Vocab hash size estimate " << config.assume_vocab_hash_size << " exceeds total memory " << config.TotalMemory()); + std::size_t memory_for_chain = + // This much memory to work with after vocab hash table. + static_cast<float>(config.TotalMemory() - config.assume_vocab_hash_size) / + // Solve for block size including the dedupe multiplier for one block. + (static_cast<float>(config.block_count) + CorpusCount::DedupeMultiplier(config.order)) * + // Chain likes memory expressed in terms of total memory. + static_cast<float>(config.block_count); + util::stream::Chain chain(util::stream::ChainConfig(NGram::TotalSize(config.order), config.block_count, memory_for_chain)); + + WordIndex type_count; + util::FilePiece text(text_file, NULL, &std::cerr); + text_file_name = text.FileName(); + CorpusCount counter(text, vocab_file, token_count, type_count, chain.BlockSize() / chain.EntrySize()); + chain >> boost::ref(counter); + + util::stream::Sort<SuffixOrder, AddCombiner> sorter(chain, config.sort, SuffixOrder(config.order), AddCombiner()); + chain.Wait(true); + std::cerr << "=== 2/5 Calculating and sorting adjusted counts ===" << std::endl; + master.InitForAdjust(sorter, type_count); +} + +void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector<Discount> &discounts, Master &master, Sorts<SuffixOrder> &primary, FixedArray<util::stream::FileBuffer> &gammas) { + const PipelineConfig &config = master.Config(); + Chains second(config.order); + + { + Sorts<ContextOrder> sorts; + master.SetupSorts(sorts); + PrintStatistics(counts, discounts); + lm::ngram::ShowSizes(counts); + std::cerr << "=== 3/5 Calculating and sorting initial probabilities ===" << std::endl; + master.SortAndReadTwice(counts, sorts, second, config.initial_probs.adder_in); + } + + Chains gamma_chains(config.order); + InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains); + // Don't care about gamma for 0. + gamma_chains[0] >> util::stream::kRecycle; + gammas.Init(config.order - 1); + for (std::size_t i = 1; i < config.order; ++i) { + gammas.push_back(util::MakeTemp(config.TempPrefix())); + gamma_chains[i] >> gammas[i - 1].Sink(); + } + // Has to be done here due to gamma_chains scope. + master.SetupSorts(primary); +} + +void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &master, Sorts<SuffixOrder> &primary, FixedArray<util::stream::FileBuffer> &gammas) { + std::cerr << "=== 4/5 Calculating and writing order-interpolated probabilities ===" << std::endl; + const PipelineConfig &config = master.Config(); + master.MaximumLazyInput(counts, primary); + + Chains gamma_chains(config.order - 1); + util::stream::ChainConfig read_backoffs(config.read_backoffs); + read_backoffs.entry_size = sizeof(float); + for (std::size_t i = 0; i < config.order - 1; ++i) { + gamma_chains.push_back(read_backoffs); + gamma_chains.back() >> gammas[i].Source(); + } + master >> Interpolate(counts[0], ChainPositions(gamma_chains)); + gamma_chains >> util::stream::kRecycle; + master.BufferFinal(counts); +} + +} // namespace + +void Pipeline(PipelineConfig config, int text_file, int out_arpa) { + // Some fail-fast sanity checks. + if (config.sort.buffer_size * 4 > config.TotalMemory()) { + config.sort.buffer_size = config.TotalMemory() / 4; + std::cerr << "Warning: changing sort block size to " << config.sort.buffer_size << " bytes due to low total memory." << std::endl; + } + if (config.minimum_block < NGram::TotalSize(config.order)) { + config.minimum_block = NGram::TotalSize(config.order); + std::cerr << "Warning: raising minimum block to " << config.minimum_block << " to fit an ngram in every block." << std::endl; + } + UTIL_THROW_IF(config.sort.buffer_size < config.minimum_block, util::Exception, "Sort block size " << config.sort.buffer_size << " is below the minimum block size " << config.minimum_block << "."); + UTIL_THROW_IF(config.TotalMemory() < config.minimum_block * config.order * config.block_count, util::Exception, + "Not enough memory to fit " << (config.order * config.block_count) << " blocks with minimum size " << config.minimum_block << ". Increase memory to " << (config.minimum_block * config.order * config.block_count) << " bytes or decrease the minimum block size."); + + UTIL_TIMER("(%w s) Total wall time elapsed\n"); + Master master(config); + + util::scoped_fd vocab_file(config.vocab_file.empty() ? + util::MakeTemp(config.TempPrefix()) : + util::CreateOrThrow(config.vocab_file.c_str())); + uint64_t token_count; + std::string text_file_name; + CountText(text_file, vocab_file.get(), master, token_count, text_file_name); + + std::vector<uint64_t> counts; + std::vector<Discount> discounts; + master >> AdjustCounts(counts, discounts); + + { + FixedArray<util::stream::FileBuffer> gammas; + Sorts<SuffixOrder> primary; + InitialProbabilities(counts, discounts, master, primary, gammas); + InterpolateProbabilities(counts, master, primary, gammas); + } + + std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl; + VocabReconstitute vocab(vocab_file.get()); + UTIL_THROW_IF(vocab.Size() != counts[0], util::Exception, "Vocab words don't match up. Is there a null byte in the input?"); + HeaderInfo header_info(text_file_name, token_count); + master >> PrintARPA(vocab, counts, (config.verbose_header ? &header_info : NULL), out_arpa) >> util::stream::kRecycle; + master.MutableChains().Wait(true); +} + +}} // namespaces diff --git a/klm/lm/builder/pipeline.hh b/klm/lm/builder/pipeline.hh new file mode 100644 index 00000000..f1d6c5f6 --- /dev/null +++ b/klm/lm/builder/pipeline.hh @@ -0,0 +1,40 @@ +#ifndef LM_BUILDER_PIPELINE__ +#define LM_BUILDER_PIPELINE__ + +#include "lm/builder/initial_probabilities.hh" +#include "lm/builder/header_info.hh" +#include "util/stream/config.hh" +#include "util/file_piece.hh" + +#include <string> +#include <cstddef> + +namespace lm { namespace builder { + +struct PipelineConfig { + std::size_t order; + std::string vocab_file; + util::stream::SortConfig sort; + InitialProbabilitiesConfig initial_probs; + util::stream::ChainConfig read_backoffs; + bool verbose_header; + + // Amount of memory to assume that the vocabulary hash table will use. This + // is subtracted from total memory for CorpusCount. + std::size_t assume_vocab_hash_size; + + // Minimum block size to tolerate. + std::size_t minimum_block; + + // Number of blocks to use. This will be overridden to 1 if everything fits. + std::size_t block_count; + + const std::string &TempPrefix() const { return sort.temp_prefix; } + std::size_t TotalMemory() const { return sort.total_memory; } +}; + +// Takes ownership of text_file. +void Pipeline(PipelineConfig config, int text_file, int out_arpa); + +}} // namespaces +#endif // LM_BUILDER_PIPELINE__ diff --git a/klm/lm/builder/print.cc b/klm/lm/builder/print.cc new file mode 100644 index 00000000..b0323221 --- /dev/null +++ b/klm/lm/builder/print.cc @@ -0,0 +1,135 @@ +#include "lm/builder/print.hh" + +#include "util/double-conversion/double-conversion.h" +#include "util/double-conversion/utils.h" +#include "util/file.hh" +#include "util/mmap.hh" +#include "util/scoped.hh" +#include "util/stream/timer.hh" + +#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE +#include <boost/lexical_cast.hpp> + +#include <sstream> + +#include <string.h> + +namespace lm { namespace builder { + +VocabReconstitute::VocabReconstitute(int fd) { + uint64_t size = util::SizeOrThrow(fd); + util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_); + const char *const start = static_cast<const char*>(memory_.get()); + const char *i; + for (i = start; i != start + size; i += strlen(i) + 1) { + map_.push_back(i); + } + // Last one for LookupPiece. + map_.push_back(i); +} + +namespace { +class OutputManager { + public: + static const std::size_t kOutBuf = 1048576; + + // Does not take ownership of out. + explicit OutputManager(int out) + : buf_(util::MallocOrThrow(kOutBuf)), + builder_(static_cast<char*>(buf_.get()), kOutBuf), + // Mostly the default but with inf instead. And no flags. + convert_(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0), + fd_(out) {} + + ~OutputManager() { + Flush(); + } + + OutputManager &operator<<(float value) { + // Odd, but this is the largest number found in the comments. + EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8); + convert_.ToShortestSingle(value, &builder_); + return *this; + } + + OutputManager &operator<<(StringPiece str) { + if (str.size() > kOutBuf) { + Flush(); + util::WriteOrThrow(fd_, str.data(), str.size()); + } else { + EnsureRemaining(str.size()); + builder_.AddSubstring(str.data(), str.size()); + } + return *this; + } + + // Inefficient! + OutputManager &operator<<(unsigned val) { + return *this << boost::lexical_cast<std::string>(val); + } + + OutputManager &operator<<(char c) { + EnsureRemaining(1); + builder_.AddCharacter(c); + return *this; + } + + void Flush() { + util::WriteOrThrow(fd_, buf_.get(), builder_.position()); + builder_.Reset(); + } + + private: + void EnsureRemaining(std::size_t amount) { + if (static_cast<std::size_t>(builder_.size() - builder_.position()) < amount) { + Flush(); + } + } + + util::scoped_malloc buf_; + double_conversion::StringBuilder builder_; + double_conversion::DoubleToStringConverter convert_; + int fd_; +}; +} // namespace + +PrintARPA::PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd) + : vocab_(vocab), out_fd_(out_fd) { + std::stringstream stream; + + if (header_info) { + stream << "# Input file: " << header_info->input_file << '\n'; + stream << "# Token count: " << header_info->token_count << '\n'; + stream << "# Smoothing: Modified Kneser-Ney" << '\n'; + } + stream << "\\data\\\n"; + for (size_t i = 0; i < counts.size(); ++i) { + stream << "ngram " << (i+1) << '=' << counts[i] << '\n'; + } + stream << '\n'; + std::string as_string(stream.str()); + util::WriteOrThrow(out_fd, as_string.data(), as_string.size()); +} + +void PrintARPA::Run(const ChainPositions &positions) { + UTIL_TIMER("(%w s) Wrote ARPA file\n"); + OutputManager out(out_fd_); + for (unsigned order = 1; order <= positions.size(); ++order) { + out << "\\" << order << "-grams:" << '\n'; + for (NGramStream stream(positions[order - 1]); stream; ++stream) { + // Correcting for numerical precision issues. Take that IRST. + out << std::min(0.0f, stream->Value().complete.prob) << '\t' << vocab_.Lookup(*stream->begin()); + for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) { + out << ' ' << vocab_.Lookup(*i); + } + float backoff = stream->Value().complete.backoff; + if (backoff != 0.0) + out << '\t' << backoff; + out << '\n'; + } + out << '\n'; + } + out << "\\end\\\n"; +} + +}} // namespaces diff --git a/klm/lm/builder/print.hh b/klm/lm/builder/print.hh new file mode 100644 index 00000000..aa932e75 --- /dev/null +++ b/klm/lm/builder/print.hh @@ -0,0 +1,102 @@ +#ifndef LM_BUILDER_PRINT__ +#define LM_BUILDER_PRINT__ + +#include "lm/builder/ngram.hh" +#include "lm/builder/multi_stream.hh" +#include "lm/builder/header_info.hh" +#include "util/file.hh" +#include "util/mmap.hh" +#include "util/string_piece.hh" + +#include <ostream> + +#include <assert.h> + +// Warning: print routines read all unigrams before all bigrams before all +// trigrams etc. So if other parts of the chain move jointly, you'll have to +// buffer. + +namespace lm { namespace builder { + +class VocabReconstitute { + public: + // fd must be alive for life of this object; does not take ownership. + explicit VocabReconstitute(int fd); + + const char *Lookup(WordIndex index) const { + assert(index < map_.size() - 1); + return map_[index]; + } + + StringPiece LookupPiece(WordIndex index) const { + return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]); + } + + std::size_t Size() const { + // There's an extra entry to support StringPiece lengths. + return map_.size() - 1; + } + + private: + util::scoped_memory memory_; + std::vector<const char*> map_; +}; + +// Not defined, only specialized. +template <class T> void PrintPayload(std::ostream &to, const Payload &payload); +template <> inline void PrintPayload<uint64_t>(std::ostream &to, const Payload &payload) { + to << payload.count; +} +template <> inline void PrintPayload<Uninterpolated>(std::ostream &to, const Payload &payload) { + to << log10(payload.uninterp.prob) << ' ' << log10(payload.uninterp.gamma); +} +template <> inline void PrintPayload<ProbBackoff>(std::ostream &to, const Payload &payload) { + to << payload.complete.prob << ' ' << payload.complete.backoff; +} + +// template parameter is the type stored. +template <class V> class Print { + public: + explicit Print(const VocabReconstitute &vocab, std::ostream &to) : vocab_(vocab), to_(to) {} + + void Run(const ChainPositions &chains) { + NGramStreams streams(chains); + for (NGramStream *s = streams.begin(); s != streams.end(); ++s) { + DumpStream(*s); + } + } + + void Run(const util::stream::ChainPosition &position) { + NGramStream stream(position); + DumpStream(stream); + } + + private: + void DumpStream(NGramStream &stream) { + for (; stream; ++stream) { + PrintPayload<V>(to_, stream->Value()); + for (const WordIndex *w = stream->begin(); w != stream->end(); ++w) { + to_ << ' ' << vocab_.Lookup(*w) << '=' << *w; + } + to_ << '\n'; + } + } + + const VocabReconstitute &vocab_; + std::ostream &to_; +}; + +class PrintARPA { + public: + // header_info may be NULL to disable the header + explicit PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd); + + void Run(const ChainPositions &positions); + + private: + const VocabReconstitute &vocab_; + int out_fd_; +}; + +}} // namespaces +#endif // LM_BUILDER_PRINT__ diff --git a/klm/lm/builder/sort.hh b/klm/lm/builder/sort.hh new file mode 100644 index 00000000..9989389b --- /dev/null +++ b/klm/lm/builder/sort.hh @@ -0,0 +1,103 @@ +#ifndef LM_BUILDER_SORT__ +#define LM_BUILDER_SORT__ + +#include "lm/builder/multi_stream.hh" +#include "lm/builder/ngram.hh" +#include "lm/word_index.hh" +#include "util/stream/sort.hh" + +#include "util/stream/timer.hh" + +#include <functional> +#include <string> + +namespace lm { +namespace builder { + +template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> { + public: + explicit Comparator(std::size_t order) : order_(order) {} + + inline bool operator()(const void *lhs, const void *rhs) const { + return static_cast<const Child*>(this)->Compare(static_cast<const WordIndex*>(lhs), static_cast<const WordIndex*>(rhs)); + } + + std::size_t Order() const { return order_; } + + protected: + std::size_t order_; +}; + +class SuffixOrder : public Comparator<SuffixOrder> { + public: + explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {} + + inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { + for (std::size_t i = order_ - 1; i != 0; --i) { + if (lhs[i] != rhs[i]) + return lhs[i] < rhs[i]; + } + return lhs[0] < rhs[0]; + } + + static const unsigned kMatchOffset = 1; +}; + +class ContextOrder : public Comparator<ContextOrder> { + public: + explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {} + + inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { + for (int i = order_ - 2; i >= 0; --i) { + if (lhs[i] != rhs[i]) + return lhs[i] < rhs[i]; + } + return lhs[order_ - 1] < rhs[order_ - 1]; + } +}; + +class PrefixOrder : public Comparator<PrefixOrder> { + public: + explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {} + + inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const { + for (std::size_t i = 0; i < order_; ++i) { + if (lhs[i] != rhs[i]) + return lhs[i] < rhs[i]; + } + return false; + } + + static const unsigned kMatchOffset = 0; +}; + +// Sum counts for the same n-gram. +struct AddCombiner { + bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const { + NGram first(first_void, compare.Order()); + // There isn't a const version of NGram. + NGram second(const_cast<void*>(second_void), compare.Order()); + if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false; + first.Count() += second.Count(); + return true; + } +}; + +// The combiner is only used on a single chain, so I didn't bother to allow +// that template. +template <class Compare> class Sorts : public FixedArray<util::stream::Sort<Compare> > { + private: + typedef util::stream::Sort<Compare> S; + typedef FixedArray<S> P; + + public: + void push_back(util::stream::Chain &chain, const util::stream::SortConfig &config, const Compare &compare) { + new (P::end()) S(chain, config, compare); + P::Constructed(); + } +}; + +} // namespace builder +} // namespace lm + +#endif // LM_BUILDER_SORT__ diff --git a/klm/lm/config.cc b/klm/lm/config.cc index f9d988ca..9520c41c 100644 --- a/klm/lm/config.cc +++ b/klm/lm/config.cc @@ -6,6 +6,7 @@ namespace lm { namespace ngram { Config::Config() : + show_progress(true), messages(&std::cerr), enumerate_vocab(NULL), unknown_missing(COMPLAIN), diff --git a/klm/lm/config.hh b/klm/lm/config.hh index 739cee9c..0de7b7c6 100644 --- a/klm/lm/config.hh +++ b/klm/lm/config.hh @@ -11,46 +11,52 @@ /* Configuration for ngram model. Separate header to reduce pollution. */ namespace lm { - + class EnumerateVocab; namespace ngram { struct Config { - // EFFECTIVE FOR BOTH ARPA AND BINARY READS + // EFFECTIVE FOR BOTH ARPA AND BINARY READS + + // (default true) print progress bar to messages + bool show_progress; // Where to log messages including the progress bar. Set to NULL for // silence. std::ostream *messages; + std::ostream *ProgressMessages() const { + return show_progress ? messages : 0; + } + // This will be called with every string in the vocabulary. See // enumerate_vocab.hh for more detail. Config does not take ownership; you - // are still responsible for deleting it (or stack allocating). + // are still responsible for deleting it (or stack allocating). EnumerateVocab *enumerate_vocab; - // ONLY EFFECTIVE WHEN READING ARPA - // What to do when <unk> isn't in the provided model. + // What to do when <unk> isn't in the provided model. WarningAction unknown_missing; - // What to do when <s> or </s> is missing from the model. - // If THROW_UP, the exception will be of type util::SpecialWordMissingException. + // What to do when <s> or </s> is missing from the model. + // If THROW_UP, the exception will be of type util::SpecialWordMissingException. WarningAction sentence_marker_missing; // What to do with a positive log probability. For COMPLAIN and SILENT, map - // to 0. + // to 0. WarningAction positive_log_probability; - // The probability to substitute for <unk> if it's missing from the model. + // The probability to substitute for <unk> if it's missing from the model. // No effect if the model has <unk> or unknown_missing == THROW_UP. float unknown_missing_logprob; // Size multiplier for probing hash table. Must be > 1. Space is linear in // this. Time is probing_multiplier / (probing_multiplier - 1). No effect - // for sorted variant. + // for sorted variant. // If you find yourself setting this to a low number, consider using the - // TrieModel which has lower memory consumption. + // TrieModel which has lower memory consumption. float probing_multiplier; // Amount of memory to use for building. The actual memory usage will be @@ -58,10 +64,10 @@ struct Config { // models. std::size_t building_memory; - // Template for temporary directory appropriate for passing to mkdtemp. + // Template for temporary directory appropriate for passing to mkdtemp. // The characters XXXXXX are appended before passing to mkdtemp. Only // applies to trie. If NULL, defaults to write_mmap. If that's NULL, - // defaults to input file name. + // defaults to input file name. const char *temporary_directory_prefix; // Level of complaining to do when loading from ARPA instead of binary format. @@ -69,49 +75,46 @@ struct Config { ARPALoadComplain arpa_complain; // While loading an ARPA file, also write out this binary format file. Set - // to NULL to disable. + // to NULL to disable. const char *write_mmap; enum WriteMethod { - WRITE_MMAP, // Map the file directly. - WRITE_AFTER // Write after we're done. + WRITE_MMAP, // Map the file directly. + WRITE_AFTER // Write after we're done. }; WriteMethod write_method; - // Include the vocab in the binary file? Only effective if write_mmap != NULL. + // Include the vocab in the binary file? Only effective if write_mmap != NULL. bool include_vocab; - // Left rest options. Only used when the model includes rest costs. + // Left rest options. Only used when the model includes rest costs. enum RestFunction { REST_MAX, // Maximum of any score to the left - REST_LOWER, // Use lower-order files given below. + REST_LOWER, // Use lower-order files given below. }; RestFunction rest_function; - // Only used for REST_LOWER. + // Only used for REST_LOWER. std::vector<std::string> rest_lower_files; - // Quantization options. Only effective for QuantTrieModel. One value is // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used - // to quantize (and one of the remaining backoffs will be 0). + // to quantize (and one of the remaining backoffs will be 0). uint8_t prob_bits, backoff_bits; // Bhiksha compression (simple form). Only works with trie. uint8_t pointer_bhiksha_bits; - - + // ONLY EFFECTIVE WHEN READING BINARY - + // How to get the giant array into memory: lazy mmap, populate, read etc. - // See util/mmap.hh for details of MapMethod. + // See util/mmap.hh for details of MapMethod. util::LoadMethod load_method; - - // Set defaults. + // Set defaults. Config(); }; diff --git a/klm/lm/filter/arpa_io.cc b/klm/lm/filter/arpa_io.cc new file mode 100644 index 00000000..f8568ac4 --- /dev/null +++ b/klm/lm/filter/arpa_io.cc @@ -0,0 +1,108 @@ +#include "lm/filter/arpa_io.hh" +#include "util/file_piece.hh" + +#include <iostream> +#include <ostream> +#include <string> +#include <vector> + +#include <ctype.h> +#include <errno.h> +#include <string.h> + +namespace lm { + +ARPAInputException::ARPAInputException(const StringPiece &message) throw() { + *this << message; +} + +ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() { + *this << message << " in line " << line; +} + +ARPAInputException::~ARPAInputException() throw() {} + +ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() { + *this << message << " in file " << file_name; +} + +ARPAOutputException::~ARPAOutputException() throw() {} + +// Seeking is the responsibility of the caller. +void WriteCounts(std::ostream &out, const std::vector<uint64_t> &number) { + out << "\n\\data\\\n"; + for (unsigned int i = 0; i < number.size(); ++i) { + out << "ngram " << i+1 << "=" << number[i] << '\n'; + } + out << '\n'; +} + +size_t SizeNeededForCounts(const std::vector<uint64_t> &number) { + std::ostringstream buf; + WriteCounts(buf, number); + return buf.tellp(); +} + +bool IsEntirelyWhiteSpace(const StringPiece &line) { + for (size_t i = 0; i < static_cast<size_t>(line.size()); ++i) { + if (!isspace(line.data()[i])) return false; + } + return true; +} + +ARPAOutput::ARPAOutput(const char *name, size_t buffer_size) : file_name_(name), buffer_(new char[buffer_size]) { + try { + file_.exceptions(std::ostream::eofbit | std::ostream::failbit | std::ostream::badbit); + if (!file_.rdbuf()->pubsetbuf(buffer_.get(), buffer_size)) { + std::cerr << "Warning: could not enlarge buffer for " << name << std::endl; + buffer_.reset(); + } + file_.open(name, std::ios::out | std::ios::binary); + } catch (const std::ios_base::failure &f) { + throw ARPAOutputException("Opening", file_name_); + } +} + +void ARPAOutput::ReserveForCounts(std::streampos reserve) { + try { + for (std::streampos i = 0; i < reserve; i += std::streampos(1)) { + file_ << '\n'; + } + } catch (const std::ios_base::failure &f) { + throw ARPAOutputException("Writing blanks to reserve space for counts to ", file_name_); + } +} + +void ARPAOutput::BeginLength(unsigned int length) { + fast_counter_ = 0; + try { + file_ << '\\' << length << "-grams:" << '\n'; + } catch (const std::ios_base::failure &f) { + throw ARPAOutputException("Writing n-gram header to ", file_name_); + } +} + +void ARPAOutput::EndLength(unsigned int length) { + try { + file_ << '\n'; + } catch (const std::ios_base::failure &f) { + throw ARPAOutputException("Writing blank at end of count list to ", file_name_); + } + if (length > counts_.size()) { + counts_.resize(length); + } + counts_[length - 1] = fast_counter_; +} + +void ARPAOutput::Finish() { + try { + file_ << "\\end\\\n"; + file_.seekp(0); + WriteCounts(file_, counts_); + file_ << std::flush; + } catch (const std::ios_base::failure &f) { + throw ARPAOutputException("Finishing including writing counts at beginning to ", file_name_); + } +} + +} // namespace lm diff --git a/klm/lm/filter/arpa_io.hh b/klm/lm/filter/arpa_io.hh new file mode 100644 index 00000000..5b31620b --- /dev/null +++ b/klm/lm/filter/arpa_io.hh @@ -0,0 +1,115 @@ +#ifndef LM_FILTER_ARPA_IO__ +#define LM_FILTER_ARPA_IO__ +/* Input and output for ARPA format language model files. + */ +#include "lm/read_arpa.hh" +#include "util/exception.hh" +#include "util/string_piece.hh" +#include "util/tokenize_piece.hh" + +#include <boost/noncopyable.hpp> +#include <boost/scoped_array.hpp> + +#include <fstream> +#include <string> +#include <vector> + +#include <err.h> +#include <string.h> +#include <stdint.h> + +namespace util { class FilePiece; } + +namespace lm { + +class ARPAInputException : public util::Exception { + public: + explicit ARPAInputException(const StringPiece &message) throw(); + explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw(); + virtual ~ARPAInputException() throw(); +}; + +class ARPAOutputException : public util::ErrnoException { + public: + ARPAOutputException(const char *prefix, const std::string &file_name) throw(); + virtual ~ARPAOutputException() throw(); + + const std::string &File() const throw() { return file_name_; } + + private: + const std::string file_name_; +}; + +// Handling for the counts of n-grams at the beginning of ARPA files. +size_t SizeNeededForCounts(const std::vector<uint64_t> &number); + +/* Writes an ARPA file. This has to be seekable so the counts can be written + * at the end. Hence, I just have it own a std::fstream instead of accepting + * a separately held std::ostream. TODO: use the fast one from estimation. + */ +class ARPAOutput : boost::noncopyable { + public: + explicit ARPAOutput(const char *name, size_t buffer_size = 65536); + + void ReserveForCounts(std::streampos reserve); + + void BeginLength(unsigned int length); + + void AddNGram(const StringPiece &line) { + try { + file_ << line << '\n'; + } catch (const std::ios_base::failure &f) { + throw ARPAOutputException("Writing an n-gram", file_name_); + } + ++fast_counter_; + } + + void AddNGram(const StringPiece &ngram, const StringPiece &line) { + AddNGram(line); + } + + template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + AddNGram(line); + } + + void EndLength(unsigned int length); + + void Finish(); + + private: + const std::string file_name_; + boost::scoped_array<char> buffer_; + std::fstream file_; + size_t fast_counter_; + std::vector<uint64_t> counts_; +}; + + +template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) { + ReadNGramHeader(in, length); + out.BeginLength(length); + for (uint64_t i = 0; i < number; ++i) { + StringPiece line = in.ReadLine(); + util::TokenIter<util::SingleCharacter> tabber(line, '\t'); + if (!tabber) throw ARPAInputException("blank line", line); + if (!++tabber) throw ARPAInputException("no tab", line); + + out.AddNGram(*tabber, line); + } + out.EndLength(length); +} + +template <class Output> void ReadARPA(util::FilePiece &in_lm, Output &out) { + std::vector<uint64_t> number; + ReadARPACounts(in_lm, number); + out.ReserveForCounts(SizeNeededForCounts(number)); + for (unsigned int i = 0; i < number.size(); ++i) { + ReadNGrams(in_lm, i + 1, number[i], out); + } + ReadEnd(in_lm); + out.Finish(); +} + +} // namespace lm + +#endif // LM_FILTER_ARPA_IO__ diff --git a/klm/lm/filter/count_io.hh b/klm/lm/filter/count_io.hh new file mode 100644 index 00000000..97c0fa25 --- /dev/null +++ b/klm/lm/filter/count_io.hh @@ -0,0 +1,91 @@ +#ifndef LM_FILTER_COUNT_IO__ +#define LM_FILTER_COUNT_IO__ + +#include <fstream> +#include <iostream> +#include <string> + +#include <err.h> + +#include "util/file_piece.hh" + +namespace lm { + +class CountOutput : boost::noncopyable { + public: + explicit CountOutput(const char *name) : file_(name, std::ios::out) {} + + void AddNGram(const StringPiece &line) { + if (!(file_ << line << '\n')) { + err(3, "Writing counts file failed"); + } + } + + template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + AddNGram(line); + } + + void AddNGram(const StringPiece &ngram, const StringPiece &line) { + AddNGram(line); + } + + private: + std::fstream file_; +}; + +class CountBatch { + public: + explicit CountBatch(std::streamsize initial_read) + : initial_read_(initial_read) { + buffer_.reserve(initial_read); + } + + void Read(std::istream &in) { + buffer_.resize(initial_read_); + in.read(&*buffer_.begin(), initial_read_); + buffer_.resize(in.gcount()); + char got; + while (in.get(got) && got != '\n') + buffer_.push_back(got); + } + + template <class Output> void Send(Output &out) { + for (util::TokenIter<util::SingleCharacter> line(StringPiece(&*buffer_.begin(), buffer_.size()), '\n'); line; ++line) { + util::TokenIter<util::SingleCharacter> tabber(*line, '\t'); + if (!tabber) { + std::cerr << "Warning: empty n-gram count line being removed\n"; + continue; + } + util::TokenIter<util::SingleCharacter, true> words(*tabber, ' '); + if (!words) { + std::cerr << "Line has a tab but no words.\n"; + continue; + } + out.AddNGram(words, util::TokenIter<util::SingleCharacter, true>::end(), *line); + } + } + + private: + std::streamsize initial_read_; + + // This could have been a std::string but that's less happy with raw writes. + std::vector<char> buffer_; +}; + +template <class Output> void ReadCount(util::FilePiece &in_file, Output &out) { + try { + while (true) { + StringPiece line = in_file.ReadLine(); + util::TokenIter<util::SingleCharacter> tabber(line, '\t'); + if (!tabber) { + std::cerr << "Warning: empty n-gram count line being removed\n"; + continue; + } + out.AddNGram(*tabber, line); + } + } catch (const util::EndOfFileException &e) {} +} + +} // namespace lm + +#endif // LM_FILTER_COUNT_IO__ diff --git a/klm/lm/filter/filter_main.cc b/klm/lm/filter/filter_main.cc new file mode 100644 index 00000000..1a4ba84f --- /dev/null +++ b/klm/lm/filter/filter_main.cc @@ -0,0 +1,248 @@ +#include "lm/filter/arpa_io.hh" +#include "lm/filter/format.hh" +#include "lm/filter/phrase.hh" +#ifndef NTHREAD +#include "lm/filter/thread.hh" +#endif +#include "lm/filter/vocab.hh" +#include "lm/filter/wrapper.hh" +#include "util/file_piece.hh" + +#include <boost/ptr_container/ptr_vector.hpp> + +#include <cstring> +#include <fstream> +#include <iostream> +#include <memory> + +namespace lm { +namespace { + +void DisplayHelp(const char *name) { + std::cerr + << "Usage: " << name << " mode [context] [phrase] [raw|arpa] [threads:m] [batch_size:m] (vocab|model):input_file output_file\n\n" + "copy mode just copies, but makes the format nicer for e.g. irstlm's broken\n" + " parser.\n" + "single mode treats the entire input as a single sentence.\n" + "multiple mode filters to multiple sentences in parallel. Each sentence is on\n" + " a separate line. A separate file is created for each file by appending the\n" + " 0-indexed line number to the output file name.\n" + "union mode produces one filtered model that is the union of models created by\n" + " multiple mode.\n\n" + "context means only the context (all but last word) has to pass the filter, but\n" + " the entire n-gram is output.\n\n" + "phrase means that the vocabulary is actually tab-delimited phrases and that the\n" + " phrases can generate the n-gram when assembled in arbitrary order and\n" + " clipped. Currently works with multiple or union mode.\n\n" + "The file format is set by [raw|arpa] with default arpa:\n" + "raw means space-separated tokens, optionally followed by a tab and arbitrary\n" + " text. This is useful for ngram count files.\n" + "arpa means the ARPA file format for n-gram language models.\n\n" +#ifndef NTHREAD + "threads:m sets m threads (default: conccurrency detected by boost)\n" + "batch_size:m sets the batch size for threading. Expect memory usage from this\n" + " of 2*threads*batch_size n-grams.\n\n" +#else + "This binary was compiled with -DNTHREAD, disabling threading. If you wanted\n" + " threading, compile without this flag against Boost >=1.42.0.\n\n" +#endif + "There are two inputs: vocabulary and model. Either may be given as a file\n" + " while the other is on stdin. Specify the type given as a file using\n" + " vocab: or model: before the file name. \n\n" + "For ARPA format, the output must be seekable. For raw format, it can be a\n" + " stream i.e. /dev/stdout\n"; +} + +typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} FilterMode; +typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; + +struct Config { + Config() : +#ifndef NTHREAD + batch_size(25000), + threads(boost::thread::hardware_concurrency()), +#endif + phrase(false), + context(false), + format(FORMAT_ARPA) + { +#ifndef NTHREAD + if (!threads) threads = 1; +#endif + } + +#ifndef NTHREAD + size_t batch_size; + size_t threads; +#endif + bool phrase; + bool context; + FilterMode mode; + Format format; +}; + +template <class Format, class Filter, class OutputBuffer, class Output> void RunThreadedFilter(const Config &config, util::FilePiece &in_lm, Filter &filter, Output &output) { +#ifndef NTHREAD + if (config.threads == 1) { +#endif + Format::RunFilter(in_lm, filter, output); +#ifndef NTHREAD + } else { + typedef Controller<Filter, OutputBuffer, Output> Threaded; + Threaded threading(config.batch_size, config.threads * 2, config.threads, filter, output); + Format::RunFilter(in_lm, threading, output); + } +#endif +} + +template <class Format, class Filter, class OutputBuffer, class Output> void RunContextFilter(const Config &config, util::FilePiece &in_lm, Filter filter, Output &output) { + if (config.context) { + ContextFilter<Filter> context_filter(filter); + RunThreadedFilter<Format, ContextFilter<Filter>, OutputBuffer, Output>(config, in_lm, context_filter, output); + } else { + RunThreadedFilter<Format, Filter, OutputBuffer, Output>(config, in_lm, filter, output); + } +} + +template <class Format, class Binary> void DispatchBinaryFilter(const Config &config, util::FilePiece &in_lm, const Binary &binary, typename Format::Output &out) { + typedef BinaryFilter<Binary> Filter; + RunContextFilter<Format, Filter, BinaryOutputBuffer, typename Format::Output>(config, in_lm, Filter(binary), out); +} + +template <class Format> void DispatchFilterModes(const Config &config, std::istream &in_vocab, util::FilePiece &in_lm, const char *out_name) { + if (config.mode == MODE_MULTIPLE) { + if (config.phrase) { + typedef phrase::Multiple Filter; + phrase::Substrings substrings; + typename Format::Multiple out(out_name, phrase::ReadMultiple(in_vocab, substrings)); + RunContextFilter<Format, Filter, MultipleOutputBuffer, typename Format::Multiple>(config, in_lm, Filter(substrings), out); + } else { + typedef vocab::Multiple Filter; + boost::unordered_map<std::string, std::vector<unsigned int> > words; + typename Format::Multiple out(out_name, vocab::ReadMultiple(in_vocab, words)); + RunContextFilter<Format, Filter, MultipleOutputBuffer, typename Format::Multiple>(config, in_lm, Filter(words), out); + } + return; + } + + typename Format::Output out(out_name); + + if (config.mode == MODE_COPY) { + Format::Copy(in_lm, out); + return; + } + + if (config.mode == MODE_SINGLE) { + vocab::Single::Words words; + vocab::ReadSingle(in_vocab, words); + DispatchBinaryFilter<Format, vocab::Single>(config, in_lm, vocab::Single(words), out); + return; + } + + if (config.mode == MODE_UNION) { + if (config.phrase) { + phrase::Substrings substrings; + phrase::ReadMultiple(in_vocab, substrings); + DispatchBinaryFilter<Format, phrase::Union>(config, in_lm, phrase::Union(substrings), out); + } else { + vocab::Union::Words words; + vocab::ReadMultiple(in_vocab, words); + DispatchBinaryFilter<Format, vocab::Union>(config, in_lm, vocab::Union(words), out); + } + return; + } +} + +} // namespace +} // namespace lm + +int main(int argc, char *argv[]) { + if (argc < 4) { + lm::DisplayHelp(argv[0]); + return 1; + } + + // I used to have boost::program_options, but some users didn't want to compile boost. + lm::Config config; + config.mode = lm::MODE_UNSET; + for (int i = 1; i < argc - 2; ++i) { + const char *str = argv[i]; + if (!std::strcmp(str, "copy")) { + config.mode = lm::MODE_COPY; + } else if (!std::strcmp(str, "single")) { + config.mode = lm::MODE_SINGLE; + } else if (!std::strcmp(str, "multiple")) { + config.mode = lm::MODE_MULTIPLE; + } else if (!std::strcmp(str, "union")) { + config.mode = lm::MODE_UNION; + } else if (!std::strcmp(str, "phrase")) { + config.phrase = true; + } else if (!std::strcmp(str, "context")) { + config.context = true; + } else if (!std::strcmp(str, "arpa")) { + config.format = lm::FORMAT_ARPA; + } else if (!std::strcmp(str, "raw")) { + config.format = lm::FORMAT_COUNT; +#ifndef NTHREAD + } else if (!std::strncmp(str, "threads:", 8)) { + config.threads = boost::lexical_cast<size_t>(str + 8); + if (!config.threads) { + std::cerr << "Specify at least one thread." << std::endl; + return 1; + } + } else if (!std::strncmp(str, "batch_size:", 11)) { + config.batch_size = boost::lexical_cast<size_t>(str + 11); + if (config.batch_size < 5000) { + std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl; + if (!config.batch_size) return 1; + } +#endif + } else { + lm::DisplayHelp(argv[0]); + return 1; + } + } + + if (config.mode == lm::MODE_UNSET) { + lm::DisplayHelp(argv[0]); + return 1; + } + + if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) { + std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl; + return 1; + } + + bool cmd_is_model = true; + const char *cmd_input = argv[argc - 2]; + if (!strncmp(cmd_input, "vocab:", 6)) { + cmd_is_model = false; + cmd_input += 6; + } else if (!strncmp(cmd_input, "model:", 6)) { + cmd_input += 6; + } else if (strchr(cmd_input, ':')) { + errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input); + } else { + std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; + } + std::ifstream cmd_file; + std::istream *vocab; + if (cmd_is_model) { + vocab = &std::cin; + } else { + cmd_file.open(cmd_input, std::ios::in); + if (!cmd_file) { + err(2, "Could not open input file %s", cmd_input); + } + vocab = &cmd_file; + } + + util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr); + + if (config.format == lm::FORMAT_ARPA) { + lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]); + } else if (config.format == lm::FORMAT_COUNT) { + lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]); + } + return 0; +} diff --git a/klm/lm/filter/format.hh b/klm/lm/filter/format.hh new file mode 100644 index 00000000..7f945b0d --- /dev/null +++ b/klm/lm/filter/format.hh @@ -0,0 +1,250 @@ +#ifndef LM_FILTER_FORMAT_H__ +#define LM_FITLER_FORMAT_H__ + +#include "lm/filter/arpa_io.hh" +#include "lm/filter/count_io.hh" + +#include <boost/lexical_cast.hpp> +#include <boost/ptr_container/ptr_vector.hpp> + +#include <iosfwd> + +namespace lm { + +template <class Single> class MultipleOutput { + private: + typedef boost::ptr_vector<Single> Singles; + typedef typename Singles::iterator SinglesIterator; + + public: + MultipleOutput(const char *prefix, size_t number) { + files_.reserve(number); + std::string tmp; + for (unsigned int i = 0; i < number; ++i) { + tmp = prefix; + tmp += boost::lexical_cast<std::string>(i); + files_.push_back(new Single(tmp.c_str())); + } + } + + void AddNGram(const StringPiece &line) { + for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) + i->AddNGram(line); + } + + template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) + i->AddNGram(begin, end, line); + } + + void SingleAddNGram(size_t offset, const StringPiece &line) { + files_[offset].AddNGram(line); + } + + template <class Iterator> void SingleAddNGram(size_t offset, const Iterator &begin, const Iterator &end, const StringPiece &line) { + files_[offset].AddNGram(begin, end, line); + } + + protected: + Singles files_; +}; + +class MultipleARPAOutput : public MultipleOutput<ARPAOutput> { + public: + MultipleARPAOutput(const char *prefix, size_t number) : MultipleOutput<ARPAOutput>(prefix, number) {} + + void ReserveForCounts(std::streampos reserve) { + for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) + i->ReserveForCounts(reserve); + } + + void BeginLength(unsigned int length) { + for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) + i->BeginLength(length); + } + + void EndLength(unsigned int length) { + for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) + i->EndLength(length); + } + + void Finish() { + for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) + i->Finish(); + } +}; + +template <class Filter, class Output> class DispatchInput { + public: + DispatchInput(Filter &filter, Output &output) : filter_(filter), output_(output) {} + +/* template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { + filter_.AddNGram(begin, end, line, output_); + }*/ + + void AddNGram(const StringPiece &ngram, const StringPiece &line) { + filter_.AddNGram(ngram, line, output_); + } + + protected: + Filter &filter_; + Output &output_; +}; + +template <class Filter, class Output> class DispatchARPAInput : public DispatchInput<Filter, Output> { + private: + typedef DispatchInput<Filter, Output> B; + + public: + DispatchARPAInput(Filter &filter, Output &output) : B(filter, output) {} + + void ReserveForCounts(std::streampos reserve) { B::output_.ReserveForCounts(reserve); } + void BeginLength(unsigned int length) { B::output_.BeginLength(length); } + + void EndLength(unsigned int length) { + B::filter_.Flush(); + B::output_.EndLength(length); + } + void Finish() { B::output_.Finish(); } +}; + +struct ARPAFormat { + typedef ARPAOutput Output; + typedef MultipleARPAOutput Multiple; + static void Copy(util::FilePiece &in, Output &out) { + ReadARPA(in, out); + } + template <class Filter, class Out> static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { + DispatchARPAInput<Filter, Out> dispatcher(filter, output); + ReadARPA(in, dispatcher); + } +}; + +struct CountFormat { + typedef CountOutput Output; + typedef MultipleOutput<Output> Multiple; + static void Copy(util::FilePiece &in, Output &out) { + ReadCount(in, out); + } + template <class Filter, class Out> static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { + DispatchInput<Filter, Out> dispatcher(filter, output); + ReadCount(in, dispatcher); + } +}; + +/* For multithreading, the buffer classes hold batches of filter inputs and + * outputs in memory. The strings get reused a lot, so keep them around + * instead of clearing each time. + */ +class InputBuffer { + public: + InputBuffer() : actual_(0) {} + + void Reserve(size_t size) { lines_.reserve(size); } + + template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + if (lines_.size() == actual_) lines_.resize(lines_.size() + 1); + // TODO avoid this copy. + std::string &copied = lines_[actual_].line; + copied.assign(line.data(), line.size()); + lines_[actual_].ngram.set(copied.data() + (ngram.data() - line.data()), ngram.size()); + ++actual_; + } + + template <class Filter, class Output> void CallFilter(Filter &filter, Output &output) const { + for (std::vector<Line>::const_iterator i = lines_.begin(); i != lines_.begin() + actual_; ++i) { + filter.AddNGram(i->ngram, i->line, output); + } + } + + void Clear() { actual_ = 0; } + bool Empty() { return actual_ == 0; } + size_t Size() { return actual_; } + + private: + struct Line { + std::string line; + StringPiece ngram; + }; + + size_t actual_; + + std::vector<Line> lines_; +}; + +class BinaryOutputBuffer { + public: + BinaryOutputBuffer() {} + + void Reserve(size_t size) { + lines_.reserve(size); + } + + void AddNGram(const StringPiece &line) { + lines_.push_back(line); + } + + template <class Output> void Flush(Output &output) { + for (std::vector<StringPiece>::const_iterator i = lines_.begin(); i != lines_.end(); ++i) { + output.AddNGram(*i); + } + lines_.clear(); + } + + private: + std::vector<StringPiece> lines_; +}; + +class MultipleOutputBuffer { + public: + MultipleOutputBuffer() : last_(NULL) {} + + void Reserve(size_t size) { + annotated_.reserve(size); + } + + void AddNGram(const StringPiece &line) { + annotated_.resize(annotated_.size() + 1); + annotated_.back().line = line; + } + + void SingleAddNGram(size_t offset, const StringPiece &line) { + if ((line.data() == last_.data()) && (line.length() == last_.length())) { + annotated_.back().systems.push_back(offset); + } else { + annotated_.resize(annotated_.size() + 1); + annotated_.back().systems.push_back(offset); + annotated_.back().line = line; + last_ = line; + } + } + + template <class Output> void Flush(Output &output) { + for (std::vector<Annotated>::const_iterator i = annotated_.begin(); i != annotated_.end(); ++i) { + if (i->systems.empty()) { + output.AddNGram(i->line); + } else { + for (std::vector<size_t>::const_iterator j = i->systems.begin(); j != i->systems.end(); ++j) { + output.SingleAddNGram(*j, i->line); + } + } + } + annotated_.clear(); + } + + private: + struct Annotated { + // If this is empty, send to all systems. + // A filter should never send to all systems and send to a single one. + std::vector<size_t> systems; + StringPiece line; + }; + + StringPiece last_; + + std::vector<Annotated> annotated_; +}; + +} // namespace lm + +#endif // LM_FILTER_FORMAT_H__ diff --git a/klm/lm/filter/phrase.cc b/klm/lm/filter/phrase.cc new file mode 100644 index 00000000..1bef2a3f --- /dev/null +++ b/klm/lm/filter/phrase.cc @@ -0,0 +1,281 @@ +#include "lm/filter/phrase.hh" + +#include "lm/filter/format.hh" + +#include <algorithm> +#include <functional> +#include <iostream> +#include <queue> +#include <string> +#include <vector> + +#include <ctype.h> + +namespace lm { +namespace phrase { + +unsigned int ReadMultiple(std::istream &in, Substrings &out) { + bool sentence_content = false; + unsigned int sentence_id = 0; + std::vector<Hash> phrase; + std::string word; + while (in) { + char c; + // Gather a word. + while (!isspace(c = in.get()) && in) word += c; + // Treat EOF like a newline. + if (!in) c = '\n'; + // Add the word to the phrase. + if (!word.empty()) { + phrase.push_back(util::MurmurHashNative(word.data(), word.size())); + word.clear(); + } + if (c == ' ') continue; + // It's more than just a space. Close out the phrase. + if (!phrase.empty()) { + sentence_content = true; + out.AddPhrase(sentence_id, phrase.begin(), phrase.end()); + phrase.clear(); + } + if (c == '\t' || c == '\v') continue; + // It's more than a space or tab: a newline. + if (sentence_content) { + ++sentence_id; + sentence_content = false; + } + } + if (!in.eof()) in.exceptions(std::istream::failbit | std::istream::badbit); + return sentence_id + sentence_content; +} + +namespace detail { const StringPiece kEndSentence("</s>"); } + +namespace { + +typedef unsigned int Sentence; +typedef std::vector<Sentence> Sentences; + +class Vertex; + +class Arc { + public: + Arc() {} + + // For arcs from one vertex to another. + void SetPhrase(Vertex &from, Vertex &to, const Sentences &intersect) { + Set(to, intersect); + from_ = &from; + } + + /* For arcs from before the n-gram begins to somewhere in the n-gram (right + * aligned). These have no from_ vertex; it implictly matches every + * sentence. This also handles when the n-gram is a substring of a phrase. + */ + void SetRight(Vertex &to, const Sentences &complete) { + Set(to, complete); + from_ = NULL; + } + + Sentence Current() const { + return *current_; + } + + bool Empty() const { + return current_ == last_; + } + + /* When this function returns: + * If Empty() then there's nothing left from this intersection. + * + * If Current() == to then to is part of the intersection. + * + * Otherwise, Current() > to. In this case, to is not part of the + * intersection and neither is anything < Current(). To determine if + * any value >= Current() is in the intersection, call LowerBound again + * with the value. + */ + void LowerBound(const Sentence to); + + private: + void Set(Vertex &to, const Sentences &sentences); + + const Sentence *current_; + const Sentence *last_; + Vertex *from_; +}; + +struct ArcGreater : public std::binary_function<const Arc *, const Arc *, bool> { + bool operator()(const Arc *first, const Arc *second) const { + return first->Current() > second->Current(); + } +}; + +class Vertex { + public: + Vertex() : current_(0) {} + + Sentence Current() const { + return current_; + } + + bool Empty() const { + return incoming_.empty(); + } + + void LowerBound(const Sentence to); + + private: + friend class Arc; + + void AddIncoming(Arc *arc) { + if (!arc->Empty()) incoming_.push(arc); + } + + unsigned int current_; + std::priority_queue<Arc*, std::vector<Arc*>, ArcGreater> incoming_; +}; + +void Arc::LowerBound(const Sentence to) { + current_ = std::lower_bound(current_, last_, to); + // If *current_ > to, don't advance from_. The intervening values of + // from_ may be useful for another one of its outgoing arcs. + if (!from_ || Empty() || (Current() > to)) return; + assert(Current() == to); + from_->LowerBound(to); + if (from_->Empty()) { + current_ = last_; + return; + } + assert(from_->Current() >= to); + if (from_->Current() > to) { + current_ = std::lower_bound(current_ + 1, last_, from_->Current()); + } +} + +void Arc::Set(Vertex &to, const Sentences &sentences) { + current_ = &*sentences.begin(); + last_ = &*sentences.end(); + to.AddIncoming(this); +} + +void Vertex::LowerBound(const Sentence to) { + if (Empty()) return; + // Union lower bound. + while (true) { + Arc *top = incoming_.top(); + if (top->Current() > to) { + current_ = top->Current(); + return; + } + // If top->Current() == to, we still need to verify that's an actual + // element and not just a bound. + incoming_.pop(); + top->LowerBound(to); + if (!top->Empty()) { + incoming_.push(top); + if (top->Current() == to) { + current_ = to; + return; + } + } else if (Empty()) { + return; + } + } +} + +void BuildGraph(const Substrings &phrase, const std::vector<Hash> &hashes, Vertex *const vertices, Arc *free_arc) { + assert(!hashes.empty()); + + const Hash *const first_word = &*hashes.begin(); + const Hash *const last_word = &*hashes.end() - 1; + + Hash hash = 0; + const Sentences *found; + // Phrases starting at or before the first word in the n-gram. + { + Vertex *vertex = vertices; + for (const Hash *word = first_word; ; ++word, ++vertex) { + hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word); + // Now hash is [hashes.begin(), word]. + if (word == last_word) { + if (phrase.FindSubstring(hash, found)) + (free_arc++)->SetRight(*vertex, *found); + break; + } + if (!phrase.FindRight(hash, found)) break; + (free_arc++)->SetRight(*vertex, *found); + } + } + + // Phrases starting at the second or later word in the n-gram. + Vertex *vertex_from = vertices; + for (const Hash *word_from = first_word + 1; word_from != &*hashes.end(); ++word_from, ++vertex_from) { + hash = 0; + Vertex *vertex_to = vertex_from + 1; + for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) { + // Notice that word_to and vertex_to have the same index. + hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to); + // Now hash covers [word_from, word_to]. + if (word_to == last_word) { + if (phrase.FindLeft(hash, found)) + (free_arc++)->SetPhrase(*vertex_from, *vertex_to, *found); + break; + } + if (!phrase.FindPhrase(hash, found)) break; + (free_arc++)->SetPhrase(*vertex_from, *vertex_to, *found); + } + } +} + +} // namespace + +namespace detail { + +} // namespace detail + +bool Union::Evaluate() { + assert(!hashes_.empty()); + // Usually there are at most 6 words in an n-gram, so stack allocation is reasonable. + Vertex vertices[hashes_.size()]; + // One for every substring. + Arc arcs[((hashes_.size() + 1) * hashes_.size()) / 2]; + BuildGraph(substrings_, hashes_, vertices, arcs); + Vertex &last_vertex = vertices[hashes_.size() - 1]; + + unsigned int lower = 0; + while (true) { + last_vertex.LowerBound(lower); + if (last_vertex.Empty()) return false; + if (last_vertex.Current() == lower) return true; + lower = last_vertex.Current(); + } +} + +template <class Output> void Multiple::Evaluate(const StringPiece &line, Output &output) { + assert(!hashes_.empty()); + // Usually there are at most 6 words in an n-gram, so stack allocation is reasonable. + Vertex vertices[hashes_.size()]; + // One for every substring. + Arc arcs[((hashes_.size() + 1) * hashes_.size()) / 2]; + BuildGraph(substrings_, hashes_, vertices, arcs); + Vertex &last_vertex = vertices[hashes_.size() - 1]; + + unsigned int lower = 0; + while (true) { + last_vertex.LowerBound(lower); + if (last_vertex.Empty()) return; + if (last_vertex.Current() == lower) { + output.SingleAddNGram(lower, line); + ++lower; + } else { + lower = last_vertex.Current(); + } + } +} + +template void Multiple::Evaluate<CountFormat::Multiple>(const StringPiece &line, CountFormat::Multiple &output); +template void Multiple::Evaluate<ARPAFormat::Multiple>(const StringPiece &line, ARPAFormat::Multiple &output); +template void Multiple::Evaluate<MultipleOutputBuffer>(const StringPiece &line, MultipleOutputBuffer &output); + +} // namespace phrase +} // namespace lm diff --git a/klm/lm/filter/phrase.hh b/klm/lm/filter/phrase.hh new file mode 100644 index 00000000..b4edff41 --- /dev/null +++ b/klm/lm/filter/phrase.hh @@ -0,0 +1,154 @@ +#ifndef LM_FILTER_PHRASE_H__ +#define LM_FILTER_PHRASE_H__ + +#include "util/murmur_hash.hh" +#include "util/string_piece.hh" +#include "util/tokenize_piece.hh" + +#include <boost/unordered_map.hpp> + +#include <iosfwd> +#include <vector> + +#define LM_FILTER_PHRASE_METHOD(caps, lower) \ +bool Find##caps(Hash key, const std::vector<unsigned int> *&out) const {\ + Table::const_iterator i(table_.find(key));\ + if (i==table_.end()) return false; \ + out = &i->second.lower; \ + return true; \ +} + +namespace lm { +namespace phrase { + +typedef uint64_t Hash; + +class Substrings { + private: + /* This is the value in a hash table where the key is a string. It indicates + * four sets of sentences: + * substring is sentences with a phrase containing the key as a substring. + * left is sentencess with a phrase that begins with the key (left aligned). + * right is sentences with a phrase that ends with the key (right aligned). + * phrase is sentences where the key is a phrase. + * Each set is encoded as a vector of sentence ids in increasing order. + */ + struct SentenceRelation { + std::vector<unsigned int> substring, left, right, phrase; + }; + /* Most of the CPU is hash table lookups, so let's not complicate it with + * vector equality comparisons. If a collision happens, the SentenceRelation + * structure will contain the union of sentence ids over the colliding strings. + * In that case, the filter will be slightly more permissive. + * The key here is the same as boost's hash of std::vector<std::string>. + */ + typedef boost::unordered_map<Hash, SentenceRelation> Table; + + public: + Substrings() {} + + /* If the string isn't a substring of any phrase, return NULL. Otherwise, + * return a pointer to std::vector<unsigned int> listing sentences with + * matching phrases. This set may be empty for Left, Right, or Phrase. + * Example: const std::vector<unsigned int> *FindSubstring(Hash key) + */ + LM_FILTER_PHRASE_METHOD(Substring, substring) + LM_FILTER_PHRASE_METHOD(Left, left) + LM_FILTER_PHRASE_METHOD(Right, right) + LM_FILTER_PHRASE_METHOD(Phrase, phrase) + +#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization + // sentence_id must be non-decreasing. Iterators are over words in the phrase. + template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) { + // Iterate over all substrings. + for (Iterator start = begin; start != end; ++start) { + Hash hash = 0; + SentenceRelation *relation; + for (Iterator finish = start; finish != end; ++finish) { + hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *finish); + // Now hash is of [start, finish]. + relation = &table_[hash]; + AppendSentence(relation->substring, sentence_id); + if (start == begin) AppendSentence(relation->left, sentence_id); + } + AppendSentence(relation->right, sentence_id); + if (start == begin) AppendSentence(relation->phrase, sentence_id); + } + } + + private: + void AppendSentence(std::vector<unsigned int> &vec, unsigned int sentence_id) { + if (vec.empty() || vec.back() != sentence_id) vec.push_back(sentence_id); + } + + Table table_; +}; + +// Read a file with one sentence per line containing tab-delimited phrases of +// space-separated words. +unsigned int ReadMultiple(std::istream &in, Substrings &out); + +namespace detail { +extern const StringPiece kEndSentence; + +template <class Iterator> void MakeHashes(Iterator i, const Iterator &end, std::vector<Hash> &hashes) { + hashes.clear(); + if (i == end) return; + // TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags. + if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) { + ++i; + } + for (; i != end && (*i != kEndSentence); ++i) { + hashes.push_back(util::MurmurHashNative(i->data(), i->size())); + } +} + +} // namespace detail + +class Union { + public: + explicit Union(const Substrings &substrings) : substrings_(substrings) {} + + template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) { + detail::MakeHashes(begin, end, hashes_); + return hashes_.empty() || Evaluate(); + } + + private: + bool Evaluate(); + + std::vector<Hash> hashes_; + + const Substrings &substrings_; +}; + +class Multiple { + public: + explicit Multiple(const Substrings &substrings) : substrings_(substrings) {} + + template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { + detail::MakeHashes(begin, end, hashes_); + if (hashes_.empty()) { + output.AddNGram(line); + return; + } + Evaluate(line, output); + } + + template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output); + } + + void Flush() const {} + + private: + template <class Output> void Evaluate(const StringPiece &line, Output &output); + + std::vector<Hash> hashes_; + + const Substrings &substrings_; +}; + +} // namespace phrase +} // namespace lm +#endif // LM_FILTER_PHRASE_H__ diff --git a/klm/lm/filter/thread.hh b/klm/lm/filter/thread.hh new file mode 100644 index 00000000..e785b263 --- /dev/null +++ b/klm/lm/filter/thread.hh @@ -0,0 +1,167 @@ +#ifndef LM_FILTER_THREAD_H__ +#define LM_FILTER_THREAD_H__ + +#include "util/thread_pool.hh" + +#include <boost/utility/in_place_factory.hpp> + +#include <deque> +#include <stack> + +namespace lm { + +template <class OutputBuffer> class ThreadBatch { + public: + ThreadBatch() {} + + void Reserve(size_t size) { + input_.Reserve(size); + output_.Reserve(size); + } + + // File reading thread. + InputBuffer &Fill(uint64_t sequence) { + sequence_ = sequence; + // Why wait until now to clear instead of after output? free in the same + // thread as allocated. + input_.Clear(); + return input_; + } + + // Filter worker thread. + template <class Filter> void CallFilter(Filter &filter) { + input_.CallFilter(filter, output_); + } + + uint64_t Sequence() const { return sequence_; } + + // File writing thread. + template <class RealOutput> void Flush(RealOutput &output) { + output_.Flush(output); + } + + private: + InputBuffer input_; + OutputBuffer output_; + + uint64_t sequence_; +}; + +template <class Batch, class Filter> class FilterWorker { + public: + typedef Batch *Request; + + FilterWorker(const Filter &filter, util::PCQueue<Request> &done) : filter_(filter), done_(done) {} + + void operator()(Request request) { + request->CallFilter(filter_); + done_.Produce(request); + } + + private: + Filter filter_; + + util::PCQueue<Request> &done_; +}; + +// There should only be one OutputWorker. +template <class Batch, class Output> class OutputWorker { + public: + typedef Batch *Request; + + OutputWorker(Output &output, util::PCQueue<Request> &done) : output_(output), done_(done), base_sequence_(0) {} + + void operator()(Request request) { + assert(request->Sequence() >= base_sequence_); + // Assemble the output in order. + uint64_t pos = request->Sequence() - base_sequence_; + if (pos >= ordering_.size()) { + ordering_.resize(pos + 1, NULL); + } + ordering_[pos] = request; + while (!ordering_.empty() && ordering_.front()) { + ordering_.front()->Flush(output_); + done_.Produce(ordering_.front()); + ordering_.pop_front(); + ++base_sequence_; + } + } + + private: + Output &output_; + + util::PCQueue<Request> &done_; + + std::deque<Request> ordering_; + + uint64_t base_sequence_; +}; + +template <class Filter, class OutputBuffer, class RealOutput> class Controller : boost::noncopyable { + private: + typedef ThreadBatch<OutputBuffer> Batch; + + public: + Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output) + : batch_size_(batch_size), queue_size_(queue), + batches_(queue), + to_read_(queue), + output_(queue, 1, boost::in_place(boost::ref(output), boost::ref(to_read_)), NULL), + filter_(queue, workers, boost::in_place(boost::ref(filter), boost::ref(output_.In())), NULL), + sequence_(0) { + for (size_t i = 0; i < queue; ++i) { + batches_[i].Reserve(batch_size); + local_read_.push(&batches_[i]); + } + NewInput(); + } + + void AddNGram(const StringPiece &ngram, const StringPiece &line, RealOutput &output) { + input_->AddNGram(ngram, line, output); + if (input_->Size() == batch_size_) { + FlushInput(); + NewInput(); + } + } + + void Flush() { + FlushInput(); + while (local_read_.size() < queue_size_) { + MoveRead(); + } + NewInput(); + } + + private: + void FlushInput() { + if (input_->Empty()) return; + filter_.Produce(local_read_.top()); + local_read_.pop(); + if (local_read_.empty()) MoveRead(); + } + + void NewInput() { + input_ = &local_read_.top()->Fill(sequence_++); + } + + void MoveRead() { + local_read_.push(to_read_.Consume()); + } + + const size_t batch_size_; + const size_t queue_size_; + + std::vector<Batch> batches_; + + util::PCQueue<Batch*> to_read_; + std::stack<Batch*> local_read_; + util::ThreadPool<OutputWorker<Batch, RealOutput> > output_; + util::ThreadPool<FilterWorker<Batch, Filter> > filter_; + + uint64_t sequence_; + InputBuffer *input_; +}; + +} // namespace lm + +#endif // LM_FILTER_THREAD_H__ diff --git a/klm/lm/filter/vocab.cc b/klm/lm/filter/vocab.cc new file mode 100644 index 00000000..7ee4e84b --- /dev/null +++ b/klm/lm/filter/vocab.cc @@ -0,0 +1,54 @@ +#include "lm/filter/vocab.hh" + +#include <istream> +#include <iostream> + +#include <ctype.h> +#include <err.h> + +namespace lm { +namespace vocab { + +void ReadSingle(std::istream &in, boost::unordered_set<std::string> &out) { + in.exceptions(std::istream::badbit); + std::string word; + while (in >> word) { + out.insert(word); + } +} + +namespace { +bool IsLineEnd(std::istream &in) { + int got; + do { + got = in.get(); + if (!in) return true; + if (got == '\n') return true; + } while (isspace(got)); + in.unget(); + return false; +} +}// namespace + +// Read space separated words in enter separated lines. These lines can be +// very long, so don't read an entire line at a time. +unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) { + in.exceptions(std::istream::badbit); + unsigned int sentence = 0; + bool used_id = false; + std::string word; + while (in >> word) { + used_id = true; + std::vector<unsigned int> &posting = out[word]; + if (posting.empty() || (posting.back() != sentence)) + posting.push_back(sentence); + if (IsLineEnd(in)) { + ++sentence; + used_id = false; + } + } + return sentence + used_id; +} + +} // namespace vocab +} // namespace lm diff --git a/klm/lm/filter/vocab.hh b/klm/lm/filter/vocab.hh new file mode 100644 index 00000000..7f0fadaa --- /dev/null +++ b/klm/lm/filter/vocab.hh @@ -0,0 +1,133 @@ +#ifndef LM_FILTER_VOCAB_H__ +#define LM_FILTER_VOCAB_H__ + +// Vocabulary-based filters for language models. + +#include "util/multi_intersection.hh" +#include "util/string_piece.hh" +#include "util/string_piece_hash.hh" +#include "util/tokenize_piece.hh" + +#include <boost/noncopyable.hpp> +#include <boost/range/iterator_range.hpp> +#include <boost/unordered/unordered_map.hpp> +#include <boost/unordered/unordered_set.hpp> + +#include <string> +#include <vector> + +namespace lm { +namespace vocab { + +void ReadSingle(std::istream &in, boost::unordered_set<std::string> &out); + +// Read one sentence vocabulary per line. Return the number of sentences. +unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out); + +/* Is this a special tag like <s> or <UNK>? This actually includes anything + * surrounded with < and >, which most tokenizers separate for real words, so + * this should not catch real words as it looks at a single token. + */ +inline bool IsTag(const StringPiece &value) { + // The parser should never give an empty string. + assert(!value.empty()); + return (value.data()[0] == '<' && value.data()[value.size() - 1] == '>'); +} + +class Single { + public: + typedef boost::unordered_set<std::string> Words; + + explicit Single(const Words &vocab) : vocab_(vocab) {} + + template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) { + for (Iterator i = begin; i != end; ++i) { + if (IsTag(*i)) continue; + if (FindStringPiece(vocab_, *i) == vocab_.end()) return false; + } + return true; + } + + private: + const Words &vocab_; +}; + +class Union { + public: + typedef boost::unordered_map<std::string, std::vector<unsigned int> > Words; + + explicit Union(const Words &vocabs) : vocabs_(vocabs) {} + + template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) { + sets_.clear(); + + for (Iterator i(begin); i != end; ++i) { + if (IsTag(*i)) continue; + Words::const_iterator found(FindStringPiece(vocabs_, *i)); + if (vocabs_.end() == found) return false; + sets_.push_back(boost::iterator_range<const unsigned int*>(&*found->second.begin(), &*found->second.end())); + } + return (sets_.empty() || util::FirstIntersection(sets_)); + } + + private: + const Words &vocabs_; + + std::vector<boost::iterator_range<const unsigned int*> > sets_; +}; + +class Multiple { + public: + typedef boost::unordered_map<std::string, std::vector<unsigned int> > Words; + + Multiple(const Words &vocabs) : vocabs_(vocabs) {} + + private: + // Callback from AllIntersection that does AddNGram. + template <class Output> class Callback { + public: + Callback(Output &out, const StringPiece &line) : out_(out), line_(line) {} + + void operator()(unsigned int index) { + out_.SingleAddNGram(index, line_); + } + + private: + Output &out_; + const StringPiece &line_; + }; + + public: + template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { + sets_.clear(); + for (Iterator i(begin); i != end; ++i) { + if (IsTag(*i)) continue; + Words::const_iterator found(FindStringPiece(vocabs_, *i)); + if (vocabs_.end() == found) return; + sets_.push_back(boost::iterator_range<const unsigned int*>(&*found->second.begin(), &*found->second.end())); + } + if (sets_.empty()) { + output.AddNGram(line); + return; + } + + Callback<Output> cb(output, line); + util::AllIntersection(sets_, cb); + } + + template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output); + } + + void Flush() const {} + + private: + const Words &vocabs_; + + std::vector<boost::iterator_range<const unsigned int*> > sets_; +}; + +} // namespace vocab +} // namespace lm + +#endif // LM_FILTER_VOCAB_H__ diff --git a/klm/lm/filter/wrapper.hh b/klm/lm/filter/wrapper.hh new file mode 100644 index 00000000..90b07a08 --- /dev/null +++ b/klm/lm/filter/wrapper.hh @@ -0,0 +1,58 @@ +#ifndef LM_FILTER_WRAPPER_H__ +#define LM_FILTER_WRAPPER_H__ + +#include "util/string_piece.hh" + +#include <algorithm> +#include <string> +#include <vector> + +namespace lm { + +// Provide a single-output filter with the same interface as a +// multiple-output filter so clients code against one interface. +template <class Binary> class BinaryFilter { + public: + // Binary modes are just references (and a set) and it makes the API cleaner to copy them. + explicit BinaryFilter(Binary binary) : binary_(binary) {} + + template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { + if (binary_.PassNGram(begin, end)) + output.AddNGram(line); + } + + template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output); + } + + void Flush() const {} + + private: + Binary binary_; +}; + +// Wrap another filter to pay attention only to context words +template <class FilterT> class ContextFilter { + public: + typedef FilterT Filter; + + explicit ContextFilter(Filter &backend) : backend_(backend) {} + + template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { + pieces_.clear(); + // TODO: this copy could be avoided by a lookahead iterator. + std::copy(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), std::back_insert_iterator<std::vector<StringPiece> >(pieces_)); + backend_.AddNGram(pieces_.begin(), pieces_.end() - !pieces_.empty(), line, output); + } + + void Flush() const {} + + private: + std::vector<StringPiece> pieces_; + + Filter backend_; +}; + +} // namespace lm + +#endif // LM_FILTER_WRAPPER_H__ diff --git a/klm/lm/fragment_main.cc b/klm/lm/fragment_main.cc new file mode 100644 index 00000000..0267cd4e --- /dev/null +++ b/klm/lm/fragment_main.cc @@ -0,0 +1,37 @@ +#include "lm/binary_format.hh" +#include "lm/model.hh" +#include "lm/left.hh" +#include "util/tokenize_piece.hh" + +template <class Model> void Query(const char *name) { + Model model(name); + std::string line; + lm::ngram::ChartState ignored; + while (getline(std::cin, line)) { + lm::ngram::RuleScore<Model> scorer(model, ignored); + for (util::TokenIter<util::SingleCharacter, true> i(line, ' '); i; ++i) { + scorer.Terminal(model.GetVocabulary().Index(*i)); + } + std::cout << scorer.Finish() << '\n'; + } +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "Expected model file name." << std::endl; + return 1; + } + const char *name = argv[1]; + lm::ngram::ModelType model_type = lm::ngram::PROBING; + lm::ngram::RecognizeBinary(name, model_type); + switch (model_type) { + case lm::ngram::PROBING: + Query<lm::ngram::ProbingModel>(name); + break; + case lm::ngram::REST_PROBING: + Query<lm::ngram::RestProbingModel>(name); + break; + default: + std::cerr << "Model type not supported yet." << std::endl; + } +} diff --git a/klm/lm/max_order.cc b/klm/lm/kenlm_max_order_main.cc index 94221201..94221201 100644 --- a/klm/lm/max_order.cc +++ b/klm/lm/kenlm_max_order_main.cc diff --git a/klm/lm/left.hh b/klm/lm/left.hh index 8c27232e..85c1ea37 100644 --- a/klm/lm/left.hh +++ b/klm/lm/left.hh @@ -51,36 +51,36 @@ namespace ngram { template <class M> class RuleScore { public: - explicit RuleScore(const M &model, ChartState &out) : model_(model), out_(out), left_done_(false), prob_(0.0) { + explicit RuleScore(const M &model, ChartState &out) : model_(model), out_(&out), left_done_(false), prob_(0.0) { out.left.length = 0; out.right.length = 0; } void BeginSentence() { - out_.right = model_.BeginSentenceState(); - // out_.left is empty. + out_->right = model_.BeginSentenceState(); + // out_->left is empty. left_done_ = true; } void Terminal(WordIndex word) { - State copy(out_.right); - FullScoreReturn ret(model_.FullScore(copy, word, out_.right)); + State copy(out_->right); + FullScoreReturn ret(model_.FullScore(copy, word, out_->right)); if (left_done_) { prob_ += ret.prob; return; } if (ret.independent_left) { prob_ += ret.prob; left_done_ = true; return; } - out_.left.pointers[out_.left.length++] = ret.extend_left; + out_->left.pointers[out_->left.length++] = ret.extend_left; prob_ += ret.rest; - if (out_.right.length != copy.length + 1) + if (out_->right.length != copy.length + 1) left_done_ = true; } // Faster version of NonTerminal for the case where the rule begins with a non-terminal. void BeginNonTerminal(const ChartState &in, float prob = 0.0) { prob_ = prob; - out_ = in; + *out_ = in; left_done_ = in.left.full; } @@ -89,23 +89,23 @@ template <class M> class RuleScore { if (!in.left.length) { if (in.left.full) { - for (const float *i = out_.right.backoff; i < out_.right.backoff + out_.right.length; ++i) prob_ += *i; + for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i; left_done_ = true; - out_.right = in.right; + out_->right = in.right; } return; } - if (!out_.right.length) { - out_.right = in.right; + if (!out_->right.length) { + out_->right = in.right; if (left_done_) { prob_ += model_.UnRest(in.left.pointers, in.left.pointers + in.left.length, 1); return; } - if (out_.left.length) { + if (out_->left.length) { left_done_ = true; } else { - out_.left = in.left; + out_->left = in.left; left_done_ = in.left.full; } return; @@ -113,10 +113,10 @@ template <class M> class RuleScore { float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1]; float *back = backoffs, *back2 = backoffs2; - unsigned char next_use = out_.right.length; + unsigned char next_use = out_->right.length; // First word - if (ExtendLeft(in, next_use, 1, out_.right.backoff, back)) return; + if (ExtendLeft(in, next_use, 1, out_->right.backoff, back)) return; // Words after the first, so extending a bigram to begin with for (unsigned char extend_length = 2; extend_length <= in.left.length; ++extend_length) { @@ -127,54 +127,58 @@ template <class M> class RuleScore { if (in.left.full) { for (const float *i = back; i != back + next_use; ++i) prob_ += *i; left_done_ = true; - out_.right = in.right; + out_->right = in.right; return; } // Right state was minimized, so it's already independent of the new words to the left. if (in.right.length < in.left.length) { - out_.right = in.right; + out_->right = in.right; return; } // Shift exisiting words down. - for (WordIndex *i = out_.right.words + next_use - 1; i >= out_.right.words; --i) { + for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) { *(i + in.right.length) = *i; } // Add words from in.right. - std::copy(in.right.words, in.right.words + in.right.length, out_.right.words); + std::copy(in.right.words, in.right.words + in.right.length, out_->right.words); // Assemble backoff composed on the existing state's backoff followed by the new state's backoff. - std::copy(in.right.backoff, in.right.backoff + in.right.length, out_.right.backoff); - std::copy(back, back + next_use, out_.right.backoff + in.right.length); - out_.right.length = in.right.length + next_use; + std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff); + std::copy(back, back + next_use, out_->right.backoff + in.right.length); + out_->right.length = in.right.length + next_use; } float Finish() { // A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram. - out_.left.full = left_done_ || (out_.left.length == model_.Order() - 1); + out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1); return prob_; } void Reset() { prob_ = 0.0; left_done_ = false; - out_.left.length = 0; - out_.right.length = 0; + out_->left.length = 0; + out_->right.length = 0; + } + void Reset(ChartState &replacement) { + out_ = &replacement; + Reset(); } private: bool ExtendLeft(const ChartState &in, unsigned char &next_use, unsigned char extend_length, const float *back_in, float *back_out) { ProcessRet(model_.ExtendLeft( - out_.right.words, out_.right.words + next_use, // Words to extend into + out_->right.words, out_->right.words + next_use, // Words to extend into back_in, // Backoffs to use in.left.pointers[extend_length - 1], extend_length, // Words to be extended back_out, // Backoffs for the next score next_use)); // Length of n-gram to use in next scoring. - if (next_use != out_.right.length) { + if (next_use != out_->right.length) { left_done_ = true; if (!next_use) { // Early exit. - out_.right = in.right; + out_->right = in.right; prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1); return true; } @@ -193,13 +197,13 @@ template <class M> class RuleScore { left_done_ = true; return; } - out_.left.pointers[out_.left.length++] = ret.extend_left; + out_->left.pointers[out_->left.length++] = ret.extend_left; prob_ += ret.rest; } const M &model_; - ChartState &out_; + ChartState *out_; bool left_done_; diff --git a/klm/lm/max_order.hh b/klm/lm/max_order.hh index bc8687cd..3eb97ccd 100644 --- a/klm/lm/max_order.hh +++ b/klm/lm/max_order.hh @@ -4,9 +4,6 @@ * (kMaxOrder - 1) * sizeof(float) bytes instead of * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead */ -#ifndef KENLM_MAX_ORDER -#define KENLM_MAX_ORDER 6 -#endif #ifndef KENLM_ORDER_MESSAGE -#define KENLM_ORDER_MESSAGE "Edit klm/lm/max_order.hh." +#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh." #endif diff --git a/klm/lm/model.cc b/klm/lm/model.cc index b46333a4..a40fd2fb 100644 --- a/klm/lm/model.cc +++ b/klm/lm/model.cc @@ -12,6 +12,7 @@ #include <functional> #include <numeric> #include <cmath> +#include <limits> namespace lm { namespace ngram { @@ -19,23 +20,24 @@ namespace detail { template <class Search, class VocabularyT> const ModelType GenericModel<Search, VocabularyT>::kModelType = Search::kModelType; -template <class Search, class VocabularyT> size_t GenericModel<Search, VocabularyT>::Size(const std::vector<uint64_t> &counts, const Config &config) { +template <class Search, class VocabularyT> uint64_t GenericModel<Search, VocabularyT>::Size(const std::vector<uint64_t> &counts, const Config &config) { return VocabularyT::Size(counts[0], config) + Search::Size(counts, config); } template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::SetupMemory(void *base, const std::vector<uint64_t> &counts, const Config &config) { + size_t goal_size = util::CheckOverflow(Size(counts, config)); uint8_t *start = static_cast<uint8_t*>(base); size_t allocated = VocabularyT::Size(counts[0], config); vocab_.SetupMemory(start, allocated, counts[0], config); start += allocated; start = search_.SetupMemory(start, counts, config); - if (static_cast<std::size_t>(start - static_cast<uint8_t*>(base)) != Size(counts, config)) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast<uint8_t*>(base)) << " but Size says they should take " << Size(counts, config)); + if (static_cast<std::size_t>(start - static_cast<uint8_t*>(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast<uint8_t*>(base)) << " but Size says they should take " << goal_size); } template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) { LoadLM(file, config, *this); - // g++ prints warnings unless these are fully initialized. + // g++ prints warnings unless these are fully initialized. State begin_sentence = State(); begin_sentence.length = 1; begin_sentence.words[0] = vocab_.BeginSentence(); @@ -49,38 +51,43 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge } namespace { -void CheckMaxOrder(size_t order) { - UTIL_THROW_IF(order > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << order << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE); +void CheckCounts(const std::vector<uint64_t> &counts) { + UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE); + if (sizeof(uint64_t) > sizeof(std::size_t)) { + for (std::vector<uint64_t>::const_iterator i = counts.begin(); i != counts.end(); ++i) { + UTIL_THROW_IF(*i > static_cast<uint64_t>(std::numeric_limits<size_t>::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines."); + } + } } } // namespace template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) { - CheckMaxOrder(params.counts.size()); + CheckCounts(params.counts); SetupMemory(start, params.counts, config); vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab); search_.LoadedBinary(); } template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) { - // Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any. - util::FilePiece f(backing_.file.release(), file, config.messages); + // Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any. + util::FilePiece f(backing_.file.release(), file, config.ProgressMessages()); try { std::vector<uint64_t> counts; // File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_. ReadARPACounts(f, counts); - CheckMaxOrder(counts.size()); + CheckCounts(counts); if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model."); if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0"); - std::size_t vocab_size = VocabularyT::Size(counts[0], config); - // Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs. + std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config)); + // Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs. vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config); if (config.write_mmap) { WriteWordsWrapper wrap(config.enumerate_vocab); vocab_.ConfigureEnumerate(&wrap, counts[0]); search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); - wrap.Write(backing_.file.get()); + wrap.Write(backing_.file.get(), backing_.vocab.size() + vocab_.UnkCountChangePadding() + Search::Size(counts, config)); } else { vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]); search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); @@ -88,7 +95,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT if (!vocab_.SawUnk()) { assert(config.unknown_missing != THROW_UP); - // Default probabilities for unknown. + // Default probabilities for unknown. search_.UnknownUnigram().backoff = 0.0; search_.UnknownUnigram().prob = config.unknown_missing_logprob; } @@ -140,7 +147,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, } template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const { - // Generate a state from context. + // Generate a state from context. context_rend = std::min(context_rend, context_rbegin + P::Order() - 1); if (context_rend == context_rbegin) { out_state.length = 0; @@ -184,7 +191,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, ret.rest = ptr.Rest(); ret.prob = ptr.Prob(); ret.extend_left = extend_pointer; - // If this function is called, then it does depend on left words. + // If this function is called, then it does depend on left words. ret.independent_left = false; } float subtract_me = ret.rest; @@ -192,7 +199,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, next_use = extend_length; ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret); next_use -= extend_length; - // Charge backoffs. + // Charge backoffs. for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b; ret.prob -= subtract_me; ret.rest -= subtract_me; @@ -202,7 +209,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, namespace { // Do a paraonoid copy of history, assuming new_word has already been copied // (hence the -1). out_state.length could be zero so I avoided using -// std::copy. +// std::copy. void CopyRemainingHistory(const WordIndex *from, State &out_state) { WordIndex *out = out_state.words + 1; const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1; @@ -210,18 +217,19 @@ void CopyRemainingHistory(const WordIndex *from, State &out_state) { } } // namespace -/* Ugly optimized function. Produce a score excluding backoff. - * The search goes in increasing order of ngram length. +/* Ugly optimized function. Produce a score excluding backoff. + * The search goes in increasing order of ngram length. * Context goes backward, so context_begin is the word immediately preceeding - * new_word. + * new_word. */ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff( const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const { + assert(new_word < vocab_.Bound()); FullScoreReturn ret; - // ret.ngram_length contains the last known non-blank ngram length. + // ret.ngram_length contains the last known non-blank ngram length. ret.ngram_length = 1; typename Search::Node node; @@ -230,9 +238,9 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, ret.prob = uni.Prob(); ret.rest = uni.Rest(); - // This is the length of the context that should be used for continuation to the right. + // This is the length of the context that should be used for continuation to the right. out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0; - // We'll write the word anyway since it will probably be used and does no harm being there. + // We'll write the word anyway since it will probably be used and does no harm being there. out_state.words[0] = new_word; if (context_rbegin == context_rend) return ret; diff --git a/klm/lm/model.hh b/klm/lm/model.hh index 6dee9419..13ff864e 100644 --- a/klm/lm/model.hh +++ b/klm/lm/model.hh @@ -41,7 +41,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod * does not include small non-mapped control structures, such as this class * itself. */ - static size_t Size(const std::vector<uint64_t> &counts, const Config &config = Config()); + static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config = Config()); /* Load the model from a file. It may be an ARPA or binary file. Binary * files must have the format expected by this class or you'll get an diff --git a/klm/lm/model_test.cc b/klm/lm/model_test.cc index 32084b5b..eb159094 100644 --- a/klm/lm/model_test.cc +++ b/klm/lm/model_test.cc @@ -1,6 +1,7 @@ #include "lm/model.hh" #include <stdlib.h> +#include <string.h> #define BOOST_TEST_MODULE ModelTest #include <boost/test/unit_test.hpp> @@ -22,17 +23,20 @@ std::ostream &operator<<(std::ostream &o, const State &state) { namespace { +// Stupid bjam reverses the command line arguments randomly. const char *TestLocation() { - if (boost::unit_test::framework::master_test_suite().argc < 2) { + if (boost::unit_test::framework::master_test_suite().argc < 3) { return "test.arpa"; } - return boost::unit_test::framework::master_test_suite().argv[1]; + char **argv = boost::unit_test::framework::master_test_suite().argv; + return argv[strstr(argv[1], "nounk") ? 2 : 1]; } const char *TestNoUnkLocation() { if (boost::unit_test::framework::master_test_suite().argc < 3) { return "test_nounk.arpa"; } - return boost::unit_test::framework::master_test_suite().argv[2]; + char **argv = boost::unit_test::framework::master_test_suite().argv; + return argv[strstr(argv[1], "nounk") ? 1 : 2]; } template <class Model> State GetState(const Model &model, const char *word, const State &in) { diff --git a/klm/lm/partial.hh b/klm/lm/partial.hh new file mode 100644 index 00000000..1dede359 --- /dev/null +++ b/klm/lm/partial.hh @@ -0,0 +1,167 @@ +#ifndef LM_PARTIAL__ +#define LM_PARTIAL__ + +#include "lm/return.hh" +#include "lm/state.hh" + +#include <algorithm> + +#include <assert.h> + +namespace lm { +namespace ngram { + +struct ExtendReturn { + float adjust; + bool make_full; + unsigned char next_use; +}; + +template <class Model> ExtendReturn ExtendLoop( + const Model &model, + unsigned char seen, const WordIndex *add_rbegin, const WordIndex *add_rend, const float *backoff_start, + const uint64_t *pointers, const uint64_t *pointers_end, + uint64_t *&pointers_write, + float *backoff_write) { + unsigned char add_length = add_rend - add_rbegin; + + float backoff_buf[2][KENLM_MAX_ORDER - 1]; + float *backoff_in = backoff_buf[0], *backoff_out = backoff_buf[1]; + std::copy(backoff_start, backoff_start + add_length, backoff_in); + + ExtendReturn value; + value.make_full = false; + value.adjust = 0.0; + value.next_use = add_length; + + unsigned char i = 0; + unsigned char length = pointers_end - pointers; + // pointers_write is NULL means that the existing left state is full, so we should use completed probabilities. + if (pointers_write) { + // Using full context, writing to new left state. + for (; i < length; ++i) { + FullScoreReturn ret(model.ExtendLeft( + add_rbegin, add_rbegin + value.next_use, + backoff_in, + pointers[i], i + seen + 1, + backoff_out, + value.next_use)); + std::swap(backoff_in, backoff_out); + if (ret.independent_left) { + value.adjust += ret.prob; + value.make_full = true; + ++i; + break; + } + value.adjust += ret.rest; + *pointers_write++ = ret.extend_left; + if (value.next_use != add_length) { + value.make_full = true; + ++i; + break; + } + } + } + // Using some of the new context. + for (; i < length && value.next_use; ++i) { + FullScoreReturn ret(model.ExtendLeft( + add_rbegin, add_rbegin + value.next_use, + backoff_in, + pointers[i], i + seen + 1, + backoff_out, + value.next_use)); + std::swap(backoff_in, backoff_out); + value.adjust += ret.prob; + } + float unrest = model.UnRest(pointers + i, pointers_end, i + seen + 1); + // Using none of the new context. + value.adjust += unrest; + + std::copy(backoff_in, backoff_in + value.next_use, backoff_write); + return value; +} + +template <class Model> float RevealBefore(const Model &model, const Right &reveal, const unsigned char seen, bool reveal_full, Left &left, Right &right) { + assert(seen < reveal.length || reveal_full); + uint64_t *pointers_write = reveal_full ? NULL : left.pointers; + float backoff_buffer[KENLM_MAX_ORDER - 1]; + ExtendReturn value(ExtendLoop( + model, + seen, reveal.words + seen, reveal.words + reveal.length, reveal.backoff + seen, + left.pointers, left.pointers + left.length, + pointers_write, + left.full ? backoff_buffer : (right.backoff + right.length))); + if (reveal_full) { + left.length = 0; + value.make_full = true; + } else { + left.length = pointers_write - left.pointers; + value.make_full |= (left.length == model.Order() - 1); + } + if (left.full) { + for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i]; + } else { + // If left wasn't full when it came in, put words into right state. + std::copy(reveal.words + seen, reveal.words + seen + value.next_use, right.words + right.length); + right.length += value.next_use; + left.full = value.make_full || (right.length == model.Order() - 1); + } + return value.adjust; +} + +template <class Model> float RevealAfter(const Model &model, Left &left, Right &right, const Left &reveal, unsigned char seen) { + assert(seen < reveal.length || reveal.full); + uint64_t *pointers_write = left.full ? NULL : (left.pointers + left.length); + ExtendReturn value(ExtendLoop( + model, + seen, right.words, right.words + right.length, right.backoff, + reveal.pointers + seen, reveal.pointers + reveal.length, + pointers_write, + right.backoff)); + if (reveal.full) { + for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += right.backoff[i]; + right.length = 0; + value.make_full = true; + } else { + right.length = value.next_use; + value.make_full |= (right.length == model.Order() - 1); + } + if (!left.full) { + left.length = pointers_write - left.pointers; + left.full = value.make_full || (left.length == model.Order() - 1); + } + return value.adjust; +} + +template <class Model> float Subsume(const Model &model, Left &first_left, const Right &first_right, const Left &second_left, Right &second_right, const unsigned int between_length) { + assert(first_right.length < KENLM_MAX_ORDER); + assert(second_left.length < KENLM_MAX_ORDER); + assert(between_length < KENLM_MAX_ORDER - 1); + uint64_t *pointers_write = first_left.full ? NULL : (first_left.pointers + first_left.length); + float backoff_buffer[KENLM_MAX_ORDER - 1]; + ExtendReturn value(ExtendLoop( + model, + between_length, first_right.words, first_right.words + first_right.length, first_right.backoff, + second_left.pointers, second_left.pointers + second_left.length, + pointers_write, + second_left.full ? backoff_buffer : (second_right.backoff + second_right.length))); + if (second_left.full) { + for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i]; + } else { + std::copy(first_right.words, first_right.words + value.next_use, second_right.words + second_right.length); + second_right.length += value.next_use; + value.make_full |= (second_right.length == model.Order() - 1); + } + if (!first_left.full) { + first_left.length = pointers_write - first_left.pointers; + first_left.full = value.make_full || second_left.full || (first_left.length == model.Order() - 1); + } + assert(first_left.length < KENLM_MAX_ORDER); + assert(second_right.length < KENLM_MAX_ORDER); + return value.adjust; +} + +} // namespace ngram +} // namespace lm + +#endif // LM_PARTIAL__ diff --git a/klm/lm/partial_test.cc b/klm/lm/partial_test.cc new file mode 100644 index 00000000..8d309c85 --- /dev/null +++ b/klm/lm/partial_test.cc @@ -0,0 +1,199 @@ +#include "lm/partial.hh" + +#include "lm/left.hh" +#include "lm/model.hh" +#include "util/tokenize_piece.hh" + +#define BOOST_TEST_MODULE PartialTest +#include <boost/test/unit_test.hpp> +#include <boost/test/floating_point_comparison.hpp> + +namespace lm { +namespace ngram { +namespace { + +const char *TestLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "test.arpa"; + } + return boost::unit_test::framework::master_test_suite().argv[1]; +} + +Config SilentConfig() { + Config config; + config.arpa_complain = Config::NONE; + config.messages = NULL; + return config; +} + +struct ModelFixture { + ModelFixture() : m(TestLocation(), SilentConfig()) {} + + RestProbingModel m; +}; + +BOOST_FIXTURE_TEST_SUITE(suite, ModelFixture) + +BOOST_AUTO_TEST_CASE(SimpleBefore) { + Left left; + left.full = false; + left.length = 0; + Right right; + right.length = 0; + + Right reveal; + reveal.length = 1; + WordIndex period = m.GetVocabulary().Index("."); + reveal.words[0] = period; + reveal.backoff[0] = -0.845098; + + BOOST_CHECK_CLOSE(0.0, RevealBefore(m, reveal, 0, false, left, right), 0.001); + BOOST_CHECK_EQUAL(0, left.length); + BOOST_CHECK(!left.full); + BOOST_CHECK_EQUAL(1, right.length); + BOOST_CHECK_EQUAL(period, right.words[0]); + BOOST_CHECK_CLOSE(-0.845098, right.backoff[0], 0.001); + + WordIndex more = m.GetVocabulary().Index("more"); + reveal.words[1] = more; + reveal.backoff[1] = -0.4771212; + reveal.length = 2; + BOOST_CHECK_CLOSE(0.0, RevealBefore(m, reveal, 1, false, left, right), 0.001); + BOOST_CHECK_EQUAL(0, left.length); + BOOST_CHECK(!left.full); + BOOST_CHECK_EQUAL(2, right.length); + BOOST_CHECK_EQUAL(period, right.words[0]); + BOOST_CHECK_EQUAL(more, right.words[1]); + BOOST_CHECK_CLOSE(-0.845098, right.backoff[0], 0.001); + BOOST_CHECK_CLOSE(-0.4771212, right.backoff[1], 0.001); +} + +BOOST_AUTO_TEST_CASE(AlsoWouldConsider) { + WordIndex would = m.GetVocabulary().Index("would"); + WordIndex consider = m.GetVocabulary().Index("consider"); + + ChartState current; + current.left.length = 1; + current.left.pointers[0] = would; + current.left.full = false; + current.right.length = 1; + current.right.words[0] = would; + current.right.backoff[0] = -0.30103; + + Left after; + after.full = false; + after.length = 1; + after.pointers[0] = consider; + + // adjustment for would consider + BOOST_CHECK_CLOSE(-1.687872 - -0.2922095 - 0.30103, RevealAfter(m, current.left, current.right, after, 0), 0.001); + + BOOST_CHECK_EQUAL(2, current.left.length); + BOOST_CHECK_EQUAL(would, current.left.pointers[0]); + BOOST_CHECK_EQUAL(false, current.left.full); + + WordIndex also = m.GetVocabulary().Index("also"); + Right before; + before.length = 1; + before.words[0] = also; + before.backoff[0] = -0.30103; + // r(would) = -0.2922095 [i would], r(would -> consider) = -1.988902 [b(would) + p(consider)] + // p(also -> would) = -2, p(also would -> consider) = -3 + BOOST_CHECK_CLOSE(-2 + 0.2922095 -3 + 1.988902, RevealBefore(m, before, 0, false, current.left, current.right), 0.001); + BOOST_CHECK_EQUAL(0, current.left.length); + BOOST_CHECK(current.left.full); + BOOST_CHECK_EQUAL(2, current.right.length); + BOOST_CHECK_EQUAL(would, current.right.words[0]); + BOOST_CHECK_EQUAL(also, current.right.words[1]); +} + +BOOST_AUTO_TEST_CASE(EndSentence) { + WordIndex loin = m.GetVocabulary().Index("loin"); + WordIndex period = m.GetVocabulary().Index("."); + WordIndex eos = m.GetVocabulary().EndSentence(); + + ChartState between; + between.left.length = 1; + between.left.pointers[0] = eos; + between.left.full = true; + between.right.length = 0; + + Right before; + before.words[0] = period; + before.words[1] = loin; + before.backoff[0] = -0.845098; + before.backoff[1] = 0.0; + + before.length = 1; + BOOST_CHECK_CLOSE(-0.0410707, RevealBefore(m, before, 0, true, between.left, between.right), 0.001); + BOOST_CHECK_EQUAL(0, between.left.length); +} + +float ScoreFragment(const RestProbingModel &model, unsigned int *begin, unsigned int *end, ChartState &out) { + RuleScore<RestProbingModel> scorer(model, out); + for (unsigned int *i = begin; i < end; ++i) { + scorer.Terminal(*i); + } + return scorer.Finish(); +} + +void CheckAdjustment(const RestProbingModel &model, float expect, const Right &before_in, bool before_full, ChartState between, const Left &after_in) { + Right before(before_in); + Left after(after_in); + after.full = false; + float got = 0.0; + for (unsigned int i = 1; i < 5; ++i) { + if (before_in.length >= i) { + before.length = i; + got += RevealBefore(model, before, i - 1, false, between.left, between.right); + } + if (after_in.length >= i) { + after.length = i; + got += RevealAfter(model, between.left, between.right, after, i - 1); + } + } + if (after_in.full) { + after.full = true; + got += RevealAfter(model, between.left, between.right, after, after.length); + } + if (before_full) { + got += RevealBefore(model, before, before.length, true, between.left, between.right); + } + // Sometimes they're zero and BOOST_CHECK_CLOSE fails for this. + BOOST_CHECK(fabs(expect - got) < 0.001); +} + +void FullDivide(const RestProbingModel &model, StringPiece str) { + std::vector<WordIndex> indices; + for (util::TokenIter<util::SingleCharacter, true> i(str, ' '); i; ++i) { + indices.push_back(model.GetVocabulary().Index(*i)); + } + ChartState full_state; + float full = ScoreFragment(model, &indices.front(), &indices.back() + 1, full_state); + + ChartState before_state; + before_state.left.full = false; + RuleScore<RestProbingModel> before_scorer(model, before_state); + float before_score = 0.0; + for (unsigned int before = 0; before < indices.size(); ++before) { + for (unsigned int after = before; after <= indices.size(); ++after) { + ChartState after_state, between_state; + float after_score = ScoreFragment(model, &indices.front() + after, &indices.front() + indices.size(), after_state); + float between_score = ScoreFragment(model, &indices.front() + before, &indices.front() + after, between_state); + CheckAdjustment(model, full - before_score - after_score - between_score, before_state.right, before_state.left.full, between_state, after_state.left); + } + before_scorer.Terminal(indices[before]); + before_score = before_scorer.Finish(); + } +} + +BOOST_AUTO_TEST_CASE(Strings) { + FullDivide(m, "also would consider"); + FullDivide(m, "looking on a little more loin . </s>"); + FullDivide(m, "in biarritz watching considering looking . on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown </s>"); +} + +BOOST_AUTO_TEST_SUITE_END() +} // namespace +} // namespace ngram +} // namespace lm diff --git a/klm/lm/quantize.hh b/klm/lm/quantize.hh index abed0112..8ce2378a 100644 --- a/klm/lm/quantize.hh +++ b/klm/lm/quantize.hh @@ -24,7 +24,7 @@ class DontQuantize { public: static const ModelType kModelTypeAdd = static_cast<ModelType>(0); static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {} - static std::size_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; } + static uint64_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; } static uint8_t MiddleBits(const Config &/*config*/) { return 63; } static uint8_t LongestBits(const Config &/*config*/) { return 31; } @@ -138,9 +138,9 @@ class SeparatelyQuantize { static void UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &counts, Config &config); - static std::size_t Size(uint8_t order, const Config &config) { - size_t longest_table = (static_cast<size_t>(1) << static_cast<size_t>(config.prob_bits)) * sizeof(float); - size_t middle_table = (static_cast<size_t>(1) << static_cast<size_t>(config.backoff_bits)) * sizeof(float) + longest_table; + static uint64_t Size(uint8_t order, const Config &config) { + uint64_t longest_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.prob_bits)) * sizeof(float); + uint64_t middle_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.backoff_bits)) * sizeof(float) + longest_table; // unigrams are currently not quantized so no need for a table. return (order - 2) * middle_table + longest_table + /* for the bit counts and alignment padding) */ 8; } diff --git a/klm/lm/ngram_query.cc b/klm/lm/query_main.cc index 49757d9a..49757d9a 100644 --- a/klm/lm/ngram_query.cc +++ b/klm/lm/query_main.cc diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc index 70727e4c..9ea08798 100644 --- a/klm/lm/read_arpa.cc +++ b/klm/lm/read_arpa.cc @@ -1,13 +1,15 @@ #include "lm/read_arpa.hh" #include "lm/blank.hh" +#include "util/file.hh" +#include <cmath> #include <cstdlib> #include <iostream> +#include <sstream> #include <vector> #include <ctype.h> -#include <math.h> #include <string.h> #include <stdint.h> @@ -31,12 +33,27 @@ bool IsEntirelyWhiteSpace(const StringPiece &line) { const char kBinaryMagic[] = "mmap lm http://kheafield.com/code"; +// strtoull isn't portable enough :-( +uint64_t ReadCount(const std::string &from) { + std::stringstream stream(from); + uint64_t ret; + stream >> ret; + UTIL_THROW_IF(!stream, FormatLoadException, "Bad count " << from); + return ret; +} + } // namespace void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) { number.clear(); - StringPiece line; - while (IsEntirelyWhiteSpace(line = in.ReadLine())) {} + StringPiece line = in.ReadLine(); + // In general, ARPA files can have arbitrary text before "\data\" + // But in KenLM, we require such lines to start with "#", so that + // we can do stricter error checking + while (IsEntirelyWhiteSpace(line) || line.starts_with("#")) { + line = in.ReadLine(); + } + if (line != "\\data\\") { if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) { UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, pipe " << in.FileName() << " through zcat. If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip."); @@ -52,15 +69,11 @@ void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) { // So strtol doesn't go off the end of line. std::string remaining(line.data() + 6, line.size() - 6); char *end_ptr; - unsigned long int length = std::strtol(remaining.c_str(), &end_ptr, 10); + unsigned int length = std::strtol(remaining.c_str(), &end_ptr, 10); if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line); if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line); ++end_ptr; - const char *start = end_ptr; - long int count = std::strtol(start, &end_ptr, 10); - if (count < 0) UTIL_THROW(FormatLoadException, "Negative n-gram count " << count); - if (start == end_ptr) UTIL_THROW(FormatLoadException, "Couldn't parse n-gram count from " << line); - number.push_back(count); + number.push_back(ReadCount(end_ptr)); } } @@ -103,7 +116,7 @@ void ReadBackoff(util::FilePiece &in, float &backoff) { int float_class = _fpclass(backoff); UTIL_THROW_IF(float_class == _FPCLASS_SNAN || float_class == _FPCLASS_QNAN || float_class == _FPCLASS_NINF || float_class == _FPCLASS_PINF, FormatLoadException, "Bad backoff " << backoff); #else - int float_class = fpclassify(backoff); + int float_class = std::fpclassify(backoff); UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff); #endif } diff --git a/klm/lm/search_hashed.cc b/klm/lm/search_hashed.cc index 13942309..2d6f15b2 100644 --- a/klm/lm/search_hashed.cc +++ b/klm/lm/search_hashed.cc @@ -231,27 +231,27 @@ template <class Value> void HashedSearch<Value>::InitializeFromARPA(const char * template <> void HashedSearch<BackoffValue>::DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn) { NoRestBuild build; - ApplyBuild(f, counts, config, vocab, warn, build); + ApplyBuild(f, counts, vocab, warn, build); } -template <> void HashedSearch<RestValue>::DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn) { +template <> void HashedSearch<RestValue>::DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn) { switch (config.rest_function) { case Config::REST_MAX: { MaxRestBuild build; - ApplyBuild(f, counts, config, vocab, warn, build); + ApplyBuild(f, counts, vocab, warn, build); } break; case Config::REST_LOWER: { LowerRestBuild<ProbingModel> build(config, counts.size(), vocab); - ApplyBuild(f, counts, config, vocab, warn, build); + ApplyBuild(f, counts, vocab, warn, build); } break; } } -template <class Value> template <class Build> void HashedSearch<Value>::ApplyBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build) { +template <class Value> template <class Build> void HashedSearch<Value>::ApplyBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build) { for (WordIndex i = 0; i < counts[0]; ++i) { build.SetRest(&i, (unsigned int)1, unigram_.Raw()[i]); } diff --git a/klm/lm/search_hashed.hh b/klm/lm/search_hashed.hh index 7e8c1220..00595796 100644 --- a/klm/lm/search_hashed.hh +++ b/klm/lm/search_hashed.hh @@ -74,8 +74,8 @@ template <class Value> class HashedSearch { // TODO: move probing_multiplier here with next binary file format update. static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {} - static std::size_t Size(const std::vector<uint64_t> &counts, const Config &config) { - std::size_t ret = Unigram::Size(counts[0]); + static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) { + uint64_t ret = Unigram::Size(counts[0]); for (unsigned char n = 1; n < counts.size() - 1; ++n) { ret += Middle::Size(counts[n], config.probing_multiplier); } @@ -147,7 +147,7 @@ template <class Value> class HashedSearch { // Interpret config's rest cost build policy and pass the right template argument to ApplyBuild. void DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn); - template <class Build> void ApplyBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build); + template <class Build> void ApplyBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build); class Unigram { public: @@ -160,8 +160,8 @@ template <class Value> class HashedSearch { #endif {} - static std::size_t Size(uint64_t count) { - return (count + 1) * sizeof(ProbBackoff); // +1 for hallucinate <unk> + static uint64_t Size(uint64_t count) { + return (count + 1) * sizeof(typename Value::Weights); // +1 for hallucinate <unk> } const typename Value::Weights &Lookup(WordIndex index) const { diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index 832cc9f7..1b0d9b26 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -55,7 +55,7 @@ struct ProbPointer { uint64_t index; }; -// Array of n-grams and float indices. +// Array of n-grams and float indices. class BackoffMessages { public: void Init(std::size_t entry_size) { @@ -89,7 +89,7 @@ class BackoffMessages { if (!HasExtension(weights.backoff)) { weights.backoff = kExtensionBackoff; UTIL_THROW_IF(fseek(unigrams, -sizeof(weights), SEEK_CUR), util::ErrnoException, "Seeking backwards to denote unigram extension failed."); - WriteOrThrow(unigrams, &weights, sizeof(weights)); + util::WriteOrThrow(unigrams, &weights, sizeof(weights)); } const ProbPointer &write_to = *reinterpret_cast<const ProbPointer*>(current_ + sizeof(WordIndex)); base[write_to.array][write_to.index] += weights.backoff; @@ -100,7 +100,7 @@ class BackoffMessages { void Apply(float *const *const base, RecordReader &reader) { FinishedAdding(); if (current_ == allocated_) return; - // We'll also use the same buffer to record messages to blanks that they extend. + // We'll also use the same buffer to record messages to blanks that they extend. WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_); const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex); for (reader.Rewind(); reader && (current_ != allocated_); ) { @@ -109,7 +109,7 @@ class BackoffMessages { ++reader; break; case 1: - // Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends. + // Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends. for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w; current_ += entry_size_; break; @@ -126,7 +126,7 @@ class BackoffMessages { break; } } - // Now this is a list of blanks that extend right. + // Now this is a list of blanks that extend right. entry_size_ = sizeof(WordIndex) * order; Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get())); current_ = (uint8_t*)backing_.get(); @@ -153,7 +153,7 @@ class BackoffMessages { private: void FinishedAdding() { Resize(current_ - (uint8_t*)backing_.get()); - // Sort requests in same order as files. + // Sort requests in same order as files. std::sort( util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)), util::SizedIterator(util::SizedProxy(current_, entry_size_)), @@ -220,7 +220,7 @@ class SRISucks { } private: - // This used to be one array. Then I needed to separate it by order for quantization to work. + // This used to be one array. Then I needed to separate it by order for quantization to work. std::vector<float> values_[KENLM_MAX_ORDER - 1]; BackoffMessages messages_[KENLM_MAX_ORDER - 1]; @@ -253,7 +253,7 @@ class FindBlanks { ++counts_.back(); } - // Unigrams wrote one past. + // Unigrams wrote one past. void Cleanup() { --counts_[0]; } @@ -270,15 +270,15 @@ class FindBlanks { SRISucks &sri_; }; -// Phase to actually write n-grams to the trie. +// Phase to actually write n-grams to the trie. template <class Quant, class Bhiksha> class WriteEntries { public: - WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) : + WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) : contexts_(contexts), quant_(quant), unigrams_(unigrams), middle_(middle), - longest_(longest), + longest_(longest), bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)), order_(order), sri_(sri) {} @@ -328,7 +328,7 @@ struct Gram { const WordIndex *begin, *end; - // For queue, this is the direction we want. + // For queue, this is the direction we want. bool operator<(const Gram &other) const { return std::lexicographical_compare(other.begin, other.end, begin, end); } @@ -353,7 +353,7 @@ template <class Doing> class BlankManager { been_length_ = length; return; } - // There are blanks to insert starting with order blank. + // There are blanks to insert starting with order blank. unsigned char blank = cur - to + 1; UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context."); const float *lower_basis; @@ -363,7 +363,7 @@ template <class Doing> class BlankManager { assert(*lower_basis != kBadProb); doing_.MiddleBlank(blank, to, based_on, *lower_basis); *pre = *cur; - // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram. + // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram. basis_[blank - 1] = kBadProb; } *pre = *cur; @@ -377,7 +377,7 @@ template <class Doing> class BlankManager { unsigned char been_length_; float basis_[KENLM_MAX_ORDER]; - + Doing &doing_; }; @@ -451,7 +451,7 @@ template <class Quant> void TrainProbQuantizer(uint8_t order, uint64_t count, Re } void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) { - // Fill unigram probabilities. + // Fill unigram probabilities. try { rewind(file); for (WordIndex i = 0; i < unigram_count; ++i) { @@ -486,7 +486,7 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve util::scoped_memory unigrams; MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams); FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri); - RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder); + RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder); fixed_counts = finder.Counts(); } unigram_file.reset(util::FDOpenOrThrow(unigram_fd)); @@ -504,7 +504,8 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve inputs[i-2].Rewind(); } if (Quant::kTrain) { - util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), config.messages, "Quantizing"); + util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), + config.ProgressMessages(), "Quantizing"); for (unsigned char i = 2; i < counts.size(); ++i) { TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant); } @@ -519,13 +520,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve for (unsigned char i = 2; i <= counts.size(); ++i) { inputs[i-2].Rewind(); } - // Fill entries except unigram probabilities. + // Fill entries except unigram probabilities. { WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri); - RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Writing trie", writer); + RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer); } - // Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation. + // Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation. for (unsigned char order = 2; order <= counts.size(); ++order) { const RecordReader &context = contexts[order - 2]; if (context) { @@ -541,13 +542,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve } /* Set ending offsets so the last entry will be sized properly */ - // Last entry for unigrams was already set. + // Last entry for unigrams was already set. if (out.middle_begin_ != out.middle_end_) { for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) { i->FinishedLoading((i+1)->InsertIndex(), config); } (out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config); - } + } } template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) { @@ -595,7 +596,7 @@ template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::Initializ } else { temporary_prefix = file; } - // At least 1MB sorting memory. + // At least 1MB sorting memory. SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab); BuildTrie(sorted, counts, config, *this, quant_, vocab, backing); diff --git a/klm/lm/search_trie.hh b/klm/lm/search_trie.hh index 10b22ab1..1264baf5 100644 --- a/klm/lm/search_trie.hh +++ b/klm/lm/search_trie.hh @@ -44,8 +44,8 @@ template <class Quant, class Bhiksha> class TrieSearch { Bhiksha::UpdateConfigFromBinary(fd, config); } - static std::size_t Size(const std::vector<uint64_t> &counts, const Config &config) { - std::size_t ret = Quant::Size(counts.size(), config) + Unigram::Size(counts[0]); + static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) { + uint64_t ret = Quant::Size(counts.size(), config) + Unigram::Size(counts[0]); for (unsigned char i = 1; i < counts.size() - 1; ++i) { ret += Middle::Size(Quant::MiddleBits(config), counts[i], counts[0], counts[i+1], config); } diff --git a/klm/lm/sizes.cc b/klm/lm/sizes.cc new file mode 100644 index 00000000..55ad586c --- /dev/null +++ b/klm/lm/sizes.cc @@ -0,0 +1,63 @@ +#include "lm/sizes.hh" +#include "lm/model.hh" +#include "util/file_piece.hh" + +#include <vector> +#include <iomanip> + +namespace lm { +namespace ngram { + +void ShowSizes(const std::vector<uint64_t> &counts, const lm::ngram::Config &config) { + uint64_t sizes[6]; + sizes[0] = ProbingModel::Size(counts, config); + sizes[1] = RestProbingModel::Size(counts, config); + sizes[2] = TrieModel::Size(counts, config); + sizes[3] = QuantTrieModel::Size(counts, config); + sizes[4] = ArrayTrieModel::Size(counts, config); + sizes[5] = QuantArrayTrieModel::Size(counts, config); + uint64_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); + uint64_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); + uint64_t divide; + char prefix; + if (min_length < (1 << 10) * 10) { + prefix = ' '; + divide = 1; + } else if (min_length < (1 << 20) * 10) { + prefix = 'k'; + divide = 1 << 10; + } else if (min_length < (1ULL << 30) * 10) { + prefix = 'M'; + divide = 1 << 20; + } else { + prefix = 'G'; + divide = 1 << 30; + } + long int length = std::max<long int>(2, static_cast<long int>(ceil(log10((double) max_length / divide)))); + std::cerr << "Memory estimate for binary LM:\ntype "; + + // right align bytes. + for (long int i = 0; i < length - 2; ++i) std::cerr << ' '; + + std::cerr << prefix << "B\n" + "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n" + "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r models -p " << config.probing_multiplier << "\n" + "trie " << std::setw(length) << (sizes[2] / divide) << " without quantization\n" + "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n" + "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n" + "trie " << std::setw(length) << (sizes[5] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n"; +} + +void ShowSizes(const std::vector<uint64_t> &counts) { + lm::ngram::Config config; + ShowSizes(counts, config); +} + +void ShowSizes(const char *file, const lm::ngram::Config &config) { + std::vector<uint64_t> counts; + util::FilePiece f(file); + lm::ReadARPACounts(f, counts); + ShowSizes(counts, config); +} + +}} //namespaces diff --git a/klm/lm/sizes.hh b/klm/lm/sizes.hh new file mode 100644 index 00000000..85abade7 --- /dev/null +++ b/klm/lm/sizes.hh @@ -0,0 +1,17 @@ +#ifndef LM_SIZES__ +#define LM_SIZES__ + +#include <vector> + +#include <stdint.h> + +namespace lm { namespace ngram { + +struct Config; + +void ShowSizes(const std::vector<uint64_t> &counts, const lm::ngram::Config &config); +void ShowSizes(const std::vector<uint64_t> &counts); +void ShowSizes(const char *file, const lm::ngram::Config &config); + +}} // namespaces +#endif // LM_SIZES__ diff --git a/klm/lm/sri_test.cc b/klm/lm/sri_test.cc deleted file mode 100644 index e697d722..00000000 --- a/klm/lm/sri_test.cc +++ /dev/null @@ -1,65 +0,0 @@ -#include "lm/sri.hh" - -#include <stdlib.h> - -#define BOOST_TEST_MODULE SRITest -#include <boost/test/unit_test.hpp> - -namespace lm { -namespace sri { -namespace { - -#define StartTest(word, ngram, score) \ - ret = model.FullScore( \ - state, \ - model.GetVocabulary().Index(word), \ - out);\ - BOOST_CHECK_CLOSE(score, ret.prob, 0.001); \ - BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length); \ - BOOST_CHECK_EQUAL(std::min<unsigned char>(ngram, 5 - 1), out.valid_length_); - -#define AppendTest(word, ngram, score) \ - StartTest(word, ngram, score) \ - state = out; - -template <class M> void Starters(M &model) { - FullScoreReturn ret; - Model::State state(model.BeginSentenceState()); - Model::State out; - - StartTest("looking", 2, -0.4846522); - - // , probability plus <s> backoff - StartTest(",", 1, -1.383514 + -0.4149733); - // <unk> probability plus <s> backoff - StartTest("this_is_not_found", 0, -1.995635 + -0.4149733); -} - -template <class M> void Continuation(M &model) { - FullScoreReturn ret; - Model::State state(model.BeginSentenceState()); - Model::State out; - - AppendTest("looking", 2, -0.484652); - AppendTest("on", 3, -0.348837); - AppendTest("a", 4, -0.0155266); - AppendTest("little", 5, -0.00306122); - State preserve = state; - AppendTest("the", 1, -4.04005); - AppendTest("biarritz", 1, -1.9889); - AppendTest("not_found", 0, -2.29666); - AppendTest("more", 1, -1.20632); - AppendTest(".", 2, -0.51363); - AppendTest("</s>", 3, -0.0191651); - - state = preserve; - AppendTest("more", 5, -0.00181395); - AppendTest("loin", 5, -0.0432557); -} - -BOOST_AUTO_TEST_CASE(starters) { Model m("test.arpa", 5); Starters(m); } -BOOST_AUTO_TEST_CASE(continuation) { Model m("test.arpa", 5); Continuation(m); } - -} // namespace -} // namespace sri -} // namespace lm diff --git a/klm/lm/state.hh b/klm/lm/state.hh index 830e40aa..d8e6c132 100644 --- a/klm/lm/state.hh +++ b/klm/lm/state.hh @@ -47,6 +47,8 @@ class State { unsigned char length; }; +typedef State Right; + inline uint64_t hash_value(const State &state, uint64_t seed = 0) { return util::MurmurHashNative(state.words, sizeof(WordIndex) * state.length, seed); } @@ -54,14 +56,14 @@ inline uint64_t hash_value(const State &state, uint64_t seed = 0) { struct Left { bool operator==(const Left &other) const { return - (length == other.length) && - pointers[length - 1] == other.pointers[length - 1] && - full == other.full; + length == other.length && + (!length || (pointers[length - 1] == other.pointers[length - 1] && full == other.full)); } int Compare(const Left &other) const { if (length < other.length) return -1; if (length > other.length) return 1; + if (length == 0) return 0; // Must be full. if (pointers[length - 1] > other.pointers[length - 1]) return 1; if (pointers[length - 1] < other.pointers[length - 1]) return -1; return (int)full - (int)other.full; diff --git a/klm/lm/trie.cc b/klm/lm/trie.cc index 0f1ca574..d9895f89 100644 --- a/klm/lm/trie.cc +++ b/klm/lm/trie.cc @@ -36,7 +36,7 @@ bool FindBitPacked(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_ } } // namespace -std::size_t BitPacked::BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits) { +uint64_t BitPacked::BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits) { uint8_t total_bits = util::RequiredBits(max_vocab) + remaining_bits; // Extra entry for next pointer at the end. // +7 then / 8 to round up bits and convert to bytes @@ -57,7 +57,7 @@ void BitPacked::BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits) max_vocab_ = max_vocab; } -template <class Bhiksha> std::size_t BitPackedMiddle<Bhiksha>::Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_ptr, const Config &config) { +template <class Bhiksha> uint64_t BitPackedMiddle<Bhiksha>::Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_ptr, const Config &config) { return Bhiksha::Size(entries + 1, max_ptr, config) + BaseSize(entries, max_vocab, quant_bits + Bhiksha::InlineBits(entries + 1, max_ptr, config)); } diff --git a/klm/lm/trie.hh b/klm/lm/trie.hh index 034a1414..9ea3c546 100644 --- a/klm/lm/trie.hh +++ b/klm/lm/trie.hh @@ -49,7 +49,7 @@ class Unigram { unigram_ = static_cast<UnigramValue*>(start); } - static std::size_t Size(uint64_t count) { + static uint64_t Size(uint64_t count) { // +1 in case unknown doesn't appear. +1 for the final next. return (count + 2) * sizeof(UnigramValue); } @@ -84,7 +84,7 @@ class BitPacked { } protected: - static std::size_t BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits); + static uint64_t BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits); void BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits); @@ -99,7 +99,7 @@ class BitPacked { template <class Bhiksha> class BitPackedMiddle : public BitPacked { public: - static std::size_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const Config &config); + static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const Config &config); // next_source need not be initialized. BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const BitPacked &next_source, const Config &config); @@ -128,7 +128,7 @@ template <class Bhiksha> class BitPackedMiddle : public BitPacked { class BitPackedLongest : public BitPacked { public: - static std::size_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab) { + static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab) { return BaseSize(entries, max_vocab, quant_bits); } diff --git a/klm/lm/trie_sort.cc b/klm/lm/trie_sort.cc index 0d83221e..dc542bb3 100644 --- a/klm/lm/trie_sort.cc +++ b/klm/lm/trie_sort.cc @@ -22,12 +22,6 @@ namespace lm { namespace ngram { namespace trie { - -void WriteOrThrow(FILE *to, const void *data, size_t size) { - assert(size); - if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size); -} - namespace { typedef util::SizedIterator NGramIter; @@ -71,13 +65,13 @@ class PartialViewProxy { typedef util::ProxyIterator<PartialViewProxy> PartialIter; -FILE *DiskFlush(const void *mem_begin, const void *mem_end, const util::TempMaker &maker) { - util::scoped_fd file(maker.Make()); +FILE *DiskFlush(const void *mem_begin, const void *mem_end, const std::string &temp_prefix) { + util::scoped_fd file(util::MakeTemp(temp_prefix)); util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin); return util::FDOpenOrThrow(file); } -FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &maker, std::size_t entry_size, unsigned char order) { +FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_prefix, std::size_t entry_size, unsigned char order) { const size_t context_size = sizeof(WordIndex) * (order - 1); // Sort just the contexts using the same memory. PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size)); @@ -90,17 +84,17 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &make #endif (context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1))); - util::scoped_FILE out(maker.MakeFile()); + util::scoped_FILE out(util::FMakeTemp(temp_prefix)); // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator. if (context_begin == context_end) return out.release(); PartialIter i(context_begin); - WriteOrThrow(out.get(), i->Data(), context_size); + util::WriteOrThrow(out.get(), i->Data(), context_size); const void *previous = i->Data(); ++i; for (; i != context_end; ++i) { if (memcmp(previous, i->Data(), context_size)) { - WriteOrThrow(out.get(), i->Data(), context_size); + util::WriteOrThrow(out.get(), i->Data(), context_size); previous = i->Data(); } } @@ -116,23 +110,23 @@ struct ThrowCombine { // Useful for context files that just contain records with no value. struct FirstCombine { void operator()(std::size_t entry_size, const void *first, const void * /*second*/, FILE *out) const { - WriteOrThrow(out, first, entry_size); + util::WriteOrThrow(out, first, entry_size); } }; -template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const util::TempMaker &maker, std::size_t weights_size, unsigned char order, const Combine &combine) { +template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const std::string &temp_prefix, std::size_t weights_size, unsigned char order, const Combine &combine) { std::size_t entry_size = sizeof(WordIndex) * order + weights_size; RecordReader first, second; first.Init(first_file, entry_size); second.Init(second_file, entry_size); - util::scoped_FILE out_file(maker.MakeFile()); + util::scoped_FILE out_file(util::FMakeTemp(temp_prefix)); EntryCompare less(order); while (first && second) { if (less(first.Data(), second.Data())) { - WriteOrThrow(out_file.get(), first.Data(), entry_size); + util::WriteOrThrow(out_file.get(), first.Data(), entry_size); ++first; } else if (less(second.Data(), first.Data())) { - WriteOrThrow(out_file.get(), second.Data(), entry_size); + util::WriteOrThrow(out_file.get(), second.Data(), entry_size); ++second; } else { combine(entry_size, first.Data(), second.Data(), out_file.get()); @@ -140,7 +134,7 @@ template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_f } } for (RecordReader &remains = (first ? first : second); remains; ++remains) { - WriteOrThrow(out_file.get(), remains.Data(), entry_size); + util::WriteOrThrow(out_file.get(), remains.Data(), entry_size); } return out_file.release(); } @@ -164,7 +158,7 @@ void RecordReader::Init(FILE *file, std::size_t entry_size) { void RecordReader::Overwrite(const void *start, std::size_t amount) { long internal = (uint8_t*)start - (uint8_t*)data_.get(); UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision"); - WriteOrThrow(file_, start, amount); + util::WriteOrThrow(file_, start, amount); long forward = entry_size_ - internal - amount; #if !defined(_WIN32) && !defined(_WIN64) if (forward) @@ -183,9 +177,8 @@ void RecordReader::Rewind() { } SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { - util::TempMaker maker(file_prefix); PositiveProbWarn warn(config.positive_log_probability); - unigram_.reset(maker.Make()); + unigram_.reset(util::MakeTemp(file_prefix)); { // In case <unk> appears. size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff); @@ -208,7 +201,7 @@ SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<u if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer); for (unsigned char order = 2; order <= counts.size(); ++order) { - ConvertToSorted(f, vocab, counts, maker, order, warn, mem.get(), buffer); + ConvertToSorted(f, vocab, counts, file_prefix, order, warn, mem.get(), buffer); } ReadEnd(f); } @@ -233,7 +226,7 @@ class Closer { }; } // namespace -void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) { +void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) { ReadNGramHeader(f, order); const size_t count = counts[order - 1]; // Size of weights. Does it include backoff? @@ -267,8 +260,8 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo std::sort #endif (NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order))); - files.push_back(DiskFlush(begin, out_end, maker)); - contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order)); + files.push_back(DiskFlush(begin, out_end, file_prefix)); + contexts.push_back(WriteContextFile(begin, out_end, file_prefix, entry_size, order)); done += (out_end - begin) / entry_size; } @@ -276,10 +269,10 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo // All individual files created. Merge them. while (files.size() > 1) { - files.push_back(MergeSortedFiles(files[0], files[1], maker, weights_size, order, ThrowCombine())); + files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, weights_size, order, ThrowCombine())); files_closer.PopFront(); files_closer.PopFront(); - contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], maker, 0, order - 1, FirstCombine())); + contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], file_prefix, 0, order - 1, FirstCombine())); contexts_closer.PopFront(); contexts_closer.PopFront(); } diff --git a/klm/lm/trie_sort.hh b/klm/lm/trie_sort.hh index 1e6fce51..1afd9562 100644 --- a/klm/lm/trie_sort.hh +++ b/klm/lm/trie_sort.hh @@ -18,7 +18,6 @@ namespace util { class FilePiece; -class TempMaker; } // namespace util namespace lm { @@ -29,8 +28,6 @@ struct Config; namespace trie { -void WriteOrThrow(FILE *to, const void *data, size_t size); - class EntryCompare : public std::binary_function<const void*, const void*, bool> { public: explicit EntryCompare(unsigned char order) : order_(order) {} @@ -103,7 +100,7 @@ class SortedFiles { } private: - void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); + void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); util::scoped_fd unigram_; diff --git a/klm/lm/vocab.cc b/klm/lm/vocab.cc index 5de68f16..fd7f96dc 100644 --- a/klm/lm/vocab.cc +++ b/klm/lm/vocab.cc @@ -80,14 +80,14 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) { buffer_.push_back(0); } -void WriteWordsWrapper::Write(int fd) { - util::SeekEnd(fd); +void WriteWordsWrapper::Write(int fd, uint64_t start) { + util::SeekOrThrow(fd, start); util::WriteOrThrow(fd, buffer_.data(), buffer_.size()); } SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {} -std::size_t SortedVocabulary::Size(std::size_t entries, const Config &/*config*/) { +uint64_t SortedVocabulary::Size(uint64_t entries, const Config &/*config*/) { // Lead with the number of entries. return sizeof(uint64_t) + sizeof(uint64_t) * entries; } @@ -116,7 +116,9 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) { } *end_ = hashed; if (enumerate_) { - strings_to_enumerate_[end_ - begin_].assign(str.data(), str.size()); + void *copied = string_backing_.Allocate(str.size()); + memcpy(copied, str.data(), str.size()); + strings_to_enumerate_[end_ - begin_] = StringPiece(static_cast<const char*>(copied), str.size()); } ++end_; // This is 1 + the offset where it was inserted to make room for unk. @@ -126,7 +128,7 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) { void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) { if (enumerate_) { if (!strings_to_enumerate_.empty()) { - util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin()); + util::PairedIterator<ProbBackoff*, StringPiece*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin()); util::JointSort(begin_, end_, values); } for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) { @@ -134,6 +136,7 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) { enumerate_->Add(i + 1, strings_to_enumerate_[i]); } strings_to_enumerate_.clear(); + string_backing_.FreeAll(); } else { util::JointSort(begin_, end_, reorder_vocab + 1); } @@ -165,7 +168,7 @@ struct ProbingVocabularyHeader { ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {} -std::size_t ProbingVocabulary::Size(std::size_t entries, const Config &config) { +uint64_t ProbingVocabulary::Size(uint64_t entries, const Config &config) { return ALIGN8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier); } diff --git a/klm/lm/vocab.hh b/klm/lm/vocab.hh index a25432f9..3902f117 100644 --- a/klm/lm/vocab.hh +++ b/klm/lm/vocab.hh @@ -4,6 +4,7 @@ #include "lm/enumerate_vocab.hh" #include "lm/lm_exception.hh" #include "lm/virtual_interface.hh" +#include "util/pool.hh" #include "util/probing_hash_table.hh" #include "util/sorted_uniform.hh" #include "util/string_piece.hh" @@ -35,7 +36,7 @@ class WriteWordsWrapper : public EnumerateVocab { void Add(WordIndex index, const StringPiece &str); - void Write(int fd); + void Write(int fd, uint64_t start); private: EnumerateVocab *inner_; @@ -62,7 +63,7 @@ class SortedVocabulary : public base::Vocabulary { } // Size for purposes of file writing - static size_t Size(std::size_t entries, const Config &config); + static uint64_t Size(uint64_t entries, const Config &config); // Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary. WordIndex Bound() const { return bound_; } @@ -96,7 +97,9 @@ class SortedVocabulary : public base::Vocabulary { EnumerateVocab *enumerate_; // Actual strings. Used only when loading from ARPA and enumerate_ != NULL - std::vector<std::string> strings_to_enumerate_; + util::Pool string_backing_; + + std::vector<StringPiece> strings_to_enumerate_; }; #pragma pack(push) @@ -129,7 +132,7 @@ class ProbingVocabulary : public base::Vocabulary { return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0; } - static size_t Size(std::size_t entries, const Config &config); + static uint64_t Size(uint64_t entries, const Config &config); // Vocab words are [0, Bound()). WordIndex Bound() const { return bound_; } diff --git a/klm/lm/word_index.hh b/klm/lm/word_index.hh index 67841c30..e09557a7 100644 --- a/klm/lm/word_index.hh +++ b/klm/lm/word_index.hh @@ -2,8 +2,11 @@ #ifndef LM_WORD_INDEX__ #define LM_WORD_INDEX__ +#include <limits.h> + namespace lm { typedef unsigned int WordIndex; +const WordIndex kMaxWordIndex = UINT_MAX; } // namespace lm typedef lm::WordIndex LMWordIndex; |