diff options
34 files changed, 2097 insertions, 634 deletions
diff --git a/klm/lm/Makefile.am b/klm/lm/Makefile.am index 48f9889e..eb71c0f5 100644 --- a/klm/lm/Makefile.am +++ b/klm/lm/Makefile.am @@ -7,11 +7,17 @@ noinst_LIBRARIES = libklm.a libklm_a_SOURCES = \ - exception.cc \ - ngram.cc \ - ngram_build_binary.cc \ + binary_format.cc \ + config.cc \ + lm_exception.cc \ + model.cc \ ngram_query.cc \ - virtual_interface.cc + read_arpa.cc \ + search_hashed.cc \ + search_trie.cc \ + trie.cc \ + virtual_interface.cc \ + vocab.cc AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. diff --git a/klm/lm/binary_format.cc b/klm/lm/binary_format.cc new file mode 100644 index 00000000..2a075b6b --- /dev/null +++ b/klm/lm/binary_format.cc @@ -0,0 +1,191 @@ +#include "lm/binary_format.hh" + +#include "lm/lm_exception.hh" +#include "util/file_piece.hh" + +#include <limits> +#include <string> + +#include <fcntl.h> +#include <errno.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +namespace lm { +namespace ngram { +namespace { +const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version"; +const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 1\n\0"; +const long int kMagicVersion = 1; + +// Test values. +struct Sanity { + char magic[sizeof(kMagicBytes)]; + float zero_f, one_f, minus_half_f; + WordIndex one_word_index, max_word_index; + uint64_t one_uint64; + + void SetToReference() { + std::memcpy(magic, kMagicBytes, sizeof(magic)); + zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; + one_word_index = 1; + max_word_index = std::numeric_limits<WordIndex>::max(); + one_uint64 = 1; + } +}; + +const char *kModelNames[3] = {"hashed n-grams with probing", "hashed n-grams with sorted uniform find", "bit packed trie"}; + +std::size_t Align8(std::size_t in) { + std::size_t off = in % 8; + if (!off) return in; + return in + 8 - off; +} + +std::size_t TotalHeaderSize(unsigned char order) { + return Align8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order); +} + +void ReadLoop(int fd, void *to_void, std::size_t size) { + uint8_t *to = static_cast<uint8_t*>(to_void); + while (size) { + ssize_t ret = read(fd, to, size); + if (ret == -1) UTIL_THROW(util::ErrnoException, "Failed to read from binary file"); + if (ret == 0) UTIL_THROW(util::ErrnoException, "Binary file too short"); + to += ret; + size -= ret; + } +} + +void WriteHeader(void *to, const Parameters ¶ms) { + Sanity header = Sanity(); + header.SetToReference(); + memcpy(to, &header, sizeof(Sanity)); + char *out = reinterpret_cast<char*>(to) + sizeof(Sanity); + + *reinterpret_cast<FixedWidthParameters*>(out) = params.fixed; + out += sizeof(FixedWidthParameters); + + uint64_t *counts = reinterpret_cast<uint64_t*>(out); + for (std::size_t i = 0; i < params.counts.size(); ++i) { + counts[i] = params.counts[i]; + } +} + +} // namespace +namespace detail { + +bool IsBinaryFormat(int fd) { + const off_t size = util::SizeFile(fd); + if (size == util::kBadSize || (size <= static_cast<off_t>(sizeof(Sanity)))) return false; + // Try reading the header. + util::scoped_memory memory; + try { + util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory); + } catch (const util::Exception &e) { + return false; + } + Sanity reference_header = Sanity(); + reference_header.SetToReference(); + if (!memcmp(memory.get(), &reference_header, sizeof(Sanity))) return true; + if (!memcmp(memory.get(), kMagicBeforeVersion, strlen(kMagicBeforeVersion))) { + char *end_ptr; + const char *begin_version = static_cast<const char*>(memory.get()) + strlen(kMagicBeforeVersion); + long int version = strtol(begin_version, &end_ptr, 10); + if ((end_ptr != begin_version) && version != kMagicVersion) { + UTIL_THROW(FormatLoadException, "Binary file has version " << version << " but this implementation expects version " << kMagicVersion << " so you'll have to rebuild your binary LM from the ARPA. Sorry."); + } + UTIL_THROW(FormatLoadException, "File looks like it should be loaded with mmap, but the test values don't match. Try rebuilding the binary format LM using the same code revision, compiler, and architecture."); + } + return false; +} + +void ReadHeader(int fd, Parameters &out) { + if ((off_t)-1 == lseek(fd, sizeof(Sanity), SEEK_SET)) UTIL_THROW(util::ErrnoException, "Seek failed in binary file"); + ReadLoop(fd, &out.fixed, sizeof(out.fixed)); + if (out.fixed.probing_multiplier < 1.0) + UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0."); + + out.counts.resize(static_cast<std::size_t>(out.fixed.order)); + ReadLoop(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order); +} + +void MatchCheck(ModelType model_type, const Parameters ¶ms) { + if (params.fixed.model_type != model_type) { + if (static_cast<unsigned int>(params.fixed.model_type) >= (sizeof(kModelNames) / sizeof(const char *))) + UTIL_THROW(FormatLoadException, "The binary file claims to be model type " << static_cast<unsigned int>(params.fixed.model_type) << " but this is not implemented for in this inference code."); + UTIL_THROW(FormatLoadException, "The binary file was built for " << kModelNames[params.fixed.model_type] << " but the inference code is trying to load " << kModelNames[model_type]); + } +} + +uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t memory_size, Backing &backing) { + const off_t file_size = util::SizeFile(backing.file.get()); + // The header is smaller than a page, so we have to map the whole header as well. + std::size_t total_map = TotalHeaderSize(params.counts.size()) + memory_size; + if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map) + UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map); + + util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.memory); + + if (config.enumerate_vocab && !params.fixed.has_vocabulary) + UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary."); + + if (config.enumerate_vocab) { + if ((off_t)-1 == lseek(backing.file.get(), total_map, SEEK_SET)) + UTIL_THROW(util::ErrnoException, "Failed to seek in binary file to vocab words"); + } + return reinterpret_cast<uint8_t*>(backing.memory.get()) + TotalHeaderSize(params.counts.size()); +} + +uint8_t *SetupZeroed(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing) { + if (config.probing_multiplier <= 1.0) UTIL_THROW(FormatLoadException, "probing multiplier must be > 1.0"); + if (config.write_mmap) { + std::size_t total_map = TotalHeaderSize(counts.size()) + memory_size; + // Write out an mmap file. + backing.memory.reset(util::MapZeroedWrite(config.write_mmap, total_map, backing.file), total_map, util::scoped_memory::MMAP_ALLOCATED); + + Parameters params; + params.counts = counts; + params.fixed.order = counts.size(); + params.fixed.probing_multiplier = config.probing_multiplier; + params.fixed.model_type = model_type; + params.fixed.has_vocabulary = config.include_vocab; + + WriteHeader(backing.memory.get(), params); + + if (params.fixed.has_vocabulary) { + if ((off_t)-1 == lseek(backing.file.get(), total_map, SEEK_SET)) + UTIL_THROW(util::ErrnoException, "Failed to seek in binary file " << config.write_mmap << " to vocab words"); + } + return reinterpret_cast<uint8_t*>(backing.memory.get()) + TotalHeaderSize(counts.size()); + } else { + backing.memory.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED); + return reinterpret_cast<uint8_t*>(backing.memory.get()); + } +} + +void ComplainAboutARPA(const Config &config, ModelType model_type) { + if (config.write_mmap || !config.messages) return; + if (config.arpa_complain == Config::ALL) { + *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl; + } else if (config.arpa_complain == Config::EXPENSIVE && model_type == TRIE_SORTED) { + *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl; + } +} + +} // namespace detail + +bool RecognizeBinary(const char *file, ModelType &recognized) { + util::scoped_fd fd(util::OpenReadOrThrow(file)); + if (!detail::IsBinaryFormat(fd.get())) return false; + Parameters params; + detail::ReadHeader(fd.get(), params); + recognized = params.fixed.model_type; + return true; +} + +} // namespace ngram +} // namespace lm diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc new file mode 100644 index 00000000..4db631a2 --- /dev/null +++ b/klm/lm/build_binary.cc @@ -0,0 +1,13 @@ +#include "lm/model.hh" + +#include <iostream> + +int main(int argc, char *argv[]) { + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " input.arpa output.mmap" << std::endl; + return 1; + } + lm::ngram::Config config; + config.write_mmap = argv[2]; + lm::ngram::Model(argv[1], config); +} diff --git a/klm/lm/config.cc b/klm/lm/config.cc new file mode 100644 index 00000000..2831d578 --- /dev/null +++ b/klm/lm/config.cc @@ -0,0 +1,22 @@ +#include "lm/config.hh" + +#include <iostream> + +namespace lm { +namespace ngram { + +Config::Config() : + messages(&std::cerr), + enumerate_vocab(NULL), + unknown_missing(COMPLAIN), + unknown_missing_prob(0.0), + probing_multiplier(1.5), + building_memory(1073741824ULL), // 1 GB + temporary_directory_prefix(NULL), + arpa_complain(ALL), + write_mmap(NULL), + include_vocab(true), + load_method(util::POPULATE_OR_READ) {} + +} // namespace ngram +} // namespace lm diff --git a/klm/lm/lm_exception.cc b/klm/lm/lm_exception.cc new file mode 100644 index 00000000..ab2ec52f --- /dev/null +++ b/klm/lm/lm_exception.cc @@ -0,0 +1,21 @@ +#include "lm/lm_exception.hh" + +#include<errno.h> +#include<stdio.h> + +namespace lm { + +LoadException::LoadException() throw() {} +LoadException::~LoadException() throw() {} +VocabLoadException::VocabLoadException() throw() {} +VocabLoadException::~VocabLoadException() throw() {} + +FormatLoadException::FormatLoadException() throw() {} +FormatLoadException::~FormatLoadException() throw() {} + +SpecialWordMissingException::SpecialWordMissingException(StringPiece which) throw() { + *this << "Missing special word " << which; +} +SpecialWordMissingException::~SpecialWordMissingException() throw() {} + +} // namespace lm diff --git a/klm/lm/model.cc b/klm/lm/model.cc new file mode 100644 index 00000000..6921d4d9 --- /dev/null +++ b/klm/lm/model.cc @@ -0,0 +1,239 @@ +#include "lm/model.hh" + +#include "lm/lm_exception.hh" +#include "lm/search_hashed.hh" +#include "lm/search_trie.hh" +#include "lm/read_arpa.hh" +#include "util/murmur_hash.hh" + +#include <algorithm> +#include <functional> +#include <numeric> +#include <cmath> + +namespace lm { +namespace ngram { + +size_t hash_value(const State &state) { + return util::MurmurHashNative(state.history_, sizeof(WordIndex) * state.valid_length_); +} + +namespace detail { + +template <class Search, class VocabularyT> size_t GenericModel<Search, VocabularyT>::Size(const std::vector<uint64_t> &counts, const Config &config) { + if (counts.size() > kMaxOrder) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ". Edit ngram.hh's kMaxOrder to at least this value and recompile."); + if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model."); + return VocabularyT::Size(counts[0], config) + Search::Size(counts, config); +} + +template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::SetupMemory(void *base, const std::vector<uint64_t> &counts, const Config &config) { + uint8_t *start = static_cast<uint8_t*>(base); + size_t allocated = VocabularyT::Size(counts[0], config); + vocab_.SetupMemory(start, allocated, counts[0], config); + start += allocated; + start = search_.SetupMemory(start, counts, config); + if (static_cast<std::size_t>(start - static_cast<uint8_t*>(base)) != Size(counts, config)) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast<uint8_t*>(base)) << " but Size says they should take " << Size(counts, config)); +} + +template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) { + LoadLM(file, config, *this); + + // g++ prints warnings unless these are fully initialized. + State begin_sentence = State(); + begin_sentence.valid_length_ = 1; + begin_sentence.history_[0] = vocab_.BeginSentence(); + begin_sentence.backoff_[0] = search_.unigram.Lookup(begin_sentence.history_[0]).backoff; + State null_context = State(); + null_context.valid_length_ = 0; + P::Init(begin_sentence, null_context, vocab_, search_.middle.size() + 2); +} + +template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) { + SetupMemory(start, params.counts, config); + vocab_.LoadedBinary(fd, config.enumerate_vocab); + search_.unigram.LoadedBinary(); + for (typename std::vector<Middle>::iterator i = search_.middle.begin(); i != search_.middle.end(); ++i) { + i->LoadedBinary(); + } + search_.longest.LoadedBinary(); +} + +template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, util::FilePiece &f, void *start, const Parameters ¶ms, const Config &config) { + SetupMemory(start, params.counts, config); + + if (config.write_mmap) { + WriteWordsWrapper wrap(config.enumerate_vocab, backing_.file.get()); + vocab_.ConfigureEnumerate(&wrap, params.counts[0]); + search_.InitializeFromARPA(file, f, params.counts, config, vocab_); + } else { + vocab_.ConfigureEnumerate(config.enumerate_vocab, params.counts[0]); + search_.InitializeFromARPA(file, f, params.counts, config, vocab_); + } + // TODO: fail faster? + if (!vocab_.SawUnk()) { + switch(config.unknown_missing) { + case Config::THROW_UP: + { + SpecialWordMissingException e("<unk>"); + e << " and configuration was set to throw if unknown is missing"; + throw e; + } + case Config::COMPLAIN: + if (config.messages) *config.messages << "Language model is missing <unk>. Substituting probability " << config.unknown_missing_prob << "." << std::endl; + // There's no break;. This is by design. + case Config::SILENT: + // Default probabilities for unknown. + search_.unigram.Unknown().backoff = 0.0; + search_.unigram.Unknown().prob = config.unknown_missing_prob; + break; + } + } + if (std::fabs(search_.unigram.Unknown().backoff) > 0.0000001) UTIL_THROW(FormatLoadException, "Backoff for unknown word should be zero, but was given as " << search_.unigram.Unknown().backoff); +} + +template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const { + unsigned char backoff_start; + FullScoreReturn ret = ScoreExceptBackoff(in_state.history_, in_state.history_ + in_state.valid_length_, new_word, backoff_start, out_state); + if (backoff_start - 1 < in_state.valid_length_) { + ret.prob = std::accumulate(in_state.backoff_ + backoff_start - 1, in_state.backoff_ + in_state.valid_length_, ret.prob); + } + return ret; +} + +template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const { + unsigned char backoff_start; + context_rend = std::min(context_rend, context_rbegin + P::Order() - 1); + FullScoreReturn ret = ScoreExceptBackoff(context_rbegin, context_rend, new_word, backoff_start, out_state); + ret.prob += SlowBackoffLookup(context_rbegin, context_rend, backoff_start); + return ret; +} + +template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const { + context_rend = std::min(context_rend, context_rbegin + P::Order() - 1); + if (context_rend == context_rbegin || *context_rbegin == 0) { + out_state.valid_length_ = 0; + return; + } + float ignored_prob; + typename Search::Node node; + search_.LookupUnigram(*context_rbegin, ignored_prob, out_state.backoff_[0], node); + float *backoff_out = out_state.backoff_ + 1; + const WordIndex *i = context_rbegin + 1; + for (; i < context_rend; ++i, ++backoff_out) { + if (!search_.LookupMiddleNoProb(search_.middle[i - context_rbegin - 1], *i, *backoff_out, node)) { + out_state.valid_length_ = i - context_rbegin; + std::copy(context_rbegin, i, out_state.history_); + return; + } + } + std::copy(context_rbegin, context_rend, out_state.history_); + out_state.valid_length_ = static_cast<unsigned char>(context_rend - context_rbegin); +} + +template <class Search, class VocabularyT> float GenericModel<Search, VocabularyT>::SlowBackoffLookup( + const WordIndex *const context_rbegin, const WordIndex *const context_rend, unsigned char start) const { + // Add the backoff weights for n-grams of order start to (context_rend - context_rbegin). + if (context_rend - context_rbegin < static_cast<std::ptrdiff_t>(start)) return 0.0; + float ret = 0.0; + if (start == 1) { + ret += search_.unigram.Lookup(*context_rbegin).backoff; + start = 2; + } + typename Search::Node node; + if (!search_.FastMakeNode(context_rbegin, context_rbegin + start - 1, node)) { + return 0.0; + } + float backoff; + // i is the order of the backoff we're looking for. + for (const WordIndex *i = context_rbegin + start - 1; i < context_rend; ++i) { + if (!search_.LookupMiddleNoProb(search_.middle[i - context_rbegin - 1], *i, backoff, node)) break; + ret += backoff; + } + return ret; +} + +/* Ugly optimized function. Produce a score excluding backoff. + * The search goes in increasing order of ngram length. + * Context goes backward, so context_begin is the word immediately preceeding + * new_word. + */ +template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff( + const WordIndex *context_rbegin, + const WordIndex *context_rend, + const WordIndex new_word, + unsigned char &backoff_start, + State &out_state) const { + FullScoreReturn ret; + typename Search::Node node; + float *backoff_out(out_state.backoff_); + search_.LookupUnigram(new_word, ret.prob, *backoff_out, node); + if (new_word == 0) { + ret.ngram_length = out_state.valid_length_ = 0; + // All of backoff. + backoff_start = 1; + return ret; + } + out_state.history_[0] = new_word; + if (context_rbegin == context_rend) { + ret.ngram_length = out_state.valid_length_ = 1; + // No backoff because we don't have the history for it. + backoff_start = P::Order(); + return ret; + } + ++backoff_out; + + // Ok now we now that the bigram contains known words. Start by looking it up. + + const WordIndex *hist_iter = context_rbegin; + typename std::vector<Middle>::const_iterator mid_iter = search_.middle.begin(); + for (; ; ++mid_iter, ++hist_iter, ++backoff_out) { + if (hist_iter == context_rend) { + // Ran out of history. No backoff. + backoff_start = P::Order(); + std::copy(context_rbegin, context_rend, out_state.history_ + 1); + ret.ngram_length = out_state.valid_length_ = (context_rend - context_rbegin) + 1; + // ret.prob was already set. + return ret; + } + + if (mid_iter == search_.middle.end()) break; + + if (!search_.LookupMiddle(*mid_iter, *hist_iter, ret.prob, *backoff_out, node)) { + // Didn't find an ngram using hist_iter. + // The history used in the found n-gram is [context_rbegin, hist_iter). + std::copy(context_rbegin, hist_iter, out_state.history_ + 1); + // Therefore, we found a (hist_iter - context_rbegin + 1)-gram including the last word. + ret.ngram_length = out_state.valid_length_ = (hist_iter - context_rbegin) + 1; + backoff_start = mid_iter - search_.middle.begin() + 1; + // ret.prob was already set. + return ret; + } + } + + // It passed every lookup in search_.middle. That means it's at least a (P::Order() - 1)-gram. + // All that's left is to check search_.longest. + + if (!search_.LookupLongest(*hist_iter, ret.prob, node)) { + // It's an (P::Order()-1)-gram + std::copy(context_rbegin, context_rbegin + P::Order() - 2, out_state.history_ + 1); + ret.ngram_length = out_state.valid_length_ = P::Order() - 1; + backoff_start = P::Order() - 1; + // ret.prob was already set. + return ret; + } + // It's an P::Order()-gram + // out_state.valid_length_ is still P::Order() - 1 because the next lookup will only need that much. + std::copy(context_rbegin, context_rbegin + P::Order() - 2, out_state.history_ + 1); + out_state.valid_length_ = P::Order() - 1; + ret.ngram_length = P::Order(); + backoff_start = P::Order(); + return ret; +} + +template class GenericModel<ProbingHashedSearch, ProbingVocabulary>; +template class GenericModel<SortedHashedSearch, SortedVocabulary>; +template class GenericModel<trie::TrieSearch, SortedVocabulary>; + +} // namespace detail +} // namespace ngram +} // namespace lm diff --git a/klm/lm/model_test.cc b/klm/lm/model_test.cc new file mode 100644 index 00000000..159628d4 --- /dev/null +++ b/klm/lm/model_test.cc @@ -0,0 +1,200 @@ +#include "lm/model.hh" + +#include <stdlib.h> + +#define BOOST_TEST_MODULE ModelTest +#include <boost/test/unit_test.hpp> + +namespace lm { +namespace ngram { +namespace { + +#define StartTest(word, ngram, score) \ + ret = model.FullScore( \ + state, \ + model.GetVocabulary().Index(word), \ + out);\ + BOOST_CHECK_CLOSE(score, ret.prob, 0.001); \ + BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length); \ + BOOST_CHECK_EQUAL(std::min<unsigned char>(ngram, 5 - 1), out.valid_length_); + +#define AppendTest(word, ngram, score) \ + StartTest(word, ngram, score) \ + state = out; + +template <class M> void Starters(const M &model) { + FullScoreReturn ret; + Model::State state(model.BeginSentenceState()); + Model::State out; + + StartTest("looking", 2, -0.4846522); + + // , probability plus <s> backoff + StartTest(",", 1, -1.383514 + -0.4149733); + // <unk> probability plus <s> backoff + StartTest("this_is_not_found", 0, -1.995635 + -0.4149733); +} + +template <class M> void Continuation(const M &model) { + FullScoreReturn ret; + Model::State state(model.BeginSentenceState()); + Model::State out; + + AppendTest("looking", 2, -0.484652); + AppendTest("on", 3, -0.348837); + AppendTest("a", 4, -0.0155266); + AppendTest("little", 5, -0.00306122); + State preserve = state; + AppendTest("the", 1, -4.04005); + AppendTest("biarritz", 1, -1.9889); + AppendTest("not_found", 0, -2.29666); + AppendTest("more", 1, -1.20632); + AppendTest(".", 2, -0.51363); + AppendTest("</s>", 3, -0.0191651); + + state = preserve; + AppendTest("more", 5, -0.00181395); + AppendTest("loin", 5, -0.0432557); +} + +#define StatelessTest(word, provide, ngram, score) \ + ret = model.FullScoreForgotState(indices + num_words - word, indices + num_words - word + provide, indices[num_words - word - 1], state); \ + BOOST_CHECK_CLOSE(score, ret.prob, 0.001); \ + BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length); \ + model.GetState(indices + num_words - word, indices + num_words - word + provide, before); \ + ret = model.FullScore(before, indices[num_words - word - 1], out); \ + BOOST_CHECK(state == out); \ + BOOST_CHECK_CLOSE(score, ret.prob, 0.001); \ + BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length); + +template <class M> void Stateless(const M &model) { + const char *words[] = {"<s>", "looking", "on", "a", "little", "the", "biarritz", "not_found", "more", ".", "</s>"}; + const size_t num_words = sizeof(words) / sizeof(const char*); + // Silience "array subscript is above array bounds" when extracting end pointer. + WordIndex indices[num_words + 1]; + for (unsigned int i = 0; i < num_words; ++i) { + indices[num_words - 1 - i] = model.GetVocabulary().Index(words[i]); + } + FullScoreReturn ret; + State state, out, before; + + ret = model.FullScoreForgotState(indices + num_words - 1, indices + num_words, indices[num_words - 2], state); + BOOST_CHECK_CLOSE(-0.484652, ret.prob, 0.001); + StatelessTest(1, 1, 2, -0.484652); + + // looking + StatelessTest(1, 2, 2, -0.484652); + // on + AppendTest("on", 3, -0.348837); + StatelessTest(2, 3, 3, -0.348837); + StatelessTest(2, 2, 3, -0.348837); + StatelessTest(2, 1, 2, -0.4638903); + // a + StatelessTest(3, 4, 4, -0.0155266); + // little + AppendTest("little", 5, -0.00306122); + StatelessTest(4, 5, 5, -0.00306122); + // the + AppendTest("the", 1, -4.04005); + StatelessTest(5, 5, 1, -4.04005); + // No context of the. + StatelessTest(5, 0, 1, -1.687872); + // biarritz + StatelessTest(6, 1, 1, -1.9889); + // not found + StatelessTest(7, 1, 0, -2.29666); + StatelessTest(7, 0, 0, -1.995635); + + WordIndex unk[1]; + unk[0] = 0; + model.GetState(unk, unk + 1, state); + BOOST_CHECK_EQUAL(0, state.valid_length_); +} + +//const char *kExpectedOrderProbing[] = {"<unk>", ",", ".", "</s>", "<s>", "a", "also", "beyond", "biarritz", "call", "concerns", "consider", "considering", "for", "higher", "however", "i", "immediate", "in", "is", "little", "loin", "look", "looking", "more", "on", "screening", "small", "the", "to", "watch", "watching", "what", "would"}; + +class ExpectEnumerateVocab : public EnumerateVocab { + public: + ExpectEnumerateVocab() {} + + void Add(WordIndex index, const StringPiece &str) { + BOOST_CHECK_EQUAL(seen.size(), index); + seen.push_back(std::string(str.data(), str.length())); + } + + void Check(const base::Vocabulary &vocab) { + BOOST_CHECK_EQUAL(34, seen.size()); + BOOST_REQUIRE(!seen.empty()); + BOOST_CHECK_EQUAL("<unk>", seen[0]); + for (WordIndex i = 0; i < seen.size(); ++i) { + BOOST_CHECK_EQUAL(i, vocab.Index(seen[i])); + } + } + + void Clear() { + seen.clear(); + } + + std::vector<std::string> seen; +}; + +template <class ModelT> void LoadingTest() { + Config config; + config.arpa_complain = Config::NONE; + config.messages = NULL; + ExpectEnumerateVocab enumerate; + config.enumerate_vocab = &enumerate; + ModelT m("test.arpa", config); + enumerate.Check(m.GetVocabulary()); + Starters(m); + Continuation(m); + Stateless(m); +} + +BOOST_AUTO_TEST_CASE(probing) { + LoadingTest<Model>(); +} + +BOOST_AUTO_TEST_CASE(sorted) { + LoadingTest<SortedModel>(); +} +BOOST_AUTO_TEST_CASE(trie) { + LoadingTest<TrieModel>(); +} + +template <class ModelT> void BinaryTest() { + Config config; + config.write_mmap = "test.binary"; + config.messages = NULL; + ExpectEnumerateVocab enumerate; + config.enumerate_vocab = &enumerate; + + { + ModelT copy_model("test.arpa", config); + enumerate.Check(copy_model.GetVocabulary()); + enumerate.Clear(); + } + + config.write_mmap = NULL; + + ModelT binary("test.binary", config); + enumerate.Check(binary.GetVocabulary()); + Starters(binary); + Continuation(binary); + Stateless(binary); + unlink("test.binary"); +} + +BOOST_AUTO_TEST_CASE(write_and_read_probing) { + BinaryTest<Model>(); +} +BOOST_AUTO_TEST_CASE(write_and_read_sorted) { + BinaryTest<SortedModel>(); +} +BOOST_AUTO_TEST_CASE(write_and_read_trie) { + BinaryTest<TrieModel>(); +} + +} // namespace +} // namespace ngram +} // namespace lm diff --git a/klm/lm/ngram.cc b/klm/lm/ngram.cc deleted file mode 100644 index a87c82aa..00000000 --- a/klm/lm/ngram.cc +++ /dev/null @@ -1,522 +0,0 @@ -#include "lm/ngram.hh" - -#include "lm/exception.hh" -#include "util/file_piece.hh" -#include "util/joint_sort.hh" -#include "util/murmur_hash.hh" -#include "util/probing_hash_table.hh" - -#include <algorithm> -#include <functional> -#include <numeric> -#include <limits> -#include <string> - -#include <cmath> -#include <fcntl.h> -#include <errno.h> -#include <stdlib.h> -#include <sys/mman.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <unistd.h> - -namespace lm { -namespace ngram { - -size_t hash_value(const State &state) { - return util::MurmurHashNative(state.history_, sizeof(WordIndex) * state.valid_length_); -} - -namespace detail { -uint64_t HashForVocab(const char *str, std::size_t len) { - // This proved faster than Boost's hash in speed trials: total load time Murmur 67090000, Boost 72210000 - // Chose to use 64A instead of native so binary format will be portable across 64 and 32 bit. - return util::MurmurHash64A(str, len, 0); -} - -void Prob::SetBackoff(float to) { - UTIL_THROW(FormatLoadException, "Attempt to set backoff " << to << " for the highest order n-gram"); -} - -// Normally static initialization is a bad idea but MurmurHash is pure arithmetic, so this is ok. -const uint64_t kUnknownHash = HashForVocab("<unk>", 5); -// Sadly some LMs have <UNK>. -const uint64_t kUnknownCapHash = HashForVocab("<UNK>", 5); - -} // namespace detail - -SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL) {} - -std::size_t SortedVocabulary::Size(std::size_t entries, float ignored) { - // Lead with the number of entries. - return sizeof(uint64_t) + sizeof(Entry) * entries; -} - -void SortedVocabulary::Init(void *start, std::size_t allocated, std::size_t entries) { - assert(allocated >= Size(entries)); - // Leave space for number of entries. - begin_ = reinterpret_cast<Entry*>(reinterpret_cast<uint64_t*>(start) + 1); - end_ = begin_; - saw_unk_ = false; -} - -WordIndex SortedVocabulary::Insert(const StringPiece &str) { - uint64_t hashed = detail::HashForVocab(str); - if (hashed == detail::kUnknownHash || hashed == detail::kUnknownCapHash) { - saw_unk_ = true; - return 0; - } - end_->key = hashed; - ++end_; - // This is 1 + the offset where it was inserted to make room for unk. - return end_ - begin_; -} - -bool SortedVocabulary::FinishedLoading(detail::ProbBackoff *reorder_vocab) { - util::JointSort(begin_, end_, reorder_vocab + 1); - SetSpecial(Index("<s>"), Index("</s>"), 0, end_ - begin_ + 1); - // Save size. - *(reinterpret_cast<uint64_t*>(begin_) - 1) = end_ - begin_; - return saw_unk_; -} - -void SortedVocabulary::LoadedBinary() { - end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1); - SetSpecial(Index("<s>"), Index("</s>"), 0, end_ - begin_ + 1); -} - -namespace detail { - -template <class Search> MapVocabulary<Search>::MapVocabulary() {} - -template <class Search> void MapVocabulary<Search>::Init(void *start, std::size_t allocated, std::size_t entries) { - lookup_ = Lookup(start, allocated); - available_ = 1; - // Later if available_ != expected_available_ then we can throw UnknownMissingException. - saw_unk_ = false; -} - -template <class Search> WordIndex MapVocabulary<Search>::Insert(const StringPiece &str) { - uint64_t hashed = HashForVocab(str); - // Prevent unknown from going into the table. - if (hashed == kUnknownHash || hashed == kUnknownCapHash) { - saw_unk_ = true; - return 0; - } else { - lookup_.Insert(Lookup::Packing::Make(hashed, available_)); - return available_++; - } -} - -template <class Search> bool MapVocabulary<Search>::FinishedLoading(ProbBackoff *reorder_vocab) { - lookup_.FinishedInserting(); - SetSpecial(Index("<s>"), Index("</s>"), 0, available_); - return saw_unk_; -} - -template <class Search> void MapVocabulary<Search>::LoadedBinary() { - lookup_.LoadedBinary(); - SetSpecial(Index("<s>"), Index("</s>"), 0, available_); -} - -/* All of the entropy is in low order bits and boost::hash does poorly with - * these. Odd numbers near 2^64 chosen by mashing on the keyboard. There is a - * stable point: 0. But 0 is <unk> which won't be queried here anyway. - */ -inline uint64_t CombineWordHash(uint64_t current, const WordIndex next) { - uint64_t ret = (current * 8978948897894561157ULL) ^ (static_cast<uint64_t>(next) * 17894857484156487943ULL); - return ret; -} - -uint64_t ChainedWordHash(const WordIndex *word, const WordIndex *word_end) { - if (word == word_end) return 0; - uint64_t current = static_cast<uint64_t>(*word); - for (++word; word != word_end; ++word) { - current = CombineWordHash(current, *word); - } - return current; -} - -bool IsEntirelyWhiteSpace(const StringPiece &line) { - for (size_t i = 0; i < static_cast<size_t>(line.size()); ++i) { - if (!isspace(line.data()[i])) return false; - } - return true; -} - -void ReadARPACounts(util::FilePiece &in, std::vector<size_t> &number) { - number.clear(); - StringPiece line; - if (!IsEntirelyWhiteSpace(line = in.ReadLine())) UTIL_THROW(FormatLoadException, "First line was \"" << line << "\" not blank"); - if ((line = in.ReadLine()) != "\\data\\") UTIL_THROW(FormatLoadException, "second line was \"" << line << "\" not \\data\\."); - while (!IsEntirelyWhiteSpace(line = in.ReadLine())) { - if (line.size() < 6 || strncmp(line.data(), "ngram ", 6)) UTIL_THROW(FormatLoadException, "count line \"" << line << "\"doesn't begin with \"ngram \""); - // So strtol doesn't go off the end of line. - std::string remaining(line.data() + 6, line.size() - 6); - char *end_ptr; - unsigned long int length = std::strtol(remaining.c_str(), &end_ptr, 10); - if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line); - if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line); - ++end_ptr; - const char *start = end_ptr; - long int count = std::strtol(start, &end_ptr, 10); - if (count < 0) UTIL_THROW(FormatLoadException, "Negative n-gram count " << count); - if (start == end_ptr) UTIL_THROW(FormatLoadException, "Couldn't parse n-gram count from " << line); - number.push_back(count); - } -} - -void ReadNGramHeader(util::FilePiece &in, unsigned int length) { - StringPiece line; - while (IsEntirelyWhiteSpace(line = in.ReadLine())) {} - std::stringstream expected; - expected << '\\' << length << "-grams:"; - if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead."); -} - -// Special unigram reader because unigram's data structure is different and because we're inserting vocab words. -template <class Voc> void Read1Grams(util::FilePiece &f, const size_t count, Voc &vocab, ProbBackoff *unigrams) { - ReadNGramHeader(f, 1); - for (size_t i = 0; i < count; ++i) { - try { - float prob = f.ReadFloat(); - if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability"); - ProbBackoff &value = unigrams[vocab.Insert(f.ReadDelimited())]; - value.prob = prob; - switch (f.get()) { - case '\t': - value.SetBackoff(f.ReadFloat()); - if ((f.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff"); - break; - case '\n': - value.ZeroBackoff(); - break; - default: - UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram"); - } - } catch(util::Exception &e) { - e << " in the " << i << "th 1-gram at byte " << f.Offset(); - throw; - } - } - if (f.ReadLine().size()) UTIL_THROW(FormatLoadException, "Expected blank line after unigrams at byte " << f.Offset()); -} - -template <class Voc, class Store> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, Store &store) { - ReadNGramHeader(f, n); - - // vocab ids of words in reverse order - WordIndex vocab_ids[n]; - typename Store::Packing::Value value; - for (size_t i = 0; i < count; ++i) { - try { - value.prob = f.ReadFloat(); - for (WordIndex *vocab_out = &vocab_ids[n-1]; vocab_out >= vocab_ids; --vocab_out) { - *vocab_out = vocab.Index(f.ReadDelimited()); - } - uint64_t key = ChainedWordHash(vocab_ids, vocab_ids + n); - - switch (f.get()) { - case '\t': - value.SetBackoff(f.ReadFloat()); - if ((f.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff"); - break; - case '\n': - value.ZeroBackoff(); - break; - default: - UTIL_THROW(FormatLoadException, "Expected tab or newline after n-gram"); - } - store.Insert(Store::Packing::Make(key, value)); - } catch(util::Exception &e) { - e << " in the " << i << "th " << n << "-gram at byte " << f.Offset(); - throw; - } - } - - if (f.ReadLine().size()) UTIL_THROW(FormatLoadException, "Expected blank line after " << n << "-grams at byte " << f.Offset()); - store.FinishedInserting(); -} - -template <class Search, class VocabularyT> size_t GenericModel<Search, VocabularyT>::Size(const std::vector<size_t> &counts, const Config &config) { - if (counts.size() > kMaxOrder) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ". Edit ngram.hh's kMaxOrder to at least this value and recompile."); - if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model."); - size_t memory_size = VocabularyT::Size(counts[0], config.probing_multiplier); - memory_size += sizeof(ProbBackoff) * (counts[0] + 1); // +1 for hallucinate <unk> - for (unsigned char n = 2; n < counts.size(); ++n) { - memory_size += Middle::Size(counts[n - 1], config.probing_multiplier); - } - memory_size += Longest::Size(counts.back(), config.probing_multiplier); - return memory_size; -} - -template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::SetupMemory(char *base, const std::vector<size_t> &counts, const Config &config) { - char *start = base; - size_t allocated = VocabularyT::Size(counts[0], config.probing_multiplier); - vocab_.Init(start, allocated, counts[0]); - start += allocated; - unigram_ = reinterpret_cast<ProbBackoff*>(start); - start += sizeof(ProbBackoff) * (counts[0] + 1); - for (unsigned int n = 2; n < counts.size(); ++n) { - allocated = Middle::Size(counts[n - 1], config.probing_multiplier); - middle_.push_back(Middle(start, allocated)); - start += allocated; - } - allocated = Longest::Size(counts.back(), config.probing_multiplier); - longest_ = Longest(start, allocated); - start += allocated; - if (static_cast<std::size_t>(start - base) != Size(counts, config)) UTIL_THROW(FormatLoadException, "The data structures took " << (start - base) << " but Size says they should take " << Size(counts, config)); -} - -const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 0\n\0"; -struct BinaryFileHeader { - char magic[sizeof(kMagicBytes)]; - float zero_f, one_f, minus_half_f; - WordIndex one_word_index, max_word_index; - uint64_t one_uint64; - - void SetToReference() { - std::memcpy(magic, kMagicBytes, sizeof(magic)); - zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; - one_word_index = 1; - max_word_index = std::numeric_limits<WordIndex>::max(); - one_uint64 = 1; - } -}; - -bool IsBinaryFormat(int fd, off_t size) { - if (size == util::kBadSize || (size <= static_cast<off_t>(sizeof(BinaryFileHeader)))) return false; - // Try reading the header. - util::scoped_mmap memory(mmap(NULL, sizeof(BinaryFileHeader), PROT_READ, MAP_FILE | MAP_PRIVATE, fd, 0), sizeof(BinaryFileHeader)); - if (memory.get() == MAP_FAILED) return false; - BinaryFileHeader reference_header = BinaryFileHeader(); - reference_header.SetToReference(); - if (!memcmp(memory.get(), &reference_header, sizeof(BinaryFileHeader))) return true; - if (!memcmp(memory.get(), "mmap lm ", 8)) UTIL_THROW(FormatLoadException, "File looks like it should be loaded with mmap, but the test values don't match. Was it built on a different machine or with a different compiler?"); - return false; -} - -std::size_t Align8(std::size_t in) { - std::size_t off = in % 8; - if (!off) return in; - return in + 8 - off; -} - -std::size_t TotalHeaderSize(unsigned int order) { - return Align8(sizeof(BinaryFileHeader) + 1 /* order */ + sizeof(uint64_t) * order /* counts */ + sizeof(float) /* probing multiplier */ + 1 /* search_tag */); -} - -void ReadBinaryHeader(const void *from, off_t size, std::vector<size_t> &out, float &probing_multiplier, unsigned char &search_tag) { - const char *from_char = reinterpret_cast<const char*>(from); - if (size < static_cast<off_t>(1 + sizeof(BinaryFileHeader))) UTIL_THROW(FormatLoadException, "File too short to have count information."); - // Skip over the BinaryFileHeader which was read by IsBinaryFormat. - from_char += sizeof(BinaryFileHeader); - unsigned char order = *reinterpret_cast<const unsigned char*>(from_char); - if (size < static_cast<off_t>(TotalHeaderSize(order))) UTIL_THROW(FormatLoadException, "File too short to have full header."); - out.resize(static_cast<std::size_t>(order)); - const uint64_t *counts = reinterpret_cast<const uint64_t*>(from_char + 1); - for (std::size_t i = 0; i < out.size(); ++i) { - out[i] = static_cast<std::size_t>(counts[i]); - } - const float *probing_ptr = reinterpret_cast<const float*>(counts + out.size()); - probing_multiplier = *probing_ptr; - search_tag = *reinterpret_cast<const char*>(probing_ptr + 1); -} - -void WriteBinaryHeader(void *to, const std::vector<size_t> &from, float probing_multiplier, char search_tag) { - BinaryFileHeader header = BinaryFileHeader(); - header.SetToReference(); - memcpy(to, &header, sizeof(BinaryFileHeader)); - char *out = reinterpret_cast<char*>(to) + sizeof(BinaryFileHeader); - *reinterpret_cast<unsigned char*>(out) = static_cast<unsigned char>(from.size()); - uint64_t *counts = reinterpret_cast<uint64_t*>(out + 1); - for (std::size_t i = 0; i < from.size(); ++i) { - counts[i] = from[i]; - } - float *probing_ptr = reinterpret_cast<float*>(counts + from.size()); - *probing_ptr = probing_multiplier; - *reinterpret_cast<char*>(probing_ptr + 1) = search_tag; -} - -template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, Config config) : mapped_file_(util::OpenReadOrThrow(file)) { - const off_t file_size = util::SizeFile(mapped_file_.get()); - - std::vector<size_t> counts; - - if (IsBinaryFormat(mapped_file_.get(), file_size)) { - memory_.reset(util::MapForRead(file_size, config.prefault, mapped_file_.get()), file_size); - - unsigned char search_tag; - ReadBinaryHeader(memory_.begin(), file_size, counts, config.probing_multiplier, search_tag); - if (config.probing_multiplier < 1.0) UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << config.probing_multiplier << " which is < 1.0."); - if (search_tag != Search::kBinaryTag) UTIL_THROW(FormatLoadException, "The binary file has a different search strategy than the one requested."); - size_t memory_size = Size(counts, config); - - char *start = reinterpret_cast<char*>(memory_.get()) + TotalHeaderSize(counts.size()); - if (memory_size != static_cast<size_t>(memory_.end() - start)) UTIL_THROW(FormatLoadException, "The mmap file " << file << " has size " << file_size << " but " << (memory_size + TotalHeaderSize(counts.size())) << " was expected based on the number of counts and configuration."); - - SetupMemory(start, counts, config); - vocab_.LoadedBinary(); - for (typename std::vector<Middle>::iterator i = middle_.begin(); i != middle_.end(); ++i) { - i->LoadedBinary(); - } - longest_.LoadedBinary(); - - } else { - if (config.probing_multiplier <= 1.0) UTIL_THROW(FormatLoadException, "probing multiplier must be > 1.0"); - - util::FilePiece f(file, mapped_file_.release(), config.messages); - ReadARPACounts(f, counts); - size_t memory_size = Size(counts, config); - char *start; - - if (config.write_mmap) { - // Write out an mmap file. - util::MapZeroedWrite(config.write_mmap, TotalHeaderSize(counts.size()) + memory_size, mapped_file_, memory_); - WriteBinaryHeader(memory_.get(), counts, config.probing_multiplier, Search::kBinaryTag); - start = reinterpret_cast<char*>(memory_.get()) + TotalHeaderSize(counts.size()); - } else { - memory_.reset(util::MapAnonymous(memory_size), memory_size); - start = reinterpret_cast<char*>(memory_.get()); - } - SetupMemory(start, counts, config); - try { - LoadFromARPA(f, counts, config); - } catch (FormatLoadException &e) { - e << " in file " << file; - throw; - } - } - - // g++ prints warnings unless these are fully initialized. - State begin_sentence = State(); - begin_sentence.valid_length_ = 1; - begin_sentence.history_[0] = vocab_.BeginSentence(); - begin_sentence.backoff_[0] = unigram_[begin_sentence.history_[0]].backoff; - State null_context = State(); - null_context.valid_length_ = 0; - P::Init(begin_sentence, null_context, vocab_, counts.size()); -} - -template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::LoadFromARPA(util::FilePiece &f, const std::vector<size_t> &counts, const Config &config) { - // Read the unigrams. - Read1Grams(f, counts[0], vocab_, unigram_); - bool saw_unk = vocab_.FinishedLoading(unigram_); - if (!saw_unk) { - switch(config.unknown_missing) { - case Config::THROW_UP: - { - SpecialWordMissingException e("<unk>"); - e << " and configuration was set to throw if unknown is missing"; - throw e; - } - case Config::COMPLAIN: - if (config.messages) *config.messages << "Language model is missing <unk>. Substituting probability " << config.unknown_missing_prob << "." << std::endl; - // There's no break;. This is by design. - case Config::SILENT: - // Default probabilities for unknown. - unigram_[0].backoff = 0.0; - unigram_[0].prob = config.unknown_missing_prob; - break; - } - } - - // Read the n-grams. - for (unsigned int n = 2; n < counts.size(); ++n) { - ReadNGrams(f, n, counts[n-1], vocab_, middle_[n-2]); - } - ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab_, longest_); - if (std::fabs(unigram_[0].backoff) > 0.0000001) UTIL_THROW(FormatLoadException, "Backoff for unknown word should be zero, but was given as " << unigram_[0].backoff); -} - -/* Ugly optimized function. - * in_state contains the previous ngram's length and backoff probabilites to - * be used here. out_state is populated with the found ngram length and - * backoffs that the next call will find useful. - * - * The search goes in increasing order of ngram length. - */ -template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore( - const State &in_state, - const WordIndex new_word, - State &out_state) const { - - FullScoreReturn ret; - // This is end pointer passed to SumBackoffs. - const ProbBackoff &unigram = unigram_[new_word]; - if (new_word == 0) { - ret.ngram_length = out_state.valid_length_ = 0; - // all of backoff. - ret.prob = std::accumulate( - in_state.backoff_, - in_state.backoff_ + in_state.valid_length_, - unigram.prob); - return ret; - } - float *backoff_out(out_state.backoff_); - *backoff_out = unigram.backoff; - ret.prob = unigram.prob; - out_state.history_[0] = new_word; - if (in_state.valid_length_ == 0) { - ret.ngram_length = out_state.valid_length_ = 1; - // No backoff because NGramLength() == 0 and unknown can't have backoff. - return ret; - } - ++backoff_out; - - // Ok now we now that the bigram contains known words. Start by looking it up. - - uint64_t lookup_hash = static_cast<uint64_t>(new_word); - const WordIndex *hist_iter = in_state.history_; - const WordIndex *const hist_end = hist_iter + in_state.valid_length_; - typename std::vector<Middle>::const_iterator mid_iter = middle_.begin(); - for (; ; ++mid_iter, ++hist_iter, ++backoff_out) { - if (hist_iter == hist_end) { - // Used history [in_state.history_, hist_end) and ran out. No backoff. - std::copy(in_state.history_, hist_end, out_state.history_ + 1); - ret.ngram_length = out_state.valid_length_ = in_state.valid_length_ + 1; - // ret.prob was already set. - return ret; - } - lookup_hash = CombineWordHash(lookup_hash, *hist_iter); - if (mid_iter == middle_.end()) break; - typename Middle::ConstIterator found; - if (!mid_iter->Find(lookup_hash, found)) { - // Didn't find an ngram using hist_iter. - // The history used in the found n-gram is [in_state.history_, hist_iter). - std::copy(in_state.history_, hist_iter, out_state.history_ + 1); - // Therefore, we found a (hist_iter - in_state.history_ + 1)-gram including the last word. - ret.ngram_length = out_state.valid_length_ = (hist_iter - in_state.history_) + 1; - ret.prob = std::accumulate( - in_state.backoff_ + (mid_iter - middle_.begin()), - in_state.backoff_ + in_state.valid_length_, - ret.prob); - return ret; - } - *backoff_out = found->GetValue().backoff; - ret.prob = found->GetValue().prob; - } - - typename Longest::ConstIterator found; - if (!longest_.Find(lookup_hash, found)) { - // It's an (P::Order()-1)-gram - std::copy(in_state.history_, in_state.history_ + P::Order() - 2, out_state.history_ + 1); - ret.ngram_length = out_state.valid_length_ = P::Order() - 1; - ret.prob += in_state.backoff_[P::Order() - 2]; - return ret; - } - // It's an P::Order()-gram - // out_state.valid_length_ is still P::Order() - 1 because the next lookup will only need that much. - std::copy(in_state.history_, in_state.history_ + P::Order() - 2, out_state.history_ + 1); - out_state.valid_length_ = P::Order() - 1; - ret.ngram_length = P::Order(); - ret.prob = found->GetValue().prob; - return ret; -} - -template class GenericModel<ProbingSearch, MapVocabulary<ProbingSearch> >; -template class GenericModel<SortedUniformSearch, SortedVocabulary>; -} // namespace detail -} // namespace ngram -} // namespace lm diff --git a/klm/lm/ngram_query.cc b/klm/lm/ngram_query.cc index d1970260..74457a74 100644 --- a/klm/lm/ngram_query.cc +++ b/klm/lm/ngram_query.cc @@ -1,4 +1,4 @@ -#include "lm/ngram.hh" +#include "lm/model.hh" #include <cstdlib> #include <fstream> diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc new file mode 100644 index 00000000..8e9a770d --- /dev/null +++ b/klm/lm/read_arpa.cc @@ -0,0 +1,154 @@ +#include "lm/read_arpa.hh" + +#include <cstdlib> +#include <vector> + +#include <ctype.h> +#include <inttypes.h> + +namespace lm { + +namespace { + +bool IsEntirelyWhiteSpace(const StringPiece &line) { + for (size_t i = 0; i < static_cast<size_t>(line.size()); ++i) { + if (!isspace(line.data()[i])) return false; + } + return true; +} + +template <class F> void GenericReadARPACounts(F &in, std::vector<uint64_t> &number) { + number.clear(); + StringPiece line; + if (!IsEntirelyWhiteSpace(line = in.ReadLine())) { + if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) { + UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, run\nzcat " << in.FileName() << " |kenlm/build_binary /dev/stdin " << in.FileName() << ".binary\nIf this already in binary format, you need to decompress it because mmap doesn't work on top of gzip."); + } + UTIL_THROW(FormatLoadException, "First line was \"" << static_cast<int>(line.data()[1]) << "\" not blank"); + } + if ((line = in.ReadLine()) != "\\data\\") UTIL_THROW(FormatLoadException, "second line was \"" << line << "\" not \\data\\."); + while (!IsEntirelyWhiteSpace(line = in.ReadLine())) { + if (line.size() < 6 || strncmp(line.data(), "ngram ", 6)) UTIL_THROW(FormatLoadException, "count line \"" << line << "\"doesn't begin with \"ngram \""); + // So strtol doesn't go off the end of line. + std::string remaining(line.data() + 6, line.size() - 6); + char *end_ptr; + unsigned long int length = std::strtol(remaining.c_str(), &end_ptr, 10); + if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line); + if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line); + ++end_ptr; + const char *start = end_ptr; + long int count = std::strtol(start, &end_ptr, 10); + if (count < 0) UTIL_THROW(FormatLoadException, "Negative n-gram count " << count); + if (start == end_ptr) UTIL_THROW(FormatLoadException, "Couldn't parse n-gram count from " << line); + number.push_back(count); + } +} + +template <class F> void GenericReadNGramHeader(F &in, unsigned int length) { + StringPiece line; + while (IsEntirelyWhiteSpace(line = in.ReadLine())) {} + std::stringstream expected; + expected << '\\' << length << "-grams:"; + if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead. "); +} + +template <class F> void GenericReadEnd(F &in) { + StringPiece line; + do { + line = in.ReadLine(); + } while (IsEntirelyWhiteSpace(line)); + if (line != "\\end\\") UTIL_THROW(FormatLoadException, "Expected \\end\\ but the ARPA file has " << line); +} + +class FakeFilePiece { + public: + explicit FakeFilePiece(std::istream &in) : in_(in) { + in_.exceptions(std::ios::failbit | std::ios::badbit | std::ios::eofbit); + } + + StringPiece ReadLine() throw(util::EndOfFileException) { + getline(in_, buffer_); + return StringPiece(buffer_); + } + + float ReadFloat() { + float ret; + in_ >> ret; + return ret; + } + + const char *FileName() const { + // This only used for error messages and we don't know the file name. . . + return "$file"; + } + + private: + std::istream &in_; + std::string buffer_; +}; + +} // namespace + +void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) { + GenericReadARPACounts(in, number); +} +void ReadARPACounts(std::istream &in, std::vector<uint64_t> &number) { + FakeFilePiece fake(in); + GenericReadARPACounts(fake, number); +} +void ReadNGramHeader(util::FilePiece &in, unsigned int length) { + GenericReadNGramHeader(in, length); +} +void ReadNGramHeader(std::istream &in, unsigned int length) { + FakeFilePiece fake(in); + GenericReadNGramHeader(fake, length); +} + +void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) { + switch (in.get()) { + case '\t': + { + float got = in.ReadFloat(); + if (got != 0.0) + UTIL_THROW(FormatLoadException, "Non-zero backoff " << got << " provided for an n-gram that should have no backoff."); + } + break; + case '\n': + break; + default: + UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram"); + } +} + +void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) { + switch (in.get()) { + case '\t': + weights.backoff = in.ReadFloat(); + if ((in.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff"); + break; + case '\n': + weights.backoff = 0.0; + break; + default: + UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram"); + } +} + +void ReadEnd(util::FilePiece &in) { + GenericReadEnd(in); + StringPiece line; + try { + while (true) { + line = in.ReadLine(); + if (!IsEntirelyWhiteSpace(line)) UTIL_THROW(FormatLoadException, "Trailing line " << line); + } + } catch (const util::EndOfFileException &e) { + return; + } +} +void ReadEnd(std::istream &in) { + FakeFilePiece fake(in); + GenericReadEnd(fake); +} + +} // namespace lm diff --git a/klm/lm/search_hashed.cc b/klm/lm/search_hashed.cc new file mode 100644 index 00000000..9cb662a6 --- /dev/null +++ b/klm/lm/search_hashed.cc @@ -0,0 +1,66 @@ +#include "lm/search_hashed.hh" + +#include "lm/lm_exception.hh" +#include "lm/read_arpa.hh" +#include "lm/vocab.hh" + +#include "util/file_piece.hh" + +#include <string> + +namespace lm { +namespace ngram { + +namespace { + +/* All of the entropy is in low order bits and boost::hash does poorly with + * these. Odd numbers near 2^64 chosen by mashing on the keyboard. There is a + * stable point: 0. But 0 is <unk> which won't be queried here anyway. + */ +inline uint64_t CombineWordHash(uint64_t current, const WordIndex next) { + uint64_t ret = (current * 8978948897894561157ULL) ^ (static_cast<uint64_t>(next) * 17894857484156487943ULL); + return ret; +} + +uint64_t ChainedWordHash(const WordIndex *word, const WordIndex *word_end) { + if (word == word_end) return 0; + uint64_t current = static_cast<uint64_t>(*word); + for (++word; word != word_end; ++word) { + current = CombineWordHash(current, *word); + } + return current; +} + +template <class Voc, class Store> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, Store &store) { + ReadNGramHeader(f, n); + + // vocab ids of words in reverse order + WordIndex vocab_ids[n]; + typename Store::Packing::Value value; + for (size_t i = 0; i < count; ++i) { + ReadNGram(f, n, vocab, vocab_ids, value); + uint64_t key = ChainedWordHash(vocab_ids, vocab_ids + n); + store.Insert(Store::Packing::Make(key, value)); + } + + store.FinishedInserting(); +} + +} // namespace +namespace detail { + +template <class MiddleT, class LongestT> template <class Voc> void TemplateHashedSearch<MiddleT, LongestT>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &/*config*/, Voc &vocab) { + Read1Grams(f, counts[0], vocab, unigram.Raw()); + // Read the n-grams. + for (unsigned int n = 2; n < counts.size(); ++n) { + ReadNGrams(f, n, counts[n-1], vocab, middle[n-2]); + } + ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab, longest); +} + +template void TemplateHashedSearch<ProbingHashedSearch::Middle, ProbingHashedSearch::Longest>::InitializeFromARPA(const char *, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &, ProbingVocabulary &vocab); +template void TemplateHashedSearch<SortedHashedSearch::Middle, SortedHashedSearch::Longest>::InitializeFromARPA(const char *, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &, SortedVocabulary &vocab); + +} // namespace detail +} // namespace ngram +} // namespace lm diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc new file mode 100644 index 00000000..182e27f5 --- /dev/null +++ b/klm/lm/search_trie.cc @@ -0,0 +1,402 @@ +#include "lm/search_trie.hh" + +#include "lm/lm_exception.hh" +#include "lm/read_arpa.hh" +#include "lm/trie.hh" +#include "lm/vocab.hh" +#include "lm/weights.hh" +#include "lm/word_index.hh" +#include "util/ersatz_progress.hh" +#include "util/file_piece.hh" +#include "util/scoped.hh" + +#include <algorithm> +#include <cstring> +#include <cstdio> +#include <deque> +#include <iostream> +#include <limits> +//#include <parallel/algorithm> +#include <vector> + +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdlib.h> + +namespace lm { +namespace ngram { +namespace trie { +namespace { + +template <unsigned char Order> class FullEntry { + public: + typedef ProbBackoff Weights; + static const unsigned char kOrder = Order; + + // reverse order + WordIndex words[Order]; + Weights weights; + + bool operator<(const FullEntry<Order> &other) const { + for (const WordIndex *i = words, *j = other.words; i != words + Order; ++i, ++j) { + if (*i < *j) return true; + if (*i > *j) return false; + } + return false; + } +}; + +template <unsigned char Order> class ProbEntry { + public: + typedef Prob Weights; + static const unsigned char kOrder = Order; + + // reverse order + WordIndex words[Order]; + Weights weights; + + bool operator<(const ProbEntry<Order> &other) const { + for (const WordIndex *i = words, *j = other.words; i != words + Order; ++i, ++j) { + if (*i < *j) return true; + if (*i > *j) return false; + } + return false; + } +}; + +void WriteOrThrow(FILE *to, const void *data, size_t size) { + if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size); +} + +void ReadOrThrow(FILE *from, void *data, size_t size) { + if (1 != std::fread(data, size, 1, from)) UTIL_THROW(util::ErrnoException, "Short read; requested size " << size); +} + +void CopyOrThrow(FILE *from, FILE *to, size_t size) { + const size_t kBufSize = 512; + char buf[kBufSize]; + for (size_t i = 0; i < size; i += kBufSize) { + std::size_t amount = std::min(size - i, kBufSize); + ReadOrThrow(from, buf, amount); + WriteOrThrow(to, buf, amount); + } +} + +template <class Entry> std::string DiskFlush(const Entry *begin, const Entry *end, const std::string &file_prefix, std::size_t batch) { + std::stringstream assembled; + assembled << file_prefix << static_cast<unsigned int>(Entry::kOrder) << '_' << batch; + std::string ret(assembled.str()); + util::scoped_FILE out(fopen(ret.c_str(), "w")); + if (!out.get()) UTIL_THROW(util::ErrnoException, "Couldn't open " << assembled.str().c_str() << " for writing"); + for (const Entry *group_begin = begin; group_begin != end;) { + const Entry *group_end = group_begin; + for (++group_end; (group_end != end) && !memcmp(group_begin->words, group_end->words, sizeof(WordIndex) * (Entry::kOrder - 1)); ++group_end) {} + WriteOrThrow(out.get(), group_begin->words, sizeof(WordIndex) * (Entry::kOrder - 1)); + WordIndex group_size = group_end - group_begin; + WriteOrThrow(out.get(), &group_size, sizeof(group_size)); + for (const Entry *i = group_begin; i != group_end; ++i) { + WriteOrThrow(out.get(), &i->words[Entry::kOrder - 1], sizeof(WordIndex)); + WriteOrThrow(out.get(), &i->weights, sizeof(typename Entry::Weights)); + } + group_begin = group_end; + } + return ret; +} + +class SortedFileReader { + public: + SortedFileReader() {} + + void Init(const std::string &name, unsigned char order) { + file_.reset(fopen(name.c_str(), "r")); + if (!file_.get()) UTIL_THROW(util::ErrnoException, "Opening " << name << " for read"); + header_.resize(order - 1); + NextHeader(); + } + + // Preceding words. + const WordIndex *Header() const { + return &*header_.begin(); + } + const std::vector<WordIndex> &HeaderVector() const { return header_;} + + std::size_t HeaderBytes() const { return header_.size() * sizeof(WordIndex); } + + void NextHeader() { + if (1 != fread(&*header_.begin(), HeaderBytes(), 1, file_.get()) && !Ended()) { + UTIL_THROW(util::ErrnoException, "Short read of counts"); + } + } + + void ReadCount(WordIndex &to) { + ReadOrThrow(file_.get(), &to, sizeof(WordIndex)); + } + + void ReadWord(WordIndex &to) { + ReadOrThrow(file_.get(), &to, sizeof(WordIndex)); + } + + template <class Weights> void ReadWeights(Weights &to) { + ReadOrThrow(file_.get(), &to, sizeof(Weights)); + } + + bool Ended() { + return feof(file_.get()); + } + + FILE *File() { return file_.get(); } + + private: + util::scoped_FILE file_; + + std::vector<WordIndex> header_; +}; + +void CopyFullRecord(SortedFileReader &from, FILE *to, std::size_t weights_size) { + WriteOrThrow(to, from.Header(), from.HeaderBytes()); + WordIndex count; + from.ReadCount(count); + WriteOrThrow(to, &count, sizeof(WordIndex)); + + CopyOrThrow(from.File(), to, (weights_size + sizeof(WordIndex)) * count); +} + +void MergeSortedFiles(const char *first_name, const char *second_name, const char *out, std::size_t weights_size, unsigned char order) { + SortedFileReader first, second; + first.Init(first_name, order); + second.Init(second_name, order); + util::scoped_FILE out_file(fopen(out, "w")); + if (!out_file.get()) UTIL_THROW(util::ErrnoException, "Could not open " << out << " for write"); + while (!first.Ended() && !second.Ended()) { + if (first.HeaderVector() < second.HeaderVector()) { + CopyFullRecord(first, out_file.get(), weights_size); + first.NextHeader(); + continue; + } + if (first.HeaderVector() > second.HeaderVector()) { + CopyFullRecord(second, out_file.get(), weights_size); + second.NextHeader(); + continue; + } + // Merge at the entry level. + WriteOrThrow(out_file.get(), first.Header(), first.HeaderBytes()); + WordIndex first_count, second_count; + first.ReadCount(first_count); second.ReadCount(second_count); + WordIndex total_count = first_count + second_count; + WriteOrThrow(out_file.get(), &total_count, sizeof(WordIndex)); + + WordIndex first_word, second_word; + first.ReadWord(first_word); second.ReadWord(second_word); + WordIndex first_index = 0, second_index = 0; + while (true) { + if (first_word < second_word) { + WriteOrThrow(out_file.get(), &first_word, sizeof(WordIndex)); + CopyOrThrow(first.File(), out_file.get(), weights_size); + if (++first_index == first_count) break; + first.ReadWord(first_word); + } else { + WriteOrThrow(out_file.get(), &second_word, sizeof(WordIndex)); + CopyOrThrow(second.File(), out_file.get(), weights_size); + if (++second_index == second_count) break; + second.ReadWord(second_word); + } + } + if (first_index == first_count) { + WriteOrThrow(out_file.get(), &second_word, sizeof(WordIndex)); + CopyOrThrow(second.File(), out_file.get(), (second_count - second_index) * (weights_size + sizeof(WordIndex)) - sizeof(WordIndex)); + } else { + WriteOrThrow(out_file.get(), &first_word, sizeof(WordIndex)); + CopyOrThrow(first.File(), out_file.get(), (first_count - first_index) * (weights_size + sizeof(WordIndex)) - sizeof(WordIndex)); + } + first.NextHeader(); + second.NextHeader(); + } + + for (SortedFileReader &remaining = first.Ended() ? second : first; !remaining.Ended(); remaining.NextHeader()) { + CopyFullRecord(remaining, out_file.get(), weights_size); + } +} + +template <class Entry> void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, util::scoped_memory &mem, const std::string &file_prefix) { + ConvertToSorted<FullEntry<Entry::kOrder - 1> >(f, vocab, counts, mem, file_prefix); + + ReadNGramHeader(f, Entry::kOrder); + const size_t count = counts[Entry::kOrder - 1]; + const size_t batch_size = std::min(count, mem.size() / sizeof(Entry)); + Entry *const begin = reinterpret_cast<Entry*>(mem.get()); + std::deque<std::string> files; + for (std::size_t batch = 0, done = 0; done < count; ++batch) { + Entry *out = begin; + Entry *out_end = out + std::min(count - done, batch_size); + for (; out != out_end; ++out) { + ReadNGram(f, Entry::kOrder, vocab, out->words, out->weights); + } + //__gnu_parallel::sort(begin, out_end); + std::sort(begin, out_end); + + files.push_back(DiskFlush(begin, out_end, file_prefix, batch)); + done += out_end - begin; + } + + // All individual files created. Merge them. + + std::size_t merge_count = 0; + while (files.size() > 1) { + std::stringstream assembled; + assembled << file_prefix << static_cast<unsigned int>(Entry::kOrder) << "_merge_" << (merge_count++); + files.push_back(assembled.str()); + MergeSortedFiles(files[0].c_str(), files[1].c_str(), files.back().c_str(), sizeof(typename Entry::Weights), Entry::kOrder); + if (std::remove(files[0].c_str())) UTIL_THROW(util::ErrnoException, "Could not remove " << files[0]); + files.pop_front(); + if (std::remove(files[0].c_str())) UTIL_THROW(util::ErrnoException, "Could not remove " << files[0]); + files.pop_front(); + } + if (!files.empty()) { + std::stringstream assembled; + assembled << file_prefix << static_cast<unsigned int>(Entry::kOrder) << "_merged"; + std::string merged_name(assembled.str()); + if (std::rename(files[0].c_str(), merged_name.c_str())) UTIL_THROW(util::ErrnoException, "Could not rename " << files[0].c_str() << " to " << merged_name.c_str()); + } +} + +template <> void ConvertToSorted<FullEntry<1> >(util::FilePiece &/*f*/, const SortedVocabulary &/*vocab*/, const std::vector<uint64_t> &/*counts*/, util::scoped_memory &/*mem*/, const std::string &/*file_prefix*/) {} + +void ARPAToSortedFiles(util::FilePiece &f, const std::vector<uint64_t> &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { + { + std::string unigram_name = file_prefix + "unigrams"; + util::scoped_fd unigram_file; + util::scoped_mmap unigram_mmap; + unigram_mmap.reset(util::MapZeroedWrite(unigram_name.c_str(), counts[0] * sizeof(ProbBackoff), unigram_file), counts[0] * sizeof(ProbBackoff)); + Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get())); + } + + util::scoped_memory mem; + mem.reset(malloc(buffer), buffer, util::scoped_memory::ARRAY_ALLOCATED); + if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer); + ConvertToSorted<ProbEntry<5> >(f, vocab, counts, mem, file_prefix); + ReadEnd(f); +} + +struct RecursiveInsertParams { + WordIndex *words; + SortedFileReader *files; + unsigned char max_order; + // This is an array of size order - 2. + BitPackedMiddle *middle; + // This has exactly one entry. + BitPackedLongest *longest; +}; + +uint64_t RecursiveInsert(RecursiveInsertParams ¶ms, unsigned char order) { + SortedFileReader &file = params.files[order - 2]; + const uint64_t ret = (order == params.max_order) ? params.longest->InsertIndex() : params.middle[order - 2].InsertIndex(); + if (std::memcmp(params.words, file.Header(), sizeof(WordIndex) * (order - 1))) + return ret; + WordIndex count; + file.ReadCount(count); + WordIndex key; + if (order == params.max_order) { + Prob value; + for (WordIndex i = 0; i < count; ++i) { + file.ReadWord(key); + file.ReadWeights(value); + params.longest->Insert(key, value.prob); + } + file.NextHeader(); + return ret; + } + ProbBackoff value; + for (WordIndex i = 0; i < count; ++i) { + file.ReadWord(params.words[order - 1]); + file.ReadWeights(value); + params.middle[order - 2].Insert( + params.words[order - 1], + value.prob, + value.backoff, + RecursiveInsert(params, order + 1)); + } + file.NextHeader(); + return ret; +} + +void BuildTrie(const std::string &file_prefix, const std::vector<uint64_t> &counts, std::ostream *messages, TrieSearch &out) { + UnigramValue *unigrams = out.unigram.Raw(); + // Load unigrams. Leave the next pointers uninitialized. + { + std::string name(file_prefix + "unigrams"); + util::scoped_FILE file(fopen(name.c_str(), "r")); + if (!file.get()) UTIL_THROW(util::ErrnoException, "Opening " << name << " failed"); + for (WordIndex i = 0; i < counts[0]; ++i) { + ReadOrThrow(file.get(), &unigrams[i].weights, sizeof(ProbBackoff)); + } + unlink(name.c_str()); + } + + // inputs[0] is bigrams. + SortedFileReader inputs[counts.size() - 1]; + for (unsigned char i = 2; i <= counts.size(); ++i) { + std::stringstream assembled; + assembled << file_prefix << static_cast<unsigned int>(i) << "_merged"; + inputs[i-2].Init(assembled.str(), i); + unlink(assembled.str().c_str()); + } + + // words[0] is unigrams. + WordIndex words[counts.size()]; + RecursiveInsertParams params; + params.words = words; + params.files = inputs; + params.max_order = static_cast<unsigned char>(counts.size()); + params.middle = &*out.middle.begin(); + params.longest = &out.longest; + { + util::ErsatzProgress progress(messages, "Building trie", counts[0]); + for (words[0] = 0; words[0] < counts[0]; ++words[0], ++progress) { + unigrams[words[0]].next = RecursiveInsert(params, 2); + } + } + + /* Set ending offsets so the last entry will be sized properly */ + if (!out.middle.empty()) { + unigrams[counts[0]].next = out.middle.front().InsertIndex(); + for (size_t i = 0; i < out.middle.size() - 1; ++i) { + out.middle[i].FinishedLoading(out.middle[i+1].InsertIndex()); + } + out.middle.back().FinishedLoading(out.longest.InsertIndex()); + } else { + unigrams[counts[0]].next = out.longest.InsertIndex(); + } +} + +} // namespace + +void TrieSearch::InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab) { + std::string temporary_directory; + if (config.temporary_directory_prefix) { + temporary_directory = config.temporary_directory_prefix; + } else if (config.write_mmap) { + temporary_directory = config.write_mmap; + } else { + temporary_directory = file; + } + // Null on end is kludge to ensure null termination. + temporary_directory += "-tmp-XXXXXX\0"; + if (!mkdtemp(&temporary_directory[0])) { + UTIL_THROW(util::ErrnoException, "Failed to make a temporary directory based on the name " << temporary_directory.c_str()); + } + // Chop off null kludge. + temporary_directory.resize(strlen(temporary_directory.c_str())); + // Add directory delimiter. Assumes a real operating system. + temporary_directory += '/'; + ARPAToSortedFiles(f, counts, config.building_memory, temporary_directory.c_str(), vocab); + BuildTrie(temporary_directory.c_str(), counts, config.messages, *this); + if (rmdir(temporary_directory.c_str())) { + std::cerr << "Failed to delete " << temporary_directory << std::endl; + } +} + +} // namespace trie +} // namespace ngram +} // namespace lm diff --git a/klm/lm/sri.cc b/klm/lm/sri.cc index 7bd23d76..b634d200 100644 --- a/klm/lm/sri.cc +++ b/klm/lm/sri.cc @@ -1,4 +1,4 @@ -#include "lm/exception.hh" +#include "lm/lm_exception.hh" #include "lm/sri.hh" #include <Ngram.h> @@ -31,8 +31,7 @@ void Vocabulary::FinishedLoading() { SetSpecial( sri_->ssIndex(), sri_->seIndex(), - sri_->unkIndex(), - sri_->highIndex() + 1); + sri_->unkIndex()); } namespace { diff --git a/klm/lm/trie.cc b/klm/lm/trie.cc new file mode 100644 index 00000000..8ed7b2a2 --- /dev/null +++ b/klm/lm/trie.cc @@ -0,0 +1,167 @@ +#include "lm/trie.hh" + +#include "util/bit_packing.hh" +#include "util/exception.hh" +#include "util/proxy_iterator.hh" +#include "util/sorted_uniform.hh" + +#include <assert.h> + +namespace lm { +namespace ngram { +namespace trie { +namespace { + +// Assumes key is first. +class JustKeyProxy { + public: + JustKeyProxy() : inner_(), base_(), key_mask_(), total_bits_() {} + + operator uint64_t() const { return GetKey(); } + + uint64_t GetKey() const { + uint64_t bit_off = inner_ * static_cast<uint64_t>(total_bits_); + return util::ReadInt57(base_ + bit_off / 8, bit_off & 7, key_mask_); + } + + private: + friend class util::ProxyIterator<JustKeyProxy>; + friend bool FindBitPacked(const void *base, uint64_t key_mask, uint8_t total_bits, uint64_t begin_index, uint64_t end_index, WordIndex key, uint64_t &at_index); + + JustKeyProxy(const void *base, uint64_t index, uint64_t key_mask, uint8_t total_bits) + : inner_(index), base_(static_cast<const uint8_t*>(base)), key_mask_(key_mask), total_bits_(total_bits) {} + + // This is a read-only iterator. + JustKeyProxy &operator=(const JustKeyProxy &other); + + typedef uint64_t value_type; + + typedef uint64_t InnerIterator; + uint64_t &Inner() { return inner_; } + const uint64_t &Inner() const { return inner_; } + + // The address in bits is base_ * 8 + inner_ * total_bits_. + uint64_t inner_; + const uint8_t *const base_; + const uint64_t key_mask_; + const uint8_t total_bits_; +}; + +bool FindBitPacked(const void *base, uint64_t key_mask, uint8_t total_bits, uint64_t begin_index, uint64_t end_index, WordIndex key, uint64_t &at_index) { + util::ProxyIterator<JustKeyProxy> begin_it(JustKeyProxy(base, begin_index, key_mask, total_bits)); + util::ProxyIterator<JustKeyProxy> end_it(JustKeyProxy(base, end_index, key_mask, total_bits)); + util::ProxyIterator<JustKeyProxy> out; + if (!util::SortedUniformFind(begin_it, end_it, key, out)) return false; + at_index = out.Inner(); + return true; +} +} // namespace + +std::size_t BitPacked::BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits) { + uint8_t total_bits = util::RequiredBits(max_vocab) + 31 + remaining_bits; + // Extra entry for next pointer at the end. + // +7 then / 8 to round up bits and convert to bytes + // +sizeof(uint64_t) so that ReadInt57 etc don't go segfault. + // Note that this waste is O(order), not O(number of ngrams). + return ((1 + entries) * total_bits + 7) / 8 + sizeof(uint64_t); +} + +void BitPacked::BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits) { + util::BitPackingSanity(); + word_bits_ = util::RequiredBits(max_vocab); + word_mask_ = (1ULL << word_bits_) - 1ULL; + if (word_bits_ > 57) UTIL_THROW(util::Exception, "Sorry, word indices more than " << (1ULL << 57) << " are not implemented. Edit util/bit_packing.hh and fix the bit packing functions."); + prob_bits_ = 31; + total_bits_ = word_bits_ + prob_bits_ + remaining_bits; + + base_ = static_cast<uint8_t*>(base); + insert_index_ = 0; +} + +std::size_t BitPackedMiddle::Size(uint64_t entries, uint64_t max_vocab, uint64_t max_ptr) { + return BaseSize(entries, max_vocab, 32 + util::RequiredBits(max_ptr)); +} + +void BitPackedMiddle::Init(void *base, uint64_t max_vocab, uint64_t max_next) { + backoff_bits_ = 32; + next_bits_ = util::RequiredBits(max_next); + if (next_bits_ > 57) UTIL_THROW(util::Exception, "Sorry, this does not support more than " << (1ULL << 57) << " n-grams of a particular order. Edit util/bit_packing.hh and fix the bit packing functions."); + next_mask_ = (1ULL << next_bits_) - 1; + + BaseInit(base, max_vocab, backoff_bits_ + next_bits_); +} + +void BitPackedMiddle::Insert(WordIndex word, float prob, float backoff, uint64_t next) { + assert(word <= word_mask_); + assert(next <= next_mask_); + uint64_t at_pointer = insert_index_ * total_bits_; + + util::WriteInt57(base_ + (at_pointer >> 3), at_pointer & 7, word); + at_pointer += word_bits_; + util::WriteNonPositiveFloat31(base_ + (at_pointer >> 3), at_pointer & 7, prob); + at_pointer += prob_bits_; + util::WriteFloat32(base_ + (at_pointer >> 3), at_pointer & 7, backoff); + at_pointer += backoff_bits_; + util::WriteInt57(base_ + (at_pointer >> 3), at_pointer & 7, next); + + ++insert_index_; +} + +bool BitPackedMiddle::Find(WordIndex word, float &prob, float &backoff, NodeRange &range) const { + uint64_t at_pointer; + if (!FindBitPacked(base_, word_mask_, total_bits_, range.begin, range.end, word, at_pointer)) return false; + at_pointer *= total_bits_; + at_pointer += word_bits_; + prob = util::ReadNonPositiveFloat31(base_ + (at_pointer >> 3), at_pointer & 7); + at_pointer += prob_bits_; + backoff = util::ReadFloat32(base_ + (at_pointer >> 3), at_pointer & 7); + at_pointer += backoff_bits_; + range.begin = util::ReadInt57(base_ + (at_pointer >> 3), at_pointer & 7, next_mask_); + // Read the next entry's pointer. + at_pointer += total_bits_; + range.end = util::ReadInt57(base_ + (at_pointer >> 3), at_pointer & 7, next_mask_); + return true; +} + +bool BitPackedMiddle::FindNoProb(WordIndex word, float &backoff, NodeRange &range) const { + uint64_t at_pointer; + if (!FindBitPacked(base_, word_mask_, total_bits_, range.begin, range.end, word, at_pointer)) return false; + at_pointer *= total_bits_; + at_pointer += word_bits_; + at_pointer += prob_bits_; + backoff = util::ReadFloat32(base_ + (at_pointer >> 3), at_pointer & 7); + at_pointer += backoff_bits_; + range.begin = util::ReadInt57(base_ + (at_pointer >> 3), at_pointer & 7, next_mask_); + // Read the next entry's pointer. + at_pointer += total_bits_; + range.end = util::ReadInt57(base_ + (at_pointer >> 3), at_pointer & 7, next_mask_); + return true; +} + +void BitPackedMiddle::FinishedLoading(uint64_t next_end) { + assert(next_end <= next_mask_); + uint64_t last_next_write = (insert_index_ + 1) * total_bits_ - next_bits_; + util::WriteInt57(base_ + (last_next_write >> 3), last_next_write & 7, next_end); +} + + +void BitPackedLongest::Insert(WordIndex index, float prob) { + assert(index <= word_mask_); + uint64_t at_pointer = insert_index_ * total_bits_; + util::WriteInt57(base_ + (at_pointer >> 3), at_pointer & 7, index); + at_pointer += word_bits_; + util::WriteNonPositiveFloat31(base_ + (at_pointer >> 3), at_pointer & 7, prob); + ++insert_index_; +} + +bool BitPackedLongest::Find(WordIndex word, float &prob, const NodeRange &node) const { + uint64_t at_pointer; + if (!FindBitPacked(base_, word_mask_, total_bits_, node.begin, node.end, word, at_pointer)) return false; + at_pointer = at_pointer * total_bits_ + word_bits_; + prob = util::ReadNonPositiveFloat31(base_ + (at_pointer >> 3), at_pointer & 7); + return true; +} + +} // namespace trie +} // namespace ngram +} // namespace lm diff --git a/klm/lm/virtual_interface.cc b/klm/lm/virtual_interface.cc index 9c7151f9..c5a64972 100644 --- a/klm/lm/virtual_interface.cc +++ b/klm/lm/virtual_interface.cc @@ -1,17 +1,16 @@ #include "lm/virtual_interface.hh" -#include "lm/exception.hh" +#include "lm/lm_exception.hh" namespace lm { namespace base { Vocabulary::~Vocabulary() {} -void Vocabulary::SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found, WordIndex available) { +void Vocabulary::SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { begin_sentence_ = begin_sentence; end_sentence_ = end_sentence; not_found_ = not_found; - available_ = available; if (begin_sentence_ == not_found_) throw SpecialWordMissingException("<s>"); if (end_sentence_ == not_found_) throw SpecialWordMissingException("</s>"); } diff --git a/klm/lm/virtual_interface.hh b/klm/lm/virtual_interface.hh index 621a129e..f15f8789 100644 --- a/klm/lm/virtual_interface.hh +++ b/klm/lm/virtual_interface.hh @@ -37,8 +37,6 @@ class Vocabulary { WordIndex BeginSentence() const { return begin_sentence_; } WordIndex EndSentence() const { return end_sentence_; } WordIndex NotFound() const { return not_found_; } - // FullScoreReturn start index of unused word assignments. - WordIndex Available() const { return available_; } /* Most implementations allow StringPiece lookups and need only override * Index(StringPiece). SRI requires null termination and overrides all @@ -56,13 +54,13 @@ class Vocabulary { // Call SetSpecial afterward. Vocabulary() {} - Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found, WordIndex available) { - SetSpecial(begin_sentence, end_sentence, not_found, available); + Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { + SetSpecial(begin_sentence, end_sentence, not_found); } - void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found, WordIndex available); + void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found); - WordIndex begin_sentence_, end_sentence_, not_found_, available_; + WordIndex begin_sentence_, end_sentence_, not_found_; private: // Disable copy constructors. They're private and undefined. @@ -97,7 +95,7 @@ class Vocabulary { * missing these methods, see facade.hh. * * This is the fastest way to use a model and presents a normal State class to - * be included in hypothesis state structure. + * be included in a hypothesis state structure. * * * OPTION 2: Use the virtual interface below. diff --git a/klm/lm/vocab.cc b/klm/lm/vocab.cc new file mode 100644 index 00000000..c30428b2 --- /dev/null +++ b/klm/lm/vocab.cc @@ -0,0 +1,187 @@ +#include "lm/vocab.hh" + +#include "lm/enumerate_vocab.hh" +#include "lm/lm_exception.hh" +#include "lm/config.hh" +#include "lm/weights.hh" +#include "util/exception.hh" +#include "util/joint_sort.hh" +#include "util/murmur_hash.hh" +#include "util/probing_hash_table.hh" + +#include <string> + +namespace lm { +namespace ngram { + +namespace detail { +uint64_t HashForVocab(const char *str, std::size_t len) { + // This proved faster than Boost's hash in speed trials: total load time Murmur 67090000, Boost 72210000 + // Chose to use 64A instead of native so binary format will be portable across 64 and 32 bit. + return util::MurmurHash64A(str, len, 0); +} +} // namespace detail + +namespace { +// Normally static initialization is a bad idea but MurmurHash is pure arithmetic, so this is ok. +const uint64_t kUnknownHash = detail::HashForVocab("<unk>", 5); +// Sadly some LMs have <UNK>. +const uint64_t kUnknownCapHash = detail::HashForVocab("<UNK>", 5); + +void ReadWords(int fd, EnumerateVocab *enumerate) { + if (!enumerate) return; + const std::size_t kInitialRead = 16384; + std::string buf; + buf.reserve(kInitialRead + 100); + buf.resize(kInitialRead); + WordIndex index = 0; + while (true) { + ssize_t got = read(fd, &buf[0], kInitialRead); + if (got == -1) UTIL_THROW(util::ErrnoException, "Reading vocabulary words"); + if (got == 0) return; + buf.resize(got); + while (buf[buf.size() - 1]) { + char next_char; + ssize_t ret = read(fd, &next_char, 1); + if (ret == -1) UTIL_THROW(util::ErrnoException, "Reading vocabulary words"); + if (ret == 0) UTIL_THROW(FormatLoadException, "Missing null terminator on a vocab word."); + buf.push_back(next_char); + } + // Ok now we have null terminated strings. + for (const char *i = buf.data(); i != buf.data() + buf.size();) { + std::size_t length = strlen(i); + enumerate->Add(index++, StringPiece(i, length)); + i += length + 1 /* null byte */; + } + } +} + +void WriteOrThrow(int fd, const void *data_void, std::size_t size) { + const uint8_t *data = static_cast<const uint8_t*>(data_void); + while (size) { + ssize_t ret = write(fd, data, size); + if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed"); + data += ret; + size -= ret; + } +} + +} // namespace + +WriteWordsWrapper::WriteWordsWrapper(EnumerateVocab *inner, int fd) : inner_(inner), fd_(fd) {} +WriteWordsWrapper::~WriteWordsWrapper() {} + +void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) { + if (inner_) inner_->Add(index, str); + WriteOrThrow(fd_, str.data(), str.size()); + char null_byte = 0; + // Inefficient because it's unbuffered. Sue me. + WriteOrThrow(fd_, &null_byte, 1); +} + +SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {} + +std::size_t SortedVocabulary::Size(std::size_t entries, const Config &/*config*/) { + // Lead with the number of entries. + return sizeof(uint64_t) + sizeof(Entry) * entries; +} + +void SortedVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config) { + assert(allocated >= Size(entries, config)); + // Leave space for number of entries. + begin_ = reinterpret_cast<Entry*>(reinterpret_cast<uint64_t*>(start) + 1); + end_ = begin_; + saw_unk_ = false; +} + +void SortedVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries) { + enumerate_ = to; + if (enumerate_) { + enumerate_->Add(0, "<unk>"); + strings_to_enumerate_.resize(max_entries); + } +} + +WordIndex SortedVocabulary::Insert(const StringPiece &str) { + uint64_t hashed = detail::HashForVocab(str); + if (hashed == kUnknownHash || hashed == kUnknownCapHash) { + saw_unk_ = true; + return 0; + } + end_->key = hashed; + if (enumerate_) { + strings_to_enumerate_[end_ - begin_].assign(str.data(), str.size()); + } + ++end_; + // This is 1 + the offset where it was inserted to make room for unk. + return end_ - begin_; +} + +void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) { + if (enumerate_) { + util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin()); + util::JointSort(begin_, end_, values); + for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) { + // <unk> strikes again: +1 here. + enumerate_->Add(i + 1, strings_to_enumerate_[i]); + } + strings_to_enumerate_.clear(); + } else { + util::JointSort(begin_, end_, reorder_vocab + 1); + } + SetSpecial(Index("<s>"), Index("</s>"), 0); + // Save size. + *(reinterpret_cast<uint64_t*>(begin_) - 1) = end_ - begin_; +} + +void SortedVocabulary::LoadedBinary(int fd, EnumerateVocab *to) { + end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1); + ReadWords(fd, to); + SetSpecial(Index("<s>"), Index("</s>"), 0); +} + +ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {} + +std::size_t ProbingVocabulary::Size(std::size_t entries, const Config &config) { + return Lookup::Size(entries, config.probing_multiplier); +} + +void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t /*entries*/, const Config &/*config*/) { + lookup_ = Lookup(start, allocated); + available_ = 1; + saw_unk_ = false; +} + +void ProbingVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t /*max_entries*/) { + enumerate_ = to; + if (enumerate_) { + enumerate_->Add(0, "<unk>"); + } +} + +WordIndex ProbingVocabulary::Insert(const StringPiece &str) { + uint64_t hashed = detail::HashForVocab(str); + // Prevent unknown from going into the table. + if (hashed == kUnknownHash || hashed == kUnknownCapHash) { + saw_unk_ = true; + return 0; + } else { + if (enumerate_) enumerate_->Add(available_, str); + lookup_.Insert(Lookup::Packing::Make(hashed, available_)); + return available_++; + } +} + +void ProbingVocabulary::FinishedLoading(ProbBackoff * /*reorder_vocab*/) { + lookup_.FinishedInserting(); + SetSpecial(Index("<s>"), Index("</s>"), 0); +} + +void ProbingVocabulary::LoadedBinary(int fd, EnumerateVocab *to) { + lookup_.LoadedBinary(); + ReadWords(fd, to); + SetSpecial(Index("<s>"), Index("</s>"), 0); +} + +} // namespace ngram +} // namespace lm diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index be84736c..9e38e0f1 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -20,6 +20,7 @@ noinst_LIBRARIES = libklm_util.a libklm_util_a_SOURCES = \ ersatz_progress.cc \ + bit_packing.cc \ exception.cc \ file_piece.cc \ mmap.cc \ diff --git a/klm/util/bit_packing.cc b/klm/util/bit_packing.cc new file mode 100644 index 00000000..dd14ffe1 --- /dev/null +++ b/klm/util/bit_packing.cc @@ -0,0 +1,27 @@ +#include "util/bit_packing.hh" +#include "util/exception.hh" + +namespace util { + +namespace { +template <bool> struct StaticCheck {}; +template <> struct StaticCheck<true> { typedef bool StaticAssertionPassed; }; + +typedef StaticCheck<sizeof(float) == 4>::StaticAssertionPassed FloatSize; + +} // namespace + +uint8_t RequiredBits(uint64_t max_value) { + if (!max_value) return 0; + uint8_t ret = 1; + while (max_value >>= 1) ++ret; + return ret; +} + +void BitPackingSanity() { + const detail::FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 }; + if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000"); + // TODO: more checks. +} + +} // namespace util diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh new file mode 100644 index 00000000..422ed873 --- /dev/null +++ b/klm/util/bit_packing.hh @@ -0,0 +1,88 @@ +#ifndef UTIL_BIT_PACKING__ +#define UTIL_BIT_PACKING__ + +/* Bit-level packing routines */ + +#include <assert.h> +#ifdef __APPLE__ +#include <architecture/byte_order.h> +#else +#include <endian.h> +#endif + +#include <inttypes.h> + +#if __BYTE_ORDER != __LITTLE_ENDIAN +#error The bit aligned storage functions assume little endian architecture +#endif + +namespace util { + +/* WARNING WARNING WARNING: + * The write functions assume that memory is zero initially. This makes them + * faster and is the appropriate case for mmapped language model construction. + * These routines assume that unaligned access to uint64_t is fast and that + * storage is little endian. This is the case on x86_64. It may not be the + * case on 32-bit x86 but my target audience is large language models for which + * 64-bit is necessary. + */ + +/* Pack integers up to 57 bits using their least significant digits. + * The length is specified using mask: + * Assumes mask == (1 << length) - 1 where length <= 57. + */ +inline uint64_t ReadInt57(const void *base, uint8_t bit, uint64_t mask) { + return (*reinterpret_cast<const uint64_t*>(base) >> bit) & mask; +} +/* Assumes value <= mask and mask == (1 << length) - 1 where length <= 57. + * Assumes the memory is zero initially. + */ +inline void WriteInt57(void *base, uint8_t bit, uint64_t value) { + *reinterpret_cast<uint64_t*>(base) |= (value << bit); +} + +namespace detail { typedef union { float f; uint32_t i; } FloatEnc; } +inline float ReadFloat32(const void *base, uint8_t bit) { + detail::FloatEnc encoded; + encoded.i = *reinterpret_cast<const uint64_t*>(base) >> bit; + return encoded.f; +} +inline void WriteFloat32(void *base, uint8_t bit, float value) { + detail::FloatEnc encoded; + encoded.f = value; + WriteInt57(base, bit, encoded.i); +} + +inline float ReadNonPositiveFloat31(const void *base, uint8_t bit) { + detail::FloatEnc encoded; + encoded.i = *reinterpret_cast<const uint64_t*>(base) >> bit; + // Sign bit set means negative. + encoded.i |= 0x80000000; + return encoded.f; +} +inline void WriteNonPositiveFloat31(void *base, uint8_t bit, float value) { + assert(value <= 0.0); + detail::FloatEnc encoded; + encoded.f = value; + encoded.i &= ~0x80000000; + WriteInt57(base, bit, encoded.i); +} + +void BitPackingSanity(); + +// Return bits required to store integers upto max_value. Not the most +// efficient implementation, but this is only called a few times to size tries. +uint8_t RequiredBits(uint64_t max_value); + +struct BitsMask { + void FromMax(uint64_t max_value) { + bits = RequiredBits(max_value); + mask = (1 << bits) - 1; + } + uint8_t bits; + uint64_t mask; +}; + +} // namespace util + +#endif // UTIL_BIT_PACKING__ diff --git a/klm/util/ersatz_progress.cc b/klm/util/ersatz_progress.cc index 09e3a106..55c182bd 100644 --- a/klm/util/ersatz_progress.cc +++ b/klm/util/ersatz_progress.cc @@ -13,10 +13,7 @@ ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<std::s ErsatzProgress::~ErsatzProgress() { if (!out_) return; - for (; stones_written_ < kWidth; ++stones_written_) { - (*out_) << '*'; - } - *out_ << '\n'; + Finished(); } ErsatzProgress::ErsatzProgress(std::ostream *to, const std::string &message, std::size_t complete) @@ -36,8 +33,8 @@ void ErsatzProgress::Milestone() { for (; stones_written_ < stone; ++stones_written_) { (*out_) << '*'; } - - if (current_ >= complete_) { + if (stone == kWidth) { + (*out_) << std::endl; next_ = std::numeric_limits<std::size_t>::max(); } else { next_ = std::max(next_, (stone * complete_) / kWidth); diff --git a/klm/util/ersatz_progress.hh b/klm/util/ersatz_progress.hh index ea6c3bb9..92c345fe 100644 --- a/klm/util/ersatz_progress.hh +++ b/klm/util/ersatz_progress.hh @@ -19,7 +19,7 @@ class ErsatzProgress { ~ErsatzProgress(); ErsatzProgress &operator++() { - if (++current_ == next_) Milestone(); + if (++current_ >= next_) Milestone(); return *this; } @@ -33,6 +33,10 @@ class ErsatzProgress { Milestone(); } + void Finished() { + Set(complete_); + } + private: void Milestone(); diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index 2b439499..e7bd8659 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -2,19 +2,23 @@ #include "util/exception.hh" -#include <iostream> #include <string> #include <limits> #include <assert.h> -#include <cstdlib> #include <ctype.h> +#include <err.h> #include <fcntl.h> +#include <stdlib.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> #include <unistd.h> +#ifdef HAVE_ZLIB +#include <zlib.h> +#endif + namespace util { EndOfFileException::EndOfFileException() throw() { @@ -26,6 +30,13 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() { *this << "Could not parse \"" << value << "\" into a float"; } +GZException::GZException(void *file) { +#ifdef HAVE_ZLIB + int num; + *this << gzerror(file, &num) << " from zlib"; +#endif // HAVE_ZLIB +} + int OpenReadOrThrow(const char *name) { int ret = open(name, O_RDONLY); if (ret == -1) UTIL_THROW(ErrnoException, "in open (" << name << ") for reading"); @@ -38,42 +49,73 @@ off_t SizeFile(int fd) { return sb.st_size; } -FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) : +FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) : file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } -FilePiece::FilePiece(const char *name, int fd, std::ostream *show_progress, off_t min_buffer) : +FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) : file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } -void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) { - if (total_size_ == kBadSize) { - fallback_to_read_ = true; - if (show_progress) - *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl; - } else { - fallback_to_read_ = false; +FilePiece::~FilePiece() { +#ifdef HAVE_ZLIB + if (gz_file_) { + // zlib took ownership + file_.release(); + int ret; + if (Z_OK != (ret = gzclose(gz_file_))) { + errx(1, "could not close file %s using zlib", file_name_.c_str()); + } } +#endif +} + +void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) { +#ifdef HAVE_ZLIB + gz_file_ = NULL; +#endif + file_name_ = name; + default_map_size_ = page_ * std::max<off_t>((min_buffer / page_ + 1), 2); position_ = NULL; position_end_ = NULL; mapped_offset_ = 0; at_end_ = false; + + if (total_size_ == kBadSize) { + // So the assertion passes. + fallback_to_read_ = false; + if (show_progress) + *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl; + TransitionToRead(); + } else { + fallback_to_read_ = false; + } Shift(); + // gzip detect. + if ((position_end_ - position_) > 2 && *position_ == 0x1f && static_cast<unsigned char>(*(position_ + 1)) == 0x8b) { +#ifndef HAVE_ZLIB + UTIL_THROW(GZException, "Looks like a gzip file but support was not compiled in."); +#endif + if (!fallback_to_read_) { + at_end_ = false; + TransitionToRead(); + } + } } -float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) { +float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) { SkipSpaces(); while (last_space_ < position_) { if (at_end_) { // Hallucinate a null off the end of the file. std::string buffer(position_, position_end_); char *end; - float ret = std::strtof(buffer.c_str(), &end); + float ret = strtof(buffer.c_str(), &end); if (buffer.c_str() == end) throw ParseNumberException(buffer); position_ += end - buffer.c_str(); return ret; @@ -81,20 +123,20 @@ float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) { Shift(); } char *end; - float ret = std::strtof(position_, &end); + float ret = strtof(position_, &end); if (end == position_) throw ParseNumberException(ReadDelimited()); position_ = end; return ret; } -void FilePiece::SkipSpaces() throw (EndOfFileException) { +void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) { for (; ; ++position_) { if (position_ == position_end_) Shift(); if (!isspace(*position_)) return; } } -const char *FilePiece::FindDelimiterOrEOF() throw (EndOfFileException) { +const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileException) { for (const char *i = position_; i <= last_space_; ++i) { if (isspace(*i)) return i; } @@ -108,7 +150,7 @@ const char *FilePiece::FindDelimiterOrEOF() throw (EndOfFileException) { return position_end_; } -StringPiece FilePiece::ReadLine(char delim) throw (EndOfFileException) { +StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) { const char *start = position_; do { for (const char *i = start; i < position_end_; ++i) { @@ -124,17 +166,19 @@ StringPiece FilePiece::ReadLine(char delim) throw (EndOfFileException) { } while (!at_end_); StringPiece ret(position_, position_end_ - position_); position_ = position_end_; - return position_; + return ret; } -void FilePiece::Shift() throw(EndOfFileException) { - if (at_end_) throw EndOfFileException(); +void FilePiece::Shift() throw(GZException, EndOfFileException) { + if (at_end_) { + progress_.Finished(); + throw EndOfFileException(); + } off_t desired_begin = position_ - data_.begin() + mapped_offset_; - progress_.Set(desired_begin); if (!fallback_to_read_) MMapShift(desired_begin); // Notice an mmap failure might set the fallback. - if (fallback_to_read_) ReadShift(desired_begin); + if (fallback_to_read_) ReadShift(); for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) { if (isspace(*last_space_)) break; @@ -163,28 +207,41 @@ void FilePiece::MMapShift(off_t desired_begin) throw() { data_.reset(); data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_PRIVATE, *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED); if (data_.get() == MAP_FAILED) { - fallback_to_read_ = true; if (desired_begin) { if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either."); } + // The mmap was scheduled to end the file, but now we're going to read it. + at_end_ = false; + TransitionToRead(); return; } mapped_offset_ = mapped_offset; position_ = data_.begin() + ignore; position_end_ = data_.begin() + mapped_size; + + progress_.Set(desired_begin); +} + +void FilePiece::TransitionToRead() throw (GZException) { + assert(!fallback_to_read_); + fallback_to_read_ = true; + data_.reset(); + data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); + if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_); + position_ = data_.begin(); + position_end_ = position_; + +#ifdef HAVE_ZLIB + assert(!gz_file_); + gz_file_ = gzdopen(file_.get(), "r"); + if (!gz_file_) { + UTIL_THROW(GZException, "zlib failed to open " << file_name_); + } +#endif } -void FilePiece::ReadShift(off_t desired_begin) throw() { +void FilePiece::ReadShift() throw(GZException, EndOfFileException) { assert(fallback_to_read_); - if (data_.source() != scoped_memory::MALLOC_ALLOCATED) { - // First call. - data_.reset(); - data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); - if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_); - position_ = data_.begin(); - position_end_ = position_; - } - // Bytes [data_.begin(), position_) have been consumed. // Bytes [position_, position_end_) have been read into the buffer. @@ -215,9 +272,23 @@ void FilePiece::ReadShift(off_t desired_begin) throw() { } } - ssize_t read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read); + ssize_t read_return; +#ifdef HAVE_ZLIB + read_return = gzread(gz_file_, static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read); + if (read_return == -1) throw GZException(gz_file_); + if (total_size_ != kBadSize) { + // Just get the position, don't actually seek. Apparently this is how you do it. . . + off_t ret = lseek(file_.get(), 0, SEEK_CUR); + if (ret != -1) progress_.Set(ret); + } +#else + read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read); if (read_return == -1) UTIL_THROW(ErrnoException, "read failed"); - if (read_return == 0) at_end_ = true; + progress_.Set(mapped_offset_); +#endif + if (read_return == 0) { + at_end_ = true; + } position_end_ += read_return; } diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index 704f0ac6..11d4a751 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -11,6 +11,8 @@ #include <cstddef> +#define HAVE_ZLIB + namespace util { class EndOfFileException : public Exception { @@ -25,6 +27,13 @@ class ParseNumberException : public Exception { ~ParseNumberException() throw() {} }; +class GZException : public Exception { + public: + explicit GZException(void *file); + GZException() throw() {} + ~GZException() throw() {} +}; + int OpenReadOrThrow(const char *name); // Return value for SizeFile when it can't size properly. @@ -34,40 +43,42 @@ off_t SizeFile(int fd); class FilePiece { public: // 32 MB default. - explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); + explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException); // Takes ownership of fd. name is used for messages. - explicit FilePiece(const char *name, int fd, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); + explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432) throw(GZException); + + ~FilePiece(); - char get() throw(EndOfFileException) { - if (position_ == position_end_) Shift(); + char get() throw(GZException, EndOfFileException) { + if (position_ == position_end_) { + Shift(); + if (at_end_) throw EndOfFileException(); + } return *(position_++); } // Memory backing the returned StringPiece may vanish on the next call. // Leaves the delimiter, if any, to be returned by get(). - StringPiece ReadDelimited() throw(EndOfFileException) { + StringPiece ReadDelimited() throw(GZException, EndOfFileException) { SkipSpaces(); return Consume(FindDelimiterOrEOF()); } // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter. // It is similar to getline in that way. - StringPiece ReadLine(char delim = '\n') throw(EndOfFileException); + StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException); - float ReadFloat() throw(EndOfFileException, ParseNumberException); + float ReadFloat() throw(GZException, EndOfFileException, ParseNumberException); - void SkipSpaces() throw (EndOfFileException); + void SkipSpaces() throw (GZException, EndOfFileException); off_t Offset() const { return position_ - data_.begin() + mapped_offset_; } - // Only for testing. - void ForceFallbackToRead() { - fallback_to_read_ = true; - } + const std::string &FileName() const { return file_name_; } private: - void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer); + void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw(GZException); StringPiece Consume(const char *to) { StringPiece ret(position_, to - position_); @@ -75,12 +86,14 @@ class FilePiece { return ret; } - const char *FindDelimiterOrEOF() throw(EndOfFileException); + const char *FindDelimiterOrEOF() throw(EndOfFileException, GZException); - void Shift() throw (EndOfFileException); + void Shift() throw (EndOfFileException, GZException); // Backends to Shift(). void MMapShift(off_t desired_begin) throw (); - void ReadShift(off_t desired_begin) throw (); + + void TransitionToRead() throw (GZException); + void ReadShift() throw (GZException, EndOfFileException); const char *position_, *last_space_, *position_end_; @@ -98,6 +111,12 @@ class FilePiece { bool fallback_to_read_; ErsatzProgress progress_; + + std::string file_name_; + +#ifdef HAVE_ZLIB + void *gz_file_; +#endif // HAVE_ZLIB }; } // namespace util diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc index befb7866..23e79fe0 100644 --- a/klm/util/file_piece_test.cc +++ b/klm/util/file_piece_test.cc @@ -1,15 +1,19 @@ #include "util/file_piece.hh" +#include "util/scoped.hh" + #define BOOST_TEST_MODULE FilePieceTest #include <boost/test/unit_test.hpp> #include <fstream> #include <iostream> +#include <stdio.h> + namespace util { namespace { /* mmap implementation */ -BOOST_AUTO_TEST_CASE(MMapLine) { +BOOST_AUTO_TEST_CASE(MMapReadLine) { std::fstream ref("file_piece.cc", std::ios::in); FilePiece test("file_piece.cc", NULL, 1); std::string ref_line; @@ -20,13 +24,17 @@ BOOST_AUTO_TEST_CASE(MMapLine) { BOOST_CHECK_EQUAL(ref_line, test_line); } } + BOOST_CHECK_THROW(test.get(), EndOfFileException); } /* read() implementation */ -BOOST_AUTO_TEST_CASE(ReadLine) { +BOOST_AUTO_TEST_CASE(StreamReadLine) { std::fstream ref("file_piece.cc", std::ios::in); - FilePiece test("file_piece.cc", NULL, 1); - test.ForceFallbackToRead(); + + scoped_FILE catter(popen("cat file_piece.cc", "r")); + BOOST_REQUIRE(catter.get()); + + FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); @@ -35,7 +43,47 @@ BOOST_AUTO_TEST_CASE(ReadLine) { BOOST_CHECK_EQUAL(ref_line, test_line); } } + BOOST_CHECK_THROW(test.get(), EndOfFileException); } +#ifdef HAVE_ZLIB + +// gzip file +BOOST_AUTO_TEST_CASE(PlainZipReadLine) { + std::fstream ref("file_piece.cc", std::ios::in); + + BOOST_REQUIRE_EQUAL(0, system("gzip <file_piece.cc >file_piece.cc.gz")); + FilePiece test("file_piece.cc.gz", NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} +// gzip stream +BOOST_AUTO_TEST_CASE(StreamZipReadLine) { + std::fstream ref("file_piece.cc", std::ios::in); + + scoped_FILE catter(popen("gzip <file_piece.cc", "r")); + BOOST_REQUIRE(catter.get()); + + FilePiece test(dup(fileno(catter.get())), "file_piece.cc", NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + +#endif + } // namespace } // namespace util diff --git a/klm/util/joint_sort.hh b/klm/util/joint_sort.hh index a2f1c01d..cf3d8432 100644 --- a/klm/util/joint_sort.hh +++ b/klm/util/joint_sort.hh @@ -119,6 +119,12 @@ template <class Proxy, class Less> class LessWrapper : public std::binary_functi } // namespace detail +template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > { + public: + PairedIterator(const KeyIter &key, const ValueIter &value) : + ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {} +}; + template <class KeyIter, class ValueIter, class Less> void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin, const Less &less) { ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > full_begin(detail::JointProxy<KeyIter, ValueIter>(key_begin, value_begin)); detail::LessWrapper<detail::JointProxy<KeyIter, ValueIter>, Less> less_wrap(less); diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 648b5d0a..8685170f 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -53,10 +53,8 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int if (prefault) { flags |= MAP_POPULATE; } - int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; -#else - int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; #endif + int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; void *ret = mmap(NULL, size, protect, flags, fd, offset); if (ret == MAP_FAILED) { UTIL_THROW(ErrnoException, "mmap failed for size " << size << " at offset " << offset); @@ -64,8 +62,40 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int return ret; } -void *MapForRead(std::size_t size, bool prefault, int fd, off_t offset) { - return MapOrThrow(size, false, MAP_FILE | MAP_PRIVATE, prefault, fd, offset); +namespace { +void ReadAll(int fd, void *to_void, std::size_t amount) { + uint8_t *to = static_cast<uint8_t*>(to_void); + while (amount) { + ssize_t ret = read(fd, to, amount); + if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); + if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read."); + amount -= ret; + to += ret; + } +} +} // namespace + +void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out) { + switch (method) { + case LAZY: + out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED); + break; + case POPULATE_OR_LAZY: +#ifdef MAP_POPULATE + case POPULATE_OR_READ: +#endif + out.reset(MapOrThrow(size, false, MAP_FILE | MAP_SHARED, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED); + break; +#ifndef MAP_POPULATE + case POPULATE_OR_READ: +#endif + case READ: + out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED); + if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc"); + if (-1 == lseek(fd, offset, SEEK_SET)) UTIL_THROW(ErrnoException, "lseek to " << offset << " in fd " << fd << " failed."); + ReadAll(fd, out.get(), size); + break; + } } void *MapAnonymous(std::size_t size) { @@ -78,14 +108,14 @@ void *MapAnonymous(std::size_t size) { | MAP_PRIVATE, false, -1, 0); } -void MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file, scoped_mmap &mem) { +void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { file.reset(open(name, O_CREAT | O_RDWR | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)); if (-1 == file.get()) UTIL_THROW(ErrnoException, "Failed to open " << name << " for writing"); if (-1 == ftruncate(file.get(), size)) UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed"); try { - mem.reset(MapOrThrow(size, true, MAP_FILE | MAP_SHARED, false, file.get(), 0), size); + return MapOrThrow(size, true, MAP_FILE | MAP_SHARED, false, file.get(), 0); } catch (ErrnoException &e) { e << " in file " << name; throw; diff --git a/klm/util/mmap.hh b/klm/util/mmap.hh index c9068ec9..0a504d89 100644 --- a/klm/util/mmap.hh +++ b/klm/util/mmap.hh @@ -6,6 +6,7 @@ #include <cstddef> +#include <inttypes.h> #include <sys/types.h> namespace util { @@ -19,8 +20,8 @@ class scoped_mmap { void *get() const { return data_; } - const char *begin() const { return reinterpret_cast<char*>(data_); } - const char *end() const { return reinterpret_cast<char*>(data_) + size_; } + const uint8_t *begin() const { return reinterpret_cast<uint8_t*>(data_); } + const uint8_t *end() const { return reinterpret_cast<uint8_t*>(data_) + size_; } std::size_t size() const { return size_; } void reset(void *data, std::size_t size) { @@ -79,23 +80,27 @@ class scoped_memory { scoped_memory &operator=(const scoped_memory &); }; -struct scoped_mapped_file { - scoped_fd fd; - scoped_mmap mem; -}; +typedef enum { + // mmap with no prepopulate + LAZY, + // On linux, pass MAP_POPULATE to mmap. + POPULATE_OR_LAZY, + // Populate on Linux. malloc and read on non-Linux. + POPULATE_OR_READ, + // malloc and read. + READ +} LoadMethod; + // Wrapper around mmap to check it worked and hide some platform macros. void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset = 0); -void *MapForRead(std::size_t size, bool prefault, int fd, off_t offset = 0); +void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out); void *MapAnonymous(std::size_t size); // Open file name with mmap of size bytes, all of which are initially zero. -void MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file, scoped_mmap &mem); -inline void MapZeroedWrite(const char *name, std::size_t size, scoped_mapped_file &out) { - MapZeroedWrite(name, size, out.fd, out.mem); -} - +void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file); + } // namespace util -#endif // UTIL_SCOPED__ +#endif // UTIL_MMAP__ diff --git a/klm/util/proxy_iterator.hh b/klm/util/proxy_iterator.hh index 1c5b7089..121a45fa 100644 --- a/klm/util/proxy_iterator.hh +++ b/klm/util/proxy_iterator.hh @@ -78,6 +78,8 @@ template <class Proxy> class ProxyIterator { const Proxy *operator->() const { return &p_; } Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); } + const InnerIterator &Inner() { return p_.Inner(); } + private: InnerIterator &I() { return p_.Inner(); } const InnerIterator &I() const { return p_.Inner(); } diff --git a/klm/util/scoped.cc b/klm/util/scoped.cc index 61394ffc..2c6d5394 100644 --- a/klm/util/scoped.cc +++ b/klm/util/scoped.cc @@ -9,4 +9,8 @@ scoped_fd::~scoped_fd() { if (fd_ != -1 && close(fd_)) err(1, "Could not close file %i", fd_); } +scoped_FILE::~scoped_FILE() { + if (file_ && fclose(file_)) err(1, "Could not close file"); +} + } // namespace util diff --git a/klm/util/scoped.hh b/klm/util/scoped.hh index ef62a74f..52864481 100644 --- a/klm/util/scoped.hh +++ b/klm/util/scoped.hh @@ -4,6 +4,7 @@ /* Other scoped objects in the style of scoped_ptr. */ #include <cstddef> +#include <cstdio> namespace util { @@ -61,6 +62,24 @@ class scoped_fd { scoped_fd &operator=(const scoped_fd &); }; +class scoped_FILE { + public: + explicit scoped_FILE(std::FILE *file = NULL) : file_(file) {} + + ~scoped_FILE(); + + std::FILE *get() { return file_; } + const std::FILE *get() const { return file_; } + + void reset(std::FILE *to = NULL) { + scoped_FILE other(file_); + file_ = to; + } + + private: + std::FILE *file_; +}; + } // namespace util #endif // UTIL_SCOPED__ diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh index 96ec4866..a8e208fb 100644 --- a/klm/util/sorted_uniform.hh +++ b/klm/util/sorted_uniform.hh @@ -65,7 +65,7 @@ template <class PackingT> class SortedUniformMap { public: // Offer consistent API with probing hash. - static std::size_t Size(std::size_t entries, float ignore = 0.0) { + static std::size_t Size(std::size_t entries, float /*ignore*/ = 0.0) { return sizeof(uint64_t) + entries * Packing::kBytes; } @@ -75,7 +75,7 @@ template <class PackingT> class SortedUniformMap { #endif {} - SortedUniformMap(void *start, std::size_t allocated) : + SortedUniformMap(void *start, std::size_t /*allocated*/) : begin_(Packing::FromVoid(reinterpret_cast<uint64_t*>(start) + 1)), end_(begin_), size_ptr_(reinterpret_cast<uint64_t*>(start)) #ifdef DEBUG diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc index 6917a6bc..5b4e98f5 100644 --- a/klm/util/string_piece.cc +++ b/klm/util/string_piece.cc @@ -30,14 +30,14 @@ #include "util/string_piece.hh" -#ifdef USE_BOOST +#ifdef HAVE_BOOST #include <boost/functional/hash/hash.hpp> #endif #include <algorithm> #include <iostream> -#ifdef USE_ICU +#ifdef HAVE_ICU U_NAMESPACE_BEGIN #endif @@ -46,12 +46,12 @@ std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { return o; } -#ifdef USE_BOOST +#ifdef HAVE_BOOST size_t hash_value(const StringPiece &str) { return boost::hash_range(str.data(), str.data() + str.length()); } #endif -#ifdef USE_ICU +#ifdef HAVE_ICU U_NAMESPACE_END #endif diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index 58008d13..3ac2f8a7 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -1,4 +1,4 @@ -/* If you use ICU in your program, then compile with -DUSE_ICU -licui18n. If +/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If * you don't use ICU, then this will use the Google implementation from Chrome. * This has been modified from the original version to let you choose. */ @@ -49,14 +49,14 @@ #define BASE_STRING_PIECE_H__ //Uncomment this line if you use ICU in your code. -//#define USE_ICU +//#define HAVE_ICU //Uncomment this line if you want boost hashing for your StringPieces. -//#define USE_BOOST +//#define HAVE_BOOST #include <cstring> #include <iosfwd> -#ifdef USE_ICU +#ifdef HAVE_ICU #include <unicode/stringpiece.h> U_NAMESPACE_BEGIN #else @@ -230,7 +230,7 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) { // allow StringPiece to be logged (needed for unit testing). extern std::ostream& operator<<(std::ostream& o, const StringPiece& piece); -#ifdef USE_BOOST +#ifdef HAVE_BOOST size_t hash_value(const StringPiece &str); /* Support for lookup of StringPiece in boost::unordered_map<std::string> */ @@ -253,7 +253,7 @@ template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece } #endif -#ifdef USE_ICU +#ifdef HAVE_ICU U_NAMESPACE_END #endif |