From d884099e0db8b4510847ec106b59ef7dca3c245b Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 18 Jan 2013 17:12:51 +0000 Subject: KenLM dffafbf with lmplz source (but not built) --- klm/util/stream/sort.hh | 542 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 542 insertions(+) create mode 100644 klm/util/stream/sort.hh (limited to 'klm/util/stream/sort.hh') diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh new file mode 100644 index 00000000..be6c11ea --- /dev/null +++ b/klm/util/stream/sort.hh @@ -0,0 +1,542 @@ +/* Usage: + * Sort sorter(temp, compare); + * Chain(config) >> Read(file) >> sorter.Unsorted(); + * Stream stream; + * Chain chain(config) >> sorter.Sorted(internal_config, lazy_config) >> stream; + * + * Note that sorter must outlive any threads that use Unsorted or Sorted. + * + * Combiners take the form: + * bool operator()(void *into, const void *option, const Compare &compare) const + * which returns true iff a combination happened. The sorting algorithm + * guarantees compare(into, option). But it does not guarantee + * compare(option, into). + * Currently, combining is only done in merge steps, not during on-the-fly + * sort. Use a hash table for that. + */ + +#ifndef UTIL_STREAM_SORT__ +#define UTIL_STREAM_SORT__ + +#include "util/stream/chain.hh" +#include "util/stream/config.hh" +#include "util/stream/io.hh" +#include "util/stream/stream.hh" +#include "util/stream/timer.hh" + +#include "util/file.hh" +#include "util/scoped.hh" +#include "util/sized_iterator.hh" + +#include +#include +#include +#include + +namespace util { +namespace stream { + +struct NeverCombine { + template bool operator()(const void *, const void *, const Compare &) const { + return false; + } +}; + +// Manage the offsets of sorted blocks in a file. +class Offsets { + public: + explicit Offsets(int fd) : log_(fd) { + Reset(); + } + + int File() const { return log_; } + + void Append(uint64_t length) { + if (!length) return; + ++block_count_; + if (length == cur_.length) { + ++cur_.run; + return; + } + WriteOrThrow(log_, &cur_, sizeof(Entry)); + cur_.length = length; + cur_.run = 1; + } + + void FinishedAppending() { + WriteOrThrow(log_, &cur_, sizeof(Entry)); + SeekOrThrow(log_, sizeof(Entry)); // Skip 0,0 at beginning. + cur_.run = 0; + if (block_count_) { + ReadOrThrow(log_, &cur_, sizeof(Entry)); + assert(cur_.length); + assert(cur_.run); + } + } + + uint64_t RemainingBlocks() const { return block_count_; } + + uint64_t TotalOffset() const { return output_sum_; } + + uint64_t PeekSize() const { + return cur_.length; + } + + uint64_t NextSize() { + assert(block_count_); + uint64_t ret = cur_.length; + output_sum_ += ret; + + --cur_.run; + --block_count_; + if (!cur_.run && block_count_) { + ReadOrThrow(log_, &cur_, sizeof(Entry)); + assert(cur_.length); + assert(cur_.run); + } + return ret; + } + + void Reset() { + SeekOrThrow(log_, 0); + ResizeOrThrow(log_, 0); + cur_.length = 0; + cur_.run = 0; + block_count_ = 0; + output_sum_ = 0; + } + + private: + int log_; + + struct Entry { + uint64_t length; + uint64_t run; + }; + Entry cur_; + + uint64_t block_count_; + + uint64_t output_sum_; +}; + +// A priority queue of entries backed by file buffers +template class MergeQueue { + public: + MergeQueue(int fd, std::size_t buffer_size, std::size_t entry_size, const Compare &compare) + : queue_(Greater(compare)), in_(fd), buffer_size_(buffer_size), entry_size_(entry_size) {} + + void Push(void *base, uint64_t offset, uint64_t amount) { + queue_.push(Entry(base, in_, offset, amount, buffer_size_)); + } + + const void *Top() const { + return queue_.top().Current(); + } + + void Pop() { + Entry top(queue_.top()); + queue_.pop(); + if (top.Increment(in_, buffer_size_, entry_size_)) + queue_.push(top); + } + + std::size_t Size() const { + return queue_.size(); + } + + bool Empty() const { + return queue_.empty(); + } + + private: + // Priority queue contains these entries. + class Entry { + public: + Entry() {} + + Entry(void *base, int fd, uint64_t offset, uint64_t amount, std::size_t buf_size) { + offset_ = offset; + remaining_ = amount; + buffer_end_ = static_cast(base) + buf_size; + Read(fd, buf_size); + } + + bool Increment(int fd, std::size_t buf_size, std::size_t entry_size) { + current_ += entry_size; + if (current_ != buffer_end_) return true; + return Read(fd, buf_size); + } + + const void *Current() const { return current_; } + + private: + bool Read(int fd, std::size_t buf_size) { + current_ = buffer_end_ - buf_size; + std::size_t amount; + if (static_cast(buf_size) < remaining_) { + amount = buf_size; + } else if (!remaining_) { + return false; + } else { + amount = remaining_; + buffer_end_ = current_ + remaining_; + } + PReadOrThrow(fd, current_, amount, offset_); + offset_ += amount; + assert(current_ <= buffer_end_); + remaining_ -= amount; + return true; + } + + // Buffer + uint8_t *current_, *buffer_end_; + // File + uint64_t remaining_, offset_; + }; + + // Wrapper comparison function for queue entries. + class Greater : public std::binary_function { + public: + explicit Greater(const Compare &compare) : compare_(compare) {} + + bool operator()(const Entry &first, const Entry &second) const { + return compare_(second.Current(), first.Current()); + } + + private: + const Compare compare_; + }; + + typedef std::priority_queue, Greater> Queue; + Queue queue_; + + const int in_; + const std::size_t buffer_size_; + const std::size_t entry_size_; +}; + +/* A worker object that merges. If the number of pieces to merge exceeds the + * arity, it outputs multiple sorted blocks, recording to out_offsets. + * However, users will only every see a single sorted block out output because + * Sort::Sorted insures the arity is higher than the number of pieces before + * returning this. + */ +template class MergingReader { + public: + MergingReader(int in, Offsets *in_offsets, Offsets *out_offsets, std::size_t buffer_size, std::size_t total_memory, const Compare &compare, const Combine &combine) : + compare_(compare), combine_(combine), + in_(in), + in_offsets_(in_offsets), out_offsets_(out_offsets), + buffer_size_(buffer_size), total_memory_(total_memory) {} + + void Run(const ChainPosition &position) { + Run(position, false); + } + + void Run(const ChainPosition &position, bool assert_one) { + // Special case: nothing to read. + if (!in_offsets_->RemainingBlocks()) { + Link l(position); + l.Poison(); + return; + } + // If there's just one entry, just read. + if (in_offsets_->RemainingBlocks() == 1) { + // Sequencing is important. + uint64_t offset = in_offsets_->TotalOffset(); + uint64_t amount = in_offsets_->NextSize(); + ReadSingle(offset, amount, position); + if (out_offsets_) out_offsets_->Append(amount); + return; + } + + Stream str(position); + scoped_malloc buffer(MallocOrThrow(total_memory_)); + uint8_t *const buffer_end = static_cast(buffer.get()) + total_memory_; + + const std::size_t entry_size = position.GetChain().EntrySize(); + + while (in_offsets_->RemainingBlocks()) { + // Use bigger buffers if there's less remaining. + uint64_t per_buffer = std::max(buffer_size_, total_memory_ / in_offsets_->RemainingBlocks()); + per_buffer -= per_buffer % entry_size; + assert(per_buffer); + + // Populate queue. + MergeQueue queue(in_, per_buffer, entry_size, compare_); + for (uint8_t *buf = static_cast(buffer.get()); + in_offsets_->RemainingBlocks() && (buf + std::min(per_buffer, in_offsets_->PeekSize()) <= buffer_end);) { + uint64_t offset = in_offsets_->TotalOffset(); + uint64_t size = in_offsets_->NextSize(); + queue.Push(buf, offset, size); + buf += static_cast(std::min(size, per_buffer)); + } + // This shouldn't happen but it's probably better to die than loop indefinitely. + if (queue.Size() < 2 && in_offsets_->RemainingBlocks()) { + std::cerr << "Bug in sort implementation: not merging at least two stripes." << std::endl; + abort(); + } + if (assert_one && in_offsets_->RemainingBlocks()) { + std::cerr << "Bug in sort implementation: should only be one merge group for lazy sort" << std::endl; + abort(); + } + + uint64_t written = 0; + // Merge including combiner support. + memcpy(str.Get(), queue.Top(), entry_size); + for (queue.Pop(); !queue.Empty(); queue.Pop()) { + if (!combine_(str.Get(), queue.Top(), compare_)) { + ++written; ++str; + memcpy(str.Get(), queue.Top(), entry_size); + } + } + ++written; ++str; + if (out_offsets_) + out_offsets_->Append(written * entry_size); + } + str.Poison(); + } + + private: + void ReadSingle(uint64_t offset, const uint64_t size, const ChainPosition &position) { + // Special case: only one to read. + const uint64_t end = offset + size; + const uint64_t block_size = position.GetChain().BlockSize(); + Link l(position); + for (; offset + block_size < end; ++l, offset += block_size) { + PReadOrThrow(in_, l->Get(), block_size, offset); + l->SetValidSize(block_size); + } + PReadOrThrow(in_, l->Get(), end - offset, offset); + l->SetValidSize(end - offset); + (++l).Poison(); + return; + } + + Compare compare_; + Combine combine_; + + int in_; + + protected: + Offsets *in_offsets_; + + private: + Offsets *out_offsets_; + + std::size_t buffer_size_; + std::size_t total_memory_; +}; + +// The lazy step owns the remaining files. This keeps track of them. +template class OwningMergingReader : public MergingReader { + private: + typedef MergingReader P; + public: + OwningMergingReader(int data, const Offsets &offsets, std::size_t buffer, std::size_t lazy, const Compare &compare, const Combine &combine) + : P(data, NULL, NULL, buffer, lazy, compare, combine), + data_(data), + offsets_(offsets) {} + + void Run(const ChainPosition &position) { + P::in_offsets_ = &offsets_; + scoped_fd data(data_); + scoped_fd offsets_file(offsets_.File()); + P::Run(position, true); + } + + private: + int data_; + Offsets offsets_; +}; + +// Don't use this directly. Worker that sorts blocks. +template class BlockSorter { + public: + BlockSorter(Offsets &offsets, const Compare &compare) : + offsets_(&offsets), compare_(compare) {} + + void Run(const ChainPosition &position) { + const std::size_t entry_size = position.GetChain().EntrySize(); + for (Link link(position); link; ++link) { + // Record the size of each block in a separate file. + offsets_->Append(link->ValidSize()); + void *end = static_cast(link->Get()) + link->ValidSize(); + std::sort( + SizedIt(link->Get(), entry_size), + SizedIt(end, entry_size), + compare_); + } + offsets_->FinishedAppending(); + } + + private: + Offsets *offsets_; + SizedCompare compare_; +}; + +class BadSortConfig : public Exception { + public: + BadSortConfig() throw() {} + ~BadSortConfig() throw() {} +}; + +template class Sort { + public: + Sort(Chain &in, const SortConfig &config, const Compare &compare = Compare(), const Combine &combine = Combine()) + : config_(config), + data_(MakeTemp(config.temp_prefix)), + offsets_file_(MakeTemp(config.temp_prefix)), offsets_(offsets_file_.get()), + compare_(compare), combine_(combine), + entry_size_(in.EntrySize()) { + UTIL_THROW_IF(!entry_size_, BadSortConfig, "Sorting entries of size 0"); + // Make buffer_size a multiple of the entry_size. + config_.buffer_size -= config_.buffer_size % entry_size_; + UTIL_THROW_IF(!config_.buffer_size, BadSortConfig, "Sort buffer too small"); + UTIL_THROW_IF(config_.total_memory < config_.buffer_size * 4, BadSortConfig, "Sorting memory " << config_.total_memory << " is too small for four buffers (two read and two write)."); + in >> BlockSorter(offsets_, compare_) >> WriteAndRecycle(data_.get()); + } + + uint64_t Size() const { + return SizeOrThrow(data_.get()); + } + + // Do merge sort, terminating when lazy merge could be done with the + // specified memory. Return the minimum memory necessary to do lazy merge. + std::size_t Merge(std::size_t lazy_memory) { + if (offsets_.RemainingBlocks() <= 1) return 0; + const uint64_t lazy_arity = std::max(1, lazy_memory / config_.buffer_size); + uint64_t size = Size(); + /* No overflow because + * offsets_.RemainingBlocks() * config_.buffer_size <= lazy_memory || + * size < lazy_memory + */ + if (offsets_.RemainingBlocks() <= lazy_arity || size <= static_cast(lazy_memory)) + return std::min(size, offsets_.RemainingBlocks() * config_.buffer_size); + + scoped_fd data2(MakeTemp(config_.temp_prefix)); + int fd_in = data_.get(), fd_out = data2.get(); + scoped_fd offsets2_file(MakeTemp(config_.temp_prefix)); + Offsets offsets2(offsets2_file.get()); + Offsets *offsets_in = &offsets_, *offsets_out = &offsets2; + + // Double buffered writing. + ChainConfig chain_config; + chain_config.entry_size = entry_size_; + chain_config.block_count = 2; + chain_config.total_memory = config_.buffer_size * 2; + Chain chain(chain_config); + + while (offsets_in->RemainingBlocks() > lazy_arity) { + if (size <= static_cast(lazy_memory)) break; + std::size_t reading_memory = config_.total_memory - 2 * config_.buffer_size; + if (size < static_cast(reading_memory)) { + reading_memory = static_cast(size); + } + SeekOrThrow(fd_in, 0); + chain >> + MergingReader( + fd_in, + offsets_in, offsets_out, + config_.buffer_size, + reading_memory, + compare_, combine_) >> + WriteAndRecycle(fd_out); + chain.Wait(); + offsets_out->FinishedAppending(); + ResizeOrThrow(fd_in, 0); + offsets_in->Reset(); + std::swap(fd_in, fd_out); + std::swap(offsets_in, offsets_out); + size = SizeOrThrow(fd_in); + } + + SeekOrThrow(fd_in, 0); + if (fd_in == data2.get()) { + data_.reset(data2.release()); + offsets_file_.reset(offsets2_file.release()); + offsets_ = offsets2; + } + if (offsets_.RemainingBlocks() <= 1) return 0; + // No overflow because the while loop exited. + return std::min(size, offsets_.RemainingBlocks() * static_cast(config_.buffer_size)); + } + + // Output to chain, using this amount of memory, maximum, for lazy merge + // sort. + void Output(Chain &out, std::size_t lazy_memory) { + Merge(lazy_memory); + out.SetProgressTarget(Size()); + out >> OwningMergingReader(data_.get(), offsets_, config_.buffer_size, lazy_memory, compare_, combine_); + data_.release(); + offsets_file_.release(); + } + + /* If a pipeline step is reading sorted input and writing to a different + * sort order, then there's a trade-off between using RAM to read lazily + * (avoiding copying the file) and using RAM to increase block size and, + * therefore, decrease the number of merge sort passes in the next + * iteration. + * + * Merge sort takes log_{arity}(pieces) passes. Thus, each time the chain + * block size is multiplied by arity, the number of output passes decreases + * by one. Up to a constant, then, log_{arity}(chain) is the number of + * passes saved. Chain simply divides the memory evenly over all blocks. + * + * Lazy sort saves this many passes (up to a constant) + * log_{arity}((memory-lazy)/block_count) + 1 + * Non-lazy sort saves this many passes (up to the same constant): + * log_{arity}(memory/block_count) + * Add log_{arity}(block_count) to both: + * log_{arity}(memory-lazy) + 1 versus log_{arity}(memory) + * Take arity to the power of both sizes (arity > 1) + * (memory - lazy)*arity versus memory + * Solve for lazy + * lazy = memory * (arity - 1) / arity + */ + std::size_t DefaultLazy() { + float arity = static_cast(config_.total_memory / config_.buffer_size); + return static_cast(static_cast(config_.total_memory) * (arity - 1.0) / arity); + } + + // Same as Output with default lazy memory setting. + void Output(Chain &out) { + Output(out, DefaultLazy()); + } + + // Completely merge sort and transfer ownership to the caller. + int StealCompleted() { + // Merge all the way. + Merge(0); + SeekOrThrow(data_.get(), 0); + offsets_file_.reset(); + return data_.release(); + } + + private: + SortConfig config_; + + scoped_fd data_; + + scoped_fd offsets_file_; + Offsets offsets_; + + const Compare compare_; + const Combine combine_; + const std::size_t entry_size_; +}; + +// returns bytes to be read on demand. +template uint64_t BlockingSort(Chain &chain, const SortConfig &config, const Compare &compare = Compare(), const Combine &combine = NeverCombine()) { + Sort sorter(chain, config, compare, combine); + chain.Wait(true); + uint64_t size = sorter.Size(); + sorter.Output(chain); + return size; +} + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_SORT__ -- cgit v1.2.3 From 8b9aae7cff1efd1be195cdd000b21546bd5fca04 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 19 Jan 2013 19:09:48 -0500 Subject: updated version of boost.m4 and automatically build kenneth's LM builder --- Makefile.am | 2 + configure.ac | 7 +- corpus/cut-corpus.pl | 2 +- klm/lm/builder/Makefile.am | 28 +++ klm/util/Makefile.am | 2 +- klm/util/double-conversion/Makefile.am | 2 +- klm/util/stream/Makefile.am | 20 ++ klm/util/stream/sort.hh | 3 +- m4/boost.m4 | 322 +++++++++++++++++++++++++-------- 9 files changed, 311 insertions(+), 77 deletions(-) create mode 100644 klm/lm/builder/Makefile.am create mode 100644 klm/util/stream/Makefile.am (limited to 'klm/util/stream/sort.hh') diff --git a/Makefile.am b/Makefile.am index c2444928..17190d27 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,8 +5,10 @@ SUBDIRS = \ utils \ mteval \ klm/util/double-conversion \ + klm/util/stream \ klm/util \ klm/lm \ + klm/lm/builder \ klm/search \ decoder \ training \ diff --git a/configure.ac b/configure.ac index d6030752..a1e5ad84 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cdec],[2013-01-15]) +AC_INIT([cdec],[2013-01-19]) AC_CONFIG_SRCDIR([decoder/cdec.cc]) AM_INIT_AUTOMAKE AC_CONFIG_HEADERS(config.h) @@ -15,7 +15,10 @@ BOOST_REQUIRE([1.44]) BOOST_PROGRAM_OPTIONS BOOST_SYSTEM BOOST_SERIALIZATION +BOOST_CHRONO +BOOST_TIMER BOOST_TEST +BOOST_THREADS AM_PATH_PYTHON AC_CHECK_HEADER(dlfcn.h,AC_DEFINE(HAVE_DLFCN_H)) AC_CHECK_LIB(dl, dlopen) @@ -111,8 +114,10 @@ AC_CONFIG_FILES([word-aligner/Makefile]) # KenLM stuff AC_CONFIG_FILES([klm/util/double-conversion/Makefile]) +AC_CONFIG_FILES([klm/util/stream/Makefile]) AC_CONFIG_FILES([klm/util/Makefile]) AC_CONFIG_FILES([klm/lm/Makefile]) +AC_CONFIG_FILES([klm/lm/builder/Makefile]) AC_CONFIG_FILES([klm/search/Makefile]) # training stuff diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl index 7daac0e2..0af3b23c 100755 --- a/corpus/cut-corpus.pl +++ b/corpus/cut-corpus.pl @@ -22,7 +22,7 @@ for my $ff (@ind) { while(<>) { chomp; - my @fields = split / \|\|\| /; + my @fields = split /\s*\|\|\|\s*/; my @sf; for my $i (@o) { my $y = $fields[$i]; diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am new file mode 100644 index 00000000..00444256 --- /dev/null +++ b/klm/lm/builder/Makefile.am @@ -0,0 +1,28 @@ +bin_PROGRAMS = builder + +builder_SOURCES = \ + main.cc \ + adjust_counts.cc \ + adjust_counts.hh \ + corpus_count.cc \ + corpus_count.hh \ + discount.hh \ + header_info.hh \ + initial_probabilities.cc \ + initial_probabilities.hh \ + interpolate.cc \ + interpolate.hh \ + joint_order.hh \ + multi_stream.hh \ + ngram.hh \ + ngram_stream.hh \ + pipeline.cc \ + pipeline.hh \ + print.cc \ + print.hh \ + sort.hh + +builder_LDADD = ../libklm.a ../../util/double-conversion/libklm_util_double.a ../../util/stream/libklm_util_stream.a ../../util/libklm_util.a $(BOOST_TIMER_LIBS) $(BOOST_CHRONO_LIBS) $(BOOST_THREAD_LIBS) + +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm + diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index 294ebc0a..248cc844 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -54,4 +54,4 @@ libklm_util_a_SOURCES = \ string_piece.cc \ usage.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/double-conversion/Makefile.am b/klm/util/double-conversion/Makefile.am index eb6616f7..dfcfb009 100644 --- a/klm/util/double-conversion/Makefile.am +++ b/klm/util/double-conversion/Makefile.am @@ -20,4 +20,4 @@ libklm_util_double_a_SOURCES = \ fixed-dtoa.cc \ strtod.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/stream/Makefile.am b/klm/util/stream/Makefile.am new file mode 100644 index 00000000..f18cbedb --- /dev/null +++ b/klm/util/stream/Makefile.am @@ -0,0 +1,20 @@ +noinst_LIBRARIES = libklm_util_stream.a + +libklm_util_stream_a_SOURCES = \ + block.hh \ + chain.cc \ + chain.hh \ + config.hh \ + io.cc \ + io.hh \ + line_input.cc \ + line_input.hh \ + multi_progress.cc \ + multi_progress.hh \ + sort.hh \ + stream.hh \ + timer.hh + +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm + +#-I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh index be6c11ea..df57fa41 100644 --- a/klm/util/stream/sort.hh +++ b/klm/util/stream/sort.hh @@ -259,7 +259,8 @@ template class MergingReader { while (in_offsets_->RemainingBlocks()) { // Use bigger buffers if there's less remaining. - uint64_t per_buffer = std::max(buffer_size_, total_memory_ / in_offsets_->RemainingBlocks()); + uint64_t per_buffer = std::max(static_cast(buffer_size_), + static_cast(total_memory_ / in_offsets_->RemainingBlocks())); per_buffer -= per_buffer % entry_size; assert(per_buffer); diff --git a/m4/boost.m4 b/m4/boost.m4 index 7e0ed075..027e039b 100644 --- a/m4/boost.m4 +++ b/m4/boost.m4 @@ -1,5 +1,5 @@ # boost.m4: Locate Boost headers and libraries for autoconf-based projects. -# Copyright (C) 2007, 2008, 2009 Benoit Sigoure +# Copyright (C) 2007, 2008, 2009, 2010, 2011 Benoit Sigoure # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -22,7 +22,7 @@ # along with this program. If not, see . m4_define([_BOOST_SERIAL], [m4_translit([ -# serial 12 +# serial 16 ], [# ], [])]) @@ -45,15 +45,19 @@ m4_define([_BOOST_SERIAL], [m4_translit([ # Note: THESE MACROS ASSUME THAT YOU USE LIBTOOL. If you don't, don't worry, # simply read the README, it will show you what to do step by step. -m4_pattern_forbid([^_?BOOST_]) +m4_pattern_forbid([^_?(BOOST|Boost)_]) # _BOOST_SED_CPP(SED-PROGRAM, PROGRAM, # [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) # -------------------------------------------------------- # Same as AC_EGREP_CPP, but leave the result in conftest.i. -# PATTERN is *not* overquoted, as in AC_EGREP_CPP. It could be useful -# to turn this into a macro which extracts the value of any macro. +# +# SED-PROGRAM is *not* overquoted, as in AC_EGREP_CPP. It is expanded +# in double-quotes, so escape your double quotes. +# +# It could be useful to turn this into a macro which extracts the +# value of any macro. m4_define([_BOOST_SED_CPP], [AC_LANG_PREPROC_REQUIRE()dnl AC_REQUIRE([AC_PROG_SED])dnl @@ -98,6 +102,7 @@ set x $boost_version_req 0 0 0 IFS=$boost_save_IFS shift boost_version_req=`expr "$[1]" '*' 100000 + "$[2]" '*' 100 + "$[3]"` +boost_version_req_string=$[1].$[2].$[3] AC_ARG_WITH([boost], [AS_HELP_STRING([--with-boost=DIR], [prefix of Boost $1 @<:@guess@:>@])])dnl @@ -113,9 +118,9 @@ if test x"$BOOST_ROOT" != x; then fi fi AC_SUBST([DISTCHECK_CONFIGURE_FLAGS], - ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"]) + ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"])dnl boost_save_CPPFLAGS=$CPPFLAGS - AC_CACHE_CHECK([for Boost headers version >= $boost_version_req], + AC_CACHE_CHECK([for Boost headers version >= $boost_version_req_string], [boost_cv_inc_path], [boost_cv_inc_path=no AC_LANG_PUSH([C++])dnl @@ -183,24 +188,25 @@ AC_LANG_POP([C++])dnl ]) case $boost_cv_inc_path in #( no) - boost_errmsg="cannot find Boost headers version >= $boost_version_req" + boost_errmsg="cannot find Boost headers version >= $boost_version_req_string" m4_if([$2], [], [AC_MSG_ERROR([$boost_errmsg])], [AC_MSG_NOTICE([$boost_errmsg])]) $2 ;;#( yes) BOOST_CPPFLAGS= - AC_DEFINE([HAVE_BOOST], [1], - [Defined if the requested minimum BOOST version is satisfied]) ;;#( *) - AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"]) + AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"])dnl ;; esac + if test x"$boost_cv_inc_path" != xno; then + AC_DEFINE([HAVE_BOOST], [1], + [Defined if the requested minimum BOOST version is satisfied]) AC_CACHE_CHECK([for Boost's header version], [boost_cv_lib_version], [m4_pattern_allow([^BOOST_LIB_VERSION$])dnl - _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;g;}], + _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;q;}], [#include boost-lib-version = BOOST_LIB_VERSION], [boost_cv_lib_version=`cat conftest.i`])]) @@ -211,6 +217,7 @@ boost-lib-version = BOOST_LIB_VERSION], AC_MSG_ERROR([invalid value: boost_major_version=$boost_major_version]) ;; esac +fi CPPFLAGS=$boost_save_CPPFLAGS ])# BOOST_REQUIRE @@ -220,7 +227,7 @@ CPPFLAGS=$boost_save_CPPFLAGS # on the command line, static versions of the libraries will be looked up. AC_DEFUN([BOOST_STATIC], [AC_ARG_ENABLE([static-boost], - [AC_HELP_STRING([--enable-static-boost], + [AS_HELP_STRING([--enable-static-boost], [Prefer the static boost libraries over the shared ones [no]])], [enable_static_boost=yes], [enable_static_boost=no])])# BOOST_STATIC @@ -290,6 +297,7 @@ dnl The else branch is huge and wasn't intended on purpose. AC_LANG_PUSH([C++])dnl AS_VAR_PUSHDEF([Boost_lib], [boost_cv_lib_$1])dnl AS_VAR_PUSHDEF([Boost_lib_LDFLAGS], [boost_cv_lib_$1_LDFLAGS])dnl +AS_VAR_PUSHDEF([Boost_lib_LDPATH], [boost_cv_lib_$1_LDPATH])dnl AS_VAR_PUSHDEF([Boost_lib_LIBS], [boost_cv_lib_$1_LIBS])dnl BOOST_FIND_HEADER([$3]) boost_save_CPPFLAGS=$CPPFLAGS @@ -371,8 +379,8 @@ for boost_rtopt_ in $boost_rtopt '' -d; do boost_tmp_lib=$with_boost test x"$with_boost" = x && boost_tmp_lib=${boost_cv_inc_path%/include} for boost_ldpath in "$boost_tmp_lib/lib" '' \ - /opt/local/lib /usr/local/lib /opt/lib /usr/lib \ - "$with_boost" C:/Boost/lib /lib /usr/lib64 /lib64 + /opt/local/lib* /usr/local/lib* /opt/lib* /usr/lib* \ + "$with_boost" C:/Boost/lib /lib* do test -e "$boost_ldpath" || continue boost_save_LDFLAGS=$LDFLAGS @@ -395,7 +403,16 @@ dnl generated only once above (before we start the for loops). LDFLAGS=$boost_save_LDFLAGS LIBS=$boost_save_LIBS if test x"$Boost_lib" = xyes; then - Boost_lib_LDFLAGS="-L$boost_ldpath -R$boost_ldpath" + # Because Boost is often installed in non-standard locations we want to + # hardcode the path to the library (with rpath). Here we assume that + # Libtool's macro was already invoked so we can steal its variable + # hardcode_libdir_flag_spec in order to get the right flags for ld. + boost_save_libdir=$libdir + libdir=$boost_ldpath + eval boost_rpath=\"$hardcode_libdir_flag_spec\" + libdir=$boost_save_libdir + Boost_lib_LDFLAGS="-L$boost_ldpath $boost_rpath" + Boost_lib_LDPATH="$boost_ldpath" break 6 else boost_failed_libs="$boost_failed_libs@$boost_lib@" @@ -410,14 +427,17 @@ rm -f conftest.$ac_objext ]) case $Boost_lib in #( no) _AC_MSG_LOG_CONFTEST - AC_MSG_ERROR([cannot not find the flags to link with Boost $1]) + AC_MSG_ERROR([cannot find the flags to link with Boost $1]) ;; esac -AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS]) -AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS]) +AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS])dnl +AC_SUBST(AS_TR_CPP([BOOST_$1_LDPATH]), [$Boost_lib_LDPATH])dnl +AC_SUBST([BOOST_LDPATH], [$Boost_lib_LDPATH])dnl +AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS])dnl CPPFLAGS=$boost_save_CPPFLAGS AS_VAR_POPDEF([Boost_lib])dnl AS_VAR_POPDEF([Boost_lib_LDFLAGS])dnl +AS_VAR_POPDEF([Boost_lib_LDPATH])dnl AS_VAR_POPDEF([Boost_lib_LIBS])dnl AC_LANG_POP([C++])dnl fi @@ -432,17 +452,31 @@ fi # The page http://beta.boost.org/doc/libs is useful: it gives the first release # version of each library (among other things). +# BOOST_DEFUN(LIBRARY, CODE) +# -------------------------- +# Define BOOST_ as a macro that runs CODE. +# +# Use indir to avoid the warning on underquoted macro name given to AC_DEFUN. +m4_define([BOOST_DEFUN], +[m4_indir([AC_DEFUN], + m4_toupper([BOOST_$1]), +[m4_pushdef([BOOST_Library], [$1])dnl +$2 +m4_popdef([BOOST_Library])dnl +]) +]) + # BOOST_ARRAY() # ------------- # Look for Boost.Array -AC_DEFUN([BOOST_ARRAY], +BOOST_DEFUN([Array], [BOOST_FIND_HEADER([boost/array.hpp])]) # BOOST_ASIO() # ------------ # Look for Boost.Asio (new in Boost 1.35). -AC_DEFUN([BOOST_ASIO], +BOOST_DEFUN([Asio], [AC_REQUIRE([BOOST_SYSTEM])dnl BOOST_FIND_HEADER([boost/asio.hpp])]) @@ -450,14 +484,41 @@ BOOST_FIND_HEADER([boost/asio.hpp])]) # BOOST_BIND() # ------------ # Look for Boost.Bind -AC_DEFUN([BOOST_BIND], +BOOST_DEFUN([Bind], [BOOST_FIND_HEADER([boost/bind.hpp])]) +# BOOST_CHRONO() +# ------------------ +# Look for Boost.Chrono +BOOST_DEFUN([Chrono], +[# Do we have to check for Boost.System? This link-time dependency was +# added as of 1.35.0. If we have a version <1.35, we must not attempt to +# find Boost.System as it didn't exist by then. +if test $boost_major_version -ge 135; then + BOOST_SYSTEM([$1]) +fi # end of the Boost.System check. +boost_system_save_LIBS=$LIBS +boost_system_save_LDFLAGS=$LDFLAGS +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" +BOOST_FIND_LIB([chrono], [$1], + [boost/chrono.hpp], + [boost::chrono::system_clock::time_point d = boost::chrono::system_clock::now();]) +if test $enable_static_boost = yes && test $boost_major_version -ge 135; then + AC_SUBST([BOOST_SYSTEM_LIBS], ["$BOOST_SYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +fi +LIBS=$boost_system_save_LIBS +LDFLAGS=$boost_system_save_LDFLAGS + +])# BOOST_CHRONO + + # BOOST_CONVERSION() # ------------------ # Look for Boost.Conversion (cast / lexical_cast) -AC_DEFUN([BOOST_CONVERSION], +BOOST_DEFUN([Conversion], [BOOST_FIND_HEADER([boost/cast.hpp]) BOOST_FIND_HEADER([boost/lexical_cast.hpp]) ])# BOOST_CONVERSION @@ -467,12 +528,31 @@ BOOST_FIND_HEADER([boost/lexical_cast.hpp]) # ----------------------------------- # Look for Boost.Date_Time. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_DATE_TIME], +BOOST_DEFUN([Date_Time], [BOOST_FIND_LIB([date_time], [$1], [boost/date_time/posix_time/posix_time.hpp], [boost::posix_time::ptime t;]) ])# BOOST_DATE_TIME +# BOOST_TIMER([PREFERRED-RT-OPT]) +# ----------------------------------- +# Look for Boost.Timer. For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Timer], +[#check for Boost.System +BOOST_SYSTEM([$1]) +boost_system_save_LIBS=$LIBS +boost_system_save_LDFLAGS=$LDFLAGS +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" +BOOST_FIND_LIB([timer], [$1], + [boost/timer/timer.hpp], + [boost::timer::auto_cpu_timer t;]) +AC_SUBST([BOOST_SYSTEM_LIBS], ["$BOOST_SYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +LIBS=$boost_system_save_LIBS +LDFLAGS=$boost_system_save_LDFLAGS +])# BOOST_TIMER # BOOST_FILESYSTEM([PREFERRED-RT-OPT]) # ------------------------------------ @@ -480,7 +560,7 @@ AC_DEFUN([BOOST_DATE_TIME], # the documentation of BOOST_FIND_LIB above. # Do not check for boost/filesystem.hpp because this file was introduced in # 1.34. -AC_DEFUN([BOOST_FILESYSTEM], +BOOST_DEFUN([Filesystem], [# Do we have to check for Boost.System? This link-time dependency was # added as of 1.35.0. If we have a version <1.35, we must not attempt to # find Boost.System as it didn't exist by then. @@ -494,6 +574,9 @@ LIBS="$LIBS $BOOST_SYSTEM_LIBS" LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" BOOST_FIND_LIB([filesystem], [$1], [boost/filesystem/path.hpp], [boost::filesystem::path p;]) +if test $enable_static_boost = yes && test $boost_major_version -ge 135; then + AC_SUBST([BOOST_FILESYSTEM_LIBS], ["$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +fi LIBS=$boost_filesystem_save_LIBS LDFLAGS=$boost_filesystem_save_LDFLAGS ])# BOOST_FILESYSTEM @@ -502,7 +585,7 @@ LDFLAGS=$boost_filesystem_save_LDFLAGS # BOOST_FOREACH() # --------------- # Look for Boost.Foreach -AC_DEFUN([BOOST_FOREACH], +BOOST_DEFUN([Foreach], [BOOST_FIND_HEADER([boost/foreach.hpp])]) @@ -513,14 +596,14 @@ AC_DEFUN([BOOST_FOREACH], # standalone. It can't be compiled because it triggers the following error: # boost/format/detail/config_macros.hpp:88: error: 'locale' in namespace 'std' # does not name a type -AC_DEFUN([BOOST_FORMAT], +BOOST_DEFUN([Format], [BOOST_FIND_HEADER([boost/format.hpp])]) # BOOST_FUNCTION() # ---------------- # Look for Boost.Function -AC_DEFUN([BOOST_FUNCTION], +BOOST_DEFUN([Function], [BOOST_FIND_HEADER([boost/function.hpp])]) @@ -528,37 +611,60 @@ AC_DEFUN([BOOST_FUNCTION], # ------------------------------- # Look for Boost.Graphs. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_GRAPH], +BOOST_DEFUN([Graph], [BOOST_FIND_LIB([graph], [$1], [boost/graph/adjacency_list.hpp], [boost::adjacency_list<> g;]) ])# BOOST_GRAPH # BOOST_IOSTREAMS([PREFERRED-RT-OPT]) -# ------------------------------- +# ----------------------------------- # Look for Boost.IOStreams. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_IOSTREAMS], +BOOST_DEFUN([IOStreams], [BOOST_FIND_LIB([iostreams], [$1], [boost/iostreams/device/file_descriptor.hpp], - [boost::iostreams::file_descriptor fd(0); fd.close();]) + [boost::iostreams::file_descriptor fd; fd.close();]) ])# BOOST_IOSTREAMS # BOOST_HASH() # ------------ # Look for Boost.Functional/Hash -AC_DEFUN([BOOST_HASH], +BOOST_DEFUN([Hash], [BOOST_FIND_HEADER([boost/functional/hash.hpp])]) # BOOST_LAMBDA() # -------------- # Look for Boost.Lambda -AC_DEFUN([BOOST_LAMBDA], +BOOST_DEFUN([Lambda], [BOOST_FIND_HEADER([boost/lambda/lambda.hpp])]) +# BOOST_LOG([PREFERRED-RT-OPT]) +# ----------------------------- +# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Log], +[BOOST_FIND_LIB([log], [$1], + [boost/log/core/core.hpp], + [boost::log::attribute a; a.get_value();]) +])# BOOST_LOG + + +# BOOST_LOG_SETUP([PREFERRED-RT-OPT]) +# ----------------------------------- +# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Log_Setup], +[AC_REQUIRE([BOOST_LOG])dnl +BOOST_FIND_LIB([log_setup], [$1], + [boost/log/utility/init/from_settings.hpp], + [boost::log::basic_settings bs; bs.empty();]) +])# BOOST_LOG_SETUP + + # BOOST_MATH() # ------------ # Look for Boost.Math @@ -567,21 +673,21 @@ AC_DEFUN([BOOST_LAMBDA], # libboost_math_c99f, libboost_math_c99l, libboost_math_tr1, # libboost_math_tr1f, libboost_math_tr1l). This macro must be fixed to do the # right thing anyway. -AC_DEFUN([BOOST_MATH], +BOOST_DEFUN([Math], [BOOST_FIND_HEADER([boost/math/special_functions.hpp])]) # BOOST_MULTIARRAY() # ------------------ # Look for Boost.MultiArray -AC_DEFUN([BOOST_MULTIARRAY], +BOOST_DEFUN([MultiArray], [BOOST_FIND_HEADER([boost/multi_array.hpp])]) # BOOST_NUMERIC_CONVERSION() # -------------------------- # Look for Boost.NumericConversion (policy-based numeric conversion) -AC_DEFUN([BOOST_NUMERIC_CONVERSION], +BOOST_DEFUN([Numeric_Conversion], [BOOST_FIND_HEADER([boost/numeric/conversion/converter.hpp]) ])# BOOST_NUMERIC_CONVERSION @@ -589,32 +695,76 @@ AC_DEFUN([BOOST_NUMERIC_CONVERSION], # BOOST_OPTIONAL() # ---------------- # Look for Boost.Optional -AC_DEFUN([BOOST_OPTIONAL], +BOOST_DEFUN([Optional], [BOOST_FIND_HEADER([boost/optional.hpp])]) # BOOST_PREPROCESSOR() # -------------------- # Look for Boost.Preprocessor -AC_DEFUN([BOOST_PREPROCESSOR], +BOOST_DEFUN([Preprocessor], [BOOST_FIND_HEADER([boost/preprocessor/repeat.hpp])]) +# BOOST_UNORDERED() +# ----------------- +# Look for Boost.Unordered +BOOST_DEFUN([Unordered], +[BOOST_FIND_HEADER([boost/unordered_map.hpp])]) + + +# BOOST_UUID() +# ------------ +# Look for Boost.Uuid +BOOST_DEFUN([Uuid], +[BOOST_FIND_HEADER([boost/uuid/uuid.hpp])]) + + # BOOST_PROGRAM_OPTIONS([PREFERRED-RT-OPT]) # ----------------------------------------- -# Look for Boost.Program_options. For the documentation of PREFERRED-RT-OPT, see -# the documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_PROGRAM_OPTIONS], +# Look for Boost.Program_options. For the documentation of PREFERRED-RT-OPT, +# see the documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Program_Options], [BOOST_FIND_LIB([program_options], [$1], [boost/program_options.hpp], [boost::program_options::options_description d("test");]) ])# BOOST_PROGRAM_OPTIONS + +# _BOOST_PYTHON_CONFIG(VARIABLE, FLAG) +# ------------------------------------ +# Save VARIABLE, and define it via `python-config --FLAG`. +# Substitute BOOST_PYTHON_VARIABLE. +m4_define([_BOOST_PYTHON_CONFIG], +[AC_SUBST([BOOST_PYTHON_$1], + [`python-config --$2 2>/dev/null`])dnl +boost_python_save_$1=$$1 +$1="$$1 $BOOST_PYTHON_$1"]) + + +# BOOST_PYTHON([PREFERRED-RT-OPT]) +# -------------------------------- +# Look for Boost.Python. For the documentation of PREFERRED-RT-OPT, +# see the documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Python], +[_BOOST_PYTHON_CONFIG([CPPFLAGS], [includes]) +_BOOST_PYTHON_CONFIG([LDFLAGS], [ldflags]) +_BOOST_PYTHON_CONFIG([LIBS], [libs]) +m4_pattern_allow([^BOOST_PYTHON_MODULE$])dnl +BOOST_FIND_LIB([python], [$1], + [boost/python.hpp], + [], [BOOST_PYTHON_MODULE(empty) {}]) +CPPFLAGS=$boost_python_save_CPPFLAGS +LDFLAGS=$boost_python_save_LDFLAGS +LIBS=$boost_python_save_LIBS +])# BOOST_PYTHON + + # BOOST_REF() # ----------- # Look for Boost.Ref -AC_DEFUN([BOOST_REF], +BOOST_DEFUN([Ref], [BOOST_FIND_HEADER([boost/ref.hpp])]) @@ -622,7 +772,7 @@ AC_DEFUN([BOOST_REF], # ------------------------------- # Look for Boost.Regex. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_REGEX], +BOOST_DEFUN([Regex], [BOOST_FIND_LIB([regex], [$1], [boost/regex.hpp], [boost::regex exp("*"); boost::regex_match("foo", exp);]) @@ -633,19 +783,19 @@ AC_DEFUN([BOOST_REGEX], # --------------------------------------- # Look for Boost.Serialization. For the documentation of PREFERRED-RT-OPT, see # the documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_SERIALIZATION], +BOOST_DEFUN([Serialization], [BOOST_FIND_LIB([serialization], [$1], [boost/archive/text_oarchive.hpp], [std::ostream* o = 0; // Cheap way to get an ostream... boost::archive::text_oarchive t(*o);]) -])# BOOST_SIGNALS +])# BOOST_SERIALIZATION # BOOST_SIGNALS([PREFERRED-RT-OPT]) # --------------------------------- # Look for Boost.Signals. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_SIGNALS], +BOOST_DEFUN([Signals], [BOOST_FIND_LIB([signals], [$1], [boost/signal.hpp], [boost::signal s;]) @@ -655,7 +805,7 @@ AC_DEFUN([BOOST_SIGNALS], # BOOST_SMART_PTR() # ----------------- # Look for Boost.SmartPtr -AC_DEFUN([BOOST_SMART_PTR], +BOOST_DEFUN([Smart_Ptr], [BOOST_FIND_HEADER([boost/scoped_ptr.hpp]) BOOST_FIND_HEADER([boost/shared_ptr.hpp]) ]) @@ -664,14 +814,14 @@ BOOST_FIND_HEADER([boost/shared_ptr.hpp]) # BOOST_STATICASSERT() # -------------------- # Look for Boost.StaticAssert -AC_DEFUN([BOOST_STATICASSERT], +BOOST_DEFUN([StaticAssert], [BOOST_FIND_HEADER([boost/static_assert.hpp])]) # BOOST_STRING_ALGO() # ------------------- # Look for Boost.StringAlgo -AC_DEFUN([BOOST_STRING_ALGO], +BOOST_DEFUN([String_Algo], [BOOST_FIND_HEADER([boost/algorithm/string.hpp]) ]) @@ -681,7 +831,7 @@ AC_DEFUN([BOOST_STRING_ALGO], # Look for Boost.System. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. This library was introduced in Boost # 1.35.0. -AC_DEFUN([BOOST_SYSTEM], +BOOST_DEFUN([System], [BOOST_FIND_LIB([system], [$1], [boost/system/error_code.hpp], [boost::system::error_code e; e.clear();]) @@ -692,7 +842,7 @@ AC_DEFUN([BOOST_SYSTEM], # ------------------------------ # Look for Boost.Test. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_TEST], +BOOST_DEFUN([Test], [m4_pattern_allow([^BOOST_CHECK$])dnl BOOST_FIND_LIB([unit_test_framework], [$1], [boost/test/unit_test.hpp], [BOOST_CHECK(2 == 2);], @@ -707,25 +857,49 @@ BOOST_FIND_LIB([unit_test_framework], [$1], # Look for Boost.Thread. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. # FIXME: Provide an alias "BOOST_THREAD". -AC_DEFUN([BOOST_THREADS], +BOOST_DEFUN([Threads], [dnl Having the pthread flag is required at least on GCC3 where dnl boost/thread.hpp would complain if we try to compile without dnl -pthread on GNU/Linux. AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl boost_threads_save_LIBS=$LIBS +boost_threads_save_LDFLAGS=$LDFLAGS boost_threads_save_CPPFLAGS=$CPPFLAGS -LIBS="$LIBS $boost_cv_pthread_flag" +# Link-time dependency from thread to system was added as of 1.49.0. +if test $boost_major_version -ge 149; then +BOOST_SYSTEM([$1]) +fi # end of the Boost.System check. +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" # Yes, we *need* to put the -pthread thing in CPPFLAGS because with GCC3, # boost/thread.hpp will trigger a #error if -pthread isn't used: # boost/config/requires_threads.hpp:47:5: #error "Compiler threading support # is not turned on. Please set the correct command line options for # threading: -pthread (Linux), -pthreads (Solaris) or -mthreads (Mingw32)" CPPFLAGS="$CPPFLAGS $boost_cv_pthread_flag" -BOOST_FIND_LIB([thread], [$1], - [boost/thread.hpp], [boost::thread t; boost::mutex m;]) -BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $boost_cv_pthread_flag" + +# When compiling for the Windows platform, the threads library is named +# differently. +case $host_os in + (*mingw*) + BOOST_FIND_LIB([thread_win32], [$1], + [boost/thread.hpp], [boost::thread t; boost::mutex m;]) + BOOST_THREAD_LDFLAGS=$BOOST_THREAD_WIN32_LDFLAGS + BOOST_THREAD_LDPATH=$BOOST_THREAD_WIN32_LDPATH + BOOST_THREAD_LIBS=$BOOST_THREAD_WIN32_LIBS + ;; + (*) + BOOST_FIND_LIB([thread], [$1], + [boost/thread.hpp], [boost::thread t; boost::mutex m;]) + ;; +esac + +BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag" +BOOST_THREAD_LDFLAGS="$BOOST_SYSTEM_LDFLAGS" BOOST_CPPFLAGS="$BOOST_CPPFLAGS $boost_cv_pthread_flag" LIBS=$boost_threads_save_LIBS +LDFLAGS=$boost_threads_save_LDFLAGS CPPFLAGS=$boost_threads_save_CPPFLAGS ])# BOOST_THREADS @@ -733,14 +907,14 @@ CPPFLAGS=$boost_threads_save_CPPFLAGS # BOOST_TOKENIZER() # ----------------- # Look for Boost.Tokenizer -AC_DEFUN([BOOST_TOKENIZER], +BOOST_DEFUN([Tokenizer], [BOOST_FIND_HEADER([boost/tokenizer.hpp])]) # BOOST_TRIBOOL() # --------------- # Look for Boost.Tribool -AC_DEFUN([BOOST_TRIBOOL], +BOOST_DEFUN([Tribool], [BOOST_FIND_HEADER([boost/logic/tribool_fwd.hpp]) BOOST_FIND_HEADER([boost/logic/tribool.hpp]) ]) @@ -749,14 +923,14 @@ BOOST_FIND_HEADER([boost/logic/tribool.hpp]) # BOOST_TUPLE() # ------------- # Look for Boost.Tuple -AC_DEFUN([BOOST_TUPLE], +BOOST_DEFUN([Tuple], [BOOST_FIND_HEADER([boost/tuple/tuple.hpp])]) # BOOST_TYPETRAITS() # -------------------- # Look for Boost.TypeTraits -AC_DEFUN([BOOST_TYPETRAITS], +BOOST_DEFUN([TypeTraits], [BOOST_FIND_HEADER([boost/type_traits.hpp])]) @@ -764,14 +938,14 @@ AC_DEFUN([BOOST_TYPETRAITS], # --------------- # Look for Boost.Utility (noncopyable, result_of, base-from-member idiom, # etc.) -AC_DEFUN([BOOST_UTILITY], +BOOST_DEFUN([Utility], [BOOST_FIND_HEADER([boost/utility.hpp])]) # BOOST_VARIANT() # --------------- # Look for Boost.Variant. -AC_DEFUN([BOOST_VARIANT], +BOOST_DEFUN([Variant], [BOOST_FIND_HEADER([boost/variant/variant_fwd.hpp]) BOOST_FIND_HEADER([boost/variant.hpp])]) @@ -782,15 +956,15 @@ BOOST_FIND_HEADER([boost/variant.hpp])]) # call BOOST_THREADS first. # Look for Boost.Wave. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_WAVE], +BOOST_DEFUN([Wave], [AC_REQUIRE([BOOST_FILESYSTEM])dnl AC_REQUIRE([BOOST_DATE_TIME])dnl boost_wave_save_LIBS=$LIBS boost_wave_save_LDFLAGS=$LDFLAGS m4_pattern_allow([^BOOST_((FILE)?SYSTEM|DATE_TIME|THREAD)_(LIBS|LDFLAGS)$])dnl -LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS\ +LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS \ $BOOST_THREAD_LIBS" -LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS\ +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS \ $BOOST_DATE_TIME_LDFLAGS $BOOST_THREAD_LDFLAGS" BOOST_FIND_LIB([wave], [$1], [boost/wave.hpp], @@ -803,7 +977,7 @@ LDFLAGS=$boost_wave_save_LDFLAGS # BOOST_XPRESSIVE() # ----------------- # Look for Boost.Xpressive (new since 1.36.0). -AC_DEFUN([BOOST_XPRESSIVE], +BOOST_DEFUN([Xpressive], [BOOST_FIND_HEADER([boost/xpressive/xpressive.hpp])]) @@ -893,8 +1067,9 @@ AC_DEFUN([_BOOST_FIND_COMPILER_TAG], [AC_REQUIRE([AC_PROG_CXX])dnl AC_REQUIRE([AC_CANONICAL_HOST])dnl AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag], -[AC_LANG_PUSH([C++])dnl - boost_cv_lib_tag=unknown +[boost_cv_lib_tag=unknown +if test x$boost_cv_inc_path != xno; then + AC_LANG_PUSH([C++])dnl # The following tests are mostly inspired by boost/config/auto_link.hpp # The list is sorted to most recent/common to oldest compiler (in order # to increase the likelihood of finding the right compiler with the @@ -908,8 +1083,12 @@ AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag] # como, edg, kcc, bck, mp, sw, tru, xlc # I'm not sure about my test for `il' (be careful: Intel's ICC pre-defines # the same defines as GCC's). - # TODO: Move the test on GCC 4.4 up once it's released. for i in \ + _BOOST_gcc_test(4, 8) \ + _BOOST_gcc_test(4, 7) \ + _BOOST_gcc_test(4, 6) \ + _BOOST_gcc_test(4, 5) \ + _BOOST_gcc_test(4, 4) \ _BOOST_gcc_test(4, 3) \ _BOOST_gcc_test(4, 2) \ _BOOST_gcc_test(4, 1) \ @@ -929,7 +1108,6 @@ AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag] "defined __ICC && (defined __unix || defined __unix__) @ il" \ "defined __ICL @ iw" \ "defined _MSC_VER && _MSC_VER == 1300 @ vc7" \ - _BOOST_gcc_test(4, 4) \ _BOOST_gcc_test(2, 95) \ "defined __MWERKS__ && __MWERKS__ <= 0x32FF @ cw9" \ "defined _MSC_VER && _MSC_VER < 1300 && !defined UNDER_CE @ vc6" \ @@ -969,7 +1147,7 @@ AC_LANG_POP([C++])dnl boost_cv_lib_tag= ;; esac -])dnl end of AC_CACHE_CHECK +fi])dnl end of AC_CACHE_CHECK ])# _BOOST_FIND_COMPILER_TAG -- cgit v1.2.3 From dc16aa2accc7d9033d9c31c7bbc5e581d43a5101 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 20 Jan 2013 12:31:03 +0000 Subject: Better delimiters, cross-platform fixes --- klm/lm/builder/corpus_count.cc | 3 ++- klm/lm/filter/arpa_io.cc | 36 +++++++++++------------------------- klm/lm/filter/arpa_io.hh | 27 ++++++++++----------------- klm/util/stream/sort.hh | 5 +++-- klm/util/stream/timer.hh | 8 +++++--- 5 files changed, 31 insertions(+), 48 deletions(-) (limited to 'klm/util/stream/sort.hh') diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc index 8c3de57d..abea4ed0 100644 --- a/klm/lm/builder/corpus_count.cc +++ b/klm/lm/builder/corpus_count.cc @@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) { const WordIndex end_sentence = vocab.Lookup(""); Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_); uint64_t count = 0; + StringPiece delimiters("\0\t\r ", 4); try { while(true) { StringPiece line(from_.ReadLine()); writer.StartSentence(); - for (util::TokenIter w(line, " \t"); w; ++w) { + for (util::TokenIter w(line, delimiters); w; ++w) { WordIndex word = vocab.Lookup(*w); UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing in the future."); writer.Append(word); diff --git a/klm/lm/filter/arpa_io.cc b/klm/lm/filter/arpa_io.cc index caf8df95..f8568ac4 100644 --- a/klm/lm/filter/arpa_io.cc +++ b/klm/lm/filter/arpa_io.cc @@ -12,38 +12,24 @@ namespace lm { -ARPAInputException::ARPAInputException(const StringPiece &message) throw() : what_("Error: ") { - what_.append(message.data(), message.size()); +ARPAInputException::ARPAInputException(const StringPiece &message) throw() { + *this << message; } ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() { - what_ = "Error: "; - what_.append(message.data(), message.size()); - what_ += " in line '"; - what_.append(line.data(), line.size()); - what_ += "'."; + *this << message << " in line " << line; } -ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() - : what_(std::string(message) + " file " + file_name), file_name_(file_name) { - if (errno) { - char buf[1024]; - buf[0] = 0; -#if (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE - const char *add = buf; - if (!strerror_r(errno, buf, 1024)) { -#else - const char *add = strerror_r(errno, buf, 1024); - if (add) { -#endif - what_ += " :"; - what_ += add; - } - } +ARPAInputException::~ARPAInputException() throw() {} + +ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() { + *this << message << " in file " << file_name; } +ARPAOutputException::~ARPAOutputException() throw() {} + // Seeking is the responsibility of the caller. -void WriteCounts(std::ostream &out, const std::vector &number) { +void WriteCounts(std::ostream &out, const std::vector &number) { out << "\n\\data\\\n"; for (unsigned int i = 0; i < number.size(); ++i) { out << "ngram " << i+1 << "=" << number[i] << '\n'; @@ -51,7 +37,7 @@ void WriteCounts(std::ostream &out, const std::vector &number) { out << '\n'; } -size_t SizeNeededForCounts(const std::vector &number) { +size_t SizeNeededForCounts(const std::vector &number) { std::ostringstream buf; WriteCounts(buf, number); return buf.tellp(); diff --git a/klm/lm/filter/arpa_io.hh b/klm/lm/filter/arpa_io.hh index 90f48447..5b31620b 100644 --- a/klm/lm/filter/arpa_io.hh +++ b/klm/lm/filter/arpa_io.hh @@ -16,6 +16,7 @@ #include #include +#include namespace util { class FilePiece; } @@ -25,34 +26,26 @@ class ARPAInputException : public util::Exception { public: explicit ARPAInputException(const StringPiece &message) throw(); explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw(); - virtual ~ARPAInputException() throw() {} - - const char *what() const throw() { return what_.c_str(); } - - private: - std::string what_; + virtual ~ARPAInputException() throw(); }; -class ARPAOutputException : public std::exception { +class ARPAOutputException : public util::ErrnoException { public: ARPAOutputException(const char *prefix, const std::string &file_name) throw(); - virtual ~ARPAOutputException() throw() {} - - const char *what() const throw() { return what_.c_str(); } + virtual ~ARPAOutputException() throw(); const std::string &File() const throw() { return file_name_; } private: - std::string what_; const std::string file_name_; }; // Handling for the counts of n-grams at the beginning of ARPA files. -size_t SizeNeededForCounts(const std::vector &number); +size_t SizeNeededForCounts(const std::vector &number); /* Writes an ARPA file. This has to be seekable so the counts can be written * at the end. Hence, I just have it own a std::fstream instead of accepting - * a separately held std::ostream. + * a separately held std::ostream. TODO: use the fast one from estimation. */ class ARPAOutput : boost::noncopyable { public: @@ -88,14 +81,14 @@ class ARPAOutput : boost::noncopyable { boost::scoped_array buffer_; std::fstream file_; size_t fast_counter_; - std::vector counts_; + std::vector counts_; }; -template void ReadNGrams(util::FilePiece &in, unsigned int length, size_t number, Output &out) { +template void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) { ReadNGramHeader(in, length); out.BeginLength(length); - for (size_t i = 0; i < number; ++i) { + for (uint64_t i = 0; i < number; ++i) { StringPiece line = in.ReadLine(); util::TokenIter tabber(line, '\t'); if (!tabber) throw ARPAInputException("blank line", line); @@ -107,7 +100,7 @@ template void ReadNGrams(util::FilePiece &in, unsigned int length } template void ReadARPA(util::FilePiece &in_lm, Output &out) { - std::vector number; + std::vector number; ReadARPACounts(in_lm, number); out.ReserveForCounts(SizeNeededForCounts(number)); for (unsigned int i = 0; i < number.size(); ++i) { diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh index df57fa41..a86f160f 100644 --- a/klm/util/stream/sort.hh +++ b/klm/util/stream/sort.hh @@ -259,8 +259,9 @@ template class MergingReader { while (in_offsets_->RemainingBlocks()) { // Use bigger buffers if there's less remaining. - uint64_t per_buffer = std::max(static_cast(buffer_size_), - static_cast(total_memory_ / in_offsets_->RemainingBlocks())); + uint64_t per_buffer = static_cast(std::max( + buffer_size_, + static_cast((static_cast(total_memory_) / in_offsets_->RemainingBlocks())))); per_buffer -= per_buffer % entry_size; assert(per_buffer); diff --git a/klm/util/stream/timer.hh b/klm/util/stream/timer.hh index 50e94fe8..7e1a5885 100644 --- a/klm/util/stream/timer.hh +++ b/klm/util/stream/timer.hh @@ -1,14 +1,16 @@ #ifndef UTIL_STREAM_TIMER__ #define UTIL_STREAM_TIMER__ -#include +// Sorry Jon, this was adding library dependencies in Moses and people complained. + +/*#include #if BOOST_VERSION >= 104800 #include #define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str)) #else -//#warning Using Boost older than 1.48. Timing information will not be available. +//#warning Using Boost older than 1.48. Timing information will not be available.*/ #define UTIL_TIMER(str) -#endif +//#endif #endif // UTIL_STREAM_TIMER__ -- cgit v1.2.3 From b35a7f3a96ff8ae42e15922dd6949bf9f5d15501 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 22 Jan 2013 21:37:49 +0000 Subject: KenLM 58da338b --- klm/lm/Makefile.am | 4 +- klm/lm/build_binary.cc | 228 -------------------------------- klm/lm/build_binary_main.cc | 228 ++++++++++++++++++++++++++++++++ klm/lm/builder/Makefile.am | 2 +- klm/lm/builder/discount.hh | 2 +- klm/lm/builder/lmplz_main.cc | 94 +++++++++++++ klm/lm/builder/main.cc | 94 ------------- klm/lm/filter/filter_main.cc | 248 ++++++++++++++++++++++++++++++++++ klm/lm/filter/main.cc | 249 ----------------------------------- klm/lm/filter/phrase.hh | 1 + klm/lm/filter/vocab.hh | 1 + klm/lm/fragment.cc | 37 ------ klm/lm/fragment_main.cc | 37 ++++++ klm/lm/kenlm_max_order_main.cc | 6 + klm/lm/max_order.cc | 6 - klm/lm/ngram_query.cc | 47 ------- klm/lm/query_main.cc | 47 +++++++ klm/util/Makefile.am | 1 + klm/util/double-conversion/strtod.cc | 4 + klm/util/file.cc | 47 +++++-- klm/util/file_piece.cc | 22 +++- klm/util/file_piece.hh | 10 ++ klm/util/file_piece_test.cc | 14 ++ klm/util/have.hh | 4 - klm/util/read_compressed.cc | 28 +++- klm/util/read_compressed.hh | 7 + klm/util/read_compressed_test.cc | 55 +++++--- klm/util/stream/io.cc | 8 +- klm/util/stream/sort.hh | 12 +- klm/util/string_piece.cc | 3 +- klm/util/string_piece.hh | 41 ------ klm/util/string_piece_hash.hh | 43 ++++++ klm/util/usage.cc | 2 +- 33 files changed, 875 insertions(+), 757 deletions(-) delete mode 100644 klm/lm/build_binary.cc create mode 100644 klm/lm/build_binary_main.cc create mode 100644 klm/lm/builder/lmplz_main.cc delete mode 100644 klm/lm/builder/main.cc create mode 100644 klm/lm/filter/filter_main.cc delete mode 100644 klm/lm/filter/main.cc delete mode 100644 klm/lm/fragment.cc create mode 100644 klm/lm/fragment_main.cc create mode 100644 klm/lm/kenlm_max_order_main.cc delete mode 100644 klm/lm/max_order.cc delete mode 100644 klm/lm/ngram_query.cc create mode 100644 klm/lm/query_main.cc create mode 100644 klm/util/string_piece_hash.hh (limited to 'klm/util/stream/sort.hh') diff --git a/klm/lm/Makefile.am b/klm/lm/Makefile.am index 45f40c43..48b0ba34 100644 --- a/klm/lm/Makefile.am +++ b/klm/lm/Makefile.am @@ -1,9 +1,9 @@ bin_PROGRAMS = build_binary ngram_query -build_binary_SOURCES = build_binary.cc +build_binary_SOURCES = build_binary_main.cc build_binary_LDADD = libklm.a ../util/libklm_util.a ../util/double-conversion/libklm_util_double.a -lz -ngram_query_SOURCES = ngram_query.cc +ngram_query_SOURCES = query_main.cc ngram_query_LDADD = libklm.a ../util/libklm_util.a ../util/double-conversion/libklm_util_double.a -lz #noinst_PROGRAMS = \ diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc deleted file mode 100644 index ab2c0c32..00000000 --- a/klm/lm/build_binary.cc +++ /dev/null @@ -1,228 +0,0 @@ -#include "lm/model.hh" -#include "lm/sizes.hh" -#include "util/file_piece.hh" -#include "util/usage.hh" - -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifdef WIN32 -#include "util/getopt.hh" -#else -#include -#endif - -namespace lm { -namespace ngram { -namespace { - -void Usage(const char *name, const char *default_mem) { - std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" -"-u sets the log10 probability for if the ARPA file does not have one.\n" -" Default is -100. The ARPA file will always take precedence.\n" -"-s allows models to be built even if they do not have and .\n" -"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" -"-w mmap|after determines how writing is done.\n" -" mmap maps the binary file and writes to it. Default for trie.\n" -" after allocates anonymous memory, builds, and writes. Default for probing.\n" -"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n" -" model files. order1.arpa must be an ARPA file. All others may be ARPA or\n" -" the same data structure as being built. All files must have the same\n" -" vocabulary. For probing, the unigrams must be in the same order.\n\n" -"type is either probing or trie. Default is probing.\n\n" -"probing uses a probing hash table. It is the fastest but uses the most memory.\n" -"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" -"trie is a straightforward trie with bit-level packing. It uses the least\n" -"memory and is still faster than SRI or IRST. Building the trie format uses an\n" -"on-disk sort to save memory.\n" -"-T is the temporary directory prefix. Default is the output file name.\n" -"-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n" -" with GNU sort. The number is followed by a unit: \% for percent of physical\n" -" memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n" -" Default unit is K for Kilobytes.\n" -"-q turns quantization on and sets the number of bits (e.g. -q 8).\n" -"-b sets backoff quantization bits. Requires -q and defaults to that value.\n" -"-a compresses pointers using an array of offsets. The parameter is the\n" -" maximum number of bits encoded by the array. Memory is minimized subject\n" -" to the maximum, so pick 255 to minimize memory.\n\n" -"Get a memory estimate by passing an ARPA file without an output file name.\n"; - exit(1); -} - -// I could really use boost::lexical_cast right about now. -float ParseFloat(const char *from) { - char *end; - float ret = strtod(from, &end); - if (*end) throw util::ParseNumberException(from); - return ret; -} -unsigned long int ParseUInt(const char *from) { - char *end; - unsigned long int ret = strtoul(from, &end, 10); - if (*end) throw util::ParseNumberException(from); - return ret; -} - -uint8_t ParseBitCount(const char *from) { - unsigned long val = ParseUInt(from); - if (val > 25) { - util::ParseNumberException e(from); - e << " bit counts are limited to 25."; - } - return val; -} - -void ParseFileList(const char *from, std::vector &to) { - to.clear(); - while (true) { - const char *i; - for (i = from; *i && *i != ' '; ++i) {} - to.push_back(std::string(from, i - from)); - if (!*i) break; - from = i + 1; - } -} - -void ProbingQuantizationUnsupported() { - std::cerr << "Quantization is only implemented in the trie data structure." << std::endl; - exit(1); -} - -} // namespace ngram -} // namespace lm -} // namespace - -int main(int argc, char *argv[]) { - using namespace lm::ngram; - - const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G"; - - try { - bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false; - lm::ngram::Config config; - config.building_memory = util::ParseSize(default_mem); - int opt; - while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:")) != -1) { - switch(opt) { - case 'q': - config.prob_bits = ParseBitCount(optarg); - if (!set_backoff_bits) config.backoff_bits = config.prob_bits; - quantize = true; - break; - case 'b': - config.backoff_bits = ParseBitCount(optarg); - set_backoff_bits = true; - break; - case 'a': - config.pointer_bhiksha_bits = ParseBitCount(optarg); - bhiksha = true; - break; - case 'u': - config.unknown_missing_logprob = ParseFloat(optarg); - break; - case 'p': - config.probing_multiplier = ParseFloat(optarg); - break; - case 't': // legacy - case 'T': - config.temporary_directory_prefix = optarg; - break; - case 'm': // legacy - config.building_memory = ParseUInt(optarg) * 1048576; - break; - case 'S': - config.building_memory = std::min(static_cast(std::numeric_limits::max()), util::ParseSize(optarg)); - break; - case 'w': - set_write_method = true; - if (!strcmp(optarg, "mmap")) { - config.write_method = Config::WRITE_MMAP; - } else if (!strcmp(optarg, "after")) { - config.write_method = Config::WRITE_AFTER; - } else { - Usage(argv[0], default_mem); - } - break; - case 's': - config.sentence_marker_missing = lm::SILENT; - break; - case 'i': - config.positive_log_probability = lm::SILENT; - break; - case 'r': - rest = true; - ParseFileList(optarg, config.rest_lower_files); - config.rest_function = Config::REST_LOWER; - break; - default: - Usage(argv[0], default_mem); - } - } - if (!quantize && set_backoff_bits) { - std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl; - abort(); - } - if (optind + 1 == argc) { - ShowSizes(argv[optind], config); - return 0; - } - const char *model_type; - const char *from_file; - - if (optind + 2 == argc) { - model_type = "probing"; - from_file = argv[optind]; - config.write_mmap = argv[optind + 1]; - } else if (optind + 3 == argc) { - model_type = argv[optind]; - from_file = argv[optind + 1]; - config.write_mmap = argv[optind + 2]; - } else { - Usage(argv[0], default_mem); - } - if (!strcmp(model_type, "probing")) { - if (!set_write_method) config.write_method = Config::WRITE_AFTER; - if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); - if (rest) { - RestProbingModel(from_file, config); - } else { - ProbingModel(from_file, config); - } - } else if (!strcmp(model_type, "trie")) { - if (rest) { - std::cerr << "Rest + trie is not supported yet." << std::endl; - return 1; - } - if (!set_write_method) config.write_method = Config::WRITE_MMAP; - if (quantize) { - if (bhiksha) { - QuantArrayTrieModel(from_file, config); - } else { - QuantTrieModel(from_file, config); - } - } else { - if (bhiksha) { - ArrayTrieModel(from_file, config); - } else { - TrieModel(from_file, config); - } - } - } else { - Usage(argv[0], default_mem); - } - } - catch (const std::exception &e) { - std::cerr << e.what() << std::endl; - std::cerr << "ERROR" << std::endl; - return 1; - } - std::cerr << "SUCCESS" << std::endl; - return 0; -} diff --git a/klm/lm/build_binary_main.cc b/klm/lm/build_binary_main.cc new file mode 100644 index 00000000..ab2c0c32 --- /dev/null +++ b/klm/lm/build_binary_main.cc @@ -0,0 +1,228 @@ +#include "lm/model.hh" +#include "lm/sizes.hh" +#include "util/file_piece.hh" +#include "util/usage.hh" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef WIN32 +#include "util/getopt.hh" +#else +#include +#endif + +namespace lm { +namespace ngram { +namespace { + +void Usage(const char *name, const char *default_mem) { + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" +"-u sets the log10 probability for if the ARPA file does not have one.\n" +" Default is -100. The ARPA file will always take precedence.\n" +"-s allows models to be built even if they do not have and .\n" +"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" +"-w mmap|after determines how writing is done.\n" +" mmap maps the binary file and writes to it. Default for trie.\n" +" after allocates anonymous memory, builds, and writes. Default for probing.\n" +"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n" +" model files. order1.arpa must be an ARPA file. All others may be ARPA or\n" +" the same data structure as being built. All files must have the same\n" +" vocabulary. For probing, the unigrams must be in the same order.\n\n" +"type is either probing or trie. Default is probing.\n\n" +"probing uses a probing hash table. It is the fastest but uses the most memory.\n" +"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" +"trie is a straightforward trie with bit-level packing. It uses the least\n" +"memory and is still faster than SRI or IRST. Building the trie format uses an\n" +"on-disk sort to save memory.\n" +"-T is the temporary directory prefix. Default is the output file name.\n" +"-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n" +" with GNU sort. The number is followed by a unit: \% for percent of physical\n" +" memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n" +" Default unit is K for Kilobytes.\n" +"-q turns quantization on and sets the number of bits (e.g. -q 8).\n" +"-b sets backoff quantization bits. Requires -q and defaults to that value.\n" +"-a compresses pointers using an array of offsets. The parameter is the\n" +" maximum number of bits encoded by the array. Memory is minimized subject\n" +" to the maximum, so pick 255 to minimize memory.\n\n" +"Get a memory estimate by passing an ARPA file without an output file name.\n"; + exit(1); +} + +// I could really use boost::lexical_cast right about now. +float ParseFloat(const char *from) { + char *end; + float ret = strtod(from, &end); + if (*end) throw util::ParseNumberException(from); + return ret; +} +unsigned long int ParseUInt(const char *from) { + char *end; + unsigned long int ret = strtoul(from, &end, 10); + if (*end) throw util::ParseNumberException(from); + return ret; +} + +uint8_t ParseBitCount(const char *from) { + unsigned long val = ParseUInt(from); + if (val > 25) { + util::ParseNumberException e(from); + e << " bit counts are limited to 25."; + } + return val; +} + +void ParseFileList(const char *from, std::vector &to) { + to.clear(); + while (true) { + const char *i; + for (i = from; *i && *i != ' '; ++i) {} + to.push_back(std::string(from, i - from)); + if (!*i) break; + from = i + 1; + } +} + +void ProbingQuantizationUnsupported() { + std::cerr << "Quantization is only implemented in the trie data structure." << std::endl; + exit(1); +} + +} // namespace ngram +} // namespace lm +} // namespace + +int main(int argc, char *argv[]) { + using namespace lm::ngram; + + const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G"; + + try { + bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false; + lm::ngram::Config config; + config.building_memory = util::ParseSize(default_mem); + int opt; + while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:")) != -1) { + switch(opt) { + case 'q': + config.prob_bits = ParseBitCount(optarg); + if (!set_backoff_bits) config.backoff_bits = config.prob_bits; + quantize = true; + break; + case 'b': + config.backoff_bits = ParseBitCount(optarg); + set_backoff_bits = true; + break; + case 'a': + config.pointer_bhiksha_bits = ParseBitCount(optarg); + bhiksha = true; + break; + case 'u': + config.unknown_missing_logprob = ParseFloat(optarg); + break; + case 'p': + config.probing_multiplier = ParseFloat(optarg); + break; + case 't': // legacy + case 'T': + config.temporary_directory_prefix = optarg; + break; + case 'm': // legacy + config.building_memory = ParseUInt(optarg) * 1048576; + break; + case 'S': + config.building_memory = std::min(static_cast(std::numeric_limits::max()), util::ParseSize(optarg)); + break; + case 'w': + set_write_method = true; + if (!strcmp(optarg, "mmap")) { + config.write_method = Config::WRITE_MMAP; + } else if (!strcmp(optarg, "after")) { + config.write_method = Config::WRITE_AFTER; + } else { + Usage(argv[0], default_mem); + } + break; + case 's': + config.sentence_marker_missing = lm::SILENT; + break; + case 'i': + config.positive_log_probability = lm::SILENT; + break; + case 'r': + rest = true; + ParseFileList(optarg, config.rest_lower_files); + config.rest_function = Config::REST_LOWER; + break; + default: + Usage(argv[0], default_mem); + } + } + if (!quantize && set_backoff_bits) { + std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl; + abort(); + } + if (optind + 1 == argc) { + ShowSizes(argv[optind], config); + return 0; + } + const char *model_type; + const char *from_file; + + if (optind + 2 == argc) { + model_type = "probing"; + from_file = argv[optind]; + config.write_mmap = argv[optind + 1]; + } else if (optind + 3 == argc) { + model_type = argv[optind]; + from_file = argv[optind + 1]; + config.write_mmap = argv[optind + 2]; + } else { + Usage(argv[0], default_mem); + } + if (!strcmp(model_type, "probing")) { + if (!set_write_method) config.write_method = Config::WRITE_AFTER; + if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); + if (rest) { + RestProbingModel(from_file, config); + } else { + ProbingModel(from_file, config); + } + } else if (!strcmp(model_type, "trie")) { + if (rest) { + std::cerr << "Rest + trie is not supported yet." << std::endl; + return 1; + } + if (!set_write_method) config.write_method = Config::WRITE_MMAP; + if (quantize) { + if (bhiksha) { + QuantArrayTrieModel(from_file, config); + } else { + QuantTrieModel(from_file, config); + } + } else { + if (bhiksha) { + ArrayTrieModel(from_file, config); + } else { + TrieModel(from_file, config); + } + } + } else { + Usage(argv[0], default_mem); + } + } + catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + std::cerr << "ERROR" << std::endl; + return 1; + } + std::cerr << "SUCCESS" << std::endl; + return 0; +} diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am index b5c147fd..317e03ce 100644 --- a/klm/lm/builder/Makefile.am +++ b/klm/lm/builder/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = builder builder_SOURCES = \ - main.cc \ + lmplz_main.cc \ adjust_counts.cc \ adjust_counts.hh \ corpus_count.cc \ diff --git a/klm/lm/builder/discount.hh b/klm/lm/builder/discount.hh index 754fb20d..4d0aa4fd 100644 --- a/klm/lm/builder/discount.hh +++ b/klm/lm/builder/discount.hh @@ -3,7 +3,7 @@ #include -#include +#include namespace lm { namespace builder { diff --git a/klm/lm/builder/lmplz_main.cc b/klm/lm/builder/lmplz_main.cc new file mode 100644 index 00000000..90b9dca2 --- /dev/null +++ b/klm/lm/builder/lmplz_main.cc @@ -0,0 +1,94 @@ +#include "lm/builder/pipeline.hh" +#include "util/file.hh" +#include "util/file_piece.hh" +#include "util/usage.hh" + +#include + +#include + +namespace { +class SizeNotify { + public: + SizeNotify(std::size_t &out) : behind_(out) {} + + void operator()(const std::string &from) { + behind_ = util::ParseSize(from); + } + + private: + std::size_t &behind_; +}; + +boost::program_options::typed_value *SizeOption(std::size_t &to, const char *default_value) { + return boost::program_options::value()->notifier(SizeNotify(to))->default_value(default_value); +} + +} // namespace + +int main(int argc, char *argv[]) { + try { + namespace po = boost::program_options; + po::options_description options("Language model building options"); + lm::builder::PipelineConfig pipeline; + + options.add_options() + ("order,o", po::value(&pipeline.order)->required(), "Order of the model") + ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)") + ("temp_prefix,T", po::value(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix") + ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory") + ("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step") + ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow") + ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)") + ("block_count", po::value(&pipeline.block_count)->default_value(2), "Block count (per order)") + ("vocab_file", po::value(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file") + ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc."); + if (argc == 1) { + std::cerr << + "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n" + "Please cite:\n" + "@inproceedings{kenlm,\n" + "author = {Kenneth Heafield},\n" + "title = {{KenLM}: Faster and Smaller Language Model Queries},\n" + "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n" + "month = {July}, year={2011},\n" + "address = {Edinburgh, UK},\n" + "publisher = {Association for Computational Linguistics},\n" + "}\n\n" + "Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n" + "the model (-o) is the only mandatory option. As this is an on-disk program,\n" + "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n" + "Memory sizes are specified like GNU sort: a number followed by a unit character.\n" + "Valid units are \% for percentage of memory (supported platforms only) and (in\n" + "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n\n"; + std::cerr << options << std::endl; + return 1; + } + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, options), vm); + po::notify(vm); + + util::NormalizeTempPrefix(pipeline.sort.temp_prefix); + + lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; + // TODO: evaluate options for these. + initial.adder_in.total_memory = 32768; + initial.adder_in.block_count = 2; + initial.adder_out.total_memory = 32768; + initial.adder_out.block_count = 2; + pipeline.read_backoffs = initial.adder_out; + + // Read from stdin + try { + lm::builder::Pipeline(pipeline, 0, 1); + } catch (const util::MallocException &e) { + std::cerr << e.what() << std::endl; + std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as() << std::endl; + return 1; + } + util::PrintUsage(std::cerr); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } +} diff --git a/klm/lm/builder/main.cc b/klm/lm/builder/main.cc deleted file mode 100644 index 90b9dca2..00000000 --- a/klm/lm/builder/main.cc +++ /dev/null @@ -1,94 +0,0 @@ -#include "lm/builder/pipeline.hh" -#include "util/file.hh" -#include "util/file_piece.hh" -#include "util/usage.hh" - -#include - -#include - -namespace { -class SizeNotify { - public: - SizeNotify(std::size_t &out) : behind_(out) {} - - void operator()(const std::string &from) { - behind_ = util::ParseSize(from); - } - - private: - std::size_t &behind_; -}; - -boost::program_options::typed_value *SizeOption(std::size_t &to, const char *default_value) { - return boost::program_options::value()->notifier(SizeNotify(to))->default_value(default_value); -} - -} // namespace - -int main(int argc, char *argv[]) { - try { - namespace po = boost::program_options; - po::options_description options("Language model building options"); - lm::builder::PipelineConfig pipeline; - - options.add_options() - ("order,o", po::value(&pipeline.order)->required(), "Order of the model") - ("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)") - ("temp_prefix,T", po::value(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix") - ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory") - ("vocab_memory", SizeOption(pipeline.assume_vocab_hash_size, "50M"), "Assume that the vocabulary hash table will use this much memory for purposes of calculating total memory in the count step") - ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow") - ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)") - ("block_count", po::value(&pipeline.block_count)->default_value(2), "Block count (per order)") - ("vocab_file", po::value(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file") - ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc."); - if (argc == 1) { - std::cerr << - "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n" - "Please cite:\n" - "@inproceedings{kenlm,\n" - "author = {Kenneth Heafield},\n" - "title = {{KenLM}: Faster and Smaller Language Model Queries},\n" - "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n" - "month = {July}, year={2011},\n" - "address = {Edinburgh, UK},\n" - "publisher = {Association for Computational Linguistics},\n" - "}\n\n" - "Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n" - "the model (-o) is the only mandatory option. As this is an on-disk program,\n" - "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n" - "Memory sizes are specified like GNU sort: a number followed by a unit character.\n" - "Valid units are \% for percentage of memory (supported platforms only) and (in\n" - "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n\n"; - std::cerr << options << std::endl; - return 1; - } - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, options), vm); - po::notify(vm); - - util::NormalizeTempPrefix(pipeline.sort.temp_prefix); - - lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; - // TODO: evaluate options for these. - initial.adder_in.total_memory = 32768; - initial.adder_in.block_count = 2; - initial.adder_out.total_memory = 32768; - initial.adder_out.block_count = 2; - pipeline.read_backoffs = initial.adder_out; - - // Read from stdin - try { - lm::builder::Pipeline(pipeline, 0, 1); - } catch (const util::MallocException &e) { - std::cerr << e.what() << std::endl; - std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as() << std::endl; - return 1; - } - util::PrintUsage(std::cerr); - } catch (const std::exception &e) { - std::cerr << e.what() << std::endl; - return 1; - } -} diff --git a/klm/lm/filter/filter_main.cc b/klm/lm/filter/filter_main.cc new file mode 100644 index 00000000..1a4ba84f --- /dev/null +++ b/klm/lm/filter/filter_main.cc @@ -0,0 +1,248 @@ +#include "lm/filter/arpa_io.hh" +#include "lm/filter/format.hh" +#include "lm/filter/phrase.hh" +#ifndef NTHREAD +#include "lm/filter/thread.hh" +#endif +#include "lm/filter/vocab.hh" +#include "lm/filter/wrapper.hh" +#include "util/file_piece.hh" + +#include + +#include +#include +#include +#include + +namespace lm { +namespace { + +void DisplayHelp(const char *name) { + std::cerr + << "Usage: " << name << " mode [context] [phrase] [raw|arpa] [threads:m] [batch_size:m] (vocab|model):input_file output_file\n\n" + "copy mode just copies, but makes the format nicer for e.g. irstlm's broken\n" + " parser.\n" + "single mode treats the entire input as a single sentence.\n" + "multiple mode filters to multiple sentences in parallel. Each sentence is on\n" + " a separate line. A separate file is created for each file by appending the\n" + " 0-indexed line number to the output file name.\n" + "union mode produces one filtered model that is the union of models created by\n" + " multiple mode.\n\n" + "context means only the context (all but last word) has to pass the filter, but\n" + " the entire n-gram is output.\n\n" + "phrase means that the vocabulary is actually tab-delimited phrases and that the\n" + " phrases can generate the n-gram when assembled in arbitrary order and\n" + " clipped. Currently works with multiple or union mode.\n\n" + "The file format is set by [raw|arpa] with default arpa:\n" + "raw means space-separated tokens, optionally followed by a tab and arbitrary\n" + " text. This is useful for ngram count files.\n" + "arpa means the ARPA file format for n-gram language models.\n\n" +#ifndef NTHREAD + "threads:m sets m threads (default: conccurrency detected by boost)\n" + "batch_size:m sets the batch size for threading. Expect memory usage from this\n" + " of 2*threads*batch_size n-grams.\n\n" +#else + "This binary was compiled with -DNTHREAD, disabling threading. If you wanted\n" + " threading, compile without this flag against Boost >=1.42.0.\n\n" +#endif + "There are two inputs: vocabulary and model. Either may be given as a file\n" + " while the other is on stdin. Specify the type given as a file using\n" + " vocab: or model: before the file name. \n\n" + "For ARPA format, the output must be seekable. For raw format, it can be a\n" + " stream i.e. /dev/stdout\n"; +} + +typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} FilterMode; +typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; + +struct Config { + Config() : +#ifndef NTHREAD + batch_size(25000), + threads(boost::thread::hardware_concurrency()), +#endif + phrase(false), + context(false), + format(FORMAT_ARPA) + { +#ifndef NTHREAD + if (!threads) threads = 1; +#endif + } + +#ifndef NTHREAD + size_t batch_size; + size_t threads; +#endif + bool phrase; + bool context; + FilterMode mode; + Format format; +}; + +template void RunThreadedFilter(const Config &config, util::FilePiece &in_lm, Filter &filter, Output &output) { +#ifndef NTHREAD + if (config.threads == 1) { +#endif + Format::RunFilter(in_lm, filter, output); +#ifndef NTHREAD + } else { + typedef Controller Threaded; + Threaded threading(config.batch_size, config.threads * 2, config.threads, filter, output); + Format::RunFilter(in_lm, threading, output); + } +#endif +} + +template void RunContextFilter(const Config &config, util::FilePiece &in_lm, Filter filter, Output &output) { + if (config.context) { + ContextFilter context_filter(filter); + RunThreadedFilter, OutputBuffer, Output>(config, in_lm, context_filter, output); + } else { + RunThreadedFilter(config, in_lm, filter, output); + } +} + +template void DispatchBinaryFilter(const Config &config, util::FilePiece &in_lm, const Binary &binary, typename Format::Output &out) { + typedef BinaryFilter Filter; + RunContextFilter(config, in_lm, Filter(binary), out); +} + +template void DispatchFilterModes(const Config &config, std::istream &in_vocab, util::FilePiece &in_lm, const char *out_name) { + if (config.mode == MODE_MULTIPLE) { + if (config.phrase) { + typedef phrase::Multiple Filter; + phrase::Substrings substrings; + typename Format::Multiple out(out_name, phrase::ReadMultiple(in_vocab, substrings)); + RunContextFilter(config, in_lm, Filter(substrings), out); + } else { + typedef vocab::Multiple Filter; + boost::unordered_map > words; + typename Format::Multiple out(out_name, vocab::ReadMultiple(in_vocab, words)); + RunContextFilter(config, in_lm, Filter(words), out); + } + return; + } + + typename Format::Output out(out_name); + + if (config.mode == MODE_COPY) { + Format::Copy(in_lm, out); + return; + } + + if (config.mode == MODE_SINGLE) { + vocab::Single::Words words; + vocab::ReadSingle(in_vocab, words); + DispatchBinaryFilter(config, in_lm, vocab::Single(words), out); + return; + } + + if (config.mode == MODE_UNION) { + if (config.phrase) { + phrase::Substrings substrings; + phrase::ReadMultiple(in_vocab, substrings); + DispatchBinaryFilter(config, in_lm, phrase::Union(substrings), out); + } else { + vocab::Union::Words words; + vocab::ReadMultiple(in_vocab, words); + DispatchBinaryFilter(config, in_lm, vocab::Union(words), out); + } + return; + } +} + +} // namespace +} // namespace lm + +int main(int argc, char *argv[]) { + if (argc < 4) { + lm::DisplayHelp(argv[0]); + return 1; + } + + // I used to have boost::program_options, but some users didn't want to compile boost. + lm::Config config; + config.mode = lm::MODE_UNSET; + for (int i = 1; i < argc - 2; ++i) { + const char *str = argv[i]; + if (!std::strcmp(str, "copy")) { + config.mode = lm::MODE_COPY; + } else if (!std::strcmp(str, "single")) { + config.mode = lm::MODE_SINGLE; + } else if (!std::strcmp(str, "multiple")) { + config.mode = lm::MODE_MULTIPLE; + } else if (!std::strcmp(str, "union")) { + config.mode = lm::MODE_UNION; + } else if (!std::strcmp(str, "phrase")) { + config.phrase = true; + } else if (!std::strcmp(str, "context")) { + config.context = true; + } else if (!std::strcmp(str, "arpa")) { + config.format = lm::FORMAT_ARPA; + } else if (!std::strcmp(str, "raw")) { + config.format = lm::FORMAT_COUNT; +#ifndef NTHREAD + } else if (!std::strncmp(str, "threads:", 8)) { + config.threads = boost::lexical_cast(str + 8); + if (!config.threads) { + std::cerr << "Specify at least one thread." << std::endl; + return 1; + } + } else if (!std::strncmp(str, "batch_size:", 11)) { + config.batch_size = boost::lexical_cast(str + 11); + if (config.batch_size < 5000) { + std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl; + if (!config.batch_size) return 1; + } +#endif + } else { + lm::DisplayHelp(argv[0]); + return 1; + } + } + + if (config.mode == lm::MODE_UNSET) { + lm::DisplayHelp(argv[0]); + return 1; + } + + if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) { + std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl; + return 1; + } + + bool cmd_is_model = true; + const char *cmd_input = argv[argc - 2]; + if (!strncmp(cmd_input, "vocab:", 6)) { + cmd_is_model = false; + cmd_input += 6; + } else if (!strncmp(cmd_input, "model:", 6)) { + cmd_input += 6; + } else if (strchr(cmd_input, ':')) { + errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input); + } else { + std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; + } + std::ifstream cmd_file; + std::istream *vocab; + if (cmd_is_model) { + vocab = &std::cin; + } else { + cmd_file.open(cmd_input, std::ios::in); + if (!cmd_file) { + err(2, "Could not open input file %s", cmd_input); + } + vocab = &cmd_file; + } + + util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr); + + if (config.format == lm::FORMAT_ARPA) { + lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); + } else if (config.format == lm::FORMAT_COUNT) { + lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); + } + return 0; +} diff --git a/klm/lm/filter/main.cc b/klm/lm/filter/main.cc deleted file mode 100644 index c42243e2..00000000 --- a/klm/lm/filter/main.cc +++ /dev/null @@ -1,249 +0,0 @@ -#include "lm/filter/arpa_io.hh" -#include "lm/filter/format.hh" -#include "lm/filter/phrase.hh" -#ifndef NTHREAD -#include "lm/filter/thread.hh" -#endif -#include "lm/filter/vocab.hh" -#include "lm/filter/wrapper.hh" -#include "util/file_piece.hh" - -#include - -#include -#include -#include -#include - -namespace lm { -namespace { - -void DisplayHelp(const char *name) { - std::cerr - << "Usage: " << name << " mode [context] [phrase] [raw|arpa] [threads:m] [batch_size:m] (vocab|model):input_file output_file\n\n" - "copy mode just copies, but makes the format nicer for e.g. irstlm's broken\n" - " parser.\n" - "single mode treats the entire input as a single sentence.\n" - "multiple mode filters to multiple sentences in parallel. Each sentence is on\n" - " a separate line. A separate file is created for each file by appending the\n" - " 0-indexed line number to the output file name.\n" - "union mode produces one filtered model that is the union of models created by\n" - " multiple mode.\n\n" - "context means only the context (all but last word) has to pass the filter, but\n" - " the entire n-gram is output.\n\n" - "phrase means that the vocabulary is actually tab-delimited phrases and that the\n" - " phrases can generate the n-gram when assembled in arbitrary order and\n" - " clipped. Currently works with multiple or union mode.\n\n" - "The file format is set by [raw|arpa] with default arpa:\n" - "raw means space-separated tokens, optionally followed by a tab and arbitrary\n" - " text. This is useful for ngram count files.\n" - "arpa means the ARPA file format for n-gram language models.\n\n" -#ifndef NTHREAD - "threads:m sets m threads (default: conccurrency detected by boost)\n" - "batch_size:m sets the batch size for threading. Expect memory usage from this\n" - " of 2*threads*batch_size n-grams.\n\n" -#else - "This binary was compiled with -DNTHREAD, disabling threading. If you wanted\n" - " threading, compile without this flag against Boost >=1.42.0.\n\n" -#endif - "There are two inputs: vocabulary and model. Either may be given as a file\n" - " while the other is on stdin. Specify the type given as a file using\n" - " vocab: or model: before the file name. \n\n" - "For ARPA format, the output must be seekable. For raw format, it can be a\n" - " stream i.e. /dev/stdout\n"; -} - -typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION} FilterMode; -typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; - -struct Config { - Config() : -#ifndef NTHREAD - batch_size(25000), - threads(boost::thread::hardware_concurrency()), -#endif - phrase(false), - context(false), - format(FORMAT_ARPA) - { -#ifndef NTHREAD - if (!threads) threads = 1; -#endif - } - -#ifndef NTHREAD - size_t batch_size; - size_t threads; -#endif - bool phrase; - bool context; - FilterMode mode; - Format format; -}; - -template void RunThreadedFilter(const Config &config, util::FilePiece &in_lm, Filter &filter, Output &output) { -#ifndef NTHREAD - if (config.threads == 1) { -#endif - Format::RunFilter(in_lm, filter, output); -#ifndef NTHREAD - } else { - typedef Controller Threaded; - Threaded threading(config.batch_size, config.threads * 2, config.threads, filter, output); - Format::RunFilter(in_lm, threading, output); - } -#endif -} - -template void RunContextFilter(const Config &config, util::FilePiece &in_lm, Filter filter, Output &output) { - if (config.context) { - ContextFilter context_filter(filter); - RunThreadedFilter, OutputBuffer, Output>(config, in_lm, context_filter, output); - } else { - RunThreadedFilter(config, in_lm, filter, output); - } -} - -template void DispatchBinaryFilter(const Config &config, util::FilePiece &in_lm, const Binary &binary, typename Format::Output &out) { - typedef BinaryFilter Filter; - RunContextFilter(config, in_lm, Filter(binary), out); -} - -template void DispatchFilterModes(const Config &config, std::istream &in_vocab, util::FilePiece &in_lm, const char *out_name) { - if (config.mode == MODE_MULTIPLE) { - if (config.phrase) { - typedef phrase::Multiple Filter; - phrase::Substrings substrings; - typename Format::Multiple out(out_name, phrase::ReadMultiple(in_vocab, substrings)); - RunContextFilter(config, in_lm, Filter(substrings), out); - } else { - typedef vocab::Multiple Filter; - boost::unordered_map > words; - typename Format::Multiple out(out_name, vocab::ReadMultiple(in_vocab, words)); - RunContextFilter(config, in_lm, Filter(words), out); - } - return; - } - - typename Format::Output out(out_name); - - if (config.mode == MODE_COPY) { - Format::Copy(in_lm, out); - return; - } - - if (config.mode == MODE_SINGLE) { - vocab::Single::Words words; - vocab::ReadSingle(in_vocab, words); - DispatchBinaryFilter(config, in_lm, vocab::Single(words), out); - return; - } - - if (config.mode == MODE_UNION) { - if (config.phrase) { - phrase::Substrings substrings; - phrase::ReadMultiple(in_vocab, substrings); - DispatchBinaryFilter(config, in_lm, phrase::Union(substrings), out); - } else { - vocab::Union::Words words; - vocab::ReadMultiple(in_vocab, words); - DispatchBinaryFilter(config, in_lm, vocab::Union(words), out); - } - return; - } -} - -} // namespace -} // namespace lm - -int main(int argc, char *argv[]) { - if (argc < 4) { - lm::DisplayHelp(argv[0]); - return 1; - } - - // I used to have boost::program_options, but some users didn't want to compile boost. - lm::Config config; - boost::optional mode; - for (int i = 1; i < argc - 2; ++i) { - const char *str = argv[i]; - if (!std::strcmp(str, "copy")) { - mode = lm::MODE_COPY; - } else if (!std::strcmp(str, "single")) { - mode = lm::MODE_SINGLE; - } else if (!std::strcmp(str, "multiple")) { - mode = lm::MODE_MULTIPLE; - } else if (!std::strcmp(str, "union")) { - mode = lm::MODE_UNION; - } else if (!std::strcmp(str, "phrase")) { - config.phrase = true; - } else if (!std::strcmp(str, "context")) { - config.context = true; - } else if (!std::strcmp(str, "arpa")) { - config.format = lm::FORMAT_ARPA; - } else if (!std::strcmp(str, "raw")) { - config.format = lm::FORMAT_COUNT; -#ifndef NTHREAD - } else if (!std::strncmp(str, "threads:", 8)) { - config.threads = boost::lexical_cast(str + 8); - if (!config.threads) { - std::cerr << "Specify at least one thread." << std::endl; - return 1; - } - } else if (!std::strncmp(str, "batch_size:", 11)) { - config.batch_size = boost::lexical_cast(str + 11); - if (config.batch_size < 5000) { - std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl; - if (!config.batch_size) return 1; - } -#endif - } else { - lm::DisplayHelp(argv[0]); - return 1; - } - } - - if (!mode) { - lm::DisplayHelp(argv[0]); - return 1; - } - config.mode = *mode; - - if (config.phrase && config.mode != lm::MODE_UNION && mode != lm::MODE_MULTIPLE) { - std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl; - return 1; - } - - bool cmd_is_model = true; - const char *cmd_input = argv[argc - 2]; - if (!strncmp(cmd_input, "vocab:", 6)) { - cmd_is_model = false; - cmd_input += 6; - } else if (!strncmp(cmd_input, "model:", 6)) { - cmd_input += 6; - } else if (strchr(cmd_input, ':')) { - errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input); - } else { - std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; - } - std::ifstream cmd_file; - std::istream *vocab; - if (cmd_is_model) { - vocab = &std::cin; - } else { - cmd_file.open(cmd_input, std::ios::in); - if (!cmd_file) { - err(2, "Could not open input file %s", cmd_input); - } - vocab = &cmd_file; - } - - util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr); - - if (config.format == lm::FORMAT_ARPA) { - lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); - } else if (config.format == lm::FORMAT_COUNT) { - lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); - } - return 0; -} diff --git a/klm/lm/filter/phrase.hh b/klm/lm/filter/phrase.hh index 07479dea..b4edff41 100644 --- a/klm/lm/filter/phrase.hh +++ b/klm/lm/filter/phrase.hh @@ -57,6 +57,7 @@ class Substrings { LM_FILTER_PHRASE_METHOD(Right, right) LM_FILTER_PHRASE_METHOD(Phrase, phrase) +#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization // sentence_id must be non-decreasing. Iterators are over words in the phrase. template void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) { // Iterate over all substrings. diff --git a/klm/lm/filter/vocab.hh b/klm/lm/filter/vocab.hh index e2b6adff..7f0fadaa 100644 --- a/klm/lm/filter/vocab.hh +++ b/klm/lm/filter/vocab.hh @@ -5,6 +5,7 @@ #include "util/multi_intersection.hh" #include "util/string_piece.hh" +#include "util/string_piece_hash.hh" #include "util/tokenize_piece.hh" #include diff --git a/klm/lm/fragment.cc b/klm/lm/fragment.cc deleted file mode 100644 index 0267cd4e..00000000 --- a/klm/lm/fragment.cc +++ /dev/null @@ -1,37 +0,0 @@ -#include "lm/binary_format.hh" -#include "lm/model.hh" -#include "lm/left.hh" -#include "util/tokenize_piece.hh" - -template void Query(const char *name) { - Model model(name); - std::string line; - lm::ngram::ChartState ignored; - while (getline(std::cin, line)) { - lm::ngram::RuleScore scorer(model, ignored); - for (util::TokenIter i(line, ' '); i; ++i) { - scorer.Terminal(model.GetVocabulary().Index(*i)); - } - std::cout << scorer.Finish() << '\n'; - } -} - -int main(int argc, char *argv[]) { - if (argc != 2) { - std::cerr << "Expected model file name." << std::endl; - return 1; - } - const char *name = argv[1]; - lm::ngram::ModelType model_type = lm::ngram::PROBING; - lm::ngram::RecognizeBinary(name, model_type); - switch (model_type) { - case lm::ngram::PROBING: - Query(name); - break; - case lm::ngram::REST_PROBING: - Query(name); - break; - default: - std::cerr << "Model type not supported yet." << std::endl; - } -} diff --git a/klm/lm/fragment_main.cc b/klm/lm/fragment_main.cc new file mode 100644 index 00000000..0267cd4e --- /dev/null +++ b/klm/lm/fragment_main.cc @@ -0,0 +1,37 @@ +#include "lm/binary_format.hh" +#include "lm/model.hh" +#include "lm/left.hh" +#include "util/tokenize_piece.hh" + +template void Query(const char *name) { + Model model(name); + std::string line; + lm::ngram::ChartState ignored; + while (getline(std::cin, line)) { + lm::ngram::RuleScore scorer(model, ignored); + for (util::TokenIter i(line, ' '); i; ++i) { + scorer.Terminal(model.GetVocabulary().Index(*i)); + } + std::cout << scorer.Finish() << '\n'; + } +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "Expected model file name." << std::endl; + return 1; + } + const char *name = argv[1]; + lm::ngram::ModelType model_type = lm::ngram::PROBING; + lm::ngram::RecognizeBinary(name, model_type); + switch (model_type) { + case lm::ngram::PROBING: + Query(name); + break; + case lm::ngram::REST_PROBING: + Query(name); + break; + default: + std::cerr << "Model type not supported yet." << std::endl; + } +} diff --git a/klm/lm/kenlm_max_order_main.cc b/klm/lm/kenlm_max_order_main.cc new file mode 100644 index 00000000..94221201 --- /dev/null +++ b/klm/lm/kenlm_max_order_main.cc @@ -0,0 +1,6 @@ +#include "lm/max_order.hh" +#include + +int main(int argc, char *argv[]) { + std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl; +} diff --git a/klm/lm/max_order.cc b/klm/lm/max_order.cc deleted file mode 100644 index 94221201..00000000 --- a/klm/lm/max_order.cc +++ /dev/null @@ -1,6 +0,0 @@ -#include "lm/max_order.hh" -#include - -int main(int argc, char *argv[]) { - std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl; -} diff --git a/klm/lm/ngram_query.cc b/klm/lm/ngram_query.cc deleted file mode 100644 index 49757d9a..00000000 --- a/klm/lm/ngram_query.cc +++ /dev/null @@ -1,47 +0,0 @@ -#include "lm/ngram_query.hh" - -int main(int argc, char *argv[]) { - if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) { - std::cerr << "Usage: " << argv[0] << " lm_file [null]" << std::endl; - std::cerr << "Input is wrapped in and unless null is passed." << std::endl; - return 1; - } - try { - bool sentence_context = (argc == 2); - using namespace lm::ngram; - ModelType model_type; - if (RecognizeBinary(argv[1], model_type)) { - switch(model_type) { - case PROBING: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - case REST_PROBING: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - case TRIE: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - case QUANT_TRIE: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - case ARRAY_TRIE: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - case QUANT_ARRAY_TRIE: - Query(argv[1], sentence_context, std::cin, std::cout); - break; - default: - std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; - abort(); - } - } else { - Query(argv[1], sentence_context, std::cin, std::cout); - } - std::cerr << "Total time including destruction:\n"; - util::PrintUsage(std::cerr); - } catch (const std::exception &e) { - std::cerr << e.what() << std::endl; - return 1; - } - return 0; -} diff --git a/klm/lm/query_main.cc b/klm/lm/query_main.cc new file mode 100644 index 00000000..49757d9a --- /dev/null +++ b/klm/lm/query_main.cc @@ -0,0 +1,47 @@ +#include "lm/ngram_query.hh" + +int main(int argc, char *argv[]) { + if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) { + std::cerr << "Usage: " << argv[0] << " lm_file [null]" << std::endl; + std::cerr << "Input is wrapped in and unless null is passed." << std::endl; + return 1; + } + try { + bool sentence_context = (argc == 2); + using namespace lm::ngram; + ModelType model_type; + if (RecognizeBinary(argv[1], model_type)) { + switch(model_type) { + case PROBING: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case REST_PROBING: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case TRIE: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case QUANT_TRIE: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case ARRAY_TRIE: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case QUANT_ARRAY_TRIE: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + default: + std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; + abort(); + } + } else { + Query(argv[1], sentence_context, std::cin, std::cout); + } + std::cerr << "Total time including destruction:\n"; + util::PrintUsage(std::cerr); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } + return 0; +} diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index 248cc844..7f873e96 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -38,6 +38,7 @@ libklm_util_a_SOURCES = \ sized_iterator.hh \ sorted_uniform.hh \ string_piece.hh \ + string_piece_hash.hh \ thread_pool.hh \ tokenize_piece.hh \ usage.hh \ diff --git a/klm/util/double-conversion/strtod.cc b/klm/util/double-conversion/strtod.cc index 9758989f..e298766a 100644 --- a/klm/util/double-conversion/strtod.cc +++ b/klm/util/double-conversion/strtod.cc @@ -506,7 +506,9 @@ float Strtof(Vector buffer, int exponent) { double double_previous = Double(double_guess).PreviousDouble(); float f1 = static_cast(double_previous); +#ifndef NDEBUG float f2 = float_guess; +#endif float f3 = static_cast(double_next); float f4; if (is_correct) { @@ -515,7 +517,9 @@ float Strtof(Vector buffer, int exponent) { double double_next2 = Double(double_next).NextDouble(); f4 = static_cast(double_next2); } +#ifndef NDEBUG ASSERT(f1 <= f2 && f2 <= f3 && f3 <= f4); +#endif // If the guess doesn't lie near a single-precision boundary we can simply // return its float-value. diff --git a/klm/util/file.cc b/klm/util/file.cc index 9a6d2e64..86d9b12d 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -22,6 +22,7 @@ #include #include #include +#include #else #include #endif @@ -99,15 +100,15 @@ uint64_t SizeOrThrow(int fd) { } void ResizeOrThrow(int fd, uint64_t to) { - UTIL_THROW_IF_ARG( #if defined(_WIN32) || defined(_WIN64) - _chsize_s + errno_t ret = _chsize_s #elif defined(OS_ANDROID) - ftruncate64 + int ret = ftruncate64 #else - ftruncate + int ret = ftruncate #endif - (fd, to), FDException, (fd), "while resizing to " << to << " bytes"); + (fd, to); + UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes"); } std::size_t PartialRead(int fd, void *to, std::size_t amount) { @@ -150,9 +151,21 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { uint8_t *to = static_cast(to_void); #if defined(_WIN32) || defined(_WIN64) - UTIL_THROW(Exception, "TODO: PReadOrThrow for windows using ReadFile http://stackoverflow.com/questions/766477/are-there-equivalents-to-pread-on-different-platforms"); -#else + UTIL_THROW(Exception, "This pread implementation for windows is broken. Please send me a patch that does not change the file pointer. Atomically. Or send me an implementation of pwrite that is allowed to change the file pointer but can be called concurrently with pread."); + const std::size_t kMaxDWORD = static_cast(4294967295UL); +#endif for (;size ;) { +#if defined(_WIN32) || defined(_WIN64) + /* BROKEN: changes file pointer. Even if you save it and change it back, it won't be safe to use concurrently with write() or read() which lmplz does. */ + // size_t might be 64-bit. DWORD is always 32. + DWORD reading = static_cast(std::min(kMaxDWORD, size)); + DWORD ret; + OVERLAPPED overlapped; + memset(&overlapped, 0, sizeof(OVERLAPPED)); + overlapped.Offset = static_cast(off); + overlapped.OffsetHigh = static_cast(off >> 32); + UTIL_THROW_IF(!ReadFile((HANDLE)_get_osfhandle(fd), to, reading, &ret, &overlapped), Exception, "ReadFile failed for offset " << off); +#else ssize_t ret; errno = 0; do { @@ -166,11 +179,11 @@ void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd)); UTIL_THROW_ARG(FDException, (fd), "while reading " << size << " bytes at offset " << off); } +#endif size -= ret; off += ret; to += ret; } -#endif } void WriteOrThrow(int fd, const void *data_void, std::size_t size) { @@ -218,15 +231,15 @@ typedef CheckOffT::True IgnoredType; // Can't we all just get along? void InternalSeek(int fd, int64_t off, int whence) { - UTIL_THROW_IF_ARG( + if ( #if defined(_WIN32) || defined(_WIN64) - (__int64)-1 == _lseeki64(fd, off, whence), + (__int64)-1 == _lseeki64(fd, off, whence) #elif defined(OS_ANDROID) - (off64_t)-1 == lseek64(fd, off, whence), + (off64_t)-1 == lseek64(fd, off, whence) #else - (off_t)-1 == lseek(fd, off, whence), + (off_t)-1 == lseek(fd, off, whence) #endif - FDException, (fd), "while seeking to " << off << " whence " << whence); + ) UTIL_THROW_ARG(FDException, (fd), "while seeking to " << off << " whence " << whence); } } // namespace @@ -386,7 +399,13 @@ void NormalizeTempPrefix(std::string &base) { struct stat sb; // It's fine for it to not exist. if (-1 == stat(base.c_str(), &sb)) return; - if (S_ISDIR(sb.st_mode)) base += '/'; + if ( +#if defined(_WIN32) || defined(_WIN64) + sb.st_mode & _S_IFDIR +#else + S_ISDIR(sb.st_mode) +#endif + ) base += '/'; } int MakeTemp(const std::string &base) { diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index fbfa0e0e..4d143857 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -49,6 +49,18 @@ FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std: Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer); } +FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buffer) : + total_size_(kBadSize), page_(SizePage()) { + InitializeNoRead("istream", min_buffer); + + fallback_to_read_ = true; + data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); + position_ = data_.begin(); + position_end_ = position_; + + fell_back_.Reset(stream); +} + FilePiece::~FilePiece() {} StringPiece FilePiece::ReadLine(char delim) { @@ -83,7 +95,8 @@ unsigned long int FilePiece::ReadULong() { return ReadNumber(); } -void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { +// Factored out so that istream can call this. +void FilePiece::InitializeNoRead(const char *name, std::size_t min_buffer) { file_name_ = name; default_map_size_ = page_ * std::max((min_buffer / page_ + 1), 2); @@ -91,6 +104,10 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s position_end_ = NULL; mapped_offset_ = 0; at_end_ = false; +} + +void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { + InitializeNoRead(name, min_buffer); if (total_size_ == kBadSize) { // So the assertion passes. @@ -239,8 +256,7 @@ void FilePiece::TransitionToRead() { assert(!fallback_to_read_); fallback_to_read_ = true; data_.reset(); - data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); - UTIL_THROW_IF(!data_.get(), ErrnoException, "malloc failed for " << default_map_size_); + data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); position_ = data_.begin(); position_end_ = position_; diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index 53310976..c07c6011 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -9,6 +9,7 @@ #include "util/string_piece.hh" #include +#include #include #include @@ -31,6 +32,13 @@ class FilePiece { // Takes ownership of fd. name is used for messages. explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); + /* Read from an istream. Don't use this if you can avoid it. Raw fd IO is + * much faster. But sometimes you just have an istream like Boost's HTTP + * server and want to parse it the same way. + * name is just used for messages and FileName(). + */ + explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576); + ~FilePiece(); char get() { @@ -71,6 +79,8 @@ class FilePiece { const std::string &FileName() const { return file_name_; } private: + void InitializeNoRead(const char *name, std::size_t min_buffer); + // Calls InitializeNoRead, so don't call both. void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); template T ReadNumber(); diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc index 91e4c559..7336007d 100644 --- a/klm/util/file_piece_test.cc +++ b/klm/util/file_piece_test.cc @@ -24,6 +24,20 @@ std::string FileLocation() { return ret; } +/* istream */ +BOOST_AUTO_TEST_CASE(IStream) { + std::fstream ref(FileLocation().c_str(), std::ios::in); + std::fstream backing(FileLocation().c_str(), std::ios::in); + FilePiece test(backing); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + BOOST_CHECK_EQUAL(ref_line, test_line); + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + /* mmap implementation */ BOOST_AUTO_TEST_CASE(MMapReadLine) { std::fstream ref(FileLocation().c_str(), std::ios::in); diff --git a/klm/util/have.hh b/klm/util/have.hh index e9a4d946..6e18529d 100644 --- a/klm/util/have.hh +++ b/klm/util/have.hh @@ -10,8 +10,4 @@ //#define HAVE_ICU #endif -#ifndef HAVE_BOOST -//#define HAVE_BOOST -#endif - #endif // UTIL_HAVE__ diff --git a/klm/util/read_compressed.cc b/klm/util/read_compressed.cc index 7a1a8fb5..b81549e4 100644 --- a/klm/util/read_compressed.cc +++ b/klm/util/read_compressed.cc @@ -320,6 +320,23 @@ class XZip : public ReadBase { }; #endif // HAVE_XZLIB +class IStreamReader : public ReadBase { + public: + explicit IStreamReader(std::istream &stream) : stream_(stream) {} + + std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { + if (!stream_.read(static_cast(to), amount)) { + UTIL_THROW_IF(!stream_.eof(), ErrnoException, "istream error"); + amount = stream_.gcount(); + } + ReadCount(thunk) += amount; + return amount; + } + + private: + std::istream &stream_; +}; + enum MagicResult { UNKNOWN, GZIP, BZIP, XZIP }; @@ -329,7 +346,7 @@ MagicResult DetectMagic(const void *from_void) { if (header[0] == 0x1f && header[1] == 0x8b) { return GZIP; } - if (header[0] == 'B' && header[1] == 'Z') { + if (header[0] == 'B' && header[1] == 'Z' && header[2] == 'h') { return BZIP; } const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; @@ -387,6 +404,10 @@ ReadCompressed::ReadCompressed(int fd) { Reset(fd); } +ReadCompressed::ReadCompressed(std::istream &in) { + Reset(in); +} + ReadCompressed::ReadCompressed() {} ReadCompressed::~ReadCompressed() {} @@ -396,6 +417,11 @@ void ReadCompressed::Reset(int fd) { internal_.reset(ReadFactory(fd, raw_amount_)); } +void ReadCompressed::Reset(std::istream &in) { + internal_.reset(); + internal_.reset(new IStreamReader(in)); +} + std::size_t ReadCompressed::Read(void *to, std::size_t amount) { return internal_->Read(to, amount, *this); } diff --git a/klm/util/read_compressed.hh b/klm/util/read_compressed.hh index 83ca9fb2..8b54c9e8 100644 --- a/klm/util/read_compressed.hh +++ b/klm/util/read_compressed.hh @@ -45,6 +45,10 @@ class ReadCompressed { // Takes ownership of fd. explicit ReadCompressed(int fd); + // Try to avoid using this. Use the fd instead. + // There is no decompression support for istreams. + explicit ReadCompressed(std::istream &in); + // Must call Reset later. ReadCompressed(); @@ -53,6 +57,9 @@ class ReadCompressed { // Takes ownership of fd. void Reset(int fd); + // Same advice as the constructor. + void Reset(std::istream &in); + std::size_t Read(void *to, std::size_t amount); uint64_t RawAmount() const { return raw_amount_; } diff --git a/klm/util/read_compressed_test.cc b/klm/util/read_compressed_test.cc index 6fd97e5e..9cb4a4b9 100644 --- a/klm/util/read_compressed_test.cc +++ b/klm/util/read_compressed_test.cc @@ -25,19 +25,34 @@ void ReadLoop(ReadCompressed &reader, void *to_void, std::size_t amount) { } } -void TestRandom(const char *compressor) { - const uint32_t kSize4 = 100000 / 4; +const uint32_t kSize4 = 100000 / 4; + +std::string WriteRandom() { char name[] = "tempXXXXXX"; + scoped_fd original(mkstemp(name)); + BOOST_REQUIRE(original.get() > 0); + for (uint32_t i = 0; i < kSize4; ++i) { + WriteOrThrow(original.get(), &i, sizeof(uint32_t)); + } + return name; +} - // Write test file. - { - scoped_fd original(mkstemp(name)); - BOOST_REQUIRE(original.get() > 0); - for (uint32_t i = 0; i < kSize4; ++i) { - WriteOrThrow(original.get(), &i, sizeof(uint32_t)); - } +void VerifyRead(ReadCompressed &reader) { + for (uint32_t i = 0; i < kSize4; ++i) { + uint32_t got; + ReadLoop(reader, &got, sizeof(uint32_t)); + BOOST_CHECK_EQUAL(i, got); } + char ignored; + BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); + // Test double EOF call. + BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); +} + +void TestRandom(const char *compressor) { + std::string name(WriteRandom()); + char gzname[] = "tempXXXXXX"; scoped_fd gzipped(mkstemp(gzname)); @@ -52,20 +67,11 @@ void TestRandom(const char *compressor) { command += "\""; BOOST_REQUIRE_EQUAL(0, system(command.c_str())); - BOOST_CHECK_EQUAL(0, unlink(name)); + BOOST_CHECK_EQUAL(0, unlink(name.c_str())); BOOST_CHECK_EQUAL(0, unlink(gzname)); ReadCompressed reader(gzipped.release()); - for (uint32_t i = 0; i < kSize4; ++i) { - uint32_t got; - ReadLoop(reader, &got, sizeof(uint32_t)); - BOOST_CHECK_EQUAL(i, got); - } - - char ignored; - BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); - // Test double EOF call. - BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); + VerifyRead(reader); } BOOST_AUTO_TEST_CASE(Uncompressed) { @@ -90,5 +96,14 @@ BOOST_AUTO_TEST_CASE(ReadXZ) { } #endif +BOOST_AUTO_TEST_CASE(IStream) { + std::string name(WriteRandom()); + std::fstream stream(name.c_str(), std::ios::in); + BOOST_CHECK_EQUAL(0, unlink(name.c_str())); + ReadCompressed reader; + reader.Reset(stream); + VerifyRead(reader); +} + } // namespace } // namespace util diff --git a/klm/util/stream/io.cc b/klm/util/stream/io.cc index c7ad2980..0459f706 100644 --- a/klm/util/stream/io.cc +++ b/klm/util/stream/io.cc @@ -29,15 +29,17 @@ void Read::Run(const ChainPosition &position) { void PRead::Run(const ChainPosition &position) { scoped_fd owner; if (own_) owner.reset(file_); - uint64_t size = SizeOrThrow(file_); + const uint64_t size = SizeOrThrow(file_); UTIL_THROW_IF(size % static_cast(position.GetChain().EntrySize()), ReadSizeException, "File size " << file_ << " size is " << size << " not a multiple of " << position.GetChain().EntrySize()); - std::size_t block_size = position.GetChain().BlockSize(); + const std::size_t block_size = position.GetChain().BlockSize(); + const uint64_t block_size64 = static_cast(block_size); Link link(position); uint64_t offset = 0; - for (; offset + block_size < size; offset += block_size, ++link) { + for (; offset + block_size64 < size; offset += block_size64, ++link) { PReadOrThrow(file_, link->Get(), block_size, offset); link->SetValidSize(block_size); } + // size - offset is <= block_size, so it casts to 32-bit fine. if (size - offset) { PReadOrThrow(file_, link->Get(), size - offset, offset); link->SetValidSize(size - offset); diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh index a86f160f..16aa6a03 100644 --- a/klm/util/stream/sort.hh +++ b/klm/util/stream/sort.hh @@ -365,10 +365,14 @@ template class BlockSorter { // Record the size of each block in a separate file. offsets_->Append(link->ValidSize()); void *end = static_cast(link->Get()) + link->ValidSize(); - std::sort( - SizedIt(link->Get(), entry_size), - SizedIt(end, entry_size), - compare_); +#if defined(_WIN32) || defined(_WIN64) + std::stable_sort +#else + std::sort +#endif + (SizedIt(link->Get(), entry_size), + SizedIt(end, entry_size), + compare_); } offsets_->FinishedAppending(); } diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc index b422cefc..ec394b96 100644 --- a/klm/util/string_piece.cc +++ b/klm/util/string_piece.cc @@ -17,7 +17,8 @@ void StringPiece::CopyToString(std::string* target) const { } size_type StringPiece::find(const StringPiece& s, size_type pos) const { - if (length_ < 0 || pos > static_cast(length_)) + // Not sure why length_ < 0 was here since it's std::size_t. + if (/*length_ < 0 || */pos > static_cast(length_)) return npos; const char* result = std::search(ptr_ + pos, ptr_ + length_, diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh index 51481646..9cf4c7f6 100644 --- a/klm/util/string_piece.hh +++ b/klm/util/string_piece.hh @@ -50,10 +50,6 @@ #include "util/have.hh" -#ifdef HAVE_BOOST -#include -#endif // HAVE_BOOST - #include #include #include @@ -256,46 +252,9 @@ inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { return o.write(piece.data(), static_cast(piece.size())); } -#ifdef HAVE_BOOST -inline size_t hash_value(const StringPiece &str) { - return boost::hash_range(str.data(), str.data() + str.length()); -} - -/* Support for lookup of StringPiece in boost::unordered_map */ -struct StringPieceCompatibleHash : public std::unary_function { - size_t operator()(const StringPiece &str) const { - return hash_value(str); - } -}; - -struct StringPieceCompatibleEquals : public std::binary_function { - bool operator()(const StringPiece &first, const StringPiece &second) const { - return first == second; - } -}; -template typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { -#if BOOST_VERSION < 104200 - std::string temp(key.data(), key.size()); - return t.find(temp); -#else - return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); -#endif -} - -template typename T::iterator FindStringPiece(T &t, const StringPiece &key) { -#if BOOST_VERSION < 104200 - std::string temp(key.data(), key.size()); - return t.find(temp); -#else - return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); -#endif -} -#endif - #ifdef HAVE_ICU U_NAMESPACE_END using U_NAMESPACE_QUALIFIER StringPiece; #endif - #endif // BASE_STRING_PIECE_H__ diff --git a/klm/util/string_piece_hash.hh b/klm/util/string_piece_hash.hh new file mode 100644 index 00000000..f206b1d8 --- /dev/null +++ b/klm/util/string_piece_hash.hh @@ -0,0 +1,43 @@ +#ifndef UTIL_STRING_PIECE_HASH__ +#define UTIL_STRING_PIECE_HASH__ + +#include "util/string_piece.hh" + +#include +#include + +inline size_t hash_value(const StringPiece &str) { + return boost::hash_range(str.data(), str.data() + str.length()); +} + +/* Support for lookup of StringPiece in boost::unordered_map */ +struct StringPieceCompatibleHash : public std::unary_function { + size_t operator()(const StringPiece &str) const { + return hash_value(str); + } +}; + +struct StringPieceCompatibleEquals : public std::binary_function { + bool operator()(const StringPiece &first, const StringPiece &second) const { + return first == second; + } +}; +template typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { +#if BOOST_VERSION < 104200 + std::string temp(key.data(), key.size()); + return t.find(temp); +#else + return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +#endif +} + +template typename T::iterator FindStringPiece(T &t, const StringPiece &key) { +#if BOOST_VERSION < 104200 + std::string temp(key.data(), key.size()); + return t.find(temp); +#else + return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +#endif +} + +#endif // UTIL_STRING_PIECE_HASH__ diff --git a/klm/util/usage.cc b/klm/util/usage.cc index 16a004bb..b8e125d0 100644 --- a/klm/util/usage.cc +++ b/klm/util/usage.cc @@ -81,7 +81,7 @@ template uint64_t ParseNum(const std::string &arg) { UTIL_THROW_IF_ARG(stream >> throwaway, SizeParseError, (arg), "because there was more cruft " << throwaway << " after the number."); // Silly sort, using kilobytes as your default unit. - if (after.empty()) after == "K"; + if (after.empty()) after = "K"; if (after == "%") { uint64_t mem = GuessPhysicalMemory(); UTIL_THROW_IF_ARG(!mem, SizeParseError, (arg), "because % was specified but the physical memory size could not be determined."); -- cgit v1.2.3