From d884099e0db8b4510847ec106b59ef7dca3c245b Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 18 Jan 2013 17:12:51 +0000 Subject: KenLM dffafbf with lmplz source (but not built) --- klm/util/stream/block.hh | 43 +++ klm/util/stream/chain.cc | 155 +++++++++++ klm/util/stream/chain.hh | 198 ++++++++++++++ klm/util/stream/config.hh | 32 +++ klm/util/stream/io.cc | 64 +++++ klm/util/stream/io.hh | 76 ++++++ klm/util/stream/io_test.cc | 38 +++ klm/util/stream/line_input.cc | 52 ++++ klm/util/stream/line_input.hh | 22 ++ klm/util/stream/multi_progress.cc | 86 ++++++ klm/util/stream/multi_progress.hh | 90 +++++++ klm/util/stream/sort.hh | 542 ++++++++++++++++++++++++++++++++++++++ klm/util/stream/sort_test.cc | 62 +++++ klm/util/stream/stream.hh | 74 ++++++ klm/util/stream/stream_test.cc | 35 +++ klm/util/stream/timer.hh | 14 + 16 files changed, 1583 insertions(+) create mode 100644 klm/util/stream/block.hh create mode 100644 klm/util/stream/chain.cc create mode 100644 klm/util/stream/chain.hh create mode 100644 klm/util/stream/config.hh create mode 100644 klm/util/stream/io.cc create mode 100644 klm/util/stream/io.hh create mode 100644 klm/util/stream/io_test.cc create mode 100644 klm/util/stream/line_input.cc create mode 100644 klm/util/stream/line_input.hh create mode 100644 klm/util/stream/multi_progress.cc create mode 100644 klm/util/stream/multi_progress.hh create mode 100644 klm/util/stream/sort.hh create mode 100644 klm/util/stream/sort_test.cc create mode 100644 klm/util/stream/stream.hh create mode 100644 klm/util/stream/stream_test.cc create mode 100644 klm/util/stream/timer.hh (limited to 'klm/util/stream') diff --git a/klm/util/stream/block.hh b/klm/util/stream/block.hh new file mode 100644 index 00000000..11aa991e --- /dev/null +++ b/klm/util/stream/block.hh @@ -0,0 +1,43 @@ +#ifndef UTIL_STREAM_BLOCK__ +#define UTIL_STREAM_BLOCK__ + +#include +#include + +namespace util { +namespace stream { + +class Block { + public: + Block() : mem_(NULL), valid_size_(0) {} + + Block(void *mem, std::size_t size) : mem_(mem), valid_size_(size) {} + + void SetValidSize(std::size_t to) { valid_size_ = to; } + // Read might fill in less than Allocated at EOF. + std::size_t ValidSize() const { return valid_size_; } + + void *Get() { return mem_; } + const void *Get() const { return mem_; } + + const void *ValidEnd() const { + return reinterpret_cast(mem_) + valid_size_; + } + + operator bool() const { return mem_ != NULL; } + bool operator!() const { return mem_ == NULL; } + + private: + friend class Link; + void SetToPoison() { + mem_ = NULL; + } + + void *mem_; + std::size_t valid_size_; +}; + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_BLOCK__ diff --git a/klm/util/stream/chain.cc b/klm/util/stream/chain.cc new file mode 100644 index 00000000..46708c60 --- /dev/null +++ b/klm/util/stream/chain.cc @@ -0,0 +1,155 @@ +#include "util/stream/chain.hh" + +#include "util/stream/io.hh" + +#include "util/exception.hh" +#include "util/pcqueue.hh" + +#include +#include +#include + +#include +#include + +namespace util { +namespace stream { + +ChainConfigException::ChainConfigException() throw() { *this << "Chain configured with "; } +ChainConfigException::~ChainConfigException() throw() {} + +Thread::~Thread() { + thread_.join(); +} + +void Thread::UnhandledException(const std::exception &e) { + std::cerr << e.what() << std::endl; + abort(); +} + +void Recycler::Run(const ChainPosition &position) { + for (Link l(position); l; ++l) { + l->SetValidSize(position.GetChain().BlockSize()); + } +} + +const Recycler kRecycle = Recycler(); + +Chain::Chain(const ChainConfig &config) : config_(config), complete_called_(false) { + UTIL_THROW_IF(!config.entry_size, ChainConfigException, "zero-size entries."); + UTIL_THROW_IF(!config.block_count, ChainConfigException, "block count zero"); + UTIL_THROW_IF(config.total_memory < config.entry_size * config.block_count, ChainConfigException, config.total_memory << " total memory, too small for " << config.block_count << " blocks of containing entries of size " << config.entry_size); + // Round down block size to a multiple of entry size. + block_size_ = config.total_memory / (config.block_count * config.entry_size) * config.entry_size; +} + +Chain::~Chain() { + Wait(); +} + +ChainPosition Chain::Add() { + if (!Running()) Start(); + PCQueue &in = queues_.back(); + queues_.push_back(new PCQueue(config_.block_count)); + return ChainPosition(in, queues_.back(), this, progress_); +} + +Chain &Chain::operator>>(const WriteAndRecycle &writer) { + threads_.push_back(new Thread(Complete(), writer)); + return *this; +} + +void Chain::Wait(bool release_memory) { + if (queues_.empty()) { + assert(threads_.empty()); + return; // Nothing to wait for. + } + if (!complete_called_) CompleteLoop(); + threads_.clear(); + for (std::size_t i = 0; queues_.front().Consume(); ++i) { + if (i == config_.block_count) { + std::cerr << "Chain ending without poison." << std::endl; + abort(); + } + } + queues_.clear(); + progress_.Finished(); + complete_called_ = false; + if (release_memory) memory_.reset(); +} + +void Chain::Start() { + Wait(false); + if (!memory_.get()) { + // Allocate memory. + assert(threads_.empty()); + assert(queues_.empty()); + std::size_t malloc_size = block_size_ * config_.block_count; + memory_.reset(MallocOrThrow(malloc_size)); + } + // This queue can accomodate all blocks. + queues_.push_back(new PCQueue(config_.block_count)); + // Populate the lead queue with blocks. + uint8_t *base = static_cast(memory_.get()); + for (std::size_t i = 0; i < config_.block_count; ++i) { + queues_.front().Produce(Block(base, block_size_)); + base += block_size_; + } +} + +ChainPosition Chain::Complete() { + assert(Running()); + UTIL_THROW_IF(complete_called_, util::Exception, "CompleteLoop() called twice"); + complete_called_ = true; + return ChainPosition(queues_.back(), queues_.front(), this, progress_); +} + +Link::Link() : in_(NULL), out_(NULL), poisoned_(true) {} + +void Link::Init(const ChainPosition &position) { + UTIL_THROW_IF(in_, util::Exception, "Link::Init twice"); + in_ = position.in_; + out_ = position.out_; + poisoned_ = false; + progress_ = position.progress_; + in_->Consume(current_); +} + +Link::Link(const ChainPosition &position) : in_(NULL) { + Init(position); +} + +Link::~Link() { + if (current_) { + // Probably an exception unwinding. + std::cerr << "Last input should have been poison." << std::endl; +// abort(); + } else { + if (!poisoned_) { + // Pass the poison! + out_->Produce(current_); + } + } +} + +Link &Link::operator++() { + assert(current_); + progress_ += current_.ValidSize(); + out_->Produce(current_); + in_->Consume(current_); + if (!current_) { + poisoned_ = true; + out_->Produce(current_); + } + return *this; +} + +void Link::Poison() { + assert(!poisoned_); + current_.SetToPoison(); + out_->Produce(current_); + poisoned_ = true; +} + +} // namespace stream +} // namespace util diff --git a/klm/util/stream/chain.hh b/klm/util/stream/chain.hh new file mode 100644 index 00000000..154b9b33 --- /dev/null +++ b/klm/util/stream/chain.hh @@ -0,0 +1,198 @@ +#ifndef UTIL_STREAM_CHAIN__ +#define UTIL_STREAM_CHAIN__ + +#include "util/stream/block.hh" +#include "util/stream/config.hh" +#include "util/stream/multi_progress.hh" +#include "util/scoped.hh" + +#include +#include + +#include + +#include + +namespace util { +template class PCQueue; +namespace stream { + +class ChainConfigException : public Exception { + public: + ChainConfigException() throw(); + ~ChainConfigException() throw(); +}; + +class Chain; +// Specifies position in chain for Link constructor. +class ChainPosition { + public: + const Chain &GetChain() const { return *chain_; } + private: + friend class Chain; + friend class Link; + ChainPosition(PCQueue &in, PCQueue &out, Chain *chain, MultiProgress &progress) + : in_(&in), out_(&out), chain_(chain), progress_(progress.Add()) {} + + PCQueue *in_, *out_; + + Chain *chain_; + + WorkerProgress progress_; +}; + +// Position is usually ChainPosition but if there are multiple streams involved, this can be ChainPositions. +class Thread { + public: + template Thread(const Position &position, const Worker &worker) + : thread_(boost::ref(*this), position, worker) {} + + ~Thread(); + + template void operator()(const Position &position, Worker &worker) { + try { + worker.Run(position); + } catch (const std::exception &e) { + UnhandledException(e); + } + } + + private: + void UnhandledException(const std::exception &e); + + boost::thread thread_; +}; + +class Recycler { + public: + void Run(const ChainPosition &position); +}; + +extern const Recycler kRecycle; +class WriteAndRecycle; + +class Chain { + private: + template struct CheckForRun { + typedef Chain type; + }; + + public: + explicit Chain(const ChainConfig &config); + + ~Chain(); + + void ActivateProgress() { + assert(!Running()); + progress_.Activate(); + } + + void SetProgressTarget(uint64_t target) { + progress_.SetTarget(target); + } + + std::size_t EntrySize() const { + return config_.entry_size; + } + std::size_t BlockSize() const { + return block_size_; + } + + // Two ways to add to the chain: Add() or operator>>. + ChainPosition Add(); + + // This is for adding threaded workers with a Run method. + template typename CheckForRun::type &operator>>(const Worker &worker) { + assert(!complete_called_); + threads_.push_back(new Thread(Add(), worker)); + return *this; + } + + // Avoid copying the worker. + template typename CheckForRun::type &operator>>(const boost::reference_wrapper &worker) { + assert(!complete_called_); + threads_.push_back(new Thread(Add(), worker)); + return *this; + } + + // Note that Link and Stream also define operator>> outside this class. + + // To complete the loop, call CompleteLoop(), >> kRecycle, or the destructor. + void CompleteLoop() { + threads_.push_back(new Thread(Complete(), kRecycle)); + } + + Chain &operator>>(const Recycler &recycle) { + CompleteLoop(); + return *this; + } + + Chain &operator>>(const WriteAndRecycle &writer); + + // Chains are reusable. Call Wait to wait for everything to finish and free memory. + void Wait(bool release_memory = true); + + // Waits for the current chain to complete (if any) then starts again. + void Start(); + + bool Running() const { return !queues_.empty(); } + + private: + ChainPosition Complete(); + + ChainConfig config_; + + std::size_t block_size_; + + scoped_malloc memory_; + + boost::ptr_vector > queues_; + + bool complete_called_; + + boost::ptr_vector threads_; + + MultiProgress progress_; +}; + +// Create the link in the worker thread using the position token. +class Link { + public: + // Either default construct and Init or just construct all at once. + Link(); + void Init(const ChainPosition &position); + + explicit Link(const ChainPosition &position); + + ~Link(); + + Block &operator*() { return current_; } + const Block &operator*() const { return current_; } + + Block *operator->() { return ¤t_; } + const Block *operator->() const { return ¤t_; } + + Link &operator++(); + + operator bool() const { return current_; } + + void Poison(); + + private: + Block current_; + PCQueue *in_, *out_; + + bool poisoned_; + + WorkerProgress progress_; +}; + +inline Chain &operator>>(Chain &chain, Link &link) { + link.Init(chain.Add()); + return chain; +} + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_CHAIN__ diff --git a/klm/util/stream/config.hh b/klm/util/stream/config.hh new file mode 100644 index 00000000..1eeb3a8a --- /dev/null +++ b/klm/util/stream/config.hh @@ -0,0 +1,32 @@ +#ifndef UTIL_STREAM_CONFIG__ +#define UTIL_STREAM_CONFIG__ + +#include +#include + +namespace util { namespace stream { + +struct ChainConfig { + ChainConfig() {} + + ChainConfig(std::size_t in_entry_size, std::size_t in_block_count, std::size_t in_total_memory) + : entry_size(in_entry_size), block_count(in_block_count), total_memory(in_total_memory) {} + + std::size_t entry_size; + std::size_t block_count; + // Chain's constructor will make this a multiple of entry_size. + std::size_t total_memory; +}; + +struct SortConfig { + std::string temp_prefix; + + // Size of each input/output buffer. + std::size_t buffer_size; + + // Total memory to use when running alone. + std::size_t total_memory; +}; + +}} // namespaces +#endif // UTIL_STREAM_CONFIG__ diff --git a/klm/util/stream/io.cc b/klm/util/stream/io.cc new file mode 100644 index 00000000..c7ad2980 --- /dev/null +++ b/klm/util/stream/io.cc @@ -0,0 +1,64 @@ +#include "util/stream/io.hh" + +#include "util/file.hh" +#include "util/stream/chain.hh" + +#include + +namespace util { +namespace stream { + +ReadSizeException::ReadSizeException() throw() {} +ReadSizeException::~ReadSizeException() throw() {} + +void Read::Run(const ChainPosition &position) { + const std::size_t block_size = position.GetChain().BlockSize(); + const std::size_t entry_size = position.GetChain().EntrySize(); + for (Link link(position); link; ++link) { + std::size_t got = util::ReadOrEOF(file_, link->Get(), block_size); + UTIL_THROW_IF(got % entry_size, ReadSizeException, "File ended with " << got << " bytes, not a multiple of " << entry_size << "."); + if (got == 0) { + link.Poison(); + return; + } else { + link->SetValidSize(got); + } + } +} + +void PRead::Run(const ChainPosition &position) { + scoped_fd owner; + if (own_) owner.reset(file_); + uint64_t size = SizeOrThrow(file_); + UTIL_THROW_IF(size % static_cast(position.GetChain().EntrySize()), ReadSizeException, "File size " << file_ << " size is " << size << " not a multiple of " << position.GetChain().EntrySize()); + std::size_t block_size = position.GetChain().BlockSize(); + Link link(position); + uint64_t offset = 0; + for (; offset + block_size < size; offset += block_size, ++link) { + PReadOrThrow(file_, link->Get(), block_size, offset); + link->SetValidSize(block_size); + } + if (size - offset) { + PReadOrThrow(file_, link->Get(), size - offset, offset); + link->SetValidSize(size - offset); + ++link; + } + link.Poison(); +} + +void Write::Run(const ChainPosition &position) { + for (Link link(position); link; ++link) { + WriteOrThrow(file_, link->Get(), link->ValidSize()); + } +} + +void WriteAndRecycle::Run(const ChainPosition &position) { + const std::size_t block_size = position.GetChain().BlockSize(); + for (Link link(position); link; ++link) { + WriteOrThrow(file_, link->Get(), link->ValidSize()); + link->SetValidSize(block_size); + } +} + +} // namespace stream +} // namespace util diff --git a/klm/util/stream/io.hh b/klm/util/stream/io.hh new file mode 100644 index 00000000..934b6b3f --- /dev/null +++ b/klm/util/stream/io.hh @@ -0,0 +1,76 @@ +#ifndef UTIL_STREAM_IO__ +#define UTIL_STREAM_IO__ + +#include "util/exception.hh" +#include "util/file.hh" + +namespace util { +namespace stream { + +class ChainPosition; + +class ReadSizeException : public util::Exception { + public: + ReadSizeException() throw(); + ~ReadSizeException() throw(); +}; + +class Read { + public: + explicit Read(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +// Like read but uses pread so that the file can be accessed from multiple threads. +class PRead { + public: + explicit PRead(int fd, bool take_own = false) : file_(fd), own_(take_own) {} + void Run(const ChainPosition &position); + private: + int file_; + bool own_; +}; + +class Write { + public: + explicit Write(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +class WriteAndRecycle { + public: + explicit WriteAndRecycle(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +// Reuse the same file over and over again to buffer output. +class FileBuffer { + public: + explicit FileBuffer(int fd) : file_(fd) {} + + WriteAndRecycle Sink() const { + util::SeekOrThrow(file_.get(), 0); + return WriteAndRecycle(file_.get()); + } + + PRead Source() const { + return PRead(file_.get()); + } + + uint64_t Size() const { + return SizeOrThrow(file_.get()); + } + + private: + scoped_fd file_; +}; + +} // namespace stream +} // namespace util +#endif // UTIL_STREAM_IO__ diff --git a/klm/util/stream/io_test.cc b/klm/util/stream/io_test.cc new file mode 100644 index 00000000..82108335 --- /dev/null +++ b/klm/util/stream/io_test.cc @@ -0,0 +1,38 @@ +#include "util/stream/io.hh" + +#include "util/stream/chain.hh" +#include "util/file.hh" + +#define BOOST_TEST_MODULE IOTest +#include + +#include + +namespace util { namespace stream { namespace { + +BOOST_AUTO_TEST_CASE(CopyFile) { + std::string temps("io_test_temp"); + + scoped_fd in(MakeTemp(temps)); + for (uint64_t i = 0; i < 100000; ++i) { + WriteOrThrow(in.get(), &i, sizeof(uint64_t)); + } + SeekOrThrow(in.get(), 0); + scoped_fd out(MakeTemp(temps)); + + ChainConfig config; + config.entry_size = 8; + config.total_memory = 1024; + config.block_count = 10; + + Chain(config) >> PRead(in.get()) >> Write(out.get()); + + SeekOrThrow(out.get(), 0); + for (uint64_t i = 0; i < 100000; ++i) { + uint64_t got; + ReadOrThrow(out.get(), &got, sizeof(uint64_t)); + BOOST_CHECK_EQUAL(i, got); + } +} + +}}} // namespaces diff --git a/klm/util/stream/line_input.cc b/klm/util/stream/line_input.cc new file mode 100644 index 00000000..dafa5020 --- /dev/null +++ b/klm/util/stream/line_input.cc @@ -0,0 +1,52 @@ +#include "util/stream/line_input.hh" + +#include "util/exception.hh" +#include "util/file.hh" +#include "util/read_compressed.hh" +#include "util/stream/chain.hh" + +#include +#include + +namespace util { namespace stream { + +void LineInput::Run(const ChainPosition &position) { + ReadCompressed reader(fd_); + // Holding area for beginning of line to be placed in next block. + std::vector carry; + + for (Link block(position); ; ++block) { + char *to = static_cast(block->Get()); + char *begin = to; + char *end = to + position.GetChain().BlockSize(); + std::copy(carry.begin(), carry.end(), to); + to += carry.size(); + while (to != end) { + std::size_t got = reader.Read(to, end - to); + if (!got) { + // EOF + block->SetValidSize(to - begin); + ++block; + block.Poison(); + return; + } + to += got; + } + + // Find the last newline. + char *newline; + for (newline = to - 1; ; --newline) { + UTIL_THROW_IF(newline < begin, Exception, "Did not find a newline in " << position.GetChain().BlockSize() << " bytes of input of " << NameFromFD(fd_) << ". Is this a text file?"); + if (*newline == '\n') break; + } + + // Copy everything after the last newline to the carry. + carry.clear(); + carry.resize(to - (newline + 1)); + std::copy(newline + 1, to, &*carry.begin()); + + block->SetValidSize(newline + 1 - begin); + } +} + +}} // namespaces diff --git a/klm/util/stream/line_input.hh b/klm/util/stream/line_input.hh new file mode 100644 index 00000000..86db1dd0 --- /dev/null +++ b/klm/util/stream/line_input.hh @@ -0,0 +1,22 @@ +#ifndef UTIL_STREAM_LINE_INPUT__ +#define UTIL_STREAM_LINE_INPUT__ +namespace util {namespace stream { + +class ChainPosition; + +/* Worker that reads input into blocks, ensuring that blocks contain whole + * lines. Assumes that the maximum size of a line is less than the block size + */ +class LineInput { + public: + // Takes ownership upon thread execution. + explicit LineInput(int fd); + + void Run(const ChainPosition &position); + + private: + int fd_; +}; + +}} // namespaces +#endif // UTIL_STREAM_LINE_INPUT__ diff --git a/klm/util/stream/multi_progress.cc b/klm/util/stream/multi_progress.cc new file mode 100644 index 00000000..8ba10386 --- /dev/null +++ b/klm/util/stream/multi_progress.cc @@ -0,0 +1,86 @@ +#include "util/stream/multi_progress.hh" + +// TODO: merge some functionality with the simple progress bar? +#include "util/ersatz_progress.hh" + +#include +#include + +#include + +#if !defined(_WIN32) && !defined(_WIN64) +#include +#endif + +namespace util { namespace stream { + +namespace { +const char kDisplayCharacters[] = "-+*#0123456789"; + +uint64_t Next(unsigned char stone, uint64_t complete) { + return (static_cast(stone + 1) * complete + MultiProgress::kWidth - 1) / MultiProgress::kWidth; +} + +} // namespace + +MultiProgress::MultiProgress() : active_(false), complete_(std::numeric_limits::max()), character_handout_(0) {} + +MultiProgress::~MultiProgress() { + if (active_ && complete_ != std::numeric_limits::max()) + std::cerr << '\n'; +} + +void MultiProgress::Activate() { + active_ = +#if !defined(_WIN32) && !defined(_WIN64) + // Is stderr a terminal? + (isatty(2) == 1) +#else + true +#endif + ; +} + +void MultiProgress::SetTarget(uint64_t complete) { + if (!active_) return; + complete_ = complete; + if (!complete) complete_ = 1; + memset(display_, 0, sizeof(display_)); + character_handout_ = 0; + std::cerr << kProgressBanner; +} + +WorkerProgress MultiProgress::Add() { + if (!active_) + return WorkerProgress(std::numeric_limits::max(), *this, '\0'); + std::size_t character_index; + { + boost::unique_lock lock(mutex_); + character_index = character_handout_++; + if (character_handout_ == sizeof(kDisplayCharacters) - 1) + character_handout_ = 0; + } + return WorkerProgress(Next(0, complete_), *this, kDisplayCharacters[character_index]); +} + +void MultiProgress::Finished() { + if (!active_ || complete_ == std::numeric_limits::max()) return; + std::cerr << '\n'; + complete_ = std::numeric_limits::max(); +} + +void MultiProgress::Milestone(WorkerProgress &worker) { + if (!active_ || complete_ == std::numeric_limits::max()) return; + unsigned char stone = std::min(static_cast(kWidth), worker.current_ * kWidth / complete_); + for (char *i = &display_[worker.stone_]; i < &display_[stone]; ++i) { + *i = worker.character_; + } + worker.next_ = Next(stone, complete_); + worker.stone_ = stone; + { + boost::unique_lock lock(mutex_); + std::cerr << '\r' << display_ << std::flush; + } +} + +}} // namespaces diff --git a/klm/util/stream/multi_progress.hh b/klm/util/stream/multi_progress.hh new file mode 100644 index 00000000..c4dd45a9 --- /dev/null +++ b/klm/util/stream/multi_progress.hh @@ -0,0 +1,90 @@ +/* Progress bar suitable for chains of workers */ +#ifndef UTIL_MULTI_PROGRESS__ +#define UTIL_MULTI_PROGRESS__ + +#include + +#include + +#include + +namespace util { namespace stream { + +class WorkerProgress; + +class MultiProgress { + public: + static const unsigned char kWidth = 100; + + MultiProgress(); + + ~MultiProgress(); + + // Turns on showing (requires SetTarget too). + void Activate(); + + void SetTarget(uint64_t complete); + + WorkerProgress Add(); + + void Finished(); + + private: + friend class WorkerProgress; + void Milestone(WorkerProgress &worker); + + bool active_; + + uint64_t complete_; + + boost::mutex mutex_; + + // \0 at the end. + char display_[kWidth + 1]; + + std::size_t character_handout_; + + MultiProgress(const MultiProgress &); + MultiProgress &operator=(const MultiProgress &); +}; + +class WorkerProgress { + public: + // Default contrutor must be initialized with operator= later. + WorkerProgress() : parent_(NULL) {} + + // Not threadsafe for the same worker by default. + WorkerProgress &operator++() { + if (++current_ >= next_) { + parent_->Milestone(*this); + } + return *this; + } + + WorkerProgress &operator+=(uint64_t amount) { + current_ += amount; + if (current_ >= next_) { + parent_->Milestone(*this); + } + return *this; + } + + private: + friend class MultiProgress; + WorkerProgress(uint64_t next, MultiProgress &parent, char character) + : current_(0), next_(next), parent_(&parent), stone_(0), character_(character) {} + + uint64_t current_, next_; + + MultiProgress *parent_; + + // Previous milestone reached. + unsigned char stone_; + + // Character to display in bar. + char character_; +}; + +}} // namespaces + +#endif // UTIL_MULTI_PROGRESS__ diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh new file mode 100644 index 00000000..be6c11ea --- /dev/null +++ b/klm/util/stream/sort.hh @@ -0,0 +1,542 @@ +/* Usage: + * Sort sorter(temp, compare); + * Chain(config) >> Read(file) >> sorter.Unsorted(); + * Stream stream; + * Chain chain(config) >> sorter.Sorted(internal_config, lazy_config) >> stream; + * + * Note that sorter must outlive any threads that use Unsorted or Sorted. + * + * Combiners take the form: + * bool operator()(void *into, const void *option, const Compare &compare) const + * which returns true iff a combination happened. The sorting algorithm + * guarantees compare(into, option). But it does not guarantee + * compare(option, into). + * Currently, combining is only done in merge steps, not during on-the-fly + * sort. Use a hash table for that. + */ + +#ifndef UTIL_STREAM_SORT__ +#define UTIL_STREAM_SORT__ + +#include "util/stream/chain.hh" +#include "util/stream/config.hh" +#include "util/stream/io.hh" +#include "util/stream/stream.hh" +#include "util/stream/timer.hh" + +#include "util/file.hh" +#include "util/scoped.hh" +#include "util/sized_iterator.hh" + +#include +#include +#include +#include + +namespace util { +namespace stream { + +struct NeverCombine { + template bool operator()(const void *, const void *, const Compare &) const { + return false; + } +}; + +// Manage the offsets of sorted blocks in a file. +class Offsets { + public: + explicit Offsets(int fd) : log_(fd) { + Reset(); + } + + int File() const { return log_; } + + void Append(uint64_t length) { + if (!length) return; + ++block_count_; + if (length == cur_.length) { + ++cur_.run; + return; + } + WriteOrThrow(log_, &cur_, sizeof(Entry)); + cur_.length = length; + cur_.run = 1; + } + + void FinishedAppending() { + WriteOrThrow(log_, &cur_, sizeof(Entry)); + SeekOrThrow(log_, sizeof(Entry)); // Skip 0,0 at beginning. + cur_.run = 0; + if (block_count_) { + ReadOrThrow(log_, &cur_, sizeof(Entry)); + assert(cur_.length); + assert(cur_.run); + } + } + + uint64_t RemainingBlocks() const { return block_count_; } + + uint64_t TotalOffset() const { return output_sum_; } + + uint64_t PeekSize() const { + return cur_.length; + } + + uint64_t NextSize() { + assert(block_count_); + uint64_t ret = cur_.length; + output_sum_ += ret; + + --cur_.run; + --block_count_; + if (!cur_.run && block_count_) { + ReadOrThrow(log_, &cur_, sizeof(Entry)); + assert(cur_.length); + assert(cur_.run); + } + return ret; + } + + void Reset() { + SeekOrThrow(log_, 0); + ResizeOrThrow(log_, 0); + cur_.length = 0; + cur_.run = 0; + block_count_ = 0; + output_sum_ = 0; + } + + private: + int log_; + + struct Entry { + uint64_t length; + uint64_t run; + }; + Entry cur_; + + uint64_t block_count_; + + uint64_t output_sum_; +}; + +// A priority queue of entries backed by file buffers +template class MergeQueue { + public: + MergeQueue(int fd, std::size_t buffer_size, std::size_t entry_size, const Compare &compare) + : queue_(Greater(compare)), in_(fd), buffer_size_(buffer_size), entry_size_(entry_size) {} + + void Push(void *base, uint64_t offset, uint64_t amount) { + queue_.push(Entry(base, in_, offset, amount, buffer_size_)); + } + + const void *Top() const { + return queue_.top().Current(); + } + + void Pop() { + Entry top(queue_.top()); + queue_.pop(); + if (top.Increment(in_, buffer_size_, entry_size_)) + queue_.push(top); + } + + std::size_t Size() const { + return queue_.size(); + } + + bool Empty() const { + return queue_.empty(); + } + + private: + // Priority queue contains these entries. + class Entry { + public: + Entry() {} + + Entry(void *base, int fd, uint64_t offset, uint64_t amount, std::size_t buf_size) { + offset_ = offset; + remaining_ = amount; + buffer_end_ = static_cast(base) + buf_size; + Read(fd, buf_size); + } + + bool Increment(int fd, std::size_t buf_size, std::size_t entry_size) { + current_ += entry_size; + if (current_ != buffer_end_) return true; + return Read(fd, buf_size); + } + + const void *Current() const { return current_; } + + private: + bool Read(int fd, std::size_t buf_size) { + current_ = buffer_end_ - buf_size; + std::size_t amount; + if (static_cast(buf_size) < remaining_) { + amount = buf_size; + } else if (!remaining_) { + return false; + } else { + amount = remaining_; + buffer_end_ = current_ + remaining_; + } + PReadOrThrow(fd, current_, amount, offset_); + offset_ += amount; + assert(current_ <= buffer_end_); + remaining_ -= amount; + return true; + } + + // Buffer + uint8_t *current_, *buffer_end_; + // File + uint64_t remaining_, offset_; + }; + + // Wrapper comparison function for queue entries. + class Greater : public std::binary_function { + public: + explicit Greater(const Compare &compare) : compare_(compare) {} + + bool operator()(const Entry &first, const Entry &second) const { + return compare_(second.Current(), first.Current()); + } + + private: + const Compare compare_; + }; + + typedef std::priority_queue, Greater> Queue; + Queue queue_; + + const int in_; + const std::size_t buffer_size_; + const std::size_t entry_size_; +}; + +/* A worker object that merges. If the number of pieces to merge exceeds the + * arity, it outputs multiple sorted blocks, recording to out_offsets. + * However, users will only every see a single sorted block out output because + * Sort::Sorted insures the arity is higher than the number of pieces before + * returning this. + */ +template class MergingReader { + public: + MergingReader(int in, Offsets *in_offsets, Offsets *out_offsets, std::size_t buffer_size, std::size_t total_memory, const Compare &compare, const Combine &combine) : + compare_(compare), combine_(combine), + in_(in), + in_offsets_(in_offsets), out_offsets_(out_offsets), + buffer_size_(buffer_size), total_memory_(total_memory) {} + + void Run(const ChainPosition &position) { + Run(position, false); + } + + void Run(const ChainPosition &position, bool assert_one) { + // Special case: nothing to read. + if (!in_offsets_->RemainingBlocks()) { + Link l(position); + l.Poison(); + return; + } + // If there's just one entry, just read. + if (in_offsets_->RemainingBlocks() == 1) { + // Sequencing is important. + uint64_t offset = in_offsets_->TotalOffset(); + uint64_t amount = in_offsets_->NextSize(); + ReadSingle(offset, amount, position); + if (out_offsets_) out_offsets_->Append(amount); + return; + } + + Stream str(position); + scoped_malloc buffer(MallocOrThrow(total_memory_)); + uint8_t *const buffer_end = static_cast(buffer.get()) + total_memory_; + + const std::size_t entry_size = position.GetChain().EntrySize(); + + while (in_offsets_->RemainingBlocks()) { + // Use bigger buffers if there's less remaining. + uint64_t per_buffer = std::max(buffer_size_, total_memory_ / in_offsets_->RemainingBlocks()); + per_buffer -= per_buffer % entry_size; + assert(per_buffer); + + // Populate queue. + MergeQueue queue(in_, per_buffer, entry_size, compare_); + for (uint8_t *buf = static_cast(buffer.get()); + in_offsets_->RemainingBlocks() && (buf + std::min(per_buffer, in_offsets_->PeekSize()) <= buffer_end);) { + uint64_t offset = in_offsets_->TotalOffset(); + uint64_t size = in_offsets_->NextSize(); + queue.Push(buf, offset, size); + buf += static_cast(std::min(size, per_buffer)); + } + // This shouldn't happen but it's probably better to die than loop indefinitely. + if (queue.Size() < 2 && in_offsets_->RemainingBlocks()) { + std::cerr << "Bug in sort implementation: not merging at least two stripes." << std::endl; + abort(); + } + if (assert_one && in_offsets_->RemainingBlocks()) { + std::cerr << "Bug in sort implementation: should only be one merge group for lazy sort" << std::endl; + abort(); + } + + uint64_t written = 0; + // Merge including combiner support. + memcpy(str.Get(), queue.Top(), entry_size); + for (queue.Pop(); !queue.Empty(); queue.Pop()) { + if (!combine_(str.Get(), queue.Top(), compare_)) { + ++written; ++str; + memcpy(str.Get(), queue.Top(), entry_size); + } + } + ++written; ++str; + if (out_offsets_) + out_offsets_->Append(written * entry_size); + } + str.Poison(); + } + + private: + void ReadSingle(uint64_t offset, const uint64_t size, const ChainPosition &position) { + // Special case: only one to read. + const uint64_t end = offset + size; + const uint64_t block_size = position.GetChain().BlockSize(); + Link l(position); + for (; offset + block_size < end; ++l, offset += block_size) { + PReadOrThrow(in_, l->Get(), block_size, offset); + l->SetValidSize(block_size); + } + PReadOrThrow(in_, l->Get(), end - offset, offset); + l->SetValidSize(end - offset); + (++l).Poison(); + return; + } + + Compare compare_; + Combine combine_; + + int in_; + + protected: + Offsets *in_offsets_; + + private: + Offsets *out_offsets_; + + std::size_t buffer_size_; + std::size_t total_memory_; +}; + +// The lazy step owns the remaining files. This keeps track of them. +template class OwningMergingReader : public MergingReader { + private: + typedef MergingReader P; + public: + OwningMergingReader(int data, const Offsets &offsets, std::size_t buffer, std::size_t lazy, const Compare &compare, const Combine &combine) + : P(data, NULL, NULL, buffer, lazy, compare, combine), + data_(data), + offsets_(offsets) {} + + void Run(const ChainPosition &position) { + P::in_offsets_ = &offsets_; + scoped_fd data(data_); + scoped_fd offsets_file(offsets_.File()); + P::Run(position, true); + } + + private: + int data_; + Offsets offsets_; +}; + +// Don't use this directly. Worker that sorts blocks. +template class BlockSorter { + public: + BlockSorter(Offsets &offsets, const Compare &compare) : + offsets_(&offsets), compare_(compare) {} + + void Run(const ChainPosition &position) { + const std::size_t entry_size = position.GetChain().EntrySize(); + for (Link link(position); link; ++link) { + // Record the size of each block in a separate file. + offsets_->Append(link->ValidSize()); + void *end = static_cast(link->Get()) + link->ValidSize(); + std::sort( + SizedIt(link->Get(), entry_size), + SizedIt(end, entry_size), + compare_); + } + offsets_->FinishedAppending(); + } + + private: + Offsets *offsets_; + SizedCompare compare_; +}; + +class BadSortConfig : public Exception { + public: + BadSortConfig() throw() {} + ~BadSortConfig() throw() {} +}; + +template class Sort { + public: + Sort(Chain &in, const SortConfig &config, const Compare &compare = Compare(), const Combine &combine = Combine()) + : config_(config), + data_(MakeTemp(config.temp_prefix)), + offsets_file_(MakeTemp(config.temp_prefix)), offsets_(offsets_file_.get()), + compare_(compare), combine_(combine), + entry_size_(in.EntrySize()) { + UTIL_THROW_IF(!entry_size_, BadSortConfig, "Sorting entries of size 0"); + // Make buffer_size a multiple of the entry_size. + config_.buffer_size -= config_.buffer_size % entry_size_; + UTIL_THROW_IF(!config_.buffer_size, BadSortConfig, "Sort buffer too small"); + UTIL_THROW_IF(config_.total_memory < config_.buffer_size * 4, BadSortConfig, "Sorting memory " << config_.total_memory << " is too small for four buffers (two read and two write)."); + in >> BlockSorter(offsets_, compare_) >> WriteAndRecycle(data_.get()); + } + + uint64_t Size() const { + return SizeOrThrow(data_.get()); + } + + // Do merge sort, terminating when lazy merge could be done with the + // specified memory. Return the minimum memory necessary to do lazy merge. + std::size_t Merge(std::size_t lazy_memory) { + if (offsets_.RemainingBlocks() <= 1) return 0; + const uint64_t lazy_arity = std::max(1, lazy_memory / config_.buffer_size); + uint64_t size = Size(); + /* No overflow because + * offsets_.RemainingBlocks() * config_.buffer_size <= lazy_memory || + * size < lazy_memory + */ + if (offsets_.RemainingBlocks() <= lazy_arity || size <= static_cast(lazy_memory)) + return std::min(size, offsets_.RemainingBlocks() * config_.buffer_size); + + scoped_fd data2(MakeTemp(config_.temp_prefix)); + int fd_in = data_.get(), fd_out = data2.get(); + scoped_fd offsets2_file(MakeTemp(config_.temp_prefix)); + Offsets offsets2(offsets2_file.get()); + Offsets *offsets_in = &offsets_, *offsets_out = &offsets2; + + // Double buffered writing. + ChainConfig chain_config; + chain_config.entry_size = entry_size_; + chain_config.block_count = 2; + chain_config.total_memory = config_.buffer_size * 2; + Chain chain(chain_config); + + while (offsets_in->RemainingBlocks() > lazy_arity) { + if (size <= static_cast(lazy_memory)) break; + std::size_t reading_memory = config_.total_memory - 2 * config_.buffer_size; + if (size < static_cast(reading_memory)) { + reading_memory = static_cast(size); + } + SeekOrThrow(fd_in, 0); + chain >> + MergingReader( + fd_in, + offsets_in, offsets_out, + config_.buffer_size, + reading_memory, + compare_, combine_) >> + WriteAndRecycle(fd_out); + chain.Wait(); + offsets_out->FinishedAppending(); + ResizeOrThrow(fd_in, 0); + offsets_in->Reset(); + std::swap(fd_in, fd_out); + std::swap(offsets_in, offsets_out); + size = SizeOrThrow(fd_in); + } + + SeekOrThrow(fd_in, 0); + if (fd_in == data2.get()) { + data_.reset(data2.release()); + offsets_file_.reset(offsets2_file.release()); + offsets_ = offsets2; + } + if (offsets_.RemainingBlocks() <= 1) return 0; + // No overflow because the while loop exited. + return std::min(size, offsets_.RemainingBlocks() * static_cast(config_.buffer_size)); + } + + // Output to chain, using this amount of memory, maximum, for lazy merge + // sort. + void Output(Chain &out, std::size_t lazy_memory) { + Merge(lazy_memory); + out.SetProgressTarget(Size()); + out >> OwningMergingReader(data_.get(), offsets_, config_.buffer_size, lazy_memory, compare_, combine_); + data_.release(); + offsets_file_.release(); + } + + /* If a pipeline step is reading sorted input and writing to a different + * sort order, then there's a trade-off between using RAM to read lazily + * (avoiding copying the file) and using RAM to increase block size and, + * therefore, decrease the number of merge sort passes in the next + * iteration. + * + * Merge sort takes log_{arity}(pieces) passes. Thus, each time the chain + * block size is multiplied by arity, the number of output passes decreases + * by one. Up to a constant, then, log_{arity}(chain) is the number of + * passes saved. Chain simply divides the memory evenly over all blocks. + * + * Lazy sort saves this many passes (up to a constant) + * log_{arity}((memory-lazy)/block_count) + 1 + * Non-lazy sort saves this many passes (up to the same constant): + * log_{arity}(memory/block_count) + * Add log_{arity}(block_count) to both: + * log_{arity}(memory-lazy) + 1 versus log_{arity}(memory) + * Take arity to the power of both sizes (arity > 1) + * (memory - lazy)*arity versus memory + * Solve for lazy + * lazy = memory * (arity - 1) / arity + */ + std::size_t DefaultLazy() { + float arity = static_cast(config_.total_memory / config_.buffer_size); + return static_cast(static_cast(config_.total_memory) * (arity - 1.0) / arity); + } + + // Same as Output with default lazy memory setting. + void Output(Chain &out) { + Output(out, DefaultLazy()); + } + + // Completely merge sort and transfer ownership to the caller. + int StealCompleted() { + // Merge all the way. + Merge(0); + SeekOrThrow(data_.get(), 0); + offsets_file_.reset(); + return data_.release(); + } + + private: + SortConfig config_; + + scoped_fd data_; + + scoped_fd offsets_file_; + Offsets offsets_; + + const Compare compare_; + const Combine combine_; + const std::size_t entry_size_; +}; + +// returns bytes to be read on demand. +template uint64_t BlockingSort(Chain &chain, const SortConfig &config, const Compare &compare = Compare(), const Combine &combine = NeverCombine()) { + Sort sorter(chain, config, compare, combine); + chain.Wait(true); + uint64_t size = sorter.Size(); + sorter.Output(chain); + return size; +} + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_SORT__ diff --git a/klm/util/stream/sort_test.cc b/klm/util/stream/sort_test.cc new file mode 100644 index 00000000..fd7705cd --- /dev/null +++ b/klm/util/stream/sort_test.cc @@ -0,0 +1,62 @@ +#include "util/stream/sort.hh" + +#define BOOST_TEST_MODULE SortTest +#include + +#include + +#include + +namespace util { namespace stream { namespace { + +struct CompareUInt64 : public std::binary_function { + bool operator()(const void *first, const void *second) const { + return *static_cast(first) < *reinterpret_cast(second); + } +}; + +const uint64_t kSize = 100000; + +struct Putter { + Putter(std::vector &shuffled) : shuffled_(shuffled) {} + + void Run(const ChainPosition &position) { + Stream put_shuffled(position); + for (uint64_t i = 0; i < shuffled_.size(); ++i, ++put_shuffled) { + *static_cast(put_shuffled.Get()) = shuffled_[i]; + } + put_shuffled.Poison(); + } + std::vector &shuffled_; +}; + +BOOST_AUTO_TEST_CASE(FromShuffled) { + std::vector shuffled; + shuffled.reserve(kSize); + for (uint64_t i = 0; i < kSize; ++i) { + shuffled.push_back(i); + } + std::random_shuffle(shuffled.begin(), shuffled.end()); + + ChainConfig config; + config.entry_size = 8; + config.total_memory = 800; + config.block_count = 3; + + SortConfig merge_config; + merge_config.temp_prefix = "sort_test_temp"; + merge_config.buffer_size = 800; + merge_config.total_memory = 3300; + + Chain chain(config); + chain >> Putter(shuffled); + BlockingSort(chain, merge_config, CompareUInt64(), NeverCombine()); + Stream sorted; + chain >> sorted >> kRecycle; + for (uint64_t i = 0; i < kSize; ++i, ++sorted) { + BOOST_CHECK_EQUAL(i, *static_cast(sorted.Get())); + } + BOOST_CHECK(!sorted); +} + +}}} // namespaces diff --git a/klm/util/stream/stream.hh b/klm/util/stream/stream.hh new file mode 100644 index 00000000..6ff45b82 --- /dev/null +++ b/klm/util/stream/stream.hh @@ -0,0 +1,74 @@ +#ifndef UTIL_STREAM_STREAM__ +#define UTIL_STREAM_STREAM__ + +#include "util/stream/chain.hh" + +#include + +#include +#include + +namespace util { +namespace stream { + +class Stream : boost::noncopyable { + public: + Stream() : current_(NULL), end_(NULL) {} + + void Init(const ChainPosition &position) { + entry_size_ = position.GetChain().EntrySize(); + block_size_ = position.GetChain().BlockSize(); + block_it_.Init(position); + StartBlock(); + } + + explicit Stream(const ChainPosition &position) { + Init(position); + } + + operator bool() const { return current_ != NULL; } + bool operator!() const { return current_ == NULL; } + + const void *Get() const { return current_; } + void *Get() { return current_; } + + void Poison() { + block_it_->SetValidSize(current_ - static_cast(block_it_->Get())); + ++block_it_; + block_it_.Poison(); + } + + Stream &operator++() { + assert(*this); + assert(current_ < end_); + current_ += entry_size_; + if (current_ == end_) { + ++block_it_; + StartBlock(); + } + return *this; + } + + private: + void StartBlock() { + for (; block_it_ && !block_it_->ValidSize(); ++block_it_) {} + current_ = static_cast(block_it_->Get()); + end_ = current_ + block_it_->ValidSize(); + } + + uint8_t *current_, *end_; + + std::size_t entry_size_; + std::size_t block_size_; + + Link block_it_; +}; + +inline Chain &operator>>(Chain &chain, Stream &stream) { + stream.Init(chain.Add()); + return chain; +} + +} // namespace stream +} // namespace util +#endif // UTIL_STREAM_STREAM__ diff --git a/klm/util/stream/stream_test.cc b/klm/util/stream/stream_test.cc new file mode 100644 index 00000000..6575d50d --- /dev/null +++ b/klm/util/stream/stream_test.cc @@ -0,0 +1,35 @@ +#include "util/stream/io.hh" + +#include "util/stream/stream.hh" +#include "util/file.hh" + +#define BOOST_TEST_MODULE StreamTest +#include + +#include + +namespace util { namespace stream { namespace { + +BOOST_AUTO_TEST_CASE(StreamTest) { + scoped_fd in(MakeTemp("io_test_temp")); + for (uint64_t i = 0; i < 100000; ++i) { + WriteOrThrow(in.get(), &i, sizeof(uint64_t)); + } + SeekOrThrow(in.get(), 0); + + ChainConfig config; + config.entry_size = 8; + config.total_memory = 100; + config.block_count = 12; + + Stream s; + Chain chain(config); + chain >> Read(in.get()) >> s >> kRecycle; + uint64_t i = 0; + for (; s; ++s, ++i) { + BOOST_CHECK_EQUAL(i, *static_cast(s.Get())); + } + BOOST_CHECK_EQUAL(100000ULL, i); +} + +}}} // namespaces diff --git a/klm/util/stream/timer.hh b/klm/util/stream/timer.hh new file mode 100644 index 00000000..50e94fe8 --- /dev/null +++ b/klm/util/stream/timer.hh @@ -0,0 +1,14 @@ +#ifndef UTIL_STREAM_TIMER__ +#define UTIL_STREAM_TIMER__ + +#include + +#if BOOST_VERSION >= 104800 +#include +#define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str)) +#else +//#warning Using Boost older than 1.48. Timing information will not be available. +#define UTIL_TIMER(str) +#endif + +#endif // UTIL_STREAM_TIMER__ -- cgit v1.2.3 From 8b9aae7cff1efd1be195cdd000b21546bd5fca04 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 19 Jan 2013 19:09:48 -0500 Subject: updated version of boost.m4 and automatically build kenneth's LM builder --- Makefile.am | 2 + configure.ac | 7 +- corpus/cut-corpus.pl | 2 +- klm/lm/builder/Makefile.am | 28 +++ klm/util/Makefile.am | 2 +- klm/util/double-conversion/Makefile.am | 2 +- klm/util/stream/Makefile.am | 20 ++ klm/util/stream/sort.hh | 3 +- m4/boost.m4 | 322 +++++++++++++++++++++++++-------- 9 files changed, 311 insertions(+), 77 deletions(-) create mode 100644 klm/lm/builder/Makefile.am create mode 100644 klm/util/stream/Makefile.am (limited to 'klm/util/stream') diff --git a/Makefile.am b/Makefile.am index c2444928..17190d27 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,8 +5,10 @@ SUBDIRS = \ utils \ mteval \ klm/util/double-conversion \ + klm/util/stream \ klm/util \ klm/lm \ + klm/lm/builder \ klm/search \ decoder \ training \ diff --git a/configure.ac b/configure.ac index d6030752..a1e5ad84 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cdec],[2013-01-15]) +AC_INIT([cdec],[2013-01-19]) AC_CONFIG_SRCDIR([decoder/cdec.cc]) AM_INIT_AUTOMAKE AC_CONFIG_HEADERS(config.h) @@ -15,7 +15,10 @@ BOOST_REQUIRE([1.44]) BOOST_PROGRAM_OPTIONS BOOST_SYSTEM BOOST_SERIALIZATION +BOOST_CHRONO +BOOST_TIMER BOOST_TEST +BOOST_THREADS AM_PATH_PYTHON AC_CHECK_HEADER(dlfcn.h,AC_DEFINE(HAVE_DLFCN_H)) AC_CHECK_LIB(dl, dlopen) @@ -111,8 +114,10 @@ AC_CONFIG_FILES([word-aligner/Makefile]) # KenLM stuff AC_CONFIG_FILES([klm/util/double-conversion/Makefile]) +AC_CONFIG_FILES([klm/util/stream/Makefile]) AC_CONFIG_FILES([klm/util/Makefile]) AC_CONFIG_FILES([klm/lm/Makefile]) +AC_CONFIG_FILES([klm/lm/builder/Makefile]) AC_CONFIG_FILES([klm/search/Makefile]) # training stuff diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl index 7daac0e2..0af3b23c 100755 --- a/corpus/cut-corpus.pl +++ b/corpus/cut-corpus.pl @@ -22,7 +22,7 @@ for my $ff (@ind) { while(<>) { chomp; - my @fields = split / \|\|\| /; + my @fields = split /\s*\|\|\|\s*/; my @sf; for my $i (@o) { my $y = $fields[$i]; diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am new file mode 100644 index 00000000..00444256 --- /dev/null +++ b/klm/lm/builder/Makefile.am @@ -0,0 +1,28 @@ +bin_PROGRAMS = builder + +builder_SOURCES = \ + main.cc \ + adjust_counts.cc \ + adjust_counts.hh \ + corpus_count.cc \ + corpus_count.hh \ + discount.hh \ + header_info.hh \ + initial_probabilities.cc \ + initial_probabilities.hh \ + interpolate.cc \ + interpolate.hh \ + joint_order.hh \ + multi_stream.hh \ + ngram.hh \ + ngram_stream.hh \ + pipeline.cc \ + pipeline.hh \ + print.cc \ + print.hh \ + sort.hh + +builder_LDADD = ../libklm.a ../../util/double-conversion/libklm_util_double.a ../../util/stream/libklm_util_stream.a ../../util/libklm_util.a $(BOOST_TIMER_LIBS) $(BOOST_CHRONO_LIBS) $(BOOST_THREAD_LIBS) + +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm + diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am index 294ebc0a..248cc844 100644 --- a/klm/util/Makefile.am +++ b/klm/util/Makefile.am @@ -54,4 +54,4 @@ libklm_util_a_SOURCES = \ string_piece.cc \ usage.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/double-conversion/Makefile.am b/klm/util/double-conversion/Makefile.am index eb6616f7..dfcfb009 100644 --- a/klm/util/double-conversion/Makefile.am +++ b/klm/util/double-conversion/Makefile.am @@ -20,4 +20,4 @@ libklm_util_double_a_SOURCES = \ fixed-dtoa.cc \ strtod.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm -I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/stream/Makefile.am b/klm/util/stream/Makefile.am new file mode 100644 index 00000000..f18cbedb --- /dev/null +++ b/klm/util/stream/Makefile.am @@ -0,0 +1,20 @@ +noinst_LIBRARIES = libklm_util_stream.a + +libklm_util_stream_a_SOURCES = \ + block.hh \ + chain.cc \ + chain.hh \ + config.hh \ + io.cc \ + io.hh \ + line_input.cc \ + line_input.hh \ + multi_progress.cc \ + multi_progress.hh \ + sort.hh \ + stream.hh \ + timer.hh + +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm + +#-I$(top_srcdir)/klm/util/double-conversion diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh index be6c11ea..df57fa41 100644 --- a/klm/util/stream/sort.hh +++ b/klm/util/stream/sort.hh @@ -259,7 +259,8 @@ template class MergingReader { while (in_offsets_->RemainingBlocks()) { // Use bigger buffers if there's less remaining. - uint64_t per_buffer = std::max(buffer_size_, total_memory_ / in_offsets_->RemainingBlocks()); + uint64_t per_buffer = std::max(static_cast(buffer_size_), + static_cast(total_memory_ / in_offsets_->RemainingBlocks())); per_buffer -= per_buffer % entry_size; assert(per_buffer); diff --git a/m4/boost.m4 b/m4/boost.m4 index 7e0ed075..027e039b 100644 --- a/m4/boost.m4 +++ b/m4/boost.m4 @@ -1,5 +1,5 @@ # boost.m4: Locate Boost headers and libraries for autoconf-based projects. -# Copyright (C) 2007, 2008, 2009 Benoit Sigoure +# Copyright (C) 2007, 2008, 2009, 2010, 2011 Benoit Sigoure # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -22,7 +22,7 @@ # along with this program. If not, see . m4_define([_BOOST_SERIAL], [m4_translit([ -# serial 12 +# serial 16 ], [# ], [])]) @@ -45,15 +45,19 @@ m4_define([_BOOST_SERIAL], [m4_translit([ # Note: THESE MACROS ASSUME THAT YOU USE LIBTOOL. If you don't, don't worry, # simply read the README, it will show you what to do step by step. -m4_pattern_forbid([^_?BOOST_]) +m4_pattern_forbid([^_?(BOOST|Boost)_]) # _BOOST_SED_CPP(SED-PROGRAM, PROGRAM, # [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) # -------------------------------------------------------- # Same as AC_EGREP_CPP, but leave the result in conftest.i. -# PATTERN is *not* overquoted, as in AC_EGREP_CPP. It could be useful -# to turn this into a macro which extracts the value of any macro. +# +# SED-PROGRAM is *not* overquoted, as in AC_EGREP_CPP. It is expanded +# in double-quotes, so escape your double quotes. +# +# It could be useful to turn this into a macro which extracts the +# value of any macro. m4_define([_BOOST_SED_CPP], [AC_LANG_PREPROC_REQUIRE()dnl AC_REQUIRE([AC_PROG_SED])dnl @@ -98,6 +102,7 @@ set x $boost_version_req 0 0 0 IFS=$boost_save_IFS shift boost_version_req=`expr "$[1]" '*' 100000 + "$[2]" '*' 100 + "$[3]"` +boost_version_req_string=$[1].$[2].$[3] AC_ARG_WITH([boost], [AS_HELP_STRING([--with-boost=DIR], [prefix of Boost $1 @<:@guess@:>@])])dnl @@ -113,9 +118,9 @@ if test x"$BOOST_ROOT" != x; then fi fi AC_SUBST([DISTCHECK_CONFIGURE_FLAGS], - ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"]) + ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"])dnl boost_save_CPPFLAGS=$CPPFLAGS - AC_CACHE_CHECK([for Boost headers version >= $boost_version_req], + AC_CACHE_CHECK([for Boost headers version >= $boost_version_req_string], [boost_cv_inc_path], [boost_cv_inc_path=no AC_LANG_PUSH([C++])dnl @@ -183,24 +188,25 @@ AC_LANG_POP([C++])dnl ]) case $boost_cv_inc_path in #( no) - boost_errmsg="cannot find Boost headers version >= $boost_version_req" + boost_errmsg="cannot find Boost headers version >= $boost_version_req_string" m4_if([$2], [], [AC_MSG_ERROR([$boost_errmsg])], [AC_MSG_NOTICE([$boost_errmsg])]) $2 ;;#( yes) BOOST_CPPFLAGS= - AC_DEFINE([HAVE_BOOST], [1], - [Defined if the requested minimum BOOST version is satisfied]) ;;#( *) - AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"]) + AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"])dnl ;; esac + if test x"$boost_cv_inc_path" != xno; then + AC_DEFINE([HAVE_BOOST], [1], + [Defined if the requested minimum BOOST version is satisfied]) AC_CACHE_CHECK([for Boost's header version], [boost_cv_lib_version], [m4_pattern_allow([^BOOST_LIB_VERSION$])dnl - _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;g;}], + _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;q;}], [#include boost-lib-version = BOOST_LIB_VERSION], [boost_cv_lib_version=`cat conftest.i`])]) @@ -211,6 +217,7 @@ boost-lib-version = BOOST_LIB_VERSION], AC_MSG_ERROR([invalid value: boost_major_version=$boost_major_version]) ;; esac +fi CPPFLAGS=$boost_save_CPPFLAGS ])# BOOST_REQUIRE @@ -220,7 +227,7 @@ CPPFLAGS=$boost_save_CPPFLAGS # on the command line, static versions of the libraries will be looked up. AC_DEFUN([BOOST_STATIC], [AC_ARG_ENABLE([static-boost], - [AC_HELP_STRING([--enable-static-boost], + [AS_HELP_STRING([--enable-static-boost], [Prefer the static boost libraries over the shared ones [no]])], [enable_static_boost=yes], [enable_static_boost=no])])# BOOST_STATIC @@ -290,6 +297,7 @@ dnl The else branch is huge and wasn't intended on purpose. AC_LANG_PUSH([C++])dnl AS_VAR_PUSHDEF([Boost_lib], [boost_cv_lib_$1])dnl AS_VAR_PUSHDEF([Boost_lib_LDFLAGS], [boost_cv_lib_$1_LDFLAGS])dnl +AS_VAR_PUSHDEF([Boost_lib_LDPATH], [boost_cv_lib_$1_LDPATH])dnl AS_VAR_PUSHDEF([Boost_lib_LIBS], [boost_cv_lib_$1_LIBS])dnl BOOST_FIND_HEADER([$3]) boost_save_CPPFLAGS=$CPPFLAGS @@ -371,8 +379,8 @@ for boost_rtopt_ in $boost_rtopt '' -d; do boost_tmp_lib=$with_boost test x"$with_boost" = x && boost_tmp_lib=${boost_cv_inc_path%/include} for boost_ldpath in "$boost_tmp_lib/lib" '' \ - /opt/local/lib /usr/local/lib /opt/lib /usr/lib \ - "$with_boost" C:/Boost/lib /lib /usr/lib64 /lib64 + /opt/local/lib* /usr/local/lib* /opt/lib* /usr/lib* \ + "$with_boost" C:/Boost/lib /lib* do test -e "$boost_ldpath" || continue boost_save_LDFLAGS=$LDFLAGS @@ -395,7 +403,16 @@ dnl generated only once above (before we start the for loops). LDFLAGS=$boost_save_LDFLAGS LIBS=$boost_save_LIBS if test x"$Boost_lib" = xyes; then - Boost_lib_LDFLAGS="-L$boost_ldpath -R$boost_ldpath" + # Because Boost is often installed in non-standard locations we want to + # hardcode the path to the library (with rpath). Here we assume that + # Libtool's macro was already invoked so we can steal its variable + # hardcode_libdir_flag_spec in order to get the right flags for ld. + boost_save_libdir=$libdir + libdir=$boost_ldpath + eval boost_rpath=\"$hardcode_libdir_flag_spec\" + libdir=$boost_save_libdir + Boost_lib_LDFLAGS="-L$boost_ldpath $boost_rpath" + Boost_lib_LDPATH="$boost_ldpath" break 6 else boost_failed_libs="$boost_failed_libs@$boost_lib@" @@ -410,14 +427,17 @@ rm -f conftest.$ac_objext ]) case $Boost_lib in #( no) _AC_MSG_LOG_CONFTEST - AC_MSG_ERROR([cannot not find the flags to link with Boost $1]) + AC_MSG_ERROR([cannot find the flags to link with Boost $1]) ;; esac -AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS]) -AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS]) +AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS])dnl +AC_SUBST(AS_TR_CPP([BOOST_$1_LDPATH]), [$Boost_lib_LDPATH])dnl +AC_SUBST([BOOST_LDPATH], [$Boost_lib_LDPATH])dnl +AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS])dnl CPPFLAGS=$boost_save_CPPFLAGS AS_VAR_POPDEF([Boost_lib])dnl AS_VAR_POPDEF([Boost_lib_LDFLAGS])dnl +AS_VAR_POPDEF([Boost_lib_LDPATH])dnl AS_VAR_POPDEF([Boost_lib_LIBS])dnl AC_LANG_POP([C++])dnl fi @@ -432,17 +452,31 @@ fi # The page http://beta.boost.org/doc/libs is useful: it gives the first release # version of each library (among other things). +# BOOST_DEFUN(LIBRARY, CODE) +# -------------------------- +# Define BOOST_ as a macro that runs CODE. +# +# Use indir to avoid the warning on underquoted macro name given to AC_DEFUN. +m4_define([BOOST_DEFUN], +[m4_indir([AC_DEFUN], + m4_toupper([BOOST_$1]), +[m4_pushdef([BOOST_Library], [$1])dnl +$2 +m4_popdef([BOOST_Library])dnl +]) +]) + # BOOST_ARRAY() # ------------- # Look for Boost.Array -AC_DEFUN([BOOST_ARRAY], +BOOST_DEFUN([Array], [BOOST_FIND_HEADER([boost/array.hpp])]) # BOOST_ASIO() # ------------ # Look for Boost.Asio (new in Boost 1.35). -AC_DEFUN([BOOST_ASIO], +BOOST_DEFUN([Asio], [AC_REQUIRE([BOOST_SYSTEM])dnl BOOST_FIND_HEADER([boost/asio.hpp])]) @@ -450,14 +484,41 @@ BOOST_FIND_HEADER([boost/asio.hpp])]) # BOOST_BIND() # ------------ # Look for Boost.Bind -AC_DEFUN([BOOST_BIND], +BOOST_DEFUN([Bind], [BOOST_FIND_HEADER([boost/bind.hpp])]) +# BOOST_CHRONO() +# ------------------ +# Look for Boost.Chrono +BOOST_DEFUN([Chrono], +[# Do we have to check for Boost.System? This link-time dependency was +# added as of 1.35.0. If we have a version <1.35, we must not attempt to +# find Boost.System as it didn't exist by then. +if test $boost_major_version -ge 135; then + BOOST_SYSTEM([$1]) +fi # end of the Boost.System check. +boost_system_save_LIBS=$LIBS +boost_system_save_LDFLAGS=$LDFLAGS +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" +BOOST_FIND_LIB([chrono], [$1], + [boost/chrono.hpp], + [boost::chrono::system_clock::time_point d = boost::chrono::system_clock::now();]) +if test $enable_static_boost = yes && test $boost_major_version -ge 135; then + AC_SUBST([BOOST_SYSTEM_LIBS], ["$BOOST_SYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +fi +LIBS=$boost_system_save_LIBS +LDFLAGS=$boost_system_save_LDFLAGS + +])# BOOST_CHRONO + + # BOOST_CONVERSION() # ------------------ # Look for Boost.Conversion (cast / lexical_cast) -AC_DEFUN([BOOST_CONVERSION], +BOOST_DEFUN([Conversion], [BOOST_FIND_HEADER([boost/cast.hpp]) BOOST_FIND_HEADER([boost/lexical_cast.hpp]) ])# BOOST_CONVERSION @@ -467,12 +528,31 @@ BOOST_FIND_HEADER([boost/lexical_cast.hpp]) # ----------------------------------- # Look for Boost.Date_Time. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_DATE_TIME], +BOOST_DEFUN([Date_Time], [BOOST_FIND_LIB([date_time], [$1], [boost/date_time/posix_time/posix_time.hpp], [boost::posix_time::ptime t;]) ])# BOOST_DATE_TIME +# BOOST_TIMER([PREFERRED-RT-OPT]) +# ----------------------------------- +# Look for Boost.Timer. For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Timer], +[#check for Boost.System +BOOST_SYSTEM([$1]) +boost_system_save_LIBS=$LIBS +boost_system_save_LDFLAGS=$LDFLAGS +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" +BOOST_FIND_LIB([timer], [$1], + [boost/timer/timer.hpp], + [boost::timer::auto_cpu_timer t;]) +AC_SUBST([BOOST_SYSTEM_LIBS], ["$BOOST_SYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +LIBS=$boost_system_save_LIBS +LDFLAGS=$boost_system_save_LDFLAGS +])# BOOST_TIMER # BOOST_FILESYSTEM([PREFERRED-RT-OPT]) # ------------------------------------ @@ -480,7 +560,7 @@ AC_DEFUN([BOOST_DATE_TIME], # the documentation of BOOST_FIND_LIB above. # Do not check for boost/filesystem.hpp because this file was introduced in # 1.34. -AC_DEFUN([BOOST_FILESYSTEM], +BOOST_DEFUN([Filesystem], [# Do we have to check for Boost.System? This link-time dependency was # added as of 1.35.0. If we have a version <1.35, we must not attempt to # find Boost.System as it didn't exist by then. @@ -494,6 +574,9 @@ LIBS="$LIBS $BOOST_SYSTEM_LIBS" LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" BOOST_FIND_LIB([filesystem], [$1], [boost/filesystem/path.hpp], [boost::filesystem::path p;]) +if test $enable_static_boost = yes && test $boost_major_version -ge 135; then + AC_SUBST([BOOST_FILESYSTEM_LIBS], ["$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"]) +fi LIBS=$boost_filesystem_save_LIBS LDFLAGS=$boost_filesystem_save_LDFLAGS ])# BOOST_FILESYSTEM @@ -502,7 +585,7 @@ LDFLAGS=$boost_filesystem_save_LDFLAGS # BOOST_FOREACH() # --------------- # Look for Boost.Foreach -AC_DEFUN([BOOST_FOREACH], +BOOST_DEFUN([Foreach], [BOOST_FIND_HEADER([boost/foreach.hpp])]) @@ -513,14 +596,14 @@ AC_DEFUN([BOOST_FOREACH], # standalone. It can't be compiled because it triggers the following error: # boost/format/detail/config_macros.hpp:88: error: 'locale' in namespace 'std' # does not name a type -AC_DEFUN([BOOST_FORMAT], +BOOST_DEFUN([Format], [BOOST_FIND_HEADER([boost/format.hpp])]) # BOOST_FUNCTION() # ---------------- # Look for Boost.Function -AC_DEFUN([BOOST_FUNCTION], +BOOST_DEFUN([Function], [BOOST_FIND_HEADER([boost/function.hpp])]) @@ -528,37 +611,60 @@ AC_DEFUN([BOOST_FUNCTION], # ------------------------------- # Look for Boost.Graphs. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_GRAPH], +BOOST_DEFUN([Graph], [BOOST_FIND_LIB([graph], [$1], [boost/graph/adjacency_list.hpp], [boost::adjacency_list<> g;]) ])# BOOST_GRAPH # BOOST_IOSTREAMS([PREFERRED-RT-OPT]) -# ------------------------------- +# ----------------------------------- # Look for Boost.IOStreams. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_IOSTREAMS], +BOOST_DEFUN([IOStreams], [BOOST_FIND_LIB([iostreams], [$1], [boost/iostreams/device/file_descriptor.hpp], - [boost::iostreams::file_descriptor fd(0); fd.close();]) + [boost::iostreams::file_descriptor fd; fd.close();]) ])# BOOST_IOSTREAMS # BOOST_HASH() # ------------ # Look for Boost.Functional/Hash -AC_DEFUN([BOOST_HASH], +BOOST_DEFUN([Hash], [BOOST_FIND_HEADER([boost/functional/hash.hpp])]) # BOOST_LAMBDA() # -------------- # Look for Boost.Lambda -AC_DEFUN([BOOST_LAMBDA], +BOOST_DEFUN([Lambda], [BOOST_FIND_HEADER([boost/lambda/lambda.hpp])]) +# BOOST_LOG([PREFERRED-RT-OPT]) +# ----------------------------- +# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Log], +[BOOST_FIND_LIB([log], [$1], + [boost/log/core/core.hpp], + [boost::log::attribute a; a.get_value();]) +])# BOOST_LOG + + +# BOOST_LOG_SETUP([PREFERRED-RT-OPT]) +# ----------------------------------- +# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the +# documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Log_Setup], +[AC_REQUIRE([BOOST_LOG])dnl +BOOST_FIND_LIB([log_setup], [$1], + [boost/log/utility/init/from_settings.hpp], + [boost::log::basic_settings bs; bs.empty();]) +])# BOOST_LOG_SETUP + + # BOOST_MATH() # ------------ # Look for Boost.Math @@ -567,21 +673,21 @@ AC_DEFUN([BOOST_LAMBDA], # libboost_math_c99f, libboost_math_c99l, libboost_math_tr1, # libboost_math_tr1f, libboost_math_tr1l). This macro must be fixed to do the # right thing anyway. -AC_DEFUN([BOOST_MATH], +BOOST_DEFUN([Math], [BOOST_FIND_HEADER([boost/math/special_functions.hpp])]) # BOOST_MULTIARRAY() # ------------------ # Look for Boost.MultiArray -AC_DEFUN([BOOST_MULTIARRAY], +BOOST_DEFUN([MultiArray], [BOOST_FIND_HEADER([boost/multi_array.hpp])]) # BOOST_NUMERIC_CONVERSION() # -------------------------- # Look for Boost.NumericConversion (policy-based numeric conversion) -AC_DEFUN([BOOST_NUMERIC_CONVERSION], +BOOST_DEFUN([Numeric_Conversion], [BOOST_FIND_HEADER([boost/numeric/conversion/converter.hpp]) ])# BOOST_NUMERIC_CONVERSION @@ -589,32 +695,76 @@ AC_DEFUN([BOOST_NUMERIC_CONVERSION], # BOOST_OPTIONAL() # ---------------- # Look for Boost.Optional -AC_DEFUN([BOOST_OPTIONAL], +BOOST_DEFUN([Optional], [BOOST_FIND_HEADER([boost/optional.hpp])]) # BOOST_PREPROCESSOR() # -------------------- # Look for Boost.Preprocessor -AC_DEFUN([BOOST_PREPROCESSOR], +BOOST_DEFUN([Preprocessor], [BOOST_FIND_HEADER([boost/preprocessor/repeat.hpp])]) +# BOOST_UNORDERED() +# ----------------- +# Look for Boost.Unordered +BOOST_DEFUN([Unordered], +[BOOST_FIND_HEADER([boost/unordered_map.hpp])]) + + +# BOOST_UUID() +# ------------ +# Look for Boost.Uuid +BOOST_DEFUN([Uuid], +[BOOST_FIND_HEADER([boost/uuid/uuid.hpp])]) + + # BOOST_PROGRAM_OPTIONS([PREFERRED-RT-OPT]) # ----------------------------------------- -# Look for Boost.Program_options. For the documentation of PREFERRED-RT-OPT, see -# the documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_PROGRAM_OPTIONS], +# Look for Boost.Program_options. For the documentation of PREFERRED-RT-OPT, +# see the documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Program_Options], [BOOST_FIND_LIB([program_options], [$1], [boost/program_options.hpp], [boost::program_options::options_description d("test");]) ])# BOOST_PROGRAM_OPTIONS + +# _BOOST_PYTHON_CONFIG(VARIABLE, FLAG) +# ------------------------------------ +# Save VARIABLE, and define it via `python-config --FLAG`. +# Substitute BOOST_PYTHON_VARIABLE. +m4_define([_BOOST_PYTHON_CONFIG], +[AC_SUBST([BOOST_PYTHON_$1], + [`python-config --$2 2>/dev/null`])dnl +boost_python_save_$1=$$1 +$1="$$1 $BOOST_PYTHON_$1"]) + + +# BOOST_PYTHON([PREFERRED-RT-OPT]) +# -------------------------------- +# Look for Boost.Python. For the documentation of PREFERRED-RT-OPT, +# see the documentation of BOOST_FIND_LIB above. +BOOST_DEFUN([Python], +[_BOOST_PYTHON_CONFIG([CPPFLAGS], [includes]) +_BOOST_PYTHON_CONFIG([LDFLAGS], [ldflags]) +_BOOST_PYTHON_CONFIG([LIBS], [libs]) +m4_pattern_allow([^BOOST_PYTHON_MODULE$])dnl +BOOST_FIND_LIB([python], [$1], + [boost/python.hpp], + [], [BOOST_PYTHON_MODULE(empty) {}]) +CPPFLAGS=$boost_python_save_CPPFLAGS +LDFLAGS=$boost_python_save_LDFLAGS +LIBS=$boost_python_save_LIBS +])# BOOST_PYTHON + + # BOOST_REF() # ----------- # Look for Boost.Ref -AC_DEFUN([BOOST_REF], +BOOST_DEFUN([Ref], [BOOST_FIND_HEADER([boost/ref.hpp])]) @@ -622,7 +772,7 @@ AC_DEFUN([BOOST_REF], # ------------------------------- # Look for Boost.Regex. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_REGEX], +BOOST_DEFUN([Regex], [BOOST_FIND_LIB([regex], [$1], [boost/regex.hpp], [boost::regex exp("*"); boost::regex_match("foo", exp);]) @@ -633,19 +783,19 @@ AC_DEFUN([BOOST_REGEX], # --------------------------------------- # Look for Boost.Serialization. For the documentation of PREFERRED-RT-OPT, see # the documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_SERIALIZATION], +BOOST_DEFUN([Serialization], [BOOST_FIND_LIB([serialization], [$1], [boost/archive/text_oarchive.hpp], [std::ostream* o = 0; // Cheap way to get an ostream... boost::archive::text_oarchive t(*o);]) -])# BOOST_SIGNALS +])# BOOST_SERIALIZATION # BOOST_SIGNALS([PREFERRED-RT-OPT]) # --------------------------------- # Look for Boost.Signals. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_SIGNALS], +BOOST_DEFUN([Signals], [BOOST_FIND_LIB([signals], [$1], [boost/signal.hpp], [boost::signal s;]) @@ -655,7 +805,7 @@ AC_DEFUN([BOOST_SIGNALS], # BOOST_SMART_PTR() # ----------------- # Look for Boost.SmartPtr -AC_DEFUN([BOOST_SMART_PTR], +BOOST_DEFUN([Smart_Ptr], [BOOST_FIND_HEADER([boost/scoped_ptr.hpp]) BOOST_FIND_HEADER([boost/shared_ptr.hpp]) ]) @@ -664,14 +814,14 @@ BOOST_FIND_HEADER([boost/shared_ptr.hpp]) # BOOST_STATICASSERT() # -------------------- # Look for Boost.StaticAssert -AC_DEFUN([BOOST_STATICASSERT], +BOOST_DEFUN([StaticAssert], [BOOST_FIND_HEADER([boost/static_assert.hpp])]) # BOOST_STRING_ALGO() # ------------------- # Look for Boost.StringAlgo -AC_DEFUN([BOOST_STRING_ALGO], +BOOST_DEFUN([String_Algo], [BOOST_FIND_HEADER([boost/algorithm/string.hpp]) ]) @@ -681,7 +831,7 @@ AC_DEFUN([BOOST_STRING_ALGO], # Look for Boost.System. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. This library was introduced in Boost # 1.35.0. -AC_DEFUN([BOOST_SYSTEM], +BOOST_DEFUN([System], [BOOST_FIND_LIB([system], [$1], [boost/system/error_code.hpp], [boost::system::error_code e; e.clear();]) @@ -692,7 +842,7 @@ AC_DEFUN([BOOST_SYSTEM], # ------------------------------ # Look for Boost.Test. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_TEST], +BOOST_DEFUN([Test], [m4_pattern_allow([^BOOST_CHECK$])dnl BOOST_FIND_LIB([unit_test_framework], [$1], [boost/test/unit_test.hpp], [BOOST_CHECK(2 == 2);], @@ -707,25 +857,49 @@ BOOST_FIND_LIB([unit_test_framework], [$1], # Look for Boost.Thread. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. # FIXME: Provide an alias "BOOST_THREAD". -AC_DEFUN([BOOST_THREADS], +BOOST_DEFUN([Threads], [dnl Having the pthread flag is required at least on GCC3 where dnl boost/thread.hpp would complain if we try to compile without dnl -pthread on GNU/Linux. AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl boost_threads_save_LIBS=$LIBS +boost_threads_save_LDFLAGS=$LDFLAGS boost_threads_save_CPPFLAGS=$CPPFLAGS -LIBS="$LIBS $boost_cv_pthread_flag" +# Link-time dependency from thread to system was added as of 1.49.0. +if test $boost_major_version -ge 149; then +BOOST_SYSTEM([$1]) +fi # end of the Boost.System check. +m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl +LIBS="$LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag" +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS" # Yes, we *need* to put the -pthread thing in CPPFLAGS because with GCC3, # boost/thread.hpp will trigger a #error if -pthread isn't used: # boost/config/requires_threads.hpp:47:5: #error "Compiler threading support # is not turned on. Please set the correct command line options for # threading: -pthread (Linux), -pthreads (Solaris) or -mthreads (Mingw32)" CPPFLAGS="$CPPFLAGS $boost_cv_pthread_flag" -BOOST_FIND_LIB([thread], [$1], - [boost/thread.hpp], [boost::thread t; boost::mutex m;]) -BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $boost_cv_pthread_flag" + +# When compiling for the Windows platform, the threads library is named +# differently. +case $host_os in + (*mingw*) + BOOST_FIND_LIB([thread_win32], [$1], + [boost/thread.hpp], [boost::thread t; boost::mutex m;]) + BOOST_THREAD_LDFLAGS=$BOOST_THREAD_WIN32_LDFLAGS + BOOST_THREAD_LDPATH=$BOOST_THREAD_WIN32_LDPATH + BOOST_THREAD_LIBS=$BOOST_THREAD_WIN32_LIBS + ;; + (*) + BOOST_FIND_LIB([thread], [$1], + [boost/thread.hpp], [boost::thread t; boost::mutex m;]) + ;; +esac + +BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag" +BOOST_THREAD_LDFLAGS="$BOOST_SYSTEM_LDFLAGS" BOOST_CPPFLAGS="$BOOST_CPPFLAGS $boost_cv_pthread_flag" LIBS=$boost_threads_save_LIBS +LDFLAGS=$boost_threads_save_LDFLAGS CPPFLAGS=$boost_threads_save_CPPFLAGS ])# BOOST_THREADS @@ -733,14 +907,14 @@ CPPFLAGS=$boost_threads_save_CPPFLAGS # BOOST_TOKENIZER() # ----------------- # Look for Boost.Tokenizer -AC_DEFUN([BOOST_TOKENIZER], +BOOST_DEFUN([Tokenizer], [BOOST_FIND_HEADER([boost/tokenizer.hpp])]) # BOOST_TRIBOOL() # --------------- # Look for Boost.Tribool -AC_DEFUN([BOOST_TRIBOOL], +BOOST_DEFUN([Tribool], [BOOST_FIND_HEADER([boost/logic/tribool_fwd.hpp]) BOOST_FIND_HEADER([boost/logic/tribool.hpp]) ]) @@ -749,14 +923,14 @@ BOOST_FIND_HEADER([boost/logic/tribool.hpp]) # BOOST_TUPLE() # ------------- # Look for Boost.Tuple -AC_DEFUN([BOOST_TUPLE], +BOOST_DEFUN([Tuple], [BOOST_FIND_HEADER([boost/tuple/tuple.hpp])]) # BOOST_TYPETRAITS() # -------------------- # Look for Boost.TypeTraits -AC_DEFUN([BOOST_TYPETRAITS], +BOOST_DEFUN([TypeTraits], [BOOST_FIND_HEADER([boost/type_traits.hpp])]) @@ -764,14 +938,14 @@ AC_DEFUN([BOOST_TYPETRAITS], # --------------- # Look for Boost.Utility (noncopyable, result_of, base-from-member idiom, # etc.) -AC_DEFUN([BOOST_UTILITY], +BOOST_DEFUN([Utility], [BOOST_FIND_HEADER([boost/utility.hpp])]) # BOOST_VARIANT() # --------------- # Look for Boost.Variant. -AC_DEFUN([BOOST_VARIANT], +BOOST_DEFUN([Variant], [BOOST_FIND_HEADER([boost/variant/variant_fwd.hpp]) BOOST_FIND_HEADER([boost/variant.hpp])]) @@ -782,15 +956,15 @@ BOOST_FIND_HEADER([boost/variant.hpp])]) # call BOOST_THREADS first. # Look for Boost.Wave. For the documentation of PREFERRED-RT-OPT, see the # documentation of BOOST_FIND_LIB above. -AC_DEFUN([BOOST_WAVE], +BOOST_DEFUN([Wave], [AC_REQUIRE([BOOST_FILESYSTEM])dnl AC_REQUIRE([BOOST_DATE_TIME])dnl boost_wave_save_LIBS=$LIBS boost_wave_save_LDFLAGS=$LDFLAGS m4_pattern_allow([^BOOST_((FILE)?SYSTEM|DATE_TIME|THREAD)_(LIBS|LDFLAGS)$])dnl -LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS\ +LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS \ $BOOST_THREAD_LIBS" -LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS\ +LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS \ $BOOST_DATE_TIME_LDFLAGS $BOOST_THREAD_LDFLAGS" BOOST_FIND_LIB([wave], [$1], [boost/wave.hpp], @@ -803,7 +977,7 @@ LDFLAGS=$boost_wave_save_LDFLAGS # BOOST_XPRESSIVE() # ----------------- # Look for Boost.Xpressive (new since 1.36.0). -AC_DEFUN([BOOST_XPRESSIVE], +BOOST_DEFUN([Xpressive], [BOOST_FIND_HEADER([boost/xpressive/xpressive.hpp])]) @@ -893,8 +1067,9 @@ AC_DEFUN([_BOOST_FIND_COMPILER_TAG], [AC_REQUIRE([AC_PROG_CXX])dnl AC_REQUIRE([AC_CANONICAL_HOST])dnl AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag], -[AC_LANG_PUSH([C++])dnl - boost_cv_lib_tag=unknown +[boost_cv_lib_tag=unknown +if test x$boost_cv_inc_path != xno; then + AC_LANG_PUSH([C++])dnl # The following tests are mostly inspired by boost/config/auto_link.hpp # The list is sorted to most recent/common to oldest compiler (in order # to increase the likelihood of finding the right compiler with the @@ -908,8 +1083,12 @@ AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag] # como, edg, kcc, bck, mp, sw, tru, xlc # I'm not sure about my test for `il' (be careful: Intel's ICC pre-defines # the same defines as GCC's). - # TODO: Move the test on GCC 4.4 up once it's released. for i in \ + _BOOST_gcc_test(4, 8) \ + _BOOST_gcc_test(4, 7) \ + _BOOST_gcc_test(4, 6) \ + _BOOST_gcc_test(4, 5) \ + _BOOST_gcc_test(4, 4) \ _BOOST_gcc_test(4, 3) \ _BOOST_gcc_test(4, 2) \ _BOOST_gcc_test(4, 1) \ @@ -929,7 +1108,6 @@ AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag] "defined __ICC && (defined __unix || defined __unix__) @ il" \ "defined __ICL @ iw" \ "defined _MSC_VER && _MSC_VER == 1300 @ vc7" \ - _BOOST_gcc_test(4, 4) \ _BOOST_gcc_test(2, 95) \ "defined __MWERKS__ && __MWERKS__ <= 0x32FF @ cw9" \ "defined _MSC_VER && _MSC_VER < 1300 && !defined UNDER_CE @ vc6" \ @@ -969,7 +1147,7 @@ AC_LANG_POP([C++])dnl boost_cv_lib_tag= ;; esac -])dnl end of AC_CACHE_CHECK +fi])dnl end of AC_CACHE_CHECK ])# _BOOST_FIND_COMPILER_TAG -- cgit v1.2.3 From dc16aa2accc7d9033d9c31c7bbc5e581d43a5101 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 20 Jan 2013 12:31:03 +0000 Subject: Better delimiters, cross-platform fixes --- klm/lm/builder/corpus_count.cc | 3 ++- klm/lm/filter/arpa_io.cc | 36 +++++++++++------------------------- klm/lm/filter/arpa_io.hh | 27 ++++++++++----------------- klm/util/stream/sort.hh | 5 +++-- klm/util/stream/timer.hh | 8 +++++--- 5 files changed, 31 insertions(+), 48 deletions(-) (limited to 'klm/util/stream') diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc index 8c3de57d..abea4ed0 100644 --- a/klm/lm/builder/corpus_count.cc +++ b/klm/lm/builder/corpus_count.cc @@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) { const WordIndex end_sentence = vocab.Lookup(""); Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_); uint64_t count = 0; + StringPiece delimiters("\0\t\r ", 4); try { while(true) { StringPiece line(from_.ReadLine()); writer.StartSentence(); - for (util::TokenIter w(line, " \t"); w; ++w) { + for (util::TokenIter w(line, delimiters); w; ++w) { WordIndex word = vocab.Lookup(*w); UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing in the future."); writer.Append(word); diff --git a/klm/lm/filter/arpa_io.cc b/klm/lm/filter/arpa_io.cc index caf8df95..f8568ac4 100644 --- a/klm/lm/filter/arpa_io.cc +++ b/klm/lm/filter/arpa_io.cc @@ -12,38 +12,24 @@ namespace lm { -ARPAInputException::ARPAInputException(const StringPiece &message) throw() : what_("Error: ") { - what_.append(message.data(), message.size()); +ARPAInputException::ARPAInputException(const StringPiece &message) throw() { + *this << message; } ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() { - what_ = "Error: "; - what_.append(message.data(), message.size()); - what_ += " in line '"; - what_.append(line.data(), line.size()); - what_ += "'."; + *this << message << " in line " << line; } -ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() - : what_(std::string(message) + " file " + file_name), file_name_(file_name) { - if (errno) { - char buf[1024]; - buf[0] = 0; -#if (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE - const char *add = buf; - if (!strerror_r(errno, buf, 1024)) { -#else - const char *add = strerror_r(errno, buf, 1024); - if (add) { -#endif - what_ += " :"; - what_ += add; - } - } +ARPAInputException::~ARPAInputException() throw() {} + +ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() { + *this << message << " in file " << file_name; } +ARPAOutputException::~ARPAOutputException() throw() {} + // Seeking is the responsibility of the caller. -void WriteCounts(std::ostream &out, const std::vector &number) { +void WriteCounts(std::ostream &out, const std::vector &number) { out << "\n\\data\\\n"; for (unsigned int i = 0; i < number.size(); ++i) { out << "ngram " << i+1 << "=" << number[i] << '\n'; @@ -51,7 +37,7 @@ void WriteCounts(std::ostream &out, const std::vector &number) { out << '\n'; } -size_t SizeNeededForCounts(const std::vector &number) { +size_t SizeNeededForCounts(const std::vector &number) { std::ostringstream buf; WriteCounts(buf, number); return buf.tellp(); diff --git a/klm/lm/filter/arpa_io.hh b/klm/lm/filter/arpa_io.hh index 90f48447..5b31620b 100644 --- a/klm/lm/filter/arpa_io.hh +++ b/klm/lm/filter/arpa_io.hh @@ -16,6 +16,7 @@ #include #include +#include namespace util { class FilePiece; } @@ -25,34 +26,26 @@ class ARPAInputException : public util::Exception { public: explicit ARPAInputException(const StringPiece &message) throw(); explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw(); - virtual ~ARPAInputException() throw() {} - - const char *what() const throw() { return what_.c_str(); } - - private: - std::string what_; + virtual ~ARPAInputException() throw(); }; -class ARPAOutputException : public std::exception { +class ARPAOutputException : public util::ErrnoException { public: ARPAOutputException(const char *prefix, const std::string &file_name) throw(); - virtual ~ARPAOutputException() throw() {} - - const char *what() const throw() { return what_.c_str(); } + virtual ~ARPAOutputException() throw(); const std::string &File() const throw() { return file_name_; } private: - std::string what_; const std::string file_name_; }; // Handling for the counts of n-grams at the beginning of ARPA files. -size_t SizeNeededForCounts(const std::vector &number); +size_t SizeNeededForCounts(const std::vector &number); /* Writes an ARPA file. This has to be seekable so the counts can be written * at the end. Hence, I just have it own a std::fstream instead of accepting - * a separately held std::ostream. + * a separately held std::ostream. TODO: use the fast one from estimation. */ class ARPAOutput : boost::noncopyable { public: @@ -88,14 +81,14 @@ class ARPAOutput : boost::noncopyable { boost::scoped_array buffer_; std::fstream file_; size_t fast_counter_; - std::vector counts_; + std::vector counts_; }; -template void ReadNGrams(util::FilePiece &in, unsigned int length, size_t number, Output &out) { +template void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) { ReadNGramHeader(in, length); out.BeginLength(length); - for (size_t i = 0; i < number; ++i) { + for (uint64_t i = 0; i < number; ++i) { StringPiece line = in.ReadLine(); util::TokenIter tabber(line, '\t'); if (!tabber) throw ARPAInputException("blank line", line); @@ -107,7 +100,7 @@ template void ReadNGrams(util::FilePiece &in, unsigned int length } template void ReadARPA(util::FilePiece &in_lm, Output &out) { - std::vector number; + std::vector number; ReadARPACounts(in_lm, number); out.ReserveForCounts(SizeNeededForCounts(number)); for (unsigned int i = 0; i < number.size(); ++i) { diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh index df57fa41..a86f160f 100644 --- a/klm/util/stream/sort.hh +++ b/klm/util/stream/sort.hh @@ -259,8 +259,9 @@ template class MergingReader { while (in_offsets_->RemainingBlocks()) { // Use bigger buffers if there's less remaining. - uint64_t per_buffer = std::max(static_cast(buffer_size_), - static_cast(total_memory_ / in_offsets_->RemainingBlocks())); + uint64_t per_buffer = static_cast(std::max( + buffer_size_, + static_cast((static_cast(total_memory_) / in_offsets_->RemainingBlocks())))); per_buffer -= per_buffer % entry_size; assert(per_buffer); diff --git a/klm/util/stream/timer.hh b/klm/util/stream/timer.hh index 50e94fe8..7e1a5885 100644 --- a/klm/util/stream/timer.hh +++ b/klm/util/stream/timer.hh @@ -1,14 +1,16 @@ #ifndef UTIL_STREAM_TIMER__ #define UTIL_STREAM_TIMER__ -#include +// Sorry Jon, this was adding library dependencies in Moses and people complained. + +/*#include #if BOOST_VERSION >= 104800 #include #define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str)) #else -//#warning Using Boost older than 1.48. Timing information will not be available. +//#warning Using Boost older than 1.48. Timing information will not be available.*/ #define UTIL_TIMER(str) -#endif +//#endif #endif // UTIL_STREAM_TIMER__ -- cgit v1.2.3