diff options
| -rw-r--r-- | klm/lm/builder/corpus_count.cc | 3 | ||||
| -rw-r--r-- | klm/lm/filter/arpa_io.cc | 36 | ||||
| -rw-r--r-- | klm/lm/filter/arpa_io.hh | 27 | ||||
| -rw-r--r-- | klm/util/stream/sort.hh | 5 | ||||
| -rw-r--r-- | klm/util/stream/timer.hh | 8 | 
5 files changed, 31 insertions, 48 deletions
| diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc index 8c3de57d..abea4ed0 100644 --- a/klm/lm/builder/corpus_count.cc +++ b/klm/lm/builder/corpus_count.cc @@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {    const WordIndex end_sentence = vocab.Lookup("</s>");    Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);    uint64_t count = 0; +  StringPiece delimiters("\0\t\r ", 4);    try {      while(true) {        StringPiece line(from_.ReadLine());        writer.StartSentence(); -      for (util::TokenIter<util::AnyCharacter, true> w(line, " \t"); w; ++w) { +      for (util::TokenIter<util::AnyCharacter, true> w(line, delimiters); w; ++w) {          WordIndex word = vocab.Lookup(*w);          UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus.  I plan to support models containing <unk> in the future.");          writer.Append(word); diff --git a/klm/lm/filter/arpa_io.cc b/klm/lm/filter/arpa_io.cc index caf8df95..f8568ac4 100644 --- a/klm/lm/filter/arpa_io.cc +++ b/klm/lm/filter/arpa_io.cc @@ -12,38 +12,24 @@  namespace lm { -ARPAInputException::ARPAInputException(const StringPiece &message) throw() : what_("Error: ") { -  what_.append(message.data(), message.size()); +ARPAInputException::ARPAInputException(const StringPiece &message) throw() { +  *this << message;  }  ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() { -  what_ = "Error: "; -  what_.append(message.data(), message.size()); -  what_ += " in line '"; -  what_.append(line.data(), line.size()); -  what_ += "'."; +  *this << message << " in line " << line;  } -ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() -  : what_(std::string(message) + " file " + file_name), file_name_(file_name) { -  if (errno) { -    char buf[1024]; -    buf[0] = 0; -#if (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE -    const char *add = buf; -    if (!strerror_r(errno, buf, 1024)) { -#else -    const char *add = strerror_r(errno, buf, 1024); -    if (add) { -#endif -      what_ += " :"; -      what_ += add; -    } -  } +ARPAInputException::~ARPAInputException() throw() {} + +ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() { +  *this << message << " in file " << file_name;  } +ARPAOutputException::~ARPAOutputException() throw() {} +  // Seeking is the responsibility of the caller. -void WriteCounts(std::ostream &out, const std::vector<size_t> &number) { +void WriteCounts(std::ostream &out, const std::vector<uint64_t> &number) {    out << "\n\\data\\\n";    for (unsigned int i = 0; i < number.size(); ++i) {      out << "ngram " << i+1 << "=" << number[i] << '\n'; @@ -51,7 +37,7 @@ void WriteCounts(std::ostream &out, const std::vector<size_t> &number) {    out << '\n';  } -size_t SizeNeededForCounts(const std::vector<size_t> &number) { +size_t SizeNeededForCounts(const std::vector<uint64_t> &number) {    std::ostringstream buf;    WriteCounts(buf, number);    return buf.tellp(); diff --git a/klm/lm/filter/arpa_io.hh b/klm/lm/filter/arpa_io.hh index 90f48447..5b31620b 100644 --- a/klm/lm/filter/arpa_io.hh +++ b/klm/lm/filter/arpa_io.hh @@ -16,6 +16,7 @@  #include <err.h>  #include <string.h> +#include <stdint.h>  namespace util { class FilePiece; } @@ -25,34 +26,26 @@ class ARPAInputException : public util::Exception {    public:      explicit ARPAInputException(const StringPiece &message) throw();      explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw(); -    virtual ~ARPAInputException() throw() {} - -    const char *what() const throw() { return what_.c_str(); } - -  private: -    std::string what_; +    virtual ~ARPAInputException() throw();  }; -class ARPAOutputException : public std::exception { +class ARPAOutputException : public util::ErrnoException {    public:      ARPAOutputException(const char *prefix, const std::string &file_name) throw(); -    virtual ~ARPAOutputException() throw() {} - -    const char *what() const throw() { return what_.c_str(); } +    virtual ~ARPAOutputException() throw();      const std::string &File() const throw() { return file_name_; }    private: -    std::string what_;      const std::string file_name_;  };  // Handling for the counts of n-grams at the beginning of ARPA files. -size_t SizeNeededForCounts(const std::vector<size_t> &number); +size_t SizeNeededForCounts(const std::vector<uint64_t> &number);  /* Writes an ARPA file.  This has to be seekable so the counts can be written   * at the end.  Hence, I just have it own a std::fstream instead of accepting - * a separately held std::ostream.   + * a separately held std::ostream.  TODO: use the fast one from estimation.   */  class ARPAOutput : boost::noncopyable {    public: @@ -88,14 +81,14 @@ class ARPAOutput : boost::noncopyable {      boost::scoped_array<char> buffer_;      std::fstream file_;      size_t fast_counter_; -    std::vector<size_t> counts_; +    std::vector<uint64_t> counts_;  }; -template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, size_t number, Output &out) { +template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) {    ReadNGramHeader(in, length);    out.BeginLength(length); -  for (size_t i = 0; i < number; ++i) { +  for (uint64_t i = 0; i < number; ++i) {      StringPiece line = in.ReadLine();      util::TokenIter<util::SingleCharacter> tabber(line, '\t');      if (!tabber) throw ARPAInputException("blank line", line); @@ -107,7 +100,7 @@ template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length  }  template <class Output> void ReadARPA(util::FilePiece &in_lm, Output &out) { -  std::vector<size_t> number; +  std::vector<uint64_t> number;    ReadARPACounts(in_lm, number);    out.ReserveForCounts(SizeNeededForCounts(number));    for (unsigned int i = 0; i < number.size(); ++i) { diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh index df57fa41..a86f160f 100644 --- a/klm/util/stream/sort.hh +++ b/klm/util/stream/sort.hh @@ -259,8 +259,9 @@ template <class Compare, class Combine> class MergingReader {        while (in_offsets_->RemainingBlocks()) {          // Use bigger buffers if there's less remaining. -        uint64_t per_buffer = std::max(static_cast<uint64_t>(buffer_size_), -                                       static_cast<uint64_t>(total_memory_ / in_offsets_->RemainingBlocks())); +        uint64_t per_buffer = static_cast<uint64_t>(std::max<std::size_t>( +            buffer_size_, +            static_cast<std::size_t>((static_cast<uint64_t>(total_memory_) / in_offsets_->RemainingBlocks()))));          per_buffer -= per_buffer % entry_size;          assert(per_buffer); diff --git a/klm/util/stream/timer.hh b/klm/util/stream/timer.hh index 50e94fe8..7e1a5885 100644 --- a/klm/util/stream/timer.hh +++ b/klm/util/stream/timer.hh @@ -1,14 +1,16 @@  #ifndef UTIL_STREAM_TIMER__  #define UTIL_STREAM_TIMER__ -#include <boost/version.hpp> +// Sorry Jon, this was adding library dependencies in Moses and people complained. + +/*#include <boost/version.hpp>  #if BOOST_VERSION >= 104800  #include <boost/timer/timer.hpp>  #define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str))  #else -//#warning Using Boost older than 1.48. Timing information will not be available. +//#warning Using Boost older than 1.48. Timing information will not be available.*/  #define UTIL_TIMER(str)  -#endif +//#endif  #endif // UTIL_STREAM_TIMER__ | 
