From dc16aa2accc7d9033d9c31c7bbc5e581d43a5101 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 20 Jan 2013 12:31:03 +0000 Subject: Better delimiters, cross-platform fixes --- klm/lm/filter/arpa_io.cc | 36 +++++++++++------------------------- klm/lm/filter/arpa_io.hh | 27 ++++++++++----------------- 2 files changed, 21 insertions(+), 42 deletions(-) (limited to 'klm/lm/filter') diff --git a/klm/lm/filter/arpa_io.cc b/klm/lm/filter/arpa_io.cc index caf8df95..f8568ac4 100644 --- a/klm/lm/filter/arpa_io.cc +++ b/klm/lm/filter/arpa_io.cc @@ -12,38 +12,24 @@ namespace lm { -ARPAInputException::ARPAInputException(const StringPiece &message) throw() : what_("Error: ") { - what_.append(message.data(), message.size()); +ARPAInputException::ARPAInputException(const StringPiece &message) throw() { + *this << message; } ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() { - what_ = "Error: "; - what_.append(message.data(), message.size()); - what_ += " in line '"; - what_.append(line.data(), line.size()); - what_ += "'."; + *this << message << " in line " << line; } -ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() - : what_(std::string(message) + " file " + file_name), file_name_(file_name) { - if (errno) { - char buf[1024]; - buf[0] = 0; -#if (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE - const char *add = buf; - if (!strerror_r(errno, buf, 1024)) { -#else - const char *add = strerror_r(errno, buf, 1024); - if (add) { -#endif - what_ += " :"; - what_ += add; - } - } +ARPAInputException::~ARPAInputException() throw() {} + +ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() { + *this << message << " in file " << file_name; } +ARPAOutputException::~ARPAOutputException() throw() {} + // Seeking is the responsibility of the caller. -void WriteCounts(std::ostream &out, const std::vector &number) { +void WriteCounts(std::ostream &out, const std::vector &number) { out << "\n\\data\\\n"; for (unsigned int i = 0; i < number.size(); ++i) { out << "ngram " << i+1 << "=" << number[i] << '\n'; @@ -51,7 +37,7 @@ void WriteCounts(std::ostream &out, const std::vector &number) { out << '\n'; } -size_t SizeNeededForCounts(const std::vector &number) { +size_t SizeNeededForCounts(const std::vector &number) { std::ostringstream buf; WriteCounts(buf, number); return buf.tellp(); diff --git a/klm/lm/filter/arpa_io.hh b/klm/lm/filter/arpa_io.hh index 90f48447..5b31620b 100644 --- a/klm/lm/filter/arpa_io.hh +++ b/klm/lm/filter/arpa_io.hh @@ -16,6 +16,7 @@ #include #include +#include namespace util { class FilePiece; } @@ -25,34 +26,26 @@ class ARPAInputException : public util::Exception { public: explicit ARPAInputException(const StringPiece &message) throw(); explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw(); - virtual ~ARPAInputException() throw() {} - - const char *what() const throw() { return what_.c_str(); } - - private: - std::string what_; + virtual ~ARPAInputException() throw(); }; -class ARPAOutputException : public std::exception { +class ARPAOutputException : public util::ErrnoException { public: ARPAOutputException(const char *prefix, const std::string &file_name) throw(); - virtual ~ARPAOutputException() throw() {} - - const char *what() const throw() { return what_.c_str(); } + virtual ~ARPAOutputException() throw(); const std::string &File() const throw() { return file_name_; } private: - std::string what_; const std::string file_name_; }; // Handling for the counts of n-grams at the beginning of ARPA files. -size_t SizeNeededForCounts(const std::vector &number); +size_t SizeNeededForCounts(const std::vector &number); /* Writes an ARPA file. This has to be seekable so the counts can be written * at the end. Hence, I just have it own a std::fstream instead of accepting - * a separately held std::ostream. + * a separately held std::ostream. TODO: use the fast one from estimation. */ class ARPAOutput : boost::noncopyable { public: @@ -88,14 +81,14 @@ class ARPAOutput : boost::noncopyable { boost::scoped_array buffer_; std::fstream file_; size_t fast_counter_; - std::vector counts_; + std::vector counts_; }; -template void ReadNGrams(util::FilePiece &in, unsigned int length, size_t number, Output &out) { +template void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) { ReadNGramHeader(in, length); out.BeginLength(length); - for (size_t i = 0; i < number; ++i) { + for (uint64_t i = 0; i < number; ++i) { StringPiece line = in.ReadLine(); util::TokenIter tabber(line, '\t'); if (!tabber) throw ARPAInputException("blank line", line); @@ -107,7 +100,7 @@ template void ReadNGrams(util::FilePiece &in, unsigned int length } template void ReadARPA(util::FilePiece &in_lm, Output &out) { - std::vector number; + std::vector number; ReadARPACounts(in_lm, number); out.ReserveForCounts(SizeNeededForCounts(number)); for (unsigned int i = 0; i < number.size(); ++i) { -- cgit v1.2.3