From 2753c37d0b59df79be15d88222eb0f2ec6caf903 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 20 Jan 2013 12:31:03 +0000
Subject: Better delimiters, cross-platform fixes

---
 klm/lm/builder/corpus_count.cc |  3 ++-
 klm/lm/filter/arpa_io.cc       | 36 +++++++++++-------------------------
 klm/lm/filter/arpa_io.hh       | 27 ++++++++++-----------------
 3 files changed, 23 insertions(+), 43 deletions(-)

(limited to 'klm/lm')

diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc
index 8c3de57d..abea4ed0 100644
--- a/klm/lm/builder/corpus_count.cc
+++ b/klm/lm/builder/corpus_count.cc
@@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
   const WordIndex end_sentence = vocab.Lookup("</s>");
   Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);
   uint64_t count = 0;
+  StringPiece delimiters("\0\t\r ", 4);
   try {
     while(true) {
       StringPiece line(from_.ReadLine());
       writer.StartSentence();
-      for (util::TokenIter<util::AnyCharacter, true> w(line, " \t"); w; ++w) {
+      for (util::TokenIter<util::AnyCharacter, true> w(line, delimiters); w; ++w) {
         WordIndex word = vocab.Lookup(*w);
         UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus.  I plan to support models containing <unk> in the future.");
         writer.Append(word);
diff --git a/klm/lm/filter/arpa_io.cc b/klm/lm/filter/arpa_io.cc
index caf8df95..f8568ac4 100644
--- a/klm/lm/filter/arpa_io.cc
+++ b/klm/lm/filter/arpa_io.cc
@@ -12,38 +12,24 @@
 
 namespace lm {
 
-ARPAInputException::ARPAInputException(const StringPiece &message) throw() : what_("Error: ") {
-  what_.append(message.data(), message.size());
+ARPAInputException::ARPAInputException(const StringPiece &message) throw() {
+  *this << message;
 }
 
 ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() {
-  what_ = "Error: ";
-  what_.append(message.data(), message.size());
-  what_ += " in line '";
-  what_.append(line.data(), line.size());
-  what_ += "'.";
+  *this << message << " in line " << line;
 }
 
-ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw()
-  : what_(std::string(message) + " file " + file_name), file_name_(file_name) {
-  if (errno) {
-    char buf[1024];
-    buf[0] = 0;
-#if (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE
-    const char *add = buf;
-    if (!strerror_r(errno, buf, 1024)) {
-#else
-    const char *add = strerror_r(errno, buf, 1024);
-    if (add) {
-#endif
-      what_ += " :";
-      what_ += add;
-    }
-  }
+ARPAInputException::~ARPAInputException() throw() {}
+
+ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() {
+  *this << message << " in file " << file_name;
 }
 
+ARPAOutputException::~ARPAOutputException() throw() {}
+
 // Seeking is the responsibility of the caller.
-void WriteCounts(std::ostream &out, const std::vector<size_t> &number) {
+void WriteCounts(std::ostream &out, const std::vector<uint64_t> &number) {
   out << "\n\\data\\\n";
   for (unsigned int i = 0; i < number.size(); ++i) {
     out << "ngram " << i+1 << "=" << number[i] << '\n';
@@ -51,7 +37,7 @@ void WriteCounts(std::ostream &out, const std::vector<size_t> &number) {
   out << '\n';
 }
 
-size_t SizeNeededForCounts(const std::vector<size_t> &number) {
+size_t SizeNeededForCounts(const std::vector<uint64_t> &number) {
   std::ostringstream buf;
   WriteCounts(buf, number);
   return buf.tellp();
diff --git a/klm/lm/filter/arpa_io.hh b/klm/lm/filter/arpa_io.hh
index 90f48447..5b31620b 100644
--- a/klm/lm/filter/arpa_io.hh
+++ b/klm/lm/filter/arpa_io.hh
@@ -16,6 +16,7 @@
 
 #include <err.h>
 #include <string.h>
+#include <stdint.h>
 
 namespace util { class FilePiece; }
 
@@ -25,34 +26,26 @@ class ARPAInputException : public util::Exception {
   public:
     explicit ARPAInputException(const StringPiece &message) throw();
     explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw();
-    virtual ~ARPAInputException() throw() {}
-
-    const char *what() const throw() { return what_.c_str(); }
-
-  private:
-    std::string what_;
+    virtual ~ARPAInputException() throw();
 };
 
-class ARPAOutputException : public std::exception {
+class ARPAOutputException : public util::ErrnoException {
   public:
     ARPAOutputException(const char *prefix, const std::string &file_name) throw();
-    virtual ~ARPAOutputException() throw() {}
-
-    const char *what() const throw() { return what_.c_str(); }
+    virtual ~ARPAOutputException() throw();
 
     const std::string &File() const throw() { return file_name_; }
 
   private:
-    std::string what_;
     const std::string file_name_;
 };
 
 // Handling for the counts of n-grams at the beginning of ARPA files.
-size_t SizeNeededForCounts(const std::vector<size_t> &number);
+size_t SizeNeededForCounts(const std::vector<uint64_t> &number);
 
 /* Writes an ARPA file.  This has to be seekable so the counts can be written
  * at the end.  Hence, I just have it own a std::fstream instead of accepting
- * a separately held std::ostream.  
+ * a separately held std::ostream.  TODO: use the fast one from estimation.
  */
 class ARPAOutput : boost::noncopyable {
   public:
@@ -88,14 +81,14 @@ class ARPAOutput : boost::noncopyable {
     boost::scoped_array<char> buffer_;
     std::fstream file_;
     size_t fast_counter_;
-    std::vector<size_t> counts_;
+    std::vector<uint64_t> counts_;
 };
 
 
-template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, size_t number, Output &out) {
+template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) {
   ReadNGramHeader(in, length);
   out.BeginLength(length);
-  for (size_t i = 0; i < number; ++i) {
+  for (uint64_t i = 0; i < number; ++i) {
     StringPiece line = in.ReadLine();
     util::TokenIter<util::SingleCharacter> tabber(line, '\t');
     if (!tabber) throw ARPAInputException("blank line", line);
@@ -107,7 +100,7 @@ template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length
 }
 
 template <class Output> void ReadARPA(util::FilePiece &in_lm, Output &out) {
-  std::vector<size_t> number;
+  std::vector<uint64_t> number;
   ReadARPACounts(in_lm, number);
   out.ReserveForCounts(SizeNeededForCounts(number));
   for (unsigned int i = 0; i < number.size(); ++i) {
-- 
cgit v1.2.3