summaryrefslogtreecommitdiff
path: root/klm
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2013-01-20 12:31:03 +0000
committerKenneth Heafield <github@kheafield.com>2013-01-20 12:31:03 +0000
commitdc16aa2accc7d9033d9c31c7bbc5e581d43a5101 (patch)
tree7ee4b4155447607ad8f0a0e9f8226199403ed77f /klm
parentc18c2497707bed72ace95db459e541261213c7e2 (diff)
Better delimiters, cross-platform fixes
Diffstat (limited to 'klm')
-rw-r--r--klm/lm/builder/corpus_count.cc3
-rw-r--r--klm/lm/filter/arpa_io.cc36
-rw-r--r--klm/lm/filter/arpa_io.hh27
-rw-r--r--klm/util/stream/sort.hh5
-rw-r--r--klm/util/stream/timer.hh8
5 files changed, 31 insertions, 48 deletions
diff --git a/klm/lm/builder/corpus_count.cc b/klm/lm/builder/corpus_count.cc
index 8c3de57d..abea4ed0 100644
--- a/klm/lm/builder/corpus_count.cc
+++ b/klm/lm/builder/corpus_count.cc
@@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
const WordIndex end_sentence = vocab.Lookup("</s>");
Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);
uint64_t count = 0;
+ StringPiece delimiters("\0\t\r ", 4);
try {
while(true) {
StringPiece line(from_.ReadLine());
writer.StartSentence();
- for (util::TokenIter<util::AnyCharacter, true> w(line, " \t"); w; ++w) {
+ for (util::TokenIter<util::AnyCharacter, true> w(line, delimiters); w; ++w) {
WordIndex word = vocab.Lookup(*w);
UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing <unk> in the future.");
writer.Append(word);
diff --git a/klm/lm/filter/arpa_io.cc b/klm/lm/filter/arpa_io.cc
index caf8df95..f8568ac4 100644
--- a/klm/lm/filter/arpa_io.cc
+++ b/klm/lm/filter/arpa_io.cc
@@ -12,38 +12,24 @@
namespace lm {
-ARPAInputException::ARPAInputException(const StringPiece &message) throw() : what_("Error: ") {
- what_.append(message.data(), message.size());
+ARPAInputException::ARPAInputException(const StringPiece &message) throw() {
+ *this << message;
}
ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() {
- what_ = "Error: ";
- what_.append(message.data(), message.size());
- what_ += " in line '";
- what_.append(line.data(), line.size());
- what_ += "'.";
+ *this << message << " in line " << line;
}
-ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw()
- : what_(std::string(message) + " file " + file_name), file_name_(file_name) {
- if (errno) {
- char buf[1024];
- buf[0] = 0;
-#if (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE
- const char *add = buf;
- if (!strerror_r(errno, buf, 1024)) {
-#else
- const char *add = strerror_r(errno, buf, 1024);
- if (add) {
-#endif
- what_ += " :";
- what_ += add;
- }
- }
+ARPAInputException::~ARPAInputException() throw() {}
+
+ARPAOutputException::ARPAOutputException(const char *message, const std::string &file_name) throw() {
+ *this << message << " in file " << file_name;
}
+ARPAOutputException::~ARPAOutputException() throw() {}
+
// Seeking is the responsibility of the caller.
-void WriteCounts(std::ostream &out, const std::vector<size_t> &number) {
+void WriteCounts(std::ostream &out, const std::vector<uint64_t> &number) {
out << "\n\\data\\\n";
for (unsigned int i = 0; i < number.size(); ++i) {
out << "ngram " << i+1 << "=" << number[i] << '\n';
@@ -51,7 +37,7 @@ void WriteCounts(std::ostream &out, const std::vector<size_t> &number) {
out << '\n';
}
-size_t SizeNeededForCounts(const std::vector<size_t> &number) {
+size_t SizeNeededForCounts(const std::vector<uint64_t> &number) {
std::ostringstream buf;
WriteCounts(buf, number);
return buf.tellp();
diff --git a/klm/lm/filter/arpa_io.hh b/klm/lm/filter/arpa_io.hh
index 90f48447..5b31620b 100644
--- a/klm/lm/filter/arpa_io.hh
+++ b/klm/lm/filter/arpa_io.hh
@@ -16,6 +16,7 @@
#include <err.h>
#include <string.h>
+#include <stdint.h>
namespace util { class FilePiece; }
@@ -25,34 +26,26 @@ class ARPAInputException : public util::Exception {
public:
explicit ARPAInputException(const StringPiece &message) throw();
explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw();
- virtual ~ARPAInputException() throw() {}
-
- const char *what() const throw() { return what_.c_str(); }
-
- private:
- std::string what_;
+ virtual ~ARPAInputException() throw();
};
-class ARPAOutputException : public std::exception {
+class ARPAOutputException : public util::ErrnoException {
public:
ARPAOutputException(const char *prefix, const std::string &file_name) throw();
- virtual ~ARPAOutputException() throw() {}
-
- const char *what() const throw() { return what_.c_str(); }
+ virtual ~ARPAOutputException() throw();
const std::string &File() const throw() { return file_name_; }
private:
- std::string what_;
const std::string file_name_;
};
// Handling for the counts of n-grams at the beginning of ARPA files.
-size_t SizeNeededForCounts(const std::vector<size_t> &number);
+size_t SizeNeededForCounts(const std::vector<uint64_t> &number);
/* Writes an ARPA file. This has to be seekable so the counts can be written
* at the end. Hence, I just have it own a std::fstream instead of accepting
- * a separately held std::ostream.
+ * a separately held std::ostream. TODO: use the fast one from estimation.
*/
class ARPAOutput : boost::noncopyable {
public:
@@ -88,14 +81,14 @@ class ARPAOutput : boost::noncopyable {
boost::scoped_array<char> buffer_;
std::fstream file_;
size_t fast_counter_;
- std::vector<size_t> counts_;
+ std::vector<uint64_t> counts_;
};
-template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, size_t number, Output &out) {
+template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) {
ReadNGramHeader(in, length);
out.BeginLength(length);
- for (size_t i = 0; i < number; ++i) {
+ for (uint64_t i = 0; i < number; ++i) {
StringPiece line = in.ReadLine();
util::TokenIter<util::SingleCharacter> tabber(line, '\t');
if (!tabber) throw ARPAInputException("blank line", line);
@@ -107,7 +100,7 @@ template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length
}
template <class Output> void ReadARPA(util::FilePiece &in_lm, Output &out) {
- std::vector<size_t> number;
+ std::vector<uint64_t> number;
ReadARPACounts(in_lm, number);
out.ReserveForCounts(SizeNeededForCounts(number));
for (unsigned int i = 0; i < number.size(); ++i) {
diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh
index df57fa41..a86f160f 100644
--- a/klm/util/stream/sort.hh
+++ b/klm/util/stream/sort.hh
@@ -259,8 +259,9 @@ template <class Compare, class Combine> class MergingReader {
while (in_offsets_->RemainingBlocks()) {
// Use bigger buffers if there's less remaining.
- uint64_t per_buffer = std::max(static_cast<uint64_t>(buffer_size_),
- static_cast<uint64_t>(total_memory_ / in_offsets_->RemainingBlocks()));
+ uint64_t per_buffer = static_cast<uint64_t>(std::max<std::size_t>(
+ buffer_size_,
+ static_cast<std::size_t>((static_cast<uint64_t>(total_memory_) / in_offsets_->RemainingBlocks()))));
per_buffer -= per_buffer % entry_size;
assert(per_buffer);
diff --git a/klm/util/stream/timer.hh b/klm/util/stream/timer.hh
index 50e94fe8..7e1a5885 100644
--- a/klm/util/stream/timer.hh
+++ b/klm/util/stream/timer.hh
@@ -1,14 +1,16 @@
#ifndef UTIL_STREAM_TIMER__
#define UTIL_STREAM_TIMER__
-#include <boost/version.hpp>
+// Sorry Jon, this was adding library dependencies in Moses and people complained.
+
+/*#include <boost/version.hpp>
#if BOOST_VERSION >= 104800
#include <boost/timer/timer.hpp>
#define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str))
#else
-//#warning Using Boost older than 1.48. Timing information will not be available.
+//#warning Using Boost older than 1.48. Timing information will not be available.*/
#define UTIL_TIMER(str)
-#endif
+//#endif
#endif // UTIL_STREAM_TIMER__