summaryrefslogtreecommitdiff
path: root/klm/util
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-05-16 13:24:08 -0700
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-05-26 22:59:54 -0400
commit2b63fa0755954edf467a2421997eaf72771260cf (patch)
treeffb22b22540cd59f20f7de6bfed4313f8b946407 /klm/util
parente331ea8e69489cfd727c0ad106c76efa69f3e06c (diff)
Big kenlm change includes lower order models for probing only. And other stuff.
Diffstat (limited to 'klm/util')
-rw-r--r--klm/util/Jamfile2
-rw-r--r--klm/util/Makefile.am3
-rw-r--r--klm/util/bit_packing.hh7
-rw-r--r--klm/util/ersatz_progress.cc8
-rw-r--r--klm/util/ersatz_progress.hh4
-rw-r--r--klm/util/file.cc10
-rw-r--r--klm/util/file.hh3
-rw-r--r--klm/util/file_piece.cc17
-rw-r--r--klm/util/file_piece.hh10
-rw-r--r--klm/util/have.hh10
-rw-r--r--klm/util/mmap.cc14
-rw-r--r--klm/util/murmur_hash.cc11
-rw-r--r--klm/util/murmur_hash.hh6
-rw-r--r--klm/util/probing_hash_table.hh21
-rw-r--r--klm/util/usage.cc46
-rw-r--r--klm/util/usage.hh8
16 files changed, 136 insertions, 44 deletions
diff --git a/klm/util/Jamfile b/klm/util/Jamfile
index b8c14347..3ee2c2c2 100644
--- a/klm/util/Jamfile
+++ b/klm/util/Jamfile
@@ -1,4 +1,4 @@
-lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc ../..//z : <include>.. : : <include>.. ;
+lib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc usage.cc ../..//z : <include>.. : : <include>.. ;
import testing ;
diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am
index a8d6299b..5ceccf2c 100644
--- a/klm/util/Makefile.am
+++ b/klm/util/Makefile.am
@@ -25,6 +25,7 @@ libklm_util_a_SOURCES = \
file.cc \
file_piece.cc \
mmap.cc \
- murmur_hash.cc
+ murmur_hash.cc \
+ usage.cc
AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I..
diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh
index 73a5cb22..dcbd814c 100644
--- a/klm/util/bit_packing.hh
+++ b/klm/util/bit_packing.hh
@@ -174,6 +174,13 @@ struct BitsMask {
uint64_t mask;
};
+struct BitAddress {
+ BitAddress(void *in_base, uint64_t in_offset) : base(in_base), offset(in_offset) {}
+
+ void *base;
+ uint64_t offset;
+};
+
} // namespace util
#endif // UTIL_BIT_PACKING__
diff --git a/klm/util/ersatz_progress.cc b/klm/util/ersatz_progress.cc
index a82ce672..07b14e26 100644
--- a/klm/util/ersatz_progress.cc
+++ b/klm/util/ersatz_progress.cc
@@ -12,17 +12,17 @@ namespace { const unsigned char kWidth = 100; }
ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<std::size_t>::max()), complete_(next_), out_(NULL) {}
ErsatzProgress::~ErsatzProgress() {
- if (!out_) return;
- Finished();
+ if (out_) Finished();
}
-ErsatzProgress::ErsatzProgress(std::ostream *to, const std::string &message, std::size_t complete)
+ErsatzProgress::ErsatzProgress(std::size_t complete, std::ostream *to, const std::string &message)
: current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) {
if (!out_) {
next_ = std::numeric_limits<std::size_t>::max();
return;
}
- *out_ << message << "\n----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n";
+ if (!message.empty()) *out_ << message << '\n';
+ *out_ << "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n";
}
void ErsatzProgress::Milestone() {
diff --git a/klm/util/ersatz_progress.hh b/klm/util/ersatz_progress.hh
index 92c345fe..f709dc51 100644
--- a/klm/util/ersatz_progress.hh
+++ b/klm/util/ersatz_progress.hh
@@ -1,7 +1,7 @@
#ifndef UTIL_ERSATZ_PROGRESS__
#define UTIL_ERSATZ_PROGRESS__
-#include <iosfwd>
+#include <iostream>
#include <string>
// Ersatz version of boost::progress so core language model doesn't depend on
@@ -14,7 +14,7 @@ class ErsatzProgress {
ErsatzProgress();
// Null means no output. The null value is useful for passing along the ostream pointer from another caller.
- ErsatzProgress(std::ostream *to, const std::string &message, std::size_t complete);
+ explicit ErsatzProgress(std::size_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
~ErsatzProgress();
diff --git a/klm/util/file.cc b/klm/util/file.cc
index de206bc8..1bd056fc 100644
--- a/klm/util/file.cc
+++ b/klm/util/file.cc
@@ -43,16 +43,6 @@ int OpenReadOrThrow(const char *name) {
return ret;
}
-int CreateOrThrow(const char *name) {
- int ret;
-#if defined(_WIN32) || defined(_WIN64)
- UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
-#else
- UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
-#endif
- return ret;
-}
-
uint64_t SizeFile(int fd) {
#if defined(_WIN32) || defined(_WIN64)
__int64 ret = _filelengthi64(fd);
diff --git a/klm/util/file.hh b/klm/util/file.hh
index 72c8ea76..5c57e2a9 100644
--- a/klm/util/file.hh
+++ b/klm/util/file.hh
@@ -65,10 +65,7 @@ class scoped_FILE {
std::FILE *file_;
};
-// Open for read only.
int OpenReadOrThrow(const char *name);
-// Create file if it doesn't exist, truncate if it does. Opened for write.
-int CreateOrThrow(const char *name);
// Return value for SizeFile when it can't size properly.
const uint64_t kBadSize = (uint64_t)-1;
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index 081e662b..7b6a01dd 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -18,31 +18,35 @@
#include <sys/types.h>
#include <sys/stat.h>
+#ifdef HAVE_ZLIB
+#include <zlib.h>
+#endif
+
namespace util {
ParseNumberException::ParseNumberException(StringPiece value) throw() {
*this << "Could not parse \"" << value << "\" into a number";
}
+GZException::GZException(void *file) {
#ifdef HAVE_ZLIB
-GZException::GZException(gzFile file) {
int num;
- *this << gzerror( file, &num) << " from zlib";
-}
+ *this << gzerror(file, &num) << " from zlib";
#endif // HAVE_ZLIB
+}
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) :
file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()),
- progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
+ progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) {
Initialize(name, show_progress, min_buffer);
}
FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()),
- progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
+ progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) {
Initialize(name, show_progress, min_buffer);
}
@@ -149,8 +153,9 @@ template <class T> T FilePiece::ReadNumber() {
SkipSpaces();
while (last_space_ < position_) {
if (at_end_) {
+ if (position_ >= position_end_) throw EndOfFileException();
// Hallucinate a null off the end of the file.
- std::string buffer(position_, position_end_);
+ std::string buffer(position_, position_end_ - position_);
char *end;
T ret;
ParseNumber(buffer.c_str(), end, ret);
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index af93d8aa..b81ac0e2 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -13,10 +13,6 @@
#include <stdint.h>
-#ifdef HAVE_ZLIB
-#include <zlib.h>
-#endif
-
namespace util {
class ParseNumberException : public Exception {
@@ -27,9 +23,7 @@ class ParseNumberException : public Exception {
class GZException : public Exception {
public:
-#ifdef HAVE_ZLIB
- explicit GZException(gzFile file);
-#endif
+ explicit GZException(void *file);
GZException() throw() {}
~GZException() throw() {}
};
@@ -123,7 +117,7 @@ class FilePiece {
std::string file_name_;
#ifdef HAVE_ZLIB
- gzFile gz_file_;
+ void *gz_file_;
#endif // HAVE_ZLIB
};
diff --git a/klm/util/have.hh b/klm/util/have.hh
index f2f0cf90..b8181e99 100644
--- a/klm/util/have.hh
+++ b/klm/util/have.hh
@@ -3,13 +3,21 @@
#define UTIL_HAVE__
#ifndef HAVE_ZLIB
+#if !defined(_WIN32) && !defined(_WIN64)
#define HAVE_ZLIB
#endif
+#endif
-// #define HAVE_ICU
+#ifndef HAVE_ICU
+//#define HAVE_ICU
+#endif
#ifndef HAVE_BOOST
#define HAVE_BOOST
#endif
+#ifndef HAVE_THREADS
+//#define HAVE_THREADS
+#endif
+
#endif // UTIL_HAVE__
diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc
index 2db35b56..e0d2570b 100644
--- a/klm/util/mmap.cc
+++ b/klm/util/mmap.cc
@@ -171,6 +171,20 @@ void *MapZeroedWrite(int fd, std::size_t size) {
return MapOrThrow(size, true, kFileFlags, false, fd, 0);
}
+namespace {
+
+int CreateOrThrow(const char *name) {
+ int ret;
+#if defined(_WIN32) || defined(_WIN64)
+ UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
+#else
+ UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
+#endif
+ return ret;
+}
+
+} // namespace
+
void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
file.reset(CreateOrThrow(name));
try {
diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc
index 6accc21a..4f519312 100644
--- a/klm/util/murmur_hash.cc
+++ b/klm/util/murmur_hash.cc
@@ -23,7 +23,7 @@ namespace util {
// 64-bit hash for 64-bit platforms
-uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
+uint64_t MurmurHash64A ( const void * key, std::size_t len, uint64_t seed )
{
const uint64_t m = 0xc6a4a7935bd1e995ULL;
const int r = 47;
@@ -81,7 +81,7 @@ uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
// 64-bit hash for 32-bit platforms
-uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
+uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed )
{
const unsigned int m = 0x5bd1e995;
const int r = 24;
@@ -150,17 +150,18 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
return h;
}
+
// Trick to test for 64-bit architecture at compile time.
namespace {
-template <unsigned L> uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, unsigned int seed) {
+template <unsigned L> inline uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, uint64_t seed) {
return MurmurHash64A(key, len, seed);
}
-template <> uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, unsigned int seed) {
+template <> inline uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, uint64_t seed) {
return MurmurHash64B(key, len, seed);
}
} // namespace
-uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) {
+uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed) {
return MurmurHashNativeBackend<sizeof(void*)>(key, len, seed);
}
diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh
index 638aaeb2..ae7e88de 100644
--- a/klm/util/murmur_hash.hh
+++ b/klm/util/murmur_hash.hh
@@ -5,9 +5,9 @@
namespace util {
-uint64_t MurmurHash64A(const void * key, std::size_t len, unsigned int seed = 0);
-uint64_t MurmurHash64B(const void * key, std::size_t len, unsigned int seed = 0);
-uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed = 0);
+uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0);
+uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0);
+uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0);
} // namespace util
diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh
index f466cebc..3354b68e 100644
--- a/klm/util/probing_hash_table.hh
+++ b/klm/util/probing_hash_table.hh
@@ -78,12 +78,33 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
}
}
+ // Return true if the value was found (and not inserted). This is consistent with Find but the opposite if hash_map!
+ template <class T> bool FindOrInsert(const T &t, MutableIterator &out) {
+#ifdef DEBUG
+ assert(initialized_);
+#endif
+ for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) {
+ Key got(i->GetKey());
+ if (equal_(got, t.GetKey())) { out = i; return true; }
+ if (equal_(got, invalid_)) {
+ UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
+ *i = t;
+ out = i;
+ return false;
+ }
+ if (++i == end_) i = begin_;
+ }
+ }
+
void FinishedInserting() {}
void LoadedBinary() {}
// Don't change anything related to GetKey,
template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {
+#ifdef DEBUG
+ assert(initialized_);
+#endif
for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) {
Key got(i->GetKey());
if (equal_(got, key)) { out = i; return true; }
diff --git a/klm/util/usage.cc b/klm/util/usage.cc
new file mode 100644
index 00000000..e5cf76f0
--- /dev/null
+++ b/klm/util/usage.cc
@@ -0,0 +1,46 @@
+#include "util/usage.hh"
+
+#include <fstream>
+#include <ostream>
+
+#include <string.h>
+#include <ctype.h>
+#if !defined(_WIN32) && !defined(_WIN64)
+#include <sys/resource.h>
+#include <sys/time.h>
+#endif
+
+namespace util {
+
+namespace {
+#if !defined(_WIN32) && !defined(_WIN64)
+float FloatSec(const struct timeval &tv) {
+ return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000.0);
+}
+#endif
+} // namespace
+
+void PrintUsage(std::ostream &out) {
+#if !defined(_WIN32) && !defined(_WIN64)
+ struct rusage usage;
+ if (getrusage(RUSAGE_SELF, &usage)) {
+ perror("getrusage");
+ return;
+ }
+ out << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n';
+
+ // Linux doesn't set memory usage :-(.
+ std::ifstream status("/proc/self/status", std::ios::in);
+ std::string line;
+ while (getline(status, line)) {
+ if (!strncmp(line.c_str(), "VmRSS:\t", 7)) {
+ out << "VmRSS: " << (line.c_str() + 7) << '\n';
+ break;
+ } else if (!strncmp(line.c_str(), "VmPeak:\t", 8)) {
+ out << "VmPeak: " << (line.c_str() + 8) << '\n';
+ }
+ }
+#endif
+}
+
+} // namespace util
diff --git a/klm/util/usage.hh b/klm/util/usage.hh
new file mode 100644
index 00000000..d331ff74
--- /dev/null
+++ b/klm/util/usage.hh
@@ -0,0 +1,8 @@
+#ifndef UTIL_USAGE__
+#define UTIL_USAGE__
+#include <iosfwd>
+
+namespace util {
+void PrintUsage(std::ostream &to);
+} // namespace util
+#endif // UTIL_USAGE__