summaryrefslogtreecommitdiff
path: root/klm/util
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-01-25 22:30:48 +0200
committerChris Dyer <cdyer@cs.cmu.edu>2011-01-25 22:30:48 +0200
commitc4ade3091b812ca135ae6520fa7173e1bbf28754 (patch)
tree2528af208f6dafd0c27dcbec0d2da291a9c93ca2 /klm/util
parentd04c0ca2d9df0e147239b18e90650ca8bd51d594 (diff)
update kenlm
Diffstat (limited to 'klm/util')
-rw-r--r--klm/util/bit_packing.cc2
-rw-r--r--klm/util/bit_packing.hh17
-rw-r--r--klm/util/ersatz_progress.cc1
-rw-r--r--klm/util/file_piece.cc24
-rw-r--r--klm/util/file_piece.hh35
-rw-r--r--klm/util/key_value_packing.hh4
-rw-r--r--klm/util/probing_hash_table.hh14
-rw-r--r--klm/util/sorted_uniform.hh10
8 files changed, 69 insertions, 38 deletions
diff --git a/klm/util/bit_packing.cc b/klm/util/bit_packing.cc
index 9d4fdf27..681da5f2 100644
--- a/klm/util/bit_packing.cc
+++ b/klm/util/bit_packing.cc
@@ -22,7 +22,7 @@ uint8_t RequiredBits(uint64_t max_value) {
}
void BitPackingSanity() {
- const detail::FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 };
+ const FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 };
if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000");
char mem[57+8];
memset(mem, 0, sizeof(mem));
diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh
index 636547b1..70cfc2d2 100644
--- a/klm/util/bit_packing.hh
+++ b/klm/util/bit_packing.hh
@@ -53,29 +53,32 @@ inline void WriteInt57(void *base, uint8_t bit, uint8_t length, uint64_t value)
*reinterpret_cast<uint64_t*>(base) |= (value << BitPackShift(bit, length));
}
-namespace detail { typedef union { float f; uint32_t i; } FloatEnc; }
+typedef union { float f; uint32_t i; } FloatEnc;
+
inline float ReadFloat32(const void *base, uint8_t bit) {
- detail::FloatEnc encoded;
+ FloatEnc encoded;
encoded.i = *reinterpret_cast<const uint64_t*>(base) >> BitPackShift(bit, 32);
return encoded.f;
}
inline void WriteFloat32(void *base, uint8_t bit, float value) {
- detail::FloatEnc encoded;
+ FloatEnc encoded;
encoded.f = value;
WriteInt57(base, bit, 32, encoded.i);
}
+const uint32_t kSignBit = 0x80000000;
+
inline float ReadNonPositiveFloat31(const void *base, uint8_t bit) {
- detail::FloatEnc encoded;
+ FloatEnc encoded;
encoded.i = *reinterpret_cast<const uint64_t*>(base) >> BitPackShift(bit, 31);
// Sign bit set means negative.
- encoded.i |= 0x80000000;
+ encoded.i |= kSignBit;
return encoded.f;
}
inline void WriteNonPositiveFloat31(void *base, uint8_t bit, float value) {
- detail::FloatEnc encoded;
+ FloatEnc encoded;
encoded.f = value;
- encoded.i &= ~0x80000000;
+ encoded.i &= ~kSignBit;
WriteInt57(base, bit, 31, encoded.i);
}
diff --git a/klm/util/ersatz_progress.cc b/klm/util/ersatz_progress.cc
index 55c182bd..a82ce672 100644
--- a/klm/util/ersatz_progress.cc
+++ b/klm/util/ersatz_progress.cc
@@ -36,6 +36,7 @@ void ErsatzProgress::Milestone() {
if (stone == kWidth) {
(*out_) << std::endl;
next_ = std::numeric_limits<std::size_t>::max();
+ out_ = NULL;
} else {
next_ = std::max(next_, (stone * complete_) / kWidth);
}
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index 5a667ebb..81eb9bb9 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -37,6 +37,9 @@ GZException::GZException(void *file) {
#endif // HAVE_ZLIB
}
+// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
+const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
int OpenReadOrThrow(const char *name) {
int ret = open(name, O_RDONLY);
if (ret == -1) UTIL_THROW(ErrnoException, "in open (" << name << ") for reading");
@@ -107,13 +110,6 @@ unsigned long int FilePiece::ReadULong() throw(GZException, EndOfFileException,
return ReadNumber<unsigned long int>();
}
-void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) {
- for (; ; ++position_) {
- if (position_ == position_end_) Shift();
- if (!isspace(*position_)) return;
- }
-}
-
void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) {
#ifdef HAVE_ZLIB
gz_file_ = NULL;
@@ -190,20 +186,6 @@ template <class T> T FilePiece::ReadNumber() throw(GZException, EndOfFileExcepti
return ret;
}
-const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileException) {
- for (const char *i = position_; i <= last_space_; ++i) {
- if (isspace(*i)) return i;
- }
- while (!at_end_) {
- size_t skip = position_end_ - position_;
- Shift();
- for (const char *i = position_ + skip; i <= last_space_; ++i) {
- if (isspace(*i)) return i;
- }
- }
- return position_end_;
-}
-
void FilePiece::Shift() throw(GZException, EndOfFileException) {
if (at_end_) {
progress_.Finished();
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index b7697e71..f5249fcf 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -36,10 +36,13 @@ class GZException : public Exception {
int OpenReadOrThrow(const char *name);
+extern const bool kSpaces[256];
+
// Return value for SizeFile when it can't size properly.
const off_t kBadSize = -1;
off_t SizeFile(int fd);
+// Memory backing the returned StringPiece may vanish on the next call.
class FilePiece {
public:
// 32 MB default.
@@ -57,12 +60,12 @@ class FilePiece {
return *(position_++);
}
- // Memory backing the returned StringPiece may vanish on the next call.
- // Leaves the delimiter, if any, to be returned by get().
- StringPiece ReadDelimited() throw(GZException, EndOfFileException) {
- SkipSpaces();
- return Consume(FindDelimiterOrEOF());
+ // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
+ StringPiece ReadDelimited(const bool *delim = kSpaces) throw(GZException, EndOfFileException) {
+ SkipSpaces(delim);
+ return Consume(FindDelimiterOrEOF(delim));
}
+
// Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
// It is similar to getline in that way.
StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException);
@@ -72,7 +75,13 @@ class FilePiece {
long int ReadLong() throw(GZException, EndOfFileException, ParseNumberException);
unsigned long int ReadULong() throw(GZException, EndOfFileException, ParseNumberException);
- void SkipSpaces() throw (GZException, EndOfFileException);
+ // Skip spaces defined by isspace.
+ void SkipSpaces(const bool *delim = kSpaces) throw (GZException, EndOfFileException) {
+ for (; ; ++position_) {
+ if (position_ == position_end_) Shift();
+ if (!delim[static_cast<unsigned char>(*position_)]) return;
+ }
+ }
off_t Offset() const {
return position_ - data_.begin() + mapped_offset_;
@@ -91,7 +100,19 @@ class FilePiece {
return ret;
}
- const char *FindDelimiterOrEOF() throw(EndOfFileException, GZException);
+ const char *FindDelimiterOrEOF(const bool *delim = kSpaces) throw (GZException, EndOfFileException) {
+ for (const char *i = position_; i < position_end_; ++i) {
+ if (delim[static_cast<unsigned char>(*i)]) return i;
+ }
+ while (!at_end_) {
+ size_t skip = position_end_ - position_;
+ Shift();
+ for (const char *i = position_ + skip; i < position_end_; ++i) {
+ if (delim[static_cast<unsigned char>(*i)]) return i;
+ }
+ }
+ return position_end_;
+ }
void Shift() throw (EndOfFileException, GZException);
// Backends to Shift().
diff --git a/klm/util/key_value_packing.hh b/klm/util/key_value_packing.hh
index 450512ac..b84a5aad 100644
--- a/klm/util/key_value_packing.hh
+++ b/klm/util/key_value_packing.hh
@@ -18,6 +18,8 @@ template <class Key, class Value> struct Entry {
const Key &GetKey() const { return key; }
const Value &GetValue() const { return value; }
+ Value &MutableValue() { return value; }
+
void Set(const Key &key_in, const Value &value_in) {
SetKey(key_in);
SetValue(value_in);
@@ -77,6 +79,8 @@ template <class KeyT, class ValueT> class ByteAlignedPacking {
const Key &GetKey() const { return key; }
const Value &GetValue() const { return value; }
+ Value &MutableValue() { return value; }
+
void Set(const Key &key_in, const Value &value_in) {
SetKey(key_in);
SetValue(value_in);
diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh
index 7b5cdc22..00be0ed7 100644
--- a/klm/util/probing_hash_table.hh
+++ b/klm/util/probing_hash_table.hh
@@ -77,6 +77,16 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac
void LoadedBinary() {}
+ // Don't change anything related to GetKey,
+ template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {
+ for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) {
+ Key got(i->GetKey());
+ if (equal_(got, key)) { out = i; return true; }
+ if (equal_(got, invalid_)) return false;
+ if (++i == end_) i = begin_;
+ }
+ }
+
template <class Key> bool Find(const Key key, ConstIterator &out) const {
#ifdef DEBUG
assert(initialized_);
@@ -84,8 +94,8 @@ template <class PackingT, class HashT, class EqualT = std::equal_to<typename Pac
for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) {
Key got(i->GetKey());
if (equal_(got, key)) { out = i; return true; }
- if (equal_(got, invalid_)) { return false; }
- if (++i == end_) { i = begin_; }
+ if (equal_(got, invalid_)) return false;
+ if (++i == end_) i = begin_;
}
}
diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh
index a8e208fb..05826b51 100644
--- a/klm/util/sorted_uniform.hh
+++ b/klm/util/sorted_uniform.hh
@@ -62,6 +62,7 @@ template <class PackingT> class SortedUniformMap {
public:
typedef PackingT Packing;
typedef typename Packing::ConstIterator ConstIterator;
+ typedef typename Packing::MutableIterator MutableIterator;
public:
// Offer consistent API with probing hash.
@@ -113,6 +114,15 @@ template <class PackingT> class SortedUniformMap {
*size_ptr_ = (end_ - begin_);
}
+ // Don't use this to change the key.
+ template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {
+#ifdef DEBUG
+ assert(initialized_);
+ assert(loaded_);
+#endif
+ return SortedUniformFind<MutableIterator, Key>(begin_, end_, key, out);
+ }
+
// Do not call before FinishedInserting.
template <class Key> bool Find(const Key key, ConstIterator &out) const {
#ifdef DEBUG