From 931a036dc3cf9e1deafc10e78e94a0ebe3c8004f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 25 Jan 2011 22:30:48 +0200 Subject: update kenlm --- klm/util/bit_packing.cc | 2 +- klm/util/bit_packing.hh | 17 ++++++++++------- klm/util/ersatz_progress.cc | 1 + klm/util/file_piece.cc | 24 +++--------------------- klm/util/file_piece.hh | 35 ++++++++++++++++++++++++++++------- klm/util/key_value_packing.hh | 4 ++++ klm/util/probing_hash_table.hh | 14 ++++++++++++-- klm/util/sorted_uniform.hh | 10 ++++++++++ 8 files changed, 69 insertions(+), 38 deletions(-) (limited to 'klm/util') diff --git a/klm/util/bit_packing.cc b/klm/util/bit_packing.cc index 9d4fdf27..681da5f2 100644 --- a/klm/util/bit_packing.cc +++ b/klm/util/bit_packing.cc @@ -22,7 +22,7 @@ uint8_t RequiredBits(uint64_t max_value) { } void BitPackingSanity() { - const detail::FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 }; + const FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 }; if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000"); char mem[57+8]; memset(mem, 0, sizeof(mem)); diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh index 636547b1..70cfc2d2 100644 --- a/klm/util/bit_packing.hh +++ b/klm/util/bit_packing.hh @@ -53,29 +53,32 @@ inline void WriteInt57(void *base, uint8_t bit, uint8_t length, uint64_t value) *reinterpret_cast(base) |= (value << BitPackShift(bit, length)); } -namespace detail { typedef union { float f; uint32_t i; } FloatEnc; } +typedef union { float f; uint32_t i; } FloatEnc; + inline float ReadFloat32(const void *base, uint8_t bit) { - detail::FloatEnc encoded; + FloatEnc encoded; encoded.i = *reinterpret_cast(base) >> BitPackShift(bit, 32); return encoded.f; } inline void WriteFloat32(void *base, uint8_t bit, float value) { - detail::FloatEnc encoded; + FloatEnc encoded; encoded.f = value; WriteInt57(base, bit, 32, encoded.i); } +const uint32_t kSignBit = 0x80000000; + inline float ReadNonPositiveFloat31(const void *base, uint8_t bit) { - detail::FloatEnc encoded; + FloatEnc encoded; encoded.i = *reinterpret_cast(base) >> BitPackShift(bit, 31); // Sign bit set means negative. - encoded.i |= 0x80000000; + encoded.i |= kSignBit; return encoded.f; } inline void WriteNonPositiveFloat31(void *base, uint8_t bit, float value) { - detail::FloatEnc encoded; + FloatEnc encoded; encoded.f = value; - encoded.i &= ~0x80000000; + encoded.i &= ~kSignBit; WriteInt57(base, bit, 31, encoded.i); } diff --git a/klm/util/ersatz_progress.cc b/klm/util/ersatz_progress.cc index 55c182bd..a82ce672 100644 --- a/klm/util/ersatz_progress.cc +++ b/klm/util/ersatz_progress.cc @@ -36,6 +36,7 @@ void ErsatzProgress::Milestone() { if (stone == kWidth) { (*out_) << std::endl; next_ = std::numeric_limits::max(); + out_ = NULL; } else { next_ = std::max(next_, (stone * complete_) / kWidth); } diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index 5a667ebb..81eb9bb9 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -37,6 +37,9 @@ GZException::GZException(void *file) { #endif // HAVE_ZLIB } +// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). +const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + int OpenReadOrThrow(const char *name) { int ret = open(name, O_RDONLY); if (ret == -1) UTIL_THROW(ErrnoException, "in open (" << name << ") for reading"); @@ -107,13 +110,6 @@ unsigned long int FilePiece::ReadULong() throw(GZException, EndOfFileException, return ReadNumber(); } -void FilePiece::SkipSpaces() throw (GZException, EndOfFileException) { - for (; ; ++position_) { - if (position_ == position_end_) Shift(); - if (!isspace(*position_)) return; - } -} - void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) throw (GZException) { #ifdef HAVE_ZLIB gz_file_ = NULL; @@ -190,20 +186,6 @@ template T FilePiece::ReadNumber() throw(GZException, EndOfFileExcepti return ret; } -const char *FilePiece::FindDelimiterOrEOF() throw (GZException, EndOfFileException) { - for (const char *i = position_; i <= last_space_; ++i) { - if (isspace(*i)) return i; - } - while (!at_end_) { - size_t skip = position_end_ - position_; - Shift(); - for (const char *i = position_ + skip; i <= last_space_; ++i) { - if (isspace(*i)) return i; - } - } - return position_end_; -} - void FilePiece::Shift() throw(GZException, EndOfFileException) { if (at_end_) { progress_.Finished(); diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index b7697e71..f5249fcf 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -36,10 +36,13 @@ class GZException : public Exception { int OpenReadOrThrow(const char *name); +extern const bool kSpaces[256]; + // Return value for SizeFile when it can't size properly. const off_t kBadSize = -1; off_t SizeFile(int fd); +// Memory backing the returned StringPiece may vanish on the next call. class FilePiece { public: // 32 MB default. @@ -57,12 +60,12 @@ class FilePiece { return *(position_++); } - // Memory backing the returned StringPiece may vanish on the next call. - // Leaves the delimiter, if any, to be returned by get(). - StringPiece ReadDelimited() throw(GZException, EndOfFileException) { - SkipSpaces(); - return Consume(FindDelimiterOrEOF()); + // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace(). + StringPiece ReadDelimited(const bool *delim = kSpaces) throw(GZException, EndOfFileException) { + SkipSpaces(delim); + return Consume(FindDelimiterOrEOF(delim)); } + // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter. // It is similar to getline in that way. StringPiece ReadLine(char delim = '\n') throw(GZException, EndOfFileException); @@ -72,7 +75,13 @@ class FilePiece { long int ReadLong() throw(GZException, EndOfFileException, ParseNumberException); unsigned long int ReadULong() throw(GZException, EndOfFileException, ParseNumberException); - void SkipSpaces() throw (GZException, EndOfFileException); + // Skip spaces defined by isspace. + void SkipSpaces(const bool *delim = kSpaces) throw (GZException, EndOfFileException) { + for (; ; ++position_) { + if (position_ == position_end_) Shift(); + if (!delim[static_cast(*position_)]) return; + } + } off_t Offset() const { return position_ - data_.begin() + mapped_offset_; @@ -91,7 +100,19 @@ class FilePiece { return ret; } - const char *FindDelimiterOrEOF() throw(EndOfFileException, GZException); + const char *FindDelimiterOrEOF(const bool *delim = kSpaces) throw (GZException, EndOfFileException) { + for (const char *i = position_; i < position_end_; ++i) { + if (delim[static_cast(*i)]) return i; + } + while (!at_end_) { + size_t skip = position_end_ - position_; + Shift(); + for (const char *i = position_ + skip; i < position_end_; ++i) { + if (delim[static_cast(*i)]) return i; + } + } + return position_end_; + } void Shift() throw (EndOfFileException, GZException); // Backends to Shift(). diff --git a/klm/util/key_value_packing.hh b/klm/util/key_value_packing.hh index 450512ac..b84a5aad 100644 --- a/klm/util/key_value_packing.hh +++ b/klm/util/key_value_packing.hh @@ -18,6 +18,8 @@ template struct Entry { const Key &GetKey() const { return key; } const Value &GetValue() const { return value; } + Value &MutableValue() { return value; } + void Set(const Key &key_in, const Value &value_in) { SetKey(key_in); SetValue(value_in); @@ -77,6 +79,8 @@ template class ByteAlignedPacking { const Key &GetKey() const { return key; } const Value &GetValue() const { return value; } + Value &MutableValue() { return value; } + void Set(const Key &key_in, const Value &value_in) { SetKey(key_in); SetValue(value_in); diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh index 7b5cdc22..00be0ed7 100644 --- a/klm/util/probing_hash_table.hh +++ b/klm/util/probing_hash_table.hh @@ -77,6 +77,16 @@ template bool UnsafeMutableFind(const Key key, MutableIterator &out) { + for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) { + Key got(i->GetKey()); + if (equal_(got, key)) { out = i; return true; } + if (equal_(got, invalid_)) return false; + if (++i == end_) i = begin_; + } + } + template bool Find(const Key key, ConstIterator &out) const { #ifdef DEBUG assert(initialized_); @@ -84,8 +94,8 @@ template GetKey()); if (equal_(got, key)) { out = i; return true; } - if (equal_(got, invalid_)) { return false; } - if (++i == end_) { i = begin_; } + if (equal_(got, invalid_)) return false; + if (++i == end_) i = begin_; } } diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh index a8e208fb..05826b51 100644 --- a/klm/util/sorted_uniform.hh +++ b/klm/util/sorted_uniform.hh @@ -62,6 +62,7 @@ template class SortedUniformMap { public: typedef PackingT Packing; typedef typename Packing::ConstIterator ConstIterator; + typedef typename Packing::MutableIterator MutableIterator; public: // Offer consistent API with probing hash. @@ -113,6 +114,15 @@ template class SortedUniformMap { *size_ptr_ = (end_ - begin_); } + // Don't use this to change the key. + template bool UnsafeMutableFind(const Key key, MutableIterator &out) { +#ifdef DEBUG + assert(initialized_); + assert(loaded_); +#endif + return SortedUniformFind(begin_, end_, key, out); + } + // Do not call before FinishedInserting. template bool Find(const Key key, ConstIterator &out) const { #ifdef DEBUG -- cgit v1.2.3