Updated kenlm. Includes left state support but not the cdec-side use of it. Updated binary format.

author: Kenneth Heafield <kenlm@kheafield.com> 2011-09-21 18:23:50 -0400
committer: Kenneth Heafield <kenlm@kheafield.com> 2011-09-21 18:23:50 -0400
commit: 83cae8bd92a2ebffcf2b8b4d2500766da008fe3d (patch)
tree: 575a54bbebc74eaafef6e9ebe4b37e6ad211b632 /klm/util
parent: 4b619c0ca5b8c521c6fb4c3df1c4b43756baa306 (diff)
14 files changed, 504 insertions, 239 deletions
diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh
index 9f47d559..33266b94 100644
--- a/klm/util/bit_packing.hh
+++ b/klm/util/bit_packing.hh
@@ -86,6 +86,20 @@ inline void WriteFloat32(void *base, uint64_t bit_off, float value) {
 
 const uint32_t kSignBit = 0x80000000;
 
+inline void SetSign(float &to) {
+  FloatEnc enc;
+  enc.f = to;
+  enc.i |= kSignBit;
+  to = enc.f;
+}
+
+inline void UnsetSign(float &to) {
+  FloatEnc enc;
+  enc.f = to;
+  enc.i &= ~kSignBit;
+  to = enc.f;
+}
+
 inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) {
   FloatEnc encoded;
   encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31);
diff --git a/klm/util/exception.cc b/klm/util/exception.cc
index 62280970..96951495 100644
--- a/klm/util/exception.cc
+++ b/klm/util/exception.cc
@@ -79,4 +79,9 @@ ErrnoException::ErrnoException() throw() : errno_(errno) {
 
 ErrnoException::~ErrnoException() throw() {}
 
+EndOfFileException::EndOfFileException() throw() {
+  *this << "End of file";
+}
+EndOfFileException::~EndOfFileException() throw() {}
+
 } // namespace util
diff --git a/klm/util/exception.hh b/klm/util/exception.hh
index 81675a57..6d6a37cb 100644
--- a/klm/util/exception.hh
+++ b/klm/util/exception.hh
@@ -105,6 +105,12 @@ class ErrnoException : public Exception {
     int errno_;
 };
 
+class EndOfFileException : public Exception {
+  public:
+    EndOfFileException() throw();
+    ~EndOfFileException() throw();
+};
+
 } // namespace util
 
 #endif // UTIL_EXCEPTION__
diff --git a/klm/util/file.cc b/klm/util/file.cc
new file mode 100644
index 00000000..d707568e
--- /dev/null
+++ b/klm/util/file.cc
@@ -0,0 +1,74 @@
+#include "util/file.hh"
+
+#include "util/exception.hh"
+
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+namespace util {
+
+scoped_fd::~scoped_fd() {
+  if (fd_ != -1 && close(fd_)) {
+    std::cerr << "Could not close file " << fd_ << std::endl;
+    std::abort();
+  }
+}
+
+scoped_FILE::~scoped_FILE() {
+  if (file_ && std::fclose(file_)) {
+    std::cerr << "Could not close file " << std::endl;
+    std::abort();
+  }
+}
+
+int OpenReadOrThrow(const char *name) {
+  int ret;
+  UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name);
+  return ret;
+}
+
+int CreateOrThrow(const char *name) {
+  int ret;
+  UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR)), ErrnoException, "while creating " << name);
+  return ret;
+}
+
+off_t SizeFile(int fd) {
+  struct stat sb;
+  if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize;
+  return sb.st_size;
+}
+
+void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
+  uint8_t *to = static_cast<uint8_t*>(to_void);
+  while (amount) {
+    ssize_t ret = read(fd, to, amount);
+    if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
+    if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
+    amount -= ret;
+    to += ret;
+  }
+}
+
+void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
+  const uint8_t *data = static_cast<const uint8_t*>(data_void);
+  while (size) {
+    ssize_t ret = write(fd, data, size);
+    if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed");
+    data += ret;
+    size -= ret;
+  }
+}
+
+void RemoveOrThrow(const char *name) {
+  UTIL_THROW_IF(std::remove(name), util::ErrnoException, "Could not remove " << name);
+}
+
+} // namespace util
diff --git a/klm/util/file.hh b/klm/util/file.hh
new file mode 100644
index 00000000..d6cca41d
--- /dev/null
+++ b/klm/util/file.hh
@@ -0,0 +1,74 @@
+#ifndef UTIL_FILE__
+#define UTIL_FILE__
+
+#include <cstdio>
+#include <unistd.h>
+
+namespace util {
+
+class scoped_fd {
+  public:
+    scoped_fd() : fd_(-1) {}
+
+    explicit scoped_fd(int fd) : fd_(fd) {}
+
+    ~scoped_fd();
+
+    void reset(int to) {
+      scoped_fd other(fd_);
+      fd_ = to;
+    }
+
+    int get() const { return fd_; }
+
+    int operator*() const { return fd_; }
+
+    int release() {
+      int ret = fd_;
+      fd_ = -1;
+      return ret;
+    }
+
+    operator bool() { return fd_ != -1; }
+
+  private:
+    int fd_;
+
+    scoped_fd(const scoped_fd &);
+    scoped_fd &operator=(const scoped_fd &);
+};
+
+class scoped_FILE {
+  public:
+    explicit scoped_FILE(std::FILE *file = NULL) : file_(file) {}
+
+    ~scoped_FILE();
+
+    std::FILE *get() { return file_; }
+    const std::FILE *get() const { return file_; }
+
+    void reset(std::FILE *to = NULL) {
+      scoped_FILE other(file_);
+      file_ = to;
+    }
+
+  private:
+    std::FILE *file_;
+};
+
+int OpenReadOrThrow(const char *name);
+
+int CreateOrThrow(const char *name);
+
+// Return value for SizeFile when it can't size properly.  
+const off_t kBadSize = -1;
+off_t SizeFile(int fd);
+
+void ReadOrThrow(int fd, void *to, std::size_t size);
+void WriteOrThrow(int fd, const void *data_void, std::size_t size);
+
+void RemoveOrThrow(const char *name);
+
+} // namespace util
+
+#endif // UTIL_FILE__
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index cbe4234f..b57582a0 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -1,6 +1,7 @@
 #include "util/file_piece.hh"
 
 #include "util/exception.hh"
+#include "util/file.hh"
 
 #include <iostream>
 #include <string>
@@ -21,11 +22,6 @@
 
 namespace util {
 
-EndOfFileException::EndOfFileException() throw() {
-  *this << "End of file";
-}
-EndOfFileException::~EndOfFileException() throw() {}
-
 ParseNumberException::ParseNumberException(StringPiece value) throw() {
   *this << "Could not parse \"" << value << "\" into a number";
 }
@@ -40,18 +36,6 @@ GZException::GZException(void *file) {
 // Sigh this is the only way I could come up with to do a _const_ bool.  It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). 
 const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 
-int OpenReadOrThrow(const char *name) {
-  int ret;
-  UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name);
-  return ret;
-}
-
-off_t SizeFile(int fd) {
-  struct stat sb;
-  if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize;
-  return sb.st_size;
-}
-
 FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) : 
   file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
   progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index a5c00910..a627f38c 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -3,9 +3,9 @@
 
 #include "util/ersatz_progress.hh"
 #include "util/exception.hh"
+#include "util/file.hh"
 #include "util/have.hh"
 #include "util/mmap.hh"
-#include "util/scoped.hh"
 #include "util/string_piece.hh"
 
 #include <string>
@@ -14,12 +14,6 @@
 
 namespace util {
 
-class EndOfFileException : public Exception {
-  public:
-    EndOfFileException() throw();
-    ~EndOfFileException() throw();
-};
-
 class ParseNumberException : public Exception {
   public:
     explicit ParseNumberException(StringPiece value) throw();
@@ -33,14 +27,8 @@ class GZException : public Exception {
     ~GZException() throw() {}
 };
 
-int OpenReadOrThrow(const char *name);
-
 extern const bool kSpaces[256];
 
-// Return value for SizeFile when it can't size properly.  
-const off_t kBadSize = -1;
-off_t SizeFile(int fd);
-
 // Memory backing the returned StringPiece may vanish on the next call.  
 class FilePiece {
   public:
diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc
index e7c0643b..5ce7adc9 100644
--- a/klm/util/mmap.cc
+++ b/klm/util/mmap.cc
@@ -1,6 +1,6 @@
 #include "util/exception.hh"
+#include "util/file.hh"
 #include "util/mmap.hh"
-#include "util/scoped.hh"
 
 #include <iostream>
 
@@ -66,20 +66,6 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
   return ret;
 }
 
-namespace {
-void ReadAll(int fd, void *to_void, std::size_t amount) {
-  uint8_t *to = static_cast<uint8_t*>(to_void);
-  while (amount) {
-    ssize_t ret = read(fd, to, amount);
-    if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed.");
-    if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
-    amount -= ret;
-    to += ret;
-  }
-}
-
-} // namespace
-
 const int kFileFlags =
 #ifdef MAP_FILE
   MAP_FILE | MAP_SHARED
@@ -106,7 +92,7 @@ void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_m
       out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
       if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc");
       if (-1 == lseek(fd, offset, SEEK_SET)) UTIL_THROW(ErrnoException, "lseek to " << offset << " in fd " << fd << " failed.");
-      ReadAll(fd, out.get(), size);
+      ReadOrThrow(fd, out.get(), size);
       break;
   }
 }
diff --git a/klm/util/mmap.hh b/klm/util/mmap.hh
index e4439fa4..b0eb6672 100644
--- a/klm/util/mmap.hh
+++ b/klm/util/mmap.hh
@@ -2,8 +2,6 @@
 #define UTIL_MMAP__
 // Utilities for mmaped files.  
 
-#include "util/scoped.hh"
-
 #include <cstddef>
 
 #include <inttypes.h>
@@ -11,6 +9,8 @@
 
 namespace util {
 
+class scoped_fd;
+
 // (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.  
 class scoped_mmap {
   public:
diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc
index fec47fd9..d58a0727 100644
--- a/klm/util/murmur_hash.cc
+++ b/klm/util/murmur_hash.cc
@@ -1,129 +1,129 @@
-/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All
- * code is released to the public domain. For business purposes, Murmurhash is
- * under the MIT license."
- * This is modified from the original:
- * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.  
- * length changed to unsigned int.  
- * placed in namespace util
- * add MurmurHashNative
- * default option = 0 for seed
- */
-
-#include "util/murmur_hash.hh"
-
-namespace util {
-
-//-----------------------------------------------------------------------------
-// MurmurHash2, 64-bit versions, by Austin Appleby
-
-// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment 
-// and endian-ness issues if used across multiple platforms.
-
-// 64-bit hash for 64-bit platforms
-
-uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
-{
-  const uint64_t m = 0xc6a4a7935bd1e995ULL;
-  const int r = 47;
-
-  uint64_t h = seed ^ (len * m);
-
-  const uint64_t * data = (const uint64_t *)key;
-  const uint64_t * end = data + (len/8);
-
-  while(data != end)
-  {
-    uint64_t k = *data++;
-
-    k *= m; 
-    k ^= k >> r; 
-    k *= m; 
-    
-    h ^= k;
-    h *= m; 
-  }
-
-  const unsigned char * data2 = (const unsigned char*)data;
-
-  switch(len & 7)
-  {
-  case 7: h ^= uint64_t(data2[6]) << 48;
-  case 6: h ^= uint64_t(data2[5]) << 40;
-  case 5: h ^= uint64_t(data2[4]) << 32;
-  case 4: h ^= uint64_t(data2[3]) << 24;
-  case 3: h ^= uint64_t(data2[2]) << 16;
-  case 2: h ^= uint64_t(data2[1]) << 8;
-  case 1: h ^= uint64_t(data2[0]);
-          h *= m;
-  };
- 
-  h ^= h >> r;
-  h *= m;
-  h ^= h >> r;
-
-  return h;
-} 
-
-
-// 64-bit hash for 32-bit platforms
-
-uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
-{
-  const unsigned int m = 0x5bd1e995;
-  const int r = 24;
-
-  unsigned int h1 = seed ^ len;
-  unsigned int h2 = 0;
-
-  const unsigned int * data = (const unsigned int *)key;
-
-  while(len >= 8)
-  {
-    unsigned int k1 = *data++;
-    k1 *= m; k1 ^= k1 >> r; k1 *= m;
-    h1 *= m; h1 ^= k1;
-    len -= 4;
-
-    unsigned int k2 = *data++;
-    k2 *= m; k2 ^= k2 >> r; k2 *= m;
-    h2 *= m; h2 ^= k2;
-    len -= 4;
-  }
-
-  if(len >= 4)
-  {
-    unsigned int k1 = *data++;
-    k1 *= m; k1 ^= k1 >> r; k1 *= m;
-    h1 *= m; h1 ^= k1;
-    len -= 4;
-  }
-
-  switch(len)
-  {
-  case 3: h2 ^= ((unsigned char*)data)[2] << 16;
-  case 2: h2 ^= ((unsigned char*)data)[1] << 8;
-  case 1: h2 ^= ((unsigned char*)data)[0];
-      h2 *= m;
-  };
-
-  h1 ^= h2 >> 18; h1 *= m;
-  h2 ^= h1 >> 22; h2 *= m;
-  h1 ^= h2 >> 17; h1 *= m;
-  h2 ^= h1 >> 19; h2 *= m;
-
-  uint64_t h = h1;
-
-  h = (h << 32) | h2;
-
-  return h;
-}
-
-uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) {
-  if (sizeof(int) == 4) {
-    return MurmurHash64B(key, len, seed);
-  } else {
-    return MurmurHash64A(key, len, seed);
-  }
-}
-
-} // namespace util
+/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All
+ * code is released to the public domain. For business purposes, Murmurhash is
+ * under the MIT license."
+ * This is modified from the original:
+ * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.  
+ * length changed to unsigned int.  
+ * placed in namespace util
+ * add MurmurHashNative
+ * default option = 0 for seed
+ */
+
+#include "util/murmur_hash.hh"
+
+namespace util {
+
+//-----------------------------------------------------------------------------
+// MurmurHash2, 64-bit versions, by Austin Appleby
+
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment 
+// and endian-ness issues if used across multiple platforms.
+
+// 64-bit hash for 64-bit platforms
+
+uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
+{
+  const uint64_t m = 0xc6a4a7935bd1e995ULL;
+  const int r = 47;
+
+  uint64_t h = seed ^ (len * m);
+
+  const uint64_t * data = (const uint64_t *)key;
+  const uint64_t * end = data + (len/8);
+
+  while(data != end)
+  {
+    uint64_t k = *data++;
+
+    k *= m; 
+    k ^= k >> r; 
+    k *= m; 
+    
+    h ^= k;
+    h *= m; 
+  }
+
+  const unsigned char * data2 = (const unsigned char*)data;
+
+  switch(len & 7)
+  {
+  case 7: h ^= uint64_t(data2[6]) << 48;
+  case 6: h ^= uint64_t(data2[5]) << 40;
+  case 5: h ^= uint64_t(data2[4]) << 32;
+  case 4: h ^= uint64_t(data2[3]) << 24;
+  case 3: h ^= uint64_t(data2[2]) << 16;
+  case 2: h ^= uint64_t(data2[1]) << 8;
+  case 1: h ^= uint64_t(data2[0]);
+          h *= m;
+  };
+ 
+  h ^= h >> r;
+  h *= m;
+  h ^= h >> r;
+
+  return h;
+} 
+
+
+// 64-bit hash for 32-bit platforms
+
+uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
+{
+  const unsigned int m = 0x5bd1e995;
+  const int r = 24;
+
+  unsigned int h1 = seed ^ len;
+  unsigned int h2 = 0;
+
+  const unsigned int * data = (const unsigned int *)key;
+
+  while(len >= 8)
+  {
+    unsigned int k1 = *data++;
+    k1 *= m; k1 ^= k1 >> r; k1 *= m;
+    h1 *= m; h1 ^= k1;
+    len -= 4;
+
+    unsigned int k2 = *data++;
+    k2 *= m; k2 ^= k2 >> r; k2 *= m;
+    h2 *= m; h2 ^= k2;
+    len -= 4;
+  }
+
+  if(len >= 4)
+  {
+    unsigned int k1 = *data++;
+    k1 *= m; k1 ^= k1 >> r; k1 *= m;
+    h1 *= m; h1 ^= k1;
+    len -= 4;
+  }
+
+  switch(len)
+  {
+  case 3: h2 ^= ((unsigned char*)data)[2] << 16;
+  case 2: h2 ^= ((unsigned char*)data)[1] << 8;
+  case 1: h2 ^= ((unsigned char*)data)[0];
+      h2 *= m;
+  };
+
+  h1 ^= h2 >> 18; h1 *= m;
+  h2 ^= h1 >> 22; h2 *= m;
+  h1 ^= h2 >> 17; h1 *= m;
+  h2 ^= h1 >> 19; h2 *= m;
+
+  uint64_t h = h1;
+
+  h = (h << 32) | h2;
+
+  return h;
+}
+
+uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) {
+  if (sizeof(int) == 4) {
+    return MurmurHash64B(key, len, seed);
+  } else {
+    return MurmurHash64A(key, len, seed);
+  }
+}
+
+} // namespace util
diff --git a/klm/util/scoped.cc b/klm/util/scoped.cc
deleted file mode 100644
index a4cc5016..00000000
--- a/klm/util/scoped.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "util/scoped.hh"
-
-#include <iostream>
-
-#include <stdlib.h>
-#include <unistd.h>
-
-namespace util {
-
-scoped_fd::~scoped_fd() {
-  if (fd_ != -1 && close(fd_)) {
-    std::cerr << "Could not close file " << fd_ << std::endl;
-    abort();
-  }
-}
-
-scoped_FILE::~scoped_FILE() {
-  if (file_ && fclose(file_)) {
-    std::cerr << "Could not close file " << std::endl;
-    abort();
-  }
-}
-
-} // namespace util
diff --git a/klm/util/scoped.hh b/klm/util/scoped.hh
index d36a7df3..12e6652b 100644
--- a/klm/util/scoped.hh
+++ b/klm/util/scoped.hh
@@ -1,10 +1,11 @@
 #ifndef UTIL_SCOPED__
 #define UTIL_SCOPED__
 
-/* Other scoped objects in the style of scoped_ptr. */
+#include "util/exception.hh"
 
+/* Other scoped objects in the style of scoped_ptr. */
 #include <cstddef>
-#include <cstdio>
+#include <cstdlib>
 
 namespace util {
 
@@ -34,52 +35,33 @@ template <class T, class R, R (*Free)(T*)> class scoped_thing {
     scoped_thing &operator=(const scoped_thing &);
 };
 
-class scoped_fd {
+class scoped_malloc {
   public:
-    scoped_fd() : fd_(-1) {}
+    scoped_malloc() : p_(NULL) {}
 
-    explicit scoped_fd(int fd) : fd_(fd) {}
+    scoped_malloc(void *p) : p_(p) {}
 
-    ~scoped_fd();
+    ~scoped_malloc() { std::free(p_); }
 
-    void reset(int to) {
-      scoped_fd other(fd_);
-      fd_ = to;
+    void reset(void *p = NULL) {
+      scoped_malloc other(p_);
+      p_ = p;
     }
 
-    int get() const { return fd_; }
-
-    int operator*() const { return fd_; }
-
-    int release() {
-      int ret = fd_;
-      fd_ = -1;
-      return ret;
+    void call_realloc(std::size_t to) {
+      void *ret;
+      UTIL_THROW_IF(!(ret = std::realloc(p_, to)), util::ErrnoException, "realloc to " << to << " bytes failed.");
+      p_ = ret;
     }
 
-  private:
-    int fd_;
-
-    scoped_fd(const scoped_fd &);
-    scoped_fd &operator=(const scoped_fd &);
-};
-
-class scoped_FILE {
-  public:
-    explicit scoped_FILE(std::FILE *file = NULL) : file_(file) {}
-
-    ~scoped_FILE();
-
-    std::FILE *get() { return file_; }
-    const std::FILE *get() const { return file_; }
-
-    void reset(std::FILE *to = NULL) {
-      scoped_FILE other(file_);
-      file_ = to;
-    }
+    void *get() { return p_; }
+    const void *get() const { return p_; }
 
   private:
-    std::FILE *file_;
+    void *p_;
+
+    scoped_malloc(const scoped_malloc &);
+    scoped_malloc &operator=(const scoped_malloc &);
 };
 
 // Hat tip to boost.  
diff --git a/klm/util/sized_iterator.hh b/klm/util/sized_iterator.hh
new file mode 100644
index 00000000..47dfc245
--- /dev/null
+++ b/klm/util/sized_iterator.hh
@@ -0,0 +1,107 @@
+#ifndef UTIL_SIZED_ITERATOR__
+#define UTIL_SIZED_ITERATOR__
+
+#include "util/proxy_iterator.hh"
+
+#include <functional>
+#include <string>
+
+#include <inttypes.h>
+#include <string.h>
+
+namespace util {
+
+class SizedInnerIterator {
+  public:
+    SizedInnerIterator() {}
+
+    SizedInnerIterator(void *ptr, std::size_t size) : ptr_(static_cast<uint8_t*>(ptr)), size_(size) {}
+
+    bool operator==(const SizedInnerIterator &other) const {
+      return ptr_ == other.ptr_;
+    }
+    bool operator<(const SizedInnerIterator &other) const {
+      return ptr_ < other.ptr_;
+    }
+    SizedInnerIterator &operator+=(std::ptrdiff_t amount) {
+      ptr_ += amount * size_;
+      return *this;
+    }
+    std::ptrdiff_t operator-(const SizedInnerIterator &other) const {
+      return (ptr_ - other.ptr_) / size_;
+    }
+
+    const void *Data() const { return ptr_; }
+    void *Data() { return ptr_; }
+    std::size_t EntrySize() const { return size_; }
+
+  private:
+    uint8_t *ptr_;
+    std::size_t size_;
+};
+
+class SizedProxy {
+  public:
+    SizedProxy() {}
+
+    SizedProxy(void *ptr, std::size_t size) : inner_(ptr, size) {}
+
+    operator std::string() const {
+      return std::string(reinterpret_cast<const char*>(inner_.Data()), inner_.EntrySize());
+    }
+
+    SizedProxy &operator=(const SizedProxy &from) {
+      memcpy(inner_.Data(), from.inner_.Data(), inner_.EntrySize());
+      return *this;
+    }
+
+    SizedProxy &operator=(const std::string &from) {
+      memcpy(inner_.Data(), from.data(), inner_.EntrySize());
+      return *this;
+    }
+
+    const void *Data() const { return inner_.Data(); }
+    void *Data() { return inner_.Data(); }
+
+  private:
+    friend class util::ProxyIterator<SizedProxy>;
+
+    typedef std::string value_type;
+
+    typedef SizedInnerIterator InnerIterator;
+
+    InnerIterator &Inner() { return inner_; }
+    const InnerIterator &Inner() const { return inner_; }
+    InnerIterator inner_;
+};
+
+typedef ProxyIterator<SizedProxy> SizedIterator;
+
+inline SizedIterator SizedIt(void *ptr, std::size_t size) { return SizedIterator(SizedProxy(ptr, size)); }
+
+// Useful wrapper for a comparison function i.e. sort.  
+template <class Delegate, class Proxy = SizedProxy> class SizedCompare : public std::binary_function<const Proxy &, const Proxy &, bool> {
+  public:
+    explicit SizedCompare(const Delegate &delegate = Delegate()) : delegate_(delegate) {}
+
+    bool operator()(const Proxy &first, const Proxy &second) const {
+      return delegate_(first.Data(), second.Data());
+    }
+    bool operator()(const Proxy &first, const std::string &second) const {
+      return delegate_(first.Data(), second.data());
+    }
+    bool operator()(const std::string &first, const Proxy &second) const {
+      return delegate_(first.data(), second.Data());
+    }
+    bool operator()(const std::string &first, const std::string &second) const {
+      return delegate_(first.data(), second.data());
+    }
+
+    const Delegate &GetDelegate() const { return delegate_; }
+    
+  private:
+    const Delegate delegate_;
+};
+
+} // namespace util
+#endif // UTIL_SIZED_ITERATOR__
diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh
new file mode 100644
index 00000000..ee1c7ab2
--- /dev/null
+++ b/klm/util/tokenize_piece.hh
@@ -0,0 +1,69 @@
+#ifndef UTIL_TOKENIZE_PIECE__
+#define UTIL_TOKENIZE_PIECE__
+
+#include "util/string_piece.hh"
+
+#include <boost/iterator/iterator_facade.hpp>
+
+/* Usage:
+ *
+ * for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) {
+ *   std::cout << *i << "\n";
+ * }
+ *
+ */
+
+namespace util {
+
+// Tokenize a StringPiece using an iterator interface.  boost::tokenizer doesn't work with StringPiece.
+template <char d> class PieceIterator : public boost::iterator_facade<PieceIterator<d>, const StringPiece, boost::forward_traversal_tag> {
+  public:
+    // Default construct is end, which is also accessed by kEndPieceIterator;
+    PieceIterator() {}
+
+    explicit PieceIterator(const StringPiece &str)
+      : after_(str) {
+        increment();
+      }
+
+    bool operator!() const {
+      return after_.data() == 0;
+    }
+    operator bool() const {
+      return after_.data() != 0;
+    }
+
+    static PieceIterator<d> end() {
+      return PieceIterator<d>();
+    }
+
+  private:
+    friend class boost::iterator_core_access;
+
+    void increment() {
+      const char *start = after_.data();
+      for (; (start != after_.data() + after_.size()) && (d == *start); ++start) {}
+      if (start == after_.data() + after_.size()) {
+        // End condition.
+        after_.clear();
+        return;
+      }
+      const char *finish = start;
+      for (; (finish != after_.data() + after_.size()) && (d != *finish); ++finish) {}
+      current_ = StringPiece(start, finish - start);
+      after_ = StringPiece(finish, after_.data() + after_.size() - finish);
+    }
+
+    bool equal(const PieceIterator &other) const {
+      return after_.data() == other.after_.data();
+    }
+
+    const StringPiece &dereference() const { return current_; }
+
+    StringPiece current_;
+    StringPiece after_;
+};
+
+} // namespace util
+
+#endif // UTIL_TOKENIZE_PIECE__
author	Kenneth Heafield <kenlm@kheafield.com>	2011-09-21 18:23:50 -0400
committer	Kenneth Heafield <kenlm@kheafield.com>	2011-09-21 18:23:50 -0400
commit	83cae8bd92a2ebffcf2b8b4d2500766da008fe3d (patch)
tree	575a54bbebc74eaafef6e9ebe4b37e6ad211b632 /klm/util
parent	4b619c0ca5b8c521c6fb4c3df1c4b43756baa306 (diff)