KenLM 58da338b

author: Kenneth Heafield <github@kheafield.com> 2013-01-22 21:37:49 +0000
committer: Kenneth Heafield <github@kheafield.com> 2013-01-22 21:37:49 +0000
commit: 516c132fb683b5bf77ae3230a1b3709beb57618e (patch)
tree: d37fb5d1d8f4273dac3509291495ff9797c399e0 /klm
parent: 53532304714256f692fd5f7305b2fab10a7d7cca (diff)
27 files changed, 224 insertions, 106 deletions
diff --git a/klm/lm/Makefile.am b/klm/lm/Makefile.am
index 45f40c43..48b0ba34 100644
--- a/klm/lm/Makefile.am
+++ b/klm/lm/Makefile.am
@@ -1,9 +1,9 @@
 bin_PROGRAMS = build_binary ngram_query
 
-build_binary_SOURCES = build_binary.cc
+build_binary_SOURCES = build_binary_main.cc
 build_binary_LDADD = libklm.a ../util/libklm_util.a ../util/double-conversion/libklm_util_double.a -lz
 
-ngram_query_SOURCES = ngram_query.cc
+ngram_query_SOURCES = query_main.cc
 ngram_query_LDADD = libklm.a ../util/libklm_util.a ../util/double-conversion/libklm_util_double.a -lz
 
 #noinst_PROGRAMS = \
diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary_main.cc
index ab2c0c32..ab2c0c32 100644
--- a/klm/lm/build_binary.cc
+++ b/klm/lm/build_binary_main.cc
diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am
index b5c147fd..317e03ce 100644
--- a/klm/lm/builder/Makefile.am
+++ b/klm/lm/builder/Makefile.am
@@ -1,7 +1,7 @@
 bin_PROGRAMS = builder
 
 builder_SOURCES = \
-  main.cc \
+  lmplz_main.cc \
   adjust_counts.cc \
   adjust_counts.hh \
   corpus_count.cc \
diff --git a/klm/lm/builder/discount.hh b/klm/lm/builder/discount.hh
index 754fb20d..4d0aa4fd 100644
--- a/klm/lm/builder/discount.hh
+++ b/klm/lm/builder/discount.hh
@@ -3,7 +3,7 @@
 
 #include <algorithm>
 
-#include <inttypes.h>
+#include <stdint.h>
 
 namespace lm {
 namespace builder {
diff --git a/klm/lm/builder/main.cc b/klm/lm/builder/lmplz_main.cc
index 90b9dca2..90b9dca2 100644
--- a/klm/lm/builder/main.cc
+++ b/klm/lm/builder/lmplz_main.cc
diff --git a/klm/lm/filter/main.cc b/klm/lm/filter/filter_main.cc
index c42243e2..1a4ba84f 100644
--- a/klm/lm/filter/main.cc
+++ b/klm/lm/filter/filter_main.cc
@@ -53,7 +53,7 @@ void DisplayHelp(const char *name) {
     "    stream i.e. /dev/stdout\n";
 }
 
-typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION} FilterMode;
+typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} FilterMode;
 typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
 
 struct Config {
@@ -162,19 +162,19 @@ int main(int argc, char *argv[]) {
     return 1;
   }
 
-  // I used to have boost::program_options, but some users didn't want to compile boost.  
+  // I used to have boost::program_options, but some users didn't want to compile boost.
   lm::Config config;
-  boost::optional<lm::FilterMode> mode;
+  config.mode = lm::MODE_UNSET;
   for (int i = 1; i < argc - 2; ++i) {
     const char *str = argv[i];
     if (!std::strcmp(str, "copy")) {
-      mode = lm::MODE_COPY;
+      config.mode = lm::MODE_COPY;
     } else if (!std::strcmp(str, "single")) {
-      mode = lm::MODE_SINGLE;
+      config.mode = lm::MODE_SINGLE;
     } else if (!std::strcmp(str, "multiple")) {
-      mode = lm::MODE_MULTIPLE;
+      config.mode = lm::MODE_MULTIPLE;
     } else if (!std::strcmp(str, "union")) {
-      mode = lm::MODE_UNION;
+      config.mode = lm::MODE_UNION;
     } else if (!std::strcmp(str, "phrase")) {
       config.phrase = true;
     } else if (!std::strcmp(str, "context")) {
@@ -203,13 +203,12 @@ int main(int argc, char *argv[]) {
     }
   }
   
-  if (!mode) {
+  if (config.mode == lm::MODE_UNSET) {
     lm::DisplayHelp(argv[0]);
     return 1;
   }
-  config.mode = *mode;
 
-  if (config.phrase && config.mode != lm::MODE_UNION && mode != lm::MODE_MULTIPLE) {
+  if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) {
     std::cerr << "Phrase constraint currently only works in multiple or union mode.  If you really need it for single, put everything on one line and use union." << std::endl;
     return 1;
   }
diff --git a/klm/lm/filter/phrase.hh b/klm/lm/filter/phrase.hh
index 07479dea..b4edff41 100644
--- a/klm/lm/filter/phrase.hh
+++ b/klm/lm/filter/phrase.hh
@@ -57,6 +57,7 @@ class Substrings {
     LM_FILTER_PHRASE_METHOD(Right, right)
     LM_FILTER_PHRASE_METHOD(Phrase, phrase)
 
+#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization
     // sentence_id must be non-decreasing.  Iterators are over words in the phrase.  
     template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) {
       // Iterate over all substrings.  
diff --git a/klm/lm/filter/vocab.hh b/klm/lm/filter/vocab.hh
index e2b6adff..7f0fadaa 100644
--- a/klm/lm/filter/vocab.hh
+++ b/klm/lm/filter/vocab.hh
@@ -5,6 +5,7 @@
 
 #include "util/multi_intersection.hh"
 #include "util/string_piece.hh"
+#include "util/string_piece_hash.hh"
 #include "util/tokenize_piece.hh"
 
 #include <boost/noncopyable.hpp>
diff --git a/klm/lm/fragment.cc b/klm/lm/fragment_main.cc
index 0267cd4e..0267cd4e 100644
--- a/klm/lm/fragment.cc
+++ b/klm/lm/fragment_main.cc
diff --git a/klm/lm/max_order.cc b/klm/lm/kenlm_max_order_main.cc
index 94221201..94221201 100644
--- a/klm/lm/max_order.cc
+++ b/klm/lm/kenlm_max_order_main.cc
diff --git a/klm/lm/ngram_query.cc b/klm/lm/query_main.cc
index 49757d9a..49757d9a 100644
--- a/klm/lm/ngram_query.cc
+++ b/klm/lm/query_main.cc
diff --git a/klm/util/Makefile.am b/klm/util/Makefile.am
index 248cc844..7f873e96 100644
--- a/klm/util/Makefile.am
+++ b/klm/util/Makefile.am
@@ -38,6 +38,7 @@ libklm_util_a_SOURCES = \
   sized_iterator.hh \
   sorted_uniform.hh \
   string_piece.hh \
+  string_piece_hash.hh \
   thread_pool.hh \
   tokenize_piece.hh \
   usage.hh \
diff --git a/klm/util/double-conversion/strtod.cc b/klm/util/double-conversion/strtod.cc
index 9758989f..e298766a 100644
--- a/klm/util/double-conversion/strtod.cc
+++ b/klm/util/double-conversion/strtod.cc
@@ -506,7 +506,9 @@ float Strtof(Vector<const char> buffer, int exponent) {
   double double_previous = Double(double_guess).PreviousDouble();
 
   float f1 = static_cast<float>(double_previous);
+#ifndef NDEBUG
   float f2 = float_guess;
+#endif
   float f3 = static_cast<float>(double_next);
   float f4;
   if (is_correct) {
@@ -515,7 +517,9 @@ float Strtof(Vector<const char> buffer, int exponent) {
     double double_next2 = Double(double_next).NextDouble();
     f4 = static_cast<float>(double_next2);
   }
+#ifndef NDEBUG
   ASSERT(f1 <= f2 && f2 <= f3 && f3 <= f4);
+#endif
 
   // If the guess doesn't lie near a single-precision boundary we can simply
   // return its float-value.
diff --git a/klm/util/file.cc b/klm/util/file.cc
index 9a6d2e64..86d9b12d 100644
--- a/klm/util/file.cc
+++ b/klm/util/file.cc
@@ -22,6 +22,7 @@
 #include <io.h>
 #include <algorithm>
 #include <limits.h>
+#include <limits>
 #else
 #include <unistd.h>
 #endif
@@ -99,15 +100,15 @@ uint64_t SizeOrThrow(int fd) {
 }
 
 void ResizeOrThrow(int fd, uint64_t to) {
-  UTIL_THROW_IF_ARG(
 #if defined(_WIN32) || defined(_WIN64)
-    _chsize_s
+    errno_t ret = _chsize_s
 #elif defined(OS_ANDROID)
-    ftruncate64
+    int ret = ftruncate64
 #else
-    ftruncate
+    int ret = ftruncate
 #endif
-    (fd, to), FDException, (fd), "while resizing to " << to << " bytes");
+    (fd, to);
+  UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes");
 }
 
 std::size_t PartialRead(int fd, void *to, std::size_t amount) {
@@ -150,9 +151,21 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {
 void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) {
   uint8_t *to = static_cast<uint8_t*>(to_void);
 #if defined(_WIN32) || defined(_WIN64)
-  UTIL_THROW(Exception, "TODO: PReadOrThrow for windows using ReadFile http://stackoverflow.com/questions/766477/are-there-equivalents-to-pread-on-different-platforms");
-#else
+  UTIL_THROW(Exception, "This pread implementation for windows is broken.  Please send me a patch that does not change the file pointer.  Atomically.  Or send me an implementation of pwrite that is allowed to change the file pointer but can be called concurrently with pread.");
+  const std::size_t kMaxDWORD = static_cast<std::size_t>(4294967295UL);
+#endif
   for (;size ;) {
+#if defined(_WIN32) || defined(_WIN64)
+    /* BROKEN: changes file pointer.  Even if you save it and change it back, it won't be safe to use concurrently with write() or read() which lmplz does. */
+    // size_t might be 64-bit.  DWORD is always 32.
+    DWORD reading = static_cast<DWORD>(std::min<std::size_t>(kMaxDWORD, size));
+    DWORD ret;
+    OVERLAPPED overlapped;
+    memset(&overlapped, 0, sizeof(OVERLAPPED));
+    overlapped.Offset = static_cast<DWORD>(off);
+    overlapped.OffsetHigh = static_cast<DWORD>(off >> 32);
+    UTIL_THROW_IF(!ReadFile((HANDLE)_get_osfhandle(fd), to, reading, &ret, &overlapped), Exception, "ReadFile failed for offset " << off);
+#else
     ssize_t ret;
     errno = 0;
     do {
@@ -166,11 +179,11 @@ void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) {
       UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd));
       UTIL_THROW_ARG(FDException, (fd), "while reading " << size << " bytes at offset " << off);
     }
+#endif
     size -= ret;
     off += ret;
     to += ret;
   }
-#endif
 }
 
 void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
@@ -218,15 +231,15 @@ typedef CheckOffT<sizeof(off_t)>::True IgnoredType;
 
 // Can't we all just get along?  
 void InternalSeek(int fd, int64_t off, int whence) {
-  UTIL_THROW_IF_ARG(
+  if (
 #if defined(_WIN32) || defined(_WIN64)
-    (__int64)-1 == _lseeki64(fd, off, whence),
+    (__int64)-1 == _lseeki64(fd, off, whence)
 #elif defined(OS_ANDROID)
-    (off64_t)-1 == lseek64(fd, off, whence),
+    (off64_t)-1 == lseek64(fd, off, whence)
 #else
-    (off_t)-1 == lseek(fd, off, whence),
+    (off_t)-1 == lseek(fd, off, whence)
 #endif
-    FDException, (fd), "while seeking to " << off << " whence " << whence);
+  ) UTIL_THROW_ARG(FDException, (fd), "while seeking to " << off << " whence " << whence);
 }
 } // namespace
 
@@ -386,7 +399,13 @@ void NormalizeTempPrefix(std::string &base) {
   struct stat sb;
   // It's fine for it to not exist.
   if (-1 == stat(base.c_str(), &sb)) return;
-  if (S_ISDIR(sb.st_mode)) base += '/';
+  if (
+#if defined(_WIN32) || defined(_WIN64)
+    sb.st_mode & _S_IFDIR
+#else
+    S_ISDIR(sb.st_mode)
+#endif
+    ) base += '/';
 }
 
 int MakeTemp(const std::string &base) {
diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc
index fbfa0e0e..4d143857 100644
--- a/klm/util/file_piece.cc
+++ b/klm/util/file_piece.cc
@@ -49,6 +49,18 @@ FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std:
   Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer);
 }
 
+FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buffer) :
+  total_size_(kBadSize), page_(SizePage()) {
+  InitializeNoRead("istream", min_buffer);
+
+  fallback_to_read_ = true;
+  data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
+  position_ = data_.begin();
+  position_end_ = position_;
+  
+  fell_back_.Reset(stream);
+}
+
 FilePiece::~FilePiece() {}
 
 StringPiece FilePiece::ReadLine(char delim) {
@@ -83,7 +95,8 @@ unsigned long int FilePiece::ReadULong() {
   return ReadNumber<unsigned long int>();
 }
 
-void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer)  {
+// Factored out so that istream can call this.
+void FilePiece::InitializeNoRead(const char *name, std::size_t min_buffer) {
   file_name_ = name;
 
   default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2);
@@ -91,6 +104,10 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s
   position_end_ = NULL;
   mapped_offset_ = 0;
   at_end_ = false;
+}
+
+void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) {
+  InitializeNoRead(name, min_buffer);
 
   if (total_size_ == kBadSize) {
     // So the assertion passes.  
@@ -239,8 +256,7 @@ void FilePiece::TransitionToRead() {
   assert(!fallback_to_read_);
   fallback_to_read_ = true;
   data_.reset();
-  data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
-  UTIL_THROW_IF(!data_.get(), ErrnoException, "malloc failed for " << default_map_size_);
+  data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
   position_ = data_.begin();
   position_end_ = position_;
 
diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh
index 53310976..c07c6011 100644
--- a/klm/util/file_piece.hh
+++ b/klm/util/file_piece.hh
@@ -9,6 +9,7 @@
 #include "util/string_piece.hh"
 
 #include <cstddef>
+#include <iosfwd>
 #include <string>
 
 #include <stdint.h>
@@ -31,6 +32,13 @@ class FilePiece {
     // Takes ownership of fd.  name is used for messages.
     explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
 
+    /* Read from an istream.  Don't use this if you can avoid it.  Raw fd IO is
+     * much faster.  But sometimes you just have an istream like Boost's HTTP
+     * server and want to parse it the same way.
+     * name is just used for messages and FileName().
+     */
+    explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576);
+
     ~FilePiece();
 
     char get() {
@@ -71,6 +79,8 @@ class FilePiece {
     const std::string &FileName() const { return file_name_; }
 
   private:
+    void InitializeNoRead(const char *name, std::size_t min_buffer);
+    // Calls InitializeNoRead, so don't call both.
     void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
 
     template <class T> T ReadNumber();
diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc
index 91e4c559..7336007d 100644
--- a/klm/util/file_piece_test.cc
+++ b/klm/util/file_piece_test.cc
@@ -24,6 +24,20 @@ std::string FileLocation() {
   return ret;
 }
 
+/* istream */
+BOOST_AUTO_TEST_CASE(IStream) {
+  std::fstream ref(FileLocation().c_str(), std::ios::in);
+  std::fstream backing(FileLocation().c_str(), std::ios::in);
+  FilePiece test(backing);
+  std::string ref_line;
+  while (getline(ref, ref_line)) {
+    StringPiece test_line(test.ReadLine());
+    BOOST_CHECK_EQUAL(ref_line, test_line);
+  }
+  BOOST_CHECK_THROW(test.get(), EndOfFileException);
+  BOOST_CHECK_THROW(test.get(), EndOfFileException);
+}
+
 /* mmap implementation */
 BOOST_AUTO_TEST_CASE(MMapReadLine) {
   std::fstream ref(FileLocation().c_str(), std::ios::in);
diff --git a/klm/util/have.hh b/klm/util/have.hh
index e9a4d946..6e18529d 100644
--- a/klm/util/have.hh
+++ b/klm/util/have.hh
@@ -10,8 +10,4 @@
 //#define HAVE_ICU
 #endif
 
-#ifndef HAVE_BOOST
-//#define HAVE_BOOST
-#endif
-
 #endif // UTIL_HAVE__
diff --git a/klm/util/read_compressed.cc b/klm/util/read_compressed.cc
index 7a1a8fb5..b81549e4 100644
--- a/klm/util/read_compressed.cc
+++ b/klm/util/read_compressed.cc
@@ -320,6 +320,23 @@ class XZip : public ReadBase {
 };
 #endif // HAVE_XZLIB
 
+class IStreamReader : public ReadBase {
+  public:
+    explicit IStreamReader(std::istream &stream) : stream_(stream) {}
+
+    std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+      if (!stream_.read(static_cast<char*>(to), amount)) {
+        UTIL_THROW_IF(!stream_.eof(), ErrnoException, "istream error");
+        amount = stream_.gcount();
+      }
+      ReadCount(thunk) += amount;
+      return amount;
+    }
+
+  private:
+    std::istream &stream_;
+};
+
 enum MagicResult {
   UNKNOWN, GZIP, BZIP, XZIP
 };
@@ -329,7 +346,7 @@ MagicResult DetectMagic(const void *from_void) {
   if (header[0] == 0x1f && header[1] == 0x8b) {
     return GZIP;
   }
-  if (header[0] == 'B' && header[1] == 'Z') {
+  if (header[0] == 'B' && header[1] == 'Z' && header[2] == 'h') {
     return BZIP;
   }
   const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
@@ -387,6 +404,10 @@ ReadCompressed::ReadCompressed(int fd) {
   Reset(fd);
 }
 
+ReadCompressed::ReadCompressed(std::istream &in) {
+  Reset(in);
+}
+
 ReadCompressed::ReadCompressed() {}
 
 ReadCompressed::~ReadCompressed() {}
@@ -396,6 +417,11 @@ void ReadCompressed::Reset(int fd) {
   internal_.reset(ReadFactory(fd, raw_amount_));
 }
 
+void ReadCompressed::Reset(std::istream &in) {
+  internal_.reset();
+  internal_.reset(new IStreamReader(in));
+}
+
 std::size_t ReadCompressed::Read(void *to, std::size_t amount) {
   return internal_->Read(to, amount, *this);
 }
diff --git a/klm/util/read_compressed.hh b/klm/util/read_compressed.hh
index 83ca9fb2..8b54c9e8 100644
--- a/klm/util/read_compressed.hh
+++ b/klm/util/read_compressed.hh
@@ -45,6 +45,10 @@ class ReadCompressed {
     // Takes ownership of fd.   
     explicit ReadCompressed(int fd);
 
+    // Try to avoid using this.  Use the fd instead.
+    // There is no decompression support for istreams.
+    explicit ReadCompressed(std::istream &in);
+
     // Must call Reset later.
     ReadCompressed();
 
@@ -53,6 +57,9 @@ class ReadCompressed {
     // Takes ownership of fd.  
     void Reset(int fd);
 
+    // Same advice as the constructor.
+    void Reset(std::istream &in);
+
     std::size_t Read(void *to, std::size_t amount);
 
     uint64_t RawAmount() const { return raw_amount_; }
diff --git a/klm/util/read_compressed_test.cc b/klm/util/read_compressed_test.cc
index 6fd97e5e..9cb4a4b9 100644
--- a/klm/util/read_compressed_test.cc
+++ b/klm/util/read_compressed_test.cc
@@ -25,19 +25,34 @@ void ReadLoop(ReadCompressed &reader, void *to_void, std::size_t amount) {
   }
 }
 
-void TestRandom(const char *compressor) {
-  const uint32_t kSize4 = 100000 / 4;
+const uint32_t kSize4 = 100000 / 4;
+
+std::string WriteRandom() {
   char name[] = "tempXXXXXX";
+  scoped_fd original(mkstemp(name));
+  BOOST_REQUIRE(original.get() > 0);
+  for (uint32_t i = 0; i < kSize4; ++i) {
+    WriteOrThrow(original.get(), &i, sizeof(uint32_t));
+  }
+  return name;
+}
 
-  // Write test file.  
-  {
-    scoped_fd original(mkstemp(name));
-    BOOST_REQUIRE(original.get() > 0);
-    for (uint32_t i = 0; i < kSize4; ++i) {
-      WriteOrThrow(original.get(), &i, sizeof(uint32_t));
-    }
+void VerifyRead(ReadCompressed &reader) {
+  for (uint32_t i = 0; i < kSize4; ++i) {
+    uint32_t got;
+    ReadLoop(reader, &got, sizeof(uint32_t));
+    BOOST_CHECK_EQUAL(i, got);
   }
 
+  char ignored;
+  BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1));
+  // Test double EOF call.
+  BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1));
+}
+
+void TestRandom(const char *compressor) {
+  std::string name(WriteRandom());
+
   char gzname[] = "tempXXXXXX";
   scoped_fd gzipped(mkstemp(gzname));
 
@@ -52,20 +67,11 @@ void TestRandom(const char *compressor) {
   command += "\"";
   BOOST_REQUIRE_EQUAL(0, system(command.c_str()));
 
-  BOOST_CHECK_EQUAL(0, unlink(name));
+  BOOST_CHECK_EQUAL(0, unlink(name.c_str()));
   BOOST_CHECK_EQUAL(0, unlink(gzname));
 
   ReadCompressed reader(gzipped.release());
-  for (uint32_t i = 0; i < kSize4; ++i) {
-    uint32_t got;
-    ReadLoop(reader, &got, sizeof(uint32_t));
-    BOOST_CHECK_EQUAL(i, got);
-  }
-
-  char ignored;
-  BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1));
-  // Test double EOF call.
-  BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1));
+  VerifyRead(reader);
 }
 
 BOOST_AUTO_TEST_CASE(Uncompressed) {
@@ -90,5 +96,14 @@ BOOST_AUTO_TEST_CASE(ReadXZ) {
 }
 #endif
 
+BOOST_AUTO_TEST_CASE(IStream) {
+  std::string name(WriteRandom());
+  std::fstream stream(name.c_str(), std::ios::in);
+  BOOST_CHECK_EQUAL(0, unlink(name.c_str()));
+  ReadCompressed reader;
+  reader.Reset(stream);
+  VerifyRead(reader);
+}
+
 } // namespace
 } // namespace util
diff --git a/klm/util/stream/io.cc b/klm/util/stream/io.cc
index c7ad2980..0459f706 100644
--- a/klm/util/stream/io.cc
+++ b/klm/util/stream/io.cc
@@ -29,15 +29,17 @@ void Read::Run(const ChainPosition &position) {
 void PRead::Run(const ChainPosition &position) {
   scoped_fd owner;
   if (own_) owner.reset(file_);
-  uint64_t size = SizeOrThrow(file_);
+  const uint64_t size = SizeOrThrow(file_);
   UTIL_THROW_IF(size % static_cast<uint64_t>(position.GetChain().EntrySize()), ReadSizeException, "File size " << file_ << " size is " << size << " not a multiple of " << position.GetChain().EntrySize());
-  std::size_t block_size = position.GetChain().BlockSize();
+  const std::size_t block_size = position.GetChain().BlockSize();
+  const uint64_t block_size64 = static_cast<uint64_t>(block_size);
   Link link(position);
   uint64_t offset = 0;
-  for (; offset + block_size < size; offset += block_size, ++link) {
+  for (; offset + block_size64 < size; offset += block_size64, ++link) {
     PReadOrThrow(file_, link->Get(), block_size, offset);
     link->SetValidSize(block_size);
   }
+  // size - offset is <= block_size, so it casts to 32-bit fine.
   if (size - offset) {
     PReadOrThrow(file_, link->Get(), size - offset, offset);
     link->SetValidSize(size - offset);
diff --git a/klm/util/stream/sort.hh b/klm/util/stream/sort.hh
index a86f160f..16aa6a03 100644
--- a/klm/util/stream/sort.hh
+++ b/klm/util/stream/sort.hh
@@ -365,10 +365,14 @@ template <class Compare> class BlockSorter {
         // Record the size of each block in a separate file.    
         offsets_->Append(link->ValidSize());
         void *end = static_cast<uint8_t*>(link->Get()) + link->ValidSize();
-        std::sort(
-            SizedIt(link->Get(), entry_size),
-            SizedIt(end, entry_size),
-            compare_);
+#if defined(_WIN32) || defined(_WIN64)
+        std::stable_sort
+#else
+        std::sort
+#endif
+          (SizedIt(link->Get(), entry_size),
+           SizedIt(end, entry_size),
+           compare_);
       }
       offsets_->FinishedAppending();
     }
diff --git a/klm/util/string_piece.cc b/klm/util/string_piece.cc
index b422cefc..ec394b96 100644
--- a/klm/util/string_piece.cc
+++ b/klm/util/string_piece.cc
@@ -17,7 +17,8 @@ void StringPiece::CopyToString(std::string* target) const {
 }
 
 size_type StringPiece::find(const StringPiece& s, size_type pos) const {
-  if (length_ < 0 || pos > static_cast<size_type>(length_))
+  // Not sure why length_ < 0 was here since it's std::size_t.
+  if (/*length_ < 0 || */pos > static_cast<size_type>(length_))
     return npos;
 
   const char* result = std::search(ptr_ + pos, ptr_ + length_,
diff --git a/klm/util/string_piece.hh b/klm/util/string_piece.hh
index 51481646..9cf4c7f6 100644
--- a/klm/util/string_piece.hh
+++ b/klm/util/string_piece.hh
@@ -50,10 +50,6 @@
 
 #include "util/have.hh"
 
-#ifdef HAVE_BOOST
-#include <boost/functional/hash/hash.hpp>
-#endif // HAVE_BOOST
-
 #include <cstring>
 #include <iosfwd>
 #include <ostream>
@@ -256,46 +252,9 @@ inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
   return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
 }
 
-#ifdef HAVE_BOOST
-inline size_t hash_value(const StringPiece &str) {
-  return boost::hash_range(str.data(), str.data() + str.length());
-}
-
-/* Support for lookup of StringPiece in boost::unordered_map<std::string> */
-struct StringPieceCompatibleHash : public std::unary_function<const StringPiece &, size_t> {
-  size_t operator()(const StringPiece &str) const {
-    return hash_value(str);
-  }
-};
-
-struct StringPieceCompatibleEquals : public std::binary_function<const StringPiece &, const std::string &, bool> {
-  bool operator()(const StringPiece &first, const StringPiece &second) const {
-    return first == second;
-  }
-};
-template <class T> typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) {
-#if BOOST_VERSION < 104200
-  std::string temp(key.data(), key.size());
-  return t.find(temp);
-#else
-  return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
-#endif
-}
-
-template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece &key) {
-#if BOOST_VERSION < 104200
-  std::string temp(key.data(), key.size());
-  return t.find(temp);
-#else
-  return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
-#endif
-}
-#endif
-
 #ifdef HAVE_ICU
 U_NAMESPACE_END
 using U_NAMESPACE_QUALIFIER StringPiece;
 #endif
 
-
 #endif  // BASE_STRING_PIECE_H__
diff --git a/klm/util/string_piece_hash.hh b/klm/util/string_piece_hash.hh
new file mode 100644
index 00000000..f206b1d8
--- /dev/null
+++ b/klm/util/string_piece_hash.hh
@@ -0,0 +1,43 @@
+#ifndef UTIL_STRING_PIECE_HASH__
+#define UTIL_STRING_PIECE_HASH__
+
+#include "util/string_piece.hh"
+
+#include <boost/functional/hash.hpp>
+#include <boost/version.hpp>
+
+inline size_t hash_value(const StringPiece &str) {
+  return boost::hash_range(str.data(), str.data() + str.length());
+}
+
+/* Support for lookup of StringPiece in boost::unordered_map<std::string> */
+struct StringPieceCompatibleHash : public std::unary_function<const StringPiece &, size_t> {
+  size_t operator()(const StringPiece &str) const {
+    return hash_value(str);
+  }
+};
+
+struct StringPieceCompatibleEquals : public std::binary_function<const StringPiece &, const std::string &, bool> {
+  bool operator()(const StringPiece &first, const StringPiece &second) const {
+    return first == second;
+  }
+};
+template <class T> typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) {
+#if BOOST_VERSION < 104200
+  std::string temp(key.data(), key.size());
+  return t.find(temp);
+#else
+  return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
+#endif
+}
+
+template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece &key) {
+#if BOOST_VERSION < 104200
+  std::string temp(key.data(), key.size());
+  return t.find(temp);
+#else
+  return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
+#endif
+}
+
+#endif // UTIL_STRING_PIECE_HASH__
diff --git a/klm/util/usage.cc b/klm/util/usage.cc
index 16a004bb..b8e125d0 100644
--- a/klm/util/usage.cc
+++ b/klm/util/usage.cc
@@ -81,7 +81,7 @@ template <class Num> uint64_t ParseNum(const std::string &arg) {
   UTIL_THROW_IF_ARG(stream >> throwaway, SizeParseError, (arg), "because there was more cruft " << throwaway << " after the number.");
 
   // Silly sort, using kilobytes as your default unit.  
-  if (after.empty()) after == "K";
+  if (after.empty()) after = "K";
   if (after == "%") {
     uint64_t mem = GuessPhysicalMemory();
     UTIL_THROW_IF_ARG(!mem, SizeParseError, (arg), "because % was specified but the physical memory size could not be determined.");
author	Kenneth Heafield <github@kheafield.com>	2013-01-22 21:37:49 +0000
committer	Kenneth Heafield <github@kheafield.com>	2013-01-22 21:37:49 +0000
commit	516c132fb683b5bf77ae3230a1b3709beb57618e (patch)
tree	d37fb5d1d8f4273dac3509291495ff9797c399e0 /klm
parent	53532304714256f692fd5f7305b2fab10a7d7cca (diff)