KenLM 5cc905bc2d214efa7de2db56a9a672b749a95591

author: Kenneth Heafield <github@kheafield.com> 2014-01-27 17:42:19 -0800
committer: Kenneth Heafield <github@kheafield.com> 2014-01-27 17:42:19 -0800
commit: 783c57b2d3312738ddcf992ac55ff750afe7cb47 (patch)
tree: c4811dab0d916836b8631f3c7df94f284a490b9b /klm/lm/binary_format.hh
parent: f7e051a05d65ef25c2ada0b84cd82bfb375ef265 (diff)
1 files changed, 55 insertions, 57 deletions
diff --git a/klm/lm/binary_format.hh b/klm/lm/binary_format.hh
index bf699d5f..f33f88d7 100644
--- a/klm/lm/binary_format.hh
+++ b/klm/lm/binary_format.hh
@@ -17,6 +17,8 @@
 namespace lm {
 namespace ngram {
 
+extern const char *kModelNames[6];
+
 /*Inspect a file to determine if it is a binary lm.  If not, return false.  
  * If so, return true and set recognized to the type.  This is the only API in
  * this header designed for use by decoder authors.  
@@ -42,67 +44,63 @@ struct Parameters {
   std::vector<uint64_t> counts;
 };
 
-struct Backing {
-  // File behind memory, if any.  
-  util::scoped_fd file;
-  // Vocabulary lookup table.  Not to be confused with the vocab words themselves.  
-  util::scoped_memory vocab;
-  // Raw block of memory backing the language model data structures
-  util::scoped_memory search;
-};
-
-// Create just enough of a binary file to write vocabulary to it.  
-uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
-// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.  
-uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing);
-
-// Write header to binary file.  This is done last to prevent incomplete files
-// from loading.   
-void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts,  std::size_t vocab_pad, Backing &backing);
+class BinaryFormat {
+  public:
+    explicit BinaryFormat(const Config &config);
+
+    // Reading a binary file:
+    // Takes ownership of fd
+    void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params);
+    // Used to read parts of the file to update the config object before figuring out full size.
+    void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
+    // Actually load the binary file and return a pointer to the beginning of the search area.
+    void *LoadBinary(std::size_t size);
+
+    uint64_t VocabStringReadingOffset() const {
+      assert(vocab_string_offset_ != kInvalidOffset);
+      return vocab_string_offset_;
+    }
 
-namespace detail {
+    // Writing a binary file or initializing in RAM from ARPA:
+    // Size for vocabulary.
+    void *SetupJustVocab(std::size_t memory_size, uint8_t order);
+    // Warning: can change the vocaulary base pointer.
+    void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
+    // Warning: can change vocabulary and search base addresses.
+    void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
+    // Write the header at the beginning of the file.
+    void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
+
+  private:
+    void MapFile(void *&vocab_base, void *&search_base);
+
+    // Copied from configuration.
+    const Config::WriteMethod write_method_;
+    const char *write_mmap_;
+    util::LoadMethod load_method_;
+
+    // File behind memory, if any.  
+    util::scoped_fd file_;
+
+    // If there is a file involved, a single mapping.
+    util::scoped_memory mapping_;
+
+    // If the data is only in memory, separately allocate each because the trie
+    // knows vocab's size before it knows search's size (because SRILM might
+    // have pruned).
+    util::scoped_memory memory_vocab_, memory_search_;
+
+    // Memory ranges.  Note that these may not be contiguous and may not all
+    // exist.
+    std::size_t header_size_, vocab_size_, vocab_pad_;
+    // aka end of search.
+    uint64_t vocab_string_offset_;
+
+    static const uint64_t kInvalidOffset = (uint64_t)-1;
+};
 
 bool IsBinaryFormat(int fd);
 
-void ReadHeader(int fd, Parameters &params);
-
-void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params);
-
-void SeekPastHeader(int fd, const Parameters &params);
-
-uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing);
-
-void ComplainAboutARPA(const Config &config, ModelType model_type);
-
-} // namespace detail
-
-template <class To> void LoadLM(const char *file, const Config &config, To &to) {
-  Backing &backing = to.MutableBacking();
-  backing.file.reset(util::OpenReadOrThrow(file));
-
-  try {
-    if (detail::IsBinaryFormat(backing.file.get())) {
-      Parameters params;
-      detail::ReadHeader(backing.file.get(), params);
-      detail::MatchCheck(To::kModelType, To::kVersion, params);
-      // Replace the run-time configured probing_multiplier with the one in the file.  
-      Config new_config(config);
-      new_config.probing_multiplier = params.fixed.probing_multiplier;
-      detail::SeekPastHeader(backing.file.get(), params);
-      To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config);
-      uint64_t memory_size = To::Size(params.counts, new_config);
-      uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing);
-      to.InitializeFromBinary(start, params, new_config, backing.file.get());
-    } else {
-      detail::ComplainAboutARPA(config, To::kModelType);
-      to.InitializeFromARPA(file, config);
-    }
-  } catch (util::Exception &e) {
-    e << " File: " << file;
-    throw;
-  }
-}
-
 } // namespace ngram
 } // namespace lm
 #endif // LM_BINARY_FORMAT__
author	Kenneth Heafield <github@kheafield.com>	2014-01-27 17:42:19 -0800
committer	Kenneth Heafield <github@kheafield.com>	2014-01-27 17:42:19 -0800
commit	783c57b2d3312738ddcf992ac55ff750afe7cb47 (patch)
tree	c4811dab0d916836b8631f3c7df94f284a490b9b /klm/lm/binary_format.hh
parent	f7e051a05d65ef25c2ada0b84cd82bfb375ef265 (diff)