summaryrefslogtreecommitdiff
path: root/klm/lm/binary_format.hh
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2014-01-27 17:42:19 -0800
committerKenneth Heafield <github@kheafield.com>2014-01-27 17:42:19 -0800
commit783c57b2d3312738ddcf992ac55ff750afe7cb47 (patch)
treec4811dab0d916836b8631f3c7df94f284a490b9b /klm/lm/binary_format.hh
parentf7e051a05d65ef25c2ada0b84cd82bfb375ef265 (diff)
KenLM 5cc905bc2d214efa7de2db56a9a672b749a95591
Diffstat (limited to 'klm/lm/binary_format.hh')
-rw-r--r--klm/lm/binary_format.hh112
1 files changed, 55 insertions, 57 deletions
diff --git a/klm/lm/binary_format.hh b/klm/lm/binary_format.hh
index bf699d5f..f33f88d7 100644
--- a/klm/lm/binary_format.hh
+++ b/klm/lm/binary_format.hh
@@ -17,6 +17,8 @@
namespace lm {
namespace ngram {
+extern const char *kModelNames[6];
+
/*Inspect a file to determine if it is a binary lm. If not, return false.
* If so, return true and set recognized to the type. This is the only API in
* this header designed for use by decoder authors.
@@ -42,67 +44,63 @@ struct Parameters {
std::vector<uint64_t> counts;
};
-struct Backing {
- // File behind memory, if any.
- util::scoped_fd file;
- // Vocabulary lookup table. Not to be confused with the vocab words themselves.
- util::scoped_memory vocab;
- // Raw block of memory backing the language model data structures
- util::scoped_memory search;
-};
-
-// Create just enough of a binary file to write vocabulary to it.
-uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
-// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.
-uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing);
-
-// Write header to binary file. This is done last to prevent incomplete files
-// from loading.
-void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing);
+class BinaryFormat {
+ public:
+ explicit BinaryFormat(const Config &config);
+
+ // Reading a binary file:
+ // Takes ownership of fd
+ void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params);
+ // Used to read parts of the file to update the config object before figuring out full size.
+ void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
+ // Actually load the binary file and return a pointer to the beginning of the search area.
+ void *LoadBinary(std::size_t size);
+
+ uint64_t VocabStringReadingOffset() const {
+ assert(vocab_string_offset_ != kInvalidOffset);
+ return vocab_string_offset_;
+ }
-namespace detail {
+ // Writing a binary file or initializing in RAM from ARPA:
+ // Size for vocabulary.
+ void *SetupJustVocab(std::size_t memory_size, uint8_t order);
+ // Warning: can change the vocaulary base pointer.
+ void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
+ // Warning: can change vocabulary and search base addresses.
+ void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
+ // Write the header at the beginning of the file.
+ void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
+
+ private:
+ void MapFile(void *&vocab_base, void *&search_base);
+
+ // Copied from configuration.
+ const Config::WriteMethod write_method_;
+ const char *write_mmap_;
+ util::LoadMethod load_method_;
+
+ // File behind memory, if any.
+ util::scoped_fd file_;
+
+ // If there is a file involved, a single mapping.
+ util::scoped_memory mapping_;
+
+ // If the data is only in memory, separately allocate each because the trie
+ // knows vocab's size before it knows search's size (because SRILM might
+ // have pruned).
+ util::scoped_memory memory_vocab_, memory_search_;
+
+ // Memory ranges. Note that these may not be contiguous and may not all
+ // exist.
+ std::size_t header_size_, vocab_size_, vocab_pad_;
+ // aka end of search.
+ uint64_t vocab_string_offset_;
+
+ static const uint64_t kInvalidOffset = (uint64_t)-1;
+};
bool IsBinaryFormat(int fd);
-void ReadHeader(int fd, Parameters &params);
-
-void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters &params);
-
-void SeekPastHeader(int fd, const Parameters &params);
-
-uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing);
-
-void ComplainAboutARPA(const Config &config, ModelType model_type);
-
-} // namespace detail
-
-template <class To> void LoadLM(const char *file, const Config &config, To &to) {
- Backing &backing = to.MutableBacking();
- backing.file.reset(util::OpenReadOrThrow(file));
-
- try {
- if (detail::IsBinaryFormat(backing.file.get())) {
- Parameters params;
- detail::ReadHeader(backing.file.get(), params);
- detail::MatchCheck(To::kModelType, To::kVersion, params);
- // Replace the run-time configured probing_multiplier with the one in the file.
- Config new_config(config);
- new_config.probing_multiplier = params.fixed.probing_multiplier;
- detail::SeekPastHeader(backing.file.get(), params);
- To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config);
- uint64_t memory_size = To::Size(params.counts, new_config);
- uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing);
- to.InitializeFromBinary(start, params, new_config, backing.file.get());
- } else {
- detail::ComplainAboutARPA(config, To::kModelType);
- to.InitializeFromARPA(file, config);
- }
- } catch (util::Exception &e) {
- e << " File: " << file;
- throw;
- }
-}
-
} // namespace ngram
} // namespace lm
#endif // LM_BINARY_FORMAT__