summaryrefslogtreecommitdiff
path: root/klm/lm/binary_format.hh
blob: f33f88d7578b4df1f8433d553308828946b91af3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#ifndef LM_BINARY_FORMAT__
#define LM_BINARY_FORMAT__

#include "lm/config.hh"
#include "lm/model_type.hh"
#include "lm/read_arpa.hh"

#include "util/file_piece.hh"
#include "util/mmap.hh"
#include "util/scoped.hh"

#include <cstddef>
#include <vector>

#include <stdint.h>

namespace lm {
namespace ngram {

extern const char *kModelNames[6];

/*Inspect a file to determine if it is a binary lm.  If not, return false.  
 * If so, return true and set recognized to the type.  This is the only API in
 * this header designed for use by decoder authors.  
 */
bool RecognizeBinary(const char *file, ModelType &recognized);

struct FixedWidthParameters {
  unsigned char order;
  float probing_multiplier;
  // What type of model is this?  
  ModelType model_type;
  // Does the end of the file have the actual strings in the vocabulary?   
  bool has_vocabulary;
  unsigned int search_version;
};

// This is a macro instead of an inline function so constants can be assigned using it.
#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)

// Parameters stored in the header of a binary file.  
struct Parameters {
  FixedWidthParameters fixed;
  std::vector<uint64_t> counts;
};

class BinaryFormat {
  public:
    explicit BinaryFormat(const Config &config);

    // Reading a binary file:
    // Takes ownership of fd
    void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters &params);
    // Used to read parts of the file to update the config object before figuring out full size.
    void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
    // Actually load the binary file and return a pointer to the beginning of the search area.
    void *LoadBinary(std::size_t size);

    uint64_t VocabStringReadingOffset() const {
      assert(vocab_string_offset_ != kInvalidOffset);
      return vocab_string_offset_;
    }

    // Writing a binary file or initializing in RAM from ARPA:
    // Size for vocabulary.
    void *SetupJustVocab(std::size_t memory_size, uint8_t order);
    // Warning: can change the vocaulary base pointer.
    void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
    // Warning: can change vocabulary and search base addresses.
    void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
    // Write the header at the beginning of the file.
    void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);

  private:
    void MapFile(void *&vocab_base, void *&search_base);

    // Copied from configuration.
    const Config::WriteMethod write_method_;
    const char *write_mmap_;
    util::LoadMethod load_method_;

    // File behind memory, if any.  
    util::scoped_fd file_;

    // If there is a file involved, a single mapping.
    util::scoped_memory mapping_;

    // If the data is only in memory, separately allocate each because the trie
    // knows vocab's size before it knows search's size (because SRILM might
    // have pruned).
    util::scoped_memory memory_vocab_, memory_search_;

    // Memory ranges.  Note that these may not be contiguous and may not all
    // exist.
    std::size_t header_size_, vocab_size_, vocab_pad_;
    // aka end of search.
    uint64_t vocab_string_offset_;

    static const uint64_t kInvalidOffset = (uint64_t)-1;
};

bool IsBinaryFormat(int fd);

} // namespace ngram
} // namespace lm
#endif // LM_BINARY_FORMAT__