klm/lm/ngram.hh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226

#ifndef LM_NGRAM__
#define LM_NGRAM__

#include "lm/facade.hh"
#include "lm/ngram_config.hh"
#include "util/key_value_packing.hh"
#include "util/mmap.hh"
#include "util/probing_hash_table.hh"
#include "util/scoped.hh"
#include "util/sorted_uniform.hh"
#include "util/string_piece.hh"

#include <algorithm>
#include <memory>
#include <vector>

namespace util { class FilePiece; }

namespace lm {
namespace ngram {

// If you need higher order, change this and recompile.  
// Having this limit means that State can be
// (kMaxOrder - 1) * sizeof(float) bytes instead of
// sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
const std::size_t kMaxOrder = 6;

// This is a POD.  
class State {
  public:
    bool operator==(const State &other) const {
      if (valid_length_ != other.valid_length_) return false;
      const WordIndex *end = history_ + valid_length_;
      for (const WordIndex *first = history_, *second = other.history_;
          first != end; ++first, ++second) {
        if (*first != *second) return false;
      }
      // If the histories are equal, so are the backoffs.  
      return true;
    }

    // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.  
    // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.  
    WordIndex history_[kMaxOrder - 1];
    float backoff_[kMaxOrder - 1];
    unsigned char valid_length_;
};

size_t hash_value(const State &state);

namespace detail {

uint64_t HashForVocab(const char *str, std::size_t len);
inline uint64_t HashForVocab(const StringPiece &str) {
  return HashForVocab(str.data(), str.length());
}

struct Prob {
  float prob;
  void SetBackoff(float to);
  void ZeroBackoff() {}
};
// No inheritance so this will be a POD.  
struct ProbBackoff {
  float prob;
  float backoff;
  void SetBackoff(float to) { backoff = to; }
  void ZeroBackoff() { backoff = 0.0; }
};

} // namespace detail

// Vocabulary based on sorted uniform find storing only uint64_t values and using their offsets as indices.  
class SortedVocabulary : public base::Vocabulary {
  private:
    // Sorted uniform requires a GetKey function.  
    struct Entry {
      uint64_t GetKey() const { return key; }
      uint64_t key;
      bool operator<(const Entry &other) const {
        return key < other.key;
      }
    };

  public:
    SortedVocabulary();

    WordIndex Index(const StringPiece &str) const {
      const Entry *found;
      if (util::SortedUniformFind<const Entry *, uint64_t>(begin_, end_, detail::HashForVocab(str), found)) {
        return found - begin_ + 1; // +1 because <unk> is 0 and does not appear in the lookup table.
      } else {
        return 0;
      }
    }

    // Ignores second argument for consistency with probing hash which has a float here.  
    static size_t Size(std::size_t entries, float ignored = 0.0);

    // Everything else is for populating.  I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
    void Init(void *start, std::size_t allocated, std::size_t entries);

    WordIndex Insert(const StringPiece &str);

    // Returns true if unknown was seen.  Reorders reorder_vocab so that the IDs are sorted.  
    bool FinishedLoading(detail::ProbBackoff *reorder_vocab);

    void LoadedBinary();

  private:
    Entry *begin_, *end_;

    bool saw_unk_;
};

namespace detail {

// Vocabulary storing a map from uint64_t to WordIndex.  
template <class Search> class MapVocabulary : public base::Vocabulary {
  public:
    MapVocabulary();

    WordIndex Index(const StringPiece &str) const {
      typename Lookup::ConstIterator i;
      return lookup_.Find(HashForVocab(str), i) ? i->GetValue() : 0;
    }

    static size_t Size(std::size_t entries, float probing_multiplier) {
      return Lookup::Size(entries, probing_multiplier);
    }

    // Everything else is for populating.  I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
    void Init(void *start, std::size_t allocated, std::size_t entries);

    WordIndex Insert(const StringPiece &str);

    // Returns true if unknown was seen.  Does nothing with reorder_vocab.  
    bool FinishedLoading(ProbBackoff *reorder_vocab);

    void LoadedBinary();

  private:
    typedef typename Search::template Table<WordIndex>::T Lookup;
    Lookup lookup_;

    bool saw_unk_;
};

// std::identity is an SGI extension :-(
struct IdentityHash : public std::unary_function<uint64_t, size_t> {
  size_t operator()(uint64_t arg) const { return static_cast<size_t>(arg); }
};

// Should return the same results as SRI.  
// Why VocabularyT instead of just Vocabulary?  ModelFacade defines Vocabulary.  
template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
  private:
    typedef base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> P;
  public:
    // Get the size of memory that will be mapped given ngram counts.  This
    // does not include small non-mapped control structures, such as this class
    // itself.  
    static size_t Size(const std::vector<size_t> &counts, const Config &config = Config());

    GenericModel(const char *file, Config config = Config());

    FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;

  private:
    // Appears after Size in the cc.  
    void SetupMemory(char *start, const std::vector<size_t> &counts, const Config &config);

    void LoadFromARPA(util::FilePiece &f, const std::vector<size_t> &counts, const Config &config);

    util::scoped_fd mapped_file_;

    // memory_ is the raw block of memory backing vocab_, unigram_, [middle.begin(), middle.end()), and longest_.  
    util::scoped_mmap memory_;
    
    VocabularyT vocab_;

    ProbBackoff *unigram_;

    typedef typename Search::template Table<ProbBackoff>::T Middle;
    std::vector<Middle> middle_;

    typedef typename Search::template Table<Prob>::T Longest;
    Longest longest_;
};

struct ProbingSearch {
  typedef float Init;

  static const unsigned char kBinaryTag = 1;

  template <class Value> struct Table {
    typedef util::ByteAlignedPacking<uint64_t, Value> Packing;
    typedef util::ProbingHashTable<Packing, IdentityHash> T;
  };
};

struct SortedUniformSearch {
  // This is ignored.
  typedef float Init;

  static const unsigned char kBinaryTag = 2;

  template <class Value> struct Table {
    typedef util::ByteAlignedPacking<uint64_t, Value> Packing;
    typedef util::SortedUniformMap<Packing> T;
  };
};

} // namespace detail

// These must also be instantiated in the cc file.  
typedef detail::MapVocabulary<detail::ProbingSearch> Vocabulary;
typedef detail::GenericModel<detail::ProbingSearch, Vocabulary> Model;

// SortedVocabulary was defined above.  
typedef detail::GenericModel<detail::SortedUniformSearch, SortedVocabulary> SortedModel;

} // namespace ngram
} // namespace lm

#endif // LM_NGRAM__