summaryrefslogtreecommitdiff
path: root/klm/lm/search_hashed.hh
blob: 6dc11fb322af742adee3ceb16b1aefbcaa10cf20 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#ifndef LM_SEARCH_HASHED__
#define LM_SEARCH_HASHED__

#include "lm/binary_format.hh"
#include "lm/config.hh"
#include "lm/read_arpa.hh"
#include "lm/weights.hh"

#include "util/key_value_packing.hh"
#include "util/probing_hash_table.hh"
#include "util/sorted_uniform.hh"

#include <algorithm>
#include <vector>

namespace util { class FilePiece; }

namespace lm {
namespace ngram {
struct Backing;
namespace detail {

inline uint64_t CombineWordHash(uint64_t current, const WordIndex next) {
  uint64_t ret = (current * 8978948897894561157ULL) ^ (static_cast<uint64_t>(1 + next) * 17894857484156487943ULL);
  return ret;
}

struct HashedSearch {
  typedef uint64_t Node;

  class Unigram {
    public:
      Unigram() {}

      Unigram(void *start, std::size_t /*allocated*/) : unigram_(static_cast<ProbBackoff*>(start)) {}

      static std::size_t Size(uint64_t count) {
        return (count + 1) * sizeof(ProbBackoff); // +1 for hallucinate <unk>
      }

      const ProbBackoff &Lookup(WordIndex index) const { return unigram_[index]; }

      ProbBackoff &Unknown() { return unigram_[0]; }

      void LoadedBinary() {}

      // For building.
      ProbBackoff *Raw() { return unigram_; }

    private:
      ProbBackoff *unigram_;
  };

  Unigram unigram;

  bool LookupUnigram(WordIndex word, float &prob, float &backoff, Node &next) const {
    const ProbBackoff &entry = unigram.Lookup(word);
    prob = entry.prob;
    backoff = entry.backoff;
    next = static_cast<Node>(word);
    return true;
  }
};

template <class MiddleT, class LongestT> struct TemplateHashedSearch : public HashedSearch {
  typedef MiddleT Middle;
  std::vector<Middle> middle;

  typedef LongestT Longest;
  Longest longest;

  static std::size_t Size(const std::vector<uint64_t> &counts, const Config &config) {
    std::size_t ret = Unigram::Size(counts[0]);
    for (unsigned char n = 1; n < counts.size() - 1; ++n) {
      ret += Middle::Size(counts[n], config.probing_multiplier);
    }
    return ret + Longest::Size(counts.back(), config.probing_multiplier);
  }

  uint8_t *SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
    std::size_t allocated = Unigram::Size(counts[0]);
    unigram = Unigram(start, allocated);
    start += allocated;
    for (unsigned int n = 2; n < counts.size(); ++n) {
      allocated = Middle::Size(counts[n - 1], config.probing_multiplier);
      middle.push_back(Middle(start, allocated));
      start += allocated;
    }
    allocated = Longest::Size(counts.back(), config.probing_multiplier);
    longest = Longest(start, allocated);
    start += allocated;
    return start;
  }

  template <class Voc> void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab, Backing &backing);

  bool LookupMiddle(const Middle &middle, WordIndex word, float &prob, float &backoff, Node &node) const {
    node = CombineWordHash(node, word);
    typename Middle::ConstIterator found;
    if (!middle.Find(node, found)) return false;
    prob = found->GetValue().prob;
    backoff = found->GetValue().backoff;
    return true;
  }

  bool LookupMiddleNoProb(const Middle &middle, WordIndex word, float &backoff, Node &node) const {
    node = CombineWordHash(node, word);
    typename Middle::ConstIterator found;
    if (!middle.Find(node, found)) return false;
    backoff = found->GetValue().backoff;
    return true;
  }

  bool LookupLongest(WordIndex word, float &prob, Node &node) const {
    node = CombineWordHash(node, word);
    typename Longest::ConstIterator found;
    if (!longest.Find(node, found)) return false;
    prob = found->GetValue().prob;
    return true;
  }

  // Geenrate a node without necessarily checking that it actually exists.  
  // Optionally return false if it's know to not exist.  
  bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const {
    assert(begin != end);
    node = static_cast<Node>(*begin);
    for (const WordIndex *i = begin + 1; i < end; ++i) {
      node = CombineWordHash(node, *i);
    }
    return true;
  }
};

// std::identity is an SGI extension :-(
struct IdentityHash : public std::unary_function<uint64_t, size_t> {
  size_t operator()(uint64_t arg) const { return static_cast<size_t>(arg); }
};

struct ProbingHashedSearch : public TemplateHashedSearch<
  util::ProbingHashTable<util::ByteAlignedPacking<uint64_t, ProbBackoff>, IdentityHash>,
  util::ProbingHashTable<util::ByteAlignedPacking<uint64_t, Prob>, IdentityHash> > {

  static const ModelType kModelType = HASH_PROBING;
};

struct SortedHashedSearch : public TemplateHashedSearch<
  util::SortedUniformMap<util::ByteAlignedPacking<uint64_t, ProbBackoff> >,
  util::SortedUniformMap<util::ByteAlignedPacking<uint64_t, Prob> > > {

  SortedHashedSearch();
  
  static const ModelType kModelType = HASH_SORTED;
};

} // namespace detail
} // namespace ngram
} // namespace lm

#endif // LM_SEARCH_HASHED__