summaryrefslogtreecommitdiff
path: root/klm/lm/bhiksha.cc
blob: 088ea98d425506e14f93d6ab1cbd3f7fe2391482 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#include "lm/bhiksha.hh"
#include "lm/config.hh"
#include "util/file.hh"
#include "util/exception.hh"

#include <limits>

namespace lm {
namespace ngram {
namespace trie {

DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) : 
  next_(util::BitsMask::ByMax(max_next)) {}

const uint8_t kArrayBhikshaVersion = 0;

// TODO: put this in binary file header instead when I change the binary file format again.  
void ArrayBhiksha::UpdateConfigFromBinary(int fd, Config &config) {
  uint8_t version;
  uint8_t configured_bits;
  util::ReadOrThrow(fd, &version, 1);
  util::ReadOrThrow(fd, &configured_bits, 1);
  if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion);
  config.pointer_bhiksha_bits = configured_bits;
}

namespace {

// Find argmin_{chopped \in [0, RequiredBits(max_next)]} ChoppedDelta(max_offset)
uint8_t ChopBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
  uint8_t required = util::RequiredBits(max_next);
  uint8_t best_chop = 0;
  int64_t lowest_change = std::numeric_limits<int64_t>::max();
  // There are probably faster ways but I don't care because this is only done once per order at construction time.  
  for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) {
    int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */
      - max_offset * static_cast<int64_t>(chop); /* savings in bits*/
    if (change < lowest_change) {
      lowest_change = change;
      best_chop = chop;
    }
  }
  return best_chop;
}

std::size_t ArrayCount(uint64_t max_offset, uint64_t max_next, const Config &config) {
  uint8_t required = util::RequiredBits(max_next);
  uint8_t chopping = ChopBits(max_offset, max_next, config);
  return (max_next >> (required - chopping)) + 1 /* we store 0 too */;
}
} // namespace

uint64_t ArrayBhiksha::Size(uint64_t max_offset, uint64_t max_next, const Config &config) {
  return sizeof(uint64_t) * (1 /* header */ + ArrayCount(max_offset, max_next, config)) + 7 /* 8-byte alignment */;
}

uint8_t ArrayBhiksha::InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
  return util::RequiredBits(max_next) - ChopBits(max_offset, max_next, config);
}

namespace {

void *AlignTo8(void *from) {
  uint8_t *val = reinterpret_cast<uint8_t*>(from);
  std::size_t remainder = reinterpret_cast<std::size_t>(val) & 7;
  if (!remainder) return val;
  return val + 8 - remainder;
}

} // namespace

ArrayBhiksha::ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_next, const Config &config)
  : next_inline_(util::BitsMask::ByBits(InlineBits(max_offset, max_next, config))),
    offset_begin_(reinterpret_cast<const uint64_t*>(AlignTo8(base)) + 1 /* 8-byte header */),
    offset_end_(offset_begin_ + ArrayCount(max_offset, max_next, config)),
    write_to_(reinterpret_cast<uint64_t*>(AlignTo8(base)) + 1 /* 8-byte header */ + 1 /* first entry is 0 */),
    original_base_(base) {}

void ArrayBhiksha::FinishedLoading(const Config &config) {
  // *offset_begin_ = 0 but without a const_cast.
  *(write_to_ - (write_to_ - offset_begin_)) = 0;

  if (write_to_ != offset_end_) UTIL_THROW(util::Exception, "Did not get all the array entries that were expected.");

  uint8_t *head_write = reinterpret_cast<uint8_t*>(original_base_);
  *(head_write++) = kArrayBhikshaVersion;
  *(head_write++) = config.pointer_bhiksha_bits;
}

void ArrayBhiksha::LoadedBinary() {
}

} // namespace trie
} // namespace ngram
} // namespace lm