summaryrefslogtreecommitdiff
path: root/klm/lm/builder/adjust_counts_test.cc
blob: 073c5dfebd7fa23323240bcac0eea565ded412c1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#include "lm/builder/adjust_counts.hh"

#include "lm/builder/ngram_stream.hh"
#include "util/scoped.hh"

#include <boost/thread/thread.hpp>
#define BOOST_TEST_MODULE AdjustCounts
#include <boost/test/unit_test.hpp>

namespace lm { namespace builder { namespace {

class KeepCopy {
  public:
    KeepCopy() : size_(0) {}

    void Run(const util::stream::ChainPosition &position) {
      for (util::stream::Link link(position); link; ++link) {
        mem_.call_realloc(size_ + link->ValidSize());
        memcpy(static_cast<uint8_t*>(mem_.get()) + size_, link->Get(), link->ValidSize());
        size_ += link->ValidSize();
      }
    }

    uint8_t *Get() { return static_cast<uint8_t*>(mem_.get()); }
    std::size_t Size() const { return size_; }

  private:
    util::scoped_malloc mem_;
    std::size_t size_;
};

struct Gram4 {
  WordIndex ids[4];
  uint64_t count;
};

class WriteInput {
  public:
    void Run(const util::stream::ChainPosition &position) {
      NGramStream input(position);
      Gram4 grams[] = {
        {{0,0,0,0},10},
        {{0,0,3,0},3},
        // bos
        {{1,1,1,2},5},
        {{0,0,3,2},5},
      };
      for (size_t i = 0; i < sizeof(grams) / sizeof(Gram4); ++i, ++input) {
        memcpy(input->begin(), grams[i].ids, sizeof(WordIndex) * 4);
        input->Count() = grams[i].count;
      }
      input.Poison();
    }
};

BOOST_AUTO_TEST_CASE(Simple) {
  KeepCopy outputs[4];
  std::vector<uint64_t> counts;
  std::vector<Discount> discount;
  {
    util::stream::ChainConfig config;
    config.total_memory = 100;
    config.block_count = 1;
    util::stream::Chains chains(4);
    for (unsigned i = 0; i < 4; ++i) {
      config.entry_size = NGram::TotalSize(i + 1);
      chains.push_back(config);
    }

    chains[3] >> WriteInput();
    util::stream::ChainPositions for_adjust(chains);
    for (unsigned i = 0; i < 4; ++i) {
      chains[i] >> boost::ref(outputs[i]);
    }
    chains >> util::stream::kRecycle;
    std::vector<uint64_t> counts_pruned(4);
    std::vector<uint64_t> prune_thresholds(4);
    DiscountConfig discount_config;
    discount_config.fallback = Discount();
    discount_config.bad_action = THROW_UP;
    BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, discount_config, discount).Run(for_adjust), BadDiscountException);
  }
  BOOST_REQUIRE_EQUAL(4UL, counts.size());
  BOOST_CHECK_EQUAL(4UL, counts[0]);
  // These are no longer set because the discounts are bad.  
/*  BOOST_CHECK_EQUAL(4UL, counts[1]);
  BOOST_CHECK_EQUAL(3UL, counts[2]);
  BOOST_CHECK_EQUAL(3UL, counts[3]);*/
  BOOST_REQUIRE_EQUAL(NGram::TotalSize(1) * 4, outputs[0].Size());
  NGram uni(outputs[0].Get(), 1);
  BOOST_CHECK_EQUAL(kUNK, *uni.begin());
  BOOST_CHECK_EQUAL(0ULL, uni.Count());
  uni.NextInMemory();
  BOOST_CHECK_EQUAL(kBOS, *uni.begin());
  BOOST_CHECK_EQUAL(0ULL, uni.Count());
  uni.NextInMemory();
  BOOST_CHECK_EQUAL(0UL, *uni.begin());
  BOOST_CHECK_EQUAL(2ULL, uni.Count());
  uni.NextInMemory();
  BOOST_CHECK_EQUAL(2ULL, uni.Count());
  BOOST_CHECK_EQUAL(2UL, *uni.begin());

  BOOST_REQUIRE_EQUAL(NGram::TotalSize(2) * 4, outputs[1].Size());
  NGram bi(outputs[1].Get(), 2);
  BOOST_CHECK_EQUAL(0UL, *bi.begin());
  BOOST_CHECK_EQUAL(0UL, *(bi.begin() + 1));
  BOOST_CHECK_EQUAL(1ULL, bi.Count());
  bi.NextInMemory();
}

}}} // namespaces