blob: e255bad136d04a33ec2d7578b8b88fc3bf4e726b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
#ifndef LM_BUILDER_CORPUS_COUNT__
#define LM_BUILDER_CORPUS_COUNT__
#include "lm/word_index.hh"
#include "util/scoped.hh"
#include <cstddef>
#include <string>
#include <stdint.h>
namespace util {
class FilePiece;
namespace stream {
class ChainPosition;
} // namespace stream
} // namespace util
namespace lm {
namespace builder {
class CorpusCount {
public:
// Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size
static float DedupeMultiplier(std::size_t order);
CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block);
void Run(const util::stream::ChainPosition &position);
private:
util::FilePiece &from_;
int vocab_write_;
uint64_t &token_count_;
WordIndex &type_count_;
std::size_t dedupe_mem_size_;
util::scoped_malloc dedupe_mem_;
};
} // namespace builder
} // namespace lm
#endif // LM_BUILDER_CORPUS_COUNT__
|