blob: 09e1a4d525b2cb8756dea7441f6a4aa91316b2ec (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
#ifndef LM_BUILDER_PIPELINE_H
#define LM_BUILDER_PIPELINE_H
#include "lm/builder/adjust_counts.hh"
#include "lm/builder/initial_probabilities.hh"
#include "lm/builder/header_info.hh"
#include "lm/lm_exception.hh"
#include "lm/word_index.hh"
#include "util/stream/config.hh"
#include "util/file_piece.hh"
#include <string>
#include <cstddef>
namespace lm { namespace builder {
struct PipelineConfig {
std::size_t order;
std::string vocab_file;
util::stream::SortConfig sort;
InitialProbabilitiesConfig initial_probs;
util::stream::ChainConfig read_backoffs;
// Include a header in the ARPA with some statistics?
bool verbose_header;
// Estimated vocabulary size. Used for sizing CorpusCount memory and
// initial probing hash table sizing, also in CorpusCount.
lm::WordIndex vocab_estimate;
// Minimum block size to tolerate.
std::size_t minimum_block;
// Number of blocks to use. This will be overridden to 1 if everything fits.
std::size_t block_count;
// n-gram count thresholds for pruning. 0 values means no pruning for
// corresponding n-gram order
std::vector<uint64_t> prune_thresholds; //mjd
// What to do with discount failures.
DiscountConfig discount;
// Compute collapsed q values instead of probability and backoff
bool output_q;
/* Computing the perplexity of LMs with different vocabularies is hard. For
* example, the lowest perplexity is attained by a unigram model that
* predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly
* interpolated models will sum to more than 1 because <unk> is duplicated
* (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to
* 1 but comes with its own problems). This option will make the vocabulary
* a particular size by replicating <unk> multiple times for purposes of
* computing vocabulary size. It has no effect if the actual vocabulary is
* larger. This parameter serves the same purpose as IRSTLM's "dub".
*/
uint64_t vocab_size_for_unk;
/* What to do the first time <s>, </s>, or <unk> appears in the input. If
* this is anything but THROW_UP, then the symbol will always be treated as
* whitespace.
*/
WarningAction disallowed_symbol_action;
const std::string &TempPrefix() const { return sort.temp_prefix; }
std::size_t TotalMemory() const { return sort.total_memory; }
};
// Takes ownership of text_file and out_arpa.
void Pipeline(PipelineConfig config, int text_file, int out_arpa);
}} // namespaces
#endif // LM_BUILDER_PIPELINE_H
|