summaryrefslogtreecommitdiff
path: root/klm/lm/builder/corpus_count.hh
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-02-21 14:13:55 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-02-21 14:13:55 +0000
commitbca26d953a774b8efca12f30407390b3f5eef9d0 (patch)
treefe922de5c89b1844f677d550dcc24e87edd67a55 /klm/lm/builder/corpus_count.hh
parent54a1c0e2bde259e3acc9c0a8ec8da3c7704e80ca (diff)
parent95c364f2cb002241c4a62bedb1c5ef6f1e9a7f22 (diff)
Merge branch 'master' of https://github.com/pauldb89/cdec
Diffstat (limited to 'klm/lm/builder/corpus_count.hh')
-rw-r--r--klm/lm/builder/corpus_count.hh42
1 files changed, 42 insertions, 0 deletions
diff --git a/klm/lm/builder/corpus_count.hh b/klm/lm/builder/corpus_count.hh
new file mode 100644
index 00000000..e255bad1
--- /dev/null
+++ b/klm/lm/builder/corpus_count.hh
@@ -0,0 +1,42 @@
+#ifndef LM_BUILDER_CORPUS_COUNT__
+#define LM_BUILDER_CORPUS_COUNT__
+
+#include "lm/word_index.hh"
+#include "util/scoped.hh"
+
+#include <cstddef>
+#include <string>
+#include <stdint.h>
+
+namespace util {
+class FilePiece;
+namespace stream {
+class ChainPosition;
+} // namespace stream
+} // namespace util
+
+namespace lm {
+namespace builder {
+
+class CorpusCount {
+ public:
+ // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size
+ static float DedupeMultiplier(std::size_t order);
+
+ CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block);
+
+ void Run(const util::stream::ChainPosition &position);
+
+ private:
+ util::FilePiece &from_;
+ int vocab_write_;
+ uint64_t &token_count_;
+ WordIndex &type_count_;
+
+ std::size_t dedupe_mem_size_;
+ util::scoped_malloc dedupe_mem_;
+};
+
+} // namespace builder
+} // namespace lm
+#endif // LM_BUILDER_CORPUS_COUNT__