summaryrefslogtreecommitdiff
path: root/klm/lm/builder/dump_counts_main.cc
diff options
context:
space:
mode:
authorarmatthews <armatthe@cmu.edu>2014-10-13 14:59:23 -0400
committerarmatthews <armatthe@cmu.edu>2014-10-13 14:59:23 -0400
commitb26cda84e05d4523eee069234a975a0153bf8608 (patch)
tree61c9da4f8dd6070f27c8e81812a76fc0a8cf2d8d /klm/lm/builder/dump_counts_main.cc
parentcd7bc67f475fdfd07fba003ac4cca40e83944740 (diff)
parentb1ed81ef3216b212295afa76c5d20a56fb647204 (diff)
Merge branch 'master' of github.com:redpony/cdec
Diffstat (limited to 'klm/lm/builder/dump_counts_main.cc')
-rw-r--r--klm/lm/builder/dump_counts_main.cc36
1 files changed, 36 insertions, 0 deletions
diff --git a/klm/lm/builder/dump_counts_main.cc b/klm/lm/builder/dump_counts_main.cc
new file mode 100644
index 00000000..fa001679
--- /dev/null
+++ b/klm/lm/builder/dump_counts_main.cc
@@ -0,0 +1,36 @@
+#include "lm/builder/print.hh"
+#include "lm/word_index.hh"
+#include "util/file.hh"
+#include "util/read_compressed.hh"
+
+#include <boost/lexical_cast.hpp>
+
+#include <iostream>
+#include <vector>
+
+int main(int argc, char *argv[]) {
+ if (argc != 4) {
+ std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n"
+ "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n"
+ "counts. Each record has order many vocabulary ids.\n"
+ "The vocabulary file contains the words delimited by NULL in order of id.\n"
+ "The vocabulary file may not be compressed because it is mmapped but the counts\n"
+ "file can be compressed.\n";
+ return 1;
+ }
+ util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
+ util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
+ lm::builder::VocabReconstitute vocab(vocab_file.get());
+ unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
+ std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
+ while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
+ UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size());
+ const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin());
+ for (const lm::WordIndex *i = words; i != words + order; ++i) {
+ UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?");
+ std::cout << vocab.Lookup(*i) << ' ';
+ }
+ // TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FakeOFStream.
+ std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
+ }
+}