diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-11-10 02:02:04 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-11-10 02:02:04 +0000 |
commit | 15b03336564d5e57e50693f19dd81b45076af5d4 (patch) | |
tree | c2072893a43f4c75f0ad5ebe3080bfa901faf18f /klm/lm/read_arpa.cc | |
parent | 1336aecfe930546f8836ffe65dd5ff78434084eb (diff) |
new version of klm
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@706 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'klm/lm/read_arpa.cc')
-rw-r--r-- | klm/lm/read_arpa.cc | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc new file mode 100644 index 00000000..8e9a770d --- /dev/null +++ b/klm/lm/read_arpa.cc @@ -0,0 +1,154 @@ +#include "lm/read_arpa.hh" + +#include <cstdlib> +#include <vector> + +#include <ctype.h> +#include <inttypes.h> + +namespace lm { + +namespace { + +bool IsEntirelyWhiteSpace(const StringPiece &line) { + for (size_t i = 0; i < static_cast<size_t>(line.size()); ++i) { + if (!isspace(line.data()[i])) return false; + } + return true; +} + +template <class F> void GenericReadARPACounts(F &in, std::vector<uint64_t> &number) { + number.clear(); + StringPiece line; + if (!IsEntirelyWhiteSpace(line = in.ReadLine())) { + if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) { + UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, run\nzcat " << in.FileName() << " |kenlm/build_binary /dev/stdin " << in.FileName() << ".binary\nIf this already in binary format, you need to decompress it because mmap doesn't work on top of gzip."); + } + UTIL_THROW(FormatLoadException, "First line was \"" << static_cast<int>(line.data()[1]) << "\" not blank"); + } + if ((line = in.ReadLine()) != "\\data\\") UTIL_THROW(FormatLoadException, "second line was \"" << line << "\" not \\data\\."); + while (!IsEntirelyWhiteSpace(line = in.ReadLine())) { + if (line.size() < 6 || strncmp(line.data(), "ngram ", 6)) UTIL_THROW(FormatLoadException, "count line \"" << line << "\"doesn't begin with \"ngram \""); + // So strtol doesn't go off the end of line. + std::string remaining(line.data() + 6, line.size() - 6); + char *end_ptr; + unsigned long int length = std::strtol(remaining.c_str(), &end_ptr, 10); + if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line); + if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line); + ++end_ptr; + const char *start = end_ptr; + long int count = std::strtol(start, &end_ptr, 10); + if (count < 0) UTIL_THROW(FormatLoadException, "Negative n-gram count " << count); + if (start == end_ptr) UTIL_THROW(FormatLoadException, "Couldn't parse n-gram count from " << line); + number.push_back(count); + } +} + +template <class F> void GenericReadNGramHeader(F &in, unsigned int length) { + StringPiece line; + while (IsEntirelyWhiteSpace(line = in.ReadLine())) {} + std::stringstream expected; + expected << '\\' << length << "-grams:"; + if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead. "); +} + +template <class F> void GenericReadEnd(F &in) { + StringPiece line; + do { + line = in.ReadLine(); + } while (IsEntirelyWhiteSpace(line)); + if (line != "\\end\\") UTIL_THROW(FormatLoadException, "Expected \\end\\ but the ARPA file has " << line); +} + +class FakeFilePiece { + public: + explicit FakeFilePiece(std::istream &in) : in_(in) { + in_.exceptions(std::ios::failbit | std::ios::badbit | std::ios::eofbit); + } + + StringPiece ReadLine() throw(util::EndOfFileException) { + getline(in_, buffer_); + return StringPiece(buffer_); + } + + float ReadFloat() { + float ret; + in_ >> ret; + return ret; + } + + const char *FileName() const { + // This only used for error messages and we don't know the file name. . . + return "$file"; + } + + private: + std::istream &in_; + std::string buffer_; +}; + +} // namespace + +void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) { + GenericReadARPACounts(in, number); +} +void ReadARPACounts(std::istream &in, std::vector<uint64_t> &number) { + FakeFilePiece fake(in); + GenericReadARPACounts(fake, number); +} +void ReadNGramHeader(util::FilePiece &in, unsigned int length) { + GenericReadNGramHeader(in, length); +} +void ReadNGramHeader(std::istream &in, unsigned int length) { + FakeFilePiece fake(in); + GenericReadNGramHeader(fake, length); +} + +void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) { + switch (in.get()) { + case '\t': + { + float got = in.ReadFloat(); + if (got != 0.0) + UTIL_THROW(FormatLoadException, "Non-zero backoff " << got << " provided for an n-gram that should have no backoff."); + } + break; + case '\n': + break; + default: + UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram"); + } +} + +void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) { + switch (in.get()) { + case '\t': + weights.backoff = in.ReadFloat(); + if ((in.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff"); + break; + case '\n': + weights.backoff = 0.0; + break; + default: + UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram"); + } +} + +void ReadEnd(util::FilePiece &in) { + GenericReadEnd(in); + StringPiece line; + try { + while (true) { + line = in.ReadLine(); + if (!IsEntirelyWhiteSpace(line)) UTIL_THROW(FormatLoadException, "Trailing line " << line); + } + } catch (const util::EndOfFileException &e) { + return; + } +} +void ReadEnd(std::istream &in) { + FakeFilePiece fake(in); + GenericReadEnd(fake); +} + +} // namespace lm |