diff options
Diffstat (limited to 'decoder/factored_lexicon_helper.cc')
-rw-r--r-- | decoder/factored_lexicon_helper.cc | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/decoder/factored_lexicon_helper.cc b/decoder/factored_lexicon_helper.cc new file mode 100644 index 00000000..7203b325 --- /dev/null +++ b/decoder/factored_lexicon_helper.cc @@ -0,0 +1,80 @@ +#include "factored_lexicon_helper.h" + +#include "filelib.h" +#include "stringlib.h" + +using namespace std; + +FactoredLexiconHelper::FactoredLexiconHelper() : + kNULL(TD::Convert("<eps>")), + has_src_(false), + has_trg_(false) { InitEscape(); } + +FactoredLexiconHelper::FactoredLexiconHelper(const std::string& srcfile, const std::string& trgmapfile) : + kNULL(TD::Convert("<eps>")), + has_src_(false), + has_trg_(false) { + if (srcfile.size() && srcfile != "*") { + ReadFile rf(srcfile); + has_src_ = true; + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) continue; + vector<WordID> v; + TD::ConvertSentence(line, &v); + src_.push_back(v); + } + } + if (trgmapfile.size() && trgmapfile != "*") { + ReadFile rf(trgmapfile); + has_trg_ = true; + istream& in = *rf.stream(); + string line; + vector<string> v; + while(in) { + getline(in, line); + if (!in) continue; + SplitOnWhitespace(line, &v); + if (v.size() != 2) { + cerr << "Error reading line in map file: " << line << endl; + abort(); + } + WordID& to = trgmap_[TD::Convert(v[0])]; + if (to != 0) { + cerr << "Duplicate entry for word " << v[0] << endl; + abort(); + } + to = TD::Convert(v[1]); + } + } + InitEscape(); +} + +void FactoredLexiconHelper::InitEscape() { + escape_[TD::Convert("=")] = TD::Convert("__EQ"); + escape_[TD::Convert(";")] = TD::Convert("__SC"); + escape_[TD::Convert(",")] = TD::Convert("__CO"); +} + +void FactoredLexiconHelper::PrepareForInput(const SentenceMetadata& smeta) { + if (has_src_) { + const int id = smeta.GetSentenceID(); + assert(id < src_.size()); + cur_src_ = src_[id]; + } else { + cur_src_.resize(smeta.GetSourceLength()); + for (int i = 0; i < cur_src_.size(); ++i) { + const vector<LatticeArc>& arcs = smeta.GetSourceLattice()[i]; + assert(arcs.size() == 1); // only sentences supported for now + cur_src_[i] = arcs[0].label; + } + } + if (cur_src_.size() != smeta.GetSourceLength()) { + cerr << "Length mismatch between mapped source and real source in sentence id=" << smeta.GetSentenceID() << endl; + cerr << " mapped len=" << cur_src_.size() << endl; + cerr << " actual len=" << smeta.GetSourceLength() << endl; + } +} + |