summaryrefslogtreecommitdiff
path: root/decoder/grammar.cc
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
committerChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
commit851e389dffdd6996ea32d70defb8906de80b9edc (patch)
tree8c68ee77205badc056b8ab5b332e67e3e98017df /decoder/grammar.cc
parentdc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff)
few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec
Diffstat (limited to 'decoder/grammar.cc')
-rw-r--r--decoder/grammar.cc164
1 files changed, 164 insertions, 0 deletions
diff --git a/decoder/grammar.cc b/decoder/grammar.cc
new file mode 100644
index 00000000..e19bd344
--- /dev/null
+++ b/decoder/grammar.cc
@@ -0,0 +1,164 @@
+#include "grammar.h"
+
+#include <algorithm>
+#include <utility>
+#include <map>
+
+#include "filelib.h"
+#include "tdict.h"
+
+using namespace std;
+
+const vector<TRulePtr> Grammar::NO_RULES;
+
+RuleBin::~RuleBin() {}
+GrammarIter::~GrammarIter() {}
+Grammar::~Grammar() {}
+
+bool Grammar::HasRuleForSpan(int i, int j, int distance) const {
+ (void) i;
+ (void) j;
+ (void) distance;
+ return true; // always true by default
+}
+
+struct TextRuleBin : public RuleBin {
+ int GetNumRules() const {
+ return rules_.size();
+ }
+ TRulePtr GetIthRule(int i) const {
+ return rules_[i];
+ }
+ void AddRule(TRulePtr t) {
+ rules_.push_back(t);
+ }
+ int Arity() const {
+ return rules_.front()->Arity();
+ }
+ void Dump() const {
+ for (int i = 0; i < rules_.size(); ++i)
+ cerr << rules_[i]->AsString() << endl;
+ }
+ private:
+ vector<TRulePtr> rules_;
+};
+
+struct TextGrammarNode : public GrammarIter {
+ TextGrammarNode() : rb_(NULL) {}
+ ~TextGrammarNode() {
+ delete rb_;
+ }
+ const GrammarIter* Extend(int symbol) const {
+ map<WordID, TextGrammarNode>::const_iterator i = tree_.find(symbol);
+ if (i == tree_.end()) return NULL;
+ return &i->second;
+ }
+
+ const RuleBin* GetRules() const {
+ if (rb_) {
+ //rb_->Dump();
+ }
+ return rb_;
+ }
+
+ map<WordID, TextGrammarNode> tree_;
+ TextRuleBin* rb_;
+};
+
+struct TGImpl {
+ TextGrammarNode root_;
+};
+
+TextGrammar::TextGrammar() : max_span_(10), pimpl_(new TGImpl) {}
+TextGrammar::TextGrammar(const string& file) :
+ max_span_(10),
+ pimpl_(new TGImpl) {
+ ReadFromFile(file);
+}
+
+const GrammarIter* TextGrammar::GetRoot() const {
+ return &pimpl_->root_;
+}
+
+void TextGrammar::AddRule(const TRulePtr& rule) {
+ if (rule->IsUnary()) {
+ rhs2unaries_[rule->f().front()].push_back(rule);
+ unaries_.push_back(rule);
+ } else {
+ TextGrammarNode* cur = &pimpl_->root_;
+ for (int i = 0; i < rule->f_.size(); ++i)
+ cur = &cur->tree_[rule->f_[i]];
+ if (cur->rb_ == NULL)
+ cur->rb_ = new TextRuleBin;
+ cur->rb_->AddRule(rule);
+ }
+}
+
+void TextGrammar::ReadFromFile(const string& filename) {
+ ReadFile in(filename);
+ istream& in_file = *in.stream();
+ assert(in_file);
+ long long int rule_count = 0;
+ bool fl = false;
+ while(in_file) {
+ string line;
+ getline(in_file, line);
+ if (line.empty()) continue;
+ ++rule_count;
+ if (rule_count % 50000 == 0) { cerr << '.' << flush; fl = true; }
+ if (rule_count % 2000000 == 0) { cerr << " [" << rule_count << "]\n"; fl = false; }
+ TRulePtr rule(TRule::CreateRuleSynchronous(line));
+ if (rule) {
+ AddRule(rule);
+ } else {
+ if (fl) { cerr << endl; }
+ cerr << "Skipping badly formatted rule in line " << rule_count << " of " << filename << endl;
+ fl = false;
+ }
+ }
+ if (fl) cerr << endl;
+ cerr << " " << rule_count << " rules read.\n";
+}
+
+bool TextGrammar::HasRuleForSpan(int i, int j, int distance) const {
+ return (max_span_ >= distance);
+}
+
+GlueGrammar::GlueGrammar(const string& file) : TextGrammar(file) {}
+
+GlueGrammar::GlueGrammar(const string& goal_nt, const string& default_nt) {
+ TRulePtr stop_glue(new TRule("[" + goal_nt + "] ||| [" + default_nt + ",1] ||| [" + default_nt + ",1]"));
+ TRulePtr glue(new TRule("[" + goal_nt + "] ||| [" + goal_nt + ",1] ["
+ + default_nt + ",2] ||| [" + goal_nt + ",1] [" + default_nt + ",2] ||| Glue=1"));
+
+ AddRule(stop_glue);
+ AddRule(glue);
+ //cerr << "GLUE: " << stop_glue->AsString() << endl;
+ //cerr << "GLUE: " << glue->AsString() << endl;
+}
+
+bool GlueGrammar::HasRuleForSpan(int i, int j, int distance) const {
+ (void) j;
+ return (i == 0);
+}
+
+PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat) :
+ has_rule_(input.size() + 1) {
+ for (int i = 0; i < input.size(); ++i) {
+ const vector<LatticeArc>& alts = input[i];
+ for (int k = 0; k < alts.size(); ++k) {
+ const int j = alts[k].dist2next + i;
+ has_rule_[i].insert(j);
+ const string& src = TD::Convert(alts[k].label);
+ TRulePtr pt(new TRule("[" + cat + "] ||| " + src + " ||| " + src + " ||| PassThrough=1"));
+ AddRule(pt);
+// cerr << "PT: " << pt->AsString() << endl;
+ }
+ }
+}
+
+bool PassThroughGrammar::HasRuleForSpan(int i, int j, int distance) const {
+ const set<int>& hr = has_rule_[i];
+ if (i == j) { return !hr.empty(); }
+ return (hr.find(j) != hr.end());
+}