added non-pruning intersection and a CRF tagger

- the linear-chain tagger is more of a proof of concept than a real tagger-- the context-free assumptions made in a number of places mean that the algorithms used may not be as efficient as they could be, but the model is as powerful as any CRF - it would be easy to add latent variables or semi-CRF support (or both!) - i've added a couple basic features that are often used for POS tagging - non-pruning intersection is useful for lexical word alignment models and the tagger - a sample POS tagger model will be committed later
author: Chris Dyer <redpony@gmail.com> 2009-12-17 13:57:54 -0500
committer: Chris Dyer <redpony@gmail.com> 2009-12-17 13:57:54 -0500
commit: bba4ff830c8722cdcaf29e36c1ff5821a912ae5d (patch)
tree: 268f2f8118aca09b3cc40dca8b2be7de8295acd5 /decoder/apply_models.cc
parent: 04ae1beeaeceb0161a64d33112f21956f9741bde (diff)
1 files changed, 78 insertions, 11 deletions
diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
index b1d002f4..a340aa1a 100644
--- a/decoder/apply_models.cc
+++ b/decoder/apply_models.cc
@@ -296,14 +296,69 @@ public:
 };
 
 struct NoPruningRescorer {
-  NoPruningRescorer(const ModelSet& m, const Hypergraph& i, Hypergraph* o) :
+  NoPruningRescorer(const ModelSet& m, const SentenceMetadata &sm, const Hypergraph& i, Hypergraph* o) :
       models(m),
+      smeta(sm),
       in(i),
-      out(*o) {
+      out(*o),
+      nodemap(i.nodes_.size()) {
     cerr << "  Rescoring forest (full intersection)\n";
   }
 
-  void RescoreNode(const int node_num, const bool is_goal) {
+  typedef unordered_map<string, int, boost::hash<string> > State2NodeIndex;
+
+  void ExpandEdge(const Hypergraph::Edge& in_edge, bool is_goal, State2NodeIndex* state2node) {
+    const int arity = in_edge.Arity();
+    Hypergraph::TailNodeVector ends(arity);
+    for (int i = 0; i < arity; ++i)
+      ends[i] = nodemap[in_edge.tail_nodes_[i]].size();
+
+    Hypergraph::TailNodeVector tail_iter(arity, 0);
+    bool done = false;
+    while (!done) {
+      Hypergraph::TailNodeVector tail(arity);
+      for (int i = 0; i < arity; ++i)
+        tail[i] = nodemap[in_edge.tail_nodes_[i]][tail_iter[i]];
+      Hypergraph::Edge* new_edge = out.AddEdge(in_edge.rule_, tail);
+      new_edge->feature_values_ = in_edge.feature_values_;
+      new_edge->i_ = in_edge.i_;
+      new_edge->j_ = in_edge.j_;
+      new_edge->prev_i_ = in_edge.prev_i_;
+      new_edge->prev_j_ = in_edge.prev_j_;
+      string head_state;
+      if (is_goal) {
+        assert(tail.size() == 1);
+        const string& ant_state = out.nodes_[tail.front()].state_;
+        models.AddFinalFeatures(ant_state, new_edge);
+      } else {
+        prob_t edge_estimate; // this is a full intersection, so we disregard this
+        models.AddFeaturesToEdge(smeta, out, new_edge, &head_state, &edge_estimate);
+      }
+      int& head_plus1 = (*state2node)[head_state];
+      if (!head_plus1) {
+        head_plus1 = out.AddNode(in_edge.rule_->GetLHS(), head_state)->id_ + 1;
+        nodemap[in_edge.head_node_].push_back(head_plus1 - 1);
+      }
+      const int head_index = head_plus1 - 1;
+      out.ConnectEdgeToHeadNode(new_edge->id_, head_index);
+
+      int ii = 0;
+      for (; ii < arity; ++ii) {
+        ++tail_iter[ii];
+        if (tail_iter[ii] < ends[ii]) break;
+        tail_iter[ii] = 0;
+      }
+      done = (ii == arity);
+    }
+  }
+
+  void ProcessOneNode(const int node_num, const bool is_goal) {
+    State2NodeIndex state2node;
+    const Hypergraph::Node& node = in.nodes_[node_num];
+    for (int i = 0; i < node.in_edges_.size(); ++i) {
+      const Hypergraph::Edge& edge = in.edges_[node.in_edges_[i]];
+      ExpandEdge(edge, is_goal, &state2node);
+    }
   }
 
   void Apply() {
@@ -316,29 +371,41 @@ struct NoPruningRescorer {
     cerr << "    ";
     for (int i = 0; i < in.nodes_.size(); ++i) {
       if (i % every == 0) cerr << '.';
-      RescoreNode(i, i == goal_id);
+      ProcessOneNode(i, i == goal_id);
     }
     cerr << endl;
   }
 
  private:
   const ModelSet& models;
+  const SentenceMetadata& smeta;
   const Hypergraph& in;
   Hypergraph& out;
+
+  vector<vector<int> > nodemap;
 };
 
 // each node in the graph has one of these, it keeps track of
 void ApplyModelSet(const Hypergraph& in,
                    const SentenceMetadata& smeta,
                    const ModelSet& models,
-                   const PruningConfiguration& config,
+                   const IntersectionConfiguration& config,
                    Hypergraph* out) {
-  int pl = config.pop_limit;
-  if (pl > 100 && in.nodes_.size() > 80000) {
-    cerr << "  Note: reducing pop_limit to " << pl << " for very large forest\n";
-    pl = 30;
+  // TODO special handling when all models are stateless
+  if (config.algorithm == 1) {
+    int pl = config.pop_limit;
+    if (pl > 100 && in.nodes_.size() > 80000) {
+      cerr << "  Note: reducing pop_limit to " << pl << " for very large forest\n";
+      pl = 30;
+    }
+    CubePruningRescorer ma(models, smeta, in, pl, out);
+    ma.Apply();
+  } else if (config.algorithm == 0) {
+    NoPruningRescorer ma(models, smeta, in, out);
+    ma.Apply();
+  } else {
+    cerr << "Don't understand intersection algorithm " << config.algorithm << endl;
+    exit(1);
   }
-  CubePruningRescorer ma(models, smeta, in, pl, out);
-  ma.Apply();
 }
author	Chris Dyer <redpony@gmail.com>	2009-12-17 13:57:54 -0500
committer	Chris Dyer <redpony@gmail.com>	2009-12-17 13:57:54 -0500
commit	bba4ff830c8722cdcaf29e36c1ff5821a912ae5d (patch)
tree	268f2f8118aca09b3cc40dca8b2be7de8295acd5 /decoder/apply_models.cc
parent	04ae1beeaeceb0161a64d33112f21956f9741bde (diff)