summaryrefslogtreecommitdiff
path: root/src/hg_intersect.cc
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
committerChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
commit851e389dffdd6996ea32d70defb8906de80b9edc (patch)
tree8c68ee77205badc056b8ab5b332e67e3e98017df /src/hg_intersect.cc
parentdc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff)
few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec
Diffstat (limited to 'src/hg_intersect.cc')
-rw-r--r--src/hg_intersect.cc121
1 files changed, 0 insertions, 121 deletions
diff --git a/src/hg_intersect.cc b/src/hg_intersect.cc
deleted file mode 100644
index a5e8913a..00000000
--- a/src/hg_intersect.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-#include "hg_intersect.h"
-
-#include <vector>
-#include <tr1/unordered_map>
-#include <boost/lexical_cast.hpp>
-#include <boost/functional/hash.hpp>
-
-#include "tdict.h"
-#include "hg.h"
-#include "trule.h"
-#include "wordid.h"
-#include "bottom_up_parser.h"
-
-using boost::lexical_cast;
-using namespace std::tr1;
-using namespace std;
-
-struct RuleFilter {
- unordered_map<vector<WordID>, bool, boost::hash<vector<WordID> > > exists_;
- bool true_lattice;
- RuleFilter(const Lattice& target, int max_phrase_size) {
- true_lattice = false;
- for (int i = 0; i < target.size(); ++i) {
- vector<WordID> phrase;
- int lim = min(static_cast<int>(target.size()), i + max_phrase_size);
- for (int j = i; j < lim; ++j) {
- if (target[j].size() > 1) { true_lattice = true; break; }
- phrase.push_back(target[j][0].label);
- exists_[phrase] = true;
- }
- }
- vector<WordID> sos(1, TD::Convert("<s>"));
- exists_[sos] = true;
- }
- bool operator()(const TRule& r) const {
- // TODO do some smarter filtering for lattices
- if (true_lattice) return false; // don't filter "true lattice" input
- const vector<WordID>& e = r.e();
- for (int i = 0; i < e.size(); ++i) {
- if (e[i] <= 0) continue;
- vector<WordID> phrase;
- for (int j = i; j < e.size(); ++j) {
- if (e[j] <= 0) break;
- phrase.push_back(e[j]);
- if (exists_.count(phrase) == 0) return true;
- }
- }
- return false;
- }
-};
-
-bool HG::Intersect(const Lattice& target, Hypergraph* hg) {
- vector<bool> rem(hg->edges_.size(), false);
- const RuleFilter filter(target, 15); // TODO make configurable
- for (int i = 0; i < rem.size(); ++i)
- rem[i] = filter(*hg->edges_[i].rule_);
- hg->PruneEdges(rem);
-
- const int nedges = hg->edges_.size();
- const int nnodes = hg->nodes_.size();
-
- TextGrammar* g = new TextGrammar;
- GrammarPtr gp(g);
- vector<int> cats(nnodes);
- // each node in the translation forest becomes a "non-terminal" in the new
- // grammar, create the labels here
- for (int i = 0; i < nnodes; ++i)
- cats[i] = TD::Convert("CAT_" + lexical_cast<string>(i)) * -1;
-
- // construct the grammar
- for (int i = 0; i < nedges; ++i) {
- const Hypergraph::Edge& edge = hg->edges_[i];
- const vector<WordID>& tgt = edge.rule_->e();
- const vector<WordID>& src = edge.rule_->f();
- TRulePtr rule(new TRule);
- rule->prev_i = edge.i_;
- rule->prev_j = edge.j_;
- rule->lhs_ = cats[edge.head_node_];
- vector<WordID>& f = rule->f_;
- vector<WordID>& e = rule->e_;
- f.resize(tgt.size()); // swap source and target, since the parser
- e.resize(src.size()); // parses using the source side!
- Hypergraph::TailNodeVector tn(edge.tail_nodes_.size());
- int ntc = 0;
- for (int j = 0; j < tgt.size(); ++j) {
- const WordID& cur = tgt[j];
- if (cur > 0) {
- f[j] = cur;
- } else {
- tn[ntc++] = cur;
- f[j] = cats[edge.tail_nodes_[-cur]];
- }
- }
- ntc = 0;
- for (int j = 0; j < src.size(); ++j) {
- const WordID& cur = src[j];
- if (cur > 0) {
- e[j] = cur;
- } else {
- e[j] = tn[ntc++];
- }
- }
- rule->scores_ = edge.feature_values_;
- rule->parent_rule_ = edge.rule_;
- rule->ComputeArity();
- //cerr << "ADD: " << rule->AsString() << endl;
-
- g->AddRule(rule);
- }
- g->SetMaxSpan(target.size() + 1);
- const string& new_goal = TD::Convert(cats.back() * -1);
- vector<GrammarPtr> grammars(1, gp);
- Hypergraph tforest;
- ExhaustiveBottomUpParser parser(new_goal, grammars);
- if (!parser.Parse(target, &tforest))
- return false;
- else
- hg->swap(tforest);
- return true;
-}
-