diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 |
commit | 0172721855098ca02b207231a654dffa5e4eb1c9 (patch) | |
tree | 8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /decoder/aligner.cc | |
parent | 37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff) |
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/aligner.cc')
-rw-r--r-- | decoder/aligner.cc | 319 |
1 files changed, 319 insertions, 0 deletions
diff --git a/decoder/aligner.cc b/decoder/aligner.cc new file mode 100644 index 00000000..bad97b74 --- /dev/null +++ b/decoder/aligner.cc @@ -0,0 +1,319 @@ +#include "aligner.h" + +#include "array2d.h" +#include "hg.h" +#include "sentence_metadata.h" +#include "inside_outside.h" +#include "viterbi.h" +#include <set> + +using namespace std; + +static bool is_digit(char x) { return x >= '0' && x <= '9'; } + +boost::shared_ptr<Array2D<bool> > AlignerTools::ReadPharaohAlignmentGrid(const string& al) { + int max_x = 0; + int max_y = 0; + int i = 0; + size_t pos = al.rfind(" ||| "); + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + if (x > max_x) max_x = x; + assert(i < al.size()); + if(al[i] != '-') { + cerr << "BAD ALIGNMENT: " << al << endl; + abort(); + } + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + if (y > max_y) max_y = y; + while(i < al.size() && al[i] == ' ') { ++i; } + } + + boost::shared_ptr<Array2D<bool> > grid(new Array2D<bool>(max_x + 1, max_y + 1)); + i = 0; + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + assert(i < al.size()); + assert(al[i] == '-'); + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + (*grid)(x, y) = true; + while(i < al.size() && al[i] == ' ') { ++i; } + } + // cerr << *grid << endl; + return grid; +} + +void AlignerTools::SerializePharaohFormat(const Array2D<bool>& alignment, ostream* out) { + bool need_space = false; + for (int i = 0; i < alignment.width(); ++i) + for (int j = 0; j < alignment.height(); ++j) + if (alignment(i,j)) { + if (need_space) (*out) << ' '; else need_space = true; + (*out) << i << '-' << j; + } + (*out) << endl; +} + +// used with lexical models since they may not fully generate the +// source string +void SourceEdgeCoveragesUsingParseIndices(const Hypergraph& g, + vector<set<int> >* src_cov) { + src_cov->clear(); + src_cov->resize(g.edges_.size()); + + for (int i = 0; i < g.edges_.size(); ++i) { + const Hypergraph::Edge& edge = g.edges_[i]; + set<int>& cov = (*src_cov)[i]; + // no words + if (edge.rule_->EWords() == 0 || edge.rule_->FWords() == 0) + continue; + // aligned to NULL (crf ibm variant only) + if (edge.prev_i_ == -1 || edge.i_ == -1) + continue; + assert(edge.j_ >= 0); + assert(edge.prev_j_ >= 0); + if (edge.Arity() == 0) { + for (int k = edge.prev_i_; k < edge.prev_j_; ++k) + cov.insert(k); + } else { + // note: this code, which handles mixed NT and terminal + // rules assumes that nodes uniquely define a src and trg + // span. + int k = edge.prev_i_; + int j = 0; + const vector<WordID>& f = edge.rule_->e(); // rules are inverted + while (k < edge.prev_j_) { + if (f[j] > 0) { + cov.insert(k); + // cerr << "src: " << k << endl; + ++k; + ++j; + } else { + const Hypergraph::Node& tailnode = g.nodes_[edge.tail_nodes_[-f[j]]]; + assert(tailnode.in_edges_.size() > 0); + // any edge will do: + const Hypergraph::Edge& rep_edge = g.edges_[tailnode.in_edges_.front()]; + //cerr << "skip " << (rep_edge.prev_j_ - rep_edge.prev_i_) << endl; // src span + k += (rep_edge.prev_j_ - rep_edge.prev_i_); // src span + ++j; + } + } + } + } +} + +int SourceEdgeCoveragesUsingTree(const Hypergraph& g, + int node_id, + int span_start, + vector<int>* spans, + vector<set<int> >* src_cov) { + const Hypergraph::Node& node = g.nodes_[node_id]; + int k = -1; + for (int i = 0; i < node.in_edges_.size(); ++i) { + const int edge_id = node.in_edges_[i]; + const Hypergraph::Edge& edge = g.edges_[edge_id]; + set<int>& cov = (*src_cov)[edge_id]; + const vector<WordID>& f = edge.rule_->e(); // rules are inverted + int j = 0; + k = span_start; + while (j < f.size()) { + if (f[j] > 0) { + cov.insert(k); + ++k; + ++j; + } else { + const int tail_node_id = edge.tail_nodes_[-f[j]]; + int &right_edge = (*spans)[tail_node_id]; + if (right_edge < 0) + right_edge = SourceEdgeCoveragesUsingTree(g, tail_node_id, k, spans, src_cov); + k = right_edge; + ++j; + } + } + } + return k; +} + +void SourceEdgeCoveragesUsingTree(const Hypergraph& g, + vector<set<int> >* src_cov) { + src_cov->clear(); + src_cov->resize(g.edges_.size()); + vector<int> span_sizes(g.nodes_.size(), -1); + SourceEdgeCoveragesUsingTree(g, g.nodes_.size() - 1, 0, &span_sizes, src_cov); +} + +int TargetEdgeCoveragesUsingTree(const Hypergraph& g, + int node_id, + int span_start, + vector<int>* spans, + vector<set<int> >* trg_cov) { + const Hypergraph::Node& node = g.nodes_[node_id]; + int k = -1; + for (int i = 0; i < node.in_edges_.size(); ++i) { + const int edge_id = node.in_edges_[i]; + const Hypergraph::Edge& edge = g.edges_[edge_id]; + set<int>& cov = (*trg_cov)[edge_id]; + int ntc = 0; + const vector<WordID>& e = edge.rule_->f(); // rules are inverted + int j = 0; + k = span_start; + while (j < e.size()) { + if (e[j] > 0) { + cov.insert(k); + ++k; + ++j; + } else { + const int tail_node_id = edge.tail_nodes_[ntc]; + ++ntc; + int &right_edge = (*spans)[tail_node_id]; + if (right_edge < 0) + right_edge = TargetEdgeCoveragesUsingTree(g, tail_node_id, k, spans, trg_cov); + k = right_edge; + ++j; + } + } + // cerr << "node=" << node_id << ": k=" << k << endl; + } + return k; +} + +void TargetEdgeCoveragesUsingTree(const Hypergraph& g, + vector<set<int> >* trg_cov) { + trg_cov->clear(); + trg_cov->resize(g.edges_.size()); + vector<int> span_sizes(g.nodes_.size(), -1); + TargetEdgeCoveragesUsingTree(g, g.nodes_.size() - 1, 0, &span_sizes, trg_cov); +} + +struct TransitionEventWeightFunction { + inline SparseVector<prob_t> operator()(const Hypergraph::Edge& e) const { + SparseVector<prob_t> result; + result.set_value(e.id_, e.edge_prob_); + return result; + } +}; + +// this code is rather complicated since it must deal with generating alignments +// when lattices are specified as input as well as with models that do not generate +// full sentence pairs (like lexical alignment models) +void AlignerTools::WriteAlignment(const Lattice& src_lattice, + const Lattice& trg_lattice, + const Hypergraph& in_g, + ostream* out, + bool map_instead_of_viterbi, + const vector<bool>* edges) { + bool fix_up_src_spans = false; + const Hypergraph* g = &in_g; + if (!src_lattice.IsSentence() || + !trg_lattice.IsSentence()) { + if (map_instead_of_viterbi) { + cerr << " Lattice alignment: using Viterbi instead of MAP alignment\n"; + } + map_instead_of_viterbi = false; + fix_up_src_spans = !src_lattice.IsSentence(); + } + if (!map_instead_of_viterbi || edges) { + Hypergraph* new_hg = in_g.CreateViterbiHypergraph(edges); + for (int i = 0; i < new_hg->edges_.size(); ++i) + new_hg->edges_[i].edge_prob_ = prob_t::One(); + g = new_hg; + } + + vector<prob_t> edge_posteriors(g->edges_.size(), prob_t::Zero()); + vector<WordID> trg_sent; + vector<WordID> src_sent; + if (fix_up_src_spans) { + ViterbiESentence(*g, &src_sent); + } else { + src_sent.resize(src_lattice.size()); + for (int i = 0; i < src_sent.size(); ++i) + src_sent[i] = src_lattice[i][0].label; + } + + ViterbiFSentence(*g, &trg_sent); + + if (edges || !map_instead_of_viterbi) { + for (int i = 0; i < edge_posteriors.size(); ++i) + edge_posteriors[i] = prob_t::One(); + } else { + SparseVector<prob_t> posts; + const prob_t z = InsideOutside<prob_t, EdgeProb, SparseVector<prob_t>, TransitionEventWeightFunction>(*g, &posts); + for (int i = 0; i < edge_posteriors.size(); ++i) + edge_posteriors[i] = posts[i] / z; + } + vector<set<int> > src_cov(g->edges_.size()); + vector<set<int> > trg_cov(g->edges_.size()); + TargetEdgeCoveragesUsingTree(*g, &trg_cov); + + if (fix_up_src_spans) + SourceEdgeCoveragesUsingTree(*g, &src_cov); + else + SourceEdgeCoveragesUsingParseIndices(*g, &src_cov); + + // figure out the src and reference size; + int src_size = src_sent.size(); + int ref_size = trg_sent.size(); + Array2D<prob_t> align(src_size, ref_size, prob_t::Zero()); + for (int c = 0; c < g->edges_.size(); ++c) { + const prob_t& p = edge_posteriors[c]; + const set<int>& srcs = src_cov[c]; + const set<int>& trgs = trg_cov[c]; + for (set<int>::const_iterator si = srcs.begin(); + si != srcs.end(); ++si) { + for (set<int>::const_iterator ti = trgs.begin(); + ti != trgs.end(); ++ti) { + align(*si, *ti) += p; + } + } + } + if (g != &in_g) { delete g; g = NULL; } + + prob_t threshold(0.9); + const bool use_soft_threshold = true; // TODO configure + + Array2D<bool> grid(src_size, ref_size, false); + for (int j = 0; j < ref_size; ++j) { + if (use_soft_threshold) { + threshold = prob_t::Zero(); + for (int i = 0; i < src_size; ++i) + if (align(i, j) > threshold) threshold = align(i, j); + //threshold *= prob_t(0.99); + } + for (int i = 0; i < src_size; ++i) + grid(i, j) = align(i, j) >= threshold; + } + if (out == &cout) { + // TODO need to do some sort of verbose flag + cerr << align << endl; + cerr << grid << endl; + } + (*out) << TD::GetString(src_sent) << " ||| " << TD::GetString(trg_sent) << " ||| "; + SerializePharaohFormat(grid, out); +}; + |