From 851e389dffdd6996ea32d70defb8906de80b9edc Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 14 Dec 2009 20:35:11 -0500 Subject: few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec --- src/aligner.cc | 204 --------------------------------------------------------- 1 file changed, 204 deletions(-) delete mode 100644 src/aligner.cc (limited to 'src/aligner.cc') diff --git a/src/aligner.cc b/src/aligner.cc deleted file mode 100644 index d9d067e5..00000000 --- a/src/aligner.cc +++ /dev/null @@ -1,204 +0,0 @@ -#include "aligner.h" - -#include "array2d.h" -#include "hg.h" -#include "inside_outside.h" -#include - -using namespace std; - -struct EdgeCoverageInfo { - set src_indices; - set trg_indices; -}; - -static bool is_digit(char x) { return x >= '0' && x <= '9'; } - -boost::shared_ptr > AlignerTools::ReadPharaohAlignmentGrid(const string& al) { - int max_x = 0; - int max_y = 0; - int i = 0; - while (i < al.size()) { - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - if (x > max_x) max_x = x; - assert(i < al.size()); - assert(al[i] == '-'); - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - if (y > max_y) max_y = y; - while(i < al.size() && al[i] == ' ') { ++i; } - } - - boost::shared_ptr > grid(new Array2D(max_x + 1, max_y + 1)); - i = 0; - while (i < al.size()) { - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - assert(i < al.size()); - assert(al[i] == '-'); - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - (*grid)(x, y) = true; - while(i < al.size() && al[i] == ' ') { ++i; } - } - // cerr << *grid << endl; - return grid; -} - -void AlignerTools::SerializePharaohFormat(const Array2D& alignment, ostream* out) { - bool need_space = false; - for (int i = 0; i < alignment.width(); ++i) - for (int j = 0; j < alignment.height(); ++j) - if (alignment(i,j)) { - if (need_space) (*out) << ' '; else need_space = true; - (*out) << i << '-' << j; - } - (*out) << endl; -} - -// compute the coverage vectors of each edge -// prereq: all derivations yield the same string pair -void ComputeCoverages(const Hypergraph& g, - vector* pcovs) { - for (int i = 0; i < g.edges_.size(); ++i) { - const Hypergraph::Edge& edge = g.edges_[i]; - EdgeCoverageInfo& cov = (*pcovs)[i]; - // no words - if (edge.rule_->EWords() == 0 || edge.rule_->FWords() == 0) - continue; - // aligned to NULL (crf ibm variant only) - if (edge.prev_i_ == -1 || edge.i_ == -1) - continue; - assert(edge.j_ >= 0); - assert(edge.prev_j_ >= 0); - if (edge.Arity() == 0) { - for (int k = edge.i_; k < edge.j_; ++k) - cov.trg_indices.insert(k); - for (int k = edge.prev_i_; k < edge.prev_j_; ++k) - cov.src_indices.insert(k); - } else { - // note: this code, which handles mixed NT and terminal - // rules assumes that nodes uniquely define a src and trg - // span. - int k = edge.prev_i_; - int j = 0; - const vector& f = edge.rule_->e(); // rules are inverted - while (k < edge.prev_j_) { - if (f[j] > 0) { - cov.src_indices.insert(k); - // cerr << "src: " << k << endl; - ++k; - ++j; - } else { - const Hypergraph::Node& tailnode = g.nodes_[edge.tail_nodes_[-f[j]]]; - assert(tailnode.in_edges_.size() > 0); - // any edge will do: - const Hypergraph::Edge& rep_edge = g.edges_[tailnode.in_edges_.front()]; - //cerr << "skip " << (rep_edge.prev_j_ - rep_edge.prev_i_) << endl; // src span - k += (rep_edge.prev_j_ - rep_edge.prev_i_); // src span - ++j; - } - } - int tc = 0; - const vector& e = edge.rule_->f(); // rules are inverted - k = edge.i_; - j = 0; - // cerr << edge.rule_->AsString() << endl; - // cerr << "i=" << k << " j=" << edge.j_ << endl; - while (k < edge.j_) { - //cerr << " k=" << k << endl; - if (e[j] > 0) { - cov.trg_indices.insert(k); - // cerr << "trg: " << k << endl; - ++k; - ++j; - } else { - assert(tc < edge.tail_nodes_.size()); - const Hypergraph::Node& tailnode = g.nodes_[edge.tail_nodes_[tc]]; - assert(tailnode.in_edges_.size() > 0); - // any edge will do: - const Hypergraph::Edge& rep_edge = g.edges_[tailnode.in_edges_.front()]; - // cerr << "t skip " << (rep_edge.j_ - rep_edge.i_) << endl; // src span - k += (rep_edge.j_ - rep_edge.i_); // src span - ++j; - ++tc; - } - } - //abort(); - } - } -} - -void AlignerTools::WriteAlignment(const string& input, - const Lattice& ref, - const Hypergraph& g, - bool map_instead_of_viterbi) { - if (!map_instead_of_viterbi) { - assert(!"not implemented!"); - } - vector edge_posteriors(g.edges_.size()); - { - SparseVector posts; - InsideOutside, TransitionEventWeightFunction>(g, &posts); - for (int i = 0; i < edge_posteriors.size(); ++i) - edge_posteriors[i] = posts[i]; - } - vector edge2cov(g.edges_.size()); - ComputeCoverages(g, &edge2cov); - - Lattice src; - // currently only dealing with src text, even if the - // model supports lattice translation (which it probably does) - LatticeTools::ConvertTextToLattice(input, &src); - // TODO assert that src is a "real lattice" - - Array2D align(src.size(), ref.size(), prob_t::Zero()); - for (int c = 0; c < g.edges_.size(); ++c) { - const prob_t& p = edge_posteriors[c]; - const EdgeCoverageInfo& eci = edge2cov[c]; - for (set::const_iterator si = eci.src_indices.begin(); - si != eci.src_indices.end(); ++si) { - for (set::const_iterator ti = eci.trg_indices.begin(); - ti != eci.trg_indices.end(); ++ti) { - align(*si, *ti) += p; - } - } - } - prob_t threshold(0.9); - const bool use_soft_threshold = true; // TODO configure - - Array2D grid(src.size(), ref.size(), false); - for (int j = 0; j < ref.size(); ++j) { - if (use_soft_threshold) { - threshold = prob_t::Zero(); - for (int i = 0; i < src.size(); ++i) - if (align(i, j) > threshold) threshold = align(i, j); - //threshold *= prob_t(0.99); - } - for (int i = 0; i < src.size(); ++i) - grid(i, j) = align(i, j) >= threshold; - } - cerr << align << endl; - cerr << grid << endl; - SerializePharaohFormat(grid, &cout); -}; - -- cgit v1.2.3