From 851e389dffdd6996ea32d70defb8906de80b9edc Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 14 Dec 2009 20:35:11 -0500 Subject: few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec --- src/hg.cc | 486 -------------------------------------------------------------- 1 file changed, 486 deletions(-) delete mode 100644 src/hg.cc (limited to 'src/hg.cc') diff --git a/src/hg.cc b/src/hg.cc deleted file mode 100644 index 7bd79394..00000000 --- a/src/hg.cc +++ /dev/null @@ -1,486 +0,0 @@ -#include "hg.h" - -#include -#include -#include -#include -#include - -#include "viterbi.h" -#include "inside_outside.h" -#include "tdict.h" - -using namespace std; - -double Hypergraph::NumberOfPaths() const { - return Inside(*this); -} - -prob_t Hypergraph::ComputeEdgePosteriors(double scale, vector* posts) const { - const ScaledEdgeProb weight(scale); - SparseVector pv; - const double inside = InsideOutside, - EdgeFeaturesWeightFunction>(*this, &pv, weight); - posts->resize(edges_.size()); - for (int i = 0; i < edges_.size(); ++i) - (*posts)[i] = prob_t(pv.value(i)); - return prob_t(inside); -} - -prob_t Hypergraph::ComputeBestPathThroughEdges(vector* post) const { - vector in(edges_.size()); - vector out(edges_.size()); - post->resize(edges_.size()); - - vector ins_node_best(nodes_.size()); - for (int i = 0; i < nodes_.size(); ++i) { - const Node& node = nodes_[i]; - prob_t& node_ins_best = ins_node_best[i]; - if (node.in_edges_.empty()) node_ins_best = prob_t::One(); - for (int j = 0; j < node.in_edges_.size(); ++j) { - const Edge& edge = edges_[node.in_edges_[j]]; - prob_t& in_edge_sco = in[node.in_edges_[j]]; - in_edge_sco = edge.edge_prob_; - for (int k = 0; k < edge.tail_nodes_.size(); ++k) - in_edge_sco *= ins_node_best[edge.tail_nodes_[k]]; - if (in_edge_sco > node_ins_best) node_ins_best = in_edge_sco; - } - } - const prob_t ins_sco = ins_node_best[nodes_.size() - 1]; - - // sanity check - int tots = 0; - for (int i = 0; i < nodes_.size(); ++i) { if (nodes_[i].out_edges_.empty()) tots++; } - assert(tots == 1); - - // compute outside scores, potentially using inside scores - vector out_node_best(nodes_.size()); - for (int i = nodes_.size() - 1; i >= 0; --i) { - const Node& node = nodes_[i]; - prob_t& node_out_best = out_node_best[node.id_]; - if (node.out_edges_.empty()) node_out_best = prob_t::One(); - for (int j = 0; j < node.out_edges_.size(); ++j) { - const Edge& edge = edges_[node.out_edges_[j]]; - prob_t sco = edge.edge_prob_ * out_node_best[edge.head_node_]; - for (int k = 0; k < edge.tail_nodes_.size(); ++k) { - if (edge.tail_nodes_[k] != i) - sco *= ins_node_best[edge.tail_nodes_[k]]; - } - if (sco > node_out_best) node_out_best = sco; - } - for (int j = 0; j < node.in_edges_.size(); ++j) { - out[node.in_edges_[j]] = node_out_best; - } - } - - for (int i = 0; i < in.size(); ++i) - (*post)[i] = in[i] * out[i]; - // for (int i = 0; i < in.size(); ++i) - // cerr << "edge " << i << ": " << log((*post)[i]) << endl; - - return ins_sco; -} - -void Hypergraph::PushWeightsToSource(double scale) { - vector posts; - ComputeEdgePosteriors(scale, &posts); - for (int i = 0; i < nodes_.size(); ++i) { - const Hypergraph::Node& node = nodes_[i]; - prob_t z = prob_t::Zero(); - for (int j = 0; j < node.out_edges_.size(); ++j) - z += posts[node.out_edges_[j]]; - for (int j = 0; j < node.out_edges_.size(); ++j) { - edges_[node.out_edges_[j]].edge_prob_ = posts[node.out_edges_[j]] / z; - } - } -} - -void Hypergraph::PushWeightsToGoal(double scale) { - vector posts; - ComputeEdgePosteriors(scale, &posts); - for (int i = 0; i < nodes_.size(); ++i) { - const Hypergraph::Node& node = nodes_[i]; - prob_t z = prob_t::Zero(); - for (int j = 0; j < node.in_edges_.size(); ++j) - z += posts[node.in_edges_[j]]; - for (int j = 0; j < node.in_edges_.size(); ++j) { - edges_[node.in_edges_[j]].edge_prob_ = posts[node.in_edges_[j]] / z; - } - } -} - -void Hypergraph::PruneEdges(const std::vector& prune_edge) { - assert(prune_edge.size() == edges_.size()); - TopologicallySortNodesAndEdges(nodes_.size() - 1, &prune_edge); -} - -void Hypergraph::DensityPruneInsideOutside(const double scale, - const bool use_sum_prod_semiring, - const double density, - const vector* preserve_mask) { - assert(density >= 1.0); - const int plen = ViterbiPathLength(*this); - vector bp; - int rnum = min(static_cast(edges_.size()), static_cast(density * static_cast(plen))); - if (rnum == edges_.size()) { - cerr << "No pruning required: denisty already sufficient"; - return; - } - vector io(edges_.size()); - if (use_sum_prod_semiring) - ComputeEdgePosteriors(scale, &io); - else - ComputeBestPathThroughEdges(&io); - assert(edges_.size() == io.size()); - vector sorted = io; - nth_element(sorted.begin(), sorted.begin() + rnum, sorted.end(), greater()); - const double cutoff = sorted[rnum]; - vector prune(edges_.size()); - for (int i = 0; i < edges_.size(); ++i) { - prune[i] = (io[i] < cutoff); - if (preserve_mask && (*preserve_mask)[i]) prune[i] = false; - } - PruneEdges(prune); -} - -void Hypergraph::BeamPruneInsideOutside( - const double scale, - const bool use_sum_prod_semiring, - const double alpha, - const vector* preserve_mask) { - assert(alpha > 0.0); - assert(scale > 0.0); - vector io(edges_.size()); - if (use_sum_prod_semiring) - ComputeEdgePosteriors(scale, &io); - else - ComputeBestPathThroughEdges(&io); - assert(edges_.size() == io.size()); - prob_t best; // initializes to zero - for (int i = 0; i < io.size(); ++i) - if (io[i] > best) best = io[i]; - const prob_t aprob(exp(-alpha)); - const prob_t cutoff = best * aprob; - // cerr << "aprob = " << aprob << "\t CUTOFF=" << cutoff << endl; - vector prune(edges_.size()); - //cerr << preserve_mask.size() << " " << edges_.size() << endl; - int pc = 0; - for (int i = 0; i < io.size(); ++i) { - const bool prune_edge = (io[i] < cutoff); - if (prune_edge) ++pc; - prune[i] = (io[i] < cutoff); - if (preserve_mask && (*preserve_mask)[i]) prune[i] = false; - } - // cerr << "Beam pruning " << pc << "/" << io.size() << " edges\n"; - PruneEdges(prune); -} - -void Hypergraph::PrintGraphviz() const { - int ei = 0; - cerr << "digraph G {\n rankdir=LR;\n nodesep=.05;\n"; - for (vector::const_iterator i = edges_.begin(); - i != edges_.end(); ++i) { - const Edge& edge=*i; - ++ei; - static const string none = ""; - string rule = (edge.rule_ ? edge.rule_->AsString(false) : none); - - cerr << " A_" << ei << " [label=\"" << rule << " p=" << edge.edge_prob_ - << " F:" << edge.feature_values_ - << "\" shape=\"rect\"];\n"; - for (int i = 0; i < edge.tail_nodes_.size(); ++i) { - cerr << " " << edge.tail_nodes_[i] << " -> A_" << ei << ";\n"; - } - cerr << " A_" << ei << " -> " << edge.head_node_ << ";\n"; - } - for (vector::const_iterator ni = nodes_.begin(); - ni != nodes_.end(); ++ni) { - cerr << " " << ni->id_ << "[label=\"" << (ni->cat_ < 0 ? TD::Convert(ni->cat_ * -1) : "") - //cerr << " " << ni->id_ << "[label=\"" << ni->cat_ - << " n=" << ni->id_ -// << ",x=" << &*ni -// << ",in=" << ni->in_edges_.size() -// << ",out=" << ni->out_edges_.size() - << "\"];\n"; - } - cerr << "}\n"; -} - -void Hypergraph::Union(const Hypergraph& other) { - if (&other == this) return; - if (nodes_.empty()) { nodes_ = other.nodes_; edges_ = other.edges_; return; } - int noff = nodes_.size(); - int eoff = edges_.size(); - int ogoal = other.nodes_.size() - 1; - int cgoal = noff - 1; - // keep a single goal node, so add nodes.size - 1 - nodes_.resize(nodes_.size() + ogoal); - // add all edges - edges_.resize(edges_.size() + other.edges_.size()); - - for (int i = 0; i < ogoal; ++i) { - const Node& on = other.nodes_[i]; - Node& cn = nodes_[i + noff]; - cn.id_ = i + noff; - cn.in_edges_.resize(on.in_edges_.size()); - for (int j = 0; j < on.in_edges_.size(); ++j) - cn.in_edges_[j] = on.in_edges_[j] + eoff; - - cn.out_edges_.resize(on.out_edges_.size()); - for (int j = 0; j < on.out_edges_.size(); ++j) - cn.out_edges_[j] = on.out_edges_[j] + eoff; - } - - for (int i = 0; i < other.edges_.size(); ++i) { - const Edge& oe = other.edges_[i]; - Edge& ce = edges_[i + eoff]; - ce.id_ = i + eoff; - ce.rule_ = oe.rule_; - ce.feature_values_ = oe.feature_values_; - if (oe.head_node_ == ogoal) { - ce.head_node_ = cgoal; - nodes_[cgoal].in_edges_.push_back(ce.id_); - } else { - ce.head_node_ = oe.head_node_ + noff; - } - ce.tail_nodes_.resize(oe.tail_nodes_.size()); - for (int j = 0; j < oe.tail_nodes_.size(); ++j) - ce.tail_nodes_[j] = oe.tail_nodes_[j] + noff; - } - - TopologicallySortNodesAndEdges(cgoal); -} - -int Hypergraph::MarkReachable(const Node& node, - vector* rmap, - const vector* prune_edges) const { - int total = 0; - if (!(*rmap)[node.id_]) { - total = 1; - (*rmap)[node.id_] = true; - for (int i = 0; i < node.in_edges_.size(); ++i) { - if (!(prune_edges && (*prune_edges)[node.in_edges_[i]])) { - for (int j = 0; j < edges_[node.in_edges_[i]].tail_nodes_.size(); ++j) - total += MarkReachable(nodes_[edges_[node.in_edges_[i]].tail_nodes_[j]], rmap, prune_edges); - } - } - } - return total; -} - -void Hypergraph::PruneUnreachable(int goal_node_id) { - TopologicallySortNodesAndEdges(goal_node_id, NULL); -} - -void Hypergraph::RemoveNoncoaccessibleStates(int goal_node_id) { - if (goal_node_id < 0) goal_node_id += nodes_.size(); - assert(goal_node_id >= 0); - assert(goal_node_id < nodes_.size()); - - // TODO finish implementation - abort(); -} - -void Hypergraph::TopologicallySortNodesAndEdges(int goal_index, - const vector* prune_edges) { - vector sedges(edges_.size()); - // figure out which nodes are reachable from the goal - vector reachable(nodes_.size(), false); - int num_reachable = MarkReachable(nodes_[goal_index], &reachable, prune_edges); - vector snodes(num_reachable); snodes.clear(); - - // enumerate all reachable nodes in topologically sorted order - vector old_node_to_new_id(nodes_.size(), -1); - vector node_to_incount(nodes_.size(), -1); - vector node_processed(nodes_.size(), false); - typedef map > PQueue; - PQueue pri_q; - for (int i = 0; i < nodes_.size(); ++i) { - if (!reachable[i]) - continue; - const int inedges = nodes_[i].in_edges_.size(); - int incount = inedges; - for (int j = 0; j < inedges; ++j) - if (edges_[nodes_[i].in_edges_[j]].tail_nodes_.size() == 0 || - (prune_edges && (*prune_edges)[nodes_[i].in_edges_[j]])) - --incount; - // cerr << &nodes_[i] <<" : incount=" << incount << "\tout=" << nodes_[i].out_edges_.size() << "\t(in-edges=" << inedges << ")\n"; - assert(node_to_incount[i] == -1); - node_to_incount[i] = incount; - pri_q[incount].insert(i); - } - - int edge_count = 0; - while (!pri_q.empty()) { - PQueue::iterator iter = pri_q.find(0); - assert(iter != pri_q.end()); - assert(!iter->second.empty()); - - // get first node with incount = 0 - const int cur_index = *iter->second.begin(); - const Node& node = nodes_[cur_index]; - assert(reachable[cur_index]); - //cerr << "node: " << node << endl; - const int new_node_index = snodes.size(); - old_node_to_new_id[cur_index] = new_node_index; - snodes.push_back(node); - Node& new_node = snodes.back(); - new_node.id_ = new_node_index; - new_node.out_edges_.clear(); - - // fix up edges - we can now process the in edges and - // the out edges of their tails - int oi = 0; - for (int i = 0; i < node.in_edges_.size(); ++i, ++oi) { - if (prune_edges && (*prune_edges)[node.in_edges_[i]]) { - --oi; - continue; - } - new_node.in_edges_[oi] = edge_count; - Edge& edge = sedges[edge_count]; - edge.id_ = edge_count; - ++edge_count; - const Edge& old_edge = edges_[node.in_edges_[i]]; - edge.rule_ = old_edge.rule_; - edge.feature_values_ = old_edge.feature_values_; - edge.head_node_ = new_node_index; - edge.tail_nodes_.resize(old_edge.tail_nodes_.size()); - edge.edge_prob_ = old_edge.edge_prob_; - edge.i_ = old_edge.i_; - edge.j_ = old_edge.j_; - edge.prev_i_ = old_edge.prev_i_; - edge.prev_j_ = old_edge.prev_j_; - for (int j = 0; j < old_edge.tail_nodes_.size(); ++j) { - const Node& old_tail_node = nodes_[old_edge.tail_nodes_[j]]; - edge.tail_nodes_[j] = old_node_to_new_id[old_tail_node.id_]; - snodes[edge.tail_nodes_[j]].out_edges_.push_back(edge_count-1); - assert(edge.tail_nodes_[j] != new_node_index); - } - } - assert(oi <= new_node.in_edges_.size()); - new_node.in_edges_.resize(oi); - - for (int i = 0; i < node.out_edges_.size(); ++i) { - const Edge& edge = edges_[node.out_edges_[i]]; - const int next_index = edge.head_node_; - assert(cur_index != next_index); - if (!reachable[next_index]) continue; - if (prune_edges && (*prune_edges)[edge.id_]) continue; - - bool dontReduce = false; - for (int j = 0; j < edge.tail_nodes_.size() && !dontReduce; ++j) { - int tail_index = edge.tail_nodes_[j]; - dontReduce = (tail_index != cur_index) && !node_processed[tail_index]; - } - if (dontReduce) - continue; - - const int incount = node_to_incount[next_index]; - if (incount <= 0) { - cerr << "incount = " << incount << ", should be > 0!\n"; - cerr << "do you have a cycle in your hypergraph?\n"; - abort(); - } - PQueue::iterator it = pri_q.find(incount); - assert(it != pri_q.end()); - it->second.erase(next_index); - if (it->second.empty()) pri_q.erase(it); - - // reinsert node with reduced incount - pri_q[incount-1].insert(next_index); - --node_to_incount[next_index]; - } - - // remove node from set - iter->second.erase(cur_index); - if (iter->second.empty()) - pri_q.erase(iter); - node_processed[cur_index] = true; - } - - sedges.resize(edge_count); - nodes_.swap(snodes); - edges_.swap(sedges); - assert(nodes_.back().out_edges_.size() == 0); -} - -TRulePtr Hypergraph::kEPSRule; -TRulePtr Hypergraph::kUnaryRule; - -void Hypergraph::EpsilonRemove(WordID eps) { - if (!kEPSRule) { - kEPSRule.reset(new TRule("[X] ||| ||| ")); - kUnaryRule.reset(new TRule("[X] ||| [X,1] ||| [X,1]")); - } - vector kill(edges_.size(), false); - for (int i = 0; i < edges_.size(); ++i) { - const Edge& edge = edges_[i]; - if (edge.tail_nodes_.empty() && - edge.rule_->f_.size() == 1 && - edge.rule_->f_[0] == eps) { - kill[i] = true; - if (!edge.feature_values_.empty()) { - Node& node = nodes_[edge.head_node_]; - if (node.in_edges_.size() != 1) { - cerr << "[WARNING] edge with features going into non-empty node - can't promote\n"; - // this *probably* means that there are multiple derivations of the - // same sequence via different paths through the input forest - // this needs to be investigated and fixed - } else { - for (int j = 0; j < node.out_edges_.size(); ++j) - edges_[node.out_edges_[j]].feature_values_ += edge.feature_values_; - // cerr << "PROMOTED " << edge.feature_values_ << endl; - } - } - } - } - bool created_eps = false; - PruneEdges(kill); - for (int i = 0; i < nodes_.size(); ++i) { - const Node& node = nodes_[i]; - if (node.in_edges_.empty()) { - for (int j = 0; j < node.out_edges_.size(); ++j) { - Edge& edge = edges_[node.out_edges_[j]]; - if (edge.rule_->Arity() == 2) { - assert(edge.rule_->f_.size() == 2); - assert(edge.rule_->e_.size() == 2); - edge.rule_ = kUnaryRule; - int cur = node.id_; - int t = -1; - assert(edge.tail_nodes_.size() == 2); - for (int i = 0; i < 2; ++i) if (edge.tail_nodes_[i] != cur) { t = edge.tail_nodes_[i]; } - assert(t != -1); - edge.tail_nodes_.resize(1); - edge.tail_nodes_[0] = t; - } else { - edge.rule_ = kEPSRule; - edge.rule_->f_[0] = eps; - edge.rule_->e_[0] = eps; - edge.tail_nodes_.clear(); - created_eps = true; - } - } - } - } - vector k2(edges_.size(), false); - PruneEdges(k2); - if (created_eps) EpsilonRemove(eps); -} - -struct EdgeWeightSorter { - const Hypergraph& hg; - EdgeWeightSorter(const Hypergraph& h) : hg(h) {} - bool operator()(int a, int b) const { - return hg.edges_[a].edge_prob_ > hg.edges_[b].edge_prob_; - } -}; - -void Hypergraph::SortInEdgesByEdgeWeights() { - for (int i = 0; i < nodes_.size(); ++i) { - Node& node = nodes_[i]; - sort(node.in_edges_.begin(), node.in_edges_.end(), EdgeWeightSorter(*this)); - } -} - -- cgit v1.2.3