From 851e389dffdd6996ea32d70defb8906de80b9edc Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 14 Dec 2009 20:35:11 -0500 Subject: few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec --- src/hg.h | 225 --------------------------------------------------------------- 1 file changed, 225 deletions(-) delete mode 100644 src/hg.h (limited to 'src/hg.h') diff --git a/src/hg.h b/src/hg.h deleted file mode 100644 index 7a2658b8..00000000 --- a/src/hg.h +++ /dev/null @@ -1,225 +0,0 @@ -#ifndef _HG_H_ -#define _HG_H_ - -#include -#include - -#include "small_vector.h" -#include "sparse_vector.h" -#include "wordid.h" -#include "trule.h" -#include "prob.h" - -// class representing an acyclic hypergraph -// - edges have 1 head, 0..n tails -class Hypergraph { - public: - Hypergraph() {} - - // SmallVector is a fast, small vector implementation for sizes <= 2 - typedef SmallVector TailNodeVector; - - // TODO get rid of state_ and cat_? - struct Node { - Node() : id_(), cat_() {} - int id_; // equal to this object's position in the nodes_ vector - WordID cat_; // non-terminal category if <0, 0 if not set - std::vector in_edges_; // contents refer to positions in edges_ - std::vector out_edges_; // contents refer to positions in edges_ - std::string state_; // opaque state - }; - - // TODO get rid of edge_prob_? (can be computed on the fly as the dot - // product of the weight vector and the feature values) - struct Edge { - Edge() : i_(-1), j_(-1), prev_i_(-1), prev_j_(-1) {} - inline int Arity() const { return tail_nodes_.size(); } - int head_node_; // refers to a position in nodes_ - TailNodeVector tail_nodes_; // contents refer to positions in nodes_ - TRulePtr rule_; - SparseVector feature_values_; - prob_t edge_prob_; // dot product of weights and feat_values - int id_; // equal to this object's position in the edges_ vector - - // span info. typically, i_ and j_ refer to indices in the source sentence - // if a synchronous parse has been executed i_ and j_ will refer to indices - // in the target sentence / lattice and prev_i_ prev_j_ will refer to - // positions in the source. Note: it is up to the translator implementation - // to properly set these values. For some models (like the Forest-input - // phrase based model) it may not be straightforward to do. if these values - // are not properly set, most things will work but alignment and any features - // that depend on them will be broken. - short int i_; - short int j_; - short int prev_i_; - short int prev_j_; - }; - - void swap(Hypergraph& other) { - other.nodes_.swap(nodes_); - other.edges_.swap(edges_); - } - - void ResizeNodes(int size) { - nodes_.resize(size); - for (int i = 0; i < size; ++i) nodes_[i].id_ = i; - } - - // reserves space in the nodes vector to prevent memory locations - // from changing - void ReserveNodes(size_t n, size_t e = 0) { - nodes_.reserve(n); - if (e) edges_.reserve(e); - } - - Edge* AddEdge(const TRulePtr& rule, const TailNodeVector& tail) { - edges_.push_back(Edge()); - Edge* edge = &edges_.back(); - edge->rule_ = rule; - edge->tail_nodes_ = tail; - edge->id_ = edges_.size() - 1; - for (int i = 0; i < edge->tail_nodes_.size(); ++i) - nodes_[edge->tail_nodes_[i]].out_edges_.push_back(edge->id_); - return edge; - } - - Node* AddNode(const WordID& cat, const std::string& state = "") { - nodes_.push_back(Node()); - nodes_.back().cat_ = cat; - nodes_.back().state_ = state; - nodes_.back().id_ = nodes_.size() - 1; - return &nodes_.back(); - } - - void ConnectEdgeToHeadNode(const int edge_id, const int head_id) { - edges_[edge_id].head_node_ = head_id; - nodes_[head_id].in_edges_.push_back(edge_id); - } - - // TODO remove this - use the version that takes indices - void ConnectEdgeToHeadNode(Edge* edge, Node* head) { - edge->head_node_ = head->id_; - head->in_edges_.push_back(edge->id_); - } - - // merge the goal node from other with this goal node - void Union(const Hypergraph& other); - - void PrintGraphviz() const; - - // compute the total number of paths in the forest - double NumberOfPaths() const; - - // BEWARE. this assumes that the source and target language - // strings are identical and that there are no loops. - // It assumes a bunch of other things about where the - // epsilons will be. It tries to assert failure if you - // break these assumptions, but it may not. - // TODO - make this work - void EpsilonRemove(WordID eps); - - // multiple the weights vector by the edge feature vector - // (inner product) to set the edge probabilities - template - void Reweight(const V& weights) { - for (int i = 0; i < edges_.size(); ++i) { - Edge& e = edges_[i]; - e.edge_prob_.logeq(e.feature_values_.dot(weights)); - } - } - - // computes inside and outside scores for each - // edge in the hypergraph - // alpha->size = edges_.size = beta->size - // returns inside prob of goal node - prob_t ComputeEdgePosteriors(double scale, - std::vector* posts) const; - - // find the score of the very best path passing through each edge - prob_t ComputeBestPathThroughEdges(std::vector* posts) const; - - // move weights as near to the source as possible, resulting in a - // stochastic automaton. ONLY FUNCTIONAL FOR *LATTICES*. - // See M. Mohri and M. Riley. A Weight Pushing Algorithm for Large - // Vocabulary Speech Recognition. 2001. - // the log semiring (NOT tropical) is used - void PushWeightsToSource(double scale = 1.0); - // same, except weights are pushed to the goal, works for HGs, - // not just lattices - void PushWeightsToGoal(double scale = 1.0); - - void SortInEdgesByEdgeWeights(); - - void PruneUnreachable(int goal_node_id); // DEPRECATED - - void RemoveNoncoaccessibleStates(int goal_node_id = -1); - - // remove edges from the hypergraph if prune_edge[edge_id] is true - void PruneEdges(const std::vector& prune_edge); - - // if you don't know, use_sum_prod_semiring should be false - void DensityPruneInsideOutside(const double scale, const bool use_sum_prod_semiring, const double density, - const std::vector* preserve_mask = NULL); - - // prunes any edge whose score on the best path taking that edge is more than alpha away - // from the score of the global best past (or the highest edge posterior) - void BeamPruneInsideOutside(const double scale, const bool use_sum_prod_semiring, const double alpha, - const std::vector* preserve_mask = NULL); - - void clear() { - nodes_.clear(); - edges_.clear(); - } - - inline size_t NumberOfEdges() const { return edges_.size(); } - inline size_t NumberOfNodes() const { return nodes_.size(); } - inline bool empty() const { return nodes_.empty(); } - - // nodes_ is sorted in topological order - std::vector nodes_; - // edges_ is not guaranteed to be in any particular order - std::vector edges_; - - // reorder nodes_ so they are in topological order - // source nodes at 0 sink nodes at size-1 - void TopologicallySortNodesAndEdges(int goal_idx, - const std::vector* prune_edges = NULL); - private: - // returns total nodes reachable - int MarkReachable(const Node& node, - std::vector* rmap, - const std::vector* prune_edges) const; - - static TRulePtr kEPSRule; - static TRulePtr kUnaryRule; -}; - -// common WeightFunctions, map an edge -> WeightType -// for generic Viterbi/Inside algorithms -struct EdgeProb { - inline const prob_t& operator()(const Hypergraph::Edge& e) const { return e.edge_prob_; } -}; - -struct ScaledEdgeProb { - ScaledEdgeProb(const double& alpha) : alpha_(alpha) {} - inline prob_t operator()(const Hypergraph::Edge& e) const { return e.edge_prob_.pow(alpha_); } - const double alpha_; -}; - -struct EdgeFeaturesWeightFunction { - inline const SparseVector& operator()(const Hypergraph::Edge& e) const { return e.feature_values_; } -}; - -struct TransitionEventWeightFunction { - inline SparseVector operator()(const Hypergraph::Edge& e) const { - SparseVector result; - result.set_value(e.id_, prob_t::One()); - return result; - } -}; - -struct TransitionCountWeightFunction { - inline double operator()(const Hypergraph::Edge& e) const { (void)e; return 1.0; } -}; - -#endif -- cgit v1.2.3