From 8372086f2fc4bd765fdd05e8cf95faeb147a6587 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 30 Mar 2014 23:50:17 -0400 Subject: almost complete tree to string translator --- decoder/decoder.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'decoder/decoder.cc') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 31049216..43e2640d 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -490,8 +490,8 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream } formalism = LowercaseString(str("formalism",conf)); - if (formalism != "scfg" && formalism != "fst" && formalism != "lextrans" && formalism != "pb" && formalism != "csplit" && formalism != "tagger" && formalism != "lexalign" && formalism != "rescore") { - cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit', 'lextrans', 'lexalign', 'rescore', or 'tagger'\n"; + if (formalism != "t2s" && formalism != "scfg" && formalism != "fst" && formalism != "lextrans" && formalism != "pb" && formalism != "csplit" && formalism != "tagger" && formalism != "lexalign" && formalism != "rescore") { + cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 't2s', 'csplit', 'lextrans', 'lexalign', 'rescore', or 'tagger'\n"; cerr << dcmdline_options << endl; exit(1); } @@ -626,6 +626,8 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream // set up translation back end if (formalism == "scfg") translator.reset(new SCFGTranslator(conf)); + else if (formalism == "t2s") + translator.reset(new Tree2StringTranslator(conf)); else if (formalism == "fst") translator.reset(new FSTTranslator(conf)); else if (formalism == "pb") -- cgit v1.2.3 From b9e6e7e24cc48021090b689e143288e2b7f2b5fc Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 7 Apr 2014 22:56:34 -0400 Subject: track node state for smarter union --- Makefile.am | 2 +- decoder/Makefile.am | 1 + decoder/apply_models.cc | 306 +++++++++++++++++----------------- decoder/bottom_up_parser.cc | 10 ++ decoder/decoder.cc | 13 ++ decoder/fst_translator.cc | 6 + decoder/hg.cc | 47 ++---- decoder/hg.h | 10 +- decoder/lexalign.cc | 5 + decoder/lextrans.cc | 5 + decoder/node_state_hash.h | 36 ++++ decoder/nt_span.h | 2 +- decoder/tagger.cc | 5 + decoder/tree2string_translator.cc | 14 +- mteval/Makefile.am | 8 +- tests/tools/filter-stderr.pl | 1 + utils/Makefile.am | 3 +- utils/hash.h | 21 +-- utils/murmur_hash.h | 186 --------------------- utils/murmur_hash3.cc | 340 ++++++++++++++++++++++++++++++++++++++ utils/murmur_hash3.h | 67 ++++++++ 21 files changed, 699 insertions(+), 389 deletions(-) create mode 100644 decoder/node_state_hash.h delete mode 100644 utils/murmur_hash.h create mode 100644 utils/murmur_hash3.cc create mode 100644 utils/murmur_hash3.h (limited to 'decoder/decoder.cc') diff --git a/Makefile.am b/Makefile.am index 598293d1..88327477 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,13 +3,13 @@ # cyclic dependencies between these directories! SUBDIRS = \ utils \ - mteval \ klm/util/double-conversion \ klm/util \ klm/util/stream \ klm/lm \ klm/lm/builder \ klm/search \ + mteval \ decoder \ training \ word-aligner \ diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 5c91fe65..c85f17ed 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -144,6 +144,7 @@ libcdec_a_SOURCES = \ lattice.cc \ lexalign.cc \ lextrans.cc \ + node_state_hash.h \ tree_fragment.cc \ tree_fragment.h \ maxtrans_blunsom.cc \ diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc index 9a8f60be..9f8bbead 100644 --- a/decoder/apply_models.cc +++ b/decoder/apply_models.cc @@ -19,6 +19,7 @@ namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; } #include +#include "node_state_hash.h" #include "verbose.h" #include "hg.h" #include "ff.h" @@ -229,7 +230,7 @@ public: D.clear(); } - void IncorporateIntoPlusLMForest(Candidate* item, State2Node* s2n, CandidateList* freelist) { + void IncorporateIntoPlusLMForest(size_t head_node_hash, Candidate* item, State2Node* s2n, CandidateList* freelist) { Hypergraph::Edge* new_edge = out.AddEdge(item->out_edge_); new_edge->edge_prob_ = item->out_edge_.edge_prob_; Candidate*& o_item = (*s2n)[item->state_]; @@ -238,6 +239,7 @@ public: int& node_id = o_item->node_index_; if (node_id < 0) { Hypergraph::Node* new_node = out.AddNode(in.nodes_[item->in_edge_->head_node_].cat_); + new_node->node_hash = cdec::HashNode(head_node_hash, item->state_); // ID is combination of existing state + residual state node_states_.push_back(item->state_); node_id = new_node->id_; } @@ -287,7 +289,7 @@ public: cand.pop_back(); // cerr << "POPPED: " << *item << endl; PushSucc(*item, is_goal, &cand, &unique_cands); - IncorporateIntoPlusLMForest(item, &state2node, &freelist); + IncorporateIntoPlusLMForest(v.node_hash, item, &state2node, &freelist); ++pops; } D_v.resize(state2node.size()); @@ -306,112 +308,112 @@ public: } void KBestFast(const int vert_index, const bool is_goal) { - // cerr << "KBest(" << vert_index << ")\n"; - CandidateList& D_v = D[vert_index]; - assert(D_v.empty()); - const Hypergraph::Node& v = in.nodes_[vert_index]; - // cerr << " has " << v.in_edges_.size() << " in-coming edges\n"; - const vector& in_edges = v.in_edges_; - CandidateHeap cand; - CandidateList freelist; - cand.reserve(in_edges.size()); - //init with j<0,0> for all rules-edges that lead to node-(NT-span) - for (int i = 0; i < in_edges.size(); ++i) { - const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; - const JVector j(edge.tail_nodes_.size(), 0); - cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal)); - } - // cerr << " making heap of " << cand.size() << " candidates\n"; - make_heap(cand.begin(), cand.end(), HeapCandCompare()); - State2Node state2node; // "buf" in Figure 2 - int pops = 0; - while(!cand.empty() && pops < pop_limit_) { - pop_heap(cand.begin(), cand.end(), HeapCandCompare()); - Candidate* item = cand.back(); - cand.pop_back(); - // cerr << "POPPED: " << *item << endl; - - PushSuccFast(*item, is_goal, &cand); - IncorporateIntoPlusLMForest(item, &state2node, &freelist); - ++pops; - } - D_v.resize(state2node.size()); - int c = 0; - for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i){ - D_v[c++] = i->second; - // cerr << "MERGED: " << *i->second << endl; - } - //cerr <<"Node id: "<< vert_index<< endl; - //#ifdef MEASURE_CA - // cerr << "countInProcess (pop/tot): node id: " << vert_index << " (" << count_in_process_pop << "/" << count_in_process_tot << ")"<& in_edges = v.in_edges_; + CandidateHeap cand; + CandidateList freelist; + cand.reserve(in_edges.size()); + //init with j<0,0> for all rules-edges that lead to node-(NT-span) + for (int i = 0; i < in_edges.size(); ++i) { + const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; + const JVector j(edge.tail_nodes_.size(), 0); + cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal)); + } + // cerr << " making heap of " << cand.size() << " candidates\n"; + make_heap(cand.begin(), cand.end(), HeapCandCompare()); + State2Node state2node; // "buf" in Figure 2 + int pops = 0; + while(!cand.empty() && pops < pop_limit_) { + pop_heap(cand.begin(), cand.end(), HeapCandCompare()); + Candidate* item = cand.back(); + cand.pop_back(); + // cerr << "POPPED: " << *item << endl; + + PushSuccFast(*item, is_goal, &cand); + IncorporateIntoPlusLMForest(v.node_hash, item, &state2node, &freelist); + ++pops; + } + D_v.resize(state2node.size()); + int c = 0; + for (auto& i : state2node) { + D_v[c++] = i.second; + // cerr << "MERGED: " << *i.second << endl; + } + //cerr <<"Node id: "<< vert_index<< endl; + //#ifdef MEASURE_CA + // cerr << "countInProcess (pop/tot): node id: " << vert_index << " (" << count_in_process_pop << "/" << count_in_process_tot << ")"<& in_edges = v.in_edges_; - CandidateHeap cand; - CandidateList freelist; - cand.reserve(in_edges.size()); - UniqueCandidateSet unique_accepted; - //init with j<0,0> for all rules-edges that lead to node-(NT-span) - for (int i = 0; i < in_edges.size(); ++i) { - const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; - const JVector j(edge.tail_nodes_.size(), 0); - cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal)); - } - // cerr << " making heap of " << cand.size() << " candidates\n"; - make_heap(cand.begin(), cand.end(), HeapCandCompare()); - State2Node state2node; // "buf" in Figure 2 - int pops = 0; - while(!cand.empty() && pops < pop_limit_) { - pop_heap(cand.begin(), cand.end(), HeapCandCompare()); - Candidate* item = cand.back(); - cand.pop_back(); + // cerr << "KBest(" << vert_index << ")\n"; + CandidateList& D_v = D[vert_index]; + assert(D_v.empty()); + const Hypergraph::Node& v = in.nodes_[vert_index]; + // cerr << " has " << v.in_edges_.size() << " in-coming edges\n"; + const vector& in_edges = v.in_edges_; + CandidateHeap cand; + CandidateList freelist; + cand.reserve(in_edges.size()); + UniqueCandidateSet unique_accepted; + //init with j<0,0> for all rules-edges that lead to node-(NT-span) + for (int i = 0; i < in_edges.size(); ++i) { + const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; + const JVector j(edge.tail_nodes_.size(), 0); + cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal)); + } + // cerr << " making heap of " << cand.size() << " candidates\n"; + make_heap(cand.begin(), cand.end(), HeapCandCompare()); + State2Node state2node; // "buf" in Figure 2 + int pops = 0; + while(!cand.empty() && pops < pop_limit_) { + pop_heap(cand.begin(), cand.end(), HeapCandCompare()); + Candidate* item = cand.back(); + cand.pop_back(); bool is_new = unique_accepted.insert(item).second; - assert(is_new); // these should all be unique! - // cerr << "POPPED: " << *item << endl; - - PushSuccFast2(*item, is_goal, &cand, &unique_accepted); - IncorporateIntoPlusLMForest(item, &state2node, &freelist); - ++pops; - } - D_v.resize(state2node.size()); - int c = 0; - for (State2Node::iterator i = state2node.begin(); i != state2node.end(); ++i){ - D_v[c++] = i->second; - // cerr << "MERGED: " << *i->second << endl; - } - //cerr <<"Node id: "<< vert_index<< endl; - //#ifdef MEASURE_CA - // cerr << "countInProcess (pop/tot): node id: " << vert_index << " (" << count_in_process_pop << "/" << count_in_process_tot << ")"<second; + // cerr << "MERGED: " << *i->second << endl; + } + //cerr <<"Node id: "<< vert_index<< endl; + //#ifdef MEASURE_CA + // cerr << "countInProcess (pop/tot): node id: " << vert_index << " (" << count_in_process_pop << "/" << count_in_process_tot << ")"<tail_nodes_[i]].size()) { - Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal); - cand.push_back(new_cand); - push_heap(cand.begin(), cand.end(), HeapCandCompare()); - } - if(item.j_[i]!=0){ - return; - } - } + CandidateHeap& cand = *pcand; + for (int i = 0; i < item.j_.size(); ++i) { + JVector j = item.j_; + ++j[i]; + if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) { + Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal); + cand.push_back(new_cand); + push_heap(cand.begin(), cand.end(), HeapCandCompare()); + } + if(item.j_[i]!=0){ + return; + } + } } //PushSucc only if all ancest Cand are added void PushSuccFast2(const Candidate& item, const bool is_goal, CandidateHeap* pcand, UniqueCandidateSet* ps){ - CandidateHeap& cand = *pcand; - for (int i = 0; i < item.j_.size(); ++i) { - JVector j = item.j_; - ++j[i]; - if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) { - Candidate query_unique(*item.in_edge_, j); - if (HasAllAncestors(&query_unique,ps)) { - Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal); - cand.push_back(new_cand); - push_heap(cand.begin(), cand.end(), HeapCandCompare()); - } - } - } + CandidateHeap& cand = *pcand; + for (int i = 0; i < item.j_.size(); ++i) { + JVector j = item.j_; + ++j[i]; + if (j[i] < D[item.in_edge_->tail_nodes_[i]].size()) { + Candidate query_unique(*item.in_edge_, j); + if (HasAllAncestors(&query_unique,ps)) { + Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal); + cand.push_back(new_cand); + push_heap(cand.begin(), cand.end(), HeapCandCompare()); + } + } + } } bool HasAllAncestors(const Candidate* item, UniqueCandidateSet* cs){ - for (int i = 0; i < item->j_.size(); ++i) { - JVector j = item->j_; - --j[i]; - if (j[i] >=0) { - Candidate query_unique(*item->in_edge_, j); - if (cs->count(&query_unique) == 0) { - return false; - } - } - } - return true; + for (int i = 0; i < item->j_.size(); ++i) { + JVector j = item->j_; + --j[i]; + if (j[i] >=0) { + Candidate query_unique(*item->in_edge_, j); + if (cs->count(&query_unique) == 0) { + return false; + } + } + } + return true; } const ModelSet& models; @@ -491,7 +493,7 @@ public: FFStates node_states_; // for each node in the out-HG what is // its q function value? const int pop_limit_; - const int strategy_; //switch Cube Pruning strategy: 1 normal, 2 fast (alg 2), 3 fast_2 (alg 3). (see: Gesmundo A., Henderson J,. Faster Cube Pruning, IWSLT 2010) + const int strategy_; //switch Cube Pruning strategy: 1 normal, 2 fast (alg 2), 3 fast_2 (alg 3). (see: Gesmundo A., Henderson J,. Faster Cube Pruning, IWSLT 2010) }; struct NoPruningRescorer { @@ -507,7 +509,7 @@ struct NoPruningRescorer { typedef unordered_map > State2NodeIndex; - void ExpandEdge(const Hypergraph::Edge& in_edge, bool is_goal, State2NodeIndex* state2node) { + void ExpandEdge(const Hypergraph::Edge& in_edge, bool is_goal, size_t head_node_hash, State2NodeIndex* state2node) { const int arity = in_edge.Arity(); Hypergraph::TailNodeVector ends(arity); for (int i = 0; i < arity; ++i) @@ -531,7 +533,9 @@ struct NoPruningRescorer { } int& head_plus1 = (*state2node)[head_state]; if (!head_plus1) { - head_plus1 = out.AddNode(in_edge.rule_->GetLHS())->id_ + 1; + HG::Node* new_node = out.AddNode(in_edge.rule_->GetLHS()); + new_node->node_hash = cdec::HashNode(head_node_hash, head_state); // ID is combination of existing state + residual state + head_plus1 = new_node->id_ + 1; node_states_.push_back(head_state); nodemap[in_edge.head_node_].push_back(head_plus1 - 1); } @@ -553,7 +557,7 @@ struct NoPruningRescorer { const Hypergraph::Node& node = in.nodes_[node_num]; for (int i = 0; i < node.in_edges_.size(); ++i) { const Hypergraph::Edge& edge = in.edges_[node.in_edges_[i]]; - ExpandEdge(edge, is_goal, &state2node); + ExpandEdge(edge, is_goal, node.node_hash, &state2node); } } @@ -605,16 +609,16 @@ void ApplyModelSet(const Hypergraph& in, cerr << " Note: reducing pop_limit to " << pl << " for very large forest\n"; } if (config.algorithm == IntersectionConfiguration::CUBE) { - CubePruningRescorer ma(models, smeta, in, pl, out); - ma.Apply(); + CubePruningRescorer ma(models, smeta, in, pl, out); + ma.Apply(); } else if (config.algorithm == IntersectionConfiguration::FAST_CUBE_PRUNING){ - CubePruningRescorer ma(models, smeta, in, pl, out, FAST_CP); - ma.Apply(); + CubePruningRescorer ma(models, smeta, in, pl, out, FAST_CP); + ma.Apply(); } else if (config.algorithm == IntersectionConfiguration::FAST_CUBE_PRUNING_2){ - CubePruningRescorer ma(models, smeta, in, pl, out, FAST_CP_2); - ma.Apply(); + CubePruningRescorer ma(models, smeta, in, pl, out, FAST_CP_2); + ma.Apply(); } } else { diff --git a/decoder/bottom_up_parser.cc b/decoder/bottom_up_parser.cc index ff4c7a90..b30f1ec6 100644 --- a/decoder/bottom_up_parser.cc +++ b/decoder/bottom_up_parser.cc @@ -7,6 +7,8 @@ #include #include +#include "node_state_hash.h" +#include "nt_span.h" #include "hg.h" #include "array2d.h" #include "tdict.h" @@ -356,5 +358,13 @@ bool ExhaustiveBottomUpParser::Parse(const Lattice& input, kEPS = TD::Convert("*EPS*"); PassiveChart chart(goal_sym_, grammars_, input, forest); const bool result = chart.Parse(); + + if (result) { + for (auto& node : forest->nodes_) { + Span prev; + const Span s = forest->NodeSpan(node.id_, &prev); + node.node_hash = cdec::HashNode(node.cat_, s.l, s.r, prev.l, prev.r); + } + } return result; } diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 43e2640d..354ea2d9 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -750,6 +750,11 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { return false; } + // this is mainly used for debugging, eventually this will be an assertion + if (!forest.AreNodesUniquelyIdentified()) { + if (!SILENT) cerr << " *** NODES NOT UNIQUELY IDENTIFIED ***\n"; + } + const bool show_tree_structure=conf.count("show_tree_structure"); if (!SILENT) forest_stats(forest," Init. forest",show_tree_structure,oracle.show_derivation); if (conf.count("show_expected_length")) { @@ -813,6 +818,10 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { forest.swap(rescored_forest); forest.Reweight(cur_weights); if (!SILENT) forest_stats(forest," " + passtr +" forest",show_tree_structure,oracle.show_derivation, conf.count("extract_rules"), extract_file); + // this is mainly used for debugging, eventually this will be an assertion + if (!forest.AreNodesUniquelyIdentified()) { + if (!SILENT) cerr << " *** NODES NOT UNIQUELY IDENTIFIED ***\n"; + } } if (conf.count("show_partition")) { @@ -984,6 +993,10 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { forest.edges_[i].rule_ = forest.edges_[i].rule_->parent_rule_; } forest.Reweight(last_weights); + // this is mainly used for debugging, eventually this will be an assertion + if (!forest.AreNodesUniquelyIdentified()) { + if (!SILENT) cerr << " *** NODES NOT UNIQUELY IDENTIFIED ***\n"; + } if (!SILENT) forest_stats(forest," Constr. forest",show_tree_structure,oracle.show_derivation); if (!SILENT) cerr << " Constr. VitTree: " << ViterbiFTree(forest) << endl; if (conf.count("show_partition")) { diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc index 074de4c9..4253b652 100644 --- a/decoder/fst_translator.cc +++ b/decoder/fst_translator.cc @@ -67,6 +67,12 @@ struct FSTTranslatorImpl { Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); forest->ConnectEdgeToHeadNode(hg_edge, goal); forest->Reweight(weights); + + // since we don't do any pruning, the node_hash will be the same for + // every run of the composer + int nc = 0; + for (auto& node : forest->nodes_) + node.node_hash = ++nc; } if (add_pass_through_rules) fst->ClearPassThroughTranslations(); diff --git a/decoder/hg.cc b/decoder/hg.cc index 7240a8ab..405169c6 100644 --- a/decoder/hg.cc +++ b/decoder/hg.cc @@ -1,14 +1,17 @@ -//TODO: lazily generate feature vectors for hyperarcs (because some of them will be pruned). this means 1) storing ref to rule for those features 2) providing ff interface for regenerating its feature vector from hyperedge+states and probably 3) still caching feat. vect on hyperedge once it's been generated. ff would normally just contribute its weighted score and result state, not component features. however, the hypergraph drops the state used by ffs after rescoring is done, so recomputation would have to start at the leaves and work bottom up. question: which takes more space, feature id+value, or state? - #include "hg.h" #include #include #include -#include #include #include #include +#ifndef HAVE_OLD_CPP +# include +#else +# include +namespace std { using std::tr1::unordered_set; } +#endif #include "viterbi.h" #include "inside_outside.h" @@ -17,28 +20,21 @@ using namespace std; -#if 0 -Hypergraph::Edge const* Hypergraph::ViterbiGoalEdge() const -{ - Edge const* r=0; - for (unsigned i=0,e=edges_.size();iIsGoal() && (!r || e.edge_prob_ > r->edge_prob_)) - r=&e; - } - return r; +bool Hypergraph::AreNodesUniquelyIdentified() const { + unordered_set s(nodes_.size() * 3 + 7); + for (const auto& n : nodes_) + if (!s.insert(n.node_hash).second) + return false; + return true; } -#endif -Hypergraph::Edge const* Hypergraph::ViterbiSortInEdges() -{ +Hypergraph::Edge const* Hypergraph::ViterbiSortInEdges() { NodeProbs nv; ComputeNodeViterbi(&nv); return SortInEdgesByNodeViterbi(nv); } -Hypergraph::Edge const* Hypergraph::SortInEdgesByNodeViterbi(NodeProbs const& nv) -{ +Hypergraph::Edge const* Hypergraph::SortInEdgesByNodeViterbi(NodeProbs const& nv) { EdgeProbs ev; ComputeEdgeViterbi(nv,&ev); return ViterbiSortInEdges(ev); @@ -375,9 +371,7 @@ bool Hypergraph::PruneInsideOutside(double alpha,double density,const EdgeMask* void Hypergraph::PrintGraphviz() const { int ei = 0; cerr << "digraph G {\n rankdir=LR;\n nodesep=.05;\n"; - for (vector::const_iterator i = edges_.begin(); - i != edges_.end(); ++i) { - const Edge& edge=*i; + for (const auto& edge : edges_) { ++ei; static const string none = ""; string rule = (edge.rule_ ? edge.rule_->AsString(false) : none); @@ -399,14 +393,9 @@ void Hypergraph::PrintGraphviz() const { } cerr << " A_" << ei << " -> " << edge.head_node_ << ";\n"; } - for (vector::const_iterator ni = nodes_.begin(); - ni != nodes_.end(); ++ni) { - cerr << " " << ni->id_ << "[label=\"" << (ni->cat_ < 0 ? TD::Convert(ni->cat_ * -1) : "") - //cerr << " " << ni->id_ << "[label=\"" << ni->cat_ - << " n=" << ni->id_ -// << ",x=" << &*ni -// << ",in=" << ni->in_edges_.size() -// << ",out=" << ni->out_edges_.size() + for (const auto& node : nodes_) { + cerr << " " << node.id_ << "[label=\"" << (node.cat_ < 0 ? TD::Convert(node.cat_ * -1) : "") + << " n=" << node.id_ << "\"];\n"; } cerr << "}\n"; diff --git a/decoder/hg.h b/decoder/hg.h index 343b99cf..43fb275b 100644 --- a/decoder/hg.h +++ b/decoder/hg.h @@ -142,13 +142,15 @@ namespace HG { // TODO get rid of cat_? // TODO keep cat_ and add span and/or state? :) struct Node { - Node() : id_(), cat_() {} + Node() : node_hash(), id_(), cat_() {} + size_t node_hash; // hash of all the information that makes this node unique int id_; // equal to this object's position in the nodes_ vector WordID cat_; // non-terminal category if <0, 0 if not set WordID NT() const { return -cat_; } EdgesVector in_edges_; // an in edge is an edge with this node as its head. (in edges come from the bottom up to us) indices in edges_ EdgesVector out_edges_; // an out edge is an edge with this node as its tail. (out edges leave us up toward the top/goal). indices in edges_ void copy_fixed(Node const& o) { // nonstructural fields only - structural ones are managed by sorting/pruning/subsetting + node_hash = o.node_hash; cat_=o.cat_; } void copy_reindex(Node const& o,indices_after const& n2,indices_after const& e2) { @@ -192,13 +194,14 @@ public: SetNodeOrigin(nodeid,r); return r; } - Span NodeSpan(int nodeid) const { + Span NodeSpan(int nodeid, Span* prev = nullptr) const { Span s; Node const &n=nodes_[nodeid]; if (!n.in_edges_.empty()) { Edge const& e=edges_[n.in_edges_.front()]; s.l=e.i_; s.r=e.j_; + if (prev) { prev->l = e.prev_i_; prev->r = e.prev_j_; } } return s; } @@ -262,6 +265,9 @@ public: for (int i = 0; i < size; ++i) nodes_[i].id_ = i; } + // if all node states are unique, return true + bool AreNodesUniquelyIdentified() const; + // reserves space in the nodes vector to prevent memory locations // from changing void ReserveNodes(size_t n, size_t e = 0) { diff --git a/decoder/lexalign.cc b/decoder/lexalign.cc index 6adb1892..11f20de7 100644 --- a/decoder/lexalign.cc +++ b/decoder/lexalign.cc @@ -124,6 +124,11 @@ bool LexicalAlign::TranslateImpl(const string& input, pimpl_->BuildTrellis(lattice, *smeta, forest); forest->is_linear_chain_ = true; forest->Reweight(weights); + // since we don't do any pruning, the node_hash will be the same for + // every run of the composer + int nc = 0; + for (auto& node : forest->nodes_) + node.node_hash = ++nc; return true; } diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc index 8c3269bf..74a18c3f 100644 --- a/decoder/lextrans.cc +++ b/decoder/lextrans.cc @@ -280,6 +280,11 @@ bool LexicalTrans::TranslateImpl(const string& input, smeta->SetSourceLength(lattice.size()); if (!pimpl_->BuildTrellis(lattice, *smeta, forest)) return false; forest->Reweight(weights); + // since we don't do any pruning, the node_hash will be the same for + // every run of the composer + int nc = 0; + for (auto& node : forest->nodes_) + node.node_hash = ++nc; return true; } diff --git a/decoder/node_state_hash.h b/decoder/node_state_hash.h new file mode 100644 index 00000000..cdc05877 --- /dev/null +++ b/decoder/node_state_hash.h @@ -0,0 +1,36 @@ +#ifndef _NODE_STATE_HASH_ +#define _NODE_STATE_HASH_ + +#include +#include +#include "murmur_hash3.h" +#include "ffset.h" + +namespace cdec { + + struct FirstPassNode { + FirstPassNode(int cat, int i, int j, int pi, int pj) : lhs(cat), s(i), t(j), u(pi), v(pj) {} + int32_t lhs; + short s; + short t; + short u; + short v; + }; + + inline uint64_t HashNode(int cat, int i, int j, int pi, int pj) { + FirstPassNode fpn(cat, i, j, pi, pj); + return MurmurHash3_64(&fpn, sizeof(FirstPassNode), 2654435769U); + } + + inline uint64_t HashNode(uint64_t old_hash, const FFState& state) { + uint8_t buf[1024]; + std::memcpy(buf, &old_hash, sizeof(uint64_t)); + assert(state.size() < (1024u - sizeof(uint64_t))); + std::memcpy(&buf[sizeof(uint64_t)], state.begin(), state.size()); + return MurmurHash3_64(buf, sizeof(uint64_t) + state.size(), 2654435769U); + } + +} + +#endif + diff --git a/decoder/nt_span.h b/decoder/nt_span.h index a918f301..6ff9391f 100644 --- a/decoder/nt_span.h +++ b/decoder/nt_span.h @@ -7,7 +7,7 @@ struct Span { int l,r; - Span() : l(-1) { } + Span() : l(-1), r(-1) { } bool is_null() const { return l<0; } void print(std::ostream &o,char const* for_null="") const { if (is_null()) diff --git a/decoder/tagger.cc b/decoder/tagger.cc index 63e855c8..30fb055f 100644 --- a/decoder/tagger.cc +++ b/decoder/tagger.cc @@ -108,6 +108,11 @@ bool Tagger::TranslateImpl(const string& input, pimpl_->BuildTrellis(sequence, forest); forest->Reweight(weights); forest->is_linear_chain_ = true; + // since we don't do any pruning, the node_hash will be the same for + // every run of the composer + int nc = 0; + for (auto& node : forest->nodes_) + node.node_hash = ++nc; return true; } diff --git a/decoder/tree2string_translator.cc b/decoder/tree2string_translator.cc index f288ab4e..8d12d01d 100644 --- a/decoder/tree2string_translator.cc +++ b/decoder/tree2string_translator.cc @@ -184,13 +184,19 @@ struct Tree2StringTranslatorImpl { // TD::Convert(input_tree.nodes[s.input_node_idx].lhs & cdec::ALL_MASK) << endl; if (s.node->rules.size()) { int& node_id = tree2hg[s.input_node_idx]; - if (node_id < 0) - node_id = hg.AddNode(-(input_tree.nodes[s.input_node_idx].lhs & cdec::ALL_MASK))->id_; + if (node_id < 0) { + HG::Node* new_node = hg.AddNode(-(input_tree.nodes[s.input_node_idx].lhs & cdec::ALL_MASK)); + new_node->node_hash = s.input_node_idx + 1; + node_id = new_node->id_; + } TailNodeVector tail; for (auto n : s.future_work) { int& nix = tree2hg[n]; - if (nix < 0) - nix = hg.AddNode(-(input_tree.nodes[n].lhs & cdec::ALL_MASK))->id_; + if (nix < 0) { + HG::Node* new_node = hg.AddNode(-(input_tree.nodes[n].lhs & cdec::ALL_MASK)); + new_node->node_hash = n + 1; + nix = new_node->id_; + } tail.push_back(nix); } for (auto& r : s.node->rules) { diff --git a/mteval/Makefile.am b/mteval/Makefile.am index 681e798e..08591c9a 100644 --- a/mteval/Makefile.am +++ b/mteval/Makefile.am @@ -1,6 +1,7 @@ bin_PROGRAMS = \ fast_score \ - mbr_kbest + mbr_kbest\ + marginalize noinst_PROGRAMS = \ scorer_test @@ -46,4 +47,7 @@ mbr_kbest_LDADD = libmteval.a ../utils/libutils.a scorer_test_SOURCES = scorer_test.cc scorer_test_LDADD = libmteval.a ../utils/libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) -AM_CPPFLAGS = -DTEST_DATA=\"$(top_srcdir)/mteval/test_data\" -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir) -I$(top_srcdir)/utils +marginalize_SOURCES = marginalize.cc +marginalize_LDADD = libmteval.a ../klm/search/libksearch.a ../klm/lm/libklm.a ../klm/util/libklm_util.a ../klm/util/double-conversion/libklm_util_double.a ../utils/libutils.a + +AM_CPPFLAGS = -DTEST_DATA=\"$(top_srcdir)/mteval/test_data\" -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir) -I$(top_srcdir)/utils -I$(top_srcdir)/klm diff --git a/tests/tools/filter-stderr.pl b/tests/tools/filter-stderr.pl index 4a762324..54fe9210 100755 --- a/tests/tools/filter-stderr.pl +++ b/tests/tools/filter-stderr.pl @@ -13,6 +13,7 @@ if (/Init.*\s+Viterbi:\s+($REAL)/) { # -LM Viterbi: australia is have diplomatic relations with north korea one of the few countries . print "-lm_trans $1\n"; } +if (/NODES NOT UNIQUELY IDENTIFIED/) { print "NODES_NOT_UNIQUE 1\n"; } #Constr. forest (nodes/edges): 111/305 #Constr. forest (paths): 9899 if (/Constr\. forest\s+\(nodes\/edges\): (\d+)\/(\d+)/) { print "constr_nodes $1\nconstr_edges $2\n"; } diff --git a/utils/Makefile.am b/utils/Makefile.am index c0ce3509..341fd80b 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -39,7 +39,8 @@ libutils_a_SOURCES = \ kernel_string_subseq.h \ logval.h \ m.h \ - murmur_hash.h \ + murmur_hash3.h \ + murmur_hash3.cc \ named_enum.h \ null_deleter.h \ null_traits.h \ diff --git a/utils/hash.h b/utils/hash.h index e1426ffb..24d2b6ad 100644 --- a/utils/hash.h +++ b/utils/hash.h @@ -3,7 +3,7 @@ #include -#include "murmur_hash.h" +#include "murmur_hash3.h" #ifdef HAVE_CONFIG_H #include "config.h" @@ -44,23 +44,21 @@ const unsigned GOLDEN_MEAN_FRACTION=2654435769U; // assumes C is POD template -struct murmur_hash -{ - typedef MurmurInt result_type; +struct murmur_hash { + typedef size_t result_type; typedef C /*const&*/ argument_type; result_type operator()(argument_type const& c) const { - return MurmurHash((void*)&c,sizeof(c)); + return cdec::MurmurHash3_64((void*)&c, sizeof(c), GOLDEN_MEAN_FRACTION); } }; // murmur_hash_array isn't std guaranteed safe (you need to use string::data()) template <> -struct murmur_hash -{ - typedef MurmurInt result_type; +struct murmur_hash { + typedef size_t result_type; typedef std::string /*const&*/ argument_type; result_type operator()(argument_type const& c) const { - return MurmurHash(c.data(),c.size()); + return cdec::MurmurHash3_64(c.data(), c.size(), GOLDEN_MEAN_FRACTION); } }; @@ -68,10 +66,10 @@ struct murmur_hash template struct murmur_hash_array { - typedef MurmurInt result_type; + typedef size_t result_type; typedef C /*const&*/ argument_type; result_type operator()(argument_type const& c) const { - return MurmurHash(&*c.begin(),c.size()*sizeof(*c.begin())); + return cdec::MurmurHash3_64(&*c.begin(), c.size()*sizeof(*c.begin()), GOLDEN_MEAN_FRACTION); } }; @@ -95,7 +93,6 @@ typename H::mapped_type & get_or_construct(H &ht,K const& k,C0 const& c0) { } } - // get_or_call (0 arg) template typename H::mapped_type & get_or_call(H &ht,K const& k,F const& f) { diff --git a/utils/murmur_hash.h b/utils/murmur_hash.h deleted file mode 100644 index 6063d524..00000000 --- a/utils/murmur_hash.h +++ /dev/null @@ -1,186 +0,0 @@ -#ifndef _MURMUR_HASH_H_ -#define _MURMUR_HASH_H_ - -//NOTE: quite fast, nice collision properties, but endian dependent hash values - -#include "have_64_bits.h" -typedef uintptr_t MurmurInt; - -// MurmurHash2, by Austin Appleby - -static const uint32_t DEFAULT_SEED=2654435769U; - -#if HAVE_64_BITS -//MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED); - -inline uint64_t MurmurHash64( const void * key, int len, unsigned int seed=DEFAULT_SEED ) -{ - const uint64_t m = 0xc6a4a7935bd1e995ULL; - const int r = 47; - - uint64_t h = seed ^ (len * m); - - const uint64_t * data = (const uint64_t *)key; - const uint64_t * end = data + (len/8); - - while(data != end) - { - uint64_t k = *data++; - - k *= m; - k ^= k >> r; - k *= m; - - h ^= k; - h *= m; - } - - const unsigned char * data2 = (const unsigned char*)data; - - switch(len & 7) - { - case 7: h ^= uint64_t(data2[6]) << 48; - case 6: h ^= uint64_t(data2[5]) << 40; - case 5: h ^= uint64_t(data2[4]) << 32; - case 4: h ^= uint64_t(data2[3]) << 24; - case 3: h ^= uint64_t(data2[2]) << 16; - case 2: h ^= uint64_t(data2[1]) << 8; - case 1: h ^= uint64_t(data2[0]); - h *= m; - }; - - h ^= h >> r; - h *= m; - h ^= h >> r; - - return h; -} - -inline uint32_t MurmurHash32(void const *key, int len, uint32_t seed=DEFAULT_SEED) -{ - return (uint32_t) MurmurHash64(key,len,seed); -} - -inline MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED) -{ - return MurmurHash64(key,len,seed); -} - -#else -// 32-bit - -// Note - This code makes a few assumptions about how your machine behaves - -// 1. We can read a 4-byte value from any address without crashing -// 2. sizeof(int) == 4 -inline uint32_t MurmurHash32 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) -{ - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - - const uint32_t m = 0x5bd1e995; - const int r = 24; - - // Initialize the hash to a 'random' value - - uint32_t h = seed ^ len; - - // Mix 4 bytes at a time into the hash - - const unsigned char * data = (const unsigned char *)key; - - while(len >= 4) - { - uint32_t k = *(uint32_t *)data; - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - data += 4; - len -= 4; - } - - // Handle the last few bytes of the input array - - switch(len) - { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; - h *= m; - }; - - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; -} - -inline MurmurInt MurmurHash ( const void * key, int len, uint32_t seed=DEFAULT_SEED) { - return MurmurHash32(key,len,seed); -} - -// 64-bit hash for 32-bit platforms - -inline uint64_t MurmurHash64 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) -{ - const uint32_t m = 0x5bd1e995; - const int r = 24; - - uint32_t h1 = seed ^ len; - uint32_t h2 = 0; - - const uint32_t * data = (const uint32_t *)key; - - while(len >= 8) - { - uint32_t k1 = *data++; - k1 *= m; k1 ^= k1 >> r; k1 *= m; - h1 *= m; h1 ^= k1; - len -= 4; - - uint32_t k2 = *data++; - k2 *= m; k2 ^= k2 >> r; k2 *= m; - h2 *= m; h2 ^= k2; - len -= 4; - } - - if(len >= 4) - { - uint32_t k1 = *data++; - k1 *= m; k1 ^= k1 >> r; k1 *= m; - h1 *= m; h1 ^= k1; - len -= 4; - } - - switch(len) - { - case 3: h2 ^= ((unsigned char*)data)[2] << 16; - case 2: h2 ^= ((unsigned char*)data)[1] << 8; - case 1: h2 ^= ((unsigned char*)data)[0]; - h2 *= m; - }; - - h1 ^= h2 >> 18; h1 *= m; - h2 ^= h1 >> 22; h2 *= m; - h1 ^= h2 >> 17; h1 *= m; - h2 ^= h1 >> 19; h2 *= m; - - uint64_t h = h1; - - h = (h << 32) | h2; - - return h; -} - -#endif -//32bit - -#endif diff --git a/utils/murmur_hash3.cc b/utils/murmur_hash3.cc new file mode 100644 index 00000000..68a71d02 --- /dev/null +++ b/utils/murmur_hash3.cc @@ -0,0 +1,340 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "murmur_hash3.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE inline __attribute__((always_inline)) + +namespace cdec { + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix64 ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + const uint32_t c1 = 0x239b961b; + const uint32_t c2 = 0xab0e9789; + const uint32_t c3 = 0x38b34ae5; + const uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i*4+0); + uint32_t k2 = getblock32(blocks,i*4+1); + uint32_t k3 = getblock32(blocks,i*4+2); + uint32_t k4 = getblock32(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix32(h1); + h2 = fmix32(h2); + h3 = fmix32(h3); + h4 = fmix32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock64(blocks,i*2+0); + uint64_t k2 = getblock64(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; + case 14: k2 ^= ((uint64_t)tail[13]) << 40; + case 13: k2 ^= ((uint64_t)tail[12]) << 32; + case 12: k2 ^= ((uint64_t)tail[11]) << 24; + case 11: k2 ^= ((uint64_t)tail[10]) << 16; + case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; + case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; + case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; + case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; + case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; + case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; + case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; + case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; + case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- + +} // namespace cdec + + diff --git a/utils/murmur_hash3.h b/utils/murmur_hash3.h new file mode 100644 index 00000000..a125d775 --- /dev/null +++ b/utils/murmur_hash3.h @@ -0,0 +1,67 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) && (_MSC_VER < 1600) + +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +namespace cdec { + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +namespace { + #ifdef __clang__ + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wunused-function" + #endif + template inline void cdecMurmurHashNativeBackend(const void * key, int len, uint32_t seed, void * out) { + MurmurHash3_x86_128(key, len, seed, out); + } + template <> inline void cdecMurmurHashNativeBackend<4>(const void * key, int len, uint32_t seed, void * out) { + MurmurHash3_x64_128(key, len, seed, out); + } + #ifdef __clang__ + #pragma clang diagnostic pop + #endif +} // namespace + +inline uint64_t MurmurHash3_64(const void * key, int len, uint32_t seed) { + uint64_t out[2]; + cdecMurmurHashNativeBackend(key, len, seed, &out); + return out[0]; +} + +inline void MurmurHash3_128(const void * key, int len, uint32_t seed, void * out) { + cdecMurmurHashNativeBackend(key, len, seed, out); +} + +} + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ -- cgit v1.2.3 From d033a045aa46ff876ad2c9f6929e2095b2481cdf Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 25 Apr 2014 23:45:32 -0400 Subject: check for non-rescorable hypergraphs --- decoder/decoder.cc | 5 +++++ decoder/hg.cc | 7 +++++++ decoder/hg.h | 4 ++++ tests/system_tests/cfg_rescore/input.cfg | 5 ++--- tests/system_tests/cfg_rescore/input.txt | 2 +- training/utils/grammar_convert.cc | 16 ++++++++++++---- 6 files changed, 31 insertions(+), 8 deletions(-) (limited to 'decoder/decoder.cc') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 354ea2d9..41f36822 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -755,6 +755,11 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { if (!SILENT) cerr << " *** NODES NOT UNIQUELY IDENTIFIED ***\n"; } + if (!forest.ArePreGoalEdgesArity1()) { + cerr << "Pre-goal edges are not arity-1. The decoder requires this.\n"; + abort(); + } + const bool show_tree_structure=conf.count("show_tree_structure"); if (!SILENT) forest_stats(forest," Init. forest",show_tree_structure,oracle.show_derivation); if (conf.count("show_expected_length")) { diff --git a/decoder/hg.cc b/decoder/hg.cc index e456fa7c..46543b01 100644 --- a/decoder/hg.cc +++ b/decoder/hg.cc @@ -28,6 +28,13 @@ bool Hypergraph::AreNodesUniquelyIdentified() const { return true; } +bool Hypergraph::ArePreGoalEdgesArity1() const { + auto& n = nodes_.back(); + for (auto eid : n.in_edges_) + if (edges_[eid].Arity() != 1) return false; + return true; +} + Hypergraph::Edge const* Hypergraph::ViterbiSortInEdges() { NodeProbs nv; ComputeNodeViterbi(&nv); diff --git a/decoder/hg.h b/decoder/hg.h index 43fb275b..4ed27d87 100644 --- a/decoder/hg.h +++ b/decoder/hg.h @@ -268,6 +268,10 @@ public: // if all node states are unique, return true bool AreNodesUniquelyIdentified() const; + // the feature function interface assumes that pre-goal edges are + // arity 1 (this simplifies the "final transition" feature computation) + bool ArePreGoalEdgesArity1() const; + // reserves space in the nodes vector to prevent memory locations // from changing void ReserveNodes(size_t n, size_t e = 0) { diff --git a/tests/system_tests/cfg_rescore/input.cfg b/tests/system_tests/cfg_rescore/input.cfg index 0073cb7b..75602c75 100644 --- a/tests/system_tests/cfg_rescore/input.cfg +++ b/tests/system_tests/cfg_rescore/input.cfg @@ -1,9 +1,8 @@ -[S] ||| [S1] -[S1] ||| [NP1] [VP] ||| Active=1 +[S] ||| [NP1] [VP] ||| Active=1 +[S] ||| [NP2] [VPSV] by [NP1] ||| Passive=1 [VP] ||| [V] [NP2] [V] ||| ate [VPSV] ||| was eaten -[S1] ||| [NP2] [VPSV] by [NP1] ||| Passive=1 [NP1] ||| John [NP2] ||| broccoli [NP2] ||| the broccoli ||| Definite=1 diff --git a/tests/system_tests/cfg_rescore/input.txt b/tests/system_tests/cfg_rescore/input.txt index 71fc26bc..2999a5fb 100644 --- a/tests/system_tests/cfg_rescore/input.txt +++ b/tests/system_tests/cfg_rescore/input.txt @@ -1 +1 @@ -{"rules":[1,"[S] ||| [S1] ||| [1]",2,"[S1] ||| [NP1] [VP] ||| [1] [2] ||| Active=1",3,"[VP] ||| [V] [NP2] ||| [1] [2]",4,"[V] ||| ate ||| ate",5,"[VPSV] ||| was eaten ||| was eaten",6,"[S1] ||| [NP2] [VPSV] by [NP1] ||| [1] [2] by [3] ||| Passive=1",7,"[NP1] ||| John ||| John",8,"[NP2] ||| broccoli ||| broccoli",9,"[NP2] ||| the broccoli ||| the broccoli ||| Definite=1"],"features":["PhraseModel_0","PhraseModel_1","PhraseModel_2","PhraseModel_3","PhraseModel_4","PhraseModel_5","PhraseModel_6","PhraseModel_7","PhraseModel_8","PhraseModel_9","PhraseModel_10","PhraseModel_11","PhraseModel_12","PhraseModel_13","PhraseModel_14","PhraseModel_15","PhraseModel_16","PhraseModel_17","PhraseModel_18","PhraseModel_19","PhraseModel_20","PhraseModel_21","PhraseModel_22","PhraseModel_23","PhraseModel_24","PhraseModel_25","PhraseModel_26","PhraseModel_27","PhraseModel_28","PhraseModel_29","PhraseModel_30","PhraseModel_31","PhraseModel_32","PhraseModel_33","PhraseModel_34","PhraseModel_35","PhraseModel_36","PhraseModel_37","PhraseModel_38","PhraseModel_39","PhraseModel_40","PhraseModel_41","PhraseModel_42","PhraseModel_43","PhraseModel_44","PhraseModel_45","PhraseModel_46","PhraseModel_47","PhraseModel_48","PhraseModel_49","PhraseModel_50","PhraseModel_51","PhraseModel_52","PhraseModel_53","PhraseModel_54","PhraseModel_55","PhraseModel_56","PhraseModel_57","PhraseModel_58","PhraseModel_59","PhraseModel_60","PhraseModel_61","PhraseModel_62","PhraseModel_63","PhraseModel_64","PhraseModel_65","PhraseModel_66","PhraseModel_67","PhraseModel_68","PhraseModel_69","PhraseModel_70","PhraseModel_71","PhraseModel_72","PhraseModel_73","PhraseModel_74","PhraseModel_75","PhraseModel_76","PhraseModel_77","PhraseModel_78","PhraseModel_79","PhraseModel_80","PhraseModel_81","PhraseModel_82","PhraseModel_83","PhraseModel_84","PhraseModel_85","PhraseModel_86","PhraseModel_87","PhraseModel_88","PhraseModel_89","PhraseModel_90","PhraseModel_91","PhraseModel_92","PhraseModel_93","PhraseModel_94","PhraseModel_95","PhraseModel_96","PhraseModel_97","PhraseModel_98","PhraseModel_99","Active","Passive","Definite"],"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":7}],"node":{"in_edges":[0],"cat":"NP1","node_hash":"0000000000000007"},"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":4}],"node":{"in_edges":[1],"cat":"V","node_hash":"0000000000000004"},"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":8},{"tail":[],"spans":[-1,-1,-1,-1],"feats":[102,1],"rule":9}],"node":{"in_edges":[2,3],"cat":"NP2","node_hash":"0000000000000009"},"edges":[{"tail":[1,2],"spans":[-1,-1,-1,-1],"feats":[],"rule":3}],"node":{"in_edges":[4],"cat":"VP","node_hash":"0000000000000003"},"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":5}],"node":{"in_edges":[5],"cat":"VPSV","node_hash":"0000000000000005"},"edges":[{"tail":[0,3],"spans":[-1,-1,-1,-1],"feats":[100,1],"rule":2},{"tail":[2,4,0],"spans":[-1,-1,-1,-1],"feats":[101,1],"rule":6}],"node":{"in_edges":[6,7],"cat":"S1","node_hash":"0000000000000006"},"edges":[{"tail":[5],"spans":[-1,-1,-1,-1],"feats":[],"rule":1}],"node":{"in_edges":[8],"cat":"S","node_hash":"0000000000000001"}} +{"rules":[1,"[S] ||| [NP1] [VP] ||| [1] [2] ||| Active=1",2,"[S] ||| [NP2] [VPSV] by [NP1] ||| [1] [2] by [3] ||| Passive=1",3,"[VP] ||| [V] [NP2] ||| [1] [2]",4,"[V] ||| ate ||| ate",5,"[VPSV] ||| was eaten ||| was eaten",6,"[NP1] ||| John ||| John",7,"[NP2] ||| broccoli ||| broccoli",8,"[NP2] ||| the broccoli ||| the broccoli ||| Definite=1",9,"[Goal] ||| [X] ||| [1]"],"features":["PhraseModel_0","PhraseModel_1","PhraseModel_2","PhraseModel_3","PhraseModel_4","PhraseModel_5","PhraseModel_6","PhraseModel_7","PhraseModel_8","PhraseModel_9","PhraseModel_10","PhraseModel_11","PhraseModel_12","PhraseModel_13","PhraseModel_14","PhraseModel_15","PhraseModel_16","PhraseModel_17","PhraseModel_18","PhraseModel_19","PhraseModel_20","PhraseModel_21","PhraseModel_22","PhraseModel_23","PhraseModel_24","PhraseModel_25","PhraseModel_26","PhraseModel_27","PhraseModel_28","PhraseModel_29","PhraseModel_30","PhraseModel_31","PhraseModel_32","PhraseModel_33","PhraseModel_34","PhraseModel_35","PhraseModel_36","PhraseModel_37","PhraseModel_38","PhraseModel_39","PhraseModel_40","PhraseModel_41","PhraseModel_42","PhraseModel_43","PhraseModel_44","PhraseModel_45","PhraseModel_46","PhraseModel_47","PhraseModel_48","PhraseModel_49","PhraseModel_50","PhraseModel_51","PhraseModel_52","PhraseModel_53","PhraseModel_54","PhraseModel_55","PhraseModel_56","PhraseModel_57","PhraseModel_58","PhraseModel_59","PhraseModel_60","PhraseModel_61","PhraseModel_62","PhraseModel_63","PhraseModel_64","PhraseModel_65","PhraseModel_66","PhraseModel_67","PhraseModel_68","PhraseModel_69","PhraseModel_70","PhraseModel_71","PhraseModel_72","PhraseModel_73","PhraseModel_74","PhraseModel_75","PhraseModel_76","PhraseModel_77","PhraseModel_78","PhraseModel_79","PhraseModel_80","PhraseModel_81","PhraseModel_82","PhraseModel_83","PhraseModel_84","PhraseModel_85","PhraseModel_86","PhraseModel_87","PhraseModel_88","PhraseModel_89","PhraseModel_90","PhraseModel_91","PhraseModel_92","PhraseModel_93","PhraseModel_94","PhraseModel_95","PhraseModel_96","PhraseModel_97","PhraseModel_98","PhraseModel_99","Active","Passive","Definite"],"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":6}],"node":{"in_edges":[0],"cat":"NP1","node_hash":"0000000000000006"},"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":4}],"node":{"in_edges":[1],"cat":"V","node_hash":"0000000000000004"},"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":7},{"tail":[],"spans":[-1,-1,-1,-1],"feats":[102,1],"rule":8}],"node":{"in_edges":[2,3],"cat":"NP2","node_hash":"0000000000000008"},"edges":[{"tail":[1,2],"spans":[-1,-1,-1,-1],"feats":[],"rule":3}],"node":{"in_edges":[4],"cat":"VP","node_hash":"0000000000000003"},"edges":[{"tail":[],"spans":[-1,-1,-1,-1],"feats":[],"rule":5}],"node":{"in_edges":[5],"cat":"VPSV","node_hash":"0000000000000005"},"edges":[{"tail":[0,3],"spans":[-1,-1,-1,-1],"feats":[100,1],"rule":1},{"tail":[2,4,0],"spans":[-1,-1,-1,-1],"feats":[101,1],"rule":2}],"node":{"in_edges":[6,7],"cat":"S","node_hash":"0000000000000002"},"edges":[{"tail":[5],"spans":[-1,-1,-1,-1],"feats":[],"rule":9}],"node":{"in_edges":[8],"cat":"Goal","node_hash":"000000000000003D"}} diff --git a/training/utils/grammar_convert.cc b/training/utils/grammar_convert.cc index 58d1957c..5c1b4d4a 100644 --- a/training/utils/grammar_convert.cc +++ b/training/utils/grammar_convert.cc @@ -56,15 +56,22 @@ int GetOrCreateNode(const WordID& lhs, map* lhs2node, Hypergraph* h return node_id - 1; } +void AddDummyGoalNode(Hypergraph* hg) { + static const int kGOAL = -TD::Convert("Goal"); + static TRulePtr kGOAL_RULE(new TRule("[Goal] ||| [X] ||| [1]")); + unsigned old_goal_node_idx = hg->nodes_.size() - 1; + HG::Node* goal_node = hg->AddNode(kGOAL); + goal_node->node_hash = goal_node->id_ * 10 + 1; + TailNodeVector tail(1, old_goal_node_idx); + HG::Edge* new_edge = hg->AddEdge(kGOAL_RULE, tail); + hg->ConnectEdgeToHeadNode(new_edge, goal_node); +} + void FilterAndCheckCorrectness(int goal, Hypergraph* hg) { if (goal < 0) { cerr << "Error! [S] not found in grammar!\n"; exit(1); } - if (hg->nodes_[goal].in_edges_.size() != 1) { - cerr << "Error! [S] has more than one rewrite!\n"; - exit(1); - } int old_size = hg->nodes_.size(); hg->TopologicallySortNodesAndEdges(goal); if (hg->nodes_.size() != old_size) { @@ -319,6 +326,7 @@ int main(int argc, char **argv) { if (line.empty()) { int goal = lhs2node[kSTART] - 1; FilterAndCheckCorrectness(goal, &hg); + AddDummyGoalNode(&hg); ProcessHypergraph(w, conf, "", &hg); hg.clear(); lhs2node.clear(); -- cgit v1.2.3 From bb3f703d572e9f4a4b971bfa2483e0caf060587d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 17 May 2014 17:46:20 -0400 Subject: stub for t2t translator --- decoder/decoder.cc | 8 +++++--- decoder/translator.h | 3 ++- decoder/tree2string_translator.cc | 12 +++++++----- 3 files changed, 14 insertions(+), 9 deletions(-) (limited to 'decoder/decoder.cc') diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 41f36822..6783cad0 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -490,8 +490,8 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream } formalism = LowercaseString(str("formalism",conf)); - if (formalism != "t2s" && formalism != "scfg" && formalism != "fst" && formalism != "lextrans" && formalism != "pb" && formalism != "csplit" && formalism != "tagger" && formalism != "lexalign" && formalism != "rescore") { - cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 't2s', 'csplit', 'lextrans', 'lexalign', 'rescore', or 'tagger'\n"; + if (formalism != "t2s" && formalism != "t2t" && formalism != "scfg" && formalism != "fst" && formalism != "lextrans" && formalism != "pb" && formalism != "csplit" && formalism != "tagger" && formalism != "lexalign" && formalism != "rescore") { + cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 't2s', 't2t', 'csplit', 'lextrans', 'lexalign', 'rescore', or 'tagger'\n"; cerr << dcmdline_options << endl; exit(1); } @@ -627,7 +627,9 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream if (formalism == "scfg") translator.reset(new SCFGTranslator(conf)); else if (formalism == "t2s") - translator.reset(new Tree2StringTranslator(conf)); + translator.reset(new Tree2StringTranslator(conf, false)); + else if (formalism == "t2t") + translator.reset(new Tree2StringTranslator(conf, true)); else if (formalism == "fst") translator.reset(new FSTTranslator(conf)); else if (formalism == "pb") diff --git a/decoder/translator.h b/decoder/translator.h index 72b2f0b0..ba218a0b 100644 --- a/decoder/translator.h +++ b/decoder/translator.h @@ -101,7 +101,8 @@ class RescoreTranslator : public Translator { class Tree2StringTranslatorImpl; class Tree2StringTranslator : public Translator { public: - Tree2StringTranslator(const boost::program_options::variables_map& conf); + Tree2StringTranslator(const boost::program_options::variables_map& conf, + bool has_multiple_states); virtual std::string GetDecoderType() const; protected: bool TranslateImpl(const std::string& src, diff --git a/decoder/tree2string_translator.cc b/decoder/tree2string_translator.cc index 5d7aa5e2..101ed21c 100644 --- a/decoder/tree2string_translator.cc +++ b/decoder/tree2string_translator.cc @@ -23,7 +23,7 @@ struct Tree2StringGrammarNode { // this needs to be rewritten so it is fast and checks errors well // use a lexer probably -void ReadTree2StringGrammar(istream* in, Tree2StringGrammarNode* root) { +void ReadTree2StringGrammar(istream* in, Tree2StringGrammarNode* root, bool has_multiple_states) { string line; while(getline(*in, line)) { size_t pos = line.find("|||"); @@ -143,7 +143,8 @@ struct Tree2StringTranslatorImpl { vector> root; bool add_pass_through_rules; unsigned remove_grammars; - Tree2StringTranslatorImpl(const boost::program_options::variables_map& conf) : + Tree2StringTranslatorImpl(const boost::program_options::variables_map& conf, + bool has_multiple_states) : add_pass_through_rules(conf.count("add_pass_through_rules")) { if (conf.count("grammar")) { const vector gf = conf["grammar"].as>(); @@ -152,7 +153,7 @@ struct Tree2StringTranslatorImpl { for (auto& f : gf) { ReadFile rf(f); root[gc].reset(new Tree2StringGrammarNode); - ReadTree2StringGrammar(rf.stream(), &*root[gc++]); + ReadTree2StringGrammar(rf.stream(), &*root[gc++], has_multiple_states); } } } @@ -357,8 +358,9 @@ struct Tree2StringTranslatorImpl { } }; -Tree2StringTranslator::Tree2StringTranslator(const boost::program_options::variables_map& conf) : - pimpl_(new Tree2StringTranslatorImpl(conf)) {} +Tree2StringTranslator::Tree2StringTranslator(const boost::program_options::variables_map& conf, + bool has_multiple_states) : + pimpl_(new Tree2StringTranslatorImpl(conf, has_multiple_states)) {} bool Tree2StringTranslator::TranslateImpl(const string& input, SentenceMetadata* smeta, -- cgit v1.2.3