From 6d3cf2f3aeaa5d008f5031f70da8d728181486bc Mon Sep 17 00:00:00 2001 From: "graehl@gmail.com" Date: Sun, 15 Aug 2010 07:39:01 +0000 Subject: really fixed binarization. test git-svn-id: https://ws10smt.googlecode.com/svn/trunk@555 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/cdec.cc | 2 +- decoder/cfg.cc | 70 +++++++++++++++++++++++++++---------------- decoder/cfg.h | 26 +++++++++++----- decoder/cfg_binarize.h | 5 ++-- decoder/cfg_format.h | 10 +++---- decoder/cfg_options.h | 38 +++++++++++++++--------- decoder/cfg_test.cc | 80 +++++++++++++++++++++++++++++++++++--------------- decoder/hg_cfg.h | 5 +++- decoder/hg_test.h | 16 ++++++---- 9 files changed, 166 insertions(+), 86 deletions(-) diff --git a/decoder/cdec.cc b/decoder/cdec.cc index 5898b245..0a02801e 100644 --- a/decoder/cdec.cc +++ b/decoder/cdec.cc @@ -670,7 +670,7 @@ int main(int argc, char** argv) { maybe_prune(forest,conf,"beam_prune","density_prune","+LM",srclen); HgCFG hgcfg(forest); - cfg_options.maybe_output(hgcfg); + cfg_options.prepare(hgcfg); if (!fsa_ffs.empty()) { Timer t("Target FSA rescoring:"); if (!has_late_models) diff --git a/decoder/cfg.cc b/decoder/cfg.cc index c0598f16..be07c2c5 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -8,6 +8,12 @@ #include "fast_lexical_cast.hpp" //#include "indices_after.h" +#define CFGPRINT(x) IF_CFG_DEBUG(std::cerr< > BestRHS; // faster to use trie? maybe. BestRHS bestp; // once inserted, the position part (output index) never changes. but the prob may be improved (overwrite ruleid at that position). HASH_MAP_EMPTY(bestp,null_rhs); @@ -129,6 +135,7 @@ void CFG::UniqRules(NTHandle ni) { } // post: oi = number of new adj adj.resize(oi); + return oi; } void CFG::SortLocalBestFirst(NTHandle ni) { @@ -145,10 +152,12 @@ namespace { CFG::BinRhs null_bin_rhs(std::numeric_limits::min(),std::numeric_limits::min()); // index i >= N.size()? then it's in M[i-N.size()] -WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) +//WordID first,WordID second, +string BinStr(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) { int nn=N.size(); ostringstream o; +#undef BinNameOWORD #define BinNameOWORD(w) \ do { \ int n=w; if (n>0) o << TD::Convert(n); \ @@ -161,8 +170,12 @@ WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) BinNameOWORD(b.first); o<<'+'; BinNameOWORD(b.second); -#undef BinNameOWORD - return TD::Convert(o.str()); + return o.str(); +} + +WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) +{ + return TD::Convert(BinStr(b,N,M)); } }//ns @@ -177,33 +190,44 @@ void CFG::Binarize(CFGBinarize const& b) { cerr << "Binarizing "< > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left. HASH_MAP_EMPTY(bin2lhs,null_bin_rhs); - int rhsmin=b.bin_unary?0:1; // iterate using indices and not iterators because we'll be adding to both nts and rules list? we could instead pessimistically reserve space for both, but this is simpler. also: store original end of nts since we won't need to reprocess newly added ones. + int rhsmin=b.bin_unary?0:1; NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation Rules new_rules; //TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target) - int newnt=-nts.size(); + int newnt=-nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0) int newruleid=rules.size(); BinRhs bin; for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) { NT const& nt=*n; for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) { + CFGPRINT("Rule id# ") CFGSHOWS(*ir);IF_CFG_DEBUG(PrintRule(cerr<<" '",*ir,CFGFormat());cerr<<"'\n"); RHS &rhs=rules[*ir].rhs; // we're going to binarize this while adding newly created rules to new_... if (rhs.empty()) continue; - bin.second=rhs.back(); - for (int r=rhs.size()-2;r>=rhsmin;--r) { // pairs from right to left (normally we leave the last pair alone) - bin.first=rhs[r]; - if (newnt==(bin.second=(get_default(bin2lhs,bin,newnt)))) { - new_nts.push_back(NT(newruleid)); - new_rules.push_back(Rule(-newnt,bin)); - ++newruleid; - if (b.bin_name_nts) - new_nts.back().from.nt=BinName(bin,nts,new_nts); - --newnt; + int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT + if (rhsmin<=r) { // means r>=0 also + bin.second=rhs[r+1]; + int bin_to; // the replacement for bin + assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size()); + // also true at start/end of loop: + for (;;) { // pairs from right to left (normally we leave the last pair alone) + + bin.first=rhs[r]; + bin_to=get_default(bin2lhs,bin,newnt); + CFGSHOWS(r) CFGSHOWS(newnt) CFGPRINT("bin="<") CFGSHOW(bin_to); + if (newnt==bin_to) { // it's new! + new_nts.push_back(NT(newruleid++)); + //now -newnt is the index of the last (after new_nts is appended) nt. bin is its rhs. bin_to is its lhs + new_rules.push_back(Rule(-newnt,bin)); + --newnt; + if (b.bin_name_nts) + new_nts.back().from.nt=BinName(bin,nts,new_nts); + } + bin.second=bin_to; + --r; + if (r void visit_rhs_nts(V &v) const { @@ -171,9 +181,11 @@ struct CFG { bool Empty() const { return nts.empty(); } void UnindexRules(); // save some space? void ReindexRules(); // scan over rules and rebuild NT::ruleids (e.g. after using UniqRules) - void UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids. keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best. as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first. but you may also call this on unsorted ruleids. - inline void UniqRules() { - for (int i=0,e=nts.size();i!=e;++i) UniqRules(i); + int UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids. keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best. as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first. but you may also call this on unsorted ruleids. returns number of rules kept + inline int UniqRules() { + int nkept=0; + for (int i=0,e=nts.size();i!=e;++i) nkept+=UniqRules(i); + return nkept; } void SortLocalBestFirst(NTHandle ni); // post: nts[ni].ruleids lists rules from highest p to lowest. when doing best-first earley intersection/parsing, you don't want to use the global marginal viterbi; you want to ignore outside in ordering edges for a node, so call this. stable in case of ties diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h index c5303622..82c4dd1a 100755 --- a/decoder/cfg_binarize.h +++ b/decoder/cfg_binarize.h @@ -18,7 +18,6 @@ struct CFGBinarize { bool bin_l2r; bool bin_unary; bool bin_name_nts; - bool bin_uniq; bool bin_topo; template // template to support both printable_opts and boost nonprintable void AddOptions(Opts *opts) { @@ -27,7 +26,6 @@ struct CFGBinarize { ("cfg_binarize_unary", defaulted_value(&bin_unary),"if true, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least cfg_binarize_at times.") ("cfg_binarize_l2r", defaulted_value(&bin_l2r),"force left to right (a (b (c d))) binarization (ignore _at threshold)") ("cfg_binarize_name_nts", defaulted_value(&bin_name_nts),"create named virtual NT tokens e.g. 'A12+the' when binarizing 'B->[A12] the cat'") - ("cfg_binarize_uniq", defaulted_value(&bin_uniq),"in case of duplicate rules, keep only the one with highest prob") ("cfg_binarize_topo", defaulted_value(&bin_topo),"reorder nonterminals after binarization to maintain definition before use (topological order). otherwise the virtual NTs will all appear after the regular NTs") ; } @@ -45,7 +43,6 @@ struct CFGBinarize { } void set_defaults() { bin_topo=false; - bin_uniq=true; bin_at=0; bin_unary=false; bin_name_nts=true; @@ -65,6 +62,8 @@ struct CFGBinarize { o << "greedy count>="<123] where 123 is the node number starting at 0, and the highest node (last in file) is the goal node in an acyclic hypergraph") ("nt_span",defaulted_value(&nt_span),"prefix A(i,j) for NT coming from hypergraph node with category A on span [i,j). this is after --nt_prefix if any") @@ -44,7 +44,7 @@ struct CFGFormat { o< // template to support both printable_opts and boost nonprintable void AddOptions(Opts *opts) { @@ -22,6 +25,8 @@ struct CFGOptions { ("cfg_output", defaulted_value(&out),"write final target CFG (before FSA rescoring) to this file") ("source_cfg_output", defaulted_value(&source_out),"write source CFG (after prelm-scoring, prelm-prune) to this file") ("cfg_unbin_output", defaulted_value(&unbin_out),"write pre-binarization CFG to this file") //TODO: + ("cfg_uniq", defaulted_value(&uniq),"in case of duplicate rules, keep only the one with highest prob") + ; binarize.AddOptions(opts); format.AddOptions(opts); @@ -29,10 +34,6 @@ struct CFGOptions { void Validate() { format.Validate(); binarize.Validate(); -// if (out.empty()) binarize.bin_name_nts=false; - } - char const* description() const { - return "CFG output options"; } void maybe_output_source(Hypergraph const& hg) { if (source_out.empty()) return; @@ -41,24 +42,33 @@ struct CFGOptions { CFG cfg(hg,false,format.features,format.goal_nt()); cfg.Print(o.get(),format); } - void maybe_print(CFG &cfg,std::string cfg_output,char const* desc=" unbinarized") { - WriteFile o(cfg_output); - std::cerr<<"Printing target"< #include #include "cfg.h" #include "hg_test.h" #include "cfg_options.h" -#define CSHOW_V 1 +/* TODO: easiest way to get meaningful confirmations that things work: implement conversion back to hg, and compare viterbi/inside etc. stats for equality to original hg. or you can define CSHOW_V and see lots of output */ + +using namespace boost; + +#define CSHOW_V 0 + #if CSHOW_V -# define CSHOWDO(x) x +# define CSHOWDO(x) x; #else # define CSHOWDO(x) #endif #define CSHOW(x) CSHOWDO(cerr<<#x<<'='< HgW; // hg file,weights + +struct CFGTest : public TestWithParam { + string hgfile; + Hypergraph hg; + CFG cfg; + CFGFormat form; + FeatureVector weights; + + static void JsonFN(Hypergraph &hg,CFG &cfg,FeatureVector &featw,std::string file ,std::string const& wts="Model_0 1 EgivenF 1 f1 1") { - FeatureVector featw; istringstream ws(wts); EXPECT_TRUE(ws>>featw); CSHOW(featw) @@ -25,35 +36,56 @@ struct CFGTest : public HGSetup { hg.Reweight(featw); cfg.Init(hg,true,true,false); } - static void SetUpTestCase() { } static void TearDownTestCase() { } + CFGTest() { + hgfile=GetParam().first; + JsonFN(hg,cfg,weights,hgfile,GetParam().second); + CSHOWDO(cerr<<"\nCFG Test: ") + CSHOW(hgfile); + form.nt_span=true; + form.comma_nt=false; + } + ~CFGTest() { } }; -TEST_F(CFGTest,Binarize) { - Hypergraph hg; - CFG cfg; - JsonFN(hg,cfg,perro_json,perro_wts); - CSHOW("\nCFG Test.\n"); +TEST_P(CFGTest,Binarize) { CFGBinarize b; - CFGFormat form; - form.nt_span=true; - for (int i=-1;i<16;++i) { - b.bin_l2r=i>=0; - b.bin_unary=i&1; - b.bin_name_nts=i&2; - b.bin_uniq=i&4; - b.bin_topo=i&8; - CFG cc=cfg; - EXPECT_EQ(cc,cfg); - CSHOW("\nBinarizing: "<=0) { + int f=i<<1; + b.bin_l2r=1; + b.bin_unary=(f>>=1)&1; + b.bin_topo=(f>>=1)&1; + uniq=(f>>=1)&1; + } else + b.bin_l2r=0; + CFG cc=uniq?cfgu:cfg; + CSHOW("\nBinarizing "<<(uniq?"uniqued ":"")<<": "< using namespace std; - +using namespace testing; #pragma GCC diagnostic ignored "-Wunused-variable" namespace { -char const* small_json="small.json.gz"; +typedef char const* Name; -char const* perro_json="perro.json.gz"; -char const* perro_wts="SameFirstLetter 1 LongerThanPrev 1 ShorterThanPrev 1 GlueTop 0.0 Glue -1.0 EgivenF -0.5 FgivenE -0.5 LexEgivenF -0.5 LexFgivenE -0.5 LM 1"; +Name urdu_json="urdu.json.gz"; +Name urdu_wts="Arity_0 1.70741473606976 Arity_1 1.12426238048012 Arity_2 1.14986187839554 Glue -0.04589037041388 LanguageModel 1.09051 PassThrough -3.66226367902928 PhraseModel_0 -1.94633451863252 PhraseModel_1 -0.1475347695476 PhraseModel_2 -1.614818994946 WordPenalty -3.0 WordPenaltyFsa -0.56028442964748 ShorterThanPrev -10 LongerThanPrev -10"; +Name small_json="small.json.gz"; +Name small_wts="Model_0 -2 Model_1 -.5 Model_2 -1.1 Model_3 -1 Model_4 -1 Model_5 .5 Model_6 .2 Model_7 -.3"; +Name perro_json="perro.json.gz"; +Name perro_wts="SameFirstLetter 1 LongerThanPrev 1 ShorterThanPrev 1 GlueTop 0.0 Glue -1.0 EgivenF -0.5 FgivenE -0.5 LexEgivenF -0.5 LexFgivenE -0.5 LM 1"; } // you can inherit from this or just use the static methods -struct HGSetup : public testing::Test { +struct HGSetup : public Test { enum { HG, HG_int, @@ -52,7 +56,7 @@ struct HGSetup : public testing::Test { }; namespace { -char const* HGjsons[]= { +Name HGjsons[]= { "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}", "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| b\",3,\"[X] ||| a [1]\",4,\"[X] ||| [1] b\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,0.1],\"rule\":1},{\"tail\":[],\"feats\":[0,0.1],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X\"},\"edges\":[{\"tail\":[0],\"feats\":[0,0.3],\"rule\":3},{\"tail\":[0],\"feats\":[0,0.2],\"rule\":4}],\"node\":{\"in_edges\":[2,3],\"cat\":\"Goal\"}}", "{\"rules\":[1,\"[X] ||| \",2,\"[X] ||| X [1]\",3,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,-2,1,-99],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.5,1,-0.8],\"rule\":2},{\"tail\":[0],\"feats\":[0,-0.7,1,-0.9],\"rule\":3}],\"node\":{\"in_edges\":[1,2]}}", -- cgit v1.2.3