diff options
author | graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-15 07:39:01 +0000 |
---|---|---|
committer | graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-15 07:39:01 +0000 |
commit | 80c952989dfbc05b482a7a8265d0ca73079ee894 (patch) | |
tree | e1306f8c49006d34ab6418d8678c051e04358ef9 | |
parent | 214c741eee9b01ccd05a1825ad9ed647adef41db (diff) |
really fixed binarization. test
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@555 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r-- | decoder/cdec.cc | 2 | ||||
-rwxr-xr-x | decoder/cfg.cc | 70 | ||||
-rwxr-xr-x | decoder/cfg.h | 26 | ||||
-rwxr-xr-x | decoder/cfg_binarize.h | 5 | ||||
-rwxr-xr-x | decoder/cfg_format.h | 10 | ||||
-rwxr-xr-x | decoder/cfg_options.h | 38 | ||||
-rwxr-xr-x | decoder/cfg_test.cc | 80 | ||||
-rwxr-xr-x | decoder/hg_cfg.h | 5 | ||||
-rwxr-xr-x | decoder/hg_test.h | 16 |
9 files changed, 166 insertions, 86 deletions
diff --git a/decoder/cdec.cc b/decoder/cdec.cc index 5898b245..0a02801e 100644 --- a/decoder/cdec.cc +++ b/decoder/cdec.cc @@ -670,7 +670,7 @@ int main(int argc, char** argv) { maybe_prune(forest,conf,"beam_prune","density_prune","+LM",srclen); HgCFG hgcfg(forest); - cfg_options.maybe_output(hgcfg); + cfg_options.prepare(hgcfg); if (!fsa_ffs.empty()) { Timer t("Target FSA rescoring:"); if (!has_late_models) diff --git a/decoder/cfg.cc b/decoder/cfg.cc index c0598f16..be07c2c5 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -8,6 +8,12 @@ #include "fast_lexical_cast.hpp" //#include "indices_after.h" +#define CFGPRINT(x) IF_CFG_DEBUG(std::cerr<<x) +#define CFGSHOWC(x,s) CFGPRINT(#x<<"="<<x<<s) +#define CFGSHOW(x) CFGSHOWC(x,"\n") +#define CFGSHOWS(x) CFGSHOWC(x," ") +#define CFGSHOW2(x,y) CFGSHOWS(x) CFGSHOW(y) + using namespace std; typedef CFG::Rule Rule; @@ -108,7 +114,7 @@ struct prob_pos { }; }//ns -void CFG::UniqRules(NTHandle ni) { +int CFG::UniqRules(NTHandle ni) { typedef HASH_MAP<RHS,prob_pos,boost::hash<RHS> > BestRHS; // faster to use trie? maybe. BestRHS bestp; // once inserted, the position part (output index) never changes. but the prob may be improved (overwrite ruleid at that position). HASH_MAP_EMPTY(bestp,null_rhs); @@ -129,6 +135,7 @@ void CFG::UniqRules(NTHandle ni) { } // post: oi = number of new adj adj.resize(oi); + return oi; } void CFG::SortLocalBestFirst(NTHandle ni) { @@ -145,10 +152,12 @@ namespace { CFG::BinRhs null_bin_rhs(std::numeric_limits<int>::min(),std::numeric_limits<int>::min()); // index i >= N.size()? then it's in M[i-N.size()] -WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) +//WordID first,WordID second, +string BinStr(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) { int nn=N.size(); ostringstream o; +#undef BinNameOWORD #define BinNameOWORD(w) \ do { \ int n=w; if (n>0) o << TD::Convert(n); \ @@ -161,8 +170,12 @@ WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) BinNameOWORD(b.first); o<<'+'; BinNameOWORD(b.second); -#undef BinNameOWORD - return TD::Convert(o.str()); + return o.str(); +} + +WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) +{ + return TD::Convert(BinStr(b,N,M)); } }//ns @@ -177,33 +190,44 @@ void CFG::Binarize(CFGBinarize const& b) { cerr << "Binarizing "<<b<<endl; HASH_MAP<BinRhs,NTHandle,boost::hash<BinRhs> > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left. HASH_MAP_EMPTY(bin2lhs,null_bin_rhs); - int rhsmin=b.bin_unary?0:1; // iterate using indices and not iterators because we'll be adding to both nts and rules list? we could instead pessimistically reserve space for both, but this is simpler. also: store original end of nts since we won't need to reprocess newly added ones. + int rhsmin=b.bin_unary?0:1; NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation Rules new_rules; //TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target) - int newnt=-nts.size(); + int newnt=-nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0) int newruleid=rules.size(); BinRhs bin; for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) { NT const& nt=*n; for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) { + CFGPRINT("Rule id# ") CFGSHOWS(*ir);IF_CFG_DEBUG(PrintRule(cerr<<" '",*ir,CFGFormat());cerr<<"'\n"); RHS &rhs=rules[*ir].rhs; // we're going to binarize this while adding newly created rules to new_... if (rhs.empty()) continue; - bin.second=rhs.back(); - for (int r=rhs.size()-2;r>=rhsmin;--r) { // pairs from right to left (normally we leave the last pair alone) - bin.first=rhs[r]; - if (newnt==(bin.second=(get_default(bin2lhs,bin,newnt)))) { - new_nts.push_back(NT(newruleid)); - new_rules.push_back(Rule(-newnt,bin)); - ++newruleid; - if (b.bin_name_nts) - new_nts.back().from.nt=BinName(bin,nts,new_nts); - --newnt; + int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT + if (rhsmin<=r) { // means r>=0 also + bin.second=rhs[r+1]; + int bin_to; // the replacement for bin + assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size()); + // also true at start/end of loop: + for (;;) { // pairs from right to left (normally we leave the last pair alone) + + bin.first=rhs[r]; + bin_to=get_default(bin2lhs,bin,newnt); + CFGSHOWS(r) CFGSHOWS(newnt) CFGPRINT("bin="<<BinStr(bin,nts,new_nts)<<"=>") CFGSHOW(bin_to); + if (newnt==bin_to) { // it's new! + new_nts.push_back(NT(newruleid++)); + //now -newnt is the index of the last (after new_nts is appended) nt. bin is its rhs. bin_to is its lhs + new_rules.push_back(Rule(-newnt,bin)); + --newnt; + if (b.bin_name_nts) + new_nts.back().from.nt=BinName(bin,nts,new_nts); + } + bin.second=bin_to; + --r; + if (r<rhsmin) break; } - } - if (rhsmin<rhs.size()) { - rhs[rhsmin]=bin.second; + rhs[rhsmin]=bin_to; rhs.resize(rhsmin+1); } } @@ -246,9 +270,7 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus prob_t &crp=cfgr.p; crp=e.edge_prob_; cfgr.lhs=e.head_node_; -#if CFG_DEBUG - cfgr.rule=e.rule_; -#endif + IF_CFG_TRULE(cfgr.rule=e.rule_;) if (copy_features) cfgr.f=e.feature_values_; if (push_weights) crp /=np[e.head_node_]; TRule const& er=*e.rule_; @@ -287,9 +309,7 @@ void CFG::PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& f) const { f.print_lhs(o,*this,r.lhs); f.print_rhs(o,*this,r.rhs.begin(),r.rhs.end()); f.print_features(o,r.p,r.f); -#if CFG_DEBUG - if (r.rule) o<<f.partsep<<*r.rule; -#endif + IF_CFG_TRULE(if (r.rule) o<<f.partsep<<*r.rule;) } void CFG::Print(std::ostream &o,CFGFormat const& f) const { diff --git a/decoder/cfg.h b/decoder/cfg.h index b6dd6d99..c07a6901 100755 --- a/decoder/cfg.h +++ b/decoder/cfg.h @@ -5,12 +5,22 @@ #ifndef CFG_DEBUG # define CFG_DEBUG 0 #endif +#ifndef CFG_KEEP_TRULE +# define CFG_KEEP_TRULE 0 +#endif + #if CFG_DEBUG -# define IF_CFG_DEBUG(x) x +# define IF_CFG_DEBUG(x) x; #else # define IF_CFG_DEBUG(x) #endif +#if CFG_KEEP_TRULE +# define IF_CFG_TRULE(x) x; +#else +# define IF_CFG_TRULE(x) +#endif + /* for target FSA intersection, we want to produce a simple (feature weighted) CFG using the target projection of a hg. this is essentially isomorphic to the hypergraph, and we're copying part of the rule info (we'll maintain a pointer to the original hg edge for posterity/debugging; and perhaps avoid making a copy of the feature vector). but we may also want to support CFG read from text files (w/ features), without needing to have a backing hypergraph. so hg pointer may be null? multiple types of CFG? always copy the feature vector? especially if we choose to binarize, we won't want to rely on 1:1 alignment w/ hg question: how much does making a copy (essentially) of hg simplify things? is the space used worth it? is the node in/out edges index really that much of a waste? is the use of indices that annoying? @@ -76,7 +86,7 @@ struct CFG { // for binarizing - no costs/probs Rule() : lhs(-1) { } bool is_null() const { return lhs<0; } - void set_null() { lhs=-1; rhs.clear();f.clear(); IF_CFG_DEBUG(rule.reset();) } + void set_null() { lhs=-1; rhs.clear();f.clear(); IF_CFG_TRULE(rule.reset();) } Rule(int lhs,BinRhs const& binrhs) : lhs(lhs),rhs(2),p(1) { rhs[0]=binrhs.first; @@ -87,14 +97,14 @@ struct CFG { RHS rhs; prob_t p; // h unused for now (there's nothing admissable, and p is already using 1st pass inside as pushed toward top) FeatureVector f; // may be empty, unless copy_features on Init - IF_CFG_DEBUG(TRulePtr rule;) + IF_CFG_TRULE(TRulePtr rule;) void Swap(Rule &o) { using namespace std; swap(lhs,o.lhs); swap(rhs,o.rhs); swap(p,o.p); swap(f,o.f); - IF_CFG_DEBUG(swap(rule,o.rule);) + IF_CFG_TRULE(swap(rule,o.rule);) } template<class V> void visit_rhs_nts(V &v) const { @@ -171,9 +181,11 @@ struct CFG { bool Empty() const { return nts.empty(); } void UnindexRules(); // save some space? void ReindexRules(); // scan over rules and rebuild NT::ruleids (e.g. after using UniqRules) - void UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids. keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best. as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first. but you may also call this on unsorted ruleids. - inline void UniqRules() { - for (int i=0,e=nts.size();i!=e;++i) UniqRules(i); + int UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids. keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best. as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first. but you may also call this on unsorted ruleids. returns number of rules kept + inline int UniqRules() { + int nkept=0; + for (int i=0,e=nts.size();i!=e;++i) nkept+=UniqRules(i); + return nkept; } void SortLocalBestFirst(NTHandle ni); // post: nts[ni].ruleids lists rules from highest p to lowest. when doing best-first earley intersection/parsing, you don't want to use the global marginal viterbi; you want to ignore outside in ordering edges for a node, so call this. stable in case of ties diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h index c5303622..82c4dd1a 100755 --- a/decoder/cfg_binarize.h +++ b/decoder/cfg_binarize.h @@ -18,7 +18,6 @@ struct CFGBinarize { bool bin_l2r; bool bin_unary; bool bin_name_nts; - bool bin_uniq; bool bin_topo; template <class Opts> // template to support both printable_opts and boost nonprintable void AddOptions(Opts *opts) { @@ -27,7 +26,6 @@ struct CFGBinarize { ("cfg_binarize_unary", defaulted_value(&bin_unary),"if true, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least cfg_binarize_at times.") ("cfg_binarize_l2r", defaulted_value(&bin_l2r),"force left to right (a (b (c d))) binarization (ignore _at threshold)") ("cfg_binarize_name_nts", defaulted_value(&bin_name_nts),"create named virtual NT tokens e.g. 'A12+the' when binarizing 'B->[A12] the cat'") - ("cfg_binarize_uniq", defaulted_value(&bin_uniq),"in case of duplicate rules, keep only the one with highest prob") ("cfg_binarize_topo", defaulted_value(&bin_topo),"reorder nonterminals after binarization to maintain definition before use (topological order). otherwise the virtual NTs will all appear after the regular NTs") ; } @@ -45,7 +43,6 @@ struct CFGBinarize { } void set_defaults() { bin_topo=false; - bin_uniq=true; bin_at=0; bin_unary=false; bin_name_nts=true; @@ -65,6 +62,8 @@ struct CFGBinarize { o << "greedy count>="<<bin_at; if (bin_name_nts) o << " named-NTs"; + if (bin_topo) + o<<" preserve-topo-order"; } o<<')'; } diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h index a9b3fd9f..c6a594b8 100755 --- a/decoder/cfg_format.h +++ b/decoder/cfg_format.h @@ -11,7 +11,7 @@ struct CFGFormat { bool identity_scfg; bool features; bool logprob_feat; - bool cfg_comma_nt; + bool comma_nt; bool nt_span; std::string goal_nt_name; std::string nt_prefix; @@ -27,7 +27,7 @@ struct CFGFormat { ("features",defaulted_value(&features),"print the CFG feature vector") ("logprob_feat",defaulted_value(&logprob_feat),"print a LogProb=-1.5 feature irrespective of --features.") ("logprob_feat_name",defaulted_value(&logprob_feat_name),"alternate name for the LogProb feature") - ("cfg_comma_nt",defaulted_value(&cfg_comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side") + ("cfg_comma_nt",defaulted_value(&comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side") ("goal_nt_name",defaulted_value(&goal_nt_name),"if nonempty, the first production will be '[goal_nt_name] ||| [x123] ||| LogProb=y' where x123 is the actual goal nt, and y is the pushed prob, if any") ("nt_prefix",defaulted_value(&nt_prefix),"NTs are [<nt_prefix>123] where 123 is the node number starting at 0, and the highest node (last in file) is the goal node in an acyclic hypergraph") ("nt_span",defaulted_value(&nt_span),"prefix A(i,j) for NT coming from hypergraph node with category A on span [i,j). this is after --nt_prefix if any") @@ -44,7 +44,7 @@ struct CFGFormat { o<<logprob_feat_name<<"(logprob) "; if (nt_span) o<<"named-NTs "; - if (cfg_comma_nt) + if (comma_nt) o<<",N "; o << "CFG output format"; o<<"]"; @@ -58,7 +58,7 @@ struct CFGFormat { void print_source_nt(std::ostream &o,CFG const&cfg,int id,int position=1) const { o<<'['; print_nt_name(o,cfg,id); - if (cfg_comma_nt) o<<','<<position; + if (comma_nt) o<<','<<position; o<<']'; } @@ -116,7 +116,7 @@ struct CFGFormat { identity_scfg=false; features=true; logprob_feat=true; - cfg_comma_nt=true; + comma_nt=true; goal_nt_name="S"; logprob_feat_name="LogProb"; nt_prefix=""; diff --git a/decoder/cfg_options.h b/decoder/cfg_options.h index 5dca168e..331363d2 100755 --- a/decoder/cfg_options.h +++ b/decoder/cfg_options.h @@ -10,11 +10,14 @@ struct CFGOptions { CFGFormat format; CFGBinarize binarize; std::string out,source_out,unbin_out; + bool uniq; void set_defaults() { format.set_defaults(); binarize.set_defaults(); - out=""; + out=source_out=unbin_out=""; + uniq=false; } + CFGOptions() { set_defaults(); } template <class Opts> // template to support both printable_opts and boost nonprintable void AddOptions(Opts *opts) { @@ -22,6 +25,8 @@ struct CFGOptions { ("cfg_output", defaulted_value(&out),"write final target CFG (before FSA rescoring) to this file") ("source_cfg_output", defaulted_value(&source_out),"write source CFG (after prelm-scoring, prelm-prune) to this file") ("cfg_unbin_output", defaulted_value(&unbin_out),"write pre-binarization CFG to this file") //TODO: + ("cfg_uniq", defaulted_value(&uniq),"in case of duplicate rules, keep only the one with highest prob") + ; binarize.AddOptions(opts); format.AddOptions(opts); @@ -29,10 +34,6 @@ struct CFGOptions { void Validate() { format.Validate(); binarize.Validate(); -// if (out.empty()) binarize.bin_name_nts=false; - } - char const* description() const { - return "CFG output options"; } void maybe_output_source(Hypergraph const& hg) { if (source_out.empty()) return; @@ -41,24 +42,33 @@ struct CFGOptions { CFG cfg(hg,false,format.features,format.goal_nt()); cfg.Print(o.get(),format); } - void maybe_print(CFG &cfg,std::string cfg_output,char const* desc=" unbinarized") { - WriteFile o(cfg_output); - std::cerr<<"Printing target"<<desc<<" CFG to "<<cfg_output<<": "<<format<<'\n'; - cfg.Print(o.get(),format); - } - - void maybe_output(HgCFG &hgcfg) { + // executes all options except source_cfg_output, building target hgcfg + void prepare(HgCFG &hgcfg) { if (out.empty() && unbin_out.empty()) return; CFG &cfg=hgcfg.GetCFG(); maybe_print(cfg,unbin_out); + maybe_uniq(hgcfg); maybe_binarize(hgcfg); maybe_print(cfg,out,""); } + char const* description() const { + return "CFG output options"; + } + void maybe_print(CFG &cfg,std::string cfg_output,char const* desc=" unbinarized") { + WriteFile o(cfg_output); + std::cerr<<"Printing target"<<desc<<" CFG to "<<cfg_output<<": "<<format<<'\n'; + cfg.Print(o.get(),format); + } + + void maybe_uniq(HgCFG &hgcfg) { + if (hgcfg.uniqed) return; + hgcfg.GetCFG().UniqRules(); + hgcfg.uniqed=true; + } void maybe_binarize(HgCFG &hgcfg) { if (hgcfg.binarized) return; - CFG &cfg=hgcfg.GetCFG(); - cfg.Binarize(binarize); + hgcfg.GetCFG().Binarize(binarize); hgcfg.binarized=true; } }; diff --git a/decoder/cfg_test.cc b/decoder/cfg_test.cc index 81efa768..cde4706c 100755 --- a/decoder/cfg_test.cc +++ b/decoder/cfg_test.cc @@ -1,23 +1,34 @@ +#include <boost/tuple/tuple.hpp> #include <gtest/gtest.h> #include "cfg.h" #include "hg_test.h" #include "cfg_options.h" -#define CSHOW_V 1 +/* TODO: easiest way to get meaningful confirmations that things work: implement conversion back to hg, and compare viterbi/inside etc. stats for equality to original hg. or you can define CSHOW_V and see lots of output */ + +using namespace boost; + +#define CSHOW_V 0 + #if CSHOW_V -# define CSHOWDO(x) x +# define CSHOWDO(x) x; #else # define CSHOWDO(x) #endif #define CSHOW(x) CSHOWDO(cerr<<#x<<'='<<x<<endl;) -struct CFGTest : public HGSetup { - CFGTest() { } - ~CFGTest() { } - static void JsonFN(Hypergraph hg,CFG &cfg,std::string file +typedef std::pair<string,string> HgW; // hg file,weights + +struct CFGTest : public TestWithParam<HgW> { + string hgfile; + Hypergraph hg; + CFG cfg; + CFGFormat form; + FeatureVector weights; + + static void JsonFN(Hypergraph &hg,CFG &cfg,FeatureVector &featw,std::string file ,std::string const& wts="Model_0 1 EgivenF 1 f1 1") { - FeatureVector featw; istringstream ws(wts); EXPECT_TRUE(ws>>featw); CSHOW(featw) @@ -25,35 +36,56 @@ struct CFGTest : public HGSetup { hg.Reweight(featw); cfg.Init(hg,true,true,false); } - static void SetUpTestCase() { } static void TearDownTestCase() { } + CFGTest() { + hgfile=GetParam().first; + JsonFN(hg,cfg,weights,hgfile,GetParam().second); + CSHOWDO(cerr<<"\nCFG Test: ") + CSHOW(hgfile); + form.nt_span=true; + form.comma_nt=false; + } + ~CFGTest() { } }; -TEST_F(CFGTest,Binarize) { - Hypergraph hg; - CFG cfg; - JsonFN(hg,cfg,perro_json,perro_wts); - CSHOW("\nCFG Test.\n"); +TEST_P(CFGTest,Binarize) { CFGBinarize b; - CFGFormat form; - form.nt_span=true; - for (int i=-1;i<16;++i) { - b.bin_l2r=i>=0; - b.bin_unary=i&1; - b.bin_name_nts=i&2; - b.bin_uniq=i&4; - b.bin_topo=i&8; - CFG cc=cfg; - EXPECT_EQ(cc,cfg); - CSHOW("\nBinarizing: "<<b); + b.bin_name_nts=1; + CFG cfgu=cfg; + EXPECT_EQ(cfgu,cfg); + int nrules=cfg.rules.size(); + CSHOWDO(cerr<<"\nUniqing: "<<nrules<<"\n"); + int nrem=cfgu.UniqRules(); + cerr<<"\nCFG "<<hgfile<<" Uniqed - remaining: "<<nrem<<" of "<<nrules<<"\n"; + if (nrem==nrules) + EXPECT_EQ(cfgu,cfg); + for (int i=-1;i<8;++i) { + bool uniq; + if (i>=0) { + int f=i<<1; + b.bin_l2r=1; + b.bin_unary=(f>>=1)&1; + b.bin_topo=(f>>=1)&1; + uniq=(f>>=1)&1; + } else + b.bin_l2r=0; + CFG cc=uniq?cfgu:cfg; + CSHOW("\nBinarizing "<<(uniq?"uniqued ":"")<<": "<<i<<" "<<b); cc.Binarize(b); CSHOWDO(cc.Print(cerr,form);cerr<<"\n\n";); } } +INSTANTIATE_TEST_CASE_P(HypergraphsWeights,CFGTest, + Values( + HgW(perro_json,perro_wts) + , HgW(small_json,small_wts) + ,HgW(urdu_json,urdu_wts) + )); + int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/decoder/hg_cfg.h b/decoder/hg_cfg.h index 64a0e767..ba936990 100755 --- a/decoder/hg_cfg.h +++ b/decoder/hg_cfg.h @@ -7,7 +7,9 @@ class Hypergraph; // in case you might want the CFG whether or not you apply FSA models: struct HgCFG { - HgCFG(Hypergraph const& ih) : ih(ih) { have_cfg=binarized=false;have_features=false; } + HgCFG(Hypergraph const& ih) : ih(ih) { + have_cfg=binarized=have_features=uniqed=false; + } Hypergraph const& ih; CFG cfg; bool have_cfg; @@ -17,6 +19,7 @@ struct HgCFG { to.Init(ih,true,want_features,true); } bool binarized; + bool uniqed; CFG &GetCFG() { if (!have_cfg) { diff --git a/decoder/hg_test.h b/decoder/hg_test.h index c1bc05bd..3da6533c 100755 --- a/decoder/hg_test.h +++ b/decoder/hg_test.h @@ -8,20 +8,24 @@ #include <gtest/gtest.h> using namespace std; - +using namespace testing; #pragma GCC diagnostic ignored "-Wunused-variable" namespace { -char const* small_json="small.json.gz"; +typedef char const* Name; -char const* perro_json="perro.json.gz"; -char const* perro_wts="SameFirstLetter 1 LongerThanPrev 1 ShorterThanPrev 1 GlueTop 0.0 Glue -1.0 EgivenF -0.5 FgivenE -0.5 LexEgivenF -0.5 LexFgivenE -0.5 LM 1"; +Name urdu_json="urdu.json.gz"; +Name urdu_wts="Arity_0 1.70741473606976 Arity_1 1.12426238048012 Arity_2 1.14986187839554 Glue -0.04589037041388 LanguageModel 1.09051 PassThrough -3.66226367902928 PhraseModel_0 -1.94633451863252 PhraseModel_1 -0.1475347695476 PhraseModel_2 -1.614818994946 WordPenalty -3.0 WordPenaltyFsa -0.56028442964748 ShorterThanPrev -10 LongerThanPrev -10"; +Name small_json="small.json.gz"; +Name small_wts="Model_0 -2 Model_1 -.5 Model_2 -1.1 Model_3 -1 Model_4 -1 Model_5 .5 Model_6 .2 Model_7 -.3"; +Name perro_json="perro.json.gz"; +Name perro_wts="SameFirstLetter 1 LongerThanPrev 1 ShorterThanPrev 1 GlueTop 0.0 Glue -1.0 EgivenF -0.5 FgivenE -0.5 LexEgivenF -0.5 LexFgivenE -0.5 LM 1"; } // you can inherit from this or just use the static methods -struct HGSetup : public testing::Test { +struct HGSetup : public Test { enum { HG, HG_int, @@ -52,7 +56,7 @@ struct HGSetup : public testing::Test { }; namespace { -char const* HGjsons[]= { +Name HGjsons[]= { "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}", "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| b\",3,\"[X] ||| a [1]\",4,\"[X] ||| [1] b\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,0.1],\"rule\":1},{\"tail\":[],\"feats\":[0,0.1],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X\"},\"edges\":[{\"tail\":[0],\"feats\":[0,0.3],\"rule\":3},{\"tail\":[0],\"feats\":[0,0.2],\"rule\":4}],\"node\":{\"in_edges\":[2,3],\"cat\":\"Goal\"}}", "{\"rules\":[1,\"[X] ||| <s>\",2,\"[X] ||| X [1]\",3,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,-2,1,-99],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.5,1,-0.8],\"rule\":2},{\"tail\":[0],\"feats\":[0,-0.7,1,-0.9],\"rule\":3}],\"node\":{\"in_edges\":[1,2]}}", |