diff options
Diffstat (limited to 'decoder')
| -rw-r--r-- | decoder/cdec.cc | 2 | ||||
| -rwxr-xr-x | decoder/cfg.cc | 70 | ||||
| -rwxr-xr-x | decoder/cfg.h | 26 | ||||
| -rwxr-xr-x | decoder/cfg_binarize.h | 5 | ||||
| -rwxr-xr-x | decoder/cfg_format.h | 10 | ||||
| -rwxr-xr-x | decoder/cfg_options.h | 38 | ||||
| -rwxr-xr-x | decoder/cfg_test.cc | 80 | ||||
| -rwxr-xr-x | decoder/hg_cfg.h | 5 | ||||
| -rwxr-xr-x | decoder/hg_test.h | 16 | 
9 files changed, 166 insertions, 86 deletions
| diff --git a/decoder/cdec.cc b/decoder/cdec.cc index 5898b245..0a02801e 100644 --- a/decoder/cdec.cc +++ b/decoder/cdec.cc @@ -670,7 +670,7 @@ int main(int argc, char** argv) {      maybe_prune(forest,conf,"beam_prune","density_prune","+LM",srclen);      HgCFG hgcfg(forest); -    cfg_options.maybe_output(hgcfg); +    cfg_options.prepare(hgcfg);      if (!fsa_ffs.empty()) {        Timer t("Target FSA rescoring:");        if (!has_late_models) diff --git a/decoder/cfg.cc b/decoder/cfg.cc index c0598f16..be07c2c5 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -8,6 +8,12 @@  #include "fast_lexical_cast.hpp"  //#include "indices_after.h" +#define CFGPRINT(x) IF_CFG_DEBUG(std::cerr<<x) +#define CFGSHOWC(x,s) CFGPRINT(#x<<"="<<x<<s) +#define CFGSHOW(x) CFGSHOWC(x,"\n") +#define CFGSHOWS(x) CFGSHOWC(x," ") +#define CFGSHOW2(x,y) CFGSHOWS(x) CFGSHOW(y) +  using namespace std;  typedef CFG::Rule Rule; @@ -108,7 +114,7 @@ struct prob_pos {  };  }//ns -void CFG::UniqRules(NTHandle ni) { +int CFG::UniqRules(NTHandle ni) {    typedef HASH_MAP<RHS,prob_pos,boost::hash<RHS> > BestRHS; // faster to use trie? maybe.    BestRHS bestp; // once inserted, the position part (output index) never changes.  but the prob may be improved (overwrite ruleid at that position).    HASH_MAP_EMPTY(bestp,null_rhs); @@ -129,6 +135,7 @@ void CFG::UniqRules(NTHandle ni) {    }    // post: oi = number of new adj    adj.resize(oi); +  return oi;  }  void CFG::SortLocalBestFirst(NTHandle ni) { @@ -145,10 +152,12 @@ namespace {  CFG::BinRhs null_bin_rhs(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());  // index i >= N.size()?  then it's in M[i-N.size()] -WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) +//WordID first,WordID second, +string BinStr(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)  {    int nn=N.size();    ostringstream o; +#undef BinNameOWORD  #define BinNameOWORD(w)                                 \    do {                                                  \      int n=w; if (n>0) o << TD::Convert(n);              \ @@ -161,8 +170,12 @@ WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)    BinNameOWORD(b.first);    o<<'+';    BinNameOWORD(b.second); -#undef BinNameOWORD -  return TD::Convert(o.str()); +  return o.str(); +} + +WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M) +{ +  return TD::Convert(BinStr(b,N,M));  }  }//ns @@ -177,33 +190,44 @@ void CFG::Binarize(CFGBinarize const& b) {    cerr << "Binarizing "<<b<<endl;    HASH_MAP<BinRhs,NTHandle,boost::hash<BinRhs> > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left.    HASH_MAP_EMPTY(bin2lhs,null_bin_rhs); -  int rhsmin=b.bin_unary?0:1;    // iterate using indices and not iterators because we'll be adding to both nts and rules list?  we could instead pessimistically reserve space for both, but this is simpler.  also: store original end of nts since we won't need to reprocess newly added ones. +  int rhsmin=b.bin_unary?0:1;    NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation    Rules new_rules;    //TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target) -  int newnt=-nts.size(); +  int newnt=-nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0)    int newruleid=rules.size();    BinRhs bin;    for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {      NT const& nt=*n;      for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) { +      CFGPRINT("Rule id# ") CFGSHOWS(*ir);IF_CFG_DEBUG(PrintRule(cerr<<" '",*ir,CFGFormat());cerr<<"'\n");        RHS &rhs=rules[*ir].rhs; // we're going to binarize this while adding newly created rules to new_...        if (rhs.empty()) continue; -      bin.second=rhs.back(); -      for (int r=rhs.size()-2;r>=rhsmin;--r) { // pairs from right to left (normally we leave the last pair alone) -        bin.first=rhs[r]; -        if (newnt==(bin.second=(get_default(bin2lhs,bin,newnt)))) { -          new_nts.push_back(NT(newruleid)); -          new_rules.push_back(Rule(-newnt,bin)); -          ++newruleid; -          if (b.bin_name_nts) -            new_nts.back().from.nt=BinName(bin,nts,new_nts); -          --newnt; +      int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT +      if (rhsmin<=r) { // means r>=0 also +        bin.second=rhs[r+1]; +        int bin_to; // the replacement for bin +        assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size()); +        // also true at start/end of loop: +        for (;;) { // pairs from right to left (normally we leave the last pair alone) + +          bin.first=rhs[r]; +          bin_to=get_default(bin2lhs,bin,newnt); +          CFGSHOWS(r) CFGSHOWS(newnt) CFGPRINT("bin="<<BinStr(bin,nts,new_nts)<<"=>") CFGSHOW(bin_to); +          if (newnt==bin_to) { // it's new! +            new_nts.push_back(NT(newruleid++)); +            //now -newnt is the index of the last (after new_nts is appended) nt.  bin is its rhs.  bin_to is its lhs +            new_rules.push_back(Rule(-newnt,bin)); +            --newnt; +            if (b.bin_name_nts) +              new_nts.back().from.nt=BinName(bin,nts,new_nts); +          } +          bin.second=bin_to; +          --r; +          if (r<rhsmin) break;          } -      } -      if (rhsmin<rhs.size()) { -        rhs[rhsmin]=bin.second; +        rhs[rhsmin]=bin_to;          rhs.resize(rhsmin+1);        }      } @@ -246,9 +270,7 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus      prob_t &crp=cfgr.p;      crp=e.edge_prob_;      cfgr.lhs=e.head_node_; -#if CFG_DEBUG -    cfgr.rule=e.rule_; -#endif +    IF_CFG_TRULE(cfgr.rule=e.rule_;)      if (copy_features) cfgr.f=e.feature_values_;      if (push_weights) crp /=np[e.head_node_];      TRule const& er=*e.rule_; @@ -287,9 +309,7 @@ void CFG::PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& f) const {    f.print_lhs(o,*this,r.lhs);    f.print_rhs(o,*this,r.rhs.begin(),r.rhs.end());    f.print_features(o,r.p,r.f); -#if CFG_DEBUG -  if (r.rule) o<<f.partsep<<*r.rule; -#endif +  IF_CFG_TRULE(if (r.rule) o<<f.partsep<<*r.rule;)  }  void CFG::Print(std::ostream &o,CFGFormat const& f) const { diff --git a/decoder/cfg.h b/decoder/cfg.h index b6dd6d99..c07a6901 100755 --- a/decoder/cfg.h +++ b/decoder/cfg.h @@ -5,12 +5,22 @@  #ifndef CFG_DEBUG  # define CFG_DEBUG 0  #endif +#ifndef CFG_KEEP_TRULE +# define CFG_KEEP_TRULE 0 +#endif +  #if CFG_DEBUG -# define IF_CFG_DEBUG(x) x +# define IF_CFG_DEBUG(x) x;  #else  # define IF_CFG_DEBUG(x)  #endif +#if CFG_KEEP_TRULE +# define IF_CFG_TRULE(x) x; +#else +# define IF_CFG_TRULE(x) +#endif +  /* for target FSA intersection, we want to produce a simple (feature weighted) CFG using the target projection of a hg.  this is essentially isomorphic to the hypergraph, and we're copying part of the rule info (we'll maintain a pointer to the original hg edge for posterity/debugging; and perhaps avoid making a copy of the feature vector).  but we may also want to support CFG read from text files (w/ features), without needing to have a backing hypergraph.  so hg pointer may be null?  multiple types of CFG?  always copy the feature vector?  especially if we choose to binarize, we won't want to rely on 1:1 alignment w/ hg     question: how much does making a copy (essentially) of hg simplify things?  is the space used worth it?  is the node in/out edges index really that much of a waste?  is the use of indices that annoying? @@ -76,7 +86,7 @@ struct CFG {      // for binarizing - no costs/probs      Rule() : lhs(-1) {  }      bool is_null() const { return lhs<0; } -    void set_null() { lhs=-1; rhs.clear();f.clear(); IF_CFG_DEBUG(rule.reset();) } +    void set_null() { lhs=-1; rhs.clear();f.clear(); IF_CFG_TRULE(rule.reset();) }      Rule(int lhs,BinRhs const& binrhs) : lhs(lhs),rhs(2),p(1) {        rhs[0]=binrhs.first; @@ -87,14 +97,14 @@ struct CFG {      RHS rhs;      prob_t p; // h unused for now (there's nothing admissable, and p is already using 1st pass inside as pushed toward top)      FeatureVector f; // may be empty, unless copy_features on Init -    IF_CFG_DEBUG(TRulePtr rule;) +    IF_CFG_TRULE(TRulePtr rule;)      void Swap(Rule &o) {        using namespace std;        swap(lhs,o.lhs);        swap(rhs,o.rhs);        swap(p,o.p);        swap(f,o.f); -      IF_CFG_DEBUG(swap(rule,o.rule);) +      IF_CFG_TRULE(swap(rule,o.rule);)      }      template<class V>      void visit_rhs_nts(V &v) const { @@ -171,9 +181,11 @@ struct CFG {    bool Empty() const { return nts.empty(); }    void UnindexRules(); // save some space?    void ReindexRules(); // scan over rules and rebuild NT::ruleids (e.g. after using UniqRules) -  void UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids.  keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best.  as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first.  but you may also call this on unsorted ruleids. -  inline void UniqRules() { -    for (int i=0,e=nts.size();i!=e;++i) UniqRules(i); +  int UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids.  keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best.  as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first.  but you may also call this on unsorted ruleids.  returns number of rules kept +  inline int UniqRules() { +    int nkept=0; +    for (int i=0,e=nts.size();i!=e;++i) nkept+=UniqRules(i); +    return nkept;    }    void SortLocalBestFirst(NTHandle ni); // post: nts[ni].ruleids lists rules from highest p to lowest.  when doing best-first earley intersection/parsing, you don't want to use the global marginal viterbi; you want to ignore outside in ordering edges for a node, so call this.  stable in case of ties diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h index c5303622..82c4dd1a 100755 --- a/decoder/cfg_binarize.h +++ b/decoder/cfg_binarize.h @@ -18,7 +18,6 @@ struct CFGBinarize {    bool bin_l2r;    bool bin_unary;    bool bin_name_nts; -  bool bin_uniq;    bool bin_topo;    template <class Opts> // template to support both printable_opts and boost nonprintable    void AddOptions(Opts *opts) { @@ -27,7 +26,6 @@ struct CFGBinarize {        ("cfg_binarize_unary", defaulted_value(&bin_unary),"if true, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least cfg_binarize_at times.")        ("cfg_binarize_l2r", defaulted_value(&bin_l2r),"force left to right (a (b (c d))) binarization (ignore _at threshold)")        ("cfg_binarize_name_nts", defaulted_value(&bin_name_nts),"create named virtual NT tokens e.g. 'A12+the' when binarizing 'B->[A12] the cat'") -      ("cfg_binarize_uniq", defaulted_value(&bin_uniq),"in case of duplicate rules, keep only the one with highest prob")        ("cfg_binarize_topo", defaulted_value(&bin_topo),"reorder nonterminals after binarization to maintain definition before use (topological order).  otherwise the virtual NTs will all appear after the regular NTs")      ;    } @@ -45,7 +43,6 @@ struct CFGBinarize {    }    void set_defaults() {      bin_topo=false; -    bin_uniq=true;      bin_at=0;      bin_unary=false;      bin_name_nts=true; @@ -65,6 +62,8 @@ struct CFGBinarize {          o << "greedy count>="<<bin_at;        if (bin_name_nts)          o << " named-NTs"; +      if (bin_topo) +        o<<" preserve-topo-order";      }      o<<')';    } diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h index a9b3fd9f..c6a594b8 100755 --- a/decoder/cfg_format.h +++ b/decoder/cfg_format.h @@ -11,7 +11,7 @@ struct CFGFormat {    bool identity_scfg;    bool features;    bool logprob_feat; -  bool cfg_comma_nt; +  bool comma_nt;    bool nt_span;    std::string goal_nt_name;    std::string nt_prefix; @@ -27,7 +27,7 @@ struct CFGFormat {        ("features",defaulted_value(&features),"print the CFG feature vector")        ("logprob_feat",defaulted_value(&logprob_feat),"print a LogProb=-1.5 feature irrespective of --features.")        ("logprob_feat_name",defaulted_value(&logprob_feat_name),"alternate name for the LogProb feature") -      ("cfg_comma_nt",defaulted_value(&cfg_comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side") +      ("cfg_comma_nt",defaulted_value(&comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side")        ("goal_nt_name",defaulted_value(&goal_nt_name),"if nonempty, the first production will be '[goal_nt_name] ||| [x123] ||| LogProb=y' where x123 is the actual goal nt, and y is the pushed prob, if any")        ("nt_prefix",defaulted_value(&nt_prefix),"NTs are [<nt_prefix>123] where 123 is the node number starting at 0, and the highest node (last in file) is the goal node in an acyclic hypergraph")        ("nt_span",defaulted_value(&nt_span),"prefix A(i,j) for NT coming from hypergraph node with category A on span [i,j).  this is after --nt_prefix if any") @@ -44,7 +44,7 @@ struct CFGFormat {        o<<logprob_feat_name<<"(logprob) ";      if (nt_span)        o<<"named-NTs "; -    if (cfg_comma_nt) +    if (comma_nt)        o<<",N ";      o << "CFG output format";      o<<"]"; @@ -58,7 +58,7 @@ struct CFGFormat {    void print_source_nt(std::ostream &o,CFG const&cfg,int id,int position=1) const {      o<<'[';      print_nt_name(o,cfg,id); -    if (cfg_comma_nt) o<<','<<position; +    if (comma_nt) o<<','<<position;      o<<']';    } @@ -116,7 +116,7 @@ struct CFGFormat {      identity_scfg=false;      features=true;      logprob_feat=true; -    cfg_comma_nt=true; +    comma_nt=true;      goal_nt_name="S";      logprob_feat_name="LogProb";      nt_prefix=""; diff --git a/decoder/cfg_options.h b/decoder/cfg_options.h index 5dca168e..331363d2 100755 --- a/decoder/cfg_options.h +++ b/decoder/cfg_options.h @@ -10,11 +10,14 @@ struct CFGOptions {    CFGFormat format;    CFGBinarize binarize;    std::string out,source_out,unbin_out; +  bool uniq;    void set_defaults() {      format.set_defaults();      binarize.set_defaults(); -    out=""; +    out=source_out=unbin_out=""; +    uniq=false;    } +    CFGOptions() { set_defaults(); }    template <class Opts> // template to support both printable_opts and boost nonprintable    void AddOptions(Opts *opts) { @@ -22,6 +25,8 @@ struct CFGOptions {        ("cfg_output", defaulted_value(&out),"write final target CFG (before FSA rescoring) to this file")        ("source_cfg_output", defaulted_value(&source_out),"write source CFG (after prelm-scoring, prelm-prune) to this file")        ("cfg_unbin_output", defaulted_value(&unbin_out),"write pre-binarization CFG to this file") //TODO: +      ("cfg_uniq", defaulted_value(&uniq),"in case of duplicate rules, keep only the one with highest prob") +      ;      binarize.AddOptions(opts);      format.AddOptions(opts); @@ -29,10 +34,6 @@ struct CFGOptions {    void Validate() {      format.Validate();      binarize.Validate(); -//    if (out.empty()) binarize.bin_name_nts=false; -  } -  char const* description() const { -    return "CFG output options";    }    void maybe_output_source(Hypergraph const& hg) {      if (source_out.empty()) return; @@ -41,24 +42,33 @@ struct CFGOptions {      CFG cfg(hg,false,format.features,format.goal_nt());      cfg.Print(o.get(),format);    } -  void maybe_print(CFG &cfg,std::string cfg_output,char const* desc=" unbinarized") { -      WriteFile o(cfg_output); -      std::cerr<<"Printing target"<<desc<<" CFG to "<<cfg_output<<": "<<format<<'\n'; -      cfg.Print(o.get(),format); -  } - -  void maybe_output(HgCFG &hgcfg) { +  // executes all options except source_cfg_output, building target hgcfg +  void prepare(HgCFG &hgcfg) {      if (out.empty() && unbin_out.empty()) return;      CFG &cfg=hgcfg.GetCFG();      maybe_print(cfg,unbin_out); +    maybe_uniq(hgcfg);      maybe_binarize(hgcfg);      maybe_print(cfg,out,"");    } +  char const* description() const { +    return "CFG output options"; +  } +  void maybe_print(CFG &cfg,std::string cfg_output,char const* desc=" unbinarized") { +    WriteFile o(cfg_output); +    std::cerr<<"Printing target"<<desc<<" CFG to "<<cfg_output<<": "<<format<<'\n'; +    cfg.Print(o.get(),format); +  } + +  void maybe_uniq(HgCFG &hgcfg) { +    if (hgcfg.uniqed) return; +    hgcfg.GetCFG().UniqRules(); +    hgcfg.uniqed=true; +  }    void maybe_binarize(HgCFG &hgcfg) {      if (hgcfg.binarized) return; -    CFG &cfg=hgcfg.GetCFG(); -    cfg.Binarize(binarize); +    hgcfg.GetCFG().Binarize(binarize);      hgcfg.binarized=true;    }  }; diff --git a/decoder/cfg_test.cc b/decoder/cfg_test.cc index 81efa768..cde4706c 100755 --- a/decoder/cfg_test.cc +++ b/decoder/cfg_test.cc @@ -1,23 +1,34 @@ +#include <boost/tuple/tuple.hpp>  #include <gtest/gtest.h>  #include "cfg.h"  #include "hg_test.h"  #include "cfg_options.h" -#define CSHOW_V 1 +/* TODO: easiest way to get meaningful confirmations that things work: implement conversion back to hg, and compare viterbi/inside etc. stats for equality to original hg.  or you can define CSHOW_V and see lots of output */ + +using namespace boost; + +#define CSHOW_V 0 +  #if CSHOW_V -# define CSHOWDO(x) x +# define CSHOWDO(x) x;  #else  # define CSHOWDO(x)  #endif  #define CSHOW(x) CSHOWDO(cerr<<#x<<'='<<x<<endl;) -struct CFGTest : public HGSetup { -  CFGTest() {  } -  ~CFGTest() {  } -  static void JsonFN(Hypergraph hg,CFG &cfg,std::string file +typedef std::pair<string,string> HgW; // hg file,weights + +struct CFGTest : public TestWithParam<HgW> { +  string hgfile; +  Hypergraph hg; +  CFG cfg; +  CFGFormat form; +  FeatureVector weights; + +  static void JsonFN(Hypergraph &hg,CFG &cfg,FeatureVector &featw,std::string file                       ,std::string const& wts="Model_0 1 EgivenF 1 f1 1")    { -    FeatureVector featw;      istringstream ws(wts);      EXPECT_TRUE(ws>>featw);      CSHOW(featw) @@ -25,35 +36,56 @@ struct CFGTest : public HGSetup {      hg.Reweight(featw);      cfg.Init(hg,true,true,false);    } -    static void SetUpTestCase() {    }    static void TearDownTestCase() {    } +  CFGTest() { +    hgfile=GetParam().first; +    JsonFN(hg,cfg,weights,hgfile,GetParam().second); +    CSHOWDO(cerr<<"\nCFG Test: ") +    CSHOW(hgfile); +    form.nt_span=true; +    form.comma_nt=false; +  } +  ~CFGTest() {  }  }; -TEST_F(CFGTest,Binarize) { -  Hypergraph hg; -  CFG cfg; -  JsonFN(hg,cfg,perro_json,perro_wts); -  CSHOW("\nCFG Test.\n"); +TEST_P(CFGTest,Binarize) {    CFGBinarize b; -  CFGFormat form; -  form.nt_span=true; -  for (int i=-1;i<16;++i) { -    b.bin_l2r=i>=0; -    b.bin_unary=i&1; -    b.bin_name_nts=i&2; -    b.bin_uniq=i&4; -    b.bin_topo=i&8; -    CFG cc=cfg; -    EXPECT_EQ(cc,cfg); -    CSHOW("\nBinarizing: "<<b); +  b.bin_name_nts=1; +  CFG cfgu=cfg; +  EXPECT_EQ(cfgu,cfg); +  int nrules=cfg.rules.size(); +  CSHOWDO(cerr<<"\nUniqing: "<<nrules<<"\n"); +  int nrem=cfgu.UniqRules(); +  cerr<<"\nCFG "<<hgfile<<" Uniqed - remaining: "<<nrem<<" of "<<nrules<<"\n"; +  if (nrem==nrules) +    EXPECT_EQ(cfgu,cfg); +  for (int i=-1;i<8;++i) { +    bool uniq; +    if (i>=0) { +      int f=i<<1; +      b.bin_l2r=1; +      b.bin_unary=(f>>=1)&1; +      b.bin_topo=(f>>=1)&1; +      uniq=(f>>=1)&1; +    } else +      b.bin_l2r=0; +    CFG cc=uniq?cfgu:cfg; +    CSHOW("\nBinarizing "<<(uniq?"uniqued ":"")<<": "<<i<<" "<<b);      cc.Binarize(b);      CSHOWDO(cc.Print(cerr,form);cerr<<"\n\n";);    }  } +INSTANTIATE_TEST_CASE_P(HypergraphsWeights,CFGTest, +                        Values( +                          HgW(perro_json,perro_wts) +                          , HgW(small_json,small_wts) +                            ,HgW(urdu_json,urdu_wts) +                          )); +  int main(int argc, char **argv) {    testing::InitGoogleTest(&argc, argv);    return RUN_ALL_TESTS(); diff --git a/decoder/hg_cfg.h b/decoder/hg_cfg.h index 64a0e767..ba936990 100755 --- a/decoder/hg_cfg.h +++ b/decoder/hg_cfg.h @@ -7,7 +7,9 @@ class Hypergraph;  // in case you might want the CFG whether or not you apply FSA models:  struct HgCFG { -  HgCFG(Hypergraph const& ih) : ih(ih) { have_cfg=binarized=false;have_features=false; } +  HgCFG(Hypergraph const& ih) : ih(ih) { +    have_cfg=binarized=have_features=uniqed=false; +  }    Hypergraph const& ih;    CFG cfg;    bool have_cfg; @@ -17,6 +19,7 @@ struct HgCFG {      to.Init(ih,true,want_features,true);    }    bool binarized; +  bool uniqed;    CFG &GetCFG()    {      if (!have_cfg) { diff --git a/decoder/hg_test.h b/decoder/hg_test.h index c1bc05bd..3da6533c 100755 --- a/decoder/hg_test.h +++ b/decoder/hg_test.h @@ -8,20 +8,24 @@  #include <gtest/gtest.h>  using namespace std; - +using namespace testing;  #pragma GCC diagnostic ignored "-Wunused-variable"  namespace { -char const* small_json="small.json.gz"; +typedef char const* Name; -char const* perro_json="perro.json.gz"; -char const* perro_wts="SameFirstLetter 1 LongerThanPrev 1 ShorterThanPrev 1 GlueTop 0.0 Glue -1.0 EgivenF -0.5 FgivenE -0.5 LexEgivenF -0.5 LexFgivenE -0.5 LM 1"; +Name urdu_json="urdu.json.gz"; +Name urdu_wts="Arity_0 1.70741473606976 Arity_1 1.12426238048012 Arity_2 1.14986187839554 Glue -0.04589037041388 LanguageModel 1.09051 PassThrough -3.66226367902928 PhraseModel_0 -1.94633451863252 PhraseModel_1 -0.1475347695476 PhraseModel_2 -1.614818994946 WordPenalty -3.0 WordPenaltyFsa -0.56028442964748 ShorterThanPrev -10 LongerThanPrev -10"; +Name small_json="small.json.gz"; +Name small_wts="Model_0 -2 Model_1 -.5 Model_2 -1.1 Model_3 -1 Model_4 -1 Model_5 .5 Model_6 .2 Model_7 -.3"; +Name perro_json="perro.json.gz"; +Name perro_wts="SameFirstLetter 1 LongerThanPrev 1 ShorterThanPrev 1 GlueTop 0.0 Glue -1.0 EgivenF -0.5 FgivenE -0.5 LexEgivenF -0.5 LexFgivenE -0.5 LM 1";  }  // you can inherit from this or just use the static methods -struct HGSetup : public testing::Test { +struct HGSetup : public Test {    enum {      HG,      HG_int, @@ -52,7 +56,7 @@ struct HGSetup : public testing::Test {  };  namespace { -char const* HGjsons[]= { +Name HGjsons[]= {    "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}",  "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| b\",3,\"[X] ||| a [1]\",4,\"[X] ||| [1] b\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,0.1],\"rule\":1},{\"tail\":[],\"feats\":[0,0.1],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X\"},\"edges\":[{\"tail\":[0],\"feats\":[0,0.3],\"rule\":3},{\"tail\":[0],\"feats\":[0,0.2],\"rule\":4}],\"node\":{\"in_edges\":[2,3],\"cat\":\"Goal\"}}",    "{\"rules\":[1,\"[X] ||| <s>\",2,\"[X] ||| X [1]\",3,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,-2,1,-99],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.5,1,-0.8],\"rule\":2},{\"tail\":[0],\"feats\":[0,-0.7,1,-0.9],\"rule\":3}],\"node\":{\"in_edges\":[1,2]}}", | 
