diff options
| author | graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-15 08:50:14 +0000 | 
|---|---|---|
| committer | graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-15 08:50:14 +0000 | 
| commit | 6f3ec4aa23606af60c4abb602bc2a87353a87e61 (patch) | |
| tree | 56fd3349ddbb3650c16f837ce3606dac59e8ce3e /decoder | |
| parent | 80c952989dfbc05b482a7a8265d0ca73079ee894 (diff) | |
files required for test
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@556 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder')
| -rwxr-xr-x | decoder/cfg.cc | 59 | ||||
| -rwxr-xr-x | decoder/cfg.h | 16 | ||||
| -rwxr-xr-x | decoder/cfg_test.cc | 7 | ||||
| -rw-r--r-- | decoder/test_data/perro.json.gz | bin | 0 -> 608 bytes | |||
| -rw-r--r-- | decoder/test_data/urdu.json.gz | bin | 0 -> 253497 bytes | 
5 files changed, 57 insertions, 25 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc index be07c2c5..3076c75e 100755 --- a/decoder/cfg.cc +++ b/decoder/cfg.cc @@ -7,12 +7,12 @@  #include <limits>  #include "fast_lexical_cast.hpp"  //#include "indices_after.h" +#include "show.h" + +#define DUNIQ(x) x +#define DBIN(x) +#define DCFG(x) IF_CFG_DEBUG(x) -#define CFGPRINT(x) IF_CFG_DEBUG(std::cerr<<x) -#define CFGSHOWC(x,s) CFGPRINT(#x<<"="<<x<<s) -#define CFGSHOW(x) CFGSHOWC(x,"\n") -#define CFGSHOWS(x) CFGSHOWC(x," ") -#define CFGSHOW2(x,y) CFGSHOWS(x) CFGSHOW(y)  using namespace std; @@ -38,7 +38,6 @@ namespace {  typedef std::vector<char> Seen; // 0 = unseen, 1 = seen+finished, 2 = open (for cycle detection; seen but not finished)  enum { UNSEEN=0,SEEN,OPEN }; -  // bottom -> top topo order (rev head->tails topo)  template <class OutOrder>  struct CFGTopo { @@ -120,22 +119,28 @@ int CFG::UniqRules(NTHandle ni) {    HASH_MAP_EMPTY(bestp,null_rhs);    Ruleids &adj=nts[ni].ruleids;    Ruleids oldadj=adj; -  int oi=0; +  int newpos=0;    for (int i=0,e=oldadj.size();i!=e;++i) { // this beautiful complexity is to ensure that adj' is a subsequence of adj (without duplicates)      int ri=oldadj[i];      Rule const& r=rules[ri]; -    prob_pos pi(r.p,oi); +    prob_pos pi(r.p,newpos);      prob_pos &oldpi=get_default(bestp,r.rhs,pi); -    if (oldpi.pos==oi) {// newly inserted -      adj[oi++]=ri; -    } else if (oldpi.prob<pi.prob) { // we improve prev. best (overwrite it @old pos) -      oldpi.prob=pi.prob; -      adj[oldpi.pos]=ri; // replace worse rule w/ better +    if (oldpi.pos==newpos) {// newly inserted +      adj[newpos++]=ri; +    } else { +      SHOWP(DUNIQ,"Uniq duplicate: ") SHOW4(DUNIQ,oldpi.prob,pi.prob,oldpi.pos,newpos); +      SHOW(DUNIQ,ShowRule(ri)); +      SHOW(DUNIQ,ShowRule(adj[oldpi.pos])); +      if (oldpi.prob<pi.prob) { // we improve prev. best (overwrite it @old pos) +        oldpi.prob=pi.prob; +        adj[oldpi.pos]=ri; // replace worse rule w/ better +      }      } +    } -  // post: oi = number of new adj -  adj.resize(oi); -  return oi; +  // post: newpos = number of new adj +  adj.resize(newpos); +  return newpos;  }  void CFG::SortLocalBestFirst(NTHandle ni) { @@ -201,8 +206,9 @@ void CFG::Binarize(CFGBinarize const& b) {    for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {      NT const& nt=*n;      for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) { -      CFGPRINT("Rule id# ") CFGSHOWS(*ir);IF_CFG_DEBUG(PrintRule(cerr<<" '",*ir,CFGFormat());cerr<<"'\n"); -      RHS &rhs=rules[*ir].rhs; // we're going to binarize this while adding newly created rules to new_... +      RuleHandle ruleid=*ir; +      SHOW2(DBIN,ruleid,ShowRule(ruleid)) +      RHS &rhs=rules[ruleid].rhs; // we're going to binarize this while adding newly created rules to new_...        if (rhs.empty()) continue;        int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT        if (rhsmin<=r) { // means r>=0 also @@ -214,7 +220,7 @@ void CFG::Binarize(CFGBinarize const& b) {            bin.first=rhs[r];            bin_to=get_default(bin2lhs,bin,newnt); -          CFGSHOWS(r) CFGSHOWS(newnt) CFGPRINT("bin="<<BinStr(bin,nts,new_nts)<<"=>") CFGSHOW(bin_to); +          SHOW(DBIN,r) SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(bin,nts,new_nts)<<"=>") SHOW(DBIN,bin_to);            if (newnt==bin_to) { // it's new!              new_nts.push_back(NT(newruleid++));              //now -newnt is the index of the last (after new_nts is appended) nt.  bin is its rhs.  bin_to is its lhs @@ -232,7 +238,7 @@ void CFG::Binarize(CFGBinarize const& b) {        }      }    } -#if 0 +#if 1    batched_append_swap(nts,new_nts);    batched_append_swap(rules,new_rules);  #else @@ -304,6 +310,10 @@ void CFG::Clear() {    hg_=0;  } +namespace { +CFGFormat form; +} +  void CFG::PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& f) const {    Rule const& r=rules[rulei];    f.print_lhs(o,*this,r.lhs); @@ -311,6 +321,12 @@ void CFG::PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& f) const {    f.print_features(o,r.p,r.f);    IF_CFG_TRULE(if (r.rule) o<<f.partsep<<*r.rule;)  } +void CFG::PrintRule(std::ostream &o,RuleHandle rulei) const { +  PrintRule(o,rulei,form); +} +string CFG::ShowRule(RuleHandle i) const { +  ostringstream o;PrintRule(o,i);return o.str(); +}  void CFG::Print(std::ostream &o,CFGFormat const& f) const {    assert(!uninit); @@ -332,10 +348,9 @@ void CFG::Print(std::ostream &o,CFGFormat const& f) const {  }  void CFG::Print(std::ostream &o) const { -  Print(o,CFGFormat()); +  Print(o,form);  } -  std::ostream &operator<<(std::ostream &o,CFG const &x) {    x.Print(o);    return o; diff --git a/decoder/cfg.h b/decoder/cfg.h index c07a6901..90a5dd5a 100755 --- a/decoder/cfg.h +++ b/decoder/cfg.h @@ -3,7 +3,7 @@  // for now, debug means remembering and printing the TRule behind each CFG rule  #ifndef CFG_DEBUG -# define CFG_DEBUG 0 +# define CFG_DEBUG 1  #endif  #ifndef CFG_KEEP_TRULE  # define CFG_KEEP_TRULE 0 @@ -98,6 +98,9 @@ struct CFG {      prob_t p; // h unused for now (there's nothing admissable, and p is already using 1st pass inside as pushed toward top)      FeatureVector f; // may be empty, unless copy_features on Init      IF_CFG_TRULE(TRulePtr rule;) +    int size() const { // for stats only +      return rhs.size(); +    }      void Swap(Rule &o) {        using namespace std;        swap(lhs,o.lhs); @@ -187,6 +190,13 @@ struct CFG {      for (int i=0,e=nts.size();i!=e;++i) nkept+=UniqRules(i);      return nkept;    } +  int rules_size() const { +    const int sz=rules.size(); +    int sum=sz; +    for (int i=0;i<sz;++i) +      sum+=rules[i].size(); +    return sum; +  }    void SortLocalBestFirst(NTHandle ni); // post: nts[ni].ruleids lists rules from highest p to lowest.  when doing best-first earley intersection/parsing, you don't want to use the global marginal viterbi; you want to ignore outside in ordering edges for a node, so call this.  stable in case of ties    inline void SortLocalBestFirst() { @@ -194,8 +204,10 @@ struct CFG {    }    void Init(Hypergraph const& hg,bool target_side=true,bool copy_features=false,bool push_weights=true);    void Print(std::ostream &o,CFGFormat const& format) const; // see cfg_format.h -  void PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& format) const;    void Print(std::ostream &o) const; // default format +  void PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& format) const; +  void PrintRule(std::ostream &o,RuleHandle rulei) const; +  std::string ShowRule(RuleHandle rulei) const;    void Swap(CFG &o) { // make sure this includes all fields (easier to see here than in .cc)      using namespace std;      swap(uninit,o.uninit); diff --git a/decoder/cfg_test.cc b/decoder/cfg_test.cc index cde4706c..c61f9f2c 100755 --- a/decoder/cfg_test.cc +++ b/decoder/cfg_test.cc @@ -3,6 +3,7 @@  #include "cfg.h"  #include "hg_test.h"  #include "cfg_options.h" +#include "show.h"  /* TODO: easiest way to get meaningful confirmations that things work: implement conversion back to hg, and compare viterbi/inside etc. stats for equality to original hg.  or you can define CSHOW_V and see lots of output */ @@ -60,8 +61,11 @@ TEST_P(CFGTest,Binarize) {    CSHOWDO(cerr<<"\nUniqing: "<<nrules<<"\n");    int nrem=cfgu.UniqRules();    cerr<<"\nCFG "<<hgfile<<" Uniqed - remaining: "<<nrem<<" of "<<nrules<<"\n"; -  if (nrem==nrules) +  if (nrem==nrules) {      EXPECT_EQ(cfgu,cfg); +    //TODO - check that 1best is still the same (that we removed only worse edges) +  } +    for (int i=-1;i<8;++i) {      bool uniq;      if (i>=0) { @@ -75,6 +79,7 @@ TEST_P(CFGTest,Binarize) {      CFG cc=uniq?cfgu:cfg;      CSHOW("\nBinarizing "<<(uniq?"uniqued ":"")<<": "<<i<<" "<<b);      cc.Binarize(b); +    cerr<<"Binarized "<<b<<" rules size "<<cfg.rules_size()<<" => "<<cc.rules_size()<<"\n";      CSHOWDO(cc.Print(cerr,form);cerr<<"\n\n";);    }  } diff --git a/decoder/test_data/perro.json.gz b/decoder/test_data/perro.json.gz Binary files differnew file mode 100644 index 00000000..41de5758 --- /dev/null +++ b/decoder/test_data/perro.json.gz diff --git a/decoder/test_data/urdu.json.gz b/decoder/test_data/urdu.json.gz Binary files differnew file mode 100644 index 00000000..84535402 --- /dev/null +++ b/decoder/test_data/urdu.json.gz  | 
