really fixed binarization. test

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@555 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-08-15 07:39:01 +0000
committer: graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-08-15 07:39:01 +0000
commit: 80c952989dfbc05b482a7a8265d0ca73079ee894 (patch)
tree: e1306f8c49006d34ab6418d8678c051e04358ef9
parent: 214c741eee9b01ccd05a1825ad9ed647adef41db (diff)
9 files changed, 166 insertions, 86 deletions
diff --git a/decoder/cdec.cc b/decoder/cdec.cc
index 5898b245..0a02801e 100644
--- a/decoder/cdec.cc
+++ b/decoder/cdec.cc
@@ -670,7 +670,7 @@ int main(int argc, char** argv) {
     maybe_prune(forest,conf,"beam_prune","density_prune","+LM",srclen);
 
     HgCFG hgcfg(forest);
-    cfg_options.maybe_output(hgcfg);
+    cfg_options.prepare(hgcfg);
     if (!fsa_ffs.empty()) {
       Timer t("Target FSA rescoring:");
       if (!has_late_models)
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index c0598f16..be07c2c5 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -8,6 +8,12 @@
 #include "fast_lexical_cast.hpp"
 //#include "indices_after.h"
 
+#define CFGPRINT(x) IF_CFG_DEBUG(std::cerr<<x)
+#define CFGSHOWC(x,s) CFGPRINT(#x<<"="<<x<<s)
+#define CFGSHOW(x) CFGSHOWC(x,"\n")
+#define CFGSHOWS(x) CFGSHOWC(x," ")
+#define CFGSHOW2(x,y) CFGSHOWS(x) CFGSHOW(y)
+
 using namespace std;
 
 typedef CFG::Rule Rule;
@@ -108,7 +114,7 @@ struct prob_pos {
 };
 }//ns
 
-void CFG::UniqRules(NTHandle ni) {
+int CFG::UniqRules(NTHandle ni) {
   typedef HASH_MAP<RHS,prob_pos,boost::hash<RHS> > BestRHS; // faster to use trie? maybe.
   BestRHS bestp; // once inserted, the position part (output index) never changes.  but the prob may be improved (overwrite ruleid at that position).
   HASH_MAP_EMPTY(bestp,null_rhs);
@@ -129,6 +135,7 @@ void CFG::UniqRules(NTHandle ni) {
   }
   // post: oi = number of new adj
   adj.resize(oi);
+  return oi;
 }
 
 void CFG::SortLocalBestFirst(NTHandle ni) {
@@ -145,10 +152,12 @@ namespace {
 CFG::BinRhs null_bin_rhs(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());
 
 // index i >= N.size()?  then it's in M[i-N.size()]
-WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
+//WordID first,WordID second,
+string BinStr(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
 {
   int nn=N.size();
   ostringstream o;
+#undef BinNameOWORD
 #define BinNameOWORD(w)                                 \
   do {                                                  \
     int n=w; if (n>0) o << TD::Convert(n);              \
@@ -161,8 +170,12 @@ WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
   BinNameOWORD(b.first);
   o<<'+';
   BinNameOWORD(b.second);
-#undef BinNameOWORD
-  return TD::Convert(o.str());
+  return o.str();
+}
+
+WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
+{
+  return TD::Convert(BinStr(b,N,M));
 }
 
 }//ns
@@ -177,33 +190,44 @@ void CFG::Binarize(CFGBinarize const& b) {
   cerr << "Binarizing "<<b<<endl;
   HASH_MAP<BinRhs,NTHandle,boost::hash<BinRhs> > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left.
   HASH_MAP_EMPTY(bin2lhs,null_bin_rhs);
-  int rhsmin=b.bin_unary?0:1;
   // iterate using indices and not iterators because we'll be adding to both nts and rules list?  we could instead pessimistically reserve space for both, but this is simpler.  also: store original end of nts since we won't need to reprocess newly added ones.
+  int rhsmin=b.bin_unary?0:1;
   NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation
   Rules new_rules;
   //TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target)
-  int newnt=-nts.size();
+  int newnt=-nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0)
   int newruleid=rules.size();
   BinRhs bin;
   for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {
     NT const& nt=*n;
     for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) {
+      CFGPRINT("Rule id# ") CFGSHOWS(*ir);IF_CFG_DEBUG(PrintRule(cerr<<" '",*ir,CFGFormat());cerr<<"'\n");
       RHS &rhs=rules[*ir].rhs; // we're going to binarize this while adding newly created rules to new_...
       if (rhs.empty()) continue;
-      bin.second=rhs.back();
-      for (int r=rhs.size()-2;r>=rhsmin;--r) { // pairs from right to left (normally we leave the last pair alone)
-        bin.first=rhs[r];
-        if (newnt==(bin.second=(get_default(bin2lhs,bin,newnt)))) {
-          new_nts.push_back(NT(newruleid));
-          new_rules.push_back(Rule(-newnt,bin));
-          ++newruleid;
-          if (b.bin_name_nts)
-            new_nts.back().from.nt=BinName(bin,nts,new_nts);
-          --newnt;
+      int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT
+      if (rhsmin<=r) { // means r>=0 also
+        bin.second=rhs[r+1];
+        int bin_to; // the replacement for bin
+        assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size());
+        // also true at start/end of loop:
+        for (;;) { // pairs from right to left (normally we leave the last pair alone)
+
+          bin.first=rhs[r];
+          bin_to=get_default(bin2lhs,bin,newnt);
+          CFGSHOWS(r) CFGSHOWS(newnt) CFGPRINT("bin="<<BinStr(bin,nts,new_nts)<<"=>") CFGSHOW(bin_to);
+          if (newnt==bin_to) { // it's new!
+            new_nts.push_back(NT(newruleid++));
+            //now -newnt is the index of the last (after new_nts is appended) nt.  bin is its rhs.  bin_to is its lhs
+            new_rules.push_back(Rule(-newnt,bin));
+            --newnt;
+            if (b.bin_name_nts)
+              new_nts.back().from.nt=BinName(bin,nts,new_nts);
+          }
+          bin.second=bin_to;
+          --r;
+          if (r<rhsmin) break;
         }
-      }
-      if (rhsmin<rhs.size()) {
-        rhs[rhsmin]=bin.second;
+        rhs[rhsmin]=bin_to;
         rhs.resize(rhsmin+1);
       }
     }
@@ -246,9 +270,7 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus
     prob_t &crp=cfgr.p;
     crp=e.edge_prob_;
     cfgr.lhs=e.head_node_;
-#if CFG_DEBUG
-    cfgr.rule=e.rule_;
-#endif
+    IF_CFG_TRULE(cfgr.rule=e.rule_;)
     if (copy_features) cfgr.f=e.feature_values_;
     if (push_weights) crp /=np[e.head_node_];
     TRule const& er=*e.rule_;
@@ -287,9 +309,7 @@ void CFG::PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& f) const {
   f.print_lhs(o,*this,r.lhs);
   f.print_rhs(o,*this,r.rhs.begin(),r.rhs.end());
   f.print_features(o,r.p,r.f);
-#if CFG_DEBUG
-  if (r.rule) o<<f.partsep<<*r.rule;
-#endif
+  IF_CFG_TRULE(if (r.rule) o<<f.partsep<<*r.rule;)
 }
 
 void CFG::Print(std::ostream &o,CFGFormat const& f) const {
diff --git a/decoder/cfg.h b/decoder/cfg.h
index b6dd6d99..c07a6901 100755
--- a/decoder/cfg.h
+++ b/decoder/cfg.h
@@ -5,12 +5,22 @@
 #ifndef CFG_DEBUG
 # define CFG_DEBUG 0
 #endif
+#ifndef CFG_KEEP_TRULE
+# define CFG_KEEP_TRULE 0
+#endif
+
 #if CFG_DEBUG
-# define IF_CFG_DEBUG(x) x
+# define IF_CFG_DEBUG(x) x;
 #else
 # define IF_CFG_DEBUG(x)
 #endif
 
+#if CFG_KEEP_TRULE
+# define IF_CFG_TRULE(x) x;
+#else
+# define IF_CFG_TRULE(x)
+#endif
+
 /* for target FSA intersection, we want to produce a simple (feature weighted) CFG using the target projection of a hg.  this is essentially isomorphic to the hypergraph, and we're copying part of the rule info (we'll maintain a pointer to the original hg edge for posterity/debugging; and perhaps avoid making a copy of the feature vector).  but we may also want to support CFG read from text files (w/ features), without needing to have a backing hypergraph.  so hg pointer may be null?  multiple types of CFG?  always copy the feature vector?  especially if we choose to binarize, we won't want to rely on 1:1 alignment w/ hg
 
    question: how much does making a copy (essentially) of hg simplify things?  is the space used worth it?  is the node in/out edges index really that much of a waste?  is the use of indices that annoying?
@@ -76,7 +86,7 @@ struct CFG {
     // for binarizing - no costs/probs
     Rule() : lhs(-1) {  }
     bool is_null() const { return lhs<0; }
-    void set_null() { lhs=-1; rhs.clear();f.clear(); IF_CFG_DEBUG(rule.reset();) }
+    void set_null() { lhs=-1; rhs.clear();f.clear(); IF_CFG_TRULE(rule.reset();) }
 
     Rule(int lhs,BinRhs const& binrhs) : lhs(lhs),rhs(2),p(1) {
       rhs[0]=binrhs.first;
@@ -87,14 +97,14 @@ struct CFG {
     RHS rhs;
     prob_t p; // h unused for now (there's nothing admissable, and p is already using 1st pass inside as pushed toward top)
     FeatureVector f; // may be empty, unless copy_features on Init
-    IF_CFG_DEBUG(TRulePtr rule;)
+    IF_CFG_TRULE(TRulePtr rule;)
     void Swap(Rule &o) {
       using namespace std;
       swap(lhs,o.lhs);
       swap(rhs,o.rhs);
       swap(p,o.p);
       swap(f,o.f);
-      IF_CFG_DEBUG(swap(rule,o.rule);)
+      IF_CFG_TRULE(swap(rule,o.rule);)
     }
     template<class V>
     void visit_rhs_nts(V &v) const {
@@ -171,9 +181,11 @@ struct CFG {
   bool Empty() const { return nts.empty(); }
   void UnindexRules(); // save some space?
   void ReindexRules(); // scan over rules and rebuild NT::ruleids (e.g. after using UniqRules)
-  void UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids.  keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best.  as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first.  but you may also call this on unsorted ruleids.
-  inline void UniqRules() {
-    for (int i=0,e=nts.size();i!=e;++i) UniqRules(i);
+  int UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids.  keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best.  as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first.  but you may also call this on unsorted ruleids.  returns number of rules kept
+  inline int UniqRules() {
+    int nkept=0;
+    for (int i=0,e=nts.size();i!=e;++i) nkept+=UniqRules(i);
+    return nkept;
   }
 
   void SortLocalBestFirst(NTHandle ni); // post: nts[ni].ruleids lists rules from highest p to lowest.  when doing best-first earley intersection/parsing, you don't want to use the global marginal viterbi; you want to ignore outside in ordering edges for a node, so call this.  stable in case of ties
diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h
index c5303622..82c4dd1a 100755
--- a/decoder/cfg_binarize.h
+++ b/decoder/cfg_binarize.h
@@ -18,7 +18,6 @@ struct CFGBinarize {
   bool bin_l2r;
   bool bin_unary;
   bool bin_name_nts;
-  bool bin_uniq;
   bool bin_topo;
   template <class Opts> // template to support both printable_opts and boost nonprintable
   void AddOptions(Opts *opts) {
@@ -27,7 +26,6 @@ struct CFGBinarize {
       ("cfg_binarize_unary", defaulted_value(&bin_unary),"if true, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least cfg_binarize_at times.")
       ("cfg_binarize_l2r", defaulted_value(&bin_l2r),"force left to right (a (b (c d))) binarization (ignore _at threshold)")
       ("cfg_binarize_name_nts", defaulted_value(&bin_name_nts),"create named virtual NT tokens e.g. 'A12+the' when binarizing 'B->[A12] the cat'")
-      ("cfg_binarize_uniq", defaulted_value(&bin_uniq),"in case of duplicate rules, keep only the one with highest prob")
       ("cfg_binarize_topo", defaulted_value(&bin_topo),"reorder nonterminals after binarization to maintain definition before use (topological order).  otherwise the virtual NTs will all appear after the regular NTs")
     ;
   }
@@ -45,7 +43,6 @@ struct CFGBinarize {
   }
   void set_defaults() {
     bin_topo=false;
-    bin_uniq=true;
     bin_at=0;
     bin_unary=false;
     bin_name_nts=true;
@@ -65,6 +62,8 @@ struct CFGBinarize {
         o << "greedy count>="<<bin_at;
       if (bin_name_nts)
         o << " named-NTs";
+      if (bin_topo)
+        o<<" preserve-topo-order";
     }
     o<<')';
   }
diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h
index a9b3fd9f..c6a594b8 100755
--- a/decoder/cfg_format.h
+++ b/decoder/cfg_format.h
@@ -11,7 +11,7 @@ struct CFGFormat {
   bool identity_scfg;
   bool features;
   bool logprob_feat;
-  bool cfg_comma_nt;
+  bool comma_nt;
   bool nt_span;
   std::string goal_nt_name;
   std::string nt_prefix;
@@ -27,7 +27,7 @@ struct CFGFormat {
       ("features",defaulted_value(&features),"print the CFG feature vector")
       ("logprob_feat",defaulted_value(&logprob_feat),"print a LogProb=-1.5 feature irrespective of --features.")
       ("logprob_feat_name",defaulted_value(&logprob_feat_name),"alternate name for the LogProb feature")
-      ("cfg_comma_nt",defaulted_value(&cfg_comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side")
+      ("cfg_comma_nt",defaulted_value(&comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side")
       ("goal_nt_name",defaulted_value(&goal_nt_name),"if nonempty, the first production will be '[goal_nt_name] ||| [x123] ||| LogProb=y' where x123 is the actual goal nt, and y is the pushed prob, if any")
       ("nt_prefix",defaulted_value(&nt_prefix),"NTs are [<nt_prefix>123] where 123 is the node number starting at 0, and the highest node (last in file) is the goal node in an acyclic hypergraph")
       ("nt_span",defaulted_value(&nt_span),"prefix A(i,j) for NT coming from hypergraph node with category A on span [i,j).  this is after --nt_prefix if any")
@@ -44,7 +44,7 @@ struct CFGFormat {
       o<<logprob_feat_name<<"(logprob) ";
     if (nt_span)
       o<<"named-NTs ";
-    if (cfg_comma_nt)
+    if (comma_nt)
       o<<",N ";
     o << "CFG output format";
     o<<"]";
@@ -58,7 +58,7 @@ struct CFGFormat {
   void print_source_nt(std::ostream &o,CFG const&cfg,int id,int position=1) const {
     o<<'[';
     print_nt_name(o,cfg,id);
-    if (cfg_comma_nt) o<<','<<position;
+    if (comma_nt) o<<','<<position;
     o<<']';
   }
 
@@ -116,7 +116,7 @@ struct CFGFormat {
     identity_scfg=false;
     features=true;
     logprob_feat=true;
-    cfg_comma_nt=true;
+    comma_nt=true;
     goal_nt_name="S";
     logprob_feat_name="LogProb";
     nt_prefix="";
diff --git a/decoder/cfg_options.h b/decoder/cfg_options.h
index 5dca168e..331363d2 100755
--- a/decoder/cfg_options.h
+++ b/decoder/cfg_options.h
@@ -10,11 +10,14 @@ struct CFGOptions {
   CFGFormat format;
   CFGBinarize binarize;
   std::string out,source_out,unbin_out;
+  bool uniq;
   void set_defaults() {
     format.set_defaults();
     binarize.set_defaults();
-    out="";
+    out=source_out=unbin_out="";
+    uniq=false;
   }
+
   CFGOptions() { set_defaults(); }
   template <class Opts> // template to support both printable_opts and boost nonprintable
   void AddOptions(Opts *opts) {
@@ -22,6 +25,8 @@ struct CFGOptions {
       ("cfg_output", defaulted_value(&out),"write final target CFG (before FSA rescoring) to this file")
       ("source_cfg_output", defaulted_value(&source_out),"write source CFG (after prelm-scoring, prelm-prune) to this file")
       ("cfg_unbin_output", defaulted_value(&unbin_out),"write pre-binarization CFG to this file") //TODO:
+      ("cfg_uniq", defaulted_value(&uniq),"in case of duplicate rules, keep only the one with highest prob")
+
     ;
     binarize.AddOptions(opts);
     format.AddOptions(opts);
@@ -29,10 +34,6 @@ struct CFGOptions {
   void Validate() {
     format.Validate();
     binarize.Validate();
-//    if (out.empty()) binarize.bin_name_nts=false;
-  }
-  char const* description() const {
-    return "CFG output options";
   }
   void maybe_output_source(Hypergraph const& hg) {
     if (source_out.empty()) return;
@@ -41,24 +42,33 @@ struct CFGOptions {
     CFG cfg(hg,false,format.features,format.goal_nt());
     cfg.Print(o.get(),format);
   }
-  void maybe_print(CFG &cfg,std::string cfg_output,char const* desc=" unbinarized") {
-      WriteFile o(cfg_output);
-      std::cerr<<"Printing target"<<desc<<" CFG to "<<cfg_output<<": "<<format<<'\n';
-      cfg.Print(o.get(),format);
-  }
-
-  void maybe_output(HgCFG &hgcfg) {
+  // executes all options except source_cfg_output, building target hgcfg
+  void prepare(HgCFG &hgcfg) {
     if (out.empty() && unbin_out.empty()) return;
     CFG &cfg=hgcfg.GetCFG();
     maybe_print(cfg,unbin_out);
+    maybe_uniq(hgcfg);
     maybe_binarize(hgcfg);
     maybe_print(cfg,out,"");
   }
 
+  char const* description() const {
+    return "CFG output options";
+  }
+  void maybe_print(CFG &cfg,std::string cfg_output,char const* desc=" unbinarized") {
+    WriteFile o(cfg_output);
+    std::cerr<<"Printing target"<<desc<<" CFG to "<<cfg_output<<": "<<format<<'\n';
+    cfg.Print(o.get(),format);
+  }
+
+  void maybe_uniq(HgCFG &hgcfg) {
+    if (hgcfg.uniqed) return;
+    hgcfg.GetCFG().UniqRules();
+    hgcfg.uniqed=true;
+  }
   void maybe_binarize(HgCFG &hgcfg) {
     if (hgcfg.binarized) return;
-    CFG &cfg=hgcfg.GetCFG();
-    cfg.Binarize(binarize);
+    hgcfg.GetCFG().Binarize(binarize);
     hgcfg.binarized=true;
   }
 };
diff --git a/decoder/cfg_test.cc b/decoder/cfg_test.cc
index 81efa768..cde4706c 100755
--- a/decoder/cfg_test.cc
+++ b/decoder/cfg_test.cc
@@ -1,23 +1,34 @@
+#include <boost/tuple/tuple.hpp>
 #include <gtest/gtest.h>
 #include "cfg.h"
 #include "hg_test.h"
 #include "cfg_options.h"
 
-#define CSHOW_V 1
+/* TODO: easiest way to get meaningful confirmations that things work: implement conversion back to hg, and compare viterbi/inside etc. stats for equality to original hg.  or you can define CSHOW_V and see lots of output */
+
+using namespace boost;
+
+#define CSHOW_V 0
+
 #if CSHOW_V
-# define CSHOWDO(x) x
+# define CSHOWDO(x) x;
 #else
 # define CSHOWDO(x)
 #endif
 #define CSHOW(x) CSHOWDO(cerr<<#x<<'='<<x<<endl;)
 
-struct CFGTest : public HGSetup {
-  CFGTest() {  }
-  ~CFGTest() {  }
-  static void JsonFN(Hypergraph hg,CFG &cfg,std::string file
+typedef std::pair<string,string> HgW; // hg file,weights
+
+struct CFGTest : public TestWithParam<HgW> {
+  string hgfile;
+  Hypergraph hg;
+  CFG cfg;
+  CFGFormat form;
+  FeatureVector weights;
+
+  static void JsonFN(Hypergraph &hg,CFG &cfg,FeatureVector &featw,std::string file
                      ,std::string const& wts="Model_0 1 EgivenF 1 f1 1")
   {
-    FeatureVector featw;
     istringstream ws(wts);
     EXPECT_TRUE(ws>>featw);
     CSHOW(featw)
@@ -25,35 +36,56 @@ struct CFGTest : public HGSetup {
     hg.Reweight(featw);
     cfg.Init(hg,true,true,false);
   }
-
   static void SetUpTestCase() {
   }
   static void TearDownTestCase() {
   }
+  CFGTest() {
+    hgfile=GetParam().first;
+    JsonFN(hg,cfg,weights,hgfile,GetParam().second);
+    CSHOWDO(cerr<<"\nCFG Test: ")
+    CSHOW(hgfile);
+    form.nt_span=true;
+    form.comma_nt=false;
+  }
+  ~CFGTest() {  }
 };
 
-TEST_F(CFGTest,Binarize) {
-  Hypergraph hg;
-  CFG cfg;
-  JsonFN(hg,cfg,perro_json,perro_wts);
-  CSHOW("\nCFG Test.\n");
+TEST_P(CFGTest,Binarize) {
   CFGBinarize b;
-  CFGFormat form;
-  form.nt_span=true;
-  for (int i=-1;i<16;++i) {
-    b.bin_l2r=i>=0;
-    b.bin_unary=i&1;
-    b.bin_name_nts=i&2;
-    b.bin_uniq=i&4;
-    b.bin_topo=i&8;
-    CFG cc=cfg;
-    EXPECT_EQ(cc,cfg);
-    CSHOW("\nBinarizing: "<<b);
+  b.bin_name_nts=1;
+  CFG cfgu=cfg;
+  EXPECT_EQ(cfgu,cfg);
+  int nrules=cfg.rules.size();
+  CSHOWDO(cerr<<"\nUniqing: "<<nrules<<"\n");
+  int nrem=cfgu.UniqRules();
+  cerr<<"\nCFG "<<hgfile<<" Uniqed - remaining: "<<nrem<<" of "<<nrules<<"\n";
+  if (nrem==nrules)
+    EXPECT_EQ(cfgu,cfg);
+  for (int i=-1;i<8;++i) {
+    bool uniq;
+    if (i>=0) {
+      int f=i<<1;
+      b.bin_l2r=1;
+      b.bin_unary=(f>>=1)&1;
+      b.bin_topo=(f>>=1)&1;
+      uniq=(f>>=1)&1;
+    } else
+      b.bin_l2r=0;
+    CFG cc=uniq?cfgu:cfg;
+    CSHOW("\nBinarizing "<<(uniq?"uniqued ":"")<<": "<<i<<" "<<b);
     cc.Binarize(b);
     CSHOWDO(cc.Print(cerr,form);cerr<<"\n\n";);
   }
 }
 
+INSTANTIATE_TEST_CASE_P(HypergraphsWeights,CFGTest,
+                        Values(
+                          HgW(perro_json,perro_wts)
+                          , HgW(small_json,small_wts)
+                            ,HgW(urdu_json,urdu_wts)
+                          ));
+
 int main(int argc, char **argv) {
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/decoder/hg_cfg.h b/decoder/hg_cfg.h
index 64a0e767..ba936990 100755
--- a/decoder/hg_cfg.h
+++ b/decoder/hg_cfg.h
@@ -7,7 +7,9 @@ class Hypergraph;
 
 // in case you might want the CFG whether or not you apply FSA models:
 struct HgCFG {
-  HgCFG(Hypergraph const& ih) : ih(ih) { have_cfg=binarized=false;have_features=false; }
+  HgCFG(Hypergraph const& ih) : ih(ih) {
+    have_cfg=binarized=have_features=uniqed=false;
+  }
   Hypergraph const& ih;
   CFG cfg;
   bool have_cfg;
@@ -17,6 +19,7 @@ struct HgCFG {
     to.Init(ih,true,want_features,true);
   }
   bool binarized;
+  bool uniqed;
   CFG &GetCFG()
   {
     if (!have_cfg) {
diff --git a/decoder/hg_test.h b/decoder/hg_test.h
index c1bc05bd..3da6533c 100755
--- a/decoder/hg_test.h
+++ b/decoder/hg_test.h
@@ -8,20 +8,24 @@
 #include <gtest/gtest.h>
 
 using namespace std;
-
+using namespace testing;
 #pragma GCC diagnostic ignored "-Wunused-variable"
 
 namespace {
 
-char const* small_json="small.json.gz";
+typedef char const* Name;
 
-char const* perro_json="perro.json.gz";
-char const* perro_wts="SameFirstLetter 1 LongerThanPrev 1 ShorterThanPrev 1 GlueTop 0.0 Glue -1.0 EgivenF -0.5 FgivenE -0.5 LexEgivenF -0.5 LexFgivenE -0.5 LM 1";
+Name urdu_json="urdu.json.gz";
+Name urdu_wts="Arity_0 1.70741473606976 Arity_1 1.12426238048012 Arity_2 1.14986187839554 Glue -0.04589037041388 LanguageModel 1.09051 PassThrough -3.66226367902928 PhraseModel_0 -1.94633451863252 PhraseModel_1 -0.1475347695476 PhraseModel_2 -1.614818994946 WordPenalty -3.0 WordPenaltyFsa -0.56028442964748 ShorterThanPrev -10 LongerThanPrev -10";
+Name small_json="small.json.gz";
+Name small_wts="Model_0 -2 Model_1 -.5 Model_2 -1.1 Model_3 -1 Model_4 -1 Model_5 .5 Model_6 .2 Model_7 -.3";
+Name perro_json="perro.json.gz";
+Name perro_wts="SameFirstLetter 1 LongerThanPrev 1 ShorterThanPrev 1 GlueTop 0.0 Glue -1.0 EgivenF -0.5 FgivenE -0.5 LexEgivenF -0.5 LexFgivenE -0.5 LM 1";
 
 }
 
 // you can inherit from this or just use the static methods
-struct HGSetup : public testing::Test {
+struct HGSetup : public Test {
   enum {
     HG,
     HG_int,
@@ -52,7 +56,7 @@ struct HGSetup : public testing::Test {
 };
 
 namespace {
-char const* HGjsons[]= {
+Name HGjsons[]= {
   "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}",
 "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| b\",3,\"[X] ||| a [1]\",4,\"[X] ||| [1] b\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,0.1],\"rule\":1},{\"tail\":[],\"feats\":[0,0.1],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X\"},\"edges\":[{\"tail\":[0],\"feats\":[0,0.3],\"rule\":3},{\"tail\":[0],\"feats\":[0,0.2],\"rule\":4}],\"node\":{\"in_edges\":[2,3],\"cat\":\"Goal\"}}",
   "{\"rules\":[1,\"[X] ||| <s>\",2,\"[X] ||| X [1]\",3,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,-2,1,-99],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.5,1,-0.8],\"rule\":2},{\"tail\":[0],\"feats\":[0,-0.7,1,-0.9],\"rule\":3}],\"node\":{\"in_edges\":[1,2]}}",
author	graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-08-15 07:39:01 +0000
committer	graehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-08-15 07:39:01 +0000
commit	80c952989dfbc05b482a7a8265d0ca73079ee894 (patch)
tree	e1306f8c49006d34ab6418d8678c051e04358ef9
parent	214c741eee9b01ccd05a1825ad9ed647adef41db (diff)