summaryrefslogtreecommitdiff
path: root/decoder
diff options
context:
space:
mode:
authorgraehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-15 07:39:01 +0000
committergraehl@gmail.com <graehl@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-15 07:39:01 +0000
commit80c952989dfbc05b482a7a8265d0ca73079ee894 (patch)
treee1306f8c49006d34ab6418d8678c051e04358ef9 /decoder
parent214c741eee9b01ccd05a1825ad9ed647adef41db (diff)
really fixed binarization. test
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@555 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder')
-rw-r--r--decoder/cdec.cc2
-rwxr-xr-xdecoder/cfg.cc70
-rwxr-xr-xdecoder/cfg.h26
-rwxr-xr-xdecoder/cfg_binarize.h5
-rwxr-xr-xdecoder/cfg_format.h10
-rwxr-xr-xdecoder/cfg_options.h38
-rwxr-xr-xdecoder/cfg_test.cc80
-rwxr-xr-xdecoder/hg_cfg.h5
-rwxr-xr-xdecoder/hg_test.h16
9 files changed, 166 insertions, 86 deletions
diff --git a/decoder/cdec.cc b/decoder/cdec.cc
index 5898b245..0a02801e 100644
--- a/decoder/cdec.cc
+++ b/decoder/cdec.cc
@@ -670,7 +670,7 @@ int main(int argc, char** argv) {
maybe_prune(forest,conf,"beam_prune","density_prune","+LM",srclen);
HgCFG hgcfg(forest);
- cfg_options.maybe_output(hgcfg);
+ cfg_options.prepare(hgcfg);
if (!fsa_ffs.empty()) {
Timer t("Target FSA rescoring:");
if (!has_late_models)
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index c0598f16..be07c2c5 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -8,6 +8,12 @@
#include "fast_lexical_cast.hpp"
//#include "indices_after.h"
+#define CFGPRINT(x) IF_CFG_DEBUG(std::cerr<<x)
+#define CFGSHOWC(x,s) CFGPRINT(#x<<"="<<x<<s)
+#define CFGSHOW(x) CFGSHOWC(x,"\n")
+#define CFGSHOWS(x) CFGSHOWC(x," ")
+#define CFGSHOW2(x,y) CFGSHOWS(x) CFGSHOW(y)
+
using namespace std;
typedef CFG::Rule Rule;
@@ -108,7 +114,7 @@ struct prob_pos {
};
}//ns
-void CFG::UniqRules(NTHandle ni) {
+int CFG::UniqRules(NTHandle ni) {
typedef HASH_MAP<RHS,prob_pos,boost::hash<RHS> > BestRHS; // faster to use trie? maybe.
BestRHS bestp; // once inserted, the position part (output index) never changes. but the prob may be improved (overwrite ruleid at that position).
HASH_MAP_EMPTY(bestp,null_rhs);
@@ -129,6 +135,7 @@ void CFG::UniqRules(NTHandle ni) {
}
// post: oi = number of new adj
adj.resize(oi);
+ return oi;
}
void CFG::SortLocalBestFirst(NTHandle ni) {
@@ -145,10 +152,12 @@ namespace {
CFG::BinRhs null_bin_rhs(std::numeric_limits<int>::min(),std::numeric_limits<int>::min());
// index i >= N.size()? then it's in M[i-N.size()]
-WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
+//WordID first,WordID second,
+string BinStr(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
{
int nn=N.size();
ostringstream o;
+#undef BinNameOWORD
#define BinNameOWORD(w) \
do { \
int n=w; if (n>0) o << TD::Convert(n); \
@@ -161,8 +170,12 @@ WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
BinNameOWORD(b.first);
o<<'+';
BinNameOWORD(b.second);
-#undef BinNameOWORD
- return TD::Convert(o.str());
+ return o.str();
+}
+
+WordID BinName(CFG::BinRhs const& b,CFG::NTs const& N,CFG::NTs const& M)
+{
+ return TD::Convert(BinStr(b,N,M));
}
}//ns
@@ -177,33 +190,44 @@ void CFG::Binarize(CFGBinarize const& b) {
cerr << "Binarizing "<<b<<endl;
HASH_MAP<BinRhs,NTHandle,boost::hash<BinRhs> > bin2lhs; // we're going to hash cons rather than build an explicit trie from right to left.
HASH_MAP_EMPTY(bin2lhs,null_bin_rhs);
- int rhsmin=b.bin_unary?0:1;
// iterate using indices and not iterators because we'll be adding to both nts and rules list? we could instead pessimistically reserve space for both, but this is simpler. also: store original end of nts since we won't need to reprocess newly added ones.
+ int rhsmin=b.bin_unary?0:1;
NTs new_nts; // these will be appended at the end, so we don't have to worry about iterator invalidation
Rules new_rules;
//TODO: this could be factored easily into in-place (append to new_* like below) and functional (nondestructive copy) versions (copy orig to target and append to target)
- int newnt=-nts.size();
+ int newnt=-nts.size(); // we're going to store binary rhs with -nt to keep distinct from words (>=0)
int newruleid=rules.size();
BinRhs bin;
for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {
NT const& nt=*n;
for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) {
+ CFGPRINT("Rule id# ") CFGSHOWS(*ir);IF_CFG_DEBUG(PrintRule(cerr<<" '",*ir,CFGFormat());cerr<<"'\n");
RHS &rhs=rules[*ir].rhs; // we're going to binarize this while adding newly created rules to new_...
if (rhs.empty()) continue;
- bin.second=rhs.back();
- for (int r=rhs.size()-2;r>=rhsmin;--r) { // pairs from right to left (normally we leave the last pair alone)
- bin.first=rhs[r];
- if (newnt==(bin.second=(get_default(bin2lhs,bin,newnt)))) {
- new_nts.push_back(NT(newruleid));
- new_rules.push_back(Rule(-newnt,bin));
- ++newruleid;
- if (b.bin_name_nts)
- new_nts.back().from.nt=BinName(bin,nts,new_nts);
- --newnt;
+ int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT
+ if (rhsmin<=r) { // means r>=0 also
+ bin.second=rhs[r+1];
+ int bin_to; // the replacement for bin
+ assert(newruleid==rules.size()+new_rules.size());assert(-newnt==nts.size()+new_nts.size());
+ // also true at start/end of loop:
+ for (;;) { // pairs from right to left (normally we leave the last pair alone)
+
+ bin.first=rhs[r];
+ bin_to=get_default(bin2lhs,bin,newnt);
+ CFGSHOWS(r) CFGSHOWS(newnt) CFGPRINT("bin="<<BinStr(bin,nts,new_nts)<<"=>") CFGSHOW(bin_to);
+ if (newnt==bin_to) { // it's new!
+ new_nts.push_back(NT(newruleid++));
+ //now -newnt is the index of the last (after new_nts is appended) nt. bin is its rhs. bin_to is its lhs
+ new_rules.push_back(Rule(-newnt,bin));
+ --newnt;
+ if (b.bin_name_nts)
+ new_nts.back().from.nt=BinName(bin,nts,new_nts);
+ }
+ bin.second=bin_to;
+ --r;
+ if (r<rhsmin) break;
}
- }
- if (rhsmin<rhs.size()) {
- rhs[rhsmin]=bin.second;
+ rhs[rhsmin]=bin_to;
rhs.resize(rhsmin+1);
}
}
@@ -246,9 +270,7 @@ void CFG::Init(Hypergraph const& hg,bool target_side,bool copy_features,bool pus
prob_t &crp=cfgr.p;
crp=e.edge_prob_;
cfgr.lhs=e.head_node_;
-#if CFG_DEBUG
- cfgr.rule=e.rule_;
-#endif
+ IF_CFG_TRULE(cfgr.rule=e.rule_;)
if (copy_features) cfgr.f=e.feature_values_;
if (push_weights) crp /=np[e.head_node_];
TRule const& er=*e.rule_;
@@ -287,9 +309,7 @@ void CFG::PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& f) const {
f.print_lhs(o,*this,r.lhs);
f.print_rhs(o,*this,r.rhs.begin(),r.rhs.end());
f.print_features(o,r.p,r.f);
-#if CFG_DEBUG
- if (r.rule) o<<f.partsep<<*r.rule;
-#endif
+ IF_CFG_TRULE(if (r.rule) o<<f.partsep<<*r.rule;)
}
void CFG::Print(std::ostream &o,CFGFormat const& f) const {
diff --git a/decoder/cfg.h b/decoder/cfg.h
index b6dd6d99..c07a6901 100755
--- a/decoder/cfg.h
+++ b/decoder/cfg.h
@@ -5,12 +5,22 @@
#ifndef CFG_DEBUG
# define CFG_DEBUG 0
#endif
+#ifndef CFG_KEEP_TRULE
+# define CFG_KEEP_TRULE 0
+#endif
+
#if CFG_DEBUG
-# define IF_CFG_DEBUG(x) x
+# define IF_CFG_DEBUG(x) x;
#else
# define IF_CFG_DEBUG(x)
#endif
+#if CFG_KEEP_TRULE
+# define IF_CFG_TRULE(x) x;
+#else
+# define IF_CFG_TRULE(x)
+#endif
+
/* for target FSA intersection, we want to produce a simple (feature weighted) CFG using the target projection of a hg. this is essentially isomorphic to the hypergraph, and we're copying part of the rule info (we'll maintain a pointer to the original hg edge for posterity/debugging; and perhaps avoid making a copy of the feature vector). but we may also want to support CFG read from text files (w/ features), without needing to have a backing hypergraph. so hg pointer may be null? multiple types of CFG? always copy the feature vector? especially if we choose to binarize, we won't want to rely on 1:1 alignment w/ hg
question: how much does making a copy (essentially) of hg simplify things? is the space used worth it? is the node in/out edges index really that much of a waste? is the use of indices that annoying?
@@ -76,7 +86,7 @@ struct CFG {
// for binarizing - no costs/probs
Rule() : lhs(-1) { }
bool is_null() const { return lhs<0; }
- void set_null() { lhs=-1; rhs.clear();f.clear(); IF_CFG_DEBUG(rule.reset();) }
+ void set_null() { lhs=-1; rhs.clear();f.clear(); IF_CFG_TRULE(rule.reset();) }
Rule(int lhs,BinRhs const& binrhs) : lhs(lhs),rhs(2),p(1) {
rhs[0]=binrhs.first;
@@ -87,14 +97,14 @@ struct CFG {
RHS rhs;
prob_t p; // h unused for now (there's nothing admissable, and p is already using 1st pass inside as pushed toward top)
FeatureVector f; // may be empty, unless copy_features on Init
- IF_CFG_DEBUG(TRulePtr rule;)
+ IF_CFG_TRULE(TRulePtr rule;)
void Swap(Rule &o) {
using namespace std;
swap(lhs,o.lhs);
swap(rhs,o.rhs);
swap(p,o.p);
swap(f,o.f);
- IF_CFG_DEBUG(swap(rule,o.rule);)
+ IF_CFG_TRULE(swap(rule,o.rule);)
}
template<class V>
void visit_rhs_nts(V &v) const {
@@ -171,9 +181,11 @@ struct CFG {
bool Empty() const { return nts.empty(); }
void UnindexRules(); // save some space?
void ReindexRules(); // scan over rules and rebuild NT::ruleids (e.g. after using UniqRules)
- void UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids. keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best. as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first. but you may also call this on unsorted ruleids.
- inline void UniqRules() {
- for (int i=0,e=nts.size();i!=e;++i) UniqRules(i);
+ int UniqRules(NTHandle ni); // keep only the highest prob rule for each rhs and lhs=nt - doesn't remove from Rules; just removes from nts[ni].ruleids. keeps the same order in this sense: for a given signature (rhs), that signature's first representative in the old ruleids will become the new position of the best. as a consequence, if you SortLocalBestFirst() then UniqRules(), the result is still best first. but you may also call this on unsorted ruleids. returns number of rules kept
+ inline int UniqRules() {
+ int nkept=0;
+ for (int i=0,e=nts.size();i!=e;++i) nkept+=UniqRules(i);
+ return nkept;
}
void SortLocalBestFirst(NTHandle ni); // post: nts[ni].ruleids lists rules from highest p to lowest. when doing best-first earley intersection/parsing, you don't want to use the global marginal viterbi; you want to ignore outside in ordering edges for a node, so call this. stable in case of ties
diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h
index c5303622..82c4dd1a 100755
--- a/decoder/cfg_binarize.h
+++ b/decoder/cfg_binarize.h
@@ -18,7 +18,6 @@ struct CFGBinarize {
bool bin_l2r;
bool bin_unary;
bool bin_name_nts;
- bool bin_uniq;
bool bin_topo;
template <class Opts> // template to support both printable_opts and boost nonprintable
void AddOptions(Opts *opts) {
@@ -27,7 +26,6 @@ struct CFGBinarize {
("cfg_binarize_unary", defaulted_value(&bin_unary),"if true, a rule-completing production A->BC may be binarized as A->U U->BC if U->BC would be used at least cfg_binarize_at times.")
("cfg_binarize_l2r", defaulted_value(&bin_l2r),"force left to right (a (b (c d))) binarization (ignore _at threshold)")
("cfg_binarize_name_nts", defaulted_value(&bin_name_nts),"create named virtual NT tokens e.g. 'A12+the' when binarizing 'B->[A12] the cat'")
- ("cfg_binarize_uniq", defaulted_value(&bin_uniq),"in case of duplicate rules, keep only the one with highest prob")
("cfg_binarize_topo", defaulted_value(&bin_topo),"reorder nonterminals after binarization to maintain definition before use (topological order). otherwise the virtual NTs will all appear after the regular NTs")
;
}
@@ -45,7 +43,6 @@ struct CFGBinarize {
}
void set_defaults() {
bin_topo=false;
- bin_uniq=true;
bin_at=0;
bin_unary=false;
bin_name_nts=true;
@@ -65,6 +62,8 @@ struct CFGBinarize {
o << "greedy count>="<<bin_at;
if (bin_name_nts)
o << " named-NTs";
+ if (bin_topo)
+ o<<" preserve-topo-order";
}
o<<')';
}
diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h
index a9b3fd9f..c6a594b8 100755
--- a/decoder/cfg_format.h
+++ b/decoder/cfg_format.h
@@ -11,7 +11,7 @@ struct CFGFormat {
bool identity_scfg;
bool features;
bool logprob_feat;
- bool cfg_comma_nt;
+ bool comma_nt;
bool nt_span;
std::string goal_nt_name;
std::string nt_prefix;
@@ -27,7 +27,7 @@ struct CFGFormat {
("features",defaulted_value(&features),"print the CFG feature vector")
("logprob_feat",defaulted_value(&logprob_feat),"print a LogProb=-1.5 feature irrespective of --features.")
("logprob_feat_name",defaulted_value(&logprob_feat_name),"alternate name for the LogProb feature")
- ("cfg_comma_nt",defaulted_value(&cfg_comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side")
+ ("cfg_comma_nt",defaulted_value(&comma_nt),"if false, omit the usual [NP,1] ',1' variable index in the source side")
("goal_nt_name",defaulted_value(&goal_nt_name),"if nonempty, the first production will be '[goal_nt_name] ||| [x123] ||| LogProb=y' where x123 is the actual goal nt, and y is the pushed prob, if any")
("nt_prefix",defaulted_value(&nt_prefix),"NTs are [<nt_prefix>123] where 123 is the node number starting at 0, and the highest node (last in file) is the goal node in an acyclic hypergraph")
("nt_span",defaulted_value(&nt_span),"prefix A(i,j) for NT coming from hypergraph node with category A on span [i,j). this is after --nt_prefix if any")
@@ -44,7 +44,7 @@ struct CFGFormat {
o<<logprob_feat_name<<"(logprob) ";
if (nt_span)
o<<"named-NTs ";
- if (cfg_comma_nt)
+ if (comma_nt)
o<<",N ";
o << "CFG output format";
o<<"]";
@@ -58,7 +58,7 @@ struct CFGFormat {
void print_source_nt(std::ostream &o,CFG const&cfg,int id,int position=1) const {
o<<'[';
print_nt_name(o,cfg,id);
- if (cfg_comma_nt) o<<','<<position;
+ if (comma_nt) o<<','<<position;
o<<']';
}
@@ -116,7 +116,7 @@ struct CFGFormat {
identity_scfg=false;
features=true;
logprob_feat=true;
- cfg_comma_nt=true;
+ comma_nt=true;
goal_nt_name="S";
logprob_feat_name="LogProb";
nt_prefix="";
diff --git a/decoder/cfg_options.h b/decoder/cfg_options.h
index 5dca168e..331363d2 100755
--- a/decoder/cfg_options.h
+++ b/decoder/cfg_options.h
@@ -10,11 +10,14 @@ struct CFGOptions {
CFGFormat format;
CFGBinarize binarize;
std::string out,source_out,unbin_out;
+ bool uniq;
void set_defaults() {
format.set_defaults();
binarize.set_defaults();
- out="";
+ out=source_out=unbin_out="";
+ uniq=false;
}
+
CFGOptions() { set_defaults(); }
template <class Opts> // template to support both printable_opts and boost nonprintable
void AddOptions(Opts *opts) {
@@ -22,6 +25,8 @@ struct CFGOptions {
("cfg_output", defaulted_value(&out),"write final target CFG (before FSA rescoring) to this file")
("source_cfg_output", defaulted_value(&source_out),"write source CFG (after prelm-scoring, prelm-prune) to this file")
("cfg_unbin_output", defaulted_value(&unbin_out),"write pre-binarization CFG to this file") //TODO:
+ ("cfg_uniq", defaulted_value(&uniq),"in case of duplicate rules, keep only the one with highest prob")
+
;
binarize.AddOptions(opts);
format.AddOptions(opts);
@@ -29,10 +34,6 @@ struct CFGOptions {
void Validate() {
format.Validate();
binarize.Validate();
-// if (out.empty()) binarize.bin_name_nts=false;
- }
- char const* description() const {
- return "CFG output options";
}
void maybe_output_source(Hypergraph const& hg) {
if (source_out.empty()) return;
@@ -41,24 +42,33 @@ struct CFGOptions {
CFG cfg(hg,false,format.features,format.goal_nt());
cfg.Print(o.get(),format);
}
- void maybe_print(CFG &cfg,std::string cfg_output,char const* desc=" unbinarized") {
- WriteFile o(cfg_output);
- std::cerr<<"Printing target"<<desc<<" CFG to "<<cfg_output<<": "<<format<<'\n';
- cfg.Print(o.get(),format);
- }
-
- void maybe_output(HgCFG &hgcfg) {
+ // executes all options except source_cfg_output, building target hgcfg
+ void prepare(HgCFG &hgcfg) {
if (out.empty() && unbin_out.empty()) return;
CFG &cfg=hgcfg.GetCFG();
maybe_print(cfg,unbin_out);
+ maybe_uniq(hgcfg);
maybe_binarize(hgcfg);
maybe_print(cfg,out,"");
}
+ char const* description() const {
+ return "CFG output options";
+ }
+ void maybe_print(CFG &cfg,std::string cfg_output,char const* desc=" unbinarized") {
+ WriteFile o(cfg_output);
+ std::cerr<<"Printing target"<<desc<<" CFG to "<<cfg_output<<": "<<format<<'\n';
+ cfg.Print(o.get(),format);
+ }
+
+ void maybe_uniq(HgCFG &hgcfg) {
+ if (hgcfg.uniqed) return;
+ hgcfg.GetCFG().UniqRules();
+ hgcfg.uniqed=true;
+ }
void maybe_binarize(HgCFG &hgcfg) {
if (hgcfg.binarized) return;
- CFG &cfg=hgcfg.GetCFG();
- cfg.Binarize(binarize);
+ hgcfg.GetCFG().Binarize(binarize);
hgcfg.binarized=true;
}
};
diff --git a/decoder/cfg_test.cc b/decoder/cfg_test.cc
index 81efa768..cde4706c 100755
--- a/decoder/cfg_test.cc
+++ b/decoder/cfg_test.cc
@@ -1,23 +1,34 @@
+#include <boost/tuple/tuple.hpp>
#include <gtest/gtest.h>
#include "cfg.h"
#include "hg_test.h"
#include "cfg_options.h"
-#define CSHOW_V 1
+/* TODO: easiest way to get meaningful confirmations that things work: implement conversion back to hg, and compare viterbi/inside etc. stats for equality to original hg. or you can define CSHOW_V and see lots of output */
+
+using namespace boost;
+
+#define CSHOW_V 0
+
#if CSHOW_V
-# define CSHOWDO(x) x
+# define CSHOWDO(x) x;
#else
# define CSHOWDO(x)
#endif
#define CSHOW(x) CSHOWDO(cerr<<#x<<'='<<x<<endl;)
-struct CFGTest : public HGSetup {
- CFGTest() { }
- ~CFGTest() { }
- static void JsonFN(Hypergraph hg,CFG &cfg,std::string file
+typedef std::pair<string,string> HgW; // hg file,weights
+
+struct CFGTest : public TestWithParam<HgW> {
+ string hgfile;
+ Hypergraph hg;
+ CFG cfg;
+ CFGFormat form;
+ FeatureVector weights;
+
+ static void JsonFN(Hypergraph &hg,CFG &cfg,FeatureVector &featw,std::string file
,std::string const& wts="Model_0 1 EgivenF 1 f1 1")
{
- FeatureVector featw;
istringstream ws(wts);
EXPECT_TRUE(ws>>featw);
CSHOW(featw)
@@ -25,35 +36,56 @@ struct CFGTest : public HGSetup {
hg.Reweight(featw);
cfg.Init(hg,true,true,false);
}
-
static void SetUpTestCase() {
}
static void TearDownTestCase() {
}
+ CFGTest() {
+ hgfile=GetParam().first;
+ JsonFN(hg,cfg,weights,hgfile,GetParam().second);
+ CSHOWDO(cerr<<"\nCFG Test: ")
+ CSHOW(hgfile);
+ form.nt_span=true;
+ form.comma_nt=false;
+ }
+ ~CFGTest() { }
};
-TEST_F(CFGTest,Binarize) {
- Hypergraph hg;
- CFG cfg;
- JsonFN(hg,cfg,perro_json,perro_wts);
- CSHOW("\nCFG Test.\n");
+TEST_P(CFGTest,Binarize) {
CFGBinarize b;
- CFGFormat form;
- form.nt_span=true;
- for (int i=-1;i<16;++i) {
- b.bin_l2r=i>=0;
- b.bin_unary=i&1;
- b.bin_name_nts=i&2;
- b.bin_uniq=i&4;
- b.bin_topo=i&8;
- CFG cc=cfg;
- EXPECT_EQ(cc,cfg);
- CSHOW("\nBinarizing: "<<b);
+ b.bin_name_nts=1;
+ CFG cfgu=cfg;
+ EXPECT_EQ(cfgu,cfg);
+ int nrules=cfg.rules.size();
+ CSHOWDO(cerr<<"\nUniqing: "<<nrules<<"\n");
+ int nrem=cfgu.UniqRules();
+ cerr<<"\nCFG "<<hgfile<<" Uniqed - remaining: "<<nrem<<" of "<<nrules<<"\n";
+ if (nrem==nrules)
+ EXPECT_EQ(cfgu,cfg);
+ for (int i=-1;i<8;++i) {
+ bool uniq;
+ if (i>=0) {
+ int f=i<<1;
+ b.bin_l2r=1;
+ b.bin_unary=(f>>=1)&1;
+ b.bin_topo=(f>>=1)&1;
+ uniq=(f>>=1)&1;
+ } else
+ b.bin_l2r=0;
+ CFG cc=uniq?cfgu:cfg;
+ CSHOW("\nBinarizing "<<(uniq?"uniqued ":"")<<": "<<i<<" "<<b);
cc.Binarize(b);
CSHOWDO(cc.Print(cerr,form);cerr<<"\n\n";);
}
}
+INSTANTIATE_TEST_CASE_P(HypergraphsWeights,CFGTest,
+ Values(
+ HgW(perro_json,perro_wts)
+ , HgW(small_json,small_wts)
+ ,HgW(urdu_json,urdu_wts)
+ ));
+
int main(int argc, char **argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
diff --git a/decoder/hg_cfg.h b/decoder/hg_cfg.h
index 64a0e767..ba936990 100755
--- a/decoder/hg_cfg.h
+++ b/decoder/hg_cfg.h
@@ -7,7 +7,9 @@ class Hypergraph;
// in case you might want the CFG whether or not you apply FSA models:
struct HgCFG {
- HgCFG(Hypergraph const& ih) : ih(ih) { have_cfg=binarized=false;have_features=false; }
+ HgCFG(Hypergraph const& ih) : ih(ih) {
+ have_cfg=binarized=have_features=uniqed=false;
+ }
Hypergraph const& ih;
CFG cfg;
bool have_cfg;
@@ -17,6 +19,7 @@ struct HgCFG {
to.Init(ih,true,want_features,true);
}
bool binarized;
+ bool uniqed;
CFG &GetCFG()
{
if (!have_cfg) {
diff --git a/decoder/hg_test.h b/decoder/hg_test.h
index c1bc05bd..3da6533c 100755
--- a/decoder/hg_test.h
+++ b/decoder/hg_test.h
@@ -8,20 +8,24 @@
#include <gtest/gtest.h>
using namespace std;
-
+using namespace testing;
#pragma GCC diagnostic ignored "-Wunused-variable"
namespace {
-char const* small_json="small.json.gz";
+typedef char const* Name;
-char const* perro_json="perro.json.gz";
-char const* perro_wts="SameFirstLetter 1 LongerThanPrev 1 ShorterThanPrev 1 GlueTop 0.0 Glue -1.0 EgivenF -0.5 FgivenE -0.5 LexEgivenF -0.5 LexFgivenE -0.5 LM 1";
+Name urdu_json="urdu.json.gz";
+Name urdu_wts="Arity_0 1.70741473606976 Arity_1 1.12426238048012 Arity_2 1.14986187839554 Glue -0.04589037041388 LanguageModel 1.09051 PassThrough -3.66226367902928 PhraseModel_0 -1.94633451863252 PhraseModel_1 -0.1475347695476 PhraseModel_2 -1.614818994946 WordPenalty -3.0 WordPenaltyFsa -0.56028442964748 ShorterThanPrev -10 LongerThanPrev -10";
+Name small_json="small.json.gz";
+Name small_wts="Model_0 -2 Model_1 -.5 Model_2 -1.1 Model_3 -1 Model_4 -1 Model_5 .5 Model_6 .2 Model_7 -.3";
+Name perro_json="perro.json.gz";
+Name perro_wts="SameFirstLetter 1 LongerThanPrev 1 ShorterThanPrev 1 GlueTop 0.0 Glue -1.0 EgivenF -0.5 FgivenE -0.5 LexEgivenF -0.5 LexFgivenE -0.5 LM 1";
}
// you can inherit from this or just use the static methods
-struct HGSetup : public testing::Test {
+struct HGSetup : public Test {
enum {
HG,
HG_int,
@@ -52,7 +56,7 @@ struct HGSetup : public testing::Test {
};
namespace {
-char const* HGjsons[]= {
+Name HGjsons[]= {
"{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}",
"{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| b\",3,\"[X] ||| a [1]\",4,\"[X] ||| [1] b\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,0.1],\"rule\":1},{\"tail\":[],\"feats\":[0,0.1],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X\"},\"edges\":[{\"tail\":[0],\"feats\":[0,0.3],\"rule\":3},{\"tail\":[0],\"feats\":[0,0.2],\"rule\":4}],\"node\":{\"in_edges\":[2,3],\"cat\":\"Goal\"}}",
"{\"rules\":[1,\"[X] ||| <s>\",2,\"[X] ||| X [1]\",3,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[0,-2,1,-99],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.5,1,-0.8],\"rule\":2},{\"tail\":[0],\"feats\":[0,-0.7,1,-0.9],\"rule\":3}],\"node\":{\"in_edges\":[1,2]}}",