summaryrefslogtreecommitdiff
path: root/decoder
diff options
context:
space:
mode:
Diffstat (limited to 'decoder')
-rwxr-xr-xdecoder/cfg.cc59
-rwxr-xr-xdecoder/cfg.h16
-rwxr-xr-xdecoder/cfg_test.cc7
-rw-r--r--decoder/test_data/perro.json.gzbin0 -> 608 bytes
-rw-r--r--decoder/test_data/urdu.json.gzbin0 -> 253497 bytes
5 files changed, 57 insertions, 25 deletions
diff --git a/decoder/cfg.cc b/decoder/cfg.cc
index be07c2c5..3076c75e 100755
--- a/decoder/cfg.cc
+++ b/decoder/cfg.cc
@@ -7,12 +7,12 @@
#include <limits>
#include "fast_lexical_cast.hpp"
//#include "indices_after.h"
+#include "show.h"
+
+#define DUNIQ(x) x
+#define DBIN(x)
+#define DCFG(x) IF_CFG_DEBUG(x)
-#define CFGPRINT(x) IF_CFG_DEBUG(std::cerr<<x)
-#define CFGSHOWC(x,s) CFGPRINT(#x<<"="<<x<<s)
-#define CFGSHOW(x) CFGSHOWC(x,"\n")
-#define CFGSHOWS(x) CFGSHOWC(x," ")
-#define CFGSHOW2(x,y) CFGSHOWS(x) CFGSHOW(y)
using namespace std;
@@ -38,7 +38,6 @@ namespace {
typedef std::vector<char> Seen; // 0 = unseen, 1 = seen+finished, 2 = open (for cycle detection; seen but not finished)
enum { UNSEEN=0,SEEN,OPEN };
-
// bottom -> top topo order (rev head->tails topo)
template <class OutOrder>
struct CFGTopo {
@@ -120,22 +119,28 @@ int CFG::UniqRules(NTHandle ni) {
HASH_MAP_EMPTY(bestp,null_rhs);
Ruleids &adj=nts[ni].ruleids;
Ruleids oldadj=adj;
- int oi=0;
+ int newpos=0;
for (int i=0,e=oldadj.size();i!=e;++i) { // this beautiful complexity is to ensure that adj' is a subsequence of adj (without duplicates)
int ri=oldadj[i];
Rule const& r=rules[ri];
- prob_pos pi(r.p,oi);
+ prob_pos pi(r.p,newpos);
prob_pos &oldpi=get_default(bestp,r.rhs,pi);
- if (oldpi.pos==oi) {// newly inserted
- adj[oi++]=ri;
- } else if (oldpi.prob<pi.prob) { // we improve prev. best (overwrite it @old pos)
- oldpi.prob=pi.prob;
- adj[oldpi.pos]=ri; // replace worse rule w/ better
+ if (oldpi.pos==newpos) {// newly inserted
+ adj[newpos++]=ri;
+ } else {
+ SHOWP(DUNIQ,"Uniq duplicate: ") SHOW4(DUNIQ,oldpi.prob,pi.prob,oldpi.pos,newpos);
+ SHOW(DUNIQ,ShowRule(ri));
+ SHOW(DUNIQ,ShowRule(adj[oldpi.pos]));
+ if (oldpi.prob<pi.prob) { // we improve prev. best (overwrite it @old pos)
+ oldpi.prob=pi.prob;
+ adj[oldpi.pos]=ri; // replace worse rule w/ better
+ }
}
+
}
- // post: oi = number of new adj
- adj.resize(oi);
- return oi;
+ // post: newpos = number of new adj
+ adj.resize(newpos);
+ return newpos;
}
void CFG::SortLocalBestFirst(NTHandle ni) {
@@ -201,8 +206,9 @@ void CFG::Binarize(CFGBinarize const& b) {
for (NTs::const_iterator n=nts.begin(),nn=nts.end();n!=nn;++n) {
NT const& nt=*n;
for (Ruleids::const_iterator ir=nt.ruleids.begin(),er=nt.ruleids.end();ir!=er;++ir) {
- CFGPRINT("Rule id# ") CFGSHOWS(*ir);IF_CFG_DEBUG(PrintRule(cerr<<" '",*ir,CFGFormat());cerr<<"'\n");
- RHS &rhs=rules[*ir].rhs; // we're going to binarize this while adding newly created rules to new_...
+ RuleHandle ruleid=*ir;
+ SHOW2(DBIN,ruleid,ShowRule(ruleid))
+ RHS &rhs=rules[ruleid].rhs; // we're going to binarize this while adding newly created rules to new_...
if (rhs.empty()) continue;
int r=rhs.size()-2; // loop below: [r,r+1) is to be reduced into a (maybe new) binary NT
if (rhsmin<=r) { // means r>=0 also
@@ -214,7 +220,7 @@ void CFG::Binarize(CFGBinarize const& b) {
bin.first=rhs[r];
bin_to=get_default(bin2lhs,bin,newnt);
- CFGSHOWS(r) CFGSHOWS(newnt) CFGPRINT("bin="<<BinStr(bin,nts,new_nts)<<"=>") CFGSHOW(bin_to);
+ SHOW(DBIN,r) SHOW(DBIN,newnt) SHOWP(DBIN,"bin="<<BinStr(bin,nts,new_nts)<<"=>") SHOW(DBIN,bin_to);
if (newnt==bin_to) { // it's new!
new_nts.push_back(NT(newruleid++));
//now -newnt is the index of the last (after new_nts is appended) nt. bin is its rhs. bin_to is its lhs
@@ -232,7 +238,7 @@ void CFG::Binarize(CFGBinarize const& b) {
}
}
}
-#if 0
+#if 1
batched_append_swap(nts,new_nts);
batched_append_swap(rules,new_rules);
#else
@@ -304,6 +310,10 @@ void CFG::Clear() {
hg_=0;
}
+namespace {
+CFGFormat form;
+}
+
void CFG::PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& f) const {
Rule const& r=rules[rulei];
f.print_lhs(o,*this,r.lhs);
@@ -311,6 +321,12 @@ void CFG::PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& f) const {
f.print_features(o,r.p,r.f);
IF_CFG_TRULE(if (r.rule) o<<f.partsep<<*r.rule;)
}
+void CFG::PrintRule(std::ostream &o,RuleHandle rulei) const {
+ PrintRule(o,rulei,form);
+}
+string CFG::ShowRule(RuleHandle i) const {
+ ostringstream o;PrintRule(o,i);return o.str();
+}
void CFG::Print(std::ostream &o,CFGFormat const& f) const {
assert(!uninit);
@@ -332,10 +348,9 @@ void CFG::Print(std::ostream &o,CFGFormat const& f) const {
}
void CFG::Print(std::ostream &o) const {
- Print(o,CFGFormat());
+ Print(o,form);
}
-
std::ostream &operator<<(std::ostream &o,CFG const &x) {
x.Print(o);
return o;
diff --git a/decoder/cfg.h b/decoder/cfg.h
index c07a6901..90a5dd5a 100755
--- a/decoder/cfg.h
+++ b/decoder/cfg.h
@@ -3,7 +3,7 @@
// for now, debug means remembering and printing the TRule behind each CFG rule
#ifndef CFG_DEBUG
-# define CFG_DEBUG 0
+# define CFG_DEBUG 1
#endif
#ifndef CFG_KEEP_TRULE
# define CFG_KEEP_TRULE 0
@@ -98,6 +98,9 @@ struct CFG {
prob_t p; // h unused for now (there's nothing admissable, and p is already using 1st pass inside as pushed toward top)
FeatureVector f; // may be empty, unless copy_features on Init
IF_CFG_TRULE(TRulePtr rule;)
+ int size() const { // for stats only
+ return rhs.size();
+ }
void Swap(Rule &o) {
using namespace std;
swap(lhs,o.lhs);
@@ -187,6 +190,13 @@ struct CFG {
for (int i=0,e=nts.size();i!=e;++i) nkept+=UniqRules(i);
return nkept;
}
+ int rules_size() const {
+ const int sz=rules.size();
+ int sum=sz;
+ for (int i=0;i<sz;++i)
+ sum+=rules[i].size();
+ return sum;
+ }
void SortLocalBestFirst(NTHandle ni); // post: nts[ni].ruleids lists rules from highest p to lowest. when doing best-first earley intersection/parsing, you don't want to use the global marginal viterbi; you want to ignore outside in ordering edges for a node, so call this. stable in case of ties
inline void SortLocalBestFirst() {
@@ -194,8 +204,10 @@ struct CFG {
}
void Init(Hypergraph const& hg,bool target_side=true,bool copy_features=false,bool push_weights=true);
void Print(std::ostream &o,CFGFormat const& format) const; // see cfg_format.h
- void PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& format) const;
void Print(std::ostream &o) const; // default format
+ void PrintRule(std::ostream &o,RuleHandle rulei,CFGFormat const& format) const;
+ void PrintRule(std::ostream &o,RuleHandle rulei) const;
+ std::string ShowRule(RuleHandle rulei) const;
void Swap(CFG &o) { // make sure this includes all fields (easier to see here than in .cc)
using namespace std;
swap(uninit,o.uninit);
diff --git a/decoder/cfg_test.cc b/decoder/cfg_test.cc
index cde4706c..c61f9f2c 100755
--- a/decoder/cfg_test.cc
+++ b/decoder/cfg_test.cc
@@ -3,6 +3,7 @@
#include "cfg.h"
#include "hg_test.h"
#include "cfg_options.h"
+#include "show.h"
/* TODO: easiest way to get meaningful confirmations that things work: implement conversion back to hg, and compare viterbi/inside etc. stats for equality to original hg. or you can define CSHOW_V and see lots of output */
@@ -60,8 +61,11 @@ TEST_P(CFGTest,Binarize) {
CSHOWDO(cerr<<"\nUniqing: "<<nrules<<"\n");
int nrem=cfgu.UniqRules();
cerr<<"\nCFG "<<hgfile<<" Uniqed - remaining: "<<nrem<<" of "<<nrules<<"\n";
- if (nrem==nrules)
+ if (nrem==nrules) {
EXPECT_EQ(cfgu,cfg);
+ //TODO - check that 1best is still the same (that we removed only worse edges)
+ }
+
for (int i=-1;i<8;++i) {
bool uniq;
if (i>=0) {
@@ -75,6 +79,7 @@ TEST_P(CFGTest,Binarize) {
CFG cc=uniq?cfgu:cfg;
CSHOW("\nBinarizing "<<(uniq?"uniqued ":"")<<": "<<i<<" "<<b);
cc.Binarize(b);
+ cerr<<"Binarized "<<b<<" rules size "<<cfg.rules_size()<<" => "<<cc.rules_size()<<"\n";
CSHOWDO(cc.Print(cerr,form);cerr<<"\n\n";);
}
}
diff --git a/decoder/test_data/perro.json.gz b/decoder/test_data/perro.json.gz
new file mode 100644
index 00000000..41de5758
--- /dev/null
+++ b/decoder/test_data/perro.json.gz
Binary files differ
diff --git a/decoder/test_data/urdu.json.gz b/decoder/test_data/urdu.json.gz
new file mode 100644
index 00000000..84535402
--- /dev/null
+++ b/decoder/test_data/urdu.json.gz
Binary files differ