diff options
author | olivia.buzek <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-08 21:59:50 +0000 |
---|---|---|
committer | olivia.buzek <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-08 21:59:50 +0000 |
commit | c12e7241e8908def96943b1a4056e536ea91eded (patch) | |
tree | c24b9cf0d2a90239b01eb6432e683292c95bb06f /extools/extract.cc | |
parent | a034f92b1fe0c6368ebb140bc691f0718dd23a23 (diff) |
Adding backoff grammar and BackoffRule feature.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@191 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'extools/extract.cc')
-rw-r--r-- | extools/extract.cc | 44 |
1 files changed, 41 insertions, 3 deletions
diff --git a/extools/extract.cc b/extools/extract.cc index 6ad124d2..c2c413e2 100644 --- a/extools/extract.cc +++ b/extools/extract.cc @@ -173,12 +173,15 @@ void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence, const int max_syms, const bool permit_adjacent_nonterminals, const bool require_aligned_terminal, - RuleObserver* observer) { + RuleObserver* observer, + vector<WordID>* all_cats) { + const char bkoff_mrkr = '_'; queue<RuleItem> q; // agenda for BFS int max_len = -1; unordered_map<pair<short, short>, vector<ParallelSpan>, boost::hash<pair<short, short> > > fspans; vector<vector<ParallelSpan> > spans_by_start(sentence.f_len); set<int> starts; + WordID bkoff; for (int i = 0; i < phrases.size(); ++i) { fspans[make_pair(phrases[i].i1,phrases[i].i2)].push_back(phrases[i]); max_len = max(max_len, phrases[i].i2 - phrases[i].i1); @@ -281,6 +284,42 @@ void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence, if (cur_es[j] >= 0 && sentence.aligned(cur_fs[i],cur_es[j])) cur_terminal_align.push_back(make_pair(i,j)); observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align); + + if(!all_cats->empty()) { + //produce the backoff grammar if the category wordIDs are available + for (int i = 0; i < cur_rhs_f.size(); ++i) { + if(cur_rhs_f[i] < 0) { + //cerr << cur_rhs_f[i] << ": (cats,f) |" << TD::Convert(-cur_rhs_f[i]) << endl; + string nonterm = TD::Convert(-cur_rhs_f[i]); + nonterm+=bkoff_mrkr; + bkoff = -TD::Convert(nonterm); + cur_rhs_f[i]=bkoff; + vector<WordID> rhs_f_bkoff; + vector<WordID> rhs_e_bkoff; + vector<pair<short,short> > bkoff_align; + bkoff_align.clear(); + bkoff_align.push_back(make_pair(0,0)); + + for (int cat = 0; cat < all_cats->size(); ++cat) { + rhs_f_bkoff.clear(); + rhs_e_bkoff.clear(); + rhs_f_bkoff.push_back(-(*all_cats)[cat]); + rhs_e_bkoff.push_back(0); + observer->CountRule(bkoff,rhs_f_bkoff,rhs_e_bkoff,bkoff_align); + + } + }//else + //cerr << cur_rhs_f[i] << ": (words,f) |" << TD::Convert(cur_rhs_f[i]) << endl; + } + /*for (int i=0; i < cur_rhs_e.size(); ++i) + if(cur_rhs_e[i] <= 0) + cerr << cur_rhs_e[i] << ": (cats,e) |" << TD::Convert(1-cur_rhs_e[i]) << endl; + else + cerr << cur_rhs_e[i] << ": (words,e) |" << TD::Convert(cur_rhs_e[i]) << endl; + */ + + observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align); + } } } } @@ -337,5 +376,4 @@ ostream& operator<<(ostream& os, const RuleStatistics& s) { } } return os; -} - +}
\ No newline at end of file |