summaryrefslogtreecommitdiff
path: root/extools/extract.cc
diff options
context:
space:
mode:
authorolivia.buzek <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-08 21:59:50 +0000
committerolivia.buzek <olivia.buzek@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-08 21:59:50 +0000
commitc12e7241e8908def96943b1a4056e536ea91eded (patch)
treec24b9cf0d2a90239b01eb6432e683292c95bb06f /extools/extract.cc
parenta034f92b1fe0c6368ebb140bc691f0718dd23a23 (diff)
Adding backoff grammar and BackoffRule feature.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@191 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'extools/extract.cc')
-rw-r--r--extools/extract.cc44
1 files changed, 41 insertions, 3 deletions
diff --git a/extools/extract.cc b/extools/extract.cc
index 6ad124d2..c2c413e2 100644
--- a/extools/extract.cc
+++ b/extools/extract.cc
@@ -173,12 +173,15 @@ void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
const int max_syms,
const bool permit_adjacent_nonterminals,
const bool require_aligned_terminal,
- RuleObserver* observer) {
+ RuleObserver* observer,
+ vector<WordID>* all_cats) {
+ const char bkoff_mrkr = '_';
queue<RuleItem> q; // agenda for BFS
int max_len = -1;
unordered_map<pair<short, short>, vector<ParallelSpan>, boost::hash<pair<short, short> > > fspans;
vector<vector<ParallelSpan> > spans_by_start(sentence.f_len);
set<int> starts;
+ WordID bkoff;
for (int i = 0; i < phrases.size(); ++i) {
fspans[make_pair(phrases[i].i1,phrases[i].i2)].push_back(phrases[i]);
max_len = max(max_len, phrases[i].i2 - phrases[i].i1);
@@ -281,6 +284,42 @@ void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
if (cur_es[j] >= 0 && sentence.aligned(cur_fs[i],cur_es[j]))
cur_terminal_align.push_back(make_pair(i,j));
observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align);
+
+ if(!all_cats->empty()) {
+ //produce the backoff grammar if the category wordIDs are available
+ for (int i = 0; i < cur_rhs_f.size(); ++i) {
+ if(cur_rhs_f[i] < 0) {
+ //cerr << cur_rhs_f[i] << ": (cats,f) |" << TD::Convert(-cur_rhs_f[i]) << endl;
+ string nonterm = TD::Convert(-cur_rhs_f[i]);
+ nonterm+=bkoff_mrkr;
+ bkoff = -TD::Convert(nonterm);
+ cur_rhs_f[i]=bkoff;
+ vector<WordID> rhs_f_bkoff;
+ vector<WordID> rhs_e_bkoff;
+ vector<pair<short,short> > bkoff_align;
+ bkoff_align.clear();
+ bkoff_align.push_back(make_pair(0,0));
+
+ for (int cat = 0; cat < all_cats->size(); ++cat) {
+ rhs_f_bkoff.clear();
+ rhs_e_bkoff.clear();
+ rhs_f_bkoff.push_back(-(*all_cats)[cat]);
+ rhs_e_bkoff.push_back(0);
+ observer->CountRule(bkoff,rhs_f_bkoff,rhs_e_bkoff,bkoff_align);
+
+ }
+ }//else
+ //cerr << cur_rhs_f[i] << ": (words,f) |" << TD::Convert(cur_rhs_f[i]) << endl;
+ }
+ /*for (int i=0; i < cur_rhs_e.size(); ++i)
+ if(cur_rhs_e[i] <= 0)
+ cerr << cur_rhs_e[i] << ": (cats,e) |" << TD::Convert(1-cur_rhs_e[i]) << endl;
+ else
+ cerr << cur_rhs_e[i] << ": (words,e) |" << TD::Convert(cur_rhs_e[i]) << endl;
+ */
+
+ observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align);
+ }
}
}
}
@@ -337,5 +376,4 @@ ostream& operator<<(ostream& os, const RuleStatistics& s) {
}
}
return os;
-}
-
+} \ No newline at end of file