summaryrefslogtreecommitdiff
path: root/extools/featurize_grammar.cc
diff options
context:
space:
mode:
Diffstat (limited to 'extools/featurize_grammar.cc')
-rw-r--r--extools/featurize_grammar.cc34
1 files changed, 11 insertions, 23 deletions
diff --git a/extools/featurize_grammar.cc b/extools/featurize_grammar.cc
index 8be057b0..771948ce 100644
--- a/extools/featurize_grammar.cc
+++ b/extools/featurize_grammar.cc
@@ -228,15 +228,11 @@ int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end
while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; }
const WordID w = TD::Convert(string(buf, start, ptr - start));
-
- if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV))
- p->push_back(1 * w);
- else {
- if (w == kDIV) return ptr;
- p->push_back(w);
- }
+ if (w == kDIV) return ptr;
+ p->push_back(w);
}
- return ptr;
+ assert(p->size() > 0);
+ return ptr;
}
void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) {
@@ -251,8 +247,10 @@ void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* coun
cur_key->clear();
// key is: "[X] ||| word word word"
int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key);
- cur_key->push_back(kDIV);
- ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
+ if (buf[tmpp] != '\t') {
+ cur_key->push_back(kDIV);
+ ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
+ }
++ptr;
int start = ptr;
int end = ptr;
@@ -294,7 +292,6 @@ void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* coun
}
}
-
void LexTranslationTable::createTTable(const char* buf){
AnnotatedParallelSentence sent;
sent.ParseInputLine(buf);
@@ -657,20 +654,11 @@ int main(int argc, char** argv){
fs1.getline(buf, MAX_LINE_LENGTH);
if (buf[0] == 0) continue;
ParseLine(buf, &cur_key, &cur_counts);
- //src.resize(cur_key.size() - 4);
- src.resize(cur_key.size() - 3);
+ src.resize(cur_key.size() - 2);
for (int i = 0; i < src.size(); ++i) src.at(i) = cur_key.at(i+2);
- cerr << "Key: "; for (vector<WordID>::const_iterator wit=cur_key.begin(); wit!=cur_key.end(); ++wit) cerr << TD::Convert(*wit) << " "; cerr << endl;
-
lhs = cur_key[0];
- cerr << buf << endl;
for (ID2RuleStatistics::const_iterator it = cur_counts.begin(); it != cur_counts.end(); ++it) {
-
- cerr << "READ: <"; for (vector<WordID>::const_iterator wit=src.begin(); wit!=src.end(); ++wit) cerr << TD::Convert(*wit) << " ";
- cerr << "|||"; for (vector<WordID>::const_iterator wit=it->first.begin(); wit!=it->first.end(); ++wit) cerr << " " << TD::Convert(*wit);
- cerr << ">\n";
-
for (int i = 0; i < extractors.size(); ++i)
extractors[i]->ObserveFilteredRule(lhs, src, it->first);
}
@@ -681,7 +669,7 @@ int main(int argc, char** argv){
cin.getline(buf, MAX_LINE_LENGTH);
if (buf[0] == 0) continue;
ParseLine(buf, &cur_key, &cur_counts);
- src.resize(cur_key.size() - 3);
+ src.resize(cur_key.size() - 2);
for (int i = 0; i < src.size(); ++i) src[i] = cur_key[i+2];
lhs = cur_key[0];
for (ID2RuleStatistics::const_iterator it = cur_counts.begin(); it != cur_counts.end(); ++it) {
@@ -697,7 +685,7 @@ int main(int argc, char** argv){
fs2.getline(buf, MAX_LINE_LENGTH);
if (buf[0] == 0) continue;
ParseLine(buf, &cur_key, &cur_counts);
- src.resize(cur_key.size() - 3);
+ src.resize(cur_key.size() - 2);
for (int i = 0; i < src.size(); ++i) src[i] = cur_key[i+2];
lhs = cur_key[0];