fix bugs

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@180 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-07 20:59:59 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-07 20:59:59 +0000
commit: ffe002f8792dd8693c12e9bc6a7f715ca170acfc (patch)
tree: cf4eb0121c47a57caa135da08e65f02e0b8cfa12 /extools/featurize_grammar.cc
parent: e1b840374b3f07185db38b6ada0384120ee166e9 (diff)
1 files changed, 11 insertions, 23 deletions
diff --git a/extools/featurize_grammar.cc b/extools/featurize_grammar.cc
index 8be057b0..771948ce 100644
--- a/extools/featurize_grammar.cc
+++ b/extools/featurize_grammar.cc
@@ -228,15 +228,11 @@ int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end
     while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
     if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; }
     const WordID w = TD::Convert(string(buf, start, ptr - start));
-
-    if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV))
-      p->push_back(1 * w);
-    else {
-      if (w == kDIV) return ptr;
-      p->push_back(w);
-    }
+    if (w == kDIV) return ptr;
+    p->push_back(w);
   }
-  return ptr;
+  assert(p->size() > 0);
+  return ptr;  
 }
 
 void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) {
@@ -251,8 +247,10 @@ void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* coun
   cur_key->clear();
   // key is: "[X] ||| word word word"
   int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key);
-  cur_key->push_back(kDIV);
-  ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
+  if (buf[tmpp] != '\t') {
+    cur_key->push_back(kDIV);
+    ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
+  }
   ++ptr;
   int start = ptr;
   int end = ptr;
@@ -294,7 +292,6 @@ void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* coun
   }
 }
 
-
 void LexTranslationTable::createTTable(const char* buf){
   AnnotatedParallelSentence sent;
   sent.ParseInputLine(buf);
@@ -657,20 +654,11 @@ int main(int argc, char** argv){
     fs1.getline(buf, MAX_LINE_LENGTH);
     if (buf[0] == 0) continue;
     ParseLine(buf, &cur_key, &cur_counts);
-    //src.resize(cur_key.size() - 4);
-    src.resize(cur_key.size() - 3);
+    src.resize(cur_key.size() - 2);
     for (int i = 0; i < src.size(); ++i) src.at(i) = cur_key.at(i+2);
 
-    cerr << "Key: "; for (vector<WordID>::const_iterator wit=cur_key.begin(); wit!=cur_key.end(); ++wit) cerr << TD::Convert(*wit) << " "; cerr << endl;
-
     lhs = cur_key[0];
-    cerr << buf << endl;
     for (ID2RuleStatistics::const_iterator it = cur_counts.begin(); it != cur_counts.end(); ++it) {
-
-      cerr << "READ: <"; for (vector<WordID>::const_iterator wit=src.begin(); wit!=src.end(); ++wit) cerr << TD::Convert(*wit) << " ";
-      cerr << "|||"; for (vector<WordID>::const_iterator wit=it->first.begin(); wit!=it->first.end(); ++wit) cerr << " " << TD::Convert(*wit);
-      cerr << ">\n";
-
       for (int i = 0; i < extractors.size(); ++i)
         extractors[i]->ObserveFilteredRule(lhs, src, it->first);
     }
@@ -681,7 +669,7 @@ int main(int argc, char** argv){
     cin.getline(buf, MAX_LINE_LENGTH);
     if (buf[0] == 0) continue;
     ParseLine(buf, &cur_key, &cur_counts);
-    src.resize(cur_key.size() - 3);
+    src.resize(cur_key.size() - 2);
     for (int i = 0; i < src.size(); ++i) src[i] = cur_key[i+2];
     lhs = cur_key[0];
     for (ID2RuleStatistics::const_iterator it = cur_counts.begin(); it != cur_counts.end(); ++it) {
@@ -697,7 +685,7 @@ int main(int argc, char** argv){
     fs2.getline(buf, MAX_LINE_LENGTH);
     if (buf[0] == 0) continue;
     ParseLine(buf, &cur_key, &cur_counts);
-    src.resize(cur_key.size() - 3);
+    src.resize(cur_key.size() - 2);
     for (int i = 0; i < src.size(); ++i) src[i] = cur_key[i+2];
     lhs = cur_key[0];
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-07 20:59:59 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-07 20:59:59 +0000
commit	ffe002f8792dd8693c12e9bc6a7f715ca170acfc (patch)
tree	cf4eb0121c47a57caa135da08e65f02e0b8cfa12 /extools/featurize_grammar.cc
parent	e1b840374b3f07185db38b6ada0384120ee166e9 (diff)