(1) allow prefixes and separators used in feature instantiations to be configured. (2) minor refactoring.

author: Waleed Ammar <wammar@cs.cmu.edu> 2012-07-02 16:51:21 -0400
committer: Waleed Ammar <wammar@cs.cmu.edu> 2012-07-02 16:51:21 -0400
commit: 5584ad22d3d3fae7ea195fe6cd7cf56cd0f35ba0 (patch)
tree: d8022f0f613d32551417a520f6c6e29c0c648963 /decoder/ff_ngrams.cc
parent: dc9e428224d95adf63da6460a2031348de295ca1 (diff)
1 files changed, 72 insertions, 13 deletions
diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc
index d6d79f5e..9c13fdbb 100644
--- a/decoder/ff_ngrams.cc
+++ b/decoder/ff_ngrams.cc
@@ -48,6 +48,9 @@ struct State {
 
 namespace {
   string Escape(const string& x) {
+    if (x.find('=') == string::npos && x.find(';') == string::npos) {
+      return x;
+    }
     string y = x;
     for (int i = 0; i < y.size(); ++i) {
       if (y[i] == '=') y[i]='_';
@@ -57,10 +60,17 @@ namespace {
   }
 }
 
-static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order) {
+static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector<string>& prefixes, string& target_separator) {
   vector<string> const& argv=SplitOnWhitespace(in);
   *explicit_markers = false;
   *order = 3;
+  prefixes.push_back("NOT-USED");
+  prefixes.push_back("U:"); // default unigram prefix
+  prefixes.push_back("B:"); // default bigram prefix
+  prefixes.push_back("T:"); // ...etc
+  prefixes.push_back("4:"); // ...etc
+  prefixes.push_back("5:"); // max allowed!
+  target_separator = "_";
 #define LMSPEC_NEXTARG if (i==argv.end()) {            \
     cerr << "Missing argument for "<<*last<<". "; goto usage; \
     } else { ++i; }
@@ -73,6 +83,30 @@ static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order)
       case 'x':
         *explicit_markers = true;
         break;
+      case 'U':
+	LMSPEC_NEXTARG;
+	prefixes[1] = *i;
+	break;
+      case 'B':
+	LMSPEC_NEXTARG;
+	prefixes[2] = *i;
+	break;
+      case 'T':
+	LMSPEC_NEXTARG;
+	prefixes[3] = *i;
+	break;
+      case '4':
+	LMSPEC_NEXTARG;
+	prefixes[4] = *i;
+	break;
+      case '5':
+	LMSPEC_NEXTARG;
+	prefixes[5] = *i;
+	break;
+      case 'S':
+	LMSPEC_NEXTARG;
+	target_separator = *i;
+	break;
       case 'o':
         LMSPEC_NEXTARG; *order=atoi((*i).c_str());
         break;
@@ -86,7 +120,29 @@ static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order)
   }
   return true;
 usage:
-  cerr << "NgramFeatures is incorrect!\n";
+  cerr << "Wrong parameters for NgramFeatures.\n\n"
+
+       << "NgramFeatures Usage: \n"			     
+       << " feature_function=NgramFeatures filename.lm [-x] [-o <order>] \n"
+       << " [-U <unigram-prefix>] [-B <bigram-prefix>][-T <trigram-prefix>]\n"
+       << " [-4 <4-gram-prefix>] [-5 <5-gram-prefix>] [-S <separator>]\n\n" 
+    
+       << "Defaults: \n"
+       << "  <order>          = 3\n" 
+       << "  <unigram-prefix> = U:\n"
+       << "  <bigram-prefix>  = B:\n"
+       << "  <trigram-prefix> = T:\n"
+       << "  <4-gram-prefix>  = 4:\n"
+       << "  <5-gram-prefix>  = 5:\n"
+       << "  <separator>      = _\n"
+       << "  -x (i.e. explicit sos/eos markers) is turned off\n\n"
+
+       << "Example configuration: \n"
+       << "  feature_function=NgramFeatures -o 3 -T tri: -S |\n\n"
+
+       << "Example feature instantiation: \n"
+       << "  tri:a|b|c \n\n";
+
   return false;
 }
 
@@ -158,16 +214,12 @@ class NgramDetectorImpl {
       int& fid = ft->fids[curword];
       ++n;
       if (!fid) {
-        const char* code="_UBT456789"; // prefix code (unigram, bigram, etc.)
         ostringstream os;
-        os << code[n] << ':';
+        os << prefixes_[n];
         for (int i = n-1; i >= 0; --i) {
-          os << (i != n-1 ? "_" : "");
+          os << (i != n-1 ? target_separator_ : "");
           const string& tok = TD::Convert(buf[i]);
-          if (tok.find('=') == string::npos)
-            os << tok;
-          else
-            os << Escape(tok);
+	  os << Escape(tok);
         }
         fid = FD::Convert(os.str());
       }
@@ -297,7 +349,8 @@ class NgramDetectorImpl {
   }
 
  public:
-  explicit NgramDetectorImpl(bool explicit_markers, unsigned order) :
+  explicit NgramDetectorImpl(bool explicit_markers, unsigned order,
+			     vector<string>& prefixes, string& target_separator) :
       kCDEC_UNK(TD::Convert("<unk>")) ,
       add_sos_eos_(!explicit_markers) {
     order_ = order;
@@ -305,6 +358,8 @@ class NgramDetectorImpl {
     unscored_size_offset_ = (order_ - 1) * sizeof(WordID);
     is_complete_offset_ = unscored_size_offset_ + 1;
     unscored_words_offset_ = is_complete_offset_ + 1;
+    prefixes_ = prefixes;
+    target_separator_ = target_separator;
 
     // special handling of beginning / ending sentence markers
     dummy_state_ = new char[state_size_];
@@ -340,6 +395,8 @@ class NgramDetectorImpl {
   char* dummy_state_;
   vector<const void*> dummy_ants_;
   TRulePtr dummy_rule_;
+  vector<string> prefixes_;
+  string target_separator_;
   struct FidTree {
     map<WordID, int> fids;
     map<WordID, FidTree> levels;
@@ -348,11 +405,13 @@ class NgramDetectorImpl {
 };
 
 NgramDetector::NgramDetector(const string& param) {
-  string filename, mapfile, featname;
+  string filename, mapfile, featname, target_separator;
+  vector<string> prefixes;
   bool explicit_markers = false;
   unsigned order = 3;
-  ParseArgs(param, &explicit_markers, &order);
-  pimpl_ = new NgramDetectorImpl(explicit_markers, order);
+  ParseArgs(param, &explicit_markers, &order, prefixes, target_separator);
+  pimpl_ = new NgramDetectorImpl(explicit_markers, order, prefixes, 
+				 target_separator);
   SetStateSize(pimpl_->ReserveStateSize());
 }
author	Waleed Ammar <wammar@cs.cmu.edu>	2012-07-02 16:51:21 -0400
committer	Waleed Ammar <wammar@cs.cmu.edu>	2012-07-02 16:51:21 -0400
commit	5584ad22d3d3fae7ea195fe6cd7cf56cd0f35ba0 (patch)
tree	d8022f0f613d32551417a520f6c6e29c0c648963 /decoder/ff_ngrams.cc
parent	dc9e428224d95adf63da6460a2031348de295ca1 (diff)