From 5584ad22d3d3fae7ea195fe6cd7cf56cd0f35ba0 Mon Sep 17 00:00:00 2001 From: Waleed Ammar Date: Mon, 2 Jul 2012 16:51:21 -0400 Subject: (1) allow prefixes and separators used in feature instantiations to be configured. (2) minor refactoring. --- decoder/ff_ngrams.cc | 85 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 13 deletions(-) (limited to 'decoder/ff_ngrams.cc') diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index d6d79f5e..9c13fdbb 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -48,6 +48,9 @@ struct State { namespace { string Escape(const string& x) { + if (x.find('=') == string::npos && x.find(';') == string::npos) { + return x; + } string y = x; for (int i = 0; i < y.size(); ++i) { if (y[i] == '=') y[i]='_'; @@ -57,10 +60,17 @@ namespace { } } -static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order) { +static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order, vector& prefixes, string& target_separator) { vector const& argv=SplitOnWhitespace(in); *explicit_markers = false; *order = 3; + prefixes.push_back("NOT-USED"); + prefixes.push_back("U:"); // default unigram prefix + prefixes.push_back("B:"); // default bigram prefix + prefixes.push_back("T:"); // ...etc + prefixes.push_back("4:"); // ...etc + prefixes.push_back("5:"); // max allowed! + target_separator = "_"; #define LMSPEC_NEXTARG if (i==argv.end()) { \ cerr << "Missing argument for "<<*last<<". "; goto usage; \ } else { ++i; } @@ -73,6 +83,30 @@ static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order) case 'x': *explicit_markers = true; break; + case 'U': + LMSPEC_NEXTARG; + prefixes[1] = *i; + break; + case 'B': + LMSPEC_NEXTARG; + prefixes[2] = *i; + break; + case 'T': + LMSPEC_NEXTARG; + prefixes[3] = *i; + break; + case '4': + LMSPEC_NEXTARG; + prefixes[4] = *i; + break; + case '5': + LMSPEC_NEXTARG; + prefixes[5] = *i; + break; + case 'S': + LMSPEC_NEXTARG; + target_separator = *i; + break; case 'o': LMSPEC_NEXTARG; *order=atoi((*i).c_str()); break; @@ -86,7 +120,29 @@ static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order) } return true; usage: - cerr << "NgramFeatures is incorrect!\n"; + cerr << "Wrong parameters for NgramFeatures.\n\n" + + << "NgramFeatures Usage: \n" + << " feature_function=NgramFeatures filename.lm [-x] [-o ] \n" + << " [-U ] [-B ][-T ]\n" + << " [-4 <4-gram-prefix>] [-5 <5-gram-prefix>] [-S ]\n\n" + + << "Defaults: \n" + << " = 3\n" + << " = U:\n" + << " = B:\n" + << " = T:\n" + << " <4-gram-prefix> = 4:\n" + << " <5-gram-prefix> = 5:\n" + << " = _\n" + << " -x (i.e. explicit sos/eos markers) is turned off\n\n" + + << "Example configuration: \n" + << " feature_function=NgramFeatures -o 3 -T tri: -S |\n\n" + + << "Example feature instantiation: \n" + << " tri:a|b|c \n\n"; + return false; } @@ -158,16 +214,12 @@ class NgramDetectorImpl { int& fid = ft->fids[curword]; ++n; if (!fid) { - const char* code="_UBT456789"; // prefix code (unigram, bigram, etc.) ostringstream os; - os << code[n] << ':'; + os << prefixes_[n]; for (int i = n-1; i >= 0; --i) { - os << (i != n-1 ? "_" : ""); + os << (i != n-1 ? target_separator_ : ""); const string& tok = TD::Convert(buf[i]); - if (tok.find('=') == string::npos) - os << tok; - else - os << Escape(tok); + os << Escape(tok); } fid = FD::Convert(os.str()); } @@ -297,7 +349,8 @@ class NgramDetectorImpl { } public: - explicit NgramDetectorImpl(bool explicit_markers, unsigned order) : + explicit NgramDetectorImpl(bool explicit_markers, unsigned order, + vector& prefixes, string& target_separator) : kCDEC_UNK(TD::Convert("")) , add_sos_eos_(!explicit_markers) { order_ = order; @@ -305,6 +358,8 @@ class NgramDetectorImpl { unscored_size_offset_ = (order_ - 1) * sizeof(WordID); is_complete_offset_ = unscored_size_offset_ + 1; unscored_words_offset_ = is_complete_offset_ + 1; + prefixes_ = prefixes; + target_separator_ = target_separator; // special handling of beginning / ending sentence markers dummy_state_ = new char[state_size_]; @@ -340,6 +395,8 @@ class NgramDetectorImpl { char* dummy_state_; vector dummy_ants_; TRulePtr dummy_rule_; + vector prefixes_; + string target_separator_; struct FidTree { map fids; map levels; @@ -348,11 +405,13 @@ class NgramDetectorImpl { }; NgramDetector::NgramDetector(const string& param) { - string filename, mapfile, featname; + string filename, mapfile, featname, target_separator; + vector prefixes; bool explicit_markers = false; unsigned order = 3; - ParseArgs(param, &explicit_markers, &order); - pimpl_ = new NgramDetectorImpl(explicit_markers, order); + ParseArgs(param, &explicit_markers, &order, prefixes, target_separator); + pimpl_ = new NgramDetectorImpl(explicit_markers, order, prefixes, + target_separator); SetStateSize(pimpl_->ReserveStateSize()); } -- cgit v1.2.3