actually use -n feature_name in LanguageModel. FF factory usage facility, FF feature ids facility (not used yet)

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@186 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-08 19:21:02 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-08 19:21:02 +0000
commit: b8cd8c32c7a89f99f3d8b8813d11517e89ecf022 (patch)
tree: c2867ddd9a2690e74b03062d71308ad1081d0bcb /decoder/ff_lm.cc
parent: e202d85c10942fd586088207a55226cc1eb1a01e (diff)
1 files changed, 59 insertions, 38 deletions
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index e6f7912e..9e6f02b7 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -1,3 +1,7 @@
+char const* usage_name="LanguageModel";
+char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]";
+char const* usage_verbose="-n determines the name of the feature (and its weight).  -o defaults to 3.  -m defaults to effectively infinite, otherwise says what order lm probs to use (up to).  you could use -o > -m but that would be wasteful.  -o < -m means some ngrams are scored longer (whenever a word is inserted by a rule next to a variable) than the state would ordinarily allow.  NOTE: multiple LanguageModel features are allowed, but they will wastefully duplicate state, except in the special case of -o 1 (which uses no state).  subsequent references to the same a.lm.gz. unless they specify -m, will reuse the same SRI LM in memory; this means that the -m used in the first load of a.lm.gz will take effect.";
+
 //TODO: backoff wordclasses for named entity xltns, esp. numbers.  e.g. digits -> @.  idealy rule features would specify replacement lm tokens/classes
 
 //TODO: extra int in state to hold "GAP" token is not needed.  if there are less than (N-1) words, then null terminate the e.g. left words.  however, this would mean treating gapless items differently.  not worth the potential bugs right now.
@@ -38,7 +42,12 @@
 
 using namespace std;
 
-// intend to have a 0-state prelm-pass heuristic LM that is better than 1gram (like how estimated_features are lower order estimates).  NgramShare will keep track of all loaded lms and reuse them.
+string LanguageModel::usage(bool param,bool verbose) {
+  return usage_helper(usage_name,usage_short,usage_verbose,param,verbose);
+}
+
+
+// NgramShare will keep track of all loaded lms and reuse them.
 //TODO: ref counting by shared_ptr?  for now, first one to load LM needs to stick around as long as all subsequent users.
 
 #include <boost/shared_ptr.hpp>
@@ -167,27 +176,31 @@ struct LMClient {
 };
 
 class LanguageModelImpl {
+  void init(int order) {
+    //all these used to be const members, but that has no performance implication, and now there's less duplication.
+    order_=order;
+    state_size_ = OrderToStateSize(order)-1;
+    unigram=(order<=1);
+    floor_=-100;
+    kSTART = TD::Convert("<s>");
+    kSTOP = TD::Convert("</s>");
+    kUNKNOWN = TD::Convert("<unk>");
+    kNONE = -1;
+    kSTAR = TD::Convert("<{STAR}>");
+  }
+
  public:
-  explicit LanguageModelImpl(int order) :
-    ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
-      floor_(-100.0),
-      kSTART(TD::Convert("<s>")),
-      kSTOP(TD::Convert("</s>")),
-      kUNKNOWN(TD::Convert("<unk>")),
-      kNONE(-1),
-      kSTAR(TD::Convert("<{STAR}>"))
-  , unigram(order<=1) {}
+  explicit LanguageModelImpl(int order) : ngram_(*TD::dict_, order)
+  {
+    init(order);
+  }
+
+
 //TODO: show that unigram special case (0 state) computes what it should.
-  LanguageModelImpl(int order, const string& f) :
-      ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
-      floor_(-100.0),
-      kSTART(TD::Convert("<s>")),
-      kSTOP(TD::Convert("</s>")),
-      kUNKNOWN(TD::Convert("<unk>")),
-      kNONE(-1),
-      kSTAR(TD::Convert("<{STAR}>"))
-  , unigram(order<=1)
+  LanguageModelImpl(int order, const string& f, int load_order=0) :
+    ngram_(*TD::dict_, load_order ? load_order : order)
   {
+    init(order);
     File file(f.c_str(), "r", 0);
     assert(file);
     cerr << "Reading " << order_ << "-gram LM from " << f << endl;
@@ -407,16 +420,16 @@ public:
  protected:
   Ngram ngram_;
   vector<WordID> buffer_;
-  const int order_;
-  const int state_size_;
-  const double floor_;
+  int order_;
+  int state_size_;
+  double floor_;
  public:
-  const WordID kSTART;
-  const WordID kSTOP;
-  const WordID kUNKNOWN;
-  const WordID kNONE;
-  const WordID kSTAR;
-  const bool unigram;
+  WordID kSTART;
+  WordID kSTOP;
+  WordID kUNKNOWN;
+  WordID kNONE;
+  WordID kSTAR;
+  bool unigram;
 };
 
 struct ClientLMI : public LanguageModelImpl
@@ -436,32 +449,33 @@ struct ReuseLMI : public LanguageModelImpl
 {
   ReuseLMI(int order, Ngram *ng) : LanguageModelImpl(order), ng(ng)
   {}
-  virtual double WordProb(int word, int* context) {
+  double WordProb(int word, int* context) {
     return ng->wordProb(word, (VocabIndex*)context);
   }
 protected:
   Ngram *ng;
 };
 
-LanguageModelImpl *make_lm_impl(int order, string const& f)
+LanguageModelImpl *make_lm_impl(int order, string const& f, int load_order)
 {
   if (f.find("lm://") == 0) {
     return new ClientLMI(order,f.substr(5));
-  } else if (ngs.have(f)) {
+  } else if (load_order==0 && ngs.have(f)) {
     cerr<<"Reusing already loaded Ngram LM: "<<f<<endl;
     return new ReuseLMI(order,ngs.get(f));
   } else {
-    LanguageModelImpl *r=new LanguageModelImpl(order,f);
+    LanguageModelImpl *r=new LanguageModelImpl(order,f,load_order);
     ngs.add(f,r->get_lm());
     return r;
   }
 }
 
-bool parse_lmspec(std::string const& in, int &order, string &featurename, string &filename)
+bool parse_lmspec(std::string const& in, int &order, string &featurename, string &filename, int &load_order)
 {
   vector<string> const& argv=SplitOnWhitespace(in);
   featurename="LanguageModel";
   order=3;
+  load_order=0;
 #define LMSPEC_NEXTARG if (i==argv.end()) {            \
     cerr << "Missing argument for "<<*last<<". "; goto usage; \
     } else { ++i; }
@@ -477,6 +491,9 @@ bool parse_lmspec(std::string const& in, int &order, string &featurename, string
       case 'n':
         LMSPEC_NEXTARG; featurename=*i;
         break;
+      case 'm':
+        LMSPEC_NEXTARG; load_order=lexical_cast<int>(*i);
+        break;
 #undef LMSPEC_NEXTARG
       default:
       fail:
@@ -495,18 +512,22 @@ bool parse_lmspec(std::string const& in, int &order, string &featurename, string
   if (order > 0 && !filename.empty())
     return true;
 usage:
-  cerr<<"LanguageModel specification should be: [-o order>0] [-n featurename] filename"<<endl<<" you provided: "<<in<<endl;
+  cerr<<usage_name<<" specification should be: "<<usage_short<<"; you provided: "<<in<<usage_verbose<<endl;
   return false;
 }
 
 
 LanguageModel::LanguageModel(const string& param) {
-  int order;
+  int order,load_order;
   string featurename,filename;
-  if (!parse_lmspec(param,order,featurename,filename))
+  if (!parse_lmspec(param,order,featurename,filename,load_order))
     abort();
-  fid_=FD::Convert("LanguageModel");
-  pimpl_ = make_lm_impl(order,filename);
+  cerr<<"LM feature name: "<<featurename<<" from file "<<filename<<" order "<<order;
+  if (load_order)
+    cerr<<" loading LM as order "<<load_order;
+  cerr<<endl;
+  fid_=FD::Convert(featurename);
+  pimpl_ = make_lm_impl(order,filename,load_order);
   //TODO: see if it's actually possible to set order_ later to mutate an already used FF for e.g. multipass.  comment in ff.h says only to change state size in constructor.  clone instead?  differently -n named ones from same lm filename are already possible, so no urgency.
   SetStateSize(LanguageModelImpl::OrderToStateSize(order));
 }
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-08 19:21:02 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-08 19:21:02 +0000
commit	b8cd8c32c7a89f99f3d8b8813d11517e89ecf022 (patch)
tree	c2867ddd9a2690e74b03062d71308ad1081d0bcb /decoder/ff_lm.cc
parent	e202d85c10942fd586088207a55226cc1eb1a01e (diff)