actually use -n feature_name in LanguageModel. FF factory usage facility, FF feature ids facility (not used yet)

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@186 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-08 19:21:02 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-08 19:21:02 +0000
commit: 52a8d49e81c14b6f7ed3afb5bdb50b17391995a8 (patch)
tree: 2cd84e2840204de6fbfaa70be5b2818b9c805b51
parent: 71b39bcf60182d1686966db34225a670d13e3594 (diff)
7 files changed, 107 insertions, 41 deletions
diff --git a/decoder/ff.cc b/decoder/ff.cc
index 261e9a17..73dbbdc9 100644
--- a/decoder/ff.cc
+++ b/decoder/ff.cc
@@ -16,6 +16,19 @@ void FeatureFunction::FinalTraversalFeatures(const void* ant_state,
   (void) features;
 }
 
+string FeatureFunction::usage_helper(std::string const& name,std::string const& params,std::string const& details,bool sp,bool sd) {
+  string r=name;
+  if (sp) {
+    r+=": ";
+    r+=params;
+  }
+  if (sd) {
+    r+="\n";
+    r+=details;
+  }
+  return r;
+}
+
 // Hiero and Joshua use log_10(e) as the value, so I do to
 WordPenalty::WordPenalty(const string& param) :
     fid_(FD::Convert("WordPenalty")),
diff --git a/decoder/ff.h b/decoder/ff.h
index 630b3208..c6c9cf8f 100644
--- a/decoder/ff.h
+++ b/decoder/ff.h
@@ -19,6 +19,17 @@ class FeatureFunction {
   explicit FeatureFunction(int state_size) : state_size_(state_size) {}
   virtual ~FeatureFunction();
 
+  // override this.  not virtual because we want to expose this to factory template for help before creating a FF
+  static std::string usage(bool show_params,bool show_details) {
+    return usage_helper("FIXME_feature_needs_name","[no parameters]","[no documentation yet]",show_params,show_details);
+  }
+
+  static std::string usage_helper(std::string const& name,std::string const& params,std::string const& details,bool show_params,bool show_details);
+
+public:
+
+  typedef std::vector<WordID> Features;
+  virtual Features features() { return Features(); }
   // returns the number of bytes of context that this feature function will
   // (maximally) use.  By default, 0 ("stateless" models in Hiero/Joshua).
   // NOTE: this value is fixed for the instance of your class, you cannot
@@ -144,7 +155,7 @@ class ModelSet {
   bool empty() const { return models_.empty(); }
  private:
   std::vector<const FeatureFunction*> models_;
-  std::vector<double> weights_; 
+  std::vector<double> weights_;
   int state_size_;
   std::vector<int> model_state_pos_;
 };
diff --git a/decoder/ff_factory.cc b/decoder/ff_factory.cc
index 1854e0bb..d66cd883 100644
--- a/decoder/ff_factory.cc
+++ b/decoder/ff_factory.cc
@@ -14,6 +14,13 @@ void FFRegistry::DisplayList() const {
   }
 }
 
+string FFRegistry::usage(string const& ffname,bool params,bool verbose) const {
+  map<string, shared_ptr<FFFactoryBase> >::const_iterator it = reg_.find(ffname);
+  return it == reg_.end()
+    ? "Unknown feature " + ffname
+    : it->second->usage(params,verbose);
+}
+
 shared_ptr<FeatureFunction> FFRegistry::Create(const string& ffname, const string& param) const {
   map<string, shared_ptr<FFFactoryBase> >::const_iterator it = reg_.find(ffname);
   shared_ptr<FeatureFunction> res;
@@ -33,3 +40,8 @@ void FFRegistry::Register(const string& ffname, FFFactoryBase* factory) {
   reg_[ffname].reset(factory);
 }
 
+
+void FFRegistry::Register(FFFactoryBase* factory)
+{
+  Register(factory->usage(false,false),factory);
+}
diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h
index bc586567..75911f38 100644
--- a/decoder/ff_factory.h
+++ b/decoder/ff_factory.h
@@ -17,8 +17,10 @@ class FFRegistry {
   friend class FFFactoryBase;
  public:
   boost::shared_ptr<FeatureFunction> Create(const std::string& ffname, const std::string& param) const;
+  std::string usage(std::string const& ffname,bool params=true,bool verbose=true) const;
   void DisplayList() const;
   void Register(const std::string& ffname, FFFactoryBase* factory);
+  void Register(FFFactoryBase* factory);
  private:
   FFRegistry() {}
   std::map<std::string, boost::shared_ptr<FFFactoryBase> > reg_;
@@ -27,6 +29,7 @@ class FFRegistry {
 struct FFFactoryBase {
   virtual ~FFFactoryBase();
   virtual boost::shared_ptr<FeatureFunction> Create(const std::string& param) const = 0;
+  virtual std::string usage(bool params,bool verbose) const = 0;
 };
 
 template<class FF>
@@ -34,6 +37,11 @@ class FFFactory : public FFFactoryBase {
   boost::shared_ptr<FeatureFunction> Create(const std::string& param) const {
     return boost::shared_ptr<FeatureFunction>(new FF(param));
   }
+  // called with false,false just gives feature name
+  virtual std::string usage(bool params,bool verbose) const {
+    return FF::usage(params,verbose);
+  }
+
 };
 
 #endif
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index e6f7912e..9e6f02b7 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -1,3 +1,7 @@
+char const* usage_name="LanguageModel";
+char const* usage_short="srilm.gz [-n FeatureName] [-o StateOrder] [-m LimitLoadOrder]";
+char const* usage_verbose="-n determines the name of the feature (and its weight).  -o defaults to 3.  -m defaults to effectively infinite, otherwise says what order lm probs to use (up to).  you could use -o > -m but that would be wasteful.  -o < -m means some ngrams are scored longer (whenever a word is inserted by a rule next to a variable) than the state would ordinarily allow.  NOTE: multiple LanguageModel features are allowed, but they will wastefully duplicate state, except in the special case of -o 1 (which uses no state).  subsequent references to the same a.lm.gz. unless they specify -m, will reuse the same SRI LM in memory; this means that the -m used in the first load of a.lm.gz will take effect.";
+
 //TODO: backoff wordclasses for named entity xltns, esp. numbers.  e.g. digits -> @.  idealy rule features would specify replacement lm tokens/classes
 
 //TODO: extra int in state to hold "GAP" token is not needed.  if there are less than (N-1) words, then null terminate the e.g. left words.  however, this would mean treating gapless items differently.  not worth the potential bugs right now.
@@ -38,7 +42,12 @@
 
 using namespace std;
 
-// intend to have a 0-state prelm-pass heuristic LM that is better than 1gram (like how estimated_features are lower order estimates).  NgramShare will keep track of all loaded lms and reuse them.
+string LanguageModel::usage(bool param,bool verbose) {
+  return usage_helper(usage_name,usage_short,usage_verbose,param,verbose);
+}
+
+
+// NgramShare will keep track of all loaded lms and reuse them.
 //TODO: ref counting by shared_ptr?  for now, first one to load LM needs to stick around as long as all subsequent users.
 
 #include <boost/shared_ptr.hpp>
@@ -167,27 +176,31 @@ struct LMClient {
 };
 
 class LanguageModelImpl {
+  void init(int order) {
+    //all these used to be const members, but that has no performance implication, and now there's less duplication.
+    order_=order;
+    state_size_ = OrderToStateSize(order)-1;
+    unigram=(order<=1);
+    floor_=-100;
+    kSTART = TD::Convert("<s>");
+    kSTOP = TD::Convert("</s>");
+    kUNKNOWN = TD::Convert("<unk>");
+    kNONE = -1;
+    kSTAR = TD::Convert("<{STAR}>");
+  }
+
  public:
-  explicit LanguageModelImpl(int order) :
-    ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
-      floor_(-100.0),
-      kSTART(TD::Convert("<s>")),
-      kSTOP(TD::Convert("</s>")),
-      kUNKNOWN(TD::Convert("<unk>")),
-      kNONE(-1),
-      kSTAR(TD::Convert("<{STAR}>"))
-  , unigram(order<=1) {}
+  explicit LanguageModelImpl(int order) : ngram_(*TD::dict_, order)
+  {
+    init(order);
+  }
+
+
 //TODO: show that unigram special case (0 state) computes what it should.
-  LanguageModelImpl(int order, const string& f) :
-      ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
-      floor_(-100.0),
-      kSTART(TD::Convert("<s>")),
-      kSTOP(TD::Convert("</s>")),
-      kUNKNOWN(TD::Convert("<unk>")),
-      kNONE(-1),
-      kSTAR(TD::Convert("<{STAR}>"))
-  , unigram(order<=1)
+  LanguageModelImpl(int order, const string& f, int load_order=0) :
+    ngram_(*TD::dict_, load_order ? load_order : order)
   {
+    init(order);
     File file(f.c_str(), "r", 0);
     assert(file);
     cerr << "Reading " << order_ << "-gram LM from " << f << endl;
@@ -407,16 +420,16 @@ public:
  protected:
   Ngram ngram_;
   vector<WordID> buffer_;
-  const int order_;
-  const int state_size_;
-  const double floor_;
+  int order_;
+  int state_size_;
+  double floor_;
  public:
-  const WordID kSTART;
-  const WordID kSTOP;
-  const WordID kUNKNOWN;
-  const WordID kNONE;
-  const WordID kSTAR;
-  const bool unigram;
+  WordID kSTART;
+  WordID kSTOP;
+  WordID kUNKNOWN;
+  WordID kNONE;
+  WordID kSTAR;
+  bool unigram;
 };
 
 struct ClientLMI : public LanguageModelImpl
@@ -436,32 +449,33 @@ struct ReuseLMI : public LanguageModelImpl
 {
   ReuseLMI(int order, Ngram *ng) : LanguageModelImpl(order), ng(ng)
   {}
-  virtual double WordProb(int word, int* context) {
+  double WordProb(int word, int* context) {
     return ng->wordProb(word, (VocabIndex*)context);
   }
 protected:
   Ngram *ng;
 };
 
-LanguageModelImpl *make_lm_impl(int order, string const& f)
+LanguageModelImpl *make_lm_impl(int order, string const& f, int load_order)
 {
   if (f.find("lm://") == 0) {
     return new ClientLMI(order,f.substr(5));
-  } else if (ngs.have(f)) {
+  } else if (load_order==0 && ngs.have(f)) {
     cerr<<"Reusing already loaded Ngram LM: "<<f<<endl;
     return new ReuseLMI(order,ngs.get(f));
   } else {
-    LanguageModelImpl *r=new LanguageModelImpl(order,f);
+    LanguageModelImpl *r=new LanguageModelImpl(order,f,load_order);
     ngs.add(f,r->get_lm());
     return r;
   }
 }
 
-bool parse_lmspec(std::string const& in, int &order, string &featurename, string &filename)
+bool parse_lmspec(std::string const& in, int &order, string &featurename, string &filename, int &load_order)
 {
   vector<string> const& argv=SplitOnWhitespace(in);
   featurename="LanguageModel";
   order=3;
+  load_order=0;
 #define LMSPEC_NEXTARG if (i==argv.end()) {            \
     cerr << "Missing argument for "<<*last<<". "; goto usage; \
     } else { ++i; }
@@ -477,6 +491,9 @@ bool parse_lmspec(std::string const& in, int &order, string &featurename, string
       case 'n':
         LMSPEC_NEXTARG; featurename=*i;
         break;
+      case 'm':
+        LMSPEC_NEXTARG; load_order=lexical_cast<int>(*i);
+        break;
 #undef LMSPEC_NEXTARG
       default:
       fail:
@@ -495,18 +512,22 @@ bool parse_lmspec(std::string const& in, int &order, string &featurename, string
   if (order > 0 && !filename.empty())
     return true;
 usage:
-  cerr<<"LanguageModel specification should be: [-o order>0] [-n featurename] filename"<<endl<<" you provided: "<<in<<endl;
+  cerr<<usage_name<<" specification should be: "<<usage_short<<"; you provided: "<<in<<usage_verbose<<endl;
   return false;
 }
 
 
 LanguageModel::LanguageModel(const string& param) {
-  int order;
+  int order,load_order;
   string featurename,filename;
-  if (!parse_lmspec(param,order,featurename,filename))
+  if (!parse_lmspec(param,order,featurename,filename,load_order))
     abort();
-  fid_=FD::Convert("LanguageModel");
-  pimpl_ = make_lm_impl(order,filename);
+  cerr<<"LM feature name: "<<featurename<<" from file "<<filename<<" order "<<order;
+  if (load_order)
+    cerr<<" loading LM as order "<<load_order;
+  cerr<<endl;
+  fid_=FD::Convert(featurename);
+  pimpl_ = make_lm_impl(order,filename,load_order);
   //TODO: see if it's actually possible to set order_ later to mutate an already used FF for e.g. multipass.  comment in ff.h says only to change state size in constructor.  clone instead?  differently -n named ones from same lm filename are already possible, so no urgency.
   SetStateSize(LanguageModelImpl::OrderToStateSize(order));
 }
diff --git a/decoder/ff_lm.h b/decoder/ff_lm.h
index 10a3e9a3..5ea41068 100644
--- a/decoder/ff_lm.h
+++ b/decoder/ff_lm.h
@@ -18,6 +18,7 @@ class LanguageModel : public FeatureFunction {
   virtual void FinalTraversalFeatures(const void* context,
                                       SparseVector<double>* features) const;
   std::string DebugStateToString(const void* state) const;
+  static std::string usage(bool param,bool verbose);
  protected:
   virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
                                      const Hypergraph::Edge& edge,
diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h
index 7794fd5e..be91f324 100644
--- a/decoder/sparse_vector.h
+++ b/decoder/sparse_vector.h
@@ -19,9 +19,9 @@ public:
   typedef typename std::map<int, T>::const_iterator const_iterator;
   SparseVector() {}
   explicit SparseVector(std::vector<T> const& v) {
-    MapType::iterator p=values_.end();
+    typename MapType::iterator p=values_.begin();
     for (unsigned i=0;i<v.size();++i)
-      p=values_.insert(p,MapType::value_type(i,v[i])); //faster
+      p=values_.insert(p,typename MapType::value_type(i,v[i])); //faster
   }
 
   const T operator[](int index) const {
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-08 19:21:02 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-08 19:21:02 +0000
commit	52a8d49e81c14b6f7ed3afb5bdb50b17391995a8 (patch)
tree	2cd84e2840204de6fbfaa70be5b2818b9c805b51
parent	71b39bcf60182d1686966db34225a670d13e3594 (diff)