0 size 1gram lm works

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@144 ec762483-ff6d-05da-a07a-a48fb63a330f
author: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-05 22:17:55 +0000
committer: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-05 22:17:55 +0000
commit: 3f184d8e18dbe5bb6b25f8bb4bd4d00fafb6ef40 (patch)
tree: 6f0009901813fd7399ca0580b8552abeaf878d26
parent: d89e3cd42d17e94f87609f828b6c04b3a9a0523c (diff)
1 files changed, 11 insertions, 7 deletions
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index 7e3f6e2b..cdfbb96a 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -2,7 +2,7 @@
 
 //TODO: allow features to reorder by heuristic*weight the rules' terminal phrases (or of hyperedges').  if first pass has pruning, then compute over whole ruleset as part of heuristic
 
-//TODO: verify that this is true: if ngram order is bigger than lm state's, then the longest possible ngram scores are still used.  if you really want a lower order, a truncated copy of the LM should be small enough.  otherwise, an option to null out words outside of the order's window would need to be implemented.
+//NOTE: if ngram order is bigger than lm state's, then the longest possible ngram scores are still used.  if you really want a lower order, a truncated copy of the LM should be small enough.  otherwise, an option to null out words outside of the order's window would need to be implemented.
 
 #include "ff_lm.h"
 
@@ -160,13 +160,14 @@ struct LMClient {
 class LanguageModelImpl {
  public:
   explicit LanguageModelImpl(int order) :
-      ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
+    ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
       floor_(-100.0),
       kSTART(TD::Convert("<s>")),
       kSTOP(TD::Convert("</s>")),
       kUNKNOWN(TD::Convert("<unk>")),
       kNONE(-1),
-      kSTAR(TD::Convert("<{STAR}>")) {}
+      kSTAR(TD::Convert("<{STAR}>"))
+  , unigram(order<=1) {}
 
   LanguageModelImpl(int order, const string& f) :
       ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
@@ -175,7 +176,9 @@ class LanguageModelImpl {
       kSTOP(TD::Convert("</s>")),
       kUNKNOWN(TD::Convert("<unk>")),
       kNONE(-1),
-      kSTAR(TD::Convert("<{STAR}>")) {
+      kSTAR(TD::Convert("<{STAR}>"))
+  , unigram(order<=1)
+  {
     File file(f.c_str(), "r", 0);
     assert(file);
     cerr << "Reading " << order_ << "-gram LM from " << f << endl;
@@ -264,7 +267,7 @@ class LanguageModelImpl {
 
   //TODO: make sure that Vocab_None is set to kNONE in srilm (-1), or that SRILM otherwise interprets -1 as a terminator and not a word
   double EstimateProb(const void* state) {
-    if (!order_) return 0.;
+    if (unigram) return 0.;
     int len = StateSize(state);
     // cerr << "residual len: " << len << endl;
     buffer_.resize(len + 1);
@@ -278,7 +281,7 @@ class LanguageModelImpl {
 
   // for <s> (n-1 left words) and (n-1 right words) </s>
   double FinalTraversalCost(const void* state) {
-    if (!order_) return 0.;
+    if (unigram) return 0.;
     int slen = StateSize(state);
     int len = slen + 2;
     // cerr << "residual len: " << len << endl;
@@ -328,7 +331,7 @@ class LanguageModelImpl {
 
   //NOTE: this is where the scoring of words happens (heuristic happens in EstimateProb)
   double LookupWords(const TRule& rule, const vector<const void*>& ant_states, void* vstate) {
-    if (order_==0)
+    if (unigram)
       return stateless_cost(rule);
     int len = rule.ELength() - rule.Arity();
     for (int i = 0; i < ant_states.size(); ++i)
@@ -399,6 +402,7 @@ public:
   const WordID kUNKNOWN;
   const WordID kNONE;
   const WordID kSTAR;
+  const bool unigram;
 };
 
 struct ClientLMI : public LanguageModelImpl
author	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-05 22:17:55 +0000
committer	graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-05 22:17:55 +0000
commit	3f184d8e18dbe5bb6b25f8bb4bd4d00fafb6ef40 (patch)
tree	6f0009901813fd7399ca0580b8552abeaf878d26
parent	d89e3cd42d17e94f87609f828b6c04b3a9a0523c (diff)