summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-05 22:17:55 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-05 22:17:55 +0000
commit3f184d8e18dbe5bb6b25f8bb4bd4d00fafb6ef40 (patch)
tree6f0009901813fd7399ca0580b8552abeaf878d26
parentd89e3cd42d17e94f87609f828b6c04b3a9a0523c (diff)
0 size 1gram lm works
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@144 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--decoder/ff_lm.cc18
1 files changed, 11 insertions, 7 deletions
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index 7e3f6e2b..cdfbb96a 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -2,7 +2,7 @@
//TODO: allow features to reorder by heuristic*weight the rules' terminal phrases (or of hyperedges'). if first pass has pruning, then compute over whole ruleset as part of heuristic
-//TODO: verify that this is true: if ngram order is bigger than lm state's, then the longest possible ngram scores are still used. if you really want a lower order, a truncated copy of the LM should be small enough. otherwise, an option to null out words outside of the order's window would need to be implemented.
+//NOTE: if ngram order is bigger than lm state's, then the longest possible ngram scores are still used. if you really want a lower order, a truncated copy of the LM should be small enough. otherwise, an option to null out words outside of the order's window would need to be implemented.
#include "ff_lm.h"
@@ -160,13 +160,14 @@ struct LMClient {
class LanguageModelImpl {
public:
explicit LanguageModelImpl(int order) :
- ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
+ ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
floor_(-100.0),
kSTART(TD::Convert("<s>")),
kSTOP(TD::Convert("</s>")),
kUNKNOWN(TD::Convert("<unk>")),
kNONE(-1),
- kSTAR(TD::Convert("<{STAR}>")) {}
+ kSTAR(TD::Convert("<{STAR}>"))
+ , unigram(order<=1) {}
LanguageModelImpl(int order, const string& f) :
ngram_(*TD::dict_, order), buffer_(), order_(order), state_size_(OrderToStateSize(order) - 1),
@@ -175,7 +176,9 @@ class LanguageModelImpl {
kSTOP(TD::Convert("</s>")),
kUNKNOWN(TD::Convert("<unk>")),
kNONE(-1),
- kSTAR(TD::Convert("<{STAR}>")) {
+ kSTAR(TD::Convert("<{STAR}>"))
+ , unigram(order<=1)
+ {
File file(f.c_str(), "r", 0);
assert(file);
cerr << "Reading " << order_ << "-gram LM from " << f << endl;
@@ -264,7 +267,7 @@ class LanguageModelImpl {
//TODO: make sure that Vocab_None is set to kNONE in srilm (-1), or that SRILM otherwise interprets -1 as a terminator and not a word
double EstimateProb(const void* state) {
- if (!order_) return 0.;
+ if (unigram) return 0.;
int len = StateSize(state);
// cerr << "residual len: " << len << endl;
buffer_.resize(len + 1);
@@ -278,7 +281,7 @@ class LanguageModelImpl {
// for <s> (n-1 left words) and (n-1 right words) </s>
double FinalTraversalCost(const void* state) {
- if (!order_) return 0.;
+ if (unigram) return 0.;
int slen = StateSize(state);
int len = slen + 2;
// cerr << "residual len: " << len << endl;
@@ -328,7 +331,7 @@ class LanguageModelImpl {
//NOTE: this is where the scoring of words happens (heuristic happens in EstimateProb)
double LookupWords(const TRule& rule, const vector<const void*>& ant_states, void* vstate) {
- if (order_==0)
+ if (unigram)
return stateless_cost(rule);
int len = rule.ELength() - rule.Arity();
for (int i = 0; i < ant_states.size(); ++i)
@@ -399,6 +402,7 @@ public:
const WordID kUNKNOWN;
const WordID kNONE;
const WordID kSTAR;
+ const bool unigram;
};
struct ClientLMI : public LanguageModelImpl