diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-31 05:10:54 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-31 05:10:54 +0000 |
commit | 526efd6515eb9efdcc1fe756c3cec4981ca10186 (patch) | |
tree | 788fbbc985492f76afac5bdbef769e8fb6410ea2 | |
parent | 56400a763cb2e56738fe82034dec5769b6d171a8 (diff) |
shorten_left comment
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@464 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-x | decoder/ff_fsa.h | 17 | ||||
-rwxr-xr-x | decoder/ff_lm_fsa.h | 4 | ||||
-rwxr-xr-x | decoder/ff_sample_fsa.h | 7 |
3 files changed, 23 insertions, 5 deletions
diff --git a/decoder/ff_fsa.h b/decoder/ff_fsa.h index 8d2b0488..de777fd5 100755 --- a/decoder/ff_fsa.h +++ b/decoder/ff_fsa.h @@ -8,6 +8,13 @@ state is some fixed width byte array. could actually be a void *, WordID sequence, whatever. + TODO: maybe ff that wants to know about SentenceMetadata should store a ref to + it permanently rather than get passed it for every operation. we're never + decoding more than 1 sentence at once and it's annoying to pass it. same + could apply for result edge as well since so far i only use it for logging + when USE_INFO_EDGE 1 - would make the most sense if the same change happened + to ff.h at the same time. + TODO: there are a confusing array of default-implemented supposedly slightly more efficient overrides enabled; however, the two key differences are: do you score a phrase, or just word at a time (the latter constraining you to obey markov_order() everywhere. you have to implement the word case no matter what. TODO: considerable simplification of implementation if Scan implementors are required to update state in place (using temporary copy if they need it), or e.g. using memmove (copy from end to beginning) to rotate state right. @@ -153,6 +160,16 @@ public: int markov_order() const { return 0; } // override if you use state. order 0 implies state_bytes()==0 as well, as far as scoring/splitting is concerned (you can still track state, though) //TODO: if we wanted, we could mark certain states as maximal-context, but this would lose our fixed amount of left context in ff_from_fsa, and lose also our vector operations (have to scan left words 1 at a time, checking always to see where you change from h to inside - BUT, could detect equivalent LM states, which would be nice). + + + // if [i,end) are unscored words of length <= markov_order, score some of them on the right, and return the number scored, i.e. [end-r,end) will have been scored for return r. CAREFUL: for ngram you have to sometimes remember to pay all of the backoff once you see a few more words to the left. + template <class Accum> + int early_score_words(SentenceMetadata const& smeta,Hypergraph::Edge const& edge,WordID const* i, WordID const* end,Accum *accum) const { + return 0; + } + + // this isn't currently used at all. this left-shortening is not recommended (wasn't worth the computation expense for ngram): specifically for bottom up scoring (ff_from_fsa), you can return a shorter left-words context - but this means e.g. for ngram tracking that a backoff occurred where the final BO cost isn't yet known. you would also have to remember any necessary info in your own state - in the future, ff_from_fsa on a list of fsa features would only shorten it to the max + Features features() const { return features_; } diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h index 9ba7b2c5..df11ab18 100755 --- a/decoder/ff_lm_fsa.h +++ b/decoder/ff_lm_fsa.h @@ -44,7 +44,9 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> { template <class Accum> void ScanAccum(SentenceMetadata const& /* smeta */,Hypergraph::Edge const& edge,WordID w,void const* old_st,void *new_st,Accum *a) const { +#if USE_INFO_EDGE Hypergraph::Edge &de=(Hypergraph::Edge &)edge; +#endif if (!ctxlen_) { Add(floored(pimpl_->WordProb(w,&empty_context)),a); return; @@ -72,7 +74,9 @@ struct LanguageModelFsa : public FsaFeatureFunctionBase<LanguageModelFsa> { //FIXME: there is a bug in here somewhere, or else the 3gram LM we use gives different scores for phrases (impossible? BOW nonzero when shortening context past what LM has?) template <class Accum> void ScanPhraseAccum(SentenceMetadata const& /* smeta */,const Hypergraph::Edge&edge,WordID const* begin,WordID const* end,void const* old_st,void *new_st,Accum *a) const { +# if USE_INFO_EDGE Hypergraph::Edge &de=(Hypergraph::Edge &)edge; +# endif if (begin==end) return; // otherwise w/ shortening it's possible to end up with no words at all. /* // this is forcing unigram prob always. we will instead build the phrase if (!ctxlen_) { diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h index a129806d..9f44f1a4 100755 --- a/decoder/ff_sample_fsa.h +++ b/decoder/ff_sample_fsa.h @@ -15,8 +15,7 @@ struct WordPenaltyFsa : public FsaFeatureFunctionBase<WordPenaltyFsa> { WordPenaltyFsa(std::string const& param) { Init(); - return; - //below are all defaults: + return; //below are all defaults: set_state_bytes(0); start.clear(); h_start.clear(); @@ -104,7 +103,6 @@ struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> { }; // similar example feature; base type exposes stateful type, defines markov_order 1, state size = sizeof(State) -// also buggy right now: give it a bonus weight struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> { typedef FsaTypedBase<int,ShorterThanPrev> Base; static std::string usage(bool param,bool verbose) { @@ -119,8 +117,7 @@ struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> { return std::strlen(TD::Convert(w)); } ShorterThanPrev(std::string const& param) - : Base(-1,4/* ,singleton_sentence(TD::se) */) - // start, h_start, end_phrase + : Base(-1,4,singleton_sentence(TD::se)) // start, h_start, end_phrase // estimate: anything <4 chars is usually shorter than previous { Init(); |