diff options
Diffstat (limited to 'decoder/apply_fsa_models.cc')
-rwxr-xr-x | decoder/apply_fsa_models.cc | 76 |
1 files changed, 71 insertions, 5 deletions
diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc index 8771863c..b43c02a4 100755 --- a/decoder/apply_fsa_models.cc +++ b/decoder/apply_fsa_models.cc @@ -11,6 +11,10 @@ #include "hg_cfg.h" #include "utoa.h" #include "hash.h" +#include "value_array.h" + +#define DFSA(x) x +#define DPFSA(x) x using namespace std; @@ -26,31 +30,91 @@ typedef CFG::RuleHandle RuleHandle; namespace { +/* + +1) A -> x . * (trie) + +this is somewhat nice. cost pushed for best first, of course. similar benefit as left-branching binarization without the explicit predict/complete steps? + +vs. just + +2) * -> x . y + +here you have to potentially list out all A -> . x y as items * -> . x y immediately, and shared rhs seqs won't be shared except at the usual single-NT predict/complete. of course, the prediction of items -> . x y can occur lazy best-first. + +vs. + +3) * -> x . * + +with 3, we predict all sorts of useless items - that won't give us our goal A and may not partcipate in any parse. this is not a good option at all. +*/ + +#define TRIE_START_LHS 1 +// option 1) above. 0 would be option 3), which is dumb + // if we don't greedy-binarize, we want to encode recognized prefixes p (X -> p . rest) efficiently. if we're doing this, we may as well also push costs so we can best-first select rules in a lazy fashion. this is effectively left-branching binarization, of course. template <class K,class V,class Hash> -struct prefix_map_type { +struct fsa_map_type { typedef std::map<K,V> type; }; //template typedef -#define PREFIX_MAP(k,v) prefix_map_type<k,v,boost::hash<k> >::type -typedef NTHandle LHS; +#define FSA_MAP(k,v) fsa_map_type<k,v,boost::hash<k> >::type +typedef WordID LHS; // so this will be -NTHandle. + + struct PrefixTrieNode { prob_t backward; // (viterbi) backward prob (for cost pushing) - typedef PREFIX_MAP(LHS,RuleHandle) Completed; // can only have one rule w/ a given signature (duplicates should be collapsed when making CFG). but there may be multiple rules, with different LHS +# +#if TRIE_START_LHS + typedef FSA_MAP(LHS,RuleHandle) Completed; // can only have one rule w/ a given signature (duplicates should be collapsed when making CFG). but there may be multiple rules, with different LHS Completed completed; - typedef PREFIX_MAP(WordID,PrefixTrieNode *) Adj; +#else + bool complete; // may also have successors, of course +#endif + // instead of completed map, we have trie start w/ lhs.? + + // outgoing edges will be ordered highest p to worst p + struct Edge { + DPFSA(prob_t p;) // we can probably just store deltas, but for debugging remember the full p + prob_t delta; // p[i]=delta*p[i-1], with p(-1)=1 + PrefixTrieNode *dest; + WordID w; + }; + typedef FSA_MAP(WordID,Edge) BuildAdj; + BuildAdj build_adj; //TODO: move builder elsewhere? + typedef ValueArray<Edge> Adj; +// typedef vector<Edge> Adj; Adj adj; //TODO: }; +#if TRIE_START_LHS +//Trie starts with lhs, then continues w/ rhs +#else +// just rhs. i think item names should exclude lhs if possible (most sharing). get prefix cost w/ forward = viterbi (global best-first admissable h only) and it should be ok? +#endif + +// costs are pushed. struct PrefixTrie { CFG const* cfgp; CFG const& cfg() const { return *cfgp; } PrefixTrie(CFG const& cfg) : cfgp(&cfg) { + //TODO: } }; +// these should go in a global best-first queue +struct Item { + prob_t forward; + /* The forward probability alpha_i(X[k]->x.y) is the sum of the probabilities of all + constrained paths of length that end in state X[k]->x.y*/ + prob_t inner; + /* The inner probability beta_i(X[k]->x.y) is the sum of the probabilities of all + paths of length i-k that start in state X[k,k]->.xy and end in X[k,i]->x.y, and generate the input symbols x[k,...,i-1] */ + +}; + }//anon ns @@ -102,6 +166,8 @@ void ApplyFsa::ApplyBottomUp() void ApplyFsa::ApplyEarley() { hgcfg.GiveCFG(cfg); + cfg.SortLocalBestFirst(); + // don't need to uniq - option to do that already exists in cfg_options //TODO: } |