summaryrefslogtreecommitdiff
path: root/decoder/apply_fsa_models.cc
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-18 21:26:55 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-08-18 21:26:55 +0000
commit769aadfc431f2eec68c889b65b8939a4d35f56e9 (patch)
tree31a496a1a49416afde4d5cee668969acc7497e9f /decoder/apply_fsa_models.cc
parent242774be8f3e4c08d1406ca4ecc9abcc1ca7d190 (diff)
ValueArray instead of string for state - same LM decode scores
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@593 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/apply_fsa_models.cc')
-rwxr-xr-xdecoder/apply_fsa_models.cc76
1 files changed, 71 insertions, 5 deletions
diff --git a/decoder/apply_fsa_models.cc b/decoder/apply_fsa_models.cc
index 8771863c..b43c02a4 100755
--- a/decoder/apply_fsa_models.cc
+++ b/decoder/apply_fsa_models.cc
@@ -11,6 +11,10 @@
#include "hg_cfg.h"
#include "utoa.h"
#include "hash.h"
+#include "value_array.h"
+
+#define DFSA(x) x
+#define DPFSA(x) x
using namespace std;
@@ -26,31 +30,91 @@ typedef CFG::RuleHandle RuleHandle;
namespace {
+/*
+
+1) A -> x . * (trie)
+
+this is somewhat nice. cost pushed for best first, of course. similar benefit as left-branching binarization without the explicit predict/complete steps?
+
+vs. just
+
+2) * -> x . y
+
+here you have to potentially list out all A -> . x y as items * -> . x y immediately, and shared rhs seqs won't be shared except at the usual single-NT predict/complete. of course, the prediction of items -> . x y can occur lazy best-first.
+
+vs.
+
+3) * -> x . *
+
+with 3, we predict all sorts of useless items - that won't give us our goal A and may not partcipate in any parse. this is not a good option at all.
+*/
+
+#define TRIE_START_LHS 1
+// option 1) above. 0 would be option 3), which is dumb
+
// if we don't greedy-binarize, we want to encode recognized prefixes p (X -> p . rest) efficiently. if we're doing this, we may as well also push costs so we can best-first select rules in a lazy fashion. this is effectively left-branching binarization, of course.
template <class K,class V,class Hash>
-struct prefix_map_type {
+struct fsa_map_type {
typedef std::map<K,V> type;
};
//template typedef
-#define PREFIX_MAP(k,v) prefix_map_type<k,v,boost::hash<k> >::type
-typedef NTHandle LHS;
+#define FSA_MAP(k,v) fsa_map_type<k,v,boost::hash<k> >::type
+typedef WordID LHS; // so this will be -NTHandle.
+
+
struct PrefixTrieNode {
prob_t backward; // (viterbi) backward prob (for cost pushing)
- typedef PREFIX_MAP(LHS,RuleHandle) Completed; // can only have one rule w/ a given signature (duplicates should be collapsed when making CFG). but there may be multiple rules, with different LHS
+#
+#if TRIE_START_LHS
+ typedef FSA_MAP(LHS,RuleHandle) Completed; // can only have one rule w/ a given signature (duplicates should be collapsed when making CFG). but there may be multiple rules, with different LHS
Completed completed;
- typedef PREFIX_MAP(WordID,PrefixTrieNode *) Adj;
+#else
+ bool complete; // may also have successors, of course
+#endif
+ // instead of completed map, we have trie start w/ lhs.?
+
+ // outgoing edges will be ordered highest p to worst p
+ struct Edge {
+ DPFSA(prob_t p;) // we can probably just store deltas, but for debugging remember the full p
+ prob_t delta; // p[i]=delta*p[i-1], with p(-1)=1
+ PrefixTrieNode *dest;
+ WordID w;
+ };
+ typedef FSA_MAP(WordID,Edge) BuildAdj;
+ BuildAdj build_adj; //TODO: move builder elsewhere?
+ typedef ValueArray<Edge> Adj;
+// typedef vector<Edge> Adj;
Adj adj;
//TODO:
};
+#if TRIE_START_LHS
+//Trie starts with lhs, then continues w/ rhs
+#else
+// just rhs. i think item names should exclude lhs if possible (most sharing). get prefix cost w/ forward = viterbi (global best-first admissable h only) and it should be ok?
+#endif
+
+// costs are pushed.
struct PrefixTrie {
CFG const* cfgp;
CFG const& cfg() const { return *cfgp; }
PrefixTrie(CFG const& cfg) : cfgp(&cfg) {
+
//TODO:
}
};
+// these should go in a global best-first queue
+struct Item {
+ prob_t forward;
+ /* The forward probability alpha_i(X[k]->x.y) is the sum of the probabilities of all
+ constrained paths of length that end in state X[k]->x.y*/
+ prob_t inner;
+ /* The inner probability beta_i(X[k]->x.y) is the sum of the probabilities of all
+ paths of length i-k that start in state X[k,k]->.xy and end in X[k,i]->x.y, and generate the input symbols x[k,...,i-1] */
+
+};
+
}//anon ns
@@ -102,6 +166,8 @@ void ApplyFsa::ApplyBottomUp()
void ApplyFsa::ApplyEarley()
{
hgcfg.GiveCFG(cfg);
+ cfg.SortLocalBestFirst();
+ // don't need to uniq - option to do that already exists in cfg_options
//TODO:
}