faster alignment mode when full translation inference is not required

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@731 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-11-22 23:00:34 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-11-22 23:00:34 +0000
commit: c23a059654229dd85dd64b0e4c8a7f8fba456734 (patch)
tree: 25086e78902e1fa6767ff7f21c245e0174c08315
parent: a7ea5514a446482a4407fd522530fefd9231be66 (diff)
2 files changed, 14 insertions, 0 deletions
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index daf82f10..f47b7385 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -368,6 +368,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
     ("scale_prune_srclen", "scale beams by the input length (in # of tokens; may not be what you want for lattices")
     ("promise_power",po::value<double>()->default_value(0), "Give more beam budget to more promising previous-pass nodes when pruning - but allocate the same average beams.  0 means off, 1 means beam proportional to inside*outside prob, n means nth power (affects just --cubepruning_pop_limit).  note: for the same pop_limit, this gives more search error unless very close to 0 (recommend disabled; even 0.01 is slightly worse than 0) which is a bad sign and suggests this isn't doing a good job; further it's slightly slower to LM cube rescore with 0.01 compared to 0, as well as giving (very insignificantly) lower BLEU.  TODO: test under more conditions, or try idea with different formula, or prob. cube beams.")
         ("lextrans_use_null", "Support source-side null words in lexical translation")
+        ("lextrans_align_only", "Only used in alignment mode. Limit target words generated by reference")
         ("tagger_tagset,t", po::value<string>(), "(Tagger) file containing tag set")
         ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format")
         ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice")
diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc
index c3bd775f..874645de 100644
--- a/decoder/lextrans.cc
+++ b/decoder/lextrans.cc
@@ -14,6 +14,7 @@ using namespace std;
 struct LexicalTransImpl {
   LexicalTransImpl(const boost::program_options::variables_map& conf) :
       use_null(conf.count("lextrans_use_null") > 0),
+      align_only_(conf.count("lextrans_align_only") > 0),
       psg_file_(),
       kXCAT(TD::Convert("X")*-1),
       kNULL(TD::Convert("<eps>")),
@@ -75,6 +76,13 @@ struct LexicalTransImpl {
     // hack to tell the feature function system how big the sentence pair is
     const int f_start = (use_null ? -1 : 0);
     int prev_node_id = -1;
+    set<WordID> target_vocab; // only set for alignment_only mode
+    if (align_only_) {
+      const Lattice& ref = smeta.GetReference();
+      for (int i = 0; i < ref.size(); ++i) {
+        target_vocab.insert(ref[i][0].label);
+      }
+    }
     for (int i = 0; i < e_len; ++i) {  // for each word in the *target*
       Hypergraph::Node* node = forest->AddNode(kXCAT);
       const int new_node_id = node->id_;
@@ -93,6 +101,10 @@ struct LexicalTransImpl {
         assert(rb);
         for (int k = 0; k < rb->GetNumRules(); ++k) {
           TRulePtr rule = rb->GetIthRule(k);
+          if (align_only_) {
+            if (target_vocab.count(rule->f_[0]) == 0)
+              continue;
+          }
           Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector());
           edge->i_ = j;
           edge->j_ = j+1;
@@ -122,6 +134,7 @@ struct LexicalTransImpl {
 
  private:
   const bool use_null;
+  const bool align_only_;
   ifstream* psg_file_;
   const WordID kXCAT;
   const WordID kNULL;
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-11-22 23:00:34 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-11-22 23:00:34 +0000
commit	c23a059654229dd85dd64b0e4c8a7f8fba456734 (patch)
tree	25086e78902e1fa6767ff7f21c245e0174c08315
parent	a7ea5514a446482a4407fd522530fefd9231be66 (diff)