From c23a059654229dd85dd64b0e4c8a7f8fba456734 Mon Sep 17 00:00:00 2001 From: redpony Date: Mon, 22 Nov 2010 23:00:34 +0000 Subject: faster alignment mode when full translation inference is not required git-svn-id: https://ws10smt.googlecode.com/svn/trunk@731 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/decoder.cc | 1 + decoder/lextrans.cc | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/decoder/decoder.cc b/decoder/decoder.cc index daf82f10..f47b7385 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -368,6 +368,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("scale_prune_srclen", "scale beams by the input length (in # of tokens; may not be what you want for lattices") ("promise_power",po::value()->default_value(0), "Give more beam budget to more promising previous-pass nodes when pruning - but allocate the same average beams. 0 means off, 1 means beam proportional to inside*outside prob, n means nth power (affects just --cubepruning_pop_limit). note: for the same pop_limit, this gives more search error unless very close to 0 (recommend disabled; even 0.01 is slightly worse than 0) which is a bad sign and suggests this isn't doing a good job; further it's slightly slower to LM cube rescore with 0.01 compared to 0, as well as giving (very insignificantly) lower BLEU. TODO: test under more conditions, or try idea with different formula, or prob. cube beams.") ("lextrans_use_null", "Support source-side null words in lexical translation") + ("lextrans_align_only", "Only used in alignment mode. Limit target words generated by reference") ("tagger_tagset,t", po::value(), "(Tagger) file containing tag set") ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format") ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice") diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc index c3bd775f..874645de 100644 --- a/decoder/lextrans.cc +++ b/decoder/lextrans.cc @@ -14,6 +14,7 @@ using namespace std; struct LexicalTransImpl { LexicalTransImpl(const boost::program_options::variables_map& conf) : use_null(conf.count("lextrans_use_null") > 0), + align_only_(conf.count("lextrans_align_only") > 0), psg_file_(), kXCAT(TD::Convert("X")*-1), kNULL(TD::Convert("")), @@ -75,6 +76,13 @@ struct LexicalTransImpl { // hack to tell the feature function system how big the sentence pair is const int f_start = (use_null ? -1 : 0); int prev_node_id = -1; + set target_vocab; // only set for alignment_only mode + if (align_only_) { + const Lattice& ref = smeta.GetReference(); + for (int i = 0; i < ref.size(); ++i) { + target_vocab.insert(ref[i][0].label); + } + } for (int i = 0; i < e_len; ++i) { // for each word in the *target* Hypergraph::Node* node = forest->AddNode(kXCAT); const int new_node_id = node->id_; @@ -93,6 +101,10 @@ struct LexicalTransImpl { assert(rb); for (int k = 0; k < rb->GetNumRules(); ++k) { TRulePtr rule = rb->GetIthRule(k); + if (align_only_) { + if (target_vocab.count(rule->f_[0]) == 0) + continue; + } Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector()); edge->i_ = j; edge->j_ = j+1; @@ -122,6 +134,7 @@ struct LexicalTransImpl { private: const bool use_null; + const bool align_only_; ifstream* psg_file_; const WordID kXCAT; const WordID kNULL; -- cgit v1.2.3