summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--decoder/decoder.cc1
-rw-r--r--decoder/lextrans.cc13
2 files changed, 14 insertions, 0 deletions
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index daf82f10..f47b7385 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -368,6 +368,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
("scale_prune_srclen", "scale beams by the input length (in # of tokens; may not be what you want for lattices")
("promise_power",po::value<double>()->default_value(0), "Give more beam budget to more promising previous-pass nodes when pruning - but allocate the same average beams. 0 means off, 1 means beam proportional to inside*outside prob, n means nth power (affects just --cubepruning_pop_limit). note: for the same pop_limit, this gives more search error unless very close to 0 (recommend disabled; even 0.01 is slightly worse than 0) which is a bad sign and suggests this isn't doing a good job; further it's slightly slower to LM cube rescore with 0.01 compared to 0, as well as giving (very insignificantly) lower BLEU. TODO: test under more conditions, or try idea with different formula, or prob. cube beams.")
("lextrans_use_null", "Support source-side null words in lexical translation")
+ ("lextrans_align_only", "Only used in alignment mode. Limit target words generated by reference")
("tagger_tagset,t", po::value<string>(), "(Tagger) file containing tag set")
("csplit_output_plf", "(Compound splitter) Output lattice in PLF format")
("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice")
diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc
index c3bd775f..874645de 100644
--- a/decoder/lextrans.cc
+++ b/decoder/lextrans.cc
@@ -14,6 +14,7 @@ using namespace std;
struct LexicalTransImpl {
LexicalTransImpl(const boost::program_options::variables_map& conf) :
use_null(conf.count("lextrans_use_null") > 0),
+ align_only_(conf.count("lextrans_align_only") > 0),
psg_file_(),
kXCAT(TD::Convert("X")*-1),
kNULL(TD::Convert("<eps>")),
@@ -75,6 +76,13 @@ struct LexicalTransImpl {
// hack to tell the feature function system how big the sentence pair is
const int f_start = (use_null ? -1 : 0);
int prev_node_id = -1;
+ set<WordID> target_vocab; // only set for alignment_only mode
+ if (align_only_) {
+ const Lattice& ref = smeta.GetReference();
+ for (int i = 0; i < ref.size(); ++i) {
+ target_vocab.insert(ref[i][0].label);
+ }
+ }
for (int i = 0; i < e_len; ++i) { // for each word in the *target*
Hypergraph::Node* node = forest->AddNode(kXCAT);
const int new_node_id = node->id_;
@@ -93,6 +101,10 @@ struct LexicalTransImpl {
assert(rb);
for (int k = 0; k < rb->GetNumRules(); ++k) {
TRulePtr rule = rb->GetIthRule(k);
+ if (align_only_) {
+ if (target_vocab.count(rule->f_[0]) == 0)
+ continue;
+ }
Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector());
edge->i_ = j;
edge->j_ = j+1;
@@ -122,6 +134,7 @@ struct LexicalTransImpl {
private:
const bool use_null;
+ const bool align_only_;
ifstream* psg_file_;
const WordID kXCAT;
const WordID kNULL;