diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2010-12-09 17:04:29 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2010-12-09 17:04:29 -0500 |
commit | 9a8cbe4db88e63378b6d3c4ec96438819f1f1131 (patch) | |
tree | abf1a23739a033eaabd62f61e39ac249d9cf7717 | |
parent | 61bfaf15c02a0555d8ffa5dd4e6ae32f09354610 (diff) |
major refactor of markov features for word alignment
-rw-r--r-- | decoder/cdec_ff.cc | 4 | ||||
-rw-r--r-- | decoder/ff_wordalign.cc | 431 | ||||
-rw-r--r-- | decoder/ff_wordalign.h | 100 | ||||
-rw-r--r-- | decoder/lextrans.cc | 2 | ||||
-rwxr-xr-x | word-aligner/aligner.pl | 14 | ||||
-rw-r--r-- | word-aligner/makefiles/makefile.grammars | 8 | ||||
-rwxr-xr-x | word-aligner/support/generate_word_pair_features.pl | 4 |
7 files changed, 163 insertions, 400 deletions
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index d6cf4572..e87ab5ab 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -50,13 +50,9 @@ void register_feature_functions() { #endif ff_registry.Register("RuleShape", new FFFactory<RuleShapeFeatures>); ff_registry.Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>); - ff_registry.Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>); ff_registry.Register("LexNullJump", new FFFactory<LexNullJump>); ff_registry.Register("NewJump", new FFFactory<NewJump>); - ff_registry.Register("MarkovJump", new FFFactory<MarkovJump>); - ff_registry.Register("MarkovJumpFClass", new FFFactory<MarkovJumpFClass>); ff_registry.Register("SourceBigram", new FFFactory<SourceBigram>); - ff_registry.Register("SourcePOSBigram", new FFFactory<SourcePOSBigram>); ff_registry.Register("BlunsomSynchronousParseHack", new FFFactory<BlunsomSynchronousParseHack>); ff_registry.Register("AlignerResults", new FFFactory<AlignerResults>); ff_registry.Register("CSplit_BasicFeatures", new FFFactory<BasicCSplitFeatures>); diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc index 980c64ad..338f1a72 100644 --- a/decoder/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -6,7 +6,13 @@ #include <sstream> #include <string> #include <cmath> +#include <tr1/unordered_map> +#include <boost/tuple/tuple.hpp> +#include "boost/tuple/tuple_comparison.hpp" +#include <boost/functional/hash.hpp> + +#include "factored_lexicon_helper.h" #include "verbose.h" #include "alignment_pharaoh.h" #include "stringlib.h" @@ -25,43 +31,6 @@ using namespace std; // TODO new feature: if a word is translated as itself and there is a transition back to the same word, fire a feature -Model2BinaryFeatures::Model2BinaryFeatures(const string& ) : - fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) { - for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) { - for (int j = 0; j < i; ++j) { - for (int k = 0; k < MAX_SENTENCE_SIZE; ++k) { - int& val = fids_[i][j][k]; - val = -1; - if (j < i) { - ostringstream os; - os << "M2FL:" << i << ":TI:" << k << "_SI:" << j; - val = FD::Convert(os.str()); - } - } - } - } -} - -void Model2BinaryFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector<const void*>& /*ant_states*/, - SparseVector<double>* features, - SparseVector<double>* // estimated_features - , - void* // state - ) const { - // if the source word is either null or the generated word - // has no position in the reference - if (edge.i_ == -1 || edge.prev_i_ == -1) - return; - - assert(smeta.GetTargetLength() > 0); - const int fid = fids_[smeta.GetSourceLength()][edge.i_][edge.prev_i_]; - features->set_value(fid, 1.0); -// cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl; -} - - RelativeSentencePosition::RelativeSentencePosition(const string& param) : fid_(FD::Convert("RelativeSentencePosition")) { if (!param.empty()) { @@ -119,87 +88,6 @@ void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& sme // cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl; } -MarkovJumpFClass::MarkovJumpFClass(const string& param) : - FeatureFunction(1), - fids_(MAX_SENTENCE_SIZE) { - cerr << " MarkovJumpFClass" << endl; - cerr << "Reading source POS tags from " << param << endl; - ReadFile rf(param); - istream& in = *rf.stream(); - set<WordID> classes; - while(in) { - string line; - getline(in, line); - if (line.empty()) continue; - vector<WordID> v; - TD::ConvertSentence(line, &v); - pos_.push_back(v); - for (int i = 0; i < v.size(); ++i) - classes.insert(v[i]); - } - cerr << " (" << pos_.size() << " lines)\n"; - cerr << " Classes: " << classes.size() << endl; - for (int ss = 1; ss < MAX_SENTENCE_SIZE; ++ss) { - map<WordID, map<int, int> >& cfids = fids_[ss]; - for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) { - map<int, int> &fids = cfids[*i]; - for (int j = -ss; j <= ss; ++j) { - ostringstream os; - os << "Jump_FL:" << ss << "_FC:" << TD::Convert(*i) << "_J:" << j; - fids[j] = FD::Convert(os.str()); - } - } - } -} - -void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta, - int prev_src_pos, - int cur_src_pos, - SparseVector<double>* features) const { - if (prev_src_pos == kNULL_i || cur_src_pos == kNULL_i) - return; - - const int jumpsize = cur_src_pos - prev_src_pos; - - assert(smeta.GetSentenceID() < pos_.size()); - const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos]; - const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second; - features->set_value(fid, 1.0); -} - -void MarkovJumpFClass::FinalTraversalFeatures(const void* context, - SparseVector<double>* features) const { - int left_index = *static_cast<const unsigned char*>(context); -// int right_index = cur_flen; - // TODO -} - -void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_states, - SparseVector<double>* features, - SparseVector<double>* /* estimated_features */, - void* state) const { - unsigned char& dpstate = *((unsigned char*)state); - if (edge.Arity() == 0) { - dpstate = static_cast<unsigned int>(edge.i_); - } else if (edge.Arity() == 1) { - dpstate = *((unsigned char*)ant_states[0]); - } else if (edge.Arity() == 2) { - int left_index = *((unsigned char*)ant_states[0]); - int right_index = *((unsigned char*)ant_states[1]); - if (right_index == -1) - dpstate = static_cast<unsigned int>(left_index); - else - dpstate = static_cast<unsigned int>(right_index); -// const WordID cur_fclass = pos_[smeta.GetSentenceID()][right_index]; -// cerr << edge.i_ << "," << edge.j_ << ": fclass=" << TD::Convert(cur_fclass) << " j=" << jumpsize << endl; -// const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second; -// features->set_value(fid, 1.0); - FireFeature(smeta, left_index, right_index, features); - } -} - LexNullJump::LexNullJump(const string& param) : FeatureFunction(1), fid_lex_null_(FD::Convert("JumpLexNull")), @@ -239,107 +127,71 @@ void LexNullJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, } } -MarkovJump::MarkovJump(const string& param) : +NewJump::NewJump(const string& param) : FeatureFunction(1), - fid_(FD::Convert("MarkovJump")), - fid_lex_null_(FD::Convert("JumpLexNull")), - fid_null_lex_(FD::Convert("JumpNullLex")), - fid_null_null_(FD::Convert("JumpNullNull")), - fid_lex_lex_(FD::Convert("JumpLexLex")), - binary_params_(false) { - cerr << " MarkovJump"; + kBOS_(TD::Convert("BOS")), + kEOS_(TD::Convert("EOS")) { + cerr << " NewJump"; vector<string> argv; + set<string> permitted; + permitted.insert("use_binned_log_lengths"); + permitted.insert("flen"); + permitted.insert("elen"); + permitted.insert("fprev"); + permitted.insert("f0"); + permitted.insert("f-1"); + permitted.insert("f+1"); + // also permitted f:FILENAME int argc = SplitOnWhitespace(param, &argv); - if (argc != 1 || !(argv[0] == "-b" || argv[0] == "+b")) { - cerr << "MarkovJump: expected parameters to be -b or +b\n"; - exit(1); - } - binary_params_ = argv[0] == "+b"; - if (binary_params_) { - flen2jump2fid_.resize(MAX_SENTENCE_SIZE); - for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) { - map<int, int>& jump2fid = flen2jump2fid_[i]; - for (int jump = -i; jump <= i; ++jump) { - ostringstream os; - os << "Jump:FLen:" << i << "_J:" << jump; - jump2fid[jump] = FD::Convert(os.str()); - } - } - } else { - cerr << " (Blunsom & Cohn definition)"; - } - cerr << endl; -} - -// TODO handle NULLs according to Och 2000? -void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector<const void*>& ant_states, - SparseVector<double>* features, - SparseVector<double>* /* estimated_features */, - void* state) const { - unsigned char& dpstate = *((unsigned char*)state); - const int flen = smeta.GetSourceLength(); - if (edge.Arity() == 0) { - dpstate = static_cast<unsigned int>(edge.i_); - if (edge.prev_i_ == 0) { // first word in sentence - if (edge.i_ >= 0 && binary_params_) { - const int fid = flen2jump2fid_[flen].find(edge.i_ + 1)->second; - features->set_value(fid, 1.0); - } else if (edge.i_ < 0 && binary_params_) { - // handled by bigram features - } - } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) { - if (edge.i_ >= 0 && binary_params_) { - int jumpsize = flen - edge.i_; - const int fid = flen2jump2fid_[flen].find(jumpsize)->second; - features->set_value(fid, 1.0); - } else if (edge.i_ < 0 && binary_params_) { - // handled by bigram features - } - } - } else if (edge.Arity() == 1) { - dpstate = *((unsigned char*)ant_states[0]); - } else if (edge.Arity() == 2) { - int left_index = *((unsigned char*)ant_states[0]); - int right_index = *((unsigned char*)ant_states[1]); - if (right_index == -1) - dpstate = static_cast<unsigned int>(left_index); - else - dpstate = static_cast<unsigned int>(right_index); - if (left_index == kNULL_i || right_index == kNULL_i) { - if (left_index == kNULL_i && right_index == kNULL_i) - features->set_value(fid_null_null_, 1.0); - else if (left_index == kNULL_i) - features->set_value(fid_null_lex_, 1.0); - else - features->set_value(fid_lex_null_, 1.0); - + set<string> config; + string f_file; + for (int i = 0; i < argc; ++i) { + if (argv[i].size() > 2 && argv[i].find("f:") == 0) { + assert(f_file.empty()); // only one f file! + f_file = argv[i].substr(2); + cerr << " source_file=" << f_file; } else { - features->set_value(fid_lex_lex_, 1.0); // TODO should only use if NULL is enabled - const int jumpsize = right_index - left_index; - - if (binary_params_) { - const int fid = flen2jump2fid_[flen].find(jumpsize)->second; - features->set_value(fid, 1.0); + if (permitted.count(argv[i])) { + assert(config.count(argv[i]) == 0); + config.insert(argv[i]); + cerr << " " << argv[i]; } else { - features->set_value(fid_, fabs(jumpsize - 1)); // Blunsom and Cohn def + cerr << "\nNewJump: don't understand param '" << argv[i] << "'\n"; + abort(); } } - } else { - assert(!"something really unexpected is happening"); } -} - -NewJump::NewJump(const string& param) : - FeatureFunction(1) { - cerr << " NewJump"; - vector<string> argv; - int argc = SplitOnWhitespace(param, &argv); - set<string> config; - for (int i = 0; i < argc; ++i) config.insert(argv[i]); cerr << endl; use_binned_log_lengths_ = config.count("use_binned_log_lengths") > 0; + f0_ = config.count("f0") > 0; + fm1_ = config.count("f-1") > 0; + fp1_ = config.count("f+1") > 0; + fprev_ = config.count("fprev") > 0; + elen_ = config.count("elen") > 0; + flen_ = config.count("flen") > 0; + if (f0_ || fm1_ || fp1_ || fprev_) { + if (f_file.empty()) { + cerr << "NewJump: conditioning on src but f:FILE not specified!\n"; + abort(); + } + ReadFile rf(f_file); + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) continue; + vector<WordID> v; + TD::ConvertSentence(line, &v); + src_.push_back(v); + } + } + fid_str_ = "J"; + if (flen_) fid_str_ += "F"; + if (elen_) fid_str_ += "E"; + if (f0_) fid_str_ += "C"; + if (fm1_) fid_str_ += "L"; + if (fp1_) fid_str_ += "R"; + if (fprev_) fid_str_ += "P"; } // do a log transform on the length (of a sentence, a jump, etc) @@ -351,33 +203,66 @@ int BinnedLogLength(int len) { return res; } +// <0>=jump size <1>=jump_dir <2>=flen, <3>=elen, <4>=f0, <5>=f-1, <6>=f+1, <7>=fprev +typedef boost::tuple<short, char, short, short, WordID, WordID, WordID, WordID> NewJumpFeatureKey; + +struct KeyHash : unary_function<NewJumpFeatureKey, size_t> { + size_t operator()(const NewJumpFeatureKey& k) const { + size_t h = 0x37473DEF321; + boost::hash_combine(h, k.get<0>()); + boost::hash_combine(h, k.get<1>()); + boost::hash_combine(h, k.get<2>()); + boost::hash_combine(h, k.get<3>()); + boost::hash_combine(h, k.get<4>()); + boost::hash_combine(h, k.get<5>()); + boost::hash_combine(h, k.get<6>()); + boost::hash_combine(h, k.get<7>()); + return h; + } +}; + void NewJump::FireFeature(const SentenceMetadata& smeta, const int prev_src_index, const int cur_src_index, SparseVector<double>* features) const { + const int id = smeta.GetSentenceID(); const int src_len = smeta.GetSourceLength(); const int raw_jump = cur_src_index - prev_src_index; + short jump_magnitude = raw_jump; char jtype = 0; - int jump_magnitude = raw_jump; if (raw_jump > 0) { jtype = 'R'; } // Right else if (raw_jump == 0) { jtype = 'S'; } // Stay else { jtype = 'L'; jump_magnitude = raw_jump * -1; } // Left - int effective_length = src_len; + int effective_src_len = src_len; + int effective_trg_len = smeta.GetTargetLength(); if (use_binned_log_lengths_) { jump_magnitude = BinnedLogLength(jump_magnitude); - effective_length = BinnedLogLength(src_len); - } - - if (true) { - static map<int, map<int, int> > len2jump2fid; - int& fid = len2jump2fid[src_len][raw_jump]; - if (!fid) { - ostringstream os; - os << fid_str_ << ":FLen" << effective_length << ":" << jtype << jump_magnitude; - fid = FD::Convert(os.str()); - } - features->set_value(fid, 1.0); + effective_src_len = BinnedLogLength(src_len); + effective_trg_len = BinnedLogLength(effective_trg_len); + } + NewJumpFeatureKey key(jump_magnitude,jtype,0,0,0,0,0); + using boost::get; + if (flen_) get<2>(key) = effective_src_len; + if (elen_) get<3>(key) = effective_trg_len; + if (f0_) get<4>(key) = GetSourceWord(id, cur_src_index); + if (fm1_) get<5>(key) = GetSourceWord(id, cur_src_index - 1); + if (fp1_) get<6>(key) = GetSourceWord(id, cur_src_index + 1); + if (fprev_) get<7>(key) = GetSourceWord(id, prev_src_index); + + static std::tr1::unordered_map<NewJumpFeatureKey, int, KeyHash> fids; + int& fid = fids[key]; + if (!fid) { + ostringstream os; + os << fid_str_ << ':' << jtype << jump_magnitude; + if (flen_) os << ':' << get<2>(key); + if (elen_) os << ':' << get<3>(key); + if (f0_) os << ':' << TD::Convert(get<4>(key)); + if (fm1_) os << ':' << TD::Convert(get<5>(key)); + if (fp1_) os << ':' << TD::Convert(get<6>(key)); + if (fprev_) os << ':' << TD::Convert(get<7>(key)); + fid = FD::Convert(os.str()); } + features->set_value(fid, 1.0); } void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, @@ -387,6 +272,7 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, SparseVector<double>* /* estimated_features */, void* state) const { unsigned char& dpstate = *((unsigned char*)state); + // IMPORTANT: this only fires on non-Null transitions! const int flen = smeta.GetSourceLength(); if (edge.Arity() == 0) { dpstate = static_cast<unsigned int>(edge.i_); @@ -427,6 +313,23 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, SourceBigram::SourceBigram(const std::string& param) : FeatureFunction(sizeof(WordID) + sizeof(int)) { + fid_str_ = "SB:"; + if (param.size() > 0) { + vector<string> argv; + int argc = SplitOnWhitespace(param, &argv); + if (argc != 2) { + cerr << "SourceBigram [FEATURE_NAME_PREFIX PATH]\n"; + abort(); + } + fid_str_ = argv[0] + ":"; + lexmap_.reset(new FactoredLexiconHelper(argv[1], "*")); + } else { + lexmap_.reset(new FactoredLexiconHelper); + } +} + +void SourceBigram::PrepareForInput(const SentenceMetadata& smeta) { + lexmap_->PrepareForInput(smeta); } void SourceBigram::FinalTraversalFeatures(const void* context, @@ -445,7 +348,7 @@ void SourceBigram::FireFeature(WordID left, // TODO important important !!! escape strings !!! if (!fid) { ostringstream os; - os << "SB:"; + os << fid_str_; if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); } os << '_'; if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); } @@ -465,85 +368,7 @@ void SourceBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta, int& out_word_count = *(static_cast<int*>(context) + 1); const int arity = edge.Arity(); if (arity == 0) { - out_context = edge.rule_->f()[0]; - out_word_count = edge.rule_->EWords(); - assert(out_word_count == 1); // this is only defined for lex translation! - // revisit this if you want to translate into null words - } else if (arity == 2) { - WordID left = *static_cast<const WordID*>(ant_contexts[0]); - WordID right = *static_cast<const WordID*>(ant_contexts[1]); - int left_wc = *(static_cast<const int*>(ant_contexts[0]) + 1); - int right_wc = *(static_cast<const int*>(ant_contexts[0]) + 1); - if (left_wc == 1 && right_wc == 1) - FireFeature(-1, left, features); - FireFeature(left, right, features); - out_word_count = left_wc + right_wc; - out_context = right; - } -} -// state: POS of src word used, number of trg words generated -SourcePOSBigram::SourcePOSBigram(const std::string& param) : - FeatureFunction(sizeof(WordID) + sizeof(int)) { - cerr << "Reading source POS tags from " << param << endl; - ReadFile rf(param); - istream& in = *rf.stream(); - while(in) { - string line; - getline(in, line); - if (line.empty()) continue; - vector<WordID> v; - TD::ConvertSentence(line, &v); - pos_.push_back(v); - } - cerr << " (" << pos_.size() << " lines)\n"; -} - -void SourcePOSBigram::FinalTraversalFeatures(const void* context, - SparseVector<double>* features) const { - WordID left = *static_cast<const WordID*>(context); - int left_wc = *(static_cast<const int*>(context) + 1); - if (left_wc == 1) - FireFeature(-1, left, features); - FireFeature(left, -1, features); -} - -void SourcePOSBigram::FireFeature(WordID left, - WordID right, - SparseVector<double>* features) const { - int& fid = fmap_[left][right]; - if (!fid) { - ostringstream os; - os << "SP:"; - if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); } - os << '_'; - if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); } - fid = FD::Convert(os.str()); - if (fid == 0) fid = -1; - } - if (fid < 0) return; - features->set_value(fid, 1.0); -} - -void SourcePOSBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* /* estimated_features */, - void* context) const { - WordID& out_context = *static_cast<WordID*>(context); - int& out_word_count = *(static_cast<int*>(context) + 1); - const int arity = edge.Arity(); - if (arity == 0) { - assert(smeta.GetSentenceID() < pos_.size()); - const vector<WordID>& pos_sent = pos_[smeta.GetSentenceID()]; - if (edge.i_ >= 0) { // non-NULL source - assert(edge.i_ < pos_sent.size()); - out_context = pos_sent[edge.i_]; - } else { // NULL source - // should assert that source is kNULL? - static const WordID kNULL = TD::Convert("<eps>"); - out_context = kNULL; - } + out_context = lexmap_->SourceWordAtPosition(edge.i_); out_word_count = edge.rule_->EWords(); assert(out_word_count == 1); // this is only defined for lex translation! // revisit this if you want to translate into null words diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h index 418c8768..a1ffd9ca 100644 --- a/decoder/ff_wordalign.h +++ b/decoder/ff_wordalign.h @@ -3,7 +3,9 @@ #include "ff.h" #include "array2d.h" +#include "factored_lexicon_helper.h" +#include <boost/scoped_ptr.hpp> #include <boost/multi_array.hpp> class RelativeSentencePosition : public FeatureFunction { @@ -23,64 +25,6 @@ class RelativeSentencePosition : public FeatureFunction { std::map<WordID, int> fids_; // fclass -> fid }; -class Model2BinaryFeatures : public FeatureFunction { - public: - Model2BinaryFeatures(const std::string& param); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* out_context) const; - private: - boost::multi_array<int, 3> fids_; -}; - -class MarkovJump : public FeatureFunction { - public: - MarkovJump(const std::string& param); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* out_context) const; - private: - const int fid_; - const int fid_lex_null_; - const int fid_null_lex_; - const int fid_null_null_; - const int fid_lex_lex_; - - bool binary_params_; - std::vector<std::map<int, int> > flen2jump2fid_; -}; - -class MarkovJumpFClass : public FeatureFunction { - public: - MarkovJumpFClass(const std::string& param); - virtual void FinalTraversalFeatures(const void* context, - SparseVector<double>* features) const; - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const; - - void FireFeature(const SentenceMetadata& smeta, - int prev_src_pos, - int cur_src_pos, - SparseVector<double>* features) const; - - private: - std::vector<std::map<WordID, std::map<int, int> > > fids_; // flen -> fclass -> jumpsize -> fid - std::vector<std::vector<WordID> > pos_; -}; - typedef std::map<WordID, int> Class2FID; typedef std::map<WordID, Class2FID> Class2Class2FID; typedef std::map<WordID, Class2Class2FID> Class2Class2Class2FID; @@ -89,6 +33,7 @@ class SourceBigram : public FeatureFunction { SourceBigram(const std::string& param); virtual void FinalTraversalFeatures(const void* context, SparseVector<double>* features) const; + void PrepareForInput(const SentenceMetadata& smeta); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, @@ -100,7 +45,9 @@ class SourceBigram : public FeatureFunction { void FireFeature(WordID src, WordID trg, SparseVector<double>* features) const; + std::string fid_str_; mutable Class2Class2FID fmap_; + boost::scoped_ptr<FactoredLexiconHelper> lexmap_; // different view (stemmed, etc) of source }; class LexNullJump : public FeatureFunction { @@ -136,30 +83,27 @@ class NewJump : public FeatureFunction { const int cur_src_index, SparseVector<double>* features) const; + WordID GetSourceWord(int sentence_id, int index) const { + if (index < 0) return kBOS_; + assert(src_.size() > sentence_id); + const std::vector<WordID>& v = src_[sentence_id]; + if (index >= v.size()) return kEOS_; + return v[index]; + } + + const WordID kBOS_; + const WordID kEOS_; bool use_binned_log_lengths_; + bool flen_; + bool elen_; + bool f0_; + bool fm1_; + bool fp1_; + bool fprev_; + std::vector<std::vector<WordID> > src_; std::string fid_str_; // identifies configuration uniquely }; -class SourcePOSBigram : public FeatureFunction { - public: - SourcePOSBigram(const std::string& param); - virtual void FinalTraversalFeatures(const void* context, - SparseVector<double>* features) const; - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const; - private: - void FireFeature(WordID src, - WordID trg, - SparseVector<double>* features) const; - mutable Class2Class2FID fmap_; - std::vector<std::vector<WordID> > pos_; -}; - class LexicalTranslationTrigger : public FeatureFunction { public: LexicalTranslationTrigger(const std::string& param); diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc index 149cd68d..f237295c 100644 --- a/decoder/lextrans.cc +++ b/decoder/lextrans.cc @@ -81,7 +81,7 @@ struct LexicalTransImpl { for (int i = 0; i < ref.size(); ++i) { target_vocab.insert(ref[i][0].label); } - bool all_sources_to_all_targets_ = false; + bool all_sources_to_all_targets_ = false; // TODO configure this set<WordID> trgs_used; for (int i = 0; i < e_len; ++i) { // for each word in the *target* Hypergraph::Node* node = forest->AddNode(kXCAT); diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index 81ac4198..f5ee5d3f 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -120,17 +120,19 @@ grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz feature_function=WordPairFeatures $align_dir/grammars/wordpairs.$direction.features.gz feature_function=LexicalPairIdentity -feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second +# stem translation feature_function=LexicalPairIdentity S $align_dir/grammars/corpus.stemmed.$first $align_dir/grammars/${second}stem.map +# POS translation +feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second feature_function=InputIdentity feature_function=OutputIdentity feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first -# the following two are deprecated -feature_function=MarkovJump +b -feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first +feature_function=NewJump +feature_function=NewJump use_binned_log_lengths flen +# jump distance and src and destination class type +feature_function=NewJump use_binned_log_lengths f0 fprev f:$align_dir/grammars/corpus.class.$first feature_function=SourceBigram -# following is deprecated- should reuse SourceBigram the way LexicalPairIdentity does -feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first +feature_function=SourceBigram SC $align_dir/grammars/corpus.class.$first EOT close CDEC; open AGENDA, ">$stage_dir/agenda.txt" or die "Can't write $stage_dir/agenda.txt: $!"; diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index be0644df..1a069abf 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -1,14 +1,13 @@ -all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml +all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map clean: - $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg* wordpairs* + $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* freq* wordpairs* SUPPORT_DIR = $(SCRIPT_DIR)/support GZIP = /usr/bin/gzip ZCAT = zcat EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl -GENERATE_PSG = $(SUPPORT_DIR)/generate_per_sentence_grammars.pl GENERATE_WORDPAIR_FEATURES = $(SUPPORT_DIR)/generate_word_pair_features.pl ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl @@ -84,6 +83,3 @@ corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 wordpairs.f-e.features.gz: corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 $(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 | $(GZIP) -9 > $@ -corpus.f-e.sgml: f.voc corpus.f-e.lex-grammar.gz corpus.f-e - $(GENERATE_PSG) f.voc corpus.f-e corpus.f-e.lex-grammar.gz freq_grammar.f-e.gz psg.f-e $@ - diff --git a/word-aligner/support/generate_word_pair_features.pl b/word-aligner/support/generate_word_pair_features.pl index b28f6feb..54b89ce1 100755 --- a/word-aligner/support/generate_word_pair_features.pl +++ b/word-aligner/support/generate_word_pair_features.pl @@ -92,7 +92,7 @@ my $ADD_ID = 1; my $ADD_PUNC = 1; my $ADD_NULL = 1; my $ADD_MODEL1 = 1; -my $ADD_NOMODEL1 = 1; +my $ADD_NOMODEL1 = 0; my $BEAM_RATIO = 50; my $BIN_ORTHO = 1; my $BIN_DLEN = 1; @@ -171,7 +171,7 @@ for my $f (sort keys %fdict) { } if ($im1 > $MIN_MAGNITUDE) { push @feats, "InvModel1=$im1" if $im1; - } else { + } elsif ($ADD_NOMODEL1) { push @feats, 'NoInvModel1=1'; } my $am1 = sprintf("%.5g", sqrt($m1 * $im1)); |