summaryrefslogtreecommitdiff
path: root/decoder/ff_wordalign.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2010-12-09 17:04:29 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2010-12-09 17:04:29 -0500
commit35142ef52f15d610ca08fa622b83594cf111ce4a (patch)
treec2196761993353bca47c7073e6cb5d996c4dad8f /decoder/ff_wordalign.cc
parenta80c69d266886d9911eb91833811d7f8393ac64d (diff)
major refactor of markov features for word alignment
Diffstat (limited to 'decoder/ff_wordalign.cc')
-rw-r--r--decoder/ff_wordalign.cc431
1 files changed, 128 insertions, 303 deletions
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index 980c64ad..338f1a72 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -6,7 +6,13 @@
#include <sstream>
#include <string>
#include <cmath>
+#include <tr1/unordered_map>
+#include <boost/tuple/tuple.hpp>
+#include "boost/tuple/tuple_comparison.hpp"
+#include <boost/functional/hash.hpp>
+
+#include "factored_lexicon_helper.h"
#include "verbose.h"
#include "alignment_pharaoh.h"
#include "stringlib.h"
@@ -25,43 +31,6 @@ using namespace std;
// TODO new feature: if a word is translated as itself and there is a transition back to the same word, fire a feature
-Model2BinaryFeatures::Model2BinaryFeatures(const string& ) :
- fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) {
- for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
- for (int j = 0; j < i; ++j) {
- for (int k = 0; k < MAX_SENTENCE_SIZE; ++k) {
- int& val = fids_[i][j][k];
- val = -1;
- if (j < i) {
- ostringstream os;
- os << "M2FL:" << i << ":TI:" << k << "_SI:" << j;
- val = FD::Convert(os.str());
- }
- }
- }
- }
-}
-
-void Model2BinaryFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const vector<const void*>& /*ant_states*/,
- SparseVector<double>* features,
- SparseVector<double>* // estimated_features
- ,
- void* // state
- ) const {
- // if the source word is either null or the generated word
- // has no position in the reference
- if (edge.i_ == -1 || edge.prev_i_ == -1)
- return;
-
- assert(smeta.GetTargetLength() > 0);
- const int fid = fids_[smeta.GetSourceLength()][edge.i_][edge.prev_i_];
- features->set_value(fid, 1.0);
-// cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
-}
-
-
RelativeSentencePosition::RelativeSentencePosition(const string& param) :
fid_(FD::Convert("RelativeSentencePosition")) {
if (!param.empty()) {
@@ -119,87 +88,6 @@ void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& sme
// cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
}
-MarkovJumpFClass::MarkovJumpFClass(const string& param) :
- FeatureFunction(1),
- fids_(MAX_SENTENCE_SIZE) {
- cerr << " MarkovJumpFClass" << endl;
- cerr << "Reading source POS tags from " << param << endl;
- ReadFile rf(param);
- istream& in = *rf.stream();
- set<WordID> classes;
- while(in) {
- string line;
- getline(in, line);
- if (line.empty()) continue;
- vector<WordID> v;
- TD::ConvertSentence(line, &v);
- pos_.push_back(v);
- for (int i = 0; i < v.size(); ++i)
- classes.insert(v[i]);
- }
- cerr << " (" << pos_.size() << " lines)\n";
- cerr << " Classes: " << classes.size() << endl;
- for (int ss = 1; ss < MAX_SENTENCE_SIZE; ++ss) {
- map<WordID, map<int, int> >& cfids = fids_[ss];
- for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) {
- map<int, int> &fids = cfids[*i];
- for (int j = -ss; j <= ss; ++j) {
- ostringstream os;
- os << "Jump_FL:" << ss << "_FC:" << TD::Convert(*i) << "_J:" << j;
- fids[j] = FD::Convert(os.str());
- }
- }
- }
-}
-
-void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta,
- int prev_src_pos,
- int cur_src_pos,
- SparseVector<double>* features) const {
- if (prev_src_pos == kNULL_i || cur_src_pos == kNULL_i)
- return;
-
- const int jumpsize = cur_src_pos - prev_src_pos;
-
- assert(smeta.GetSentenceID() < pos_.size());
- const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos];
- const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
- features->set_value(fid, 1.0);
-}
-
-void MarkovJumpFClass::FinalTraversalFeatures(const void* context,
- SparseVector<double>* features) const {
- int left_index = *static_cast<const unsigned char*>(context);
-// int right_index = cur_flen;
- // TODO
-}
-
-void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_states,
- SparseVector<double>* features,
- SparseVector<double>* /* estimated_features */,
- void* state) const {
- unsigned char& dpstate = *((unsigned char*)state);
- if (edge.Arity() == 0) {
- dpstate = static_cast<unsigned int>(edge.i_);
- } else if (edge.Arity() == 1) {
- dpstate = *((unsigned char*)ant_states[0]);
- } else if (edge.Arity() == 2) {
- int left_index = *((unsigned char*)ant_states[0]);
- int right_index = *((unsigned char*)ant_states[1]);
- if (right_index == -1)
- dpstate = static_cast<unsigned int>(left_index);
- else
- dpstate = static_cast<unsigned int>(right_index);
-// const WordID cur_fclass = pos_[smeta.GetSentenceID()][right_index];
-// cerr << edge.i_ << "," << edge.j_ << ": fclass=" << TD::Convert(cur_fclass) << " j=" << jumpsize << endl;
-// const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
-// features->set_value(fid, 1.0);
- FireFeature(smeta, left_index, right_index, features);
- }
-}
-
LexNullJump::LexNullJump(const string& param) :
FeatureFunction(1),
fid_lex_null_(FD::Convert("JumpLexNull")),
@@ -239,107 +127,71 @@ void LexNullJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
}
}
-MarkovJump::MarkovJump(const string& param) :
+NewJump::NewJump(const string& param) :
FeatureFunction(1),
- fid_(FD::Convert("MarkovJump")),
- fid_lex_null_(FD::Convert("JumpLexNull")),
- fid_null_lex_(FD::Convert("JumpNullLex")),
- fid_null_null_(FD::Convert("JumpNullNull")),
- fid_lex_lex_(FD::Convert("JumpLexLex")),
- binary_params_(false) {
- cerr << " MarkovJump";
+ kBOS_(TD::Convert("BOS")),
+ kEOS_(TD::Convert("EOS")) {
+ cerr << " NewJump";
vector<string> argv;
+ set<string> permitted;
+ permitted.insert("use_binned_log_lengths");
+ permitted.insert("flen");
+ permitted.insert("elen");
+ permitted.insert("fprev");
+ permitted.insert("f0");
+ permitted.insert("f-1");
+ permitted.insert("f+1");
+ // also permitted f:FILENAME
int argc = SplitOnWhitespace(param, &argv);
- if (argc != 1 || !(argv[0] == "-b" || argv[0] == "+b")) {
- cerr << "MarkovJump: expected parameters to be -b or +b\n";
- exit(1);
- }
- binary_params_ = argv[0] == "+b";
- if (binary_params_) {
- flen2jump2fid_.resize(MAX_SENTENCE_SIZE);
- for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
- map<int, int>& jump2fid = flen2jump2fid_[i];
- for (int jump = -i; jump <= i; ++jump) {
- ostringstream os;
- os << "Jump:FLen:" << i << "_J:" << jump;
- jump2fid[jump] = FD::Convert(os.str());
- }
- }
- } else {
- cerr << " (Blunsom & Cohn definition)";
- }
- cerr << endl;
-}
-
-// TODO handle NULLs according to Och 2000?
-void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const vector<const void*>& ant_states,
- SparseVector<double>* features,
- SparseVector<double>* /* estimated_features */,
- void* state) const {
- unsigned char& dpstate = *((unsigned char*)state);
- const int flen = smeta.GetSourceLength();
- if (edge.Arity() == 0) {
- dpstate = static_cast<unsigned int>(edge.i_);
- if (edge.prev_i_ == 0) { // first word in sentence
- if (edge.i_ >= 0 && binary_params_) {
- const int fid = flen2jump2fid_[flen].find(edge.i_ + 1)->second;
- features->set_value(fid, 1.0);
- } else if (edge.i_ < 0 && binary_params_) {
- // handled by bigram features
- }
- } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) {
- if (edge.i_ >= 0 && binary_params_) {
- int jumpsize = flen - edge.i_;
- const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
- features->set_value(fid, 1.0);
- } else if (edge.i_ < 0 && binary_params_) {
- // handled by bigram features
- }
- }
- } else if (edge.Arity() == 1) {
- dpstate = *((unsigned char*)ant_states[0]);
- } else if (edge.Arity() == 2) {
- int left_index = *((unsigned char*)ant_states[0]);
- int right_index = *((unsigned char*)ant_states[1]);
- if (right_index == -1)
- dpstate = static_cast<unsigned int>(left_index);
- else
- dpstate = static_cast<unsigned int>(right_index);
- if (left_index == kNULL_i || right_index == kNULL_i) {
- if (left_index == kNULL_i && right_index == kNULL_i)
- features->set_value(fid_null_null_, 1.0);
- else if (left_index == kNULL_i)
- features->set_value(fid_null_lex_, 1.0);
- else
- features->set_value(fid_lex_null_, 1.0);
-
+ set<string> config;
+ string f_file;
+ for (int i = 0; i < argc; ++i) {
+ if (argv[i].size() > 2 && argv[i].find("f:") == 0) {
+ assert(f_file.empty()); // only one f file!
+ f_file = argv[i].substr(2);
+ cerr << " source_file=" << f_file;
} else {
- features->set_value(fid_lex_lex_, 1.0); // TODO should only use if NULL is enabled
- const int jumpsize = right_index - left_index;
-
- if (binary_params_) {
- const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
- features->set_value(fid, 1.0);
+ if (permitted.count(argv[i])) {
+ assert(config.count(argv[i]) == 0);
+ config.insert(argv[i]);
+ cerr << " " << argv[i];
} else {
- features->set_value(fid_, fabs(jumpsize - 1)); // Blunsom and Cohn def
+ cerr << "\nNewJump: don't understand param '" << argv[i] << "'\n";
+ abort();
}
}
- } else {
- assert(!"something really unexpected is happening");
}
-}
-
-NewJump::NewJump(const string& param) :
- FeatureFunction(1) {
- cerr << " NewJump";
- vector<string> argv;
- int argc = SplitOnWhitespace(param, &argv);
- set<string> config;
- for (int i = 0; i < argc; ++i) config.insert(argv[i]);
cerr << endl;
use_binned_log_lengths_ = config.count("use_binned_log_lengths") > 0;
+ f0_ = config.count("f0") > 0;
+ fm1_ = config.count("f-1") > 0;
+ fp1_ = config.count("f+1") > 0;
+ fprev_ = config.count("fprev") > 0;
+ elen_ = config.count("elen") > 0;
+ flen_ = config.count("flen") > 0;
+ if (f0_ || fm1_ || fp1_ || fprev_) {
+ if (f_file.empty()) {
+ cerr << "NewJump: conditioning on src but f:FILE not specified!\n";
+ abort();
+ }
+ ReadFile rf(f_file);
+ istream& in = *rf.stream();
+ string line;
+ while(in) {
+ getline(in, line);
+ if (!in) continue;
+ vector<WordID> v;
+ TD::ConvertSentence(line, &v);
+ src_.push_back(v);
+ }
+ }
+ fid_str_ = "J";
+ if (flen_) fid_str_ += "F";
+ if (elen_) fid_str_ += "E";
+ if (f0_) fid_str_ += "C";
+ if (fm1_) fid_str_ += "L";
+ if (fp1_) fid_str_ += "R";
+ if (fprev_) fid_str_ += "P";
}
// do a log transform on the length (of a sentence, a jump, etc)
@@ -351,33 +203,66 @@ int BinnedLogLength(int len) {
return res;
}
+// <0>=jump size <1>=jump_dir <2>=flen, <3>=elen, <4>=f0, <5>=f-1, <6>=f+1, <7>=fprev
+typedef boost::tuple<short, char, short, short, WordID, WordID, WordID, WordID> NewJumpFeatureKey;
+
+struct KeyHash : unary_function<NewJumpFeatureKey, size_t> {
+ size_t operator()(const NewJumpFeatureKey& k) const {
+ size_t h = 0x37473DEF321;
+ boost::hash_combine(h, k.get<0>());
+ boost::hash_combine(h, k.get<1>());
+ boost::hash_combine(h, k.get<2>());
+ boost::hash_combine(h, k.get<3>());
+ boost::hash_combine(h, k.get<4>());
+ boost::hash_combine(h, k.get<5>());
+ boost::hash_combine(h, k.get<6>());
+ boost::hash_combine(h, k.get<7>());
+ return h;
+ }
+};
+
void NewJump::FireFeature(const SentenceMetadata& smeta,
const int prev_src_index,
const int cur_src_index,
SparseVector<double>* features) const {
+ const int id = smeta.GetSentenceID();
const int src_len = smeta.GetSourceLength();
const int raw_jump = cur_src_index - prev_src_index;
+ short jump_magnitude = raw_jump;
char jtype = 0;
- int jump_magnitude = raw_jump;
if (raw_jump > 0) { jtype = 'R'; } // Right
else if (raw_jump == 0) { jtype = 'S'; } // Stay
else { jtype = 'L'; jump_magnitude = raw_jump * -1; } // Left
- int effective_length = src_len;
+ int effective_src_len = src_len;
+ int effective_trg_len = smeta.GetTargetLength();
if (use_binned_log_lengths_) {
jump_magnitude = BinnedLogLength(jump_magnitude);
- effective_length = BinnedLogLength(src_len);
- }
-
- if (true) {
- static map<int, map<int, int> > len2jump2fid;
- int& fid = len2jump2fid[src_len][raw_jump];
- if (!fid) {
- ostringstream os;
- os << fid_str_ << ":FLen" << effective_length << ":" << jtype << jump_magnitude;
- fid = FD::Convert(os.str());
- }
- features->set_value(fid, 1.0);
+ effective_src_len = BinnedLogLength(src_len);
+ effective_trg_len = BinnedLogLength(effective_trg_len);
+ }
+ NewJumpFeatureKey key(jump_magnitude,jtype,0,0,0,0,0);
+ using boost::get;
+ if (flen_) get<2>(key) = effective_src_len;
+ if (elen_) get<3>(key) = effective_trg_len;
+ if (f0_) get<4>(key) = GetSourceWord(id, cur_src_index);
+ if (fm1_) get<5>(key) = GetSourceWord(id, cur_src_index - 1);
+ if (fp1_) get<6>(key) = GetSourceWord(id, cur_src_index + 1);
+ if (fprev_) get<7>(key) = GetSourceWord(id, prev_src_index);
+
+ static std::tr1::unordered_map<NewJumpFeatureKey, int, KeyHash> fids;
+ int& fid = fids[key];
+ if (!fid) {
+ ostringstream os;
+ os << fid_str_ << ':' << jtype << jump_magnitude;
+ if (flen_) os << ':' << get<2>(key);
+ if (elen_) os << ':' << get<3>(key);
+ if (f0_) os << ':' << TD::Convert(get<4>(key));
+ if (fm1_) os << ':' << TD::Convert(get<5>(key));
+ if (fp1_) os << ':' << TD::Convert(get<6>(key));
+ if (fprev_) os << ':' << TD::Convert(get<7>(key));
+ fid = FD::Convert(os.str());
}
+ features->set_value(fid, 1.0);
}
void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
@@ -387,6 +272,7 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
SparseVector<double>* /* estimated_features */,
void* state) const {
unsigned char& dpstate = *((unsigned char*)state);
+ // IMPORTANT: this only fires on non-Null transitions!
const int flen = smeta.GetSourceLength();
if (edge.Arity() == 0) {
dpstate = static_cast<unsigned int>(edge.i_);
@@ -427,6 +313,23 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
SourceBigram::SourceBigram(const std::string& param) :
FeatureFunction(sizeof(WordID) + sizeof(int)) {
+ fid_str_ = "SB:";
+ if (param.size() > 0) {
+ vector<string> argv;
+ int argc = SplitOnWhitespace(param, &argv);
+ if (argc != 2) {
+ cerr << "SourceBigram [FEATURE_NAME_PREFIX PATH]\n";
+ abort();
+ }
+ fid_str_ = argv[0] + ":";
+ lexmap_.reset(new FactoredLexiconHelper(argv[1], "*"));
+ } else {
+ lexmap_.reset(new FactoredLexiconHelper);
+ }
+}
+
+void SourceBigram::PrepareForInput(const SentenceMetadata& smeta) {
+ lexmap_->PrepareForInput(smeta);
}
void SourceBigram::FinalTraversalFeatures(const void* context,
@@ -445,7 +348,7 @@ void SourceBigram::FireFeature(WordID left,
// TODO important important !!! escape strings !!!
if (!fid) {
ostringstream os;
- os << "SB:";
+ os << fid_str_;
if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }
os << '_';
if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); }
@@ -465,85 +368,7 @@ void SourceBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,
int& out_word_count = *(static_cast<int*>(context) + 1);
const int arity = edge.Arity();
if (arity == 0) {
- out_context = edge.rule_->f()[0];
- out_word_count = edge.rule_->EWords();
- assert(out_word_count == 1); // this is only defined for lex translation!
- // revisit this if you want to translate into null words
- } else if (arity == 2) {
- WordID left = *static_cast<const WordID*>(ant_contexts[0]);
- WordID right = *static_cast<const WordID*>(ant_contexts[1]);
- int left_wc = *(static_cast<const int*>(ant_contexts[0]) + 1);
- int right_wc = *(static_cast<const int*>(ant_contexts[0]) + 1);
- if (left_wc == 1 && right_wc == 1)
- FireFeature(-1, left, features);
- FireFeature(left, right, features);
- out_word_count = left_wc + right_wc;
- out_context = right;
- }
-}
-// state: POS of src word used, number of trg words generated
-SourcePOSBigram::SourcePOSBigram(const std::string& param) :
- FeatureFunction(sizeof(WordID) + sizeof(int)) {
- cerr << "Reading source POS tags from " << param << endl;
- ReadFile rf(param);
- istream& in = *rf.stream();
- while(in) {
- string line;
- getline(in, line);
- if (line.empty()) continue;
- vector<WordID> v;
- TD::ConvertSentence(line, &v);
- pos_.push_back(v);
- }
- cerr << " (" << pos_.size() << " lines)\n";
-}
-
-void SourcePOSBigram::FinalTraversalFeatures(const void* context,
- SparseVector<double>* features) const {
- WordID left = *static_cast<const WordID*>(context);
- int left_wc = *(static_cast<const int*>(context) + 1);
- if (left_wc == 1)
- FireFeature(-1, left, features);
- FireFeature(left, -1, features);
-}
-
-void SourcePOSBigram::FireFeature(WordID left,
- WordID right,
- SparseVector<double>* features) const {
- int& fid = fmap_[left][right];
- if (!fid) {
- ostringstream os;
- os << "SP:";
- if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }
- os << '_';
- if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); }
- fid = FD::Convert(os.str());
- if (fid == 0) fid = -1;
- }
- if (fid < 0) return;
- features->set_value(fid, 1.0);
-}
-
-void SourcePOSBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* /* estimated_features */,
- void* context) const {
- WordID& out_context = *static_cast<WordID*>(context);
- int& out_word_count = *(static_cast<int*>(context) + 1);
- const int arity = edge.Arity();
- if (arity == 0) {
- assert(smeta.GetSentenceID() < pos_.size());
- const vector<WordID>& pos_sent = pos_[smeta.GetSentenceID()];
- if (edge.i_ >= 0) { // non-NULL source
- assert(edge.i_ < pos_sent.size());
- out_context = pos_sent[edge.i_];
- } else { // NULL source
- // should assert that source is kNULL?
- static const WordID kNULL = TD::Convert("<eps>");
- out_context = kNULL;
- }
+ out_context = lexmap_->SourceWordAtPosition(edge.i_);
out_word_count = edge.rule_->EWords();
assert(out_word_count == 1); // this is only defined for lex translation!
// revisit this if you want to translate into null words