summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2010-12-09 17:04:29 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2010-12-09 17:04:29 -0500
commit35142ef52f15d610ca08fa622b83594cf111ce4a (patch)
treec2196761993353bca47c7073e6cb5d996c4dad8f
parenta80c69d266886d9911eb91833811d7f8393ac64d (diff)
major refactor of markov features for word alignment
-rw-r--r--decoder/cdec_ff.cc4
-rw-r--r--decoder/ff_wordalign.cc431
-rw-r--r--decoder/ff_wordalign.h100
-rw-r--r--decoder/lextrans.cc2
-rwxr-xr-xword-aligner/aligner.pl14
-rw-r--r--word-aligner/makefiles/makefile.grammars8
-rwxr-xr-xword-aligner/support/generate_word_pair_features.pl4
7 files changed, 163 insertions, 400 deletions
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index d6cf4572..e87ab5ab 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -50,13 +50,9 @@ void register_feature_functions() {
#endif
ff_registry.Register("RuleShape", new FFFactory<RuleShapeFeatures>);
ff_registry.Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>);
- ff_registry.Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>);
ff_registry.Register("LexNullJump", new FFFactory<LexNullJump>);
ff_registry.Register("NewJump", new FFFactory<NewJump>);
- ff_registry.Register("MarkovJump", new FFFactory<MarkovJump>);
- ff_registry.Register("MarkovJumpFClass", new FFFactory<MarkovJumpFClass>);
ff_registry.Register("SourceBigram", new FFFactory<SourceBigram>);
- ff_registry.Register("SourcePOSBigram", new FFFactory<SourcePOSBigram>);
ff_registry.Register("BlunsomSynchronousParseHack", new FFFactory<BlunsomSynchronousParseHack>);
ff_registry.Register("AlignerResults", new FFFactory<AlignerResults>);
ff_registry.Register("CSplit_BasicFeatures", new FFFactory<BasicCSplitFeatures>);
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index 980c64ad..338f1a72 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -6,7 +6,13 @@
#include <sstream>
#include <string>
#include <cmath>
+#include <tr1/unordered_map>
+#include <boost/tuple/tuple.hpp>
+#include "boost/tuple/tuple_comparison.hpp"
+#include <boost/functional/hash.hpp>
+
+#include "factored_lexicon_helper.h"
#include "verbose.h"
#include "alignment_pharaoh.h"
#include "stringlib.h"
@@ -25,43 +31,6 @@ using namespace std;
// TODO new feature: if a word is translated as itself and there is a transition back to the same word, fire a feature
-Model2BinaryFeatures::Model2BinaryFeatures(const string& ) :
- fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) {
- for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
- for (int j = 0; j < i; ++j) {
- for (int k = 0; k < MAX_SENTENCE_SIZE; ++k) {
- int& val = fids_[i][j][k];
- val = -1;
- if (j < i) {
- ostringstream os;
- os << "M2FL:" << i << ":TI:" << k << "_SI:" << j;
- val = FD::Convert(os.str());
- }
- }
- }
- }
-}
-
-void Model2BinaryFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const vector<const void*>& /*ant_states*/,
- SparseVector<double>* features,
- SparseVector<double>* // estimated_features
- ,
- void* // state
- ) const {
- // if the source word is either null or the generated word
- // has no position in the reference
- if (edge.i_ == -1 || edge.prev_i_ == -1)
- return;
-
- assert(smeta.GetTargetLength() > 0);
- const int fid = fids_[smeta.GetSourceLength()][edge.i_][edge.prev_i_];
- features->set_value(fid, 1.0);
-// cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
-}
-
-
RelativeSentencePosition::RelativeSentencePosition(const string& param) :
fid_(FD::Convert("RelativeSentencePosition")) {
if (!param.empty()) {
@@ -119,87 +88,6 @@ void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& sme
// cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
}
-MarkovJumpFClass::MarkovJumpFClass(const string& param) :
- FeatureFunction(1),
- fids_(MAX_SENTENCE_SIZE) {
- cerr << " MarkovJumpFClass" << endl;
- cerr << "Reading source POS tags from " << param << endl;
- ReadFile rf(param);
- istream& in = *rf.stream();
- set<WordID> classes;
- while(in) {
- string line;
- getline(in, line);
- if (line.empty()) continue;
- vector<WordID> v;
- TD::ConvertSentence(line, &v);
- pos_.push_back(v);
- for (int i = 0; i < v.size(); ++i)
- classes.insert(v[i]);
- }
- cerr << " (" << pos_.size() << " lines)\n";
- cerr << " Classes: " << classes.size() << endl;
- for (int ss = 1; ss < MAX_SENTENCE_SIZE; ++ss) {
- map<WordID, map<int, int> >& cfids = fids_[ss];
- for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) {
- map<int, int> &fids = cfids[*i];
- for (int j = -ss; j <= ss; ++j) {
- ostringstream os;
- os << "Jump_FL:" << ss << "_FC:" << TD::Convert(*i) << "_J:" << j;
- fids[j] = FD::Convert(os.str());
- }
- }
- }
-}
-
-void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta,
- int prev_src_pos,
- int cur_src_pos,
- SparseVector<double>* features) const {
- if (prev_src_pos == kNULL_i || cur_src_pos == kNULL_i)
- return;
-
- const int jumpsize = cur_src_pos - prev_src_pos;
-
- assert(smeta.GetSentenceID() < pos_.size());
- const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos];
- const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
- features->set_value(fid, 1.0);
-}
-
-void MarkovJumpFClass::FinalTraversalFeatures(const void* context,
- SparseVector<double>* features) const {
- int left_index = *static_cast<const unsigned char*>(context);
-// int right_index = cur_flen;
- // TODO
-}
-
-void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_states,
- SparseVector<double>* features,
- SparseVector<double>* /* estimated_features */,
- void* state) const {
- unsigned char& dpstate = *((unsigned char*)state);
- if (edge.Arity() == 0) {
- dpstate = static_cast<unsigned int>(edge.i_);
- } else if (edge.Arity() == 1) {
- dpstate = *((unsigned char*)ant_states[0]);
- } else if (edge.Arity() == 2) {
- int left_index = *((unsigned char*)ant_states[0]);
- int right_index = *((unsigned char*)ant_states[1]);
- if (right_index == -1)
- dpstate = static_cast<unsigned int>(left_index);
- else
- dpstate = static_cast<unsigned int>(right_index);
-// const WordID cur_fclass = pos_[smeta.GetSentenceID()][right_index];
-// cerr << edge.i_ << "," << edge.j_ << ": fclass=" << TD::Convert(cur_fclass) << " j=" << jumpsize << endl;
-// const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
-// features->set_value(fid, 1.0);
- FireFeature(smeta, left_index, right_index, features);
- }
-}
-
LexNullJump::LexNullJump(const string& param) :
FeatureFunction(1),
fid_lex_null_(FD::Convert("JumpLexNull")),
@@ -239,107 +127,71 @@ void LexNullJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
}
}
-MarkovJump::MarkovJump(const string& param) :
+NewJump::NewJump(const string& param) :
FeatureFunction(1),
- fid_(FD::Convert("MarkovJump")),
- fid_lex_null_(FD::Convert("JumpLexNull")),
- fid_null_lex_(FD::Convert("JumpNullLex")),
- fid_null_null_(FD::Convert("JumpNullNull")),
- fid_lex_lex_(FD::Convert("JumpLexLex")),
- binary_params_(false) {
- cerr << " MarkovJump";
+ kBOS_(TD::Convert("BOS")),
+ kEOS_(TD::Convert("EOS")) {
+ cerr << " NewJump";
vector<string> argv;
+ set<string> permitted;
+ permitted.insert("use_binned_log_lengths");
+ permitted.insert("flen");
+ permitted.insert("elen");
+ permitted.insert("fprev");
+ permitted.insert("f0");
+ permitted.insert("f-1");
+ permitted.insert("f+1");
+ // also permitted f:FILENAME
int argc = SplitOnWhitespace(param, &argv);
- if (argc != 1 || !(argv[0] == "-b" || argv[0] == "+b")) {
- cerr << "MarkovJump: expected parameters to be -b or +b\n";
- exit(1);
- }
- binary_params_ = argv[0] == "+b";
- if (binary_params_) {
- flen2jump2fid_.resize(MAX_SENTENCE_SIZE);
- for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
- map<int, int>& jump2fid = flen2jump2fid_[i];
- for (int jump = -i; jump <= i; ++jump) {
- ostringstream os;
- os << "Jump:FLen:" << i << "_J:" << jump;
- jump2fid[jump] = FD::Convert(os.str());
- }
- }
- } else {
- cerr << " (Blunsom & Cohn definition)";
- }
- cerr << endl;
-}
-
-// TODO handle NULLs according to Och 2000?
-void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const vector<const void*>& ant_states,
- SparseVector<double>* features,
- SparseVector<double>* /* estimated_features */,
- void* state) const {
- unsigned char& dpstate = *((unsigned char*)state);
- const int flen = smeta.GetSourceLength();
- if (edge.Arity() == 0) {
- dpstate = static_cast<unsigned int>(edge.i_);
- if (edge.prev_i_ == 0) { // first word in sentence
- if (edge.i_ >= 0 && binary_params_) {
- const int fid = flen2jump2fid_[flen].find(edge.i_ + 1)->second;
- features->set_value(fid, 1.0);
- } else if (edge.i_ < 0 && binary_params_) {
- // handled by bigram features
- }
- } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) {
- if (edge.i_ >= 0 && binary_params_) {
- int jumpsize = flen - edge.i_;
- const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
- features->set_value(fid, 1.0);
- } else if (edge.i_ < 0 && binary_params_) {
- // handled by bigram features
- }
- }
- } else if (edge.Arity() == 1) {
- dpstate = *((unsigned char*)ant_states[0]);
- } else if (edge.Arity() == 2) {
- int left_index = *((unsigned char*)ant_states[0]);
- int right_index = *((unsigned char*)ant_states[1]);
- if (right_index == -1)
- dpstate = static_cast<unsigned int>(left_index);
- else
- dpstate = static_cast<unsigned int>(right_index);
- if (left_index == kNULL_i || right_index == kNULL_i) {
- if (left_index == kNULL_i && right_index == kNULL_i)
- features->set_value(fid_null_null_, 1.0);
- else if (left_index == kNULL_i)
- features->set_value(fid_null_lex_, 1.0);
- else
- features->set_value(fid_lex_null_, 1.0);
-
+ set<string> config;
+ string f_file;
+ for (int i = 0; i < argc; ++i) {
+ if (argv[i].size() > 2 && argv[i].find("f:") == 0) {
+ assert(f_file.empty()); // only one f file!
+ f_file = argv[i].substr(2);
+ cerr << " source_file=" << f_file;
} else {
- features->set_value(fid_lex_lex_, 1.0); // TODO should only use if NULL is enabled
- const int jumpsize = right_index - left_index;
-
- if (binary_params_) {
- const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
- features->set_value(fid, 1.0);
+ if (permitted.count(argv[i])) {
+ assert(config.count(argv[i]) == 0);
+ config.insert(argv[i]);
+ cerr << " " << argv[i];
} else {
- features->set_value(fid_, fabs(jumpsize - 1)); // Blunsom and Cohn def
+ cerr << "\nNewJump: don't understand param '" << argv[i] << "'\n";
+ abort();
}
}
- } else {
- assert(!"something really unexpected is happening");
}
-}
-
-NewJump::NewJump(const string& param) :
- FeatureFunction(1) {
- cerr << " NewJump";
- vector<string> argv;
- int argc = SplitOnWhitespace(param, &argv);
- set<string> config;
- for (int i = 0; i < argc; ++i) config.insert(argv[i]);
cerr << endl;
use_binned_log_lengths_ = config.count("use_binned_log_lengths") > 0;
+ f0_ = config.count("f0") > 0;
+ fm1_ = config.count("f-1") > 0;
+ fp1_ = config.count("f+1") > 0;
+ fprev_ = config.count("fprev") > 0;
+ elen_ = config.count("elen") > 0;
+ flen_ = config.count("flen") > 0;
+ if (f0_ || fm1_ || fp1_ || fprev_) {
+ if (f_file.empty()) {
+ cerr << "NewJump: conditioning on src but f:FILE not specified!\n";
+ abort();
+ }
+ ReadFile rf(f_file);
+ istream& in = *rf.stream();
+ string line;
+ while(in) {
+ getline(in, line);
+ if (!in) continue;
+ vector<WordID> v;
+ TD::ConvertSentence(line, &v);
+ src_.push_back(v);
+ }
+ }
+ fid_str_ = "J";
+ if (flen_) fid_str_ += "F";
+ if (elen_) fid_str_ += "E";
+ if (f0_) fid_str_ += "C";
+ if (fm1_) fid_str_ += "L";
+ if (fp1_) fid_str_ += "R";
+ if (fprev_) fid_str_ += "P";
}
// do a log transform on the length (of a sentence, a jump, etc)
@@ -351,33 +203,66 @@ int BinnedLogLength(int len) {
return res;
}
+// <0>=jump size <1>=jump_dir <2>=flen, <3>=elen, <4>=f0, <5>=f-1, <6>=f+1, <7>=fprev
+typedef boost::tuple<short, char, short, short, WordID, WordID, WordID, WordID> NewJumpFeatureKey;
+
+struct KeyHash : unary_function<NewJumpFeatureKey, size_t> {
+ size_t operator()(const NewJumpFeatureKey& k) const {
+ size_t h = 0x37473DEF321;
+ boost::hash_combine(h, k.get<0>());
+ boost::hash_combine(h, k.get<1>());
+ boost::hash_combine(h, k.get<2>());
+ boost::hash_combine(h, k.get<3>());
+ boost::hash_combine(h, k.get<4>());
+ boost::hash_combine(h, k.get<5>());
+ boost::hash_combine(h, k.get<6>());
+ boost::hash_combine(h, k.get<7>());
+ return h;
+ }
+};
+
void NewJump::FireFeature(const SentenceMetadata& smeta,
const int prev_src_index,
const int cur_src_index,
SparseVector<double>* features) const {
+ const int id = smeta.GetSentenceID();
const int src_len = smeta.GetSourceLength();
const int raw_jump = cur_src_index - prev_src_index;
+ short jump_magnitude = raw_jump;
char jtype = 0;
- int jump_magnitude = raw_jump;
if (raw_jump > 0) { jtype = 'R'; } // Right
else if (raw_jump == 0) { jtype = 'S'; } // Stay
else { jtype = 'L'; jump_magnitude = raw_jump * -1; } // Left
- int effective_length = src_len;
+ int effective_src_len = src_len;
+ int effective_trg_len = smeta.GetTargetLength();
if (use_binned_log_lengths_) {
jump_magnitude = BinnedLogLength(jump_magnitude);
- effective_length = BinnedLogLength(src_len);
- }
-
- if (true) {
- static map<int, map<int, int> > len2jump2fid;
- int& fid = len2jump2fid[src_len][raw_jump];
- if (!fid) {
- ostringstream os;
- os << fid_str_ << ":FLen" << effective_length << ":" << jtype << jump_magnitude;
- fid = FD::Convert(os.str());
- }
- features->set_value(fid, 1.0);
+ effective_src_len = BinnedLogLength(src_len);
+ effective_trg_len = BinnedLogLength(effective_trg_len);
+ }
+ NewJumpFeatureKey key(jump_magnitude,jtype,0,0,0,0,0);
+ using boost::get;
+ if (flen_) get<2>(key) = effective_src_len;
+ if (elen_) get<3>(key) = effective_trg_len;
+ if (f0_) get<4>(key) = GetSourceWord(id, cur_src_index);
+ if (fm1_) get<5>(key) = GetSourceWord(id, cur_src_index - 1);
+ if (fp1_) get<6>(key) = GetSourceWord(id, cur_src_index + 1);
+ if (fprev_) get<7>(key) = GetSourceWord(id, prev_src_index);
+
+ static std::tr1::unordered_map<NewJumpFeatureKey, int, KeyHash> fids;
+ int& fid = fids[key];
+ if (!fid) {
+ ostringstream os;
+ os << fid_str_ << ':' << jtype << jump_magnitude;
+ if (flen_) os << ':' << get<2>(key);
+ if (elen_) os << ':' << get<3>(key);
+ if (f0_) os << ':' << TD::Convert(get<4>(key));
+ if (fm1_) os << ':' << TD::Convert(get<5>(key));
+ if (fp1_) os << ':' << TD::Convert(get<6>(key));
+ if (fprev_) os << ':' << TD::Convert(get<7>(key));
+ fid = FD::Convert(os.str());
}
+ features->set_value(fid, 1.0);
}
void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
@@ -387,6 +272,7 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
SparseVector<double>* /* estimated_features */,
void* state) const {
unsigned char& dpstate = *((unsigned char*)state);
+ // IMPORTANT: this only fires on non-Null transitions!
const int flen = smeta.GetSourceLength();
if (edge.Arity() == 0) {
dpstate = static_cast<unsigned int>(edge.i_);
@@ -427,6 +313,23 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
SourceBigram::SourceBigram(const std::string& param) :
FeatureFunction(sizeof(WordID) + sizeof(int)) {
+ fid_str_ = "SB:";
+ if (param.size() > 0) {
+ vector<string> argv;
+ int argc = SplitOnWhitespace(param, &argv);
+ if (argc != 2) {
+ cerr << "SourceBigram [FEATURE_NAME_PREFIX PATH]\n";
+ abort();
+ }
+ fid_str_ = argv[0] + ":";
+ lexmap_.reset(new FactoredLexiconHelper(argv[1], "*"));
+ } else {
+ lexmap_.reset(new FactoredLexiconHelper);
+ }
+}
+
+void SourceBigram::PrepareForInput(const SentenceMetadata& smeta) {
+ lexmap_->PrepareForInput(smeta);
}
void SourceBigram::FinalTraversalFeatures(const void* context,
@@ -445,7 +348,7 @@ void SourceBigram::FireFeature(WordID left,
// TODO important important !!! escape strings !!!
if (!fid) {
ostringstream os;
- os << "SB:";
+ os << fid_str_;
if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }
os << '_';
if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); }
@@ -465,85 +368,7 @@ void SourceBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,
int& out_word_count = *(static_cast<int*>(context) + 1);
const int arity = edge.Arity();
if (arity == 0) {
- out_context = edge.rule_->f()[0];
- out_word_count = edge.rule_->EWords();
- assert(out_word_count == 1); // this is only defined for lex translation!
- // revisit this if you want to translate into null words
- } else if (arity == 2) {
- WordID left = *static_cast<const WordID*>(ant_contexts[0]);
- WordID right = *static_cast<const WordID*>(ant_contexts[1]);
- int left_wc = *(static_cast<const int*>(ant_contexts[0]) + 1);
- int right_wc = *(static_cast<const int*>(ant_contexts[0]) + 1);
- if (left_wc == 1 && right_wc == 1)
- FireFeature(-1, left, features);
- FireFeature(left, right, features);
- out_word_count = left_wc + right_wc;
- out_context = right;
- }
-}
-// state: POS of src word used, number of trg words generated
-SourcePOSBigram::SourcePOSBigram(const std::string& param) :
- FeatureFunction(sizeof(WordID) + sizeof(int)) {
- cerr << "Reading source POS tags from " << param << endl;
- ReadFile rf(param);
- istream& in = *rf.stream();
- while(in) {
- string line;
- getline(in, line);
- if (line.empty()) continue;
- vector<WordID> v;
- TD::ConvertSentence(line, &v);
- pos_.push_back(v);
- }
- cerr << " (" << pos_.size() << " lines)\n";
-}
-
-void SourcePOSBigram::FinalTraversalFeatures(const void* context,
- SparseVector<double>* features) const {
- WordID left = *static_cast<const WordID*>(context);
- int left_wc = *(static_cast<const int*>(context) + 1);
- if (left_wc == 1)
- FireFeature(-1, left, features);
- FireFeature(left, -1, features);
-}
-
-void SourcePOSBigram::FireFeature(WordID left,
- WordID right,
- SparseVector<double>* features) const {
- int& fid = fmap_[left][right];
- if (!fid) {
- ostringstream os;
- os << "SP:";
- if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }
- os << '_';
- if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); }
- fid = FD::Convert(os.str());
- if (fid == 0) fid = -1;
- }
- if (fid < 0) return;
- features->set_value(fid, 1.0);
-}
-
-void SourcePOSBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* /* estimated_features */,
- void* context) const {
- WordID& out_context = *static_cast<WordID*>(context);
- int& out_word_count = *(static_cast<int*>(context) + 1);
- const int arity = edge.Arity();
- if (arity == 0) {
- assert(smeta.GetSentenceID() < pos_.size());
- const vector<WordID>& pos_sent = pos_[smeta.GetSentenceID()];
- if (edge.i_ >= 0) { // non-NULL source
- assert(edge.i_ < pos_sent.size());
- out_context = pos_sent[edge.i_];
- } else { // NULL source
- // should assert that source is kNULL?
- static const WordID kNULL = TD::Convert("<eps>");
- out_context = kNULL;
- }
+ out_context = lexmap_->SourceWordAtPosition(edge.i_);
out_word_count = edge.rule_->EWords();
assert(out_word_count == 1); // this is only defined for lex translation!
// revisit this if you want to translate into null words
diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h
index 418c8768..a1ffd9ca 100644
--- a/decoder/ff_wordalign.h
+++ b/decoder/ff_wordalign.h
@@ -3,7 +3,9 @@
#include "ff.h"
#include "array2d.h"
+#include "factored_lexicon_helper.h"
+#include <boost/scoped_ptr.hpp>
#include <boost/multi_array.hpp>
class RelativeSentencePosition : public FeatureFunction {
@@ -23,64 +25,6 @@ class RelativeSentencePosition : public FeatureFunction {
std::map<WordID, int> fids_; // fclass -> fid
};
-class Model2BinaryFeatures : public FeatureFunction {
- public:
- Model2BinaryFeatures(const std::string& param);
- protected:
- virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* out_context) const;
- private:
- boost::multi_array<int, 3> fids_;
-};
-
-class MarkovJump : public FeatureFunction {
- public:
- MarkovJump(const std::string& param);
- protected:
- virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* out_context) const;
- private:
- const int fid_;
- const int fid_lex_null_;
- const int fid_null_lex_;
- const int fid_null_null_;
- const int fid_lex_lex_;
-
- bool binary_params_;
- std::vector<std::map<int, int> > flen2jump2fid_;
-};
-
-class MarkovJumpFClass : public FeatureFunction {
- public:
- MarkovJumpFClass(const std::string& param);
- virtual void FinalTraversalFeatures(const void* context,
- SparseVector<double>* features) const;
- protected:
- virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* context) const;
-
- void FireFeature(const SentenceMetadata& smeta,
- int prev_src_pos,
- int cur_src_pos,
- SparseVector<double>* features) const;
-
- private:
- std::vector<std::map<WordID, std::map<int, int> > > fids_; // flen -> fclass -> jumpsize -> fid
- std::vector<std::vector<WordID> > pos_;
-};
-
typedef std::map<WordID, int> Class2FID;
typedef std::map<WordID, Class2FID> Class2Class2FID;
typedef std::map<WordID, Class2Class2FID> Class2Class2Class2FID;
@@ -89,6 +33,7 @@ class SourceBigram : public FeatureFunction {
SourceBigram(const std::string& param);
virtual void FinalTraversalFeatures(const void* context,
SparseVector<double>* features) const;
+ void PrepareForInput(const SentenceMetadata& smeta);
protected:
virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
const Hypergraph::Edge& edge,
@@ -100,7 +45,9 @@ class SourceBigram : public FeatureFunction {
void FireFeature(WordID src,
WordID trg,
SparseVector<double>* features) const;
+ std::string fid_str_;
mutable Class2Class2FID fmap_;
+ boost::scoped_ptr<FactoredLexiconHelper> lexmap_; // different view (stemmed, etc) of source
};
class LexNullJump : public FeatureFunction {
@@ -136,30 +83,27 @@ class NewJump : public FeatureFunction {
const int cur_src_index,
SparseVector<double>* features) const;
+ WordID GetSourceWord(int sentence_id, int index) const {
+ if (index < 0) return kBOS_;
+ assert(src_.size() > sentence_id);
+ const std::vector<WordID>& v = src_[sentence_id];
+ if (index >= v.size()) return kEOS_;
+ return v[index];
+ }
+
+ const WordID kBOS_;
+ const WordID kEOS_;
bool use_binned_log_lengths_;
+ bool flen_;
+ bool elen_;
+ bool f0_;
+ bool fm1_;
+ bool fp1_;
+ bool fprev_;
+ std::vector<std::vector<WordID> > src_;
std::string fid_str_; // identifies configuration uniquely
};
-class SourcePOSBigram : public FeatureFunction {
- public:
- SourcePOSBigram(const std::string& param);
- virtual void FinalTraversalFeatures(const void* context,
- SparseVector<double>* features) const;
- protected:
- virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* context) const;
- private:
- void FireFeature(WordID src,
- WordID trg,
- SparseVector<double>* features) const;
- mutable Class2Class2FID fmap_;
- std::vector<std::vector<WordID> > pos_;
-};
-
class LexicalTranslationTrigger : public FeatureFunction {
public:
LexicalTranslationTrigger(const std::string& param);
diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc
index 149cd68d..f237295c 100644
--- a/decoder/lextrans.cc
+++ b/decoder/lextrans.cc
@@ -81,7 +81,7 @@ struct LexicalTransImpl {
for (int i = 0; i < ref.size(); ++i) {
target_vocab.insert(ref[i][0].label);
}
- bool all_sources_to_all_targets_ = false;
+ bool all_sources_to_all_targets_ = false; // TODO configure this
set<WordID> trgs_used;
for (int i = 0; i < e_len; ++i) { // for each word in the *target*
Hypergraph::Node* node = forest->AddNode(kXCAT);
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index 81ac4198..f5ee5d3f 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -120,17 +120,19 @@ grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
feature_function=WordPairFeatures $align_dir/grammars/wordpairs.$direction.features.gz
feature_function=LexicalPairIdentity
-feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second
+# stem translation
feature_function=LexicalPairIdentity S $align_dir/grammars/corpus.stemmed.$first $align_dir/grammars/${second}stem.map
+# POS translation
+feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second
feature_function=InputIdentity
feature_function=OutputIdentity
feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first
-# the following two are deprecated
-feature_function=MarkovJump +b
-feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first
+feature_function=NewJump
+feature_function=NewJump use_binned_log_lengths flen
+# jump distance and src and destination class type
+feature_function=NewJump use_binned_log_lengths f0 fprev f:$align_dir/grammars/corpus.class.$first
feature_function=SourceBigram
-# following is deprecated- should reuse SourceBigram the way LexicalPairIdentity does
-feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first
+feature_function=SourceBigram SC $align_dir/grammars/corpus.class.$first
EOT
close CDEC;
open AGENDA, ">$stage_dir/agenda.txt" or die "Can't write $stage_dir/agenda.txt: $!";
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index be0644df..1a069abf 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -1,14 +1,13 @@
-all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml
+all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map
clean:
- $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg* wordpairs*
+ $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* freq* wordpairs*
SUPPORT_DIR = $(SCRIPT_DIR)/support
GZIP = /usr/bin/gzip
ZCAT = zcat
EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl
EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl
-GENERATE_PSG = $(SUPPORT_DIR)/generate_per_sentence_grammars.pl
GENERATE_WORDPAIR_FEATURES = $(SUPPORT_DIR)/generate_word_pair_features.pl
ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl
ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl
@@ -84,6 +83,3 @@ corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1
wordpairs.f-e.features.gz: corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1
$(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 | $(GZIP) -9 > $@
-corpus.f-e.sgml: f.voc corpus.f-e.lex-grammar.gz corpus.f-e
- $(GENERATE_PSG) f.voc corpus.f-e corpus.f-e.lex-grammar.gz freq_grammar.f-e.gz psg.f-e $@
-
diff --git a/word-aligner/support/generate_word_pair_features.pl b/word-aligner/support/generate_word_pair_features.pl
index b28f6feb..54b89ce1 100755
--- a/word-aligner/support/generate_word_pair_features.pl
+++ b/word-aligner/support/generate_word_pair_features.pl
@@ -92,7 +92,7 @@ my $ADD_ID = 1;
my $ADD_PUNC = 1;
my $ADD_NULL = 1;
my $ADD_MODEL1 = 1;
-my $ADD_NOMODEL1 = 1;
+my $ADD_NOMODEL1 = 0;
my $BEAM_RATIO = 50;
my $BIN_ORTHO = 1;
my $BIN_DLEN = 1;
@@ -171,7 +171,7 @@ for my $f (sort keys %fdict) {
}
if ($im1 > $MIN_MAGNITUDE) {
push @feats, "InvModel1=$im1" if $im1;
- } else {
+ } elsif ($ADD_NOMODEL1) {
push @feats, 'NoInvModel1=1';
}
my $am1 = sprintf("%.5g", sqrt($m1 * $im1));