summaryrefslogtreecommitdiff
path: root/decoder/ff_wordalign.cc
diff options
context:
space:
mode:
Diffstat (limited to 'decoder/ff_wordalign.cc')
-rw-r--r--decoder/ff_wordalign.cc256
1 files changed, 255 insertions, 1 deletions
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index 5f42b438..980c64ad 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -1,10 +1,13 @@
#include "ff_wordalign.h"
+#include <algorithm>
+#include <iterator>
#include <set>
#include <sstream>
#include <string>
#include <cmath>
+#include "verbose.h"
#include "alignment_pharaoh.h"
#include "stringlib.h"
#include "sentence_metadata.h"
@@ -20,6 +23,8 @@ static const int kNULL_i = 255; // -1 as an unsigned char
using namespace std;
+// TODO new feature: if a word is translated as itself and there is a transition back to the same word, fire a feature
+
Model2BinaryFeatures::Model2BinaryFeatures(const string& ) :
fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) {
for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
@@ -195,6 +200,45 @@ void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta,
}
}
+LexNullJump::LexNullJump(const string& param) :
+ FeatureFunction(1),
+ fid_lex_null_(FD::Convert("JumpLexNull")),
+ fid_null_lex_(FD::Convert("JumpNullLex")),
+ fid_null_null_(FD::Convert("JumpNullNull")),
+ fid_lex_lex_(FD::Convert("JumpLexLex")) {}
+
+void LexNullJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* /* estimated_features */,
+ void* state) const {
+ char& dpstate = *((char*)state);
+ if (edge.Arity() == 0) {
+ // dpstate is 'N' = null or 'L' = lex
+ if (edge.i_ < 0) { dpstate = 'N'; } else { dpstate = 'L'; }
+ } else if (edge.Arity() == 1) {
+ dpstate = *((unsigned char*)ant_states[0]);
+ } else if (edge.Arity() == 2) {
+ char left = *((char*)ant_states[0]);
+ char right = *((char*)ant_states[1]);
+ dpstate = right;
+ if (left == 'N') {
+ if (right == 'N')
+ features->set_value(fid_null_null_, 1.0);
+ else
+ features->set_value(fid_null_lex_, 1.0);
+ } else { // left == 'L'
+ if (right == 'N')
+ features->set_value(fid_lex_null_, 1.0);
+ else
+ features->set_value(fid_lex_lex_, 1.0);
+ }
+ } else {
+ assert(!"something really unexpected is happening");
+ }
+}
+
MarkovJump::MarkovJump(const string& param) :
FeatureFunction(1),
fid_(FD::Convert("MarkovJump")),
@@ -287,6 +331,100 @@ void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
}
}
+NewJump::NewJump(const string& param) :
+ FeatureFunction(1) {
+ cerr << " NewJump";
+ vector<string> argv;
+ int argc = SplitOnWhitespace(param, &argv);
+ set<string> config;
+ for (int i = 0; i < argc; ++i) config.insert(argv[i]);
+ cerr << endl;
+ use_binned_log_lengths_ = config.count("use_binned_log_lengths") > 0;
+}
+
+// do a log transform on the length (of a sentence, a jump, etc)
+// this basically means that large distances that are close to each other
+// are put into the same bin
+int BinnedLogLength(int len) {
+ int res = static_cast<int>(log(len+1) / log(1.3));
+ if (res > 16) res = 16;
+ return res;
+}
+
+void NewJump::FireFeature(const SentenceMetadata& smeta,
+ const int prev_src_index,
+ const int cur_src_index,
+ SparseVector<double>* features) const {
+ const int src_len = smeta.GetSourceLength();
+ const int raw_jump = cur_src_index - prev_src_index;
+ char jtype = 0;
+ int jump_magnitude = raw_jump;
+ if (raw_jump > 0) { jtype = 'R'; } // Right
+ else if (raw_jump == 0) { jtype = 'S'; } // Stay
+ else { jtype = 'L'; jump_magnitude = raw_jump * -1; } // Left
+ int effective_length = src_len;
+ if (use_binned_log_lengths_) {
+ jump_magnitude = BinnedLogLength(jump_magnitude);
+ effective_length = BinnedLogLength(src_len);
+ }
+
+ if (true) {
+ static map<int, map<int, int> > len2jump2fid;
+ int& fid = len2jump2fid[src_len][raw_jump];
+ if (!fid) {
+ ostringstream os;
+ os << fid_str_ << ":FLen" << effective_length << ":" << jtype << jump_magnitude;
+ fid = FD::Convert(os.str());
+ }
+ features->set_value(fid, 1.0);
+ }
+}
+
+void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& ant_states,
+ SparseVector<double>* features,
+ SparseVector<double>* /* estimated_features */,
+ void* state) const {
+ unsigned char& dpstate = *((unsigned char*)state);
+ const int flen = smeta.GetSourceLength();
+ if (edge.Arity() == 0) {
+ dpstate = static_cast<unsigned int>(edge.i_);
+ if (edge.prev_i_ == 0) { // first target word in sentence
+ if (edge.i_ >= 0) { // generated from non-Null token?
+ FireFeature(smeta,
+ -1, // previous src = beginning of sentence index
+ edge.i_, // current src
+ features);
+ }
+ } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) { // last word
+ if (edge.i_ >= 0) { // generated from non-Null token?
+ FireFeature(smeta,
+ edge.i_, // previous src = last word position
+ flen, // current src
+ features);
+ }
+ }
+ } else if (edge.Arity() == 1) {
+ dpstate = *((unsigned char*)ant_states[0]);
+ } else if (edge.Arity() == 2) {
+ int left_index = *((unsigned char*)ant_states[0]);
+ int right_index = *((unsigned char*)ant_states[1]);
+ if (right_index == -1)
+ dpstate = static_cast<unsigned int>(left_index);
+ else
+ dpstate = static_cast<unsigned int>(right_index);
+ if (left_index != kNULL_i && right_index != kNULL_i) {
+ FireFeature(smeta,
+ left_index, // previous src index
+ right_index, // current src index
+ features);
+ }
+ } else {
+ assert(!"something really unexpected is happening");
+ }
+}
+
SourceBigram::SourceBigram(const std::string& param) :
FeatureFunction(sizeof(WordID) + sizeof(int)) {
}
@@ -626,6 +764,122 @@ void InputIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta,
}
}
+WordPairFeatures::WordPairFeatures(const string& param) {
+ vector<string> argv;
+ int argc = SplitOnWhitespace(param, &argv);
+ if (argc != 1) {
+ cerr << "WordPairFeature /path/to/feature_values.table\n";
+ abort();
+ }
+ set<WordID> all_srcs;
+ {
+ ReadFile rf(argv[0]);
+ istream& in = *rf.stream();
+ string buf;
+ while (in) {
+ getline(in, buf);
+ if (buf.empty()) continue;
+ int start = 0;
+ while(start < buf.size() && buf[start] == ' ') ++start;
+ int end = start;
+ while(end < buf.size() && buf[end] != ' ') ++end;
+ const WordID src = TD::Convert(buf.substr(start, end - start));
+ all_srcs.insert(src);
+ }
+ }
+ if (all_srcs.empty()) {
+ cerr << "WordPairFeature " << param << " loaded empty file!\n";
+ return;
+ }
+ fkeys_.reserve(all_srcs.size());
+ copy(all_srcs.begin(), all_srcs.end(), back_inserter(fkeys_));
+ values_.resize(all_srcs.size());
+ if (!SILENT) { cerr << "WordPairFeature: " << all_srcs.size() << " sources\n"; }
+ ReadFile rf(argv[0]);
+ istream& in = *rf.stream();
+ string buf;
+ double val = 0;
+ WordID cur_src = 0;
+ map<WordID, SparseVector<float> > *pv = NULL;
+ const WordID kBARRIER = TD::Convert("|||");
+ while (in) {
+ getline(in, buf);
+ if (buf.size() == 0) continue;
+ int start = 0;
+ while(start < buf.size() && buf[start] == ' ') ++start;
+ int end = start;
+ while(end < buf.size() && buf[end] != ' ') ++end;
+ const WordID src = TD::Convert(buf.substr(start, end - start));
+ if (cur_src != src) {
+ cur_src = src;
+ size_t ind = distance(fkeys_.begin(), lower_bound(fkeys_.begin(), fkeys_.end(), cur_src));
+ pv = &values_[ind];
+ }
+ end += 1;
+ start = end;
+ while(end < buf.size() && buf[end] != ' ') ++end;
+ WordID x = TD::Convert(buf.substr(start, end - start));
+ if (x != kBARRIER) {
+ cerr << "1 Format error: " << buf << endl;
+ abort();
+ }
+ start = end + 1;
+ end = start + 1;
+ while(end < buf.size() && buf[end] != ' ') ++end;
+ WordID trg = TD::Convert(buf.substr(start, end - start));
+ if (trg == kBARRIER) {
+ cerr << "2 Format error: " << buf << endl;
+ abort();
+ }
+ start = end + 1;
+ end = start + 1;
+ while(end < buf.size() && buf[end] != ' ') ++end;
+ WordID x2 = TD::Convert(buf.substr(start, end - start));
+ if (x2 != kBARRIER) {
+ cerr << "3 Format error: " << buf << endl;
+ abort();
+ }
+ start = end + 1;
+
+ SparseVector<float>& v = (*pv)[trg];
+ while(start < buf.size()) {
+ end = start + 1;
+ while(end < buf.size() && buf[end] != '=' && buf[end] != ' ') ++end;
+ if (end == buf.size() || buf[end] != '=') { cerr << "4 Format error: " << buf << endl; abort(); }
+ const int fid = FD::Convert(buf.substr(start, end - start));
+ start = end + 1;
+ while(start < buf.size() && buf[start] == ' ') ++start;
+ end = start + 1;
+ while(end < buf.size() && buf[end] != ' ') ++end;
+ assert(end > start);
+ if (end < buf.size()) buf[end] = 0;
+ val = strtod(&buf.c_str()[start], NULL);
+ v.set_value(fid, val);
+ start = end + 1;
+ }
+ }
+}
-
+void WordPairFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const {
+ if (edge.Arity() == 0) {
+ assert(edge.rule_->EWords() == 1);
+ assert(edge.rule_->FWords() == 1);
+ const WordID trg = edge.rule_->e()[0];
+ const WordID src = edge.rule_->f()[0];
+ size_t ind = distance(fkeys_.begin(), lower_bound(fkeys_.begin(), fkeys_.end(), src));
+ if (ind == fkeys_.size() || fkeys_[ind] != src) {
+ cerr << "WordPairFeatures no source entries for " << TD::Convert(src) << endl;
+ abort();
+ }
+ const map<WordID, SparseVector<float> >::const_iterator it = values_[ind].find(trg);
+ // TODO optional strict flag to make sure there are features for all pairs?
+ if (it != values_[ind].end())
+ (*features) += it->second;
+ }
+}