major refactor of markov features for word alignment

author: Chris Dyer <cdyer@cs.cmu.edu> 2010-12-09 17:04:29 -0500
committer: Chris Dyer <cdyer@cs.cmu.edu> 2010-12-09 17:04:29 -0500
commit: 9a8cbe4db88e63378b6d3c4ec96438819f1f1131 (patch)
tree: abf1a23739a033eaabd62f61e39ac249d9cf7717
parent: 61bfaf15c02a0555d8ffa5dd4e6ae32f09354610 (diff)
7 files changed, 163 insertions, 400 deletions
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index d6cf4572..e87ab5ab 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -50,13 +50,9 @@ void register_feature_functions() {
 #endif
   ff_registry.Register("RuleShape", new FFFactory<RuleShapeFeatures>);
   ff_registry.Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>);
-  ff_registry.Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>);
   ff_registry.Register("LexNullJump", new FFFactory<LexNullJump>);
   ff_registry.Register("NewJump", new FFFactory<NewJump>);
-  ff_registry.Register("MarkovJump", new FFFactory<MarkovJump>);
-  ff_registry.Register("MarkovJumpFClass", new FFFactory<MarkovJumpFClass>);
   ff_registry.Register("SourceBigram", new FFFactory<SourceBigram>);
-  ff_registry.Register("SourcePOSBigram", new FFFactory<SourcePOSBigram>);
   ff_registry.Register("BlunsomSynchronousParseHack", new FFFactory<BlunsomSynchronousParseHack>);
   ff_registry.Register("AlignerResults", new FFFactory<AlignerResults>);
   ff_registry.Register("CSplit_BasicFeatures", new FFFactory<BasicCSplitFeatures>);
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index 980c64ad..338f1a72 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -6,7 +6,13 @@
 #include <sstream>
 #include <string>
 #include <cmath>
+#include <tr1/unordered_map>
 
+#include <boost/tuple/tuple.hpp>
+#include "boost/tuple/tuple_comparison.hpp"
+#include <boost/functional/hash.hpp>
+
+#include "factored_lexicon_helper.h"
 #include "verbose.h"
 #include "alignment_pharaoh.h"
 #include "stringlib.h"
@@ -25,43 +31,6 @@ using namespace std;
 
 // TODO new feature: if a word is translated as itself and there is a transition back to the same word, fire a feature
 
-Model2BinaryFeatures::Model2BinaryFeatures(const string& ) :
-    fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) {
-  for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
-    for (int j = 0; j < i; ++j) {
-      for (int k = 0; k < MAX_SENTENCE_SIZE; ++k) {
-        int& val = fids_[i][j][k];
-        val = -1;
-        if (j < i) {
-          ostringstream os;
-          os << "M2FL:" << i << ":TI:" << k << "_SI:" << j;
-          val = FD::Convert(os.str());
-        }
-      }
-    }
-  }
-}
-
-void Model2BinaryFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                                 const Hypergraph::Edge& edge,
-                                                 const vector<const void*>& /*ant_states*/,
-                                                 SparseVector<double>* features,
-                                                 SparseVector<double>* // estimated_features
-                                                 ,
-                                                 void* // state
-  ) const {
-  // if the source word is either null or the generated word
-  // has no position in the reference
-  if (edge.i_ == -1 || edge.prev_i_ == -1)
-    return;
-
-  assert(smeta.GetTargetLength() > 0);
-  const int fid = fids_[smeta.GetSourceLength()][edge.i_][edge.prev_i_];
-  features->set_value(fid, 1.0);
-//  cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
-}
-
-
 RelativeSentencePosition::RelativeSentencePosition(const string& param) :
     fid_(FD::Convert("RelativeSentencePosition")) {
   if (!param.empty()) {
@@ -119,87 +88,6 @@ void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& sme
 //  cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
 }
 
-MarkovJumpFClass::MarkovJumpFClass(const string& param) :
-    FeatureFunction(1),
-    fids_(MAX_SENTENCE_SIZE) {
-  cerr << "    MarkovJumpFClass" << endl;
-  cerr << "Reading source POS tags from " << param << endl;
-  ReadFile rf(param);
-  istream& in = *rf.stream();
-  set<WordID> classes;
-  while(in) {
-    string line;
-    getline(in, line);
-    if (line.empty()) continue;
-    vector<WordID> v;
-    TD::ConvertSentence(line, &v);
-    pos_.push_back(v);
-    for (int i = 0; i < v.size(); ++i)
-      classes.insert(v[i]);
-  }
-  cerr << "  (" << pos_.size() << " lines)\n";
-  cerr << "  Classes: " << classes.size() << endl;
-  for (int ss = 1; ss < MAX_SENTENCE_SIZE; ++ss) {
-    map<WordID, map<int, int> >& cfids = fids_[ss];
-    for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) {
-      map<int, int> &fids = cfids[*i];
-      for (int j = -ss; j <= ss; ++j) {
-        ostringstream os;
-        os << "Jump_FL:" << ss << "_FC:" << TD::Convert(*i) << "_J:" << j;
-        fids[j] = FD::Convert(os.str());
-      }
-    }
-  }
-}
-
-void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta,
-                                   int prev_src_pos,
-                                   int cur_src_pos,
-                                   SparseVector<double>* features) const {
-  if (prev_src_pos == kNULL_i || cur_src_pos == kNULL_i)
-    return;
-
-  const int jumpsize = cur_src_pos - prev_src_pos;
-
-  assert(smeta.GetSentenceID() < pos_.size());
-  const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos];
-  const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
-  features->set_value(fid, 1.0);
-}
-
-void MarkovJumpFClass::FinalTraversalFeatures(const void* context,
-                                      SparseVector<double>* features) const {
-  int left_index = *static_cast<const unsigned char*>(context);
-//  int right_index = cur_flen;
-  // TODO
-}
-
-void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                     const Hypergraph::Edge& edge,
-                                     const std::vector<const void*>& ant_states,
-                                     SparseVector<double>* features,
-                                     SparseVector<double>* /* estimated_features */,
-                                     void* state) const {
-  unsigned char& dpstate = *((unsigned char*)state);
-  if (edge.Arity() == 0) {
-    dpstate = static_cast<unsigned int>(edge.i_);
-  } else if (edge.Arity() == 1) {
-    dpstate = *((unsigned char*)ant_states[0]);
-  } else if (edge.Arity() == 2) {
-    int left_index = *((unsigned char*)ant_states[0]);
-    int right_index = *((unsigned char*)ant_states[1]);
-    if (right_index == -1)
-      dpstate = static_cast<unsigned int>(left_index);
-    else
-      dpstate = static_cast<unsigned int>(right_index);
-//    const WordID cur_fclass = pos_[smeta.GetSentenceID()][right_index];
-//    cerr << edge.i_ << "," << edge.j_ << ": fclass=" << TD::Convert(cur_fclass) << " j=" << jumpsize << endl;
-//    const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
-//    features->set_value(fid, 1.0);
-    FireFeature(smeta, left_index, right_index, features);
-  }
-}
-
 LexNullJump::LexNullJump(const string& param) :
     FeatureFunction(1),
     fid_lex_null_(FD::Convert("JumpLexNull")),
@@ -239,107 +127,71 @@ void LexNullJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   }
 }
 
-MarkovJump::MarkovJump(const string& param) :
+NewJump::NewJump(const string& param) :
     FeatureFunction(1),
-    fid_(FD::Convert("MarkovJump")),
-    fid_lex_null_(FD::Convert("JumpLexNull")),
-    fid_null_lex_(FD::Convert("JumpNullLex")),
-    fid_null_null_(FD::Convert("JumpNullNull")),
-    fid_lex_lex_(FD::Convert("JumpLexLex")),
-    binary_params_(false) {
-  cerr << "    MarkovJump";
+    kBOS_(TD::Convert("BOS")),
+    kEOS_(TD::Convert("EOS")) {
+  cerr << "    NewJump";
   vector<string> argv;
+  set<string> permitted;
+  permitted.insert("use_binned_log_lengths");
+  permitted.insert("flen");
+  permitted.insert("elen");
+  permitted.insert("fprev");
+  permitted.insert("f0");
+  permitted.insert("f-1");
+  permitted.insert("f+1");
+  // also permitted f:FILENAME
   int argc = SplitOnWhitespace(param, &argv);
-  if (argc != 1 || !(argv[0] == "-b" || argv[0] == "+b")) {
-    cerr << "MarkovJump: expected parameters to be -b or +b\n";
-    exit(1);
-  }
-  binary_params_ = argv[0] == "+b";
-  if (binary_params_) {
-    flen2jump2fid_.resize(MAX_SENTENCE_SIZE);
-    for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
-      map<int, int>& jump2fid = flen2jump2fid_[i];
-      for (int jump = -i; jump <= i; ++jump) {
-        ostringstream os;
-        os << "Jump:FLen:" << i << "_J:" << jump;
-        jump2fid[jump] = FD::Convert(os.str());
-      }
-    }
-  } else {
-    cerr << " (Blunsom & Cohn definition)";
-  }
-  cerr << endl;
-}
-
-// TODO handle NULLs according to Och 2000?
-void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                       const Hypergraph::Edge& edge,
-                                       const vector<const void*>& ant_states,
-                                       SparseVector<double>* features,
-                                       SparseVector<double>* /* estimated_features */,
-                                       void* state) const {
-  unsigned char& dpstate = *((unsigned char*)state);
-  const int flen = smeta.GetSourceLength();
-  if (edge.Arity() == 0) {
-    dpstate = static_cast<unsigned int>(edge.i_);
-    if (edge.prev_i_ == 0) {     // first word in sentence
-      if (edge.i_ >= 0 && binary_params_) {
-        const int fid = flen2jump2fid_[flen].find(edge.i_ + 1)->second;
-        features->set_value(fid, 1.0);
-      } else if (edge.i_ < 0 && binary_params_) {
-        // handled by bigram features
-      }
-    } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) {
-      if (edge.i_ >= 0 && binary_params_) {
-        int jumpsize = flen - edge.i_;
-        const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
-        features->set_value(fid, 1.0);
-      } else if (edge.i_ < 0 && binary_params_) {
-        // handled by bigram features
-      }
-    }
-  } else if (edge.Arity() == 1) {
-    dpstate = *((unsigned char*)ant_states[0]);
-  } else if (edge.Arity() == 2) {
-    int left_index = *((unsigned char*)ant_states[0]);
-    int right_index = *((unsigned char*)ant_states[1]);
-    if (right_index == -1)
-      dpstate = static_cast<unsigned int>(left_index);
-    else
-      dpstate = static_cast<unsigned int>(right_index);
-    if (left_index == kNULL_i || right_index == kNULL_i) {
-      if (left_index == kNULL_i && right_index == kNULL_i)
-        features->set_value(fid_null_null_, 1.0);
-      else if (left_index == kNULL_i)
-        features->set_value(fid_null_lex_, 1.0);
-      else
-        features->set_value(fid_lex_null_, 1.0);
-
+  set<string> config;
+  string f_file;
+  for (int i = 0; i < argc; ++i) {
+    if (argv[i].size() > 2 && argv[i].find("f:") == 0) {
+      assert(f_file.empty());  // only one f file!
+      f_file = argv[i].substr(2);
+      cerr << " source_file=" << f_file;
     } else {
-      features->set_value(fid_lex_lex_, 1.0); // TODO should only use if NULL is enabled
-      const int jumpsize = right_index - left_index;
-
-      if (binary_params_) {
-        const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
-        features->set_value(fid, 1.0);
+      if (permitted.count(argv[i])) {
+        assert(config.count(argv[i]) == 0);
+        config.insert(argv[i]);
+        cerr << " " << argv[i];
       } else {
-        features->set_value(fid_, fabs(jumpsize - 1));  // Blunsom and Cohn def
+        cerr << "\nNewJump: don't understand param '" << argv[i] << "'\n";
+        abort();
       }
     }
-  } else {
-    assert(!"something really unexpected is happening");
   }
-}
-
-NewJump::NewJump(const string& param) :
-    FeatureFunction(1) {
-  cerr << "    NewJump";
-  vector<string> argv;
-  int argc = SplitOnWhitespace(param, &argv);
-  set<string> config;
-  for (int i = 0; i < argc; ++i) config.insert(argv[i]);
   cerr << endl;
   use_binned_log_lengths_ = config.count("use_binned_log_lengths") > 0;
+  f0_ = config.count("f0") > 0;
+  fm1_ = config.count("f-1") > 0;
+  fp1_ = config.count("f+1") > 0;
+  fprev_ = config.count("fprev") > 0;
+  elen_ = config.count("elen") > 0;
+  flen_ = config.count("flen") > 0;
+  if (f0_ || fm1_ || fp1_ || fprev_) {
+    if (f_file.empty()) {
+      cerr << "NewJump: conditioning on src but f:FILE not specified!\n";
+      abort();
+    }
+    ReadFile rf(f_file);
+    istream& in = *rf.stream();
+    string line;
+    while(in) {
+      getline(in, line);
+      if (!in) continue;
+      vector<WordID> v;
+      TD::ConvertSentence(line, &v);
+      src_.push_back(v);
+    }
+  }
+  fid_str_ = "J";
+  if (flen_) fid_str_ += "F";
+  if (elen_) fid_str_ += "E";
+  if (f0_) fid_str_ += "C";
+  if (fm1_) fid_str_ += "L";
+  if (fp1_) fid_str_ += "R";
+  if (fprev_) fid_str_ += "P";
 }
 
 // do a log transform on the length (of a sentence, a jump, etc)
@@ -351,33 +203,66 @@ int BinnedLogLength(int len) {
   return res;
 }
 
+// <0>=jump size <1>=jump_dir <2>=flen, <3>=elen, <4>=f0, <5>=f-1, <6>=f+1, <7>=fprev
+typedef boost::tuple<short, char, short, short, WordID, WordID, WordID, WordID> NewJumpFeatureKey;
+
+struct KeyHash : unary_function<NewJumpFeatureKey, size_t> {
+  size_t operator()(const NewJumpFeatureKey& k) const {
+    size_t h = 0x37473DEF321;
+    boost::hash_combine(h, k.get<0>());
+    boost::hash_combine(h, k.get<1>());
+    boost::hash_combine(h, k.get<2>());
+    boost::hash_combine(h, k.get<3>());
+    boost::hash_combine(h, k.get<4>());
+    boost::hash_combine(h, k.get<5>());
+    boost::hash_combine(h, k.get<6>());
+    boost::hash_combine(h, k.get<7>());
+    return h;
+  }
+};
+
 void NewJump::FireFeature(const SentenceMetadata& smeta,
                           const int prev_src_index,
                           const int cur_src_index,
                           SparseVector<double>* features) const {
+  const int id = smeta.GetSentenceID();
   const int src_len = smeta.GetSourceLength();
   const int raw_jump = cur_src_index - prev_src_index;
+  short jump_magnitude = raw_jump;
   char jtype = 0;
-  int jump_magnitude = raw_jump;
   if (raw_jump > 0) { jtype = 'R'; } // Right
   else if (raw_jump == 0) { jtype = 'S'; } // Stay
   else { jtype = 'L'; jump_magnitude = raw_jump * -1; } // Left
-  int effective_length = src_len;
+  int effective_src_len = src_len;
+  int effective_trg_len = smeta.GetTargetLength();
   if (use_binned_log_lengths_) {
     jump_magnitude = BinnedLogLength(jump_magnitude);
-    effective_length = BinnedLogLength(src_len);
-  }
-
-  if (true) {
-    static map<int, map<int, int> > len2jump2fid;
-    int& fid = len2jump2fid[src_len][raw_jump];
-    if (!fid) {
-      ostringstream os;
-      os << fid_str_ << ":FLen" << effective_length << ":" << jtype << jump_magnitude;
-      fid = FD::Convert(os.str());
-    }
-    features->set_value(fid, 1.0);
+    effective_src_len = BinnedLogLength(src_len);
+    effective_trg_len = BinnedLogLength(effective_trg_len);
+  }
+  NewJumpFeatureKey key(jump_magnitude,jtype,0,0,0,0,0);
+  using boost::get;
+  if (flen_)  get<2>(key) = effective_src_len;
+  if (elen_)  get<3>(key) = effective_trg_len;
+  if (f0_)    get<4>(key) = GetSourceWord(id, cur_src_index);
+  if (fm1_)   get<5>(key) = GetSourceWord(id, cur_src_index - 1);
+  if (fp1_)   get<6>(key) = GetSourceWord(id, cur_src_index + 1);
+  if (fprev_) get<7>(key) = GetSourceWord(id, prev_src_index);
+
+  static std::tr1::unordered_map<NewJumpFeatureKey, int, KeyHash> fids;
+  int& fid = fids[key];
+  if (!fid) {
+    ostringstream os;
+    os << fid_str_ << ':' << jtype << jump_magnitude;
+    if (flen_)  os << ':' << get<2>(key);
+    if (elen_)  os << ':' << get<3>(key);
+    if (f0_)    os << ':' << TD::Convert(get<4>(key));
+    if (fm1_)   os << ':' << TD::Convert(get<5>(key));
+    if (fp1_)   os << ':' << TD::Convert(get<6>(key));
+    if (fprev_) os << ':' << TD::Convert(get<7>(key));    
+    fid = FD::Convert(os.str());
   }
+  features->set_value(fid, 1.0);
 }
 
 void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
@@ -387,6 +272,7 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
                                        SparseVector<double>* /* estimated_features */,
                                        void* state) const {
   unsigned char& dpstate = *((unsigned char*)state);
+  // IMPORTANT: this only fires on non-Null transitions!
   const int flen = smeta.GetSourceLength();
   if (edge.Arity() == 0) {
     dpstate = static_cast<unsigned int>(edge.i_);
@@ -427,6 +313,23 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
 
 SourceBigram::SourceBigram(const std::string& param) :
     FeatureFunction(sizeof(WordID) + sizeof(int)) {
+  fid_str_ = "SB:";
+  if (param.size() > 0) {
+    vector<string> argv;
+    int argc = SplitOnWhitespace(param, &argv);
+    if (argc != 2) {
+      cerr << "SourceBigram [FEATURE_NAME_PREFIX PATH]\n";
+      abort();
+    }
+    fid_str_ = argv[0] + ":";
+    lexmap_.reset(new FactoredLexiconHelper(argv[1], "*"));
+  } else {
+    lexmap_.reset(new FactoredLexiconHelper);
+  }
+}
+
+void SourceBigram::PrepareForInput(const SentenceMetadata& smeta) {
+  lexmap_->PrepareForInput(smeta);
 }
 
 void SourceBigram::FinalTraversalFeatures(const void* context,
@@ -445,7 +348,7 @@ void SourceBigram::FireFeature(WordID left,
   // TODO important important !!! escape strings !!!
   if (!fid) {
     ostringstream os;
-    os << "SB:";
+    os << fid_str_;
     if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }
     os << '_';
     if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); }
@@ -465,85 +368,7 @@ void SourceBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   int& out_word_count = *(static_cast<int*>(context) + 1);
   const int arity = edge.Arity();
   if (arity == 0) {
-    out_context = edge.rule_->f()[0];
-    out_word_count = edge.rule_->EWords();
-    assert(out_word_count == 1); // this is only defined for lex translation!
-    // revisit this if you want to translate into null words
-  } else if (arity == 2) {
-    WordID left = *static_cast<const WordID*>(ant_contexts[0]);
-    WordID right = *static_cast<const WordID*>(ant_contexts[1]);
-    int left_wc = *(static_cast<const int*>(ant_contexts[0]) + 1);
-    int right_wc = *(static_cast<const int*>(ant_contexts[0]) + 1);
-    if (left_wc == 1 && right_wc == 1)
-      FireFeature(-1, left, features);
-    FireFeature(left, right, features);
-    out_word_count = left_wc + right_wc;
-    out_context = right;
-  }
-}
-// state: POS of src word used, number of trg words generated
-SourcePOSBigram::SourcePOSBigram(const std::string& param) :
-    FeatureFunction(sizeof(WordID) + sizeof(int)) {
-  cerr << "Reading source POS tags from " << param << endl;
-  ReadFile rf(param);
-  istream& in = *rf.stream();
-  while(in) {
-    string line;
-    getline(in, line);
-    if (line.empty()) continue;
-    vector<WordID> v;
-    TD::ConvertSentence(line, &v);
-    pos_.push_back(v);
-  }
-  cerr << "  (" << pos_.size() << " lines)\n";
-}
-
-void SourcePOSBigram::FinalTraversalFeatures(const void* context,
-                                      SparseVector<double>* features) const {
-  WordID left = *static_cast<const WordID*>(context);
-  int left_wc = *(static_cast<const int*>(context) + 1);
-  if (left_wc == 1)
-    FireFeature(-1, left, features);
-  FireFeature(left, -1, features);
-}
-
-void SourcePOSBigram::FireFeature(WordID left,
-                   WordID right,
-                   SparseVector<double>* features) const {
-  int& fid = fmap_[left][right];
-  if (!fid) {
-    ostringstream os;
-    os << "SP:";
-    if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }
-    os << '_';
-    if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); }
-    fid = FD::Convert(os.str());
-    if (fid == 0) fid = -1;
-  }
-  if (fid < 0) return;
-  features->set_value(fid, 1.0);
-}
-
-void SourcePOSBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                     const Hypergraph::Edge& edge,
-                                     const std::vector<const void*>& ant_contexts,
-                                     SparseVector<double>* features,
-                                            SparseVector<double>* /* estimated_features */,
-                                     void* context) const {
-  WordID& out_context = *static_cast<WordID*>(context);
-  int& out_word_count = *(static_cast<int*>(context) + 1);
-  const int arity = edge.Arity();
-  if (arity == 0) {
-    assert(smeta.GetSentenceID() < pos_.size());
-    const vector<WordID>& pos_sent = pos_[smeta.GetSentenceID()];
-    if (edge.i_ >= 0) {  // non-NULL source
-      assert(edge.i_ < pos_sent.size());
-      out_context = pos_sent[edge.i_];
-    } else { // NULL source
-      // should assert that source is kNULL?
-      static const WordID kNULL = TD::Convert("<eps>");
-      out_context = kNULL;
-    }
+    out_context = lexmap_->SourceWordAtPosition(edge.i_);
     out_word_count = edge.rule_->EWords();
     assert(out_word_count == 1); // this is only defined for lex translation!
     // revisit this if you want to translate into null words
diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h
index 418c8768..a1ffd9ca 100644
--- a/decoder/ff_wordalign.h
+++ b/decoder/ff_wordalign.h
@@ -3,7 +3,9 @@
 
 #include "ff.h"
 #include "array2d.h"
+#include "factored_lexicon_helper.h"
 
+#include <boost/scoped_ptr.hpp>
 #include <boost/multi_array.hpp>
 
 class RelativeSentencePosition : public FeatureFunction {
@@ -23,64 +25,6 @@ class RelativeSentencePosition : public FeatureFunction {
   std::map<WordID, int> fids_;  // fclass -> fid
 };
 
-class Model2BinaryFeatures : public FeatureFunction {
- public:
-  Model2BinaryFeatures(const std::string& param);
- protected:
-  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                     const Hypergraph::Edge& edge,
-                                     const std::vector<const void*>& ant_contexts,
-                                     SparseVector<double>* features,
-                                     SparseVector<double>* estimated_features,
-                                     void* out_context) const;
- private:
-  boost::multi_array<int, 3> fids_;
-};
-
-class MarkovJump : public FeatureFunction {
- public:
-  MarkovJump(const std::string& param);
- protected:
-  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                     const Hypergraph::Edge& edge,
-                                     const std::vector<const void*>& ant_contexts,
-                                     SparseVector<double>* features,
-                                     SparseVector<double>* estimated_features,
-                                     void* out_context) const;
- private:
-  const int fid_;
-  const int fid_lex_null_;
-  const int fid_null_lex_;
-  const int fid_null_null_;
-  const int fid_lex_lex_;
-
-  bool binary_params_;
-  std::vector<std::map<int, int> > flen2jump2fid_;
-};
-
-class MarkovJumpFClass : public FeatureFunction {
- public:
-  MarkovJumpFClass(const std::string& param);
-  virtual void FinalTraversalFeatures(const void* context,
-                                      SparseVector<double>* features) const;
- protected:
-  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                     const Hypergraph::Edge& edge,
-                                     const std::vector<const void*>& ant_contexts,
-                                     SparseVector<double>* features,
-                                     SparseVector<double>* estimated_features,
-                                     void* context) const;
-
-  void FireFeature(const SentenceMetadata& smeta,
-                   int prev_src_pos,
-                   int cur_src_pos,
-                   SparseVector<double>* features) const;
-
- private:
-  std::vector<std::map<WordID, std::map<int, int> > > fids_;  // flen -> fclass -> jumpsize -> fid
-  std::vector<std::vector<WordID> > pos_;
-};
-
 typedef std::map<WordID, int> Class2FID;
 typedef std::map<WordID, Class2FID> Class2Class2FID;
 typedef std::map<WordID, Class2Class2FID> Class2Class2Class2FID;
@@ -89,6 +33,7 @@ class SourceBigram : public FeatureFunction {
   SourceBigram(const std::string& param);
   virtual void FinalTraversalFeatures(const void* context,
                                       SparseVector<double>* features) const;
+  void PrepareForInput(const SentenceMetadata& smeta);
  protected:
   virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
                                      const Hypergraph::Edge& edge,
@@ -100,7 +45,9 @@ class SourceBigram : public FeatureFunction {
   void FireFeature(WordID src,
                    WordID trg,
                    SparseVector<double>* features) const;
+  std::string fid_str_;
   mutable Class2Class2FID fmap_;
+  boost::scoped_ptr<FactoredLexiconHelper> lexmap_; // different view (stemmed, etc) of source
 };
 
 class LexNullJump : public FeatureFunction {
@@ -136,30 +83,27 @@ class NewJump : public FeatureFunction {
                    const int cur_src_index,
                    SparseVector<double>* features) const;
 
+  WordID GetSourceWord(int sentence_id, int index) const {
+    if (index < 0) return kBOS_;
+    assert(src_.size() > sentence_id);
+    const std::vector<WordID>& v = src_[sentence_id];
+    if (index >= v.size()) return kEOS_;
+    return v[index];
+  }
+
+  const WordID kBOS_;
+  const WordID kEOS_;
   bool use_binned_log_lengths_;
+  bool flen_;
+  bool elen_;
+  bool f0_;
+  bool fm1_;
+  bool fp1_;
+  bool fprev_;
+  std::vector<std::vector<WordID> > src_;
   std::string fid_str_;  // identifies configuration uniquely
 };
 
-class SourcePOSBigram : public FeatureFunction {
- public:
-  SourcePOSBigram(const std::string& param);
-  virtual void FinalTraversalFeatures(const void* context,
-                                      SparseVector<double>* features) const;
- protected:
-  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                     const Hypergraph::Edge& edge,
-                                     const std::vector<const void*>& ant_contexts,
-                                     SparseVector<double>* features,
-                                     SparseVector<double>* estimated_features,
-                                     void* context) const;
- private:
-  void FireFeature(WordID src,
-                   WordID trg,
-                   SparseVector<double>* features) const;
-  mutable Class2Class2FID fmap_;
-  std::vector<std::vector<WordID> > pos_;
-};
-
 class LexicalTranslationTrigger : public FeatureFunction {
  public:
   LexicalTranslationTrigger(const std::string& param);
diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc
index 149cd68d..f237295c 100644
--- a/decoder/lextrans.cc
+++ b/decoder/lextrans.cc
@@ -81,7 +81,7 @@ struct LexicalTransImpl {
     for (int i = 0; i < ref.size(); ++i) {
       target_vocab.insert(ref[i][0].label);
     }
-    bool all_sources_to_all_targets_ = false;
+    bool all_sources_to_all_targets_ = false; // TODO configure this
     set<WordID> trgs_used;
     for (int i = 0; i < e_len; ++i) {  // for each word in the *target*
       Hypergraph::Node* node = forest->AddNode(kXCAT);
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index 81ac4198..f5ee5d3f 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -120,17 +120,19 @@ grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
 
 feature_function=WordPairFeatures $align_dir/grammars/wordpairs.$direction.features.gz
 feature_function=LexicalPairIdentity
-feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second
+# stem translation
 feature_function=LexicalPairIdentity S $align_dir/grammars/corpus.stemmed.$first $align_dir/grammars/${second}stem.map
+# POS translation
+feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second
 feature_function=InputIdentity
 feature_function=OutputIdentity
 feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first
-# the following two are deprecated
-feature_function=MarkovJump +b
-feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first
+feature_function=NewJump
+feature_function=NewJump use_binned_log_lengths flen
+# jump distance and src and destination class type
+feature_function=NewJump use_binned_log_lengths f0 fprev f:$align_dir/grammars/corpus.class.$first
 feature_function=SourceBigram
-# following is deprecated- should reuse SourceBigram the way LexicalPairIdentity does
-feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first
+feature_function=SourceBigram SC $align_dir/grammars/corpus.class.$first
 EOT
   close CDEC;
   open AGENDA, ">$stage_dir/agenda.txt" or die "Can't write $stage_dir/agenda.txt: $!";
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index be0644df..1a069abf 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -1,14 +1,13 @@
-all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml
+all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map
 
 clean:
-	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg* wordpairs*
+	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* freq* wordpairs*
 
 SUPPORT_DIR = $(SCRIPT_DIR)/support
 GZIP = /usr/bin/gzip
 ZCAT = zcat
 EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl
 EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl
-GENERATE_PSG = $(SUPPORT_DIR)/generate_per_sentence_grammars.pl
 GENERATE_WORDPAIR_FEATURES = $(SUPPORT_DIR)/generate_word_pair_features.pl
 ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl
 ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl
@@ -84,6 +83,3 @@ corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1
 wordpairs.f-e.features.gz: corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1
 	$(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 | $(GZIP) -9 > $@
 
-corpus.f-e.sgml: f.voc corpus.f-e.lex-grammar.gz corpus.f-e
-	$(GENERATE_PSG) f.voc corpus.f-e corpus.f-e.lex-grammar.gz freq_grammar.f-e.gz psg.f-e $@
-
diff --git a/word-aligner/support/generate_word_pair_features.pl b/word-aligner/support/generate_word_pair_features.pl
index b28f6feb..54b89ce1 100755
--- a/word-aligner/support/generate_word_pair_features.pl
+++ b/word-aligner/support/generate_word_pair_features.pl
@@ -92,7 +92,7 @@ my $ADD_ID = 1;
 my $ADD_PUNC = 1;
 my $ADD_NULL = 1;
 my $ADD_MODEL1 = 1;
-my $ADD_NOMODEL1 = 1;
+my $ADD_NOMODEL1 = 0;
 my $BEAM_RATIO = 50;
 my $BIN_ORTHO = 1;
 my $BIN_DLEN = 1;
@@ -171,7 +171,7 @@ for my $f (sort keys %fdict) {
         }
         if ($im1 > $MIN_MAGNITUDE) {
           push @feats, "InvModel1=$im1" if $im1;
-        } else {
+        } elsif ($ADD_NOMODEL1) {
           push @feats, 'NoInvModel1=1';
         }
         my $am1 = sprintf("%.5g", sqrt($m1 * $im1));
author	Chris Dyer <cdyer@cs.cmu.edu>	2010-12-09 17:04:29 -0500
committer	Chris Dyer <cdyer@cs.cmu.edu>	2010-12-09 17:04:29 -0500
commit	9a8cbe4db88e63378b6d3c4ec96438819f1f1131 (patch)
tree	abf1a23739a033eaabd62f61e39ac249d9cf7717
parent	61bfaf15c02a0555d8ffa5dd4e6ae32f09354610 (diff)