major refactor of markov features for word alignment

author: Chris Dyer <cdyer@cs.cmu.edu> 2010-12-09 17:04:29 -0500
committer: Chris Dyer <cdyer@cs.cmu.edu> 2010-12-09 17:04:29 -0500
commit: 35142ef52f15d610ca08fa622b83594cf111ce4a (patch)
tree: c2196761993353bca47c7073e6cb5d996c4dad8f /decoder/ff_wordalign.cc
parent: a80c69d266886d9911eb91833811d7f8393ac64d (diff)
1 files changed, 128 insertions, 303 deletions
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index 980c64ad..338f1a72 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -6,7 +6,13 @@
 #include <sstream>
 #include <string>
 #include <cmath>
+#include <tr1/unordered_map>
 
+#include <boost/tuple/tuple.hpp>
+#include "boost/tuple/tuple_comparison.hpp"
+#include <boost/functional/hash.hpp>
+
+#include "factored_lexicon_helper.h"
 #include "verbose.h"
 #include "alignment_pharaoh.h"
 #include "stringlib.h"
@@ -25,43 +31,6 @@ using namespace std;
 
 // TODO new feature: if a word is translated as itself and there is a transition back to the same word, fire a feature
 
-Model2BinaryFeatures::Model2BinaryFeatures(const string& ) :
-    fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) {
-  for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
-    for (int j = 0; j < i; ++j) {
-      for (int k = 0; k < MAX_SENTENCE_SIZE; ++k) {
-        int& val = fids_[i][j][k];
-        val = -1;
-        if (j < i) {
-          ostringstream os;
-          os << "M2FL:" << i << ":TI:" << k << "_SI:" << j;
-          val = FD::Convert(os.str());
-        }
-      }
-    }
-  }
-}
-
-void Model2BinaryFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                                 const Hypergraph::Edge& edge,
-                                                 const vector<const void*>& /*ant_states*/,
-                                                 SparseVector<double>* features,
-                                                 SparseVector<double>* // estimated_features
-                                                 ,
-                                                 void* // state
-  ) const {
-  // if the source word is either null or the generated word
-  // has no position in the reference
-  if (edge.i_ == -1 || edge.prev_i_ == -1)
-    return;
-
-  assert(smeta.GetTargetLength() > 0);
-  const int fid = fids_[smeta.GetSourceLength()][edge.i_][edge.prev_i_];
-  features->set_value(fid, 1.0);
-//  cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
-}
-
-
 RelativeSentencePosition::RelativeSentencePosition(const string& param) :
     fid_(FD::Convert("RelativeSentencePosition")) {
   if (!param.empty()) {
@@ -119,87 +88,6 @@ void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& sme
 //  cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
 }
 
-MarkovJumpFClass::MarkovJumpFClass(const string& param) :
-    FeatureFunction(1),
-    fids_(MAX_SENTENCE_SIZE) {
-  cerr << "    MarkovJumpFClass" << endl;
-  cerr << "Reading source POS tags from " << param << endl;
-  ReadFile rf(param);
-  istream& in = *rf.stream();
-  set<WordID> classes;
-  while(in) {
-    string line;
-    getline(in, line);
-    if (line.empty()) continue;
-    vector<WordID> v;
-    TD::ConvertSentence(line, &v);
-    pos_.push_back(v);
-    for (int i = 0; i < v.size(); ++i)
-      classes.insert(v[i]);
-  }
-  cerr << "  (" << pos_.size() << " lines)\n";
-  cerr << "  Classes: " << classes.size() << endl;
-  for (int ss = 1; ss < MAX_SENTENCE_SIZE; ++ss) {
-    map<WordID, map<int, int> >& cfids = fids_[ss];
-    for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) {
-      map<int, int> &fids = cfids[*i];
-      for (int j = -ss; j <= ss; ++j) {
-        ostringstream os;
-        os << "Jump_FL:" << ss << "_FC:" << TD::Convert(*i) << "_J:" << j;
-        fids[j] = FD::Convert(os.str());
-      }
-    }
-  }
-}
-
-void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta,
-                                   int prev_src_pos,
-                                   int cur_src_pos,
-                                   SparseVector<double>* features) const {
-  if (prev_src_pos == kNULL_i || cur_src_pos == kNULL_i)
-    return;
-
-  const int jumpsize = cur_src_pos - prev_src_pos;
-
-  assert(smeta.GetSentenceID() < pos_.size());
-  const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos];
-  const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
-  features->set_value(fid, 1.0);
-}
-
-void MarkovJumpFClass::FinalTraversalFeatures(const void* context,
-                                      SparseVector<double>* features) const {
-  int left_index = *static_cast<const unsigned char*>(context);
-//  int right_index = cur_flen;
-  // TODO
-}
-
-void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                     const Hypergraph::Edge& edge,
-                                     const std::vector<const void*>& ant_states,
-                                     SparseVector<double>* features,
-                                     SparseVector<double>* /* estimated_features */,
-                                     void* state) const {
-  unsigned char& dpstate = *((unsigned char*)state);
-  if (edge.Arity() == 0) {
-    dpstate = static_cast<unsigned int>(edge.i_);
-  } else if (edge.Arity() == 1) {
-    dpstate = *((unsigned char*)ant_states[0]);
-  } else if (edge.Arity() == 2) {
-    int left_index = *((unsigned char*)ant_states[0]);
-    int right_index = *((unsigned char*)ant_states[1]);
-    if (right_index == -1)
-      dpstate = static_cast<unsigned int>(left_index);
-    else
-      dpstate = static_cast<unsigned int>(right_index);
-//    const WordID cur_fclass = pos_[smeta.GetSentenceID()][right_index];
-//    cerr << edge.i_ << "," << edge.j_ << ": fclass=" << TD::Convert(cur_fclass) << " j=" << jumpsize << endl;
-//    const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
-//    features->set_value(fid, 1.0);
-    FireFeature(smeta, left_index, right_index, features);
-  }
-}
-
 LexNullJump::LexNullJump(const string& param) :
     FeatureFunction(1),
     fid_lex_null_(FD::Convert("JumpLexNull")),
@@ -239,107 +127,71 @@ void LexNullJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   }
 }
 
-MarkovJump::MarkovJump(const string& param) :
+NewJump::NewJump(const string& param) :
     FeatureFunction(1),
-    fid_(FD::Convert("MarkovJump")),
-    fid_lex_null_(FD::Convert("JumpLexNull")),
-    fid_null_lex_(FD::Convert("JumpNullLex")),
-    fid_null_null_(FD::Convert("JumpNullNull")),
-    fid_lex_lex_(FD::Convert("JumpLexLex")),
-    binary_params_(false) {
-  cerr << "    MarkovJump";
+    kBOS_(TD::Convert("BOS")),
+    kEOS_(TD::Convert("EOS")) {
+  cerr << "    NewJump";
   vector<string> argv;
+  set<string> permitted;
+  permitted.insert("use_binned_log_lengths");
+  permitted.insert("flen");
+  permitted.insert("elen");
+  permitted.insert("fprev");
+  permitted.insert("f0");
+  permitted.insert("f-1");
+  permitted.insert("f+1");
+  // also permitted f:FILENAME
   int argc = SplitOnWhitespace(param, &argv);
-  if (argc != 1 || !(argv[0] == "-b" || argv[0] == "+b")) {
-    cerr << "MarkovJump: expected parameters to be -b or +b\n";
-    exit(1);
-  }
-  binary_params_ = argv[0] == "+b";
-  if (binary_params_) {
-    flen2jump2fid_.resize(MAX_SENTENCE_SIZE);
-    for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
-      map<int, int>& jump2fid = flen2jump2fid_[i];
-      for (int jump = -i; jump <= i; ++jump) {
-        ostringstream os;
-        os << "Jump:FLen:" << i << "_J:" << jump;
-        jump2fid[jump] = FD::Convert(os.str());
-      }
-    }
-  } else {
-    cerr << " (Blunsom & Cohn definition)";
-  }
-  cerr << endl;
-}
-
-// TODO handle NULLs according to Och 2000?
-void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                       const Hypergraph::Edge& edge,
-                                       const vector<const void*>& ant_states,
-                                       SparseVector<double>* features,
-                                       SparseVector<double>* /* estimated_features */,
-                                       void* state) const {
-  unsigned char& dpstate = *((unsigned char*)state);
-  const int flen = smeta.GetSourceLength();
-  if (edge.Arity() == 0) {
-    dpstate = static_cast<unsigned int>(edge.i_);
-    if (edge.prev_i_ == 0) {     // first word in sentence
-      if (edge.i_ >= 0 && binary_params_) {
-        const int fid = flen2jump2fid_[flen].find(edge.i_ + 1)->second;
-        features->set_value(fid, 1.0);
-      } else if (edge.i_ < 0 && binary_params_) {
-        // handled by bigram features
-      }
-    } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) {
-      if (edge.i_ >= 0 && binary_params_) {
-        int jumpsize = flen - edge.i_;
-        const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
-        features->set_value(fid, 1.0);
-      } else if (edge.i_ < 0 && binary_params_) {
-        // handled by bigram features
-      }
-    }
-  } else if (edge.Arity() == 1) {
-    dpstate = *((unsigned char*)ant_states[0]);
-  } else if (edge.Arity() == 2) {
-    int left_index = *((unsigned char*)ant_states[0]);
-    int right_index = *((unsigned char*)ant_states[1]);
-    if (right_index == -1)
-      dpstate = static_cast<unsigned int>(left_index);
-    else
-      dpstate = static_cast<unsigned int>(right_index);
-    if (left_index == kNULL_i || right_index == kNULL_i) {
-      if (left_index == kNULL_i && right_index == kNULL_i)
-        features->set_value(fid_null_null_, 1.0);
-      else if (left_index == kNULL_i)
-        features->set_value(fid_null_lex_, 1.0);
-      else
-        features->set_value(fid_lex_null_, 1.0);
-
+  set<string> config;
+  string f_file;
+  for (int i = 0; i < argc; ++i) {
+    if (argv[i].size() > 2 && argv[i].find("f:") == 0) {
+      assert(f_file.empty());  // only one f file!
+      f_file = argv[i].substr(2);
+      cerr << " source_file=" << f_file;
     } else {
-      features->set_value(fid_lex_lex_, 1.0); // TODO should only use if NULL is enabled
-      const int jumpsize = right_index - left_index;
-
-      if (binary_params_) {
-        const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
-        features->set_value(fid, 1.0);
+      if (permitted.count(argv[i])) {
+        assert(config.count(argv[i]) == 0);
+        config.insert(argv[i]);
+        cerr << " " << argv[i];
       } else {
-        features->set_value(fid_, fabs(jumpsize - 1));  // Blunsom and Cohn def
+        cerr << "\nNewJump: don't understand param '" << argv[i] << "'\n";
+        abort();
       }
     }
-  } else {
-    assert(!"something really unexpected is happening");
   }
-}
-
-NewJump::NewJump(const string& param) :
-    FeatureFunction(1) {
-  cerr << "    NewJump";
-  vector<string> argv;
-  int argc = SplitOnWhitespace(param, &argv);
-  set<string> config;
-  for (int i = 0; i < argc; ++i) config.insert(argv[i]);
   cerr << endl;
   use_binned_log_lengths_ = config.count("use_binned_log_lengths") > 0;
+  f0_ = config.count("f0") > 0;
+  fm1_ = config.count("f-1") > 0;
+  fp1_ = config.count("f+1") > 0;
+  fprev_ = config.count("fprev") > 0;
+  elen_ = config.count("elen") > 0;
+  flen_ = config.count("flen") > 0;
+  if (f0_ || fm1_ || fp1_ || fprev_) {
+    if (f_file.empty()) {
+      cerr << "NewJump: conditioning on src but f:FILE not specified!\n";
+      abort();
+    }
+    ReadFile rf(f_file);
+    istream& in = *rf.stream();
+    string line;
+    while(in) {
+      getline(in, line);
+      if (!in) continue;
+      vector<WordID> v;
+      TD::ConvertSentence(line, &v);
+      src_.push_back(v);
+    }
+  }
+  fid_str_ = "J";
+  if (flen_) fid_str_ += "F";
+  if (elen_) fid_str_ += "E";
+  if (f0_) fid_str_ += "C";
+  if (fm1_) fid_str_ += "L";
+  if (fp1_) fid_str_ += "R";
+  if (fprev_) fid_str_ += "P";
 }
 
 // do a log transform on the length (of a sentence, a jump, etc)
@@ -351,33 +203,66 @@ int BinnedLogLength(int len) {
   return res;
 }
 
+// <0>=jump size <1>=jump_dir <2>=flen, <3>=elen, <4>=f0, <5>=f-1, <6>=f+1, <7>=fprev
+typedef boost::tuple<short, char, short, short, WordID, WordID, WordID, WordID> NewJumpFeatureKey;
+
+struct KeyHash : unary_function<NewJumpFeatureKey, size_t> {
+  size_t operator()(const NewJumpFeatureKey& k) const {
+    size_t h = 0x37473DEF321;
+    boost::hash_combine(h, k.get<0>());
+    boost::hash_combine(h, k.get<1>());
+    boost::hash_combine(h, k.get<2>());
+    boost::hash_combine(h, k.get<3>());
+    boost::hash_combine(h, k.get<4>());
+    boost::hash_combine(h, k.get<5>());
+    boost::hash_combine(h, k.get<6>());
+    boost::hash_combine(h, k.get<7>());
+    return h;
+  }
+};
+
 void NewJump::FireFeature(const SentenceMetadata& smeta,
                           const int prev_src_index,
                           const int cur_src_index,
                           SparseVector<double>* features) const {
+  const int id = smeta.GetSentenceID();
   const int src_len = smeta.GetSourceLength();
   const int raw_jump = cur_src_index - prev_src_index;
+  short jump_magnitude = raw_jump;
   char jtype = 0;
-  int jump_magnitude = raw_jump;
   if (raw_jump > 0) { jtype = 'R'; } // Right
   else if (raw_jump == 0) { jtype = 'S'; } // Stay
   else { jtype = 'L'; jump_magnitude = raw_jump * -1; } // Left
-  int effective_length = src_len;
+  int effective_src_len = src_len;
+  int effective_trg_len = smeta.GetTargetLength();
   if (use_binned_log_lengths_) {
     jump_magnitude = BinnedLogLength(jump_magnitude);
-    effective_length = BinnedLogLength(src_len);
-  }
-
-  if (true) {
-    static map<int, map<int, int> > len2jump2fid;
-    int& fid = len2jump2fid[src_len][raw_jump];
-    if (!fid) {
-      ostringstream os;
-      os << fid_str_ << ":FLen" << effective_length << ":" << jtype << jump_magnitude;
-      fid = FD::Convert(os.str());
-    }
-    features->set_value(fid, 1.0);
+    effective_src_len = BinnedLogLength(src_len);
+    effective_trg_len = BinnedLogLength(effective_trg_len);
+  }
+  NewJumpFeatureKey key(jump_magnitude,jtype,0,0,0,0,0);
+  using boost::get;
+  if (flen_)  get<2>(key) = effective_src_len;
+  if (elen_)  get<3>(key) = effective_trg_len;
+  if (f0_)    get<4>(key) = GetSourceWord(id, cur_src_index);
+  if (fm1_)   get<5>(key) = GetSourceWord(id, cur_src_index - 1);
+  if (fp1_)   get<6>(key) = GetSourceWord(id, cur_src_index + 1);
+  if (fprev_) get<7>(key) = GetSourceWord(id, prev_src_index);
+
+  static std::tr1::unordered_map<NewJumpFeatureKey, int, KeyHash> fids;
+  int& fid = fids[key];
+  if (!fid) {
+    ostringstream os;
+    os << fid_str_ << ':' << jtype << jump_magnitude;
+    if (flen_)  os << ':' << get<2>(key);
+    if (elen_)  os << ':' << get<3>(key);
+    if (f0_)    os << ':' << TD::Convert(get<4>(key));
+    if (fm1_)   os << ':' << TD::Convert(get<5>(key));
+    if (fp1_)   os << ':' << TD::Convert(get<6>(key));
+    if (fprev_) os << ':' << TD::Convert(get<7>(key));    
+    fid = FD::Convert(os.str());
   }
+  features->set_value(fid, 1.0);
 }
 
 void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
@@ -387,6 +272,7 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
                                        SparseVector<double>* /* estimated_features */,
                                        void* state) const {
   unsigned char& dpstate = *((unsigned char*)state);
+  // IMPORTANT: this only fires on non-Null transitions!
   const int flen = smeta.GetSourceLength();
   if (edge.Arity() == 0) {
     dpstate = static_cast<unsigned int>(edge.i_);
@@ -427,6 +313,23 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
 
 SourceBigram::SourceBigram(const std::string& param) :
     FeatureFunction(sizeof(WordID) + sizeof(int)) {
+  fid_str_ = "SB:";
+  if (param.size() > 0) {
+    vector<string> argv;
+    int argc = SplitOnWhitespace(param, &argv);
+    if (argc != 2) {
+      cerr << "SourceBigram [FEATURE_NAME_PREFIX PATH]\n";
+      abort();
+    }
+    fid_str_ = argv[0] + ":";
+    lexmap_.reset(new FactoredLexiconHelper(argv[1], "*"));
+  } else {
+    lexmap_.reset(new FactoredLexiconHelper);
+  }
+}
+
+void SourceBigram::PrepareForInput(const SentenceMetadata& smeta) {
+  lexmap_->PrepareForInput(smeta);
 }
 
 void SourceBigram::FinalTraversalFeatures(const void* context,
@@ -445,7 +348,7 @@ void SourceBigram::FireFeature(WordID left,
   // TODO important important !!! escape strings !!!
   if (!fid) {
     ostringstream os;
-    os << "SB:";
+    os << fid_str_;
     if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }
     os << '_';
     if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); }
@@ -465,85 +368,7 @@ void SourceBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   int& out_word_count = *(static_cast<int*>(context) + 1);
   const int arity = edge.Arity();
   if (arity == 0) {
-    out_context = edge.rule_->f()[0];
-    out_word_count = edge.rule_->EWords();
-    assert(out_word_count == 1); // this is only defined for lex translation!
-    // revisit this if you want to translate into null words
-  } else if (arity == 2) {
-    WordID left = *static_cast<const WordID*>(ant_contexts[0]);
-    WordID right = *static_cast<const WordID*>(ant_contexts[1]);
-    int left_wc = *(static_cast<const int*>(ant_contexts[0]) + 1);
-    int right_wc = *(static_cast<const int*>(ant_contexts[0]) + 1);
-    if (left_wc == 1 && right_wc == 1)
-      FireFeature(-1, left, features);
-    FireFeature(left, right, features);
-    out_word_count = left_wc + right_wc;
-    out_context = right;
-  }
-}
-// state: POS of src word used, number of trg words generated
-SourcePOSBigram::SourcePOSBigram(const std::string& param) :
-    FeatureFunction(sizeof(WordID) + sizeof(int)) {
-  cerr << "Reading source POS tags from " << param << endl;
-  ReadFile rf(param);
-  istream& in = *rf.stream();
-  while(in) {
-    string line;
-    getline(in, line);
-    if (line.empty()) continue;
-    vector<WordID> v;
-    TD::ConvertSentence(line, &v);
-    pos_.push_back(v);
-  }
-  cerr << "  (" << pos_.size() << " lines)\n";
-}
-
-void SourcePOSBigram::FinalTraversalFeatures(const void* context,
-                                      SparseVector<double>* features) const {
-  WordID left = *static_cast<const WordID*>(context);
-  int left_wc = *(static_cast<const int*>(context) + 1);
-  if (left_wc == 1)
-    FireFeature(-1, left, features);
-  FireFeature(left, -1, features);
-}
-
-void SourcePOSBigram::FireFeature(WordID left,
-                   WordID right,
-                   SparseVector<double>* features) const {
-  int& fid = fmap_[left][right];
-  if (!fid) {
-    ostringstream os;
-    os << "SP:";
-    if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }
-    os << '_';
-    if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); }
-    fid = FD::Convert(os.str());
-    if (fid == 0) fid = -1;
-  }
-  if (fid < 0) return;
-  features->set_value(fid, 1.0);
-}
-
-void SourcePOSBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                     const Hypergraph::Edge& edge,
-                                     const std::vector<const void*>& ant_contexts,
-                                     SparseVector<double>* features,
-                                            SparseVector<double>* /* estimated_features */,
-                                     void* context) const {
-  WordID& out_context = *static_cast<WordID*>(context);
-  int& out_word_count = *(static_cast<int*>(context) + 1);
-  const int arity = edge.Arity();
-  if (arity == 0) {
-    assert(smeta.GetSentenceID() < pos_.size());
-    const vector<WordID>& pos_sent = pos_[smeta.GetSentenceID()];
-    if (edge.i_ >= 0) {  // non-NULL source
-      assert(edge.i_ < pos_sent.size());
-      out_context = pos_sent[edge.i_];
-    } else { // NULL source
-      // should assert that source is kNULL?
-      static const WordID kNULL = TD::Convert("<eps>");
-      out_context = kNULL;
-    }
+    out_context = lexmap_->SourceWordAtPosition(edge.i_);
     out_word_count = edge.rule_->EWords();
     assert(out_word_count == 1); // this is only defined for lex translation!
     // revisit this if you want to translate into null words
author	Chris Dyer <cdyer@cs.cmu.edu>	2010-12-09 17:04:29 -0500
committer	Chris Dyer <cdyer@cs.cmu.edu>	2010-12-09 17:04:29 -0500
commit	35142ef52f15d610ca08fa622b83594cf111ce4a (patch)
tree	c2196761993353bca47c7073e6cb5d996c4dad8f /decoder/ff_wordalign.cc
parent	a80c69d266886d9911eb91833811d7f8393ac64d (diff)