diff options
| -rw-r--r-- | decoder/cdec_ff.cc | 4 | ||||
| -rw-r--r-- | decoder/ff_wordalign.cc | 431 | ||||
| -rw-r--r-- | decoder/ff_wordalign.h | 100 | ||||
| -rw-r--r-- | decoder/lextrans.cc | 2 | ||||
| -rwxr-xr-x | word-aligner/aligner.pl | 14 | ||||
| -rw-r--r-- | word-aligner/makefiles/makefile.grammars | 8 | ||||
| -rwxr-xr-x | word-aligner/support/generate_word_pair_features.pl | 4 | 
7 files changed, 163 insertions, 400 deletions
| diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index d6cf4572..e87ab5ab 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -50,13 +50,9 @@ void register_feature_functions() {  #endif    ff_registry.Register("RuleShape", new FFFactory<RuleShapeFeatures>);    ff_registry.Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>); -  ff_registry.Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>);    ff_registry.Register("LexNullJump", new FFFactory<LexNullJump>);    ff_registry.Register("NewJump", new FFFactory<NewJump>); -  ff_registry.Register("MarkovJump", new FFFactory<MarkovJump>); -  ff_registry.Register("MarkovJumpFClass", new FFFactory<MarkovJumpFClass>);    ff_registry.Register("SourceBigram", new FFFactory<SourceBigram>); -  ff_registry.Register("SourcePOSBigram", new FFFactory<SourcePOSBigram>);    ff_registry.Register("BlunsomSynchronousParseHack", new FFFactory<BlunsomSynchronousParseHack>);    ff_registry.Register("AlignerResults", new FFFactory<AlignerResults>);    ff_registry.Register("CSplit_BasicFeatures", new FFFactory<BasicCSplitFeatures>); diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc index 980c64ad..338f1a72 100644 --- a/decoder/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -6,7 +6,13 @@  #include <sstream>  #include <string>  #include <cmath> +#include <tr1/unordered_map> +#include <boost/tuple/tuple.hpp> +#include "boost/tuple/tuple_comparison.hpp" +#include <boost/functional/hash.hpp> + +#include "factored_lexicon_helper.h"  #include "verbose.h"  #include "alignment_pharaoh.h"  #include "stringlib.h" @@ -25,43 +31,6 @@ using namespace std;  // TODO new feature: if a word is translated as itself and there is a transition back to the same word, fire a feature -Model2BinaryFeatures::Model2BinaryFeatures(const string& ) : -    fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) { -  for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) { -    for (int j = 0; j < i; ++j) { -      for (int k = 0; k < MAX_SENTENCE_SIZE; ++k) { -        int& val = fids_[i][j][k]; -        val = -1; -        if (j < i) { -          ostringstream os; -          os << "M2FL:" << i << ":TI:" << k << "_SI:" << j; -          val = FD::Convert(os.str()); -        } -      } -    } -  } -} - -void Model2BinaryFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, -                                                 const Hypergraph::Edge& edge, -                                                 const vector<const void*>& /*ant_states*/, -                                                 SparseVector<double>* features, -                                                 SparseVector<double>* // estimated_features -                                                 , -                                                 void* // state -  ) const { -  // if the source word is either null or the generated word -  // has no position in the reference -  if (edge.i_ == -1 || edge.prev_i_ == -1) -    return; - -  assert(smeta.GetTargetLength() > 0); -  const int fid = fids_[smeta.GetSourceLength()][edge.i_][edge.prev_i_]; -  features->set_value(fid, 1.0); -//  cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl; -} - -  RelativeSentencePosition::RelativeSentencePosition(const string& param) :      fid_(FD::Convert("RelativeSentencePosition")) {    if (!param.empty()) { @@ -119,87 +88,6 @@ void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& sme  //  cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;  } -MarkovJumpFClass::MarkovJumpFClass(const string& param) : -    FeatureFunction(1), -    fids_(MAX_SENTENCE_SIZE) { -  cerr << "    MarkovJumpFClass" << endl; -  cerr << "Reading source POS tags from " << param << endl; -  ReadFile rf(param); -  istream& in = *rf.stream(); -  set<WordID> classes; -  while(in) { -    string line; -    getline(in, line); -    if (line.empty()) continue; -    vector<WordID> v; -    TD::ConvertSentence(line, &v); -    pos_.push_back(v); -    for (int i = 0; i < v.size(); ++i) -      classes.insert(v[i]); -  } -  cerr << "  (" << pos_.size() << " lines)\n"; -  cerr << "  Classes: " << classes.size() << endl; -  for (int ss = 1; ss < MAX_SENTENCE_SIZE; ++ss) { -    map<WordID, map<int, int> >& cfids = fids_[ss]; -    for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) { -      map<int, int> &fids = cfids[*i]; -      for (int j = -ss; j <= ss; ++j) { -        ostringstream os; -        os << "Jump_FL:" << ss << "_FC:" << TD::Convert(*i) << "_J:" << j; -        fids[j] = FD::Convert(os.str()); -      } -    } -  } -} - -void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta, -                                   int prev_src_pos, -                                   int cur_src_pos, -                                   SparseVector<double>* features) const { -  if (prev_src_pos == kNULL_i || cur_src_pos == kNULL_i) -    return; - -  const int jumpsize = cur_src_pos - prev_src_pos; - -  assert(smeta.GetSentenceID() < pos_.size()); -  const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos]; -  const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second; -  features->set_value(fid, 1.0); -} - -void MarkovJumpFClass::FinalTraversalFeatures(const void* context, -                                      SparseVector<double>* features) const { -  int left_index = *static_cast<const unsigned char*>(context); -//  int right_index = cur_flen; -  // TODO -} - -void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta, -                                     const Hypergraph::Edge& edge, -                                     const std::vector<const void*>& ant_states, -                                     SparseVector<double>* features, -                                     SparseVector<double>* /* estimated_features */, -                                     void* state) const { -  unsigned char& dpstate = *((unsigned char*)state); -  if (edge.Arity() == 0) { -    dpstate = static_cast<unsigned int>(edge.i_); -  } else if (edge.Arity() == 1) { -    dpstate = *((unsigned char*)ant_states[0]); -  } else if (edge.Arity() == 2) { -    int left_index = *((unsigned char*)ant_states[0]); -    int right_index = *((unsigned char*)ant_states[1]); -    if (right_index == -1) -      dpstate = static_cast<unsigned int>(left_index); -    else -      dpstate = static_cast<unsigned int>(right_index); -//    const WordID cur_fclass = pos_[smeta.GetSentenceID()][right_index]; -//    cerr << edge.i_ << "," << edge.j_ << ": fclass=" << TD::Convert(cur_fclass) << " j=" << jumpsize << endl; -//    const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second; -//    features->set_value(fid, 1.0); -    FireFeature(smeta, left_index, right_index, features); -  } -} -  LexNullJump::LexNullJump(const string& param) :      FeatureFunction(1),      fid_lex_null_(FD::Convert("JumpLexNull")), @@ -239,107 +127,71 @@ void LexNullJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,    }  } -MarkovJump::MarkovJump(const string& param) : +NewJump::NewJump(const string& param) :      FeatureFunction(1), -    fid_(FD::Convert("MarkovJump")), -    fid_lex_null_(FD::Convert("JumpLexNull")), -    fid_null_lex_(FD::Convert("JumpNullLex")), -    fid_null_null_(FD::Convert("JumpNullNull")), -    fid_lex_lex_(FD::Convert("JumpLexLex")), -    binary_params_(false) { -  cerr << "    MarkovJump"; +    kBOS_(TD::Convert("BOS")), +    kEOS_(TD::Convert("EOS")) { +  cerr << "    NewJump";    vector<string> argv; +  set<string> permitted; +  permitted.insert("use_binned_log_lengths"); +  permitted.insert("flen"); +  permitted.insert("elen"); +  permitted.insert("fprev"); +  permitted.insert("f0"); +  permitted.insert("f-1"); +  permitted.insert("f+1"); +  // also permitted f:FILENAME    int argc = SplitOnWhitespace(param, &argv); -  if (argc != 1 || !(argv[0] == "-b" || argv[0] == "+b")) { -    cerr << "MarkovJump: expected parameters to be -b or +b\n"; -    exit(1); -  } -  binary_params_ = argv[0] == "+b"; -  if (binary_params_) { -    flen2jump2fid_.resize(MAX_SENTENCE_SIZE); -    for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) { -      map<int, int>& jump2fid = flen2jump2fid_[i]; -      for (int jump = -i; jump <= i; ++jump) { -        ostringstream os; -        os << "Jump:FLen:" << i << "_J:" << jump; -        jump2fid[jump] = FD::Convert(os.str()); -      } -    } -  } else { -    cerr << " (Blunsom & Cohn definition)"; -  } -  cerr << endl; -} - -// TODO handle NULLs according to Och 2000? -void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, -                                       const Hypergraph::Edge& edge, -                                       const vector<const void*>& ant_states, -                                       SparseVector<double>* features, -                                       SparseVector<double>* /* estimated_features */, -                                       void* state) const { -  unsigned char& dpstate = *((unsigned char*)state); -  const int flen = smeta.GetSourceLength(); -  if (edge.Arity() == 0) { -    dpstate = static_cast<unsigned int>(edge.i_); -    if (edge.prev_i_ == 0) {     // first word in sentence -      if (edge.i_ >= 0 && binary_params_) { -        const int fid = flen2jump2fid_[flen].find(edge.i_ + 1)->second; -        features->set_value(fid, 1.0); -      } else if (edge.i_ < 0 && binary_params_) { -        // handled by bigram features -      } -    } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) { -      if (edge.i_ >= 0 && binary_params_) { -        int jumpsize = flen - edge.i_; -        const int fid = flen2jump2fid_[flen].find(jumpsize)->second; -        features->set_value(fid, 1.0); -      } else if (edge.i_ < 0 && binary_params_) { -        // handled by bigram features -      } -    } -  } else if (edge.Arity() == 1) { -    dpstate = *((unsigned char*)ant_states[0]); -  } else if (edge.Arity() == 2) { -    int left_index = *((unsigned char*)ant_states[0]); -    int right_index = *((unsigned char*)ant_states[1]); -    if (right_index == -1) -      dpstate = static_cast<unsigned int>(left_index); -    else -      dpstate = static_cast<unsigned int>(right_index); -    if (left_index == kNULL_i || right_index == kNULL_i) { -      if (left_index == kNULL_i && right_index == kNULL_i) -        features->set_value(fid_null_null_, 1.0); -      else if (left_index == kNULL_i) -        features->set_value(fid_null_lex_, 1.0); -      else -        features->set_value(fid_lex_null_, 1.0); - +  set<string> config; +  string f_file; +  for (int i = 0; i < argc; ++i) { +    if (argv[i].size() > 2 && argv[i].find("f:") == 0) { +      assert(f_file.empty());  // only one f file! +      f_file = argv[i].substr(2); +      cerr << " source_file=" << f_file;      } else { -      features->set_value(fid_lex_lex_, 1.0); // TODO should only use if NULL is enabled -      const int jumpsize = right_index - left_index; - -      if (binary_params_) { -        const int fid = flen2jump2fid_[flen].find(jumpsize)->second; -        features->set_value(fid, 1.0); +      if (permitted.count(argv[i])) { +        assert(config.count(argv[i]) == 0); +        config.insert(argv[i]); +        cerr << " " << argv[i];        } else { -        features->set_value(fid_, fabs(jumpsize - 1));  // Blunsom and Cohn def +        cerr << "\nNewJump: don't understand param '" << argv[i] << "'\n"; +        abort();        }      } -  } else { -    assert(!"something really unexpected is happening");    } -} - -NewJump::NewJump(const string& param) : -    FeatureFunction(1) { -  cerr << "    NewJump"; -  vector<string> argv; -  int argc = SplitOnWhitespace(param, &argv); -  set<string> config; -  for (int i = 0; i < argc; ++i) config.insert(argv[i]);    cerr << endl;    use_binned_log_lengths_ = config.count("use_binned_log_lengths") > 0; +  f0_ = config.count("f0") > 0; +  fm1_ = config.count("f-1") > 0; +  fp1_ = config.count("f+1") > 0; +  fprev_ = config.count("fprev") > 0; +  elen_ = config.count("elen") > 0; +  flen_ = config.count("flen") > 0; +  if (f0_ || fm1_ || fp1_ || fprev_) { +    if (f_file.empty()) { +      cerr << "NewJump: conditioning on src but f:FILE not specified!\n"; +      abort(); +    } +    ReadFile rf(f_file); +    istream& in = *rf.stream(); +    string line; +    while(in) { +      getline(in, line); +      if (!in) continue; +      vector<WordID> v; +      TD::ConvertSentence(line, &v); +      src_.push_back(v); +    } +  } +  fid_str_ = "J"; +  if (flen_) fid_str_ += "F"; +  if (elen_) fid_str_ += "E"; +  if (f0_) fid_str_ += "C"; +  if (fm1_) fid_str_ += "L"; +  if (fp1_) fid_str_ += "R"; +  if (fprev_) fid_str_ += "P";  }  // do a log transform on the length (of a sentence, a jump, etc) @@ -351,33 +203,66 @@ int BinnedLogLength(int len) {    return res;  } +// <0>=jump size <1>=jump_dir <2>=flen, <3>=elen, <4>=f0, <5>=f-1, <6>=f+1, <7>=fprev +typedef boost::tuple<short, char, short, short, WordID, WordID, WordID, WordID> NewJumpFeatureKey; + +struct KeyHash : unary_function<NewJumpFeatureKey, size_t> { +  size_t operator()(const NewJumpFeatureKey& k) const { +    size_t h = 0x37473DEF321; +    boost::hash_combine(h, k.get<0>()); +    boost::hash_combine(h, k.get<1>()); +    boost::hash_combine(h, k.get<2>()); +    boost::hash_combine(h, k.get<3>()); +    boost::hash_combine(h, k.get<4>()); +    boost::hash_combine(h, k.get<5>()); +    boost::hash_combine(h, k.get<6>()); +    boost::hash_combine(h, k.get<7>()); +    return h; +  } +}; +  void NewJump::FireFeature(const SentenceMetadata& smeta,                            const int prev_src_index,                            const int cur_src_index,                            SparseVector<double>* features) const { +  const int id = smeta.GetSentenceID();    const int src_len = smeta.GetSourceLength();    const int raw_jump = cur_src_index - prev_src_index; +  short jump_magnitude = raw_jump;    char jtype = 0; -  int jump_magnitude = raw_jump;    if (raw_jump > 0) { jtype = 'R'; } // Right    else if (raw_jump == 0) { jtype = 'S'; } // Stay    else { jtype = 'L'; jump_magnitude = raw_jump * -1; } // Left -  int effective_length = src_len; +  int effective_src_len = src_len; +  int effective_trg_len = smeta.GetTargetLength();    if (use_binned_log_lengths_) {      jump_magnitude = BinnedLogLength(jump_magnitude); -    effective_length = BinnedLogLength(src_len); -  } - -  if (true) { -    static map<int, map<int, int> > len2jump2fid; -    int& fid = len2jump2fid[src_len][raw_jump]; -    if (!fid) { -      ostringstream os; -      os << fid_str_ << ":FLen" << effective_length << ":" << jtype << jump_magnitude; -      fid = FD::Convert(os.str()); -    } -    features->set_value(fid, 1.0); +    effective_src_len = BinnedLogLength(src_len); +    effective_trg_len = BinnedLogLength(effective_trg_len); +  } +  NewJumpFeatureKey key(jump_magnitude,jtype,0,0,0,0,0); +  using boost::get; +  if (flen_)  get<2>(key) = effective_src_len; +  if (elen_)  get<3>(key) = effective_trg_len; +  if (f0_)    get<4>(key) = GetSourceWord(id, cur_src_index); +  if (fm1_)   get<5>(key) = GetSourceWord(id, cur_src_index - 1); +  if (fp1_)   get<6>(key) = GetSourceWord(id, cur_src_index + 1); +  if (fprev_) get<7>(key) = GetSourceWord(id, prev_src_index); + +  static std::tr1::unordered_map<NewJumpFeatureKey, int, KeyHash> fids; +  int& fid = fids[key]; +  if (!fid) { +    ostringstream os; +    os << fid_str_ << ':' << jtype << jump_magnitude; +    if (flen_)  os << ':' << get<2>(key); +    if (elen_)  os << ':' << get<3>(key); +    if (f0_)    os << ':' << TD::Convert(get<4>(key)); +    if (fm1_)   os << ':' << TD::Convert(get<5>(key)); +    if (fp1_)   os << ':' << TD::Convert(get<6>(key)); +    if (fprev_) os << ':' << TD::Convert(get<7>(key));     +    fid = FD::Convert(os.str());    } +  features->set_value(fid, 1.0);  }  void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta, @@ -387,6 +272,7 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,                                         SparseVector<double>* /* estimated_features */,                                         void* state) const {    unsigned char& dpstate = *((unsigned char*)state); +  // IMPORTANT: this only fires on non-Null transitions!    const int flen = smeta.GetSourceLength();    if (edge.Arity() == 0) {      dpstate = static_cast<unsigned int>(edge.i_); @@ -427,6 +313,23 @@ void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,  SourceBigram::SourceBigram(const std::string& param) :      FeatureFunction(sizeof(WordID) + sizeof(int)) { +  fid_str_ = "SB:"; +  if (param.size() > 0) { +    vector<string> argv; +    int argc = SplitOnWhitespace(param, &argv); +    if (argc != 2) { +      cerr << "SourceBigram [FEATURE_NAME_PREFIX PATH]\n"; +      abort(); +    } +    fid_str_ = argv[0] + ":"; +    lexmap_.reset(new FactoredLexiconHelper(argv[1], "*")); +  } else { +    lexmap_.reset(new FactoredLexiconHelper); +  } +} + +void SourceBigram::PrepareForInput(const SentenceMetadata& smeta) { +  lexmap_->PrepareForInput(smeta);  }  void SourceBigram::FinalTraversalFeatures(const void* context, @@ -445,7 +348,7 @@ void SourceBigram::FireFeature(WordID left,    // TODO important important !!! escape strings !!!    if (!fid) {      ostringstream os; -    os << "SB:"; +    os << fid_str_;      if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }      os << '_';      if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); } @@ -465,85 +368,7 @@ void SourceBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,    int& out_word_count = *(static_cast<int*>(context) + 1);    const int arity = edge.Arity();    if (arity == 0) { -    out_context = edge.rule_->f()[0]; -    out_word_count = edge.rule_->EWords(); -    assert(out_word_count == 1); // this is only defined for lex translation! -    // revisit this if you want to translate into null words -  } else if (arity == 2) { -    WordID left = *static_cast<const WordID*>(ant_contexts[0]); -    WordID right = *static_cast<const WordID*>(ant_contexts[1]); -    int left_wc = *(static_cast<const int*>(ant_contexts[0]) + 1); -    int right_wc = *(static_cast<const int*>(ant_contexts[0]) + 1); -    if (left_wc == 1 && right_wc == 1) -      FireFeature(-1, left, features); -    FireFeature(left, right, features); -    out_word_count = left_wc + right_wc; -    out_context = right; -  } -} -// state: POS of src word used, number of trg words generated -SourcePOSBigram::SourcePOSBigram(const std::string& param) : -    FeatureFunction(sizeof(WordID) + sizeof(int)) { -  cerr << "Reading source POS tags from " << param << endl; -  ReadFile rf(param); -  istream& in = *rf.stream(); -  while(in) { -    string line; -    getline(in, line); -    if (line.empty()) continue; -    vector<WordID> v; -    TD::ConvertSentence(line, &v); -    pos_.push_back(v); -  } -  cerr << "  (" << pos_.size() << " lines)\n"; -} - -void SourcePOSBigram::FinalTraversalFeatures(const void* context, -                                      SparseVector<double>* features) const { -  WordID left = *static_cast<const WordID*>(context); -  int left_wc = *(static_cast<const int*>(context) + 1); -  if (left_wc == 1) -    FireFeature(-1, left, features); -  FireFeature(left, -1, features); -} - -void SourcePOSBigram::FireFeature(WordID left, -                   WordID right, -                   SparseVector<double>* features) const { -  int& fid = fmap_[left][right]; -  if (!fid) { -    ostringstream os; -    os << "SP:"; -    if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); } -    os << '_'; -    if (right < 0) { os << "EOS"; } else { os << TD::Convert(right); } -    fid = FD::Convert(os.str()); -    if (fid == 0) fid = -1; -  } -  if (fid < 0) return; -  features->set_value(fid, 1.0); -} - -void SourcePOSBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta, -                                     const Hypergraph::Edge& edge, -                                     const std::vector<const void*>& ant_contexts, -                                     SparseVector<double>* features, -                                            SparseVector<double>* /* estimated_features */, -                                     void* context) const { -  WordID& out_context = *static_cast<WordID*>(context); -  int& out_word_count = *(static_cast<int*>(context) + 1); -  const int arity = edge.Arity(); -  if (arity == 0) { -    assert(smeta.GetSentenceID() < pos_.size()); -    const vector<WordID>& pos_sent = pos_[smeta.GetSentenceID()]; -    if (edge.i_ >= 0) {  // non-NULL source -      assert(edge.i_ < pos_sent.size()); -      out_context = pos_sent[edge.i_]; -    } else { // NULL source -      // should assert that source is kNULL? -      static const WordID kNULL = TD::Convert("<eps>"); -      out_context = kNULL; -    } +    out_context = lexmap_->SourceWordAtPosition(edge.i_);      out_word_count = edge.rule_->EWords();      assert(out_word_count == 1); // this is only defined for lex translation!      // revisit this if you want to translate into null words diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h index 418c8768..a1ffd9ca 100644 --- a/decoder/ff_wordalign.h +++ b/decoder/ff_wordalign.h @@ -3,7 +3,9 @@  #include "ff.h"  #include "array2d.h" +#include "factored_lexicon_helper.h" +#include <boost/scoped_ptr.hpp>  #include <boost/multi_array.hpp>  class RelativeSentencePosition : public FeatureFunction { @@ -23,64 +25,6 @@ class RelativeSentencePosition : public FeatureFunction {    std::map<WordID, int> fids_;  // fclass -> fid  }; -class Model2BinaryFeatures : public FeatureFunction { - public: -  Model2BinaryFeatures(const std::string& param); - protected: -  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, -                                     const Hypergraph::Edge& edge, -                                     const std::vector<const void*>& ant_contexts, -                                     SparseVector<double>* features, -                                     SparseVector<double>* estimated_features, -                                     void* out_context) const; - private: -  boost::multi_array<int, 3> fids_; -}; - -class MarkovJump : public FeatureFunction { - public: -  MarkovJump(const std::string& param); - protected: -  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, -                                     const Hypergraph::Edge& edge, -                                     const std::vector<const void*>& ant_contexts, -                                     SparseVector<double>* features, -                                     SparseVector<double>* estimated_features, -                                     void* out_context) const; - private: -  const int fid_; -  const int fid_lex_null_; -  const int fid_null_lex_; -  const int fid_null_null_; -  const int fid_lex_lex_; - -  bool binary_params_; -  std::vector<std::map<int, int> > flen2jump2fid_; -}; - -class MarkovJumpFClass : public FeatureFunction { - public: -  MarkovJumpFClass(const std::string& param); -  virtual void FinalTraversalFeatures(const void* context, -                                      SparseVector<double>* features) const; - protected: -  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, -                                     const Hypergraph::Edge& edge, -                                     const std::vector<const void*>& ant_contexts, -                                     SparseVector<double>* features, -                                     SparseVector<double>* estimated_features, -                                     void* context) const; - -  void FireFeature(const SentenceMetadata& smeta, -                   int prev_src_pos, -                   int cur_src_pos, -                   SparseVector<double>* features) const; - - private: -  std::vector<std::map<WordID, std::map<int, int> > > fids_;  // flen -> fclass -> jumpsize -> fid -  std::vector<std::vector<WordID> > pos_; -}; -  typedef std::map<WordID, int> Class2FID;  typedef std::map<WordID, Class2FID> Class2Class2FID;  typedef std::map<WordID, Class2Class2FID> Class2Class2Class2FID; @@ -89,6 +33,7 @@ class SourceBigram : public FeatureFunction {    SourceBigram(const std::string& param);    virtual void FinalTraversalFeatures(const void* context,                                        SparseVector<double>* features) const; +  void PrepareForInput(const SentenceMetadata& smeta);   protected:    virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,                                       const Hypergraph::Edge& edge, @@ -100,7 +45,9 @@ class SourceBigram : public FeatureFunction {    void FireFeature(WordID src,                     WordID trg,                     SparseVector<double>* features) const; +  std::string fid_str_;    mutable Class2Class2FID fmap_; +  boost::scoped_ptr<FactoredLexiconHelper> lexmap_; // different view (stemmed, etc) of source  };  class LexNullJump : public FeatureFunction { @@ -136,30 +83,27 @@ class NewJump : public FeatureFunction {                     const int cur_src_index,                     SparseVector<double>* features) const; +  WordID GetSourceWord(int sentence_id, int index) const { +    if (index < 0) return kBOS_; +    assert(src_.size() > sentence_id); +    const std::vector<WordID>& v = src_[sentence_id]; +    if (index >= v.size()) return kEOS_; +    return v[index]; +  } + +  const WordID kBOS_; +  const WordID kEOS_;    bool use_binned_log_lengths_; +  bool flen_; +  bool elen_; +  bool f0_; +  bool fm1_; +  bool fp1_; +  bool fprev_; +  std::vector<std::vector<WordID> > src_;    std::string fid_str_;  // identifies configuration uniquely  }; -class SourcePOSBigram : public FeatureFunction { - public: -  SourcePOSBigram(const std::string& param); -  virtual void FinalTraversalFeatures(const void* context, -                                      SparseVector<double>* features) const; - protected: -  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, -                                     const Hypergraph::Edge& edge, -                                     const std::vector<const void*>& ant_contexts, -                                     SparseVector<double>* features, -                                     SparseVector<double>* estimated_features, -                                     void* context) const; - private: -  void FireFeature(WordID src, -                   WordID trg, -                   SparseVector<double>* features) const; -  mutable Class2Class2FID fmap_; -  std::vector<std::vector<WordID> > pos_; -}; -  class LexicalTranslationTrigger : public FeatureFunction {   public:    LexicalTranslationTrigger(const std::string& param); diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc index 149cd68d..f237295c 100644 --- a/decoder/lextrans.cc +++ b/decoder/lextrans.cc @@ -81,7 +81,7 @@ struct LexicalTransImpl {      for (int i = 0; i < ref.size(); ++i) {        target_vocab.insert(ref[i][0].label);      } -    bool all_sources_to_all_targets_ = false; +    bool all_sources_to_all_targets_ = false; // TODO configure this      set<WordID> trgs_used;      for (int i = 0; i < e_len; ++i) {  // for each word in the *target*        Hypergraph::Node* node = forest->AddNode(kXCAT); diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index 81ac4198..f5ee5d3f 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -120,17 +120,19 @@ grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz  feature_function=WordPairFeatures $align_dir/grammars/wordpairs.$direction.features.gz  feature_function=LexicalPairIdentity -feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second +# stem translation  feature_function=LexicalPairIdentity S $align_dir/grammars/corpus.stemmed.$first $align_dir/grammars/${second}stem.map +# POS translation +feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second  feature_function=InputIdentity  feature_function=OutputIdentity  feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first -# the following two are deprecated -feature_function=MarkovJump +b -feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first +feature_function=NewJump +feature_function=NewJump use_binned_log_lengths flen +# jump distance and src and destination class type +feature_function=NewJump use_binned_log_lengths f0 fprev f:$align_dir/grammars/corpus.class.$first  feature_function=SourceBigram -# following is deprecated- should reuse SourceBigram the way LexicalPairIdentity does -feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first +feature_function=SourceBigram SC $align_dir/grammars/corpus.class.$first  EOT    close CDEC;    open AGENDA, ">$stage_dir/agenda.txt" or die "Can't write $stage_dir/agenda.txt: $!"; diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index be0644df..1a069abf 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -1,14 +1,13 @@ -all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml +all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map  clean: -	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg* wordpairs* +	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* freq* wordpairs*  SUPPORT_DIR = $(SCRIPT_DIR)/support  GZIP = /usr/bin/gzip  ZCAT = zcat  EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl  EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl -GENERATE_PSG = $(SUPPORT_DIR)/generate_per_sentence_grammars.pl  GENERATE_WORDPAIR_FEATURES = $(SUPPORT_DIR)/generate_word_pair_features.pl  ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl  ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl @@ -84,6 +83,3 @@ corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1  wordpairs.f-e.features.gz: corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1  	$(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 | $(GZIP) -9 > $@ -corpus.f-e.sgml: f.voc corpus.f-e.lex-grammar.gz corpus.f-e -	$(GENERATE_PSG) f.voc corpus.f-e corpus.f-e.lex-grammar.gz freq_grammar.f-e.gz psg.f-e $@ - diff --git a/word-aligner/support/generate_word_pair_features.pl b/word-aligner/support/generate_word_pair_features.pl index b28f6feb..54b89ce1 100755 --- a/word-aligner/support/generate_word_pair_features.pl +++ b/word-aligner/support/generate_word_pair_features.pl @@ -92,7 +92,7 @@ my $ADD_ID = 1;  my $ADD_PUNC = 1;  my $ADD_NULL = 1;  my $ADD_MODEL1 = 1; -my $ADD_NOMODEL1 = 1; +my $ADD_NOMODEL1 = 0;  my $BEAM_RATIO = 50;  my $BIN_ORTHO = 1;  my $BIN_DLEN = 1; @@ -171,7 +171,7 @@ for my $f (sort keys %fdict) {          }          if ($im1 > $MIN_MAGNITUDE) {            push @feats, "InvModel1=$im1" if $im1; -        } else { +        } elsif ($ADD_NOMODEL1) {            push @feats, 'NoInvModel1=1';          }          my $am1 = sprintf("%.5g", sqrt($m1 * $im1)); | 
