alternative def of neighborhoods

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@739 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-12-01 05:27:13 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-12-01 05:27:13 +0000
commit: d52db01a2e224869c6ea72a4a234e888c6fd756c (patch)
tree: c5aff0967b4fcca2ac879aecb4ac68317d3582aa
parent: ca83fbe4c043b2e4e18a21f91e74dfa922eda44e (diff)
11 files changed, 823 insertions, 406 deletions
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 3953118c..d6cf4572 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -51,6 +51,8 @@ void register_feature_functions() {
   ff_registry.Register("RuleShape", new FFFactory<RuleShapeFeatures>);
   ff_registry.Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>);
   ff_registry.Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>);
+  ff_registry.Register("LexNullJump", new FFFactory<LexNullJump>);
+  ff_registry.Register("NewJump", new FFFactory<NewJump>);
   ff_registry.Register("MarkovJump", new FFFactory<MarkovJump>);
   ff_registry.Register("MarkovJumpFClass", new FFFactory<MarkovJumpFClass>);
   ff_registry.Register("SourceBigram", new FFFactory<SourceBigram>);
@@ -64,6 +66,7 @@ void register_feature_functions() {
   ff_registry.Register("OutputIdentity", new FFFactory<OutputIdentity>);
   ff_registry.Register("InputIdentity", new FFFactory<InputIdentity>);
   ff_registry.Register("LexicalTranslationTrigger", new FFFactory<LexicalTranslationTrigger>);
+  ff_registry.Register("WordPairFeatures", new FFFactory<WordPairFeatures>);
   ff_registry.Register("WordSet", new FFFactory<WordSet>);
 #ifdef HAVE_GLC
   ff_registry.Register("ContextCRF", new FFFactory<Model1Features>);
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index 5f42b438..980c64ad 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -1,10 +1,13 @@
 #include "ff_wordalign.h"
 
+#include <algorithm>
+#include <iterator>
 #include <set>
 #include <sstream>
 #include <string>
 #include <cmath>
 
+#include "verbose.h"
 #include "alignment_pharaoh.h"
 #include "stringlib.h"
 #include "sentence_metadata.h"
@@ -20,6 +23,8 @@ static const int kNULL_i = 255;  // -1 as an unsigned char
 
 using namespace std;
 
+// TODO new feature: if a word is translated as itself and there is a transition back to the same word, fire a feature
+
 Model2BinaryFeatures::Model2BinaryFeatures(const string& ) :
     fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) {
   for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
@@ -195,6 +200,45 @@ void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   }
 }
 
+LexNullJump::LexNullJump(const string& param) :
+    FeatureFunction(1),
+    fid_lex_null_(FD::Convert("JumpLexNull")),
+    fid_null_lex_(FD::Convert("JumpNullLex")),
+    fid_null_null_(FD::Convert("JumpNullNull")),
+    fid_lex_lex_(FD::Convert("JumpLexLex")) {}
+
+void LexNullJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                        const Hypergraph::Edge& edge,
+                                        const vector<const void*>& ant_states,
+                                        SparseVector<double>* features,
+                                        SparseVector<double>* /* estimated_features */,
+                                        void* state) const {
+  char& dpstate = *((char*)state);
+  if (edge.Arity() == 0) {
+    // dpstate is 'N' = null or 'L' = lex
+    if (edge.i_ < 0) { dpstate = 'N'; } else { dpstate = 'L'; }
+  } else if (edge.Arity() == 1) {
+    dpstate = *((unsigned char*)ant_states[0]);
+  } else if (edge.Arity() == 2) {
+    char left = *((char*)ant_states[0]);
+    char right = *((char*)ant_states[1]);
+    dpstate = right;
+    if (left == 'N') {
+      if (right == 'N')
+        features->set_value(fid_null_null_, 1.0);
+      else
+        features->set_value(fid_null_lex_, 1.0);
+    } else { // left == 'L'
+      if (right == 'N')
+        features->set_value(fid_lex_null_, 1.0);
+      else
+        features->set_value(fid_lex_lex_, 1.0);
+    }
+  } else {
+    assert(!"something really unexpected is happening");
+  }
+}
+
 MarkovJump::MarkovJump(const string& param) :
     FeatureFunction(1),
     fid_(FD::Convert("MarkovJump")),
@@ -287,6 +331,100 @@ void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   }
 }
 
+NewJump::NewJump(const string& param) :
+    FeatureFunction(1) {
+  cerr << "    NewJump";
+  vector<string> argv;
+  int argc = SplitOnWhitespace(param, &argv);
+  set<string> config;
+  for (int i = 0; i < argc; ++i) config.insert(argv[i]);
+  cerr << endl;
+  use_binned_log_lengths_ = config.count("use_binned_log_lengths") > 0;
+}
+
+// do a log transform on the length (of a sentence, a jump, etc)
+// this basically means that large distances that are close to each other
+// are put into the same bin
+int BinnedLogLength(int len) {
+  int res = static_cast<int>(log(len+1) / log(1.3));
+  if (res > 16) res = 16;
+  return res;
+}
+
+void NewJump::FireFeature(const SentenceMetadata& smeta,
+                          const int prev_src_index,
+                          const int cur_src_index,
+                          SparseVector<double>* features) const {
+  const int src_len = smeta.GetSourceLength();
+  const int raw_jump = cur_src_index - prev_src_index;
+  char jtype = 0;
+  int jump_magnitude = raw_jump;
+  if (raw_jump > 0) { jtype = 'R'; } // Right
+  else if (raw_jump == 0) { jtype = 'S'; } // Stay
+  else { jtype = 'L'; jump_magnitude = raw_jump * -1; } // Left
+  int effective_length = src_len;
+  if (use_binned_log_lengths_) {
+    jump_magnitude = BinnedLogLength(jump_magnitude);
+    effective_length = BinnedLogLength(src_len);
+  }
+
+  if (true) {
+    static map<int, map<int, int> > len2jump2fid;
+    int& fid = len2jump2fid[src_len][raw_jump];
+    if (!fid) {
+      ostringstream os;
+      os << fid_str_ << ":FLen" << effective_length << ":" << jtype << jump_magnitude;
+      fid = FD::Convert(os.str());
+    }
+    features->set_value(fid, 1.0);
+  }
+}
+
+void NewJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                       const Hypergraph::Edge& edge,
+                                       const vector<const void*>& ant_states,
+                                       SparseVector<double>* features,
+                                       SparseVector<double>* /* estimated_features */,
+                                       void* state) const {
+  unsigned char& dpstate = *((unsigned char*)state);
+  const int flen = smeta.GetSourceLength();
+  if (edge.Arity() == 0) {
+    dpstate = static_cast<unsigned int>(edge.i_);
+    if (edge.prev_i_ == 0) {     // first target word in sentence
+      if (edge.i_ >= 0) {   // generated from non-Null token?
+        FireFeature(smeta,
+                    -1,  // previous src = beginning of sentence index
+                    edge.i_, // current src
+                    features);
+      }
+    } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) {  // last word
+      if (edge.i_ >= 0) {  // generated from non-Null token?
+        FireFeature(smeta,
+                    edge.i_,  // previous src = last word position
+                    flen,     // current src
+                    features);
+      }
+    }
+  } else if (edge.Arity() == 1) {
+    dpstate = *((unsigned char*)ant_states[0]);
+  } else if (edge.Arity() == 2) {
+    int left_index = *((unsigned char*)ant_states[0]);
+    int right_index = *((unsigned char*)ant_states[1]);
+    if (right_index == -1)
+      dpstate = static_cast<unsigned int>(left_index);
+    else
+      dpstate = static_cast<unsigned int>(right_index);
+    if (left_index != kNULL_i && right_index != kNULL_i) {
+      FireFeature(smeta,
+                  left_index,          // previous src index
+                  right_index,         // current src index
+                  features);
+    }
+  } else {
+    assert(!"something really unexpected is happening");
+  }
+}
+
 SourceBigram::SourceBigram(const std::string& param) :
     FeatureFunction(sizeof(WordID) + sizeof(int)) {
 }
@@ -626,6 +764,122 @@ void InputIdentity::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   }
 }
 
+WordPairFeatures::WordPairFeatures(const string& param) {
+  vector<string> argv;
+  int argc = SplitOnWhitespace(param, &argv); 
+  if (argc != 1) {
+    cerr << "WordPairFeature /path/to/feature_values.table\n";
+    abort();
+  }
+  set<WordID> all_srcs;
+  {
+    ReadFile rf(argv[0]);
+    istream& in = *rf.stream();
+    string buf;
+    while (in) {
+      getline(in, buf);
+      if (buf.empty()) continue;
+      int start = 0;
+      while(start < buf.size() && buf[start] == ' ') ++start;
+      int end = start;
+      while(end < buf.size() && buf[end] != ' ') ++end;
+      const WordID src = TD::Convert(buf.substr(start, end - start));
+      all_srcs.insert(src);
+    }
+  }
+  if (all_srcs.empty()) {
+    cerr << "WordPairFeature " << param << " loaded empty file!\n";
+    return;
+  }
+  fkeys_.reserve(all_srcs.size());
+  copy(all_srcs.begin(), all_srcs.end(), back_inserter(fkeys_));
+  values_.resize(all_srcs.size());
+  if (!SILENT) { cerr << "WordPairFeature: " << all_srcs.size() << " sources\n"; }
+  ReadFile rf(argv[0]);
+  istream& in = *rf.stream();
+  string buf;
+  double val = 0;
+  WordID cur_src = 0;
+  map<WordID, SparseVector<float> > *pv = NULL;
+  const WordID kBARRIER = TD::Convert("|||");
+  while (in) {
+    getline(in, buf);
+    if (buf.size() == 0) continue;
+    int start = 0;
+    while(start < buf.size() && buf[start] == ' ') ++start;
+    int end = start;
+    while(end < buf.size() && buf[end] != ' ') ++end;
+    const WordID src = TD::Convert(buf.substr(start, end - start));
+    if (cur_src != src) {
+      cur_src = src;
+      size_t ind = distance(fkeys_.begin(), lower_bound(fkeys_.begin(), fkeys_.end(), cur_src));
+      pv = &values_[ind];
+    }
+    end += 1;
+    start = end;
+    while(end < buf.size() && buf[end] != ' ') ++end;
+    WordID x = TD::Convert(buf.substr(start, end - start));
+    if (x != kBARRIER) {
+      cerr << "1 Format error: " << buf << endl;
+      abort();
+    }
+    start = end + 1;
+    end = start + 1;
+    while(end < buf.size() && buf[end] != ' ') ++end;
+    WordID trg = TD::Convert(buf.substr(start, end - start));
+    if (trg == kBARRIER) {
+      cerr << "2 Format error: " << buf << endl;
+      abort();
+    }
+    start = end + 1;
+    end = start + 1;
+    while(end < buf.size() && buf[end] != ' ') ++end;
+    WordID x2 = TD::Convert(buf.substr(start, end - start));
+    if (x2 != kBARRIER) {
+      cerr << "3 Format error: " << buf << endl;
+      abort();
+    }
+    start = end + 1;
+
+    SparseVector<float>& v = (*pv)[trg];
+    while(start < buf.size()) {
+      end = start + 1;
+      while(end < buf.size() && buf[end] != '=' && buf[end] != ' ') ++end;
+      if (end == buf.size() || buf[end] != '=') { cerr << "4 Format error: " << buf << endl; abort(); }
+      const int fid = FD::Convert(buf.substr(start, end - start));
+      start = end + 1;
+      while(start < buf.size() && buf[start] == ' ') ++start;
+      end = start + 1;
+      while(end < buf.size() && buf[end] != ' ') ++end;
+      assert(end > start);
+      if (end < buf.size()) buf[end] = 0;
+      val = strtod(&buf.c_str()[start], NULL);
+      v.set_value(fid, val);
+      start = end + 1;
+    }
+  }
+}
 
-
+void WordPairFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                     const Hypergraph::Edge& edge,
+                                     const std::vector<const void*>& ant_contexts,
+                                     SparseVector<double>* features,
+                                     SparseVector<double>* estimated_features,
+                                     void* context) const {
+  if (edge.Arity() == 0) {
+    assert(edge.rule_->EWords() == 1);
+    assert(edge.rule_->FWords() == 1);
+    const WordID trg = edge.rule_->e()[0]; 
+    const WordID src = edge.rule_->f()[0];
+    size_t ind = distance(fkeys_.begin(), lower_bound(fkeys_.begin(), fkeys_.end(), src));
+    if (ind == fkeys_.size() || fkeys_[ind] != src) {
+      cerr << "WordPairFeatures no source entries for " << TD::Convert(src) << endl;
+      abort();
+    }
+    const map<WordID, SparseVector<float> >::const_iterator it = values_[ind].find(trg);
+    // TODO optional strict flag to make sure there are features for all pairs?
+    if (it != values_[ind].end())
+      (*features) += it->second;
+  }
+}
 
diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h
index 0714229c..418c8768 100644
--- a/decoder/ff_wordalign.h
+++ b/decoder/ff_wordalign.h
@@ -103,6 +103,43 @@ class SourceBigram : public FeatureFunction {
   mutable Class2Class2FID fmap_;
 };
 
+class LexNullJump : public FeatureFunction {
+ public:
+  LexNullJump(const std::string& param);
+ protected:
+  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                     const Hypergraph::Edge& edge,
+                                     const std::vector<const void*>& ant_contexts,
+                                     SparseVector<double>* features,
+                                     SparseVector<double>* estimated_features,
+                                     void* out_context) const;
+ private:
+  const int fid_lex_null_;
+  const int fid_null_lex_;
+  const int fid_null_null_;
+  const int fid_lex_lex_;
+};
+
+class NewJump : public FeatureFunction {
+ public:
+  NewJump(const std::string& param);
+ protected:
+  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                     const Hypergraph::Edge& edge,
+                                     const std::vector<const void*>& ant_contexts,
+                                     SparseVector<double>* features,
+                                     SparseVector<double>* estimated_features,
+                                     void* out_context) const;
+ private:
+  void FireFeature(const SentenceMetadata& smeta,
+                   const int prev_src_index,
+                   const int cur_src_index,
+                   SparseVector<double>* features) const;
+
+  bool use_binned_log_lengths_;
+  std::string fid_str_;  // identifies configuration uniquely
+};
+
 class SourcePOSBigram : public FeatureFunction {
  public:
   SourcePOSBigram(const std::string& param);
@@ -238,6 +275,24 @@ class BlunsomSynchronousParseHack : public FeatureFunction {
   mutable std::vector<std::vector<WordID> > refs_;
 };
 
+// association feature type look up a pair (e,f) in a table and return a vector
+// of feature values
+class WordPairFeatures : public FeatureFunction {
+ public:
+  WordPairFeatures(const std::string& param);
+ protected:
+  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                     const Hypergraph::Edge& edge,
+                                     const std::vector<const void*>& ant_contexts,
+                                     SparseVector<double>* features,
+                                     SparseVector<double>* estimated_features,
+                                     void* context) const;
+
+ private:
+  std::vector<WordID> fkeys_;  // parallel to values_
+  std::vector<std::map<WordID, SparseVector<float> > > values_;  // fkeys_index -> e -> value
+};
+
 class InputIdentity : public FeatureFunction {
  public:
   InputIdentity(const std::string& param);
diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc
index 4476fe63..35d2d15d 100644
--- a/decoder/lextrans.cc
+++ b/decoder/lextrans.cc
@@ -76,13 +76,13 @@ struct LexicalTransImpl {
     // hack to tell the feature function system how big the sentence pair is
     const int f_start = (use_null ? -1 : 0);
     int prev_node_id = -1;
-    set<WordID> target_vocab; // only set for alignment_only mode
-    if (align_only_) {
-      const Lattice& ref = smeta.GetReference();
-      for (int i = 0; i < ref.size(); ++i) {
-        target_vocab.insert(ref[i][0].label);
-      }
+    set<WordID> target_vocab;
+    const Lattice& ref = smeta.GetReference();
+    for (int i = 0; i < ref.size(); ++i) {
+      target_vocab.insert(ref[i][0].label);
     }
+    bool all_sources_to_all_targets_ = true;
+    set<WordID> trgs_used;
     for (int i = 0; i < e_len; ++i) {  // for each word in the *target*
       Hypergraph::Node* node = forest->AddNode(kXCAT);
       const int new_node_id = node->id_;
@@ -101,10 +101,13 @@ struct LexicalTransImpl {
         assert(rb);
         for (int k = 0; k < rb->GetNumRules(); ++k) {
           TRulePtr rule = rb->GetIthRule(k);
+          const WordID trg_word = rule->e_[0];
           if (align_only_) {
-            if (target_vocab.count(rule->e_[0]) == 0)
+            if (target_vocab.count(trg_word) == 0)
               continue;
           }
+          if (all_sources_to_all_targets_ && (target_vocab.count(trg_word) > 0))
+            trgs_used.insert(trg_word);
           Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector());
           edge->i_ = j;
           edge->j_ = j+1;
@@ -113,6 +116,21 @@ struct LexicalTransImpl {
           edge->feature_values_ += edge->rule_->GetFeatureValues();
           forest->ConnectEdgeToHeadNode(edge->id_, new_node_id);
         }
+        if (all_sources_to_all_targets_) {
+          for (set<WordID>::iterator it = target_vocab.begin(); it != target_vocab.end(); ++it) {
+            if (trgs_used.count(*it)) continue;
+            const WordID ungenerated_trg_word = *it;
+            TRulePtr rule;
+            rule.reset(TRule::CreateLexicalRule(src_sym, ungenerated_trg_word));
+            Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector());
+            edge->i_ = j;
+            edge->j_ = j+1;
+            edge->prev_i_ = i;
+            edge->prev_j_ = i+1;
+            forest->ConnectEdgeToHeadNode(edge->id_, new_node_id);
+          }
+          trgs_used.clear();
+        }
       }
       if (prev_node_id >= 0) {
         const int comb_node_id = forest->AddNode(kXCAT)->id_;
diff --git a/decoder/trule.cc b/decoder/trule.cc
index a40c4e14..eedf8f30 100644
--- a/decoder/trule.cc
+++ b/decoder/trule.cc
@@ -246,18 +246,18 @@ string TRule::AsString(bool verbose) const {
   int idx = 0;
   if (lhs_ && verbose) {
     os << '[' << TD::Convert(lhs_ * -1) << "] |||";
-    for (int i = 0; i < f_.size(); ++i) {
-      const WordID& w = f_[i];
-      if (w < 0) {
-        int wi = w * -1;
-        ++idx;
-        os << " [" << TD::Convert(wi) << ',' << idx << ']';
-      } else {
-        os << ' ' << TD::Convert(w);
-      }
+  }
+  for (int i = 0; i < f_.size(); ++i) {
+    const WordID& w = f_[i];
+    if (w < 0) {
+      int wi = w * -1;
+      ++idx;
+      os << " [" << TD::Convert(wi) << ',' << idx << ']';
+    } else {
+      os << ' ' << TD::Convert(w);
     }
-    os << " ||| ";
   }
+  os << " ||| ";
   if (idx > 9) {
     cerr << "Too many non-terminals!\n partial: " << os.str() << endl;
     exit(1);
diff --git a/environment/LocalConfig.pm b/environment/LocalConfig.pm
index b047d21c..6e29fd05 100644
--- a/environment/LocalConfig.pm
+++ b/environment/LocalConfig.pm
@@ -36,6 +36,10 @@ my $CCONFIG = {
     'HOST_REGEXP' => qr/^(blacklight.psc.edu|bl1.psc.teragrid.org)$/,
     'QSubMemFlag' => '-l pmem=',
   },
+  'LOCAL' => {
+    'HOST_REGEXP' => qr/local\.net$/,
+    'QSubMemFlag' => '',
+  },
 };
 
 our $senvironment_name;
diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h
index cce6c8a4..f76fc14c 100644
--- a/utils/sparse_vector.h
+++ b/utils/sparse_vector.h
@@ -361,6 +361,18 @@ public:
         return *this;
     }
 
+    template <typename R>
+    SparseVector<T> &operator+=(const SparseVector<R> &other) {
+        for (typename SparseVector<R>::MapType::const_iterator
+                it = other.values_.begin(); it != other.values_.end(); ++it)
+        {
+//            T v =
+              (values_[it->first] += it->second);
+//            if (!v) values_.erase(it->first);
+        }
+        return *this;
+    }
+
     SparseVector<T> &operator-=(const SparseVector<T> &other) {
         for (typename MapType::const_iterator
                 it = other.values_.begin(); it != other.values_.end(); ++it)
@@ -512,8 +524,8 @@ public:
       values_.swap(other.values_);
     }
 
-private:
   MapType values_;
+private:
 
 #if HAVE_BOOST_ARCHIVE_TEXT_OARCHIVE_HPP
   friend class boost::serialization::access;
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index f0733449..81ac4198 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -118,15 +118,18 @@ grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
 # grammar=$align_dir/grammars/freq_grammar.$direction.gz
 # per_sentence_grammar_file=$align_dir/grammars/psg.$direction
 
+feature_function=WordPairFeatures $align_dir/grammars/wordpairs.$direction.features.gz
 feature_function=LexicalPairIdentity
 feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second
 feature_function=LexicalPairIdentity S $align_dir/grammars/corpus.stemmed.$first $align_dir/grammars/${second}stem.map
 feature_function=InputIdentity
 feature_function=OutputIdentity
 feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first
+# the following two are deprecated
 feature_function=MarkovJump +b
 feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first
 feature_function=SourceBigram
+# following is deprecated- should reuse SourceBigram the way LexicalPairIdentity does
 feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first
 EOT
   close CDEC;
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index 21f39ac1..60417ec5 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -1,7 +1,7 @@
-all: corpus.f-e.lex-grammar.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml
+all: corpus.f-e.lex-grammar.gz wordpairs.f-e.features.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml
 
 clean:
-	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg*
+	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg* wordpairs*
 
 SUPPORT_DIR = $(SCRIPT_DIR)/support
 GZIP = /usr/bin/gzip
@@ -9,6 +9,7 @@ ZCAT = zcat
 EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl
 EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl
 GENERATE_PSG = $(SUPPORT_DIR)/generate_per_sentence_grammars.pl
+GENERATE_WORDPAIR_FEATURES = $(SUPPORT_DIR)/generate_word_pair_features.pl
 ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl
 ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl
 STEM_F = $(SCRIPT_DIR)/stemmers/$(F_LANG).pl
@@ -66,13 +67,22 @@ corpus.e-f: corpus.f corpus.e $(MERGE_CORPUS)
 	$(MERGE_CORPUS) corpus.e corpus.f > $@
 
 corpus.f-e.model1: corpus.f-e $(MODEL1)
-	$(MODEL1) corpus.f-e > $@
+	$(MODEL1) -v -V corpus.f-e > $@
 
 corpus.e-f.model1: corpus.e-f $(MODEL1)
-	$(MODEL1) corpus.e-f > $@
+	$(MODEL1) -v -V corpus.e-f > $@
 
-corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN)
-	$(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) | $(GZIP) -9 > $@
+corpus.f-e.full-model1: corpus.f-e $(MODEL1)
+	$(MODEL1) -t -999999 -v -V corpus.f-e > $@
+
+corpus.e-f.full-model1: corpus.e-f $(MODEL1)
+	$(MODEL1) -t -999999 -v -V corpus.e-f > $@
+
+corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1
+	$(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 | $(GZIP) -9 > $@
+
+wordpairs.f-e.features.gz: corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1
+	$(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 | $(GZIP) -9 > $@
 
 corpus.f-e.sgml: f.voc corpus.f-e.lex-grammar.gz corpus.f-e
 	$(GENERATE_PSG) f.voc corpus.f-e corpus.f-e.lex-grammar.gz freq_grammar.f-e.gz psg.f-e $@
diff --git a/word-aligner/support/generate_word_pair_features.pl b/word-aligner/support/generate_word_pair_features.pl
new file mode 100755
index 00000000..b722ee49
--- /dev/null
+++ b/word-aligner/support/generate_word_pair_features.pl
@@ -0,0 +1,432 @@
+#!/usr/bin/perl -w
+use utf8;
+use strict;
+
+my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $sparse_m1) = @ARGV;
+die "Usage: $0 corpus.fr-en corpus.f-e.full-model1 corpus.e-f.full-model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f corpus.f-e.model1\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && $sparse_m1 && -f $sparse_m1;
+
+my %eclass = ();
+my %fclass = ();
+load_classes($class_e, \%eclass);
+load_classes($class_f, \%fclass);
+
+our @IDENT_BINS = qw (Ident0 Ident1 Ident2 Ident3 Ident4 Ident5 Ident6 Ident7 Ident8_9 Ident8_9 Ident10_11 Ident10_11 Ident12_14 Ident12_14 Ident12_14);
+die unless scalar @IDENT_BINS == 15;
+our $MAX_IDENT_BIN = 'IdentGT' . scalar @IDENT_BINS;
+
+my $MIN_MAGNITUDE = 0.001; # minimum value of a feature
+
+our %cache;
+open EF, "<$effile" or die;
+open M1, "<$model1" or die;
+open IM1, "<$imodel1" or die;
+open SM1, "<$sparse_m1" or die;
+binmode(EF,":utf8");
+binmode(M1,":utf8");
+binmode(IM1,":utf8");
+binmode(SM1,":utf8");
+binmode(STDOUT,":utf8");
+my %model1;
+print STDERR "Reading model1...\n";
+my %sizes = ();
+while(<M1>) {
+  chomp;
+  my ($f, $e, $lp) = split /\s+/;
+  $model1{$f}->{$e} = sprintf("%.5g", 1e-12 + exp($lp));
+  $sizes{$f}++;
+}
+close M1;
+
+my $inv_add = 0;
+my %invm1;
+print STDERR "Reading inverse model1...\n";
+my %esizes=();
+while(<IM1>) {
+  chomp;
+  my ($e, $f, $lp) = split /\s+/;
+  $invm1{$e}->{$f} = sprintf("%.5g", 1e-12 + exp($lp));
+}
+close IM1;
+
+open OE, "<$orthoe" or die;
+binmode(OE,":utf8");
+my %oe_dict;
+while(<OE>) {
+  chomp;
+  my ($a, $b) = split / \|\|\| /, $_;
+  die "BAD: $_" unless defined $a && defined $b;
+  $oe_dict{$a} = $b;
+}
+close OE;
+
+print STDERR "Reading sparse model 1 from $sparse_m1...\n";
+my %s_m1;
+while(<SM1>) {
+  chomp;
+  my ($f, $e, $lp) = split /\s+/;
+  die unless defined $e && defined $f;
+  $s_m1{$f}->{$e} = 1;
+}
+close SM1;
+
+open OF, "<$orthof" or die;
+binmode(OF,":utf8");
+my %of_dict;
+while(<OF>) {
+  chomp;
+  my ($a, $b) = split / \|\|\| /, $_;
+  die "BAD: $_" unless defined $a && defined $b;
+  $of_dict{$a} = $b;
+}
+close OF;
+$of_dict{'<eps>'} = '<eps>';
+$oe_dict{'<eps>'} = '<eps>';
+
+my $MIN_FEATURE_COUNT = 0;
+my $ADD_PREFIX_ID = 1;
+my $ADD_LEN = 1;
+my $ADD_SIM = 1;
+my $ADD_DICE = 1;
+my $ADD_111 = 1;
+my $ADD_SPARSE_M1 = 0; # this is a very bad feature
+my $SPARSE_111 = 1; # if 1-1-1, then don't include Model1 & Dice features
+my $ADD_ID = 1;
+my $ADD_PUNC = 1;
+my $ADD_NULL = 1;
+my $ADD_MODEL1 = 1;
+my $ADD_NOMODEL1 = 1;
+my $BEAM_RATIO = 50;
+my $BIN_ORTHO = 1;
+my $BIN_DLEN = 1;
+my $BIN_IDENT = 1;
+my $BIN_DICE = 1;
+
+if ($ADD_NULL) { $fclass{'<eps>'}='NUL'; $eclass{'<eps>'} ='NUL'; }
+
+my %fdict;
+my %fcounts;
+my %ecounts;
+
+my %sdict;
+
+while(<EF>) {
+  chomp;
+  my ($f, $e) = split /\s*\|\|\|\s*/;
+  my @es = split /\s+/, $e;
+  my @fs = split /\s+/, $f;
+  for my $ew (@es){
+    die "E: Empty word" if $ew eq '';
+    $ecounts{$ew}++;
+  }
+  push @fs, '<eps>' if $ADD_NULL;
+  my $i = 0;
+  for my $fw (@fs){
+    $i++;
+    die "F: Empty word\nI=$i FS: @fs" if $fw eq '';
+    $fcounts{$fw}++;
+  }
+  for my $fw (@fs){
+    for my $ew (@es){
+      $fdict{$fw}->{$ew}++;
+    }
+  }
+}
+
+print STDERR "Extracting word pair features...\n";
+my $specials = 0;
+my $fc = 1000000;
+my $sids = 1000000;
+for my $f (sort keys %fdict) {
+  my $re = $fdict{$f};
+  my $max;
+  for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) {
+    my $efcount = $re->{$e};
+    unless (defined $max) { $max = $efcount; }
+    my $m1 = $model1{$f}->{$e};
+    my $im1 = $invm1{$e}->{$f};
+    my $is_null = undef;
+    if ($f eq '<eps>') {
+      $is_null = 1;
+      $im1 = 0;  # probability not calcuated
+    }
+    die "No Model1 probability for $e | $f !" unless defined $m1;
+    die "No inverse Model1 probability for $f | $e !" unless defined $im1;
+    my $ident = ($e eq $f);
+    my $total_eandf = $ecounts{$e} + $fcounts{$f};
+    my $dice = 2 * $efcount / $total_eandf;
+    my @feats;
+    my $is_111 = ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1);
+    if ($is_111 && $ADD_111) {
+      push @feats, "OneOneOne=1";
+    }
+    unless ($is_111 && $SPARSE_111) {
+      if ($ADD_SPARSE_M1 && defined $s_m1{$f}->{$e}) {
+        push @feats, "HighM1=1";
+      }
+      if (defined $m1 && $ADD_MODEL1) {
+        if ($m1 > $MIN_MAGNITUDE) {
+          push @feats, "Model1=$m1";
+          my $m1d = sprintf("%.5g", sqrt($m1 * $dice));
+          push @feats, "M1Dice=$m1d" if $m1d > $MIN_MAGNITUDE;
+        } elsif ($ADD_NOMODEL1) {
+          push @feats, 'NoModel1=1';
+        }
+        if ($im1 > $MIN_MAGNITUDE) {
+          push @feats, "InvModel1=$im1" if $im1;
+        } else {
+          push @feats, 'NoInvModel1=1';
+        }
+        my $am1 = sprintf("%.5g", sqrt($m1 * $im1));
+        push @feats, "AgrModel1=$am1" if $am1 > $MIN_MAGNITUDE;
+      }
+      if ($ADD_DICE) {
+        if ($BIN_DICE) {
+          push @feats, dicebin($dice) . '=1';
+        } else {
+          push @feats, "Dice=$dice";
+        }
+      }
+    }
+    my $oe = $oe_dict{$e};
+    die "Can't find orthonorm form for $e" unless defined $oe;
+    my $of = $of_dict{$f};
+    die "Can't find orthonorm form for $f" unless defined $of;
+    my $len_e = length($oe);
+    my $len_f = length($of);
+    if ($ADD_LEN) {
+      if (!$is_null) {
+        my $dlen = abs($len_e - $len_f);
+        if ($BIN_DLEN) {
+          push @feats, dlenbin($dlen) . '=1';
+        } else {
+          push @feats, "DLen=$dlen";
+        }
+      }
+    }
+    my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
+    my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
+    my $both_non_numeric = (!$e_num && !$f_num);
+
+    unless ($total_eandf > 20) {
+      if ($f_num && $e_num) {
+        my $xf = $of;
+        $xf =~ s/[.,\N{U+0087}]//g;
+        my $xe = $oe;
+        $xe =~ s/[.,\N{U+0087}]//g;
+        if (($of ne $oe) && ($xe eq $xf)) { push @feats, "NumNearIdent=1"; }
+      }
+    }
+
+    if ($ADD_SIM) {
+      my $ld = 0;
+      my $eff = $len_e;
+      if ($eff < $len_f) { $eff = $len_f; }
+      if (!$is_null) {
+        $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
+      }
+      if ($BIN_ORTHO) {
+        push @feats, orthobin($ld) . '=1';
+      } else {
+        push @feats, "OrthoSim=$ld";
+      }
+    }
+    my $f_is_punc = ($f =~ /^[!,\-\/"'`:;&=+?.()\[\]«»]+$/);
+    if ($ident && $ADD_ID) {
+      if ($f_is_punc) { push @feats, "IdentPunc=1"; }
+      else {
+        if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; }
+        if ($total_eandf < 8) { push @feats, "IdentRare=1"; }
+        if ($BIN_IDENT) {
+          push @feats, identbin($len_e) . '=1';
+        } else {
+          push @feats, "Identical=$len_e";
+        }
+      }
+    }
+    if ($ADD_PREFIX_ID && !$ident) {
+      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
+        my $pe = substr $oe, 0, 3;
+        my $pf = substr $of, 0, 3;
+        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
+      }
+    }
+    if ($ADD_PUNC) {
+      if ($f_is_punc && $e =~ /[a-z]+/) {
+        push @feats, "PuncMiss=1";
+      }
+    }
+    print "$f ||| $e ||| @feats\n";
+  }
+}
+
+
+sub levenshtein
+{
+    # $s1 and $s2 are the two strings
+    # $len1 and $len2 are their respective lengths
+    #
+    my ($s1, $s2) = @_;
+    my ($len1, $len2) = (length $s1, length $s2);
+
+    # If one of the strings is empty, the distance is the length
+    # of the other string
+    #
+    return $len2 if ($len1 == 0);
+    return $len1 if ($len2 == 0);
+
+    my %mat;
+
+    # Init the distance matrix
+    #
+    # The first row to 0..$len1
+    # The first column to 0..$len2
+    # The rest to 0
+    #
+    # The first row and column are initialized so to denote distance
+    # from the empty string
+    #
+    for (my $i = 0; $i <= $len1; ++$i)
+    {
+        for (my $j = 0; $j <= $len2; ++$j)
+        {
+            $mat{$i}{$j} = 0;
+            $mat{0}{$j} = $j;
+        }
+
+        $mat{$i}{0} = $i;
+    }
+
+    # Some char-by-char processing is ahead, so prepare
+    # array of chars from the strings
+    #
+    my @ar1 = split(//, $s1);
+    my @ar2 = split(//, $s2);
+
+    for (my $i = 1; $i <= $len1; ++$i)
+    {
+        for (my $j = 1; $j <= $len2; ++$j)
+        {
+            # Set the cost to 1 iff the ith char of $s1
+            # equals the jth of $s2
+            # 
+            # Denotes a substitution cost. When the char are equal
+            # there is no need to substitute, so the cost is 0
+            #
+            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
+
+            # Cell $mat{$i}{$j} equals the minimum of:
+            #
+            # - The cell immediately above plus 1
+            # - The cell immediately to the left plus 1
+            # - The cell diagonally above and to the left plus the cost
+            #
+            # We can either insert a new char, delete a char or
+            # substitute an existing char (with an associated cost)
+            #
+            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
+                                $mat{$i}{$j-1} + 1,
+                                $mat{$i-1}{$j-1} + $cost]);
+        }
+    }
+
+    # Finally, the Levenshtein distance equals the rightmost bottom cell
+    # of the matrix
+    #
+    # Note that $mat{$x}{$y} denotes the distance between the substrings
+    # 1..$x and 1..$y
+    #
+    return $mat{$len1}{$len2};
+}
+
+
+# minimal element of a list
+#
+sub min
+{
+    my @list = @{$_[0]};
+    my $min = $list[0];
+
+    foreach my $i (@list)
+    {
+        $min = $i if ($i < $min);
+    }
+
+    return $min;
+}
+
+sub load_classes {
+  my ($file, $ref) = @_;
+  print STDERR "Reading classes from $file...\n";
+  open F, "<$file" or die "Can't read $file: $!";
+  binmode(F, ":utf8") or die;
+  while(<F>) {
+    chomp;
+    my ($word, $class) = split /\s+/;
+#    print STDERR "'$word' -> $class\n";
+    $ref->{$word} = $class;
+  }
+  close F;
+}
+
+sub dicebin {
+  my $x = shift;
+  if ($x < 0.05) { return 'DiceLT005'; }
+  elsif ($x >= 0.05 && $x < 0.1) { return 'Dice005_01'; }
+  elsif ($x >= 0.1 && $x < 0.2) { return 'Dice01_02'; }
+  elsif ($x >= 0.2 && $x < 0.3) { return 'Dice02_03'; }
+  elsif ($x >= 0.3 && $x < 0.4) { return 'Dice03_04'; }
+  elsif ($x >= 0.4 && $x < 0.5) { return 'Dice04_05'; }
+  elsif ($x >= 0.5 && $x < 0.6) { return 'Dice05_06'; }
+  elsif ($x >= 0.6 && $x < 0.7) { return 'Dice06_07'; }
+  elsif ($x >= 0.7 && $x < 0.8) { return 'Dice07_08'; }
+  elsif ($x >= 0.8 && $x < 0.9) { return 'Dice08_09'; }
+  elsif ($x >= 0.9 && $x < 1.0) { return 'Dice09_10'; }
+  elsif ($x >= 1.0 && $x < 1.1) { return 'Dice10_11'; }
+  elsif ($x >= 1.1 && $x < 1.2) { return 'Dice11_12'; }
+  elsif ($x >= 1.2 && $x < 1.4) { return 'Dice12_14'; }
+  elsif ($x >= 1.4 && $x < 1.6) { return 'Dice14_16'; }
+  elsif ($x >= 1.6 && $x < 1.8) { return 'Dice16_18'; }
+  elsif ($x >= 1.8 && $x < 2.0) { return 'Dice18_20'; }
+  elsif ($x >= 2.0 && $x < 2.3) { return 'Dice20_23'; }
+  elsif ($x >= 2.3) { return 'DiceGT23'; }
+}
+
+sub orthobin {
+  my $x = shift;
+  if ($x < 0.9) { return 'OrthoLT09'; }
+  elsif ($x >= 0.9 && $x < 1.1) { return 'Ortho09_11'; }
+  elsif ($x >= 1.1 && $x < 1.3) { return 'Ortho11_13'; }
+  elsif ($x >= 1.3 && $x < 1.5) { return 'Ortho13_15'; }
+  elsif ($x >= 1.5 && $x < 1.7) { return 'Ortho15_17'; }
+  elsif ($x >= 1.7 && $x < 1.9) { return 'Ortho17_19'; }
+  elsif ($x >= 1.9 && $x < 2.1) { return 'Ortho19_21'; }
+  elsif ($x >= 2.1 && $x < 2.3) { return 'Ortho21_23'; }
+  elsif ($x >= 2.3 && $x < 2.5) { return 'Ortho23_25'; }
+  elsif ($x >= 2.5 && $x < 2.7) { return 'Ortho25_27'; }
+  elsif ($x >= 2.7 && $x < 2.9) { return 'Ortho27_29'; }
+  elsif ($x >= 2.9) { return 'OrthoGT29'; }
+}
+
+sub dlenbin {
+  my $x = shift;
+  if ($x == 0) { return 'DLen0'; }
+  elsif ($x == 1) { return 'DLen1'; }
+  elsif ($x == 2) { return 'DLen2'; }
+  elsif ($x == 3) { return 'DLen3'; }
+  elsif ($x == 4) { return 'DLen4'; }
+  elsif ($x == 5) { return 'DLen5'; }
+  elsif ($x == 6) { return 'DLen6'; }
+  elsif ($x == 7) { return 'DLen7'; }
+  elsif ($x == 8) { return 'DLen8'; }
+  elsif ($x == 9) { return 'DLen9'; }
+  elsif ($x >= 10) { return 'DLenGT10'; }
+}
+
+sub identbin {
+  my $x = shift;
+  if ($x == 0) { die; }
+  if ($x > scalar @IDENT_BINS) { return $MAX_IDENT_BIN; }
+  return $IDENT_BINS[$x];
+}
+
+
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index c96071bf..47d4d945 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -4,27 +4,14 @@ use strict;
 
 my $LIMIT_SIZE=30;
 
-my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+my ($effile, $model1, $imodel1) = @ARGV;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1;
 
+my $ADD_NULL = 1;
 
-my %eclass = ();
-my %fclass = ();
-load_classes($class_e, \%eclass);
-load_classes($class_f, \%fclass);
-
-our @IDENT_BINS = qw (Ident0 Ident1 Ident2 Ident3 Ident4 Ident5 Ident6 Ident7 Ident8_9 Ident8_9 Ident10_11 Ident10_11 Ident12_14 Ident12_14 Ident12_14);
-die unless scalar @IDENT_BINS == 15;
-our $MAX_IDENT_BIN = 'IdentGT' . scalar @IDENT_BINS;
-
-our %cache;
 open EF, "<$effile" or die;
 open M1, "<$model1" or die;
 open IM1, "<$imodel1" or die;
-#open M4, "<$gizaf2e" or die;
-#open IM4, "<$gizae2f" or die;
-#binmode(M4,":utf8");
-#binmode(IM4,":utf8");
 binmode(EF,":utf8");
 binmode(M1,":utf8");
 binmode(IM1,":utf8");
@@ -35,7 +22,7 @@ my %sizes = ();
 while(<M1>) {
   chomp;
   my ($f, $e, $lp) = split /\s+/;
-  $model1{$f}->{$e} = sprintf("%.5g", 1e-12 + exp($lp));
+  $model1{$f}->{$e} = 1;
   $sizes{$f}++;
 }
 close M1;
@@ -47,10 +34,10 @@ my %esizes=();
 while(<IM1>) {
   chomp;
   my ($e, $f, $lp) = split /\s+/;
-  $invm1{$e}->{$f} = sprintf("%.5g", 1e-12 + exp($lp));
+  $invm1{$e}->{$f} = 1;
   $esizes{$e}++;
   if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) {
-    $model1{$f}->{$e} = 1e-12;
+    $model1{$f}->{$e} = 1;
     $sizes{$f}++;
     $inv_add++;
   }
@@ -58,72 +45,9 @@ while(<IM1>) {
 close IM1;
 print STDERR "Added $inv_add from inverse model1\n";
 
-open M1, "<$model1" or die;
-binmode(M1,":utf8");
-my $dir_add = 0;
-print STDERR "Reading model1 (again) for extra inverse translations...\n";
-while(<M1>) {
-  chomp;
-  my ($f, $e, $lp) = split /\s+/;
-  if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) {
-    $invm1{$e}->{$f} = 1e-12;
-    $esizes{$e}++;
-    $dir_add++;
-  }
-}
-close M1;
-print STDERR "Added $dir_add from model 1\n";
 print STDERR "Generating grammars...\n";
-open OE, "<$orthoe" or die;
-binmode(OE,":utf8");
-my %oe_dict;
-while(<OE>) {
-  chomp;
-  my ($a, $b) = split / \|\|\| /, $_;
-  die "BAD: $_" unless defined $a && defined $b;
-  $oe_dict{$a} = $b;
-}
-close OE;
-open OF, "<$orthof" or die;
-binmode(OF,":utf8");
-my %of_dict;
-while(<OF>) {
-  chomp;
-  my ($a, $b) = split / \|\|\| /, $_;
-  die "BAD: $_" unless defined $a && defined $b;
-  $of_dict{$a} = $b;
-}
-close OF;
-$of_dict{'<eps>'} = '<eps>';
-$oe_dict{'<eps>'} = '<eps>';
-
-my $MIN_FEATURE_COUNT = 0;
-my $ADD_PREFIX_ID = 1;
-my $ADD_LEN = 1;
-my $ADD_SIM = 1;
-my $ADD_DICE = 1;
-my $ADD_111 = 1;
-my $ADD_ID = 1;
-my $ADD_PUNC = 1;
-my $ADD_NULL = 1;
-my $ADD_MODEL1 = 1;
-my $ADD_STEM_ID = 0;
-my $ADD_SYM = 0;
-my $BEAM_RATIO = 50;
-my $BIN_ORTHO = 1;
-my $BIN_DLEN = 1;
-my $BIN_IDENT = 1;
-my $BIN_DICE = 1;
-my $ADD_FIDENT = 0;
-
-if ($ADD_NULL) { $fclass{'<eps>'}='NUL'; $eclass{'<eps>'} ='NUL'; }
 
 my %fdict;
-my %fcounts;
-my %ecounts;
-
-my %sdict;
-
 while(<EF>) {
   chomp;
   my ($f, $e) = split /\s*\|\|\|\s*/;
@@ -131,14 +55,12 @@ while(<EF>) {
   my @fs = split /\s+/, $f;
   for my $ew (@es){
     die "E: Empty word" if $ew eq '';
-    $ecounts{$ew}++;
   }
   push @fs, '<eps>' if $ADD_NULL;
   my $i = 0;
   for my $fw (@fs){
     $i++;
     die "F: Empty word\nI=$i FS: @fs" if $fw eq '';
-    $fcounts{$fw}++;
   }
   for my $fw (@fs){
     for my $ew (@es){
@@ -147,7 +69,6 @@ while(<EF>) {
   }
 }
 
-#print STDERR "Loading Giza output...\n";
 my %model4;
 #while(<M4>) {
 #  my $en = <M4>; chomp $en;
@@ -181,305 +102,10 @@ for my $f (sort keys %fdict) {
     my $m4 = $model4{$f}->{$e};
     my $im1 = $invm1{$e}->{$f};
     my $is_good_pair = (defined $m1 || defined $m4);
-    my $is_inv_good_pair = (defined $im1);
     my $ident = ($e eq $f);
     if ($ident) { $is_good_pair = 1; }
-    my $total_eandf = $ecounts{$e} + $fcounts{$f};
-    my $dice = 2 * $efcount / $total_eandf;
-    my @feats;
-    if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
-      $is_good_pair = 1;
-      if ($ADD_111) {
-        push @feats, "OneOneOne=1";
-      }
-    }
     next unless $is_good_pair;
-    if (defined $m1 && $ADD_MODEL1) {
-      push @feats, "Model1=$m1";
-      my $m1d = sprintf("%.5g", sqrt($m1 * $dice));
-      push @feats, "Model1Dice=$m1d";
-    }
-    if ($ADD_MODEL1 && !defined $m1) { push @feats, "NoModel1=1"; }
-    if (defined $im1 && $ADD_MODEL1) {
-      push @feats, "InvModel1=$im1";
-    }
-    if (!defined $im1 && $ADD_MODEL1) {
-      push @feats, "NoInvModel1=1";
-    }
-    if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) {
-      $fc++;
-      push @feats, "F$fc=1";
-    }
-    if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; }
-    my $oe = $oe_dict{$e};
-    die "Can't find orthonorm form for $e" unless defined $oe;
-    my $of = $of_dict{$f};
-    die "Can't find orthonorm form for $f" unless defined $of;
-    my $len_e = length($oe);
-    my $len_f = length($of);
-    if ($ADD_DICE) {
-      if ($BIN_DICE) {
-        push @feats, dicebin($dice) . '=1';
-      } else {
-        push @feats, "Dice=$dice";
-      }
-    }
-    my $is_null = undef;
-    if ($ADD_NULL && $f eq '<eps>') {
-      $is_null = 1;
-    }
-    if ($ADD_LEN) {
-      if (!$is_null) {
-        my $dlen = abs($len_e - $len_f);
-        if ($BIN_DLEN) {
-          push @feats, dlenbin($dlen) . '=1';
-        } else {
-          push @feats, "DLen=$dlen";
-        }
-      }
-    }
-    my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
-    my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
-    my $both_non_numeric = (!$e_num && !$f_num);
-    unless ($total_eandf > 20) {
-      if ($f_num && $e_num) {
-        my $xf = $of;
-        $xf =~ s/[.,]//g;
-        my $xe = $oe;
-        $xe =~ s/[.,]//g;
-        if (($of ne $oe) && ($xe eq $xf)) { push @feats, "NumNearIdent=1"; }
-      }
-    }
-    if ($ADD_STEM_ID) {
-      my $el = 4;
-      my $fl = 4;
-      if ($oe =~ /^al|re|co/) { $el++; }
-      if ($of =~ /^al|re|co/) { $fl++; }
-      if ($oe =~ /^trans|inter/) { $el+=2; }
-      if ($of =~ /^trans|inter/) { $fl+=2; }
-      if ($fl > length($of)) { $fl = length($of); }
-      if ($el > length($oe)) { $el = length($oe); }
-      my $sf = substr $of, 0, $fl;
-      my $se = substr $oe, 0, $el;
-      my $id = $sdict{$sf}->{$se};
-      if (!$id) {
-        $sids++;
-	$sdict{$sf}->{$se} = $sids;
-	$id = $sids;
-      }
-      push @feats, "S$id=1";
-    }
-    if ($ADD_SIM) {
-      my $ld = 0;
-      my $eff = $len_e;
-      if ($eff < $len_f) { $eff = $len_f; }
-      if (!$is_null) {
-        $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
-      }
-      #if ($ld > 1.5) { $is_good_pair = 1; }
-      if ($BIN_ORTHO) {
-        push @feats, orthobin($ld) . '=1';
-      } else {
-        push @feats, "OrthoSim=$ld";
-      }
-    }
-    if ($ident && $ADD_ID) {
-      if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; }
-      if ($total_eandf < 8) { push @feats, "IdentRare=1"; }
-      if ($BIN_IDENT) {
-        push @feats, identbin($len_e) . '=1';
-      } else {
-        push @feats, "Identical=$len_e";
-      }
-    }
-    if ($ADD_PREFIX_ID && !$ident) {
-      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
-        my $pe = substr $oe, 0, 3;
-        my $pf = substr $of, 0, 3;
-        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
-      }
-    }
-    if ($ADD_PUNC) {
-      if ($f =~ /^[!,\-\/"'`:;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) {
-        push @feats, "PuncMiss=1";
-      }
-    }
-    my $is_special = ($is_good_pair && !(defined $m1));
-    $specials++ if $is_special;
-    print STDERR "$f -> $e\n" if $is_special;
-    print "$f ||| $e ||| @feats\n" if $is_good_pair;
+    print "$f ||| $e ||| X=0\n" if $is_good_pair;
   }
 }
-print STDERR "Added $specials special rules that were not in the M1 set\n";
-
-
-sub levenshtein
-{
-    # $s1 and $s2 are the two strings
-    # $len1 and $len2 are their respective lengths
-    #
-    my ($s1, $s2) = @_;
-    my ($len1, $len2) = (length $s1, length $s2);
-
-    # If one of the strings is empty, the distance is the length
-    # of the other string
-    #
-    return $len2 if ($len1 == 0);
-    return $len1 if ($len2 == 0);
-
-    my %mat;
-
-    # Init the distance matrix
-    #
-    # The first row to 0..$len1
-    # The first column to 0..$len2
-    # The rest to 0
-    #
-    # The first row and column are initialized so to denote distance
-    # from the empty string
-    #
-    for (my $i = 0; $i <= $len1; ++$i)
-    {
-        for (my $j = 0; $j <= $len2; ++$j)
-        {
-            $mat{$i}{$j} = 0;
-            $mat{0}{$j} = $j;
-        }
-
-        $mat{$i}{0} = $i;
-    }
-
-    # Some char-by-char processing is ahead, so prepare
-    # array of chars from the strings
-    #
-    my @ar1 = split(//, $s1);
-    my @ar2 = split(//, $s2);
-
-    for (my $i = 1; $i <= $len1; ++$i)
-    {
-        for (my $j = 1; $j <= $len2; ++$j)
-        {
-            # Set the cost to 1 iff the ith char of $s1
-            # equals the jth of $s2
-            # 
-            # Denotes a substitution cost. When the char are equal
-            # there is no need to substitute, so the cost is 0
-            #
-            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
-
-            # Cell $mat{$i}{$j} equals the minimum of:
-            #
-            # - The cell immediately above plus 1
-            # - The cell immediately to the left plus 1
-            # - The cell diagonally above and to the left plus the cost
-            #
-            # We can either insert a new char, delete a char or
-            # substitute an existing char (with an associated cost)
-            #
-            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
-                                $mat{$i}{$j-1} + 1,
-                                $mat{$i-1}{$j-1} + $cost]);
-        }
-    }
-
-    # Finally, the Levenshtein distance equals the rightmost bottom cell
-    # of the matrix
-    #
-    # Note that $mat{$x}{$y} denotes the distance between the substrings
-    # 1..$x and 1..$y
-    #
-    return $mat{$len1}{$len2};
-}
-
-
-# minimal element of a list
-#
-sub min
-{
-    my @list = @{$_[0]};
-    my $min = $list[0];
-
-    foreach my $i (@list)
-    {
-        $min = $i if ($i < $min);
-    }
-
-    return $min;
-}
-
-sub load_classes {
-  my ($file, $ref) = @_;
-  print STDERR "Reading classes from $file...\n";
-  open F, "<$file" or die "Can't read $file: $!";
-  binmode(F, ":utf8") or die;
-  while(<F>) {
-    chomp;
-    my ($word, $class) = split /\s+/;
-#    print STDERR "'$word' -> $class\n";
-    $ref->{$word} = $class;
-  }
-  close F;
-}
-
-sub dicebin {
-  my $x = shift;
-  if ($x < 0.05) { return 'DiceLT005'; }
-  elsif ($x >= 0.05 && $x < 0.1) { return 'Dice005_01'; }
-  elsif ($x >= 0.1 && $x < 0.2) { return 'Dice01_02'; }
-  elsif ($x >= 0.2 && $x < 0.3) { return 'Dice02_03'; }
-  elsif ($x >= 0.3 && $x < 0.4) { return 'Dice03_04'; }
-  elsif ($x >= 0.4 && $x < 0.5) { return 'Dice04_05'; }
-  elsif ($x >= 0.5 && $x < 0.6) { return 'Dice05_06'; }
-  elsif ($x >= 0.6 && $x < 0.7) { return 'Dice06_07'; }
-  elsif ($x >= 0.7 && $x < 0.8) { return 'Dice07_08'; }
-  elsif ($x >= 0.8 && $x < 0.9) { return 'Dice08_09'; }
-  elsif ($x >= 0.9 && $x < 1.0) { return 'Dice09_10'; }
-  elsif ($x >= 1.0 && $x < 1.1) { return 'Dice10_11'; }
-  elsif ($x >= 1.1 && $x < 1.2) { return 'Dice11_12'; }
-  elsif ($x >= 1.2 && $x < 1.4) { return 'Dice12_14'; }
-  elsif ($x >= 1.4 && $x < 1.6) { return 'Dice14_16'; }
-  elsif ($x >= 1.6 && $x < 1.8) { return 'Dice16_18'; }
-  elsif ($x >= 1.8 && $x < 2.0) { return 'Dice18_20'; }
-  elsif ($x >= 2.0 && $x < 2.3) { return 'Dice20_23'; }
-  elsif ($x >= 2.3) { return 'DiceGT23'; }
-}
-
-sub orthobin {
-  my $x = shift;
-  if ($x < 0.9) { return 'OrthoLT09'; }
-  elsif ($x >= 0.9 && $x < 1.1) { return 'Ortho09_11'; }
-  elsif ($x >= 1.1 && $x < 1.3) { return 'Ortho11_13'; }
-  elsif ($x >= 1.3 && $x < 1.5) { return 'Ortho13_15'; }
-  elsif ($x >= 1.5 && $x < 1.7) { return 'Ortho15_17'; }
-  elsif ($x >= 1.7 && $x < 1.9) { return 'Ortho17_19'; }
-  elsif ($x >= 1.9 && $x < 2.1) { return 'Ortho19_21'; }
-  elsif ($x >= 2.1 && $x < 2.3) { return 'Ortho21_23'; }
-  elsif ($x >= 2.3 && $x < 2.5) { return 'Ortho23_25'; }
-  elsif ($x >= 2.5 && $x < 2.7) { return 'Ortho25_27'; }
-  elsif ($x >= 2.7 && $x < 2.9) { return 'Ortho27_29'; }
-  elsif ($x >= 2.9) { return 'OrthoGT29'; }
-}
-
-sub dlenbin {
-  my $x = shift;
-  if ($x == 0) { return 'DLen0'; }
-  elsif ($x == 1) { return 'DLen1'; }
-  elsif ($x == 2) { return 'DLen2'; }
-  elsif ($x == 3) { return 'DLen3'; }
-  elsif ($x == 4) { return 'DLen4'; }
-  elsif ($x == 5) { return 'DLen5'; }
-  elsif ($x == 6) { return 'DLen6'; }
-  elsif ($x == 7) { return 'DLen7'; }
-  elsif ($x == 8) { return 'DLen8'; }
-  elsif ($x == 9) { return 'DLen9'; }
-  elsif ($x >= 10) { return 'DLenGT10'; }
-}
-
-sub identbin {
-  my $x = shift;
-  if ($x == 0) { die; }
-  if ($x > scalar @IDENT_BINS) { return $MAX_IDENT_BIN; }
-  return $IDENT_BINS[$x];
-}
-
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-12-01 05:27:13 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-12-01 05:27:13 +0000
commit	d52db01a2e224869c6ea72a4a234e888c6fd756c (patch)
tree	c5aff0967b4fcca2ac879aecb4ac68317d3582aa
parent	ca83fbe4c043b2e4e18a21f91e74dfa922eda44e (diff)