From c97b8a8b58f7385fb48b74e2cf1ea9610cd1202f Mon Sep 17 00:00:00 2001
From: Chris Dyer <redpony@gmail.com>
Date: Mon, 1 Feb 2010 17:38:39 -0500
Subject: word aligner cleanup, new features

---
 decoder/cdec_ff.cc                              |   1 +
 decoder/ff_wordalign.cc                         |  96 ++++++-
 decoder/ff_wordalign.h                          |  13 +-
 tests/system_tests/unsup-align/cdec.ini         |   2 +-
 tests/system_tests/unsup-align/gold.statistics  |   2 +-
 training/cluster-ptrain.pl                      |   6 +-
 word-aligner/aligner.pl                         |  20 +-
 word-aligner/classify.pl                        |  27 --
 word-aligner/extract_grammar.pl                 |  11 -
 word-aligner/extract_vocab.pl                   |  20 --
 word-aligner/extract_weights.pl                 |  17 --
 word-aligner/invert_grammar.pl                  |   8 -
 word-aligner/make_lex_grammar.pl                | 339 ------------------------
 word-aligner/makefiles/makefile.grammars        |  15 +-
 word-aligner/merge_corpus.pl                    |  18 --
 word-aligner/supplement_weights_file.pl         |  37 ---
 word-aligner/support/classify.pl                |  27 ++
 word-aligner/support/extract_grammar.pl         |  11 +
 word-aligner/support/extract_vocab.pl           |  20 ++
 word-aligner/support/extract_weights.pl         |  17 ++
 word-aligner/support/invert_grammar.pl          |   8 +
 word-aligner/support/make_lex_grammar.pl        | 339 ++++++++++++++++++++++++
 word-aligner/support/merge_corpus.pl            |  18 ++
 word-aligner/support/supplement_weights_file.pl |  73 +++++
 24 files changed, 641 insertions(+), 504 deletions(-)
 delete mode 100755 word-aligner/classify.pl
 delete mode 100755 word-aligner/extract_grammar.pl
 delete mode 100755 word-aligner/extract_vocab.pl
 delete mode 100755 word-aligner/extract_weights.pl
 delete mode 100755 word-aligner/invert_grammar.pl
 delete mode 100755 word-aligner/make_lex_grammar.pl
 delete mode 100755 word-aligner/merge_corpus.pl
 delete mode 100755 word-aligner/supplement_weights_file.pl
 create mode 100755 word-aligner/support/classify.pl
 create mode 100755 word-aligner/support/extract_grammar.pl
 create mode 100755 word-aligner/support/extract_vocab.pl
 create mode 100755 word-aligner/support/extract_weights.pl
 create mode 100755 word-aligner/support/invert_grammar.pl
 create mode 100755 word-aligner/support/make_lex_grammar.pl
 create mode 100755 word-aligner/support/merge_corpus.pl
 create mode 100755 word-aligner/support/supplement_weights_file.pl
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index 34499398..b4381dda 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -16,6 +16,7 @@ void register_feature_functions() {
   global_ff_registry->Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>);
   global_ff_registry->Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>);
   global_ff_registry->Register("MarkovJump", new FFFactory<MarkovJump>);
+  global_ff_registry->Register("MarkovJumpFClass", new FFFactory<MarkovJumpFClass>);
   global_ff_registry->Register("SourcePOSBigram", new FFFactory<SourcePOSBigram>);
   global_ff_registry->Register("BlunsomSynchronousParseHack", new FFFactory<BlunsomSynchronousParseHack>);
   global_ff_registry->Register("AlignerResults", new FFFactory<AlignerResults>);
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index e3fa91d4..fb90df62 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -1,5 +1,6 @@
 #include "ff_wordalign.h"
 
+#include <set>
 #include <sstream>
 #include <string>
 #include <cmath>
@@ -12,20 +13,20 @@
 #include "tdict.h"   // Blunsom hack
 #include "filelib.h" // Blunsom hack
 
-static const size_t MAX_SENTENCE_SIZE = 100;
+static const int MAX_SENTENCE_SIZE = 100;
 
 using namespace std;
 
 Model2BinaryFeatures::Model2BinaryFeatures(const string& param) :
     fids_(boost::extents[MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE][MAX_SENTENCE_SIZE]) {
-  for (int i = 0; i < MAX_SENTENCE_SIZE; ++i) {
-    for (int j = 0; j < MAX_SENTENCE_SIZE; ++j) {
+  for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
+    for (int j = 0; j < i; ++j) {
       for (int k = 0; k < MAX_SENTENCE_SIZE; ++k) {
         int& val = fids_[i][j][k];
         val = -1;
         if (j < i) {
           ostringstream os;
-          os << "M2_" << i << '_' << j << ':' << k;
+          os << "M2_FL:" << i << "_SI:" << j << "_TI:" << k;
           val = FD::Convert(os.str());
         }
       }
@@ -56,8 +57,24 @@ RelativeSentencePosition::RelativeSentencePosition(const string& param) :
   if (!param.empty()) {
     cerr << "  Loading word classes from " << param << endl;
     condition_on_fclass_ = true;
-    template_ = "RSP:FC000";
-    assert(!"not implemented");
+    ReadFile rf(param);
+    istream& in = *rf.stream();
+    set<WordID> classes;
+    while(in) {
+      string line;
+      getline(in, line);
+      if (line.empty()) continue;
+      vector<WordID> v;
+      TD::ConvertSentence(line, &v);
+      pos_.push_back(v);
+      for (int i = 0; i < v.size(); ++i)
+        classes.insert(v[i]);
+      for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) {
+        ostringstream os;
+        os << "RelPos_FC:" << TD::Convert(*i);
+        fids_[*i] = FD::Convert(os.str());
+      }
+    }
   } else {
     condition_on_fclass_ = false;
   }
@@ -79,17 +96,22 @@ void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& sme
                           static_cast<double>(edge.prev_i_) / smeta.GetTargetLength());
   features->set_value(fid_, val);
   if (condition_on_fclass_) {
-    assert(!"not implemented");
+    assert(smeta.GetSentenceID() < pos_.size());
+    const WordID cur_fclass = pos_[smeta.GetSentenceID()][edge.i_];
+    const int fid = fids_.find(cur_fclass)->second;
+    features->set_value(fid, val);
   }
 //  cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
 }
 
 MarkovJumpFClass::MarkovJumpFClass(const string& param) :
-    FeatureFunction(1) {
+    FeatureFunction(1),
+    fids_(MAX_SENTENCE_SIZE) {
   cerr << "    MarkovJumpFClass" << endl;
   cerr << "Reading source POS tags from " << param << endl;
   ReadFile rf(param);
   istream& in = *rf.stream();
+  set<WordID> classes;
   while(in) {
     string line;
     getline(in, line);
@@ -97,8 +119,66 @@ MarkovJumpFClass::MarkovJumpFClass(const string& param) :
     vector<WordID> v;
     TD::ConvertSentence(line, &v);
     pos_.push_back(v);
+    for (int i = 0; i < v.size(); ++i)
+      classes.insert(v[i]);
   }
   cerr << "  (" << pos_.size() << " lines)\n";
+  cerr << "  Classes: " << classes.size() << endl;
+  for (int ss = 1; ss < MAX_SENTENCE_SIZE; ++ss) {
+    map<WordID, map<int, int> >& cfids = fids_[ss];
+    for (set<WordID>::iterator i = classes.begin(); i != classes.end(); ++i) {
+      map<int, int> &fids = cfids[*i];
+      for (int j = -ss; j <= ss; ++j) {
+        ostringstream os;
+        os << "Jump_FL:" << ss << "_FC:" << TD::Convert(*i) << "_J:" << j;
+        fids[j] = FD::Convert(os.str());
+      }
+    }
+  }
+}
+
+void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta,
+                                   int prev_src_pos,
+                                   int cur_src_pos,
+                                   SparseVector<double>* features) const {
+  const int jumpsize = cur_src_pos - prev_src_pos;
+  assert(smeta.GetSentenceID() < pos_.size());
+  const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos];
+  const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
+  features->set_value(fid, 1.0);
+}
+
+void MarkovJumpFClass::FinalTraversalFeatures(const void* context,
+                                      SparseVector<double>* features) const {
+  int left_index = *static_cast<const unsigned char*>(context);
+//  int right_index = cur_flen;
+  // TODO
+}
+
+void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+                                     const Hypergraph::Edge& edge,
+                                     const std::vector<const void*>& ant_states,
+                                     SparseVector<double>* features,
+                                     SparseVector<double>* estimated_features,
+                                     void* state) const {
+  unsigned char& dpstate = *((unsigned char*)state);
+  if (edge.Arity() == 0) {
+    dpstate = static_cast<unsigned int>(edge.i_);
+  } else if (edge.Arity() == 1) {
+    dpstate = *((unsigned char*)ant_states[0]);
+  } else if (edge.Arity() == 2) {
+    int left_index = *((unsigned char*)ant_states[0]);
+    int right_index = *((unsigned char*)ant_states[1]);
+    if (right_index == -1)
+      dpstate = static_cast<unsigned int>(left_index);
+    else
+      dpstate = static_cast<unsigned int>(right_index);
+//    const WordID cur_fclass = pos_[smeta.GetSentenceID()][right_index];
+//    cerr << edge.i_ << "," << edge.j_ << ": fclass=" << TD::Convert(cur_fclass) << " j=" << jumpsize << endl;
+//    const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
+//    features->set_value(fid, 1.0);
+    FireFeature(smeta, left_index, right_index, features);
+  }
 }
 
 MarkovJump::MarkovJump(const string& param) :
diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h
index c5404887..688750de 100644
--- a/decoder/ff_wordalign.h
+++ b/decoder/ff_wordalign.h
@@ -19,7 +19,8 @@ class RelativeSentencePosition : public FeatureFunction {
  private:
   const int fid_;
   bool condition_on_fclass_;
-  std::string template_;
+  std::vector<std::vector<WordID> > pos_;
+  std::map<WordID, int> fids_;  // fclass -> fid
 };
 
 class Model2BinaryFeatures : public FeatureFunction {
@@ -66,10 +67,14 @@ class MarkovJumpFClass : public FeatureFunction {
                                      SparseVector<double>* features,
                                      SparseVector<double>* estimated_features,
                                      void* context) const;
- private:
-  void FireFeature(WordID src,
-                   WordID trg,
+
+  void FireFeature(const SentenceMetadata& smeta,
+                   int prev_src_pos,
+                   int cur_src_pos,
                    SparseVector<double>* features) const;
+
+ private:
+  std::vector<std::map<WordID, std::map<int, int> > > fids_;  // flen -> fclass -> jumpsize -> fid
   std::vector<std::vector<WordID> > pos_;
 };
 
diff --git a/tests/system_tests/unsup-align/cdec.ini b/tests/system_tests/unsup-align/cdec.ini
index 4016a201..37a37214 100644
--- a/tests/system_tests/unsup-align/cdec.ini
+++ b/tests/system_tests/unsup-align/cdec.ini
@@ -1,6 +1,6 @@
 aligner=true
 grammar=unsup-align.lex-grammar
-cubepruning_pop_limit=1000000
+intersection_strategy=full
 formalism=lexcrf
 feature_function=RelativeSentencePosition
 feature_function=MarkovJump
diff --git a/tests/system_tests/unsup-align/gold.statistics b/tests/system_tests/unsup-align/gold.statistics
index 2f37c2db..975c9d4e 100644
--- a/tests/system_tests/unsup-align/gold.statistics
+++ b/tests/system_tests/unsup-align/gold.statistics
@@ -90,7 +90,7 @@ constr_paths 4
 +lm_nodes 7
 +lm_edges 14
 +lm_paths 16
-+lm_trans end thet
++lm_trans thet thet
 constr_nodes 7
 constr_edges 10
 constr_paths 4
diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl
index 7643d4e5..03122df9 100755
--- a/training/cluster-ptrain.pl
+++ b/training/cluster-ptrain.pl
@@ -29,10 +29,12 @@ my $PRIOR;
 my $OALG = "lbfgs";
 my $sigsq = 1;
 my $means_file;
+my $mem_buffers = 20;
 my $RESTART_IF_NECESSARY;
 GetOptions("cdec=s" => \$DECODER,
            "distributed" => \$DISTRIBUTED,
            "sigma_squared=f" => \$sigsq,
+           "lbfgs_memory_buffers=i" => \$mem_buffers,
            "max_iteration=i" => \$max_iteration,
            "means=s" => \$means_file,
            "optimizer=s" => \$OALG,
@@ -133,7 +135,7 @@ while ($iter < $max_iteration) {
   my $start = time;
   my $next_iter = $iter + 1;
   my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter";
-  my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M 50 $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz";
+  my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz";
   my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
   my $cmd = "";
   if ($parallel) { $cmd = $pcmd; }
@@ -183,6 +185,8 @@ Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init
     --means FILE       if you want means other than 0
     --sigma_squared S  variance on prior
     --pmem MEM         Memory required for decoder
+    --lbfgs_memory_buffers Number of buffers to use
+                           with LBFGS optimizer
 
 EOT
 }
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index d203fc53..7eec0e42 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -6,15 +6,20 @@ use Getopt::Long;
 my $training_dir = "$SCRIPT_DIR/../training";
 die "Can't find training dir: $training_dir" unless -d $training_dir;
 
+my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';
 my $num_classes = 50;
 my $nodes = 40;
 my $pmem = "2500mb";
 my $DECODER = "cdec";
 GetOptions("cdec=s" => \$DECODER,
            "jobs=i" => \$nodes,
-           "pmem=s" => \$pmem
+           "pmem=s" => \$pmem,
+           "mkcls=s" => \$mkcls,
           ) or usage();
 usage() unless (scalar @ARGV == 1);
+die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls;
+die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls;
+
 my $in_file = shift @ARGV;
 die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/);
 my $f_lang = $1;
@@ -22,13 +27,13 @@ my $e_lang = $2;
 
 print STDERR "Source language: $f_lang\n";
 print STDERR "Target language: $e_lang\n";
+print STDERR " Using mkcls in: $mkcls\n\n";
 die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
 die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
 
 my @stages = qw(nopos relpos markov);
 my @directions = qw(f-e e-f);
 
-my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';
 my $corpus = 'c';
 
 my $cwd = getcwd();
@@ -75,7 +80,7 @@ NCLASSES = $num_classes
 
 TARGETS = @targets
 PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary
-PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 5
+PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15
 
 export
 
@@ -95,12 +100,16 @@ clean:
 EOT
 close TOPLEVEL;
 
+print STDERR "Created alignment task. chdir to talign/ then type make.\n\n";
+exit 0;
+
 sub make_stage {
   my ($stage, $direction, $prev_stage) = @_;
   my $stage_dir = "$align_dir/$stage-$direction";
   my $first = $direction;
   $first =~ s/^(.+)-.*$/$1/;
   mkdir $stage_dir;
+  my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n";
   open CDEC, ">$stage_dir/cdec.ini" or die;
   print CDEC <<EOT;
 formalism=lexcrf
@@ -108,10 +117,11 @@ intersection_strategy=full
 grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
 EOT
   if ($stage =~ /relpos/) {
-    print CDEC "feature_function=RelativeSentencePosition\n";
+    print CDEC "$RELPOS\n";
   } elsif ($stage =~ /markov/) {
-    print CDEC "feature_function=RelativeSentencePosition\n";
+    print CDEC "$RELPOS\n";
     print CDEC "feature_function=MarkovJump\n";
+    print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n";
     print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n";
   }
   close CDEC;
diff --git a/word-aligner/classify.pl b/word-aligner/classify.pl
deleted file mode 100755
index 893c7b22..00000000
--- a/word-aligner/classify.pl
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-die "Usage: $0 classes.txt corpus.txt" unless scalar @ARGV == 2;
-
-my ($class, $text) = @ARGV;
-open C, "<$class" or die "Can't read $class: $!";
-open T, "<$text" or die "Can't read $text: $!";
-
-my %dict = ();
-my $cc = 0;
-while(<C>) {
-  chomp;
-  my ($word, $cat) = split /\s+/;
-  die "'$word' '$cat'" unless (defined $word && defined $cat);
-  $dict{$word} = $cat;
-  $cc++;
-}
-close C;
-print STDERR "Loaded classes for $cc words\n";
-
-while(<T>) {
-  chomp;
-  my @cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/;
-  print "@cats\n";
-}
-
diff --git a/word-aligner/extract_grammar.pl b/word-aligner/extract_grammar.pl
deleted file mode 100755
index d7275ef5..00000000
--- a/word-aligner/extract_grammar.pl
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $key = shift @ARGV;
-die "Usage: $0 KEY\n" unless defined $key;
-
-while(<>) {
-  my ($k, @rest) = split / \|\|\| /;
-  print join(' ||| ', @rest) if ($k eq $key);
-}
-
diff --git a/word-aligner/extract_vocab.pl b/word-aligner/extract_vocab.pl
deleted file mode 100755
index 070d4202..00000000
--- a/word-aligner/extract_vocab.pl
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-print STDERR "Extracting vocabulary...\n";
-my %dict = ();
-my $wc = 0;
-while(<>) {
-  chomp;
-  my @words = split /\s+/;
-  for my $word (@words) { $wc++; $dict{$word}++; }
-}
-
-my $tc = 0;
-for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) {
-  print "$word\n";
-  $tc++;
-}
-
-print STDERR "$tc types / $wc tokens\n";
-
diff --git a/word-aligner/extract_weights.pl b/word-aligner/extract_weights.pl
deleted file mode 100755
index dfedd12e..00000000
--- a/word-aligner/extract_weights.pl
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-my %dict=();
-while(<>) {
-  chomp;
-  my ($dummy, $a, $b, $wts) = split / \|\|\| /;
-  my @weights = split /\s+/, $wts;
-  for my $w (@weights) {
-    my ($name, $val) = split /=/, $w;
-    unless ($dict{$name}) {
-      my $r = (0.5 - rand) / 5;
-      $r = sprintf ("%0.4f", $r);
-      print "$name $r\n";
-      $dict{$name}= 1;
-    }
-  }
-}
diff --git a/word-aligner/invert_grammar.pl b/word-aligner/invert_grammar.pl
deleted file mode 100755
index 3988388d..00000000
--- a/word-aligner/invert_grammar.pl
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-while(<>) {
-  my ($f, $e, $scores) = split / \|\|\| /;
-  print "$e ||| $f ||| $scores";
-}
-
diff --git a/word-aligner/make_lex_grammar.pl b/word-aligner/make_lex_grammar.pl
deleted file mode 100755
index bdb2752c..00000000
--- a/word-aligner/make_lex_grammar.pl
+++ /dev/null
@@ -1,339 +0,0 @@
-#!/usr/bin/perl -w
-use utf8;
-use strict;
-
-my $LIMIT_SIZE=30;
-
-my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
-
-my %eclass = ();
-my %fclass = ();
-load_classes($class_e, \%eclass);
-load_classes($class_f, \%fclass);
-
-our %cache;
-open EF, "<$effile" or die;
-open M1, "<$model1" or die;
-open IM1, "<$imodel1" or die;
-binmode(EF,":utf8");
-binmode(M1,":utf8");
-binmode(IM1,":utf8");
-binmode(STDOUT,":utf8");
-my %model1;
-print STDERR "Reading model1...\n";
-my %sizes = ();
-while(<M1>) {
-  chomp;
-  my ($f, $e, $lp) = split /\s+/;
-  $model1{$f}->{$e} = 1;
-  $sizes{$f}++;
-}
-close M1;
-
-my $inv_add = 0;
-my %invm1;
-print STDERR "Reading inverse model1...\n";
-my %esizes=();
-while(<IM1>) {
-  chomp;
-  my ($e, $f, $lp) = split /\s+/;
-  $invm1{$e}->{$f} = 1;
-  $esizes{$e}++;
-  if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) {
-    $model1{$f}->{$e} = 1;
-    $sizes{$f}++;
-    $inv_add++;
-  }
-}
-close IM1;
-print STDERR "Added $inv_add from inverse model1\n";
-
-open M1, "<$model1" or die;
-binmode(M1,":utf8");
-my $dir_add = 0;
-print STDERR "Reading model1 (again) for extra inverse translations...\n";
-while(<M1>) {
-  chomp;
-  my ($f, $e, $lp) = split /\s+/;
-  if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) {
-    $invm1{$e}->{$f} = 1;
-    $esizes{$e}++;
-    $dir_add++;
-  }
-}
-close M1;
-print STDERR "Added $dir_add from model 1\n";
-print STDERR "Generating grammars...\n";
-open OE, "<$orthoe" or die;
-binmode(OE,":utf8");
-my %oe_dict;
-while(<OE>) {
-  chomp;
-  my ($a, $b) = split / \|\|\| /, $_;
-  die "BAD: $_" unless defined $a && defined $b;
-  $oe_dict{$a} = $b;
-}
-close OE;
-open OF, "<$orthof" or die;
-binmode(OF,":utf8");
-my %of_dict;
-while(<OF>) {
-  chomp;
-  my ($a, $b) = split / \|\|\| /, $_;
-  die "BAD: $_" unless defined $a && defined $b;
-  $of_dict{$a} = $b;
-}
-close OF;
-$of_dict{'<eps>'} = '<eps>';
-$oe_dict{'<eps>'} = '<eps>';
-
-my $MIN_FEATURE_COUNT = 0;
-my $ADD_PREFIX_ID = 0;
-my $ADD_CLASS_CLASS = 1;
-my $ADD_LEN = 1;
-my $ADD_SIM = 1;
-my $ADD_DICE = 1;
-my $ADD_111 = 1;
-my $ADD_ID = 1;
-my $ADD_PUNC = 1;
-my $ADD_NULL = 0;
-my $ADD_STEM_ID = 1;
-my $ADD_SYM = 0;
-my $BEAM_RATIO = 50;
-
-my %fdict;
-my %fcounts;
-my %ecounts;
-
-my %sdict;
-
-while(<EF>) {
-  chomp;
-  my ($f, $e) = split /\s*\|\|\|\s*/;
-  my @es = split /\s+/, $e;
-  my @fs = split /\s+/, $f;
-  for my $ew (@es){ $ecounts{$ew}++; }
-  push @fs, '<eps>' if $ADD_NULL;
-  for my $fw (@fs){ $fcounts{$fw}++; }
-  for my $fw (@fs){
-    for my $ew (@es){
-      $fdict{$fw}->{$ew}++;
-    }
-  }
-}
-
-my $specials = 0;
-my $fc = 1000000;
-my $sids = 1000000;
-for my $f (sort keys %fdict) {
-  my $re = $fdict{$f};
-  my $max;
-  for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) {
-    my $efcount = $re->{$e};
-    unless (defined $max) { $max = $efcount; }
-    my $m1 = $model1{$f}->{$e};
-    my $im1 = $invm1{$e}->{$f};
-    my $is_good_pair = (defined $m1);
-    my $is_inv_good_pair = (defined $im1);
-    my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
-    my @feats;
-    if ($efcount > $MIN_FEATURE_COUNT) {
-      $fc++;
-      push @feats, "F$fc=1";
-    }
-    if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; }
-    my $oe = $oe_dict{$e};
-    die "Can't find orthonorm form for $e" unless defined $oe;
-    my $of = $of_dict{$f};
-    die "Can't find orthonorm form for $f" unless defined $of;
-    my $len_e = length($oe);
-    my $len_f = length($of);
-    push @feats, "Dice=$dice" if $ADD_DICE;
-    if ($ADD_CLASS_CLASS) {
-      my $ce = $eclass{$e} or die "E- no class for: $e";
-      my $cf = $fclass{$f} or die "F- no class for: $f";
-      push @feats, "C${cf}_${ce}=1";
-    }
-    my $is_null = undef;
-    if ($ADD_NULL && $f eq '<eps>') {
-      push @feats, "IsNull=1";
-      $is_null = 1;
-    }
-    if ($ADD_LEN) {
-      if (!$is_null) {
-        my $dlen = abs($len_e - $len_f);
-        push @feats, "DLen=$dlen";
-      }
-    }
-    my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
-    my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
-    my $both_non_numeric = (!$e_num && !$f_num);
-    if ($ADD_STEM_ID) {
-      my $el = 4;
-      my $fl = 4;
-      if ($oe =~ /^al|re|co/) { $el++; }
-      if ($of =~ /^al|re|co/) { $fl++; }
-      if ($oe =~ /^trans|inter/) { $el+=2; }
-      if ($of =~ /^trans|inter/) { $fl+=2; }
-      if ($fl > length($of)) { $fl = length($of); }
-      if ($el > length($oe)) { $el = length($oe); }
-      my $sf = substr $of, 0, $fl;
-      my $se = substr $oe, 0, $el;
-      my $id = $sdict{$sf}->{$se};
-      if (!$id) {
-        $sids++;
-	$sdict{$sf}->{$se} = $sids;
-	$id = $sids;
-      }
-      push @feats, "S$id=1";
-    }
-    if ($ADD_PREFIX_ID) {
-      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
-        my $pe = substr $oe, 0, 3;
-        my $pf = substr $of, 0, 3;
-        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
-      }
-    }
-    if ($ADD_SIM) {
-      my $ld = 0;
-      my $eff = $len_e;
-      if ($eff < $len_f) { $eff = $len_f; }
-      if (!$is_null) {
-        $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
-      }
-      if ($ld > 1.5) { $is_good_pair = 1; }
-      push @feats, "OrthoSim=$ld";
-    }
-    my $ident = ($e eq $f);
-    if ($ident) { $is_good_pair = 1; }
-    if ($ident && $ADD_ID) { push @feats, "Identical=$len_e"; }
-    if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
-      $is_good_pair = 1;
-      if ($ADD_111) {
-        push @feats, "OneOneOne=1";
-      }
-    }
-    if ($ADD_PUNC) {
-      if ($f =~ /^[!,\-\/"':;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) {
-        push @feats, "PuncMiss=1";
-      }
-    }
-    my $is_special = ($is_good_pair && !(defined $m1));
-    $specials++ if $is_special;
-    print STDERR "$f -> $e\n" if $is_special;
-    print "1 ||| $f ||| $e ||| @feats\n" if $is_good_pair;
-    print "2 ||| $e ||| $f ||| @feats\n" if $is_inv_good_pair;
-  }
-}
-print STDERR "Added $specials special rules that were not in the M1 set\n";
-
-
-sub levenshtein
-{
-    # $s1 and $s2 are the two strings
-    # $len1 and $len2 are their respective lengths
-    #
-    my ($s1, $s2) = @_;
-    my ($len1, $len2) = (length $s1, length $s2);
-
-    # If one of the strings is empty, the distance is the length
-    # of the other string
-    #
-    return $len2 if ($len1 == 0);
-    return $len1 if ($len2 == 0);
-
-    my %mat;
-
-    # Init the distance matrix
-    #
-    # The first row to 0..$len1
-    # The first column to 0..$len2
-    # The rest to 0
-    #
-    # The first row and column are initialized so to denote distance
-    # from the empty string
-    #
-    for (my $i = 0; $i <= $len1; ++$i)
-    {
-        for (my $j = 0; $j <= $len2; ++$j)
-        {
-            $mat{$i}{$j} = 0;
-            $mat{0}{$j} = $j;
-        }
-
-        $mat{$i}{0} = $i;
-    }
-
-    # Some char-by-char processing is ahead, so prepare
-    # array of chars from the strings
-    #
-    my @ar1 = split(//, $s1);
-    my @ar2 = split(//, $s2);
-
-    for (my $i = 1; $i <= $len1; ++$i)
-    {
-        for (my $j = 1; $j <= $len2; ++$j)
-        {
-            # Set the cost to 1 iff the ith char of $s1
-            # equals the jth of $s2
-            # 
-            # Denotes a substitution cost. When the char are equal
-            # there is no need to substitute, so the cost is 0
-            #
-            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
-
-            # Cell $mat{$i}{$j} equals the minimum of:
-            #
-            # - The cell immediately above plus 1
-            # - The cell immediately to the left plus 1
-            # - The cell diagonally above and to the left plus the cost
-            #
-            # We can either insert a new char, delete a char or
-            # substitute an existing char (with an associated cost)
-            #
-            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
-                                $mat{$i}{$j-1} + 1,
-                                $mat{$i-1}{$j-1} + $cost]);
-        }
-    }
-
-    # Finally, the Levenshtein distance equals the rightmost bottom cell
-    # of the matrix
-    #
-    # Note that $mat{$x}{$y} denotes the distance between the substrings
-    # 1..$x and 1..$y
-    #
-    return $mat{$len1}{$len2};
-}
-
-
-# minimal element of a list
-#
-sub min
-{
-    my @list = @{$_[0]};
-    my $min = $list[0];
-
-    foreach my $i (@list)
-    {
-        $min = $i if ($i < $min);
-    }
-
-    return $min;
-}
-
-sub load_classes {
-  my ($file, $ref) = @_;
-  print STDERR "Reading classes from $file...\n";
-  open F, "<$file" or die "Can't read $file: $!";
-  binmode(F, ":utf8") or die;
-  while(<F>) {
-    chomp;
-    my ($word, $class) = split /\s+/;
-#    print STDERR "'$word' -> $class\n";
-    $ref->{$word} = $class;
-  }
-  close F;
-}
-
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index a6167010..b89937c1 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -3,18 +3,19 @@ all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.c
 clean:
 	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar*
 
+SUPPORT_DIR = $(SCRIPT_DIR)/support
 GZIP = /usr/bin/gzip
 ZCAT = zcat
-EXTRACT_WEIGHTS = $(SCRIPT_DIR)/extract_weights.pl
-EXTRACT_GRAMMAR = $(SCRIPT_DIR)/extract_grammar.pl
-SUPPLEMENT_WEIGHTS = $(SCRIPT_DIR)/supplement_weights_file.pl
-EXTRACT_VOCAB = $(SCRIPT_DIR)/extract_vocab.pl
+EXTRACT_WEIGHTS = $(SUPPORT_DIR)/extract_weights.pl
+EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl
+SUPPLEMENT_WEIGHTS = $(SUPPORT_DIR)/supplement_weights_file.pl
+EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl
 ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl
 ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl
-CLASSIFY = $(SCRIPT_DIR)/classify.pl
-MAKE_LEX_GRAMMAR = $(SCRIPT_DIR)/make_lex_grammar.pl
+CLASSIFY = $(SUPPORT_DIR)/classify.pl
+MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl
 MODEL1 = $(TRAINING_DIR)/model1
-MERGE_CORPUS = $(SCRIPT_DIR)/merge_corpus.pl
+MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl
 
 orthonorm-dict.e: corpus.e
 	$(EXTRACT_VOCAB) corpus.e > e.voc
diff --git a/word-aligner/merge_corpus.pl b/word-aligner/merge_corpus.pl
deleted file mode 100755
index 02827903..00000000
--- a/word-aligner/merge_corpus.pl
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-die "Usage: $0 corpus.e|f corpus.f|e" unless scalar @ARGV == 2;
-
-my ($a, $b) = @ARGV;
-open A, "<$a" or die "Can't read $a: $!";
-open B, "<$b" or die "Can't read $a: $!";
-
-while(<A>) {
-  chomp;
-  my $e = <B>;
-  die "Mismatched lines in $a and $b!" unless defined $e;
-  print "$_ ||| $e";
-}
-
-my $e = <B>;
-die "Mismatched lines in $a and $b!" unless !defined $e;
-
diff --git a/word-aligner/supplement_weights_file.pl b/word-aligner/supplement_weights_file.pl
deleted file mode 100755
index 76f668e2..00000000
--- a/word-aligner/supplement_weights_file.pl
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my ($f_classes) = @ARGV;
-
-die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes;
-
-print <<EOT;
-MarkovJump 0
-RelativeSentencePosition 0
-EOT
-
-# !	8
-# "	11
-# 's	18
-
-my %dcats = ();
-$dcats{'BOS'} = 1;
-$dcats{'EOS'} = 1;
-
-open FC, "<$f_classes" or die;
-while(<FC>) {
-  chomp;
-  my ($x, $cat) = split /\s+/;
-  $dcats{$cat} = 1;
-}
-
-my @cats = sort keys %dcats;
-
-for (my $i=0; $i < scalar @cats; $i++) {
-  my $c1 = $cats[$i];
-  for (my $j=0; $j < scalar @cats; $j++) {
-    my $c2 = $cats[$j];
-    print "SP:${c1}_${c2} 0\n";
-  }
-}
-
diff --git a/word-aligner/support/classify.pl b/word-aligner/support/classify.pl
new file mode 100755
index 00000000..893c7b22
--- /dev/null
+++ b/word-aligner/support/classify.pl
@@ -0,0 +1,27 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 classes.txt corpus.txt" unless scalar @ARGV == 2;
+
+my ($class, $text) = @ARGV;
+open C, "<$class" or die "Can't read $class: $!";
+open T, "<$text" or die "Can't read $text: $!";
+
+my %dict = ();
+my $cc = 0;
+while(<C>) {
+  chomp;
+  my ($word, $cat) = split /\s+/;
+  die "'$word' '$cat'" unless (defined $word && defined $cat);
+  $dict{$word} = $cat;
+  $cc++;
+}
+close C;
+print STDERR "Loaded classes for $cc words\n";
+
+while(<T>) {
+  chomp;
+  my @cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/;
+  print "@cats\n";
+}
+
diff --git a/word-aligner/support/extract_grammar.pl b/word-aligner/support/extract_grammar.pl
new file mode 100755
index 00000000..d7275ef5
--- /dev/null
+++ b/word-aligner/support/extract_grammar.pl
@@ -0,0 +1,11 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $key = shift @ARGV;
+die "Usage: $0 KEY\n" unless defined $key;
+
+while(<>) {
+  my ($k, @rest) = split / \|\|\| /;
+  print join(' ||| ', @rest) if ($k eq $key);
+}
+
diff --git a/word-aligner/support/extract_vocab.pl b/word-aligner/support/extract_vocab.pl
new file mode 100755
index 00000000..070d4202
--- /dev/null
+++ b/word-aligner/support/extract_vocab.pl
@@ -0,0 +1,20 @@
+#!/usr/bin/perl -w
+use strict;
+
+print STDERR "Extracting vocabulary...\n";
+my %dict = ();
+my $wc = 0;
+while(<>) {
+  chomp;
+  my @words = split /\s+/;
+  for my $word (@words) { $wc++; $dict{$word}++; }
+}
+
+my $tc = 0;
+for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) {
+  print "$word\n";
+  $tc++;
+}
+
+print STDERR "$tc types / $wc tokens\n";
+
diff --git a/word-aligner/support/extract_weights.pl b/word-aligner/support/extract_weights.pl
new file mode 100755
index 00000000..dfedd12e
--- /dev/null
+++ b/word-aligner/support/extract_weights.pl
@@ -0,0 +1,17 @@
+#!/usr/bin/perl -w
+use strict;
+my %dict=();
+while(<>) {
+  chomp;
+  my ($dummy, $a, $b, $wts) = split / \|\|\| /;
+  my @weights = split /\s+/, $wts;
+  for my $w (@weights) {
+    my ($name, $val) = split /=/, $w;
+    unless ($dict{$name}) {
+      my $r = (0.5 - rand) / 5;
+      $r = sprintf ("%0.4f", $r);
+      print "$name $r\n";
+      $dict{$name}= 1;
+    }
+  }
+}
diff --git a/word-aligner/support/invert_grammar.pl b/word-aligner/support/invert_grammar.pl
new file mode 100755
index 00000000..3988388d
--- /dev/null
+++ b/word-aligner/support/invert_grammar.pl
@@ -0,0 +1,8 @@
+#!/usr/bin/perl -w
+use strict;
+
+while(<>) {
+  my ($f, $e, $scores) = split / \|\|\| /;
+  print "$e ||| $f ||| $scores";
+}
+
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
new file mode 100755
index 00000000..bdb2752c
--- /dev/null
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -0,0 +1,339 @@
+#!/usr/bin/perl -w
+use utf8;
+use strict;
+
+my $LIMIT_SIZE=30;
+
+my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+
+my %eclass = ();
+my %fclass = ();
+load_classes($class_e, \%eclass);
+load_classes($class_f, \%fclass);
+
+our %cache;
+open EF, "<$effile" or die;
+open M1, "<$model1" or die;
+open IM1, "<$imodel1" or die;
+binmode(EF,":utf8");
+binmode(M1,":utf8");
+binmode(IM1,":utf8");
+binmode(STDOUT,":utf8");
+my %model1;
+print STDERR "Reading model1...\n";
+my %sizes = ();
+while(<M1>) {
+  chomp;
+  my ($f, $e, $lp) = split /\s+/;
+  $model1{$f}->{$e} = 1;
+  $sizes{$f}++;
+}
+close M1;
+
+my $inv_add = 0;
+my %invm1;
+print STDERR "Reading inverse model1...\n";
+my %esizes=();
+while(<IM1>) {
+  chomp;
+  my ($e, $f, $lp) = split /\s+/;
+  $invm1{$e}->{$f} = 1;
+  $esizes{$e}++;
+  if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) {
+    $model1{$f}->{$e} = 1;
+    $sizes{$f}++;
+    $inv_add++;
+  }
+}
+close IM1;
+print STDERR "Added $inv_add from inverse model1\n";
+
+open M1, "<$model1" or die;
+binmode(M1,":utf8");
+my $dir_add = 0;
+print STDERR "Reading model1 (again) for extra inverse translations...\n";
+while(<M1>) {
+  chomp;
+  my ($f, $e, $lp) = split /\s+/;
+  if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) {
+    $invm1{$e}->{$f} = 1;
+    $esizes{$e}++;
+    $dir_add++;
+  }
+}
+close M1;
+print STDERR "Added $dir_add from model 1\n";
+print STDERR "Generating grammars...\n";
+open OE, "<$orthoe" or die;
+binmode(OE,":utf8");
+my %oe_dict;
+while(<OE>) {
+  chomp;
+  my ($a, $b) = split / \|\|\| /, $_;
+  die "BAD: $_" unless defined $a && defined $b;
+  $oe_dict{$a} = $b;
+}
+close OE;
+open OF, "<$orthof" or die;
+binmode(OF,":utf8");
+my %of_dict;
+while(<OF>) {
+  chomp;
+  my ($a, $b) = split / \|\|\| /, $_;
+  die "BAD: $_" unless defined $a && defined $b;
+  $of_dict{$a} = $b;
+}
+close OF;
+$of_dict{'<eps>'} = '<eps>';
+$oe_dict{'<eps>'} = '<eps>';
+
+my $MIN_FEATURE_COUNT = 0;
+my $ADD_PREFIX_ID = 0;
+my $ADD_CLASS_CLASS = 1;
+my $ADD_LEN = 1;
+my $ADD_SIM = 1;
+my $ADD_DICE = 1;
+my $ADD_111 = 1;
+my $ADD_ID = 1;
+my $ADD_PUNC = 1;
+my $ADD_NULL = 0;
+my $ADD_STEM_ID = 1;
+my $ADD_SYM = 0;
+my $BEAM_RATIO = 50;
+
+my %fdict;
+my %fcounts;
+my %ecounts;
+
+my %sdict;
+
+while(<EF>) {
+  chomp;
+  my ($f, $e) = split /\s*\|\|\|\s*/;
+  my @es = split /\s+/, $e;
+  my @fs = split /\s+/, $f;
+  for my $ew (@es){ $ecounts{$ew}++; }
+  push @fs, '<eps>' if $ADD_NULL;
+  for my $fw (@fs){ $fcounts{$fw}++; }
+  for my $fw (@fs){
+    for my $ew (@es){
+      $fdict{$fw}->{$ew}++;
+    }
+  }
+}
+
+my $specials = 0;
+my $fc = 1000000;
+my $sids = 1000000;
+for my $f (sort keys %fdict) {
+  my $re = $fdict{$f};
+  my $max;
+  for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) {
+    my $efcount = $re->{$e};
+    unless (defined $max) { $max = $efcount; }
+    my $m1 = $model1{$f}->{$e};
+    my $im1 = $invm1{$e}->{$f};
+    my $is_good_pair = (defined $m1);
+    my $is_inv_good_pair = (defined $im1);
+    my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
+    my @feats;
+    if ($efcount > $MIN_FEATURE_COUNT) {
+      $fc++;
+      push @feats, "F$fc=1";
+    }
+    if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; }
+    my $oe = $oe_dict{$e};
+    die "Can't find orthonorm form for $e" unless defined $oe;
+    my $of = $of_dict{$f};
+    die "Can't find orthonorm form for $f" unless defined $of;
+    my $len_e = length($oe);
+    my $len_f = length($of);
+    push @feats, "Dice=$dice" if $ADD_DICE;
+    if ($ADD_CLASS_CLASS) {
+      my $ce = $eclass{$e} or die "E- no class for: $e";
+      my $cf = $fclass{$f} or die "F- no class for: $f";
+      push @feats, "C${cf}_${ce}=1";
+    }
+    my $is_null = undef;
+    if ($ADD_NULL && $f eq '<eps>') {
+      push @feats, "IsNull=1";
+      $is_null = 1;
+    }
+    if ($ADD_LEN) {
+      if (!$is_null) {
+        my $dlen = abs($len_e - $len_f);
+        push @feats, "DLen=$dlen";
+      }
+    }
+    my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
+    my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
+    my $both_non_numeric = (!$e_num && !$f_num);
+    if ($ADD_STEM_ID) {
+      my $el = 4;
+      my $fl = 4;
+      if ($oe =~ /^al|re|co/) { $el++; }
+      if ($of =~ /^al|re|co/) { $fl++; }
+      if ($oe =~ /^trans|inter/) { $el+=2; }
+      if ($of =~ /^trans|inter/) { $fl+=2; }
+      if ($fl > length($of)) { $fl = length($of); }
+      if ($el > length($oe)) { $el = length($oe); }
+      my $sf = substr $of, 0, $fl;
+      my $se = substr $oe, 0, $el;
+      my $id = $sdict{$sf}->{$se};
+      if (!$id) {
+        $sids++;
+	$sdict{$sf}->{$se} = $sids;
+	$id = $sids;
+      }
+      push @feats, "S$id=1";
+    }
+    if ($ADD_PREFIX_ID) {
+      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
+        my $pe = substr $oe, 0, 3;
+        my $pf = substr $of, 0, 3;
+        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
+      }
+    }
+    if ($ADD_SIM) {
+      my $ld = 0;
+      my $eff = $len_e;
+      if ($eff < $len_f) { $eff = $len_f; }
+      if (!$is_null) {
+        $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
+      }
+      if ($ld > 1.5) { $is_good_pair = 1; }
+      push @feats, "OrthoSim=$ld";
+    }
+    my $ident = ($e eq $f);
+    if ($ident) { $is_good_pair = 1; }
+    if ($ident && $ADD_ID) { push @feats, "Identical=$len_e"; }
+    if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
+      $is_good_pair = 1;
+      if ($ADD_111) {
+        push @feats, "OneOneOne=1";
+      }
+    }
+    if ($ADD_PUNC) {
+      if ($f =~ /^[!,\-\/"':;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) {
+        push @feats, "PuncMiss=1";
+      }
+    }
+    my $is_special = ($is_good_pair && !(defined $m1));
+    $specials++ if $is_special;
+    print STDERR "$f -> $e\n" if $is_special;
+    print "1 ||| $f ||| $e ||| @feats\n" if $is_good_pair;
+    print "2 ||| $e ||| $f ||| @feats\n" if $is_inv_good_pair;
+  }
+}
+print STDERR "Added $specials special rules that were not in the M1 set\n";
+
+
+sub levenshtein
+{
+    # $s1 and $s2 are the two strings
+    # $len1 and $len2 are their respective lengths
+    #
+    my ($s1, $s2) = @_;
+    my ($len1, $len2) = (length $s1, length $s2);
+
+    # If one of the strings is empty, the distance is the length
+    # of the other string
+    #
+    return $len2 if ($len1 == 0);
+    return $len1 if ($len2 == 0);
+
+    my %mat;
+
+    # Init the distance matrix
+    #
+    # The first row to 0..$len1
+    # The first column to 0..$len2
+    # The rest to 0
+    #
+    # The first row and column are initialized so to denote distance
+    # from the empty string
+    #
+    for (my $i = 0; $i <= $len1; ++$i)
+    {
+        for (my $j = 0; $j <= $len2; ++$j)
+        {
+            $mat{$i}{$j} = 0;
+            $mat{0}{$j} = $j;
+        }
+
+        $mat{$i}{0} = $i;
+    }
+
+    # Some char-by-char processing is ahead, so prepare
+    # array of chars from the strings
+    #
+    my @ar1 = split(//, $s1);
+    my @ar2 = split(//, $s2);
+
+    for (my $i = 1; $i <= $len1; ++$i)
+    {
+        for (my $j = 1; $j <= $len2; ++$j)
+        {
+            # Set the cost to 1 iff the ith char of $s1
+            # equals the jth of $s2
+            # 
+            # Denotes a substitution cost. When the char are equal
+            # there is no need to substitute, so the cost is 0
+            #
+            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
+
+            # Cell $mat{$i}{$j} equals the minimum of:
+            #
+            # - The cell immediately above plus 1
+            # - The cell immediately to the left plus 1
+            # - The cell diagonally above and to the left plus the cost
+            #
+            # We can either insert a new char, delete a char or
+            # substitute an existing char (with an associated cost)
+            #
+            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
+                                $mat{$i}{$j-1} + 1,
+                                $mat{$i-1}{$j-1} + $cost]);
+        }
+    }
+
+    # Finally, the Levenshtein distance equals the rightmost bottom cell
+    # of the matrix
+    #
+    # Note that $mat{$x}{$y} denotes the distance between the substrings
+    # 1..$x and 1..$y
+    #
+    return $mat{$len1}{$len2};
+}
+
+
+# minimal element of a list
+#
+sub min
+{
+    my @list = @{$_[0]};
+    my $min = $list[0];
+
+    foreach my $i (@list)
+    {
+        $min = $i if ($i < $min);
+    }
+
+    return $min;
+}
+
+sub load_classes {
+  my ($file, $ref) = @_;
+  print STDERR "Reading classes from $file...\n";
+  open F, "<$file" or die "Can't read $file: $!";
+  binmode(F, ":utf8") or die;
+  while(<F>) {
+    chomp;
+    my ($word, $class) = split /\s+/;
+#    print STDERR "'$word' -> $class\n";
+    $ref->{$word} = $class;
+  }
+  close F;
+}
+
diff --git a/word-aligner/support/merge_corpus.pl b/word-aligner/support/merge_corpus.pl
new file mode 100755
index 00000000..02827903
--- /dev/null
+++ b/word-aligner/support/merge_corpus.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+die "Usage: $0 corpus.e|f corpus.f|e" unless scalar @ARGV == 2;
+
+my ($a, $b) = @ARGV;
+open A, "<$a" or die "Can't read $a: $!";
+open B, "<$b" or die "Can't read $a: $!";
+
+while(<A>) {
+  chomp;
+  my $e = <B>;
+  die "Mismatched lines in $a and $b!" unless defined $e;
+  print "$_ ||| $e";
+}
+
+my $e = <B>;
+die "Mismatched lines in $a and $b!" unless !defined $e;
+
diff --git a/word-aligner/support/supplement_weights_file.pl b/word-aligner/support/supplement_weights_file.pl
new file mode 100755
index 00000000..7f804b90
--- /dev/null
+++ b/word-aligner/support/supplement_weights_file.pl
@@ -0,0 +1,73 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $ADD_FCLASS_JUMP = 1;
+my $ADD_MODEL2_BINARY = 0;
+my $ADD_FC_RELPOS = 1;
+
+my ($f_classes) = @ARGV;
+
+die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes;
+
+print <<EOT;
+MarkovJump 0
+RelativeSentencePosition 0
+EOT
+
+# !	8
+# "	11
+# 's	18
+
+my %dcats = ();
+$dcats{'BOS'} = 1;
+$dcats{'EOS'} = 1;
+
+open FC, "<$f_classes" or die;
+while(<FC>) {
+  chomp;
+  my ($x, $cat) = split /\s+/;
+  $dcats{$cat} = 1;
+}
+
+my @cats = sort keys %dcats;
+
+my $added = 0;
+for (my $i=0; $i < scalar @cats; $i++) {
+  my $c1 = $cats[$i];
+  for (my $j=0; $j < scalar @cats; $j++) {
+    my $c2 = $cats[$j];
+    print "SP:${c1}_${c2} 0\n";
+    $added++;
+  }
+}
+
+for (my $ss=1; $ss < 100; $ss++) {
+  if ($ADD_FCLASS_JUMP) {
+    for (my $i=0; $i < scalar @cats; $i++) {
+      my $cat = $cats[$i];
+      for (my $j = -$ss; $j <= $ss; $j++) {
+        print "Jump_FL:${ss}_FC:${cat}_J:$j 0\n";
+        $added++;
+      }
+    }
+  }
+  if ($ADD_MODEL2_BINARY) {
+    # M2_FL:8_SI:3_TI:2=1
+    for (my $i = 0; $i < $ss; $i++) {
+      for (my $j = 0; $j < 100; $j++) {
+        print "M2_FL:${ss}_SI:${i}_TI:${j} 0\n";
+        $added++;
+      }
+    }
+  }
+}
+if ($ADD_FC_RELPOS) {
+  #RelPos_FC:11
+  for (my $i=0; $i < scalar @cats; $i++) {
+    my $cat = $cats[$i];
+    print "RelPos_FC:$cat 0\n";
+    $added++;
+  }
+}
+
+print STDERR "Added $added weights\n";
-- 
cgit v1.2.3