From 743f63daf739884051ee5760390420023b07ee26 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Wed, 22 Dec 2010 13:15:42 -0600
Subject: fix compound splitter, new features, more training data

---
 compound-split/de/TRAIN           |   3 +-
 compound-split/de/cdec-train.ini  |   2 +-
 compound-split/de/dev.in-ref      | 119 ++++++++++++++++++++++++++++++++
 compound-split/de/weights.ptinit  |   2 +
 compound-split/de/weights.trained |  37 +++++-----
 decoder/apply_models.cc           |   1 +
 decoder/csplit.cc                 | 140 +++++++++++++++++++-------------------
 decoder/decoder.cc                |  25 +++----
 decoder/ff_csplit.cc              |  21 +++++-
 training/Makefile.am              |   9 ++-
 10 files changed, 247 insertions(+), 112 deletions(-)

diff --git a/compound-split/de/TRAIN b/compound-split/de/TRAIN
index d586050e..6f7184ea 100755
--- a/compound-split/de/TRAIN
+++ b/compound-split/de/TRAIN
@@ -1,2 +1 @@
-../../training/cluster-ptrain.pl cdec-train.ini dev.in-ref weights.ptinit --gaussian_prior --sigma_squared 1 &> training.log &
-
+~/cdec/training/mpi_batch_optimize -w weights.cur.gz -t dev.in-ref -d cdec-train.ini -M 200
diff --git a/compound-split/de/cdec-train.ini b/compound-split/de/cdec-train.ini
index 58a99106..44f5934d 100644
--- a/compound-split/de/cdec-train.ini
+++ b/compound-split/de/cdec-train.ini
@@ -1,5 +1,5 @@
 formalism=csplit
-crf_uniform_empirical=true
+# crf_uniform_empirical=true
 intersection_strategy=full
 feature_function=CSplit_BasicFeatures large_dict.de.gz badlist.de.gz
 feature_function=CSplit_ReverseCharLM charlm.rev.5gm.de.lm.gz
diff --git a/compound-split/de/dev.in-ref b/compound-split/de/dev.in-ref
index a68f0688..83dae731 100644
--- a/compound-split/de/dev.in-ref
+++ b/compound-split/de/dev.in-ref
@@ -619,3 +619,122 @@ teuersten ||| # teuersten
 kirchenneubau ||| # kirche neu bau
 ostdeutschlands ||| # ost deutschlands
 erfolgen ||| # erfolgen
+rumänien ||| # rumänien 
+empört ||| # empört
+berlin ||| # berlin
+rumänische ||| # rumänische
+regierung ||| # regierung
+empört ||| # empört
+ankündigung ||| # ankündigung
+deutschlands ||| # deutschlands
+frankreichs ||| # frankreichs
+beitritt ||| # beitritt
+rumäniens ||| # rumäniens
+bulgariens ||| # bulgariens
+schengen ||| # schengen
+nicht ||| # nicht
+zuzustimmen ||| # zuzustimmen
+bukarest ||| # bukarest
+informationen ||| # informationen
+verletzung ||| # verletzung
+vertrags ||| # vertrags
+lissabon ||| # lissabon
+rumänischer ||| # rumänischer
+zollbeamter ||| ((('#',0,1),),(('zoll',0,1),('zollbeamter',0,2),),(('beamter',0,1),),)
+grenze ||| # grenze
+zwischen ||| # zwischen
+rumänien ||| # rumänien
+republik ||| # republik
+moldau ||| # moldau
+dezember ||| # dezember
+regierung ||| # regierung
+bukarest ||| # bukarest
+empört ||| # empört
+treten ||| # treten
+kontrollen ||| # kontrollen
+grenzen ||| # grenzen
+rumänien ||| # rumänien
+bulgarien ||| # bulgarien
+solange ||| # solange
+beizubehalten ||| # beizubehalten
+länder ||| # länder
+unumkehrbare ||| # unumkehrbare
+fortschritte ||| # fortschritte
+korruption ||| # korruption
+organisierte ||| # organisierte
+kriminalität ||| # kriminalität
+vorweisen ||| # vorweisen
+bukarest ||| # bukarest
+informationen ||| # informationen
+dieser ||| # dieser
+zeitung ||| # zeitung
+überlegt ||| # überlegt
+vertragsverletzung ||| ((('#',0,1),),(('vertrag',0,1),('vertrags',0,1),),(('verletzung',0,1),),)
+einzureichen ||| # einzureichen
+sollten ||| # sollten
+deutschland ||| # deutschland
+frankreich ||| # frankreich
+haltung ||| # haltung
+durchsetzen ||| # durchsetzen
+rumäniens ||| # rumäniens
+außenministerium ||| ((('#',0,1),),(('außen',0,1),),(('ministerium',0,1),),)
+spricht ||| # spricht
+unannehmbaren ||| # unannehmbaren
+präzedenzfall ||| ((('#',0,1),),(('präzedenzfall',0,2),('präzedenz',0,1),),(('fall',0,1),),)
+sondern ||| # sondern
+staatspräsident ||| ((('#',0,1),),(('staatspräsident',0,2),('staats',0,1),('staat',0,1),),(('präsident',0,1),),)
+georgi ||| # georgi
+parwanow ||| # parwanow
+verständnis ||| # verständnis
+bulgarien ||| # bulgarien
+verstehen ||| # verstehen
+auflagen ||| # auflagen
+erfüllen ||| # erfüllen
+eigentliche ||| # eigentliche
+erklärung ||| # erklärung
+verzögerung ||| # verzögerung
+mittwoch ||| # mittwoch
+haltung ||| # haltung
+hintergrund ||| # hintergrund
+streits ||| # streits
+regierung ||| # regierung
+ministerpräsident ||| ((('#',0,1),),(('minister',0,1),),(('präsident',0,1),),)
+grenzkontrollen ||| ((('#',0,1),),(('grenz',0,1),),(('kontrollen',0,1),),)
+entfallen ||| # entfallen
+zweiten ||| # zweiten
+weltkrieg ||| ((('#',0,1),),(('welt',0,1),('weltkrieg',0,2),),(('krieg',0,1),),)
+versteckte ||| # versteckte
+abwehr ||| # abwehr
+admirals ||| # admirals
+canaris ||| # canaris
+sprengsätze ||| # sprengsätze
+apfelsinenkisten ||| ((('#',0,1),),(('apfelsinen',0,1),('apfelsine',0,1),),(('kisten',0,1),),)
+britische ||| # britische
+hafenarbeiter ||| ((('#',0,1),),(('hafen',0,1),),(('arbeiter',0,1),),)
+weigerten ||| # weigerten
+schiffe ||| # schiffe
+entladen ||| # entladen
+zeiten ||| # zeiten
+griechischen ||| # griechischen
+militärdiktatur ||| ((('#',0,1),),(('militär',0,1),),(('diktatur',0,1),),)
+warnte ||| # warnte
+widerstandsgruppe ||| ((('#',0,1),),(('widerstand',0,1),('widerstands',0,1),),(('gruppe',0,1),),)
+pfirsiche ||| # pfirsiche
+aprikosen ||| # aprikosen
+vergiftet ||| # vergiftet
+kuklina ||| # kuklina
+trägerin ||| # trägerin
+alternativen ||| # alternativen
+nobelpreises ||| ((('#',0,1),),(('nobel',0,1),),(('preises',0,1),),)
+kämpft |||  # kämpft
+rechte ||| # rechte
+soldaten ||| # soldaten
+russlands ||| # russlands
+online ||| # online
+sprach ||| # sprach
+menschenrechte ||| ((('#',0,1),),(('menschen',0,1),('mensch',0,1),),(('rechte',0,1),),)
+heimat ||| # heimat
+kaufrausch ||| ((('#',0,1),),(('kauf',0,1),),(('rausch',0,1),),)
+kommerzialisierung ||| # kommerzialisierung
+weihnachten ||| # weihnachten
+funktioniert ||| # funktioniert
diff --git a/compound-split/de/weights.ptinit b/compound-split/de/weights.ptinit
index eaaa3899..eaea77ce 100644
--- a/compound-split/de/weights.ptinit
+++ b/compound-split/de/weights.ptinit
@@ -5,9 +5,11 @@ FugS 0
 FugN 0
 WordCount 0
 InDict 0
+InDictSubWord 0
 Short 0
 Long 0
 OOV 0
+OOVSubWord 0
 ShortRange 0
 HighFreq 0
 MedFreq 0
diff --git a/compound-split/de/weights.trained b/compound-split/de/weights.trained
index 359e5cc7..94c6951f 100644
--- a/compound-split/de/weights.trained
+++ b/compound-split/de/weights.trained
@@ -1,17 +1,20 @@
-LettersSq -0.037643555390228831
-LettersSqrt 0.58198736272513085
-RevCharLM 0.45802141843469085
-FugS 0.26570690067173086
-FugN -0.70672252122442492
-WordCount 0.33774557030334018
-InDict 0.23339787529651213
-Short 0.60862824917301594
-Long -0.58675406875713121
-OOV 0.10434769500682411
-ShortRange -1.0221040223076261
-HighFreq -2.9803896632623825
-MedFreq 0.18811013582723696
-Freq -0.26933190242976746
-Bad -2.3217842031714113
-FreqLen1 -0.28996794292058575
-FreqLen2 -0.21944133928835977
+# Objective = 130.351  (eval count=252)
+LettersSq -0.056135510587750022
+LettersSqrt -2.3295721373391776
+RevCharLM 0.36059050723989938
+FugS -0.65163142842679733
+FugN -1.7541906469311515
+WordCount 19.356942545900733
+InDict -15.19336735406667
+InDictSubWord 0.8665049533783179
+Short 1.0429051684475563
+Long -0.66305657970937237
+OOV 35.550309899439839
+OOVSubWord -2.023997552143789
+ShortRange -1.0433366143574028
+HighFreq -4.9882552529226301
+MedFreq -0.091778951485726443
+Freq 0.4285650067397816
+Bad -62
+FreqLen1 -1.8532115534306581
+FreqLen2 -1.2921088742036031
diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
index 18460950..9390c809 100644
--- a/decoder/apply_models.cc
+++ b/decoder/apply_models.cc
@@ -177,6 +177,7 @@ public:
 
   void Apply() {
     int num_nodes = in.nodes_.size();
+    assert(num_nodes >= 2);
     int goal_id = num_nodes - 1;
     int pregoal = goal_id - 1;
     int every = 1;
diff --git a/decoder/csplit.cc b/decoder/csplit.cc
index 7d50e3af..4a723822 100644
--- a/decoder/csplit.cc
+++ b/decoder/csplit.cc
@@ -13,14 +13,16 @@ using namespace std;
 
 struct CompoundSplitImpl {
   CompoundSplitImpl(const boost::program_options::variables_map& conf) :
-      fugen_elements_(true),   // TODO configure
+      fugen_elements_(true),
       min_size_(3),
       kXCAT(TD::Convert("X")*-1),
       kWORDBREAK_RULE(new TRule("[X] ||| # ||| #")),
       kTEMPLATE_RULE(new TRule("[X] ||| [X,1] ? ||| [1] ?")),
       kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")),
       kFUGEN_S(FD::Convert("FugS")),
-      kFUGEN_N(FD::Convert("FugN")) {}
+      kFUGEN_N(FD::Convert("FugN")) {
+    // TODO: use conf to turn fugenelements on and off
+  }
 
   void PasteTogetherStrings(const vector<string>& chars,
                             const int i,
@@ -40,73 +42,73 @@ struct CompoundSplitImpl {
 
   void BuildTrellis(const vector<string>& chars,
                     Hypergraph* forest) {
-//    vector<int> nodes(chars.size()+1, -1);
-//    nodes[0] = forest->AddNode(kXCAT)->id_;       // source
-//    const int left_rule = forest->AddEdge(kWORDBREAK_RULE, Hypergraph::TailNodeVector())->id_;
-//    forest->ConnectEdgeToHeadNode(left_rule, nodes[0]);
-//
-//    const int max_split_ = max(static_cast<int>(chars.size()) - min_size_ + 1, 1);
-//    cerr << "max: " << max_split_ << "  " << " min: " << min_size_ << endl;
-//    for (int i = min_size_; i < max_split_; ++i)
-//      nodes[i] = forest->AddNode(kXCAT)->id_;
-//    assert(nodes.back() == -1);
-//    nodes.back() = forest->AddNode(kXCAT)->id_;   // sink
-//
-//    for (int i = 0; i < max_split_; ++i) {
-//      if (nodes[i] < 0) continue;
-//      const int start = min(i + min_size_, static_cast<int>(chars.size()));
-//      for (int j = start; j <= chars.size(); ++j) {
-//        if (nodes[j] < 0) continue;
-//        string yield;
-//        PasteTogetherStrings(chars, i, j, &yield);
-//        // cerr << "[" << i << "," << j << "] " << yield << endl;
-//        TRulePtr rule = TRulePtr(new TRule(*kTEMPLATE_RULE));
-//        rule->e_[1] = rule->f_[1] = TD::Convert(yield);
-//        // cerr << rule->AsString() << endl;
-//        int edge = forest->AddEdge(
-//          rule,
-//          Hypergraph::TailNodeVector(1, nodes[i]))->id_;
-//        forest->ConnectEdgeToHeadNode(edge, nodes[j]);
-//        forest->edges_[edge].i_ = i;
-//        forest->edges_[edge].j_ = j;
-//
-//        // handle "fugenelemente" here
-//        // don't delete "fugenelemente" at the end of words
-//        if (fugen_elements_ && j != chars.size()) {
-//          const int len = yield.size();
-//          string alt;
-//          int fid = 0;
-//          if (len > (min_size_ + 2) && yield[len-1] == 's' && yield[len-2] == 'e') {
-//            alt = yield.substr(0, len - 2);
-//            fid = kFUGEN_S;
-//          } else if (len > (min_size_ + 1) && yield[len-1] == 's') {
-//            alt = yield.substr(0, len - 1);
-//            fid = kFUGEN_S;
-//          } else if (len > (min_size_ + 2) && yield[len-2] == 'e' && yield[len-1] == 'n') {
-//            alt = yield.substr(0, len - 1);
-//            fid = kFUGEN_N;
-//          }
-//          if (alt.size()) {
-//            TRulePtr altrule = TRulePtr(new TRule(*rule));
-//            altrule->e_[1] = TD::Convert(alt);
-//            // cerr << altrule->AsString() << endl;
-//            int edge = forest->AddEdge(
-//              altrule,
-//              Hypergraph::TailNodeVector(1, nodes[i]))->id_;
-//            forest->ConnectEdgeToHeadNode(edge, nodes[j]);
-//            forest->edges_[edge].feature_values_.set_value(fid, 1.0);
-//            forest->edges_[edge].i_ = i;
-//            forest->edges_[edge].j_ = j;
-//          }
-//        }
-//      }
-//    }
-//
-//    // add goal rule
-//    Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1);
-//    Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1);
-//    Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail);
-//    forest->ConnectEdgeToHeadNode(hg_edge, goal);
+    vector<int> nodes(chars.size()+1, -1);
+    nodes[0] = forest->AddNode(kXCAT)->id_;       // source
+    const int left_rule = forest->AddEdge(kWORDBREAK_RULE, Hypergraph::TailNodeVector())->id_;
+    forest->ConnectEdgeToHeadNode(left_rule, nodes[0]);
+
+    const int max_split_ = max(static_cast<int>(chars.size()) - min_size_ + 1, 1);
+    // cerr << "max: " << max_split_ << "  " << " min: " << min_size_ << endl;
+    for (int i = min_size_; i < max_split_; ++i)
+      nodes[i] = forest->AddNode(kXCAT)->id_;
+    assert(nodes.back() == -1);
+    nodes.back() = forest->AddNode(kXCAT)->id_;   // sink
+
+    for (int i = 0; i < max_split_; ++i) {
+      if (nodes[i] < 0) continue;
+      const int start = min(i + min_size_, static_cast<int>(chars.size()));
+      for (int j = start; j <= chars.size(); ++j) {
+        if (nodes[j] < 0) continue;
+        string yield;
+        PasteTogetherStrings(chars, i, j, &yield);
+        // cerr << "[" << i << "," << j << "] " << yield << endl;
+        TRulePtr rule = TRulePtr(new TRule(*kTEMPLATE_RULE));
+        rule->e_[1] = rule->f_[1] = TD::Convert(yield);
+        // cerr << rule->AsString() << endl;
+        int edge = forest->AddEdge(
+          rule,
+          Hypergraph::TailNodeVector(1, nodes[i]))->id_;
+        forest->ConnectEdgeToHeadNode(edge, nodes[j]);
+        forest->edges_[edge].i_ = i;
+        forest->edges_[edge].j_ = j;
+
+        // handle "fugenelemente" here
+        // don't delete "fugenelemente" at the end of words
+        if (fugen_elements_ && j != chars.size()) {
+          const int len = yield.size();
+          string alt;
+          int fid = 0;
+          if (len > (min_size_ + 2) && yield[len-1] == 's' && yield[len-2] == 'e') {
+            alt = yield.substr(0, len - 2);
+            fid = kFUGEN_S;
+          } else if (len > (min_size_ + 1) && yield[len-1] == 's') {
+            alt = yield.substr(0, len - 1);
+            fid = kFUGEN_S;
+          } else if (len > (min_size_ + 2) && yield[len-2] == 'e' && yield[len-1] == 'n') {
+            alt = yield.substr(0, len - 1);
+            fid = kFUGEN_N;
+          }
+          if (alt.size()) {
+            TRulePtr altrule = TRulePtr(new TRule(*rule));
+            altrule->e_[1] = TD::Convert(alt);
+            // cerr << altrule->AsString() << endl;
+            int edge = forest->AddEdge(
+              altrule,
+              Hypergraph::TailNodeVector(1, nodes[i]))->id_;
+            forest->ConnectEdgeToHeadNode(edge, nodes[j]);
+            forest->edges_[edge].feature_values_.set_value(fid, 1.0);
+            forest->edges_[edge].i_ = i;
+            forest->edges_[edge].j_ = j;
+          }
+        }
+      }
+    }
+
+    // add goal rule
+    Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1);
+    Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1);
+    Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail);
+    forest->ConnectEdgeToHeadNode(hg_edge, goal);
   }
  private:
   const bool fugen_elements_;
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 3551b584..e28080aa 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -279,7 +279,6 @@ struct DecoderImpl {
   bool encode_b64;
   bool kbest;
   bool unique_kbest;
-  bool crf_uniform_empirical;
   bool get_oracle_forest;
   shared_ptr<WriteFile> extract_file;
   int combine_size;
@@ -379,7 +378,6 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
         ("max_translation_sample,X", po::value<int>(), "Sample the max translation from the chart")
         ("pb_max_distortion,D", po::value<int>()->default_value(4), "Phrase-based decoder: maximum distortion")
         ("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)")
-        ("crf_uniform_empirical", "If there are multple references use (i.e., lattice) a uniform distribution rather than posterior weighting a la EM")
     ("get_oracle_forest,o", "Calculate rescored hypregraph using approximate BLEU scoring of rules")
     ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)")
         ("vector_format",po::value<string>()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)")
@@ -611,7 +609,6 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
   encode_b64 = str("vector_format",conf) == "b64";
   kbest = conf.count("k_best");
   unique_kbest = conf.count("unique_k_best");
-  crf_uniform_empirical = conf.count("crf_uniform_empirical");
   get_oracle_forest = conf.count("get_oracle_forest");
 
   cfg_options.Validate();
@@ -842,14 +839,12 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
   if (has_ref) {
     if (HG::Intersect(ref, &forest)) {
       if (!SILENT) forest_stats(forest,"  Constr. forest",show_tree_structure,show_features,feature_weights,oracle.show_derivation);
-      if (crf_uniform_empirical) {
-        if (!SILENT) cerr << "  USING UNIFORM WEIGHTS\n";
-        for (int i = 0; i < forest.edges_.size(); ++i)
-          forest.edges_[i].edge_prob_=prob_t::One();
-      } else {
-        forest.Reweight(feature_weights);
-        if (!SILENT) cerr << "  Constr. VitTree: " << ViterbiFTree(forest) << endl;
-      }
+//      if (crf_uniform_empirical) {
+//        if (!SILENT) cerr << "  USING UNIFORM WEIGHTS\n";
+//        for (int i = 0; i < forest.edges_.size(); ++i)
+//          forest.edges_[i].edge_prob_=prob_t::One(); }
+      forest.Reweight(feature_weights);
+      if (!SILENT) cerr << "  Constr. VitTree: " << ViterbiFTree(forest) << endl;
       if (conf.count("show_partition")) {
          const prob_t z = Inside<prob_t, EdgeProb>(forest);
          cerr << "  Contst. partition  log(Z): " << log(z) << endl;
@@ -878,11 +873,9 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
       if (write_gradient) {
         const prob_t ref_z = InsideOutside<prob_t, EdgeProb, SparseVector<prob_t>, EdgeFeaturesAndProbWeightFunction>(forest, &ref_exp);
         ref_exp /= ref_z;
-        if (crf_uniform_empirical) {
-          log_ref_z = ref_exp.dot(feature_weights);
-        } else {
-          log_ref_z = log(ref_z);
-        }
+//        if (crf_uniform_empirical)
+//          log_ref_z = ref_exp.dot(feature_weights);
+        log_ref_z = log(ref_z);
         //cerr << "      MODEL LOG Z: " << log_z << endl;
         //cerr << "  EMPIRICAL LOG Z: " << log_ref_z << endl;
         if ((log_z - log_ref_z) < kMINUS_EPSILON) {
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
index 1485009b..204b7ce6 100644
--- a/decoder/ff_csplit.cc
+++ b/decoder/ff_csplit.cc
@@ -22,9 +22,11 @@ struct BasicCSplitFeaturesImpl {
       letters_sq_(FD::Convert("LettersSq")),
       letters_sqrt_(FD::Convert("LettersSqrt")),
       in_dict_(FD::Convert("InDict")),
+      in_dict_sub_word_(FD::Convert("InDictSubWord")),
       short_(FD::Convert("Short")),
       long_(FD::Convert("Long")),
       oov_(FD::Convert("OOV")),
+      oov_sub_word_(FD::Convert("OOVSubWord")),
       short_range_(FD::Convert("ShortRange")),
       high_freq_(FD::Convert("HighFreq")),
       med_freq_(FD::Convert("MedFreq")),
@@ -52,15 +54,18 @@ struct BasicCSplitFeaturesImpl {
   }
 
   void TraversalFeaturesImpl(const Hypergraph::Edge& edge,
+                             const int src_word_size,
                              SparseVector<double>* features) const;
 
   const int word_count_;
   const int letters_sq_;
   const int letters_sqrt_;
   const int in_dict_;
+  const int in_dict_sub_word_;
   const int short_;
   const int long_;
   const int oov_;
+  const int oov_sub_word_;
   const int short_range_;
   const int high_freq_;
   const int med_freq_;
@@ -77,7 +82,9 @@ BasicCSplitFeatures::BasicCSplitFeatures(const string& param) :
 
 void BasicCSplitFeaturesImpl::TraversalFeaturesImpl(
                                      const Hypergraph::Edge& edge,
+                                     const int src_word_length,
                                      SparseVector<double>* features) const {
+  const bool subword = (edge.i_ > 0) || (edge.j_ < src_word_length);
   features->set_value(word_count_, 1.0);
   features->set_value(letters_sq_, (edge.j_ - edge.i_) * (edge.j_ - edge.i_));
   features->set_value(letters_sqrt_, sqrt(edge.j_ - edge.i_));
@@ -108,8 +115,10 @@ void BasicCSplitFeaturesImpl::TraversalFeaturesImpl(
   if (freq) {
     features->set_value(freq_, freq);
     features->set_value(in_dict_, 1.0);
+    if (subword) features->set_value(in_dict_sub_word_, 1.0);
   } else {
     features->set_value(oov_, 1.0);
+    if (subword) features->set_value(oov_sub_word_, 1.0);
     freq = 99.0f;
   }
   if (bad_words_.count(word) != 0)
@@ -143,7 +152,7 @@ void BasicCSplitFeatures::TraversalFeaturesImpl(
   (void) estimated_features;
   if (edge.Arity() == 0) return;
   if (edge.rule_->EWords() != 1) return;
-  pimpl_->TraversalFeaturesImpl(edge, features);
+  pimpl_->TraversalFeaturesImpl(edge, smeta.GetSourceLattice().size(), features);
 }
 
 struct ReverseCharLMCSplitFeatureImpl {
@@ -208,9 +217,17 @@ void ReverseCharLMCSplitFeature::TraversalFeaturesImpl(
   if (edge.rule_->EWords() != 1) return;
   const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_);
   features->set_value(fid_, lpp);
+#if 0
   WordID neighbor_word = 0;
   const WordID word = edge.rule_->e_[1];
-#if 0
+  const char* sword = TD::Convert(word);
+  const int len = strlen(sword);
+  int cur = 0;
+  int chars = 0;
+  while(cur < len) {
+    cur += UTF8Len(sword[cur]);
+    ++chars;
+  }
   if (chars > 4 && (sword[0] == 's' || sword[0] == 'n')) {
     neighbor_word = TD::Convert(string(&sword[1]));
   }
diff --git a/training/Makefile.am b/training/Makefile.am
index cb17aeff..8218ff0a 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -9,9 +9,8 @@ bin_PROGRAMS = \
   plftools \
   collapse_weights \
   cllh_filter_grammar \
-  mpi_online_optimize
-
-#  mpi_batch_optimize
+  mpi_online_optimize \
+  mpi_batch_optimize
 
 noinst_PROGRAMS = \
   lbfgs_test \
@@ -22,8 +21,8 @@ TESTS = lbfgs_test optimize_test
 mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc
 mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
-#mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc
-#mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc
+mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
 
 if MPI
 bin_PROGRAMS += compute_cllh
-- 
cgit v1.2.3