11 files changed, 231 insertions, 77 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index d4e2a77c..81cd43e7 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -65,7 +65,8 @@ libcdec_a_SOURCES = \
   ff_csplit.cc \
   ff_tagger.cc \
   freqdict.cc \
-  lexcrf.cc \
+  lexalign.cc \
+  lextrans.cc \
   tagger.cc \
   bottom_up_parser.cc \
   phrasebased_translator.cc \
diff --git a/decoder/cdec.cc b/decoder/cdec.cc
index b130e7fd..811a0d04 100644
--- a/decoder/cdec.cc
+++ b/decoder/cdec.cc
@@ -18,7 +18,8 @@
 #include "sampler.h"
 #include "sparse_vector.h"
 #include "tagger.h"
-#include "lexcrf.h"
+#include "lextrans.h"
+#include "lexalign.h"
 #include "csplit.h"
 #include "weights.h"
 #include "tdict.h"
@@ -50,7 +51,7 @@ void ShowBanner() {
 void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
-        ("formalism,f",po::value<string>(),"Decoding formalism; values include SCFG, FST, PB, LexCRF (lexical translation model), CSplit (compound splitting), Tagger (sequence labeling)")
+        ("formalism,f",po::value<string>(),"Decoding formalism; values include SCFG, FST, PB, LexTrans (lexical translation model, also disc training), CSplit (compound splitting), Tagger (sequence labeling), LexAlign (alignment only, or EM training)")
         ("input,i",po::value<string>()->default_value("-"),"Source file")
         ("grammar,g",po::value<vector<string> >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)")
         ("weights,w",po::value<string>(),"Feature weights file")
@@ -72,7 +73,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
         ("show_expected_length", "Show the expected translation length under the model")
         ("show_partition,z", "Compute and show the partition (inside score)")
         ("beam_prune", po::value<double>(), "Prune paths from +LM forest")
-        ("lexcrf_use_null", "Support source-side null words in lexical translation")
+        ("lexalign_use_null", "Support source-side null words in lexical translation")
         ("tagger_tagset,t", po::value<string>(), "(Tagger) file containing tag set")
         ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format")
         ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice")
@@ -117,8 +118,8 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   }
 
   const string formalism = LowercaseString((*conf)["formalism"].as<string>());
-  if (formalism != "scfg" && formalism != "fst" && formalism != "lexcrf" && formalism != "pb" && formalism != "csplit" && formalism != "tagger") {
-    cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit', 'lexcrf', or 'tagger'\n";
+  if (formalism != "scfg" && formalism != "fst" && formalism != "lextrans" && formalism != "pb" && formalism != "csplit" && formalism != "tagger" && formalism != "lexalign") {
+    cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit', 'lextrans', 'lexalign', or 'tagger'\n";
     cerr << dcmdline_options << endl;
     exit(1);
   }
@@ -273,8 +274,10 @@ int main(int argc, char** argv) {
     translator.reset(new PhraseBasedTranslator(conf));
   else if (formalism == "csplit")
     translator.reset(new CompoundSplit(conf));
-  else if (formalism == "lexcrf")
-    translator.reset(new LexicalCRF(conf));
+  else if (formalism == "lextrans")
+    translator.reset(new LexicalTrans(conf));
+  else if (formalism == "lexalign")
+    translator.reset(new LexicalAlign(conf));
   else if (formalism == "tagger")
     translator.reset(new Tagger(conf));
   else
diff --git a/decoder/dict_test.cc b/decoder/dict_test.cc
index 5c5d84f0..2049ec27 100644
--- a/decoder/dict_test.cc
+++ b/decoder/dict_test.cc
@@ -1,8 +1,13 @@
 #include "dict.h"
 
+#include "fdict.h"
+
+#include <iostream>
 #include <gtest/gtest.h>
 #include <cassert>
 
+using namespace std;
+
 class DTest : public testing::Test {
  public:
   DTest() {}
@@ -23,6 +28,18 @@ TEST_F(DTest, Convert) {
   EXPECT_EQ(d.Convert(b), "bar");
 }
 
+TEST_F(DTest, FDictTest) {
+  int fid = FD::Convert("First");
+  EXPECT_GT(fid, 0);
+  EXPECT_EQ(FD::Convert(fid), "First");
+  string x = FD::Escape("=");
+  cerr << x << endl;
+  EXPECT_NE(x, "=");
+  x = FD::Escape(";");
+  cerr << x << endl;
+  EXPECT_NE(x, ";");
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/decoder/fdict.cc b/decoder/fdict.cc
index 8218a5d3..7e1b0e1f 100644
--- a/decoder/fdict.cc
+++ b/decoder/fdict.cc
@@ -1,5 +1,129 @@
 #include "fdict.h"
 
+#include <string>
+
+using namespace std;
+
 Dict FD::dict_;
 bool FD::frozen_ = false;
 
+static int HexPairValue(const char * code) {
+  int value = 0;
+  const char * pch = code;
+  for (;;) {
+    int digit = *pch++;
+    if (digit >= '0' && digit <= '9') {
+      value += digit - '0';
+    }
+    else if (digit >= 'A' && digit <= 'F') {
+      value += digit - 'A' + 10;
+    }
+    else if (digit >= 'a' && digit <= 'f') {
+      value += digit - 'a' + 10;
+    }
+    else {
+      return -1;
+    }
+    if (pch == code + 2)
+      return value;
+    value <<= 4;
+  }
+}
+
+int UrlDecode(const char *source, char *dest)
+{
+  char * start = dest;
+
+  while (*source) {
+    switch (*source) {
+    case '+':
+      *(dest++) = ' ';
+      break;
+    case '%':
+      if (source[1] && source[2]) {
+        int value = HexPairValue(source + 1);
+        if (value >= 0) {
+          *(dest++) = value;
+          source += 2;
+        }
+        else {
+          *dest++ = '?';
+        }
+      }
+      else {
+        *dest++ = '?';
+      }
+      break;
+    default:
+      *dest++ = *source;
+    }
+    source++;
+  }
+  
+  *dest = 0;
+  return dest - start;
+}  
+
+int UrlEncode(const char *source, char *dest, unsigned max) {
+  static const char *digits = "0123456789ABCDEF";
+  unsigned char ch;
+  unsigned len = 0;
+  char *start = dest;
+
+  while (len < max - 4 && *source)
+  {
+    ch = (unsigned char)*source;
+    if (*source == ' ') {
+      *dest++ = '+';
+    }
+    else if (strchr("=:;,_| %", ch)) {
+      *dest++ = '%';
+      *dest++ = digits[(ch >> 4) & 0x0F];
+      *dest++ = digits[       ch & 0x0F];
+    }
+    else {
+      *dest++ = *source;
+    }  
+    source++;
+  }
+  *dest = 0;
+  return start - dest;
+}
+
+std::string UrlDecodeString(const std::string & encoded) {
+  const char * sz_encoded = encoded.c_str();
+  size_t needed_length = encoded.length();
+  for (const char * pch = sz_encoded; *pch; pch++) {
+    if (*pch == '%')
+      needed_length += 2;
+  }
+  needed_length += 10;
+  char stackalloc[64];
+  char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ?
+    (char *)malloc(needed_length) : stackalloc;
+  UrlDecode(encoded.c_str(), buf);
+  std::string result(buf);
+  if (buf != stackalloc) {
+    free(buf);
+  }
+  return result;
+}
+
+std::string UrlEncodeString(const std::string & decoded) {
+  const char * sz_decoded = decoded.c_str();
+  size_t needed_length = decoded.length() * 3 + 3;
+  char stackalloc[64];
+  char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ?
+    (char *)malloc(needed_length) : stackalloc;
+  UrlEncode(decoded.c_str(), buf, needed_length);
+  std::string result(buf);
+  if (buf != stackalloc) {
+    free(buf);
+  }
+  return result;
+}
+
+string FD::Escape(const string& s) {
+  return UrlEncodeString(s);
+}
+
diff --git a/decoder/fdict.h b/decoder/fdict.h
index d05f1706..c4236580 100644
--- a/decoder/fdict.h
+++ b/decoder/fdict.h
@@ -20,6 +20,9 @@ struct FD {
   static inline const std::string& Convert(const WordID& w) {
     return dict_.Convert(w);
   }
+  // Escape any string to a form that can be used as the name
+  // of a weight in a weights file
+  static std::string Escape(const std::string& s);
   static Dict dict_;
  private:
   static bool frozen_;
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index fb90df62..669aa530 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -26,7 +26,7 @@ Model2BinaryFeatures::Model2BinaryFeatures(const string& param) :
         val = -1;
         if (j < i) {
           ostringstream os;
-          os << "M2_FL:" << i << "_SI:" << j << "_TI:" << k;
+          os << "M2FL:" << i << ":TI:" << k << "_SI:" << j;
           val = FD::Convert(os.str());
         }
       }
@@ -181,32 +181,27 @@ void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   }
 }
 
+//  std::vector<std::map<int, int> > flen2jump2fid_;
 MarkovJump::MarkovJump(const string& param) :
     FeatureFunction(1),
     fid_(FD::Convert("MarkovJump")),
-    individual_params_per_jumpsize_(false),
-    condition_on_flen_(false) {
+    binary_params_(false) {
   cerr << "    MarkovJump";
   vector<string> argv;
   int argc = SplitOnWhitespace(param, &argv);
-  if (argc > 0) {
-    if (argv[0] == "--fclasses") {
-      argc--;
-      assert(argc > 0);
-      const string f_class_file = argv[1];
-    }
-    if (argc != 1 || !(argv[0] == "-f" || argv[0] == "-i" || argv[0] == "-if")) {
-      cerr << "MarkovJump: expected parameters to be -f, -i, or -if\n";
-      exit(1);
-    }
-    individual_params_per_jumpsize_ = (argv[0][1] == 'i');
-    condition_on_flen_ = (argv[0][argv[0].size() - 1] == 'f');
-    if (individual_params_per_jumpsize_) {
-      template_ = "Jump:000";
-      cerr << ", individual jump parameters";
-      if (condition_on_flen_) {
-        template_ += ":F00";
-        cerr << " (split by f-length)";
+  if (argc != 1 || !(argv[0] == "-b" || argv[0] == "+b")) {
+    cerr << "MarkovJump: expected parameters to be -b or +b\n";
+    exit(1);
+  }
+  binary_params_ = argv[0] == "+b";
+  if (binary_params_) {
+    flen2jump2fid_.resize(MAX_SENTENCE_SIZE);
+    for (int i = 1; i < MAX_SENTENCE_SIZE; ++i) {
+      map<int, int>& jump2fid = flen2jump2fid_[i];
+      for (int jump = -i; jump <= i; ++jump) {
+        ostringstream os;
+        os << "Jump:FLen:" << i << "_J:" << jump;
+        jump2fid[jump] = FD::Convert(os.str());
       }
     }
   } else {
@@ -215,6 +210,7 @@ MarkovJump::MarkovJump(const string& param) :
   cerr << endl;
 }
 
+// TODO handle NULLs according to Och 2000
 void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
                                        const Hypergraph::Edge& edge,
                                        const vector<const void*>& ant_states,
@@ -222,8 +218,24 @@ void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
                                        SparseVector<double>* estimated_features,
                                        void* state) const {
   unsigned char& dpstate = *((unsigned char*)state);
+  const int flen = smeta.GetSourceLength();
   if (edge.Arity() == 0) {
     dpstate = static_cast<unsigned int>(edge.i_);
+    if (edge.prev_i_ == 0) {
+      if (binary_params_) {
+        // NULL will be tricky
+        // TODO initial state distribution, not normal jumps
+        const int fid = flen2jump2fid_[flen].find(edge.i_ + 1)->second;
+        features->set_value(fid, 1.0);
+      }
+    } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) {
+        // NULL will be tricky
+      if (binary_params_) {
+        int jumpsize = flen - edge.i_;
+        const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
+        features->set_value(fid, 1.0);
+      }
+    }
   } else if (edge.Arity() == 1) {
     dpstate = *((unsigned char*)ant_states[0]);
   } else if (edge.Arity() == 2) {
@@ -234,27 +246,12 @@ void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
     else
       dpstate = static_cast<unsigned int>(right_index);
     const int jumpsize = right_index - left_index;
-    features->set_value(fid_, fabs(jumpsize - 1));  // Blunsom and Cohn def
 
-    if (individual_params_per_jumpsize_) {
-      string fname = template_;
-      int param = jumpsize;
-      if (jumpsize < 0) {
-        param *= -1;
-        fname[5]='L';
-      } else if (jumpsize > 0) {
-        fname[5]='R';
-      }
-      if (param) {
-        fname[6] = '0' + (param / 10);
-        fname[7] = '0' + (param % 10);
-      }
-      if (condition_on_flen_) {
-        const int flen = smeta.GetSourceLength();
-        fname[10] = '0' + (flen / 10);
-        fname[11] = '0' + (flen % 10);
-      }
-      features->set_value(FD::Convert(fname), 1.0);
+    if (binary_params_) {
+      const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
+      features->set_value(fid, 1.0);
+    } else {
+      features->set_value(fid_, fabs(jumpsize - 1));  // Blunsom and Cohn def
     }
   } else {
     assert(!"something really unexpected is happening");
diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h
index 688750de..c44ad26b 100644
--- a/decoder/ff_wordalign.h
+++ b/decoder/ff_wordalign.h
@@ -49,10 +49,8 @@ class MarkovJump : public FeatureFunction {
                                      void* out_context) const;
  private:
   const int fid_;
-  bool individual_params_per_jumpsize_;
-  bool condition_on_flen_;
-  bool condition_on_fclass_;
-  std::string template_;
+  bool binary_params_;
+  std::vector<std::map<int, int> > flen2jump2fid_;
 };
 
 class MarkovJumpFClass : public FeatureFunction {
diff --git a/decoder/lexalign.cc b/decoder/lexalign.cc
index ee3b5fe0..8dd77c53 100644
--- a/decoder/lexalign.cc
+++ b/decoder/lexalign.cc
@@ -31,17 +31,24 @@ struct LexicalAlignImpl {
       const WordID& e_i = target[i][0].label;
       Hypergraph::Node* node = forest->AddNode(kXCAT);
       const int new_node_id = node->id_;
+      int num_srcs = 0;
       for (int j = f_start; j < f_len; ++j) { // for each word in the source
         const WordID src_sym = (j < 0 ? kNULL : lattice[j][0].label);
-        TRulePtr& rule = LexRule(src_sym, e_i);
-        Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector());
-        edge->i_ = j;
-        edge->j_ = j+1;
-        edge->prev_i_ = i;
-        edge->prev_j_ = i+1;
-        edge->feature_values_ += edge->rule_->GetFeatureValues();
-        forest->ConnectEdgeToHeadNode(edge->id_, new_node_id);
+        const TRulePtr& rule = LexRule(src_sym, e_i);
+        if (rule) {
+          Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector());
+          edge->i_ = j;
+          edge->j_ = j+1;
+          edge->prev_i_ = i;
+          edge->prev_j_ = i+1;
+          edge->feature_values_ += edge->rule_->GetFeatureValues();
+          ++num_srcs;
+          forest->ConnectEdgeToHeadNode(edge->id_, new_node_id);
+        } else {
+          cerr << TD::Convert(src_sym) << " does not translate to " << TD::Convert(e_i) << endl;
+        }
       }
+      assert(num_srcs > 0);
       if (prev_node_id >= 0) {
         const int comb_node_id = forest->AddNode(kXCAT)->id_;
         Hypergraph::TailNodeVector tail(2, prev_node_id);
@@ -66,21 +73,23 @@ struct LexicalAlignImpl {
       return it->second;
     int& fid = e2fid[e];
     if (f == 0) {
-      fid = FD::Convert("Lx_<eps>_" + FD::Escape(TD::Convert(e)));
+      fid = FD::Convert("Lx:<eps>_" + FD::Escape(TD::Convert(e)));
     } else {
-      fid = FD::Convert("Lx_" + FD::Escape(TD::Convert(f)) + "_" + FD::Escape(TD::Convert(e)));
+      fid = FD::Convert("Lx:" + FD::Escape(TD::Convert(f)) + "_" + FD::Escape(TD::Convert(e)));
     }
     return fid;
   }
 
-  inline TRulePtr& LexRule(const WordID& f, const WordID& e) {
+  inline const TRulePtr& LexRule(const WordID& f, const WordID& e) {
+    const int fid = LexFeatureId(f, e);
+    if (!fid) { return kNULL_PTR; }
     map<int, TRulePtr>& e2rule = f2e2rule[f];
     map<int, TRulePtr>::iterator it = e2rule.find(e);
     if (it != e2rule.end())
       return it->second;
     TRulePtr& tr = e2rule[e];
     tr.reset(TRule::CreateLexicalRule(f, e));
-    tr->scores_.set_value(LexFeatureId(f, e), 1.0);
+    tr->scores_.set_value(fid, 1.0);
     return tr;
   }
 
@@ -90,6 +99,7 @@ struct LexicalAlignImpl {
   const WordID kNULL;
   const TRulePtr kBINARY;
   const TRulePtr kGOAL_RULE;
+  const TRulePtr kNULL_PTR;
   map<int, map<int, TRulePtr> > f2e2rule;
   map<int, map<int, int> > f2e2fid;
   GrammarPtr grammar;
diff --git a/decoder/lextrans.cc b/decoder/lextrans.cc
index b0e03c69..e7fa1aa1 100644
--- a/decoder/lextrans.cc
+++ b/decoder/lextrans.cc
@@ -1,4 +1,4 @@
-#include "lexcrf.h"
+#include "lextrans.h"
 
 #include <iostream>
 
@@ -10,8 +10,8 @@
 
 using namespace std;
 
-struct LexicalCRFImpl {
-  LexicalCRFImpl(const boost::program_options::variables_map& conf) :
+struct LexicalTransImpl {
+  LexicalTransImpl(const boost::program_options::variables_map& conf) :
       use_null(conf.count("lexcrf_use_null") > 0),
       kXCAT(TD::Convert("X")*-1),
       kNULL(TD::Convert("<eps>")),
@@ -95,10 +95,10 @@ struct LexicalCRFImpl {
   GrammarPtr grammar;
 };
 
-LexicalCRF::LexicalCRF(const boost::program_options::variables_map& conf) :
-  pimpl_(new LexicalCRFImpl(conf)) {}
+LexicalTrans::LexicalTrans(const boost::program_options::variables_map& conf) :
+  pimpl_(new LexicalTransImpl(conf)) {}
 
-bool LexicalCRF::Translate(const string& input,
+bool LexicalTrans::Translate(const string& input,
                       SentenceMetadata* smeta,
                       const vector<double>& weights,
                       Hypergraph* forest) {
diff --git a/decoder/lextrans.h b/decoder/lextrans.h
index 99362c81..9920f79c 100644
--- a/decoder/lextrans.h
+++ b/decoder/lextrans.h
@@ -1,18 +1,18 @@
-#ifndef _LEXCRF_H_
-#define _LEXCRF_H_
+#ifndef _LEXTrans_H_
+#define _LEXTrans_H_
 
 #include "translator.h"
 #include "lattice.h"
 
-struct LexicalCRFImpl;
-struct LexicalCRF : public Translator {
-  LexicalCRF(const boost::program_options::variables_map& conf);
+struct LexicalTransImpl;
+struct LexicalTrans : public Translator {
+  LexicalTrans(const boost::program_options::variables_map& conf);
   bool Translate(const std::string& input,
                  SentenceMetadata* smeta,
                  const std::vector<double>& weights,
                  Hypergraph* forest);
  private:
-  boost::shared_ptr<LexicalCRFImpl> pimpl_;
+  boost::shared_ptr<LexicalTransImpl> pimpl_;
 };
 
 #endif
diff --git a/decoder/stringlib.cc b/decoder/stringlib.cc
index 3ed74bef..3e52ae87 100644
--- a/decoder/stringlib.cc
+++ b/decoder/stringlib.cc
@@ -1,5 +1,6 @@
 #include "stringlib.h"
 
+#include <cstring>
 #include <cstdlib>
 #include <cassert>
 #include <iostream>