minor changes to the way the phonotactic prob is calculated

author: Chris Dyer <redpony@gmail.com> 2009-12-07 01:50:49 -0500
committer: Chris Dyer <redpony@gmail.com> 2009-12-07 01:50:49 -0500
commit: ec7edcc7e398bdb040d810094b8416ad9f279d98 (patch)
tree: bdaea01e4317d04946de432d8b3a187881e63e96
parent: d6c82eaee9489a4d187a266a7d76a3e09cde119f (diff)
4 files changed, 53 insertions, 30 deletions
diff --git a/src/csplit.cc b/src/csplit.cc
index 788f3112..21e1b711 100644
--- a/src/csplit.cc
+++ b/src/csplit.cc
@@ -145,6 +145,8 @@ bool CompoundSplit::Translate(const string& input,
   vector<string> in;
   SplitUTF8String(input, &in);
   smeta->SetSourceLength(in.size());  // TODO do utf8 or somethign
+  for (int i = 0; i < in.size(); ++i)
+    smeta->src_lattice_.push_back(vector<LatticeArc>(1, LatticeArc(TD::Convert(in[i]), 0.0, 1)));
   pimpl_->BuildTrellis(in, forest);
   forest->Reweight(weights);
   return true;
diff --git a/src/csplit.h b/src/csplit.h
index 5911af77..54e5329d 100644
--- a/src/csplit.h
+++ b/src/csplit.h
@@ -4,6 +4,13 @@
 #include "translator.h"
 #include "lattice.h"
 
+// this "translator" takes single words (with NO SPACES) and segments
+// them using the approach described in:
+//
+// C. Dyer. (2009) Using a maximum entropy model to build segmentation
+//                 lattices for MT. In Proceedings of NAACL HLT 2009.
+// note, an extra word space marker # is inserted at the left edge of
+// the forest!
 struct CompoundSplitImpl;
 struct CompoundSplit : public Translator {
   CompoundSplit(const boost::program_options::variables_map& conf);
diff --git a/src/ff_csplit.cc b/src/ff_csplit.cc
index e24d7d1d..eb106047 100644
--- a/src/ff_csplit.cc
+++ b/src/ff_csplit.cc
@@ -3,15 +3,17 @@
 #include <set>
 #include <cstring>
 
+#include "Vocab.h"
+#include "Ngram.h"
+
+#include "sentence_metadata.h"
+#include "lattice.h"
 #include "tdict.h"
 #include "freqdict.h"
 #include "filelib.h"
 #include "stringlib.h"
 #include "tdict.h"
 
-#include "Vocab.h"
-#include "Ngram.h"
-
 using namespace std;
 
 struct BasicCSplitFeaturesImpl {
@@ -25,6 +27,8 @@ struct BasicCSplitFeaturesImpl {
       high_freq_(FD::Convert("HighFreq")),
       med_freq_(FD::Convert("MedFreq")),
       freq_(FD::Convert("Freq")),
+      fl1_(FD::Convert("FreqLen1")),
+      fl2_(FD::Convert("FreqLen2")),
       bad_(FD::Convert("Bad")) {
     vector<string> argv;
     int argc = SplitOnWhitespace(param, &argv);
@@ -57,6 +61,8 @@ struct BasicCSplitFeaturesImpl {
   const int high_freq_;
   const int med_freq_;
   const int freq_;
+  const int fl1_;
+  const int fl2_;
   const int bad_;
   FreqDict freq_dict_;
   set<WordID> bad_words_;
@@ -78,6 +84,11 @@ void BasicCSplitFeaturesImpl::TraversalFeaturesImpl(
     cur += UTF8Len(sword[cur]);
     ++chars;
   }
+
+  // these are corrections that attempt to make chars
+  // more like a phoneme count than a letter count, they
+  // are only really meaningful for german and should
+  // probably be gotten rid of
   bool has_sch = strstr(sword, "sch");
   bool has_ch = (!has_sch && strstr(sword, "ch"));
   bool has_ie = strstr(sword, "ie");
@@ -107,6 +118,10 @@ void BasicCSplitFeaturesImpl::TraversalFeaturesImpl(
     features->set_value(med_freq_, 1.0);
   if (freq < 10.0f && chars < 5)
     features->set_value(short_range_, 1.0);
+
+  // i don't understand these features, but they really help!
+  features->set_value(fl1_, sqrt(chars * freq));
+  features->set_value(fl2_, freq / chars);
 }
 
 void BasicCSplitFeatures::TraversalFeaturesImpl(
@@ -128,6 +143,7 @@ void BasicCSplitFeatures::TraversalFeaturesImpl(
 struct ReverseCharLMCSplitFeatureImpl {
   ReverseCharLMCSplitFeatureImpl(const string& param) :
       order_(5),
+      vocab_(*TD::dict_),
       ngram_(vocab_, order_) {
     kBOS = vocab_.getIndex("<s>");
     kEOS = vocab_.getIndex("</s>");
@@ -137,41 +153,30 @@ struct ReverseCharLMCSplitFeatureImpl {
     ngram_.read(file);
   }
 
-  double LeftPhonotacticProb(const char* word) {
+  double LeftPhonotacticProb(const Lattice& inword, const int start) {
+    const int end = inword.size();
     for (int i = 0; i < order_; ++i)
       sc[i] = kBOS;
-    const int len = strlen(word);
-    int cur = 0;
-    int chars = 0;
-    while(cur < len) {
-      cur += UTF8Len(word[cur]);
-      ++chars;
-    }
-    const int sp = min(chars, order_-1);
-    int wend = 0; cur = 0;
-    while(cur < sp) {
-      wend += UTF8Len(word[wend]);
-      ++cur;
-    }
-    int wi = 0;
+    int sp = min(end - start, order_ - 1);
+    // cerr << "[" << start << "," << sp << "]\n";
     int ci = (order_ - sp - 1);
-    // cerr << "WORD: " << word << endl;
-    while (wi != wend) {
-      const int clen = UTF8Len(word[wi]);
-      string cur_char(&word[wi], clen);
-      wi += clen;
-      // cerr << " char: " << cur_char << "  ci=" << ci << endl;
-      sc[ci++] = vocab_.getIndex(cur_char.c_str());
+    int wi = start;
+    while (sp > 0) {
+      sc[ci] = inword[wi][0].label;
+      // cerr << " CHAR: " << TD::Convert(sc[ci]) << "  ci=" << ci << endl;
+      ++wi;
+      ++ci;
+      --sp;
     }
-    // cerr << "  END sp=" << sp << endl;
-    sc[sp] = Vocab_None;
+    // cerr << "  END ci=" << ci << endl;
+    sc[ci] = Vocab_None;
     const double startprob = ngram_.wordProb(kEOS, sc);
     // cerr << "  PROB=" << startprob << endl;
     return startprob;
   }
  private:
   const int order_;
-  Vocab vocab_;
+  Vocab& vocab_;
   VocabIndex kBOS;
   VocabIndex kEOS;
   Ngram ngram_;
@@ -189,9 +194,13 @@ void ReverseCharLMCSplitFeature::TraversalFeaturesImpl(
                                      SparseVector<double>* features,
                                      SparseVector<double>* estimated_features,
                                      void* out_context) const {
+  (void) ant_contexts;
+  (void) estimated_features;
+  (void) out_context;
+
   if (edge.Arity() != 1) return;
   if (edge.rule_->EWords() != 1) return;
-  const double lpp = pimpl_->LeftPhonotacticProb(TD::Convert(edge.rule_->e_[1]));
+  const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_);
   features->set_value(fid_, lpp);
 }
 
diff --git a/src/sentence_metadata.h b/src/sentence_metadata.h
index 0178f1f5..ef9eb388 100644
--- a/src/sentence_metadata.h
+++ b/src/sentence_metadata.h
@@ -27,11 +27,16 @@ struct SentenceMetadata {
   int GetSourceLength() const { return src_len_; }
   int GetTargetLength() const { return trg_len_; }
   int GetSentenceID() const { return sent_id_; }
+  // this will be empty if the translator accepts non FS input!
+  const Lattice& GetSourceLattice() const { return src_lattice_; }
 
  private:
   const int sent_id_;
+  // the following should be set, if possible, by the Translator
   int src_len_;
-
+ public:
+  Lattice src_lattice_;  // this will only be set if inputs are finite state!
+ private:
   // you need to be very careful when depending on these values
   // they will only be set during training / alignment contexts
   const bool has_reference_;
author	Chris Dyer <redpony@gmail.com>	2009-12-07 01:50:49 -0500
committer	Chris Dyer <redpony@gmail.com>	2009-12-07 01:50:49 -0500
commit	ec7edcc7e398bdb040d810094b8416ad9f279d98 (patch)
tree	bdaea01e4317d04946de432d8b3a187881e63e96
parent	d6c82eaee9489a4d187a266a7d76a3e09cde119f (diff)