initial checkin

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-22 05:12:27 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-22 05:12:27 +0000
commit: 0172721855098ca02b207231a654dffa5e4eb1c9 (patch)
tree: 8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /decoder/ff_csplit.cc
parent: 37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff)
1 files changed, 225 insertions, 0 deletions
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
new file mode 100644
index 00000000..658603e4
--- /dev/null
+++ b/decoder/ff_csplit.cc
@@ -0,0 +1,225 @@
+#include "ff_csplit.h"
+
+#include <set>
+#include <cstring>
+
+#include "Vocab.h"
+#include "Ngram.h"
+
+#include "sentence_metadata.h"
+#include "lattice.h"
+#include "tdict.h"
+#include "freqdict.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "tdict.h"
+
+using namespace std;
+
+struct BasicCSplitFeaturesImpl {
+  BasicCSplitFeaturesImpl(const string& param) :
+      word_count_(FD::Convert("WordCount")),
+      letters_sq_(FD::Convert("LettersSq")),
+      letters_sqrt_(FD::Convert("LettersSqrt")),
+      in_dict_(FD::Convert("InDict")),
+      short_(FD::Convert("Short")),
+      long_(FD::Convert("Long")),
+      oov_(FD::Convert("OOV")),
+      short_range_(FD::Convert("ShortRange")),
+      high_freq_(FD::Convert("HighFreq")),
+      med_freq_(FD::Convert("MedFreq")),
+      freq_(FD::Convert("Freq")),
+      fl1_(FD::Convert("FreqLen1")),
+      fl2_(FD::Convert("FreqLen2")),
+      bad_(FD::Convert("Bad")) {
+    vector<string> argv;
+    int argc = SplitOnWhitespace(param, &argv);
+    if (argc != 1 && argc != 2) {
+      cerr << "Expected: freqdict.txt [badwords.txt]\n";
+      abort();
+    }
+    freq_dict_.Load(argv[0]);
+    if (argc == 2) {
+      ReadFile rf(argv[1]);
+      istream& in = *rf.stream();
+      while(in) {
+        string badword;
+        in >> badword;
+        if (badword.empty()) continue;
+        bad_words_.insert(TD::Convert(badword));
+      }
+    }
+  }
+
+  void TraversalFeaturesImpl(const Hypergraph::Edge& edge,
+                             SparseVector<double>* features) const;
+
+  const int word_count_;
+  const int letters_sq_;
+  const int letters_sqrt_;
+  const int in_dict_;
+  const int short_;
+  const int long_;
+  const int oov_;
+  const int short_range_;
+  const int high_freq_;
+  const int med_freq_;
+  const int freq_;
+  const int fl1_;
+  const int fl2_;
+  const int bad_;
+  FreqDict freq_dict_;
+  set<WordID> bad_words_;
+};
+
+BasicCSplitFeatures::BasicCSplitFeatures(const string& param) :
+  pimpl_(new BasicCSplitFeaturesImpl(param)) {}
+
+void BasicCSplitFeaturesImpl::TraversalFeaturesImpl(
+                                     const Hypergraph::Edge& edge,
+                                     SparseVector<double>* features) const {
+  features->set_value(word_count_, 1.0);
+  features->set_value(letters_sq_, (edge.j_ - edge.i_) * (edge.j_ - edge.i_));
+  features->set_value(letters_sqrt_, sqrt(edge.j_ - edge.i_));
+  const WordID word = edge.rule_->e_[1];
+  const char* sword = TD::Convert(word);
+  const int len = strlen(sword);
+  int cur = 0;
+  int chars = 0;
+  while(cur < len) {
+    cur += UTF8Len(sword[cur]);
+    ++chars;
+  }
+
+  // these are corrections that attempt to make chars
+  // more like a phoneme count than a letter count, they
+  // are only really meaningful for german and should
+  // probably be gotten rid of
+  bool has_sch = strstr(sword, "sch");
+  bool has_ch = (!has_sch && strstr(sword, "ch"));
+  bool has_ie = strstr(sword, "ie");
+  bool has_zw = strstr(sword, "zw");
+  if (has_sch) chars -= 2;
+  if (has_ch) --chars;
+  if (has_ie) --chars;
+  if (has_zw) --chars;
+
+  float freq = freq_dict_.LookUp(word);
+  if (freq) {
+    features->set_value(freq_, freq);
+    features->set_value(in_dict_, 1.0);
+  } else {
+    features->set_value(oov_, 1.0);
+    freq = 99.0f;
+  }
+  if (bad_words_.count(word) != 0)
+    features->set_value(bad_, 1.0);
+  if (chars < 5)
+    features->set_value(short_, 1.0);
+  if (chars > 10)
+    features->set_value(long_, 1.0);
+  if (freq < 7.0f)
+    features->set_value(high_freq_, 1.0);
+  if (freq > 8.0f && freq < 10.f)
+    features->set_value(med_freq_, 1.0);
+  if (freq < 10.0f && chars < 5)
+    features->set_value(short_range_, 1.0);
+
+  // i don't understand these features, but they really help!
+  features->set_value(fl1_, sqrt(chars * freq));
+  features->set_value(fl2_, freq / chars);
+}
+
+void BasicCSplitFeatures::TraversalFeaturesImpl(
+                                     const SentenceMetadata& smeta,
+                                     const Hypergraph::Edge& edge,
+                                     const std::vector<const void*>& ant_contexts,
+                                     SparseVector<double>* features,
+                                     SparseVector<double>* estimated_features,
+                                     void* out_context) const {
+  (void) smeta;
+  (void) ant_contexts;
+  (void) out_context;
+  (void) estimated_features;
+  if (edge.Arity() == 0) return;
+  if (edge.rule_->EWords() != 1) return;
+  pimpl_->TraversalFeaturesImpl(edge, features);
+}
+
+struct ReverseCharLMCSplitFeatureImpl {
+  ReverseCharLMCSplitFeatureImpl(const string& param) :
+      order_(5),
+      vocab_(*TD::dict_),
+      ngram_(vocab_, order_) {
+    kBOS = vocab_.getIndex("<s>");
+    kEOS = vocab_.getIndex("</s>");
+    File file(param.c_str(), "r", 0);
+    assert(file);
+    cerr << "Reading " << order_ << "-gram LM from " << param << endl;
+    ngram_.read(file);
+  }
+
+  double LeftPhonotacticProb(const Lattice& inword, const int start) {
+    const int end = inword.size();
+    for (int i = 0; i < order_; ++i)
+      sc[i] = kBOS;
+    int sp = min(end - start, order_ - 1);
+    // cerr << "[" << start << "," << sp << "]\n";
+    int ci = (order_ - sp - 1);
+    int wi = start;
+    while (sp > 0) {
+      sc[ci] = inword[wi][0].label;
+      // cerr << " CHAR: " << TD::Convert(sc[ci]) << "  ci=" << ci << endl;
+      ++wi;
+      ++ci;
+      --sp;
+    }
+    // cerr << "  END ci=" << ci << endl;
+    sc[ci] = Vocab_None;
+    const double startprob = ngram_.wordProb(kEOS, sc);
+    // cerr << "  PROB=" << startprob << endl;
+    return startprob;
+  }
+ private:
+  const int order_;
+  Vocab& vocab_;
+  VocabIndex kBOS;
+  VocabIndex kEOS;
+  Ngram ngram_;
+  VocabIndex sc[80];
+};
+
+ReverseCharLMCSplitFeature::ReverseCharLMCSplitFeature(const string& param) :
+  pimpl_(new ReverseCharLMCSplitFeatureImpl(param)),
+  fid_(FD::Convert("RevCharLM")) {}
+
+void ReverseCharLMCSplitFeature::TraversalFeaturesImpl(
+                                     const SentenceMetadata& smeta,
+                                     const Hypergraph::Edge& edge,
+                                     const std::vector<const void*>& ant_contexts,
+                                     SparseVector<double>* features,
+                                     SparseVector<double>* estimated_features,
+                                     void* out_context) const {
+  (void) ant_contexts;
+  (void) estimated_features;
+  (void) out_context;
+
+  if (edge.Arity() != 1) return;
+  if (edge.rule_->EWords() != 1) return;
+  const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_);
+  features->set_value(fid_, lpp);
+#if 0
+  WordID neighbor_word = 0;
+  const WordID word = edge.rule_->e_[1];
+  if (chars > 4 && (sword[0] == 's' || sword[0] == 'n')) {
+    neighbor_word = TD::Convert(string(&sword[1]));
+  }
+  if (neighbor_word) {
+    float nfreq = freq_dict_.LookUp(neighbor_word);
+    cerr << "COMPARE: " << TD::Convert(word) << " & " << TD::Convert(neighbor_word) << endl;
+    if (!nfreq) nfreq = 99.0f;
+    features->set_value(fdoes_deletion_help_, (freq - nfreq));
+  }
+#endif
+}
+
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-22 05:12:27 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-22 05:12:27 +0000
commit	0172721855098ca02b207231a654dffa5e4eb1c9 (patch)
tree	8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /decoder/ff_csplit.cc
parent	37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff)